This change adjusts the cost modeling used when the target does not have a schedule model with individual instruction latencies. After this change, we use the default latency information available from TargetSchedule. The default latency information essentially ends up treating most instructions as latency 1, with a few "expensive" ones getting a higher cost.
Previously, we unconditionally applied the first legal pattern - without any consideration of profitability. As a result, this change both prevents some patterns being applied, and changes which patterns are exercised. (i.e. previously the first pattern was applied, afterwards, maybe the second one is because the first wasn't profitable.)
The motivation here is two fold.
First, this brings the default behavior in line with the behavior when -mcpu or -mtune is specified. This improves test coverage, and generally makes it less likely we will have bad surprises when providing more information to the compiler.
Second, this enables some reassociation for ILP by default. Despite being unconditionally enabled, the prior code tended to "reassociate" repeatedly through an entire chain and simply moving the first operand to the end. The result was still a serial chain, just a different one. With this change, one of the intermediate transforms is unprofitable and we end up with a partially flattened tree.
Note that the resulting code diffs show significant room for improvement in the basic algorithm. I am intentionally excluding those from this patch.
For the test diffs, I don't seen any concerning regressions. I took a fairly close look at the RISCV ones, but only skimmed the x86 (particularly vector x86) changes.
Differential Revision: https://reviews.llvm.org/D141017
DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
MachineTraceMetrics::Trace BlockTrace) {
SmallVector<unsigned, 16> InstrDepth;
- assert(TSchedModel.hasInstrSchedModelOrItineraries() &&
- "Missing machine model\n");
-
// For each instruction in the new sequence compute the depth based on the
// operands. Use the trace information when possible. For new operands which
// are tracked in the InstrIdxForVirtReg map depth is looked up in InstrDepth
/// \returns Latency of \p NewRoot
unsigned MachineCombiner::getLatency(MachineInstr *Root, MachineInstr *NewRoot,
MachineTraceMetrics::Trace BlockTrace) {
- assert(TSchedModel.hasInstrSchedModelOrItineraries() &&
- "Missing machine model\n");
-
// Check each definition in NewRoot and compute the latency
unsigned NewRootLatency = 0;
DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
MachineCombinerPattern Pattern,
bool SlackIsAccurate) {
- assert(TSchedModel.hasInstrSchedModelOrItineraries() &&
- "Missing machine model\n");
// Get depth and latency of NewRoot and Root.
unsigned NewRootDepth = getDepth(InsInstrs, InstrIdxForVirtReg, BlockTrace);
unsigned RootDepth = BlockTrace.getInstrCycles(*Root).Depth;
// Eagerly stop after the first pattern fires.
Changed = true;
break;
- } else if (!TSchedModel.hasInstrSchedModelOrItineraries()) {
- LLVM_DEBUG(dbgs() << "\t Replacing due to lack of schedule model\n");
- insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, MinInstr,
- RegUnits, TII, P, IncrementalUpdate);
- // Eagerly stop after the first pattern fires.
- Changed = true;
- break;
} else {
// For big basic blocks, we only compute the full trace the first time
// we hit this. We do not invalidate the trace, but instead update the
define i64 @addc_adde(i64 %a, i64 %b) nounwind {
; RV32I-LABEL: addc_adde:
; RV32I: # %bb.0:
+; RV32I-NEXT: add a1, a1, a3
; RV32I-NEXT: add a2, a0, a2
; RV32I-NEXT: sltu a0, a2, a0
-; RV32I-NEXT: add a0, a3, a0
; RV32I-NEXT: add a1, a1, a0
; RV32I-NEXT: mv a0, a2
; RV32I-NEXT: ret
; RV32I-LABEL: subc_sube:
; RV32I: # %bb.0:
; RV32I-NEXT: sltu a4, a0, a2
-; RV32I-NEXT: add a3, a3, a4
; RV32I-NEXT: sub a1, a1, a3
+; RV32I-NEXT: sub a1, a1, a4
; RV32I-NEXT: sub a0, a0, a2
; RV32I-NEXT: ret
%1 = sub i64 %a, %b
; RISCV32-NEXT: sltu a5, a6, a5
; RISCV32-NEXT: mulhu a6, a0, a3
; RISCV32-NEXT: mulhu t0, a1, a2
-; RISCV32-NEXT: add a5, a5, t0
-; RISCV32-NEXT: add a5, a5, a7
-; RISCV32-NEXT: mul a7, a1, a3
-; RISCV32-NEXT: add a5, a5, a7
+; RISCV32-NEXT: add a6, a6, t0
; RISCV32-NEXT: add a5, a6, a5
+; RISCV32-NEXT: add a5, a5, a7
+; RISCV32-NEXT: mul a6, a1, a3
+; RISCV32-NEXT: add a5, a5, a6
; RISCV32-NEXT: bgez a1, .LBB0_2
; RISCV32-NEXT: # %bb.1:
; RISCV32-NEXT: sub a5, a5, a2
; RV32IMB-NEXT: li a2, 29
; RV32IMB-NEXT: mul a1, a1, a2
; RV32IMB-NEXT: mulhu a3, a0, a2
+; RV32IMB-NEXT: add a1, a3, a1
; RV32IMB-NEXT: mul a2, a0, a2
; RV32IMB-NEXT: addi a0, a2, 1073
; RV32IMB-NEXT: sltu a2, a0, a2
; RV32IMB-NEXT: add a1, a1, a2
-; RV32IMB-NEXT: add a1, a3, a1
; RV32IMB-NEXT: ret
;
; RV64IMB-LABEL: add_mul_combine_accept_a3:
; RV32IMB-NEXT: li a2, 23
; RV32IMB-NEXT: mul a1, a1, a2
; RV32IMB-NEXT: mulhu a3, a0, a2
+; RV32IMB-NEXT: add a1, a3, a1
; RV32IMB-NEXT: mul a2, a0, a2
; RV32IMB-NEXT: lui a0, 50
; RV32IMB-NEXT: addi a0, a0, 1119
; RV32IMB-NEXT: add a0, a2, a0
; RV32IMB-NEXT: sltu a2, a0, a2
; RV32IMB-NEXT: add a1, a1, a2
-; RV32IMB-NEXT: add a1, a3, a1
; RV32IMB-NEXT: ret
;
; RV64IMB-LABEL: add_mul_combine_accept_b3:
; RV32IMB-NEXT: li a2, 29
; RV32IMB-NEXT: mul a1, a1, a2
; RV32IMB-NEXT: mulhu a3, a0, a2
+; RV32IMB-NEXT: add a1, a3, a1
; RV32IMB-NEXT: mul a2, a0, a2
; RV32IMB-NEXT: lui a0, 14
; RV32IMB-NEXT: addi a0, a0, -185
; RV32IMB-NEXT: add a0, a2, a0
; RV32IMB-NEXT: sltu a2, a0, a2
; RV32IMB-NEXT: add a1, a1, a2
-; RV32IMB-NEXT: add a1, a3, a1
; RV32IMB-NEXT: ret
;
; RV64IMB-LABEL: add_mul_combine_reject_a3:
; RV32IMB-NEXT: li a2, 73
; RV32IMB-NEXT: mul a1, a1, a2
; RV32IMB-NEXT: mulhu a3, a0, a2
+; RV32IMB-NEXT: add a1, a3, a1
; RV32IMB-NEXT: mul a2, a0, a2
; RV32IMB-NEXT: lui a0, 18
; RV32IMB-NEXT: addi a0, a0, -728
; RV32IMB-NEXT: add a0, a2, a0
; RV32IMB-NEXT: sltu a2, a0, a2
; RV32IMB-NEXT: add a1, a1, a2
-; RV32IMB-NEXT: add a1, a3, a1
; RV32IMB-NEXT: ret
;
; RV64IMB-LABEL: add_mul_combine_reject_c3:
; RV32IMB-NEXT: mulhu a2, a0, a2
; RV32IMB-NEXT: sh1add a1, a1, a1
; RV32IMB-NEXT: slli a1, a1, 6
+; RV32IMB-NEXT: add a1, a2, a1
; RV32IMB-NEXT: sh1add a0, a0, a0
-; RV32IMB-NEXT: slli a3, a0, 6
+; RV32IMB-NEXT: slli a2, a0, 6
; RV32IMB-NEXT: lui a0, 47
; RV32IMB-NEXT: addi a0, a0, -512
-; RV32IMB-NEXT: add a0, a3, a0
-; RV32IMB-NEXT: sltu a3, a0, a3
-; RV32IMB-NEXT: add a1, a1, a3
-; RV32IMB-NEXT: add a1, a2, a1
+; RV32IMB-NEXT: add a0, a2, a0
+; RV32IMB-NEXT: sltu a2, a0, a2
+; RV32IMB-NEXT: add a1, a1, a2
; RV32IMB-NEXT: ret
;
; RV64IMB-LABEL: add_mul_combine_reject_d3:
; RV32IMB-NEXT: li a2, 29
; RV32IMB-NEXT: mul a1, a1, a2
; RV32IMB-NEXT: mulhu a3, a0, a2
+; RV32IMB-NEXT: add a1, a3, a1
; RV32IMB-NEXT: mul a2, a0, a2
; RV32IMB-NEXT: lui a0, 14
; RV32IMB-NEXT: addi a0, a0, -185
; RV32IMB-NEXT: add a0, a2, a0
; RV32IMB-NEXT: sltu a2, a0, a2
; RV32IMB-NEXT: add a1, a1, a2
-; RV32IMB-NEXT: add a1, a3, a1
; RV32IMB-NEXT: ret
;
; RV64IMB-LABEL: add_mul_combine_reject_e3:
; RV32IMB-NEXT: li a2, 29
; RV32IMB-NEXT: mul a1, a1, a2
; RV32IMB-NEXT: mulhu a3, a0, a2
+; RV32IMB-NEXT: add a1, a3, a1
; RV32IMB-NEXT: mul a2, a0, a2
; RV32IMB-NEXT: lui a0, 14
; RV32IMB-NEXT: addi a0, a0, -145
; RV32IMB-NEXT: add a0, a2, a0
; RV32IMB-NEXT: sltu a2, a0, a2
; RV32IMB-NEXT: add a1, a1, a2
-; RV32IMB-NEXT: add a1, a3, a1
; RV32IMB-NEXT: ret
;
; RV64IMB-LABEL: add_mul_combine_reject_f3:
; RV32IMB-NEXT: li a2, 73
; RV32IMB-NEXT: mul a1, a1, a2
; RV32IMB-NEXT: mulhu a3, a0, a2
+; RV32IMB-NEXT: add a1, a3, a1
; RV32IMB-NEXT: mul a2, a0, a2
; RV32IMB-NEXT: lui a0, 2
; RV32IMB-NEXT: addi a0, a0, -882
; RV32IMB-NEXT: add a0, a2, a0
; RV32IMB-NEXT: sltu a2, a0, a2
; RV32IMB-NEXT: add a1, a1, a2
-; RV32IMB-NEXT: add a1, a3, a1
; RV32IMB-NEXT: ret
;
; RV64IMB-LABEL: add_mul_combine_reject_g3:
; RV32IMB-NEXT: addi a2, a2, -1096
; RV32IMB-NEXT: mul a1, a1, a2
; RV32IMB-NEXT: mulhu a3, a0, a2
+; RV32IMB-NEXT: add a1, a3, a1
; RV32IMB-NEXT: mul a2, a0, a2
; RV32IMB-NEXT: lui a0, 2
; RV32IMB-NEXT: addi a0, a0, 798
; RV32IMB-NEXT: add a0, a2, a0
; RV32IMB-NEXT: sltu a2, a0, a2
; RV32IMB-NEXT: add a1, a1, a2
-; RV32IMB-NEXT: add a1, a3, a1
; RV32IMB-NEXT: ret
;
; RV64IMB-LABEL: mul3000_add8990_c:
; RV32IMB-NEXT: addi a2, a2, -1096
; RV32IMB-NEXT: mul a1, a1, a2
; RV32IMB-NEXT: mulhu a3, a0, a2
+; RV32IMB-NEXT: add a1, a3, a1
; RV32IMB-NEXT: mul a2, a0, a2
; RV32IMB-NEXT: lui a0, 1048574
; RV32IMB-NEXT: addi a0, a0, -798
; RV32IMB-NEXT: add a0, a2, a0
; RV32IMB-NEXT: sltu a2, a0, a2
; RV32IMB-NEXT: add a1, a1, a2
-; RV32IMB-NEXT: add a1, a3, a1
; RV32IMB-NEXT: addi a1, a1, -1
; RV32IMB-NEXT: ret
;
; RV32IMB-NEXT: addi a2, a2, 1096
; RV32IMB-NEXT: mul a1, a1, a2
; RV32IMB-NEXT: mulhu a3, a0, a2
-; RV32IMB-NEXT: sub a1, a0, a1
+; RV32IMB-NEXT: sub a3, a3, a0
+; RV32IMB-NEXT: add a1, a3, a1
; RV32IMB-NEXT: mul a2, a0, a2
; RV32IMB-NEXT: lui a0, 2
; RV32IMB-NEXT: addi a0, a0, 798
; RV32IMB-NEXT: add a0, a2, a0
; RV32IMB-NEXT: sltu a2, a0, a2
-; RV32IMB-NEXT: sub a1, a1, a2
-; RV32IMB-NEXT: sub a1, a3, a1
+; RV32IMB-NEXT: add a1, a1, a2
; RV32IMB-NEXT: ret
;
; RV64IMB-LABEL: mulneg3000_add8990_c:
; RV32IMB-NEXT: addi a2, a2, 1096
; RV32IMB-NEXT: mul a1, a1, a2
; RV32IMB-NEXT: mulhu a3, a0, a2
-; RV32IMB-NEXT: sub a1, a0, a1
+; RV32IMB-NEXT: sub a3, a3, a0
+; RV32IMB-NEXT: add a1, a3, a1
; RV32IMB-NEXT: mul a2, a0, a2
; RV32IMB-NEXT: lui a0, 1048574
; RV32IMB-NEXT: addi a0, a0, -798
; RV32IMB-NEXT: add a0, a2, a0
; RV32IMB-NEXT: sltu a2, a0, a2
-; RV32IMB-NEXT: sub a1, a1, a2
-; RV32IMB-NEXT: sub a1, a3, a1
+; RV32IMB-NEXT: add a1, a1, a2
; RV32IMB-NEXT: addi a1, a1, -1
; RV32IMB-NEXT: ret
;
;
; RV32I-LABEL: add:
; RV32I: # %bb.0:
+; RV32I-NEXT: add a1, a1, a3
; RV32I-NEXT: add a2, a0, a2
; RV32I-NEXT: sltu a0, a2, a0
-; RV32I-NEXT: add a0, a3, a0
; RV32I-NEXT: add a1, a1, a0
; RV32I-NEXT: mv a0, a2
; RV32I-NEXT: ret
; RV32I-LABEL: sub:
; RV32I: # %bb.0:
; RV32I-NEXT: sltu a4, a0, a2
-; RV32I-NEXT: add a3, a3, a4
; RV32I-NEXT: sub a1, a1, a3
+; RV32I-NEXT: sub a1, a1, a4
; RV32I-NEXT: sub a0, a0, a2
; RV32I-NEXT: ret
%1 = sub i64 %a, %b
; RV32I-NEXT: and a2, a0, a2
; RV32I-NEXT: slli a2, a2, 8
; RV32I-NEXT: slli a0, a0, 24
-; RV32I-NEXT: or a1, a2, a1
+; RV32I-NEXT: or a0, a0, a2
; RV32I-NEXT: or a0, a0, a1
; RV32I-NEXT: ret
;
; RV64I-NEXT: and a2, a0, a2
; RV64I-NEXT: slli a2, a2, 8
; RV64I-NEXT: slliw a0, a0, 24
-; RV64I-NEXT: or a1, a2, a1
+; RV64I-NEXT: or a0, a0, a2
; RV64I-NEXT: or a0, a0, a1
; RV64I-NEXT: ret
;
; RV32I-NEXT: and a4, a1, a3
; RV32I-NEXT: slli a4, a4, 8
; RV32I-NEXT: slli a1, a1, 24
-; RV32I-NEXT: or a2, a4, a2
+; RV32I-NEXT: or a1, a1, a4
; RV32I-NEXT: or a2, a1, a2
; RV32I-NEXT: srli a1, a0, 8
; RV32I-NEXT: and a1, a1, a3
; RV32I-NEXT: and a3, a0, a3
; RV32I-NEXT: slli a3, a3, 8
; RV32I-NEXT: slli a0, a0, 24
-; RV32I-NEXT: or a1, a3, a1
+; RV32I-NEXT: or a0, a0, a3
; RV32I-NEXT: or a1, a0, a1
; RV32I-NEXT: mv a0, a2
; RV32I-NEXT: ret
; RV64I-NEXT: srli a5, a0, 8
; RV64I-NEXT: srliw a5, a5, 24
; RV64I-NEXT: slli a5, a5, 24
+; RV64I-NEXT: or a3, a5, a3
; RV64I-NEXT: or a1, a3, a1
-; RV64I-NEXT: or a1, a5, a1
; RV64I-NEXT: and a4, a0, a4
; RV64I-NEXT: slli a4, a4, 24
; RV64I-NEXT: srliw a3, a0, 24
; RV64I-NEXT: and a2, a0, a2
; RV64I-NEXT: slli a2, a2, 40
; RV64I-NEXT: slli a0, a0, 56
-; RV64I-NEXT: or a2, a2, a3
-; RV64I-NEXT: or a1, a2, a1
+; RV64I-NEXT: or a0, a0, a2
+; RV64I-NEXT: or a0, a0, a3
; RV64I-NEXT: or a0, a0, a1
; RV64I-NEXT: ret
;
; RV32I-NEXT: and a2, a0, a2
; RV32I-NEXT: slli a2, a2, 8
; RV32I-NEXT: slli a0, a0, 24
-; RV32I-NEXT: or a1, a2, a1
+; RV32I-NEXT: or a0, a0, a2
; RV32I-NEXT: or a0, a0, a1
; RV32I-NEXT: srli a1, a0, 4
; RV32I-NEXT: lui a2, 61681
; RV64I-NEXT: and a2, a0, a2
; RV64I-NEXT: slli a2, a2, 8
; RV64I-NEXT: slliw a0, a0, 24
-; RV64I-NEXT: or a1, a2, a1
+; RV64I-NEXT: or a0, a0, a2
; RV64I-NEXT: or a0, a0, a1
; RV64I-NEXT: srli a1, a0, 4
; RV64I-NEXT: lui a2, 61681
; RV32I-NEXT: and a4, a1, a3
; RV32I-NEXT: slli a4, a4, 8
; RV32I-NEXT: slli a1, a1, 24
-; RV32I-NEXT: or a2, a4, a2
+; RV32I-NEXT: or a1, a1, a4
; RV32I-NEXT: or a1, a1, a2
; RV32I-NEXT: srli a2, a1, 4
; RV32I-NEXT: lui a4, 61681
; RV32I-NEXT: and a3, a0, a3
; RV32I-NEXT: slli a3, a3, 8
; RV32I-NEXT: slli a0, a0, 24
-; RV32I-NEXT: or a1, a3, a1
+; RV32I-NEXT: or a0, a0, a3
; RV32I-NEXT: or a0, a0, a1
; RV32I-NEXT: srli a1, a0, 4
; RV32I-NEXT: and a1, a1, a4
; RV64I-NEXT: srli a5, a0, 8
; RV64I-NEXT: srliw a5, a5, 24
; RV64I-NEXT: slli a5, a5, 24
+; RV64I-NEXT: or a3, a5, a3
; RV64I-NEXT: or a1, a3, a1
-; RV64I-NEXT: or a1, a5, a1
; RV64I-NEXT: and a4, a0, a4
; RV64I-NEXT: slli a4, a4, 24
; RV64I-NEXT: srliw a3, a0, 24
; RV64I-NEXT: and a2, a0, a2
; RV64I-NEXT: slli a2, a2, 40
; RV64I-NEXT: slli a0, a0, 56
-; RV64I-NEXT: or a2, a2, a3
-; RV64I-NEXT: lui a3, %hi(.LCPI6_0)
-; RV64I-NEXT: ld a3, %lo(.LCPI6_0)(a3)
-; RV64I-NEXT: or a1, a2, a1
+; RV64I-NEXT: or a0, a0, a2
+; RV64I-NEXT: lui a2, %hi(.LCPI6_0)
+; RV64I-NEXT: ld a2, %lo(.LCPI6_0)(a2)
+; RV64I-NEXT: or a0, a0, a3
; RV64I-NEXT: or a0, a0, a1
; RV64I-NEXT: srli a1, a0, 4
-; RV64I-NEXT: and a1, a1, a3
-; RV64I-NEXT: and a0, a0, a3
+; RV64I-NEXT: and a1, a1, a2
+; RV64I-NEXT: and a0, a0, a2
; RV64I-NEXT: lui a2, %hi(.LCPI6_1)
; RV64I-NEXT: ld a2, %lo(.LCPI6_1)(a2)
; RV64I-NEXT: slli a0, a0, 4
; RV32I-FPELIM-LABEL: callee_aligned_stack:
; RV32I-FPELIM: # %bb.0:
; RV32I-FPELIM-NEXT: lw a0, 0(a2)
-; RV32I-FPELIM-NEXT: lw a1, 0(sp)
-; RV32I-FPELIM-NEXT: lw a2, 8(sp)
-; RV32I-FPELIM-NEXT: lw a3, 16(sp)
-; RV32I-FPELIM-NEXT: lw a4, 20(sp)
-; RV32I-FPELIM-NEXT: add a1, a7, a1
-; RV32I-FPELIM-NEXT: add a1, a1, a2
-; RV32I-FPELIM-NEXT: add a1, a1, a3
-; RV32I-FPELIM-NEXT: add a1, a1, a4
+; RV32I-FPELIM-NEXT: lw a1, 8(sp)
+; RV32I-FPELIM-NEXT: lw a2, 0(sp)
+; RV32I-FPELIM-NEXT: lw a3, 20(sp)
+; RV32I-FPELIM-NEXT: lw a4, 16(sp)
+; RV32I-FPELIM-NEXT: add a0, a0, a7
+; RV32I-FPELIM-NEXT: add a1, a2, a1
; RV32I-FPELIM-NEXT: add a0, a0, a1
+; RV32I-FPELIM-NEXT: add a3, a4, a3
+; RV32I-FPELIM-NEXT: add a0, a0, a3
; RV32I-FPELIM-NEXT: ret
;
; RV32I-WITHFP-LABEL: callee_aligned_stack:
; RV32I-WITHFP-NEXT: sw s0, 8(sp) # 4-byte Folded Spill
; RV32I-WITHFP-NEXT: addi s0, sp, 16
; RV32I-WITHFP-NEXT: lw a0, 0(a2)
-; RV32I-WITHFP-NEXT: lw a1, 0(s0)
-; RV32I-WITHFP-NEXT: lw a2, 8(s0)
-; RV32I-WITHFP-NEXT: lw a3, 16(s0)
-; RV32I-WITHFP-NEXT: lw a4, 20(s0)
-; RV32I-WITHFP-NEXT: add a1, a7, a1
-; RV32I-WITHFP-NEXT: add a1, a1, a2
-; RV32I-WITHFP-NEXT: add a1, a1, a3
-; RV32I-WITHFP-NEXT: add a1, a1, a4
+; RV32I-WITHFP-NEXT: lw a1, 8(s0)
+; RV32I-WITHFP-NEXT: lw a2, 0(s0)
+; RV32I-WITHFP-NEXT: lw a3, 20(s0)
+; RV32I-WITHFP-NEXT: lw a4, 16(s0)
+; RV32I-WITHFP-NEXT: add a0, a0, a7
+; RV32I-WITHFP-NEXT: add a1, a2, a1
; RV32I-WITHFP-NEXT: add a0, a0, a1
+; RV32I-WITHFP-NEXT: add a3, a4, a3
+; RV32I-WITHFP-NEXT: add a0, a0, a3
; RV32I-WITHFP-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
; RV32I-WITHFP-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
; RV32I-WITHFP-NEXT: addi sp, sp, 16
; RV32I-FPELIM-NEXT: andi a0, a0, 255
; RV32I-FPELIM-NEXT: slli a1, a1, 16
; RV32I-FPELIM-NEXT: srli a1, a1, 16
-; RV32I-FPELIM-NEXT: add a1, a1, a2
-; RV32I-FPELIM-NEXT: xor a2, a4, t1
-; RV32I-FPELIM-NEXT: xor a3, a3, a7
-; RV32I-FPELIM-NEXT: or a2, a3, a2
-; RV32I-FPELIM-NEXT: seqz a2, a2
-; RV32I-FPELIM-NEXT: add a1, a2, a1
+; RV32I-FPELIM-NEXT: add a0, a0, a2
+; RV32I-FPELIM-NEXT: add a0, a0, a1
+; RV32I-FPELIM-NEXT: xor a1, a4, t1
+; RV32I-FPELIM-NEXT: xor a2, a3, a7
+; RV32I-FPELIM-NEXT: or a1, a2, a1
+; RV32I-FPELIM-NEXT: seqz a1, a1
; RV32I-FPELIM-NEXT: add a0, a0, a5
; RV32I-FPELIM-NEXT: add a0, a0, a6
; RV32I-FPELIM-NEXT: add a0, a0, t0
; RV32I-WITHFP-NEXT: andi a0, a0, 255
; RV32I-WITHFP-NEXT: slli a1, a1, 16
; RV32I-WITHFP-NEXT: srli a1, a1, 16
-; RV32I-WITHFP-NEXT: add a1, a1, a2
-; RV32I-WITHFP-NEXT: xor a2, a4, t1
-; RV32I-WITHFP-NEXT: xor a3, a3, a7
-; RV32I-WITHFP-NEXT: or a2, a3, a2
-; RV32I-WITHFP-NEXT: seqz a2, a2
-; RV32I-WITHFP-NEXT: add a1, a2, a1
+; RV32I-WITHFP-NEXT: add a0, a0, a2
+; RV32I-WITHFP-NEXT: add a0, a0, a1
+; RV32I-WITHFP-NEXT: xor a1, a4, t1
+; RV32I-WITHFP-NEXT: xor a2, a3, a7
+; RV32I-WITHFP-NEXT: or a1, a2, a1
+; RV32I-WITHFP-NEXT: seqz a1, a1
; RV32I-WITHFP-NEXT: add a0, a0, a5
; RV32I-WITHFP-NEXT: add a0, a0, a6
; RV32I-WITHFP-NEXT: add a0, a0, t0
; RV32I-FPELIM-NEXT: or a4, a4, a5
; RV32I-FPELIM-NEXT: xor a0, a0, a1
; RV32I-FPELIM-NEXT: xor a2, a3, a2
-; RV32I-FPELIM-NEXT: or a0, a0, a4
; RV32I-FPELIM-NEXT: or a0, a2, a0
+; RV32I-FPELIM-NEXT: or a0, a0, a4
; RV32I-FPELIM-NEXT: seqz a0, a0
; RV32I-FPELIM-NEXT: ret
;
; RV32I-WITHFP-NEXT: or a4, a4, a5
; RV32I-WITHFP-NEXT: xor a0, a0, a1
; RV32I-WITHFP-NEXT: xor a2, a3, a2
-; RV32I-WITHFP-NEXT: or a0, a0, a4
; RV32I-WITHFP-NEXT: or a0, a2, a0
+; RV32I-WITHFP-NEXT: or a0, a0, a4
; RV32I-WITHFP-NEXT: seqz a0, a0
; RV32I-WITHFP-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
; RV32I-WITHFP-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
; RV32I-FPELIM-NEXT: or a3, a3, a4
; RV32I-FPELIM-NEXT: xor a0, a7, a0
; RV32I-FPELIM-NEXT: xor a1, a2, a1
-; RV32I-FPELIM-NEXT: or a0, a0, a3
; RV32I-FPELIM-NEXT: or a0, a1, a0
+; RV32I-FPELIM-NEXT: or a0, a0, a3
; RV32I-FPELIM-NEXT: seqz a0, a0
; RV32I-FPELIM-NEXT: ret
;
; RV32I-WITHFP-NEXT: or a3, a3, a4
; RV32I-WITHFP-NEXT: xor a0, a7, a0
; RV32I-WITHFP-NEXT: xor a1, a2, a1
-; RV32I-WITHFP-NEXT: or a0, a0, a3
; RV32I-WITHFP-NEXT: or a0, a1, a0
+; RV32I-WITHFP-NEXT: or a0, a0, a3
; RV32I-WITHFP-NEXT: seqz a0, a0
; RV32I-WITHFP-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
; RV32I-WITHFP-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
; RV32I-FPELIM-LABEL: callee_aligned_stack:
; RV32I-FPELIM: # %bb.0:
; RV32I-FPELIM-NEXT: lw a0, 0(a2)
-; RV32I-FPELIM-NEXT: lw a1, 0(sp)
-; RV32I-FPELIM-NEXT: lw a2, 8(sp)
-; RV32I-FPELIM-NEXT: lw a3, 16(sp)
-; RV32I-FPELIM-NEXT: lw a4, 20(sp)
-; RV32I-FPELIM-NEXT: add a1, a7, a1
-; RV32I-FPELIM-NEXT: add a1, a1, a2
-; RV32I-FPELIM-NEXT: add a1, a1, a3
-; RV32I-FPELIM-NEXT: add a1, a1, a4
+; RV32I-FPELIM-NEXT: lw a1, 8(sp)
+; RV32I-FPELIM-NEXT: lw a2, 0(sp)
+; RV32I-FPELIM-NEXT: lw a3, 20(sp)
+; RV32I-FPELIM-NEXT: lw a4, 16(sp)
+; RV32I-FPELIM-NEXT: add a0, a0, a7
+; RV32I-FPELIM-NEXT: add a1, a2, a1
; RV32I-FPELIM-NEXT: add a0, a0, a1
+; RV32I-FPELIM-NEXT: add a3, a4, a3
+; RV32I-FPELIM-NEXT: add a0, a0, a3
; RV32I-FPELIM-NEXT: ret
;
; RV32I-WITHFP-LABEL: callee_aligned_stack:
; RV32I-WITHFP-NEXT: sw s0, 8(sp) # 4-byte Folded Spill
; RV32I-WITHFP-NEXT: addi s0, sp, 16
; RV32I-WITHFP-NEXT: lw a0, 0(a2)
-; RV32I-WITHFP-NEXT: lw a1, 0(s0)
-; RV32I-WITHFP-NEXT: lw a2, 8(s0)
-; RV32I-WITHFP-NEXT: lw a3, 16(s0)
-; RV32I-WITHFP-NEXT: lw a4, 20(s0)
-; RV32I-WITHFP-NEXT: add a1, a7, a1
-; RV32I-WITHFP-NEXT: add a1, a1, a2
-; RV32I-WITHFP-NEXT: add a1, a1, a3
-; RV32I-WITHFP-NEXT: add a1, a1, a4
+; RV32I-WITHFP-NEXT: lw a1, 8(s0)
+; RV32I-WITHFP-NEXT: lw a2, 0(s0)
+; RV32I-WITHFP-NEXT: lw a3, 20(s0)
+; RV32I-WITHFP-NEXT: lw a4, 16(s0)
+; RV32I-WITHFP-NEXT: add a0, a0, a7
+; RV32I-WITHFP-NEXT: add a1, a2, a1
; RV32I-WITHFP-NEXT: add a0, a0, a1
+; RV32I-WITHFP-NEXT: add a3, a4, a3
+; RV32I-WITHFP-NEXT: add a0, a0, a3
; RV32I-WITHFP-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
; RV32I-WITHFP-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
; RV32I-WITHFP-NEXT: addi sp, sp, 16
; RV64I-NEXT: andi a0, a0, 255
; RV64I-NEXT: slli a1, a1, 48
; RV64I-NEXT: srli a1, a1, 48
-; RV64I-NEXT: add a1, a1, a2
-; RV64I-NEXT: xor a2, a4, t1
-; RV64I-NEXT: xor a3, a3, a7
-; RV64I-NEXT: or a2, a3, a2
-; RV64I-NEXT: seqz a2, a2
-; RV64I-NEXT: add a1, a2, a1
+; RV64I-NEXT: add a0, a0, a2
+; RV64I-NEXT: add a0, a0, a1
+; RV64I-NEXT: xor a1, a4, t1
+; RV64I-NEXT: xor a2, a3, a7
+; RV64I-NEXT: or a1, a2, a1
+; RV64I-NEXT: seqz a1, a1
; RV64I-NEXT: add a0, a0, a5
; RV64I-NEXT: add a0, a0, a6
; RV64I-NEXT: add a0, a0, t0
; RV64I-NEXT: or a4, a4, a5
; RV64I-NEXT: xor a0, a0, a1
; RV64I-NEXT: xor a2, a3, a2
-; RV64I-NEXT: or a0, a0, a4
; RV64I-NEXT: or a0, a2, a0
+; RV64I-NEXT: or a0, a0, a4
; RV64I-NEXT: seqz a0, a0
; RV64I-NEXT: ret
%1 = icmp eq i256 %a, %b
; RV64I-NEXT: or a3, a3, a4
; RV64I-NEXT: xor a0, a7, a0
; RV64I-NEXT: xor a1, a2, a1
-; RV64I-NEXT: or a0, a0, a3
; RV64I-NEXT: or a0, a1, a0
+; RV64I-NEXT: or a0, a0, a3
; RV64I-NEXT: seqz a0, a0
; RV64I-NEXT: ret
%1 = icmp eq i256 %h, %j
; should only be 8-byte aligned
; RV64I-LABEL: callee_aligned_stack:
; RV64I: # %bb.0:
-; RV64I-NEXT: ld a0, 0(sp)
-; RV64I-NEXT: ld a1, 16(sp)
-; RV64I-NEXT: ld a2, 32(sp)
+; RV64I-NEXT: ld a0, 32(sp)
+; RV64I-NEXT: ld a1, 0(sp)
+; RV64I-NEXT: ld a2, 16(sp)
; RV64I-NEXT: ld a3, 40(sp)
-; RV64I-NEXT: add a0, a7, a0
-; RV64I-NEXT: add a0, a0, a1
-; RV64I-NEXT: add a0, a0, a2
+; RV64I-NEXT: add a5, a5, a7
+; RV64I-NEXT: add a1, a5, a1
+; RV64I-NEXT: add a0, a2, a0
+; RV64I-NEXT: add a0, a1, a0
; RV64I-NEXT: add a0, a0, a3
-; RV64I-NEXT: add a0, a5, a0
; RV64I-NEXT: ret
%f_trunc = trunc i128 %f to i64
%1 = add i64 %f_trunc, %g
; RV32IF-NEXT: srli a1, a1, 16
; RV32IF-NEXT: slli a0, a0, 17
; RV32IF-NEXT: srli a0, a0, 17
-; RV32IF-NEXT: lui a2, 1048560
-; RV32IF-NEXT: or a1, a1, a2
+; RV32IF-NEXT: or a0, a0, a1
+; RV32IF-NEXT: lui a1, 1048560
; RV32IF-NEXT: or a0, a0, a1
; RV32IF-NEXT: fmv.w.x fa0, a0
; RV32IF-NEXT: ret
; RV32IFD-NEXT: srli a1, a1, 16
; RV32IFD-NEXT: slli a0, a0, 17
; RV32IFD-NEXT: srli a0, a0, 17
-; RV32IFD-NEXT: lui a2, 1048560
-; RV32IFD-NEXT: or a1, a1, a2
+; RV32IFD-NEXT: or a0, a0, a1
+; RV32IFD-NEXT: lui a1, 1048560
; RV32IFD-NEXT: or a0, a0, a1
; RV32IFD-NEXT: fmv.w.x fa0, a0
; RV32IFD-NEXT: ret
; RV64IFD-NEXT: srli a1, a1, 16
; RV64IFD-NEXT: slli a0, a0, 49
; RV64IFD-NEXT: srli a0, a0, 49
-; RV64IFD-NEXT: lui a2, 1048560
-; RV64IFD-NEXT: or a1, a1, a2
+; RV64IFD-NEXT: or a0, a0, a1
+; RV64IFD-NEXT: lui a1, 1048560
; RV64IFD-NEXT: or a0, a0, a1
; RV64IFD-NEXT: fmv.w.x fa0, a0
; RV64IFD-NEXT: ret
; RV32IF-NEXT: srli a1, a1, 16
; RV32IF-NEXT: slli a0, a0, 17
; RV32IF-NEXT: srli a0, a0, 17
-; RV32IF-NEXT: lui a2, 1048560
-; RV32IF-NEXT: or a1, a1, a2
+; RV32IF-NEXT: or a0, a0, a1
+; RV32IF-NEXT: lui a1, 1048560
; RV32IF-NEXT: or a0, a0, a1
; RV32IF-NEXT: fmv.w.x fa0, a0
; RV32IF-NEXT: ret
; RV32IFD-NEXT: slli a1, a1, 17
; RV32IFD-NEXT: srli a1, a1, 17
; RV32IFD-NEXT: lui a2, 1048560
-; RV32IFD-NEXT: or a0, a0, a2
+; RV32IFD-NEXT: or a1, a1, a2
; RV32IFD-NEXT: or a0, a1, a0
; RV32IFD-NEXT: fmv.w.x fa0, a0
; RV32IFD-NEXT: addi sp, sp, 16
; RV64IFD-NEXT: slli a0, a0, 63
; RV64IFD-NEXT: srli a0, a0, 48
; RV64IFD-NEXT: lui a2, 1048560
-; RV64IFD-NEXT: or a0, a0, a2
+; RV64IFD-NEXT: or a1, a1, a2
; RV64IFD-NEXT: or a0, a1, a0
; RV64IFD-NEXT: fmv.w.x fa0, a0
; RV64IFD-NEXT: ret
; RV32-NEXT: addi a3, a3, -820
; RV32-NEXT: mul a3, a5, a3
; RV32-NEXT: mulhu a6, a5, a4
+; RV32-NEXT: add a3, a6, a3
; RV32-NEXT: sltu a0, a0, a2
; RV32-NEXT: sub a1, a1, a0
-; RV32-NEXT: mul a0, a1, a4
-; RV32-NEXT: add a0, a3, a0
-; RV32-NEXT: add a1, a6, a0
+; RV32-NEXT: mul a1, a1, a4
+; RV32-NEXT: add a1, a3, a1
; RV32-NEXT: mul a0, a5, a4
; RV32-NEXT: ret
;
; RV32I-NEXT: neg a0, a3
; RV32I-NEXT: snez a2, a3
; RV32I-NEXT: srai a1, a1, 1
-; RV32I-NEXT: neg a2, a2
-; RV32I-NEXT: sub a1, a2, a1
+; RV32I-NEXT: neg a1, a1
+; RV32I-NEXT: sub a1, a1, a2
; RV32I-NEXT: ret
;
; RV64I-LABEL: sdiv64_pow2_negative_2:
; RV32I-NEXT: neg a0, a3
; RV32I-NEXT: snez a2, a3
; RV32I-NEXT: srai a1, a1, 11
-; RV32I-NEXT: neg a2, a2
-; RV32I-NEXT: sub a1, a2, a1
+; RV32I-NEXT: neg a1, a1
+; RV32I-NEXT: sub a1, a1, a2
; RV32I-NEXT: ret
;
; RV64I-LABEL: sdiv64_pow2_negative_2048:
; RV32I-NEXT: neg a0, a3
; RV32I-NEXT: snez a2, a3
; RV32I-NEXT: srai a1, a1, 12
-; RV32I-NEXT: neg a2, a2
-; RV32I-NEXT: sub a1, a2, a1
+; RV32I-NEXT: neg a1, a1
+; RV32I-NEXT: sub a1, a1, a2
; RV32I-NEXT: ret
;
; RV64I-LABEL: sdiv64_pow2_negative_4096:
; RV32I-NEXT: neg a0, a3
; RV32I-NEXT: snez a2, a3
; RV32I-NEXT: srai a1, a1, 16
-; RV32I-NEXT: neg a2, a2
-; RV32I-NEXT: sub a1, a2, a1
+; RV32I-NEXT: neg a1, a1
+; RV32I-NEXT: sub a1, a1, a2
; RV32I-NEXT: ret
;
; RV64I-LABEL: sdiv64_pow2_negative_65536:
; RV32I-LABEL: sdiv64_pow2_8589934592:
; RV32I: # %bb.0: # %entry
; RV32I-NEXT: srli a2, a1, 31
-; RV32I-NEXT: srai a3, a1, 31
-; RV32I-NEXT: add a3, a0, a3
-; RV32I-NEXT: sltu a0, a3, a0
-; RV32I-NEXT: add a0, a2, a0
-; RV32I-NEXT: add a1, a1, a0
+; RV32I-NEXT: add a2, a1, a2
+; RV32I-NEXT: srai a1, a1, 31
+; RV32I-NEXT: add a1, a0, a1
+; RV32I-NEXT: sltu a0, a1, a0
+; RV32I-NEXT: add a1, a2, a0
; RV32I-NEXT: srai a0, a1, 1
; RV32I-NEXT: srai a1, a1, 31
; RV32I-NEXT: ret
; RV32I-LABEL: sdiv64_pow2_negative_8589934592:
; RV32I: # %bb.0: # %entry
; RV32I-NEXT: srli a2, a1, 31
-; RV32I-NEXT: srai a3, a1, 31
-; RV32I-NEXT: add a3, a0, a3
-; RV32I-NEXT: sltu a0, a3, a0
+; RV32I-NEXT: add a2, a1, a2
+; RV32I-NEXT: srai a1, a1, 31
+; RV32I-NEXT: add a1, a0, a1
+; RV32I-NEXT: sltu a0, a1, a0
; RV32I-NEXT: add a0, a2, a0
-; RV32I-NEXT: add a0, a1, a0
; RV32I-NEXT: srai a1, a0, 31
; RV32I-NEXT: srai a0, a0, 1
; RV32I-NEXT: snez a2, a0
-; RV32I-NEXT: neg a2, a2
-; RV32I-NEXT: sub a1, a2, a1
+; RV32I-NEXT: neg a1, a1
+; RV32I-NEXT: sub a1, a1, a2
; RV32I-NEXT: neg a0, a0
; RV32I-NEXT: ret
;
; RV32IM-NEXT: addi a3, a3, -820
; RV32IM-NEXT: mul a3, a5, a3
; RV32IM-NEXT: mulhu a6, a5, a4
+; RV32IM-NEXT: add a3, a6, a3
; RV32IM-NEXT: sltu a0, a0, a2
; RV32IM-NEXT: sub a1, a1, a0
-; RV32IM-NEXT: mul a0, a1, a4
-; RV32IM-NEXT: add a0, a3, a0
-; RV32IM-NEXT: add a1, a6, a0
+; RV32IM-NEXT: mul a1, a1, a4
+; RV32IM-NEXT: add a1, a3, a1
; RV32IM-NEXT: mul a0, a5, a4
; RV32IM-NEXT: ret
;
; RV32IF-NEXT: or a4, a1, a0
; RV32IF-NEXT: snez a4, a4
; RV32IF-NEXT: addi a4, a4, -1
+; RV32IF-NEXT: and a3, a4, a3
; RV32IF-NEXT: xori a0, a0, 1
; RV32IF-NEXT: or a0, a0, a1
; RV32IF-NEXT: seqz a0, a0
; RV32IF-NEXT: addi a1, a0, -1
; RV32IF-NEXT: and a0, a1, a3
-; RV32IF-NEXT: and a0, a0, a4
+; RV32IF-NEXT: and a2, a4, a2
; RV32IF-NEXT: and a1, a1, a2
-; RV32IF-NEXT: and a1, a1, a4
; RV32IF-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
; RV32IF-NEXT: addi sp, sp, 32
; RV32IF-NEXT: ret
; RV64-NEXT: call __fixunsdfti@plt
; RV64-NEXT: snez a2, a1
; RV64-NEXT: addi a2, a2, -1
+; RV64-NEXT: and a0, a2, a0
; RV64-NEXT: addi a1, a1, -1
; RV64-NEXT: seqz a1, a1
; RV64-NEXT: addi a1, a1, -1
; RV64-NEXT: and a0, a1, a0
-; RV64-NEXT: and a0, a0, a2
; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload
; RV64-NEXT: addi sp, sp, 16
; RV64-NEXT: ret
; RV32IFD-NEXT: or a4, a1, a0
; RV32IFD-NEXT: snez a4, a4
; RV32IFD-NEXT: addi a4, a4, -1
+; RV32IFD-NEXT: and a3, a4, a3
; RV32IFD-NEXT: xori a0, a0, 1
; RV32IFD-NEXT: or a0, a0, a1
; RV32IFD-NEXT: seqz a0, a0
; RV32IFD-NEXT: addi a1, a0, -1
; RV32IFD-NEXT: and a0, a1, a3
-; RV32IFD-NEXT: and a0, a0, a4
+; RV32IFD-NEXT: and a2, a4, a2
; RV32IFD-NEXT: and a1, a1, a2
-; RV32IFD-NEXT: and a1, a1, a4
; RV32IFD-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
; RV32IFD-NEXT: addi sp, sp, 32
; RV32IFD-NEXT: ret
; RV32IF-NEXT: seqz a6, a1
; RV32IF-NEXT: .LBB47_7: # %entry
; RV32IF-NEXT: neg a6, a6
+; RV32IF-NEXT: and a3, a6, a3
; RV32IF-NEXT: xori a1, a1, 1
; RV32IF-NEXT: or a1, a1, a0
; RV32IF-NEXT: seqz a1, a1
; RV32IF-NEXT: addi a1, a1, -1
; RV32IF-NEXT: and a3, a1, a3
-; RV32IF-NEXT: and a3, a3, a6
+; RV32IF-NEXT: and a4, a6, a4
; RV32IF-NEXT: and a1, a1, a4
-; RV32IF-NEXT: and a1, a1, a6
; RV32IF-NEXT: neg a4, a5
; RV32IF-NEXT: and a4, a4, a0
; RV32IF-NEXT: mv a0, a3
; RV64-NEXT: .LBB47_2: # %entry
; RV64-NEXT: slti a3, a1, 1
; RV64-NEXT: neg a3, a3
+; RV64-NEXT: and a0, a3, a0
; RV64-NEXT: addi a1, a1, -1
; RV64-NEXT: seqz a1, a1
; RV64-NEXT: addi a1, a1, -1
; RV64-NEXT: and a0, a1, a0
-; RV64-NEXT: and a0, a0, a3
; RV64-NEXT: beqz a2, .LBB47_4
; RV64-NEXT: # %bb.3: # %entry
; RV64-NEXT: sgtz a1, a2
; RV32IFD-NEXT: seqz a6, a1
; RV32IFD-NEXT: .LBB47_7: # %entry
; RV32IFD-NEXT: neg a6, a6
+; RV32IFD-NEXT: and a3, a6, a3
; RV32IFD-NEXT: xori a1, a1, 1
; RV32IFD-NEXT: or a1, a1, a0
; RV32IFD-NEXT: seqz a1, a1
; RV32IFD-NEXT: addi a1, a1, -1
; RV32IFD-NEXT: and a3, a1, a3
-; RV32IFD-NEXT: and a3, a3, a6
+; RV32IFD-NEXT: and a4, a6, a4
; RV32IFD-NEXT: and a1, a1, a4
-; RV32IFD-NEXT: and a1, a1, a6
; RV32IFD-NEXT: neg a4, a5
; RV32IFD-NEXT: and a4, a4, a0
; RV32IFD-NEXT: mv a0, a3
; RV32-NEXT: or a4, a1, a0
; RV32-NEXT: snez a4, a4
; RV32-NEXT: addi a4, a4, -1
+; RV32-NEXT: and a3, a4, a3
; RV32-NEXT: xori a0, a0, 1
; RV32-NEXT: or a0, a0, a1
; RV32-NEXT: seqz a0, a0
; RV32-NEXT: addi a1, a0, -1
; RV32-NEXT: and a0, a1, a3
-; RV32-NEXT: and a0, a0, a4
+; RV32-NEXT: and a2, a4, a2
; RV32-NEXT: and a1, a1, a2
-; RV32-NEXT: and a1, a1, a4
; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
; RV32-NEXT: addi sp, sp, 32
; RV32-NEXT: ret
; RV64-NEXT: call __fixunssfti@plt
; RV64-NEXT: snez a2, a1
; RV64-NEXT: addi a2, a2, -1
+; RV64-NEXT: and a0, a2, a0
; RV64-NEXT: addi a1, a1, -1
; RV64-NEXT: seqz a1, a1
; RV64-NEXT: addi a1, a1, -1
; RV64-NEXT: and a0, a1, a0
-; RV64-NEXT: and a0, a0, a2
; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload
; RV64-NEXT: addi sp, sp, 16
; RV64-NEXT: ret
; RV32-NEXT: seqz a6, a1
; RV32-NEXT: .LBB50_7: # %entry
; RV32-NEXT: neg a6, a6
+; RV32-NEXT: and a3, a6, a3
; RV32-NEXT: xori a1, a1, 1
; RV32-NEXT: or a1, a1, a0
; RV32-NEXT: seqz a1, a1
; RV32-NEXT: addi a1, a1, -1
; RV32-NEXT: and a3, a1, a3
-; RV32-NEXT: and a3, a3, a6
+; RV32-NEXT: and a4, a6, a4
; RV32-NEXT: and a1, a1, a4
-; RV32-NEXT: and a1, a1, a6
; RV32-NEXT: neg a4, a5
; RV32-NEXT: and a4, a4, a0
; RV32-NEXT: mv a0, a3
; RV64-NEXT: .LBB50_2: # %entry
; RV64-NEXT: slti a3, a1, 1
; RV64-NEXT: neg a3, a3
+; RV64-NEXT: and a0, a3, a0
; RV64-NEXT: addi a1, a1, -1
; RV64-NEXT: seqz a1, a1
; RV64-NEXT: addi a1, a1, -1
; RV64-NEXT: and a0, a1, a0
-; RV64-NEXT: and a0, a0, a3
; RV64-NEXT: beqz a2, .LBB50_4
; RV64-NEXT: # %bb.3: # %entry
; RV64-NEXT: sgtz a1, a2
; RV32-NEXT: or a4, a1, a0
; RV32-NEXT: snez a4, a4
; RV32-NEXT: addi a4, a4, -1
+; RV32-NEXT: and a3, a4, a3
; RV32-NEXT: xori a0, a0, 1
; RV32-NEXT: or a0, a0, a1
; RV32-NEXT: seqz a0, a0
; RV32-NEXT: addi a1, a0, -1
; RV32-NEXT: and a0, a1, a3
-; RV32-NEXT: and a0, a0, a4
+; RV32-NEXT: and a2, a4, a2
; RV32-NEXT: and a1, a1, a2
-; RV32-NEXT: and a1, a1, a4
; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
; RV32-NEXT: addi sp, sp, 32
; RV32-NEXT: ret
; RV64-NEXT: call __fixunssfti@plt
; RV64-NEXT: snez a2, a1
; RV64-NEXT: addi a2, a2, -1
+; RV64-NEXT: and a0, a2, a0
; RV64-NEXT: addi a1, a1, -1
; RV64-NEXT: seqz a1, a1
; RV64-NEXT: addi a1, a1, -1
; RV64-NEXT: and a0, a1, a0
-; RV64-NEXT: and a0, a0, a2
; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload
; RV64-NEXT: addi sp, sp, 16
; RV64-NEXT: ret
; RV32-NEXT: seqz a6, a1
; RV32-NEXT: .LBB53_7: # %entry
; RV32-NEXT: neg a6, a6
+; RV32-NEXT: and a3, a6, a3
; RV32-NEXT: xori a1, a1, 1
; RV32-NEXT: or a1, a1, a0
; RV32-NEXT: seqz a1, a1
; RV32-NEXT: addi a1, a1, -1
; RV32-NEXT: and a3, a1, a3
-; RV32-NEXT: and a3, a3, a6
+; RV32-NEXT: and a4, a6, a4
; RV32-NEXT: and a1, a1, a4
-; RV32-NEXT: and a1, a1, a6
; RV32-NEXT: neg a4, a5
; RV32-NEXT: and a4, a4, a0
; RV32-NEXT: mv a0, a3
; RV64-NEXT: .LBB53_2: # %entry
; RV64-NEXT: slti a3, a1, 1
; RV64-NEXT: neg a3, a3
+; RV64-NEXT: and a0, a3, a0
; RV64-NEXT: addi a1, a1, -1
; RV64-NEXT: seqz a1, a1
; RV64-NEXT: addi a1, a1, -1
; RV64-NEXT: and a0, a1, a0
-; RV64-NEXT: and a0, a0, a3
; RV64-NEXT: beqz a2, .LBB53_4
; RV64-NEXT: # %bb.3: # %entry
; RV64-NEXT: sgtz a1, a2
; CHECK-NOV-NEXT: call __fixunsdfti@plt
; CHECK-NOV-NEXT: snez a2, a1
; CHECK-NOV-NEXT: addi a2, a2, -1
+; CHECK-NOV-NEXT: and a0, a2, a0
; CHECK-NOV-NEXT: addi a1, a1, -1
; CHECK-NOV-NEXT: seqz a1, a1
; CHECK-NOV-NEXT: addi a1, a1, -1
; CHECK-NOV-NEXT: and a0, a1, a0
-; CHECK-NOV-NEXT: and a0, a0, a2
; CHECK-NOV-NEXT: snez a1, s1
; CHECK-NOV-NEXT: addi a1, a1, -1
+; CHECK-NOV-NEXT: and a1, a1, s0
; CHECK-NOV-NEXT: addi s1, s1, -1
; CHECK-NOV-NEXT: seqz a2, s1
; CHECK-NOV-NEXT: addi a2, a2, -1
-; CHECK-NOV-NEXT: and a2, a2, s0
; CHECK-NOV-NEXT: and a1, a2, a1
; CHECK-NOV-NEXT: ld ra, 24(sp) # 8-byte Folded Reload
; CHECK-NOV-NEXT: ld s0, 16(sp) # 8-byte Folded Reload
; CHECK-V-NEXT: call __fixunsdfti@plt
; CHECK-V-NEXT: snez a2, s1
; CHECK-V-NEXT: addi a2, a2, -1
+; CHECK-V-NEXT: and a2, a2, s0
; CHECK-V-NEXT: addi s1, s1, -1
; CHECK-V-NEXT: seqz a3, s1
; CHECK-V-NEXT: addi a3, a3, -1
-; CHECK-V-NEXT: and a3, a3, s0
; CHECK-V-NEXT: and a2, a3, a2
; CHECK-V-NEXT: snez a3, a1
; CHECK-V-NEXT: addi a3, a3, -1
+; CHECK-V-NEXT: and a0, a3, a0
; CHECK-V-NEXT: addi a1, a1, -1
; CHECK-V-NEXT: seqz a1, a1
; CHECK-V-NEXT: addi a1, a1, -1
; CHECK-V-NEXT: and a0, a1, a0
-; CHECK-V-NEXT: and a0, a0, a3
; CHECK-V-NEXT: sd a0, 24(sp)
; CHECK-V-NEXT: sd a2, 32(sp)
; CHECK-V-NEXT: addi a0, sp, 24
; CHECK-NOV-NEXT: addi a1, a1, -1
; CHECK-NOV-NEXT: slti a0, s1, 1
; CHECK-NOV-NEXT: neg a0, a0
+; CHECK-NOV-NEXT: and a0, a0, s0
; CHECK-NOV-NEXT: addi s1, s1, -1
; CHECK-NOV-NEXT: seqz a5, s1
; CHECK-NOV-NEXT: addi a5, a5, -1
-; CHECK-NOV-NEXT: and a5, a5, s0
; CHECK-NOV-NEXT: and a0, a5, a0
; CHECK-NOV-NEXT: beqz a4, .LBB47_6
; CHECK-NOV-NEXT: # %bb.5: # %entry
; CHECK-NOV-NEXT: call __fixunssfti@plt
; CHECK-NOV-NEXT: snez a2, a1
; CHECK-NOV-NEXT: addi a2, a2, -1
+; CHECK-NOV-NEXT: and a0, a2, a0
; CHECK-NOV-NEXT: addi a1, a1, -1
; CHECK-NOV-NEXT: seqz a1, a1
; CHECK-NOV-NEXT: addi a1, a1, -1
; CHECK-NOV-NEXT: and a0, a1, a0
-; CHECK-NOV-NEXT: and a0, a0, a2
; CHECK-NOV-NEXT: snez a1, s1
; CHECK-NOV-NEXT: addi a1, a1, -1
+; CHECK-NOV-NEXT: and a1, a1, s0
; CHECK-NOV-NEXT: addi s1, s1, -1
; CHECK-NOV-NEXT: seqz a2, s1
; CHECK-NOV-NEXT: addi a2, a2, -1
-; CHECK-NOV-NEXT: and a2, a2, s0
; CHECK-NOV-NEXT: and a1, a2, a1
; CHECK-NOV-NEXT: ld ra, 24(sp) # 8-byte Folded Reload
; CHECK-NOV-NEXT: ld s0, 16(sp) # 8-byte Folded Reload
; CHECK-V-NEXT: call __fixunssfti@plt
; CHECK-V-NEXT: snez a2, s1
; CHECK-V-NEXT: addi a2, a2, -1
+; CHECK-V-NEXT: and a2, a2, s0
; CHECK-V-NEXT: addi s1, s1, -1
; CHECK-V-NEXT: seqz a3, s1
; CHECK-V-NEXT: addi a3, a3, -1
-; CHECK-V-NEXT: and a3, a3, s0
; CHECK-V-NEXT: and a2, a3, a2
; CHECK-V-NEXT: snez a3, a1
; CHECK-V-NEXT: addi a3, a3, -1
+; CHECK-V-NEXT: and a0, a3, a0
; CHECK-V-NEXT: addi a1, a1, -1
; CHECK-V-NEXT: seqz a1, a1
; CHECK-V-NEXT: addi a1, a1, -1
; CHECK-V-NEXT: and a0, a1, a0
-; CHECK-V-NEXT: and a0, a0, a3
; CHECK-V-NEXT: sd a0, 24(sp)
; CHECK-V-NEXT: sd a2, 32(sp)
; CHECK-V-NEXT: addi a0, sp, 24
; CHECK-NOV-NEXT: addi a1, a1, -1
; CHECK-NOV-NEXT: slti a0, s1, 1
; CHECK-NOV-NEXT: neg a0, a0
+; CHECK-NOV-NEXT: and a0, a0, s0
; CHECK-NOV-NEXT: addi s1, s1, -1
; CHECK-NOV-NEXT: seqz a5, s1
; CHECK-NOV-NEXT: addi a5, a5, -1
-; CHECK-NOV-NEXT: and a5, a5, s0
; CHECK-NOV-NEXT: and a0, a5, a0
; CHECK-NOV-NEXT: beqz a4, .LBB50_6
; CHECK-NOV-NEXT: # %bb.5: # %entry
; CHECK-NOV-NEXT: call __fixunssfti@plt
; CHECK-NOV-NEXT: snez a2, a1
; CHECK-NOV-NEXT: addi a2, a2, -1
+; CHECK-NOV-NEXT: and a0, a2, a0
; CHECK-NOV-NEXT: addi a1, a1, -1
; CHECK-NOV-NEXT: seqz a1, a1
; CHECK-NOV-NEXT: addi a1, a1, -1
; CHECK-NOV-NEXT: and a0, a1, a0
-; CHECK-NOV-NEXT: and a0, a0, a2
; CHECK-NOV-NEXT: snez a1, s2
; CHECK-NOV-NEXT: addi a1, a1, -1
+; CHECK-NOV-NEXT: and a1, a1, s1
; CHECK-NOV-NEXT: addi s2, s2, -1
; CHECK-NOV-NEXT: seqz a2, s2
; CHECK-NOV-NEXT: addi a2, a2, -1
-; CHECK-NOV-NEXT: and a2, a2, s1
; CHECK-NOV-NEXT: and a1, a2, a1
; CHECK-NOV-NEXT: ld ra, 24(sp) # 8-byte Folded Reload
; CHECK-NOV-NEXT: ld s0, 16(sp) # 8-byte Folded Reload
; CHECK-V-NEXT: call __fixunssfti@plt
; CHECK-V-NEXT: snez a2, a1
; CHECK-V-NEXT: addi a2, a2, -1
+; CHECK-V-NEXT: and a0, a2, a0
; CHECK-V-NEXT: addi a1, a1, -1
; CHECK-V-NEXT: seqz a1, a1
; CHECK-V-NEXT: addi a1, a1, -1
; CHECK-V-NEXT: and a0, a1, a0
-; CHECK-V-NEXT: and a0, a0, a2
; CHECK-V-NEXT: snez a1, s2
; CHECK-V-NEXT: addi a1, a1, -1
+; CHECK-V-NEXT: and a1, a1, s1
; CHECK-V-NEXT: addi s2, s2, -1
; CHECK-V-NEXT: seqz a2, s2
; CHECK-V-NEXT: addi a2, a2, -1
-; CHECK-V-NEXT: and a2, a2, s1
; CHECK-V-NEXT: and a1, a2, a1
; CHECK-V-NEXT: sd a1, 8(sp)
; CHECK-V-NEXT: sd a0, 0(sp)
; CHECK-NOV-NEXT: addi a1, a1, -1
; CHECK-NOV-NEXT: slti a0, s1, 1
; CHECK-NOV-NEXT: neg a0, a0
+; CHECK-NOV-NEXT: and a0, a0, s0
; CHECK-NOV-NEXT: addi s1, s1, -1
; CHECK-NOV-NEXT: seqz a5, s1
; CHECK-NOV-NEXT: addi a5, a5, -1
-; CHECK-NOV-NEXT: and a5, a5, s0
; CHECK-NOV-NEXT: and a0, a5, a0
; CHECK-NOV-NEXT: beqz a4, .LBB53_6
; CHECK-NOV-NEXT: # %bb.5: # %entry
; CHECK-V-NEXT: addi a1, a1, -1
; CHECK-V-NEXT: slti a0, s1, 1
; CHECK-V-NEXT: neg a0, a0
+; CHECK-V-NEXT: and a0, a0, s0
; CHECK-V-NEXT: addi s1, s1, -1
; CHECK-V-NEXT: seqz a5, s1
; CHECK-V-NEXT: addi a5, a5, -1
-; CHECK-V-NEXT: and a5, a5, s0
; CHECK-V-NEXT: and a0, a5, a0
; CHECK-V-NEXT: beqz a4, .LBB53_6
; CHECK-V-NEXT: # %bb.5: # %entry
; RV32I-NEXT: # %bb.1:
; RV32I-NEXT: snez a2, a0
; RV32I-NEXT: neg a0, a0
-; RV32I-NEXT: neg a2, a2
-; RV32I-NEXT: sub a1, a2, a1
+; RV32I-NEXT: neg a1, a1
+; RV32I-NEXT: sub a1, a1, a2
; RV32I-NEXT: .LBB6_2:
; RV32I-NEXT: ret
;
; RV32ZBB-NEXT: # %bb.1:
; RV32ZBB-NEXT: snez a2, a0
; RV32ZBB-NEXT: neg a0, a0
-; RV32ZBB-NEXT: neg a2, a2
-; RV32ZBB-NEXT: sub a1, a2, a1
+; RV32ZBB-NEXT: neg a1, a1
+; RV32ZBB-NEXT: sub a1, a1, a2
; RV32ZBB-NEXT: .LBB6_2:
; RV32ZBB-NEXT: ret
;
; RV32I-NEXT: # %bb.1:
; RV32I-NEXT: snez a2, a0
; RV32I-NEXT: neg a0, a0
-; RV32I-NEXT: neg a2, a2
-; RV32I-NEXT: sub a1, a2, a1
+; RV32I-NEXT: neg a1, a1
+; RV32I-NEXT: sub a1, a1, a2
; RV32I-NEXT: .LBB7_2:
; RV32I-NEXT: ret
;
; RV32ZBB-NEXT: # %bb.1:
; RV32ZBB-NEXT: snez a2, a0
; RV32ZBB-NEXT: neg a0, a0
-; RV32ZBB-NEXT: neg a2, a2
-; RV32ZBB-NEXT: sub a1, a2, a1
+; RV32ZBB-NEXT: neg a1, a1
+; RV32ZBB-NEXT: sub a1, a1, a2
; RV32ZBB-NEXT: .LBB7_2:
; RV32ZBB-NEXT: ret
;
define i128 @abs128(i128 %x) {
; RV32I-LABEL: abs128:
; RV32I: # %bb.0:
-; RV32I-NEXT: lw a4, 0(a1)
-; RV32I-NEXT: lw a3, 4(a1)
-; RV32I-NEXT: lw a2, 12(a1)
-; RV32I-NEXT: snez a5, a4
+; RV32I-NEXT: lw a3, 0(a1)
+; RV32I-NEXT: lw a2, 4(a1)
+; RV32I-NEXT: lw a4, 12(a1)
+; RV32I-NEXT: snez a5, a3
; RV32I-NEXT: mv a6, a5
-; RV32I-NEXT: beqz a3, .LBB8_2
+; RV32I-NEXT: beqz a2, .LBB8_2
; RV32I-NEXT: # %bb.1:
-; RV32I-NEXT: snez a6, a3
+; RV32I-NEXT: snez a6, a2
; RV32I-NEXT: .LBB8_2:
; RV32I-NEXT: lw a1, 8(a1)
-; RV32I-NEXT: bgez a2, .LBB8_4
+; RV32I-NEXT: bgez a4, .LBB8_4
; RV32I-NEXT: # %bb.3:
; RV32I-NEXT: neg a7, a1
; RV32I-NEXT: sltu t0, a7, a6
; RV32I-NEXT: snez a1, a1
+; RV32I-NEXT: add a1, a4, a1
; RV32I-NEXT: add a1, a1, t0
-; RV32I-NEXT: neg a1, a1
-; RV32I-NEXT: sub a2, a1, a2
+; RV32I-NEXT: neg a4, a1
; RV32I-NEXT: sub a1, a7, a6
-; RV32I-NEXT: add a3, a3, a5
+; RV32I-NEXT: add a2, a2, a5
+; RV32I-NEXT: neg a2, a2
; RV32I-NEXT: neg a3, a3
-; RV32I-NEXT: neg a4, a4
; RV32I-NEXT: .LBB8_4:
-; RV32I-NEXT: sw a4, 0(a0)
+; RV32I-NEXT: sw a3, 0(a0)
; RV32I-NEXT: sw a1, 8(a0)
-; RV32I-NEXT: sw a3, 4(a0)
-; RV32I-NEXT: sw a2, 12(a0)
+; RV32I-NEXT: sw a2, 4(a0)
+; RV32I-NEXT: sw a4, 12(a0)
; RV32I-NEXT: ret
;
; RV32ZBB-LABEL: abs128:
; RV32ZBB: # %bb.0:
-; RV32ZBB-NEXT: lw a4, 0(a1)
-; RV32ZBB-NEXT: lw a3, 4(a1)
-; RV32ZBB-NEXT: lw a2, 12(a1)
-; RV32ZBB-NEXT: snez a5, a4
+; RV32ZBB-NEXT: lw a3, 0(a1)
+; RV32ZBB-NEXT: lw a2, 4(a1)
+; RV32ZBB-NEXT: lw a4, 12(a1)
+; RV32ZBB-NEXT: snez a5, a3
; RV32ZBB-NEXT: mv a6, a5
-; RV32ZBB-NEXT: beqz a3, .LBB8_2
+; RV32ZBB-NEXT: beqz a2, .LBB8_2
; RV32ZBB-NEXT: # %bb.1:
-; RV32ZBB-NEXT: snez a6, a3
+; RV32ZBB-NEXT: snez a6, a2
; RV32ZBB-NEXT: .LBB8_2:
; RV32ZBB-NEXT: lw a1, 8(a1)
-; RV32ZBB-NEXT: bgez a2, .LBB8_4
+; RV32ZBB-NEXT: bgez a4, .LBB8_4
; RV32ZBB-NEXT: # %bb.3:
; RV32ZBB-NEXT: neg a7, a1
; RV32ZBB-NEXT: sltu t0, a7, a6
; RV32ZBB-NEXT: snez a1, a1
+; RV32ZBB-NEXT: add a1, a4, a1
; RV32ZBB-NEXT: add a1, a1, t0
-; RV32ZBB-NEXT: neg a1, a1
-; RV32ZBB-NEXT: sub a2, a1, a2
+; RV32ZBB-NEXT: neg a4, a1
; RV32ZBB-NEXT: sub a1, a7, a6
-; RV32ZBB-NEXT: add a3, a3, a5
+; RV32ZBB-NEXT: add a2, a2, a5
+; RV32ZBB-NEXT: neg a2, a2
; RV32ZBB-NEXT: neg a3, a3
-; RV32ZBB-NEXT: neg a4, a4
; RV32ZBB-NEXT: .LBB8_4:
-; RV32ZBB-NEXT: sw a4, 0(a0)
+; RV32ZBB-NEXT: sw a3, 0(a0)
; RV32ZBB-NEXT: sw a1, 8(a0)
-; RV32ZBB-NEXT: sw a3, 4(a0)
-; RV32ZBB-NEXT: sw a2, 12(a0)
+; RV32ZBB-NEXT: sw a2, 4(a0)
+; RV32ZBB-NEXT: sw a4, 12(a0)
; RV32ZBB-NEXT: ret
;
; RV64I-LABEL: abs128:
; RV64I-NEXT: # %bb.1:
; RV64I-NEXT: snez a2, a0
; RV64I-NEXT: neg a0, a0
-; RV64I-NEXT: neg a2, a2
-; RV64I-NEXT: sub a1, a2, a1
+; RV64I-NEXT: neg a1, a1
+; RV64I-NEXT: sub a1, a1, a2
; RV64I-NEXT: .LBB8_2:
; RV64I-NEXT: ret
;
; RV64ZBB-NEXT: # %bb.1:
; RV64ZBB-NEXT: snez a2, a0
; RV64ZBB-NEXT: neg a0, a0
-; RV64ZBB-NEXT: neg a2, a2
-; RV64ZBB-NEXT: sub a1, a2, a1
+; RV64ZBB-NEXT: neg a1, a1
+; RV64ZBB-NEXT: sub a1, a1, a2
; RV64ZBB-NEXT: .LBB8_2:
; RV64ZBB-NEXT: ret
%abs = tail call i128 @llvm.abs.i128(i128 %x, i1 true)
define i128 @select_abs128(i128 %x) {
; RV32I-LABEL: select_abs128:
; RV32I: # %bb.0:
-; RV32I-NEXT: lw a4, 0(a1)
-; RV32I-NEXT: lw a3, 4(a1)
-; RV32I-NEXT: lw a2, 12(a1)
-; RV32I-NEXT: snez a5, a4
+; RV32I-NEXT: lw a3, 0(a1)
+; RV32I-NEXT: lw a2, 4(a1)
+; RV32I-NEXT: lw a4, 12(a1)
+; RV32I-NEXT: snez a5, a3
; RV32I-NEXT: mv a6, a5
-; RV32I-NEXT: beqz a3, .LBB9_2
+; RV32I-NEXT: beqz a2, .LBB9_2
; RV32I-NEXT: # %bb.1:
-; RV32I-NEXT: snez a6, a3
+; RV32I-NEXT: snez a6, a2
; RV32I-NEXT: .LBB9_2:
; RV32I-NEXT: lw a1, 8(a1)
-; RV32I-NEXT: bgez a2, .LBB9_4
+; RV32I-NEXT: bgez a4, .LBB9_4
; RV32I-NEXT: # %bb.3:
; RV32I-NEXT: neg a7, a1
; RV32I-NEXT: sltu t0, a7, a6
; RV32I-NEXT: snez a1, a1
+; RV32I-NEXT: add a1, a4, a1
; RV32I-NEXT: add a1, a1, t0
-; RV32I-NEXT: neg a1, a1
-; RV32I-NEXT: sub a2, a1, a2
+; RV32I-NEXT: neg a4, a1
; RV32I-NEXT: sub a1, a7, a6
-; RV32I-NEXT: add a3, a3, a5
+; RV32I-NEXT: add a2, a2, a5
+; RV32I-NEXT: neg a2, a2
; RV32I-NEXT: neg a3, a3
-; RV32I-NEXT: neg a4, a4
; RV32I-NEXT: .LBB9_4:
-; RV32I-NEXT: sw a4, 0(a0)
+; RV32I-NEXT: sw a3, 0(a0)
; RV32I-NEXT: sw a1, 8(a0)
-; RV32I-NEXT: sw a3, 4(a0)
-; RV32I-NEXT: sw a2, 12(a0)
+; RV32I-NEXT: sw a2, 4(a0)
+; RV32I-NEXT: sw a4, 12(a0)
; RV32I-NEXT: ret
;
; RV32ZBB-LABEL: select_abs128:
; RV32ZBB: # %bb.0:
-; RV32ZBB-NEXT: lw a4, 0(a1)
-; RV32ZBB-NEXT: lw a3, 4(a1)
-; RV32ZBB-NEXT: lw a2, 12(a1)
-; RV32ZBB-NEXT: snez a5, a4
+; RV32ZBB-NEXT: lw a3, 0(a1)
+; RV32ZBB-NEXT: lw a2, 4(a1)
+; RV32ZBB-NEXT: lw a4, 12(a1)
+; RV32ZBB-NEXT: snez a5, a3
; RV32ZBB-NEXT: mv a6, a5
-; RV32ZBB-NEXT: beqz a3, .LBB9_2
+; RV32ZBB-NEXT: beqz a2, .LBB9_2
; RV32ZBB-NEXT: # %bb.1:
-; RV32ZBB-NEXT: snez a6, a3
+; RV32ZBB-NEXT: snez a6, a2
; RV32ZBB-NEXT: .LBB9_2:
; RV32ZBB-NEXT: lw a1, 8(a1)
-; RV32ZBB-NEXT: bgez a2, .LBB9_4
+; RV32ZBB-NEXT: bgez a4, .LBB9_4
; RV32ZBB-NEXT: # %bb.3:
; RV32ZBB-NEXT: neg a7, a1
; RV32ZBB-NEXT: sltu t0, a7, a6
; RV32ZBB-NEXT: snez a1, a1
+; RV32ZBB-NEXT: add a1, a4, a1
; RV32ZBB-NEXT: add a1, a1, t0
-; RV32ZBB-NEXT: neg a1, a1
-; RV32ZBB-NEXT: sub a2, a1, a2
+; RV32ZBB-NEXT: neg a4, a1
; RV32ZBB-NEXT: sub a1, a7, a6
-; RV32ZBB-NEXT: add a3, a3, a5
+; RV32ZBB-NEXT: add a2, a2, a5
+; RV32ZBB-NEXT: neg a2, a2
; RV32ZBB-NEXT: neg a3, a3
-; RV32ZBB-NEXT: neg a4, a4
; RV32ZBB-NEXT: .LBB9_4:
-; RV32ZBB-NEXT: sw a4, 0(a0)
+; RV32ZBB-NEXT: sw a3, 0(a0)
; RV32ZBB-NEXT: sw a1, 8(a0)
-; RV32ZBB-NEXT: sw a3, 4(a0)
-; RV32ZBB-NEXT: sw a2, 12(a0)
+; RV32ZBB-NEXT: sw a2, 4(a0)
+; RV32ZBB-NEXT: sw a4, 12(a0)
; RV32ZBB-NEXT: ret
;
; RV64I-LABEL: select_abs128:
; RV64I-NEXT: # %bb.1:
; RV64I-NEXT: snez a2, a0
; RV64I-NEXT: neg a0, a0
-; RV64I-NEXT: neg a2, a2
-; RV64I-NEXT: sub a1, a2, a1
+; RV64I-NEXT: neg a1, a1
+; RV64I-NEXT: sub a1, a1, a2
; RV64I-NEXT: .LBB9_2:
; RV64I-NEXT: ret
;
; RV64ZBB-NEXT: # %bb.1:
; RV64ZBB-NEXT: snez a2, a0
; RV64ZBB-NEXT: neg a0, a0
-; RV64ZBB-NEXT: neg a2, a2
-; RV64ZBB-NEXT: sub a1, a2, a1
+; RV64ZBB-NEXT: neg a1, a1
+; RV64ZBB-NEXT: sub a1, a1, a2
; RV64ZBB-NEXT: .LBB9_2:
; RV64ZBB-NEXT: ret
%1 = icmp slt i128 %x, 0
; RV32IM: # %bb.0:
; RV32IM-NEXT: mul a3, a0, a3
; RV32IM-NEXT: mulhu a4, a0, a2
+; RV32IM-NEXT: add a3, a4, a3
; RV32IM-NEXT: mul a1, a1, a2
; RV32IM-NEXT: add a1, a3, a1
-; RV32IM-NEXT: add a1, a4, a1
; RV32IM-NEXT: mul a0, a0, a2
; RV32IM-NEXT: ret
;
; RV32I-NEXT: srli a0, a0, 30
; RV32I-NEXT: slli a4, a1, 2
; RV32I-NEXT: or a0, a4, a0
-; RV32I-NEXT: add a1, a1, a3
-; RV32I-NEXT: add a1, a0, a1
+; RV32I-NEXT: add a0, a0, a1
+; RV32I-NEXT: add a1, a0, a3
; RV32I-NEXT: mv a0, a2
; RV32I-NEXT: ret
;
; RV32IM-NEXT: li a2, 5
; RV32IM-NEXT: mulhu a2, a0, a2
; RV32IM-NEXT: slli a3, a1, 2
+; RV32IM-NEXT: add a1, a3, a1
; RV32IM-NEXT: add a1, a2, a1
-; RV32IM-NEXT: add a1, a1, a3
; RV32IM-NEXT: slli a2, a0, 2
; RV32IM-NEXT: add a0, a2, a0
; RV32IM-NEXT: ret
; RV32I-NEXT: srli a0, a0, 30
; RV32I-NEXT: slli a3, a1, 2
; RV32I-NEXT: or a0, a3, a0
-; RV32I-NEXT: add a1, a1, a2
; RV32I-NEXT: add a0, a0, a1
+; RV32I-NEXT: add a0, a0, a2
; RV32I-NEXT: ret
;
; RV32IM-LABEL: mulhs_positive_constant:
; RV32I-NEXT: srli a0, a0, 30
; RV32I-NEXT: slli a4, a1, 2
; RV32I-NEXT: or a0, a4, a0
-; RV32I-NEXT: add a1, a1, a2
-; RV32I-NEXT: snez a2, a3
-; RV32I-NEXT: add a1, a1, a2
-; RV32I-NEXT: neg a1, a1
-; RV32I-NEXT: sub a0, a1, a0
+; RV32I-NEXT: add a0, a0, a1
+; RV32I-NEXT: snez a1, a3
+; RV32I-NEXT: add a1, a2, a1
+; RV32I-NEXT: add a0, a0, a1
+; RV32I-NEXT: neg a0, a0
; RV32I-NEXT: ret
;
; RV32IM-LABEL: mulhs_negative_constant:
; RV32I-NEXT: srli a0, a0, 26
; RV32I-NEXT: slli a4, a1, 6
; RV32I-NEXT: or a0, a4, a0
-; RV32I-NEXT: add a1, a1, a3
-; RV32I-NEXT: add a1, a0, a1
+; RV32I-NEXT: add a0, a0, a1
+; RV32I-NEXT: add a1, a0, a3
; RV32I-NEXT: mv a0, a2
; RV32I-NEXT: ret
;
; RV32IM-NEXT: li a2, 65
; RV32IM-NEXT: mulhu a2, a0, a2
; RV32IM-NEXT: slli a3, a1, 6
+; RV32IM-NEXT: add a1, a3, a1
; RV32IM-NEXT: add a1, a2, a1
-; RV32IM-NEXT: add a1, a1, a3
; RV32IM-NEXT: slli a2, a0, 6
; RV32IM-NEXT: add a0, a2, a0
; RV32IM-NEXT: ret
; RV32I-NEXT: srli a4, a0, 26
; RV32I-NEXT: slli a5, a1, 6
; RV32I-NEXT: or a4, a5, a4
-; RV32I-NEXT: add a1, a1, a3
; RV32I-NEXT: sub a1, a4, a1
+; RV32I-NEXT: sub a1, a1, a3
; RV32I-NEXT: sub a0, a2, a0
; RV32I-NEXT: ret
;
; RV32IM-NEXT: li a2, 63
; RV32IM-NEXT: mulhu a2, a0, a2
; RV32IM-NEXT: slli a3, a1, 6
-; RV32IM-NEXT: sub a1, a2, a1
-; RV32IM-NEXT: add a1, a1, a3
+; RV32IM-NEXT: sub a1, a3, a1
+; RV32IM-NEXT: add a1, a2, a1
; RV32IM-NEXT: slli a2, a0, 6
; RV32IM-NEXT: sub a0, a2, a0
; RV32IM-NEXT: ret
; RV32I-NEXT: srli a4, a0, 26
; RV32I-NEXT: slli a5, a1, 6
; RV32I-NEXT: or a4, a5, a4
-; RV32I-NEXT: add a3, a4, a3
+; RV32I-NEXT: sub a1, a1, a4
; RV32I-NEXT: sub a1, a1, a3
; RV32I-NEXT: sub a0, a0, a2
; RV32I-NEXT: ret
; RV32IM-NEXT: sub a1, a1, a2
; RV32IM-NEXT: li a2, -63
; RV32IM-NEXT: mulhu a2, a0, a2
-; RV32IM-NEXT: sub a1, a0, a1
-; RV32IM-NEXT: sub a1, a2, a1
+; RV32IM-NEXT: sub a2, a2, a0
+; RV32IM-NEXT: add a1, a2, a1
; RV32IM-NEXT: slli a2, a0, 6
; RV32IM-NEXT: sub a0, a0, a2
; RV32IM-NEXT: ret
; RV32I-NEXT: srli a0, a0, 26
; RV32I-NEXT: slli a4, a1, 6
; RV32I-NEXT: or a0, a4, a0
-; RV32I-NEXT: add a1, a1, a2
-; RV32I-NEXT: snez a2, a3
-; RV32I-NEXT: add a1, a1, a2
+; RV32I-NEXT: add a0, a0, a1
+; RV32I-NEXT: add a0, a0, a2
+; RV32I-NEXT: snez a1, a3
; RV32I-NEXT: neg a1, a1
; RV32I-NEXT: sub a1, a1, a0
; RV32I-NEXT: neg a0, a3
; RV32IM-NEXT: add a1, a2, a1
; RV32IM-NEXT: li a2, -65
; RV32IM-NEXT: mulhu a2, a0, a2
-; RV32IM-NEXT: add a1, a0, a1
+; RV32IM-NEXT: sub a2, a2, a0
; RV32IM-NEXT: sub a1, a2, a1
; RV32IM-NEXT: slli a2, a0, 6
; RV32IM-NEXT: neg a0, a0
; RV32I-NEXT: srli a3, a0, 20
; RV32I-NEXT: slli a1, a1, 12
; RV32I-NEXT: or a1, a1, a3
-; RV32I-NEXT: slli a3, a0, 8
-; RV32I-NEXT: slli a4, a0, 12
-; RV32I-NEXT: add a0, a4, a3
-; RV32I-NEXT: sltu a3, a0, a4
-; RV32I-NEXT: add a2, a2, a3
+; RV32I-NEXT: add a1, a1, a2
+; RV32I-NEXT: slli a2, a0, 8
+; RV32I-NEXT: slli a3, a0, 12
+; RV32I-NEXT: add a0, a3, a2
+; RV32I-NEXT: sltu a2, a0, a3
; RV32I-NEXT: add a1, a1, a2
; RV32I-NEXT: ret
;
; RV32I-NEXT: srli a3, a0, 20
; RV32I-NEXT: slli a1, a1, 12
; RV32I-NEXT: or a1, a1, a3
-; RV32I-NEXT: slli a3, a0, 8
-; RV32I-NEXT: slli a0, a0, 12
-; RV32I-NEXT: sltu a4, a0, a3
-; RV32I-NEXT: add a2, a2, a4
; RV32I-NEXT: sub a1, a1, a2
-; RV32I-NEXT: sub a0, a0, a3
+; RV32I-NEXT: slli a2, a0, 8
+; RV32I-NEXT: slli a0, a0, 12
+; RV32I-NEXT: sltu a3, a0, a2
+; RV32I-NEXT: sub a1, a1, a3
+; RV32I-NEXT: sub a0, a0, a2
; RV32I-NEXT: ret
;
; RV32IM-LABEL: muli64_p3840:
; RV32IM-NEXT: slli a2, a2, 8
; RV32IM-NEXT: mul a1, a1, a2
; RV32IM-NEXT: mulhu a3, a0, a2
-; RV32IM-NEXT: sub a1, a0, a1
-; RV32IM-NEXT: sub a1, a3, a1
+; RV32IM-NEXT: sub a3, a3, a0
+; RV32IM-NEXT: add a1, a3, a1
; RV32IM-NEXT: mul a0, a0, a2
; RV32IM-NEXT: ret
;
; RV32I-NEXT: srli a3, a0, 24
; RV32I-NEXT: slli a1, a1, 8
; RV32I-NEXT: or a1, a1, a3
-; RV32I-NEXT: slli a3, a0, 12
-; RV32I-NEXT: slli a0, a0, 8
-; RV32I-NEXT: sltu a4, a0, a3
-; RV32I-NEXT: add a2, a2, a4
; RV32I-NEXT: sub a1, a1, a2
-; RV32I-NEXT: sub a0, a0, a3
+; RV32I-NEXT: slli a2, a0, 12
+; RV32I-NEXT: slli a0, a0, 8
+; RV32I-NEXT: sltu a3, a0, a2
+; RV32I-NEXT: sub a1, a1, a3
+; RV32I-NEXT: sub a0, a0, a2
; RV32I-NEXT: ret
;
; RV32IM-LABEL: muli64_m3840:
; RV32IM-NEXT: slli a2, a2, 8
; RV32IM-NEXT: mul a1, a1, a2
; RV32IM-NEXT: mulhu a3, a0, a2
-; RV32IM-NEXT: sub a1, a0, a1
-; RV32IM-NEXT: sub a1, a3, a1
+; RV32IM-NEXT: sub a3, a3, a0
+; RV32IM-NEXT: add a1, a3, a1
; RV32IM-NEXT: mul a0, a0, a2
; RV32IM-NEXT: ret
;
; RV32I-NEXT: srli a2, a4, 24
; RV32I-NEXT: slli a7, a3, 8
; RV32I-NEXT: or a2, a7, a2
-; RV32I-NEXT: sltu a7, a2, a1
-; RV32I-NEXT: srli t0, a3, 20
+; RV32I-NEXT: sltu t0, a2, a1
+; RV32I-NEXT: srli a7, a3, 20
; RV32I-NEXT: slli t1, a5, 12
-; RV32I-NEXT: or t0, t1, t0
+; RV32I-NEXT: or a7, t1, a7
; RV32I-NEXT: srli a3, a3, 24
; RV32I-NEXT: slli a5, a5, 8
-; RV32I-NEXT: or t1, a5, a3
-; RV32I-NEXT: add t0, t0, a7
+; RV32I-NEXT: or a3, a5, a3
+; RV32I-NEXT: sub t1, a3, a7
; RV32I-NEXT: srli a3, a6, 20
; RV32I-NEXT: slli a5, a4, 12
; RV32I-NEXT: or a3, a5, a3
; RV32IM-NEXT: sltu t4, t4, s1
; RV32IM-NEXT: sltu a7, t1, a7
; RV32IM-NEXT: mulhu t1, a1, t2
+; RV32IM-NEXT: add a7, t1, a7
; RV32IM-NEXT: add a7, a7, t4
; RV32IM-NEXT: sltu t0, t5, t0
; RV32IM-NEXT: mul a2, a2, a5
-; RV32IM-NEXT: mulhu t2, a3, a5
-; RV32IM-NEXT: sub a3, a3, a2
-; RV32IM-NEXT: sub a2, t2, a3
+; RV32IM-NEXT: mulhu t1, a3, a5
+; RV32IM-NEXT: sub a3, t1, a3
+; RV32IM-NEXT: add a2, a3, a2
; RV32IM-NEXT: add a1, a4, a1
-; RV32IM-NEXT: sub a1, a1, a2
-; RV32IM-NEXT: sub a1, a1, t0
; RV32IM-NEXT: sub a1, t3, a1
+; RV32IM-NEXT: add a1, a1, a2
+; RV32IM-NEXT: add a1, a1, t0
; RV32IM-NEXT: add a1, a7, a1
; RV32IM-NEXT: add a1, a1, s0
-; RV32IM-NEXT: add a1, t1, a1
; RV32IM-NEXT: mul a2, a4, a5
; RV32IM-NEXT: sw a2, 0(a0)
; RV32IM-NEXT: sw a6, 4(a0)
; RV64I-NEXT: srli a3, a0, 56
; RV64I-NEXT: slli a1, a1, 8
; RV64I-NEXT: or a1, a1, a3
-; RV64I-NEXT: slli a3, a0, 12
-; RV64I-NEXT: slli a0, a0, 8
-; RV64I-NEXT: sltu a4, a0, a3
-; RV64I-NEXT: add a2, a2, a4
; RV64I-NEXT: sub a1, a1, a2
-; RV64I-NEXT: sub a0, a0, a3
+; RV64I-NEXT: slli a2, a0, 12
+; RV64I-NEXT: slli a0, a0, 8
+; RV64I-NEXT: sltu a3, a0, a2
+; RV64I-NEXT: sub a1, a1, a3
+; RV64I-NEXT: sub a0, a0, a2
; RV64I-NEXT: ret
;
; RV64IM-LABEL: muli128_m3840:
; RV64IM-NEXT: slli a2, a2, 8
; RV64IM-NEXT: mul a1, a1, a2
; RV64IM-NEXT: mulhu a3, a0, a2
-; RV64IM-NEXT: sub a1, a0, a1
-; RV64IM-NEXT: sub a1, a3, a1
+; RV64IM-NEXT: sub a3, a3, a0
+; RV64IM-NEXT: add a1, a3, a1
; RV64IM-NEXT: mul a0, a0, a2
; RV64IM-NEXT: ret
%1 = mul i128 %a, -3840
; RV32I-NEXT: srli a7, a7, 26
; RV32I-NEXT: slli t4, a5, 6
; RV32I-NEXT: or a7, t4, a7
-; RV32I-NEXT: add a7, a7, t1
-; RV32I-NEXT: add a7, a7, t3
; RV32I-NEXT: sub a5, a5, a7
+; RV32I-NEXT: sub a5, a5, t1
+; RV32I-NEXT: sub a5, a5, t3
; RV32I-NEXT: sub a7, t2, t0
; RV32I-NEXT: sub a3, a3, a6
; RV32I-NEXT: sub a3, a3, a4
; RV32IM-NEXT: sltu t4, t4, s1
; RV32IM-NEXT: sltu a7, t1, a7
; RV32IM-NEXT: mulhu t1, a4, t2
+; RV32IM-NEXT: add a7, t1, a7
; RV32IM-NEXT: add a7, a7, t4
-; RV32IM-NEXT: slli t2, a2, 6
-; RV32IM-NEXT: sub a2, a2, t2
+; RV32IM-NEXT: slli t1, a2, 6
+; RV32IM-NEXT: sub a2, a2, t1
; RV32IM-NEXT: mulhu a5, a1, a5
-; RV32IM-NEXT: sub a1, a1, a2
; RV32IM-NEXT: sub a5, a5, a1
+; RV32IM-NEXT: add a2, a5, a2
; RV32IM-NEXT: add a4, a3, a4
-; RV32IM-NEXT: sub a4, a4, a5
-; RV32IM-NEXT: neg a1, t5
-; RV32IM-NEXT: sltu a1, a1, t0
-; RV32IM-NEXT: sub a4, a4, a1
; RV32IM-NEXT: sub a1, t3, a4
+; RV32IM-NEXT: add a1, a1, a2
+; RV32IM-NEXT: neg a2, t5
+; RV32IM-NEXT: sltu a2, a2, t0
+; RV32IM-NEXT: add a1, a1, a2
; RV32IM-NEXT: add a1, a7, a1
; RV32IM-NEXT: add a1, a1, s0
-; RV32IM-NEXT: add a1, t1, a1
; RV32IM-NEXT: slli a2, a3, 6
; RV32IM-NEXT: sub a3, a3, a2
; RV32IM-NEXT: sw a3, 0(a0)
; RV64I-NEXT: srli a4, a0, 58
; RV64I-NEXT: slli a5, a1, 6
; RV64I-NEXT: or a4, a5, a4
-; RV64I-NEXT: add a3, a4, a3
+; RV64I-NEXT: sub a1, a1, a4
; RV64I-NEXT: sub a1, a1, a3
; RV64I-NEXT: sub a0, a0, a2
; RV64I-NEXT: ret
; RV64IM-NEXT: sub a1, a1, a2
; RV64IM-NEXT: li a2, -63
; RV64IM-NEXT: mulhu a2, a0, a2
-; RV64IM-NEXT: sub a1, a0, a1
-; RV64IM-NEXT: sub a1, a2, a1
+; RV64IM-NEXT: sub a2, a2, a0
+; RV64IM-NEXT: add a1, a2, a1
; RV64IM-NEXT: slli a2, a0, 6
; RV64IM-NEXT: sub a0, a0, a2
; RV64IM-NEXT: ret
; RV32I-NEXT: sltu a3, a2, s9
; RV32I-NEXT: sltu a4, s9, s5
; RV32I-NEXT: sltu a5, s8, s7
+; RV32I-NEXT: add a5, s6, a5
; RV32I-NEXT: add a4, a5, a4
+; RV32I-NEXT: add a1, a1, s3
; RV32I-NEXT: sltu a0, s2, a0
-; RV32I-NEXT: add a0, s3, a0
; RV32I-NEXT: add a0, a1, a0
; RV32I-NEXT: add a0, a4, a0
-; RV32I-NEXT: add a0, a0, a3
-; RV32I-NEXT: add a1, s6, a0
+; RV32I-NEXT: add a1, a0, a3
; RV32I-NEXT: mv a0, a2
; RV32I-NEXT: lw ra, 44(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s0, 40(sp) # 4-byte Folded Reload
; RV32IM-NEXT: sltu a7, t0, a7
; RV32IM-NEXT: sltu a5, a5, a6
; RV32IM-NEXT: mulhu a3, a1, a3
-; RV32IM-NEXT: add a5, a5, a7
+; RV32IM-NEXT: add a3, a3, a5
+; RV32IM-NEXT: add a3, a3, a7
; RV32IM-NEXT: mul a1, a4, a1
; RV32IM-NEXT: mulhu a0, a4, a0
-; RV32IM-NEXT: add a1, a1, t1
; RV32IM-NEXT: add a0, a0, a1
-; RV32IM-NEXT: add a0, a5, a0
-; RV32IM-NEXT: add a0, a0, t2
-; RV32IM-NEXT: add a1, a3, a0
+; RV32IM-NEXT: add a0, a0, t1
+; RV32IM-NEXT: add a0, a3, a0
+; RV32IM-NEXT: add a1, a0, t2
; RV32IM-NEXT: mv a0, a2
; RV32IM-NEXT: ret
;
; RV32I-NEXT: xor a0, a0, a2
; RV32I-NEXT: sltu a3, a2, a0
; RV32I-NEXT: xor a1, a1, a2
-; RV32I-NEXT: add a1, a1, a3
; RV32I-NEXT: sub a1, a2, a1
+; RV32I-NEXT: sub a1, a1, a3
; RV32I-NEXT: sub a0, a2, a0
; RV32I-NEXT: ret
;
; RV32ZBB-NEXT: xor a0, a0, a2
; RV32ZBB-NEXT: sltu a3, a2, a0
; RV32ZBB-NEXT: xor a1, a1, a2
-; RV32ZBB-NEXT: add a1, a1, a3
; RV32ZBB-NEXT: sub a1, a2, a1
+; RV32ZBB-NEXT: sub a1, a1, a3
; RV32ZBB-NEXT: sub a0, a2, a0
; RV32ZBB-NEXT: ret
;
; RV32I-NEXT: xor a0, a0, a2
; RV32I-NEXT: sltu a3, a2, a0
; RV32I-NEXT: xor a1, a1, a2
-; RV32I-NEXT: add a1, a1, a3
; RV32I-NEXT: sub a1, a2, a1
+; RV32I-NEXT: sub a1, a1, a3
; RV32I-NEXT: sub a0, a2, a0
; RV32I-NEXT: ret
;
; RV32ZBB-NEXT: xor a0, a0, a2
; RV32ZBB-NEXT: sltu a3, a2, a0
; RV32ZBB-NEXT: xor a1, a1, a2
-; RV32ZBB-NEXT: add a1, a1, a3
; RV32ZBB-NEXT: sub a1, a2, a1
+; RV32ZBB-NEXT: sub a1, a1, a3
; RV32ZBB-NEXT: sub a0, a2, a0
; RV32ZBB-NEXT: ret
;
; RV32I-NEXT: bgez a1, .LBB5_2
; RV32I-NEXT: # %bb.1:
; RV32I-NEXT: snez a3, a0
-; RV32I-NEXT: neg a3, a3
-; RV32I-NEXT: sub a1, a3, a1
+; RV32I-NEXT: neg a1, a1
+; RV32I-NEXT: sub a1, a1, a3
; RV32I-NEXT: neg a0, a0
; RV32I-NEXT: .LBB5_2:
; RV32I-NEXT: sw a0, 0(a2)
; RV32I-NEXT: snez a3, a0
-; RV32I-NEXT: neg a3, a3
-; RV32I-NEXT: sub a3, a3, a1
+; RV32I-NEXT: neg a4, a1
+; RV32I-NEXT: sub a3, a4, a3
; RV32I-NEXT: neg a0, a0
; RV32I-NEXT: sw a1, 4(a2)
; RV32I-NEXT: mv a1, a3
; RV32ZBB-NEXT: bgez a1, .LBB5_2
; RV32ZBB-NEXT: # %bb.1:
; RV32ZBB-NEXT: snez a3, a0
-; RV32ZBB-NEXT: neg a3, a3
-; RV32ZBB-NEXT: sub a1, a3, a1
+; RV32ZBB-NEXT: neg a1, a1
+; RV32ZBB-NEXT: sub a1, a1, a3
; RV32ZBB-NEXT: neg a0, a0
; RV32ZBB-NEXT: .LBB5_2:
; RV32ZBB-NEXT: sw a0, 0(a2)
; RV32ZBB-NEXT: snez a3, a0
-; RV32ZBB-NEXT: neg a3, a3
-; RV32ZBB-NEXT: sub a3, a3, a1
+; RV32ZBB-NEXT: neg a4, a1
+; RV32ZBB-NEXT: sub a3, a4, a3
; RV32ZBB-NEXT: neg a0, a0
; RV32ZBB-NEXT: sw a1, 4(a2)
; RV32ZBB-NEXT: mv a1, a3
; RV32I-NEXT: and a0, a0, a2
; RV32I-NEXT: add a0, a1, a0
; RV32I-NEXT: sltu a1, a0, a1
+; RV32I-NEXT: add a3, a5, a3
; RV32I-NEXT: add a1, a3, a1
-; RV32I-NEXT: add a1, a5, a1
; RV32I-NEXT: ret
;
; RV64I-LABEL: rotl_64_mask_shared:
; RV32ZBB-NEXT: and a0, a0, a2
; RV32ZBB-NEXT: add a0, a1, a0
; RV32ZBB-NEXT: sltu a1, a0, a1
+; RV32ZBB-NEXT: add a3, a5, a3
; RV32ZBB-NEXT: add a1, a3, a1
-; RV32ZBB-NEXT: add a1, a5, a1
; RV32ZBB-NEXT: ret
;
; RV64ZBB-LABEL: rotl_64_mask_shared:
; RV32I-NEXT: and a0, a0, a2
; RV32I-NEXT: add a0, a6, a0
; RV32I-NEXT: sltu a2, a0, a6
-; RV32I-NEXT: add a2, a3, a2
+; RV32I-NEXT: add a1, a1, a3
; RV32I-NEXT: add a1, a1, a2
; RV32I-NEXT: ret
;
; RV32ZBB-NEXT: and a0, a0, a2
; RV32ZBB-NEXT: add a0, a6, a0
; RV32ZBB-NEXT: sltu a2, a0, a6
-; RV32ZBB-NEXT: add a2, a3, a2
+; RV32ZBB-NEXT: add a1, a1, a3
; RV32ZBB-NEXT: add a1, a1, a2
; RV32ZBB-NEXT: ret
;
; RV32I-NEXT: srl t0, t0, a1
; RV32I-NEXT: sll t1, a0, a4
; RV32I-NEXT: srli a0, a6, 1
-; RV32I-NEXT: srl a6, a0, a1
+; RV32I-NEXT: srl t2, a0, a1
; RV32I-NEXT: mv a0, a3
; RV32I-NEXT: bnez a5, .LBB21_6
; RV32I-NEXT: # %bb.5:
; RV32I-NEXT: mv a0, a2
; RV32I-NEXT: .LBB21_6:
-; RV32I-NEXT: or a7, a7, t0
-; RV32I-NEXT: or a6, t1, a6
+; RV32I-NEXT: or a6, a7, t0
+; RV32I-NEXT: or a7, t1, t2
; RV32I-NEXT: sll t0, a0, a4
; RV32I-NEXT: bnez a5, .LBB21_8
; RV32I-NEXT: # %bb.7:
; RV32I-NEXT: sll a2, a2, a4
; RV32I-NEXT: srli a0, a0, 1
; RV32I-NEXT: srl a0, a0, a1
-; RV32I-NEXT: or a2, a2, a0
-; RV32I-NEXT: add a0, a7, a3
-; RV32I-NEXT: sltu a1, a0, a7
-; RV32I-NEXT: add a1, a2, a1
-; RV32I-NEXT: add a1, a6, a1
+; RV32I-NEXT: or a0, a2, a0
+; RV32I-NEXT: add a1, a7, a0
+; RV32I-NEXT: add a0, a6, a3
+; RV32I-NEXT: sltu a2, a0, a6
+; RV32I-NEXT: add a1, a1, a2
; RV32I-NEXT: ret
;
; RV64I-LABEL: rotl_64_mask_multiple:
; RV32ZBB-NEXT: srl t0, t0, a1
; RV32ZBB-NEXT: sll t1, a0, a4
; RV32ZBB-NEXT: srli a0, a6, 1
-; RV32ZBB-NEXT: srl a6, a0, a1
+; RV32ZBB-NEXT: srl t2, a0, a1
; RV32ZBB-NEXT: mv a0, a3
; RV32ZBB-NEXT: bnez a5, .LBB21_6
; RV32ZBB-NEXT: # %bb.5:
; RV32ZBB-NEXT: mv a0, a2
; RV32ZBB-NEXT: .LBB21_6:
-; RV32ZBB-NEXT: or a7, a7, t0
-; RV32ZBB-NEXT: or a6, t1, a6
+; RV32ZBB-NEXT: or a6, a7, t0
+; RV32ZBB-NEXT: or a7, t1, t2
; RV32ZBB-NEXT: sll t0, a0, a4
; RV32ZBB-NEXT: bnez a5, .LBB21_8
; RV32ZBB-NEXT: # %bb.7:
; RV32ZBB-NEXT: sll a2, a2, a4
; RV32ZBB-NEXT: srli a0, a0, 1
; RV32ZBB-NEXT: srl a0, a0, a1
-; RV32ZBB-NEXT: or a2, a2, a0
-; RV32ZBB-NEXT: add a0, a7, a3
-; RV32ZBB-NEXT: sltu a1, a0, a7
-; RV32ZBB-NEXT: add a1, a2, a1
-; RV32ZBB-NEXT: add a1, a6, a1
+; RV32ZBB-NEXT: or a0, a2, a0
+; RV32ZBB-NEXT: add a1, a7, a0
+; RV32ZBB-NEXT: add a0, a6, a3
+; RV32ZBB-NEXT: sltu a2, a0, a6
+; RV32ZBB-NEXT: add a1, a1, a2
; RV32ZBB-NEXT: ret
;
; RV64ZBB-LABEL: rotl_64_mask_multiple:
; RV32I-NEXT: slli t0, a1, 1
; RV32I-NEXT: not a0, a4
; RV32I-NEXT: sll t0, t0, a0
-; RV32I-NEXT: srl a1, a1, a4
+; RV32I-NEXT: srl t1, a1, a4
; RV32I-NEXT: slli a6, a6, 1
-; RV32I-NEXT: sll t1, a6, a0
+; RV32I-NEXT: sll t2, a6, a0
; RV32I-NEXT: mv a6, a2
; RV32I-NEXT: beqz a5, .LBB23_6
; RV32I-NEXT: # %bb.5:
; RV32I-NEXT: mv a6, a3
; RV32I-NEXT: .LBB23_6:
-; RV32I-NEXT: or a7, t0, a7
-; RV32I-NEXT: or a1, t1, a1
+; RV32I-NEXT: or a1, t0, a7
+; RV32I-NEXT: or a7, t2, t1
; RV32I-NEXT: srl t0, a6, a4
; RV32I-NEXT: beqz a5, .LBB23_8
; RV32I-NEXT: # %bb.7:
; RV32I-NEXT: srl a3, a3, a4
; RV32I-NEXT: slli a6, a6, 1
; RV32I-NEXT: sll a0, a6, a0
-; RV32I-NEXT: or a3, a0, a3
-; RV32I-NEXT: add a0, a7, a2
-; RV32I-NEXT: sltu a2, a0, a7
-; RV32I-NEXT: add a2, a3, a2
-; RV32I-NEXT: add a1, a1, a2
+; RV32I-NEXT: or a0, a0, a3
+; RV32I-NEXT: add a7, a7, a0
+; RV32I-NEXT: add a0, a1, a2
+; RV32I-NEXT: sltu a1, a0, a1
+; RV32I-NEXT: add a1, a7, a1
; RV32I-NEXT: ret
;
; RV64I-LABEL: rotr_64_mask_multiple:
; RV32ZBB-NEXT: slli t0, a1, 1
; RV32ZBB-NEXT: not a0, a4
; RV32ZBB-NEXT: sll t0, t0, a0
-; RV32ZBB-NEXT: srl a1, a1, a4
+; RV32ZBB-NEXT: srl t1, a1, a4
; RV32ZBB-NEXT: slli a6, a6, 1
-; RV32ZBB-NEXT: sll t1, a6, a0
+; RV32ZBB-NEXT: sll t2, a6, a0
; RV32ZBB-NEXT: mv a6, a2
; RV32ZBB-NEXT: beqz a5, .LBB23_6
; RV32ZBB-NEXT: # %bb.5:
; RV32ZBB-NEXT: mv a6, a3
; RV32ZBB-NEXT: .LBB23_6:
-; RV32ZBB-NEXT: or a7, t0, a7
-; RV32ZBB-NEXT: or a1, t1, a1
+; RV32ZBB-NEXT: or a1, t0, a7
+; RV32ZBB-NEXT: or a7, t2, t1
; RV32ZBB-NEXT: srl t0, a6, a4
; RV32ZBB-NEXT: beqz a5, .LBB23_8
; RV32ZBB-NEXT: # %bb.7:
; RV32ZBB-NEXT: srl a3, a3, a4
; RV32ZBB-NEXT: slli a6, a6, 1
; RV32ZBB-NEXT: sll a0, a6, a0
-; RV32ZBB-NEXT: or a3, a0, a3
-; RV32ZBB-NEXT: add a0, a7, a2
-; RV32ZBB-NEXT: sltu a2, a0, a7
-; RV32ZBB-NEXT: add a2, a3, a2
-; RV32ZBB-NEXT: add a1, a1, a2
+; RV32ZBB-NEXT: or a0, a0, a3
+; RV32ZBB-NEXT: add a7, a7, a0
+; RV32ZBB-NEXT: add a0, a1, a2
+; RV32ZBB-NEXT: sltu a1, a0, a1
+; RV32ZBB-NEXT: add a1, a7, a1
; RV32ZBB-NEXT: ret
;
; RV64ZBB-LABEL: rotr_64_mask_multiple:
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: snez a2, a0
; CHECK-NEXT: neg a0, a0
-; CHECK-NEXT: neg a2, a2
-; CHECK-NEXT: sub a1, a2, a1
+; CHECK-NEXT: neg a1, a1
+; CHECK-NEXT: sub a1, a1, a2
; CHECK-NEXT: .LBB19_2:
; CHECK-NEXT: ret
%abs = tail call i64 @llvm.abs.i64(i64 %x, i1 true)
; RV32I-NEXT: and a2, a0, a2
; RV32I-NEXT: slli a2, a2, 8
; RV32I-NEXT: slli a0, a0, 24
-; RV32I-NEXT: or a1, a2, a1
+; RV32I-NEXT: or a0, a0, a2
; RV32I-NEXT: or a0, a0, a1
; RV32I-NEXT: ret
;
; RV32I-NEXT: and a4, a1, a3
; RV32I-NEXT: slli a4, a4, 8
; RV32I-NEXT: slli a1, a1, 24
-; RV32I-NEXT: or a2, a4, a2
+; RV32I-NEXT: or a1, a1, a4
; RV32I-NEXT: or a2, a1, a2
; RV32I-NEXT: srli a1, a0, 8
; RV32I-NEXT: and a1, a1, a3
; RV32I-NEXT: and a3, a0, a3
; RV32I-NEXT: slli a3, a3, 8
; RV32I-NEXT: slli a0, a0, 24
-; RV32I-NEXT: or a1, a3, a1
+; RV32I-NEXT: or a0, a0, a3
; RV32I-NEXT: or a1, a0, a1
; RV32I-NEXT: mv a0, a2
; RV32I-NEXT: ret
; CHECK-NEXT: slli a2, a2, 32
; CHECK-NEXT: mulhu a1, a2, a1
; CHECK-NEXT: srli a1, a1, 1
-; CHECK-NEXT: add a0, a0, a1
-; CHECK-NEXT: addw a0, a3, a0
+; CHECK-NEXT: add a0, a3, a0
+; CHECK-NEXT: addw a0, a0, a1
; CHECK-NEXT: ret
; CHECK-NEXT: .LBB0_2:
; CHECK-NEXT: li a0, 0
; CHECK-NEXT: slli a3, a3, 32
; CHECK-NEXT: mulhu a1, a3, a1
; CHECK-NEXT: srli a1, a1, 1
-; CHECK-NEXT: add a0, a0, a1
; CHECK-NEXT: subw a0, a2, a0
+; CHECK-NEXT: subw a0, a0, a1
; CHECK-NEXT: ret
; CHECK-NEXT: .LBB1_2:
; CHECK-NEXT: li a0, 0
; RV64I-NEXT: and a2, a0, a2
; RV64I-NEXT: slli a2, a2, 8
; RV64I-NEXT: slliw a0, a0, 24
-; RV64I-NEXT: or a1, a2, a1
+; RV64I-NEXT: or a0, a0, a2
; RV64I-NEXT: or a0, a0, a1
; RV64I-NEXT: ret
;
; RV64I-NEXT: and a3, a0, a3
; RV64I-NEXT: slli a3, a3, 8
; RV64I-NEXT: slli a0, a0, 24
-; RV64I-NEXT: or a2, a3, a2
+; RV64I-NEXT: or a0, a0, a3
; RV64I-NEXT: or a0, a0, a2
; RV64I-NEXT: sw a0, 0(a1)
; RV64I-NEXT: ret
; RV64I-NEXT: srli a5, a0, 8
; RV64I-NEXT: srliw a5, a5, 24
; RV64I-NEXT: slli a5, a5, 24
+; RV64I-NEXT: or a3, a5, a3
; RV64I-NEXT: or a1, a3, a1
-; RV64I-NEXT: or a1, a5, a1
; RV64I-NEXT: and a4, a0, a4
; RV64I-NEXT: slli a4, a4, 24
; RV64I-NEXT: srliw a3, a0, 24
; RV64I-NEXT: and a2, a0, a2
; RV64I-NEXT: slli a2, a2, 40
; RV64I-NEXT: slli a0, a0, 56
-; RV64I-NEXT: or a2, a2, a3
-; RV64I-NEXT: or a1, a2, a1
+; RV64I-NEXT: or a0, a0, a2
+; RV64I-NEXT: or a0, a0, a3
; RV64I-NEXT: or a0, a0, a1
; RV64I-NEXT: ret
;
; RV32-NEXT: neg a2, a2
; RV32-NEXT: and a2, a2, a4
; RV32-NEXT: add a2, a2, a3
-; RV32-NEXT: add a2, a6, a2
-; RV32-NEXT: add a0, a2, a0
+; RV32-NEXT: add a1, a1, a6
+; RV32-NEXT: add a1, a1, a2
; RV32-NEXT: add a0, a1, a0
; RV32-NEXT: ret
;
; RV64-NEXT: negw a2, a2
; RV64-NEXT: and a2, a2, a4
; RV64-NEXT: add a2, a2, a3
-; RV64-NEXT: add a2, a6, a2
-; RV64-NEXT: add a0, a2, a0
+; RV64-NEXT: add a1, a1, a6
+; RV64-NEXT: add a1, a1, a2
; RV64-NEXT: addw a0, a1, a0
; RV64-NEXT: ret
%r = call i32 @llvm.vp.reduce.add.v4i32(i32 %s, <4 x i32> %v, <4 x i1> %m, i32 %evl)
; RV32-NEXT: lw a3, 12(a0)
; RV32-NEXT: lw a4, 0(a0)
; RV32-NEXT: lw a5, 4(a0)
-; RV32-NEXT: lw a6, 0(a1)
-; RV32-NEXT: lw a7, 8(a1)
-; RV32-NEXT: lw t0, 4(a1)
+; RV32-NEXT: lw a6, 4(a1)
+; RV32-NEXT: lw a7, 0(a1)
+; RV32-NEXT: lw t0, 8(a1)
; RV32-NEXT: lw a1, 12(a1)
-; RV32-NEXT: add a6, a4, a6
-; RV32-NEXT: sltu a4, a6, a4
-; RV32-NEXT: add a4, t0, a4
+; RV32-NEXT: add a5, a5, a6
+; RV32-NEXT: add a7, a4, a7
+; RV32-NEXT: sltu a4, a7, a4
; RV32-NEXT: add a4, a5, a4
-; RV32-NEXT: add a7, a2, a7
-; RV32-NEXT: sltu a2, a7, a2
-; RV32-NEXT: add a1, a1, a2
; RV32-NEXT: add a1, a3, a1
-; RV32-NEXT: sw a7, 8(a0)
-; RV32-NEXT: sw a6, 0(a0)
+; RV32-NEXT: add t0, a2, t0
+; RV32-NEXT: sltu a2, t0, a2
+; RV32-NEXT: add a1, a1, a2
+; RV32-NEXT: sw t0, 8(a0)
+; RV32-NEXT: sw a7, 0(a0)
; RV32-NEXT: sw a1, 12(a0)
; RV32-NEXT: sw a4, 4(a0)
; RV32-NEXT: ret
; RV32-LABEL: add_v1i64:
; RV32: # %bb.0:
; RV32-NEXT: lw a2, 0(a0)
-; RV32-NEXT: lw a3, 0(a1)
-; RV32-NEXT: lw a1, 4(a1)
-; RV32-NEXT: lw a4, 4(a0)
-; RV32-NEXT: add a3, a2, a3
-; RV32-NEXT: sltu a2, a3, a2
-; RV32-NEXT: add a1, a1, a2
-; RV32-NEXT: add a1, a4, a1
-; RV32-NEXT: sw a3, 0(a0)
-; RV32-NEXT: sw a1, 4(a0)
+; RV32-NEXT: lw a3, 4(a0)
+; RV32-NEXT: lw a4, 4(a1)
+; RV32-NEXT: lw a1, 0(a1)
+; RV32-NEXT: add a3, a3, a4
+; RV32-NEXT: add a1, a2, a1
+; RV32-NEXT: sltu a2, a1, a2
+; RV32-NEXT: add a2, a3, a2
+; RV32-NEXT: sw a1, 0(a0)
+; RV32-NEXT: sw a2, 4(a0)
; RV32-NEXT: ret
;
; RV64-LABEL: add_v1i64:
; CHECK-NEXT: vmv1r.v v0, v9
; CHECK-NEXT: vcpop.m a2, v11, v0.t
; CHECK-NEXT: seqz a2, a2
-; CHECK-NEXT: addi a3, a1, -128
-; CHECK-NEXT: sltu a1, a1, a3
+; CHECK-NEXT: and a0, a2, a0
+; CHECK-NEXT: addi a2, a1, -128
+; CHECK-NEXT: sltu a1, a1, a2
; CHECK-NEXT: addi a1, a1, -1
-; CHECK-NEXT: and a1, a1, a3
+; CHECK-NEXT: and a1, a1, a2
; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma
; CHECK-NEXT: vmnot.m v8, v8
; CHECK-NEXT: vmv1r.v v0, v10
; CHECK-NEXT: vcpop.m a1, v8, v0.t
; CHECK-NEXT: seqz a1, a1
; CHECK-NEXT: and a0, a1, a0
-; CHECK-NEXT: and a0, a0, a2
; CHECK-NEXT: ret
%r = call i1 @llvm.vp.reduce.and.v256i1(i1 %s, <256 x i1> %v, <256 x i1> %m, i32 %evl)
ret i1 %r
; RV32-NEXT: or a3, a3, a4
; RV32-NEXT: slli a5, a5, 16
; RV32-NEXT: slli a6, a6, 24
-; RV32-NEXT: or a3, a5, a3
-; RV32-NEXT: or a3, a6, a3
+; RV32-NEXT: or a4, a6, a5
+; RV32-NEXT: or a3, a4, a3
; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
; RV32-NEXT: vmv.v.x v8, a3
; RV32-NEXT: andi a2, a2, 2
; RV32-NEXT: or a2, a2, a3
; RV32-NEXT: slli a4, a4, 16
; RV32-NEXT: slli a0, a0, 24
-; RV32-NEXT: or a2, a4, a2
+; RV32-NEXT: or a0, a0, a4
; RV32-NEXT: or a0, a0, a2
; RV32-NEXT: vmv.s.x v9, a0
; RV32-NEXT: vslideup.vi v8, v9, 1
; RV64-NEXT: or a3, a3, a4
; RV64-NEXT: slli a5, a5, 16
; RV64-NEXT: slli a6, a6, 24
-; RV64-NEXT: or a3, a5, a3
-; RV64-NEXT: or a3, a6, a3
+; RV64-NEXT: or a4, a6, a5
+; RV64-NEXT: or a3, a4, a3
; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
; RV64-NEXT: vmv.v.x v8, a3
; RV64-NEXT: andi a2, a2, 2
; RV64-NEXT: or a2, a2, a3
; RV64-NEXT: slli a4, a4, 16
; RV64-NEXT: slli a0, a0, 24
-; RV64-NEXT: or a2, a4, a2
+; RV64-NEXT: or a0, a0, a4
; RV64-NEXT: or a0, a0, a2
; RV64-NEXT: vmv.s.x v9, a0
; RV64-NEXT: vslideup.vi v8, v9, 1
; CHECK-NEXT: vmv1r.v v0, v9
; CHECK-NEXT: vcpop.m a1, v11, v0.t
; CHECK-NEXT: snez a1, a1
+; CHECK-NEXT: or a0, a1, a0
; CHECK-NEXT: or a0, a3, a0
-; CHECK-NEXT: or a0, a0, a1
; CHECK-NEXT: ret
%r = call i1 @llvm.vp.reduce.or.nxv128i1(i1 %s, <vscale x 128 x i1> %v, <vscale x 128 x i1> %m, i32 %evl)
ret i1 %r
; RV32I: # %bb.0:
; RV32I-NEXT: mv a4, a1
; RV32I-NEXT: mv a1, a0
+; RV32I-NEXT: add a5, a4, a3
; RV32I-NEXT: add a0, a0, a2
; RV32I-NEXT: sltu a1, a0, a1
-; RV32I-NEXT: add a1, a3, a1
-; RV32I-NEXT: add a1, a4, a1
+; RV32I-NEXT: add a1, a5, a1
; RV32I-NEXT: xor a2, a4, a1
; RV32I-NEXT: xor a3, a4, a3
; RV32I-NEXT: not a3, a3
; RV32IZbb: # %bb.0:
; RV32IZbb-NEXT: mv a4, a1
; RV32IZbb-NEXT: mv a1, a0
+; RV32IZbb-NEXT: add a5, a4, a3
; RV32IZbb-NEXT: add a0, a0, a2
; RV32IZbb-NEXT: sltu a1, a0, a1
-; RV32IZbb-NEXT: add a1, a3, a1
-; RV32IZbb-NEXT: add a1, a4, a1
+; RV32IZbb-NEXT: add a1, a5, a1
; RV32IZbb-NEXT: xor a2, a4, a1
; RV32IZbb-NEXT: xor a3, a4, a3
; RV32IZbb-NEXT: andn a2, a2, a3
; RV32I: # %bb.0:
; RV32I-NEXT: mv a2, a1
; RV32I-NEXT: mv a1, a0
+; RV32I-NEXT: add a3, a2, a5
; RV32I-NEXT: add a0, a0, a4
; RV32I-NEXT: sltu a1, a0, a1
-; RV32I-NEXT: add a1, a5, a1
-; RV32I-NEXT: add a1, a2, a1
+; RV32I-NEXT: add a1, a3, a1
; RV32I-NEXT: xor a3, a2, a1
; RV32I-NEXT: xor a2, a2, a5
; RV32I-NEXT: not a2, a2
; RV32IZbb: # %bb.0:
; RV32IZbb-NEXT: mv a2, a1
; RV32IZbb-NEXT: mv a1, a0
+; RV32IZbb-NEXT: add a3, a2, a5
; RV32IZbb-NEXT: add a0, a0, a4
; RV32IZbb-NEXT: sltu a1, a0, a1
-; RV32IZbb-NEXT: add a1, a5, a1
-; RV32IZbb-NEXT: add a1, a2, a1
+; RV32IZbb-NEXT: add a1, a3, a1
; RV32IZbb-NEXT: xor a3, a2, a1
; RV32IZbb-NEXT: xor a2, a2, a5
; RV32IZbb-NEXT: andn a2, a3, a2
; RV32I-NEXT: and a1, a0, a1
; RV32I-NEXT: add a0, a1, a3
; RV32I-NEXT: sltu a1, a0, a1
-; RV32I-NEXT: add a1, a4, a1
+; RV32I-NEXT: add a2, a2, a4
; RV32I-NEXT: add a1, a2, a1
; RV32I-NEXT: ret
;
; RV32I-NEXT: and a2, a0, a2
; RV32I-NEXT: and a0, a0, a1
; RV32I-NEXT: sltu a1, a3, a0
-; RV32I-NEXT: add a1, a2, a1
+; RV32I-NEXT: sub a4, a4, a2
; RV32I-NEXT: sub a1, a4, a1
; RV32I-NEXT: sub a0, a3, a0
; RV32I-NEXT: ret
; RV32-NEXT: call bar@plt
; RV32-NEXT: mv s3, a0
; RV32-NEXT: call bar@plt
+; RV32-NEXT: add s0, s0, s1
; RV32-NEXT: add a0, s3, a0
-; RV32-NEXT: add a0, s1, a0
; RV32-NEXT: add a0, s0, a0
; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
; RV32-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
; RV64-NEXT: call bar@plt
; RV64-NEXT: mv s3, a0
; RV64-NEXT: call bar@plt
+; RV64-NEXT: add s0, s0, s1
; RV64-NEXT: add a0, s3, a0
-; RV64-NEXT: add a0, s1, a0
; RV64-NEXT: addw a0, s0, a0
; RV64-NEXT: ld ra, 24(sp) # 8-byte Folded Reload
; RV64-NEXT: ld s0, 16(sp) # 8-byte Folded Reload
; RV32I-NEXT: or a3, a3, a4
; RV32I-NEXT: slli a5, a5, 16
; RV32I-NEXT: slli a6, a6, 24
-; RV32I-NEXT: or a3, a5, a3
-; RV32I-NEXT: or a3, a6, a3
+; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: or a3, a4, a3
; RV32I-NEXT: andi a2, a2, 7
; RV32I-NEXT: srl a3, a3, a2
; RV32I-NEXT: lbu a4, 5(a1)
; RV32I-NEXT: or a4, a4, a5
; RV32I-NEXT: slli a6, a6, 16
; RV32I-NEXT: slli a7, a7, 24
-; RV32I-NEXT: or a4, a6, a4
-; RV32I-NEXT: or a4, a7, a4
+; RV32I-NEXT: or a5, a7, a6
+; RV32I-NEXT: or a4, a5, a4
; RV32I-NEXT: slli a5, a4, 1
; RV32I-NEXT: xori a6, a2, 31
; RV32I-NEXT: sll a5, a5, a6
; RV32I-NEXT: or a5, a5, a7
; RV32I-NEXT: slli t0, t0, 16
; RV32I-NEXT: slli t1, t1, 24
-; RV32I-NEXT: or a5, t0, a5
-; RV32I-NEXT: or a5, t1, a5
+; RV32I-NEXT: or a7, t1, t0
+; RV32I-NEXT: or a5, a7, a5
; RV32I-NEXT: slli a7, a5, 1
; RV32I-NEXT: not t0, a2
; RV32I-NEXT: lbu t1, 13(a1)
; RV32I-NEXT: srl a5, a5, a2
; RV32I-NEXT: slli t0, t0, 16
; RV32I-NEXT: slli a1, a1, 24
-; RV32I-NEXT: or a7, t0, a7
+; RV32I-NEXT: or a1, a1, t0
; RV32I-NEXT: or a1, a1, a7
; RV32I-NEXT: slli a7, a1, 1
; RV32I-NEXT: sll a6, a7, a6
; RV32I-NEXT: or a3, a3, a4
; RV32I-NEXT: slli a5, a5, 16
; RV32I-NEXT: slli a6, a6, 24
-; RV32I-NEXT: or a3, a5, a3
-; RV32I-NEXT: or a3, a6, a3
+; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: or a3, a4, a3
; RV32I-NEXT: andi a2, a2, 7
; RV32I-NEXT: srl a3, a3, a2
; RV32I-NEXT: lbu a4, 5(a1)
; RV32I-NEXT: or a4, a4, a5
; RV32I-NEXT: slli a6, a6, 16
; RV32I-NEXT: slli a7, a7, 24
-; RV32I-NEXT: or a4, a6, a4
-; RV32I-NEXT: or a4, a7, a4
+; RV32I-NEXT: or a5, a7, a6
+; RV32I-NEXT: or a4, a5, a4
; RV32I-NEXT: slli a5, a4, 1
; RV32I-NEXT: xori a6, a2, 31
; RV32I-NEXT: sll a5, a5, a6
; RV32I-NEXT: or a5, a5, a7
; RV32I-NEXT: slli t0, t0, 16
; RV32I-NEXT: slli t1, t1, 24
-; RV32I-NEXT: or a5, t0, a5
-; RV32I-NEXT: or a5, t1, a5
+; RV32I-NEXT: or a7, t1, t0
+; RV32I-NEXT: or a5, a7, a5
; RV32I-NEXT: slli a7, a5, 1
; RV32I-NEXT: not t0, a2
; RV32I-NEXT: lbu t1, 13(a1)
; RV32I-NEXT: srl a5, a5, a2
; RV32I-NEXT: slli t0, t0, 16
; RV32I-NEXT: slli a1, a1, 24
-; RV32I-NEXT: or a7, t0, a7
+; RV32I-NEXT: or a1, a1, t0
; RV32I-NEXT: or a1, a1, a7
; RV32I-NEXT: slli a7, a1, 1
; RV32I-NEXT: sll a6, a7, a6
; RV32I-NEXT: or a1, a1, a4
; RV32I-NEXT: slli a5, a5, 16
; RV32I-NEXT: slli a6, a6, 24
-; RV32I-NEXT: or a1, a5, a1
-; RV32I-NEXT: or a1, a6, a1
+; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: or a1, a4, a1
; RV32I-NEXT: andi a2, a2, 7
; RV32I-NEXT: sll a4, a1, a2
; RV32I-NEXT: lbu a5, 1(a3)
; RV32I-NEXT: or a5, a5, a6
; RV32I-NEXT: slli a7, a7, 16
; RV32I-NEXT: slli t0, t0, 24
-; RV32I-NEXT: or a5, a7, a5
-; RV32I-NEXT: or a5, t0, a5
+; RV32I-NEXT: or a6, t0, a7
+; RV32I-NEXT: or a5, a6, a5
; RV32I-NEXT: srli a6, a5, 1
; RV32I-NEXT: xori a7, a2, 31
; RV32I-NEXT: srl a6, a6, a7
; RV32I-NEXT: or a6, a6, t0
; RV32I-NEXT: slli t1, t1, 16
; RV32I-NEXT: slli t2, t2, 24
-; RV32I-NEXT: or a6, t1, a6
-; RV32I-NEXT: or a6, t2, a6
+; RV32I-NEXT: or t0, t2, t1
+; RV32I-NEXT: or a6, t0, a6
; RV32I-NEXT: sll t0, a6, a2
; RV32I-NEXT: srli a1, a1, 1
; RV32I-NEXT: not t1, a2
; RV32I-NEXT: or t0, t0, t1
; RV32I-NEXT: slli t2, t2, 16
; RV32I-NEXT: slli a3, a3, 24
-; RV32I-NEXT: or t0, t2, t0
+; RV32I-NEXT: or a3, a3, t2
; RV32I-NEXT: or a3, a3, t0
; RV32I-NEXT: sll a3, a3, a2
; RV32I-NEXT: srli a6, a6, 1
; RV32-NEXT: addi a3, a3, -1366
; RV32-NEXT: mul a3, a5, a3
; RV32-NEXT: mulhu a6, a5, a4
+; RV32-NEXT: add a3, a6, a3
; RV32-NEXT: sltu a0, a0, a2
; RV32-NEXT: sub a1, a1, a0
-; RV32-NEXT: mul a0, a1, a4
-; RV32-NEXT: add a0, a3, a0
-; RV32-NEXT: add a1, a6, a0
+; RV32-NEXT: mul a1, a1, a4
+; RV32-NEXT: add a1, a3, a1
; RV32-NEXT: mul a0, a5, a4
; RV32-NEXT: ret
;
; RV64-NEXT: sub a4, a0, a3
; RV64-NEXT: mul a5, a4, a6
; RV64-NEXT: mulhu a6, a4, a2
+; RV64-NEXT: add a5, a6, a5
; RV64-NEXT: sltu a0, a0, a3
; RV64-NEXT: sub a1, a1, a0
-; RV64-NEXT: mul a0, a1, a2
-; RV64-NEXT: add a0, a5, a0
-; RV64-NEXT: add a1, a6, a0
+; RV64-NEXT: mul a1, a1, a2
+; RV64-NEXT: add a1, a5, a1
; RV64-NEXT: mul a0, a4, a2
; RV64-NEXT: ret
%a = udiv iXLen2 %x, 3
; RV32-NEXT: addi a3, a3, -820
; RV32-NEXT: mul a3, a5, a3
; RV32-NEXT: mulhu a6, a5, a4
+; RV32-NEXT: add a3, a6, a3
; RV32-NEXT: sltu a0, a0, a2
; RV32-NEXT: sub a1, a1, a0
-; RV32-NEXT: mul a0, a1, a4
-; RV32-NEXT: add a0, a3, a0
-; RV32-NEXT: add a1, a6, a0
+; RV32-NEXT: mul a1, a1, a4
+; RV32-NEXT: add a1, a3, a1
; RV32-NEXT: mul a0, a5, a4
; RV32-NEXT: ret
;
; RV64-NEXT: sub a4, a0, a3
; RV64-NEXT: mul a5, a4, a6
; RV64-NEXT: mulhu a6, a4, a2
+; RV64-NEXT: add a5, a6, a5
; RV64-NEXT: sltu a0, a0, a3
; RV64-NEXT: sub a1, a1, a0
-; RV64-NEXT: mul a0, a1, a2
-; RV64-NEXT: add a0, a5, a0
-; RV64-NEXT: add a1, a6, a0
+; RV64-NEXT: mul a1, a1, a2
+; RV64-NEXT: add a1, a5, a1
; RV64-NEXT: mul a0, a4, a2
; RV64-NEXT: ret
%a = udiv iXLen2 %x, 5
; RV32-NEXT: mul a5, a3, a5
; RV32-NEXT: addi a4, a4, -273
; RV32-NEXT: mulhu a6, a3, a4
+; RV32-NEXT: add a5, a6, a5
; RV32-NEXT: sltu a0, a0, a2
; RV32-NEXT: sub a1, a1, a0
-; RV32-NEXT: mul a0, a1, a4
-; RV32-NEXT: add a0, a5, a0
-; RV32-NEXT: add a1, a6, a0
+; RV32-NEXT: mul a1, a1, a4
+; RV32-NEXT: add a1, a5, a1
; RV32-NEXT: mul a0, a3, a4
; RV32-NEXT: ret
;
; RV64-NEXT: sub a3, a0, a2
; RV64-NEXT: mul a4, a3, a4
; RV64-NEXT: mulhu a6, a3, a5
+; RV64-NEXT: add a4, a6, a4
; RV64-NEXT: sltu a0, a0, a2
; RV64-NEXT: sub a1, a1, a0
-; RV64-NEXT: mul a0, a1, a5
-; RV64-NEXT: add a0, a4, a0
-; RV64-NEXT: add a1, a6, a0
+; RV64-NEXT: mul a1, a1, a5
+; RV64-NEXT: add a1, a4, a1
; RV64-NEXT: mul a0, a3, a5
; RV64-NEXT: ret
%a = udiv iXLen2 %x, 15
; RV32-NEXT: addi a3, a3, 240
; RV32-NEXT: mul a3, a5, a3
; RV32-NEXT: mulhu a6, a5, a4
+; RV32-NEXT: add a3, a6, a3
; RV32-NEXT: sltu a0, a0, a2
; RV32-NEXT: sub a1, a1, a0
-; RV32-NEXT: mul a0, a1, a4
-; RV32-NEXT: add a0, a3, a0
-; RV32-NEXT: add a1, a6, a0
+; RV32-NEXT: mul a1, a1, a4
+; RV32-NEXT: add a1, a3, a1
; RV32-NEXT: mul a0, a5, a4
; RV32-NEXT: ret
;
; RV64-NEXT: sub a4, a0, a3
; RV64-NEXT: mul a5, a4, a6
; RV64-NEXT: mulhu a6, a4, a2
+; RV64-NEXT: add a5, a6, a5
; RV64-NEXT: sltu a0, a0, a3
; RV64-NEXT: sub a1, a1, a0
-; RV64-NEXT: mul a0, a1, a2
-; RV64-NEXT: add a0, a5, a0
-; RV64-NEXT: add a1, a6, a0
+; RV64-NEXT: mul a1, a1, a2
+; RV64-NEXT: add a1, a5, a1
; RV64-NEXT: mul a0, a4, a2
; RV64-NEXT: ret
%a = udiv iXLen2 %x, 17
; RV32-NEXT: mul a5, a3, a5
; RV32-NEXT: addi a4, a4, -257
; RV32-NEXT: mulhu a6, a3, a4
+; RV32-NEXT: add a5, a6, a5
; RV32-NEXT: sltu a0, a0, a2
; RV32-NEXT: sub a1, a1, a0
-; RV32-NEXT: mul a0, a1, a4
-; RV32-NEXT: add a0, a5, a0
-; RV32-NEXT: add a1, a6, a0
+; RV32-NEXT: mul a1, a1, a4
+; RV32-NEXT: add a1, a5, a1
; RV32-NEXT: mul a0, a3, a4
; RV32-NEXT: ret
;
; RV64-NEXT: sub a3, a0, a2
; RV64-NEXT: mul a4, a3, a4
; RV64-NEXT: mulhu a6, a3, a5
+; RV64-NEXT: add a4, a6, a4
; RV64-NEXT: sltu a0, a0, a2
; RV64-NEXT: sub a1, a1, a0
-; RV64-NEXT: mul a0, a1, a5
-; RV64-NEXT: add a0, a4, a0
-; RV64-NEXT: add a1, a6, a0
+; RV64-NEXT: mul a1, a1, a5
+; RV64-NEXT: add a1, a4, a1
; RV64-NEXT: mul a0, a3, a5
; RV64-NEXT: ret
%a = udiv iXLen2 %x, 255
; RV32-NEXT: addi a3, a3, -256
; RV32-NEXT: mul a3, a5, a3
; RV32-NEXT: mulhu a6, a5, a4
+; RV32-NEXT: add a3, a6, a3
; RV32-NEXT: sltu a0, a0, a2
; RV32-NEXT: sub a1, a1, a0
-; RV32-NEXT: mul a0, a1, a4
-; RV32-NEXT: add a0, a3, a0
-; RV32-NEXT: add a1, a6, a0
+; RV32-NEXT: mul a1, a1, a4
+; RV32-NEXT: add a1, a3, a1
; RV32-NEXT: mul a0, a5, a4
; RV32-NEXT: ret
;
; RV64-NEXT: sub a4, a0, a3
; RV64-NEXT: mul a5, a4, a6
; RV64-NEXT: mulhu a6, a4, a2
+; RV64-NEXT: add a5, a6, a5
; RV64-NEXT: sltu a0, a0, a3
; RV64-NEXT: sub a1, a1, a0
-; RV64-NEXT: mul a0, a1, a2
-; RV64-NEXT: add a0, a5, a0
-; RV64-NEXT: add a1, a6, a0
+; RV64-NEXT: mul a1, a1, a2
+; RV64-NEXT: add a1, a5, a1
; RV64-NEXT: mul a0, a4, a2
; RV64-NEXT: ret
%a = udiv iXLen2 %x, 257
; RV32-NEXT: mul a5, a3, a5
; RV32-NEXT: addi a4, a4, -1
; RV32-NEXT: mulhu a4, a3, a4
+; RV32-NEXT: add a4, a4, a5
; RV32-NEXT: sltu a0, a0, a2
; RV32-NEXT: sub a1, a1, a0
; RV32-NEXT: slli a0, a1, 16
; RV32-NEXT: add a0, a0, a1
-; RV32-NEXT: sub a1, a5, a0
-; RV32-NEXT: add a1, a4, a1
+; RV32-NEXT: sub a1, a4, a0
; RV32-NEXT: slli a0, a3, 16
; RV32-NEXT: neg a2, a3
; RV32-NEXT: sub a0, a2, a0
; RV64-NEXT: mul a5, a3, a5
; RV64-NEXT: addi a4, a4, -1
; RV64-NEXT: mulhu a6, a3, a4
+; RV64-NEXT: add a5, a6, a5
; RV64-NEXT: sltu a0, a0, a2
; RV64-NEXT: sub a1, a1, a0
-; RV64-NEXT: mul a0, a1, a4
-; RV64-NEXT: add a0, a5, a0
-; RV64-NEXT: add a1, a6, a0
+; RV64-NEXT: mul a1, a1, a4
+; RV64-NEXT: add a1, a5, a1
; RV64-NEXT: mul a0, a3, a4
; RV64-NEXT: ret
%a = udiv iXLen2 %x, 65535
; RV32-NEXT: sub a3, a0, a2
; RV32-NEXT: mulhu a4, a3, a4
; RV32-NEXT: slli a5, a3, 16
+; RV32-NEXT: sub a4, a4, a5
; RV32-NEXT: sltu a0, a0, a2
; RV32-NEXT: sub a1, a1, a0
; RV32-NEXT: slli a0, a1, 16
; RV32-NEXT: sub a1, a1, a0
-; RV32-NEXT: sub a0, a5, a1
-; RV32-NEXT: sub a1, a4, a0
+; RV32-NEXT: add a1, a4, a1
; RV32-NEXT: sub a0, a3, a5
; RV32-NEXT: ret
;
; RV64-NEXT: sub a5, a0, a2
; RV64-NEXT: mul a3, a5, a3
; RV64-NEXT: mulhu a6, a5, a4
+; RV64-NEXT: add a3, a6, a3
; RV64-NEXT: sltu a0, a0, a2
; RV64-NEXT: sub a1, a1, a0
-; RV64-NEXT: mul a0, a1, a4
-; RV64-NEXT: add a0, a3, a0
-; RV64-NEXT: add a1, a6, a0
+; RV64-NEXT: mul a1, a1, a4
+; RV64-NEXT: add a1, a3, a1
; RV64-NEXT: mul a0, a5, a4
; RV64-NEXT: ret
%a = udiv iXLen2 %x, 65537
; RV32-NEXT: addi a3, a3, -1366
; RV32-NEXT: mul a3, a5, a3
; RV32-NEXT: mulhu a6, a5, a4
+; RV32-NEXT: add a3, a6, a3
; RV32-NEXT: sltu a0, a0, a2
; RV32-NEXT: sub a1, a1, a0
-; RV32-NEXT: mul a0, a1, a4
-; RV32-NEXT: add a0, a3, a0
-; RV32-NEXT: add a1, a6, a0
+; RV32-NEXT: mul a1, a1, a4
+; RV32-NEXT: add a1, a3, a1
; RV32-NEXT: mul a0, a5, a4
; RV32-NEXT: ret
;
; RV64-NEXT: sub a4, a0, a3
; RV64-NEXT: mul a5, a4, a6
; RV64-NEXT: mulhu a6, a4, a2
+; RV64-NEXT: add a5, a6, a5
; RV64-NEXT: sltu a0, a0, a3
; RV64-NEXT: sub a1, a1, a0
-; RV64-NEXT: mul a0, a1, a2
-; RV64-NEXT: add a0, a5, a0
-; RV64-NEXT: add a1, a6, a0
+; RV64-NEXT: mul a1, a1, a2
+; RV64-NEXT: add a1, a5, a1
; RV64-NEXT: mul a0, a4, a2
; RV64-NEXT: ret
%a = udiv iXLen2 %x, 12
; RV32IM-NEXT: add a1, a1, a2
; RV32IM-NEXT: li a2, 95
; RV32IM-NEXT: mul a2, a1, a2
-; RV32IM-NEXT: sub a2, a2, a1
+; RV32IM-NEXT: add a0, a0, a1
; RV32IM-NEXT: sub a0, a0, a2
; RV32IM-NEXT: ret
;
; RV64IM-NEXT: add a1, a1, a2
; RV64IM-NEXT: li a2, 95
; RV64IM-NEXT: mulw a2, a1, a2
-; RV64IM-NEXT: subw a2, a2, a1
+; RV64IM-NEXT: add a0, a0, a1
; RV64IM-NEXT: subw a0, a0, a2
; RV64IM-NEXT: ret
%1 = srem i32 %x, 95
; RV32-NEXT: andi a1, a1, 1
; RV32-NEXT: slli a1, a1, 1
; RV32-NEXT: slli a0, a0, 2
-; RV32-NEXT: or a0, a1, a0
; RV32-NEXT: or a0, a2, a0
+; RV32-NEXT: or a0, a0, a1
; RV32-NEXT: sw a0, 8(s0)
; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
; RV32-NEXT: lw s0, 24(sp) # 4-byte Folded Reload
; RV32M-NEXT: andi a1, a1, 1
; RV32M-NEXT: slli a1, a1, 1
; RV32M-NEXT: slli a0, a0, 2
-; RV32M-NEXT: or a0, a1, a0
; RV32M-NEXT: or a0, a2, a0
+; RV32M-NEXT: or a0, a0, a1
; RV32M-NEXT: sw a0, 8(s0)
; RV32M-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
; RV32M-NEXT: lw s0, 24(sp) # 4-byte Folded Reload
; RV64M-NEXT: srai a4, a4, 1
; RV64M-NEXT: add a4, a4, a5
; RV64M-NEXT: slli a5, a4, 3
-; RV64M-NEXT: sub a3, a3, a5
; RV64M-NEXT: add a3, a3, a4
+; RV64M-NEXT: sub a3, a3, a5
; RV64M-NEXT: addi a3, a3, -1
; RV64M-NEXT: seqz a3, a3
; RV64M-NEXT: lui a4, %hi(.LCPI3_2)
; RV32MV-NEXT: andi a2, a2, 1
; RV32MV-NEXT: slli a2, a2, 1
; RV32MV-NEXT: slli a0, a0, 2
-; RV32MV-NEXT: or a0, a2, a0
; RV32MV-NEXT: or a0, a1, a0
+; RV32MV-NEXT: or a0, a0, a2
; RV32MV-NEXT: sw a0, 8(s2)
; RV32MV-NEXT: addi sp, s0, -64
; RV32MV-NEXT: lw ra, 60(sp) # 4-byte Folded Reload
; RV64MV-NEXT: slli a4, a3, 3
; RV64MV-NEXT: lui a5, %hi(.LCPI3_2)
; RV64MV-NEXT: ld a5, %lo(.LCPI3_2)(a5)
-; RV64MV-NEXT: sub a2, a2, a4
; RV64MV-NEXT: add a2, a2, a3
+; RV64MV-NEXT: sub a2, a2, a4
; RV64MV-NEXT: sd a2, 8(sp)
; RV64MV-NEXT: mulh a2, a1, a5
; RV64MV-NEXT: srli a3, a2, 63
; RV32IM-NEXT: srai a5, a5, 6
; RV32IM-NEXT: add a5, a5, t5
; RV32IM-NEXT: mul a7, a5, a7
-; RV32IM-NEXT: sub a5, a7, a5
-; RV32IM-NEXT: sub a2, a2, a5
-; RV32IM-NEXT: sub a5, t4, t3
-; RV32IM-NEXT: sub a3, a3, a5
-; RV32IM-NEXT: sub a5, t2, t1
-; RV32IM-NEXT: sub a1, a1, a5
-; RV32IM-NEXT: sub a5, t0, a6
-; RV32IM-NEXT: sub a4, a4, a5
+; RV32IM-NEXT: add a2, a2, a5
+; RV32IM-NEXT: sub a2, a2, a7
+; RV32IM-NEXT: add a3, a3, t3
+; RV32IM-NEXT: sub a3, a3, t4
+; RV32IM-NEXT: add a1, a1, t1
+; RV32IM-NEXT: sub a1, a1, t2
+; RV32IM-NEXT: add a4, a4, a6
+; RV32IM-NEXT: sub a4, a4, t0
; RV32IM-NEXT: sh a4, 6(a0)
; RV32IM-NEXT: sh a1, 4(a0)
; RV32IM-NEXT: sh a3, 2(a0)
; RV64IM-NEXT: srai a3, a3, 6
; RV64IM-NEXT: add a3, a3, t5
; RV64IM-NEXT: mulw a7, a3, a7
-; RV64IM-NEXT: subw a3, a7, a3
-; RV64IM-NEXT: subw a4, a4, a3
-; RV64IM-NEXT: subw a3, t4, t3
-; RV64IM-NEXT: subw a5, a5, a3
-; RV64IM-NEXT: subw a3, t2, t1
-; RV64IM-NEXT: subw a1, a1, a3
-; RV64IM-NEXT: subw a3, t0, a6
-; RV64IM-NEXT: subw a2, a2, a3
+; RV64IM-NEXT: add a3, a4, a3
+; RV64IM-NEXT: subw a3, a3, a7
+; RV64IM-NEXT: add a5, a5, t3
+; RV64IM-NEXT: subw a4, a5, t4
+; RV64IM-NEXT: add a1, a1, t1
+; RV64IM-NEXT: subw a1, a1, t2
+; RV64IM-NEXT: add a2, a2, a6
+; RV64IM-NEXT: subw a2, a2, t0
; RV64IM-NEXT: sh a2, 6(a0)
; RV64IM-NEXT: sh a1, 4(a0)
-; RV64IM-NEXT: sh a5, 2(a0)
-; RV64IM-NEXT: sh a4, 0(a0)
+; RV64IM-NEXT: sh a4, 2(a0)
+; RV64IM-NEXT: sh a3, 0(a0)
; RV64IM-NEXT: ret
%1 = srem <4 x i16> %x, <i16 95, i16 95, i16 95, i16 95>
%2 = sdiv <4 x i16> %x, <i16 95, i16 95, i16 95, i16 95>
; RV32: # %bb.0:
; RV32-NEXT: mv a4, a1
; RV32-NEXT: sltu a1, a0, a2
-; RV32-NEXT: add a1, a3, a1
-; RV32-NEXT: sub a1, a4, a1
+; RV32-NEXT: sub a5, a4, a3
+; RV32-NEXT: sub a1, a5, a1
; RV32-NEXT: xor a5, a4, a1
; RV32-NEXT: xor a3, a4, a3
; RV32-NEXT: and a3, a3, a5
; RV32: # %bb.0:
; RV32-NEXT: mv a2, a1
; RV32-NEXT: sltu a1, a0, a4
-; RV32-NEXT: add a1, a5, a1
-; RV32-NEXT: sub a1, a2, a1
+; RV32-NEXT: sub a3, a2, a5
+; RV32-NEXT: sub a1, a3, a1
; RV32-NEXT: xor a3, a2, a1
; RV32-NEXT: xor a2, a2, a5
; RV32-NEXT: and a2, a2, a3
define i64 @func2(i64 %x, i64 %y) nounwind {
; RV32I-LABEL: func2:
; RV32I: # %bb.0:
+; RV32I-NEXT: add a3, a1, a3
; RV32I-NEXT: add a2, a0, a2
; RV32I-NEXT: sltu a0, a2, a0
; RV32I-NEXT: add a3, a3, a0
-; RV32I-NEXT: add a3, a1, a3
; RV32I-NEXT: beq a3, a1, .LBB1_2
; RV32I-NEXT: # %bb.1:
; RV32I-NEXT: sltu a0, a3, a1
;
; RV32IZbb-LABEL: func2:
; RV32IZbb: # %bb.0:
+; RV32IZbb-NEXT: add a3, a1, a3
; RV32IZbb-NEXT: add a2, a0, a2
; RV32IZbb-NEXT: sltu a0, a2, a0
; RV32IZbb-NEXT: add a3, a3, a0
-; RV32IZbb-NEXT: add a3, a1, a3
; RV32IZbb-NEXT: beq a3, a1, .LBB1_2
; RV32IZbb-NEXT: # %bb.1:
; RV32IZbb-NEXT: sltu a0, a3, a1
define i64 @func64(i64 %x, i64 %y, i64 %z) nounwind {
; RV32I-LABEL: func64:
; RV32I: # %bb.0:
+; RV32I-NEXT: add a2, a1, a5
; RV32I-NEXT: add a4, a0, a4
; RV32I-NEXT: sltu a0, a4, a0
-; RV32I-NEXT: add a2, a5, a0
-; RV32I-NEXT: add a2, a1, a2
+; RV32I-NEXT: add a2, a2, a0
; RV32I-NEXT: beq a2, a1, .LBB1_2
; RV32I-NEXT: # %bb.1:
; RV32I-NEXT: sltu a0, a2, a1
;
; RV32IZbb-LABEL: func64:
; RV32IZbb: # %bb.0:
+; RV32IZbb-NEXT: add a2, a1, a5
; RV32IZbb-NEXT: add a4, a0, a4
; RV32IZbb-NEXT: sltu a0, a4, a0
-; RV32IZbb-NEXT: add a2, a5, a0
-; RV32IZbb-NEXT: add a2, a1, a2
+; RV32IZbb-NEXT: add a2, a2, a0
; RV32IZbb-NEXT: beq a2, a1, .LBB1_2
; RV32IZbb-NEXT: # %bb.1:
; RV32IZbb-NEXT: sltu a0, a2, a1
; RISCV32-NEXT: sw s2, 20(sp) # 4-byte Folded Spill
; RISCV32-NEXT: sw s3, 16(sp) # 4-byte Folded Spill
; RISCV32-NEXT: sw s4, 12(sp) # 4-byte Folded Spill
-; RISCV32-NEXT: lw a4, 12(a1)
+; RISCV32-NEXT: lw a3, 12(a1)
; RISCV32-NEXT: lw a7, 12(a2)
; RISCV32-NEXT: lw a6, 8(a1)
-; RISCV32-NEXT: lw a3, 0(a2)
+; RISCV32-NEXT: lw a4, 0(a2)
; RISCV32-NEXT: lw a5, 0(a1)
; RISCV32-NEXT: lw t3, 4(a1)
; RISCV32-NEXT: lw t0, 8(a2)
; RISCV32-NEXT: lw a2, 4(a2)
-; RISCV32-NEXT: mulhu a1, a5, a3
-; RISCV32-NEXT: mul t1, t3, a3
+; RISCV32-NEXT: mulhu a1, a5, a4
+; RISCV32-NEXT: mul t1, t3, a4
; RISCV32-NEXT: add a1, t1, a1
; RISCV32-NEXT: sltu t1, a1, t1
-; RISCV32-NEXT: mulhu t2, t3, a3
+; RISCV32-NEXT: mulhu t2, t3, a4
; RISCV32-NEXT: add t4, t2, t1
; RISCV32-NEXT: mul t1, a5, a2
; RISCV32-NEXT: add a1, t1, a1
; RISCV32-NEXT: mul t6, t3, a2
; RISCV32-NEXT: add s0, t6, t5
; RISCV32-NEXT: mul t1, t0, a5
-; RISCV32-NEXT: mul s3, a6, a3
+; RISCV32-NEXT: mul s3, a6, a4
; RISCV32-NEXT: add s4, s3, t1
; RISCV32-NEXT: add t1, s0, s4
; RISCV32-NEXT: sltu t2, t1, s0
-; RISCV32-NEXT: sltu t6, s0, t6
+; RISCV32-NEXT: sltu s0, s0, t6
; RISCV32-NEXT: sltu t4, t5, t4
-; RISCV32-NEXT: mulhu s1, t3, a2
-; RISCV32-NEXT: add t4, t4, t6
-; RISCV32-NEXT: add s1, s1, t4
+; RISCV32-NEXT: mulhu t5, t3, a2
+; RISCV32-NEXT: add t4, t5, t4
+; RISCV32-NEXT: add s0, t4, s0
; RISCV32-NEXT: mul t4, t3, t0
-; RISCV32-NEXT: mul s2, a7, a5
-; RISCV32-NEXT: mulhu s0, t0, a5
-; RISCV32-NEXT: add t4, s0, t4
-; RISCV32-NEXT: add s2, t4, s2
+; RISCV32-NEXT: mul t5, a7, a5
+; RISCV32-NEXT: add t4, t5, t4
+; RISCV32-NEXT: mulhu s1, t0, a5
+; RISCV32-NEXT: add s2, s1, t4
; RISCV32-NEXT: mul t4, a2, a6
-; RISCV32-NEXT: mul t6, a4, a3
-; RISCV32-NEXT: mulhu t5, a6, a3
+; RISCV32-NEXT: mul t5, a3, a4
; RISCV32-NEXT: add t4, t5, t4
-; RISCV32-NEXT: add t6, t4, t6
-; RISCV32-NEXT: sltu t4, s4, s3
-; RISCV32-NEXT: add t4, s2, t4
-; RISCV32-NEXT: add t4, t6, t4
+; RISCV32-NEXT: mulhu t5, a6, a4
+; RISCV32-NEXT: add t6, t5, t4
+; RISCV32-NEXT: add t4, t6, s2
+; RISCV32-NEXT: sltu s3, s4, s3
+; RISCV32-NEXT: add t4, t4, s3
+; RISCV32-NEXT: add t4, s0, t4
; RISCV32-NEXT: add t4, t4, t2
-; RISCV32-NEXT: add t4, s1, t4
-; RISCV32-NEXT: beq t4, s1, .LBB0_2
+; RISCV32-NEXT: beq t4, s0, .LBB0_2
; RISCV32-NEXT: # %bb.1: # %start
-; RISCV32-NEXT: sltu t2, t4, s1
+; RISCV32-NEXT: sltu t2, t4, s0
; RISCV32-NEXT: .LBB0_2: # %start
-; RISCV32-NEXT: sltu s0, s2, s0
+; RISCV32-NEXT: sltu s0, s2, s1
; RISCV32-NEXT: snez s1, t3
; RISCV32-NEXT: snez s2, a7
; RISCV32-NEXT: and s1, s2, s1
; RISCV32-NEXT: mulhu s2, a7, a5
; RISCV32-NEXT: snez s2, s2
+; RISCV32-NEXT: or s1, s1, s2
; RISCV32-NEXT: mulhu t3, t3, t0
; RISCV32-NEXT: snez t3, t3
-; RISCV32-NEXT: or t3, s2, t3
-; RISCV32-NEXT: or t3, t3, s0
; RISCV32-NEXT: or t3, s1, t3
+; RISCV32-NEXT: or t3, t3, s0
; RISCV32-NEXT: sltu t5, t6, t5
; RISCV32-NEXT: snez t6, a2
-; RISCV32-NEXT: snez s0, a4
+; RISCV32-NEXT: snez s0, a3
; RISCV32-NEXT: and t6, s0, t6
-; RISCV32-NEXT: mulhu s0, a4, a3
+; RISCV32-NEXT: mulhu s0, a3, a4
; RISCV32-NEXT: snez s0, s0
+; RISCV32-NEXT: or t6, t6, s0
; RISCV32-NEXT: mulhu a2, a2, a6
; RISCV32-NEXT: snez a2, a2
-; RISCV32-NEXT: or a2, s0, a2
+; RISCV32-NEXT: or a2, t6, a2
; RISCV32-NEXT: or a2, a2, t5
; RISCV32-NEXT: or a7, t0, a7
; RISCV32-NEXT: snez a7, a7
-; RISCV32-NEXT: or a4, a6, a4
-; RISCV32-NEXT: snez a4, a4
-; RISCV32-NEXT: and a4, a4, a7
-; RISCV32-NEXT: or a2, a4, a2
-; RISCV32-NEXT: or a4, t6, t3
-; RISCV32-NEXT: or a4, a4, t2
-; RISCV32-NEXT: or a2, a2, a4
-; RISCV32-NEXT: mul a3, a5, a3
+; RISCV32-NEXT: or a3, a6, a3
+; RISCV32-NEXT: snez a3, a3
+; RISCV32-NEXT: and a3, a3, a7
+; RISCV32-NEXT: or a2, a3, a2
+; RISCV32-NEXT: or a3, t3, t2
+; RISCV32-NEXT: or a2, a2, a3
+; RISCV32-NEXT: mul a3, a5, a4
; RISCV32-NEXT: andi a2, a2, 1
; RISCV32-NEXT: sw a3, 0(a0)
; RISCV32-NEXT: sw a1, 4(a0)
; NOMISALIGN-LABEL: load_i24:
; NOMISALIGN: # %bb.0:
; NOMISALIGN-NEXT: lbu a1, 1(a0)
-; NOMISALIGN-NEXT: lb a2, 2(a0)
-; NOMISALIGN-NEXT: lbu a0, 0(a0)
+; NOMISALIGN-NEXT: lbu a2, 0(a0)
+; NOMISALIGN-NEXT: lb a0, 2(a0)
; NOMISALIGN-NEXT: slli a1, a1, 8
-; NOMISALIGN-NEXT: slli a2, a2, 16
-; NOMISALIGN-NEXT: or a0, a0, a2
+; NOMISALIGN-NEXT: or a1, a1, a2
+; NOMISALIGN-NEXT: slli a0, a0, 16
; NOMISALIGN-NEXT: or a0, a1, a0
; NOMISALIGN-NEXT: ret
;
; RV32I-NEXT: or a1, a1, a2
; RV32I-NEXT: slli a3, a3, 16
; RV32I-NEXT: slli a0, a0, 24
-; RV32I-NEXT: or a1, a3, a1
+; RV32I-NEXT: or a0, a0, a3
; RV32I-NEXT: or a0, a0, a1
; RV32I-NEXT: ret
;
; RV64I-NEXT: or a1, a1, a2
; RV64I-NEXT: slli a3, a3, 16
; RV64I-NEXT: slli a0, a0, 24
-; RV64I-NEXT: or a1, a3, a1
+; RV64I-NEXT: or a0, a0, a3
; RV64I-NEXT: or a0, a0, a1
; RV64I-NEXT: ret
;
; RV32I-NEXT: or a1, a1, a2
; RV32I-NEXT: slli a3, a3, 16
; RV32I-NEXT: slli a4, a4, 24
-; RV32I-NEXT: or a1, a3, a1
-; RV32I-NEXT: or a2, a4, a1
+; RV32I-NEXT: or a2, a4, a3
+; RV32I-NEXT: or a2, a2, a1
; RV32I-NEXT: lbu a1, 5(a0)
; RV32I-NEXT: lbu a3, 4(a0)
; RV32I-NEXT: lbu a4, 6(a0)
; RV32I-NEXT: or a1, a1, a3
; RV32I-NEXT: slli a4, a4, 16
; RV32I-NEXT: slli a0, a0, 24
-; RV32I-NEXT: or a1, a4, a1
+; RV32I-NEXT: or a0, a0, a4
; RV32I-NEXT: or a1, a0, a1
; RV32I-NEXT: mv a0, a2
; RV32I-NEXT: ret
; RV64I-NEXT: lbu a1, 1(a0)
; RV64I-NEXT: lbu a2, 0(a0)
; RV64I-NEXT: lbu a3, 2(a0)
+; RV64I-NEXT: lbu a4, 3(a0)
; RV64I-NEXT: slli a1, a1, 8
; RV64I-NEXT: or a1, a1, a2
; RV64I-NEXT: slli a3, a3, 16
-; RV64I-NEXT: lbu a2, 5(a0)
-; RV64I-NEXT: lbu a4, 3(a0)
+; RV64I-NEXT: slli a4, a4, 24
+; RV64I-NEXT: or a3, a4, a3
; RV64I-NEXT: or a1, a3, a1
+; RV64I-NEXT: lbu a2, 5(a0)
; RV64I-NEXT: lbu a3, 4(a0)
-; RV64I-NEXT: slli a2, a2, 8
-; RV64I-NEXT: lbu a5, 6(a0)
+; RV64I-NEXT: lbu a4, 6(a0)
; RV64I-NEXT: lbu a0, 7(a0)
+; RV64I-NEXT: slli a2, a2, 8
; RV64I-NEXT: or a2, a2, a3
-; RV64I-NEXT: slli a4, a4, 24
-; RV64I-NEXT: slli a5, a5, 16
+; RV64I-NEXT: slli a4, a4, 16
; RV64I-NEXT: slli a0, a0, 24
-; RV64I-NEXT: or a2, a5, a2
+; RV64I-NEXT: or a0, a0, a4
; RV64I-NEXT: or a0, a0, a2
; RV64I-NEXT: slli a0, a0, 32
; RV64I-NEXT: or a0, a0, a1
-; RV64I-NEXT: or a0, a0, a4
; RV64I-NEXT: ret
;
; MISALIGN-RV32I-LABEL: load_i64:
; RV32IM-NEXT: srli a1, a1, 6
; RV32IM-NEXT: li a2, 95
; RV32IM-NEXT: mul a2, a1, a2
-; RV32IM-NEXT: sub a2, a2, a1
+; RV32IM-NEXT: add a0, a0, a1
; RV32IM-NEXT: sub a0, a0, a2
; RV32IM-NEXT: ret
;
; RV64IM-NEXT: srli a1, a1, 6
; RV64IM-NEXT: li a2, 95
; RV64IM-NEXT: mulw a2, a1, a2
-; RV64IM-NEXT: subw a2, a2, a1
+; RV64IM-NEXT: add a0, a0, a1
; RV64IM-NEXT: subw a0, a0, a2
; RV64IM-NEXT: ret
%1 = urem i32 %x, 95
; RV32MV-NEXT: andi a3, a3, 2047
; RV32MV-NEXT: slli a3, a3, 11
; RV32MV-NEXT: slli a1, a1, 22
-; RV32MV-NEXT: or a1, a3, a1
; RV32MV-NEXT: or a1, a2, a1
+; RV32MV-NEXT: or a1, a1, a3
; RV32MV-NEXT: sw a1, 0(a0)
; RV32MV-NEXT: addi sp, sp, 16
; RV32MV-NEXT: ret
; RV64MV-NEXT: vslidedown.vi v8, v8, 2
; RV64MV-NEXT: vmv.x.s a3, v8
; RV64MV-NEXT: slli a3, a3, 22
-; RV64MV-NEXT: or a2, a2, a3
+; RV64MV-NEXT: or a1, a1, a3
; RV64MV-NEXT: or a1, a1, a2
; RV64MV-NEXT: sw a1, 0(a0)
; RV64MV-NEXT: slli a1, a1, 31
; RV32IM-NEXT: mul t4, t3, a7
; RV32IM-NEXT: mulhu a5, a2, a5
; RV32IM-NEXT: mul a7, a5, a7
-; RV32IM-NEXT: sub a5, a7, a5
-; RV32IM-NEXT: sub a2, a2, a5
-; RV32IM-NEXT: sub a5, t4, t3
-; RV32IM-NEXT: sub a3, a3, a5
-; RV32IM-NEXT: sub a5, t2, t1
-; RV32IM-NEXT: sub a1, a1, a5
-; RV32IM-NEXT: sub a5, t0, a6
-; RV32IM-NEXT: sub a4, a4, a5
+; RV32IM-NEXT: add a2, a2, a5
+; RV32IM-NEXT: sub a2, a2, a7
+; RV32IM-NEXT: add a3, a3, t3
+; RV32IM-NEXT: sub a3, a3, t4
+; RV32IM-NEXT: add a1, a1, t1
+; RV32IM-NEXT: sub a1, a1, t2
+; RV32IM-NEXT: add a4, a4, a6
+; RV32IM-NEXT: sub a4, a4, t0
; RV32IM-NEXT: sh a4, 6(a0)
; RV32IM-NEXT: sh a1, 4(a0)
; RV32IM-NEXT: sh a3, 2(a0)
; RV64IM-NEXT: mulw t4, t3, a7
; RV64IM-NEXT: mulhu a3, a4, a3
; RV64IM-NEXT: mulw a7, a3, a7
-; RV64IM-NEXT: subw a3, a7, a3
-; RV64IM-NEXT: subw a4, a4, a3
-; RV64IM-NEXT: subw a3, t4, t3
-; RV64IM-NEXT: subw a5, a5, a3
-; RV64IM-NEXT: subw a3, t2, t1
-; RV64IM-NEXT: subw a1, a1, a3
-; RV64IM-NEXT: subw a3, t0, a6
-; RV64IM-NEXT: subw a2, a2, a3
+; RV64IM-NEXT: add a3, a4, a3
+; RV64IM-NEXT: subw a3, a3, a7
+; RV64IM-NEXT: add a5, a5, t3
+; RV64IM-NEXT: subw a4, a5, t4
+; RV64IM-NEXT: add a1, a1, t1
+; RV64IM-NEXT: subw a1, a1, t2
+; RV64IM-NEXT: add a2, a2, a6
+; RV64IM-NEXT: subw a2, a2, t0
; RV64IM-NEXT: sh a2, 6(a0)
; RV64IM-NEXT: sh a1, 4(a0)
-; RV64IM-NEXT: sh a5, 2(a0)
-; RV64IM-NEXT: sh a4, 0(a0)
+; RV64IM-NEXT: sh a4, 2(a0)
+; RV64IM-NEXT: sh a3, 0(a0)
; RV64IM-NEXT: ret
%1 = urem <4 x i16> %x, <i16 95, i16 95, i16 95, i16 95>
%2 = udiv <4 x i16> %x, <i16 95, i16 95, i16 95, i16 95>
; RV32I-LABEL: func2:
; RV32I: # %bb.0:
; RV32I-NEXT: sltu a4, a0, a2
-; RV32I-NEXT: add a3, a3, a4
; RV32I-NEXT: sub a3, a1, a3
+; RV32I-NEXT: sub a3, a3, a4
; RV32I-NEXT: sub a2, a0, a2
; RV32I-NEXT: beq a3, a1, .LBB1_2
; RV32I-NEXT: # %bb.1:
; RV32IZbb-LABEL: func2:
; RV32IZbb: # %bb.0:
; RV32IZbb-NEXT: sltu a4, a0, a2
-; RV32IZbb-NEXT: add a3, a3, a4
; RV32IZbb-NEXT: sub a3, a1, a3
+; RV32IZbb-NEXT: sub a3, a3, a4
; RV32IZbb-NEXT: sub a2, a0, a2
; RV32IZbb-NEXT: beq a3, a1, .LBB1_2
; RV32IZbb-NEXT: # %bb.1:
; RV32I-LABEL: func64:
; RV32I: # %bb.0:
; RV32I-NEXT: sltu a2, a0, a4
-; RV32I-NEXT: add a2, a5, a2
-; RV32I-NEXT: sub a2, a1, a2
+; RV32I-NEXT: sub a3, a1, a5
+; RV32I-NEXT: sub a2, a3, a2
; RV32I-NEXT: sub a3, a0, a4
; RV32I-NEXT: beq a2, a1, .LBB1_2
; RV32I-NEXT: # %bb.1:
; RV32IZbb-LABEL: func64:
; RV32IZbb: # %bb.0:
; RV32IZbb-NEXT: sltu a2, a0, a4
-; RV32IZbb-NEXT: add a2, a5, a2
-; RV32IZbb-NEXT: sub a2, a1, a2
+; RV32IZbb-NEXT: sub a3, a1, a5
+; RV32IZbb-NEXT: sub a2, a3, a2
; RV32IZbb-NEXT: sub a3, a0, a4
; RV32IZbb-NEXT: beq a2, a1, .LBB1_2
; RV32IZbb-NEXT: # %bb.1:
; ILP32-ILP32F-FPELIM-NEXT: andi a0, a0, -8
; ILP32-ILP32F-FPELIM-NEXT: addi a3, sp, 27
; ILP32-ILP32F-FPELIM-NEXT: sw a3, 4(sp)
-; ILP32-ILP32F-FPELIM-NEXT: lw a3, 0(a0)
-; ILP32-ILP32F-FPELIM-NEXT: lw a4, 4(a0)
-; ILP32-ILP32F-FPELIM-NEXT: add a0, a1, a3
+; ILP32-ILP32F-FPELIM-NEXT: lw a3, 4(a0)
+; ILP32-ILP32F-FPELIM-NEXT: lw a0, 0(a0)
+; ILP32-ILP32F-FPELIM-NEXT: add a2, a2, a3
+; ILP32-ILP32F-FPELIM-NEXT: add a0, a1, a0
; ILP32-ILP32F-FPELIM-NEXT: sltu a1, a0, a1
-; ILP32-ILP32F-FPELIM-NEXT: add a1, a4, a1
; ILP32-ILP32F-FPELIM-NEXT: add a1, a2, a1
; ILP32-ILP32F-FPELIM-NEXT: addi sp, sp, 32
; ILP32-ILP32F-FPELIM-NEXT: ret
; ILP32-ILP32F-WITHFP-NEXT: andi a0, a0, -8
; ILP32-ILP32F-WITHFP-NEXT: addi a3, s0, 19
; ILP32-ILP32F-WITHFP-NEXT: sw a3, -12(s0)
-; ILP32-ILP32F-WITHFP-NEXT: lw a3, 0(a0)
-; ILP32-ILP32F-WITHFP-NEXT: lw a4, 4(a0)
-; ILP32-ILP32F-WITHFP-NEXT: add a0, a1, a3
+; ILP32-ILP32F-WITHFP-NEXT: lw a3, 4(a0)
+; ILP32-ILP32F-WITHFP-NEXT: lw a0, 0(a0)
+; ILP32-ILP32F-WITHFP-NEXT: add a2, a2, a3
+; ILP32-ILP32F-WITHFP-NEXT: add a0, a1, a0
; ILP32-ILP32F-WITHFP-NEXT: sltu a1, a0, a1
-; ILP32-ILP32F-WITHFP-NEXT: add a1, a4, a1
; ILP32-ILP32F-WITHFP-NEXT: add a1, a2, a1
; ILP32-ILP32F-WITHFP-NEXT: lw ra, 20(sp) # 4-byte Folded Reload
; ILP32-ILP32F-WITHFP-NEXT: lw s0, 16(sp) # 4-byte Folded Reload
; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: andi a0, a0, -8
; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: addi a3, sp, 27
; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: sw a3, 4(sp)
-; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: lw a3, 0(a0)
-; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: lw a4, 4(a0)
-; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: add a0, a1, a3
+; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: lw a3, 4(a0)
+; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: lw a0, 0(a0)
+; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: add a2, a2, a3
+; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: add a0, a1, a0
; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: sltu a1, a0, a1
-; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: add a1, a4, a1
; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: add a1, a2, a1
; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: addi sp, sp, 32
; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: ret
; ILP32-ILP32F-FPELIM-NEXT: lw a4, 4(a0)
; ILP32-ILP32F-FPELIM-NEXT: add a0, a1, a3
; ILP32-ILP32F-FPELIM-NEXT: sltu a1, a0, a1
-; ILP32-ILP32F-FPELIM-NEXT: add a1, a4, a1
+; ILP32-ILP32F-FPELIM-NEXT: add a2, a2, a4
; ILP32-ILP32F-FPELIM-NEXT: add a1, a2, a1
; ILP32-ILP32F-FPELIM-NEXT: addi sp, sp, 32
; ILP32-ILP32F-FPELIM-NEXT: ret
; ILP32-ILP32F-WITHFP-NEXT: lw a4, 4(a0)
; ILP32-ILP32F-WITHFP-NEXT: add a0, a1, a3
; ILP32-ILP32F-WITHFP-NEXT: sltu a1, a0, a1
-; ILP32-ILP32F-WITHFP-NEXT: add a1, a4, a1
+; ILP32-ILP32F-WITHFP-NEXT: add a2, a2, a4
; ILP32-ILP32F-WITHFP-NEXT: add a1, a2, a1
; ILP32-ILP32F-WITHFP-NEXT: lw ra, 20(sp) # 4-byte Folded Reload
; ILP32-ILP32F-WITHFP-NEXT: lw s0, 16(sp) # 4-byte Folded Reload
; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: sw a3, 20(sp)
; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: fld ft0, 0(a0)
; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: fsd ft0, 8(sp)
-; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: lw a0, 8(sp)
-; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: lw a3, 12(sp)
-; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: add a0, a1, a0
+; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: lw a0, 12(sp)
+; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: lw a3, 8(sp)
+; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: add a2, a2, a0
+; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: add a0, a1, a3
; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: sltu a1, a0, a1
-; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: add a1, a3, a1
; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: add a1, a2, a1
; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: addi sp, sp, 48
; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: ret
; ILP32-ILP32F-FPELIM-NEXT: addi a3, a0, 4
; ILP32-ILP32F-FPELIM-NEXT: sw a3, 4(sp)
; ILP32-ILP32F-FPELIM-NEXT: lw a0, 0(a0)
-; ILP32-ILP32F-FPELIM-NEXT: add a2, s0, a2
-; ILP32-ILP32F-FPELIM-NEXT: add a0, a2, a0
+; ILP32-ILP32F-FPELIM-NEXT: add a1, a1, s0
+; ILP32-ILP32F-FPELIM-NEXT: add a1, a1, a2
; ILP32-ILP32F-FPELIM-NEXT: add a0, a1, a0
; ILP32-ILP32F-FPELIM-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
; ILP32-ILP32F-FPELIM-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
; ILP32-ILP32F-WITHFP-NEXT: addi a3, a0, 4
; ILP32-ILP32F-WITHFP-NEXT: sw a3, -16(s0)
; ILP32-ILP32F-WITHFP-NEXT: lw a0, 0(a0)
-; ILP32-ILP32F-WITHFP-NEXT: add a2, s1, a2
-; ILP32-ILP32F-WITHFP-NEXT: add a0, a2, a0
+; ILP32-ILP32F-WITHFP-NEXT: add a1, a1, s1
+; ILP32-ILP32F-WITHFP-NEXT: add a1, a1, a2
; ILP32-ILP32F-WITHFP-NEXT: add a0, a1, a0
; ILP32-ILP32F-WITHFP-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
; ILP32-ILP32F-WITHFP-NEXT: lw s0, 24(sp) # 4-byte Folded Reload
; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: addi a3, a0, 4
; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: sw a3, 4(sp)
; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: lw a0, 0(a0)
-; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: add a2, s0, a2
-; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: add a0, a2, a0
+; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: add a1, a1, s0
+; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: add a1, a1, a2
; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: add a0, a1, a0
; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
; LP64-LP64F-LP64D-FPELIM-NEXT: addi a3, a0, 8
; LP64-LP64F-LP64D-FPELIM-NEXT: sd a3, 8(sp)
; LP64-LP64F-LP64D-FPELIM-NEXT: ld a0, 0(a0)
-; LP64-LP64F-LP64D-FPELIM-NEXT: add a2, s0, a2
-; LP64-LP64F-LP64D-FPELIM-NEXT: add a0, a2, a0
+; LP64-LP64F-LP64D-FPELIM-NEXT: add a1, a1, s0
+; LP64-LP64F-LP64D-FPELIM-NEXT: add a1, a1, a2
; LP64-LP64F-LP64D-FPELIM-NEXT: addw a0, a1, a0
; LP64-LP64F-LP64D-FPELIM-NEXT: ld ra, 24(sp) # 8-byte Folded Reload
; LP64-LP64F-LP64D-FPELIM-NEXT: ld s0, 16(sp) # 8-byte Folded Reload
; LP64-LP64F-LP64D-WITHFP-NEXT: addi a3, a0, 8
; LP64-LP64F-LP64D-WITHFP-NEXT: sd a3, -32(s0)
; LP64-LP64F-LP64D-WITHFP-NEXT: ld a0, 0(a0)
-; LP64-LP64F-LP64D-WITHFP-NEXT: add a2, s1, a2
-; LP64-LP64F-LP64D-WITHFP-NEXT: add a0, a2, a0
+; LP64-LP64F-LP64D-WITHFP-NEXT: add a1, a1, s1
+; LP64-LP64F-LP64D-WITHFP-NEXT: add a1, a1, a2
; LP64-LP64F-LP64D-WITHFP-NEXT: addw a0, a1, a0
; LP64-LP64F-LP64D-WITHFP-NEXT: ld ra, 40(sp) # 8-byte Folded Reload
; LP64-LP64F-LP64D-WITHFP-NEXT: ld s0, 32(sp) # 8-byte Folded Reload
; RV64I-NEXT: slli a5, a5, 16
; RV64I-NEXT: lbu a1, 0(a1)
; RV64I-NEXT: slli a0, a0, 24
-; RV64I-NEXT: or a3, a5, a3
+; RV64I-NEXT: or a0, a0, a5
; RV64I-NEXT: or a0, a0, a3
; RV64I-NEXT: slli a1, a1, 3
; RV64I-NEXT: srlw a0, a0, a1
; RV32I-NEXT: or a3, a3, a4
; RV32I-NEXT: slli a5, a5, 16
; RV32I-NEXT: slli a0, a0, 24
-; RV32I-NEXT: or a3, a5, a3
+; RV32I-NEXT: or a0, a0, a5
; RV32I-NEXT: or a0, a0, a3
; RV32I-NEXT: lbu a3, 1(a1)
; RV32I-NEXT: lbu a4, 0(a1)
; RV32I-NEXT: or a3, a3, a4
; RV32I-NEXT: slli a5, a5, 16
; RV32I-NEXT: slli a1, a1, 24
-; RV32I-NEXT: or a3, a5, a3
+; RV32I-NEXT: or a1, a1, a5
; RV32I-NEXT: or a1, a1, a3
; RV32I-NEXT: slli a1, a1, 3
; RV32I-NEXT: srl a0, a0, a1
; RV64I-NEXT: slli a5, a5, 16
; RV64I-NEXT: lbu a1, 0(a1)
; RV64I-NEXT: slli a0, a0, 24
-; RV64I-NEXT: or a3, a5, a3
+; RV64I-NEXT: or a0, a0, a5
; RV64I-NEXT: or a0, a0, a3
; RV64I-NEXT: slli a1, a1, 3
; RV64I-NEXT: sllw a0, a0, a1
; RV32I-NEXT: or a3, a3, a4
; RV32I-NEXT: slli a5, a5, 16
; RV32I-NEXT: slli a0, a0, 24
-; RV32I-NEXT: or a3, a5, a3
+; RV32I-NEXT: or a0, a0, a5
; RV32I-NEXT: or a0, a0, a3
; RV32I-NEXT: lbu a3, 1(a1)
; RV32I-NEXT: lbu a4, 0(a1)
; RV32I-NEXT: or a3, a3, a4
; RV32I-NEXT: slli a5, a5, 16
; RV32I-NEXT: slli a1, a1, 24
-; RV32I-NEXT: or a3, a5, a3
+; RV32I-NEXT: or a1, a1, a5
; RV32I-NEXT: or a1, a1, a3
; RV32I-NEXT: slli a1, a1, 3
; RV32I-NEXT: sll a0, a0, a1
; RV64I-NEXT: slli a5, a5, 16
; RV64I-NEXT: lbu a1, 0(a1)
; RV64I-NEXT: slli a0, a0, 24
-; RV64I-NEXT: or a3, a5, a3
+; RV64I-NEXT: or a0, a0, a5
; RV64I-NEXT: or a0, a0, a3
; RV64I-NEXT: slli a1, a1, 3
; RV64I-NEXT: sraw a0, a0, a1
; RV32I-NEXT: or a3, a3, a4
; RV32I-NEXT: slli a5, a5, 16
; RV32I-NEXT: slli a0, a0, 24
-; RV32I-NEXT: or a3, a5, a3
+; RV32I-NEXT: or a0, a0, a5
; RV32I-NEXT: or a0, a0, a3
; RV32I-NEXT: lbu a3, 1(a1)
; RV32I-NEXT: lbu a4, 0(a1)
; RV32I-NEXT: or a3, a3, a4
; RV32I-NEXT: slli a5, a5, 16
; RV32I-NEXT: slli a1, a1, 24
-; RV32I-NEXT: or a3, a5, a3
+; RV32I-NEXT: or a1, a1, a5
; RV32I-NEXT: or a1, a1, a3
; RV32I-NEXT: slli a1, a1, 3
; RV32I-NEXT: sra a0, a0, a1
; RV64I-NEXT: lbu a3, 1(a0)
; RV64I-NEXT: lbu a4, 0(a0)
; RV64I-NEXT: lbu a5, 2(a0)
+; RV64I-NEXT: lbu a6, 3(a0)
; RV64I-NEXT: slli a3, a3, 8
; RV64I-NEXT: or a3, a3, a4
; RV64I-NEXT: slli a5, a5, 16
+; RV64I-NEXT: slli a6, a6, 24
+; RV64I-NEXT: or a4, a6, a5
+; RV64I-NEXT: or a3, a4, a3
; RV64I-NEXT: lbu a4, 5(a0)
-; RV64I-NEXT: lbu a6, 3(a0)
-; RV64I-NEXT: or a3, a5, a3
; RV64I-NEXT: lbu a5, 4(a0)
-; RV64I-NEXT: slli a4, a4, 8
-; RV64I-NEXT: lbu a7, 6(a0)
+; RV64I-NEXT: lbu a6, 6(a0)
; RV64I-NEXT: lbu a0, 7(a0)
+; RV64I-NEXT: slli a4, a4, 8
; RV64I-NEXT: or a4, a4, a5
-; RV64I-NEXT: slli a6, a6, 24
-; RV64I-NEXT: slli a7, a7, 16
+; RV64I-NEXT: slli a6, a6, 16
; RV64I-NEXT: slli a0, a0, 24
-; RV64I-NEXT: or a4, a7, a4
+; RV64I-NEXT: or a0, a0, a6
; RV64I-NEXT: or a0, a0, a4
; RV64I-NEXT: slli a0, a0, 32
; RV64I-NEXT: or a0, a0, a3
-; RV64I-NEXT: or a0, a0, a6
; RV64I-NEXT: lbu a3, 5(a1)
; RV64I-NEXT: lbu a4, 4(a1)
; RV64I-NEXT: lbu a5, 6(a1)
; RV64I-NEXT: or a3, a3, a4
; RV64I-NEXT: slli a5, a5, 16
; RV64I-NEXT: slli a6, a6, 24
-; RV64I-NEXT: or a3, a5, a3
-; RV64I-NEXT: or a3, a6, a3
+; RV64I-NEXT: or a4, a6, a5
+; RV64I-NEXT: or a3, a4, a3
; RV64I-NEXT: lbu a4, 1(a1)
; RV64I-NEXT: lbu a5, 0(a1)
; RV64I-NEXT: lbu a6, 2(a1)
; RV64I-NEXT: or a4, a4, a5
; RV64I-NEXT: slli a6, a6, 16
; RV64I-NEXT: slli a1, a1, 24
-; RV64I-NEXT: or a4, a6, a4
+; RV64I-NEXT: or a1, a1, a6
; RV64I-NEXT: or a1, a1, a4
; RV64I-NEXT: slli a1, a1, 3
; RV64I-NEXT: slli a3, a3, 35
; RV32I-NEXT: or a3, a3, a4
; RV32I-NEXT: slli a5, a5, 16
; RV32I-NEXT: slli a6, a6, 24
-; RV32I-NEXT: or a3, a5, a3
-; RV32I-NEXT: or a3, a6, a3
+; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: or a3, a4, a3
; RV32I-NEXT: lbu a4, 1(a1)
; RV32I-NEXT: lbu a5, 0(a1)
; RV32I-NEXT: lbu a6, 2(a1)
; RV32I-NEXT: or a4, a4, a5
; RV32I-NEXT: slli a6, a6, 16
; RV32I-NEXT: slli a1, a1, 24
-; RV32I-NEXT: or a4, a6, a4
+; RV32I-NEXT: or a1, a1, a6
; RV32I-NEXT: or a1, a1, a4
; RV32I-NEXT: slli a5, a1, 3
; RV32I-NEXT: addi a4, a5, -32
; RV32I-NEXT: or a6, a6, a7
; RV32I-NEXT: slli t0, t0, 16
; RV32I-NEXT: slli a0, a0, 24
-; RV32I-NEXT: or a6, t0, a6
+; RV32I-NEXT: or a0, a0, t0
; RV32I-NEXT: or a0, a0, a6
; RV32I-NEXT: srl a0, a0, a5
; RV32I-NEXT: slli a3, a3, 1
; RV64I-NEXT: lbu a3, 1(a0)
; RV64I-NEXT: lbu a4, 0(a0)
; RV64I-NEXT: lbu a5, 2(a0)
+; RV64I-NEXT: lbu a6, 3(a0)
; RV64I-NEXT: slli a3, a3, 8
; RV64I-NEXT: or a3, a3, a4
; RV64I-NEXT: slli a5, a5, 16
+; RV64I-NEXT: slli a6, a6, 24
+; RV64I-NEXT: or a4, a6, a5
+; RV64I-NEXT: or a3, a4, a3
; RV64I-NEXT: lbu a4, 5(a0)
-; RV64I-NEXT: lbu a6, 3(a0)
-; RV64I-NEXT: or a3, a5, a3
; RV64I-NEXT: lbu a5, 4(a0)
-; RV64I-NEXT: slli a4, a4, 8
-; RV64I-NEXT: lbu a7, 6(a0)
+; RV64I-NEXT: lbu a6, 6(a0)
; RV64I-NEXT: lbu a0, 7(a0)
+; RV64I-NEXT: slli a4, a4, 8
; RV64I-NEXT: or a4, a4, a5
-; RV64I-NEXT: slli a6, a6, 24
-; RV64I-NEXT: slli a7, a7, 16
+; RV64I-NEXT: slli a6, a6, 16
; RV64I-NEXT: slli a0, a0, 24
-; RV64I-NEXT: or a4, a7, a4
+; RV64I-NEXT: or a0, a0, a6
; RV64I-NEXT: or a0, a0, a4
; RV64I-NEXT: slli a0, a0, 32
; RV64I-NEXT: or a0, a0, a3
-; RV64I-NEXT: or a0, a0, a6
; RV64I-NEXT: lbu a3, 5(a1)
; RV64I-NEXT: lbu a4, 4(a1)
; RV64I-NEXT: lbu a5, 6(a1)
; RV64I-NEXT: or a3, a3, a4
; RV64I-NEXT: slli a5, a5, 16
; RV64I-NEXT: slli a6, a6, 24
-; RV64I-NEXT: or a3, a5, a3
-; RV64I-NEXT: or a3, a6, a3
+; RV64I-NEXT: or a4, a6, a5
+; RV64I-NEXT: or a3, a4, a3
; RV64I-NEXT: lbu a4, 1(a1)
; RV64I-NEXT: lbu a5, 0(a1)
; RV64I-NEXT: lbu a6, 2(a1)
; RV64I-NEXT: or a4, a4, a5
; RV64I-NEXT: slli a6, a6, 16
; RV64I-NEXT: slli a1, a1, 24
-; RV64I-NEXT: or a4, a6, a4
+; RV64I-NEXT: or a1, a1, a6
; RV64I-NEXT: or a1, a1, a4
; RV64I-NEXT: slli a1, a1, 3
; RV64I-NEXT: slli a3, a3, 35
; RV32I-NEXT: or a3, a3, a4
; RV32I-NEXT: slli a5, a5, 16
; RV32I-NEXT: slli a6, a6, 24
-; RV32I-NEXT: or a3, a5, a3
-; RV32I-NEXT: or a3, a6, a3
+; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: or a3, a4, a3
; RV32I-NEXT: lbu a4, 1(a1)
; RV32I-NEXT: lbu a5, 0(a1)
; RV32I-NEXT: lbu a6, 2(a1)
; RV32I-NEXT: or a4, a4, a5
; RV32I-NEXT: slli a6, a6, 16
; RV32I-NEXT: slli a1, a1, 24
-; RV32I-NEXT: or a4, a6, a4
+; RV32I-NEXT: or a1, a1, a6
; RV32I-NEXT: or a1, a1, a4
; RV32I-NEXT: slli a5, a1, 3
; RV32I-NEXT: addi a4, a5, -32
; RV32I-NEXT: or a6, a6, a7
; RV32I-NEXT: slli t0, t0, 16
; RV32I-NEXT: slli a0, a0, 24
-; RV32I-NEXT: or a6, t0, a6
+; RV32I-NEXT: or a0, a0, t0
; RV32I-NEXT: or a0, a0, a6
; RV32I-NEXT: sll a0, a0, a5
; RV32I-NEXT: srli a3, a3, 1
; RV64I-NEXT: lbu a3, 1(a0)
; RV64I-NEXT: lbu a4, 0(a0)
; RV64I-NEXT: lbu a5, 2(a0)
+; RV64I-NEXT: lbu a6, 3(a0)
; RV64I-NEXT: slli a3, a3, 8
; RV64I-NEXT: or a3, a3, a4
; RV64I-NEXT: slli a5, a5, 16
+; RV64I-NEXT: slli a6, a6, 24
+; RV64I-NEXT: or a4, a6, a5
+; RV64I-NEXT: or a3, a4, a3
; RV64I-NEXT: lbu a4, 5(a0)
-; RV64I-NEXT: lbu a6, 3(a0)
-; RV64I-NEXT: or a3, a5, a3
; RV64I-NEXT: lbu a5, 4(a0)
-; RV64I-NEXT: slli a4, a4, 8
-; RV64I-NEXT: lbu a7, 6(a0)
+; RV64I-NEXT: lbu a6, 6(a0)
; RV64I-NEXT: lbu a0, 7(a0)
+; RV64I-NEXT: slli a4, a4, 8
; RV64I-NEXT: or a4, a4, a5
-; RV64I-NEXT: slli a6, a6, 24
-; RV64I-NEXT: slli a7, a7, 16
+; RV64I-NEXT: slli a6, a6, 16
; RV64I-NEXT: slli a0, a0, 24
-; RV64I-NEXT: or a4, a7, a4
+; RV64I-NEXT: or a0, a0, a6
; RV64I-NEXT: or a0, a0, a4
; RV64I-NEXT: slli a0, a0, 32
; RV64I-NEXT: or a0, a0, a3
-; RV64I-NEXT: or a0, a0, a6
; RV64I-NEXT: lbu a3, 5(a1)
; RV64I-NEXT: lbu a4, 4(a1)
; RV64I-NEXT: lbu a5, 6(a1)
; RV64I-NEXT: or a3, a3, a4
; RV64I-NEXT: slli a5, a5, 16
; RV64I-NEXT: slli a6, a6, 24
-; RV64I-NEXT: or a3, a5, a3
-; RV64I-NEXT: or a3, a6, a3
+; RV64I-NEXT: or a4, a6, a5
+; RV64I-NEXT: or a3, a4, a3
; RV64I-NEXT: lbu a4, 1(a1)
; RV64I-NEXT: lbu a5, 0(a1)
; RV64I-NEXT: lbu a6, 2(a1)
; RV64I-NEXT: or a4, a4, a5
; RV64I-NEXT: slli a6, a6, 16
; RV64I-NEXT: slli a1, a1, 24
-; RV64I-NEXT: or a4, a6, a4
+; RV64I-NEXT: or a1, a1, a6
; RV64I-NEXT: or a1, a1, a4
; RV64I-NEXT: slli a1, a1, 3
; RV64I-NEXT: slli a3, a3, 35
; RV32I-NEXT: lbu a3, 5(a0)
; RV32I-NEXT: lbu a4, 4(a0)
; RV32I-NEXT: lbu a5, 6(a0)
+; RV32I-NEXT: lbu a6, 7(a0)
; RV32I-NEXT: slli a3, a3, 8
; RV32I-NEXT: or a3, a3, a4
; RV32I-NEXT: slli a5, a5, 16
-; RV32I-NEXT: lbu a4, 1(a1)
-; RV32I-NEXT: lbu a6, 0(a1)
+; RV32I-NEXT: slli a4, a6, 24
+; RV32I-NEXT: or a5, a4, a5
; RV32I-NEXT: or a3, a5, a3
-; RV32I-NEXT: lbu a5, 7(a0)
-; RV32I-NEXT: slli a4, a4, 8
-; RV32I-NEXT: or a4, a4, a6
-; RV32I-NEXT: lbu a6, 2(a1)
+; RV32I-NEXT: lbu a5, 1(a1)
+; RV32I-NEXT: lbu a6, 0(a1)
+; RV32I-NEXT: lbu a7, 2(a1)
; RV32I-NEXT: lbu a1, 3(a1)
-; RV32I-NEXT: slli a5, a5, 24
-; RV32I-NEXT: or a3, a5, a3
-; RV32I-NEXT: slli a6, a6, 16
+; RV32I-NEXT: slli a5, a5, 8
+; RV32I-NEXT: or a5, a5, a6
+; RV32I-NEXT: slli a7, a7, 16
; RV32I-NEXT: slli a1, a1, 24
-; RV32I-NEXT: or a4, a6, a4
-; RV32I-NEXT: or a1, a1, a4
-; RV32I-NEXT: slli a4, a1, 3
-; RV32I-NEXT: addi a6, a4, -32
-; RV32I-NEXT: sra a1, a3, a4
+; RV32I-NEXT: or a1, a1, a7
+; RV32I-NEXT: or a1, a1, a5
+; RV32I-NEXT: slli a5, a1, 3
+; RV32I-NEXT: addi a6, a5, -32
+; RV32I-NEXT: sra a1, a3, a5
; RV32I-NEXT: bltz a6, .LBB5_2
; RV32I-NEXT: # %bb.1:
-; RV32I-NEXT: srai a5, a5, 31
+; RV32I-NEXT: srai a4, a4, 31
; RV32I-NEXT: mv a0, a1
-; RV32I-NEXT: mv a1, a5
+; RV32I-NEXT: mv a1, a4
; RV32I-NEXT: j .LBB5_3
; RV32I-NEXT: .LBB5_2:
-; RV32I-NEXT: lbu a5, 1(a0)
+; RV32I-NEXT: lbu a4, 1(a0)
; RV32I-NEXT: lbu a6, 0(a0)
; RV32I-NEXT: lbu a7, 2(a0)
; RV32I-NEXT: lbu a0, 3(a0)
-; RV32I-NEXT: slli a5, a5, 8
-; RV32I-NEXT: or a5, a5, a6
+; RV32I-NEXT: slli a4, a4, 8
+; RV32I-NEXT: or a4, a4, a6
; RV32I-NEXT: slli a7, a7, 16
; RV32I-NEXT: slli a0, a0, 24
-; RV32I-NEXT: or a5, a7, a5
-; RV32I-NEXT: or a0, a0, a5
-; RV32I-NEXT: srl a0, a0, a4
+; RV32I-NEXT: or a0, a0, a7
+; RV32I-NEXT: or a0, a0, a4
+; RV32I-NEXT: srl a0, a0, a5
; RV32I-NEXT: slli a3, a3, 1
-; RV32I-NEXT: not a4, a4
+; RV32I-NEXT: not a4, a5
; RV32I-NEXT: sll a3, a3, a4
; RV32I-NEXT: or a0, a0, a3
; RV32I-NEXT: .LBB5_3:
; RV64I-NEXT: lbu a3, 9(a0)
; RV64I-NEXT: lbu a4, 8(a0)
; RV64I-NEXT: lbu a5, 10(a0)
+; RV64I-NEXT: lbu a6, 11(a0)
; RV64I-NEXT: slli a3, a3, 8
; RV64I-NEXT: or a3, a3, a4
; RV64I-NEXT: slli a5, a5, 16
+; RV64I-NEXT: slli a6, a6, 24
+; RV64I-NEXT: or a4, a6, a5
+; RV64I-NEXT: or a3, a4, a3
; RV64I-NEXT: lbu a4, 13(a0)
-; RV64I-NEXT: lbu a6, 11(a0)
-; RV64I-NEXT: or a3, a5, a3
; RV64I-NEXT: lbu a5, 12(a0)
+; RV64I-NEXT: lbu a6, 14(a0)
+; RV64I-NEXT: lbu a7, 15(a0)
; RV64I-NEXT: slli a4, a4, 8
-; RV64I-NEXT: lbu a7, 14(a0)
-; RV64I-NEXT: lbu t0, 15(a0)
; RV64I-NEXT: or a4, a4, a5
-; RV64I-NEXT: slli a6, a6, 24
-; RV64I-NEXT: slli a7, a7, 16
-; RV64I-NEXT: slli t0, t0, 24
-; RV64I-NEXT: or a4, a7, a4
-; RV64I-NEXT: or a4, t0, a4
+; RV64I-NEXT: slli a6, a6, 16
+; RV64I-NEXT: slli a7, a7, 24
+; RV64I-NEXT: or a5, a7, a6
+; RV64I-NEXT: or a4, a5, a4
; RV64I-NEXT: slli a4, a4, 32
; RV64I-NEXT: or a3, a4, a3
-; RV64I-NEXT: or a3, a3, a6
; RV64I-NEXT: lbu a4, 5(a1)
; RV64I-NEXT: lbu a5, 4(a1)
; RV64I-NEXT: lbu a6, 6(a1)
; RV64I-NEXT: or a4, a4, a5
; RV64I-NEXT: slli a6, a6, 16
; RV64I-NEXT: slli a7, a7, 24
-; RV64I-NEXT: or a4, a6, a4
-; RV64I-NEXT: or a4, a7, a4
+; RV64I-NEXT: or a5, a7, a6
+; RV64I-NEXT: or a4, a5, a4
; RV64I-NEXT: lbu a5, 1(a1)
; RV64I-NEXT: lbu a6, 0(a1)
; RV64I-NEXT: lbu a7, 2(a1)
; RV64I-NEXT: or a5, a5, a6
; RV64I-NEXT: slli a7, a7, 16
; RV64I-NEXT: slli a1, a1, 24
-; RV64I-NEXT: or a5, a7, a5
+; RV64I-NEXT: or a1, a1, a7
; RV64I-NEXT: or a1, a1, a5
; RV64I-NEXT: slli a1, a1, 3
; RV64I-NEXT: slli a4, a4, 35
; RV64I-NEXT: lbu a6, 1(a0)
; RV64I-NEXT: lbu a7, 0(a0)
; RV64I-NEXT: lbu t0, 2(a0)
+; RV64I-NEXT: lbu t1, 3(a0)
; RV64I-NEXT: slli a6, a6, 8
; RV64I-NEXT: or a6, a6, a7
; RV64I-NEXT: slli t0, t0, 16
+; RV64I-NEXT: slli t1, t1, 24
+; RV64I-NEXT: or a7, t1, t0
+; RV64I-NEXT: or a6, a7, a6
; RV64I-NEXT: lbu a7, 5(a0)
-; RV64I-NEXT: lbu t1, 3(a0)
-; RV64I-NEXT: or a6, t0, a6
; RV64I-NEXT: lbu t0, 4(a0)
-; RV64I-NEXT: slli a7, a7, 8
-; RV64I-NEXT: lbu t2, 6(a0)
+; RV64I-NEXT: lbu t1, 6(a0)
; RV64I-NEXT: lbu a0, 7(a0)
+; RV64I-NEXT: slli a7, a7, 8
; RV64I-NEXT: or a7, a7, t0
-; RV64I-NEXT: slli t1, t1, 24
-; RV64I-NEXT: slli t2, t2, 16
+; RV64I-NEXT: slli t1, t1, 16
; RV64I-NEXT: slli a0, a0, 24
-; RV64I-NEXT: or a7, t2, a7
+; RV64I-NEXT: or a0, a0, t1
; RV64I-NEXT: or a0, a0, a7
; RV64I-NEXT: slli a0, a0, 32
; RV64I-NEXT: or a0, a0, a6
-; RV64I-NEXT: or a0, a0, t1
; RV64I-NEXT: srl a0, a0, a5
; RV64I-NEXT: not a5, a5
; RV64I-NEXT: slli a3, a3, 1
; RV64I-NEXT: lbu a3, 1(a0)
; RV64I-NEXT: lbu a4, 0(a0)
; RV64I-NEXT: lbu a5, 2(a0)
+; RV64I-NEXT: lbu a6, 3(a0)
; RV64I-NEXT: slli a3, a3, 8
; RV64I-NEXT: or a3, a3, a4
; RV64I-NEXT: slli a5, a5, 16
+; RV64I-NEXT: slli a6, a6, 24
+; RV64I-NEXT: or a4, a6, a5
+; RV64I-NEXT: or a3, a4, a3
; RV64I-NEXT: lbu a4, 5(a0)
-; RV64I-NEXT: lbu a6, 3(a0)
-; RV64I-NEXT: or a3, a5, a3
; RV64I-NEXT: lbu a5, 4(a0)
+; RV64I-NEXT: lbu a6, 6(a0)
+; RV64I-NEXT: lbu a7, 7(a0)
; RV64I-NEXT: slli a4, a4, 8
-; RV64I-NEXT: lbu a7, 6(a0)
-; RV64I-NEXT: lbu t0, 7(a0)
; RV64I-NEXT: or a4, a4, a5
-; RV64I-NEXT: slli a6, a6, 24
-; RV64I-NEXT: slli a7, a7, 16
-; RV64I-NEXT: slli t0, t0, 24
-; RV64I-NEXT: or a4, a7, a4
-; RV64I-NEXT: or a4, t0, a4
+; RV64I-NEXT: slli a6, a6, 16
+; RV64I-NEXT: slli a7, a7, 24
+; RV64I-NEXT: or a5, a7, a6
+; RV64I-NEXT: or a4, a5, a4
; RV64I-NEXT: slli a4, a4, 32
; RV64I-NEXT: or a3, a4, a3
-; RV64I-NEXT: or a3, a3, a6
; RV64I-NEXT: lbu a4, 5(a1)
; RV64I-NEXT: lbu a5, 4(a1)
; RV64I-NEXT: lbu a6, 6(a1)
; RV64I-NEXT: or a4, a4, a5
; RV64I-NEXT: slli a6, a6, 16
; RV64I-NEXT: slli a7, a7, 24
-; RV64I-NEXT: or a4, a6, a4
-; RV64I-NEXT: or a4, a7, a4
+; RV64I-NEXT: or a5, a7, a6
+; RV64I-NEXT: or a4, a5, a4
; RV64I-NEXT: lbu a5, 1(a1)
; RV64I-NEXT: lbu a6, 0(a1)
; RV64I-NEXT: lbu a7, 2(a1)
; RV64I-NEXT: or a5, a5, a6
; RV64I-NEXT: slli a7, a7, 16
; RV64I-NEXT: slli a1, a1, 24
-; RV64I-NEXT: or a5, a7, a5
+; RV64I-NEXT: or a1, a1, a7
; RV64I-NEXT: or a1, a1, a5
; RV64I-NEXT: slli a1, a1, 3
; RV64I-NEXT: slli a4, a4, 35
; RV64I-NEXT: lbu a6, 9(a0)
; RV64I-NEXT: lbu a7, 8(a0)
; RV64I-NEXT: lbu t0, 10(a0)
+; RV64I-NEXT: lbu t1, 11(a0)
; RV64I-NEXT: slli a6, a6, 8
; RV64I-NEXT: or a6, a6, a7
; RV64I-NEXT: slli t0, t0, 16
+; RV64I-NEXT: slli t1, t1, 24
+; RV64I-NEXT: or a7, t1, t0
+; RV64I-NEXT: or a6, a7, a6
; RV64I-NEXT: lbu a7, 13(a0)
-; RV64I-NEXT: lbu t1, 11(a0)
-; RV64I-NEXT: or a6, t0, a6
; RV64I-NEXT: lbu t0, 12(a0)
-; RV64I-NEXT: slli a7, a7, 8
-; RV64I-NEXT: lbu t2, 14(a0)
+; RV64I-NEXT: lbu t1, 14(a0)
; RV64I-NEXT: lbu a0, 15(a0)
+; RV64I-NEXT: slli a7, a7, 8
; RV64I-NEXT: or a7, a7, t0
-; RV64I-NEXT: slli t1, t1, 24
-; RV64I-NEXT: slli t2, t2, 16
+; RV64I-NEXT: slli t1, t1, 16
; RV64I-NEXT: slli a0, a0, 24
-; RV64I-NEXT: or a7, t2, a7
+; RV64I-NEXT: or a0, a0, t1
; RV64I-NEXT: or a0, a0, a7
; RV64I-NEXT: slli a0, a0, 32
; RV64I-NEXT: or a0, a0, a6
-; RV64I-NEXT: or a0, a0, t1
; RV64I-NEXT: sll a0, a0, a5
; RV64I-NEXT: not a5, a5
; RV64I-NEXT: srli a3, a3, 1
; RV64I-NEXT: or a3, a3, a4
; RV64I-NEXT: slli a5, a5, 16
; RV64I-NEXT: slli a6, a6, 24
-; RV64I-NEXT: or a3, a5, a3
+; RV64I-NEXT: or a4, a6, a5
+; RV64I-NEXT: or a3, a4, a3
; RV64I-NEXT: lbu a4, 13(a0)
; RV64I-NEXT: lbu a5, 12(a0)
-; RV64I-NEXT: lbu a7, 14(a0)
-; RV64I-NEXT: lbu t0, 15(a0)
+; RV64I-NEXT: lbu a6, 14(a0)
+; RV64I-NEXT: lbu a7, 15(a0)
; RV64I-NEXT: slli a4, a4, 8
; RV64I-NEXT: or a4, a4, a5
-; RV64I-NEXT: slli a7, a7, 16
-; RV64I-NEXT: slli t0, t0, 24
-; RV64I-NEXT: or a4, a7, a4
-; RV64I-NEXT: or a4, t0, a4
+; RV64I-NEXT: slli a6, a6, 16
+; RV64I-NEXT: slli a7, a7, 24
+; RV64I-NEXT: or a5, a7, a6
+; RV64I-NEXT: or a4, a5, a4
; RV64I-NEXT: slli a5, a4, 32
; RV64I-NEXT: or a3, a5, a3
-; RV64I-NEXT: or a3, a3, a6
; RV64I-NEXT: lbu a5, 5(a1)
; RV64I-NEXT: lbu a6, 4(a1)
; RV64I-NEXT: lbu a7, 6(a1)
; RV64I-NEXT: or a5, a5, a6
; RV64I-NEXT: slli a7, a7, 16
; RV64I-NEXT: slli t0, t0, 24
-; RV64I-NEXT: or a5, a7, a5
-; RV64I-NEXT: or a5, t0, a5
+; RV64I-NEXT: or a6, t0, a7
+; RV64I-NEXT: or a5, a6, a5
; RV64I-NEXT: lbu a6, 1(a1)
; RV64I-NEXT: lbu a7, 0(a1)
; RV64I-NEXT: lbu t0, 2(a1)
; RV64I-NEXT: or a6, a6, a7
; RV64I-NEXT: slli t0, t0, 16
; RV64I-NEXT: slli a1, a1, 24
-; RV64I-NEXT: or a6, t0, a6
+; RV64I-NEXT: or a1, a1, t0
; RV64I-NEXT: or a1, a1, a6
; RV64I-NEXT: slli a1, a1, 3
; RV64I-NEXT: slli a5, a5, 35
; RV64I-NEXT: lbu a4, 1(a0)
; RV64I-NEXT: lbu a6, 0(a0)
; RV64I-NEXT: lbu a7, 2(a0)
+; RV64I-NEXT: lbu t0, 3(a0)
; RV64I-NEXT: slli a4, a4, 8
; RV64I-NEXT: or a4, a4, a6
; RV64I-NEXT: slli a7, a7, 16
+; RV64I-NEXT: slli t0, t0, 24
+; RV64I-NEXT: or a6, t0, a7
+; RV64I-NEXT: or a4, a6, a4
; RV64I-NEXT: lbu a6, 5(a0)
-; RV64I-NEXT: lbu t0, 3(a0)
-; RV64I-NEXT: or a4, a7, a4
; RV64I-NEXT: lbu a7, 4(a0)
-; RV64I-NEXT: slli a6, a6, 8
-; RV64I-NEXT: lbu t1, 6(a0)
+; RV64I-NEXT: lbu t0, 6(a0)
; RV64I-NEXT: lbu a0, 7(a0)
+; RV64I-NEXT: slli a6, a6, 8
; RV64I-NEXT: or a6, a6, a7
-; RV64I-NEXT: slli t0, t0, 24
-; RV64I-NEXT: slli t1, t1, 16
+; RV64I-NEXT: slli t0, t0, 16
; RV64I-NEXT: slli a0, a0, 24
-; RV64I-NEXT: or a6, t1, a6
+; RV64I-NEXT: or a0, a0, t0
; RV64I-NEXT: or a0, a0, a6
; RV64I-NEXT: slli a0, a0, 32
; RV64I-NEXT: or a0, a0, a4
-; RV64I-NEXT: or a0, a0, t0
; RV64I-NEXT: srl a0, a0, a5
; RV64I-NEXT: not a4, a5
; RV64I-NEXT: slli a3, a3, 1
; RV64I-NEXT: slli a5, a5, 16
; RV64I-NEXT: lbu a1, 0(a1)
; RV64I-NEXT: slli a0, a0, 24
-; RV64I-NEXT: or a3, a5, a3
+; RV64I-NEXT: or a0, a0, a5
; RV64I-NEXT: or a0, a0, a3
; RV64I-NEXT: srlw a0, a0, a1
; RV64I-NEXT: sb a0, 0(a2)
; RV32I-NEXT: or a3, a3, a4
; RV32I-NEXT: slli a5, a5, 16
; RV32I-NEXT: slli a0, a0, 24
-; RV32I-NEXT: or a3, a5, a3
+; RV32I-NEXT: or a0, a0, a5
; RV32I-NEXT: or a0, a0, a3
; RV32I-NEXT: lbu a3, 1(a1)
; RV32I-NEXT: lbu a4, 0(a1)
; RV32I-NEXT: or a3, a3, a4
; RV32I-NEXT: slli a5, a5, 16
; RV32I-NEXT: slli a1, a1, 24
-; RV32I-NEXT: or a3, a5, a3
+; RV32I-NEXT: or a1, a1, a5
; RV32I-NEXT: or a1, a1, a3
; RV32I-NEXT: srl a0, a0, a1
; RV32I-NEXT: sb a0, 0(a2)
; RV64I-NEXT: slli a5, a5, 16
; RV64I-NEXT: lbu a1, 0(a1)
; RV64I-NEXT: slli a0, a0, 24
-; RV64I-NEXT: or a3, a5, a3
+; RV64I-NEXT: or a0, a0, a5
; RV64I-NEXT: or a0, a0, a3
; RV64I-NEXT: sllw a0, a0, a1
; RV64I-NEXT: sb a0, 0(a2)
; RV32I-NEXT: or a3, a3, a4
; RV32I-NEXT: slli a5, a5, 16
; RV32I-NEXT: slli a0, a0, 24
-; RV32I-NEXT: or a3, a5, a3
+; RV32I-NEXT: or a0, a0, a5
; RV32I-NEXT: or a0, a0, a3
; RV32I-NEXT: lbu a3, 1(a1)
; RV32I-NEXT: lbu a4, 0(a1)
; RV32I-NEXT: or a3, a3, a4
; RV32I-NEXT: slli a5, a5, 16
; RV32I-NEXT: slli a1, a1, 24
-; RV32I-NEXT: or a3, a5, a3
+; RV32I-NEXT: or a1, a1, a5
; RV32I-NEXT: or a1, a1, a3
; RV32I-NEXT: sll a0, a0, a1
; RV32I-NEXT: sb a0, 0(a2)
; RV64I-NEXT: slli a5, a5, 16
; RV64I-NEXT: lbu a1, 0(a1)
; RV64I-NEXT: slli a0, a0, 24
-; RV64I-NEXT: or a3, a5, a3
+; RV64I-NEXT: or a0, a0, a5
; RV64I-NEXT: or a0, a0, a3
; RV64I-NEXT: sraw a0, a0, a1
; RV64I-NEXT: sb a0, 0(a2)
; RV32I-NEXT: or a3, a3, a4
; RV32I-NEXT: slli a5, a5, 16
; RV32I-NEXT: slli a0, a0, 24
-; RV32I-NEXT: or a3, a5, a3
+; RV32I-NEXT: or a0, a0, a5
; RV32I-NEXT: or a0, a0, a3
; RV32I-NEXT: lbu a3, 1(a1)
; RV32I-NEXT: lbu a4, 0(a1)
; RV32I-NEXT: or a3, a3, a4
; RV32I-NEXT: slli a5, a5, 16
; RV32I-NEXT: slli a1, a1, 24
-; RV32I-NEXT: or a3, a5, a3
+; RV32I-NEXT: or a1, a1, a5
; RV32I-NEXT: or a1, a1, a3
; RV32I-NEXT: sra a0, a0, a1
; RV32I-NEXT: sb a0, 0(a2)
; RV64I-NEXT: or a3, a3, a4
; RV64I-NEXT: slli a5, a5, 16
; RV64I-NEXT: slli a6, a6, 24
-; RV64I-NEXT: or a3, a5, a3
+; RV64I-NEXT: or a4, a6, a5
+; RV64I-NEXT: or a3, a4, a3
; RV64I-NEXT: lbu a4, 5(a0)
; RV64I-NEXT: lbu a5, 4(a0)
-; RV64I-NEXT: lbu a7, 6(a0)
+; RV64I-NEXT: lbu a6, 6(a0)
; RV64I-NEXT: lbu a0, 7(a0)
; RV64I-NEXT: slli a4, a4, 8
; RV64I-NEXT: or a4, a4, a5
-; RV64I-NEXT: slli a7, a7, 16
+; RV64I-NEXT: slli a6, a6, 16
; RV64I-NEXT: slli a0, a0, 24
-; RV64I-NEXT: or a4, a7, a4
+; RV64I-NEXT: or a0, a0, a6
; RV64I-NEXT: or a0, a0, a4
; RV64I-NEXT: slli a0, a0, 32
; RV64I-NEXT: or a0, a0, a3
; RV64I-NEXT: lbu a3, 1(a1)
; RV64I-NEXT: lbu a4, 0(a1)
; RV64I-NEXT: lbu a5, 2(a1)
-; RV64I-NEXT: or a0, a0, a6
+; RV64I-NEXT: lbu a6, 3(a1)
; RV64I-NEXT: slli a3, a3, 8
; RV64I-NEXT: or a3, a3, a4
; RV64I-NEXT: slli a5, a5, 16
+; RV64I-NEXT: slli a6, a6, 24
+; RV64I-NEXT: or a4, a6, a5
+; RV64I-NEXT: or a3, a4, a3
; RV64I-NEXT: lbu a4, 5(a1)
-; RV64I-NEXT: lbu a6, 3(a1)
-; RV64I-NEXT: or a3, a5, a3
; RV64I-NEXT: lbu a5, 4(a1)
-; RV64I-NEXT: slli a4, a4, 8
-; RV64I-NEXT: lbu a7, 6(a1)
+; RV64I-NEXT: lbu a6, 6(a1)
; RV64I-NEXT: lbu a1, 7(a1)
+; RV64I-NEXT: slli a4, a4, 8
; RV64I-NEXT: or a4, a4, a5
-; RV64I-NEXT: slli a6, a6, 24
-; RV64I-NEXT: slli a7, a7, 16
+; RV64I-NEXT: slli a6, a6, 16
; RV64I-NEXT: slli a1, a1, 24
-; RV64I-NEXT: or a4, a7, a4
+; RV64I-NEXT: or a1, a1, a6
; RV64I-NEXT: or a1, a1, a4
; RV64I-NEXT: slli a1, a1, 32
; RV64I-NEXT: or a1, a1, a3
-; RV64I-NEXT: or a1, a1, a6
; RV64I-NEXT: srl a0, a0, a1
; RV64I-NEXT: sb a0, 0(a2)
; RV64I-NEXT: srli a1, a0, 48
; RV32I-NEXT: or a3, a3, a4
; RV32I-NEXT: slli a5, a5, 16
; RV32I-NEXT: slli a6, a6, 24
-; RV32I-NEXT: or a3, a5, a3
-; RV32I-NEXT: or a3, a6, a3
+; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: or a3, a4, a3
; RV32I-NEXT: lbu a4, 1(a1)
; RV32I-NEXT: lbu a5, 0(a1)
; RV32I-NEXT: lbu a6, 2(a1)
; RV32I-NEXT: or a4, a4, a5
; RV32I-NEXT: slli a6, a6, 16
; RV32I-NEXT: slli a1, a1, 24
-; RV32I-NEXT: or a5, a6, a4
-; RV32I-NEXT: or a5, a1, a5
+; RV32I-NEXT: or a5, a1, a6
+; RV32I-NEXT: or a5, a5, a4
; RV32I-NEXT: addi a4, a5, -32
; RV32I-NEXT: srl a1, a3, a5
; RV32I-NEXT: bltz a4, .LBB3_2
; RV32I-NEXT: or a6, a6, a7
; RV32I-NEXT: slli t0, t0, 16
; RV32I-NEXT: slli a0, a0, 24
-; RV32I-NEXT: or a6, t0, a6
+; RV32I-NEXT: or a0, a0, t0
; RV32I-NEXT: or a0, a0, a6
; RV32I-NEXT: srl a0, a0, a5
; RV32I-NEXT: not a5, a5
; RV64I-NEXT: or a3, a3, a4
; RV64I-NEXT: slli a5, a5, 16
; RV64I-NEXT: slli a6, a6, 24
-; RV64I-NEXT: or a3, a5, a3
+; RV64I-NEXT: or a4, a6, a5
+; RV64I-NEXT: or a3, a4, a3
; RV64I-NEXT: lbu a4, 5(a0)
; RV64I-NEXT: lbu a5, 4(a0)
-; RV64I-NEXT: lbu a7, 6(a0)
+; RV64I-NEXT: lbu a6, 6(a0)
; RV64I-NEXT: lbu a0, 7(a0)
; RV64I-NEXT: slli a4, a4, 8
; RV64I-NEXT: or a4, a4, a5
-; RV64I-NEXT: slli a7, a7, 16
+; RV64I-NEXT: slli a6, a6, 16
; RV64I-NEXT: slli a0, a0, 24
-; RV64I-NEXT: or a4, a7, a4
+; RV64I-NEXT: or a0, a0, a6
; RV64I-NEXT: or a0, a0, a4
; RV64I-NEXT: slli a0, a0, 32
; RV64I-NEXT: or a0, a0, a3
; RV64I-NEXT: lbu a3, 1(a1)
; RV64I-NEXT: lbu a4, 0(a1)
; RV64I-NEXT: lbu a5, 2(a1)
-; RV64I-NEXT: or a0, a0, a6
+; RV64I-NEXT: lbu a6, 3(a1)
; RV64I-NEXT: slli a3, a3, 8
; RV64I-NEXT: or a3, a3, a4
; RV64I-NEXT: slli a5, a5, 16
+; RV64I-NEXT: slli a6, a6, 24
+; RV64I-NEXT: or a4, a6, a5
+; RV64I-NEXT: or a3, a4, a3
; RV64I-NEXT: lbu a4, 5(a1)
-; RV64I-NEXT: lbu a6, 3(a1)
-; RV64I-NEXT: or a3, a5, a3
; RV64I-NEXT: lbu a5, 4(a1)
-; RV64I-NEXT: slli a4, a4, 8
-; RV64I-NEXT: lbu a7, 6(a1)
+; RV64I-NEXT: lbu a6, 6(a1)
; RV64I-NEXT: lbu a1, 7(a1)
+; RV64I-NEXT: slli a4, a4, 8
; RV64I-NEXT: or a4, a4, a5
-; RV64I-NEXT: slli a6, a6, 24
-; RV64I-NEXT: slli a7, a7, 16
+; RV64I-NEXT: slli a6, a6, 16
; RV64I-NEXT: slli a1, a1, 24
-; RV64I-NEXT: or a4, a7, a4
+; RV64I-NEXT: or a1, a1, a6
; RV64I-NEXT: or a1, a1, a4
; RV64I-NEXT: slli a1, a1, 32
; RV64I-NEXT: or a1, a1, a3
-; RV64I-NEXT: or a1, a1, a6
; RV64I-NEXT: sll a0, a0, a1
; RV64I-NEXT: sb a0, 0(a2)
; RV64I-NEXT: srli a1, a0, 48
; RV32I-NEXT: or a3, a3, a4
; RV32I-NEXT: slli a5, a5, 16
; RV32I-NEXT: slli a6, a6, 24
-; RV32I-NEXT: or a3, a5, a3
-; RV32I-NEXT: or a3, a6, a3
+; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: or a3, a4, a3
; RV32I-NEXT: lbu a4, 1(a1)
; RV32I-NEXT: lbu a5, 0(a1)
; RV32I-NEXT: lbu a6, 2(a1)
; RV32I-NEXT: or a4, a4, a5
; RV32I-NEXT: slli a6, a6, 16
; RV32I-NEXT: slli a1, a1, 24
-; RV32I-NEXT: or a5, a6, a4
-; RV32I-NEXT: or a5, a1, a5
+; RV32I-NEXT: or a5, a1, a6
+; RV32I-NEXT: or a5, a5, a4
; RV32I-NEXT: addi a4, a5, -32
; RV32I-NEXT: sll a1, a3, a5
; RV32I-NEXT: bltz a4, .LBB4_2
; RV32I-NEXT: or a6, a6, a7
; RV32I-NEXT: slli t0, t0, 16
; RV32I-NEXT: slli a0, a0, 24
-; RV32I-NEXT: or a6, t0, a6
+; RV32I-NEXT: or a0, a0, t0
; RV32I-NEXT: or a0, a0, a6
; RV32I-NEXT: sll a0, a0, a5
; RV32I-NEXT: not a5, a5
; RV64I-NEXT: or a3, a3, a4
; RV64I-NEXT: slli a5, a5, 16
; RV64I-NEXT: slli a6, a6, 24
-; RV64I-NEXT: or a3, a5, a3
+; RV64I-NEXT: or a4, a6, a5
+; RV64I-NEXT: or a3, a4, a3
; RV64I-NEXT: lbu a4, 5(a0)
; RV64I-NEXT: lbu a5, 4(a0)
-; RV64I-NEXT: lbu a7, 6(a0)
+; RV64I-NEXT: lbu a6, 6(a0)
; RV64I-NEXT: lbu a0, 7(a0)
; RV64I-NEXT: slli a4, a4, 8
; RV64I-NEXT: or a4, a4, a5
-; RV64I-NEXT: slli a7, a7, 16
+; RV64I-NEXT: slli a6, a6, 16
; RV64I-NEXT: slli a0, a0, 24
-; RV64I-NEXT: or a4, a7, a4
+; RV64I-NEXT: or a0, a0, a6
; RV64I-NEXT: or a0, a0, a4
; RV64I-NEXT: slli a0, a0, 32
; RV64I-NEXT: or a0, a0, a3
; RV64I-NEXT: lbu a3, 1(a1)
; RV64I-NEXT: lbu a4, 0(a1)
; RV64I-NEXT: lbu a5, 2(a1)
-; RV64I-NEXT: or a0, a0, a6
+; RV64I-NEXT: lbu a6, 3(a1)
; RV64I-NEXT: slli a3, a3, 8
; RV64I-NEXT: or a3, a3, a4
; RV64I-NEXT: slli a5, a5, 16
+; RV64I-NEXT: slli a6, a6, 24
+; RV64I-NEXT: or a4, a6, a5
+; RV64I-NEXT: or a3, a4, a3
; RV64I-NEXT: lbu a4, 5(a1)
-; RV64I-NEXT: lbu a6, 3(a1)
-; RV64I-NEXT: or a3, a5, a3
; RV64I-NEXT: lbu a5, 4(a1)
-; RV64I-NEXT: slli a4, a4, 8
-; RV64I-NEXT: lbu a7, 6(a1)
+; RV64I-NEXT: lbu a6, 6(a1)
; RV64I-NEXT: lbu a1, 7(a1)
+; RV64I-NEXT: slli a4, a4, 8
; RV64I-NEXT: or a4, a4, a5
-; RV64I-NEXT: slli a6, a6, 24
-; RV64I-NEXT: slli a7, a7, 16
+; RV64I-NEXT: slli a6, a6, 16
; RV64I-NEXT: slli a1, a1, 24
-; RV64I-NEXT: or a4, a7, a4
+; RV64I-NEXT: or a1, a1, a6
; RV64I-NEXT: or a1, a1, a4
; RV64I-NEXT: slli a1, a1, 32
; RV64I-NEXT: or a1, a1, a3
-; RV64I-NEXT: or a1, a1, a6
; RV64I-NEXT: sra a0, a0, a1
; RV64I-NEXT: sb a0, 0(a2)
; RV64I-NEXT: srli a1, a0, 48
; RV32I-NEXT: lbu a3, 5(a0)
; RV32I-NEXT: lbu a4, 4(a0)
; RV32I-NEXT: lbu a5, 6(a0)
+; RV32I-NEXT: lbu a6, 7(a0)
; RV32I-NEXT: slli a3, a3, 8
; RV32I-NEXT: or a3, a3, a4
; RV32I-NEXT: slli a5, a5, 16
-; RV32I-NEXT: lbu a4, 1(a1)
-; RV32I-NEXT: lbu a6, 0(a1)
+; RV32I-NEXT: slli a4, a6, 24
+; RV32I-NEXT: or a5, a4, a5
; RV32I-NEXT: or a3, a5, a3
-; RV32I-NEXT: lbu a5, 7(a0)
-; RV32I-NEXT: slli a4, a4, 8
-; RV32I-NEXT: or a4, a4, a6
-; RV32I-NEXT: lbu a6, 2(a1)
+; RV32I-NEXT: lbu a5, 1(a1)
+; RV32I-NEXT: lbu a6, 0(a1)
+; RV32I-NEXT: lbu a7, 2(a1)
; RV32I-NEXT: lbu a1, 3(a1)
-; RV32I-NEXT: slli a5, a5, 24
-; RV32I-NEXT: or a3, a5, a3
-; RV32I-NEXT: slli a6, a6, 16
+; RV32I-NEXT: slli a5, a5, 8
+; RV32I-NEXT: or a5, a5, a6
+; RV32I-NEXT: slli a7, a7, 16
; RV32I-NEXT: slli a1, a1, 24
-; RV32I-NEXT: or a4, a6, a4
-; RV32I-NEXT: or a4, a1, a4
-; RV32I-NEXT: addi a6, a4, -32
-; RV32I-NEXT: sra a1, a3, a4
+; RV32I-NEXT: or a1, a1, a7
+; RV32I-NEXT: or a5, a1, a5
+; RV32I-NEXT: addi a6, a5, -32
+; RV32I-NEXT: sra a1, a3, a5
; RV32I-NEXT: bltz a6, .LBB5_2
; RV32I-NEXT: # %bb.1:
-; RV32I-NEXT: srai a5, a5, 31
+; RV32I-NEXT: srai a4, a4, 31
; RV32I-NEXT: mv a0, a1
-; RV32I-NEXT: mv a1, a5
+; RV32I-NEXT: mv a1, a4
; RV32I-NEXT: j .LBB5_3
; RV32I-NEXT: .LBB5_2:
-; RV32I-NEXT: lbu a5, 1(a0)
+; RV32I-NEXT: lbu a4, 1(a0)
; RV32I-NEXT: lbu a6, 0(a0)
; RV32I-NEXT: lbu a7, 2(a0)
; RV32I-NEXT: lbu a0, 3(a0)
-; RV32I-NEXT: slli a5, a5, 8
-; RV32I-NEXT: or a5, a5, a6
+; RV32I-NEXT: slli a4, a4, 8
+; RV32I-NEXT: or a4, a4, a6
; RV32I-NEXT: slli a7, a7, 16
; RV32I-NEXT: slli a0, a0, 24
-; RV32I-NEXT: or a5, a7, a5
-; RV32I-NEXT: or a0, a0, a5
-; RV32I-NEXT: srl a0, a0, a4
-; RV32I-NEXT: not a4, a4
+; RV32I-NEXT: or a0, a0, a7
+; RV32I-NEXT: or a0, a0, a4
+; RV32I-NEXT: srl a0, a0, a5
+; RV32I-NEXT: not a4, a5
; RV32I-NEXT: slli a3, a3, 1
; RV32I-NEXT: sll a3, a3, a4
; RV32I-NEXT: or a0, a0, a3
; RV64I-NEXT: or a3, a3, a4
; RV64I-NEXT: slli a5, a5, 16
; RV64I-NEXT: slli a6, a6, 24
-; RV64I-NEXT: or a3, a5, a3
+; RV64I-NEXT: or a4, a6, a5
+; RV64I-NEXT: or a3, a4, a3
; RV64I-NEXT: lbu a4, 13(a0)
; RV64I-NEXT: lbu a5, 12(a0)
-; RV64I-NEXT: lbu a7, 14(a0)
-; RV64I-NEXT: lbu t0, 15(a0)
+; RV64I-NEXT: lbu a6, 14(a0)
+; RV64I-NEXT: lbu a7, 15(a0)
; RV64I-NEXT: slli a4, a4, 8
; RV64I-NEXT: or a4, a4, a5
-; RV64I-NEXT: slli a7, a7, 16
-; RV64I-NEXT: slli t0, t0, 24
-; RV64I-NEXT: or a4, a7, a4
-; RV64I-NEXT: or a4, t0, a4
+; RV64I-NEXT: slli a6, a6, 16
+; RV64I-NEXT: slli a7, a7, 24
+; RV64I-NEXT: or a5, a7, a6
+; RV64I-NEXT: or a4, a5, a4
; RV64I-NEXT: slli a4, a4, 32
; RV64I-NEXT: or a3, a4, a3
; RV64I-NEXT: lbu a4, 1(a1)
; RV64I-NEXT: lbu a5, 0(a1)
-; RV64I-NEXT: lbu a7, 2(a1)
-; RV64I-NEXT: or a3, a3, a6
+; RV64I-NEXT: lbu a6, 2(a1)
+; RV64I-NEXT: lbu a7, 3(a1)
; RV64I-NEXT: slli a4, a4, 8
; RV64I-NEXT: or a4, a4, a5
-; RV64I-NEXT: slli a7, a7, 16
+; RV64I-NEXT: slli a6, a6, 16
+; RV64I-NEXT: slli a7, a7, 24
+; RV64I-NEXT: or a5, a7, a6
+; RV64I-NEXT: or a4, a5, a4
; RV64I-NEXT: lbu a5, 5(a1)
-; RV64I-NEXT: lbu a6, 3(a1)
-; RV64I-NEXT: or a4, a7, a4
-; RV64I-NEXT: lbu a7, 4(a1)
-; RV64I-NEXT: slli a5, a5, 8
-; RV64I-NEXT: lbu t0, 6(a1)
+; RV64I-NEXT: lbu a6, 4(a1)
+; RV64I-NEXT: lbu a7, 6(a1)
; RV64I-NEXT: lbu a1, 7(a1)
-; RV64I-NEXT: or a5, a5, a7
-; RV64I-NEXT: slli a6, a6, 24
-; RV64I-NEXT: slli t0, t0, 16
+; RV64I-NEXT: slli a5, a5, 8
+; RV64I-NEXT: or a5, a5, a6
+; RV64I-NEXT: slli a7, a7, 16
; RV64I-NEXT: slli a1, a1, 24
-; RV64I-NEXT: or a5, t0, a5
+; RV64I-NEXT: or a1, a1, a7
; RV64I-NEXT: or a1, a1, a5
; RV64I-NEXT: slli a1, a1, 32
-; RV64I-NEXT: or a1, a1, a4
-; RV64I-NEXT: or a5, a1, a6
+; RV64I-NEXT: or a5, a1, a4
; RV64I-NEXT: addi a4, a5, -64
; RV64I-NEXT: srl a1, a3, a5
; RV64I-NEXT: bltz a4, .LBB6_2
; RV64I-NEXT: lbu a6, 1(a0)
; RV64I-NEXT: lbu a7, 0(a0)
; RV64I-NEXT: lbu t0, 2(a0)
+; RV64I-NEXT: lbu t1, 3(a0)
; RV64I-NEXT: slli a6, a6, 8
; RV64I-NEXT: or a6, a6, a7
; RV64I-NEXT: slli t0, t0, 16
+; RV64I-NEXT: slli t1, t1, 24
+; RV64I-NEXT: or a7, t1, t0
+; RV64I-NEXT: or a6, a7, a6
; RV64I-NEXT: lbu a7, 5(a0)
-; RV64I-NEXT: lbu t1, 3(a0)
-; RV64I-NEXT: or a6, t0, a6
; RV64I-NEXT: lbu t0, 4(a0)
-; RV64I-NEXT: slli a7, a7, 8
-; RV64I-NEXT: lbu t2, 6(a0)
+; RV64I-NEXT: lbu t1, 6(a0)
; RV64I-NEXT: lbu a0, 7(a0)
+; RV64I-NEXT: slli a7, a7, 8
; RV64I-NEXT: or a7, a7, t0
-; RV64I-NEXT: slli t1, t1, 24
-; RV64I-NEXT: slli t2, t2, 16
+; RV64I-NEXT: slli t1, t1, 16
; RV64I-NEXT: slli a0, a0, 24
-; RV64I-NEXT: or a7, t2, a7
+; RV64I-NEXT: or a0, a0, t1
; RV64I-NEXT: or a0, a0, a7
; RV64I-NEXT: slli a0, a0, 32
; RV64I-NEXT: or a0, a0, a6
-; RV64I-NEXT: or a0, a0, t1
; RV64I-NEXT: srl a0, a0, a5
; RV64I-NEXT: not a5, a5
; RV64I-NEXT: slli a3, a3, 1
; RV32I-NEXT: lbu a0, 15(a0)
; RV32I-NEXT: slli s1, s1, 16
; RV32I-NEXT: slli a1, a1, 24
-; RV32I-NEXT: or s0, s1, s0
+; RV32I-NEXT: or a1, a1, s1
; RV32I-NEXT: or a1, a1, s0
; RV32I-NEXT: sb zero, 43(sp)
; RV32I-NEXT: sb zero, 42(sp)
; RV32I-NEXT: or a0, a0, a4
; RV32I-NEXT: slli a5, a5, 16
; RV32I-NEXT: slli a6, a6, 24
-; RV32I-NEXT: or a0, a5, a0
-; RV32I-NEXT: or a4, a6, a0
+; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: or a4, a4, a0
; RV32I-NEXT: andi a5, a1, 7
; RV32I-NEXT: srl a0, a4, a5
; RV32I-NEXT: lbu a1, 9(a3)
; RV32I-NEXT: or a1, a1, a6
; RV32I-NEXT: slli a7, a7, 16
; RV32I-NEXT: slli t0, t0, 24
-; RV32I-NEXT: or a1, a7, a1
-; RV32I-NEXT: or a6, t0, a1
+; RV32I-NEXT: or a6, t0, a7
+; RV32I-NEXT: or a6, a6, a1
; RV32I-NEXT: slli a1, a6, 1
; RV32I-NEXT: not a7, a5
; RV32I-NEXT: sll a1, a1, a7
; RV32I-NEXT: or a7, a7, t0
; RV32I-NEXT: slli t1, t1, 16
; RV32I-NEXT: slli t2, t2, 24
-; RV32I-NEXT: or a7, t1, a7
-; RV32I-NEXT: or a7, t2, a7
+; RV32I-NEXT: or t0, t2, t1
+; RV32I-NEXT: or a7, t0, a7
; RV32I-NEXT: srl a7, a7, a5
; RV32I-NEXT: slli a4, a4, 1
; RV32I-NEXT: xori t0, a5, 31
; RV32I-NEXT: or t1, t1, t2
; RV32I-NEXT: slli t3, t3, 16
; RV32I-NEXT: slli a3, a3, 24
-; RV32I-NEXT: or t1, t3, t1
+; RV32I-NEXT: or a3, a3, t3
; RV32I-NEXT: or a3, a3, t1
; RV32I-NEXT: slli t1, a3, 1
; RV32I-NEXT: sll t0, t1, t0
; RV64I-NEXT: or a3, a3, a4
; RV64I-NEXT: slli a5, a5, 16
; RV64I-NEXT: slli a6, a6, 24
-; RV64I-NEXT: or a3, a5, a3
+; RV64I-NEXT: or a4, a6, a5
+; RV64I-NEXT: or a3, a4, a3
; RV64I-NEXT: lbu a4, 5(a0)
; RV64I-NEXT: lbu a5, 4(a0)
-; RV64I-NEXT: lbu a7, 6(a0)
-; RV64I-NEXT: lbu t0, 7(a0)
+; RV64I-NEXT: lbu a6, 6(a0)
+; RV64I-NEXT: lbu a7, 7(a0)
; RV64I-NEXT: slli a4, a4, 8
; RV64I-NEXT: or a4, a4, a5
-; RV64I-NEXT: slli a7, a7, 16
-; RV64I-NEXT: slli t0, t0, 24
-; RV64I-NEXT: or a4, a7, a4
-; RV64I-NEXT: or a4, t0, a4
+; RV64I-NEXT: slli a6, a6, 16
+; RV64I-NEXT: slli a7, a7, 24
+; RV64I-NEXT: or a5, a7, a6
+; RV64I-NEXT: or a4, a5, a4
; RV64I-NEXT: slli a4, a4, 32
; RV64I-NEXT: or a3, a4, a3
; RV64I-NEXT: lbu a4, 1(a1)
; RV64I-NEXT: lbu a5, 0(a1)
-; RV64I-NEXT: lbu a7, 2(a1)
-; RV64I-NEXT: or a3, a3, a6
+; RV64I-NEXT: lbu a6, 2(a1)
+; RV64I-NEXT: lbu a7, 3(a1)
; RV64I-NEXT: slli a4, a4, 8
; RV64I-NEXT: or a4, a4, a5
-; RV64I-NEXT: slli a7, a7, 16
+; RV64I-NEXT: slli a6, a6, 16
+; RV64I-NEXT: slli a7, a7, 24
+; RV64I-NEXT: or a5, a7, a6
+; RV64I-NEXT: or a4, a5, a4
; RV64I-NEXT: lbu a5, 5(a1)
-; RV64I-NEXT: lbu a6, 3(a1)
-; RV64I-NEXT: or a4, a7, a4
-; RV64I-NEXT: lbu a7, 4(a1)
-; RV64I-NEXT: slli a5, a5, 8
-; RV64I-NEXT: lbu t0, 6(a1)
+; RV64I-NEXT: lbu a6, 4(a1)
+; RV64I-NEXT: lbu a7, 6(a1)
; RV64I-NEXT: lbu a1, 7(a1)
-; RV64I-NEXT: or a5, a5, a7
-; RV64I-NEXT: slli a6, a6, 24
-; RV64I-NEXT: slli t0, t0, 16
+; RV64I-NEXT: slli a5, a5, 8
+; RV64I-NEXT: or a5, a5, a6
+; RV64I-NEXT: slli a7, a7, 16
; RV64I-NEXT: slli a1, a1, 24
-; RV64I-NEXT: or a5, t0, a5
+; RV64I-NEXT: or a1, a1, a7
; RV64I-NEXT: or a1, a1, a5
; RV64I-NEXT: slli a1, a1, 32
-; RV64I-NEXT: or a1, a1, a4
-; RV64I-NEXT: or a5, a1, a6
+; RV64I-NEXT: or a5, a1, a4
; RV64I-NEXT: addi a4, a5, -64
; RV64I-NEXT: sll a1, a3, a5
; RV64I-NEXT: bltz a4, .LBB7_2
; RV64I-NEXT: lbu a6, 9(a0)
; RV64I-NEXT: lbu a7, 8(a0)
; RV64I-NEXT: lbu t0, 10(a0)
+; RV64I-NEXT: lbu t1, 11(a0)
; RV64I-NEXT: slli a6, a6, 8
; RV64I-NEXT: or a6, a6, a7
; RV64I-NEXT: slli t0, t0, 16
+; RV64I-NEXT: slli t1, t1, 24
+; RV64I-NEXT: or a7, t1, t0
+; RV64I-NEXT: or a6, a7, a6
; RV64I-NEXT: lbu a7, 13(a0)
-; RV64I-NEXT: lbu t1, 11(a0)
-; RV64I-NEXT: or a6, t0, a6
; RV64I-NEXT: lbu t0, 12(a0)
-; RV64I-NEXT: slli a7, a7, 8
-; RV64I-NEXT: lbu t2, 14(a0)
+; RV64I-NEXT: lbu t1, 14(a0)
; RV64I-NEXT: lbu a0, 15(a0)
+; RV64I-NEXT: slli a7, a7, 8
; RV64I-NEXT: or a7, a7, t0
-; RV64I-NEXT: slli t1, t1, 24
-; RV64I-NEXT: slli t2, t2, 16
+; RV64I-NEXT: slli t1, t1, 16
; RV64I-NEXT: slli a0, a0, 24
-; RV64I-NEXT: or a7, t2, a7
+; RV64I-NEXT: or a0, a0, t1
; RV64I-NEXT: or a0, a0, a7
; RV64I-NEXT: slli a0, a0, 32
; RV64I-NEXT: or a0, a0, a6
-; RV64I-NEXT: or a0, a0, t1
; RV64I-NEXT: sll a0, a0, a5
; RV64I-NEXT: not a5, a5
; RV64I-NEXT: srli a3, a3, 1
; RV32I-NEXT: lbu a0, 15(a0)
; RV32I-NEXT: slli s1, s1, 16
; RV32I-NEXT: slli a1, a1, 24
-; RV32I-NEXT: or s0, s1, s0
+; RV32I-NEXT: or a1, a1, s1
; RV32I-NEXT: or a1, a1, s0
; RV32I-NEXT: sb zero, 27(sp)
; RV32I-NEXT: sb zero, 26(sp)
; RV32I-NEXT: or a0, a0, a4
; RV32I-NEXT: slli a5, a5, 16
; RV32I-NEXT: slli a6, a6, 24
-; RV32I-NEXT: or a0, a5, a0
-; RV32I-NEXT: or a4, a6, a0
+; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: or a4, a4, a0
; RV32I-NEXT: andi a5, a1, 7
; RV32I-NEXT: sll a0, a4, a5
; RV32I-NEXT: lbu a1, 1(a3)
; RV32I-NEXT: or a1, a1, a6
; RV32I-NEXT: slli a7, a7, 16
; RV32I-NEXT: slli t0, t0, 24
-; RV32I-NEXT: or a1, a7, a1
-; RV32I-NEXT: or a6, t0, a1
+; RV32I-NEXT: or a6, t0, a7
+; RV32I-NEXT: or a6, a6, a1
; RV32I-NEXT: srli a1, a6, 1
; RV32I-NEXT: xori a7, a5, 31
; RV32I-NEXT: srl a1, a1, a7
; RV32I-NEXT: or t0, t0, t1
; RV32I-NEXT: slli t2, t2, 16
; RV32I-NEXT: slli t3, t3, 24
-; RV32I-NEXT: or t0, t2, t0
-; RV32I-NEXT: or t0, t3, t0
+; RV32I-NEXT: or t1, t3, t2
+; RV32I-NEXT: or t0, t1, t0
; RV32I-NEXT: sll t0, t0, a5
; RV32I-NEXT: lbu t1, 9(a3)
; RV32I-NEXT: lbu t2, 8(a3)
; RV32I-NEXT: or t1, t1, t2
; RV32I-NEXT: slli t3, t3, 16
; RV32I-NEXT: slli a3, a3, 24
-; RV32I-NEXT: or t1, t3, t1
+; RV32I-NEXT: or a3, a3, t3
; RV32I-NEXT: or a3, a3, t1
; RV32I-NEXT: srli t1, a3, 1
; RV32I-NEXT: srl a7, t1, a7
; RV64I-NEXT: or a3, a3, a4
; RV64I-NEXT: slli a5, a5, 16
; RV64I-NEXT: slli a6, a6, 24
-; RV64I-NEXT: or a3, a5, a3
+; RV64I-NEXT: or a4, a6, a5
+; RV64I-NEXT: or a3, a4, a3
; RV64I-NEXT: lbu a4, 13(a0)
; RV64I-NEXT: lbu a5, 12(a0)
-; RV64I-NEXT: lbu a7, 14(a0)
-; RV64I-NEXT: lbu t0, 15(a0)
+; RV64I-NEXT: lbu a6, 14(a0)
+; RV64I-NEXT: lbu a7, 15(a0)
; RV64I-NEXT: slli a4, a4, 8
; RV64I-NEXT: or a4, a4, a5
-; RV64I-NEXT: slli a7, a7, 16
-; RV64I-NEXT: slli t0, t0, 24
-; RV64I-NEXT: or a4, a7, a4
-; RV64I-NEXT: or a4, t0, a4
+; RV64I-NEXT: slli a6, a6, 16
+; RV64I-NEXT: slli a7, a7, 24
+; RV64I-NEXT: or a5, a7, a6
+; RV64I-NEXT: or a4, a5, a4
; RV64I-NEXT: slli a5, a4, 32
; RV64I-NEXT: or a3, a5, a3
; RV64I-NEXT: lbu a5, 1(a1)
-; RV64I-NEXT: lbu a7, 0(a1)
-; RV64I-NEXT: lbu t0, 2(a1)
-; RV64I-NEXT: or a3, a3, a6
+; RV64I-NEXT: lbu a6, 0(a1)
+; RV64I-NEXT: lbu a7, 2(a1)
+; RV64I-NEXT: lbu t0, 3(a1)
; RV64I-NEXT: slli a5, a5, 8
-; RV64I-NEXT: or a5, a5, a7
-; RV64I-NEXT: slli t0, t0, 16
+; RV64I-NEXT: or a5, a5, a6
+; RV64I-NEXT: slli a7, a7, 16
+; RV64I-NEXT: slli t0, t0, 24
+; RV64I-NEXT: or a6, t0, a7
+; RV64I-NEXT: or a5, a6, a5
; RV64I-NEXT: lbu a6, 5(a1)
-; RV64I-NEXT: lbu a7, 3(a1)
-; RV64I-NEXT: or a5, t0, a5
-; RV64I-NEXT: lbu t0, 4(a1)
-; RV64I-NEXT: slli a6, a6, 8
-; RV64I-NEXT: lbu t1, 6(a1)
+; RV64I-NEXT: lbu a7, 4(a1)
+; RV64I-NEXT: lbu t0, 6(a1)
; RV64I-NEXT: lbu a1, 7(a1)
-; RV64I-NEXT: or a6, a6, t0
-; RV64I-NEXT: slli a7, a7, 24
-; RV64I-NEXT: slli t1, t1, 16
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a6, a6, a7
+; RV64I-NEXT: slli t0, t0, 16
; RV64I-NEXT: slli a1, a1, 24
-; RV64I-NEXT: or a6, t1, a6
+; RV64I-NEXT: or a1, a1, t0
; RV64I-NEXT: or a1, a1, a6
; RV64I-NEXT: slli a1, a1, 32
-; RV64I-NEXT: or a1, a1, a5
-; RV64I-NEXT: or a5, a1, a7
+; RV64I-NEXT: or a5, a1, a5
; RV64I-NEXT: addi a6, a5, -64
; RV64I-NEXT: sra a1, a3, a5
; RV64I-NEXT: bltz a6, .LBB8_2
; RV64I-NEXT: lbu a4, 1(a0)
; RV64I-NEXT: lbu a6, 0(a0)
; RV64I-NEXT: lbu a7, 2(a0)
+; RV64I-NEXT: lbu t0, 3(a0)
; RV64I-NEXT: slli a4, a4, 8
; RV64I-NEXT: or a4, a4, a6
; RV64I-NEXT: slli a7, a7, 16
+; RV64I-NEXT: slli t0, t0, 24
+; RV64I-NEXT: or a6, t0, a7
+; RV64I-NEXT: or a4, a6, a4
; RV64I-NEXT: lbu a6, 5(a0)
-; RV64I-NEXT: lbu t0, 3(a0)
-; RV64I-NEXT: or a4, a7, a4
; RV64I-NEXT: lbu a7, 4(a0)
-; RV64I-NEXT: slli a6, a6, 8
-; RV64I-NEXT: lbu t1, 6(a0)
+; RV64I-NEXT: lbu t0, 6(a0)
; RV64I-NEXT: lbu a0, 7(a0)
+; RV64I-NEXT: slli a6, a6, 8
; RV64I-NEXT: or a6, a6, a7
-; RV64I-NEXT: slli t0, t0, 24
-; RV64I-NEXT: slli t1, t1, 16
+; RV64I-NEXT: slli t0, t0, 16
; RV64I-NEXT: slli a0, a0, 24
-; RV64I-NEXT: or a6, t1, a6
+; RV64I-NEXT: or a0, a0, t0
; RV64I-NEXT: or a0, a0, a6
; RV64I-NEXT: slli a0, a0, 32
; RV64I-NEXT: or a0, a0, a4
-; RV64I-NEXT: or a0, a0, t0
; RV64I-NEXT: srl a0, a0, a5
; RV64I-NEXT: not a4, a5
; RV64I-NEXT: slli a3, a3, 1
; RV32I-NEXT: lbu a0, 14(a0)
; RV32I-NEXT: slli s2, s2, 16
; RV32I-NEXT: slli a1, a1, 24
-; RV32I-NEXT: or s1, s2, s1
+; RV32I-NEXT: or a1, a1, s2
; RV32I-NEXT: or a1, a1, s1
; RV32I-NEXT: sb a3, 23(sp)
; RV32I-NEXT: sb a0, 22(sp)
; RV32I-NEXT: or a0, a0, a4
; RV32I-NEXT: slli a5, a5, 16
; RV32I-NEXT: slli a6, a6, 24
-; RV32I-NEXT: or a0, a5, a0
-; RV32I-NEXT: or a4, a6, a0
+; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: or a4, a4, a0
; RV32I-NEXT: andi a5, a1, 7
; RV32I-NEXT: srl a0, a4, a5
; RV32I-NEXT: lbu a1, 9(a3)
; RV32I-NEXT: or a1, a1, a6
; RV32I-NEXT: slli a7, a7, 16
; RV32I-NEXT: slli t0, t0, 24
-; RV32I-NEXT: or a1, a7, a1
-; RV32I-NEXT: or a6, t0, a1
+; RV32I-NEXT: or a6, t0, a7
+; RV32I-NEXT: or a6, a6, a1
; RV32I-NEXT: slli a1, a6, 1
; RV32I-NEXT: not a7, a5
; RV32I-NEXT: sll a1, a1, a7
; RV32I-NEXT: or a7, a7, t0
; RV32I-NEXT: slli t1, t1, 16
; RV32I-NEXT: slli t2, t2, 24
-; RV32I-NEXT: or a7, t1, a7
-; RV32I-NEXT: or a7, t2, a7
+; RV32I-NEXT: or t0, t2, t1
+; RV32I-NEXT: or a7, t0, a7
; RV32I-NEXT: srl a7, a7, a5
; RV32I-NEXT: slli a4, a4, 1
; RV32I-NEXT: xori t0, a5, 31
; RV32I-NEXT: or t1, t1, t2
; RV32I-NEXT: slli t3, t3, 16
; RV32I-NEXT: slli a3, a3, 24
-; RV32I-NEXT: or t1, t3, t1
+; RV32I-NEXT: or a3, a3, t3
; RV32I-NEXT: or a3, a3, t1
; RV32I-NEXT: slli t1, a3, 1
; RV32I-NEXT: sll t0, t1, t0
; RV64I-NEXT: lbu s5, 17(a0)
; RV64I-NEXT: lbu s6, 18(a0)
; RV64I-NEXT: lbu s7, 19(a0)
+; RV64I-NEXT: lbu s8, 20(a0)
; RV64I-NEXT: lbu s9, 1(a1)
; RV64I-NEXT: lbu s10, 0(a1)
; RV64I-NEXT: lbu s11, 2(a1)
-; RV64I-NEXT: lbu s8, 20(a0)
+; RV64I-NEXT: lbu ra, 3(a1)
; RV64I-NEXT: slli s9, s9, 8
; RV64I-NEXT: or s9, s9, s10
; RV64I-NEXT: slli s11, s11, 16
+; RV64I-NEXT: slli ra, ra, 24
; RV64I-NEXT: lbu s10, 5(a1)
-; RV64I-NEXT: lbu ra, 4(a1)
+; RV64I-NEXT: or s11, ra, s11
; RV64I-NEXT: or s9, s11, s9
-; RV64I-NEXT: lbu s11, 6(a1)
+; RV64I-NEXT: lbu s11, 4(a1)
; RV64I-NEXT: slli s10, s10, 8
-; RV64I-NEXT: or s10, s10, ra
-; RV64I-NEXT: lbu ra, 7(a1)
-; RV64I-NEXT: slli s11, s11, 16
-; RV64I-NEXT: or s10, s11, s10
+; RV64I-NEXT: lbu ra, 6(a1)
+; RV64I-NEXT: lbu a1, 7(a1)
+; RV64I-NEXT: or s10, s10, s11
; RV64I-NEXT: lbu s11, 21(a0)
-; RV64I-NEXT: slli ra, ra, 24
-; RV64I-NEXT: or s10, ra, s10
+; RV64I-NEXT: slli ra, ra, 16
+; RV64I-NEXT: slli a1, a1, 24
+; RV64I-NEXT: or a1, a1, ra
; RV64I-NEXT: lbu ra, 22(a0)
-; RV64I-NEXT: lbu a1, 3(a1)
-; RV64I-NEXT: slli s10, s10, 32
-; RV64I-NEXT: or s9, s10, s9
+; RV64I-NEXT: or a1, a1, s10
; RV64I-NEXT: lbu s10, 23(a0)
-; RV64I-NEXT: slli a1, a1, 24
-; RV64I-NEXT: or t0, s9, a1
+; RV64I-NEXT: slli a1, a1, 32
+; RV64I-NEXT: or t0, a1, s9
; RV64I-NEXT: lbu s9, 24(a0)
; RV64I-NEXT: lbu a7, 25(a0)
; RV64I-NEXT: lbu a6, 26(a0)
; RV64I-NEXT: sb s0, 68(sp)
; RV64I-NEXT: sb t6, 67(sp)
; RV64I-NEXT: sb t5, 66(sp)
+; RV64I-NEXT: sb t4, 65(sp)
; RV64I-NEXT: sb zero, 119(sp)
; RV64I-NEXT: sb zero, 118(sp)
; RV64I-NEXT: sb zero, 117(sp)
; RV64I-NEXT: sb zero, 90(sp)
; RV64I-NEXT: sb zero, 89(sp)
; RV64I-NEXT: sb zero, 88(sp)
-; RV64I-NEXT: sb t4, 65(sp)
; RV64I-NEXT: sb t3, 64(sp)
; RV64I-NEXT: sb t2, 63(sp)
; RV64I-NEXT: sb t1, 62(sp)
; RV64I-NEXT: or a0, a0, a1
; RV64I-NEXT: slli a4, a4, 16
; RV64I-NEXT: slli a5, a5, 24
+; RV64I-NEXT: or a4, a5, a4
; RV64I-NEXT: or a0, a4, a0
; RV64I-NEXT: lbu a1, 13(a3)
; RV64I-NEXT: lbu a4, 12(a3)
-; RV64I-NEXT: lbu a6, 14(a3)
-; RV64I-NEXT: lbu a7, 15(a3)
+; RV64I-NEXT: lbu a5, 14(a3)
+; RV64I-NEXT: lbu a6, 15(a3)
; RV64I-NEXT: slli a1, a1, 8
; RV64I-NEXT: or a1, a1, a4
-; RV64I-NEXT: slli a6, a6, 16
-; RV64I-NEXT: slli a7, a7, 24
-; RV64I-NEXT: or a1, a6, a1
-; RV64I-NEXT: or a1, a7, a1
+; RV64I-NEXT: slli a5, a5, 16
+; RV64I-NEXT: slli a6, a6, 24
+; RV64I-NEXT: or a4, a6, a5
+; RV64I-NEXT: or a1, a4, a1
; RV64I-NEXT: slli a1, a1, 32
-; RV64I-NEXT: or a0, a1, a0
-; RV64I-NEXT: or a4, a0, a5
+; RV64I-NEXT: or a4, a1, a0
; RV64I-NEXT: andi a1, t0, 7
; RV64I-NEXT: lbu a0, 17(a3)
; RV64I-NEXT: lbu a5, 16(a3)
; RV64I-NEXT: or a0, a0, a5
; RV64I-NEXT: slli a6, a6, 16
; RV64I-NEXT: slli a7, a7, 24
-; RV64I-NEXT: or a0, a6, a0
+; RV64I-NEXT: or a5, a7, a6
+; RV64I-NEXT: or a0, a5, a0
; RV64I-NEXT: lbu a5, 21(a3)
; RV64I-NEXT: lbu a6, 20(a3)
-; RV64I-NEXT: lbu t0, 22(a3)
-; RV64I-NEXT: lbu t1, 23(a3)
+; RV64I-NEXT: lbu a7, 22(a3)
+; RV64I-NEXT: lbu t0, 23(a3)
; RV64I-NEXT: slli a5, a5, 8
; RV64I-NEXT: or a5, a5, a6
-; RV64I-NEXT: slli t0, t0, 16
-; RV64I-NEXT: slli t1, t1, 24
-; RV64I-NEXT: or a5, t0, a5
-; RV64I-NEXT: or a5, t1, a5
+; RV64I-NEXT: slli a7, a7, 16
+; RV64I-NEXT: slli t0, t0, 24
+; RV64I-NEXT: or a6, t0, a7
+; RV64I-NEXT: or a5, a6, a5
; RV64I-NEXT: slli a5, a5, 32
-; RV64I-NEXT: or a0, a5, a0
-; RV64I-NEXT: or a5, a0, a7
+; RV64I-NEXT: or a5, a5, a0
; RV64I-NEXT: slli a0, a5, 1
; RV64I-NEXT: not a6, a1
; RV64I-NEXT: sll a0, a0, a6
; RV64I-NEXT: or a6, a6, a7
; RV64I-NEXT: slli t0, t0, 16
; RV64I-NEXT: slli t1, t1, 24
-; RV64I-NEXT: or a6, t0, a6
+; RV64I-NEXT: or a7, t1, t0
+; RV64I-NEXT: or a6, a7, a6
; RV64I-NEXT: lbu a7, 5(a3)
; RV64I-NEXT: lbu t0, 4(a3)
-; RV64I-NEXT: lbu t2, 6(a3)
-; RV64I-NEXT: lbu t3, 7(a3)
+; RV64I-NEXT: lbu t1, 6(a3)
+; RV64I-NEXT: lbu t2, 7(a3)
; RV64I-NEXT: slli a7, a7, 8
; RV64I-NEXT: or a7, a7, t0
-; RV64I-NEXT: slli t2, t2, 16
-; RV64I-NEXT: slli t3, t3, 24
-; RV64I-NEXT: or a7, t2, a7
-; RV64I-NEXT: or a7, t3, a7
+; RV64I-NEXT: slli t1, t1, 16
+; RV64I-NEXT: slli t2, t2, 24
+; RV64I-NEXT: or t0, t2, t1
+; RV64I-NEXT: or a7, t0, a7
; RV64I-NEXT: slli a7, a7, 32
; RV64I-NEXT: or a6, a7, a6
; RV64I-NEXT: lbu a7, 25(a3)
; RV64I-NEXT: lbu t0, 24(a3)
-; RV64I-NEXT: lbu t2, 26(a3)
-; RV64I-NEXT: or a6, a6, t1
+; RV64I-NEXT: lbu t1, 26(a3)
+; RV64I-NEXT: lbu t2, 27(a3)
; RV64I-NEXT: slli a7, a7, 8
; RV64I-NEXT: or a7, a7, t0
-; RV64I-NEXT: slli t2, t2, 16
+; RV64I-NEXT: slli t1, t1, 16
+; RV64I-NEXT: slli t2, t2, 24
+; RV64I-NEXT: or t0, t2, t1
+; RV64I-NEXT: or a7, t0, a7
; RV64I-NEXT: lbu t0, 29(a3)
-; RV64I-NEXT: or a7, t2, a7
; RV64I-NEXT: lbu t1, 28(a3)
; RV64I-NEXT: lbu t2, 30(a3)
+; RV64I-NEXT: lbu a3, 31(a3)
; RV64I-NEXT: slli t0, t0, 8
-; RV64I-NEXT: lbu t3, 31(a3)
; RV64I-NEXT: or t0, t0, t1
; RV64I-NEXT: slli t2, t2, 16
-; RV64I-NEXT: or t0, t2, t0
-; RV64I-NEXT: slli t3, t3, 24
-; RV64I-NEXT: or t0, t3, t0
+; RV64I-NEXT: slli a3, a3, 24
+; RV64I-NEXT: or a3, a3, t2
; RV64I-NEXT: slli t1, a4, 1
-; RV64I-NEXT: lbu a3, 27(a3)
-; RV64I-NEXT: slli t0, t0, 32
-; RV64I-NEXT: or a7, t0, a7
+; RV64I-NEXT: or a3, a3, t0
; RV64I-NEXT: xori t0, a1, 63
; RV64I-NEXT: sll t1, t1, t0
-; RV64I-NEXT: slli a3, a3, 24
-; RV64I-NEXT: or a3, a7, a3
+; RV64I-NEXT: slli a3, a3, 32
+; RV64I-NEXT: or a3, a3, a7
; RV64I-NEXT: slli a7, a3, 1
; RV64I-NEXT: sll a7, a7, t0
; RV64I-NEXT: srl a4, a4, a1
; RV32I-NEXT: lbu s7, 19(a0)
; RV32I-NEXT: lbu s8, 1(a1)
; RV32I-NEXT: lbu s9, 20(a0)
-; RV32I-NEXT: lbu s10, 0(a1)
-; RV32I-NEXT: lbu s11, 21(a0)
+; RV32I-NEXT: lbu s10, 21(a0)
+; RV32I-NEXT: lbu s11, 0(a1)
; RV32I-NEXT: slli s8, s8, 8
; RV32I-NEXT: lbu ra, 2(a1)
-; RV32I-NEXT: or s8, s8, s10
-; RV32I-NEXT: lbu s10, 22(a0)
; RV32I-NEXT: lbu a1, 3(a1)
+; RV32I-NEXT: or s8, s8, s11
+; RV32I-NEXT: lbu s11, 22(a0)
; RV32I-NEXT: slli ra, ra, 16
-; RV32I-NEXT: or s8, ra, s8
-; RV32I-NEXT: lbu ra, 23(a0)
; RV32I-NEXT: slli a1, a1, 24
+; RV32I-NEXT: or a1, a1, ra
+; RV32I-NEXT: lbu ra, 23(a0)
; RV32I-NEXT: or t0, a1, s8
; RV32I-NEXT: lbu s8, 24(a0)
; RV32I-NEXT: lbu a7, 25(a0)
; RV32I-NEXT: sb a7, 53(sp)
; RV32I-NEXT: sb s8, 52(sp)
; RV32I-NEXT: sb ra, 51(sp)
-; RV32I-NEXT: sb s10, 50(sp)
-; RV32I-NEXT: sb s11, 49(sp)
+; RV32I-NEXT: sb s11, 50(sp)
+; RV32I-NEXT: sb s10, 49(sp)
; RV32I-NEXT: sb s9, 48(sp)
; RV32I-NEXT: sb s7, 47(sp)
; RV32I-NEXT: sb s6, 46(sp)
; RV32I-NEXT: or a0, a0, a1
; RV32I-NEXT: slli a3, a3, 16
; RV32I-NEXT: slli a5, a5, 24
-; RV32I-NEXT: or a0, a3, a0
-; RV32I-NEXT: or t4, a5, a0
+; RV32I-NEXT: or a3, a5, a3
+; RV32I-NEXT: or t4, a3, a0
; RV32I-NEXT: andi a3, t0, 7
; RV32I-NEXT: lbu a0, 9(a4)
; RV32I-NEXT: lbu a1, 8(a4)
; RV32I-NEXT: or a0, a0, a1
; RV32I-NEXT: slli a5, a5, 16
; RV32I-NEXT: slli a6, a6, 24
-; RV32I-NEXT: or a0, a5, a0
-; RV32I-NEXT: or a6, a6, a0
+; RV32I-NEXT: or a1, a6, a5
+; RV32I-NEXT: or a6, a1, a0
; RV32I-NEXT: slli a0, a6, 1
; RV32I-NEXT: not t0, a3
; RV32I-NEXT: sll a0, a0, t0
; RV32I-NEXT: or a1, a1, a5
; RV32I-NEXT: slli a7, a7, 16
; RV32I-NEXT: slli t1, t1, 24
-; RV32I-NEXT: or a1, a7, a1
-; RV32I-NEXT: or t1, t1, a1
+; RV32I-NEXT: or a5, t1, a7
+; RV32I-NEXT: or t1, a5, a1
; RV32I-NEXT: slli a1, t4, 1
; RV32I-NEXT: xori t2, a3, 31
; RV32I-NEXT: sll a1, a1, t2
; RV32I-NEXT: or a5, a5, a7
; RV32I-NEXT: slli t3, t3, 16
; RV32I-NEXT: slli t5, t5, 24
-; RV32I-NEXT: or a5, t3, a5
-; RV32I-NEXT: or t3, t5, a5
+; RV32I-NEXT: or a7, t5, t3
+; RV32I-NEXT: or t3, a7, a5
; RV32I-NEXT: lbu a5, 17(a4)
; RV32I-NEXT: lbu a7, 16(a4)
; RV32I-NEXT: lbu t5, 18(a4)
; RV32I-NEXT: or a5, a5, a7
; RV32I-NEXT: slli t5, t5, 16
; RV32I-NEXT: slli t6, t6, 24
-; RV32I-NEXT: or a5, t5, a5
-; RV32I-NEXT: or a5, t6, a5
+; RV32I-NEXT: or a7, t6, t5
+; RV32I-NEXT: or a5, a7, a5
; RV32I-NEXT: slli a7, a5, 1
; RV32I-NEXT: sll a7, a7, t0
; RV32I-NEXT: lbu t5, 21(a4)
; RV32I-NEXT: or t5, t5, t6
; RV32I-NEXT: slli s0, s0, 16
; RV32I-NEXT: slli s1, s1, 24
+; RV32I-NEXT: or s0, s1, s0
; RV32I-NEXT: or t5, s0, t5
-; RV32I-NEXT: or t5, s1, t5
; RV32I-NEXT: lbu t6, 25(a4)
; RV32I-NEXT: lbu s0, 24(a4)
; RV32I-NEXT: lbu s1, 26(a4)
; RV32I-NEXT: or t6, t6, s0
; RV32I-NEXT: slli s1, s1, 16
; RV32I-NEXT: slli s2, s2, 24
-; RV32I-NEXT: or t6, s1, t6
-; RV32I-NEXT: or t6, s2, t6
+; RV32I-NEXT: or s0, s2, s1
+; RV32I-NEXT: or t6, s0, t6
; RV32I-NEXT: lbu s0, 29(a4)
-; RV32I-NEXT: slli s1, t6, 1
-; RV32I-NEXT: lbu s2, 28(a4)
-; RV32I-NEXT: sll t0, s1, t0
+; RV32I-NEXT: lbu s1, 28(a4)
+; RV32I-NEXT: slli s2, t6, 1
+; RV32I-NEXT: sll t0, s2, t0
; RV32I-NEXT: slli s0, s0, 8
+; RV32I-NEXT: or s0, s0, s1
; RV32I-NEXT: lbu s1, 30(a4)
-; RV32I-NEXT: or s0, s0, s2
+; RV32I-NEXT: lbu a4, 31(a4)
; RV32I-NEXT: slli s2, t3, 1
; RV32I-NEXT: sll s2, s2, t2
; RV32I-NEXT: slli s1, s1, 16
-; RV32I-NEXT: lbu a4, 31(a4)
-; RV32I-NEXT: or s0, s1, s0
+; RV32I-NEXT: slli a4, a4, 24
+; RV32I-NEXT: or a4, a4, s1
; RV32I-NEXT: slli s1, t5, 1
; RV32I-NEXT: sll s1, s1, t2
-; RV32I-NEXT: slli a4, a4, 24
; RV32I-NEXT: or a4, a4, s0
; RV32I-NEXT: slli s0, a4, 1
; RV32I-NEXT: sll t2, s0, t2
; RV64I-NEXT: lbu s5, 17(a0)
; RV64I-NEXT: lbu s6, 18(a0)
; RV64I-NEXT: lbu s7, 19(a0)
+; RV64I-NEXT: lbu s8, 20(a0)
; RV64I-NEXT: lbu s9, 1(a1)
; RV64I-NEXT: lbu s10, 0(a1)
; RV64I-NEXT: lbu s11, 2(a1)
-; RV64I-NEXT: lbu s8, 20(a0)
+; RV64I-NEXT: lbu ra, 3(a1)
; RV64I-NEXT: slli s9, s9, 8
; RV64I-NEXT: or s9, s9, s10
; RV64I-NEXT: slli s11, s11, 16
+; RV64I-NEXT: slli ra, ra, 24
; RV64I-NEXT: lbu s10, 5(a1)
-; RV64I-NEXT: lbu ra, 4(a1)
+; RV64I-NEXT: or s11, ra, s11
; RV64I-NEXT: or s9, s11, s9
-; RV64I-NEXT: lbu s11, 6(a1)
+; RV64I-NEXT: lbu s11, 4(a1)
; RV64I-NEXT: slli s10, s10, 8
-; RV64I-NEXT: or s10, s10, ra
-; RV64I-NEXT: lbu ra, 7(a1)
-; RV64I-NEXT: slli s11, s11, 16
-; RV64I-NEXT: or s10, s11, s10
+; RV64I-NEXT: lbu ra, 6(a1)
+; RV64I-NEXT: lbu a1, 7(a1)
+; RV64I-NEXT: or s10, s10, s11
; RV64I-NEXT: lbu s11, 21(a0)
-; RV64I-NEXT: slli ra, ra, 24
-; RV64I-NEXT: or s10, ra, s10
+; RV64I-NEXT: slli ra, ra, 16
+; RV64I-NEXT: slli a1, a1, 24
+; RV64I-NEXT: or a1, a1, ra
; RV64I-NEXT: lbu ra, 22(a0)
-; RV64I-NEXT: lbu a1, 3(a1)
-; RV64I-NEXT: slli s10, s10, 32
-; RV64I-NEXT: or s9, s10, s9
+; RV64I-NEXT: or a1, a1, s10
; RV64I-NEXT: lbu s10, 23(a0)
-; RV64I-NEXT: slli a1, a1, 24
-; RV64I-NEXT: or t0, s9, a1
+; RV64I-NEXT: slli a1, a1, 32
+; RV64I-NEXT: or t0, a1, s9
; RV64I-NEXT: lbu s9, 24(a0)
; RV64I-NEXT: lbu a7, 25(a0)
; RV64I-NEXT: lbu a6, 26(a0)
; RV64I-NEXT: sb t6, 99(sp)
; RV64I-NEXT: sb t5, 98(sp)
; RV64I-NEXT: sb t4, 97(sp)
+; RV64I-NEXT: sb t3, 96(sp)
; RV64I-NEXT: sb zero, 87(sp)
; RV64I-NEXT: sb zero, 86(sp)
; RV64I-NEXT: sb zero, 85(sp)
; RV64I-NEXT: sb zero, 58(sp)
; RV64I-NEXT: sb zero, 57(sp)
; RV64I-NEXT: sb zero, 56(sp)
-; RV64I-NEXT: sb t3, 96(sp)
; RV64I-NEXT: sb t2, 95(sp)
; RV64I-NEXT: sb t1, 94(sp)
; RV64I-NEXT: ld a0, 8(sp) # 8-byte Folded Reload
; RV64I-NEXT: or a1, a1, a3
; RV64I-NEXT: slli a4, a4, 16
; RV64I-NEXT: slli a5, a5, 24
+; RV64I-NEXT: or a4, a5, a4
; RV64I-NEXT: or a1, a4, a1
; RV64I-NEXT: lbu a3, 13(a0)
; RV64I-NEXT: lbu a4, 12(a0)
-; RV64I-NEXT: lbu a6, 14(a0)
-; RV64I-NEXT: lbu a7, 15(a0)
+; RV64I-NEXT: lbu a5, 14(a0)
+; RV64I-NEXT: lbu a6, 15(a0)
; RV64I-NEXT: slli a3, a3, 8
; RV64I-NEXT: or a3, a3, a4
-; RV64I-NEXT: slli a6, a6, 16
-; RV64I-NEXT: slli a7, a7, 24
-; RV64I-NEXT: or a3, a6, a3
-; RV64I-NEXT: or a3, a7, a3
-; RV64I-NEXT: slli a3, a3, 32
-; RV64I-NEXT: or a1, a3, a1
-; RV64I-NEXT: or a3, a1, a5
-; RV64I-NEXT: lbu a1, 1(a0)
-; RV64I-NEXT: lbu a4, 0(a0)
-; RV64I-NEXT: lbu a5, 2(a0)
-; RV64I-NEXT: lbu a6, 3(a0)
-; RV64I-NEXT: slli a1, a1, 8
-; RV64I-NEXT: or a1, a1, a4
; RV64I-NEXT: slli a5, a5, 16
; RV64I-NEXT: slli a6, a6, 24
-; RV64I-NEXT: or a1, a5, a1
-; RV64I-NEXT: lbu a4, 5(a0)
-; RV64I-NEXT: lbu a5, 4(a0)
-; RV64I-NEXT: lbu a7, 6(a0)
-; RV64I-NEXT: lbu t1, 7(a0)
+; RV64I-NEXT: or a4, a6, a5
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: slli a3, a3, 32
+; RV64I-NEXT: or a3, a3, a1
+; RV64I-NEXT: andi a1, t0, 7
+; RV64I-NEXT: lbu a4, 1(a0)
+; RV64I-NEXT: lbu a5, 0(a0)
+; RV64I-NEXT: lbu a6, 2(a0)
+; RV64I-NEXT: lbu a7, 3(a0)
; RV64I-NEXT: slli a4, a4, 8
; RV64I-NEXT: or a4, a4, a5
+; RV64I-NEXT: slli a6, a6, 16
+; RV64I-NEXT: slli a7, a7, 24
+; RV64I-NEXT: or a5, a7, a6
+; RV64I-NEXT: or a4, a5, a4
+; RV64I-NEXT: lbu a5, 5(a0)
+; RV64I-NEXT: lbu a6, 4(a0)
+; RV64I-NEXT: lbu a7, 6(a0)
+; RV64I-NEXT: lbu t0, 7(a0)
+; RV64I-NEXT: slli a5, a5, 8
+; RV64I-NEXT: or a5, a5, a6
; RV64I-NEXT: slli a7, a7, 16
-; RV64I-NEXT: slli t1, t1, 24
-; RV64I-NEXT: or a4, a7, a4
-; RV64I-NEXT: or a4, t1, a4
-; RV64I-NEXT: slli a4, a4, 32
-; RV64I-NEXT: or a1, a4, a1
-; RV64I-NEXT: lbu a4, 25(a0)
-; RV64I-NEXT: lbu a5, 24(a0)
+; RV64I-NEXT: slli t0, t0, 24
+; RV64I-NEXT: or a6, t0, a7
+; RV64I-NEXT: or a5, a6, a5
+; RV64I-NEXT: slli a5, a5, 32
+; RV64I-NEXT: or a4, a5, a4
+; RV64I-NEXT: lbu a5, 25(a0)
+; RV64I-NEXT: lbu a6, 24(a0)
; RV64I-NEXT: lbu a7, 26(a0)
-; RV64I-NEXT: or a6, a1, a6
-; RV64I-NEXT: slli a4, a4, 8
-; RV64I-NEXT: or a4, a4, a5
+; RV64I-NEXT: lbu t0, 27(a0)
+; RV64I-NEXT: slli a5, a5, 8
+; RV64I-NEXT: or a5, a5, a6
; RV64I-NEXT: slli a7, a7, 16
-; RV64I-NEXT: lbu a1, 29(a0)
-; RV64I-NEXT: lbu a5, 27(a0)
-; RV64I-NEXT: or a4, a7, a4
+; RV64I-NEXT: slli t0, t0, 24
+; RV64I-NEXT: or a6, t0, a7
+; RV64I-NEXT: or a5, a6, a5
+; RV64I-NEXT: lbu a6, 29(a0)
; RV64I-NEXT: lbu a7, 28(a0)
-; RV64I-NEXT: slli a1, a1, 8
-; RV64I-NEXT: lbu t1, 30(a0)
-; RV64I-NEXT: lbu t2, 31(a0)
-; RV64I-NEXT: or a1, a1, a7
-; RV64I-NEXT: slli a5, a5, 24
-; RV64I-NEXT: slli t1, t1, 16
-; RV64I-NEXT: slli t2, t2, 24
-; RV64I-NEXT: or a1, t1, a1
-; RV64I-NEXT: or a1, t2, a1
-; RV64I-NEXT: slli a1, a1, 32
-; RV64I-NEXT: or a1, a1, a4
-; RV64I-NEXT: lbu a4, 17(a0)
+; RV64I-NEXT: lbu t0, 30(a0)
+; RV64I-NEXT: lbu t1, 31(a0)
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a6, a6, a7
+; RV64I-NEXT: slli t0, t0, 16
+; RV64I-NEXT: slli t1, t1, 24
+; RV64I-NEXT: or a7, t1, t0
+; RV64I-NEXT: or a6, a7, a6
+; RV64I-NEXT: slli a6, a6, 32
+; RV64I-NEXT: or a5, a6, a5
+; RV64I-NEXT: lbu a6, 17(a0)
; RV64I-NEXT: lbu a7, 16(a0)
-; RV64I-NEXT: lbu t1, 18(a0)
-; RV64I-NEXT: or a5, a1, a5
-; RV64I-NEXT: slli a4, a4, 8
-; RV64I-NEXT: or a1, a4, a7
-; RV64I-NEXT: slli t1, t1, 16
-; RV64I-NEXT: or a1, t1, a1
-; RV64I-NEXT: lbu a4, 21(a0)
-; RV64I-NEXT: lbu a7, 20(a0)
+; RV64I-NEXT: lbu t0, 18(a0)
+; RV64I-NEXT: lbu t1, 19(a0)
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a6, a6, a7
+; RV64I-NEXT: slli t0, t0, 16
+; RV64I-NEXT: slli t1, t1, 24
+; RV64I-NEXT: lbu a7, 21(a0)
+; RV64I-NEXT: or t0, t1, t0
+; RV64I-NEXT: or a6, t0, a6
+; RV64I-NEXT: lbu t0, 20(a0)
+; RV64I-NEXT: slli a7, a7, 8
; RV64I-NEXT: lbu t1, 22(a0)
-; RV64I-NEXT: andi t0, t0, 7
-; RV64I-NEXT: slli a4, a4, 8
-; RV64I-NEXT: or a4, a4, a7
+; RV64I-NEXT: lbu a0, 23(a0)
+; RV64I-NEXT: or a7, a7, t0
+; RV64I-NEXT: srli t0, a4, 1
; RV64I-NEXT: slli t1, t1, 16
-; RV64I-NEXT: lbu a7, 23(a0)
-; RV64I-NEXT: or a4, t1, a4
-; RV64I-NEXT: srli t1, a6, 1
-; RV64I-NEXT: lbu t2, 19(a0)
-; RV64I-NEXT: slli a7, a7, 24
-; RV64I-NEXT: or a4, a7, a4
-; RV64I-NEXT: xori a7, t0, 63
-; RV64I-NEXT: srl a0, t1, a7
-; RV64I-NEXT: slli t2, t2, 24
-; RV64I-NEXT: slli a4, a4, 32
-; RV64I-NEXT: or a1, a4, a1
-; RV64I-NEXT: or a4, a1, t2
-; RV64I-NEXT: srli a1, a4, 1
-; RV64I-NEXT: srl a7, a1, a7
-; RV64I-NEXT: srli a1, a3, 1
-; RV64I-NEXT: not t1, t0
-; RV64I-NEXT: srl t1, a1, t1
-; RV64I-NEXT: sll a1, a3, t0
-; RV64I-NEXT: sll a3, a5, t0
-; RV64I-NEXT: sll a4, a4, t0
-; RV64I-NEXT: sll a5, a6, t0
-; RV64I-NEXT: srli a6, a4, 56
-; RV64I-NEXT: sb a6, 23(a2)
-; RV64I-NEXT: srli a6, a4, 48
-; RV64I-NEXT: sb a6, 22(a2)
-; RV64I-NEXT: srli a6, a4, 40
-; RV64I-NEXT: sb a6, 21(a2)
-; RV64I-NEXT: srli a6, a4, 32
-; RV64I-NEXT: sb a6, 20(a2)
-; RV64I-NEXT: srli a6, a4, 24
-; RV64I-NEXT: sb a6, 19(a2)
-; RV64I-NEXT: srli a6, a4, 16
-; RV64I-NEXT: sb a6, 18(a2)
-; RV64I-NEXT: or a6, a4, t1
-; RV64I-NEXT: srli a4, a4, 8
-; RV64I-NEXT: sb a4, 17(a2)
-; RV64I-NEXT: srli a4, a3, 56
-; RV64I-NEXT: sb a4, 31(a2)
-; RV64I-NEXT: srli a4, a3, 48
-; RV64I-NEXT: sb a4, 30(a2)
-; RV64I-NEXT: srli a4, a3, 40
-; RV64I-NEXT: sb a4, 29(a2)
-; RV64I-NEXT: srli a4, a3, 32
-; RV64I-NEXT: sb a4, 28(a2)
-; RV64I-NEXT: srli a4, a3, 24
-; RV64I-NEXT: sb a4, 27(a2)
-; RV64I-NEXT: srli a4, a3, 16
-; RV64I-NEXT: sb a4, 26(a2)
-; RV64I-NEXT: or a4, a3, a7
-; RV64I-NEXT: srli a3, a3, 8
-; RV64I-NEXT: sb a3, 25(a2)
-; RV64I-NEXT: srli a3, a5, 56
-; RV64I-NEXT: sb a3, 7(a2)
-; RV64I-NEXT: srli a3, a5, 48
-; RV64I-NEXT: sb a3, 6(a2)
-; RV64I-NEXT: srli a3, a5, 40
-; RV64I-NEXT: sb a3, 5(a2)
-; RV64I-NEXT: srli a3, a5, 32
-; RV64I-NEXT: sb a3, 4(a2)
-; RV64I-NEXT: srli a3, a5, 24
-; RV64I-NEXT: sb a3, 3(a2)
-; RV64I-NEXT: srli a3, a5, 16
-; RV64I-NEXT: sb a3, 2(a2)
-; RV64I-NEXT: sb a5, 0(a2)
+; RV64I-NEXT: slli a0, a0, 24
+; RV64I-NEXT: or t1, a0, t1
+; RV64I-NEXT: xori t2, a1, 63
+; RV64I-NEXT: srl a0, t0, t2
+; RV64I-NEXT: or a7, t1, a7
+; RV64I-NEXT: slli a7, a7, 32
+; RV64I-NEXT: or a6, a7, a6
+; RV64I-NEXT: srli a7, a6, 1
+; RV64I-NEXT: srl a7, a7, t2
+; RV64I-NEXT: srli t0, a3, 1
+; RV64I-NEXT: not t1, a1
+; RV64I-NEXT: srl t0, t0, t1
+; RV64I-NEXT: sll a3, a3, a1
+; RV64I-NEXT: sll a5, a5, a1
+; RV64I-NEXT: sll a6, a6, a1
+; RV64I-NEXT: sll a1, a4, a1
+; RV64I-NEXT: srli a4, a6, 56
+; RV64I-NEXT: sb a4, 23(a2)
+; RV64I-NEXT: srli a4, a6, 48
+; RV64I-NEXT: sb a4, 22(a2)
+; RV64I-NEXT: srli a4, a6, 40
+; RV64I-NEXT: sb a4, 21(a2)
+; RV64I-NEXT: srli a4, a6, 32
+; RV64I-NEXT: sb a4, 20(a2)
+; RV64I-NEXT: srli a4, a6, 24
+; RV64I-NEXT: sb a4, 19(a2)
+; RV64I-NEXT: srli a4, a6, 16
+; RV64I-NEXT: sb a4, 18(a2)
+; RV64I-NEXT: or a4, a6, t0
+; RV64I-NEXT: srli a6, a6, 8
+; RV64I-NEXT: sb a6, 17(a2)
+; RV64I-NEXT: srli a6, a5, 56
+; RV64I-NEXT: sb a6, 31(a2)
+; RV64I-NEXT: srli a6, a5, 48
+; RV64I-NEXT: sb a6, 30(a2)
+; RV64I-NEXT: srli a6, a5, 40
+; RV64I-NEXT: sb a6, 29(a2)
+; RV64I-NEXT: srli a6, a5, 32
+; RV64I-NEXT: sb a6, 28(a2)
+; RV64I-NEXT: srli a6, a5, 24
+; RV64I-NEXT: sb a6, 27(a2)
+; RV64I-NEXT: srli a6, a5, 16
+; RV64I-NEXT: sb a6, 26(a2)
+; RV64I-NEXT: or a6, a5, a7
; RV64I-NEXT: srli a5, a5, 8
-; RV64I-NEXT: sb a5, 1(a2)
-; RV64I-NEXT: srli a3, a1, 56
-; RV64I-NEXT: sb a3, 15(a2)
-; RV64I-NEXT: srli a3, a1, 48
-; RV64I-NEXT: sb a3, 14(a2)
-; RV64I-NEXT: srli a3, a1, 40
-; RV64I-NEXT: sb a3, 13(a2)
-; RV64I-NEXT: srli a3, a1, 32
-; RV64I-NEXT: sb a3, 12(a2)
-; RV64I-NEXT: srli a3, a1, 24
-; RV64I-NEXT: sb a3, 11(a2)
-; RV64I-NEXT: srli a3, a1, 16
-; RV64I-NEXT: sb a3, 10(a2)
-; RV64I-NEXT: or a0, a1, a0
+; RV64I-NEXT: sb a5, 25(a2)
+; RV64I-NEXT: srli a5, a1, 56
+; RV64I-NEXT: sb a5, 7(a2)
+; RV64I-NEXT: srli a5, a1, 48
+; RV64I-NEXT: sb a5, 6(a2)
+; RV64I-NEXT: srli a5, a1, 40
+; RV64I-NEXT: sb a5, 5(a2)
+; RV64I-NEXT: srli a5, a1, 32
+; RV64I-NEXT: sb a5, 4(a2)
+; RV64I-NEXT: srli a5, a1, 24
+; RV64I-NEXT: sb a5, 3(a2)
+; RV64I-NEXT: srli a5, a1, 16
+; RV64I-NEXT: sb a5, 2(a2)
+; RV64I-NEXT: sb a1, 0(a2)
; RV64I-NEXT: srli a1, a1, 8
-; RV64I-NEXT: sb a1, 9(a2)
-; RV64I-NEXT: sb a6, 16(a2)
-; RV64I-NEXT: sb a4, 24(a2)
+; RV64I-NEXT: sb a1, 1(a2)
+; RV64I-NEXT: srli a1, a3, 56
+; RV64I-NEXT: sb a1, 15(a2)
+; RV64I-NEXT: srli a1, a3, 48
+; RV64I-NEXT: sb a1, 14(a2)
+; RV64I-NEXT: srli a1, a3, 40
+; RV64I-NEXT: sb a1, 13(a2)
+; RV64I-NEXT: srli a1, a3, 32
+; RV64I-NEXT: sb a1, 12(a2)
+; RV64I-NEXT: srli a1, a3, 24
+; RV64I-NEXT: sb a1, 11(a2)
+; RV64I-NEXT: srli a1, a3, 16
+; RV64I-NEXT: sb a1, 10(a2)
+; RV64I-NEXT: or a0, a3, a0
+; RV64I-NEXT: srli a3, a3, 8
+; RV64I-NEXT: sb a3, 9(a2)
+; RV64I-NEXT: sb a4, 16(a2)
+; RV64I-NEXT: sb a6, 24(a2)
; RV64I-NEXT: sb a0, 8(a2)
; RV64I-NEXT: ld ra, 216(sp) # 8-byte Folded Reload
; RV64I-NEXT: ld s0, 208(sp) # 8-byte Folded Reload
; RV32I-NEXT: lbu s7, 19(a0)
; RV32I-NEXT: lbu s8, 1(a1)
; RV32I-NEXT: lbu s9, 20(a0)
-; RV32I-NEXT: lbu s10, 0(a1)
-; RV32I-NEXT: lbu s11, 21(a0)
+; RV32I-NEXT: lbu s10, 21(a0)
+; RV32I-NEXT: lbu s11, 0(a1)
; RV32I-NEXT: slli s8, s8, 8
; RV32I-NEXT: lbu ra, 2(a1)
-; RV32I-NEXT: or s8, s8, s10
-; RV32I-NEXT: lbu s10, 22(a0)
; RV32I-NEXT: lbu a1, 3(a1)
+; RV32I-NEXT: or s8, s8, s11
+; RV32I-NEXT: lbu s11, 22(a0)
; RV32I-NEXT: slli ra, ra, 16
-; RV32I-NEXT: or s8, ra, s8
-; RV32I-NEXT: lbu ra, 23(a0)
; RV32I-NEXT: slli a1, a1, 24
+; RV32I-NEXT: or a1, a1, ra
+; RV32I-NEXT: lbu ra, 23(a0)
; RV32I-NEXT: or t0, a1, s8
; RV32I-NEXT: lbu s8, 24(a0)
; RV32I-NEXT: lbu a7, 25(a0)
; RV32I-NEXT: sb a7, 85(sp)
; RV32I-NEXT: sb s8, 84(sp)
; RV32I-NEXT: sb ra, 83(sp)
-; RV32I-NEXT: sb s10, 82(sp)
-; RV32I-NEXT: sb s11, 81(sp)
+; RV32I-NEXT: sb s11, 82(sp)
+; RV32I-NEXT: sb s10, 81(sp)
; RV32I-NEXT: sb s9, 80(sp)
; RV32I-NEXT: sb s7, 79(sp)
; RV32I-NEXT: sb s6, 78(sp)
; RV32I-NEXT: or a0, a0, a1
; RV32I-NEXT: slli a3, a3, 16
; RV32I-NEXT: slli a4, a4, 24
-; RV32I-NEXT: or a0, a3, a0
-; RV32I-NEXT: or t4, a4, a0
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: or t3, a3, a0
; RV32I-NEXT: andi a1, t0, 7
; RV32I-NEXT: lbu a0, 1(a5)
; RV32I-NEXT: lbu a3, 0(a5)
; RV32I-NEXT: or a0, a0, a3
; RV32I-NEXT: slli a4, a4, 16
; RV32I-NEXT: slli a6, a6, 24
-; RV32I-NEXT: or a0, a4, a0
-; RV32I-NEXT: or a6, a6, a0
+; RV32I-NEXT: or a3, a6, a4
+; RV32I-NEXT: or a6, a3, a0
; RV32I-NEXT: srli a0, a6, 1
-; RV32I-NEXT: xori t0, a1, 31
-; RV32I-NEXT: srl a0, a0, t0
+; RV32I-NEXT: xori a7, a1, 31
+; RV32I-NEXT: srl a0, a0, a7
; RV32I-NEXT: lbu a3, 13(a5)
; RV32I-NEXT: lbu a4, 12(a5)
-; RV32I-NEXT: lbu a7, 14(a5)
+; RV32I-NEXT: lbu t0, 14(a5)
; RV32I-NEXT: lbu t1, 15(a5)
; RV32I-NEXT: slli a3, a3, 8
; RV32I-NEXT: or a3, a3, a4
-; RV32I-NEXT: slli a7, a7, 16
+; RV32I-NEXT: slli t0, t0, 16
; RV32I-NEXT: slli t1, t1, 24
-; RV32I-NEXT: or a3, a7, a3
-; RV32I-NEXT: or t1, t1, a3
+; RV32I-NEXT: or a4, t1, t0
+; RV32I-NEXT: or t0, a4, a3
; RV32I-NEXT: lbu a3, 9(a5)
; RV32I-NEXT: lbu a4, 8(a5)
-; RV32I-NEXT: lbu a7, 10(a5)
+; RV32I-NEXT: lbu t1, 10(a5)
; RV32I-NEXT: lbu t2, 11(a5)
; RV32I-NEXT: slli a3, a3, 8
; RV32I-NEXT: or a3, a3, a4
-; RV32I-NEXT: slli a7, a7, 16
+; RV32I-NEXT: slli t1, t1, 16
; RV32I-NEXT: slli t2, t2, 24
-; RV32I-NEXT: or a3, a7, a3
-; RV32I-NEXT: or t2, t2, a3
-; RV32I-NEXT: srli a3, t2, 1
-; RV32I-NEXT: srl a3, a3, t0
-; RV32I-NEXT: srli a4, t4, 1
-; RV32I-NEXT: not t3, a1
-; RV32I-NEXT: srl a7, a4, t3
-; RV32I-NEXT: lbu a4, 21(a5)
+; RV32I-NEXT: or a4, t2, t1
+; RV32I-NEXT: or t1, a4, a3
+; RV32I-NEXT: srli a3, t1, 1
+; RV32I-NEXT: srl a3, a3, a7
+; RV32I-NEXT: srli a4, t3, 1
+; RV32I-NEXT: not t2, a1
+; RV32I-NEXT: lbu t4, 21(a5)
; RV32I-NEXT: lbu t5, 20(a5)
; RV32I-NEXT: lbu t6, 22(a5)
; RV32I-NEXT: lbu s0, 23(a5)
-; RV32I-NEXT: slli a4, a4, 8
-; RV32I-NEXT: or a4, a4, t5
+; RV32I-NEXT: slli t4, t4, 8
+; RV32I-NEXT: or t4, t4, t5
; RV32I-NEXT: slli t6, t6, 16
; RV32I-NEXT: slli s0, s0, 24
-; RV32I-NEXT: or a4, t6, a4
-; RV32I-NEXT: or a4, s0, a4
+; RV32I-NEXT: or t5, s0, t6
+; RV32I-NEXT: or t4, t5, t4
; RV32I-NEXT: lbu t5, 17(a5)
; RV32I-NEXT: lbu t6, 16(a5)
; RV32I-NEXT: lbu s0, 18(a5)
; RV32I-NEXT: or t5, t5, t6
; RV32I-NEXT: slli s0, s0, 16
; RV32I-NEXT: slli s1, s1, 24
+; RV32I-NEXT: or s0, s1, s0
; RV32I-NEXT: or t5, s0, t5
-; RV32I-NEXT: or t5, s1, t5
; RV32I-NEXT: lbu t6, 29(a5)
; RV32I-NEXT: lbu s0, 28(a5)
; RV32I-NEXT: lbu s1, 30(a5)
; RV32I-NEXT: or t6, t6, s0
; RV32I-NEXT: slli s1, s1, 16
; RV32I-NEXT: slli s2, s2, 24
-; RV32I-NEXT: lbu s0, 25(a5)
-; RV32I-NEXT: or t6, s1, t6
-; RV32I-NEXT: lbu s1, 24(a5)
-; RV32I-NEXT: or t6, s2, t6
-; RV32I-NEXT: slli s0, s0, 8
-; RV32I-NEXT: lbu s2, 26(a5)
-; RV32I-NEXT: or s0, s0, s1
-; RV32I-NEXT: srli s1, t5, 1
-; RV32I-NEXT: srl s1, s1, t0
-; RV32I-NEXT: slli s2, s2, 16
+; RV32I-NEXT: or s0, s2, s1
+; RV32I-NEXT: lbu s1, 25(a5)
+; RV32I-NEXT: lbu s2, 24(a5)
+; RV32I-NEXT: srl a4, a4, t2
+; RV32I-NEXT: or t6, s0, t6
+; RV32I-NEXT: slli s1, s1, 8
+; RV32I-NEXT: or s0, s1, s2
+; RV32I-NEXT: lbu s1, 26(a5)
; RV32I-NEXT: lbu a5, 27(a5)
-; RV32I-NEXT: or s0, s2, s0
-; RV32I-NEXT: srli s2, t1, 1
-; RV32I-NEXT: srl s2, s2, t3
+; RV32I-NEXT: srli s2, t5, 1
+; RV32I-NEXT: srl s2, s2, a7
+; RV32I-NEXT: slli s1, s1, 16
; RV32I-NEXT: slli a5, a5, 24
+; RV32I-NEXT: or a5, a5, s1
+; RV32I-NEXT: srli s1, t0, 1
+; RV32I-NEXT: srl s1, s1, t2
; RV32I-NEXT: or a5, a5, s0
; RV32I-NEXT: srli s0, a5, 1
-; RV32I-NEXT: srl t0, s0, t0
-; RV32I-NEXT: srli s0, a4, 1
-; RV32I-NEXT: srl t3, s0, t3
-; RV32I-NEXT: sll t4, t4, a1
+; RV32I-NEXT: srl a7, s0, a7
+; RV32I-NEXT: srli s0, t4, 1
+; RV32I-NEXT: srl t2, s0, t2
+; RV32I-NEXT: sll t3, t3, a1
+; RV32I-NEXT: sll t0, t0, a1
; RV32I-NEXT: sll t1, t1, a1
-; RV32I-NEXT: sll t2, t2, a1
-; RV32I-NEXT: sll a4, a4, a1
+; RV32I-NEXT: sll t4, t4, a1
; RV32I-NEXT: sll t5, t5, a1
; RV32I-NEXT: sll t6, t6, a1
; RV32I-NEXT: sll a5, a5, a1
; RV32I-NEXT: sb a6, 27(a2)
; RV32I-NEXT: srli a6, a5, 16
; RV32I-NEXT: sb a6, 26(a2)
-; RV32I-NEXT: or a6, a5, t3
+; RV32I-NEXT: or a6, a5, t2
; RV32I-NEXT: srli a5, a5, 8
; RV32I-NEXT: sb a5, 25(a2)
; RV32I-NEXT: srli a5, t6, 24
; RV32I-NEXT: sb a5, 31(a2)
; RV32I-NEXT: srli a5, t6, 16
; RV32I-NEXT: sb a5, 30(a2)
-; RV32I-NEXT: or a5, t6, t0
-; RV32I-NEXT: srli t0, t6, 8
-; RV32I-NEXT: sb t0, 29(a2)
-; RV32I-NEXT: srli t0, t5, 24
-; RV32I-NEXT: sb t0, 19(a2)
-; RV32I-NEXT: srli t0, t5, 16
-; RV32I-NEXT: sb t0, 18(a2)
-; RV32I-NEXT: or t0, t5, s2
-; RV32I-NEXT: srli t3, t5, 8
-; RV32I-NEXT: sb t3, 17(a2)
-; RV32I-NEXT: srli t3, a4, 24
-; RV32I-NEXT: sb t3, 23(a2)
-; RV32I-NEXT: srli t3, a4, 16
-; RV32I-NEXT: sb t3, 22(a2)
-; RV32I-NEXT: or s1, a4, s1
-; RV32I-NEXT: srli a4, a4, 8
-; RV32I-NEXT: sb a4, 21(a2)
-; RV32I-NEXT: srli a4, t2, 24
-; RV32I-NEXT: sb a4, 11(a2)
-; RV32I-NEXT: srli a4, t2, 16
-; RV32I-NEXT: sb a4, 10(a2)
-; RV32I-NEXT: or a4, t2, a7
-; RV32I-NEXT: srli a7, t2, 8
-; RV32I-NEXT: sb a7, 9(a2)
-; RV32I-NEXT: srli a7, t1, 24
-; RV32I-NEXT: sb a7, 15(a2)
-; RV32I-NEXT: srli a7, t1, 16
-; RV32I-NEXT: sb a7, 14(a2)
-; RV32I-NEXT: or a3, t1, a3
-; RV32I-NEXT: srli a7, t1, 8
-; RV32I-NEXT: sb a7, 13(a2)
-; RV32I-NEXT: srli a7, a1, 24
-; RV32I-NEXT: sb a7, 3(a2)
-; RV32I-NEXT: srli a7, a1, 16
-; RV32I-NEXT: sb a7, 2(a2)
+; RV32I-NEXT: or a5, t6, a7
+; RV32I-NEXT: srli a7, t6, 8
+; RV32I-NEXT: sb a7, 29(a2)
+; RV32I-NEXT: srli a7, t5, 24
+; RV32I-NEXT: sb a7, 19(a2)
+; RV32I-NEXT: srli a7, t5, 16
+; RV32I-NEXT: sb a7, 18(a2)
+; RV32I-NEXT: or a7, t5, s1
+; RV32I-NEXT: srli t2, t5, 8
+; RV32I-NEXT: sb t2, 17(a2)
+; RV32I-NEXT: srli t2, t4, 24
+; RV32I-NEXT: sb t2, 23(a2)
+; RV32I-NEXT: srli t2, t4, 16
+; RV32I-NEXT: sb t2, 22(a2)
+; RV32I-NEXT: or t2, t4, s2
+; RV32I-NEXT: srli t4, t4, 8
+; RV32I-NEXT: sb t4, 21(a2)
+; RV32I-NEXT: srli t4, t1, 24
+; RV32I-NEXT: sb t4, 11(a2)
+; RV32I-NEXT: srli t4, t1, 16
+; RV32I-NEXT: sb t4, 10(a2)
+; RV32I-NEXT: or a4, t1, a4
+; RV32I-NEXT: srli t1, t1, 8
+; RV32I-NEXT: sb t1, 9(a2)
+; RV32I-NEXT: srli t1, t0, 24
+; RV32I-NEXT: sb t1, 15(a2)
+; RV32I-NEXT: srli t1, t0, 16
+; RV32I-NEXT: sb t1, 14(a2)
+; RV32I-NEXT: or a3, t0, a3
+; RV32I-NEXT: srli t0, t0, 8
+; RV32I-NEXT: sb t0, 13(a2)
+; RV32I-NEXT: srli t0, a1, 24
+; RV32I-NEXT: sb t0, 3(a2)
+; RV32I-NEXT: srli t0, a1, 16
+; RV32I-NEXT: sb t0, 2(a2)
; RV32I-NEXT: sb a1, 0(a2)
; RV32I-NEXT: srli a1, a1, 8
; RV32I-NEXT: sb a1, 1(a2)
-; RV32I-NEXT: srli a1, t4, 24
+; RV32I-NEXT: srli a1, t3, 24
; RV32I-NEXT: sb a1, 7(a2)
-; RV32I-NEXT: srli a1, t4, 16
+; RV32I-NEXT: srli a1, t3, 16
; RV32I-NEXT: sb a1, 6(a2)
-; RV32I-NEXT: or a0, t4, a0
-; RV32I-NEXT: srli a1, t4, 8
+; RV32I-NEXT: or a0, t3, a0
+; RV32I-NEXT: srli a1, t3, 8
; RV32I-NEXT: sb a1, 5(a2)
; RV32I-NEXT: sb a6, 24(a2)
; RV32I-NEXT: sb a5, 28(a2)
-; RV32I-NEXT: sb t0, 16(a2)
-; RV32I-NEXT: sb s1, 20(a2)
+; RV32I-NEXT: sb a7, 16(a2)
+; RV32I-NEXT: sb t2, 20(a2)
; RV32I-NEXT: sb a4, 8(a2)
; RV32I-NEXT: sb a3, 12(a2)
; RV32I-NEXT: sb a0, 4(a2)
; RV64I-NEXT: lbu s5, 16(a0)
; RV64I-NEXT: lbu s6, 17(a0)
; RV64I-NEXT: lbu s7, 18(a0)
-; RV64I-NEXT: lbu s8, 1(a1)
-; RV64I-NEXT: lbu s9, 0(a1)
-; RV64I-NEXT: lbu s10, 2(a1)
-; RV64I-NEXT: lbu s11, 19(a0)
-; RV64I-NEXT: slli s8, s8, 8
-; RV64I-NEXT: or s8, s8, s9
-; RV64I-NEXT: slli s10, s10, 16
-; RV64I-NEXT: lbu s9, 5(a1)
-; RV64I-NEXT: lbu ra, 4(a1)
-; RV64I-NEXT: or s8, s10, s8
-; RV64I-NEXT: lbu s10, 6(a1)
+; RV64I-NEXT: lbu s8, 19(a0)
+; RV64I-NEXT: lbu s9, 1(a1)
+; RV64I-NEXT: lbu s10, 0(a1)
+; RV64I-NEXT: lbu s11, 2(a1)
+; RV64I-NEXT: lbu ra, 3(a1)
; RV64I-NEXT: slli s9, s9, 8
-; RV64I-NEXT: or s9, s9, ra
-; RV64I-NEXT: lbu ra, 7(a1)
-; RV64I-NEXT: slli s10, s10, 16
-; RV64I-NEXT: or s9, s10, s9
-; RV64I-NEXT: lbu s10, 20(a0)
+; RV64I-NEXT: or s9, s9, s10
+; RV64I-NEXT: slli s11, s11, 16
; RV64I-NEXT: slli ra, ra, 24
-; RV64I-NEXT: or s9, ra, s9
-; RV64I-NEXT: lbu ra, 21(a0)
-; RV64I-NEXT: lbu a1, 3(a1)
-; RV64I-NEXT: slli s9, s9, 32
-; RV64I-NEXT: or s8, s9, s8
-; RV64I-NEXT: lbu s9, 22(a0)
+; RV64I-NEXT: lbu s10, 5(a1)
+; RV64I-NEXT: or s11, ra, s11
+; RV64I-NEXT: or s9, s11, s9
+; RV64I-NEXT: lbu s11, 4(a1)
+; RV64I-NEXT: slli s10, s10, 8
+; RV64I-NEXT: lbu ra, 6(a1)
+; RV64I-NEXT: lbu a1, 7(a1)
+; RV64I-NEXT: or s10, s10, s11
+; RV64I-NEXT: lbu s11, 20(a0)
+; RV64I-NEXT: slli ra, ra, 16
; RV64I-NEXT: slli a1, a1, 24
-; RV64I-NEXT: or t1, s8, a1
-; RV64I-NEXT: lbu s8, 23(a0)
+; RV64I-NEXT: or a1, a1, ra
+; RV64I-NEXT: lbu ra, 21(a0)
+; RV64I-NEXT: or a1, a1, s10
+; RV64I-NEXT: lbu s10, 22(a0)
+; RV64I-NEXT: slli a1, a1, 32
+; RV64I-NEXT: or t1, a1, s9
+; RV64I-NEXT: lbu s9, 23(a0)
; RV64I-NEXT: lbu a7, 24(a0)
; RV64I-NEXT: lbu a6, 25(a0)
; RV64I-NEXT: lbu a5, 26(a0)
; RV64I-NEXT: sb a5, 82(sp)
; RV64I-NEXT: sb a6, 81(sp)
; RV64I-NEXT: sb a7, 80(sp)
-; RV64I-NEXT: sb s8, 79(sp)
-; RV64I-NEXT: sb s9, 78(sp)
+; RV64I-NEXT: sb s9, 79(sp)
+; RV64I-NEXT: sb s10, 78(sp)
; RV64I-NEXT: sb ra, 77(sp)
-; RV64I-NEXT: sb s10, 76(sp)
-; RV64I-NEXT: sb s11, 75(sp)
+; RV64I-NEXT: sb s11, 76(sp)
+; RV64I-NEXT: sb s8, 75(sp)
; RV64I-NEXT: sb s7, 74(sp)
; RV64I-NEXT: sb s6, 73(sp)
; RV64I-NEXT: sb s5, 72(sp)
; RV64I-NEXT: sb s0, 67(sp)
; RV64I-NEXT: sb t6, 66(sp)
; RV64I-NEXT: sb t5, 65(sp)
+; RV64I-NEXT: sb t4, 64(sp)
; RV64I-NEXT: sb t0, 87(sp)
; RV64I-NEXT: slli t0, t0, 56
-; RV64I-NEXT: sb t4, 64(sp)
; RV64I-NEXT: sb t3, 63(sp)
; RV64I-NEXT: sb t2, 62(sp)
; RV64I-NEXT: ld a0, 8(sp) # 8-byte Folded Reload
; RV64I-NEXT: or a0, a0, a1
; RV64I-NEXT: slli a4, a4, 16
; RV64I-NEXT: slli a5, a5, 24
+; RV64I-NEXT: or a4, a5, a4
; RV64I-NEXT: or a0, a4, a0
; RV64I-NEXT: lbu a1, 13(a3)
; RV64I-NEXT: lbu a4, 12(a3)
-; RV64I-NEXT: lbu a6, 14(a3)
-; RV64I-NEXT: lbu a7, 15(a3)
+; RV64I-NEXT: lbu a5, 14(a3)
+; RV64I-NEXT: lbu a6, 15(a3)
; RV64I-NEXT: slli a1, a1, 8
; RV64I-NEXT: or a1, a1, a4
-; RV64I-NEXT: slli a6, a6, 16
-; RV64I-NEXT: slli a7, a7, 24
-; RV64I-NEXT: or a1, a6, a1
-; RV64I-NEXT: or a1, a7, a1
+; RV64I-NEXT: slli a5, a5, 16
+; RV64I-NEXT: slli a6, a6, 24
+; RV64I-NEXT: or a4, a6, a5
+; RV64I-NEXT: or a1, a4, a1
; RV64I-NEXT: slli a1, a1, 32
-; RV64I-NEXT: or a0, a1, a0
-; RV64I-NEXT: or a4, a0, a5
+; RV64I-NEXT: or a4, a1, a0
; RV64I-NEXT: andi a1, t1, 7
; RV64I-NEXT: lbu a0, 17(a3)
; RV64I-NEXT: lbu a5, 16(a3)
; RV64I-NEXT: or a0, a0, a5
; RV64I-NEXT: slli a6, a6, 16
; RV64I-NEXT: slli a7, a7, 24
-; RV64I-NEXT: or a0, a6, a0
+; RV64I-NEXT: or a5, a7, a6
+; RV64I-NEXT: or a0, a5, a0
; RV64I-NEXT: lbu a5, 21(a3)
; RV64I-NEXT: lbu a6, 20(a3)
-; RV64I-NEXT: lbu t0, 22(a3)
-; RV64I-NEXT: lbu t1, 23(a3)
+; RV64I-NEXT: lbu a7, 22(a3)
+; RV64I-NEXT: lbu t0, 23(a3)
; RV64I-NEXT: slli a5, a5, 8
; RV64I-NEXT: or a5, a5, a6
-; RV64I-NEXT: slli t0, t0, 16
-; RV64I-NEXT: slli t1, t1, 24
-; RV64I-NEXT: or a5, t0, a5
-; RV64I-NEXT: or a5, t1, a5
+; RV64I-NEXT: slli a7, a7, 16
+; RV64I-NEXT: slli t0, t0, 24
+; RV64I-NEXT: or a6, t0, a7
+; RV64I-NEXT: or a5, a6, a5
; RV64I-NEXT: slli a5, a5, 32
-; RV64I-NEXT: or a0, a5, a0
-; RV64I-NEXT: or a5, a0, a7
+; RV64I-NEXT: or a5, a5, a0
; RV64I-NEXT: slli a0, a5, 1
; RV64I-NEXT: not a6, a1
; RV64I-NEXT: sll a0, a0, a6
; RV64I-NEXT: or a6, a6, a7
; RV64I-NEXT: slli t0, t0, 16
; RV64I-NEXT: slli t1, t1, 24
-; RV64I-NEXT: or a6, t0, a6
+; RV64I-NEXT: or a7, t1, t0
+; RV64I-NEXT: or a6, a7, a6
; RV64I-NEXT: lbu a7, 5(a3)
; RV64I-NEXT: lbu t0, 4(a3)
-; RV64I-NEXT: lbu t2, 6(a3)
-; RV64I-NEXT: lbu t3, 7(a3)
+; RV64I-NEXT: lbu t1, 6(a3)
+; RV64I-NEXT: lbu t2, 7(a3)
; RV64I-NEXT: slli a7, a7, 8
; RV64I-NEXT: or a7, a7, t0
-; RV64I-NEXT: slli t2, t2, 16
-; RV64I-NEXT: slli t3, t3, 24
-; RV64I-NEXT: or a7, t2, a7
-; RV64I-NEXT: or a7, t3, a7
+; RV64I-NEXT: slli t1, t1, 16
+; RV64I-NEXT: slli t2, t2, 24
+; RV64I-NEXT: or t0, t2, t1
+; RV64I-NEXT: or a7, t0, a7
; RV64I-NEXT: slli a7, a7, 32
; RV64I-NEXT: or a6, a7, a6
; RV64I-NEXT: lbu a7, 25(a3)
; RV64I-NEXT: lbu t0, 24(a3)
-; RV64I-NEXT: lbu t2, 26(a3)
-; RV64I-NEXT: or a6, a6, t1
+; RV64I-NEXT: lbu t1, 26(a3)
+; RV64I-NEXT: lbu t2, 27(a3)
; RV64I-NEXT: slli a7, a7, 8
; RV64I-NEXT: or a7, a7, t0
-; RV64I-NEXT: slli t2, t2, 16
+; RV64I-NEXT: slli t1, t1, 16
+; RV64I-NEXT: slli t2, t2, 24
+; RV64I-NEXT: or t0, t2, t1
+; RV64I-NEXT: or a7, t0, a7
; RV64I-NEXT: lbu t0, 29(a3)
-; RV64I-NEXT: or a7, t2, a7
; RV64I-NEXT: lbu t1, 28(a3)
; RV64I-NEXT: lbu t2, 30(a3)
+; RV64I-NEXT: lbu a3, 31(a3)
; RV64I-NEXT: slli t0, t0, 8
-; RV64I-NEXT: lbu t3, 31(a3)
; RV64I-NEXT: or t0, t0, t1
; RV64I-NEXT: slli t2, t2, 16
-; RV64I-NEXT: or t0, t2, t0
-; RV64I-NEXT: slli t3, t3, 24
-; RV64I-NEXT: or t0, t3, t0
+; RV64I-NEXT: slli a3, a3, 24
+; RV64I-NEXT: or a3, a3, t2
; RV64I-NEXT: slli t1, a4, 1
-; RV64I-NEXT: lbu a3, 27(a3)
-; RV64I-NEXT: slli t0, t0, 32
-; RV64I-NEXT: or a7, t0, a7
+; RV64I-NEXT: or a3, a3, t0
; RV64I-NEXT: xori t0, a1, 63
; RV64I-NEXT: sll t1, t1, t0
-; RV64I-NEXT: slli a3, a3, 24
-; RV64I-NEXT: or a3, a7, a3
+; RV64I-NEXT: slli a3, a3, 32
+; RV64I-NEXT: or a3, a3, a7
; RV64I-NEXT: slli a7, a3, 1
; RV64I-NEXT: sll a7, a7, t0
; RV64I-NEXT: srl a4, a4, a1
; RV32I-NEXT: lbu s8, 18(a0)
; RV32I-NEXT: lbu a4, 1(a1)
; RV32I-NEXT: lbu s9, 19(a0)
-; RV32I-NEXT: lbu s10, 0(a1)
-; RV32I-NEXT: lbu s11, 20(a0)
+; RV32I-NEXT: lbu s10, 20(a0)
+; RV32I-NEXT: lbu s11, 0(a1)
; RV32I-NEXT: slli a4, a4, 8
; RV32I-NEXT: lbu ra, 2(a1)
-; RV32I-NEXT: or a4, a4, s10
-; RV32I-NEXT: lbu s10, 21(a0)
; RV32I-NEXT: lbu a1, 3(a1)
+; RV32I-NEXT: or a4, a4, s11
+; RV32I-NEXT: lbu s11, 21(a0)
; RV32I-NEXT: slli ra, ra, 16
-; RV32I-NEXT: or a4, ra, a4
-; RV32I-NEXT: lbu ra, 22(a0)
; RV32I-NEXT: slli a1, a1, 24
+; RV32I-NEXT: or a1, a1, ra
+; RV32I-NEXT: lbu ra, 22(a0)
; RV32I-NEXT: or t1, a1, a4
; RV32I-NEXT: lbu t0, 23(a0)
; RV32I-NEXT: lbu a7, 24(a0)
; RV32I-NEXT: sb a7, 52(sp)
; RV32I-NEXT: sb t0, 51(sp)
; RV32I-NEXT: sb ra, 50(sp)
-; RV32I-NEXT: sb s10, 49(sp)
-; RV32I-NEXT: sb s11, 48(sp)
+; RV32I-NEXT: sb s11, 49(sp)
+; RV32I-NEXT: sb s10, 48(sp)
; RV32I-NEXT: sb s9, 47(sp)
; RV32I-NEXT: sb s8, 46(sp)
; RV32I-NEXT: sb s7, 45(sp)
; RV32I-NEXT: or a0, a0, a1
; RV32I-NEXT: slli a4, a4, 16
; RV32I-NEXT: slli a5, a5, 24
-; RV32I-NEXT: or a0, a4, a0
-; RV32I-NEXT: or t4, a5, a0
+; RV32I-NEXT: or a4, a5, a4
+; RV32I-NEXT: or t4, a4, a0
; RV32I-NEXT: andi a4, t1, 7
; RV32I-NEXT: lbu a0, 9(a3)
; RV32I-NEXT: lbu a1, 8(a3)
; RV32I-NEXT: or a0, a0, a1
; RV32I-NEXT: slli a5, a5, 16
; RV32I-NEXT: slli a6, a6, 24
-; RV32I-NEXT: or a0, a5, a0
-; RV32I-NEXT: or a6, a6, a0
+; RV32I-NEXT: or a1, a6, a5
+; RV32I-NEXT: or a6, a1, a0
; RV32I-NEXT: slli a0, a6, 1
; RV32I-NEXT: not t0, a4
; RV32I-NEXT: sll a0, a0, t0
; RV32I-NEXT: or a1, a1, a5
; RV32I-NEXT: slli a7, a7, 16
; RV32I-NEXT: slli t1, t1, 24
-; RV32I-NEXT: or a1, a7, a1
-; RV32I-NEXT: or t1, t1, a1
+; RV32I-NEXT: or a5, t1, a7
+; RV32I-NEXT: or t1, a5, a1
; RV32I-NEXT: slli a1, t4, 1
; RV32I-NEXT: xori t2, a4, 31
; RV32I-NEXT: sll a1, a1, t2
; RV32I-NEXT: or a5, a5, a7
; RV32I-NEXT: slli t3, t3, 16
; RV32I-NEXT: slli t5, t5, 24
-; RV32I-NEXT: or a5, t3, a5
-; RV32I-NEXT: or t3, t5, a5
+; RV32I-NEXT: or a7, t5, t3
+; RV32I-NEXT: or t3, a7, a5
; RV32I-NEXT: lbu a5, 17(a3)
; RV32I-NEXT: lbu a7, 16(a3)
; RV32I-NEXT: lbu t5, 18(a3)
; RV32I-NEXT: or a5, a5, a7
; RV32I-NEXT: slli t5, t5, 16
; RV32I-NEXT: slli t6, t6, 24
-; RV32I-NEXT: or a5, t5, a5
-; RV32I-NEXT: or a5, t6, a5
+; RV32I-NEXT: or a7, t6, t5
+; RV32I-NEXT: or a5, a7, a5
; RV32I-NEXT: slli a7, a5, 1
; RV32I-NEXT: sll a7, a7, t0
; RV32I-NEXT: lbu t5, 21(a3)
; RV32I-NEXT: or t5, t5, t6
; RV32I-NEXT: slli s0, s0, 16
; RV32I-NEXT: slli s1, s1, 24
+; RV32I-NEXT: or s0, s1, s0
; RV32I-NEXT: or t5, s0, t5
-; RV32I-NEXT: or t5, s1, t5
; RV32I-NEXT: lbu t6, 25(a3)
; RV32I-NEXT: lbu s0, 24(a3)
; RV32I-NEXT: lbu s1, 26(a3)
; RV32I-NEXT: or t6, t6, s0
; RV32I-NEXT: slli s1, s1, 16
; RV32I-NEXT: slli s2, s2, 24
-; RV32I-NEXT: or t6, s1, t6
-; RV32I-NEXT: or t6, s2, t6
+; RV32I-NEXT: or s0, s2, s1
+; RV32I-NEXT: or t6, s0, t6
; RV32I-NEXT: lbu s0, 29(a3)
-; RV32I-NEXT: slli s1, t6, 1
-; RV32I-NEXT: lbu s2, 28(a3)
-; RV32I-NEXT: sll t0, s1, t0
+; RV32I-NEXT: lbu s1, 28(a3)
+; RV32I-NEXT: slli s2, t6, 1
+; RV32I-NEXT: sll t0, s2, t0
; RV32I-NEXT: slli s0, s0, 8
+; RV32I-NEXT: or s0, s0, s1
; RV32I-NEXT: lbu s1, 30(a3)
-; RV32I-NEXT: or s0, s0, s2
+; RV32I-NEXT: lbu a3, 31(a3)
; RV32I-NEXT: slli s2, t3, 1
; RV32I-NEXT: sll s2, s2, t2
; RV32I-NEXT: slli s1, s1, 16
-; RV32I-NEXT: lbu a3, 31(a3)
-; RV32I-NEXT: or s0, s1, s0
+; RV32I-NEXT: slli a3, a3, 24
+; RV32I-NEXT: or a3, a3, s1
; RV32I-NEXT: slli s1, t5, 1
; RV32I-NEXT: sll s1, s1, t2
-; RV32I-NEXT: slli a3, a3, 24
; RV32I-NEXT: or a3, a3, s0
; RV32I-NEXT: slli s0, a3, 1
; RV32I-NEXT: sll t2, s0, t2
define zeroext i1 @saddo1.i64(i64 %v1, i64 %v2, ptr %res) {
; RV32-LABEL: saddo1.i64:
; RV32: # %bb.0: # %entry
+; RV32-NEXT: add a5, a1, a3
; RV32-NEXT: add a2, a0, a2
; RV32-NEXT: sltu a0, a2, a0
-; RV32-NEXT: add a0, a3, a0
-; RV32-NEXT: add a5, a1, a0
+; RV32-NEXT: add a5, a5, a0
; RV32-NEXT: xor a0, a1, a5
; RV32-NEXT: xor a1, a1, a3
; RV32-NEXT: not a1, a1
;
; RV32ZBA-LABEL: saddo1.i64:
; RV32ZBA: # %bb.0: # %entry
+; RV32ZBA-NEXT: add a5, a1, a3
; RV32ZBA-NEXT: add a2, a0, a2
; RV32ZBA-NEXT: sltu a0, a2, a0
-; RV32ZBA-NEXT: add a0, a3, a0
-; RV32ZBA-NEXT: add a5, a1, a0
+; RV32ZBA-NEXT: add a5, a5, a0
; RV32ZBA-NEXT: xor a0, a1, a5
; RV32ZBA-NEXT: xor a1, a1, a3
; RV32ZBA-NEXT: not a1, a1
define zeroext i1 @uaddo.i64(i64 %v1, i64 %v2, ptr %res) {
; RV32-LABEL: uaddo.i64:
; RV32: # %bb.0: # %entry
+; RV32-NEXT: add a3, a1, a3
; RV32-NEXT: add a2, a0, a2
; RV32-NEXT: sltu a0, a2, a0
; RV32-NEXT: add a3, a3, a0
-; RV32-NEXT: add a3, a1, a3
; RV32-NEXT: beq a3, a1, .LBB10_2
; RV32-NEXT: # %bb.1: # %entry
; RV32-NEXT: sltu a0, a3, a1
;
; RV32ZBA-LABEL: uaddo.i64:
; RV32ZBA: # %bb.0: # %entry
+; RV32ZBA-NEXT: add a3, a1, a3
; RV32ZBA-NEXT: add a2, a0, a2
; RV32ZBA-NEXT: sltu a0, a2, a0
; RV32ZBA-NEXT: add a3, a3, a0
-; RV32ZBA-NEXT: add a3, a1, a3
; RV32ZBA-NEXT: beq a3, a1, .LBB10_2
; RV32ZBA-NEXT: # %bb.1: # %entry
; RV32ZBA-NEXT: sltu a0, a3, a1
; RV32-LABEL: ssubo.i64:
; RV32: # %bb.0: # %entry
; RV32-NEXT: sltu a5, a0, a2
-; RV32-NEXT: add a5, a3, a5
-; RV32-NEXT: sub a5, a1, a5
+; RV32-NEXT: sub a6, a1, a3
+; RV32-NEXT: sub a5, a6, a5
; RV32-NEXT: xor a6, a1, a5
; RV32-NEXT: xor a1, a1, a3
; RV32-NEXT: and a1, a1, a6
; RV32ZBA-LABEL: ssubo.i64:
; RV32ZBA: # %bb.0: # %entry
; RV32ZBA-NEXT: sltu a5, a0, a2
-; RV32ZBA-NEXT: add a5, a3, a5
-; RV32ZBA-NEXT: sub a5, a1, a5
+; RV32ZBA-NEXT: sub a6, a1, a3
+; RV32ZBA-NEXT: sub a5, a6, a5
; RV32ZBA-NEXT: xor a6, a1, a5
; RV32ZBA-NEXT: xor a1, a1, a3
; RV32ZBA-NEXT: and a1, a1, a6
; RV32-LABEL: usubo.i64:
; RV32: # %bb.0: # %entry
; RV32-NEXT: sltu a5, a0, a2
-; RV32-NEXT: add a3, a3, a5
; RV32-NEXT: sub a3, a1, a3
+; RV32-NEXT: sub a3, a3, a5
; RV32-NEXT: sub a2, a0, a2
; RV32-NEXT: beq a3, a1, .LBB18_2
; RV32-NEXT: # %bb.1: # %entry
; RV32ZBA-LABEL: usubo.i64:
; RV32ZBA: # %bb.0: # %entry
; RV32ZBA-NEXT: sltu a5, a0, a2
-; RV32ZBA-NEXT: add a3, a3, a5
; RV32ZBA-NEXT: sub a3, a1, a3
+; RV32ZBA-NEXT: sub a3, a3, a5
; RV32ZBA-NEXT: sub a2, a0, a2
; RV32ZBA-NEXT: beq a3, a1, .LBB18_2
; RV32ZBA-NEXT: # %bb.1: # %entry
; RV32-NEXT: sltu t0, t1, t0
; RV32-NEXT: sltu a6, a7, a6
; RV32-NEXT: mulhu a7, a1, a3
+; RV32-NEXT: add a6, a7, a6
; RV32-NEXT: add a6, a6, t0
-; RV32-NEXT: mulhu t0, a2, t2
+; RV32-NEXT: mulhu a7, a2, t2
+; RV32-NEXT: add a7, a7, t3
; RV32-NEXT: mul a3, a3, t2
-; RV32-NEXT: add a3, t3, a3
-; RV32-NEXT: add a3, t0, a3
+; RV32-NEXT: add a3, a7, a3
; RV32-NEXT: mul a1, t4, a1
-; RV32-NEXT: mulhu t0, t4, a0
+; RV32-NEXT: mulhu a7, t4, a0
+; RV32-NEXT: add a1, a7, a1
; RV32-NEXT: add a1, a1, t5
; RV32-NEXT: add a1, a1, a3
; RV32-NEXT: sltu a3, t6, t5
; RV32-NEXT: add a1, a1, a3
-; RV32-NEXT: add a1, t0, a1
; RV32-NEXT: add a1, a6, a1
; RV32-NEXT: add a1, a1, s1
-; RV32-NEXT: add a1, a7, a1
; RV32-NEXT: srai a3, a5, 31
; RV32-NEXT: xor a1, a1, a3
; RV32-NEXT: xor a3, s0, a3
; RV32ZBA-NEXT: sltu t0, t1, t0
; RV32ZBA-NEXT: sltu a6, a7, a6
; RV32ZBA-NEXT: mulhu a7, a1, a3
+; RV32ZBA-NEXT: add a6, a7, a6
; RV32ZBA-NEXT: add a6, a6, t0
-; RV32ZBA-NEXT: mulhu t0, a2, t2
+; RV32ZBA-NEXT: mulhu a7, a2, t2
+; RV32ZBA-NEXT: add a7, a7, t3
; RV32ZBA-NEXT: mul a3, a3, t2
-; RV32ZBA-NEXT: add a3, t3, a3
-; RV32ZBA-NEXT: add a3, t0, a3
+; RV32ZBA-NEXT: add a3, a7, a3
; RV32ZBA-NEXT: mul a1, t4, a1
-; RV32ZBA-NEXT: mulhu t0, t4, a0
+; RV32ZBA-NEXT: mulhu a7, t4, a0
+; RV32ZBA-NEXT: add a1, a7, a1
; RV32ZBA-NEXT: add a1, a1, t5
; RV32ZBA-NEXT: add a1, a1, a3
; RV32ZBA-NEXT: sltu a3, t6, t5
; RV32ZBA-NEXT: add a1, a1, a3
-; RV32ZBA-NEXT: add a1, t0, a1
; RV32ZBA-NEXT: add a1, a6, a1
; RV32ZBA-NEXT: add a1, a1, s1
-; RV32ZBA-NEXT: add a1, a7, a1
; RV32ZBA-NEXT: srai a3, a5, 31
; RV32ZBA-NEXT: xor a1, a1, a3
; RV32ZBA-NEXT: xor a3, s0, a3
; RV32: # %bb.0: # %entry
; RV32-NEXT: mul a5, a3, a0
; RV32-NEXT: mul a6, a1, a2
-; RV32-NEXT: mulhu a7, a0, a2
-; RV32-NEXT: add a5, a7, a5
-; RV32-NEXT: add a5, a5, a6
-; RV32-NEXT: sltu a6, a5, a7
+; RV32-NEXT: add a5, a6, a5
+; RV32-NEXT: mulhu a6, a0, a2
+; RV32-NEXT: add a5, a6, a5
+; RV32-NEXT: sltu a6, a5, a6
; RV32-NEXT: snez a7, a3
; RV32-NEXT: snez t0, a1
; RV32-NEXT: and a7, t0, a7
; RV32-NEXT: mulhu a1, a1, a2
; RV32-NEXT: snez a1, a1
+; RV32-NEXT: or a1, a7, a1
; RV32-NEXT: mulhu a3, a3, a0
; RV32-NEXT: snez a3, a3
; RV32-NEXT: or a1, a1, a3
; RV32-NEXT: or a1, a1, a6
-; RV32-NEXT: or a1, a7, a1
; RV32-NEXT: mul a0, a0, a2
; RV32-NEXT: sw a0, 0(a4)
; RV32-NEXT: sw a5, 4(a4)
; RV32ZBA: # %bb.0: # %entry
; RV32ZBA-NEXT: mul a5, a3, a0
; RV32ZBA-NEXT: mul a6, a1, a2
-; RV32ZBA-NEXT: mulhu a7, a0, a2
-; RV32ZBA-NEXT: add a5, a7, a5
-; RV32ZBA-NEXT: add a5, a5, a6
-; RV32ZBA-NEXT: sltu a6, a5, a7
+; RV32ZBA-NEXT: add a5, a6, a5
+; RV32ZBA-NEXT: mulhu a6, a0, a2
+; RV32ZBA-NEXT: add a5, a6, a5
+; RV32ZBA-NEXT: sltu a6, a5, a6
; RV32ZBA-NEXT: snez a7, a3
; RV32ZBA-NEXT: snez t0, a1
; RV32ZBA-NEXT: and a7, t0, a7
; RV32ZBA-NEXT: mulhu a1, a1, a2
; RV32ZBA-NEXT: snez a1, a1
+; RV32ZBA-NEXT: or a1, a7, a1
; RV32ZBA-NEXT: mulhu a3, a3, a0
; RV32ZBA-NEXT: snez a3, a3
; RV32ZBA-NEXT: or a1, a1, a3
; RV32ZBA-NEXT: or a1, a1, a6
-; RV32ZBA-NEXT: or a1, a7, a1
; RV32ZBA-NEXT: mul a0, a0, a2
; RV32ZBA-NEXT: sw a0, 0(a4)
; RV32ZBA-NEXT: sw a5, 4(a4)
define i64 @saddo.select.i64(i64 %v1, i64 %v2) {
; RV32-LABEL: saddo.select.i64:
; RV32: # %bb.0: # %entry
-; RV32-NEXT: add a4, a0, a2
-; RV32-NEXT: sltu a4, a4, a0
-; RV32-NEXT: add a4, a3, a4
-; RV32-NEXT: add a4, a1, a4
+; RV32-NEXT: add a4, a1, a3
+; RV32-NEXT: add a5, a0, a2
+; RV32-NEXT: sltu a5, a5, a0
+; RV32-NEXT: add a4, a4, a5
; RV32-NEXT: xor a4, a1, a4
; RV32-NEXT: xor a5, a1, a3
; RV32-NEXT: not a5, a5
;
; RV32ZBA-LABEL: saddo.select.i64:
; RV32ZBA: # %bb.0: # %entry
-; RV32ZBA-NEXT: add a4, a0, a2
-; RV32ZBA-NEXT: sltu a4, a4, a0
-; RV32ZBA-NEXT: add a4, a3, a4
-; RV32ZBA-NEXT: add a4, a1, a4
+; RV32ZBA-NEXT: add a4, a1, a3
+; RV32ZBA-NEXT: add a5, a0, a2
+; RV32ZBA-NEXT: sltu a5, a5, a0
+; RV32ZBA-NEXT: add a4, a4, a5
; RV32ZBA-NEXT: xor a4, a1, a4
; RV32ZBA-NEXT: xor a5, a1, a3
; RV32ZBA-NEXT: not a5, a5
define i1 @saddo.not.i64(i64 %v1, i64 %v2) {
; RV32-LABEL: saddo.not.i64:
; RV32: # %bb.0: # %entry
+; RV32-NEXT: add a4, a1, a3
; RV32-NEXT: add a2, a0, a2
; RV32-NEXT: sltu a0, a2, a0
-; RV32-NEXT: add a0, a3, a0
-; RV32-NEXT: add a0, a1, a0
+; RV32-NEXT: add a0, a4, a0
; RV32-NEXT: xor a0, a1, a0
; RV32-NEXT: xor a1, a1, a3
; RV32-NEXT: not a1, a1
;
; RV32ZBA-LABEL: saddo.not.i64:
; RV32ZBA: # %bb.0: # %entry
+; RV32ZBA-NEXT: add a4, a1, a3
; RV32ZBA-NEXT: add a2, a0, a2
; RV32ZBA-NEXT: sltu a0, a2, a0
-; RV32ZBA-NEXT: add a0, a3, a0
-; RV32ZBA-NEXT: add a0, a1, a0
+; RV32ZBA-NEXT: add a0, a4, a0
; RV32ZBA-NEXT: xor a0, a1, a0
; RV32ZBA-NEXT: xor a1, a1, a3
; RV32ZBA-NEXT: not a1, a1
define i64 @uaddo.select.i64(i64 %v1, i64 %v2) {
; RV32-LABEL: uaddo.select.i64:
; RV32: # %bb.0: # %entry
+; RV32-NEXT: add a5, a1, a3
; RV32-NEXT: add a4, a0, a2
; RV32-NEXT: sltu a4, a4, a0
-; RV32-NEXT: add a5, a3, a4
-; RV32-NEXT: add a5, a1, a5
+; RV32-NEXT: add a5, a5, a4
; RV32-NEXT: bne a5, a1, .LBB34_3
; RV32-NEXT: # %bb.1: # %entry
; RV32-NEXT: beqz a4, .LBB34_4
;
; RV32ZBA-LABEL: uaddo.select.i64:
; RV32ZBA: # %bb.0: # %entry
+; RV32ZBA-NEXT: add a5, a1, a3
; RV32ZBA-NEXT: add a4, a0, a2
; RV32ZBA-NEXT: sltu a4, a4, a0
-; RV32ZBA-NEXT: add a5, a3, a4
-; RV32ZBA-NEXT: add a5, a1, a5
+; RV32ZBA-NEXT: add a5, a5, a4
; RV32ZBA-NEXT: bne a5, a1, .LBB34_3
; RV32ZBA-NEXT: # %bb.1: # %entry
; RV32ZBA-NEXT: beqz a4, .LBB34_4
define i1 @uaddo.not.i64(i64 %v1, i64 %v2) {
; RV32-LABEL: uaddo.not.i64:
; RV32: # %bb.0: # %entry
+; RV32-NEXT: add a3, a1, a3
; RV32-NEXT: add a2, a0, a2
; RV32-NEXT: sltu a0, a2, a0
; RV32-NEXT: add a2, a3, a0
-; RV32-NEXT: add a2, a1, a2
; RV32-NEXT: beq a2, a1, .LBB35_2
; RV32-NEXT: # %bb.1: # %entry
; RV32-NEXT: sltu a0, a2, a1
;
; RV32ZBA-LABEL: uaddo.not.i64:
; RV32ZBA: # %bb.0: # %entry
+; RV32ZBA-NEXT: add a3, a1, a3
; RV32ZBA-NEXT: add a2, a0, a2
; RV32ZBA-NEXT: sltu a0, a2, a0
; RV32ZBA-NEXT: add a2, a3, a0
-; RV32ZBA-NEXT: add a2, a1, a2
; RV32ZBA-NEXT: beq a2, a1, .LBB35_2
; RV32ZBA-NEXT: # %bb.1: # %entry
; RV32ZBA-NEXT: sltu a0, a2, a1
; RV32-LABEL: ssubo.select.i64:
; RV32: # %bb.0: # %entry
; RV32-NEXT: sltu a4, a0, a2
-; RV32-NEXT: add a4, a3, a4
-; RV32-NEXT: sub a4, a1, a4
-; RV32-NEXT: xor a4, a1, a4
-; RV32-NEXT: xor a5, a1, a3
-; RV32-NEXT: and a4, a5, a4
+; RV32-NEXT: sub a5, a1, a3
+; RV32-NEXT: sub a5, a5, a4
+; RV32-NEXT: xor a5, a1, a5
+; RV32-NEXT: xor a4, a1, a3
+; RV32-NEXT: and a4, a4, a5
; RV32-NEXT: bltz a4, .LBB38_2
; RV32-NEXT: # %bb.1: # %entry
; RV32-NEXT: mv a0, a2
; RV32ZBA-LABEL: ssubo.select.i64:
; RV32ZBA: # %bb.0: # %entry
; RV32ZBA-NEXT: sltu a4, a0, a2
-; RV32ZBA-NEXT: add a4, a3, a4
-; RV32ZBA-NEXT: sub a4, a1, a4
-; RV32ZBA-NEXT: xor a4, a1, a4
-; RV32ZBA-NEXT: xor a5, a1, a3
-; RV32ZBA-NEXT: and a4, a5, a4
+; RV32ZBA-NEXT: sub a5, a1, a3
+; RV32ZBA-NEXT: sub a5, a5, a4
+; RV32ZBA-NEXT: xor a5, a1, a5
+; RV32ZBA-NEXT: xor a4, a1, a3
+; RV32ZBA-NEXT: and a4, a4, a5
; RV32ZBA-NEXT: bltz a4, .LBB38_2
; RV32ZBA-NEXT: # %bb.1: # %entry
; RV32ZBA-NEXT: mv a0, a2
; RV32-LABEL: ssub.not.i64:
; RV32: # %bb.0: # %entry
; RV32-NEXT: sltu a0, a0, a2
-; RV32-NEXT: add a0, a3, a0
-; RV32-NEXT: sub a0, a1, a0
-; RV32-NEXT: xor a0, a1, a0
+; RV32-NEXT: sub a2, a1, a3
+; RV32-NEXT: sub a2, a2, a0
+; RV32-NEXT: xor a2, a1, a2
; RV32-NEXT: xor a1, a1, a3
-; RV32-NEXT: and a0, a1, a0
-; RV32-NEXT: slti a0, a0, 0
+; RV32-NEXT: and a1, a1, a2
+; RV32-NEXT: slti a0, a1, 0
; RV32-NEXT: xori a0, a0, 1
; RV32-NEXT: ret
;
; RV32ZBA-LABEL: ssub.not.i64:
; RV32ZBA: # %bb.0: # %entry
; RV32ZBA-NEXT: sltu a0, a0, a2
-; RV32ZBA-NEXT: add a0, a3, a0
-; RV32ZBA-NEXT: sub a0, a1, a0
-; RV32ZBA-NEXT: xor a0, a1, a0
+; RV32ZBA-NEXT: sub a2, a1, a3
+; RV32ZBA-NEXT: sub a2, a2, a0
+; RV32ZBA-NEXT: xor a2, a1, a2
; RV32ZBA-NEXT: xor a1, a1, a3
-; RV32ZBA-NEXT: and a0, a1, a0
-; RV32ZBA-NEXT: slti a0, a0, 0
+; RV32ZBA-NEXT: and a1, a1, a2
+; RV32ZBA-NEXT: slti a0, a1, 0
; RV32ZBA-NEXT: xori a0, a0, 1
; RV32ZBA-NEXT: ret
;
; RV32-LABEL: usubo.select.i64:
; RV32: # %bb.0: # %entry
; RV32-NEXT: sltu a4, a0, a2
-; RV32-NEXT: add a4, a3, a4
-; RV32-NEXT: sub a4, a1, a4
+; RV32-NEXT: sub a5, a1, a3
+; RV32-NEXT: sub a4, a5, a4
; RV32-NEXT: beq a4, a1, .LBB42_2
; RV32-NEXT: # %bb.1: # %entry
; RV32-NEXT: sltu a4, a1, a4
; RV32ZBA-LABEL: usubo.select.i64:
; RV32ZBA: # %bb.0: # %entry
; RV32ZBA-NEXT: sltu a4, a0, a2
-; RV32ZBA-NEXT: add a4, a3, a4
-; RV32ZBA-NEXT: sub a4, a1, a4
+; RV32ZBA-NEXT: sub a5, a1, a3
+; RV32ZBA-NEXT: sub a4, a5, a4
; RV32ZBA-NEXT: beq a4, a1, .LBB42_2
; RV32ZBA-NEXT: # %bb.1: # %entry
; RV32ZBA-NEXT: sltu a4, a1, a4
; RV32-LABEL: usubo.not.i64:
; RV32: # %bb.0: # %entry
; RV32-NEXT: sltu a4, a0, a2
-; RV32-NEXT: add a3, a3, a4
; RV32-NEXT: sub a3, a1, a3
+; RV32-NEXT: sub a3, a3, a4
; RV32-NEXT: beq a3, a1, .LBB43_2
; RV32-NEXT: # %bb.1: # %entry
; RV32-NEXT: sltu a0, a1, a3
; RV32ZBA-LABEL: usubo.not.i64:
; RV32ZBA: # %bb.0: # %entry
; RV32ZBA-NEXT: sltu a4, a0, a2
-; RV32ZBA-NEXT: add a3, a3, a4
; RV32ZBA-NEXT: sub a3, a1, a3
+; RV32ZBA-NEXT: sub a3, a3, a4
; RV32ZBA-NEXT: beq a3, a1, .LBB43_2
; RV32ZBA-NEXT: # %bb.1: # %entry
; RV32ZBA-NEXT: sltu a0, a1, a3
; RV32-NEXT: sltu a7, t0, a7
; RV32-NEXT: sltu a5, a6, a5
; RV32-NEXT: mulhu a6, a1, a3
+; RV32-NEXT: add a5, a6, a5
; RV32-NEXT: add a5, a5, a7
-; RV32-NEXT: mulhu a7, a2, t1
-; RV32-NEXT: mul t0, a3, t1
-; RV32-NEXT: add t0, t2, t0
-; RV32-NEXT: add a7, a7, t0
-; RV32-NEXT: mul t0, t3, a1
-; RV32-NEXT: mulhu t1, t3, a0
-; RV32-NEXT: add t0, t0, t4
+; RV32-NEXT: mulhu a6, a2, t1
+; RV32-NEXT: add a6, a6, t2
+; RV32-NEXT: mul a7, a3, t1
+; RV32-NEXT: add a6, a6, a7
+; RV32-NEXT: mul a7, t3, a1
+; RV32-NEXT: mulhu t0, t3, a0
; RV32-NEXT: add a7, t0, a7
-; RV32-NEXT: sltu t0, t5, t4
-; RV32-NEXT: add a7, a7, t0
-; RV32-NEXT: add a7, t1, a7
-; RV32-NEXT: add a5, a5, a7
+; RV32-NEXT: add a7, a7, t4
+; RV32-NEXT: add a6, a7, a6
+; RV32-NEXT: sltu a7, t5, t4
+; RV32-NEXT: add a6, a6, a7
+; RV32-NEXT: add a5, a5, a6
; RV32-NEXT: add a5, a5, s0
-; RV32-NEXT: add a5, a6, a5
; RV32-NEXT: srai a4, a4, 31
; RV32-NEXT: xor a5, a5, a4
; RV32-NEXT: xor a4, t6, a4
; RV32ZBA-NEXT: sltu a7, t0, a7
; RV32ZBA-NEXT: sltu a5, a6, a5
; RV32ZBA-NEXT: mulhu a6, a1, a3
+; RV32ZBA-NEXT: add a5, a6, a5
; RV32ZBA-NEXT: add a5, a5, a7
-; RV32ZBA-NEXT: mulhu a7, a2, t1
-; RV32ZBA-NEXT: mul t0, a3, t1
-; RV32ZBA-NEXT: add t0, t2, t0
-; RV32ZBA-NEXT: add a7, a7, t0
-; RV32ZBA-NEXT: mul t0, t3, a1
-; RV32ZBA-NEXT: mulhu t1, t3, a0
-; RV32ZBA-NEXT: add t0, t0, t4
+; RV32ZBA-NEXT: mulhu a6, a2, t1
+; RV32ZBA-NEXT: add a6, a6, t2
+; RV32ZBA-NEXT: mul a7, a3, t1
+; RV32ZBA-NEXT: add a6, a6, a7
+; RV32ZBA-NEXT: mul a7, t3, a1
+; RV32ZBA-NEXT: mulhu t0, t3, a0
; RV32ZBA-NEXT: add a7, t0, a7
-; RV32ZBA-NEXT: sltu t0, t5, t4
-; RV32ZBA-NEXT: add a7, a7, t0
-; RV32ZBA-NEXT: add a7, t1, a7
-; RV32ZBA-NEXT: add a5, a5, a7
+; RV32ZBA-NEXT: add a7, a7, t4
+; RV32ZBA-NEXT: add a6, a7, a6
+; RV32ZBA-NEXT: sltu a7, t5, t4
+; RV32ZBA-NEXT: add a6, a6, a7
+; RV32ZBA-NEXT: add a5, a5, a6
; RV32ZBA-NEXT: add a5, a5, s0
-; RV32ZBA-NEXT: add a5, a6, a5
; RV32ZBA-NEXT: srai a4, a4, 31
; RV32ZBA-NEXT: xor a5, a5, a4
; RV32ZBA-NEXT: xor a4, t6, a4
; RV32-NEXT: sltu a7, t0, a7
; RV32-NEXT: sltu a5, a6, a5
; RV32-NEXT: mulhu a6, a1, a3
+; RV32-NEXT: add a5, a6, a5
; RV32-NEXT: add a5, a5, a7
; RV32-NEXT: mulhu a2, a2, t1
+; RV32-NEXT: add a2, a2, t2
; RV32-NEXT: mul a3, a3, t1
-; RV32-NEXT: add a3, t2, a3
; RV32-NEXT: add a2, a2, a3
; RV32-NEXT: mul a1, t3, a1
; RV32-NEXT: mulhu a0, t3, a0
-; RV32-NEXT: add a1, a1, t4
-; RV32-NEXT: add a1, a1, a2
-; RV32-NEXT: sltu a2, t5, t4
-; RV32-NEXT: add a1, a1, a2
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, a0, t4
+; RV32-NEXT: add a0, a0, a2
+; RV32-NEXT: sltu a1, t5, t4
; RV32-NEXT: add a0, a0, a1
; RV32-NEXT: add a0, a5, a0
; RV32-NEXT: add a0, a0, s0
-; RV32-NEXT: add a0, a6, a0
; RV32-NEXT: srai a4, a4, 31
; RV32-NEXT: xor a0, a0, a4
; RV32-NEXT: xor a1, t6, a4
; RV32ZBA-NEXT: sltu a7, t0, a7
; RV32ZBA-NEXT: sltu a5, a6, a5
; RV32ZBA-NEXT: mulhu a6, a1, a3
+; RV32ZBA-NEXT: add a5, a6, a5
; RV32ZBA-NEXT: add a5, a5, a7
; RV32ZBA-NEXT: mulhu a2, a2, t1
+; RV32ZBA-NEXT: add a2, a2, t2
; RV32ZBA-NEXT: mul a3, a3, t1
-; RV32ZBA-NEXT: add a3, t2, a3
; RV32ZBA-NEXT: add a2, a2, a3
; RV32ZBA-NEXT: mul a1, t3, a1
; RV32ZBA-NEXT: mulhu a0, t3, a0
-; RV32ZBA-NEXT: add a1, a1, t4
-; RV32ZBA-NEXT: add a1, a1, a2
-; RV32ZBA-NEXT: sltu a2, t5, t4
-; RV32ZBA-NEXT: add a1, a1, a2
+; RV32ZBA-NEXT: add a0, a0, a1
+; RV32ZBA-NEXT: add a0, a0, t4
+; RV32ZBA-NEXT: add a0, a0, a2
+; RV32ZBA-NEXT: sltu a1, t5, t4
; RV32ZBA-NEXT: add a0, a0, a1
; RV32ZBA-NEXT: add a0, a5, a0
; RV32ZBA-NEXT: add a0, a0, s0
-; RV32ZBA-NEXT: add a0, a6, a0
; RV32ZBA-NEXT: srai a4, a4, 31
; RV32ZBA-NEXT: xor a0, a0, a4
; RV32ZBA-NEXT: xor a1, t6, a4
; RV32: # %bb.0: # %entry
; RV32-NEXT: mul a4, a3, a0
; RV32-NEXT: mul a5, a1, a2
-; RV32-NEXT: mulhu a6, a0, a2
-; RV32-NEXT: add a4, a6, a4
-; RV32-NEXT: add a4, a4, a5
-; RV32-NEXT: sltu a4, a4, a6
+; RV32-NEXT: add a4, a5, a4
+; RV32-NEXT: mulhu a5, a0, a2
+; RV32-NEXT: add a4, a5, a4
+; RV32-NEXT: sltu a4, a4, a5
; RV32-NEXT: snez a5, a3
; RV32-NEXT: snez a6, a1
; RV32-NEXT: and a5, a6, a5
; RV32-NEXT: mulhu a6, a1, a2
; RV32-NEXT: snez a6, a6
-; RV32-NEXT: mulhu a7, a3, a0
-; RV32-NEXT: snez a7, a7
-; RV32-NEXT: or a6, a6, a7
-; RV32-NEXT: or a4, a6, a4
+; RV32-NEXT: or a5, a5, a6
+; RV32-NEXT: mulhu a6, a3, a0
+; RV32-NEXT: snez a6, a6
+; RV32-NEXT: or a5, a5, a6
; RV32-NEXT: or a4, a5, a4
; RV32-NEXT: bnez a4, .LBB50_2
; RV32-NEXT: # %bb.1: # %entry
; RV32ZBA: # %bb.0: # %entry
; RV32ZBA-NEXT: mul a4, a3, a0
; RV32ZBA-NEXT: mul a5, a1, a2
-; RV32ZBA-NEXT: mulhu a6, a0, a2
-; RV32ZBA-NEXT: add a4, a6, a4
-; RV32ZBA-NEXT: add a4, a4, a5
-; RV32ZBA-NEXT: sltu a4, a4, a6
+; RV32ZBA-NEXT: add a4, a5, a4
+; RV32ZBA-NEXT: mulhu a5, a0, a2
+; RV32ZBA-NEXT: add a4, a5, a4
+; RV32ZBA-NEXT: sltu a4, a4, a5
; RV32ZBA-NEXT: snez a5, a3
; RV32ZBA-NEXT: snez a6, a1
; RV32ZBA-NEXT: and a5, a6, a5
; RV32ZBA-NEXT: mulhu a6, a1, a2
; RV32ZBA-NEXT: snez a6, a6
-; RV32ZBA-NEXT: mulhu a7, a3, a0
-; RV32ZBA-NEXT: snez a7, a7
-; RV32ZBA-NEXT: or a6, a6, a7
-; RV32ZBA-NEXT: or a4, a6, a4
+; RV32ZBA-NEXT: or a5, a5, a6
+; RV32ZBA-NEXT: mulhu a6, a3, a0
+; RV32ZBA-NEXT: snez a6, a6
+; RV32ZBA-NEXT: or a5, a5, a6
; RV32ZBA-NEXT: or a4, a5, a4
; RV32ZBA-NEXT: bnez a4, .LBB50_2
; RV32ZBA-NEXT: # %bb.1: # %entry
; RV32: # %bb.0: # %entry
; RV32-NEXT: mul a4, a3, a0
; RV32-NEXT: mul a5, a1, a2
-; RV32-NEXT: mulhu a6, a0, a2
-; RV32-NEXT: add a4, a6, a4
-; RV32-NEXT: add a4, a4, a5
-; RV32-NEXT: sltu a4, a4, a6
+; RV32-NEXT: add a4, a5, a4
+; RV32-NEXT: mulhu a5, a0, a2
+; RV32-NEXT: add a4, a5, a4
+; RV32-NEXT: sltu a4, a4, a5
; RV32-NEXT: snez a5, a3
; RV32-NEXT: snez a6, a1
; RV32-NEXT: and a5, a6, a5
; RV32-NEXT: mulhu a1, a1, a2
; RV32-NEXT: snez a1, a1
+; RV32-NEXT: or a1, a5, a1
; RV32-NEXT: mulhu a0, a3, a0
; RV32-NEXT: snez a0, a0
; RV32-NEXT: or a0, a1, a0
; RV32-NEXT: or a0, a0, a4
-; RV32-NEXT: or a0, a5, a0
; RV32-NEXT: xori a0, a0, 1
; RV32-NEXT: ret
;
; RV32ZBA: # %bb.0: # %entry
; RV32ZBA-NEXT: mul a4, a3, a0
; RV32ZBA-NEXT: mul a5, a1, a2
-; RV32ZBA-NEXT: mulhu a6, a0, a2
-; RV32ZBA-NEXT: add a4, a6, a4
-; RV32ZBA-NEXT: add a4, a4, a5
-; RV32ZBA-NEXT: sltu a4, a4, a6
+; RV32ZBA-NEXT: add a4, a5, a4
+; RV32ZBA-NEXT: mulhu a5, a0, a2
+; RV32ZBA-NEXT: add a4, a5, a4
+; RV32ZBA-NEXT: sltu a4, a4, a5
; RV32ZBA-NEXT: snez a5, a3
; RV32ZBA-NEXT: snez a6, a1
; RV32ZBA-NEXT: and a5, a6, a5
; RV32ZBA-NEXT: mulhu a1, a1, a2
; RV32ZBA-NEXT: snez a1, a1
+; RV32ZBA-NEXT: or a1, a5, a1
; RV32ZBA-NEXT: mulhu a0, a3, a0
; RV32ZBA-NEXT: snez a0, a0
; RV32ZBA-NEXT: or a0, a1, a0
; RV32ZBA-NEXT: or a0, a0, a4
-; RV32ZBA-NEXT: or a0, a5, a0
; RV32ZBA-NEXT: xori a0, a0, 1
; RV32ZBA-NEXT: ret
;
define zeroext i1 @saddo.br.i64(i64 %v1, i64 %v2) {
; RV32-LABEL: saddo.br.i64:
; RV32: # %bb.0: # %entry
+; RV32-NEXT: add a4, a1, a3
; RV32-NEXT: add a2, a0, a2
; RV32-NEXT: sltu a0, a2, a0
-; RV32-NEXT: add a0, a3, a0
-; RV32-NEXT: add a0, a1, a0
+; RV32-NEXT: add a0, a4, a0
; RV32-NEXT: xor a0, a1, a0
; RV32-NEXT: xor a1, a1, a3
; RV32-NEXT: not a1, a1
;
; RV32ZBA-LABEL: saddo.br.i64:
; RV32ZBA: # %bb.0: # %entry
+; RV32ZBA-NEXT: add a4, a1, a3
; RV32ZBA-NEXT: add a2, a0, a2
; RV32ZBA-NEXT: sltu a0, a2, a0
-; RV32ZBA-NEXT: add a0, a3, a0
-; RV32ZBA-NEXT: add a0, a1, a0
+; RV32ZBA-NEXT: add a0, a4, a0
; RV32ZBA-NEXT: xor a0, a1, a0
; RV32ZBA-NEXT: xor a1, a1, a3
; RV32ZBA-NEXT: not a1, a1
define zeroext i1 @uaddo.br.i64(i64 %v1, i64 %v2) {
; RV32-LABEL: uaddo.br.i64:
; RV32: # %bb.0: # %entry
+; RV32-NEXT: add a3, a1, a3
; RV32-NEXT: add a2, a0, a2
; RV32-NEXT: sltu a0, a2, a0
; RV32-NEXT: add a2, a3, a0
-; RV32-NEXT: add a2, a1, a2
; RV32-NEXT: beq a2, a1, .LBB55_2
; RV32-NEXT: # %bb.1: # %entry
; RV32-NEXT: sltu a0, a2, a1
;
; RV32ZBA-LABEL: uaddo.br.i64:
; RV32ZBA: # %bb.0: # %entry
+; RV32ZBA-NEXT: add a3, a1, a3
; RV32ZBA-NEXT: add a2, a0, a2
; RV32ZBA-NEXT: sltu a0, a2, a0
; RV32ZBA-NEXT: add a2, a3, a0
-; RV32ZBA-NEXT: add a2, a1, a2
; RV32ZBA-NEXT: beq a2, a1, .LBB55_2
; RV32ZBA-NEXT: # %bb.1: # %entry
; RV32ZBA-NEXT: sltu a0, a2, a1
; RV32-LABEL: ssubo.br.i64:
; RV32: # %bb.0: # %entry
; RV32-NEXT: sltu a0, a0, a2
-; RV32-NEXT: add a0, a3, a0
-; RV32-NEXT: sub a0, a1, a0
-; RV32-NEXT: xor a0, a1, a0
+; RV32-NEXT: sub a2, a1, a3
+; RV32-NEXT: sub a2, a2, a0
+; RV32-NEXT: xor a2, a1, a2
; RV32-NEXT: xor a1, a1, a3
-; RV32-NEXT: and a0, a1, a0
-; RV32-NEXT: bgez a0, .LBB57_2
+; RV32-NEXT: and a1, a1, a2
+; RV32-NEXT: bgez a1, .LBB57_2
; RV32-NEXT: # %bb.1: # %overflow
; RV32-NEXT: li a0, 0
; RV32-NEXT: ret
; RV32ZBA-LABEL: ssubo.br.i64:
; RV32ZBA: # %bb.0: # %entry
; RV32ZBA-NEXT: sltu a0, a0, a2
-; RV32ZBA-NEXT: add a0, a3, a0
-; RV32ZBA-NEXT: sub a0, a1, a0
-; RV32ZBA-NEXT: xor a0, a1, a0
+; RV32ZBA-NEXT: sub a2, a1, a3
+; RV32ZBA-NEXT: sub a2, a2, a0
+; RV32ZBA-NEXT: xor a2, a1, a2
; RV32ZBA-NEXT: xor a1, a1, a3
-; RV32ZBA-NEXT: and a0, a1, a0
-; RV32ZBA-NEXT: bgez a0, .LBB57_2
+; RV32ZBA-NEXT: and a1, a1, a2
+; RV32ZBA-NEXT: bgez a1, .LBB57_2
; RV32ZBA-NEXT: # %bb.1: # %overflow
; RV32ZBA-NEXT: li a0, 0
; RV32ZBA-NEXT: ret
; RV32-LABEL: usubo.br.i64:
; RV32: # %bb.0: # %entry
; RV32-NEXT: sltu a4, a0, a2
-; RV32-NEXT: add a3, a3, a4
; RV32-NEXT: sub a3, a1, a3
+; RV32-NEXT: sub a3, a3, a4
; RV32-NEXT: beq a3, a1, .LBB59_3
; RV32-NEXT: # %bb.1: # %entry
; RV32-NEXT: sltu a0, a1, a3
; RV32ZBA-LABEL: usubo.br.i64:
; RV32ZBA: # %bb.0: # %entry
; RV32ZBA-NEXT: sltu a4, a0, a2
-; RV32ZBA-NEXT: add a3, a3, a4
; RV32ZBA-NEXT: sub a3, a1, a3
+; RV32ZBA-NEXT: sub a3, a3, a4
; RV32ZBA-NEXT: beq a3, a1, .LBB59_3
; RV32ZBA-NEXT: # %bb.1: # %entry
; RV32ZBA-NEXT: sltu a0, a1, a3
; RV32-NEXT: sltu a7, t0, a7
; RV32-NEXT: sltu a5, a6, a5
; RV32-NEXT: mulhu a6, a1, a3
+; RV32-NEXT: add a5, a6, a5
; RV32-NEXT: add a5, a5, a7
; RV32-NEXT: mulhu a2, a2, t1
+; RV32-NEXT: add a2, a2, t2
; RV32-NEXT: mul a3, a3, t1
-; RV32-NEXT: add a3, t2, a3
; RV32-NEXT: add a2, a2, a3
; RV32-NEXT: mul a1, t3, a1
; RV32-NEXT: mulhu a0, t3, a0
-; RV32-NEXT: add a1, a1, t4
-; RV32-NEXT: add a1, a1, a2
-; RV32-NEXT: sltu a2, t5, t4
-; RV32-NEXT: add a1, a1, a2
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, a0, t4
+; RV32-NEXT: add a0, a0, a2
+; RV32-NEXT: sltu a1, t5, t4
; RV32-NEXT: add a0, a0, a1
; RV32-NEXT: add a0, a5, a0
; RV32-NEXT: add a0, a0, s0
-; RV32-NEXT: add a0, a6, a0
; RV32-NEXT: srai a4, a4, 31
; RV32-NEXT: xor a0, a0, a4
; RV32-NEXT: xor a1, t6, a4
; RV32ZBA-NEXT: sltu a7, t0, a7
; RV32ZBA-NEXT: sltu a5, a6, a5
; RV32ZBA-NEXT: mulhu a6, a1, a3
+; RV32ZBA-NEXT: add a5, a6, a5
; RV32ZBA-NEXT: add a5, a5, a7
; RV32ZBA-NEXT: mulhu a2, a2, t1
+; RV32ZBA-NEXT: add a2, a2, t2
; RV32ZBA-NEXT: mul a3, a3, t1
-; RV32ZBA-NEXT: add a3, t2, a3
; RV32ZBA-NEXT: add a2, a2, a3
; RV32ZBA-NEXT: mul a1, t3, a1
; RV32ZBA-NEXT: mulhu a0, t3, a0
-; RV32ZBA-NEXT: add a1, a1, t4
-; RV32ZBA-NEXT: add a1, a1, a2
-; RV32ZBA-NEXT: sltu a2, t5, t4
-; RV32ZBA-NEXT: add a1, a1, a2
+; RV32ZBA-NEXT: add a0, a0, a1
+; RV32ZBA-NEXT: add a0, a0, t4
+; RV32ZBA-NEXT: add a0, a0, a2
+; RV32ZBA-NEXT: sltu a1, t5, t4
; RV32ZBA-NEXT: add a0, a0, a1
; RV32ZBA-NEXT: add a0, a5, a0
; RV32ZBA-NEXT: add a0, a0, s0
-; RV32ZBA-NEXT: add a0, a6, a0
; RV32ZBA-NEXT: srai a4, a4, 31
; RV32ZBA-NEXT: xor a0, a0, a4
; RV32ZBA-NEXT: xor a1, t6, a4
; RV32-NEXT: sltu t1, t1, t6
; RV32-NEXT: sltu a4, a6, a4
; RV32-NEXT: mulhu a6, a1, a7
+; RV32-NEXT: add a4, a6, a4
; RV32-NEXT: add a4, a4, t1
; RV32-NEXT: sltu a5, t3, a5
; RV32-NEXT: mulh a2, t2, a2
; RV32-NEXT: add a0, a0, a1
-; RV32-NEXT: sub a0, a0, a2
-; RV32-NEXT: sub a0, a0, a5
; RV32-NEXT: sub a0, t0, a0
+; RV32-NEXT: add a0, a0, a2
+; RV32-NEXT: add a0, a0, a5
; RV32-NEXT: add a0, a4, a0
; RV32-NEXT: add a0, a0, t5
-; RV32-NEXT: add a0, a6, a0
; RV32-NEXT: srai a3, a3, 31
; RV32-NEXT: xor a0, a0, a3
; RV32-NEXT: xor a1, t4, a3
; RV32ZBA-NEXT: sltu t1, t1, t6
; RV32ZBA-NEXT: sltu a4, a6, a4
; RV32ZBA-NEXT: mulhu a6, a1, a7
+; RV32ZBA-NEXT: add a4, a6, a4
; RV32ZBA-NEXT: add a4, a4, t1
; RV32ZBA-NEXT: sltu a5, t3, a5
; RV32ZBA-NEXT: mulh a2, t2, a2
; RV32ZBA-NEXT: add a0, a0, a1
-; RV32ZBA-NEXT: sub a0, a0, a2
-; RV32ZBA-NEXT: sub a0, a0, a5
; RV32ZBA-NEXT: sub a0, t0, a0
+; RV32ZBA-NEXT: add a0, a0, a2
+; RV32ZBA-NEXT: add a0, a0, a5
; RV32ZBA-NEXT: add a0, a4, a0
; RV32ZBA-NEXT: add a0, a0, t5
-; RV32ZBA-NEXT: add a0, a6, a0
; RV32ZBA-NEXT: srai a3, a3, 31
; RV32ZBA-NEXT: xor a0, a0, a3
; RV32ZBA-NEXT: xor a1, t4, a3
; RV32: # %bb.0: # %entry
; RV32-NEXT: mul a4, a3, a0
; RV32-NEXT: mul a5, a1, a2
-; RV32-NEXT: mulhu a6, a0, a2
-; RV32-NEXT: add a4, a6, a4
-; RV32-NEXT: add a4, a4, a5
-; RV32-NEXT: sltu a4, a4, a6
+; RV32-NEXT: add a4, a5, a4
+; RV32-NEXT: mulhu a5, a0, a2
+; RV32-NEXT: add a4, a5, a4
+; RV32-NEXT: sltu a4, a4, a5
; RV32-NEXT: snez a5, a3
; RV32-NEXT: snez a6, a1
; RV32-NEXT: and a5, a6, a5
; RV32-NEXT: mulhu a1, a1, a2
; RV32-NEXT: snez a1, a1
+; RV32-NEXT: or a1, a5, a1
; RV32-NEXT: mulhu a0, a3, a0
; RV32-NEXT: snez a0, a0
; RV32-NEXT: or a0, a1, a0
; RV32-NEXT: or a0, a0, a4
-; RV32-NEXT: or a0, a5, a0
; RV32-NEXT: beqz a0, .LBB64_2
; RV32-NEXT: # %bb.1: # %overflow
; RV32-NEXT: li a0, 0
; RV32ZBA: # %bb.0: # %entry
; RV32ZBA-NEXT: mul a4, a3, a0
; RV32ZBA-NEXT: mul a5, a1, a2
-; RV32ZBA-NEXT: mulhu a6, a0, a2
-; RV32ZBA-NEXT: add a4, a6, a4
-; RV32ZBA-NEXT: add a4, a4, a5
-; RV32ZBA-NEXT: sltu a4, a4, a6
+; RV32ZBA-NEXT: add a4, a5, a4
+; RV32ZBA-NEXT: mulhu a5, a0, a2
+; RV32ZBA-NEXT: add a4, a5, a4
+; RV32ZBA-NEXT: sltu a4, a4, a5
; RV32ZBA-NEXT: snez a5, a3
; RV32ZBA-NEXT: snez a6, a1
; RV32ZBA-NEXT: and a5, a6, a5
; RV32ZBA-NEXT: mulhu a1, a1, a2
; RV32ZBA-NEXT: snez a1, a1
+; RV32ZBA-NEXT: or a1, a5, a1
; RV32ZBA-NEXT: mulhu a0, a3, a0
; RV32ZBA-NEXT: snez a0, a0
; RV32ZBA-NEXT: or a0, a1, a0
; RV32ZBA-NEXT: or a0, a0, a4
-; RV32ZBA-NEXT: or a0, a5, a0
; RV32ZBA-NEXT: beqz a0, .LBB64_2
; RV32ZBA-NEXT: # %bb.1: # %overflow
; RV32ZBA-NEXT: li a0, 0
; RV32: # %bb.0: # %entry
; RV32-NEXT: add a2, a0, a0
; RV32-NEXT: sltu a0, a2, a0
-; RV32-NEXT: add a2, a1, a0
-; RV32-NEXT: add a2, a1, a2
+; RV32-NEXT: add a2, a1, a1
+; RV32-NEXT: add a2, a2, a0
; RV32-NEXT: beq a2, a1, .LBB65_2
; RV32-NEXT: # %bb.1: # %entry
; RV32-NEXT: sltu a0, a2, a1
; RV32ZBA: # %bb.0: # %entry
; RV32ZBA-NEXT: add a2, a0, a0
; RV32ZBA-NEXT: sltu a0, a2, a0
-; RV32ZBA-NEXT: add a2, a1, a0
-; RV32ZBA-NEXT: add a2, a1, a2
+; RV32ZBA-NEXT: add a2, a1, a1
+; RV32ZBA-NEXT: add a2, a2, a0
; RV32ZBA-NEXT: beq a2, a1, .LBB65_2
; RV32ZBA-NEXT: # %bb.1: # %entry
; RV32ZBA-NEXT: sltu a0, a2, a1
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: psubusw %xmm1, %xmm2
; SSE2-NEXT: psubusw %xmm0, %xmm1
+; SSE2-NEXT: paddw %xmm0, %xmm1
; SSE2-NEXT: psubw %xmm0, %xmm2
; SSE2-NEXT: paddw %xmm1, %xmm2
-; SSE2-NEXT: paddw %xmm2, %xmm0
+; SSE2-NEXT: movdqa %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE42-LABEL: abd_minmax_v8i16:
;
; X64-LABEL: test_i32_add_add_idx0:
; X64: # %bb.0:
-; X64-NEXT: # kill: def $edx killed $edx def $rdx
+; X64-NEXT: # kill: def $esi killed $esi def $rsi
; X64-NEXT: # kill: def $edi killed $edi def $rdi
+; X64-NEXT: leal (%rdi,%rsi), %eax
; X64-NEXT: andl $1, %edx
-; X64-NEXT: leal (%rdx,%rdi), %eax
-; X64-NEXT: addl %esi, %eax
+; X64-NEXT: addl %edx, %eax
; X64-NEXT: retq
%add = add i32 %y, %x
%mask = and i32 %z, 1
define i32 @foo(i32 %a, i32 %b, i32 %c, i32 %d) {
; CHECK-LABEL: foo:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: # kill: def $ecx killed $ecx def $rcx
; CHECK-NEXT: # kill: def $edx killed $edx def $rdx
; CHECK-NEXT: # kill: def $esi killed $esi def $rsi
+; CHECK-NEXT: # kill: def $edi killed $edi def $rdi
; CHECK-NEXT: movl %esi, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: movl %ecx, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: movl %edi, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: movl %edx, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: leal (%rsi,%rdx), %eax
-; CHECK-NEXT: addl %ecx, %eax
-; CHECK-NEXT: addl %edi, %eax
+; CHECK-NEXT: addl %edi, %esi
+; CHECK-NEXT: leal (%rdx,%rcx), %eax
+; CHECK-NEXT: addl %esi, %eax
; CHECK-NEXT: retq
entry:
%a0 = alloca i32
define <8 x i32> @DAGCombineB(<8 x i32> %v1, <8 x i32> %v2) nounwind readonly {
; CHECK-LABEL: DAGCombineB:
; CHECK: # %bb.0:
-; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm2
+; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm2
+; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm1
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm3
-; CHECK-NEXT: vpaddd %xmm3, %xmm2, %xmm2
-; CHECK-NEXT: vpaddd %xmm2, %xmm3, %xmm2
-; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm1
-; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; CHECK-NEXT: vpaddd %xmm1, %xmm3, %xmm1
+; CHECK-NEXT: vpaddd %xmm3, %xmm1, %xmm1
+; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0
+; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; CHECK-NEXT: retq
%t1 = add <8 x i32> %v1, %v2
%t2 = add <8 x i32> %t1, %v1
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vmovdqu64 8192(%rdi,%rax), %zmm0
; CHECK-NEXT: vpaddq %zmm0, %zmm0, %zmm1
-; CHECK-NEXT: vpaddq %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
; CHECK-NEXT: vmovdqu64 %zmm0, 8192(%rdi,%rax)
; CHECK-NEXT: addq $64, %rax
; CHECK-NEXT: jne .LBB9_1
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vmovdqu 8192(%rdi,%rax), %ymm0
; CHECK-NEXT: vpaddq %ymm0, %ymm0, %ymm1
-; CHECK-NEXT: vpaddq %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0
; CHECK-NEXT: vmovdqu %ymm0, 8192(%rdi,%rax)
; CHECK-NEXT: addq $32, %rax
; CHECK-NEXT: jne .LBB10_1
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vmovdqu 8192(%rdi,%rax), %xmm0
; CHECK-NEXT: vpaddq %xmm0, %xmm0, %xmm1
-; CHECK-NEXT: vpaddq %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
; CHECK-NEXT: vmovdqu %xmm0, 8192(%rdi,%rax)
; CHECK-NEXT: addq $16, %rax
; CHECK-NEXT: jne .LBB11_1
define i64 @test_x86_avx512_cvtsd2usi64(<2 x double> %a0) {
; CHECK-LABEL: test_x86_avx512_cvtsd2usi64:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vcvtsd2usi %xmm0, %rcx
-; CHECK-NEXT: vcvtsd2usi {rz-sae}, %xmm0, %rdx
+; CHECK-NEXT: vcvtsd2usi %xmm0, %rax
+; CHECK-NEXT: vcvtsd2usi {rz-sae}, %xmm0, %rcx
+; CHECK-NEXT: addq %rax, %rcx
; CHECK-NEXT: vcvtsd2usi {rd-sae}, %xmm0, %rax
-; CHECK-NEXT: addq %rdx, %rax
; CHECK-NEXT: addq %rcx, %rax
; CHECK-NEXT: retq
define i64 @test_x86_avx512_cvtsd2si64(<2 x double> %a0) {
; CHECK-LABEL: test_x86_avx512_cvtsd2si64:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vcvtsd2si %xmm0, %rcx
-; CHECK-NEXT: vcvtsd2si {rz-sae}, %xmm0, %rdx
+; CHECK-NEXT: vcvtsd2si %xmm0, %rax
+; CHECK-NEXT: vcvtsd2si {rz-sae}, %xmm0, %rcx
+; CHECK-NEXT: addq %rax, %rcx
; CHECK-NEXT: vcvtsd2si {rd-sae}, %xmm0, %rax
-; CHECK-NEXT: addq %rdx, %rax
; CHECK-NEXT: addq %rcx, %rax
; CHECK-NEXT: retq
define i64 @test_x86_avx512_cvtss2usi64(<4 x float> %a0) {
; CHECK-LABEL: test_x86_avx512_cvtss2usi64:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vcvtss2usi %xmm0, %rcx
-; CHECK-NEXT: vcvtss2usi {rz-sae}, %xmm0, %rdx
+; CHECK-NEXT: vcvtss2usi %xmm0, %rax
+; CHECK-NEXT: vcvtss2usi {rz-sae}, %xmm0, %rcx
+; CHECK-NEXT: addq %rax, %rcx
; CHECK-NEXT: vcvtss2usi {rd-sae}, %xmm0, %rax
-; CHECK-NEXT: addq %rdx, %rax
; CHECK-NEXT: addq %rcx, %rax
; CHECK-NEXT: retq
define i64 @test_x86_avx512_cvtss2si64(<4 x float> %a0) {
; CHECK-LABEL: test_x86_avx512_cvtss2si64:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vcvtss2si %xmm0, %rcx
-; CHECK-NEXT: vcvtss2si {rz-sae}, %xmm0, %rdx
+; CHECK-NEXT: vcvtss2si %xmm0, %rax
+; CHECK-NEXT: vcvtss2si {rz-sae}, %xmm0, %rcx
+; CHECK-NEXT: addq %rax, %rcx
; CHECK-NEXT: vcvtss2si {rd-sae}, %xmm0, %rax
-; CHECK-NEXT: addq %rdx, %rax
; CHECK-NEXT: addq %rcx, %rax
; CHECK-NEXT: retq
define i32 @test_x86_avx512_cvtsd2usi32(<2 x double> %a0) {
; CHECK-LABEL: test_x86_avx512_cvtsd2usi32:
; CHECK: # %bb.0:
-; CHECK-NEXT: vcvtsd2usi %xmm0, %ecx
-; CHECK-NEXT: vcvtsd2usi {rz-sae}, %xmm0, %edx
+; CHECK-NEXT: vcvtsd2usi %xmm0, %eax
+; CHECK-NEXT: vcvtsd2usi {rz-sae}, %xmm0, %ecx
+; CHECK-NEXT: addl %eax, %ecx
; CHECK-NEXT: vcvtsd2usi {rd-sae}, %xmm0, %eax
-; CHECK-NEXT: addl %edx, %eax
; CHECK-NEXT: addl %ecx, %eax
; CHECK-NEXT: ret{{[l|q]}}
define i32 @test_x86_avx512_cvtsd2si32(<2 x double> %a0) {
; CHECK-LABEL: test_x86_avx512_cvtsd2si32:
; CHECK: # %bb.0:
-; CHECK-NEXT: vcvtsd2si %xmm0, %ecx
-; CHECK-NEXT: vcvtsd2si {rz-sae}, %xmm0, %edx
+; CHECK-NEXT: vcvtsd2si %xmm0, %eax
+; CHECK-NEXT: vcvtsd2si {rz-sae}, %xmm0, %ecx
+; CHECK-NEXT: addl %eax, %ecx
; CHECK-NEXT: vcvtsd2si {rd-sae}, %xmm0, %eax
-; CHECK-NEXT: addl %edx, %eax
; CHECK-NEXT: addl %ecx, %eax
; CHECK-NEXT: ret{{[l|q]}}
define i32 @test_x86_avx512_cvtss2usi32(<4 x float> %a0) {
; CHECK-LABEL: test_x86_avx512_cvtss2usi32:
; CHECK: # %bb.0:
-; CHECK-NEXT: vcvtss2usi %xmm0, %ecx
-; CHECK-NEXT: vcvtss2usi {rz-sae}, %xmm0, %edx
+; CHECK-NEXT: vcvtss2usi %xmm0, %eax
+; CHECK-NEXT: vcvtss2usi {rz-sae}, %xmm0, %ecx
+; CHECK-NEXT: addl %eax, %ecx
; CHECK-NEXT: vcvtss2usi {rd-sae}, %xmm0, %eax
-; CHECK-NEXT: addl %edx, %eax
; CHECK-NEXT: addl %ecx, %eax
; CHECK-NEXT: ret{{[l|q]}}
define i32 @test_x86_avx512_cvtss2si32(<4 x float> %a0) {
; CHECK-LABEL: test_x86_avx512_cvtss2si32:
; CHECK: # %bb.0:
-; CHECK-NEXT: vcvtss2si %xmm0, %ecx
-; CHECK-NEXT: vcvtss2si {rz-sae}, %xmm0, %edx
+; CHECK-NEXT: vcvtss2si %xmm0, %eax
+; CHECK-NEXT: vcvtss2si {rz-sae}, %xmm0, %ecx
+; CHECK-NEXT: addl %eax, %ecx
; CHECK-NEXT: vcvtss2si {rd-sae}, %xmm0, %eax
-; CHECK-NEXT: addl %edx, %eax
; CHECK-NEXT: addl %ecx, %eax
; CHECK-NEXT: ret{{[l|q]}}
; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vpmovqb %zmm0, %xmm2
; X64-NEXT: vpmovqb %zmm0, %xmm1 {%k1}
+; X64-NEXT: vpaddb %xmm1, %xmm2, %xmm1
; X64-NEXT: vpmovqb %zmm0, %xmm0 {%k1} {z}
; X64-NEXT: vpaddb %xmm0, %xmm1, %xmm0
-; X64-NEXT: vpaddb %xmm0, %xmm2, %xmm0
; X64-NEXT: vzeroupper
; X64-NEXT: retq
;
; X86-NEXT: kmovw %eax, %k1
; X86-NEXT: vpmovqb %zmm0, %xmm2
; X86-NEXT: vpmovqb %zmm0, %xmm1 {%k1}
+; X86-NEXT: vpaddb %xmm1, %xmm2, %xmm1
; X86-NEXT: vpmovqb %zmm0, %xmm0 {%k1} {z}
; X86-NEXT: vpaddb %xmm0, %xmm1, %xmm0
-; X86-NEXT: vpaddb %xmm0, %xmm2, %xmm0
; X86-NEXT: vzeroupper
; X86-NEXT: retl
%res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 -1)
; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vpmovsqb %zmm0, %xmm2
; X64-NEXT: vpmovsqb %zmm0, %xmm1 {%k1}
+; X64-NEXT: vpaddb %xmm1, %xmm2, %xmm1
; X64-NEXT: vpmovsqb %zmm0, %xmm0 {%k1} {z}
; X64-NEXT: vpaddb %xmm0, %xmm1, %xmm0
-; X64-NEXT: vpaddb %xmm0, %xmm2, %xmm0
; X64-NEXT: vzeroupper
; X64-NEXT: retq
;
; X86-NEXT: kmovw %eax, %k1
; X86-NEXT: vpmovsqb %zmm0, %xmm2
; X86-NEXT: vpmovsqb %zmm0, %xmm1 {%k1}
+; X86-NEXT: vpaddb %xmm1, %xmm2, %xmm1
; X86-NEXT: vpmovsqb %zmm0, %xmm0 {%k1} {z}
; X86-NEXT: vpaddb %xmm0, %xmm1, %xmm0
-; X86-NEXT: vpaddb %xmm0, %xmm2, %xmm0
; X86-NEXT: vzeroupper
; X86-NEXT: retl
%res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 -1)
; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vpmovusqb %zmm0, %xmm2
; X64-NEXT: vpmovusqb %zmm0, %xmm1 {%k1}
+; X64-NEXT: vpaddb %xmm1, %xmm2, %xmm1
; X64-NEXT: vpmovusqb %zmm0, %xmm0 {%k1} {z}
; X64-NEXT: vpaddb %xmm0, %xmm1, %xmm0
-; X64-NEXT: vpaddb %xmm0, %xmm2, %xmm0
; X64-NEXT: vzeroupper
; X64-NEXT: retq
;
; X86-NEXT: kmovw %eax, %k1
; X86-NEXT: vpmovusqb %zmm0, %xmm2
; X86-NEXT: vpmovusqb %zmm0, %xmm1 {%k1}
+; X86-NEXT: vpaddb %xmm1, %xmm2, %xmm1
; X86-NEXT: vpmovusqb %zmm0, %xmm0 {%k1} {z}
; X86-NEXT: vpaddb %xmm0, %xmm1, %xmm0
-; X86-NEXT: vpaddb %xmm0, %xmm2, %xmm0
; X86-NEXT: vzeroupper
; X86-NEXT: retl
%res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 -1)
; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vpmovqw %zmm0, %xmm2
; X64-NEXT: vpmovqw %zmm0, %xmm1 {%k1}
+; X64-NEXT: vpaddw %xmm1, %xmm2, %xmm1
; X64-NEXT: vpmovqw %zmm0, %xmm0 {%k1} {z}
; X64-NEXT: vpaddw %xmm0, %xmm1, %xmm0
-; X64-NEXT: vpaddw %xmm0, %xmm2, %xmm0
; X64-NEXT: vzeroupper
; X64-NEXT: retq
;
; X86-NEXT: kmovw %eax, %k1
; X86-NEXT: vpmovqw %zmm0, %xmm2
; X86-NEXT: vpmovqw %zmm0, %xmm1 {%k1}
+; X86-NEXT: vpaddw %xmm1, %xmm2, %xmm1
; X86-NEXT: vpmovqw %zmm0, %xmm0 {%k1} {z}
; X86-NEXT: vpaddw %xmm0, %xmm1, %xmm0
-; X86-NEXT: vpaddw %xmm0, %xmm2, %xmm0
; X86-NEXT: vzeroupper
; X86-NEXT: retl
%res0 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 -1)
; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vpmovsqw %zmm0, %xmm2
; X64-NEXT: vpmovsqw %zmm0, %xmm1 {%k1}
+; X64-NEXT: vpaddw %xmm1, %xmm2, %xmm1
; X64-NEXT: vpmovsqw %zmm0, %xmm0 {%k1} {z}
; X64-NEXT: vpaddw %xmm0, %xmm1, %xmm0
-; X64-NEXT: vpaddw %xmm0, %xmm2, %xmm0
; X64-NEXT: vzeroupper
; X64-NEXT: retq
;
; X86-NEXT: kmovw %eax, %k1
; X86-NEXT: vpmovsqw %zmm0, %xmm2
; X86-NEXT: vpmovsqw %zmm0, %xmm1 {%k1}
+; X86-NEXT: vpaddw %xmm1, %xmm2, %xmm1
; X86-NEXT: vpmovsqw %zmm0, %xmm0 {%k1} {z}
; X86-NEXT: vpaddw %xmm0, %xmm1, %xmm0
-; X86-NEXT: vpaddw %xmm0, %xmm2, %xmm0
; X86-NEXT: vzeroupper
; X86-NEXT: retl
%res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 -1)
; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vpmovusqw %zmm0, %xmm2
; X64-NEXT: vpmovusqw %zmm0, %xmm1 {%k1}
+; X64-NEXT: vpaddw %xmm1, %xmm2, %xmm1
; X64-NEXT: vpmovusqw %zmm0, %xmm0 {%k1} {z}
; X64-NEXT: vpaddw %xmm0, %xmm1, %xmm0
-; X64-NEXT: vpaddw %xmm0, %xmm2, %xmm0
; X64-NEXT: vzeroupper
; X64-NEXT: retq
;
; X86-NEXT: kmovw %eax, %k1
; X86-NEXT: vpmovusqw %zmm0, %xmm2
; X86-NEXT: vpmovusqw %zmm0, %xmm1 {%k1}
+; X86-NEXT: vpaddw %xmm1, %xmm2, %xmm1
; X86-NEXT: vpmovusqw %zmm0, %xmm0 {%k1} {z}
; X86-NEXT: vpaddw %xmm0, %xmm1, %xmm0
-; X86-NEXT: vpaddw %xmm0, %xmm2, %xmm0
; X86-NEXT: vzeroupper
; X86-NEXT: retl
%res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 -1)
; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vpmovdb %zmm0, %xmm2
; X64-NEXT: vpmovdb %zmm0, %xmm1 {%k1}
+; X64-NEXT: vpaddb %xmm1, %xmm2, %xmm1
; X64-NEXT: vpmovdb %zmm0, %xmm0 {%k1} {z}
; X64-NEXT: vpaddb %xmm0, %xmm1, %xmm0
-; X64-NEXT: vpaddb %xmm0, %xmm2, %xmm0
; X64-NEXT: vzeroupper
; X64-NEXT: retq
;
; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
; X86-NEXT: vpmovdb %zmm0, %xmm2
; X86-NEXT: vpmovdb %zmm0, %xmm1 {%k1}
+; X86-NEXT: vpaddb %xmm1, %xmm2, %xmm1
; X86-NEXT: vpmovdb %zmm0, %xmm0 {%k1} {z}
; X86-NEXT: vpaddb %xmm0, %xmm1, %xmm0
-; X86-NEXT: vpaddb %xmm0, %xmm2, %xmm0
; X86-NEXT: vzeroupper
; X86-NEXT: retl
%res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 -1)
; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vpmovsdb %zmm0, %xmm2
; X64-NEXT: vpmovsdb %zmm0, %xmm1 {%k1}
+; X64-NEXT: vpaddb %xmm1, %xmm2, %xmm1
; X64-NEXT: vpmovsdb %zmm0, %xmm0 {%k1} {z}
; X64-NEXT: vpaddb %xmm0, %xmm1, %xmm0
-; X64-NEXT: vpaddb %xmm0, %xmm2, %xmm0
; X64-NEXT: vzeroupper
; X64-NEXT: retq
;
; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
; X86-NEXT: vpmovsdb %zmm0, %xmm2
; X86-NEXT: vpmovsdb %zmm0, %xmm1 {%k1}
+; X86-NEXT: vpaddb %xmm1, %xmm2, %xmm1
; X86-NEXT: vpmovsdb %zmm0, %xmm0 {%k1} {z}
; X86-NEXT: vpaddb %xmm0, %xmm1, %xmm0
-; X86-NEXT: vpaddb %xmm0, %xmm2, %xmm0
; X86-NEXT: vzeroupper
; X86-NEXT: retl
%res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 -1)
; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vpmovusdb %zmm0, %xmm2
; X64-NEXT: vpmovusdb %zmm0, %xmm1 {%k1}
+; X64-NEXT: vpaddb %xmm1, %xmm2, %xmm1
; X64-NEXT: vpmovusdb %zmm0, %xmm0 {%k1} {z}
; X64-NEXT: vpaddb %xmm0, %xmm1, %xmm0
-; X64-NEXT: vpaddb %xmm0, %xmm2, %xmm0
; X64-NEXT: vzeroupper
; X64-NEXT: retq
;
; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
; X86-NEXT: vpmovusdb %zmm0, %xmm2
; X86-NEXT: vpmovusdb %zmm0, %xmm1 {%k1}
+; X86-NEXT: vpaddb %xmm1, %xmm2, %xmm1
; X86-NEXT: vpmovusdb %zmm0, %xmm0 {%k1} {z}
; X86-NEXT: vpaddb %xmm0, %xmm1, %xmm0
-; X86-NEXT: vpaddb %xmm0, %xmm2, %xmm0
; X86-NEXT: vzeroupper
; X86-NEXT: retl
%res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 -1)
; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vpmovdw %zmm0, %ymm2
; X64-NEXT: vpmovdw %zmm0, %ymm1 {%k1}
+; X64-NEXT: vpaddw %ymm1, %ymm2, %ymm1
; X64-NEXT: vpmovdw %zmm0, %ymm0 {%k1} {z}
; X64-NEXT: vpaddw %ymm0, %ymm1, %ymm0
-; X64-NEXT: vpaddw %ymm0, %ymm2, %ymm0
; X64-NEXT: retq
;
; X86-LABEL: test_int_x86_avx512_mask_pmov_dw_512:
; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
; X86-NEXT: vpmovdw %zmm0, %ymm2
; X86-NEXT: vpmovdw %zmm0, %ymm1 {%k1}
+; X86-NEXT: vpaddw %ymm1, %ymm2, %ymm1
; X86-NEXT: vpmovdw %zmm0, %ymm0 {%k1} {z}
; X86-NEXT: vpaddw %ymm0, %ymm1, %ymm0
-; X86-NEXT: vpaddw %ymm0, %ymm2, %ymm0
; X86-NEXT: retl
%res0 = call <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 -1)
%res1 = call <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2)
; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vpmovsdw %zmm0, %ymm2
; X64-NEXT: vpmovsdw %zmm0, %ymm1 {%k1}
+; X64-NEXT: vpaddw %ymm1, %ymm2, %ymm1
; X64-NEXT: vpmovsdw %zmm0, %ymm0 {%k1} {z}
; X64-NEXT: vpaddw %ymm0, %ymm1, %ymm0
-; X64-NEXT: vpaddw %ymm0, %ymm2, %ymm0
; X64-NEXT: retq
;
; X86-LABEL: test_int_x86_avx512_mask_pmovs_dw_512:
; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
; X86-NEXT: vpmovsdw %zmm0, %ymm2
; X86-NEXT: vpmovsdw %zmm0, %ymm1 {%k1}
+; X86-NEXT: vpaddw %ymm1, %ymm2, %ymm1
; X86-NEXT: vpmovsdw %zmm0, %ymm0 {%k1} {z}
; X86-NEXT: vpaddw %ymm0, %ymm1, %ymm0
-; X86-NEXT: vpaddw %ymm0, %ymm2, %ymm0
; X86-NEXT: retl
%res0 = call <16 x i16> @llvm.x86.avx512.mask.pmovs.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 -1)
%res1 = call <16 x i16> @llvm.x86.avx512.mask.pmovs.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2)
; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vpmovusdw %zmm0, %ymm2
; X64-NEXT: vpmovusdw %zmm0, %ymm1 {%k1}
+; X64-NEXT: vpaddw %ymm1, %ymm2, %ymm1
; X64-NEXT: vpmovusdw %zmm0, %ymm0 {%k1} {z}
; X64-NEXT: vpaddw %ymm0, %ymm1, %ymm0
-; X64-NEXT: vpaddw %ymm0, %ymm2, %ymm0
; X64-NEXT: retq
;
; X86-LABEL: test_int_x86_avx512_mask_pmovus_dw_512:
; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
; X86-NEXT: vpmovusdw %zmm0, %ymm2
; X86-NEXT: vpmovusdw %zmm0, %ymm1 {%k1}
+; X86-NEXT: vpaddw %ymm1, %ymm2, %ymm1
; X86-NEXT: vpmovusdw %zmm0, %ymm0 {%k1} {z}
; X86-NEXT: vpaddw %ymm0, %ymm1, %ymm0
-; X86-NEXT: vpaddw %ymm0, %ymm2, %ymm0
; X86-NEXT: retl
%res0 = call <16 x i16> @llvm.x86.avx512.mask.pmovus.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 -1)
%res1 = call <16 x i16> @llvm.x86.avx512.mask.pmovus.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2)
; X64-NEXT: kmovw %k0, %esi
; X64-NEXT: vcmpnltsd {sae}, %xmm1, %xmm0, %k0 {%k1}
; X64-NEXT: kmovw %k0, %eax
+; X64-NEXT: orl %ecx, %edx
; X64-NEXT: orl %esi, %eax
; X64-NEXT: orl %edx, %eax
-; X64-NEXT: orl %ecx, %eax
; X64-NEXT: # kill: def $al killed $al killed $eax
; X64-NEXT: retq
;
; X86-NEXT: kmovw %k0, %esi
; X86-NEXT: vcmpnltsd {sae}, %xmm1, %xmm0, %k0 {%k1}
; X86-NEXT: kmovw %k0, %eax
+; X86-NEXT: orl %ecx, %edx
; X86-NEXT: orl %esi, %eax
; X86-NEXT: orl %edx, %eax
-; X86-NEXT: orl %ecx, %eax
; X86-NEXT: # kill: def $al killed $al killed $eax
; X86-NEXT: popl %esi
; X86-NEXT: .cfi_def_cfa_offset 4
; X64-NEXT: kmovw %k0, %esi
; X64-NEXT: vcmpnltss {sae}, %xmm1, %xmm0, %k0 {%k1}
; X64-NEXT: kmovw %k0, %eax
+; X64-NEXT: andl %ecx, %edx
; X64-NEXT: andl %esi, %eax
; X64-NEXT: andl %edx, %eax
-; X64-NEXT: andl %ecx, %eax
; X64-NEXT: # kill: def $al killed $al killed $eax
; X64-NEXT: retq
;
; X86-NEXT: kmovw %k0, %esi
; X86-NEXT: vcmpnltss {sae}, %xmm1, %xmm0, %k0 {%k1}
; X86-NEXT: kmovw %k0, %eax
+; X86-NEXT: andl %ecx, %edx
; X86-NEXT: andl %esi, %eax
; X86-NEXT: andl %edx, %eax
-; X86-NEXT: andl %ecx, %eax
; X86-NEXT: # kill: def $al killed $al killed $eax
; X86-NEXT: popl %esi
; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: kshiftlq $6, %k1, %k1
; X86-NEXT: kshiftlq $59, %k0, %k0
; X86-NEXT: kshiftrq $59, %k0, %k0
+; X86-NEXT: korq %k1, %k0, %k0
; X86-NEXT: movb $1, %al
-; X86-NEXT: kmovd %eax, %k2
-; X86-NEXT: kshiftlq $63, %k2, %k2
-; X86-NEXT: kshiftrq $58, %k2, %k2
-; X86-NEXT: korq %k1, %k2, %k1
+; X86-NEXT: kmovd %eax, %k1
+; X86-NEXT: kshiftlq $63, %k1, %k1
+; X86-NEXT: kshiftrq $58, %k1, %k1
; X86-NEXT: korq %k0, %k1, %k0
; X86-NEXT: vpmovm2b %k0, %zmm0
; X86-NEXT: retl
; X86-NEXT: kshiftlq $6, %k1, %k1
; X86-NEXT: kshiftlq $59, %k0, %k0
; X86-NEXT: kshiftrq $59, %k0, %k0
-; X86-NEXT: kmovd %eax, %k2
-; X86-NEXT: kshiftlq $63, %k2, %k2
-; X86-NEXT: kshiftrq $58, %k2, %k2
-; X86-NEXT: korq %k1, %k2, %k1
+; X86-NEXT: korq %k1, %k0, %k0
+; X86-NEXT: kmovd %eax, %k1
+; X86-NEXT: kshiftlq $63, %k1, %k1
+; X86-NEXT: kshiftrq $58, %k1, %k1
; X86-NEXT: korq %k0, %k1, %k0
; X86-NEXT: vpmovm2b %k0, %zmm0
; X86-NEXT: retl
;
; WIN64-LABEL: test_argv64i1:
; WIN64: # %bb.0:
-; WIN64-NEXT: addq %rdx, %rcx
-; WIN64-NEXT: addq %rdi, %rcx
-; WIN64-NEXT: addq %rsi, %rcx
-; WIN64-NEXT: addq %r8, %rcx
+; WIN64-NEXT: addq %rcx, %rax
+; WIN64-NEXT: addq %rdx, %rax
+; WIN64-NEXT: addq %rdi, %rax
+; WIN64-NEXT: leaq (%rsi,%r8), %rcx
; WIN64-NEXT: addq %r9, %rcx
-; WIN64-NEXT: addq %r10, %rcx
-; WIN64-NEXT: addq %r11, %rcx
+; WIN64-NEXT: addq %rcx, %rax
+; WIN64-NEXT: leaq (%r10,%r11), %rcx
; WIN64-NEXT: addq %r12, %rcx
; WIN64-NEXT: addq %r14, %rcx
-; WIN64-NEXT: addq %r15, %rcx
; WIN64-NEXT: addq %rcx, %rax
+; WIN64-NEXT: addq %r15, %rax
; WIN64-NEXT: addq {{[0-9]+}}(%rsp), %rax
; WIN64-NEXT: retq
;
; LINUXOSX64-LABEL: test_argv64i1:
; LINUXOSX64: # %bb.0:
-; LINUXOSX64-NEXT: addq %rdx, %rcx
-; LINUXOSX64-NEXT: addq %rdi, %rcx
-; LINUXOSX64-NEXT: addq %rsi, %rcx
-; LINUXOSX64-NEXT: addq %r8, %rcx
+; LINUXOSX64-NEXT: addq %rcx, %rax
+; LINUXOSX64-NEXT: addq %rdx, %rax
+; LINUXOSX64-NEXT: addq %rdi, %rax
+; LINUXOSX64-NEXT: leaq (%rsi,%r8), %rcx
; LINUXOSX64-NEXT: addq %r9, %rcx
-; LINUXOSX64-NEXT: addq %r12, %rcx
-; LINUXOSX64-NEXT: addq %r13, %rcx
+; LINUXOSX64-NEXT: addq %rcx, %rax
+; LINUXOSX64-NEXT: leaq (%r12,%r13), %rcx
; LINUXOSX64-NEXT: addq %r14, %rcx
; LINUXOSX64-NEXT: addq %r15, %rcx
; LINUXOSX64-NEXT: addq %rcx, %rax
; X32: # %bb.0:
; X32-NEXT: pushl %ebp
; X32-NEXT: pushl %ebx
-; X32-NEXT: subl $16, %esp
+; X32-NEXT: subl $12, %esp
; X32-NEXT: movl %esi, (%esp) # 4-byte Spill
; X32-NEXT: movl %edi, %esi
; X32-NEXT: movl %edx, %ebx
; X32-NEXT: subl %esi, %ebx
; X32-NEXT: movl %edi, %eax
; X32-NEXT: subl %ecx, %eax
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X32-NEXT: movl %ebp, %ecx
; X32-NEXT: subl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: imull %eax, %ecx
-; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: movl %esi, %edx
-; X32-NEXT: subl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: imull %ebx, %edx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: subl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: imull %ebx, %eax
+; X32-NEXT: addl %ecx, %eax
; X32-NEXT: movl (%esp), %ebx # 4-byte Reload
-; X32-NEXT: subl %ebp, %ebx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: subl {{[0-9]+}}(%esp), %ebx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT: movl %edx, %ecx
; X32-NEXT: subl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: imull %ebx, %ecx
-; X32-NEXT: addl %edx, %ecx
+; X32-NEXT: addl %eax, %ecx
; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NEXT: addl (%esp), %ebp # 4-byte Folded Reload
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: addl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: imull %edx, %edi
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: addl (%esp), %eax # 4-byte Folded Reload
+; X32-NEXT: addl {{[0-9]+}}(%esp), %ebp
+; X32-NEXT: imull %ebp, %edi
; X32-NEXT: addl {{[0-9]+}}(%esp), %esi
; X32-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: imull %ebp, %eax
-; X32-NEXT: addl %esi, %eax
-; X32-NEXT: addl %eax, %edi
+; X32-NEXT: addl %esi, %edi
+; X32-NEXT: addl {{[0-9]+}}(%esp), %edx
+; X32-NEXT: imull %eax, %edx
+; X32-NEXT: addl %edx, %edi
; X32-NEXT: addl %ecx, %edi
-; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
; X32-NEXT: movl %edi, %eax
-; X32-NEXT: addl $16, %esp
+; X32-NEXT: addl $12, %esp
; X32-NEXT: popl %ebx
; X32-NEXT: popl %ebp
; X32-NEXT: retl
; WIN64-NEXT: # kill: def $r11d killed $r11d killed $r11
; WIN64-NEXT: subl %r12d, %r11d
; WIN64-NEXT: imull %edx, %r11d
+; WIN64-NEXT: addl %r9d, %r11d
; WIN64-NEXT: leal (%r14,%r15), %edx
-; WIN64-NEXT: # kill: def $r14d killed $r14d killed $r14
-; WIN64-NEXT: subl %r15d, %r14d
-; WIN64-NEXT: imull %esi, %r14d
-; WIN64-NEXT: addl %r11d, %r14d
+; WIN64-NEXT: movl %r14d, %r9d
+; WIN64-NEXT: subl %r15d, %r9d
+; WIN64-NEXT: imull %esi, %r9d
+; WIN64-NEXT: addl %r11d, %r9d
; WIN64-NEXT: addl %ecx, %eax
; WIN64-NEXT: imull %r8d, %eax
; WIN64-NEXT: imull %ebx, %r10d
+; WIN64-NEXT: addl %r10d, %eax
; WIN64-NEXT: imull %edi, %edx
-; WIN64-NEXT: addl %r10d, %edx
; WIN64-NEXT: addl %edx, %eax
-; WIN64-NEXT: addl %r14d, %eax
; WIN64-NEXT: addl %r9d, %eax
; WIN64-NEXT: popq %rbx
; WIN64-NEXT: retq
; LINUXOSX64-NEXT: leal (%r13,%r14), %r11d
; LINUXOSX64-NEXT: movl %r13d, %r12d
; LINUXOSX64-NEXT: subl %r14d, %r12d
-; LINUXOSX64-NEXT: movl {{[0-9]+}}(%rsp), %r14d
; LINUXOSX64-NEXT: imull %edx, %r12d
-; LINUXOSX64-NEXT: movl %r15d, %edx
-; LINUXOSX64-NEXT: subl %r14d, %edx
-; LINUXOSX64-NEXT: imull %esi, %edx
-; LINUXOSX64-NEXT: addl %r12d, %edx
+; LINUXOSX64-NEXT: movl {{[0-9]+}}(%rsp), %edx
+; LINUXOSX64-NEXT: addl %r9d, %r12d
+; LINUXOSX64-NEXT: movl %r15d, %r9d
+; LINUXOSX64-NEXT: subl %edx, %r9d
+; LINUXOSX64-NEXT: imull %esi, %r9d
+; LINUXOSX64-NEXT: addl %r12d, %r9d
; LINUXOSX64-NEXT: addl %ecx, %eax
; LINUXOSX64-NEXT: imull %r8d, %eax
; LINUXOSX64-NEXT: imull %r10d, %r11d
-; LINUXOSX64-NEXT: addl %r15d, %r14d
-; LINUXOSX64-NEXT: imull %edi, %r14d
-; LINUXOSX64-NEXT: addl %r11d, %r14d
-; LINUXOSX64-NEXT: addl %r14d, %eax
+; LINUXOSX64-NEXT: addl %r11d, %eax
+; LINUXOSX64-NEXT: addl %r15d, %edx
+; LINUXOSX64-NEXT: imull %edi, %edx
; LINUXOSX64-NEXT: addl %edx, %eax
; LINUXOSX64-NEXT: addl %r9d, %eax
; LINUXOSX64-NEXT: retq
; X64-NEXT: kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0]
; X64-NEXT: vpcmpgtb %zmm0, %zmm1, %k0 # encoding: [0x62,0xf1,0x75,0x48,0x64,0xc0]
; X64-NEXT: kmovq %k0, %rcx # encoding: [0xc4,0xe1,0xfb,0x93,0xc8]
+; X64-NEXT: addq %rax, %rcx # encoding: [0x48,0x01,0xc1]
; X64-NEXT: vpcmpleb %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x48,0x3f,0xc1,0x02]
-; X64-NEXT: kmovq %k0, %rdx # encoding: [0xc4,0xe1,0xfb,0x93,0xd0]
-; X64-NEXT: addq %rcx, %rdx # encoding: [0x48,0x01,0xca]
+; X64-NEXT: kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0]
; X64-NEXT: vpcmpneqb %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x48,0x3f,0xc1,0x04]
-; X64-NEXT: kmovq %k0, %rcx # encoding: [0xc4,0xe1,0xfb,0x93,0xc8]
-; X64-NEXT: addq %rdx, %rcx # encoding: [0x48,0x01,0xd1]
-; X64-NEXT: vpcmpnltb %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x48,0x3f,0xc1,0x05]
; X64-NEXT: kmovq %k0, %rdx # encoding: [0xc4,0xe1,0xfb,0x93,0xd0]
-; X64-NEXT: addq %rcx, %rdx # encoding: [0x48,0x01,0xca]
; X64-NEXT: addq %rax, %rdx # encoding: [0x48,0x01,0xc2]
-; X64-NEXT: vpcmpgtb %zmm1, %zmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x48,0x64,0xc1]
+; X64-NEXT: addq %rcx, %rdx # encoding: [0x48,0x01,0xca]
+; X64-NEXT: vpcmpnltb %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x48,0x3f,0xc1,0x05]
; X64-NEXT: kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0]
-; X64-NEXT: leaq -1(%rax,%rdx), %rax # encoding: [0x48,0x8d,0x44,0x10,0xff]
+; X64-NEXT: addq %rdx, %rax # encoding: [0x48,0x01,0xd0]
+; X64-NEXT: vpcmpgtb %zmm1, %zmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x48,0x64,0xc1]
+; X64-NEXT: kmovq %k0, %rcx # encoding: [0xc4,0xe1,0xfb,0x93,0xc8]
+; X64-NEXT: leaq -1(%rcx,%rax), %rax # encoding: [0x48,0x8d,0x44,0x01,0xff]
; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
; X64-NEXT: retq # encoding: [0xc3]
%res0 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 -1)
; X64: # %bb.0:
; X64-NEXT: kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf]
; X64-NEXT: vpcmpeqb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x74,0xc1]
-; X64-NEXT: kmovq %k0, %rcx # encoding: [0xc4,0xe1,0xfb,0x93,0xc8]
-; X64-NEXT: vpcmpgtb %zmm0, %zmm1, %k0 {%k1} # encoding: [0x62,0xf1,0x75,0x49,0x64,0xc0]
; X64-NEXT: kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0]
+; X64-NEXT: vpcmpgtb %zmm0, %zmm1, %k0 {%k1} # encoding: [0x62,0xf1,0x75,0x49,0x64,0xc0]
+; X64-NEXT: kmovq %k0, %rcx # encoding: [0xc4,0xe1,0xfb,0x93,0xc8]
+; X64-NEXT: addq %rax, %rcx # encoding: [0x48,0x01,0xc1]
; X64-NEXT: vpcmpleb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3f,0xc1,0x02]
-; X64-NEXT: kmovq %k0, %rdx # encoding: [0xc4,0xe1,0xfb,0x93,0xd0]
-; X64-NEXT: addq %rax, %rdx # encoding: [0x48,0x01,0xc2]
-; X64-NEXT: vpcmpneqb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3f,0xc1,0x04]
; X64-NEXT: kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0]
-; X64-NEXT: addq %rdx, %rax # encoding: [0x48,0x01,0xd0]
-; X64-NEXT: vpcmpnltb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3f,0xc1,0x05]
+; X64-NEXT: vpcmpneqb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3f,0xc1,0x04]
; X64-NEXT: kmovq %k0, %rdx # encoding: [0xc4,0xe1,0xfb,0x93,0xd0]
; X64-NEXT: addq %rax, %rdx # encoding: [0x48,0x01,0xc2]
+; X64-NEXT: addq %rcx, %rdx # encoding: [0x48,0x01,0xca]
+; X64-NEXT: vpcmpnltb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3f,0xc1,0x05]
+; X64-NEXT: kmovq %k0, %rcx # encoding: [0xc4,0xe1,0xfb,0x93,0xc8]
; X64-NEXT: vpcmpgtb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x64,0xc1]
; X64-NEXT: kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0]
-; X64-NEXT: addq %rdx, %rax # encoding: [0x48,0x01,0xd0]
-; X64-NEXT: addq %rdi, %rax # encoding: [0x48,0x01,0xf8]
; X64-NEXT: addq %rcx, %rax # encoding: [0x48,0x01,0xc8]
+; X64-NEXT: addq %rdi, %rax # encoding: [0x48,0x01,0xf8]
+; X64-NEXT: addq %rdx, %rax # encoding: [0x48,0x01,0xd0]
; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
; X64-NEXT: retq # encoding: [0xc3]
%res0 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 %mask)
; X64-NEXT: kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0]
; X64-NEXT: vpcmpltub %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x48,0x3e,0xc1,0x01]
; X64-NEXT: kmovq %k0, %rcx # encoding: [0xc4,0xe1,0xfb,0x93,0xc8]
+; X64-NEXT: addq %rax, %rcx # encoding: [0x48,0x01,0xc1]
; X64-NEXT: vpcmpleub %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x48,0x3e,0xc1,0x02]
-; X64-NEXT: kmovq %k0, %rdx # encoding: [0xc4,0xe1,0xfb,0x93,0xd0]
-; X64-NEXT: addq %rcx, %rdx # encoding: [0x48,0x01,0xca]
+; X64-NEXT: kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0]
; X64-NEXT: vpcmpneqb %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x48,0x3f,0xc1,0x04]
-; X64-NEXT: kmovq %k0, %rcx # encoding: [0xc4,0xe1,0xfb,0x93,0xc8]
-; X64-NEXT: addq %rdx, %rcx # encoding: [0x48,0x01,0xd1]
-; X64-NEXT: vpcmpnltub %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x48,0x3e,0xc1,0x05]
; X64-NEXT: kmovq %k0, %rdx # encoding: [0xc4,0xe1,0xfb,0x93,0xd0]
-; X64-NEXT: addq %rcx, %rdx # encoding: [0x48,0x01,0xca]
; X64-NEXT: addq %rax, %rdx # encoding: [0x48,0x01,0xc2]
-; X64-NEXT: vpcmpnleub %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x48,0x3e,0xc1,0x06]
+; X64-NEXT: addq %rcx, %rdx # encoding: [0x48,0x01,0xca]
+; X64-NEXT: vpcmpnltub %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x48,0x3e,0xc1,0x05]
; X64-NEXT: kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0]
-; X64-NEXT: leaq -1(%rax,%rdx), %rax # encoding: [0x48,0x8d,0x44,0x10,0xff]
+; X64-NEXT: addq %rdx, %rax # encoding: [0x48,0x01,0xd0]
+; X64-NEXT: vpcmpnleub %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x48,0x3e,0xc1,0x06]
+; X64-NEXT: kmovq %k0, %rcx # encoding: [0xc4,0xe1,0xfb,0x93,0xc8]
+; X64-NEXT: leaq -1(%rcx,%rax), %rax # encoding: [0x48,0x8d,0x44,0x01,0xff]
; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
; X64-NEXT: retq # encoding: [0xc3]
%res0 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 -1)
; X64: # %bb.0:
; X64-NEXT: kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf]
; X64-NEXT: vpcmpeqb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x74,0xc1]
-; X64-NEXT: kmovq %k0, %rcx # encoding: [0xc4,0xe1,0xfb,0x93,0xc8]
-; X64-NEXT: vpcmpltub %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3e,0xc1,0x01]
; X64-NEXT: kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0]
+; X64-NEXT: vpcmpltub %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3e,0xc1,0x01]
+; X64-NEXT: kmovq %k0, %rcx # encoding: [0xc4,0xe1,0xfb,0x93,0xc8]
+; X64-NEXT: addq %rax, %rcx # encoding: [0x48,0x01,0xc1]
; X64-NEXT: vpcmpleub %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3e,0xc1,0x02]
-; X64-NEXT: kmovq %k0, %rdx # encoding: [0xc4,0xe1,0xfb,0x93,0xd0]
-; X64-NEXT: addq %rax, %rdx # encoding: [0x48,0x01,0xc2]
-; X64-NEXT: vpcmpneqb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3f,0xc1,0x04]
; X64-NEXT: kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0]
-; X64-NEXT: addq %rdx, %rax # encoding: [0x48,0x01,0xd0]
-; X64-NEXT: vpcmpnltub %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3e,0xc1,0x05]
+; X64-NEXT: vpcmpneqb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3f,0xc1,0x04]
; X64-NEXT: kmovq %k0, %rdx # encoding: [0xc4,0xe1,0xfb,0x93,0xd0]
; X64-NEXT: addq %rax, %rdx # encoding: [0x48,0x01,0xc2]
+; X64-NEXT: addq %rcx, %rdx # encoding: [0x48,0x01,0xca]
+; X64-NEXT: vpcmpnltub %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3e,0xc1,0x05]
+; X64-NEXT: kmovq %k0, %rcx # encoding: [0xc4,0xe1,0xfb,0x93,0xc8]
; X64-NEXT: vpcmpnleub %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3e,0xc1,0x06]
; X64-NEXT: kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0]
-; X64-NEXT: addq %rdx, %rax # encoding: [0x48,0x01,0xd0]
-; X64-NEXT: addq %rdi, %rax # encoding: [0x48,0x01,0xf8]
; X64-NEXT: addq %rcx, %rax # encoding: [0x48,0x01,0xc8]
+; X64-NEXT: addq %rdi, %rax # encoding: [0x48,0x01,0xf8]
+; X64-NEXT: addq %rdx, %rax # encoding: [0x48,0x01,0xd0]
; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
; X64-NEXT: retq # encoding: [0xc3]
%res0 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 %mask)
; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
; X86-NEXT: vpcmpgtw %zmm0, %zmm1, %k0 # encoding: [0x62,0xf1,0x75,0x48,0x65,0xc0]
; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
+; X86-NEXT: addl %eax, %ecx # encoding: [0x01,0xc1]
; X86-NEXT: vpcmplew %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3f,0xc1,0x02]
-; X86-NEXT: kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0]
-; X86-NEXT: addl %ecx, %edx # encoding: [0x01,0xca]
+; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
; X86-NEXT: vpcmpneqw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3f,0xc1,0x04]
-; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
-; X86-NEXT: addl %edx, %ecx # encoding: [0x01,0xd1]
-; X86-NEXT: vpcmpnltw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3f,0xc1,0x05]
; X86-NEXT: kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0]
-; X86-NEXT: addl %ecx, %edx # encoding: [0x01,0xca]
; X86-NEXT: addl %eax, %edx # encoding: [0x01,0xc2]
-; X86-NEXT: vpcmpgtw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x48,0x65,0xc1]
+; X86-NEXT: addl %ecx, %edx # encoding: [0x01,0xca]
+; X86-NEXT: vpcmpnltw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3f,0xc1,0x05]
; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
-; X86-NEXT: leal -1(%eax,%edx), %eax # encoding: [0x8d,0x44,0x10,0xff]
+; X86-NEXT: addl %edx, %eax # encoding: [0x01,0xd0]
+; X86-NEXT: vpcmpgtw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x48,0x65,0xc1]
+; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
+; X86-NEXT: leal -1(%ecx,%eax), %eax # encoding: [0x8d,0x44,0x01,0xff]
; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
; X86-NEXT: retl # encoding: [0xc3]
;
; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
; X64-NEXT: vpcmpgtw %zmm0, %zmm1, %k0 # encoding: [0x62,0xf1,0x75,0x48,0x65,0xc0]
; X64-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
+; X64-NEXT: addl %eax, %ecx # encoding: [0x01,0xc1]
; X64-NEXT: vpcmplew %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3f,0xc1,0x02]
-; X64-NEXT: kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0]
-; X64-NEXT: addl %ecx, %edx # encoding: [0x01,0xca]
+; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
; X64-NEXT: vpcmpneqw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3f,0xc1,0x04]
-; X64-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
-; X64-NEXT: addl %edx, %ecx # encoding: [0x01,0xd1]
-; X64-NEXT: vpcmpnltw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3f,0xc1,0x05]
; X64-NEXT: kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0]
-; X64-NEXT: addl %ecx, %edx # encoding: [0x01,0xca]
; X64-NEXT: addl %eax, %edx # encoding: [0x01,0xc2]
-; X64-NEXT: vpcmpgtw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x48,0x65,0xc1]
+; X64-NEXT: addl %ecx, %edx # encoding: [0x01,0xca]
+; X64-NEXT: vpcmpnltw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3f,0xc1,0x05]
; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
-; X64-NEXT: leal -1(%rax,%rdx), %eax # encoding: [0x8d,0x44,0x10,0xff]
+; X64-NEXT: addl %edx, %eax # encoding: [0x01,0xd0]
+; X64-NEXT: vpcmpgtw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x48,0x65,0xc1]
+; X64-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
+; X64-NEXT: leal -1(%rcx,%rax), %eax # encoding: [0x8d,0x44,0x01,0xff]
; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
; X64-NEXT: retq # encoding: [0xc3]
%res0 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 0, i32 -1)
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x08]
; X86-NEXT: kmovd %ecx, %k1 # encoding: [0xc5,0xfb,0x92,0xc9]
; X86-NEXT: vpcmpeqw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x75,0xc1]
-; X86-NEXT: kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0]
-; X86-NEXT: vpcmpgtw %zmm0, %zmm1, %k0 {%k1} # encoding: [0x62,0xf1,0x75,0x49,0x65,0xc0]
; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; X86-NEXT: vpcmpgtw %zmm0, %zmm1, %k0 {%k1} # encoding: [0x62,0xf1,0x75,0x49,0x65,0xc0]
+; X86-NEXT: kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0]
+; X86-NEXT: addl %eax, %edx # encoding: [0x01,0xc2]
; X86-NEXT: vpcmplew %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3f,0xc1,0x02]
-; X86-NEXT: kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0]
-; X86-NEXT: addl %eax, %esi # encoding: [0x01,0xc6]
-; X86-NEXT: vpcmpneqw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3f,0xc1,0x04]
; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
-; X86-NEXT: addl %esi, %eax # encoding: [0x01,0xf0]
-; X86-NEXT: vpcmpnltw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3f,0xc1,0x05]
+; X86-NEXT: vpcmpneqw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3f,0xc1,0x04]
; X86-NEXT: kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0]
; X86-NEXT: addl %eax, %esi # encoding: [0x01,0xc6]
+; X86-NEXT: addl %edx, %esi # encoding: [0x01,0xd6]
+; X86-NEXT: vpcmpnltw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3f,0xc1,0x05]
+; X86-NEXT: kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0]
; X86-NEXT: vpcmpgtw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x65,0xc1]
; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
-; X86-NEXT: addl %esi, %eax # encoding: [0x01,0xf0]
-; X86-NEXT: addl %ecx, %eax # encoding: [0x01,0xc8]
; X86-NEXT: addl %edx, %eax # encoding: [0x01,0xd0]
+; X86-NEXT: addl %ecx, %eax # encoding: [0x01,0xc8]
+; X86-NEXT: addl %esi, %eax # encoding: [0x01,0xf0]
; X86-NEXT: popl %esi # encoding: [0x5e]
; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
; X86-NEXT: retl # encoding: [0xc3]
; X64: # %bb.0:
; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
; X64-NEXT: vpcmpeqw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x75,0xc1]
-; X64-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
-; X64-NEXT: vpcmpgtw %zmm0, %zmm1, %k0 {%k1} # encoding: [0x62,0xf1,0x75,0x49,0x65,0xc0]
; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; X64-NEXT: vpcmpgtw %zmm0, %zmm1, %k0 {%k1} # encoding: [0x62,0xf1,0x75,0x49,0x65,0xc0]
+; X64-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
+; X64-NEXT: addl %eax, %ecx # encoding: [0x01,0xc1]
; X64-NEXT: vpcmplew %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3f,0xc1,0x02]
-; X64-NEXT: kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0]
-; X64-NEXT: addl %eax, %edx # encoding: [0x01,0xc2]
-; X64-NEXT: vpcmpneqw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3f,0xc1,0x04]
; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
-; X64-NEXT: addl %edx, %eax # encoding: [0x01,0xd0]
-; X64-NEXT: vpcmpnltw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3f,0xc1,0x05]
+; X64-NEXT: vpcmpneqw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3f,0xc1,0x04]
; X64-NEXT: kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0]
; X64-NEXT: addl %eax, %edx # encoding: [0x01,0xc2]
+; X64-NEXT: addl %ecx, %edx # encoding: [0x01,0xca]
+; X64-NEXT: vpcmpnltw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3f,0xc1,0x05]
+; X64-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
; X64-NEXT: vpcmpgtw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x65,0xc1]
; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
-; X64-NEXT: addl %edx, %eax # encoding: [0x01,0xd0]
-; X64-NEXT: addl %edi, %eax # encoding: [0x01,0xf8]
; X64-NEXT: addl %ecx, %eax # encoding: [0x01,0xc8]
+; X64-NEXT: addl %edi, %eax # encoding: [0x01,0xf8]
+; X64-NEXT: addl %edx, %eax # encoding: [0x01,0xd0]
; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
; X64-NEXT: retq # encoding: [0xc3]
%res0 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 0, i32 %mask)
; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
; X86-NEXT: vpcmpltuw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3e,0xc1,0x01]
; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
+; X86-NEXT: addl %eax, %ecx # encoding: [0x01,0xc1]
; X86-NEXT: vpcmpleuw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3e,0xc1,0x02]
-; X86-NEXT: kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0]
-; X86-NEXT: addl %ecx, %edx # encoding: [0x01,0xca]
+; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
; X86-NEXT: vpcmpneqw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3f,0xc1,0x04]
-; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
-; X86-NEXT: addl %edx, %ecx # encoding: [0x01,0xd1]
-; X86-NEXT: vpcmpnltuw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3e,0xc1,0x05]
; X86-NEXT: kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0]
-; X86-NEXT: addl %ecx, %edx # encoding: [0x01,0xca]
; X86-NEXT: addl %eax, %edx # encoding: [0x01,0xc2]
-; X86-NEXT: vpcmpnleuw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3e,0xc1,0x06]
+; X86-NEXT: addl %ecx, %edx # encoding: [0x01,0xca]
+; X86-NEXT: vpcmpnltuw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3e,0xc1,0x05]
; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
-; X86-NEXT: leal -1(%eax,%edx), %eax # encoding: [0x8d,0x44,0x10,0xff]
+; X86-NEXT: addl %edx, %eax # encoding: [0x01,0xd0]
+; X86-NEXT: vpcmpnleuw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3e,0xc1,0x06]
+; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
+; X86-NEXT: leal -1(%ecx,%eax), %eax # encoding: [0x8d,0x44,0x01,0xff]
; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
; X86-NEXT: retl # encoding: [0xc3]
;
; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
; X64-NEXT: vpcmpltuw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3e,0xc1,0x01]
; X64-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
+; X64-NEXT: addl %eax, %ecx # encoding: [0x01,0xc1]
; X64-NEXT: vpcmpleuw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3e,0xc1,0x02]
-; X64-NEXT: kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0]
-; X64-NEXT: addl %ecx, %edx # encoding: [0x01,0xca]
+; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
; X64-NEXT: vpcmpneqw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3f,0xc1,0x04]
-; X64-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
-; X64-NEXT: addl %edx, %ecx # encoding: [0x01,0xd1]
-; X64-NEXT: vpcmpnltuw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3e,0xc1,0x05]
; X64-NEXT: kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0]
-; X64-NEXT: addl %ecx, %edx # encoding: [0x01,0xca]
; X64-NEXT: addl %eax, %edx # encoding: [0x01,0xc2]
-; X64-NEXT: vpcmpnleuw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3e,0xc1,0x06]
+; X64-NEXT: addl %ecx, %edx # encoding: [0x01,0xca]
+; X64-NEXT: vpcmpnltuw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3e,0xc1,0x05]
; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
-; X64-NEXT: leal -1(%rax,%rdx), %eax # encoding: [0x8d,0x44,0x10,0xff]
+; X64-NEXT: addl %edx, %eax # encoding: [0x01,0xd0]
+; X64-NEXT: vpcmpnleuw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3e,0xc1,0x06]
+; X64-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
+; X64-NEXT: leal -1(%rcx,%rax), %eax # encoding: [0x8d,0x44,0x01,0xff]
; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
; X64-NEXT: retq # encoding: [0xc3]
%res0 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 0, i32 -1)
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x08]
; X86-NEXT: kmovd %ecx, %k1 # encoding: [0xc5,0xfb,0x92,0xc9]
; X86-NEXT: vpcmpeqw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x75,0xc1]
-; X86-NEXT: kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0]
-; X86-NEXT: vpcmpltuw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3e,0xc1,0x01]
; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; X86-NEXT: vpcmpltuw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3e,0xc1,0x01]
+; X86-NEXT: kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0]
+; X86-NEXT: addl %eax, %edx # encoding: [0x01,0xc2]
; X86-NEXT: vpcmpleuw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3e,0xc1,0x02]
-; X86-NEXT: kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0]
-; X86-NEXT: addl %eax, %esi # encoding: [0x01,0xc6]
-; X86-NEXT: vpcmpneqw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3f,0xc1,0x04]
; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
-; X86-NEXT: addl %esi, %eax # encoding: [0x01,0xf0]
-; X86-NEXT: vpcmpnltuw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3e,0xc1,0x05]
+; X86-NEXT: vpcmpneqw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3f,0xc1,0x04]
; X86-NEXT: kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0]
; X86-NEXT: addl %eax, %esi # encoding: [0x01,0xc6]
+; X86-NEXT: addl %edx, %esi # encoding: [0x01,0xd6]
+; X86-NEXT: vpcmpnltuw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3e,0xc1,0x05]
+; X86-NEXT: kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0]
; X86-NEXT: vpcmpnleuw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3e,0xc1,0x06]
; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
-; X86-NEXT: addl %esi, %eax # encoding: [0x01,0xf0]
-; X86-NEXT: addl %ecx, %eax # encoding: [0x01,0xc8]
; X86-NEXT: addl %edx, %eax # encoding: [0x01,0xd0]
+; X86-NEXT: addl %ecx, %eax # encoding: [0x01,0xc8]
+; X86-NEXT: addl %esi, %eax # encoding: [0x01,0xf0]
; X86-NEXT: popl %esi # encoding: [0x5e]
; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
; X86-NEXT: retl # encoding: [0xc3]
; X64: # %bb.0:
; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
; X64-NEXT: vpcmpeqw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x75,0xc1]
-; X64-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
-; X64-NEXT: vpcmpltuw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3e,0xc1,0x01]
; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; X64-NEXT: vpcmpltuw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3e,0xc1,0x01]
+; X64-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
+; X64-NEXT: addl %eax, %ecx # encoding: [0x01,0xc1]
; X64-NEXT: vpcmpleuw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3e,0xc1,0x02]
-; X64-NEXT: kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0]
-; X64-NEXT: addl %eax, %edx # encoding: [0x01,0xc2]
-; X64-NEXT: vpcmpneqw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3f,0xc1,0x04]
; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
-; X64-NEXT: addl %edx, %eax # encoding: [0x01,0xd0]
-; X64-NEXT: vpcmpnltuw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3e,0xc1,0x05]
+; X64-NEXT: vpcmpneqw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3f,0xc1,0x04]
; X64-NEXT: kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0]
; X64-NEXT: addl %eax, %edx # encoding: [0x01,0xc2]
+; X64-NEXT: addl %ecx, %edx # encoding: [0x01,0xca]
+; X64-NEXT: vpcmpnltuw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3e,0xc1,0x05]
+; X64-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
; X64-NEXT: vpcmpnleuw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3e,0xc1,0x06]
; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
-; X64-NEXT: addl %edx, %eax # encoding: [0x01,0xd0]
-; X64-NEXT: addl %edi, %eax # encoding: [0x01,0xf8]
; X64-NEXT: addl %ecx, %eax # encoding: [0x01,0xc8]
+; X64-NEXT: addl %edi, %eax # encoding: [0x01,0xf8]
+; X64-NEXT: addl %edx, %eax # encoding: [0x01,0xd0]
; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
; X64-NEXT: retq # encoding: [0xc3]
%res0 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 0, i32 %mask)
; CHECK-NEXT: vmulph %zmm2, %zmm0, %zmm0
; CHECK-NEXT: vfmadd213ph {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to32}, %zmm2, %zmm0
; CHECK-NEXT: vmulph {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to32}, %zmm2, %zmm2
+; CHECK-NEXT: vmulph %zmm2, %zmm1, %zmm1
; CHECK-NEXT: vmulph %zmm0, %zmm1, %zmm0
-; CHECK-NEXT: vmulph %zmm2, %zmm0, %zmm0
; CHECK-NEXT: retq
%1 = call fast <32 x half> @llvm.sqrt.v32f16(<32 x half> %a0)
%2 = fdiv fast <32 x half> %a1, %1
; CHECK-NEXT: kmovd %k0, %esi
; CHECK-NEXT: vcmpnltsh {sae}, %xmm1, %xmm0, %k0 {%k1}
; CHECK-NEXT: kmovd %k0, %eax
+; CHECK-NEXT: andb %cl, %dl
; CHECK-NEXT: andb %sil, %al
; CHECK-NEXT: andb %dl, %al
-; CHECK-NEXT: andb %cl, %al
; CHECK-NEXT: # kill: def $al killed $al killed $eax
; CHECK-NEXT: retq
%res1 = call i8 @llvm.x86.avx512fp16.mask.cmp.sh(<8 x half> %x0, <8 x half> %x1, i32 2, i8 -1, i32 4)
define <16 x i32> @pr52561(<16 x i32> %a, <16 x i32> %b) "min-legal-vector-width"="256" "prefer-vector-width"="256" nounwind {
; X64-LABEL: pr52561:
; X64: # %bb.0:
-; X64-NEXT: vpbroadcastd {{.*#+}} ymm4 = [112,112,112,112,112,112,112,112]
-; X64-NEXT: vpaddd %ymm4, %ymm2, %ymm2
+; X64-NEXT: vpaddd %ymm3, %ymm1, %ymm1
+; X64-NEXT: vpaddd %ymm2, %ymm0, %ymm0
+; X64-NEXT: vpbroadcastd {{.*#+}} ymm2 = [112,112,112,112,112,112,112,112]
; X64-NEXT: vpaddd %ymm2, %ymm0, %ymm0
-; X64-NEXT: vpaddd %ymm4, %ymm3, %ymm2
; X64-NEXT: vpaddd %ymm2, %ymm1, %ymm1
; X64-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
; X64-NEXT: vpxor %xmm2, %xmm2, %xmm2
; X86-NEXT: movl %esp, %ebp
; X86-NEXT: andl $-32, %esp
; X86-NEXT: subl $32, %esp
+; X86-NEXT: vpaddd %ymm2, %ymm0, %ymm0
; X86-NEXT: vpaddd 8(%ebp), %ymm1, %ymm1
-; X86-NEXT: vpbroadcastd {{.*#+}} ymm3 = [112,112,112,112,112,112,112,112]
-; X86-NEXT: vpaddd %ymm3, %ymm2, %ymm2
+; X86-NEXT: vpbroadcastd {{.*#+}} ymm2 = [112,112,112,112,112,112,112,112]
; X86-NEXT: vpaddd %ymm2, %ymm0, %ymm0
-; X86-NEXT: vpaddd %ymm3, %ymm1, %ymm1
+; X86-NEXT: vpaddd %ymm2, %ymm1, %ymm1
; X86-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm1, %ymm1
; X86-NEXT: vpxor %xmm2, %xmm2, %xmm2
; X86-NEXT: vmovsh %xmm0, %xmm2, %xmm0
; CHECK-NEXT: vpbroadcastq %rdi, %zmm3
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
; CHECK-NEXT: vpmovsxdq %ymm2, %zmm2
-; CHECK-NEXT: vpaddq %zmm2, %zmm3, %zmm4
-; CHECK-NEXT: vpaddq %zmm2, %zmm4, %zmm2
+; CHECK-NEXT: vpaddq %zmm2, %zmm2, %zmm2
+; CHECK-NEXT: vpaddq %zmm2, %zmm3, %zmm2
; CHECK-NEXT: vpmovsxdq %ymm0, %zmm0
-; CHECK-NEXT: vpaddq %zmm0, %zmm3, %zmm3
+; CHECK-NEXT: vpaddq %zmm0, %zmm0, %zmm0
; CHECK-NEXT: vpaddq %zmm0, %zmm3, %zmm0
; CHECK-NEXT: vmovq %xmm0, %rax
; CHECK-NEXT: vmovsh %xmm1, (%rax)
; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
; X86-NEXT: vpmovqb %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x32,0xc2]
; X86-NEXT: vpmovqb %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x32,0xc1]
+; X86-NEXT: vpaddb %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfc,0xc9]
; X86-NEXT: vpmovqb %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x32,0xc0]
; X86-NEXT: vpaddb %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfc,0xc0]
-; X86-NEXT: vpaddb %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfc,0xc0]
; X86-NEXT: retl # encoding: [0xc3]
;
; X64-LABEL: test_int_x86_avx512_mask_pmov_qb_128:
; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
; X64-NEXT: vpmovqb %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x32,0xc2]
; X64-NEXT: vpmovqb %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x32,0xc1]
+; X64-NEXT: vpaddb %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfc,0xc9]
; X64-NEXT: vpmovqb %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x32,0xc0]
; X64-NEXT: vpaddb %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfc,0xc0]
-; X64-NEXT: vpaddb %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfc,0xc0]
; X64-NEXT: retq # encoding: [0xc3]
%res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.128(<2 x i64> %x0, <16 x i8> %x1, i8 -1)
%res1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.128(<2 x i64> %x0, <16 x i8> %x1, i8 %x2)
; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
; X86-NEXT: vpmovsqb %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x22,0xc2]
; X86-NEXT: vpmovsqb %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x22,0xc1]
+; X86-NEXT: vpaddb %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfc,0xc9]
; X86-NEXT: vpmovsqb %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x22,0xc0]
; X86-NEXT: vpaddb %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfc,0xc0]
-; X86-NEXT: vpaddb %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfc,0xc0]
; X86-NEXT: retl # encoding: [0xc3]
;
; X64-LABEL: test_int_x86_avx512_mask_pmovs_qb_128:
; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
; X64-NEXT: vpmovsqb %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x22,0xc2]
; X64-NEXT: vpmovsqb %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x22,0xc1]
+; X64-NEXT: vpaddb %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfc,0xc9]
; X64-NEXT: vpmovsqb %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x22,0xc0]
; X64-NEXT: vpaddb %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfc,0xc0]
-; X64-NEXT: vpaddb %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfc,0xc0]
; X64-NEXT: retq # encoding: [0xc3]
%res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.128(<2 x i64> %x0, <16 x i8> %x1, i8 -1)
%res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.128(<2 x i64> %x0, <16 x i8> %x1, i8 %x2)
; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
; X86-NEXT: vpmovusqb %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x12,0xc2]
; X86-NEXT: vpmovusqb %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x12,0xc1]
+; X86-NEXT: vpaddb %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfc,0xc9]
; X86-NEXT: vpmovusqb %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x12,0xc0]
; X86-NEXT: vpaddb %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfc,0xc0]
-; X86-NEXT: vpaddb %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfc,0xc0]
; X86-NEXT: retl # encoding: [0xc3]
;
; X64-LABEL: test_int_x86_avx512_mask_pmovus_qb_128:
; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
; X64-NEXT: vpmovusqb %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x12,0xc2]
; X64-NEXT: vpmovusqb %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x12,0xc1]
+; X64-NEXT: vpaddb %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfc,0xc9]
; X64-NEXT: vpmovusqb %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x12,0xc0]
; X64-NEXT: vpaddb %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfc,0xc0]
-; X64-NEXT: vpaddb %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfc,0xc0]
; X64-NEXT: retq # encoding: [0xc3]
%res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.128(<2 x i64> %x0, <16 x i8> %x1, i8 -1)
%res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.128(<2 x i64> %x0, <16 x i8> %x1, i8 %x2)
; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
; X86-NEXT: vpmovqb %ymm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x28,0x32,0xc2]
; X86-NEXT: vpmovqb %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x32,0xc1]
+; X86-NEXT: vpaddb %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfc,0xc9]
; X86-NEXT: vpmovqb %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x32,0xc0]
; X86-NEXT: vpaddb %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfc,0xc0]
-; X86-NEXT: vpaddb %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfc,0xc0]
; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
; X86-NEXT: retl # encoding: [0xc3]
;
; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
; X64-NEXT: vpmovqb %ymm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x28,0x32,0xc2]
; X64-NEXT: vpmovqb %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x32,0xc1]
+; X64-NEXT: vpaddb %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfc,0xc9]
; X64-NEXT: vpmovqb %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x32,0xc0]
; X64-NEXT: vpaddb %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfc,0xc0]
-; X64-NEXT: vpaddb %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfc,0xc0]
; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
; X64-NEXT: retq # encoding: [0xc3]
%res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.256(<4 x i64> %x0, <16 x i8> %x1, i8 -1)
; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
; X86-NEXT: vpmovsqb %ymm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x28,0x22,0xc2]
; X86-NEXT: vpmovsqb %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x22,0xc1]
+; X86-NEXT: vpaddb %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfc,0xc9]
; X86-NEXT: vpmovsqb %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x22,0xc0]
; X86-NEXT: vpaddb %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfc,0xc0]
-; X86-NEXT: vpaddb %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfc,0xc0]
; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
; X86-NEXT: retl # encoding: [0xc3]
;
; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
; X64-NEXT: vpmovsqb %ymm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x28,0x22,0xc2]
; X64-NEXT: vpmovsqb %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x22,0xc1]
+; X64-NEXT: vpaddb %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfc,0xc9]
; X64-NEXT: vpmovsqb %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x22,0xc0]
; X64-NEXT: vpaddb %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfc,0xc0]
-; X64-NEXT: vpaddb %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfc,0xc0]
; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
; X64-NEXT: retq # encoding: [0xc3]
%res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.256(<4 x i64> %x0, <16 x i8> %x1, i8 -1)
; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
; X86-NEXT: vpmovusqb %ymm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x28,0x12,0xc2]
; X86-NEXT: vpmovusqb %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x12,0xc1]
+; X86-NEXT: vpaddb %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfc,0xc9]
; X86-NEXT: vpmovusqb %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x12,0xc0]
; X86-NEXT: vpaddb %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfc,0xc0]
-; X86-NEXT: vpaddb %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfc,0xc0]
; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
; X86-NEXT: retl # encoding: [0xc3]
;
; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
; X64-NEXT: vpmovusqb %ymm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x28,0x12,0xc2]
; X64-NEXT: vpmovusqb %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x12,0xc1]
+; X64-NEXT: vpaddb %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfc,0xc9]
; X64-NEXT: vpmovusqb %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x12,0xc0]
; X64-NEXT: vpaddb %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfc,0xc0]
-; X64-NEXT: vpaddb %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfc,0xc0]
; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
; X64-NEXT: retq # encoding: [0xc3]
%res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.256(<4 x i64> %x0, <16 x i8> %x1, i8 -1)
; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
; X86-NEXT: vpmovqw %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x34,0xc2]
; X86-NEXT: vpmovqw %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x34,0xc1]
+; X86-NEXT: vpaddw %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfd,0xc9]
; X86-NEXT: vpmovqw %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x34,0xc0]
; X86-NEXT: vpaddw %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc0]
-; X86-NEXT: vpaddw %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfd,0xc0]
; X86-NEXT: retl # encoding: [0xc3]
;
; X64-LABEL: test_int_x86_avx512_mask_pmov_qw_128:
; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
; X64-NEXT: vpmovqw %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x34,0xc2]
; X64-NEXT: vpmovqw %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x34,0xc1]
+; X64-NEXT: vpaddw %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfd,0xc9]
; X64-NEXT: vpmovqw %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x34,0xc0]
; X64-NEXT: vpaddw %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc0]
-; X64-NEXT: vpaddw %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfd,0xc0]
; X64-NEXT: retq # encoding: [0xc3]
%res0 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.128(<2 x i64> %x0, <8 x i16> %x1, i8 -1)
%res1 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.128(<2 x i64> %x0, <8 x i16> %x1, i8 %x2)
; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
; X86-NEXT: vpmovsqw %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x24,0xc2]
; X86-NEXT: vpmovsqw %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x24,0xc1]
+; X86-NEXT: vpaddw %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfd,0xc9]
; X86-NEXT: vpmovsqw %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x24,0xc0]
; X86-NEXT: vpaddw %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc0]
-; X86-NEXT: vpaddw %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfd,0xc0]
; X86-NEXT: retl # encoding: [0xc3]
;
; X64-LABEL: test_int_x86_avx512_mask_pmovs_qw_128:
; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
; X64-NEXT: vpmovsqw %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x24,0xc2]
; X64-NEXT: vpmovsqw %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x24,0xc1]
+; X64-NEXT: vpaddw %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfd,0xc9]
; X64-NEXT: vpmovsqw %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x24,0xc0]
; X64-NEXT: vpaddw %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc0]
-; X64-NEXT: vpaddw %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfd,0xc0]
; X64-NEXT: retq # encoding: [0xc3]
%res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.128(<2 x i64> %x0, <8 x i16> %x1, i8 -1)
%res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.128(<2 x i64> %x0, <8 x i16> %x1, i8 %x2)
; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
; X86-NEXT: vpmovusqw %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x14,0xc2]
; X86-NEXT: vpmovusqw %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x14,0xc1]
+; X86-NEXT: vpaddw %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfd,0xc9]
; X86-NEXT: vpmovusqw %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x14,0xc0]
; X86-NEXT: vpaddw %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc0]
-; X86-NEXT: vpaddw %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfd,0xc0]
; X86-NEXT: retl # encoding: [0xc3]
;
; X64-LABEL: test_int_x86_avx512_mask_pmovus_qw_128:
; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
; X64-NEXT: vpmovusqw %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x14,0xc2]
; X64-NEXT: vpmovusqw %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x14,0xc1]
+; X64-NEXT: vpaddw %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfd,0xc9]
; X64-NEXT: vpmovusqw %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x14,0xc0]
; X64-NEXT: vpaddw %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc0]
-; X64-NEXT: vpaddw %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfd,0xc0]
; X64-NEXT: retq # encoding: [0xc3]
%res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.128(<2 x i64> %x0, <8 x i16> %x1, i8 -1)
%res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.128(<2 x i64> %x0, <8 x i16> %x1, i8 %x2)
; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
; X86-NEXT: vpmovqw %ymm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x28,0x34,0xc2]
; X86-NEXT: vpmovqw %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x34,0xc1]
+; X86-NEXT: vpaddw %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfd,0xc9]
; X86-NEXT: vpmovqw %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x34,0xc0]
; X86-NEXT: vpaddw %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc0]
-; X86-NEXT: vpaddw %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfd,0xc0]
; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
; X86-NEXT: retl # encoding: [0xc3]
;
; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
; X64-NEXT: vpmovqw %ymm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x28,0x34,0xc2]
; X64-NEXT: vpmovqw %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x34,0xc1]
+; X64-NEXT: vpaddw %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfd,0xc9]
; X64-NEXT: vpmovqw %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x34,0xc0]
; X64-NEXT: vpaddw %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc0]
-; X64-NEXT: vpaddw %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfd,0xc0]
; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
; X64-NEXT: retq # encoding: [0xc3]
%res0 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.256(<4 x i64> %x0, <8 x i16> %x1, i8 -1)
; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
; X86-NEXT: vpmovsqw %ymm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x28,0x24,0xc2]
; X86-NEXT: vpmovsqw %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x24,0xc1]
+; X86-NEXT: vpaddw %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfd,0xc9]
; X86-NEXT: vpmovsqw %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x24,0xc0]
; X86-NEXT: vpaddw %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc0]
-; X86-NEXT: vpaddw %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfd,0xc0]
; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
; X86-NEXT: retl # encoding: [0xc3]
;
; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
; X64-NEXT: vpmovsqw %ymm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x28,0x24,0xc2]
; X64-NEXT: vpmovsqw %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x24,0xc1]
+; X64-NEXT: vpaddw %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfd,0xc9]
; X64-NEXT: vpmovsqw %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x24,0xc0]
; X64-NEXT: vpaddw %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc0]
-; X64-NEXT: vpaddw %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfd,0xc0]
; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
; X64-NEXT: retq # encoding: [0xc3]
%res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.256(<4 x i64> %x0, <8 x i16> %x1, i8 -1)
; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
; X86-NEXT: vpmovusqw %ymm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x28,0x14,0xc2]
; X86-NEXT: vpmovusqw %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x14,0xc1]
+; X86-NEXT: vpaddw %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfd,0xc9]
; X86-NEXT: vpmovusqw %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x14,0xc0]
; X86-NEXT: vpaddw %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc0]
-; X86-NEXT: vpaddw %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfd,0xc0]
; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
; X86-NEXT: retl # encoding: [0xc3]
;
; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
; X64-NEXT: vpmovusqw %ymm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x28,0x14,0xc2]
; X64-NEXT: vpmovusqw %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x14,0xc1]
+; X64-NEXT: vpaddw %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfd,0xc9]
; X64-NEXT: vpmovusqw %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x14,0xc0]
; X64-NEXT: vpaddw %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc0]
-; X64-NEXT: vpaddw %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfd,0xc0]
; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
; X64-NEXT: retq # encoding: [0xc3]
%res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.256(<4 x i64> %x0, <8 x i16> %x1, i8 -1)
; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
; X86-NEXT: vpmovqd %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x35,0xc2]
; X86-NEXT: vpmovqd %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x35,0xc1]
+; X86-NEXT: vpaddd %xmm1, %xmm2, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc9]
; X86-NEXT: vpmovqd %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x35,0xc0]
; X86-NEXT: vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0]
-; X86-NEXT: vpaddd %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0]
; X86-NEXT: retl # encoding: [0xc3]
;
; X64-LABEL: test_int_x86_avx512_mask_pmov_qd_128:
; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
; X64-NEXT: vpmovqd %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x35,0xc2]
; X64-NEXT: vpmovqd %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x35,0xc1]
+; X64-NEXT: vpaddd %xmm1, %xmm2, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc9]
; X64-NEXT: vpmovqd %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x35,0xc0]
; X64-NEXT: vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0]
-; X64-NEXT: vpaddd %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0]
; X64-NEXT: retq # encoding: [0xc3]
%res0 = call <4 x i32> @llvm.x86.avx512.mask.pmov.qd.128(<2 x i64> %x0, <4 x i32> %x1, i8 -1)
%res1 = call <4 x i32> @llvm.x86.avx512.mask.pmov.qd.128(<2 x i64> %x0, <4 x i32> %x1, i8 %x2)
; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
; X86-NEXT: vpmovsqd %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x25,0xc2]
; X86-NEXT: vpmovsqd %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x25,0xc1]
+; X86-NEXT: vpaddd %xmm1, %xmm2, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc9]
; X86-NEXT: vpmovsqd %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x25,0xc0]
; X86-NEXT: vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0]
-; X86-NEXT: vpaddd %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0]
; X86-NEXT: retl # encoding: [0xc3]
;
; X64-LABEL: test_int_x86_avx512_mask_pmovs_qd_128:
; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
; X64-NEXT: vpmovsqd %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x25,0xc2]
; X64-NEXT: vpmovsqd %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x25,0xc1]
+; X64-NEXT: vpaddd %xmm1, %xmm2, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc9]
; X64-NEXT: vpmovsqd %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x25,0xc0]
; X64-NEXT: vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0]
-; X64-NEXT: vpaddd %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0]
; X64-NEXT: retq # encoding: [0xc3]
%res0 = call <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.128(<2 x i64> %x0, <4 x i32> %x1, i8 -1)
%res1 = call <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.128(<2 x i64> %x0, <4 x i32> %x1, i8 %x2)
; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
; X86-NEXT: vpmovusqd %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x15,0xc2]
; X86-NEXT: vpmovusqd %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x15,0xc1]
+; X86-NEXT: vpaddd %xmm1, %xmm2, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc9]
; X86-NEXT: vpmovusqd %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x15,0xc0]
; X86-NEXT: vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0]
-; X86-NEXT: vpaddd %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0]
; X86-NEXT: retl # encoding: [0xc3]
;
; X64-LABEL: test_int_x86_avx512_mask_pmovus_qd_128:
; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
; X64-NEXT: vpmovusqd %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x15,0xc2]
; X64-NEXT: vpmovusqd %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x15,0xc1]
+; X64-NEXT: vpaddd %xmm1, %xmm2, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc9]
; X64-NEXT: vpmovusqd %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x15,0xc0]
; X64-NEXT: vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0]
-; X64-NEXT: vpaddd %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0]
; X64-NEXT: retq # encoding: [0xc3]
%res0 = call <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.128(<2 x i64> %x0, <4 x i32> %x1, i8 -1)
%res1 = call <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.128(<2 x i64> %x0, <4 x i32> %x1, i8 %x2)
; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
; X86-NEXT: vpmovdb %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x31,0xc2]
; X86-NEXT: vpmovdb %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x31,0xc1]
+; X86-NEXT: vpaddb %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfc,0xc9]
; X86-NEXT: vpmovdb %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x31,0xc0]
; X86-NEXT: vpaddb %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfc,0xc0]
-; X86-NEXT: vpaddb %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfc,0xc0]
; X86-NEXT: retl # encoding: [0xc3]
;
; X64-LABEL: test_int_x86_avx512_mask_pmov_db_128:
; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
; X64-NEXT: vpmovdb %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x31,0xc2]
; X64-NEXT: vpmovdb %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x31,0xc1]
+; X64-NEXT: vpaddb %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfc,0xc9]
; X64-NEXT: vpmovdb %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x31,0xc0]
; X64-NEXT: vpaddb %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfc,0xc0]
-; X64-NEXT: vpaddb %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfc,0xc0]
; X64-NEXT: retq # encoding: [0xc3]
%res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.128(<4 x i32> %x0, <16 x i8> %x1, i8 -1)
%res1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.128(<4 x i32> %x0, <16 x i8> %x1, i8 %x2)
; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
; X86-NEXT: vpmovsdb %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x21,0xc2]
; X86-NEXT: vpmovsdb %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x21,0xc1]
+; X86-NEXT: vpaddb %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfc,0xc9]
; X86-NEXT: vpmovsdb %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x21,0xc0]
; X86-NEXT: vpaddb %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfc,0xc0]
-; X86-NEXT: vpaddb %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfc,0xc0]
; X86-NEXT: retl # encoding: [0xc3]
;
; X64-LABEL: test_int_x86_avx512_mask_pmovs_db_128:
; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
; X64-NEXT: vpmovsdb %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x21,0xc2]
; X64-NEXT: vpmovsdb %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x21,0xc1]
+; X64-NEXT: vpaddb %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfc,0xc9]
; X64-NEXT: vpmovsdb %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x21,0xc0]
; X64-NEXT: vpaddb %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfc,0xc0]
-; X64-NEXT: vpaddb %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfc,0xc0]
; X64-NEXT: retq # encoding: [0xc3]
%res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.128(<4 x i32> %x0, <16 x i8> %x1, i8 -1)
%res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.128(<4 x i32> %x0, <16 x i8> %x1, i8 %x2)
; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
; X86-NEXT: vpmovusdb %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x11,0xc2]
; X86-NEXT: vpmovusdb %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x11,0xc1]
+; X86-NEXT: vpaddb %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfc,0xc9]
; X86-NEXT: vpmovusdb %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x11,0xc0]
; X86-NEXT: vpaddb %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfc,0xc0]
-; X86-NEXT: vpaddb %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfc,0xc0]
; X86-NEXT: retl # encoding: [0xc3]
;
; X64-LABEL: test_int_x86_avx512_mask_pmovus_db_128:
; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
; X64-NEXT: vpmovusdb %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x11,0xc2]
; X64-NEXT: vpmovusdb %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x11,0xc1]
+; X64-NEXT: vpaddb %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfc,0xc9]
; X64-NEXT: vpmovusdb %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x11,0xc0]
; X64-NEXT: vpaddb %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfc,0xc0]
-; X64-NEXT: vpaddb %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfc,0xc0]
; X64-NEXT: retq # encoding: [0xc3]
%res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.128(<4 x i32> %x0, <16 x i8> %x1, i8 -1)
%res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.128(<4 x i32> %x0, <16 x i8> %x1, i8 %x2)
; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
; X86-NEXT: vpmovdb %ymm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x28,0x31,0xc2]
; X86-NEXT: vpmovdb %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x31,0xc1]
+; X86-NEXT: vpaddb %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfc,0xc9]
; X86-NEXT: vpmovdb %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x31,0xc0]
; X86-NEXT: vpaddb %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfc,0xc0]
-; X86-NEXT: vpaddb %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfc,0xc0]
; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
; X86-NEXT: retl # encoding: [0xc3]
;
; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
; X64-NEXT: vpmovdb %ymm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x28,0x31,0xc2]
; X64-NEXT: vpmovdb %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x31,0xc1]
+; X64-NEXT: vpaddb %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfc,0xc9]
; X64-NEXT: vpmovdb %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x31,0xc0]
; X64-NEXT: vpaddb %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfc,0xc0]
-; X64-NEXT: vpaddb %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfc,0xc0]
; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
; X64-NEXT: retq # encoding: [0xc3]
%res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.256(<8 x i32> %x0, <16 x i8> %x1, i8 -1)
; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
; X86-NEXT: vpmovsdb %ymm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x28,0x21,0xc2]
; X86-NEXT: vpmovsdb %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x21,0xc1]
+; X86-NEXT: vpaddb %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfc,0xc9]
; X86-NEXT: vpmovsdb %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x21,0xc0]
; X86-NEXT: vpaddb %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfc,0xc0]
-; X86-NEXT: vpaddb %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfc,0xc0]
; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
; X86-NEXT: retl # encoding: [0xc3]
;
; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
; X64-NEXT: vpmovsdb %ymm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x28,0x21,0xc2]
; X64-NEXT: vpmovsdb %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x21,0xc1]
+; X64-NEXT: vpaddb %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfc,0xc9]
; X64-NEXT: vpmovsdb %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x21,0xc0]
; X64-NEXT: vpaddb %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfc,0xc0]
-; X64-NEXT: vpaddb %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfc,0xc0]
; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
; X64-NEXT: retq # encoding: [0xc3]
%res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.256(<8 x i32> %x0, <16 x i8> %x1, i8 -1)
; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
; X86-NEXT: vpmovusdb %ymm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x28,0x11,0xc2]
; X86-NEXT: vpmovusdb %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x11,0xc1]
+; X86-NEXT: vpaddb %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfc,0xc9]
; X86-NEXT: vpmovusdb %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x11,0xc0]
; X86-NEXT: vpaddb %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfc,0xc0]
-; X86-NEXT: vpaddb %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfc,0xc0]
; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
; X86-NEXT: retl # encoding: [0xc3]
;
; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
; X64-NEXT: vpmovusdb %ymm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x28,0x11,0xc2]
; X64-NEXT: vpmovusdb %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x11,0xc1]
+; X64-NEXT: vpaddb %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfc,0xc9]
; X64-NEXT: vpmovusdb %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x11,0xc0]
; X64-NEXT: vpaddb %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfc,0xc0]
-; X64-NEXT: vpaddb %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfc,0xc0]
; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
; X64-NEXT: retq # encoding: [0xc3]
%res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.256(<8 x i32> %x0, <16 x i8> %x1, i8 -1)
; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
; X86-NEXT: vpmovdw %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x33,0xc2]
; X86-NEXT: vpmovdw %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x33,0xc1]
+; X86-NEXT: vpaddw %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfd,0xc9]
; X86-NEXT: vpmovdw %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x33,0xc0]
; X86-NEXT: vpaddw %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc0]
-; X86-NEXT: vpaddw %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfd,0xc0]
; X86-NEXT: retl # encoding: [0xc3]
;
; X64-LABEL: test_int_x86_avx512_mask_pmov_dw_128:
; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
; X64-NEXT: vpmovdw %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x33,0xc2]
; X64-NEXT: vpmovdw %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x33,0xc1]
+; X64-NEXT: vpaddw %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfd,0xc9]
; X64-NEXT: vpmovdw %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x33,0xc0]
; X64-NEXT: vpaddw %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc0]
-; X64-NEXT: vpaddw %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfd,0xc0]
; X64-NEXT: retq # encoding: [0xc3]
%res0 = call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.128(<4 x i32> %x0, <8 x i16> %x1, i8 -1)
%res1 = call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.128(<4 x i32> %x0, <8 x i16> %x1, i8 %x2)
; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
; X86-NEXT: vpmovsdw %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x23,0xc2]
; X86-NEXT: vpmovsdw %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x23,0xc1]
+; X86-NEXT: vpaddw %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfd,0xc9]
; X86-NEXT: vpmovsdw %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x23,0xc0]
; X86-NEXT: vpaddw %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc0]
-; X86-NEXT: vpaddw %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfd,0xc0]
; X86-NEXT: retl # encoding: [0xc3]
;
; X64-LABEL: test_int_x86_avx512_mask_pmovs_dw_128:
; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
; X64-NEXT: vpmovsdw %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x23,0xc2]
; X64-NEXT: vpmovsdw %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x23,0xc1]
+; X64-NEXT: vpaddw %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfd,0xc9]
; X64-NEXT: vpmovsdw %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x23,0xc0]
; X64-NEXT: vpaddw %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc0]
-; X64-NEXT: vpaddw %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfd,0xc0]
; X64-NEXT: retq # encoding: [0xc3]
%res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.128(<4 x i32> %x0, <8 x i16> %x1, i8 -1)
%res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.128(<4 x i32> %x0, <8 x i16> %x1, i8 %x2)
; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
; X86-NEXT: vpmovusdw %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x13,0xc2]
; X86-NEXT: vpmovusdw %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x13,0xc1]
+; X86-NEXT: vpaddw %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfd,0xc9]
; X86-NEXT: vpmovusdw %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x13,0xc0]
; X86-NEXT: vpaddw %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc0]
-; X86-NEXT: vpaddw %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfd,0xc0]
; X86-NEXT: retl # encoding: [0xc3]
;
; X64-LABEL: test_int_x86_avx512_mask_pmovus_dw_128:
; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
; X64-NEXT: vpmovusdw %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x13,0xc2]
; X64-NEXT: vpmovusdw %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x13,0xc1]
+; X64-NEXT: vpaddw %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfd,0xc9]
; X64-NEXT: vpmovusdw %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x13,0xc0]
; X64-NEXT: vpaddw %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc0]
-; X64-NEXT: vpaddw %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfd,0xc0]
; X64-NEXT: retq # encoding: [0xc3]
%res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.128(<4 x i32> %x0, <8 x i16> %x1, i8 -1)
%res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.128(<4 x i32> %x0, <8 x i16> %x1, i8 %x2)
; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
; X86-NEXT: vpmovdw %ymm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x28,0x33,0xc2]
; X86-NEXT: vpmovdw %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x33,0xc1]
+; X86-NEXT: vpaddw %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfd,0xc9]
; X86-NEXT: vpmovdw %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x33,0xc0]
; X86-NEXT: vpaddw %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc0]
-; X86-NEXT: vpaddw %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfd,0xc0]
; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
; X86-NEXT: retl # encoding: [0xc3]
;
; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
; X64-NEXT: vpmovdw %ymm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x28,0x33,0xc2]
; X64-NEXT: vpmovdw %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x33,0xc1]
+; X64-NEXT: vpaddw %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfd,0xc9]
; X64-NEXT: vpmovdw %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x33,0xc0]
; X64-NEXT: vpaddw %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc0]
-; X64-NEXT: vpaddw %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfd,0xc0]
; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
; X64-NEXT: retq # encoding: [0xc3]
%res0 = call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.256(<8 x i32> %x0, <8 x i16> %x1, i8 -1)
; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
; X86-NEXT: vpmovsdw %ymm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x28,0x23,0xc2]
; X86-NEXT: vpmovsdw %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x23,0xc1]
+; X86-NEXT: vpaddw %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfd,0xc9]
; X86-NEXT: vpmovsdw %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x23,0xc0]
; X86-NEXT: vpaddw %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc0]
-; X86-NEXT: vpaddw %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfd,0xc0]
; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
; X86-NEXT: retl # encoding: [0xc3]
;
; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
; X64-NEXT: vpmovsdw %ymm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x28,0x23,0xc2]
; X64-NEXT: vpmovsdw %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x23,0xc1]
+; X64-NEXT: vpaddw %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfd,0xc9]
; X64-NEXT: vpmovsdw %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x23,0xc0]
; X64-NEXT: vpaddw %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc0]
-; X64-NEXT: vpaddw %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfd,0xc0]
; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
; X64-NEXT: retq # encoding: [0xc3]
%res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.256(<8 x i32> %x0, <8 x i16> %x1, i8 -1)
; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
; X86-NEXT: vpmovusdw %ymm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x28,0x13,0xc2]
; X86-NEXT: vpmovusdw %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x13,0xc1]
+; X86-NEXT: vpaddw %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfd,0xc9]
; X86-NEXT: vpmovusdw %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x13,0xc0]
; X86-NEXT: vpaddw %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc0]
-; X86-NEXT: vpaddw %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfd,0xc0]
; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
; X86-NEXT: retl # encoding: [0xc3]
;
; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
; X64-NEXT: vpmovusdw %ymm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x28,0x13,0xc2]
; X64-NEXT: vpmovusdw %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x13,0xc1]
+; X64-NEXT: vpaddw %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfd,0xc9]
; X64-NEXT: vpmovusdw %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x13,0xc0]
; X64-NEXT: vpaddw %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc0]
-; X64-NEXT: vpaddw %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfd,0xc0]
; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
; X64-NEXT: retq # encoding: [0xc3]
%res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.256(<8 x i32> %x0, <8 x i16> %x1, i8 -1)
; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
; X86-NEXT: vcvtps2ph $2, %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x1d,0xc2,0x02]
; X86-NEXT: vcvtps2ph $10, %xmm0, %xmm3 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x1d,0xc3,0x0a]
+; X86-NEXT: vpaddw %xmm3, %xmm2, %xmm2 # encoding: [0xc5,0xe9,0xfd,0xd3]
; X86-NEXT: vcvtps2ph $11, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x1d,0xc1,0x0b]
-; X86-NEXT: vpaddw %xmm3, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc3]
-; X86-NEXT: vpaddw %xmm2, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfd,0xc2]
+; X86-NEXT: vpaddw %xmm2, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc2]
; X86-NEXT: retl # encoding: [0xc3]
;
; X64-LABEL: test_x86_vcvtps2ph_128:
; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
; X64-NEXT: vcvtps2ph $2, %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x1d,0xc2,0x02]
; X64-NEXT: vcvtps2ph $10, %xmm0, %xmm3 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x1d,0xc3,0x0a]
+; X64-NEXT: vpaddw %xmm3, %xmm2, %xmm2 # encoding: [0xc5,0xe9,0xfd,0xd3]
; X64-NEXT: vcvtps2ph $11, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x1d,0xc1,0x0b]
-; X64-NEXT: vpaddw %xmm3, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc3]
-; X64-NEXT: vpaddw %xmm2, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfd,0xc2]
+; X64-NEXT: vpaddw %xmm2, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc2]
; X64-NEXT: retq # encoding: [0xc3]
%res1 = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.128(<4 x float> %a0, i32 2, <8 x i16> zeroinitializer, i8 -1)
%res2 = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.128(<4 x float> %a0, i32 10, <8 x i16> zeroinitializer, i8 %mask)
; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
; X86-NEXT: vcvtps2ph $2, %ymm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x1d,0xc2,0x02]
; X86-NEXT: vcvtps2ph $11, %ymm0, %xmm3 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xa9,0x1d,0xc3,0x0b]
+; X86-NEXT: vpaddw %xmm3, %xmm2, %xmm2 # encoding: [0xc5,0xe9,0xfd,0xd3]
; X86-NEXT: vcvtps2ph $12, %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x1d,0xc1,0x0c]
-; X86-NEXT: vpaddw %xmm3, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc3]
-; X86-NEXT: vpaddw %xmm2, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfd,0xc2]
+; X86-NEXT: vpaddw %xmm2, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc2]
; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
; X86-NEXT: retl # encoding: [0xc3]
;
; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
; X64-NEXT: vcvtps2ph $2, %ymm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x1d,0xc2,0x02]
; X64-NEXT: vcvtps2ph $11, %ymm0, %xmm3 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xa9,0x1d,0xc3,0x0b]
+; X64-NEXT: vpaddw %xmm3, %xmm2, %xmm2 # encoding: [0xc5,0xe9,0xfd,0xd3]
; X64-NEXT: vcvtps2ph $12, %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x1d,0xc1,0x0c]
-; X64-NEXT: vpaddw %xmm3, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc3]
-; X64-NEXT: vpaddw %xmm2, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfd,0xc2]
+; X64-NEXT: vpaddw %xmm2, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc2]
; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
; X64-NEXT: retq # encoding: [0xc3]
%res1 = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.256(<8 x float> %a0, i32 2, <8 x i16> zeroinitializer, i8 -1)
; X64-LABEL: blsmask_through2:
; X64: # %bb.0: # %entry
; X64-NEXT: # kill: def $esi killed $esi def $rsi
-; X64-NEXT: xorl %edx, %edi
-; X64-NEXT: xorl %esi, %edi
; X64-NEXT: leal -1(%rsi), %eax
+; X64-NEXT: xorl %edx, %edi
; X64-NEXT: xorl %edi, %eax
+; X64-NEXT: xorl %esi, %eax
; X64-NEXT: retq
entry:
%sub = add nsw i32 %b, -1
;
; X64-LABEL: blsmask_through3:
; X64: # %bb.0: # %entry
-; X64-NEXT: xorq %rdx, %rdi
-; X64-NEXT: xorq %rcx, %rdi
-; X64-NEXT: xorq %rsi, %rdi
; X64-NEXT: leaq -1(%rsi), %rax
+; X64-NEXT: xorq %rdx, %rdi
; X64-NEXT: xorq %rdi, %rax
+; X64-NEXT: xorq %rsi, %rcx
+; X64-NEXT: xorq %rcx, %rax
; X64-NEXT: retq
entry:
%sub = add nsw i64 %b, -1
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: movl %ecx, %edi
; X86-NEXT: addl $-1, %edi
-; X86-NEXT: movl %esi, %ebx
-; X86-NEXT: adcl $-1, %ebx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT: xorl %ebx, %ebp
+; X86-NEXT: movl %esi, %ebp
+; X86-NEXT: adcl $-1, %ebp
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: xorl %ebp, %ebx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: xorl %edi, %eax
-; X86-NEXT: xorl %ebp, %esi
+; X86-NEXT: xorl %ebx, %esi
; X86-NEXT: xorl %eax, %ecx
-; X86-NEXT: imull %eax, %ebx
+; X86-NEXT: imull %eax, %ebp
; X86-NEXT: mull %edi
-; X86-NEXT: imull %edi, %ebp
-; X86-NEXT: addl %ebx, %ebp
; X86-NEXT: addl %ebp, %edx
+; X86-NEXT: imull %edi, %ebx
+; X86-NEXT: addl %ebx, %edx
; X86-NEXT: orl %esi, %edx
; X86-NEXT: orl %ecx, %eax
; X86-NEXT: popl %esi
; X64-LABEL: blsi_through2:
; X64: # %bb.0: # %entry
; X64-NEXT: movl %esi, %eax
-; X64-NEXT: andl %edx, %edi
-; X64-NEXT: andl %esi, %edi
; X64-NEXT: negl %eax
+; X64-NEXT: andl %edx, %edi
; X64-NEXT: andl %edi, %eax
+; X64-NEXT: andl %esi, %eax
; X64-NEXT: retq
entry:
%sub = sub i32 0, %b
; X64-LABEL: blsi_through3:
; X64: # %bb.0: # %entry
; X64-NEXT: movq %rsi, %rax
-; X64-NEXT: andq %rdx, %rdi
-; X64-NEXT: andq %rsi, %rdi
; X64-NEXT: negq %rax
+; X64-NEXT: andq %rdx, %rdi
; X64-NEXT: andq %rdi, %rax
+; X64-NEXT: andq %rsi, %rax
; X64-NEXT: retq
entry:
%sub = sub i64 0, %b
; X86-NEXT: andl %eax, %ecx
; X86-NEXT: imull %edx, %ebx
; X86-NEXT: imull %eax, %edi
-; X86-NEXT: addl %ebx, %edi
; X86-NEXT: mull %edx
; X86-NEXT: addl %edi, %edx
+; X86-NEXT: addl %ebx, %edx
; X86-NEXT: orl %esi, %edx
; X86-NEXT: orl %ecx, %eax
; X86-NEXT: popl %esi
; X64-LABEL: blsr_through2:
; X64: # %bb.0: # %entry
; X64-NEXT: # kill: def $esi killed $esi def $rsi
-; X64-NEXT: andl %edx, %edi
-; X64-NEXT: andl %esi, %edi
; X64-NEXT: leal -1(%rsi), %eax
+; X64-NEXT: andl %edx, %edi
; X64-NEXT: andl %edi, %eax
+; X64-NEXT: andl %esi, %eax
; X64-NEXT: retq
entry:
%sub = add nsw i32 %b, -1
;
; X64-LABEL: blsr_through3:
; X64: # %bb.0: # %entry
-; X64-NEXT: movq %rsi, %rax
-; X64-NEXT: andq %rdx, %rdi
-; X64-NEXT: andq %rcx, %rdi
-; X64-NEXT: andq %rsi, %rdi
-; X64-NEXT: negq %rax
-; X64-NEXT: andq %rdi, %rax
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: andq %rsi, %rcx
+; X64-NEXT: negq %rsi
+; X64-NEXT: andq %rdx, %rax
+; X64-NEXT: andq %rsi, %rax
+; X64-NEXT: andq %rcx, %rax
; X64-NEXT: retq
entry:
%sub = sub nsw i64 0, %b
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: movl %ecx, %edi
; X86-NEXT: addl $-1, %edi
-; X86-NEXT: movl %esi, %ebx
-; X86-NEXT: adcl $-1, %ebx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT: andl %ebx, %ebp
+; X86-NEXT: movl %esi, %ebp
+; X86-NEXT: adcl $-1, %ebp
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: andl %ebp, %ebx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: andl %edi, %eax
-; X86-NEXT: andl %ebp, %esi
+; X86-NEXT: andl %ebx, %esi
; X86-NEXT: andl %eax, %ecx
-; X86-NEXT: imull %eax, %ebx
+; X86-NEXT: imull %eax, %ebp
; X86-NEXT: mull %edi
-; X86-NEXT: imull %edi, %ebp
-; X86-NEXT: addl %ebx, %ebp
; X86-NEXT: addl %ebp, %edx
+; X86-NEXT: imull %edi, %ebx
+; X86-NEXT: addl %ebx, %edx
; X86-NEXT: orl %esi, %edx
; X86-NEXT: orl %ecx, %eax
; X86-NEXT: popl %esi
; AVX1-NEXT: vpsubd 16(%rdi), %xmm0, %xmm1
; AVX1-NEXT: vpsubd (%rdi), %xmm0, %xmm0
; AVX1-NEXT: vpaddd %xmm0, %xmm0, %xmm2
-; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm2
+; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm2
; AVX1-NEXT: vpaddd %xmm1, %xmm1, %xmm3
-; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm3
+; AVX1-NEXT: vpaddd %xmm1, %xmm3, %xmm3
; AVX1-NEXT: vmovdqu %xmm1, 16(%rsi)
; AVX1-NEXT: vmovdqu %xmm0, (%rsi)
; AVX1-NEXT: vmovdqu %xmm3, 16(%rdi)
; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT: pushl %ebx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
; X86-NEXT: pushl %ebp
+; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
; X86-NEXT: calll __divdi3
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: movl %ecx, 4(%edx)
; X86-NEXT: movl %eax, (%edx)
-; X86-NEXT: imull %eax, %ebx
-; X86-NEXT: mull %ebp
-; X86-NEXT: imull %ebp, %ecx
-; X86-NEXT: addl %ebx, %ecx
+; X86-NEXT: imull %eax, %ebp
+; X86-NEXT: mull %ebx
+; X86-NEXT: addl %ebp, %edx
+; X86-NEXT: imull %ebx, %ecx
; X86-NEXT: addl %edx, %ecx
; X86-NEXT: subl %eax, %esi
; X86-NEXT: sbbl %ecx, %edi
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
; X86-NEXT: subl $152, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: movl %esi, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl %ecx, %eax
; X86-NEXT: sarl $31, %eax
-; X86-NEXT: movl %ebp, %ecx
-; X86-NEXT: sarl $31, %ecx
+; X86-NEXT: movl %ebx, %ebp
+; X86-NEXT: sarl $31, %ebp
; X86-NEXT: movl %eax, %edx
-; X86-NEXT: xorl %esi, %edx
-; X86-NEXT: movl %eax, %esi
-; X86-NEXT: xorl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: movl %esi, %ebx
-; X86-NEXT: movl %eax, %edi
-; X86-NEXT: xorl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: xorl %ecx, %edx
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: xorl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl %ecx, %edi
; X86-NEXT: movl %eax, %esi
; X86-NEXT: xorl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: subl %eax, %esi
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: xorl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: subl %eax, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: sbbl %eax, %esi
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: sbbl %eax, %edi
; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: sbbl %eax, %ebx
-; X86-NEXT: movl %ebx, (%esp) # 4-byte Spill
; X86-NEXT: sbbl %eax, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %ecx, %edi
-; X86-NEXT: xorl %ebp, %edi
-; X86-NEXT: movl %ecx, %ebp
-; X86-NEXT: xorl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT: movl %ecx, %ebx
+; X86-NEXT: movl %ebp, %esi
+; X86-NEXT: xorl %ebx, %esi
+; X86-NEXT: movl %ebp, %edx
+; X86-NEXT: xorl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl %ebp, %ebx
; X86-NEXT: xorl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT: movl %ecx, %esi
-; X86-NEXT: xorl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: subl %ecx, %esi
-; X86-NEXT: sbbl %ecx, %ebx
-; X86-NEXT: sbbl %ecx, %ebp
-; X86-NEXT: sbbl %ecx, %edi
-; X86-NEXT: xorl %eax, %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ebp, %edi
+; X86-NEXT: xorl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: subl %ebp, %edi
+; X86-NEXT: sbbl %ebp, %ebx
+; X86-NEXT: sbbl %ebp, %edx
+; X86-NEXT: sbbl %ebp, %esi
+; X86-NEXT: xorl %eax, %ebp
+; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %ebx, %eax
-; X86-NEXT: orl %edi, %eax
-; X86-NEXT: movl %esi, %ecx
-; X86-NEXT: orl %ebp, %ecx
-; X86-NEXT: orl %eax, %ecx
-; X86-NEXT: sete {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: orl %esi, %eax
+; X86-NEXT: movl %edi, %ecx
; X86-NEXT: orl %edx, %ecx
+; X86-NEXT: movl %edx, %ebp
+; X86-NEXT: orl %eax, %ecx
+; X86-NEXT: sete %cl
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: orl (%esp), %edx # 4-byte Folded Reload
-; X86-NEXT: orl %ecx, %edx
-; X86-NEXT: sete {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT: bsrl %edi, %eax
-; X86-NEXT: xorl $31, %eax
-; X86-NEXT: bsrl %ebp, %edx
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: orl %eax, %edx
+; X86-NEXT: sete %al
+; X86-NEXT: orb %cl, %al
+; X86-NEXT: movb %al, (%esp) # 1-byte Spill
+; X86-NEXT: bsrl %esi, %edx
; X86-NEXT: xorl $31, %edx
-; X86-NEXT: addl $32, %edx
-; X86-NEXT: testl %edi, %edi
-; X86-NEXT: cmovnel %eax, %edx
-; X86-NEXT: bsrl %ebx, %eax
-; X86-NEXT: xorl $31, %eax
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: bsrl %esi, %ecx
+; X86-NEXT: bsrl %ebp, %ecx
; X86-NEXT: xorl $31, %ecx
; X86-NEXT: addl $32, %ecx
+; X86-NEXT: testl %esi, %esi
+; X86-NEXT: cmovnel %edx, %ecx
+; X86-NEXT: bsrl %ebx, %edx
+; X86-NEXT: xorl $31, %edx
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: bsrl %edi, %edi
+; X86-NEXT: xorl $31, %edi
+; X86-NEXT: addl $32, %edi
; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: testl %ebx, %ebx
-; X86-NEXT: cmovnel %eax, %ecx
-; X86-NEXT: addl $64, %ecx
+; X86-NEXT: cmovnel %edx, %edi
+; X86-NEXT: addl $64, %edi
; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: orl %edi, %ebp
-; X86-NEXT: cmovnel %edx, %ecx
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %esi, %ebp
+; X86-NEXT: cmovnel %ecx, %edi
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: bsrl %ebx, %esi
+; X86-NEXT: bsrl %ebx, %edx
+; X86-NEXT: xorl $31, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT: bsrl %ebp, %ecx
+; X86-NEXT: xorl $31, %ecx
+; X86-NEXT: addl $32, %ecx
+; X86-NEXT: testl %ebx, %ebx
+; X86-NEXT: cmovnel %edx, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: bsrl %eax, %esi
; X86-NEXT: xorl $31, %esi
-; X86-NEXT: movl (%esp), %eax # 4-byte Reload
-; X86-NEXT: bsrl %eax, %edx
+; X86-NEXT: bsrl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X86-NEXT: xorl $31, %edx
; X86-NEXT: addl $32, %edx
-; X86-NEXT: testl %ebx, %ebx
+; X86-NEXT: testl %eax, %eax
; X86-NEXT: cmovnel %esi, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT: bsrl %ebp, %edi
-; X86-NEXT: xorl $31, %edi
-; X86-NEXT: bsrl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT: xorl $31, %esi
-; X86-NEXT: addl $32, %esi
-; X86-NEXT: testl %ebp, %ebp
-; X86-NEXT: cmovnel %edi, %esi
-; X86-NEXT: addl $64, %esi
-; X86-NEXT: movl %eax, %edi
-; X86-NEXT: orl %ebx, %edi
-; X86-NEXT: cmovnel %edx, %esi
-; X86-NEXT: xorl %edi, %edi
-; X86-NEXT: subl %esi, %ecx
-; X86-NEXT: movl $0, %ebp
-; X86-NEXT: sbbl %ebp, %ebp
+; X86-NEXT: addl $64, %edx
+; X86-NEXT: movl %ebp, %esi
+; X86-NEXT: orl %ebx, %esi
+; X86-NEXT: cmovnel %ecx, %edx
+; X86-NEXT: xorl %esi, %esi
+; X86-NEXT: subl %edx, %edi
+; X86-NEXT: movl $0, %edx
+; X86-NEXT: sbbl %edx, %edx
; X86-NEXT: movl $0, %eax
; X86-NEXT: sbbl %eax, %eax
-; X86-NEXT: movl $0, %esi
-; X86-NEXT: sbbl %esi, %esi
-; X86-NEXT: movl $127, %edx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: cmpl %ecx, %edx
-; X86-NEXT: movl %eax, %ecx
-; X86-NEXT: movl $0, %edx
-; X86-NEXT: sbbl %ebp, %edx
-; X86-NEXT: movl $0, %edx
-; X86-NEXT: sbbl %eax, %edx
-; X86-NEXT: movl $0, %edx
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: sbbl %esi, %edx
-; X86-NEXT: setb %dl
-; X86-NEXT: orb {{[-0-9]+}}(%e{{[sb]}}p), %dl # 1-byte Folded Reload
-; X86-NEXT: orb {{[-0-9]+}}(%e{{[sb]}}p), %dl # 1-byte Folded Reload
-; X86-NEXT: cmovnel %edi, %ebx
-; X86-NEXT: movl (%esp), %esi # 4-byte Reload
-; X86-NEXT: cmovnel %edi, %esi
+; X86-NEXT: movl $0, %ebx
+; X86-NEXT: sbbl %ebx, %ebx
+; X86-NEXT: movl $127, %ecx
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: cmpl %edi, %ecx
+; X86-NEXT: movl $0, %ecx
+; X86-NEXT: sbbl %edx, %ecx
+; X86-NEXT: movl $0, %ecx
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: sbbl %eax, %ecx
+; X86-NEXT: movl $0, %ecx
+; X86-NEXT: sbbl %ebx, %ecx
+; X86-NEXT: setb %cl
+; X86-NEXT: orb (%esp), %cl # 1-byte Folded Reload
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: cmovnel %edi, %eax
-; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT: jne .LBB4_8
-; X86-NEXT: # %bb.1: # %_udiv-special-cases
+; X86-NEXT: cmovnel %esi, %eax
+; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT: cmovnel %esi, %ebp
+; X86-NEXT: movl %ebp, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: cmovnel %esi, %eax
+; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: jne .LBB4_1
+; X86-NEXT: # %bb.8: # %_udiv-special-cases
+; X86-NEXT: movl %edx, %ecx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X86-NEXT: xorl $127, %edx
-; X86-NEXT: orl %ecx, %edx
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %ebp, %ecx
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %ebx, %ecx
; X86-NEXT: orl %edx, %ecx
-; X86-NEXT: je .LBB4_8
-; X86-NEXT: # %bb.2: # %udiv-bb1
+; X86-NEXT: movl (%esp), %ebx # 4-byte Reload
+; X86-NEXT: je .LBB4_9
+; X86-NEXT: # %bb.5: # %udiv-bb1
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %ecx, %ebp
; X86-NEXT: xorb $127, %al
; X86-NEXT: movb %al, %ch
; X86-NEXT: shrb $3, %al
; X86-NEXT: andb $15, %al
; X86-NEXT: negb %al
-; X86-NEXT: movsbl %al, %edi
-; X86-NEXT: movl 144(%esp,%edi), %edx
-; X86-NEXT: movl 148(%esp,%edi), %ebx
+; X86-NEXT: movsbl %al, %esi
+; X86-NEXT: movl 144(%esp,%esi), %edx
+; X86-NEXT: movl 148(%esp,%esi), %ebx
; X86-NEXT: movb %ch, %cl
; X86-NEXT: shldl %cl, %edx, %ebx
; X86-NEXT: shll %cl, %edx
; X86-NEXT: notb %cl
-; X86-NEXT: movl 140(%esp,%edi), %eax
-; X86-NEXT: movl %eax, %esi
-; X86-NEXT: shrl %esi
-; X86-NEXT: shrl %cl, %esi
-; X86-NEXT: orl %edx, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 136(%esp,%edi), %edx
+; X86-NEXT: movl 140(%esp,%esi), %eax
+; X86-NEXT: movl %eax, %edi
+; X86-NEXT: shrl %edi
+; X86-NEXT: shrl %cl, %edi
+; X86-NEXT: orl %edx, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 136(%esp,%esi), %edx
; X86-NEXT: movb %ch, %cl
; X86-NEXT: shldl %cl, %edx, %eax
; X86-NEXT: shll %cl, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: addl $1, %ebp
; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT: adcl $0, %ebp
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: adcl $0, %ecx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X86-NEXT: adcl $0, %edx
-; X86-NEXT: jae .LBB4_3
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT: adcl $0, %ebp
+; X86-NEXT: jae .LBB4_2
; X86-NEXT: # %bb.6:
; X86-NEXT: xorl %edx, %edx
; X86-NEXT: xorl %ecx, %ecx
; X86-NEXT: jmp .LBB4_7
-; X86-NEXT: .LBB4_3: # %udiv-preheader
+; X86-NEXT: .LBB4_1:
+; X86-NEXT: movl (%esp), %ebx # 4-byte Reload
+; X86-NEXT: jmp .LBB4_9
+; X86-NEXT: .LBB4_2: # %udiv-preheader
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X86-NEXT: movl %esi, {{[0-9]+}}(%esp)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X86-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT: movl (%esp), %esi # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X86-NEXT: movl %esi, {{[0-9]+}}(%esp)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X86-NEXT: movl %esi, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X86-NEXT: movb %dl, %ch
; X86-NEXT: andb $7, %ch
; X86-NEXT: movb %dl, %cl
; X86-NEXT: andb $15, %cl
; X86-NEXT: movzbl %cl, %edx
; X86-NEXT: movl 100(%esp,%edx), %esi
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 96(%esp,%edx), %ebp
-; X86-NEXT: movl %edx, %eax
-; X86-NEXT: movl %ebp, %edx
+; X86-NEXT: movl %ebx, (%esp) # 4-byte Spill
+; X86-NEXT: movl 96(%esp,%edx), %edi
+; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: movl %edi, %edx
; X86-NEXT: movb %ch, %cl
; X86-NEXT: shrdl %cl, %esi, %edx
-; X86-NEXT: movl %ebx, %edi
-; X86-NEXT: movl 88(%esp,%eax), %ebx
-; X86-NEXT: movl 92(%esp,%eax), %eax
-; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 88(%esp,%ebx), %ebp
+; X86-NEXT: movl 92(%esp,%ebx), %ebx
+; X86-NEXT: movl %ebx, %eax
; X86-NEXT: shrl %cl, %eax
; X86-NEXT: notb %cl
-; X86-NEXT: addl %ebp, %ebp
-; X86-NEXT: shll %cl, %ebp
-; X86-NEXT: orl %eax, %ebp
-; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: addl %edi, %edi
+; X86-NEXT: shll %cl, %edi
+; X86-NEXT: orl %eax, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movb %ch, %cl
; X86-NEXT: shrl %cl, %esi
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl (%esp), %eax # 4-byte Reload
-; X86-NEXT: shrdl %cl, %eax, %ebx
+; X86-NEXT: shrdl %cl, %ebx, %ebp
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: addl $-1, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: adcl $-1, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: adcl $-1, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NEXT: .p2align 4, 0x90
-; X86-NEXT: .LBB4_4: # %udiv-do-while
+; X86-NEXT: .LBB4_3: # %udiv-do-while
; X86-NEXT: # =>This Inner Loop Header: Depth=1
-; X86-NEXT: movl %edx, (%esp) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT: shldl $1, %edx, %ebp
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: shldl $1, %edx, %ebx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: shldl $1, %edx, (%esp) # 4-byte Folded Spill
-; X86-NEXT: shldl $1, %ebx, %edx
-; X86-NEXT: shldl $1, %edi, %ebx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: shldl $1, %esi, %edi
+; X86-NEXT: shldl $1, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: shldl $1, %ebp, %edx
+; X86-NEXT: movl (%esp), %ecx # 4-byte Reload
+; X86-NEXT: shldl $1, %ecx, %ebp
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: shldl $1, %edi, %ecx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: orl %eax, %ecx
+; X86-NEXT: movl %ecx, (%esp) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: shldl $1, %ecx, %edi
; X86-NEXT: orl %eax, %edi
; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %ecx, %edi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: shldl $1, %ecx, %esi
-; X86-NEXT: orl %eax, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X86-NEXT: shldl $1, %esi, %ecx
; X86-NEXT: orl %eax, %ecx
; X86-NEXT: addl %esi, %esi
; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: cmpl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT: cmpl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: sbbl %edx, %ecx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: sbbl (%esp), %ecx # 4-byte Folded Reload
+; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: sbbl %ebp, %ecx
+; X86-NEXT: sbbl %ebx, %ecx
; X86-NEXT: sarl $31, %ecx
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: andl $1, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %ecx, %esi
-; X86-NEXT: andl %edi, %esi
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
; X86-NEXT: movl %ecx, %edi
; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT: subl %ecx, %ebx
+; X86-NEXT: subl %ecx, %ebp
; X86-NEXT: sbbl %eax, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl (%esp), %edx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X86-NEXT: sbbl %edi, %edx
-; X86-NEXT: sbbl %esi, %ebp
-; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: sbbl %esi, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: addl $-1, %ecx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: adcl $-1, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: adcl $-1, %esi
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
; X86-NEXT: adcl $-1, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: adcl $-1, %esi
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: orl %edi, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: orl %esi, %ecx
+; X86-NEXT: orl %esi, %eax
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %edi, %ecx
; X86-NEXT: orl %eax, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: jne .LBB4_4
-; X86-NEXT: # %bb.5:
-; X86-NEXT: movl %edi, %ebx
+; X86-NEXT: jne .LBB4_3
+; X86-NEXT: # %bb.4:
+; X86-NEXT: movl (%esp), %ebx # 4-byte Reload
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X86-NEXT: .LBB4_7: # %udiv-loop-exit
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: shldl $1, %esi, %ebx
-; X86-NEXT: orl %ecx, %ebx
-; X86-NEXT: shldl $1, %eax, %esi
-; X86-NEXT: orl %ecx, %esi
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: shldl $1, %edi, %eax
+; X86-NEXT: shldl $1, %edi, %ebx
+; X86-NEXT: orl %ecx, %ebx
+; X86-NEXT: shldl $1, %eax, %edi
+; X86-NEXT: orl %ecx, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: shldl $1, %esi, %eax
; X86-NEXT: orl %ecx, %eax
-; X86-NEXT: addl %edi, %edi
-; X86-NEXT: orl %edx, %edi
-; X86-NEXT: .LBB4_8: # %udiv-end
+; X86-NEXT: addl %esi, %esi
+; X86-NEXT: orl %edx, %esi
+; X86-NEXT: .LBB4_9: # %udiv-end
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: xorl %ecx, %ebx
-; X86-NEXT: xorl %ecx, %esi
+; X86-NEXT: xorl %ecx, %edi
; X86-NEXT: xorl %ecx, %eax
-; X86-NEXT: movl %edi, %edx
-; X86-NEXT: xorl %ecx, %edx
-; X86-NEXT: subl %ecx, %edx
+; X86-NEXT: xorl %ecx, %esi
+; X86-NEXT: subl %ecx, %esi
; X86-NEXT: sbbl %ecx, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: sbbl %ecx, %esi
+; X86-NEXT: sbbl %ecx, %edi
; X86-NEXT: sbbl %ecx, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl %edx, (%ecx)
+; X86-NEXT: movl %esi, (%ecx)
; X86-NEXT: movl %eax, 4(%ecx)
-; X86-NEXT: movl %esi, 8(%ecx)
+; X86-NEXT: movl %edi, 8(%ecx)
; X86-NEXT: movl %ebx, 12(%ecx)
; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT: movl %edx, %ebx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: mull %edi
+; X86-NEXT: mull %ebp
; X86-NEXT: movl %edx, %ecx
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %ebx, %eax
-; X86-NEXT: movl %ebx, %edi
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: mull %ebp
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ebx, %ebp
; X86-NEXT: movl %edx, %ebx
; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
; X86-NEXT: adcl $0, %ecx
-; X86-NEXT: movl %edi, %eax
-; X86-NEXT: movl %esi, %edi
+; X86-NEXT: movl %ecx, (%esp) # 4-byte Spill
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: movl %ebp, %ecx
+; X86-NEXT: movl %esi, %ebp
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: mull %esi
; X86-NEXT: addl %ebx, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl %ecx, %edx
-; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: setb %bl
+; X86-NEXT: adcl (%esp), %edx # 4-byte Folded Reload
+; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: setb (%esp) # 1-byte Folded Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X86-NEXT: movl %esi, %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: addl %ecx, %eax
+; X86-NEXT: addl %ebx, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movzbl %bl, %eax
+; X86-NEXT: movzbl (%esp), %eax # 1-byte Folded Reload
; X86-NEXT: adcl %eax, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %edx, (%esp) # 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: imull %eax, %ecx
; X86-NEXT: mull %edi
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: imull {{[0-9]+}}(%esp), %edi
-; X86-NEXT: addl %ecx, %edi
; X86-NEXT: addl %edx, %edi
+; X86-NEXT: addl %ecx, %edi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: imull %eax, %esi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: imull %edx, %ebp
-; X86-NEXT: addl %esi, %ebp
-; X86-NEXT: mull %edx
-; X86-NEXT: addl %edx, %ebp
+; X86-NEXT: movl %edi, %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: imull %ebp, %edi
+; X86-NEXT: mull %ebp
+; X86-NEXT: addl %edx, %edi
+; X86-NEXT: addl %esi, %edi
; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: adcl %edi, %ebp
+; X86-NEXT: adcl %ecx, %edi
; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: subl (%esp), %esi # 4-byte Folded Reload
+; X86-NEXT: adcl (%esp), %edi # 4-byte Folded Reload
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: subl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: sbbl %eax, %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: sbbl %ebp, %edi
+; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: sbbl %eax, %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: sbbl %edi, %ebx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl %esi, (%eax)
-; X86-NEXT: movl %edx, 4(%eax)
-; X86-NEXT: movl %ecx, 8(%eax)
-; X86-NEXT: movl %edi, 12(%eax)
+; X86-NEXT: movl %edx, (%eax)
+; X86-NEXT: movl %ecx, 4(%eax)
+; X86-NEXT: movl %esi, 8(%eax)
+; X86-NEXT: movl %ebx, 12(%eax)
; X86-NEXT: addl $152, %esp
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X64-NEXT: movq %rax, (%rbx)
; X64-NEXT: imulq %rax, %r14
; X64-NEXT: mulq %r15
+; X64-NEXT: addq %r14, %rdx
; X64-NEXT: imulq %r15, %rcx
-; X64-NEXT: addq %r14, %rcx
; X64-NEXT: addq %rdx, %rcx
; X64-NEXT: subq %rax, %r13
; X64-NEXT: sbbq %rcx, %r12
; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT: pushl %ebx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
; X86-NEXT: pushl %ebp
+; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
; X86-NEXT: calll __udivdi3
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: movl %ecx, 4(%edx)
; X86-NEXT: movl %eax, (%edx)
-; X86-NEXT: imull %eax, %ebx
-; X86-NEXT: mull %ebp
-; X86-NEXT: imull %ebp, %ecx
-; X86-NEXT: addl %ebx, %ecx
+; X86-NEXT: imull %eax, %ebp
+; X86-NEXT: mull %ebx
+; X86-NEXT: addl %ebp, %edx
+; X86-NEXT: imull %ebx, %ecx
; X86-NEXT: addl %edx, %ecx
; X86-NEXT: subl %eax, %esi
; X86-NEXT: sbbl %ecx, %edi
; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: subl $132, %esp
+; X86-NEXT: subl $136, %esp
; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movl %edi, %eax
+; X86-NEXT: movl %ebx, %eax
; X86-NEXT: orl %ebp, %eax
-; X86-NEXT: orl %ebx, %ecx
+; X86-NEXT: movl %edi, %ecx
+; X86-NEXT: orl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: orl %eax, %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: sete (%esp) # 1-byte Folded Spill
-; X86-NEXT: orl %edx, %ecx
-; X86-NEXT: movl %edx, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: sete %cl
+; X86-NEXT: orl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: orl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: orl %ecx, %edx
-; X86-NEXT: sete {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT: bsrl %ebp, %esi
-; X86-NEXT: xorl $31, %esi
-; X86-NEXT: bsrl %ebx, %edx
+; X86-NEXT: orl %esi, %edx
+; X86-NEXT: orl %eax, %edx
+; X86-NEXT: sete %al
+; X86-NEXT: orb %cl, %al
+; X86-NEXT: movb %al, (%esp) # 1-byte Spill
+; X86-NEXT: bsrl %ebp, %edx
; X86-NEXT: xorl $31, %edx
-; X86-NEXT: addl $32, %edx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: bsrl %eax, %ecx
+; X86-NEXT: xorl $31, %ecx
+; X86-NEXT: addl $32, %ecx
; X86-NEXT: testl %ebp, %ebp
-; X86-NEXT: cmovnel %esi, %edx
-; X86-NEXT: bsrl %edi, %esi
-; X86-NEXT: xorl $31, %esi
-; X86-NEXT: bsrl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: cmovnel %edx, %ecx
+; X86-NEXT: bsrl %ebx, %edx
+; X86-NEXT: xorl $31, %edx
+; X86-NEXT: bsrl %edi, %edi
+; X86-NEXT: xorl $31, %edi
+; X86-NEXT: addl $32, %edi
+; X86-NEXT: testl %ebx, %ebx
+; X86-NEXT: cmovnel %edx, %edi
+; X86-NEXT: addl $64, %edi
+; X86-NEXT: movl %eax, %edx
+; X86-NEXT: orl %ebp, %edx
+; X86-NEXT: cmovnel %ecx, %edi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT: bsrl %ebp, %edx
+; X86-NEXT: xorl $31, %edx
+; X86-NEXT: bsrl %esi, %ecx
+; X86-NEXT: movl %esi, %ebx
; X86-NEXT: xorl $31, %ecx
; X86-NEXT: addl $32, %ecx
-; X86-NEXT: testl %edi, %edi
-; X86-NEXT: cmovnel %esi, %ecx
-; X86-NEXT: addl $64, %ecx
-; X86-NEXT: orl %ebp, %ebx
+; X86-NEXT: testl %ebp, %ebp
; X86-NEXT: cmovnel %edx, %ecx
-; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: bsrl %eax, %esi
; X86-NEXT: xorl $31, %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: bsrl %edx, %edx
+; X86-NEXT: bsrl {{[0-9]+}}(%esp), %edx
; X86-NEXT: xorl $31, %edx
; X86-NEXT: addl $32, %edx
; X86-NEXT: testl %eax, %eax
; X86-NEXT: cmovnel %esi, %edx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT: bsrl %ebp, %edi
-; X86-NEXT: xorl $31, %edi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: bsrl %eax, %esi
-; X86-NEXT: xorl $31, %esi
-; X86-NEXT: addl $32, %esi
-; X86-NEXT: testl %ebp, %ebp
-; X86-NEXT: cmovnel %edi, %esi
-; X86-NEXT: addl $64, %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl %eax, %edi
-; X86-NEXT: orl %ebx, %edi
-; X86-NEXT: cmovnel %edx, %esi
-; X86-NEXT: xorl %edi, %edi
-; X86-NEXT: subl %esi, %ecx
-; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: addl $64, %edx
+; X86-NEXT: movl %ebx, %esi
+; X86-NEXT: orl %ebp, %esi
+; X86-NEXT: cmovnel %ecx, %edx
+; X86-NEXT: xorl %ebx, %ebx
+; X86-NEXT: subl %edx, %edi
+; X86-NEXT: movl %ebp, %edx
; X86-NEXT: movl $0, %ebp
; X86-NEXT: sbbl %ebp, %ebp
; X86-NEXT: movl $0, %esi
; X86-NEXT: sbbl %esi, %esi
; X86-NEXT: movl $0, %eax
; X86-NEXT: sbbl %eax, %eax
-; X86-NEXT: movl $127, %edx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: cmpl %ecx, %edx
-; X86-NEXT: movl %esi, %ecx
-; X86-NEXT: movl $0, %edx
-; X86-NEXT: sbbl %ebp, %edx
-; X86-NEXT: movl $0, %edx
-; X86-NEXT: sbbl %esi, %edx
-; X86-NEXT: movl $0, %edx
+; X86-NEXT: movl $127, %ecx
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: cmpl %edi, %ecx
+; X86-NEXT: movl $0, %ecx
+; X86-NEXT: sbbl %ebp, %ecx
+; X86-NEXT: movl $0, %ecx
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: sbbl %esi, %ecx
+; X86-NEXT: movl $0, %ecx
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: sbbl %eax, %edx
-; X86-NEXT: setb %dl
-; X86-NEXT: orb {{[-0-9]+}}(%e{{[sb]}}p), %dl # 1-byte Folded Reload
-; X86-NEXT: orb (%esp), %dl # 1-byte Folded Reload
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: cmovnel %edi, %eax
-; X86-NEXT: movl %ebx, %esi
-; X86-NEXT: cmovnel %edi, %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT: cmovnel %edi, %ebx
-; X86-NEXT: cmovel {{[0-9]+}}(%esp), %edi
+; X86-NEXT: sbbl %eax, %ecx
+; X86-NEXT: setb %cl
+; X86-NEXT: orb (%esp), %cl # 1-byte Folded Reload
+; X86-NEXT: cmovnel %ebx, %edx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: cmovnel %ebx, %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: cmovnel %ebx, %edi
+; X86-NEXT: cmovel {{[0-9]+}}(%esp), %ebx
; X86-NEXT: jne .LBB4_1
; X86-NEXT: # %bb.8: # %_udiv-special-cases
-; X86-NEXT: movl %eax, %edx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: xorl $127, %eax
-; X86-NEXT: orl %ecx, %eax
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X86-NEXT: movl %ebp, %ecx
; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; X86-NEXT: orl %eax, %ecx
-; X86-NEXT: movl %edx, %eax
-; X86-NEXT: movl %ebp, %ecx
-; X86-NEXT: movl %ebx, %ebp
+; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
; X86-NEXT: je .LBB4_9
; X86-NEXT: # %bb.5: # %udiv-bb1
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: movl %ecx, %ebp
; X86-NEXT: xorb $127, %al
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movb %al, %ch
; X86-NEXT: andb $7, %ch
; X86-NEXT: shrb $3, %al
; X86-NEXT: andb $15, %al
; X86-NEXT: negb %al
; X86-NEXT: movsbl %al, %eax
-; X86-NEXT: movl 124(%esp,%eax), %edx
-; X86-NEXT: movl 128(%esp,%eax), %esi
+; X86-NEXT: movl 128(%esp,%eax), %edx
+; X86-NEXT: movl 132(%esp,%eax), %esi
; X86-NEXT: movb %ch, %cl
; X86-NEXT: shldl %cl, %edx, %esi
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: shll %cl, %edx
; X86-NEXT: notb %cl
-; X86-NEXT: movl 120(%esp,%eax), %ebp
-; X86-NEXT: movl %ebp, %esi
+; X86-NEXT: movl 124(%esp,%eax), %edi
+; X86-NEXT: movl %edi, %esi
; X86-NEXT: shrl %esi
; X86-NEXT: shrl %cl, %esi
; X86-NEXT: orl %edx, %esi
-; X86-NEXT: movl 116(%esp,%eax), %edi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 120(%esp,%eax), %ebx
; X86-NEXT: movb %ch, %cl
-; X86-NEXT: shldl %cl, %edi, %ebp
-; X86-NEXT: shll %cl, %edi
-; X86-NEXT: addl $1, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %ebx, %edi
+; X86-NEXT: shll %cl, %ebx
+; X86-NEXT: addl $1, %ebp
+; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: adcl $0, %ecx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: adcl $0, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: adcl $0, %ecx
; X86-NEXT: jae .LBB4_2
; X86-NEXT: # %bb.6:
-; X86-NEXT: xorl %edx, %edx
; X86-NEXT: xorl %ecx, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: xorl %eax, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
; X86-NEXT: jmp .LBB4_7
; X86-NEXT: .LBB4_1:
-; X86-NEXT: movl %ebx, %ebp
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
; X86-NEXT: jmp .LBB4_9
; X86-NEXT: .LBB4_2: # %udiv-preheader
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: shrb $3, %al
; X86-NEXT: andb $15, %al
; X86-NEXT: movzbl %al, %eax
-; X86-NEXT: movl 80(%esp,%eax), %edx
-; X86-NEXT: movl %edx, (%esp) # 4-byte Spill
+; X86-NEXT: movl 84(%esp,%eax), %esi
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 76(%esp,%eax), %edi
-; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 80(%esp,%eax), %edi
; X86-NEXT: movl %edi, %ebx
; X86-NEXT: movb %ch, %cl
-; X86-NEXT: shrdl %cl, %edx, %ebx
-; X86-NEXT: movl 68(%esp,%eax), %ebp
-; X86-NEXT: movl 72(%esp,%eax), %edx
+; X86-NEXT: shrdl %cl, %esi, %ebx
+; X86-NEXT: movl 72(%esp,%eax), %ebp
+; X86-NEXT: movl 76(%esp,%eax), %edx
; X86-NEXT: movl %edx, %eax
; X86-NEXT: shrl %cl, %eax
; X86-NEXT: notb %cl
; X86-NEXT: orl %eax, %edi
; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movb %ch, %cl
-; X86-NEXT: shrl %cl, (%esp) # 4-byte Folded Spill
+; X86-NEXT: shrl %cl, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: shrdl %cl, %edx, %ebp
; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X86-NEXT: .p2align 4, 0x90
; X86-NEXT: .LBB4_3: # %udiv-do-while
; X86-NEXT: # =>This Inner Loop Header: Depth=1
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %ebx, %edi
-; X86-NEXT: shldl $1, %ebx, (%esp) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: movl %ebx, (%esp) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
; X86-NEXT: shldl $1, %ebx, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: shldl $1, %ebx, (%esp) # 4-byte Folded Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X86-NEXT: shldl $1, %edx, %ebx
-; X86-NEXT: shldl $1, %eax, %edx
-; X86-NEXT: shldl $1, %esi, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT: orl %ebp, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl $1, %ecx, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl $1, %eax, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: orl %esi, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: shldl $1, %ecx, %esi
-; X86-NEXT: orl %ebp, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl $1, %ecx, %eax
+; X86-NEXT: orl %esi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: shldl $1, %eax, %ecx
-; X86-NEXT: orl %ebp, %ecx
+; X86-NEXT: orl %esi, %ecx
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: addl %eax, %eax
; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: sbbl %ebx, %ecx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: sbbl %edi, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: sbbl (%esp), %ecx # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: sbbl %edi, %ecx
; X86-NEXT: sarl $31, %ecx
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: andl $1, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %ecx, %ebp
-; X86-NEXT: andl {{[0-9]+}}(%esp), %ebp
; X86-NEXT: movl %ecx, %esi
; X86-NEXT: andl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl %ecx, %ebp
+; X86-NEXT: andl {{[0-9]+}}(%esp), %ebp
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: andl {{[0-9]+}}(%esp), %eax
; X86-NEXT: andl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X86-NEXT: sbbl %eax, %ebx
; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl (%esp), %ebx # 4-byte Reload
+; X86-NEXT: sbbl %ebp, %ebx
; X86-NEXT: sbbl %esi, %edi
-; X86-NEXT: movl %edi, %ebx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: sbbl %ebp, (%esp) # 4-byte Folded Spill
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: addl $-1, %ecx
; X86-NEXT: adcl $-1, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: adcl $-1, %edi
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
; X86-NEXT: adcl $-1, %ebp
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: adcl $-1, %esi
; X86-NEXT: movl %edx, %eax
-; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: orl %ebp, %eax
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %esi, %eax
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: orl %edi, %ecx
+; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %ebp, %ecx
; X86-NEXT: orl %eax, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: jne .LBB4_3
; X86-NEXT: # %bb.4:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT: movl %ecx, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X86-NEXT: .LBB4_7: # %udiv-loop-exit
-; X86-NEXT: shldl $1, %esi, %eax
-; X86-NEXT: orl %ecx, %eax
-; X86-NEXT: shldl $1, %ebp, %esi
-; X86-NEXT: orl %ecx, %esi
-; X86-NEXT: shldl $1, %edi, %ebp
-; X86-NEXT: orl %ecx, %ebp
-; X86-NEXT: addl %edi, %edi
-; X86-NEXT: orl %edx, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: shldl $1, %esi, %edx
+; X86-NEXT: orl %eax, %edx
+; X86-NEXT: shldl $1, %edi, %esi
+; X86-NEXT: orl %eax, %esi
+; X86-NEXT: shldl $1, %ebx, %edi
+; X86-NEXT: orl %eax, %edi
+; X86-NEXT: addl %ebx, %ebx
+; X86-NEXT: orl %ecx, %ebx
; X86-NEXT: .LBB4_9: # %udiv-end
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl %edi, (%ecx)
-; X86-NEXT: movl %ebp, 4(%ecx)
-; X86-NEXT: movl %esi, 8(%ecx)
-; X86-NEXT: movl %eax, 12(%ecx)
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl %ebx, (%eax)
+; X86-NEXT: movl %edi, 4(%eax)
+; X86-NEXT: movl %esi, 8(%eax)
+; X86-NEXT: movl %edx, 12(%eax)
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %esi, %edi
+; X86-NEXT: movl %ebp, %esi
+; X86-NEXT: imull %eax, %esi
; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: imull %ebp, %ecx
-; X86-NEXT: movl %edx, %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT: imull %edi, %ebp
-; X86-NEXT: addl %ecx, %ebp
-; X86-NEXT: mull %edi
+; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: mull %ebx
; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT: addl %esi, %edx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT: imull %ebx, %ebp
; X86-NEXT: addl %edx, %ebp
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: mull %edi
+; X86-NEXT: imull %esi, %ecx
+; X86-NEXT: addl %edx, %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: imull %ebx, %edi
+; X86-NEXT: addl %ecx, %edi
+; X86-NEXT: addl (%esp), %eax # 4-byte Folded Reload
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl %ebp, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT: movl %ebp, %eax
; X86-NEXT: mull %esi
-; X86-NEXT: movl %eax, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: imull %ebx, %eax
-; X86-NEXT: movl %edi, %ebx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: imull %edi, %esi
-; X86-NEXT: addl %eax, %esi
-; X86-NEXT: addl %edx, %esi
-; X86-NEXT: addl (%esp), %ecx # 4-byte Folded Reload
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl %ebp, %esi
-; X86-NEXT: movl %ebx, %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: mull %ecx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: mull %ecx
+; X86-NEXT: mull %esi
; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: movl %eax, %ebp
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
; X86-NEXT: adcl $0, %ecx
-; X86-NEXT: movl %ebx, %eax
-; X86-NEXT: mull %edi
+; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: mull %ebx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
; X86-NEXT: movl %edx, %ebx
-; X86-NEXT: addl %ebp, %eax
-; X86-NEXT: movl %eax, %ebp
+; X86-NEXT: addl %esi, %eax
+; X86-NEXT: movl %eax, %esi
; X86-NEXT: adcl %ecx, %ebx
; X86-NEXT: setb %cl
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: mull %edi
+; X86-NEXT: mull {{[0-9]+}}(%esp)
; X86-NEXT: addl %ebx, %eax
; X86-NEXT: movzbl %cl, %ecx
; X86-NEXT: adcl %ecx, %edx
; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: adcl %esi, %edx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT: subl (%esp), %ebx # 4-byte Folded Reload
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: sbbl %ebp, %esi
+; X86-NEXT: adcl %edi, %edx
+; X86-NEXT: subl (%esp), %ebp # 4-byte Folded Reload
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: sbbl %esi, %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
; X86-NEXT: sbbl %eax, %edi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: sbbl %edx, %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: sbbl %edx, %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl %ebx, (%eax)
-; X86-NEXT: movl %esi, 4(%eax)
+; X86-NEXT: movl %ebp, (%eax)
+; X86-NEXT: movl %ecx, 4(%eax)
; X86-NEXT: movl %edi, 8(%eax)
-; X86-NEXT: movl %ecx, 12(%eax)
-; X86-NEXT: addl $132, %esp
+; X86-NEXT: movl %esi, 12(%eax)
+; X86-NEXT: addl $136, %esp
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: popl %ebx
; X64-NEXT: movq %rax, (%rbx)
; X64-NEXT: imulq %rax, %r14
; X64-NEXT: mulq %r15
+; X64-NEXT: addq %r14, %rdx
; X64-NEXT: imulq %r15, %rcx
-; X64-NEXT: addq %r14, %rcx
; X64-NEXT: addq %rdx, %rcx
; X64-NEXT: subq %rax, %r13
; X64-NEXT: sbbq %rcx, %r12
; X32-NEXT: movl %ecx, %eax
; X32-NEXT: mull %ebx
; X32-NEXT: imull $-1431655766, %ecx, %ecx # imm = 0xAAAAAAAA
-; X32-NEXT: imull $-1431655765, %edi, %esi # imm = 0xAAAAAAAB
-; X32-NEXT: addl %ecx, %esi
-; X32-NEXT: addl %esi, %edx
+; X32-NEXT: addl %ecx, %edx
+; X32-NEXT: imull $-1431655765, %edi, %ecx # imm = 0xAAAAAAAB
+; X32-NEXT: addl %ecx, %edx
; X32-NEXT: popl %esi
; X32-NEXT: popl %edi
; X32-NEXT: popl %ebx
; X32-NEXT: movl %ecx, %eax
; X32-NEXT: mull %ebx
; X32-NEXT: imull $-858993460, %ecx, %ecx # imm = 0xCCCCCCCC
-; X32-NEXT: imull $-858993459, %edi, %esi # imm = 0xCCCCCCCD
-; X32-NEXT: addl %ecx, %esi
-; X32-NEXT: addl %esi, %edx
+; X32-NEXT: addl %ecx, %edx
+; X32-NEXT: imull $-858993459, %edi, %ecx # imm = 0xCCCCCCCD
+; X32-NEXT: addl %ecx, %edx
; X32-NEXT: popl %esi
; X32-NEXT: popl %edi
; X32-NEXT: popl %ebx
; X32-NEXT: movl %ecx, %eax
; X32-NEXT: mull %edx
; X32-NEXT: imull $-286331154, %ecx, %ecx # imm = 0xEEEEEEEE
-; X32-NEXT: imull $-286331153, %edi, %esi # imm = 0xEEEEEEEF
-; X32-NEXT: addl %ecx, %esi
-; X32-NEXT: addl %esi, %edx
+; X32-NEXT: addl %ecx, %edx
+; X32-NEXT: imull $-286331153, %edi, %ecx # imm = 0xEEEEEEEF
+; X32-NEXT: addl %ecx, %edx
; X32-NEXT: popl %esi
; X32-NEXT: popl %edi
; X32-NEXT: retl
; X32-NEXT: movl %ecx, %eax
; X32-NEXT: mull %ebx
; X32-NEXT: imull $-252645136, %ecx, %ecx # imm = 0xF0F0F0F0
-; X32-NEXT: imull $-252645135, %edi, %esi # imm = 0xF0F0F0F1
-; X32-NEXT: addl %ecx, %esi
-; X32-NEXT: addl %esi, %edx
+; X32-NEXT: addl %ecx, %edx
+; X32-NEXT: imull $-252645135, %edi, %ecx # imm = 0xF0F0F0F1
+; X32-NEXT: addl %ecx, %edx
; X32-NEXT: popl %esi
; X32-NEXT: popl %edi
; X32-NEXT: popl %ebx
; X32-NEXT: movl %ecx, %eax
; X32-NEXT: mull %edx
; X32-NEXT: imull $-16843010, %ecx, %ecx # imm = 0xFEFEFEFE
-; X32-NEXT: imull $-16843009, %esi, %esi # imm = 0xFEFEFEFF
-; X32-NEXT: addl %ecx, %esi
-; X32-NEXT: addl %esi, %edx
+; X32-NEXT: addl %ecx, %edx
+; X32-NEXT: imull $-16843009, %esi, %ecx # imm = 0xFEFEFEFF
+; X32-NEXT: addl %ecx, %edx
; X32-NEXT: popl %esi
; X32-NEXT: retl
;
; X32-NEXT: movl %ecx, %eax
; X32-NEXT: mull %ebx
; X32-NEXT: imull $-16711936, %ecx, %ecx # imm = 0xFF00FF00
-; X32-NEXT: imull $-16711935, %edi, %esi # imm = 0xFF00FF01
-; X32-NEXT: addl %ecx, %esi
-; X32-NEXT: addl %esi, %edx
+; X32-NEXT: addl %ecx, %edx
+; X32-NEXT: imull $-16711935, %edi, %ecx # imm = 0xFF00FF01
+; X32-NEXT: addl %ecx, %edx
; X32-NEXT: popl %esi
; X32-NEXT: popl %edi
; X32-NEXT: popl %ebx
; X32-NEXT: movl %ecx, %eax
; X32-NEXT: mull %ebx
; X32-NEXT: imull $-1431655766, %ecx, %ecx # imm = 0xAAAAAAAA
-; X32-NEXT: imull $-1431655765, %edi, %esi # imm = 0xAAAAAAAB
-; X32-NEXT: addl %ecx, %esi
-; X32-NEXT: addl %esi, %edx
+; X32-NEXT: addl %ecx, %edx
+; X32-NEXT: imull $-1431655765, %edi, %ecx # imm = 0xAAAAAAAB
+; X32-NEXT: addl %ecx, %edx
; X32-NEXT: popl %esi
; X32-NEXT: popl %edi
; X32-NEXT: popl %ebx
; X86-64-NEXT: imulq %rdi, %rcx
; X86-64-NEXT: movq %rdi, %rax
; X86-64-NEXT: mulq %r8
+; X86-64-NEXT: addq %rcx, %rdx
; X86-64-NEXT: imulq %rsi, %r8
-; X86-64-NEXT: addq %rcx, %r8
; X86-64-NEXT: addq %r8, %rdx
; X86-64-NEXT: retq
;
; WIN64-NEXT: imulq %rcx, %r9
; WIN64-NEXT: movq %rcx, %rax
; WIN64-NEXT: mulq %r10
+; WIN64-NEXT: addq %r9, %rdx
; WIN64-NEXT: imulq %r10, %r8
-; WIN64-NEXT: addq %r9, %r8
; WIN64-NEXT: addq %r8, %rdx
; WIN64-NEXT: retq
entry:
; X86-64-NEXT: imulq %rdi, %rcx
; X86-64-NEXT: movq %rdi, %rax
; X86-64-NEXT: mulq %r8
+; X86-64-NEXT: addq %rcx, %rdx
; X86-64-NEXT: imulq %rsi, %r8
-; X86-64-NEXT: addq %rcx, %r8
; X86-64-NEXT: addq %r8, %rdx
; X86-64-NEXT: retq
;
; WIN64-NEXT: imulq %rcx, %r9
; WIN64-NEXT: movq %rcx, %rax
; WIN64-NEXT: mulq %r10
+; WIN64-NEXT: addq %r9, %rdx
; WIN64-NEXT: imulq %r10, %r8
-; WIN64-NEXT: addq %r9, %r8
; WIN64-NEXT: addq %r8, %rdx
; WIN64-NEXT: retq
entry:
; X86-64-NEXT: movabsq $-1229782938247303441, %r8 # imm = 0xEEEEEEEEEEEEEEEF
; X86-64-NEXT: movq %rdi, %rax
; X86-64-NEXT: mulq %r8
+; X86-64-NEXT: addq %rcx, %rdx
; X86-64-NEXT: imulq %rsi, %r8
-; X86-64-NEXT: addq %rcx, %r8
; X86-64-NEXT: addq %r8, %rdx
; X86-64-NEXT: retq
;
; WIN64-NEXT: movabsq $-1229782938247303441, %r10 # imm = 0xEEEEEEEEEEEEEEEF
; WIN64-NEXT: movq %rcx, %rax
; WIN64-NEXT: mulq %r10
+; WIN64-NEXT: addq %r9, %rdx
; WIN64-NEXT: imulq %r10, %r8
-; WIN64-NEXT: addq %r9, %r8
; WIN64-NEXT: addq %r8, %rdx
; WIN64-NEXT: retq
entry:
; X86-64-NEXT: imulq %rdi, %rcx
; X86-64-NEXT: movq %rdi, %rax
; X86-64-NEXT: mulq %r8
+; X86-64-NEXT: addq %rcx, %rdx
; X86-64-NEXT: imulq %rsi, %r8
-; X86-64-NEXT: addq %rcx, %r8
; X86-64-NEXT: addq %r8, %rdx
; X86-64-NEXT: retq
;
; WIN64-NEXT: imulq %rcx, %r9
; WIN64-NEXT: movq %rcx, %rax
; WIN64-NEXT: mulq %r10
+; WIN64-NEXT: addq %r9, %rdx
; WIN64-NEXT: imulq %r10, %r8
-; WIN64-NEXT: addq %r9, %r8
; WIN64-NEXT: addq %r8, %rdx
; WIN64-NEXT: retq
entry:
; X86-64-NEXT: movabsq $-72340172838076673, %r8 # imm = 0xFEFEFEFEFEFEFEFF
; X86-64-NEXT: movq %rdi, %rax
; X86-64-NEXT: mulq %r8
+; X86-64-NEXT: addq %rcx, %rdx
; X86-64-NEXT: imulq %rsi, %r8
-; X86-64-NEXT: addq %rcx, %r8
; X86-64-NEXT: addq %r8, %rdx
; X86-64-NEXT: retq
;
; WIN64-NEXT: movabsq $-72340172838076673, %r10 # imm = 0xFEFEFEFEFEFEFEFF
; WIN64-NEXT: movq %rcx, %rax
; WIN64-NEXT: mulq %r10
+; WIN64-NEXT: addq %r9, %rdx
; WIN64-NEXT: imulq %r10, %r8
-; WIN64-NEXT: addq %r9, %r8
; WIN64-NEXT: addq %r8, %rdx
; WIN64-NEXT: retq
entry:
; X86-64-NEXT: imulq %rdi, %rcx
; X86-64-NEXT: movq %rdi, %rax
; X86-64-NEXT: mulq %r8
+; X86-64-NEXT: addq %rcx, %rdx
; X86-64-NEXT: imulq %rsi, %r8
-; X86-64-NEXT: addq %rcx, %r8
; X86-64-NEXT: addq %r8, %rdx
; X86-64-NEXT: retq
;
; WIN64-NEXT: imulq %rcx, %r9
; WIN64-NEXT: movq %rcx, %rax
; WIN64-NEXT: mulq %r10
+; WIN64-NEXT: addq %r9, %rdx
; WIN64-NEXT: imulq %r10, %r8
-; WIN64-NEXT: addq %r9, %r8
; WIN64-NEXT: addq %r8, %rdx
; WIN64-NEXT: retq
entry:
; X86-64-NEXT: movabsq $-281479271743489, %r8 # imm = 0xFFFEFFFEFFFEFFFF
; X86-64-NEXT: movq %rdi, %rax
; X86-64-NEXT: mulq %r8
+; X86-64-NEXT: addq %rcx, %rdx
; X86-64-NEXT: imulq %rsi, %r8
-; X86-64-NEXT: addq %rcx, %r8
; X86-64-NEXT: addq %r8, %rdx
; X86-64-NEXT: retq
;
; WIN64-NEXT: movabsq $-281479271743489, %r10 # imm = 0xFFFEFFFEFFFEFFFF
; WIN64-NEXT: movq %rcx, %rax
; WIN64-NEXT: mulq %r10
+; WIN64-NEXT: addq %r9, %rdx
; WIN64-NEXT: imulq %r10, %r8
-; WIN64-NEXT: addq %r9, %r8
; WIN64-NEXT: addq %r8, %rdx
; WIN64-NEXT: retq
entry:
; X86-64-NEXT: imulq %rdi, %rcx
; X86-64-NEXT: movq %rdi, %rax
; X86-64-NEXT: mulq %r8
+; X86-64-NEXT: addq %rcx, %rdx
; X86-64-NEXT: imulq %rsi, %r8
-; X86-64-NEXT: addq %rcx, %r8
; X86-64-NEXT: addq %r8, %rdx
; X86-64-NEXT: retq
;
; WIN64-NEXT: imulq %rcx, %r9
; WIN64-NEXT: movq %rcx, %rax
; WIN64-NEXT: mulq %r10
+; WIN64-NEXT: addq %r9, %rdx
; WIN64-NEXT: imulq %r10, %r8
-; WIN64-NEXT: addq %r9, %r8
; WIN64-NEXT: addq %r8, %rdx
; WIN64-NEXT: retq
entry:
; X86-64-NEXT: imulq %rdi, %rcx
; X86-64-NEXT: movq %rdi, %rax
; X86-64-NEXT: mulq %r8
+; X86-64-NEXT: addq %rcx, %rdx
; X86-64-NEXT: imulq %rsi, %r8
-; X86-64-NEXT: addq %rcx, %r8
; X86-64-NEXT: addq %r8, %rdx
; X86-64-NEXT: retq
;
; WIN64-NEXT: imulq %rcx, %r9
; WIN64-NEXT: movq %rcx, %rax
; WIN64-NEXT: mulq %r10
+; WIN64-NEXT: addq %r9, %rdx
; WIN64-NEXT: imulq %r10, %r8
-; WIN64-NEXT: addq %r9, %r8
; WIN64-NEXT: addq %r8, %rdx
; WIN64-NEXT: retq
entry:
;
; MPIC-LABEL: neg_0x80000001:
; MPIC: # %bb.0: # %entry
-; MPIC-NEXT: leaq _GLOBAL_OFFSET_TABLE_(%rip), %rcx
-; MPIC-NEXT: movabsq $foo@GOTOFF, %rdx
+; MPIC-NEXT: leaq _GLOBAL_OFFSET_TABLE_(%rip), %rax
+; MPIC-NEXT: movabsq $foo@GOTOFF, %rcx
+; MPIC-NEXT: addq %rax, %rcx
; MPIC-NEXT: movabsq $-2147483649, %rax # imm = 0xFFFFFFFF7FFFFFFF
-; MPIC-NEXT: addq %rdx, %rax
; MPIC-NEXT: addq %rcx, %rax
; MPIC-NEXT: retq
entry:
; NOBMI-LABEL: not_a_masked_merge2:
; NOBMI: # %bb.0:
; NOBMI-NEXT: movl %edi, %eax
+; NOBMI-NEXT: orl %edi, %esi
; NOBMI-NEXT: notl %eax
; NOBMI-NEXT: andl %edx, %eax
; NOBMI-NEXT: orl %esi, %eax
-; NOBMI-NEXT: orl %edi, %eax
; NOBMI-NEXT: retq
;
; BMI-LABEL: not_a_masked_merge2:
; BMI: # %bb.0:
+; BMI-NEXT: orl %edi, %esi
; BMI-NEXT: andnl %edx, %edi, %eax
; BMI-NEXT: orl %esi, %eax
-; BMI-NEXT: orl %edi, %eax
; BMI-NEXT: retq
%not_an_and0 = or i32 %a0, %a1
%not = xor i32 %a0, -1
; CHECK-NEXT: .cfi_offset %esi, -20
; CHECK-NEXT: .cfi_offset %edi, -16
; CHECK-NEXT: .cfi_offset %ebx, -12
-; CHECK-NEXT: movl $-1028477379, %edi # imm = 0xC2B2AE3D
-; CHECK-NEXT: movl $668265295, %ebx # imm = 0x27D4EB4F
-; CHECK-NEXT: movl a, %eax
-; CHECK-NEXT: cmpl $0, (%eax)
+; CHECK-NEXT: movl $-1028477379, %ebx # imm = 0xC2B2AE3D
+; CHECK-NEXT: movl $668265295, %ecx # imm = 0x27D4EB4F
+; CHECK-NEXT: movl a, %edi
+; CHECK-NEXT: cmpl $0, (%edi)
; CHECK-NEXT: je .LBB0_2
; CHECK-NEXT: # %bb.1: # %if.then
-; CHECK-NEXT: movl 8(%eax), %edi
-; CHECK-NEXT: movl 12(%eax), %esi
-; CHECK-NEXT: movl %esi, %edx
-; CHECK-NEXT: shldl $1, %edi, %edx
-; CHECK-NEXT: orl %esi, %edx
-; CHECK-NEXT: leal (%edi,%edi), %ecx
-; CHECK-NEXT: orl %edi, %ecx
-; CHECK-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT: movl 16(%eax), %ecx
-; CHECK-NEXT: movl 20(%eax), %esi
-; CHECK-NEXT: movl %esi, %edi
-; CHECK-NEXT: shldl $2, %ecx, %edi
-; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT: movl %esi, %edi
-; CHECK-NEXT: shldl $31, %ecx, %edi
-; CHECK-NEXT: shll $2, %ecx
-; CHECK-NEXT: orl %edi, %ecx
+; CHECK-NEXT: movl 8(%edi), %esi
+; CHECK-NEXT: movl 12(%edi), %eax
+; CHECK-NEXT: movl %eax, %edx
+; CHECK-NEXT: shldl $1, %esi, %edx
+; CHECK-NEXT: orl %eax, %edx
+; CHECK-NEXT: leal (%esi,%esi), %eax
+; CHECK-NEXT: orl %esi, %eax
+; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT: movl 16(%edi), %ebx
+; CHECK-NEXT: movl 20(%edi), %esi
+; CHECK-NEXT: movl %esi, %eax
+; CHECK-NEXT: shldl $2, %ebx, %eax
+; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT: movl %ebx, %eax
+; CHECK-NEXT: movl %esi, %ebx
+; CHECK-NEXT: shldl $31, %eax, %ebx
+; CHECK-NEXT: shll $2, %eax
+; CHECK-NEXT: orl %ebx, %eax
; CHECK-NEXT: shrl %esi
; CHECK-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; CHECK-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; CHECK-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; CHECK-NEXT: adcl %edx, %esi
-; CHECK-NEXT: movl 28(%eax), %ecx
-; CHECK-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT: movl 24(%eax), %eax
+; CHECK-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT: movl 24(%edi), %eax
; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT: movl $-1028477379, %ecx # imm = 0xC2B2AE3D
-; CHECK-NEXT: imull %eax, %ecx
-; CHECK-NEXT: mull %ebx
-; CHECK-NEXT: movl %eax, %edi
-; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; CHECK-NEXT: movl $-1028477379, %ebx # imm = 0xC2B2AE3D
; CHECK-NEXT: imull %eax, %ebx
-; CHECK-NEXT: addl %ecx, %ebx
-; CHECK-NEXT: addl %edx, %ebx
-; CHECK-NEXT: imull $1336530590, %eax, %ecx # imm = 0x4FA9D69E
-; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; CHECK-NEXT: imull $-2056954758, %edx, %eax # imm = 0x85655C7A
-; CHECK-NEXT: addl %eax, %ecx
-; CHECK-NEXT: movl %edx, %eax
+; CHECK-NEXT: mull %ecx
+; CHECK-NEXT: movl %eax, %esi
+; CHECK-NEXT: addl %ebx, %edx
+; CHECK-NEXT: movl 28(%edi), %edi
+; CHECK-NEXT: imull %edi, %ecx
+; CHECK-NEXT: addl %edx, %ecx
; CHECK-NEXT: movl $1336530590, %edx # imm = 0x4FA9D69E
+; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; CHECK-NEXT: movl %ebx, %eax
; CHECK-NEXT: mull %edx
-; CHECK-NEXT: addl %edx, %ecx
-; CHECK-NEXT: shrdl $3, %ebx, %edi
-; CHECK-NEXT: sarl $3, %ebx
-; CHECK-NEXT: orl %ecx, %ebx
-; CHECK-NEXT: orl %eax, %edi
-; CHECK-NEXT: imull $326129324, %edi, %eax # imm = 0x137056AC
-; CHECK-NEXT: imull $-66860409, %ebx, %ecx # imm = 0xFC03CA87
-; CHECK-NEXT: addl %eax, %ecx
+; CHECK-NEXT: imull $-2056954758, %ebx, %ebx # imm = 0x85655C7A
+; CHECK-NEXT: addl %edx, %ebx
+; CHECK-NEXT: imull $1336530590, %edi, %edx # imm = 0x4FA9D69E
+; CHECK-NEXT: addl %ebx, %edx
+; CHECK-NEXT: shrdl $3, %ecx, %esi
+; CHECK-NEXT: sarl $3, %ecx
+; CHECK-NEXT: orl %edx, %ecx
+; CHECK-NEXT: orl %eax, %esi
; CHECK-NEXT: movl $-66860409, %ebx # imm = 0xFC03CA87
+; CHECK-NEXT: movl %esi, %eax
+; CHECK-NEXT: mull %ebx
+; CHECK-NEXT: movl %eax, %edi
+; CHECK-NEXT: imull $326129324, %esi, %eax # imm = 0x137056AC
+; CHECK-NEXT: addl %edx, %eax
+; CHECK-NEXT: imull $-66860409, %ecx, %ecx # imm = 0xFC03CA87
+; CHECK-NEXT: addl %eax, %ecx
+; CHECK-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; CHECK-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; CHECK-NEXT: movl %edi, b
; CHECK-NEXT: movl %edi, %eax
; CHECK-NEXT: mull %ebx
-; CHECK-NEXT: addl %edx, %ecx
-; CHECK-NEXT: xorl %esi, %ecx
-; CHECK-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; CHECK-NEXT: imull $326129324, %edi, %esi # imm = 0x137056AC
+; CHECK-NEXT: addl %edx, %esi
; CHECK-NEXT: movl %ecx, b+4
-; CHECK-NEXT: imull $326129324, %eax, %edx # imm = 0x137056AC
; CHECK-NEXT: imull $-66860409, %ecx, %ecx # imm = 0xFC03CA87
-; CHECK-NEXT: addl %edx, %ecx
-; CHECK-NEXT: movl %eax, b
-; CHECK-NEXT: mull %ebx
; CHECK-NEXT: jmp .LBB0_3
; CHECK-NEXT: .LBB0_2: # %if.else
-; CHECK-NEXT: xorl b+4, %edi
-; CHECK-NEXT: xorl b, %ebx
-; CHECK-NEXT: movl $1419758215, %ecx # imm = 0x549FCA87
-; CHECK-NEXT: movl %ebx, %eax
-; CHECK-NEXT: mull %ecx
-; CHECK-NEXT: imull $93298681, %ebx, %esi # imm = 0x58F9FF9
-; CHECK-NEXT: imull $1419758215, %edi, %ecx # imm = 0x549FCA87
-; CHECK-NEXT: addl %esi, %ecx
+; CHECK-NEXT: xorl b+4, %ebx
+; CHECK-NEXT: xorl b, %ecx
+; CHECK-NEXT: movl $1419758215, %edx # imm = 0x549FCA87
+; CHECK-NEXT: movl %ecx, %eax
+; CHECK-NEXT: mull %edx
+; CHECK-NEXT: imull $93298681, %ecx, %esi # imm = 0x58F9FF9
+; CHECK-NEXT: addl %edx, %esi
+; CHECK-NEXT: imull $1419758215, %ebx, %ecx # imm = 0x549FCA87
; CHECK-NEXT: .LBB0_3: # %if.end
-; CHECK-NEXT: addl %edx, %ecx
+; CHECK-NEXT: addl %esi, %ecx
; CHECK-NEXT: addl $-1028477341, %eax # imm = 0xC2B2AE63
; CHECK-NEXT: adcl $-2048144777, %ecx # imm = 0x85EBCA77
; CHECK-NEXT: movl %eax, b
; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %r8d
; CHECK-NEXT: addq %rdi, %rsi
; CHECK-NEXT: addq %rbp, %rdx
+; CHECK-NEXT: addq %rsi, %rdx
; CHECK-NEXT: addq %rbx, %rcx
; CHECK-NEXT: addq %r8, %rax
; CHECK-NEXT: addq %rcx, %rax
; CHECK-NEXT: addq %rdx, %rax
-; CHECK-NEXT: addq %rsi, %rax
; CHECK-NEXT: popq %rbx
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: popq %rbp
; GNUX32-NEXT: movzbl {{[0-9]+}}(%esp), %r8d
; GNUX32-NEXT: addq %rdi, %rsi
; GNUX32-NEXT: addq %rbp, %rdx
+; GNUX32-NEXT: addq %rsi, %rdx
; GNUX32-NEXT: addq %rbx, %rcx
; GNUX32-NEXT: addq %r8, %rax
; GNUX32-NEXT: addq %rcx, %rax
; GNUX32-NEXT: addq %rdx, %rax
-; GNUX32-NEXT: addq %rsi, %rax
; GNUX32-NEXT: popq %rbx
; GNUX32-NEXT: .cfi_def_cfa_offset 16
; GNUX32-NEXT: popq %rbp
define cc 11 {i32, i32, i32} @addfour(i32 %hp, i32 %p, i32 %x, i32 %y, i32 %z) nounwind {
; CHECK-LABEL: addfour:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: addl %edx, %ecx
+; CHECK-NEXT: addl %edx, %eax
; CHECK-NEXT: addl %ecx, %eax
; CHECK-NEXT: retl
entry:
define cc 11 {i64, i64, i64} @addfour(i64 %hp, i64 %p, i64 %x, i64 %y, i64 %z, i64 %w) nounwind {
; CHECK-LABEL: addfour:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: leaq (%rdx,%rcx), %rax
-; CHECK-NEXT: addq %r8, %rax
-; CHECK-NEXT: addq %rsi, %rax
+; CHECK-NEXT: addq %rsi, %rdx
+; CHECK-NEXT: leaq (%rcx,%r8), %rax
+; CHECK-NEXT: addq %rdx, %rax
; CHECK-NEXT: retq
entry:
%0 = add i64 %x, %y
; SSE2-LABEL: PR37890_v16i32:
; SSE2: # %bb.0:
; SSE2-NEXT: paddd %xmm3, %xmm1
-; SSE2-NEXT: paddd %xmm2, %xmm1
-; SSE2-NEXT: paddd %xmm0, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE2-NEXT: paddd %xmm2, %xmm0
; SSE2-NEXT: paddd %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE2-NEXT: paddd %xmm0, %xmm1
-; SSE2-NEXT: movd %xmm1, %eax
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
+; SSE2-NEXT: paddd %xmm1, %xmm0
+; SSE2-NEXT: movd %xmm0, %eax
; SSE2-NEXT: retq
;
; SSSE3-SLOW-LABEL: PR37890_v16i32:
; SSSE3-SLOW: # %bb.0:
; SSSE3-SLOW-NEXT: paddd %xmm3, %xmm1
-; SSSE3-SLOW-NEXT: paddd %xmm2, %xmm1
-; SSSE3-SLOW-NEXT: paddd %xmm0, %xmm1
-; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSSE3-SLOW-NEXT: paddd %xmm2, %xmm0
; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0
-; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSSE3-SLOW-NEXT: paddd %xmm0, %xmm1
-; SSSE3-SLOW-NEXT: movd %xmm1, %eax
+; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
+; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0
+; SSSE3-SLOW-NEXT: movd %xmm0, %eax
; SSSE3-SLOW-NEXT: retq
;
; SSSE3-FAST-LABEL: PR37890_v16i32:
; SSSE3-FAST: # %bb.0:
; SSSE3-FAST-NEXT: paddd %xmm3, %xmm1
-; SSSE3-FAST-NEXT: paddd %xmm2, %xmm1
-; SSSE3-FAST-NEXT: paddd %xmm0, %xmm1
-; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSSE3-FAST-NEXT: paddd %xmm2, %xmm0
; SSSE3-FAST-NEXT: paddd %xmm1, %xmm0
-; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0
-; SSSE3-FAST-NEXT: movd %xmm0, %eax
+; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; SSSE3-FAST-NEXT: paddd %xmm0, %xmm1
+; SSSE3-FAST-NEXT: phaddd %xmm1, %xmm1
+; SSSE3-FAST-NEXT: movd %xmm1, %eax
; SSSE3-FAST-NEXT: retq
;
; AVX1-SLOW-LABEL: PR37890_v16i32:
; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm3, %xmm2
-; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1
; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm0, %xmm0
; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; SSE2-LABEL: PR37890_v8f64:
; SSE2: # %bb.0:
; SSE2-NEXT: addpd %xmm3, %xmm1
-; SSE2-NEXT: addpd %xmm2, %xmm1
+; SSE2-NEXT: addpd %xmm2, %xmm0
; SSE2-NEXT: addpd %xmm1, %xmm0
; SSE2-NEXT: movapd %xmm0, %xmm1
; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
; SSSE3-SLOW-LABEL: PR37890_v8f64:
; SSSE3-SLOW: # %bb.0:
; SSSE3-SLOW-NEXT: addpd %xmm3, %xmm1
-; SSSE3-SLOW-NEXT: addpd %xmm2, %xmm1
+; SSSE3-SLOW-NEXT: addpd %xmm2, %xmm0
; SSSE3-SLOW-NEXT: addpd %xmm1, %xmm0
; SSSE3-SLOW-NEXT: movapd %xmm0, %xmm1
; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
; SSSE3-FAST-LABEL: PR37890_v8f64:
; SSSE3-FAST: # %bb.0:
; SSSE3-FAST-NEXT: addpd %xmm3, %xmm1
-; SSSE3-FAST-NEXT: addpd %xmm2, %xmm1
+; SSSE3-FAST-NEXT: addpd %xmm2, %xmm0
; SSSE3-FAST-NEXT: addpd %xmm1, %xmm0
; SSSE3-FAST-NEXT: haddpd %xmm0, %xmm0
; SSSE3-FAST-NEXT: retq
; SSE2-LABEL: PR37890_v16f32:
; SSE2: # %bb.0:
; SSE2-NEXT: addps %xmm3, %xmm1
-; SSE2-NEXT: addps %xmm2, %xmm1
+; SSE2-NEXT: addps %xmm2, %xmm0
; SSE2-NEXT: addps %xmm1, %xmm0
; SSE2-NEXT: movaps %xmm0, %xmm1
; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
; SSSE3-SLOW-LABEL: PR37890_v16f32:
; SSSE3-SLOW: # %bb.0:
; SSSE3-SLOW-NEXT: addps %xmm3, %xmm1
-; SSSE3-SLOW-NEXT: addps %xmm2, %xmm1
+; SSSE3-SLOW-NEXT: addps %xmm2, %xmm0
; SSSE3-SLOW-NEXT: addps %xmm1, %xmm0
; SSSE3-SLOW-NEXT: movaps %xmm0, %xmm1
; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
; SSSE3-FAST-LABEL: PR37890_v16f32:
; SSSE3-FAST: # %bb.0:
; SSSE3-FAST-NEXT: addps %xmm3, %xmm1
-; SSSE3-FAST-NEXT: addps %xmm2, %xmm1
+; SSSE3-FAST-NEXT: addps %xmm2, %xmm0
; SSSE3-FAST-NEXT: addps %xmm1, %xmm0
; SSSE3-FAST-NEXT: movaps %xmm0, %xmm1
; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
; X86-SSE42-LABEL: test_reduce_v16i32:
; X86-SSE42: ## %bb.0:
; X86-SSE42-NEXT: pmaxsd %xmm3, %xmm1
-; X86-SSE42-NEXT: pmaxsd %xmm2, %xmm1
-; X86-SSE42-NEXT: pmaxsd %xmm0, %xmm1
-; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; X86-SSE42-NEXT: pmaxsd %xmm2, %xmm0
; X86-SSE42-NEXT: pmaxsd %xmm1, %xmm0
-; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-SSE42-NEXT: pmaxsd %xmm0, %xmm1
-; X86-SSE42-NEXT: movd %xmm1, %eax
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
+; X86-SSE42-NEXT: pmaxsd %xmm1, %xmm0
+; X86-SSE42-NEXT: movd %xmm0, %eax
; X86-SSE42-NEXT: retl
;
; X86-AVX1-LABEL: test_reduce_v16i32:
; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; X86-AVX1-NEXT: vpmaxsd %xmm2, %xmm3, %xmm2
-; X86-AVX1-NEXT: vpmaxsd %xmm2, %xmm1, %xmm1
; X86-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpmaxsd %xmm2, %xmm0, %xmm0
; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; X64-SSE42-LABEL: test_reduce_v16i32:
; X64-SSE42: ## %bb.0:
; X64-SSE42-NEXT: pmaxsd %xmm3, %xmm1
-; X64-SSE42-NEXT: pmaxsd %xmm2, %xmm1
-; X64-SSE42-NEXT: pmaxsd %xmm0, %xmm1
-; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; X64-SSE42-NEXT: pmaxsd %xmm2, %xmm0
; X64-SSE42-NEXT: pmaxsd %xmm1, %xmm0
-; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-SSE42-NEXT: pmaxsd %xmm0, %xmm1
-; X64-SSE42-NEXT: movd %xmm1, %eax
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
+; X64-SSE42-NEXT: pmaxsd %xmm1, %xmm0
+; X64-SSE42-NEXT: movd %xmm0, %eax
; X64-SSE42-NEXT: retq
;
; X64-AVX1-LABEL: test_reduce_v16i32:
; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; X64-AVX1-NEXT: vpmaxsd %xmm2, %xmm3, %xmm2
-; X64-AVX1-NEXT: vpmaxsd %xmm2, %xmm1, %xmm1
; X64-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpmaxsd %xmm2, %xmm0, %xmm0
; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; X86-SSE2-LABEL: test_reduce_v32i16:
; X86-SSE2: ## %bb.0:
; X86-SSE2-NEXT: pmaxsw %xmm3, %xmm1
-; X86-SSE2-NEXT: pmaxsw %xmm2, %xmm1
-; X86-SSE2-NEXT: pmaxsw %xmm0, %xmm1
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; X86-SSE2-NEXT: pmaxsw %xmm2, %xmm0
; X86-SSE2-NEXT: pmaxsw %xmm1, %xmm0
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-SSE2-NEXT: pmaxsw %xmm0, %xmm1
-; X86-SSE2-NEXT: movdqa %xmm1, %xmm0
-; X86-SSE2-NEXT: psrld $16, %xmm0
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
; X86-SSE2-NEXT: pmaxsw %xmm1, %xmm0
-; X86-SSE2-NEXT: movd %xmm0, %eax
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT: psrld $16, %xmm1
+; X86-SSE2-NEXT: pmaxsw %xmm0, %xmm1
+; X86-SSE2-NEXT: movd %xmm1, %eax
; X86-SSE2-NEXT: ## kill: def $ax killed $ax killed $eax
; X86-SSE2-NEXT: retl
;
; X86-SSE42-LABEL: test_reduce_v32i16:
; X86-SSE42: ## %bb.0:
; X86-SSE42-NEXT: pmaxsw %xmm3, %xmm1
-; X86-SSE42-NEXT: pmaxsw %xmm2, %xmm1
-; X86-SSE42-NEXT: pmaxsw %xmm0, %xmm1
-; X86-SSE42-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-SSE42-NEXT: phminposuw %xmm1, %xmm0
+; X86-SSE42-NEXT: pmaxsw %xmm2, %xmm0
+; X86-SSE42-NEXT: pmaxsw %xmm1, %xmm0
+; X86-SSE42-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE42-NEXT: phminposuw %xmm0, %xmm0
; X86-SSE42-NEXT: movd %xmm0, %eax
; X86-SSE42-NEXT: xorl $32767, %eax ## imm = 0x7FFF
; X86-SSE42-NEXT: ## kill: def $ax killed $ax killed $eax
; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; X86-AVX1-NEXT: vpmaxsw %xmm2, %xmm3, %xmm2
-; X86-AVX1-NEXT: vpmaxsw %xmm2, %xmm1, %xmm1
; X86-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpmaxsw %xmm2, %xmm0, %xmm0
; X86-AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0
; X86-AVX1-NEXT: vmovd %xmm0, %eax
; X64-SSE2-LABEL: test_reduce_v32i16:
; X64-SSE2: ## %bb.0:
; X64-SSE2-NEXT: pmaxsw %xmm3, %xmm1
-; X64-SSE2-NEXT: pmaxsw %xmm2, %xmm1
-; X64-SSE2-NEXT: pmaxsw %xmm0, %xmm1
-; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; X64-SSE2-NEXT: pmaxsw %xmm2, %xmm0
; X64-SSE2-NEXT: pmaxsw %xmm1, %xmm0
-; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-SSE2-NEXT: pmaxsw %xmm0, %xmm1
-; X64-SSE2-NEXT: movdqa %xmm1, %xmm0
-; X64-SSE2-NEXT: psrld $16, %xmm0
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
; X64-SSE2-NEXT: pmaxsw %xmm1, %xmm0
-; X64-SSE2-NEXT: movd %xmm0, %eax
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE2-NEXT: psrld $16, %xmm1
+; X64-SSE2-NEXT: pmaxsw %xmm0, %xmm1
+; X64-SSE2-NEXT: movd %xmm1, %eax
; X64-SSE2-NEXT: ## kill: def $ax killed $ax killed $eax
; X64-SSE2-NEXT: retq
;
; X64-SSE42-LABEL: test_reduce_v32i16:
; X64-SSE42: ## %bb.0:
; X64-SSE42-NEXT: pmaxsw %xmm3, %xmm1
-; X64-SSE42-NEXT: pmaxsw %xmm2, %xmm1
-; X64-SSE42-NEXT: pmaxsw %xmm0, %xmm1
-; X64-SSE42-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; X64-SSE42-NEXT: phminposuw %xmm1, %xmm0
+; X64-SSE42-NEXT: pmaxsw %xmm2, %xmm0
+; X64-SSE42-NEXT: pmaxsw %xmm1, %xmm0
+; X64-SSE42-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-SSE42-NEXT: phminposuw %xmm0, %xmm0
; X64-SSE42-NEXT: movd %xmm0, %eax
; X64-SSE42-NEXT: xorl $32767, %eax ## imm = 0x7FFF
; X64-SSE42-NEXT: ## kill: def $ax killed $ax killed $eax
; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; X64-AVX1-NEXT: vpmaxsw %xmm2, %xmm3, %xmm2
-; X64-AVX1-NEXT: vpmaxsw %xmm2, %xmm1, %xmm1
; X64-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpmaxsw %xmm2, %xmm0, %xmm0
; X64-AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; X64-AVX1-NEXT: vphminposuw %xmm0, %xmm0
; X64-AVX1-NEXT: vmovd %xmm0, %eax
; X86-SSE42-LABEL: test_reduce_v64i8:
; X86-SSE42: ## %bb.0:
; X86-SSE42-NEXT: pmaxsb %xmm3, %xmm1
-; X86-SSE42-NEXT: pmaxsb %xmm2, %xmm1
-; X86-SSE42-NEXT: pmaxsb %xmm0, %xmm1
-; X86-SSE42-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-SSE42-NEXT: movdqa %xmm1, %xmm0
-; X86-SSE42-NEXT: psrlw $8, %xmm0
-; X86-SSE42-NEXT: pminub %xmm1, %xmm0
-; X86-SSE42-NEXT: phminposuw %xmm0, %xmm0
+; X86-SSE42-NEXT: pmaxsb %xmm2, %xmm0
+; X86-SSE42-NEXT: pmaxsb %xmm1, %xmm0
+; X86-SSE42-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE42-NEXT: psrlw $8, %xmm1
+; X86-SSE42-NEXT: pminub %xmm0, %xmm1
+; X86-SSE42-NEXT: phminposuw %xmm1, %xmm0
; X86-SSE42-NEXT: movd %xmm0, %eax
; X86-SSE42-NEXT: xorb $127, %al
; X86-SSE42-NEXT: ## kill: def $al killed $al killed $eax
; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; X86-AVX1-NEXT: vpmaxsb %xmm2, %xmm3, %xmm2
-; X86-AVX1-NEXT: vpmaxsb %xmm2, %xmm1, %xmm1
; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpmaxsb %xmm2, %xmm0, %xmm0
; X86-AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
; X64-SSE42-LABEL: test_reduce_v64i8:
; X64-SSE42: ## %bb.0:
; X64-SSE42-NEXT: pmaxsb %xmm3, %xmm1
-; X64-SSE42-NEXT: pmaxsb %xmm2, %xmm1
-; X64-SSE42-NEXT: pmaxsb %xmm0, %xmm1
-; X64-SSE42-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; X64-SSE42-NEXT: movdqa %xmm1, %xmm0
-; X64-SSE42-NEXT: psrlw $8, %xmm0
-; X64-SSE42-NEXT: pminub %xmm1, %xmm0
-; X64-SSE42-NEXT: phminposuw %xmm0, %xmm0
+; X64-SSE42-NEXT: pmaxsb %xmm2, %xmm0
+; X64-SSE42-NEXT: pmaxsb %xmm1, %xmm0
+; X64-SSE42-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE42-NEXT: psrlw $8, %xmm1
+; X64-SSE42-NEXT: pminub %xmm0, %xmm1
+; X64-SSE42-NEXT: phminposuw %xmm1, %xmm0
; X64-SSE42-NEXT: movd %xmm0, %eax
; X64-SSE42-NEXT: xorb $127, %al
; X64-SSE42-NEXT: ## kill: def $al killed $al killed $eax
; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; X64-AVX1-NEXT: vpmaxsb %xmm2, %xmm3, %xmm2
-; X64-AVX1-NEXT: vpmaxsb %xmm2, %xmm1, %xmm1
; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpmaxsb %xmm2, %xmm0, %xmm0
; X64-AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
; X86-SSE42-LABEL: test_reduce_v16i32:
; X86-SSE42: ## %bb.0:
; X86-SSE42-NEXT: pminsd %xmm3, %xmm1
-; X86-SSE42-NEXT: pminsd %xmm2, %xmm1
-; X86-SSE42-NEXT: pminsd %xmm0, %xmm1
-; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; X86-SSE42-NEXT: pminsd %xmm2, %xmm0
; X86-SSE42-NEXT: pminsd %xmm1, %xmm0
-; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-SSE42-NEXT: pminsd %xmm0, %xmm1
-; X86-SSE42-NEXT: movd %xmm1, %eax
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
+; X86-SSE42-NEXT: pminsd %xmm1, %xmm0
+; X86-SSE42-NEXT: movd %xmm0, %eax
; X86-SSE42-NEXT: retl
;
; X86-AVX1-LABEL: test_reduce_v16i32:
; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; X86-AVX1-NEXT: vpminsd %xmm2, %xmm3, %xmm2
-; X86-AVX1-NEXT: vpminsd %xmm2, %xmm1, %xmm1
; X86-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpminsd %xmm2, %xmm0, %xmm0
; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0
; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; X64-SSE42-LABEL: test_reduce_v16i32:
; X64-SSE42: ## %bb.0:
; X64-SSE42-NEXT: pminsd %xmm3, %xmm1
-; X64-SSE42-NEXT: pminsd %xmm2, %xmm1
-; X64-SSE42-NEXT: pminsd %xmm0, %xmm1
-; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; X64-SSE42-NEXT: pminsd %xmm2, %xmm0
; X64-SSE42-NEXT: pminsd %xmm1, %xmm0
-; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-SSE42-NEXT: pminsd %xmm0, %xmm1
-; X64-SSE42-NEXT: movd %xmm1, %eax
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
+; X64-SSE42-NEXT: pminsd %xmm1, %xmm0
+; X64-SSE42-NEXT: movd %xmm0, %eax
; X64-SSE42-NEXT: retq
;
; X64-AVX1-LABEL: test_reduce_v16i32:
; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; X64-AVX1-NEXT: vpminsd %xmm2, %xmm3, %xmm2
-; X64-AVX1-NEXT: vpminsd %xmm2, %xmm1, %xmm1
; X64-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpminsd %xmm2, %xmm0, %xmm0
; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0
; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; X86-SSE2-LABEL: test_reduce_v32i16:
; X86-SSE2: ## %bb.0:
; X86-SSE2-NEXT: pminsw %xmm3, %xmm1
-; X86-SSE2-NEXT: pminsw %xmm2, %xmm1
-; X86-SSE2-NEXT: pminsw %xmm0, %xmm1
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; X86-SSE2-NEXT: pminsw %xmm2, %xmm0
; X86-SSE2-NEXT: pminsw %xmm1, %xmm0
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-SSE2-NEXT: pminsw %xmm0, %xmm1
-; X86-SSE2-NEXT: movdqa %xmm1, %xmm0
-; X86-SSE2-NEXT: psrld $16, %xmm0
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
; X86-SSE2-NEXT: pminsw %xmm1, %xmm0
-; X86-SSE2-NEXT: movd %xmm0, %eax
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT: psrld $16, %xmm1
+; X86-SSE2-NEXT: pminsw %xmm0, %xmm1
+; X86-SSE2-NEXT: movd %xmm1, %eax
; X86-SSE2-NEXT: ## kill: def $ax killed $ax killed $eax
; X86-SSE2-NEXT: retl
;
; X86-SSE42-LABEL: test_reduce_v32i16:
; X86-SSE42: ## %bb.0:
; X86-SSE42-NEXT: pminsw %xmm3, %xmm1
-; X86-SSE42-NEXT: pminsw %xmm2, %xmm1
-; X86-SSE42-NEXT: pminsw %xmm0, %xmm1
-; X86-SSE42-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-SSE42-NEXT: phminposuw %xmm1, %xmm0
+; X86-SSE42-NEXT: pminsw %xmm2, %xmm0
+; X86-SSE42-NEXT: pminsw %xmm1, %xmm0
+; X86-SSE42-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE42-NEXT: phminposuw %xmm0, %xmm0
; X86-SSE42-NEXT: movd %xmm0, %eax
; X86-SSE42-NEXT: xorl $32768, %eax ## imm = 0x8000
; X86-SSE42-NEXT: ## kill: def $ax killed $ax killed $eax
; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; X86-AVX1-NEXT: vpminsw %xmm2, %xmm3, %xmm2
-; X86-AVX1-NEXT: vpminsw %xmm2, %xmm1, %xmm1
; X86-AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpminsw %xmm2, %xmm0, %xmm0
; X86-AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0
; X86-AVX1-NEXT: vmovd %xmm0, %eax
; X64-SSE2-LABEL: test_reduce_v32i16:
; X64-SSE2: ## %bb.0:
; X64-SSE2-NEXT: pminsw %xmm3, %xmm1
-; X64-SSE2-NEXT: pminsw %xmm2, %xmm1
-; X64-SSE2-NEXT: pminsw %xmm0, %xmm1
-; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; X64-SSE2-NEXT: pminsw %xmm2, %xmm0
; X64-SSE2-NEXT: pminsw %xmm1, %xmm0
-; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-SSE2-NEXT: pminsw %xmm0, %xmm1
-; X64-SSE2-NEXT: movdqa %xmm1, %xmm0
-; X64-SSE2-NEXT: psrld $16, %xmm0
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
; X64-SSE2-NEXT: pminsw %xmm1, %xmm0
-; X64-SSE2-NEXT: movd %xmm0, %eax
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE2-NEXT: psrld $16, %xmm1
+; X64-SSE2-NEXT: pminsw %xmm0, %xmm1
+; X64-SSE2-NEXT: movd %xmm1, %eax
; X64-SSE2-NEXT: ## kill: def $ax killed $ax killed $eax
; X64-SSE2-NEXT: retq
;
; X64-SSE42-LABEL: test_reduce_v32i16:
; X64-SSE42: ## %bb.0:
; X64-SSE42-NEXT: pminsw %xmm3, %xmm1
-; X64-SSE42-NEXT: pminsw %xmm2, %xmm1
-; X64-SSE42-NEXT: pminsw %xmm0, %xmm1
-; X64-SSE42-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; X64-SSE42-NEXT: phminposuw %xmm1, %xmm0
+; X64-SSE42-NEXT: pminsw %xmm2, %xmm0
+; X64-SSE42-NEXT: pminsw %xmm1, %xmm0
+; X64-SSE42-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-SSE42-NEXT: phminposuw %xmm0, %xmm0
; X64-SSE42-NEXT: movd %xmm0, %eax
; X64-SSE42-NEXT: xorl $32768, %eax ## imm = 0x8000
; X64-SSE42-NEXT: ## kill: def $ax killed $ax killed $eax
; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; X64-AVX1-NEXT: vpminsw %xmm2, %xmm3, %xmm2
-; X64-AVX1-NEXT: vpminsw %xmm2, %xmm1, %xmm1
; X64-AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpminsw %xmm2, %xmm0, %xmm0
; X64-AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; X64-AVX1-NEXT: vphminposuw %xmm0, %xmm0
; X64-AVX1-NEXT: vmovd %xmm0, %eax
; X86-SSE42-LABEL: test_reduce_v64i8:
; X86-SSE42: ## %bb.0:
; X86-SSE42-NEXT: pminsb %xmm3, %xmm1
-; X86-SSE42-NEXT: pminsb %xmm2, %xmm1
-; X86-SSE42-NEXT: pminsb %xmm0, %xmm1
-; X86-SSE42-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-SSE42-NEXT: movdqa %xmm1, %xmm0
-; X86-SSE42-NEXT: psrlw $8, %xmm0
-; X86-SSE42-NEXT: pminub %xmm1, %xmm0
-; X86-SSE42-NEXT: phminposuw %xmm0, %xmm0
+; X86-SSE42-NEXT: pminsb %xmm2, %xmm0
+; X86-SSE42-NEXT: pminsb %xmm1, %xmm0
+; X86-SSE42-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE42-NEXT: psrlw $8, %xmm1
+; X86-SSE42-NEXT: pminub %xmm0, %xmm1
+; X86-SSE42-NEXT: phminposuw %xmm1, %xmm0
; X86-SSE42-NEXT: movd %xmm0, %eax
; X86-SSE42-NEXT: addb $-128, %al
; X86-SSE42-NEXT: ## kill: def $al killed $al killed $eax
; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; X86-AVX1-NEXT: vpminsb %xmm2, %xmm3, %xmm2
-; X86-AVX1-NEXT: vpminsb %xmm2, %xmm1, %xmm1
; X86-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpminsb %xmm2, %xmm0, %xmm0
; X86-AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
; X64-SSE42-LABEL: test_reduce_v64i8:
; X64-SSE42: ## %bb.0:
; X64-SSE42-NEXT: pminsb %xmm3, %xmm1
-; X64-SSE42-NEXT: pminsb %xmm2, %xmm1
-; X64-SSE42-NEXT: pminsb %xmm0, %xmm1
-; X64-SSE42-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; X64-SSE42-NEXT: movdqa %xmm1, %xmm0
-; X64-SSE42-NEXT: psrlw $8, %xmm0
-; X64-SSE42-NEXT: pminub %xmm1, %xmm0
-; X64-SSE42-NEXT: phminposuw %xmm0, %xmm0
+; X64-SSE42-NEXT: pminsb %xmm2, %xmm0
+; X64-SSE42-NEXT: pminsb %xmm1, %xmm0
+; X64-SSE42-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE42-NEXT: psrlw $8, %xmm1
+; X64-SSE42-NEXT: pminub %xmm0, %xmm1
+; X64-SSE42-NEXT: phminposuw %xmm1, %xmm0
; X64-SSE42-NEXT: movd %xmm0, %eax
; X64-SSE42-NEXT: addb $-128, %al
; X64-SSE42-NEXT: ## kill: def $al killed $al killed $eax
; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; X64-AVX1-NEXT: vpminsb %xmm2, %xmm3, %xmm2
-; X64-AVX1-NEXT: vpminsb %xmm2, %xmm1, %xmm1
; X64-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpminsb %xmm2, %xmm0, %xmm0
; X64-AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
; X86-SSE42-LABEL: test_reduce_v16i32:
; X86-SSE42: ## %bb.0:
; X86-SSE42-NEXT: pmaxud %xmm3, %xmm1
-; X86-SSE42-NEXT: pmaxud %xmm2, %xmm1
-; X86-SSE42-NEXT: pmaxud %xmm0, %xmm1
-; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; X86-SSE42-NEXT: pmaxud %xmm2, %xmm0
; X86-SSE42-NEXT: pmaxud %xmm1, %xmm0
-; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-SSE42-NEXT: pmaxud %xmm0, %xmm1
-; X86-SSE42-NEXT: movd %xmm1, %eax
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
+; X86-SSE42-NEXT: pmaxud %xmm1, %xmm0
+; X86-SSE42-NEXT: movd %xmm0, %eax
; X86-SSE42-NEXT: retl
;
; X86-AVX1-LABEL: test_reduce_v16i32:
; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; X86-AVX1-NEXT: vpmaxud %xmm2, %xmm3, %xmm2
-; X86-AVX1-NEXT: vpmaxud %xmm2, %xmm1, %xmm1
; X86-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpmaxud %xmm2, %xmm0, %xmm0
; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; X64-SSE42-LABEL: test_reduce_v16i32:
; X64-SSE42: ## %bb.0:
; X64-SSE42-NEXT: pmaxud %xmm3, %xmm1
-; X64-SSE42-NEXT: pmaxud %xmm2, %xmm1
-; X64-SSE42-NEXT: pmaxud %xmm0, %xmm1
-; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; X64-SSE42-NEXT: pmaxud %xmm2, %xmm0
; X64-SSE42-NEXT: pmaxud %xmm1, %xmm0
-; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-SSE42-NEXT: pmaxud %xmm0, %xmm1
-; X64-SSE42-NEXT: movd %xmm1, %eax
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
+; X64-SSE42-NEXT: pmaxud %xmm1, %xmm0
+; X64-SSE42-NEXT: movd %xmm0, %eax
; X64-SSE42-NEXT: retq
;
; X64-AVX1-LABEL: test_reduce_v16i32:
; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; X64-AVX1-NEXT: vpmaxud %xmm2, %xmm3, %xmm2
-; X64-AVX1-NEXT: vpmaxud %xmm2, %xmm1, %xmm1
; X64-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpmaxud %xmm2, %xmm0, %xmm0
; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; X86-SSE42-LABEL: test_reduce_v32i16:
; X86-SSE42: ## %bb.0:
; X86-SSE42-NEXT: pmaxuw %xmm3, %xmm1
-; X86-SSE42-NEXT: pmaxuw %xmm2, %xmm1
-; X86-SSE42-NEXT: pmaxuw %xmm0, %xmm1
-; X86-SSE42-NEXT: pcmpeqd %xmm0, %xmm0
-; X86-SSE42-NEXT: pxor %xmm1, %xmm0
-; X86-SSE42-NEXT: phminposuw %xmm0, %xmm0
+; X86-SSE42-NEXT: pmaxuw %xmm2, %xmm0
+; X86-SSE42-NEXT: pmaxuw %xmm1, %xmm0
+; X86-SSE42-NEXT: pcmpeqd %xmm1, %xmm1
+; X86-SSE42-NEXT: pxor %xmm0, %xmm1
+; X86-SSE42-NEXT: phminposuw %xmm1, %xmm0
; X86-SSE42-NEXT: movd %xmm0, %eax
; X86-SSE42-NEXT: notl %eax
; X86-SSE42-NEXT: ## kill: def $ax killed $ax killed $eax
; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; X86-AVX1-NEXT: vpmaxuw %xmm2, %xmm3, %xmm2
-; X86-AVX1-NEXT: vpmaxuw %xmm2, %xmm1, %xmm1
; X86-AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpmaxuw %xmm2, %xmm0, %xmm0
; X86-AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0
; X64-SSE42-LABEL: test_reduce_v32i16:
; X64-SSE42: ## %bb.0:
; X64-SSE42-NEXT: pmaxuw %xmm3, %xmm1
-; X64-SSE42-NEXT: pmaxuw %xmm2, %xmm1
-; X64-SSE42-NEXT: pmaxuw %xmm0, %xmm1
-; X64-SSE42-NEXT: pcmpeqd %xmm0, %xmm0
-; X64-SSE42-NEXT: pxor %xmm1, %xmm0
-; X64-SSE42-NEXT: phminposuw %xmm0, %xmm0
+; X64-SSE42-NEXT: pmaxuw %xmm2, %xmm0
+; X64-SSE42-NEXT: pmaxuw %xmm1, %xmm0
+; X64-SSE42-NEXT: pcmpeqd %xmm1, %xmm1
+; X64-SSE42-NEXT: pxor %xmm0, %xmm1
+; X64-SSE42-NEXT: phminposuw %xmm1, %xmm0
; X64-SSE42-NEXT: movd %xmm0, %eax
; X64-SSE42-NEXT: notl %eax
; X64-SSE42-NEXT: ## kill: def $ax killed $ax killed $eax
; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; X64-AVX1-NEXT: vpmaxuw %xmm2, %xmm3, %xmm2
-; X64-AVX1-NEXT: vpmaxuw %xmm2, %xmm1, %xmm1
; X64-AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpmaxuw %xmm2, %xmm0, %xmm0
; X64-AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X64-AVX1-NEXT: vphminposuw %xmm0, %xmm0
; X86-SSE2-LABEL: test_reduce_v64i8:
; X86-SSE2: ## %bb.0:
; X86-SSE2-NEXT: pmaxub %xmm3, %xmm1
-; X86-SSE2-NEXT: pmaxub %xmm2, %xmm1
-; X86-SSE2-NEXT: pmaxub %xmm0, %xmm1
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; X86-SSE2-NEXT: pmaxub %xmm2, %xmm0
; X86-SSE2-NEXT: pmaxub %xmm1, %xmm0
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-SSE2-NEXT: pmaxub %xmm0, %xmm1
-; X86-SSE2-NEXT: movdqa %xmm1, %xmm0
-; X86-SSE2-NEXT: psrld $16, %xmm0
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
; X86-SSE2-NEXT: pmaxub %xmm1, %xmm0
; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
-; X86-SSE2-NEXT: psrlw $8, %xmm1
+; X86-SSE2-NEXT: psrld $16, %xmm1
; X86-SSE2-NEXT: pmaxub %xmm0, %xmm1
-; X86-SSE2-NEXT: movd %xmm1, %eax
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE2-NEXT: psrlw $8, %xmm0
+; X86-SSE2-NEXT: pmaxub %xmm1, %xmm0
+; X86-SSE2-NEXT: movd %xmm0, %eax
; X86-SSE2-NEXT: ## kill: def $al killed $al killed $eax
; X86-SSE2-NEXT: retl
;
; X86-SSE42-LABEL: test_reduce_v64i8:
; X86-SSE42: ## %bb.0:
; X86-SSE42-NEXT: pmaxub %xmm3, %xmm1
-; X86-SSE42-NEXT: pmaxub %xmm2, %xmm1
-; X86-SSE42-NEXT: pmaxub %xmm0, %xmm1
-; X86-SSE42-NEXT: pcmpeqd %xmm0, %xmm0
-; X86-SSE42-NEXT: pxor %xmm1, %xmm0
-; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
-; X86-SSE42-NEXT: psrlw $8, %xmm1
-; X86-SSE42-NEXT: pminub %xmm0, %xmm1
-; X86-SSE42-NEXT: phminposuw %xmm1, %xmm0
+; X86-SSE42-NEXT: pmaxub %xmm2, %xmm0
+; X86-SSE42-NEXT: pmaxub %xmm1, %xmm0
+; X86-SSE42-NEXT: pcmpeqd %xmm1, %xmm1
+; X86-SSE42-NEXT: pxor %xmm0, %xmm1
+; X86-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE42-NEXT: psrlw $8, %xmm0
+; X86-SSE42-NEXT: pminub %xmm1, %xmm0
+; X86-SSE42-NEXT: phminposuw %xmm0, %xmm0
; X86-SSE42-NEXT: movd %xmm0, %eax
; X86-SSE42-NEXT: notb %al
; X86-SSE42-NEXT: ## kill: def $al killed $al killed $eax
; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; X86-AVX1-NEXT: vpmaxub %xmm2, %xmm3, %xmm2
-; X86-AVX1-NEXT: vpmaxub %xmm2, %xmm1, %xmm1
; X86-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpmaxub %xmm2, %xmm0, %xmm0
; X86-AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
; X64-SSE2-LABEL: test_reduce_v64i8:
; X64-SSE2: ## %bb.0:
; X64-SSE2-NEXT: pmaxub %xmm3, %xmm1
-; X64-SSE2-NEXT: pmaxub %xmm2, %xmm1
-; X64-SSE2-NEXT: pmaxub %xmm0, %xmm1
-; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; X64-SSE2-NEXT: pmaxub %xmm2, %xmm0
; X64-SSE2-NEXT: pmaxub %xmm1, %xmm0
-; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-SSE2-NEXT: pmaxub %xmm0, %xmm1
-; X64-SSE2-NEXT: movdqa %xmm1, %xmm0
-; X64-SSE2-NEXT: psrld $16, %xmm0
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
; X64-SSE2-NEXT: pmaxub %xmm1, %xmm0
; X64-SSE2-NEXT: movdqa %xmm0, %xmm1
-; X64-SSE2-NEXT: psrlw $8, %xmm1
+; X64-SSE2-NEXT: psrld $16, %xmm1
; X64-SSE2-NEXT: pmaxub %xmm0, %xmm1
-; X64-SSE2-NEXT: movd %xmm1, %eax
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE2-NEXT: psrlw $8, %xmm0
+; X64-SSE2-NEXT: pmaxub %xmm1, %xmm0
+; X64-SSE2-NEXT: movd %xmm0, %eax
; X64-SSE2-NEXT: ## kill: def $al killed $al killed $eax
; X64-SSE2-NEXT: retq
;
; X64-SSE42-LABEL: test_reduce_v64i8:
; X64-SSE42: ## %bb.0:
; X64-SSE42-NEXT: pmaxub %xmm3, %xmm1
-; X64-SSE42-NEXT: pmaxub %xmm2, %xmm1
-; X64-SSE42-NEXT: pmaxub %xmm0, %xmm1
-; X64-SSE42-NEXT: pcmpeqd %xmm0, %xmm0
-; X64-SSE42-NEXT: pxor %xmm1, %xmm0
-; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
-; X64-SSE42-NEXT: psrlw $8, %xmm1
-; X64-SSE42-NEXT: pminub %xmm0, %xmm1
-; X64-SSE42-NEXT: phminposuw %xmm1, %xmm0
+; X64-SSE42-NEXT: pmaxub %xmm2, %xmm0
+; X64-SSE42-NEXT: pmaxub %xmm1, %xmm0
+; X64-SSE42-NEXT: pcmpeqd %xmm1, %xmm1
+; X64-SSE42-NEXT: pxor %xmm0, %xmm1
+; X64-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE42-NEXT: psrlw $8, %xmm0
+; X64-SSE42-NEXT: pminub %xmm1, %xmm0
+; X64-SSE42-NEXT: phminposuw %xmm0, %xmm0
; X64-SSE42-NEXT: movd %xmm0, %eax
; X64-SSE42-NEXT: notb %al
; X64-SSE42-NEXT: ## kill: def $al killed $al killed $eax
; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; X64-AVX1-NEXT: vpmaxub %xmm2, %xmm3, %xmm2
-; X64-AVX1-NEXT: vpmaxub %xmm2, %xmm1, %xmm1
; X64-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpmaxub %xmm2, %xmm0, %xmm0
; X64-AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
; X86-SSE42-LABEL: test_reduce_v16i32:
; X86-SSE42: ## %bb.0:
; X86-SSE42-NEXT: pminud %xmm3, %xmm1
-; X86-SSE42-NEXT: pminud %xmm2, %xmm1
-; X86-SSE42-NEXT: pminud %xmm0, %xmm1
-; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; X86-SSE42-NEXT: pminud %xmm2, %xmm0
; X86-SSE42-NEXT: pminud %xmm1, %xmm0
-; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-SSE42-NEXT: pminud %xmm0, %xmm1
-; X86-SSE42-NEXT: movd %xmm1, %eax
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
+; X86-SSE42-NEXT: pminud %xmm1, %xmm0
+; X86-SSE42-NEXT: movd %xmm0, %eax
; X86-SSE42-NEXT: retl
;
; X86-AVX1-LABEL: test_reduce_v16i32:
; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; X86-AVX1-NEXT: vpminud %xmm2, %xmm3, %xmm2
-; X86-AVX1-NEXT: vpminud %xmm2, %xmm1, %xmm1
; X86-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpminud %xmm2, %xmm0, %xmm0
; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0
; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; X64-SSE42-LABEL: test_reduce_v16i32:
; X64-SSE42: ## %bb.0:
; X64-SSE42-NEXT: pminud %xmm3, %xmm1
-; X64-SSE42-NEXT: pminud %xmm2, %xmm1
-; X64-SSE42-NEXT: pminud %xmm0, %xmm1
-; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; X64-SSE42-NEXT: pminud %xmm2, %xmm0
; X64-SSE42-NEXT: pminud %xmm1, %xmm0
-; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-SSE42-NEXT: pminud %xmm0, %xmm1
-; X64-SSE42-NEXT: movd %xmm1, %eax
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
+; X64-SSE42-NEXT: pminud %xmm1, %xmm0
+; X64-SSE42-NEXT: movd %xmm0, %eax
; X64-SSE42-NEXT: retq
;
; X64-AVX1-LABEL: test_reduce_v16i32:
; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; X64-AVX1-NEXT: vpminud %xmm2, %xmm3, %xmm2
-; X64-AVX1-NEXT: vpminud %xmm2, %xmm1, %xmm1
; X64-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpminud %xmm2, %xmm0, %xmm0
; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0
; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; X86-SSE42-LABEL: test_reduce_v32i16:
; X86-SSE42: ## %bb.0:
; X86-SSE42-NEXT: pminuw %xmm3, %xmm1
-; X86-SSE42-NEXT: pminuw %xmm2, %xmm1
-; X86-SSE42-NEXT: pminuw %xmm0, %xmm1
-; X86-SSE42-NEXT: phminposuw %xmm1, %xmm0
+; X86-SSE42-NEXT: pminuw %xmm2, %xmm0
+; X86-SSE42-NEXT: pminuw %xmm1, %xmm0
+; X86-SSE42-NEXT: phminposuw %xmm0, %xmm0
; X86-SSE42-NEXT: movd %xmm0, %eax
; X86-SSE42-NEXT: ## kill: def $ax killed $ax killed $eax
; X86-SSE42-NEXT: retl
; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; X86-AVX1-NEXT: vpminuw %xmm2, %xmm3, %xmm2
-; X86-AVX1-NEXT: vpminuw %xmm2, %xmm1, %xmm1
; X86-AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpminuw %xmm2, %xmm0, %xmm0
; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0
; X86-AVX1-NEXT: vmovd %xmm0, %eax
; X86-AVX1-NEXT: ## kill: def $ax killed $ax killed $eax
; X64-SSE42-LABEL: test_reduce_v32i16:
; X64-SSE42: ## %bb.0:
; X64-SSE42-NEXT: pminuw %xmm3, %xmm1
-; X64-SSE42-NEXT: pminuw %xmm2, %xmm1
-; X64-SSE42-NEXT: pminuw %xmm0, %xmm1
-; X64-SSE42-NEXT: phminposuw %xmm1, %xmm0
+; X64-SSE42-NEXT: pminuw %xmm2, %xmm0
+; X64-SSE42-NEXT: pminuw %xmm1, %xmm0
+; X64-SSE42-NEXT: phminposuw %xmm0, %xmm0
; X64-SSE42-NEXT: movd %xmm0, %eax
; X64-SSE42-NEXT: ## kill: def $ax killed $ax killed $eax
; X64-SSE42-NEXT: retq
; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; X64-AVX1-NEXT: vpminuw %xmm2, %xmm3, %xmm2
-; X64-AVX1-NEXT: vpminuw %xmm2, %xmm1, %xmm1
; X64-AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpminuw %xmm2, %xmm0, %xmm0
; X64-AVX1-NEXT: vphminposuw %xmm0, %xmm0
; X64-AVX1-NEXT: vmovd %xmm0, %eax
; X64-AVX1-NEXT: ## kill: def $ax killed $ax killed $eax
; X86-SSE2-LABEL: test_reduce_v64i8:
; X86-SSE2: ## %bb.0:
; X86-SSE2-NEXT: pminub %xmm3, %xmm1
-; X86-SSE2-NEXT: pminub %xmm2, %xmm1
-; X86-SSE2-NEXT: pminub %xmm0, %xmm1
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; X86-SSE2-NEXT: pminub %xmm2, %xmm0
; X86-SSE2-NEXT: pminub %xmm1, %xmm0
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-SSE2-NEXT: pminub %xmm0, %xmm1
-; X86-SSE2-NEXT: movdqa %xmm1, %xmm0
-; X86-SSE2-NEXT: psrld $16, %xmm0
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
; X86-SSE2-NEXT: pminub %xmm1, %xmm0
; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
-; X86-SSE2-NEXT: psrlw $8, %xmm1
+; X86-SSE2-NEXT: psrld $16, %xmm1
; X86-SSE2-NEXT: pminub %xmm0, %xmm1
-; X86-SSE2-NEXT: movd %xmm1, %eax
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE2-NEXT: psrlw $8, %xmm0
+; X86-SSE2-NEXT: pminub %xmm1, %xmm0
+; X86-SSE2-NEXT: movd %xmm0, %eax
; X86-SSE2-NEXT: ## kill: def $al killed $al killed $eax
; X86-SSE2-NEXT: retl
;
; X86-SSE42-LABEL: test_reduce_v64i8:
; X86-SSE42: ## %bb.0:
; X86-SSE42-NEXT: pminub %xmm3, %xmm1
-; X86-SSE42-NEXT: pminub %xmm2, %xmm1
-; X86-SSE42-NEXT: pminub %xmm0, %xmm1
-; X86-SSE42-NEXT: movdqa %xmm1, %xmm0
-; X86-SSE42-NEXT: psrlw $8, %xmm0
+; X86-SSE42-NEXT: pminub %xmm2, %xmm0
; X86-SSE42-NEXT: pminub %xmm1, %xmm0
-; X86-SSE42-NEXT: phminposuw %xmm0, %xmm0
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE42-NEXT: psrlw $8, %xmm1
+; X86-SSE42-NEXT: pminub %xmm0, %xmm1
+; X86-SSE42-NEXT: phminposuw %xmm1, %xmm0
; X86-SSE42-NEXT: movd %xmm0, %eax
; X86-SSE42-NEXT: ## kill: def $al killed $al killed $eax
; X86-SSE42-NEXT: retl
; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; X86-AVX1-NEXT: vpminub %xmm2, %xmm3, %xmm2
-; X86-AVX1-NEXT: vpminub %xmm2, %xmm1, %xmm1
; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpminub %xmm2, %xmm0, %xmm0
; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0
; X64-SSE2-LABEL: test_reduce_v64i8:
; X64-SSE2: ## %bb.0:
; X64-SSE2-NEXT: pminub %xmm3, %xmm1
-; X64-SSE2-NEXT: pminub %xmm2, %xmm1
-; X64-SSE2-NEXT: pminub %xmm0, %xmm1
-; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; X64-SSE2-NEXT: pminub %xmm2, %xmm0
; X64-SSE2-NEXT: pminub %xmm1, %xmm0
-; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-SSE2-NEXT: pminub %xmm0, %xmm1
-; X64-SSE2-NEXT: movdqa %xmm1, %xmm0
-; X64-SSE2-NEXT: psrld $16, %xmm0
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
; X64-SSE2-NEXT: pminub %xmm1, %xmm0
; X64-SSE2-NEXT: movdqa %xmm0, %xmm1
-; X64-SSE2-NEXT: psrlw $8, %xmm1
+; X64-SSE2-NEXT: psrld $16, %xmm1
; X64-SSE2-NEXT: pminub %xmm0, %xmm1
-; X64-SSE2-NEXT: movd %xmm1, %eax
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE2-NEXT: psrlw $8, %xmm0
+; X64-SSE2-NEXT: pminub %xmm1, %xmm0
+; X64-SSE2-NEXT: movd %xmm0, %eax
; X64-SSE2-NEXT: ## kill: def $al killed $al killed $eax
; X64-SSE2-NEXT: retq
;
; X64-SSE42-LABEL: test_reduce_v64i8:
; X64-SSE42: ## %bb.0:
; X64-SSE42-NEXT: pminub %xmm3, %xmm1
-; X64-SSE42-NEXT: pminub %xmm2, %xmm1
-; X64-SSE42-NEXT: pminub %xmm0, %xmm1
-; X64-SSE42-NEXT: movdqa %xmm1, %xmm0
-; X64-SSE42-NEXT: psrlw $8, %xmm0
+; X64-SSE42-NEXT: pminub %xmm2, %xmm0
; X64-SSE42-NEXT: pminub %xmm1, %xmm0
-; X64-SSE42-NEXT: phminposuw %xmm0, %xmm0
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE42-NEXT: psrlw $8, %xmm1
+; X64-SSE42-NEXT: pminub %xmm0, %xmm1
+; X64-SSE42-NEXT: phminposuw %xmm1, %xmm0
; X64-SSE42-NEXT: movd %xmm0, %eax
; X64-SSE42-NEXT: ## kill: def $al killed $al killed $eax
; X64-SSE42-NEXT: retq
; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; X64-AVX1-NEXT: vpminub %xmm2, %xmm3, %xmm2
-; X64-AVX1-NEXT: vpminub %xmm2, %xmm1, %xmm1
; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpminub %xmm2, %xmm0, %xmm0
; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
; X64-AVX1-NEXT: vphminposuw %xmm0, %xmm0
; SSSE3-SLOW-NEXT: paddd %xmm0, %xmm4
; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1]
; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,1,0,1]
-; SSSE3-SLOW-NEXT: paddd %xmm2, %xmm5
; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm5
+; SSSE3-SLOW-NEXT: paddd %xmm2, %xmm5
; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1]
+; SSSE3-SLOW-NEXT: paddd %xmm3, %xmm1
; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm6 = xmm3[2,3,2,3]
-; SSSE3-SLOW-NEXT: paddd %xmm3, %xmm6
; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm6
; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm5[2,3]
; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,0]
; AVX1-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0]
; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[3,3,3,3]
; AVX1-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
-; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm2[4,5,6,7]
+; AVX1-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2,3],xmm2[4,5,6,7]
+; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm2, %xmm1
; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[1,1,1,1]
; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,0,0,0]
-; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,2,2,2]
-; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm4, %xmm2
-; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm3, %xmm1
; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,2,2,2]
+; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm3, %xmm2
+; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm2, %xmm1
; AVX1-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
; AVX1-SLOW-NEXT: retq
;
; AVX1-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0]
; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[3,3,3,3]
; AVX1-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
-; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm2[4,5,6,7]
+; AVX1-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2,3],xmm2[4,5,6,7]
+; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm2, %xmm1
; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX1-FAST-NEXT: vphaddd %xmm3, %xmm3, %xmm1
; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,2,2,2]
; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
-; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm3, %xmm1
-; AVX1-FAST-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; AVX1-FAST-NEXT: vpaddd %xmm2, %xmm3, %xmm2
+; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm2, %xmm1
; AVX1-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
; AVX1-FAST-NEXT: retq
;
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,2,2,2]
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[3]
+; AVX2-SLOW-NEXT: vpaddd %xmm1, %xmm2, %xmm1
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3]
; AVX2-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX2-SLOW-NEXT: vpaddd %xmm2, %xmm0, %xmm0
; AVX2-SLOW-NEXT: retq
;
; AVX2-FAST-LABEL: sequential_sum_v4i32_v4i32:
; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1,2],xmm5[3]
; AVX2-FAST-NEXT: vpbroadcastd %xmm4, %xmm4
; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
+; AVX2-FAST-NEXT: vpaddd %xmm1, %xmm2, %xmm1
; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3]
; AVX2-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX2-FAST-NEXT: vpaddd %xmm2, %xmm0, %xmm0
; AVX2-FAST-NEXT: retq
%5 = shufflevector <4 x i32> %0, <4 x i32> %1, <2 x i32> <i32 0, i32 4>
%6 = shufflevector <4 x i32> %0, <4 x i32> %1, <2 x i32> <i32 1, i32 5>
; X86-NEXT: movl %edx, %ebx
; X86-NEXT: orl %ecx, %ebx
; X86-NEXT: movl %esi, %ebp
-; X86-NEXT: orl %ebx, %ebp
+; X86-NEXT: orl %edx, %ebp
+; X86-NEXT: orl %ecx, %ebp
; X86-NEXT: shrdl $28, %ebx, %ebp
; X86-NEXT: jne .LBB0_1
; X86-NEXT: # %bb.2: # %exit
;
; X86-LABEL: test6:
; X86: # %bb.0: # %entry
-; X86-NEXT: pushl %esi
-; X86-NEXT: .cfi_def_cfa_offset 8
-; X86-NEXT: .cfi_offset %esi, -8
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl %ecx, %esi
-; X86-NEXT: shll $5, %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: shll $5, %ecx
+; X86-NEXT: addl %eax, %ecx
; X86-NEXT: movl $33, %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: addl %esi, %edx
; X86-NEXT: addl %ecx, %edx
-; X86-NEXT: popl %esi
-; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
entry:
%tmp3 = mul i64 %a, 33
; X64-NEXT: leal 1(%rax,%rcx), %ecx
; X64-NEXT: leal (%rax,%rax), %edx
; X64-NEXT: addl %eax, %edx
-; X64-NEXT: addl %eax, %edx
-; X64-NEXT: addl %eax, %edx
-; X64-NEXT: addl %eax, %edx
-; X64-NEXT: addl %ecx, %edx
-; X64-NEXT: movl %edx, 16(%rdi)
+; X64-NEXT: addl %edx, %ecx
+; X64-NEXT: addl %edx, %ecx
+; X64-NEXT: movl %ecx, 16(%rdi)
; X64-NEXT: retq
;
; X86-LABEL: foo_loop:
; X86-NEXT: leal 1(%ecx,%esi), %edx
; X86-NEXT: leal (%ecx,%ecx), %esi
; X86-NEXT: addl %ecx, %esi
-; X86-NEXT: addl %ecx, %esi
-; X86-NEXT: addl %ecx, %esi
-; X86-NEXT: addl %ecx, %esi
-; X86-NEXT: addl %edx, %esi
-; X86-NEXT: movl %esi, 16(%eax)
+; X86-NEXT: addl %esi, %edx
+; X86-NEXT: addl %esi, %edx
+; X86-NEXT: movl %edx, 16(%eax)
; X86-NEXT: popl %esi
; X86-NEXT: .cfi_def_cfa_offset 8
; X86-NEXT: popl %edi
define i32 @test2(ptr %p, i32 %a, i32 %b, i32 %c) {
; CHECK-LABEL: test2:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: # kill: def $edx killed $edx def $rdx
; CHECK-NEXT: movl %esi, %eax
+; CHECK-NEXT: addl %eax, %ecx
; CHECK-NEXT: addl %edx, %ecx
-; CHECK-NEXT: addl %esi, %ecx
; CHECK-NEXT: movl %ecx, (%rdi)
; CHECK-NEXT: subl %edx, %eax
+; CHECK-NEXT: # kill: def $eax killed $eax killed $rax
; CHECK-NEXT: retq
entry:
%0 = add i32 %a, %b
define i32 @test3(ptr %p, i32 %a, i32 %b, i32 %c) {
; CHECK-LABEL: test3:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: # kill: def $edx killed $edx def $rdx
; CHECK-NEXT: movl %esi, %eax
+; CHECK-NEXT: addl %eax, %ecx
; CHECK-NEXT: addl %edx, %ecx
-; CHECK-NEXT: addl %esi, %ecx
; CHECK-NEXT: movl %ecx, (%rdi)
; CHECK-NEXT: subl %edx, %eax
+; CHECK-NEXT: # kill: def $eax killed $eax killed $rax
; CHECK-NEXT: retq
entry:
%0 = add i32 %a, %b
; CHECK-LABEL: test6:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movq (%rdi), %rax
-; CHECK-NEXT: addq %rax, %rcx
; CHECK-NEXT: addq %rdx, %rcx
+; CHECK-NEXT: addq %rax, %rcx
; CHECK-NEXT: movq %rcx, (%rdi)
; CHECK-NEXT: subq %rdx, %rax
; CHECK-NEXT: retq
; CHECK-LABEL: test7:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movq (%rdi), %rax
-; CHECK-NEXT: addq %rax, %rcx
; CHECK-NEXT: addq %rdx, %rcx
+; CHECK-NEXT: addq %rax, %rcx
; CHECK-NEXT: movq %rcx, (%rdi)
; CHECK-NEXT: subq %rdx, %rax
; CHECK-NEXT: retq
; CHECK-NEXT: sarq %cl, %rdi
; CHECK-NEXT: # kill: def $cl killed $cl killed $rcx
; CHECK-NEXT: shrq %cl, %rsi
-; CHECK-NEXT: orq %rsi, %rax
; CHECK-NEXT: orq %rdi, %rax
+; CHECK-NEXT: orq %rsi, %rax
; CHECK-NEXT: retq
%sh1 = ashr i64 %x0, %y
%sh2 = lshr i64 %x1, %y
; CHECK-NEXT: shrq %cl, %rdi
; CHECK-NEXT: movl %r8d, %ecx
; CHECK-NEXT: shrq %cl, %rsi
-; CHECK-NEXT: orq %rsi, %rax
; CHECK-NEXT: orq %rdi, %rax
+; CHECK-NEXT: orq %rsi, %rax
; CHECK-NEXT: retq
%sh1 = lshr i64 %x0, %y
%sh2 = lshr i64 %x1, %w
; CHECK-NEXT: sarq %cl, %rdi
; CHECK-NEXT: # kill: def $cl killed $cl killed $rcx
; CHECK-NEXT: shrq %cl, %rsi
-; CHECK-NEXT: xorq %rsi, %rax
; CHECK-NEXT: xorq %rdi, %rax
+; CHECK-NEXT: xorq %rsi, %rax
; CHECK-NEXT: retq
%sh1 = ashr i64 %x0, %y
%sh2 = lshr i64 %x1, %y
; CHECK-NEXT: shrq %cl, %rdi
; CHECK-NEXT: movl %r8d, %ecx
; CHECK-NEXT: shrq %cl, %rsi
-; CHECK-NEXT: xorq %rsi, %rax
; CHECK-NEXT: xorq %rdi, %rax
+; CHECK-NEXT: xorq %rsi, %rax
; CHECK-NEXT: retq
%sh1 = lshr i64 %x0, %y
%sh2 = lshr i64 %x1, %w
; CHECK-NEXT: shrq %cl, %rdi
; CHECK-NEXT: # kill: def $cl killed $cl killed $rcx
; CHECK-NEXT: sarq %cl, %rsi
-; CHECK-NEXT: andq %rsi, %rax
; CHECK-NEXT: andq %rdi, %rax
+; CHECK-NEXT: andq %rsi, %rax
; CHECK-NEXT: retq
%sh1 = lshr i64 %x0, %y
%sh2 = ashr i64 %x1, %y
; CHECK-NEXT: shrq %cl, %rdi
; CHECK-NEXT: movl %r8d, %ecx
; CHECK-NEXT: shrq %cl, %rsi
-; CHECK-NEXT: andq %rsi, %rax
; CHECK-NEXT: andq %rdi, %rax
+; CHECK-NEXT: andq %rsi, %rax
; CHECK-NEXT: retq
%sh1 = lshr i64 %x0, %y
%sh2 = lshr i64 %x1, %w
define <16 x float> @foo(<16 x float> %x) {
; CHECK-LABEL: foo:
; CHECK: ## %bb.0: ## %bb
-; CHECK-NEXT: movaps %xmm3, %xmm9
-; CHECK-NEXT: movaps %xmm2, %xmm5
+; CHECK-NEXT: xorps %xmm5, %xmm5
+; CHECK-NEXT: cvttps2dq %xmm3, %xmm7
+; CHECK-NEXT: movaps %xmm3, %xmm4
+; CHECK-NEXT: cmpltps %xmm5, %xmm4
+; CHECK-NEXT: movaps {{.*#+}} xmm8 = [13,14,15,16]
+; CHECK-NEXT: movaps %xmm4, %xmm6
+; CHECK-NEXT: orps %xmm8, %xmm6
+; CHECK-NEXT: cvtdq2ps %xmm7, %xmm3
+; CHECK-NEXT: andps %xmm8, %xmm3
+; CHECK-NEXT: andps %xmm6, %xmm3
+; CHECK-NEXT: andnps %xmm4, %xmm6
+; CHECK-NEXT: cvttps2dq %xmm2, %xmm4
+; CHECK-NEXT: movaps %xmm2, %xmm7
+; CHECK-NEXT: cmpltps %xmm5, %xmm7
+; CHECK-NEXT: movaps {{.*#+}} xmm8 = [9,10,11,12]
+; CHECK-NEXT: movaps %xmm7, %xmm9
+; CHECK-NEXT: orps %xmm8, %xmm9
+; CHECK-NEXT: cvtdq2ps %xmm4, %xmm2
+; CHECK-NEXT: andps %xmm8, %xmm2
+; CHECK-NEXT: andps %xmm9, %xmm2
+; CHECK-NEXT: andnps %xmm7, %xmm9
+; CHECK-NEXT: cvttps2dq %xmm1, %xmm4
+; CHECK-NEXT: cmpltps %xmm5, %xmm1
+; CHECK-NEXT: movaps {{.*#+}} xmm7 = [5,6,7,8]
+; CHECK-NEXT: movaps %xmm1, %xmm8
+; CHECK-NEXT: orps %xmm7, %xmm8
+; CHECK-NEXT: cvtdq2ps %xmm4, %xmm4
+; CHECK-NEXT: andps %xmm7, %xmm4
+; CHECK-NEXT: andps %xmm8, %xmm4
+; CHECK-NEXT: andnps %xmm1, %xmm8
+; CHECK-NEXT: cvttps2dq %xmm0, %xmm1
+; CHECK-NEXT: cmpltps %xmm5, %xmm0
+; CHECK-NEXT: movaps {{.*#+}} xmm5 = [1,2,3,4]
; CHECK-NEXT: movaps %xmm0, %xmm7
-; CHECK-NEXT: xorps %xmm0, %xmm0
-; CHECK-NEXT: movaps %xmm3, %xmm2
-; CHECK-NEXT: cmpltps %xmm0, %xmm2
-; CHECK-NEXT: movaps %xmm2, %xmm4
-; CHECK-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
-; CHECK-NEXT: movaps %xmm4, %xmm8
-; CHECK-NEXT: andnps %xmm2, %xmm8
-; CHECK-NEXT: movaps %xmm5, %xmm6
-; CHECK-NEXT: cmpltps %xmm0, %xmm6
-; CHECK-NEXT: movaps {{.*#+}} xmm11 = [9,10,11,12]
-; CHECK-NEXT: movaps %xmm6, %xmm2
-; CHECK-NEXT: orps %xmm11, %xmm2
-; CHECK-NEXT: movaps %xmm2, %xmm10
-; CHECK-NEXT: andnps %xmm6, %xmm10
-; CHECK-NEXT: cvttps2dq %xmm1, %xmm12
-; CHECK-NEXT: cmpltps %xmm0, %xmm1
-; CHECK-NEXT: movaps {{.*#+}} xmm13 = [5,6,7,8]
-; CHECK-NEXT: movaps %xmm1, %xmm6
-; CHECK-NEXT: orps %xmm13, %xmm6
-; CHECK-NEXT: movaps %xmm6, %xmm14
-; CHECK-NEXT: andnps %xmm1, %xmm14
-; CHECK-NEXT: cvttps2dq %xmm7, %xmm3
-; CHECK-NEXT: cmpltps %xmm0, %xmm7
-; CHECK-NEXT: movaps {{.*#+}} xmm15 = [1,2,3,4]
-; CHECK-NEXT: movaps %xmm7, %xmm0
-; CHECK-NEXT: orps %xmm15, %xmm0
-; CHECK-NEXT: movaps %xmm0, %xmm1
-; CHECK-NEXT: andnps %xmm7, %xmm1
-; CHECK-NEXT: andps %xmm15, %xmm0
-; CHECK-NEXT: cvtdq2ps %xmm3, %xmm3
-; CHECK-NEXT: andps %xmm3, %xmm0
-; CHECK-NEXT: movaps {{.*#+}} xmm3 = [1,1,1,1]
-; CHECK-NEXT: andps %xmm3, %xmm1
-; CHECK-NEXT: orps %xmm1, %xmm0
-; CHECK-NEXT: andps %xmm13, %xmm6
-; CHECK-NEXT: cvtdq2ps %xmm12, %xmm1
-; CHECK-NEXT: andps %xmm1, %xmm6
-; CHECK-NEXT: andps %xmm3, %xmm14
-; CHECK-NEXT: orps %xmm14, %xmm6
-; CHECK-NEXT: andps %xmm11, %xmm2
-; CHECK-NEXT: cvttps2dq %xmm5, %xmm1
+; CHECK-NEXT: orps %xmm5, %xmm7
; CHECK-NEXT: cvtdq2ps %xmm1, %xmm1
-; CHECK-NEXT: andps %xmm1, %xmm2
-; CHECK-NEXT: andps %xmm3, %xmm10
-; CHECK-NEXT: orps %xmm10, %xmm2
-; CHECK-NEXT: andps %xmm3, %xmm8
-; CHECK-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
-; CHECK-NEXT: cvttps2dq %xmm9, %xmm1
-; CHECK-NEXT: cvtdq2ps %xmm1, %xmm1
-; CHECK-NEXT: andps %xmm1, %xmm4
+; CHECK-NEXT: andps %xmm5, %xmm1
+; CHECK-NEXT: andps %xmm7, %xmm1
+; CHECK-NEXT: andnps %xmm0, %xmm7
+; CHECK-NEXT: movaps {{.*#+}} xmm0 = [1,1,1,1]
+; CHECK-NEXT: andps %xmm0, %xmm7
+; CHECK-NEXT: orps %xmm7, %xmm1
+; CHECK-NEXT: andps %xmm0, %xmm8
; CHECK-NEXT: orps %xmm8, %xmm4
-; CHECK-NEXT: movaps %xmm6, %xmm1
-; CHECK-NEXT: movaps %xmm4, %xmm3
+; CHECK-NEXT: andps %xmm0, %xmm9
+; CHECK-NEXT: orps %xmm9, %xmm2
+; CHECK-NEXT: andps %xmm0, %xmm6
+; CHECK-NEXT: orps %xmm6, %xmm3
+; CHECK-NEXT: movaps %xmm1, %xmm0
+; CHECK-NEXT: movaps %xmm4, %xmm1
; CHECK-NEXT: retq
bb:
%v3 = icmp slt <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>, zeroinitializer
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; SSE2-NEXT: paddd %xmm4, %xmm3
; SSE2-NEXT: paddd %xmm0, %xmm2
; SSE2-NEXT: paddd %xmm0, %xmm1
-; SSE2-NEXT: paddd %xmm3, %xmm1
; SSE2-NEXT: paddd %xmm2, %xmm1
+; SSE2-NEXT: paddd %xmm3, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; SSE2-NEXT: paddd %xmm1, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; AVX1-NEXT: movl %edx, %eax
; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0
; AVX1-NEXT: xorl %ecx, %ecx
-; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: .p2align 4, 0x90
; AVX1-NEXT: .LBB3_1: # %vector.body
; AVX1-NEXT: # =>This Inner Loop Header: Depth=1
; AVX1-NEXT: vpmaddwd 16(%rdi,%rcx,2), %xmm4, %xmm4
; AVX1-NEXT: vpmaddwd 32(%rdi,%rcx,2), %xmm5, %xmm5
; AVX1-NEXT: vpmaddwd 48(%rdi,%rcx,2), %xmm6, %xmm6
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm7
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7
; AVX1-NEXT: vpaddd %xmm7, %xmm6, %xmm6
-; AVX1-NEXT: vpaddd %xmm2, %xmm5, %xmm2
-; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm2
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
+; AVX1-NEXT: vpaddd %xmm1, %xmm5, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
; AVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm4
-; AVX1-NEXT: vpaddd %xmm1, %xmm3, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
+; AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
; AVX1-NEXT: addq $16, %rcx
; AVX1-NEXT: cmpq %rcx, %rax
; AVX1-NEXT: jne .LBB3_1
; AVX1-NEXT: # %bb.2: # %middle.block
; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm3
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm4
+; AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
-; AVX1-NEXT: vpaddd %xmm5, %xmm2, %xmm2
-; AVX1-NEXT: vpaddd %xmm2, %xmm5, %xmm2
-; AVX1-NEXT: vpaddd %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vpaddd %xmm3, %xmm0, %xmm0
-; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vpaddd %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; SSE2-NEXT: paddd %xmm4, %xmm3
; SSE2-NEXT: paddd %xmm0, %xmm2
; SSE2-NEXT: paddd %xmm0, %xmm1
-; SSE2-NEXT: paddd %xmm3, %xmm1
; SSE2-NEXT: paddd %xmm2, %xmm1
+; SSE2-NEXT: paddd %xmm3, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; SSE2-NEXT: paddd %xmm1, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; AVX1-NEXT: movl %edx, %eax
; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0
; AVX1-NEXT: xorl %ecx, %ecx
-; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: .p2align 4, 0x90
; AVX1-NEXT: .LBB7_1: # %vector.body
; AVX1-NEXT: # =>This Inner Loop Header: Depth=1
; AVX1-NEXT: vpmaddwd %xmm5, %xmm7, %xmm5
; AVX1-NEXT: vpmovsxbw (%rsi,%rcx), %xmm7
; AVX1-NEXT: vpmaddwd %xmm6, %xmm7, %xmm6
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm7
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7
; AVX1-NEXT: vpaddd %xmm7, %xmm3, %xmm3
-; AVX1-NEXT: vpaddd %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT: vpaddd %xmm3, %xmm5, %xmm3
-; AVX1-NEXT: vpaddd %xmm1, %xmm6, %xmm1
+; AVX1-NEXT: vpaddd %xmm1, %xmm4, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
+; AVX1-NEXT: vpaddd %xmm3, %xmm5, %xmm3
+; AVX1-NEXT: vpaddd %xmm2, %xmm6, %xmm2
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
; AVX1-NEXT: addq $32, %rcx
; AVX1-NEXT: cmpq %rcx, %rax
; AVX1-NEXT: jne .LBB7_1
; AVX1-NEXT: # %bb.2: # %middle.block
; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm3
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm4
+; AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
-; AVX1-NEXT: vpaddd %xmm5, %xmm2, %xmm2
-; AVX1-NEXT: vpaddd %xmm2, %xmm5, %xmm2
-; AVX1-NEXT: vpaddd %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vpaddd %xmm3, %xmm0, %xmm0
-; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vpaddd %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; SSE2-NEXT: paddd %xmm3, %xmm2
; SSE2-NEXT: paddd %xmm4, %xmm0
; SSE2-NEXT: paddd %xmm5, %xmm1
-; SSE2-NEXT: paddd %xmm2, %xmm1
; SSE2-NEXT: paddd %xmm0, %xmm1
+; SSE2-NEXT: paddd %xmm2, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; SSE2-NEXT: paddd %xmm1, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; AVX1-NEXT: movl %edx, %eax
; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0
; AVX1-NEXT: xorl %ecx, %ecx
-; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX1-NEXT: .p2align 4, 0x90
; AVX1-NEXT: .LBB11_1: # %vector.body
; AVX1-NEXT: # =>This Inner Loop Header: Depth=1
; AVX1-NEXT: vpmulld %xmm10, %xmm12, %xmm10
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm12 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
; AVX1-NEXT: vpmulld %xmm11, %xmm12, %xmm11
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm12
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm12
; AVX1-NEXT: vpaddd %xmm4, %xmm12, %xmm4
-; AVX1-NEXT: vpaddd %xmm2, %xmm5, %xmm2
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; AVX1-NEXT: vpaddd %xmm1, %xmm5, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
; AVX1-NEXT: vpaddd %xmm4, %xmm6, %xmm4
; AVX1-NEXT: vpaddd %xmm0, %xmm7, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
-; AVX1-NEXT: vpaddd %xmm4, %xmm8, %xmm4
-; AVX1-NEXT: vpaddd %xmm1, %xmm9, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
-; AVX1-NEXT: vpaddd %xmm4, %xmm10, %xmm4
-; AVX1-NEXT: vpaddd %xmm3, %xmm11, %xmm3
+; AVX1-NEXT: vpaddd %xmm4, %xmm8, %xmm4
+; AVX1-NEXT: vpaddd %xmm3, %xmm9, %xmm3
; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT: vpaddd %xmm4, %xmm10, %xmm4
+; AVX1-NEXT: vpaddd %xmm2, %xmm11, %xmm2
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
; AVX1-NEXT: addq $16, %rcx
; AVX1-NEXT: cmpq %rcx, %rax
; AVX1-NEXT: jne .LBB11_1
; AVX1-NEXT: # %bb.2: # %middle.block
-; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm4
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm6
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
+; AVX1-NEXT: vpaddd %xmm3, %xmm0, %xmm4
+; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm5
+; AVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
-; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpaddd %xmm2, %xmm6, %xmm2
-; AVX1-NEXT: vpaddd %xmm2, %xmm5, %xmm2
-; AVX1-NEXT: vpaddd %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpaddd %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpaddd %xmm0, %xmm4, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; SSE2-NEXT: movdqu (%rdx), %xmm0
; SSE2-NEXT: movdqu (%rcx), %xmm2
; SSE2-NEXT: pmaddwd %xmm0, %xmm2
+; SSE2-NEXT: paddd %xmm1, %xmm2
; SSE2-NEXT: movdqu (%r8), %xmm0
-; SSE2-NEXT: movdqu (%r9), %xmm3
-; SSE2-NEXT: pmaddwd %xmm0, %xmm3
-; SSE2-NEXT: paddd %xmm1, %xmm3
-; SSE2-NEXT: movdqu (%r10), %xmm0
-; SSE2-NEXT: movdqu (%rax), %xmm1
+; SSE2-NEXT: movdqu (%r9), %xmm1
; SSE2-NEXT: pmaddwd %xmm0, %xmm1
-; SSE2-NEXT: paddd %xmm3, %xmm1
; SSE2-NEXT: paddd %xmm2, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
-; SSE2-NEXT: paddd %xmm1, %xmm0
+; SSE2-NEXT: movdqu (%r10), %xmm0
+; SSE2-NEXT: movdqu (%rax), %xmm2
+; SSE2-NEXT: pmaddwd %xmm0, %xmm2
+; SSE2-NEXT: paddd %xmm1, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
+; SSE2-NEXT: paddd %xmm2, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; SSE2-NEXT: paddd %xmm0, %xmm1
; SSE2-NEXT: movd %xmm1, %eax
; AVX-NEXT: vpmaddwd (%rsi), %xmm0, %xmm0
; AVX-NEXT: vmovdqu (%rdx), %xmm1
; AVX-NEXT: vpmaddwd (%rcx), %xmm1, %xmm1
-; AVX-NEXT: vmovdqu (%r8), %xmm2
-; AVX-NEXT: vpmaddwd (%r9), %xmm2, %xmm2
-; AVX-NEXT: vpaddd %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vmovdqu (%r10), %xmm2
-; AVX-NEXT: vpmaddwd (%rax), %xmm2, %xmm2
-; AVX-NEXT: vpaddd %xmm0, %xmm2, %xmm0
+; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0
+; AVX-NEXT: vmovdqu (%r8), %xmm1
+; AVX-NEXT: vpmaddwd (%r9), %xmm1, %xmm1
; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vmovdqu (%r10), %xmm1
+; AVX-NEXT: vpmaddwd (%rax), %xmm1, %xmm1
+; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; AVX512F-NEXT: vpgatherdd c(,%zmm0), %zmm2 {%k2}
; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm0 = [28,28,28,28,28,28,28,28,28,28,28,28,28,28,28,28]
; AVX512F-NEXT: vpgatherdd c(,%zmm0), %zmm1 {%k1}
-; AVX512F-NEXT: vpaddd %ymm1, %ymm1, %ymm0
-; AVX512F-NEXT: vpaddd %ymm0, %ymm2, %ymm0
+; AVX512F-NEXT: vpaddd %ymm1, %ymm2, %ymm0
+; AVX512F-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: gather_v8i32_v8i32:
; AVX512VL-NEXT: vpgatherdd c(,%ymm1), %ymm2 {%k2}
; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [28,28,28,28,28,28,28,28]
; AVX512VL-NEXT: vpgatherdd c(,%ymm1), %ymm0 {%k1}
-; AVX512VL-NEXT: vpaddd %ymm0, %ymm0, %ymm0
-; AVX512VL-NEXT: vpaddd %ymm0, %ymm2, %ymm0
+; AVX512VL-NEXT: vpaddd %ymm0, %ymm2, %ymm1
+; AVX512VL-NEXT: vpaddd %ymm0, %ymm1, %ymm0
; AVX512VL-NEXT: retq
%1 = icmp eq <8 x i32> %trigger, zeroinitializer
%2 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> getelementptr (%struct.a, <8 x ptr> <ptr @c, ptr @c, ptr @c, ptr @c, ptr @c, ptr @c, ptr @c, ptr @c>, <8 x i64> zeroinitializer, i32 0, <8 x i64> <i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3>), i32 4, <8 x i1> %1, <8 x i32> undef)
; KNL_64-NEXT: vpbroadcastq %rdi, %zmm2
; KNL_64-NEXT: vpbroadcastq {{.*#+}} zmm3 = [824,824,824,824,824,824,824,824]
; KNL_64-NEXT: vpmuludq %zmm3, %zmm0, %zmm4
+; KNL_64-NEXT: vpaddq %zmm4, %zmm2, %zmm2
; KNL_64-NEXT: vpsrlq $32, %zmm0, %zmm0
; KNL_64-NEXT: vpmuludq %zmm3, %zmm0, %zmm0
; KNL_64-NEXT: vpsllq $32, %zmm0, %zmm0
; KNL_64-NEXT: vpaddq %zmm0, %zmm2, %zmm0
; KNL_64-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero
; KNL_64-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm1
-; KNL_64-NEXT: vpaddq %zmm1, %zmm4, %zmm1
; KNL_64-NEXT: vpaddq %zmm1, %zmm0, %zmm1
; KNL_64-NEXT: kxnorw %k0, %k0, %k1
; KNL_64-NEXT: vpxor %xmm0, %xmm0, %xmm0
; KNL_32-NEXT: vpmovqd %zmm0, %ymm0
; KNL_32-NEXT: vpbroadcastd {{.*#+}} ymm3 = [820,820,820,820,820,820,820,820]
; KNL_32-NEXT: vpmulld %ymm3, %ymm0, %ymm0
-; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0
-; KNL_32-NEXT: vpaddd %ymm0, %ymm2, %ymm1
+; KNL_32-NEXT: vpaddd %ymm0, %ymm2, %ymm0
+; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm1
; KNL_32-NEXT: vpxor %xmm0, %xmm0, %xmm0
; KNL_32-NEXT: movw $255, %ax
; KNL_32-NEXT: kmovw %eax, %k1
; SKX_SMALL: # %bb.0: # %entry
; SKX_SMALL-NEXT: vpbroadcastq %rdi, %zmm2
; SKX_SMALL-NEXT: vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
+; SKX_SMALL-NEXT: vpaddq %zmm0, %zmm2, %zmm0
; SKX_SMALL-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero
; SKX_SMALL-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm1
-; SKX_SMALL-NEXT: vpaddq %zmm1, %zmm0, %zmm0
-; SKX_SMALL-NEXT: vpaddq %zmm0, %zmm2, %zmm1
+; SKX_SMALL-NEXT: vpaddq %zmm1, %zmm0, %zmm1
; SKX_SMALL-NEXT: kxnorw %k0, %k0, %k1
; SKX_SMALL-NEXT: vpxor %xmm0, %xmm0, %xmm0
; SKX_SMALL-NEXT: vpgatherqd 72(,%zmm1), %ymm0 {%k1}
; SKX_LARGE-NEXT: vpmuldq (%rax){1to8}, %zmm1, %zmm1
; SKX_LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax
; SKX_LARGE-NEXT: vpmullq (%rax){1to8}, %zmm0, %zmm0
-; SKX_LARGE-NEXT: vpaddq %zmm1, %zmm0, %zmm0
-; SKX_LARGE-NEXT: vpaddq %zmm0, %zmm2, %zmm1
+; SKX_LARGE-NEXT: vpaddq %zmm0, %zmm2, %zmm0
+; SKX_LARGE-NEXT: vpaddq %zmm1, %zmm0, %zmm1
; SKX_LARGE-NEXT: kxnorw %k0, %k0, %k1
; SKX_LARGE-NEXT: vpxor %xmm0, %xmm0, %xmm0
; SKX_LARGE-NEXT: vpgatherqd 72(,%zmm1), %ymm0 {%k1}
; KNL_64-NEXT: vpbroadcastq %rdi, %zmm2
; KNL_64-NEXT: vpbroadcastq {{.*#+}} zmm3 = [824,824,824,824,824,824,824,824]
; KNL_64-NEXT: vpmuludq %zmm3, %zmm0, %zmm4
+; KNL_64-NEXT: vpaddq %zmm4, %zmm2, %zmm2
; KNL_64-NEXT: vpsrlq $32, %zmm0, %zmm0
; KNL_64-NEXT: vpmuludq %zmm3, %zmm0, %zmm0
; KNL_64-NEXT: vpsllq $32, %zmm0, %zmm0
; KNL_64-NEXT: vpaddq %zmm0, %zmm2, %zmm0
; KNL_64-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero
; KNL_64-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm1
-; KNL_64-NEXT: vpaddq %zmm1, %zmm4, %zmm1
; KNL_64-NEXT: vpaddq %zmm1, %zmm0, %zmm1
; KNL_64-NEXT: kxnorw %k0, %k0, %k1
; KNL_64-NEXT: vpxor %xmm0, %xmm0, %xmm0
; KNL_32-NEXT: vpmovqd %zmm0, %ymm0
; KNL_32-NEXT: vpbroadcastd {{.*#+}} ymm3 = [820,820,820,820,820,820,820,820]
; KNL_32-NEXT: vpmulld %ymm3, %ymm0, %ymm0
-; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0
-; KNL_32-NEXT: vpaddd %ymm0, %ymm2, %ymm1
+; KNL_32-NEXT: vpaddd %ymm0, %ymm2, %ymm0
+; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm1
; KNL_32-NEXT: vpxor %xmm0, %xmm0, %xmm0
; KNL_32-NEXT: movw $255, %ax
; KNL_32-NEXT: kmovw %eax, %k1
; SKX_SMALL: # %bb.0: # %entry
; SKX_SMALL-NEXT: vpbroadcastq %rdi, %zmm2
; SKX_SMALL-NEXT: vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
+; SKX_SMALL-NEXT: vpaddq %zmm0, %zmm2, %zmm0
; SKX_SMALL-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero
; SKX_SMALL-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm1
-; SKX_SMALL-NEXT: vpaddq %zmm1, %zmm0, %zmm0
-; SKX_SMALL-NEXT: vpaddq %zmm0, %zmm2, %zmm1
+; SKX_SMALL-NEXT: vpaddq %zmm1, %zmm0, %zmm1
; SKX_SMALL-NEXT: kxnorw %k0, %k0, %k1
; SKX_SMALL-NEXT: vpxor %xmm0, %xmm0, %xmm0
; SKX_SMALL-NEXT: vpgatherqd 72(,%zmm1), %ymm0 {%k1}
; SKX_LARGE-NEXT: vpmuldq (%rax){1to8}, %zmm1, %zmm1
; SKX_LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax
; SKX_LARGE-NEXT: vpmullq (%rax){1to8}, %zmm0, %zmm0
-; SKX_LARGE-NEXT: vpaddq %zmm1, %zmm0, %zmm0
-; SKX_LARGE-NEXT: vpaddq %zmm0, %zmm2, %zmm1
+; SKX_LARGE-NEXT: vpaddq %zmm0, %zmm2, %zmm0
+; SKX_LARGE-NEXT: vpaddq %zmm1, %zmm0, %zmm1
; SKX_LARGE-NEXT: kxnorw %k0, %k0, %k1
; SKX_LARGE-NEXT: vpxor %xmm0, %xmm0, %xmm0
; SKX_LARGE-NEXT: vpgatherqd 72(,%zmm1), %ymm0 {%k1}
; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; SKX-NEXT: vpgatherqq (,%ymm0), %ymm1 {%k1}
; SKX-NEXT: vpaddq %ymm1, %ymm1, %ymm0
-; SKX-NEXT: vpaddq %ymm0, %ymm1, %ymm0
+; SKX-NEXT: vpaddq %ymm1, %ymm0, %ymm0
; SKX-NEXT: retq
;
; SKX_32-LABEL: test_pr28312:
; SKX_32-NEXT: vpxor %xmm1, %xmm1, %xmm1
; SKX_32-NEXT: vpgatherdq (,%xmm0), %ymm1 {%k1}
; SKX_32-NEXT: vpaddq %ymm1, %ymm1, %ymm0
-; SKX_32-NEXT: vpaddq %ymm0, %ymm1, %ymm0
+; SKX_32-NEXT: vpaddq %ymm1, %ymm0, %ymm0
; SKX_32-NEXT: movl %ebp, %esp
; SKX_32-NEXT: popl %ebp
; SKX_32-NEXT: .cfi_def_cfa %esp, 4
; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm2
; X86-SSE2-NEXT: movdqu 16(%eax), %xmm0
; X86-SSE2-NEXT: pcmpeqb %xmm1, %xmm0
+; X86-SSE2-NEXT: pand %xmm2, %xmm0
; X86-SSE2-NEXT: movdqu 32(%ecx), %xmm1
-; X86-SSE2-NEXT: movdqu 32(%eax), %xmm3
-; X86-SSE2-NEXT: pcmpeqb %xmm1, %xmm3
-; X86-SSE2-NEXT: pand %xmm0, %xmm3
-; X86-SSE2-NEXT: pand %xmm2, %xmm3
-; X86-SSE2-NEXT: pmovmskb %xmm3, %eax
+; X86-SSE2-NEXT: movdqu 32(%eax), %xmm2
+; X86-SSE2-NEXT: pcmpeqb %xmm1, %xmm2
+; X86-SSE2-NEXT: pand %xmm0, %xmm2
+; X86-SSE2-NEXT: pmovmskb %xmm2, %eax
; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
; X86-SSE2-NEXT: sete %al
; X86-SSE2-NEXT: retl
; X86-SSE41-NEXT: pxor %xmm0, %xmm2
; X86-SSE41-NEXT: movdqu 16(%eax), %xmm0
; X86-SSE41-NEXT: pxor %xmm1, %xmm0
+; X86-SSE41-NEXT: por %xmm2, %xmm0
; X86-SSE41-NEXT: movdqu 32(%ecx), %xmm1
-; X86-SSE41-NEXT: movdqu 32(%eax), %xmm3
-; X86-SSE41-NEXT: pxor %xmm1, %xmm3
-; X86-SSE41-NEXT: por %xmm0, %xmm3
-; X86-SSE41-NEXT: por %xmm2, %xmm3
-; X86-SSE41-NEXT: ptest %xmm3, %xmm3
+; X86-SSE41-NEXT: movdqu 32(%eax), %xmm2
+; X86-SSE41-NEXT: pxor %xmm1, %xmm2
+; X86-SSE41-NEXT: por %xmm0, %xmm2
+; X86-SSE41-NEXT: ptest %xmm2, %xmm2
; X86-SSE41-NEXT: sete %al
; X86-SSE41-NEXT: retl
%call = tail call i32 @memcmp(ptr %x, ptr %y, i32 48) nounwind
; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm2
; X86-SSE2-NEXT: movdqu 16(%eax), %xmm0
; X86-SSE2-NEXT: pcmpeqb %xmm1, %xmm0
+; X86-SSE2-NEXT: pand %xmm2, %xmm0
; X86-SSE2-NEXT: movdqu 32(%ecx), %xmm1
-; X86-SSE2-NEXT: movdqu 32(%eax), %xmm3
-; X86-SSE2-NEXT: pcmpeqb %xmm1, %xmm3
-; X86-SSE2-NEXT: pand %xmm0, %xmm3
-; X86-SSE2-NEXT: pand %xmm2, %xmm3
-; X86-SSE2-NEXT: pmovmskb %xmm3, %eax
+; X86-SSE2-NEXT: movdqu 32(%eax), %xmm2
+; X86-SSE2-NEXT: pcmpeqb %xmm1, %xmm2
+; X86-SSE2-NEXT: pand %xmm0, %xmm2
+; X86-SSE2-NEXT: pmovmskb %xmm2, %eax
; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
; X86-SSE2-NEXT: sete %al
; X86-SSE2-NEXT: retl
; X86-SSE41-NEXT: pxor %xmm0, %xmm2
; X86-SSE41-NEXT: movdqu 16(%eax), %xmm0
; X86-SSE41-NEXT: pxor %xmm1, %xmm0
+; X86-SSE41-NEXT: por %xmm2, %xmm0
; X86-SSE41-NEXT: movdqu 32(%ecx), %xmm1
-; X86-SSE41-NEXT: movdqu 32(%eax), %xmm3
-; X86-SSE41-NEXT: pxor %xmm1, %xmm3
-; X86-SSE41-NEXT: por %xmm0, %xmm3
-; X86-SSE41-NEXT: por %xmm2, %xmm3
-; X86-SSE41-NEXT: ptest %xmm3, %xmm3
+; X86-SSE41-NEXT: movdqu 32(%eax), %xmm2
+; X86-SSE41-NEXT: pxor %xmm1, %xmm2
+; X86-SSE41-NEXT: por %xmm0, %xmm2
+; X86-SSE41-NEXT: ptest %xmm2, %xmm2
; X86-SSE41-NEXT: sete %al
; X86-SSE41-NEXT: retl
%call = tail call i32 @memcmp(ptr %x, ptr %y, i32 48) nounwind
; X86-SSE2-NEXT: movdqu 32(%eax), %xmm2
; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE2-NEXT: pand %xmm1, %xmm0
; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
-; X86-SSE2-NEXT: pand %xmm1, %xmm2
; X86-SSE2-NEXT: pand %xmm0, %xmm2
; X86-SSE2-NEXT: pmovmskb %xmm2, %eax
; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
; X86-SSE41-NEXT: movdqu 32(%eax), %xmm2
; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE41-NEXT: por %xmm1, %xmm0
; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
-; X86-SSE41-NEXT: por %xmm1, %xmm2
; X86-SSE41-NEXT: por %xmm0, %xmm2
; X86-SSE41-NEXT: ptest %xmm2, %xmm2
; X86-SSE41-NEXT: setne %al
; X86-SSE2: # %bb.0:
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT: movdqu (%ecx), %xmm1
-; X86-SSE2-NEXT: movdqu 16(%ecx), %xmm2
-; X86-SSE2-NEXT: movdqu (%eax), %xmm0
+; X86-SSE2-NEXT: movdqu (%ecx), %xmm0
+; X86-SSE2-NEXT: movdqu 16(%ecx), %xmm1
+; X86-SSE2-NEXT: movdqu (%eax), %xmm2
+; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm2
+; X86-SSE2-NEXT: movdqu 16(%eax), %xmm0
; X86-SSE2-NEXT: pcmpeqb %xmm1, %xmm0
-; X86-SSE2-NEXT: movdqu 16(%eax), %xmm1
-; X86-SSE2-NEXT: pcmpeqb %xmm2, %xmm1
-; X86-SSE2-NEXT: movdqu 32(%ecx), %xmm2
-; X86-SSE2-NEXT: movdqu 32(%eax), %xmm3
-; X86-SSE2-NEXT: pcmpeqb %xmm2, %xmm3
-; X86-SSE2-NEXT: movdqu 47(%ecx), %xmm2
-; X86-SSE2-NEXT: movdqu 47(%eax), %xmm4
-; X86-SSE2-NEXT: pcmpeqb %xmm2, %xmm4
-; X86-SSE2-NEXT: pand %xmm3, %xmm4
-; X86-SSE2-NEXT: pand %xmm1, %xmm4
-; X86-SSE2-NEXT: pand %xmm0, %xmm4
-; X86-SSE2-NEXT: pmovmskb %xmm4, %eax
+; X86-SSE2-NEXT: pand %xmm2, %xmm0
+; X86-SSE2-NEXT: movdqu 32(%ecx), %xmm1
+; X86-SSE2-NEXT: movdqu 32(%eax), %xmm2
+; X86-SSE2-NEXT: pcmpeqb %xmm1, %xmm2
+; X86-SSE2-NEXT: movdqu 47(%ecx), %xmm1
+; X86-SSE2-NEXT: movdqu 47(%eax), %xmm3
+; X86-SSE2-NEXT: pcmpeqb %xmm1, %xmm3
+; X86-SSE2-NEXT: pand %xmm2, %xmm3
+; X86-SSE2-NEXT: pand %xmm0, %xmm3
+; X86-SSE2-NEXT: pmovmskb %xmm3, %eax
; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
; X86-SSE2-NEXT: setne %al
; X86-SSE2-NEXT: retl
; X86-SSE41: # %bb.0:
; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE41-NEXT: movdqu (%ecx), %xmm1
-; X86-SSE41-NEXT: movdqu 16(%ecx), %xmm2
-; X86-SSE41-NEXT: movdqu (%eax), %xmm0
+; X86-SSE41-NEXT: movdqu (%ecx), %xmm0
+; X86-SSE41-NEXT: movdqu 16(%ecx), %xmm1
+; X86-SSE41-NEXT: movdqu (%eax), %xmm2
+; X86-SSE41-NEXT: pxor %xmm0, %xmm2
+; X86-SSE41-NEXT: movdqu 16(%eax), %xmm0
; X86-SSE41-NEXT: pxor %xmm1, %xmm0
-; X86-SSE41-NEXT: movdqu 16(%eax), %xmm1
-; X86-SSE41-NEXT: pxor %xmm2, %xmm1
-; X86-SSE41-NEXT: movdqu 32(%ecx), %xmm2
-; X86-SSE41-NEXT: movdqu 32(%eax), %xmm3
-; X86-SSE41-NEXT: pxor %xmm2, %xmm3
-; X86-SSE41-NEXT: movdqu 47(%ecx), %xmm2
-; X86-SSE41-NEXT: movdqu 47(%eax), %xmm4
-; X86-SSE41-NEXT: pxor %xmm2, %xmm4
-; X86-SSE41-NEXT: por %xmm3, %xmm4
-; X86-SSE41-NEXT: por %xmm1, %xmm4
-; X86-SSE41-NEXT: por %xmm0, %xmm4
-; X86-SSE41-NEXT: ptest %xmm4, %xmm4
+; X86-SSE41-NEXT: por %xmm2, %xmm0
+; X86-SSE41-NEXT: movdqu 32(%ecx), %xmm1
+; X86-SSE41-NEXT: movdqu 32(%eax), %xmm2
+; X86-SSE41-NEXT: pxor %xmm1, %xmm2
+; X86-SSE41-NEXT: movdqu 47(%ecx), %xmm1
+; X86-SSE41-NEXT: movdqu 47(%eax), %xmm3
+; X86-SSE41-NEXT: pxor %xmm1, %xmm3
+; X86-SSE41-NEXT: por %xmm2, %xmm3
+; X86-SSE41-NEXT: por %xmm0, %xmm3
+; X86-SSE41-NEXT: ptest %xmm3, %xmm3
; X86-SSE41-NEXT: setne %al
; X86-SSE41-NEXT: retl
%call = tail call i32 @memcmp(ptr %x, ptr %y, i32 63) nounwind
; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
; X86-SSE2-NEXT: pand %xmm3, %xmm2
; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-SSE2-NEXT: pand %xmm2, %xmm1
; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
; X86-SSE2-NEXT: pand %xmm1, %xmm0
+; X86-SSE2-NEXT: pand %xmm2, %xmm0
; X86-SSE2-NEXT: pmovmskb %xmm0, %eax
; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
; X86-SSE2-NEXT: sete %al
; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
; X86-SSE41-NEXT: por %xmm3, %xmm2
; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-SSE41-NEXT: por %xmm2, %xmm1
; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
; X86-SSE41-NEXT: por %xmm1, %xmm0
+; X86-SSE41-NEXT: por %xmm2, %xmm0
; X86-SSE41-NEXT: ptest %xmm0, %xmm0
; X86-SSE41-NEXT: sete %al
; X86-SSE41-NEXT: retl
; X86-SSE2: # %bb.0:
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT: movdqu (%ecx), %xmm1
-; X86-SSE2-NEXT: movdqu 16(%ecx), %xmm2
-; X86-SSE2-NEXT: movdqu (%eax), %xmm0
+; X86-SSE2-NEXT: movdqu (%ecx), %xmm0
+; X86-SSE2-NEXT: movdqu 16(%ecx), %xmm1
+; X86-SSE2-NEXT: movdqu (%eax), %xmm2
+; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm2
+; X86-SSE2-NEXT: movdqu 16(%eax), %xmm0
; X86-SSE2-NEXT: pcmpeqb %xmm1, %xmm0
-; X86-SSE2-NEXT: movdqu 16(%eax), %xmm1
-; X86-SSE2-NEXT: pcmpeqb %xmm2, %xmm1
-; X86-SSE2-NEXT: movdqu 32(%ecx), %xmm2
-; X86-SSE2-NEXT: movdqu 32(%eax), %xmm3
-; X86-SSE2-NEXT: pcmpeqb %xmm2, %xmm3
-; X86-SSE2-NEXT: movdqu 48(%ecx), %xmm2
-; X86-SSE2-NEXT: movdqu 48(%eax), %xmm4
-; X86-SSE2-NEXT: pcmpeqb %xmm2, %xmm4
-; X86-SSE2-NEXT: pand %xmm3, %xmm4
-; X86-SSE2-NEXT: pand %xmm1, %xmm4
-; X86-SSE2-NEXT: pand %xmm0, %xmm4
-; X86-SSE2-NEXT: pmovmskb %xmm4, %eax
+; X86-SSE2-NEXT: pand %xmm2, %xmm0
+; X86-SSE2-NEXT: movdqu 32(%ecx), %xmm1
+; X86-SSE2-NEXT: movdqu 32(%eax), %xmm2
+; X86-SSE2-NEXT: pcmpeqb %xmm1, %xmm2
+; X86-SSE2-NEXT: movdqu 48(%ecx), %xmm1
+; X86-SSE2-NEXT: movdqu 48(%eax), %xmm3
+; X86-SSE2-NEXT: pcmpeqb %xmm1, %xmm3
+; X86-SSE2-NEXT: pand %xmm2, %xmm3
+; X86-SSE2-NEXT: pand %xmm0, %xmm3
+; X86-SSE2-NEXT: pmovmskb %xmm3, %eax
; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
; X86-SSE2-NEXT: setne %al
; X86-SSE2-NEXT: retl
; X86-SSE41: # %bb.0:
; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE41-NEXT: movdqu (%ecx), %xmm1
-; X86-SSE41-NEXT: movdqu 16(%ecx), %xmm2
-; X86-SSE41-NEXT: movdqu (%eax), %xmm0
+; X86-SSE41-NEXT: movdqu (%ecx), %xmm0
+; X86-SSE41-NEXT: movdqu 16(%ecx), %xmm1
+; X86-SSE41-NEXT: movdqu (%eax), %xmm2
+; X86-SSE41-NEXT: pxor %xmm0, %xmm2
+; X86-SSE41-NEXT: movdqu 16(%eax), %xmm0
; X86-SSE41-NEXT: pxor %xmm1, %xmm0
-; X86-SSE41-NEXT: movdqu 16(%eax), %xmm1
-; X86-SSE41-NEXT: pxor %xmm2, %xmm1
-; X86-SSE41-NEXT: movdqu 32(%ecx), %xmm2
-; X86-SSE41-NEXT: movdqu 32(%eax), %xmm3
-; X86-SSE41-NEXT: pxor %xmm2, %xmm3
-; X86-SSE41-NEXT: movdqu 48(%ecx), %xmm2
-; X86-SSE41-NEXT: movdqu 48(%eax), %xmm4
-; X86-SSE41-NEXT: pxor %xmm2, %xmm4
-; X86-SSE41-NEXT: por %xmm3, %xmm4
-; X86-SSE41-NEXT: por %xmm1, %xmm4
-; X86-SSE41-NEXT: por %xmm0, %xmm4
-; X86-SSE41-NEXT: ptest %xmm4, %xmm4
+; X86-SSE41-NEXT: por %xmm2, %xmm0
+; X86-SSE41-NEXT: movdqu 32(%ecx), %xmm1
+; X86-SSE41-NEXT: movdqu 32(%eax), %xmm2
+; X86-SSE41-NEXT: pxor %xmm1, %xmm2
+; X86-SSE41-NEXT: movdqu 48(%ecx), %xmm1
+; X86-SSE41-NEXT: movdqu 48(%eax), %xmm3
+; X86-SSE41-NEXT: pxor %xmm1, %xmm3
+; X86-SSE41-NEXT: por %xmm2, %xmm3
+; X86-SSE41-NEXT: por %xmm0, %xmm3
+; X86-SSE41-NEXT: ptest %xmm3, %xmm3
; X86-SSE41-NEXT: setne %al
; X86-SSE41-NEXT: retl
%call = tail call i32 @memcmp(ptr %x, ptr %y, i32 64) nounwind
; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
; X86-SSE2-NEXT: pand %xmm3, %xmm2
; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-SSE2-NEXT: pand %xmm2, %xmm1
; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
; X86-SSE2-NEXT: pand %xmm1, %xmm0
+; X86-SSE2-NEXT: pand %xmm2, %xmm0
; X86-SSE2-NEXT: pmovmskb %xmm0, %eax
; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
; X86-SSE2-NEXT: sete %al
; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
; X86-SSE41-NEXT: por %xmm3, %xmm2
; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-SSE41-NEXT: por %xmm2, %xmm1
; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
; X86-SSE41-NEXT: por %xmm1, %xmm0
+; X86-SSE41-NEXT: por %xmm2, %xmm0
; X86-SSE41-NEXT: ptest %xmm0, %xmm0
; X86-SSE41-NEXT: sete %al
; X86-SSE41-NEXT: retl
; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm3
; X64-SSE2-NEXT: movdqu 16(%rsi), %xmm0
; X64-SSE2-NEXT: pcmpeqb %xmm1, %xmm0
+; X64-SSE2-NEXT: pand %xmm3, %xmm0
; X64-SSE2-NEXT: movdqu 32(%rsi), %xmm1
; X64-SSE2-NEXT: pcmpeqb %xmm2, %xmm1
; X64-SSE2-NEXT: pand %xmm0, %xmm1
-; X64-SSE2-NEXT: pand %xmm3, %xmm1
; X64-SSE2-NEXT: pmovmskb %xmm1, %eax
; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
; X64-SSE2-NEXT: sete %al
; X64-SSE41-NEXT: pxor %xmm0, %xmm3
; X64-SSE41-NEXT: movdqu 16(%rsi), %xmm0
; X64-SSE41-NEXT: pxor %xmm1, %xmm0
+; X64-SSE41-NEXT: por %xmm3, %xmm0
; X64-SSE41-NEXT: movdqu 32(%rsi), %xmm1
; X64-SSE41-NEXT: pxor %xmm2, %xmm1
; X64-SSE41-NEXT: por %xmm0, %xmm1
-; X64-SSE41-NEXT: por %xmm3, %xmm1
; X64-SSE41-NEXT: ptest %xmm1, %xmm1
; X64-SSE41-NEXT: sete %al
; X64-SSE41-NEXT: retq
; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm3
; X64-SSE2-NEXT: movdqu 16(%rsi), %xmm0
; X64-SSE2-NEXT: pcmpeqb %xmm1, %xmm0
+; X64-SSE2-NEXT: pand %xmm3, %xmm0
; X64-SSE2-NEXT: movdqu 32(%rsi), %xmm1
; X64-SSE2-NEXT: pcmpeqb %xmm2, %xmm1
; X64-SSE2-NEXT: pand %xmm0, %xmm1
-; X64-SSE2-NEXT: pand %xmm3, %xmm1
; X64-SSE2-NEXT: pmovmskb %xmm1, %eax
; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
; X64-SSE2-NEXT: sete %al
; X64-SSE41-NEXT: pxor %xmm0, %xmm3
; X64-SSE41-NEXT: movdqu 16(%rsi), %xmm0
; X64-SSE41-NEXT: pxor %xmm1, %xmm0
+; X64-SSE41-NEXT: por %xmm3, %xmm0
; X64-SSE41-NEXT: movdqu 32(%rsi), %xmm1
; X64-SSE41-NEXT: pxor %xmm2, %xmm1
; X64-SSE41-NEXT: por %xmm0, %xmm1
-; X64-SSE41-NEXT: por %xmm3, %xmm1
; X64-SSE41-NEXT: ptest %xmm1, %xmm1
; X64-SSE41-NEXT: sete %al
; X64-SSE41-NEXT: retq
; X64-AVX-NEXT: vmovdqu 32(%rdi), %xmm2
; X64-AVX-NEXT: vpxor 16(%rsi), %xmm1, %xmm1
; X64-AVX-NEXT: vpxor (%rsi), %xmm0, %xmm0
-; X64-AVX-NEXT: vpxor 32(%rsi), %xmm2, %xmm2
-; X64-AVX-NEXT: vpor %xmm2, %xmm1, %xmm1
+; X64-AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpxor 32(%rsi), %xmm2, %xmm1
; X64-AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
; X64-AVX-NEXT: vptest %xmm0, %xmm0
; X64-AVX-NEXT: sete %al
; X64-SSE2-NEXT: movdqu 32(%rdi), %xmm2
; X64-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; X64-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-SSE2-NEXT: pand %xmm1, %xmm0
; X64-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; X64-SSE2-NEXT: pand %xmm1, %xmm2
; X64-SSE2-NEXT: pand %xmm0, %xmm2
; X64-SSE2-NEXT: pmovmskb %xmm2, %eax
; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
; X64-SSE41-NEXT: movdqu 32(%rdi), %xmm2
; X64-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; X64-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-SSE41-NEXT: por %xmm1, %xmm0
; X64-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; X64-SSE41-NEXT: por %xmm1, %xmm2
; X64-SSE41-NEXT: por %xmm0, %xmm2
; X64-SSE41-NEXT: ptest %xmm2, %xmm2
; X64-SSE41-NEXT: setne %al
; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm4
; X64-SSE2-NEXT: movdqu 16(%rsi), %xmm0
; X64-SSE2-NEXT: pcmpeqb %xmm1, %xmm0
+; X64-SSE2-NEXT: pand %xmm4, %xmm0
; X64-SSE2-NEXT: movdqu 32(%rsi), %xmm1
; X64-SSE2-NEXT: pcmpeqb %xmm2, %xmm1
; X64-SSE2-NEXT: movdqu 47(%rsi), %xmm2
; X64-SSE2-NEXT: pcmpeqb %xmm3, %xmm2
; X64-SSE2-NEXT: pand %xmm1, %xmm2
; X64-SSE2-NEXT: pand %xmm0, %xmm2
-; X64-SSE2-NEXT: pand %xmm4, %xmm2
; X64-SSE2-NEXT: pmovmskb %xmm2, %eax
; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
; X64-SSE2-NEXT: setne %al
; X64-SSE41-NEXT: pxor %xmm0, %xmm4
; X64-SSE41-NEXT: movdqu 16(%rsi), %xmm0
; X64-SSE41-NEXT: pxor %xmm1, %xmm0
+; X64-SSE41-NEXT: por %xmm4, %xmm0
; X64-SSE41-NEXT: movdqu 32(%rsi), %xmm1
; X64-SSE41-NEXT: pxor %xmm2, %xmm1
; X64-SSE41-NEXT: movdqu 47(%rsi), %xmm2
; X64-SSE41-NEXT: pxor %xmm3, %xmm2
; X64-SSE41-NEXT: por %xmm1, %xmm2
; X64-SSE41-NEXT: por %xmm0, %xmm2
-; X64-SSE41-NEXT: por %xmm4, %xmm2
; X64-SSE41-NEXT: ptest %xmm2, %xmm2
; X64-SSE41-NEXT: setne %al
; X64-SSE41-NEXT: retq
; X64-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
; X64-SSE2-NEXT: pand %xmm3, %xmm2
; X64-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; X64-SSE2-NEXT: pand %xmm2, %xmm1
; X64-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; X64-SSE2-NEXT: pand %xmm1, %xmm0
+; X64-SSE2-NEXT: pand %xmm2, %xmm0
; X64-SSE2-NEXT: pmovmskb %xmm0, %eax
; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
; X64-SSE2-NEXT: sete %al
; X64-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
; X64-SSE41-NEXT: por %xmm3, %xmm2
; X64-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; X64-SSE41-NEXT: por %xmm2, %xmm1
; X64-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; X64-SSE41-NEXT: por %xmm1, %xmm0
+; X64-SSE41-NEXT: por %xmm2, %xmm0
; X64-SSE41-NEXT: ptest %xmm0, %xmm0
; X64-SSE41-NEXT: sete %al
; X64-SSE41-NEXT: retq
; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm4
; X64-SSE2-NEXT: movdqu 16(%rsi), %xmm0
; X64-SSE2-NEXT: pcmpeqb %xmm1, %xmm0
+; X64-SSE2-NEXT: pand %xmm4, %xmm0
; X64-SSE2-NEXT: movdqu 32(%rsi), %xmm1
; X64-SSE2-NEXT: pcmpeqb %xmm2, %xmm1
; X64-SSE2-NEXT: movdqu 48(%rsi), %xmm2
; X64-SSE2-NEXT: pcmpeqb %xmm3, %xmm2
; X64-SSE2-NEXT: pand %xmm1, %xmm2
; X64-SSE2-NEXT: pand %xmm0, %xmm2
-; X64-SSE2-NEXT: pand %xmm4, %xmm2
; X64-SSE2-NEXT: pmovmskb %xmm2, %eax
; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
; X64-SSE2-NEXT: setne %al
; X64-SSE41-NEXT: pxor %xmm0, %xmm4
; X64-SSE41-NEXT: movdqu 16(%rsi), %xmm0
; X64-SSE41-NEXT: pxor %xmm1, %xmm0
+; X64-SSE41-NEXT: por %xmm4, %xmm0
; X64-SSE41-NEXT: movdqu 32(%rsi), %xmm1
; X64-SSE41-NEXT: pxor %xmm2, %xmm1
; X64-SSE41-NEXT: movdqu 48(%rsi), %xmm2
; X64-SSE41-NEXT: pxor %xmm3, %xmm2
; X64-SSE41-NEXT: por %xmm1, %xmm2
; X64-SSE41-NEXT: por %xmm0, %xmm2
-; X64-SSE41-NEXT: por %xmm4, %xmm2
; X64-SSE41-NEXT: ptest %xmm2, %xmm2
; X64-SSE41-NEXT: setne %al
; X64-SSE41-NEXT: retq
; X64-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
; X64-SSE2-NEXT: pand %xmm3, %xmm2
; X64-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; X64-SSE2-NEXT: pand %xmm2, %xmm1
; X64-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; X64-SSE2-NEXT: pand %xmm1, %xmm0
+; X64-SSE2-NEXT: pand %xmm2, %xmm0
; X64-SSE2-NEXT: pmovmskb %xmm0, %eax
; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
; X64-SSE2-NEXT: sete %al
; X64-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
; X64-SSE41-NEXT: por %xmm3, %xmm2
; X64-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; X64-SSE41-NEXT: por %xmm2, %xmm1
; X64-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; X64-SSE41-NEXT: por %xmm1, %xmm0
+; X64-SSE41-NEXT: por %xmm2, %xmm0
; X64-SSE41-NEXT: ptest %xmm0, %xmm0
; X64-SSE41-NEXT: sete %al
; X64-SSE41-NEXT: retq
; X64-AVX1-NEXT: vmovups 64(%rdi), %ymm2
; X64-AVX1-NEXT: vxorps 32(%rsi), %ymm1, %ymm1
; X64-AVX1-NEXT: vxorps (%rsi), %ymm0, %ymm0
-; X64-AVX1-NEXT: vxorps 64(%rsi), %ymm2, %ymm2
-; X64-AVX1-NEXT: vorps %ymm2, %ymm1, %ymm1
+; X64-AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
+; X64-AVX1-NEXT: vxorps 64(%rsi), %ymm2, %ymm1
; X64-AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; X64-AVX1-NEXT: vptest %ymm0, %ymm0
; X64-AVX1-NEXT: setne %al
; X64-AVX2-NEXT: vmovdqu 64(%rdi), %ymm2
; X64-AVX2-NEXT: vpxor 32(%rsi), %ymm1, %ymm1
; X64-AVX2-NEXT: vpxor (%rsi), %ymm0, %ymm0
-; X64-AVX2-NEXT: vpxor 64(%rsi), %ymm2, %ymm2
-; X64-AVX2-NEXT: vpor %ymm2, %ymm1, %ymm1
+; X64-AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpxor 64(%rsi), %ymm2, %ymm1
; X64-AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
; X64-AVX2-NEXT: vptest %ymm0, %ymm0
; X64-AVX2-NEXT: setne %al
; X64-AVX1-NEXT: vmovups 64(%rdi), %ymm2
; X64-AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
; X64-AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; X64-AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
-; X64-AVX1-NEXT: vorps %ymm2, %ymm1, %ymm1
+; X64-AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
+; X64-AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm1
; X64-AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; X64-AVX1-NEXT: vptest %ymm0, %ymm0
; X64-AVX1-NEXT: sete %al
; X64-AVX2-NEXT: vmovdqu 64(%rdi), %ymm2
; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
-; X64-AVX2-NEXT: vpor %ymm2, %ymm1, %ymm1
+; X64-AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm1
; X64-AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
; X64-AVX2-NEXT: vptest %ymm0, %ymm0
; X64-AVX2-NEXT: sete %al
; X64-AVX1-NEXT: vxorps 64(%rsi), %ymm2, %ymm2
; X64-AVX1-NEXT: vorps %ymm3, %ymm2, %ymm2
; X64-AVX1-NEXT: vxorps 32(%rsi), %ymm1, %ymm1
-; X64-AVX1-NEXT: vorps %ymm2, %ymm1, %ymm1
; X64-AVX1-NEXT: vxorps (%rsi), %ymm0, %ymm0
; X64-AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
+; X64-AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0
; X64-AVX1-NEXT: vptest %ymm0, %ymm0
; X64-AVX1-NEXT: setne %al
; X64-AVX1-NEXT: vzeroupper
; X64-AVX2-NEXT: vpxor 64(%rsi), %ymm2, %ymm2
; X64-AVX2-NEXT: vpor %ymm3, %ymm2, %ymm2
; X64-AVX2-NEXT: vpxor 32(%rsi), %ymm1, %ymm1
-; X64-AVX2-NEXT: vpor %ymm2, %ymm1, %ymm1
; X64-AVX2-NEXT: vpxor (%rsi), %ymm0, %ymm0
; X64-AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0
; X64-AVX2-NEXT: vptest %ymm0, %ymm0
; X64-AVX2-NEXT: setne %al
; X64-AVX2-NEXT: vzeroupper
; X64-AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
; X64-AVX1-NEXT: vorps %ymm3, %ymm2, %ymm2
; X64-AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; X64-AVX1-NEXT: vorps %ymm2, %ymm1, %ymm1
; X64-AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; X64-AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
+; X64-AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0
; X64-AVX1-NEXT: vptest %ymm0, %ymm0
; X64-AVX1-NEXT: sete %al
; X64-AVX1-NEXT: vzeroupper
; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
; X64-AVX2-NEXT: vpor %ymm3, %ymm2, %ymm2
; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; X64-AVX2-NEXT: vpor %ymm2, %ymm1, %ymm1
; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; X64-AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0
; X64-AVX2-NEXT: vptest %ymm0, %ymm0
; X64-AVX2-NEXT: sete %al
; X64-AVX2-NEXT: vzeroupper
; X64-AVX1-NEXT: vxorps 64(%rsi), %ymm2, %ymm2
; X64-AVX1-NEXT: vorps %ymm3, %ymm2, %ymm2
; X64-AVX1-NEXT: vxorps 32(%rsi), %ymm1, %ymm1
-; X64-AVX1-NEXT: vorps %ymm2, %ymm1, %ymm1
; X64-AVX1-NEXT: vxorps (%rsi), %ymm0, %ymm0
; X64-AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
+; X64-AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0
; X64-AVX1-NEXT: vptest %ymm0, %ymm0
; X64-AVX1-NEXT: setne %al
; X64-AVX1-NEXT: vzeroupper
; X64-AVX2-NEXT: vpxor 64(%rsi), %ymm2, %ymm2
; X64-AVX2-NEXT: vpor %ymm3, %ymm2, %ymm2
; X64-AVX2-NEXT: vpxor 32(%rsi), %ymm1, %ymm1
-; X64-AVX2-NEXT: vpor %ymm2, %ymm1, %ymm1
; X64-AVX2-NEXT: vpxor (%rsi), %ymm0, %ymm0
; X64-AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0
; X64-AVX2-NEXT: vptest %ymm0, %ymm0
; X64-AVX2-NEXT: setne %al
; X64-AVX2-NEXT: vzeroupper
; X64-AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
; X64-AVX1-NEXT: vorps %ymm3, %ymm2, %ymm2
; X64-AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; X64-AVX1-NEXT: vorps %ymm2, %ymm1, %ymm1
; X64-AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; X64-AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
+; X64-AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0
; X64-AVX1-NEXT: vptest %ymm0, %ymm0
; X64-AVX1-NEXT: sete %al
; X64-AVX1-NEXT: vzeroupper
; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
; X64-AVX2-NEXT: vpor %ymm3, %ymm2, %ymm2
; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; X64-AVX2-NEXT: vpor %ymm2, %ymm1, %ymm1
; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; X64-AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0
; X64-AVX2-NEXT: vptest %ymm0, %ymm0
; X64-AVX2-NEXT: sete %al
; X64-AVX2-NEXT: vzeroupper
; SSE2-NEXT: paddq %xmm3, %xmm4
; SSE2-NEXT: psllq $32, %xmm4
; SSE2-NEXT: pmuludq %xmm2, %xmm1
-; SSE2-NEXT: paddq %xmm4, %xmm0
; SSE2-NEXT: paddq %xmm1, %xmm0
+; SSE2-NEXT: paddq %xmm4, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: vec128_i64_signed_reg_reg:
; SSE41-NEXT: movdqa %xmm3, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1
; SSE41-NEXT: psubq %xmm5, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm3
-; SSE41-NEXT: psrlq $1, %xmm3
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: psrlq $1, %xmm0
; SSE41-NEXT: psrlq $33, %xmm1
; SSE41-NEXT: pmuludq %xmm4, %xmm1
-; SSE41-NEXT: movdqa %xmm4, %xmm0
-; SSE41-NEXT: psrlq $32, %xmm0
-; SSE41-NEXT: pmuludq %xmm3, %xmm0
-; SSE41-NEXT: paddq %xmm1, %xmm0
-; SSE41-NEXT: psllq $32, %xmm0
-; SSE41-NEXT: pmuludq %xmm4, %xmm3
+; SSE41-NEXT: movdqa %xmm4, %xmm3
+; SSE41-NEXT: psrlq $32, %xmm3
+; SSE41-NEXT: pmuludq %xmm0, %xmm3
+; SSE41-NEXT: paddq %xmm1, %xmm3
+; SSE41-NEXT: psllq $32, %xmm3
+; SSE41-NEXT: pmuludq %xmm4, %xmm0
; SSE41-NEXT: paddq %xmm2, %xmm0
; SSE41-NEXT: paddq %xmm3, %xmm0
; SSE41-NEXT: retq
; AVX1-FALLBACK-NEXT: vpaddq %xmm1, %xmm4, %xmm1
; AVX1-FALLBACK-NEXT: vpsllq $32, %xmm1, %xmm1
; AVX1-FALLBACK-NEXT: vpmuludq %xmm3, %xmm2, %xmm2
-; AVX1-FALLBACK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
; AVX1-FALLBACK-NEXT: vpaddq %xmm0, %xmm2, %xmm0
+; AVX1-FALLBACK-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; AVX1-FALLBACK-NEXT: retq
;
; AVX2-FALLBACK-LABEL: vec128_i64_signed_reg_reg:
; AVX2-FALLBACK-NEXT: vpaddq %xmm1, %xmm4, %xmm1
; AVX2-FALLBACK-NEXT: vpsllq $32, %xmm1, %xmm1
; AVX2-FALLBACK-NEXT: vpmuludq %xmm3, %xmm2, %xmm2
-; AVX2-FALLBACK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
; AVX2-FALLBACK-NEXT: vpaddq %xmm0, %xmm2, %xmm0
+; AVX2-FALLBACK-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; AVX2-FALLBACK-NEXT: retq
;
; XOP-LABEL: vec128_i64_signed_reg_reg:
; XOP-NEXT: vpaddq %xmm1, %xmm4, %xmm1
; XOP-NEXT: vpsllq $32, %xmm1, %xmm1
; XOP-NEXT: vpmuludq %xmm3, %xmm2, %xmm2
-; XOP-NEXT: vpaddq %xmm0, %xmm1, %xmm0
; XOP-NEXT: vpaddq %xmm0, %xmm2, %xmm0
+; XOP-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
;
; AVX512F-LABEL: vec128_i64_signed_reg_reg:
; AVX512F-NEXT: vpaddq %xmm1, %xmm4, %xmm1
; AVX512F-NEXT: vpsllq $32, %xmm1, %xmm1
; AVX512F-NEXT: vpmuludq %xmm3, %xmm2, %xmm2
-; AVX512F-NEXT: vpaddq %xmm0, %xmm1, %xmm0
; AVX512F-NEXT: vpaddq %xmm0, %xmm2, %xmm0
+; AVX512F-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-FALLBACK-NEXT: vpaddq %xmm1, %xmm4, %xmm1
; AVX512BW-FALLBACK-NEXT: vpsllq $32, %xmm1, %xmm1
; AVX512BW-FALLBACK-NEXT: vpmuludq %xmm3, %xmm2, %xmm2
-; AVX512BW-FALLBACK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
; AVX512BW-FALLBACK-NEXT: vpaddq %xmm0, %xmm2, %xmm0
+; AVX512BW-FALLBACK-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; AVX512BW-FALLBACK-NEXT: vzeroupper
; AVX512BW-FALLBACK-NEXT: retq
%t3 = icmp sgt <2 x i64> %a1, %a2 ; signed
; SSE2-NEXT: paddq %xmm3, %xmm4
; SSE2-NEXT: psllq $32, %xmm4
; SSE2-NEXT: pmuludq %xmm2, %xmm1
-; SSE2-NEXT: paddq %xmm4, %xmm0
; SSE2-NEXT: paddq %xmm1, %xmm0
+; SSE2-NEXT: paddq %xmm4, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: vec128_i64_unsigned_reg_reg:
; SSE41-NEXT: movdqa %xmm3, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1
; SSE41-NEXT: psubq %xmm5, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm3
-; SSE41-NEXT: psrlq $1, %xmm3
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: psrlq $1, %xmm0
; SSE41-NEXT: psrlq $33, %xmm1
; SSE41-NEXT: pmuludq %xmm4, %xmm1
-; SSE41-NEXT: movdqa %xmm4, %xmm0
-; SSE41-NEXT: psrlq $32, %xmm0
-; SSE41-NEXT: pmuludq %xmm3, %xmm0
-; SSE41-NEXT: paddq %xmm1, %xmm0
-; SSE41-NEXT: psllq $32, %xmm0
-; SSE41-NEXT: pmuludq %xmm4, %xmm3
+; SSE41-NEXT: movdqa %xmm4, %xmm3
+; SSE41-NEXT: psrlq $32, %xmm3
+; SSE41-NEXT: pmuludq %xmm0, %xmm3
+; SSE41-NEXT: paddq %xmm1, %xmm3
+; SSE41-NEXT: psllq $32, %xmm3
+; SSE41-NEXT: pmuludq %xmm4, %xmm0
; SSE41-NEXT: paddq %xmm2, %xmm0
; SSE41-NEXT: paddq %xmm3, %xmm0
; SSE41-NEXT: retq
; AVX1-FALLBACK-NEXT: vpaddq %xmm1, %xmm3, %xmm1
; AVX1-FALLBACK-NEXT: vpsllq $32, %xmm1, %xmm1
; AVX1-FALLBACK-NEXT: vpmuludq %xmm5, %xmm2, %xmm2
-; AVX1-FALLBACK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
; AVX1-FALLBACK-NEXT: vpaddq %xmm0, %xmm2, %xmm0
+; AVX1-FALLBACK-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; AVX1-FALLBACK-NEXT: retq
;
; AVX2-FALLBACK-LABEL: vec128_i64_unsigned_reg_reg:
; AVX2-FALLBACK-NEXT: vpaddq %xmm1, %xmm3, %xmm1
; AVX2-FALLBACK-NEXT: vpsllq $32, %xmm1, %xmm1
; AVX2-FALLBACK-NEXT: vpmuludq %xmm5, %xmm2, %xmm2
-; AVX2-FALLBACK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
; AVX2-FALLBACK-NEXT: vpaddq %xmm0, %xmm2, %xmm0
+; AVX2-FALLBACK-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; AVX2-FALLBACK-NEXT: retq
;
; XOP-LABEL: vec128_i64_unsigned_reg_reg:
; XOP-NEXT: vpaddq %xmm1, %xmm4, %xmm1
; XOP-NEXT: vpsllq $32, %xmm1, %xmm1
; XOP-NEXT: vpmuludq %xmm3, %xmm2, %xmm2
-; XOP-NEXT: vpaddq %xmm0, %xmm1, %xmm0
; XOP-NEXT: vpaddq %xmm0, %xmm2, %xmm0
+; XOP-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
;
; AVX512F-LABEL: vec128_i64_unsigned_reg_reg:
; AVX512F-NEXT: vpaddq %xmm1, %xmm4, %xmm1
; AVX512F-NEXT: vpsllq $32, %xmm1, %xmm1
; AVX512F-NEXT: vpmuludq %xmm3, %xmm2, %xmm2
-; AVX512F-NEXT: vpaddq %xmm0, %xmm1, %xmm0
; AVX512F-NEXT: vpaddq %xmm0, %xmm2, %xmm0
+; AVX512F-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-FALLBACK-NEXT: vpaddq %xmm1, %xmm4, %xmm1
; AVX512BW-FALLBACK-NEXT: vpsllq $32, %xmm1, %xmm1
; AVX512BW-FALLBACK-NEXT: vpmuludq %xmm3, %xmm2, %xmm2
-; AVX512BW-FALLBACK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
; AVX512BW-FALLBACK-NEXT: vpaddq %xmm0, %xmm2, %xmm0
+; AVX512BW-FALLBACK-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; AVX512BW-FALLBACK-NEXT: vzeroupper
; AVX512BW-FALLBACK-NEXT: retq
%t3 = icmp ugt <2 x i64> %a1, %a2
; SSE2-NEXT: pandn %xmm0, %xmm3
; SSE2-NEXT: por %xmm4, %xmm3
; SSE2-NEXT: psubq %xmm5, %xmm3
-; SSE2-NEXT: movdqa %xmm3, %xmm4
-; SSE2-NEXT: psrlq $1, %xmm4
+; SSE2-NEXT: movdqa %xmm3, %xmm0
+; SSE2-NEXT: psrlq $1, %xmm0
; SSE2-NEXT: psrlq $33, %xmm3
; SSE2-NEXT: pmuludq %xmm2, %xmm3
-; SSE2-NEXT: movdqa %xmm2, %xmm0
-; SSE2-NEXT: psrlq $32, %xmm0
-; SSE2-NEXT: pmuludq %xmm4, %xmm0
-; SSE2-NEXT: paddq %xmm3, %xmm0
-; SSE2-NEXT: psllq $32, %xmm0
-; SSE2-NEXT: pmuludq %xmm2, %xmm4
+; SSE2-NEXT: movdqa %xmm2, %xmm4
+; SSE2-NEXT: psrlq $32, %xmm4
+; SSE2-NEXT: pmuludq %xmm0, %xmm4
+; SSE2-NEXT: paddq %xmm3, %xmm4
+; SSE2-NEXT: psllq $32, %xmm4
+; SSE2-NEXT: pmuludq %xmm2, %xmm0
; SSE2-NEXT: paddq %xmm1, %xmm0
; SSE2-NEXT: paddq %xmm4, %xmm0
; SSE2-NEXT: retq
; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm1
; SSE41-NEXT: psubq %xmm5, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: psrlq $1, %xmm2
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: psrlq $1, %xmm0
; SSE41-NEXT: psrlq $33, %xmm1
; SSE41-NEXT: pmuludq %xmm4, %xmm1
-; SSE41-NEXT: movdqa %xmm4, %xmm0
-; SSE41-NEXT: psrlq $32, %xmm0
-; SSE41-NEXT: pmuludq %xmm2, %xmm0
-; SSE41-NEXT: paddq %xmm1, %xmm0
-; SSE41-NEXT: psllq $32, %xmm0
-; SSE41-NEXT: pmuludq %xmm4, %xmm2
+; SSE41-NEXT: movdqa %xmm4, %xmm2
+; SSE41-NEXT: psrlq $32, %xmm2
+; SSE41-NEXT: pmuludq %xmm0, %xmm2
+; SSE41-NEXT: paddq %xmm1, %xmm2
+; SSE41-NEXT: psllq $32, %xmm2
+; SSE41-NEXT: pmuludq %xmm4, %xmm0
; SSE41-NEXT: paddq %xmm3, %xmm0
; SSE41-NEXT: paddq %xmm2, %xmm0
; SSE41-NEXT: retq
; AVX1-FALLBACK-NEXT: vpaddq %xmm0, %xmm4, %xmm0
; AVX1-FALLBACK-NEXT: vpsllq $32, %xmm0, %xmm0
; AVX1-FALLBACK-NEXT: vpmuludq %xmm3, %xmm2, %xmm2
-; AVX1-FALLBACK-NEXT: vpaddq %xmm1, %xmm0, %xmm0
-; AVX1-FALLBACK-NEXT: vpaddq %xmm0, %xmm2, %xmm0
+; AVX1-FALLBACK-NEXT: vpaddq %xmm1, %xmm2, %xmm1
+; AVX1-FALLBACK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
; AVX1-FALLBACK-NEXT: retq
;
; AVX2-FALLBACK-LABEL: vec128_i64_signed_mem_reg:
; AVX2-FALLBACK-NEXT: vpaddq %xmm0, %xmm4, %xmm0
; AVX2-FALLBACK-NEXT: vpsllq $32, %xmm0, %xmm0
; AVX2-FALLBACK-NEXT: vpmuludq %xmm3, %xmm2, %xmm2
-; AVX2-FALLBACK-NEXT: vpaddq %xmm1, %xmm0, %xmm0
-; AVX2-FALLBACK-NEXT: vpaddq %xmm0, %xmm2, %xmm0
+; AVX2-FALLBACK-NEXT: vpaddq %xmm1, %xmm2, %xmm1
+; AVX2-FALLBACK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
; AVX2-FALLBACK-NEXT: retq
;
; XOP-LABEL: vec128_i64_signed_mem_reg:
; XOP-NEXT: vpaddq %xmm0, %xmm4, %xmm0
; XOP-NEXT: vpsllq $32, %xmm0, %xmm0
; XOP-NEXT: vpmuludq %xmm3, %xmm2, %xmm2
-; XOP-NEXT: vpaddq %xmm1, %xmm0, %xmm0
-; XOP-NEXT: vpaddq %xmm0, %xmm2, %xmm0
+; XOP-NEXT: vpaddq %xmm1, %xmm2, %xmm1
+; XOP-NEXT: vpaddq %xmm0, %xmm1, %xmm0
; XOP-NEXT: retq
;
; AVX512F-LABEL: vec128_i64_signed_mem_reg:
; AVX512F-NEXT: vpaddq %xmm0, %xmm4, %xmm0
; AVX512F-NEXT: vpsllq $32, %xmm0, %xmm0
; AVX512F-NEXT: vpmuludq %xmm3, %xmm2, %xmm2
-; AVX512F-NEXT: vpaddq %xmm1, %xmm0, %xmm0
-; AVX512F-NEXT: vpaddq %xmm0, %xmm2, %xmm0
+; AVX512F-NEXT: vpaddq %xmm1, %xmm2, %xmm1
+; AVX512F-NEXT: vpaddq %xmm0, %xmm1, %xmm0
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-FALLBACK-NEXT: vpaddq %xmm0, %xmm4, %xmm0
; AVX512BW-FALLBACK-NEXT: vpsllq $32, %xmm0, %xmm0
; AVX512BW-FALLBACK-NEXT: vpmuludq %xmm3, %xmm2, %xmm2
-; AVX512BW-FALLBACK-NEXT: vpaddq %xmm1, %xmm0, %xmm0
-; AVX512BW-FALLBACK-NEXT: vpaddq %xmm0, %xmm2, %xmm0
+; AVX512BW-FALLBACK-NEXT: vpaddq %xmm1, %xmm2, %xmm1
+; AVX512BW-FALLBACK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
; AVX512BW-FALLBACK-NEXT: vzeroupper
; AVX512BW-FALLBACK-NEXT: retq
%a1 = load <2 x i64>, ptr %a1_addr
; SSE2-NEXT: paddq %xmm2, %xmm4
; SSE2-NEXT: psllq $32, %xmm4
; SSE2-NEXT: pmuludq %xmm1, %xmm3
-; SSE2-NEXT: paddq %xmm4, %xmm0
; SSE2-NEXT: paddq %xmm3, %xmm0
+; SSE2-NEXT: paddq %xmm4, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: vec128_i64_signed_reg_mem:
; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm3
; SSE41-NEXT: psubq %xmm5, %xmm3
-; SSE41-NEXT: movdqa %xmm3, %xmm2
-; SSE41-NEXT: psrlq $1, %xmm2
+; SSE41-NEXT: movdqa %xmm3, %xmm0
+; SSE41-NEXT: psrlq $1, %xmm0
; SSE41-NEXT: psrlq $33, %xmm3
; SSE41-NEXT: pmuludq %xmm4, %xmm3
-; SSE41-NEXT: movdqa %xmm4, %xmm0
-; SSE41-NEXT: psrlq $32, %xmm0
-; SSE41-NEXT: pmuludq %xmm2, %xmm0
-; SSE41-NEXT: paddq %xmm3, %xmm0
-; SSE41-NEXT: psllq $32, %xmm0
-; SSE41-NEXT: pmuludq %xmm4, %xmm2
+; SSE41-NEXT: movdqa %xmm4, %xmm2
+; SSE41-NEXT: psrlq $32, %xmm2
+; SSE41-NEXT: pmuludq %xmm0, %xmm2
+; SSE41-NEXT: paddq %xmm3, %xmm2
+; SSE41-NEXT: psllq $32, %xmm2
+; SSE41-NEXT: pmuludq %xmm4, %xmm0
; SSE41-NEXT: paddq %xmm1, %xmm0
; SSE41-NEXT: paddq %xmm2, %xmm0
; SSE41-NEXT: retq
; AVX1-FALLBACK-NEXT: vpaddq %xmm1, %xmm4, %xmm1
; AVX1-FALLBACK-NEXT: vpsllq $32, %xmm1, %xmm1
; AVX1-FALLBACK-NEXT: vpmuludq %xmm3, %xmm2, %xmm2
-; AVX1-FALLBACK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
; AVX1-FALLBACK-NEXT: vpaddq %xmm0, %xmm2, %xmm0
+; AVX1-FALLBACK-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; AVX1-FALLBACK-NEXT: retq
;
; AVX2-FALLBACK-LABEL: vec128_i64_signed_reg_mem:
; AVX2-FALLBACK-NEXT: vpaddq %xmm1, %xmm4, %xmm1
; AVX2-FALLBACK-NEXT: vpsllq $32, %xmm1, %xmm1
; AVX2-FALLBACK-NEXT: vpmuludq %xmm3, %xmm2, %xmm2
-; AVX2-FALLBACK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
; AVX2-FALLBACK-NEXT: vpaddq %xmm0, %xmm2, %xmm0
+; AVX2-FALLBACK-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; AVX2-FALLBACK-NEXT: retq
;
; XOP-LABEL: vec128_i64_signed_reg_mem:
; XOP-NEXT: vpaddq %xmm1, %xmm4, %xmm1
; XOP-NEXT: vpsllq $32, %xmm1, %xmm1
; XOP-NEXT: vpmuludq %xmm3, %xmm2, %xmm2
-; XOP-NEXT: vpaddq %xmm0, %xmm1, %xmm0
; XOP-NEXT: vpaddq %xmm0, %xmm2, %xmm0
+; XOP-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
;
; AVX512F-LABEL: vec128_i64_signed_reg_mem:
; AVX512F-NEXT: vpaddq %xmm1, %xmm4, %xmm1
; AVX512F-NEXT: vpsllq $32, %xmm1, %xmm1
; AVX512F-NEXT: vpmuludq %xmm3, %xmm2, %xmm2
-; AVX512F-NEXT: vpaddq %xmm0, %xmm1, %xmm0
; AVX512F-NEXT: vpaddq %xmm0, %xmm2, %xmm0
+; AVX512F-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-FALLBACK-NEXT: vpaddq %xmm1, %xmm4, %xmm1
; AVX512BW-FALLBACK-NEXT: vpsllq $32, %xmm1, %xmm1
; AVX512BW-FALLBACK-NEXT: vpmuludq %xmm3, %xmm2, %xmm2
-; AVX512BW-FALLBACK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
; AVX512BW-FALLBACK-NEXT: vpaddq %xmm0, %xmm2, %xmm0
+; AVX512BW-FALLBACK-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; AVX512BW-FALLBACK-NEXT: vzeroupper
; AVX512BW-FALLBACK-NEXT: retq
%a2 = load <2 x i64>, ptr %a2_addr
; SSE2-NEXT: pandn %xmm0, %xmm3
; SSE2-NEXT: por %xmm4, %xmm3
; SSE2-NEXT: psubq %xmm5, %xmm3
-; SSE2-NEXT: movdqa %xmm3, %xmm4
-; SSE2-NEXT: psrlq $1, %xmm4
+; SSE2-NEXT: movdqa %xmm3, %xmm0
+; SSE2-NEXT: psrlq $1, %xmm0
; SSE2-NEXT: psrlq $33, %xmm3
; SSE2-NEXT: pmuludq %xmm2, %xmm3
-; SSE2-NEXT: movdqa %xmm2, %xmm0
-; SSE2-NEXT: psrlq $32, %xmm0
-; SSE2-NEXT: pmuludq %xmm4, %xmm0
-; SSE2-NEXT: paddq %xmm3, %xmm0
-; SSE2-NEXT: psllq $32, %xmm0
-; SSE2-NEXT: pmuludq %xmm2, %xmm4
+; SSE2-NEXT: movdqa %xmm2, %xmm4
+; SSE2-NEXT: psrlq $32, %xmm4
+; SSE2-NEXT: pmuludq %xmm0, %xmm4
+; SSE2-NEXT: paddq %xmm3, %xmm4
+; SSE2-NEXT: psllq $32, %xmm4
+; SSE2-NEXT: pmuludq %xmm2, %xmm0
; SSE2-NEXT: paddq %xmm1, %xmm0
; SSE2-NEXT: paddq %xmm4, %xmm0
; SSE2-NEXT: retq
; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3
; SSE41-NEXT: psubq %xmm5, %xmm3
-; SSE41-NEXT: movdqa %xmm3, %xmm1
-; SSE41-NEXT: psrlq $1, %xmm1
+; SSE41-NEXT: movdqa %xmm3, %xmm0
+; SSE41-NEXT: psrlq $1, %xmm0
; SSE41-NEXT: psrlq $33, %xmm3
; SSE41-NEXT: pmuludq %xmm4, %xmm3
-; SSE41-NEXT: movdqa %xmm4, %xmm0
-; SSE41-NEXT: psrlq $32, %xmm0
-; SSE41-NEXT: pmuludq %xmm1, %xmm0
-; SSE41-NEXT: paddq %xmm3, %xmm0
-; SSE41-NEXT: psllq $32, %xmm0
-; SSE41-NEXT: pmuludq %xmm4, %xmm1
+; SSE41-NEXT: movdqa %xmm4, %xmm1
+; SSE41-NEXT: psrlq $32, %xmm1
+; SSE41-NEXT: pmuludq %xmm0, %xmm1
+; SSE41-NEXT: paddq %xmm3, %xmm1
+; SSE41-NEXT: psllq $32, %xmm1
+; SSE41-NEXT: pmuludq %xmm4, %xmm0
; SSE41-NEXT: paddq %xmm2, %xmm0
; SSE41-NEXT: paddq %xmm1, %xmm0
; SSE41-NEXT: retq
; AVX1-FALLBACK-NEXT: vpaddq %xmm1, %xmm4, %xmm1
; AVX1-FALLBACK-NEXT: vpsllq $32, %xmm1, %xmm1
; AVX1-FALLBACK-NEXT: vpmuludq %xmm3, %xmm2, %xmm2
-; AVX1-FALLBACK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
; AVX1-FALLBACK-NEXT: vpaddq %xmm0, %xmm2, %xmm0
+; AVX1-FALLBACK-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; AVX1-FALLBACK-NEXT: retq
;
; AVX2-FALLBACK-LABEL: vec128_i64_signed_mem_mem:
; AVX2-FALLBACK-NEXT: vpaddq %xmm1, %xmm4, %xmm1
; AVX2-FALLBACK-NEXT: vpsllq $32, %xmm1, %xmm1
; AVX2-FALLBACK-NEXT: vpmuludq %xmm3, %xmm2, %xmm2
-; AVX2-FALLBACK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
; AVX2-FALLBACK-NEXT: vpaddq %xmm0, %xmm2, %xmm0
+; AVX2-FALLBACK-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; AVX2-FALLBACK-NEXT: retq
;
; XOP-LABEL: vec128_i64_signed_mem_mem:
; XOP-NEXT: vpaddq %xmm1, %xmm4, %xmm1
; XOP-NEXT: vpsllq $32, %xmm1, %xmm1
; XOP-NEXT: vpmuludq %xmm3, %xmm2, %xmm2
-; XOP-NEXT: vpaddq %xmm0, %xmm1, %xmm0
; XOP-NEXT: vpaddq %xmm0, %xmm2, %xmm0
+; XOP-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
;
; AVX512F-LABEL: vec128_i64_signed_mem_mem:
; AVX512F-NEXT: vpaddq %xmm1, %xmm4, %xmm1
; AVX512F-NEXT: vpsllq $32, %xmm1, %xmm1
; AVX512F-NEXT: vpmuludq %xmm3, %xmm2, %xmm2
-; AVX512F-NEXT: vpaddq %xmm0, %xmm1, %xmm0
; AVX512F-NEXT: vpaddq %xmm0, %xmm2, %xmm0
+; AVX512F-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-FALLBACK-NEXT: vpaddq %xmm1, %xmm4, %xmm1
; AVX512BW-FALLBACK-NEXT: vpsllq $32, %xmm1, %xmm1
; AVX512BW-FALLBACK-NEXT: vpmuludq %xmm3, %xmm2, %xmm2
-; AVX512BW-FALLBACK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
; AVX512BW-FALLBACK-NEXT: vpaddq %xmm0, %xmm2, %xmm0
+; AVX512BW-FALLBACK-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; AVX512BW-FALLBACK-NEXT: vzeroupper
; AVX512BW-FALLBACK-NEXT: retq
%a1 = load <2 x i64>, ptr %a1_addr
; SSE2-NEXT: movdqa %xmm0, %xmm3
; SSE2-NEXT: psubusw %xmm1, %xmm3
; SSE2-NEXT: psubusw %xmm0, %xmm1
+; SSE2-NEXT: paddw %xmm0, %xmm1
; SSE2-NEXT: psubw %xmm0, %xmm3
; SSE2-NEXT: paddw %xmm1, %xmm3
-; SSE2-NEXT: paddw %xmm0, %xmm3
; SSE2-NEXT: psrlw $1, %xmm3
; SSE2-NEXT: pmullw %xmm2, %xmm3
; SSE2-NEXT: paddw %xmm3, %xmm0
; AVX1-FALLBACK-NEXT: vpaddq %xmm3, %xmm7, %xmm3
; AVX1-FALLBACK-NEXT: vpsllq $32, %xmm3, %xmm3
; AVX1-FALLBACK-NEXT: vpmuludq %xmm5, %xmm6, %xmm5
-; AVX1-FALLBACK-NEXT: vpaddq %xmm4, %xmm3, %xmm3
-; AVX1-FALLBACK-NEXT: vpaddq %xmm3, %xmm5, %xmm3
-; AVX1-FALLBACK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
+; AVX1-FALLBACK-NEXT: vpaddq %xmm4, %xmm5, %xmm4
+; AVX1-FALLBACK-NEXT: vpaddq %xmm3, %xmm4, %xmm3
; AVX1-FALLBACK-NEXT: vpaddq %xmm0, %xmm2, %xmm0
+; AVX1-FALLBACK-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; AVX1-FALLBACK-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
; AVX1-FALLBACK-NEXT: retq
;
; AVX2-NEXT: vpaddq %ymm1, %ymm4, %ymm1
; AVX2-NEXT: vpsllq $32, %ymm1, %ymm1
; AVX2-NEXT: vpmuludq %ymm3, %ymm2, %ymm2
-; AVX2-NEXT: vpaddq %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vpaddq %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; XOP-FALLBACK-LABEL: vec256_i64_signed_reg_reg:
; XOP-FALLBACK-NEXT: vpaddq %xmm3, %xmm7, %xmm3
; XOP-FALLBACK-NEXT: vpsllq $32, %xmm3, %xmm3
; XOP-FALLBACK-NEXT: vpmuludq %xmm5, %xmm6, %xmm5
-; XOP-FALLBACK-NEXT: vpaddq %xmm4, %xmm3, %xmm3
-; XOP-FALLBACK-NEXT: vpaddq %xmm3, %xmm5, %xmm3
-; XOP-FALLBACK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
+; XOP-FALLBACK-NEXT: vpaddq %xmm4, %xmm5, %xmm4
+; XOP-FALLBACK-NEXT: vpaddq %xmm3, %xmm4, %xmm3
; XOP-FALLBACK-NEXT: vpaddq %xmm0, %xmm2, %xmm0
+; XOP-FALLBACK-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; XOP-FALLBACK-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
; XOP-FALLBACK-NEXT: retq
;
; XOPAVX1-NEXT: vpaddq %xmm3, %xmm7, %xmm3
; XOPAVX1-NEXT: vpsllq $32, %xmm3, %xmm3
; XOPAVX1-NEXT: vpmuludq %xmm5, %xmm6, %xmm5
-; XOPAVX1-NEXT: vpaddq %xmm4, %xmm3, %xmm3
-; XOPAVX1-NEXT: vpaddq %xmm3, %xmm5, %xmm3
-; XOPAVX1-NEXT: vpaddq %xmm0, %xmm1, %xmm0
+; XOPAVX1-NEXT: vpaddq %xmm4, %xmm5, %xmm4
+; XOPAVX1-NEXT: vpaddq %xmm3, %xmm4, %xmm3
; XOPAVX1-NEXT: vpaddq %xmm0, %xmm2, %xmm0
+; XOPAVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
; XOPAVX1-NEXT: retq
;
; AVX512F-NEXT: vpaddq %ymm1, %ymm4, %ymm1
; AVX512F-NEXT: vpsllq $32, %ymm1, %ymm1
; AVX512F-NEXT: vpmuludq %ymm3, %ymm2, %ymm2
-; AVX512F-NEXT: vpaddq %ymm0, %ymm1, %ymm0
; AVX512F-NEXT: vpaddq %ymm0, %ymm2, %ymm0
+; AVX512F-NEXT: vpaddq %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: vec256_i64_signed_reg_reg:
; AVX512BW-FALLBACK-NEXT: vpaddq %ymm1, %ymm4, %ymm1
; AVX512BW-FALLBACK-NEXT: vpsllq $32, %ymm1, %ymm1
; AVX512BW-FALLBACK-NEXT: vpmuludq %ymm3, %ymm2, %ymm2
-; AVX512BW-FALLBACK-NEXT: vpaddq %ymm0, %ymm1, %ymm0
; AVX512BW-FALLBACK-NEXT: vpaddq %ymm0, %ymm2, %ymm0
+; AVX512BW-FALLBACK-NEXT: vpaddq %ymm1, %ymm0, %ymm0
; AVX512BW-FALLBACK-NEXT: retq
%t3 = icmp sgt <4 x i64> %a1, %a2 ; signed
%t4 = select <4 x i1> %t3, <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1>, <4 x i64> <i64 1, i64 1, i64 1, i64 1>
; AVX1-FALLBACK-NEXT: vpaddq %xmm2, %xmm6, %xmm2
; AVX1-FALLBACK-NEXT: vpsllq $32, %xmm2, %xmm2
; AVX1-FALLBACK-NEXT: vpmuludq %xmm5, %xmm3, %xmm3
-; AVX1-FALLBACK-NEXT: vpaddq %xmm2, %xmm8, %xmm2
+; AVX1-FALLBACK-NEXT: vpaddq %xmm3, %xmm8, %xmm3
; AVX1-FALLBACK-NEXT: vpaddq %xmm2, %xmm3, %xmm2
-; AVX1-FALLBACK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
; AVX1-FALLBACK-NEXT: vpaddq %xmm0, %xmm4, %xmm0
+; AVX1-FALLBACK-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; AVX1-FALLBACK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-FALLBACK-NEXT: retq
;
; AVX2-NEXT: vpaddq %ymm1, %ymm3, %ymm1
; AVX2-NEXT: vpsllq $32, %ymm1, %ymm1
; AVX2-NEXT: vpmuludq %ymm5, %ymm2, %ymm2
-; AVX2-NEXT: vpaddq %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vpaddq %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; XOP-FALLBACK-LABEL: vec256_i64_unsigned_reg_reg:
; XOP-FALLBACK-NEXT: vpaddq %xmm3, %xmm7, %xmm3
; XOP-FALLBACK-NEXT: vpsllq $32, %xmm3, %xmm3
; XOP-FALLBACK-NEXT: vpmuludq %xmm5, %xmm6, %xmm5
-; XOP-FALLBACK-NEXT: vpaddq %xmm4, %xmm3, %xmm3
-; XOP-FALLBACK-NEXT: vpaddq %xmm3, %xmm5, %xmm3
-; XOP-FALLBACK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
+; XOP-FALLBACK-NEXT: vpaddq %xmm4, %xmm5, %xmm4
+; XOP-FALLBACK-NEXT: vpaddq %xmm3, %xmm4, %xmm3
; XOP-FALLBACK-NEXT: vpaddq %xmm0, %xmm2, %xmm0
+; XOP-FALLBACK-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; XOP-FALLBACK-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
; XOP-FALLBACK-NEXT: retq
;
; XOPAVX1-NEXT: vpaddq %xmm3, %xmm7, %xmm3
; XOPAVX1-NEXT: vpsllq $32, %xmm3, %xmm3
; XOPAVX1-NEXT: vpmuludq %xmm5, %xmm6, %xmm5
-; XOPAVX1-NEXT: vpaddq %xmm4, %xmm3, %xmm3
-; XOPAVX1-NEXT: vpaddq %xmm3, %xmm5, %xmm3
-; XOPAVX1-NEXT: vpaddq %xmm0, %xmm1, %xmm0
+; XOPAVX1-NEXT: vpaddq %xmm4, %xmm5, %xmm4
+; XOPAVX1-NEXT: vpaddq %xmm3, %xmm4, %xmm3
; XOPAVX1-NEXT: vpaddq %xmm0, %xmm2, %xmm0
+; XOPAVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
; XOPAVX1-NEXT: retq
;
; AVX512F-NEXT: vpaddq %ymm1, %ymm4, %ymm1
; AVX512F-NEXT: vpsllq $32, %ymm1, %ymm1
; AVX512F-NEXT: vpmuludq %ymm3, %ymm2, %ymm2
-; AVX512F-NEXT: vpaddq %ymm0, %ymm1, %ymm0
; AVX512F-NEXT: vpaddq %ymm0, %ymm2, %ymm0
+; AVX512F-NEXT: vpaddq %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: vec256_i64_unsigned_reg_reg:
; AVX512BW-FALLBACK-NEXT: vpaddq %ymm1, %ymm4, %ymm1
; AVX512BW-FALLBACK-NEXT: vpsllq $32, %ymm1, %ymm1
; AVX512BW-FALLBACK-NEXT: vpmuludq %ymm3, %ymm2, %ymm2
-; AVX512BW-FALLBACK-NEXT: vpaddq %ymm0, %ymm1, %ymm0
; AVX512BW-FALLBACK-NEXT: vpaddq %ymm0, %ymm2, %ymm0
+; AVX512BW-FALLBACK-NEXT: vpaddq %ymm1, %ymm0, %ymm0
; AVX512BW-FALLBACK-NEXT: retq
%t3 = icmp ugt <4 x i64> %a1, %a2
%t4 = select <4 x i1> %t3, <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1>, <4 x i64> <i64 1, i64 1, i64 1, i64 1>
; AVX1-FALLBACK-NEXT: vpaddq %xmm4, %xmm7, %xmm4
; AVX1-FALLBACK-NEXT: vpsllq $32, %xmm4, %xmm4
; AVX1-FALLBACK-NEXT: vpmuludq %xmm5, %xmm6, %xmm5
-; AVX1-FALLBACK-NEXT: vpaddq %xmm2, %xmm4, %xmm2
; AVX1-FALLBACK-NEXT: vpaddq %xmm2, %xmm5, %xmm2
-; AVX1-FALLBACK-NEXT: vpaddq %xmm1, %xmm0, %xmm0
-; AVX1-FALLBACK-NEXT: vpaddq %xmm0, %xmm3, %xmm0
+; AVX1-FALLBACK-NEXT: vpaddq %xmm4, %xmm2, %xmm2
+; AVX1-FALLBACK-NEXT: vpaddq %xmm1, %xmm3, %xmm1
+; AVX1-FALLBACK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
; AVX1-FALLBACK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-FALLBACK-NEXT: retq
;
; AVX2-NEXT: vpaddq %ymm0, %ymm4, %ymm0
; AVX2-NEXT: vpsllq $32, %ymm0, %ymm0
; AVX2-NEXT: vpmuludq %ymm3, %ymm2, %ymm2
-; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpaddq %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: vpaddq %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpaddq %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
; XOP-FALLBACK-LABEL: vec256_i64_signed_mem_reg:
; XOP-FALLBACK-NEXT: vpaddq %xmm4, %xmm7, %xmm4
; XOP-FALLBACK-NEXT: vpsllq $32, %xmm4, %xmm4
; XOP-FALLBACK-NEXT: vpmuludq %xmm5, %xmm6, %xmm5
-; XOP-FALLBACK-NEXT: vpaddq %xmm2, %xmm4, %xmm2
; XOP-FALLBACK-NEXT: vpaddq %xmm2, %xmm5, %xmm2
-; XOP-FALLBACK-NEXT: vpaddq %xmm1, %xmm0, %xmm0
-; XOP-FALLBACK-NEXT: vpaddq %xmm0, %xmm3, %xmm0
+; XOP-FALLBACK-NEXT: vpaddq %xmm4, %xmm2, %xmm2
+; XOP-FALLBACK-NEXT: vpaddq %xmm1, %xmm3, %xmm1
+; XOP-FALLBACK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
; XOP-FALLBACK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; XOP-FALLBACK-NEXT: retq
;
; XOPAVX1-NEXT: vpaddq %xmm4, %xmm7, %xmm4
; XOPAVX1-NEXT: vpsllq $32, %xmm4, %xmm4
; XOPAVX1-NEXT: vpmuludq %xmm5, %xmm6, %xmm5
-; XOPAVX1-NEXT: vpaddq %xmm2, %xmm4, %xmm2
; XOPAVX1-NEXT: vpaddq %xmm2, %xmm5, %xmm2
-; XOPAVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
-; XOPAVX1-NEXT: vpaddq %xmm0, %xmm3, %xmm0
+; XOPAVX1-NEXT: vpaddq %xmm4, %xmm2, %xmm2
+; XOPAVX1-NEXT: vpaddq %xmm1, %xmm3, %xmm1
+; XOPAVX1-NEXT: vpaddq %xmm0, %xmm1, %xmm0
; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; XOPAVX1-NEXT: retq
;
; AVX512F-NEXT: vpaddq %ymm0, %ymm4, %ymm0
; AVX512F-NEXT: vpsllq $32, %ymm0, %ymm0
; AVX512F-NEXT: vpmuludq %ymm3, %ymm2, %ymm2
-; AVX512F-NEXT: vpaddq %ymm1, %ymm0, %ymm0
-; AVX512F-NEXT: vpaddq %ymm0, %ymm2, %ymm0
+; AVX512F-NEXT: vpaddq %ymm1, %ymm2, %ymm1
+; AVX512F-NEXT: vpaddq %ymm0, %ymm1, %ymm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: vec256_i64_signed_mem_reg:
; AVX512BW-FALLBACK-NEXT: vpaddq %ymm0, %ymm4, %ymm0
; AVX512BW-FALLBACK-NEXT: vpsllq $32, %ymm0, %ymm0
; AVX512BW-FALLBACK-NEXT: vpmuludq %ymm3, %ymm2, %ymm2
-; AVX512BW-FALLBACK-NEXT: vpaddq %ymm1, %ymm0, %ymm0
-; AVX512BW-FALLBACK-NEXT: vpaddq %ymm0, %ymm2, %ymm0
+; AVX512BW-FALLBACK-NEXT: vpaddq %ymm1, %ymm2, %ymm1
+; AVX512BW-FALLBACK-NEXT: vpaddq %ymm0, %ymm1, %ymm0
; AVX512BW-FALLBACK-NEXT: retq
%a1 = load <4 x i64>, ptr %a1_addr
%t3 = icmp sgt <4 x i64> %a1, %a2 ; signed
; AVX1-FALLBACK-NEXT: vpaddq %xmm2, %xmm7, %xmm2
; AVX1-FALLBACK-NEXT: vpsllq $32, %xmm2, %xmm2
; AVX1-FALLBACK-NEXT: vpmuludq %xmm5, %xmm6, %xmm5
-; AVX1-FALLBACK-NEXT: vpaddq %xmm4, %xmm2, %xmm2
-; AVX1-FALLBACK-NEXT: vpaddq %xmm2, %xmm5, %xmm2
-; AVX1-FALLBACK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
+; AVX1-FALLBACK-NEXT: vpaddq %xmm4, %xmm5, %xmm4
+; AVX1-FALLBACK-NEXT: vpaddq %xmm2, %xmm4, %xmm2
; AVX1-FALLBACK-NEXT: vpaddq %xmm0, %xmm3, %xmm0
+; AVX1-FALLBACK-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; AVX1-FALLBACK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-FALLBACK-NEXT: retq
;
; AVX2-NEXT: vpaddq %ymm1, %ymm4, %ymm1
; AVX2-NEXT: vpsllq $32, %ymm1, %ymm1
; AVX2-NEXT: vpmuludq %ymm3, %ymm2, %ymm2
-; AVX2-NEXT: vpaddq %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vpaddq %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; XOP-FALLBACK-LABEL: vec256_i64_signed_reg_mem:
; XOP-FALLBACK-NEXT: vpaddq %xmm2, %xmm7, %xmm2
; XOP-FALLBACK-NEXT: vpsllq $32, %xmm2, %xmm2
; XOP-FALLBACK-NEXT: vpmuludq %xmm5, %xmm6, %xmm5
-; XOP-FALLBACK-NEXT: vpaddq %xmm4, %xmm2, %xmm2
-; XOP-FALLBACK-NEXT: vpaddq %xmm2, %xmm5, %xmm2
-; XOP-FALLBACK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
+; XOP-FALLBACK-NEXT: vpaddq %xmm4, %xmm5, %xmm4
+; XOP-FALLBACK-NEXT: vpaddq %xmm2, %xmm4, %xmm2
; XOP-FALLBACK-NEXT: vpaddq %xmm0, %xmm3, %xmm0
+; XOP-FALLBACK-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; XOP-FALLBACK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; XOP-FALLBACK-NEXT: retq
;
; XOPAVX1-NEXT: vpaddq %xmm2, %xmm7, %xmm2
; XOPAVX1-NEXT: vpsllq $32, %xmm2, %xmm2
; XOPAVX1-NEXT: vpmuludq %xmm5, %xmm6, %xmm5
-; XOPAVX1-NEXT: vpaddq %xmm4, %xmm2, %xmm2
-; XOPAVX1-NEXT: vpaddq %xmm2, %xmm5, %xmm2
-; XOPAVX1-NEXT: vpaddq %xmm0, %xmm1, %xmm0
+; XOPAVX1-NEXT: vpaddq %xmm4, %xmm5, %xmm4
+; XOPAVX1-NEXT: vpaddq %xmm2, %xmm4, %xmm2
; XOPAVX1-NEXT: vpaddq %xmm0, %xmm3, %xmm0
+; XOPAVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; XOPAVX1-NEXT: retq
;
; AVX512F-NEXT: vpaddq %ymm1, %ymm4, %ymm1
; AVX512F-NEXT: vpsllq $32, %ymm1, %ymm1
; AVX512F-NEXT: vpmuludq %ymm3, %ymm2, %ymm2
-; AVX512F-NEXT: vpaddq %ymm0, %ymm1, %ymm0
; AVX512F-NEXT: vpaddq %ymm0, %ymm2, %ymm0
+; AVX512F-NEXT: vpaddq %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: vec256_i64_signed_reg_mem:
; AVX512BW-FALLBACK-NEXT: vpaddq %ymm1, %ymm4, %ymm1
; AVX512BW-FALLBACK-NEXT: vpsllq $32, %ymm1, %ymm1
; AVX512BW-FALLBACK-NEXT: vpmuludq %ymm3, %ymm2, %ymm2
-; AVX512BW-FALLBACK-NEXT: vpaddq %ymm0, %ymm1, %ymm0
; AVX512BW-FALLBACK-NEXT: vpaddq %ymm0, %ymm2, %ymm0
+; AVX512BW-FALLBACK-NEXT: vpaddq %ymm1, %ymm0, %ymm0
; AVX512BW-FALLBACK-NEXT: retq
%a2 = load <4 x i64>, ptr %a2_addr
%t3 = icmp sgt <4 x i64> %a1, %a2 ; signed
; AVX1-FALLBACK-NEXT: vpaddq %xmm1, %xmm7, %xmm1
; AVX1-FALLBACK-NEXT: vpsllq $32, %xmm1, %xmm1
; AVX1-FALLBACK-NEXT: vpmuludq %xmm5, %xmm6, %xmm5
-; AVX1-FALLBACK-NEXT: vpaddq %xmm3, %xmm1, %xmm1
-; AVX1-FALLBACK-NEXT: vpaddq %xmm1, %xmm5, %xmm1
-; AVX1-FALLBACK-NEXT: vpaddq %xmm2, %xmm0, %xmm0
-; AVX1-FALLBACK-NEXT: vpaddq %xmm0, %xmm4, %xmm0
+; AVX1-FALLBACK-NEXT: vpaddq %xmm3, %xmm5, %xmm3
+; AVX1-FALLBACK-NEXT: vpaddq %xmm1, %xmm3, %xmm1
+; AVX1-FALLBACK-NEXT: vpaddq %xmm2, %xmm4, %xmm2
+; AVX1-FALLBACK-NEXT: vpaddq %xmm0, %xmm2, %xmm0
; AVX1-FALLBACK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-FALLBACK-NEXT: retq
;
; AVX2-NEXT: vpaddq %ymm1, %ymm4, %ymm1
; AVX2-NEXT: vpsllq $32, %ymm1, %ymm1
; AVX2-NEXT: vpmuludq %ymm3, %ymm2, %ymm2
-; AVX2-NEXT: vpaddq %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vpaddq %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; XOP-FALLBACK-LABEL: vec256_i64_signed_mem_mem:
; XOP-FALLBACK-NEXT: vpaddq %xmm2, %xmm7, %xmm2
; XOP-FALLBACK-NEXT: vpsllq $32, %xmm2, %xmm2
; XOP-FALLBACK-NEXT: vpmuludq %xmm5, %xmm6, %xmm5
-; XOP-FALLBACK-NEXT: vpaddq %xmm3, %xmm2, %xmm2
-; XOP-FALLBACK-NEXT: vpaddq %xmm2, %xmm5, %xmm2
-; XOP-FALLBACK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
+; XOP-FALLBACK-NEXT: vpaddq %xmm3, %xmm5, %xmm3
+; XOP-FALLBACK-NEXT: vpaddq %xmm2, %xmm3, %xmm2
; XOP-FALLBACK-NEXT: vpaddq %xmm0, %xmm4, %xmm0
+; XOP-FALLBACK-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; XOP-FALLBACK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; XOP-FALLBACK-NEXT: retq
;
; XOPAVX1-NEXT: vpaddq %xmm2, %xmm7, %xmm2
; XOPAVX1-NEXT: vpsllq $32, %xmm2, %xmm2
; XOPAVX1-NEXT: vpmuludq %xmm5, %xmm6, %xmm5
-; XOPAVX1-NEXT: vpaddq %xmm3, %xmm2, %xmm2
-; XOPAVX1-NEXT: vpaddq %xmm2, %xmm5, %xmm2
-; XOPAVX1-NEXT: vpaddq %xmm0, %xmm1, %xmm0
+; XOPAVX1-NEXT: vpaddq %xmm3, %xmm5, %xmm3
+; XOPAVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2
; XOPAVX1-NEXT: vpaddq %xmm0, %xmm4, %xmm0
+; XOPAVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; XOPAVX1-NEXT: retq
;
; AVX512F-NEXT: vpaddq %ymm1, %ymm4, %ymm1
; AVX512F-NEXT: vpsllq $32, %ymm1, %ymm1
; AVX512F-NEXT: vpmuludq %ymm3, %ymm2, %ymm2
-; AVX512F-NEXT: vpaddq %ymm0, %ymm1, %ymm0
; AVX512F-NEXT: vpaddq %ymm0, %ymm2, %ymm0
+; AVX512F-NEXT: vpaddq %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: vec256_i64_signed_mem_mem:
; AVX512BW-FALLBACK-NEXT: vpaddq %ymm1, %ymm4, %ymm1
; AVX512BW-FALLBACK-NEXT: vpsllq $32, %ymm1, %ymm1
; AVX512BW-FALLBACK-NEXT: vpmuludq %ymm3, %ymm2, %ymm2
-; AVX512BW-FALLBACK-NEXT: vpaddq %ymm0, %ymm1, %ymm0
; AVX512BW-FALLBACK-NEXT: vpaddq %ymm0, %ymm2, %ymm0
+; AVX512BW-FALLBACK-NEXT: vpaddq %ymm1, %ymm0, %ymm0
; AVX512BW-FALLBACK-NEXT: retq
%a1 = load <4 x i64>, ptr %a1_addr
%a2 = load <4 x i64>, ptr %a2_addr
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: cmpl %ebp, %eax
-; X86-NEXT: movl %edi, %ecx
-; X86-NEXT: sbbl %esi, %ecx
-; X86-NEXT: setl %cl
-; X86-NEXT: movzbl %cl, %edx
+; X86-NEXT: cmpl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl %edi, %edx
+; X86-NEXT: sbbl %ebp, %edx
+; X86-NEXT: setl %dl
+; X86-NEXT: movzbl %dl, %ebx
; X86-NEXT: jl .LBB5_1
; X86-NEXT: # %bb.2:
-; X86-NEXT: movl %esi, %ebx
-; X86-NEXT: movl %ebp, %esi
; X86-NEXT: movl %ebp, %ecx
+; X86-NEXT: movl %ebp, %edx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: jmp .LBB5_3
; X86-NEXT: .LBB5_1:
-; X86-NEXT: movl %edi, %ebx
-; X86-NEXT: movl %eax, %ecx
-; X86-NEXT: movl %esi, %edi
-; X86-NEXT: movl %ebp, %esi
-; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: movl %edi, %edx
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: movl %ebp, %ecx
+; X86-NEXT: movl %ebp, %edi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: .LBB5_3:
-; X86-NEXT: negl %edx
-; X86-NEXT: movl %edx, %ebp
+; X86-NEXT: negl %ebx
+; X86-NEXT: movl %ebx, %ebp
; X86-NEXT: orl $1, %ebp
-; X86-NEXT: subl %ecx, %eax
-; X86-NEXT: sbbl %ebx, %edi
+; X86-NEXT: subl %esi, %eax
+; X86-NEXT: sbbl %edx, %edi
; X86-NEXT: shrdl $1, %edi, %eax
+; X86-NEXT: imull %eax, %ebx
+; X86-NEXT: mull %ebp
+; X86-NEXT: addl %ebx, %edx
; X86-NEXT: shrl %edi
-; X86-NEXT: imull %eax, %edx
; X86-NEXT: imull %ebp, %edi
-; X86-NEXT: addl %edx, %edi
-; X86-NEXT: mull %ebp
; X86-NEXT: addl %edi, %edx
-; X86-NEXT: addl %esi, %eax
-; X86-NEXT: adcl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: addl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: adcl %ecx, %edx
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: popl %ebx
; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: xorl %edx, %edx
-; X86-NEXT: cmpl %ebx, %eax
-; X86-NEXT: movl %edi, %ecx
-; X86-NEXT: sbbl %esi, %ecx
-; X86-NEXT: setb %cl
-; X86-NEXT: sbbl %edx, %edx
-; X86-NEXT: testb %cl, %cl
+; X86-NEXT: xorl %ebx, %ebx
+; X86-NEXT: cmpl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl %edi, %edx
+; X86-NEXT: sbbl %ebp, %edx
+; X86-NEXT: setb %dl
+; X86-NEXT: sbbl %ebx, %ebx
+; X86-NEXT: testb %dl, %dl
; X86-NEXT: jne .LBB6_1
; X86-NEXT: # %bb.2:
-; X86-NEXT: movl %esi, %ebp
-; X86-NEXT: movl %ebx, %esi
-; X86-NEXT: movl %ebx, %ecx
+; X86-NEXT: movl %ebp, %ecx
+; X86-NEXT: movl %ebp, %edx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: jmp .LBB6_3
; X86-NEXT: .LBB6_1:
-; X86-NEXT: movl %edi, %ebp
-; X86-NEXT: movl %eax, %ecx
-; X86-NEXT: movl %esi, %edi
-; X86-NEXT: movl %ebx, %esi
-; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: movl %edi, %edx
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: movl %ebp, %ecx
+; X86-NEXT: movl %ebp, %edi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: .LBB6_3:
-; X86-NEXT: movl %edx, %ebx
-; X86-NEXT: orl $1, %ebx
-; X86-NEXT: subl %ecx, %eax
-; X86-NEXT: sbbl %ebp, %edi
+; X86-NEXT: movl %ebx, %ebp
+; X86-NEXT: orl $1, %ebp
+; X86-NEXT: subl %esi, %eax
+; X86-NEXT: sbbl %edx, %edi
; X86-NEXT: shrdl $1, %edi, %eax
+; X86-NEXT: imull %eax, %ebx
+; X86-NEXT: mull %ebp
+; X86-NEXT: addl %ebx, %edx
; X86-NEXT: shrl %edi
-; X86-NEXT: imull %eax, %edx
-; X86-NEXT: imull %ebx, %edi
-; X86-NEXT: addl %edx, %edi
-; X86-NEXT: mull %ebx
+; X86-NEXT: imull %ebp, %edi
; X86-NEXT: addl %edi, %edx
-; X86-NEXT: addl %esi, %eax
-; X86-NEXT: adcl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: addl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: adcl %ecx, %edx
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: popl %ebx
; X86-NEXT: pushl %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movl (%edx), %ecx
-; X86-NEXT: movl 4(%edx), %esi
-; X86-NEXT: cmpl %ecx, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl (%ecx), %esi
+; X86-NEXT: movl 4(%ecx), %ecx
+; X86-NEXT: cmpl %esi, %eax
; X86-NEXT: movl %edi, %edx
-; X86-NEXT: sbbl %esi, %edx
+; X86-NEXT: sbbl %ecx, %edx
; X86-NEXT: setl %dl
-; X86-NEXT: movzbl %dl, %edx
+; X86-NEXT: movzbl %dl, %ebx
; X86-NEXT: jl .LBB7_1
; X86-NEXT: # %bb.2:
-; X86-NEXT: movl %esi, (%esp) # 4-byte Spill
-; X86-NEXT: movl %ecx, %ebx
+; X86-NEXT: movl %ecx, (%esp) # 4-byte Spill
+; X86-NEXT: movl %esi, %edx
; X86-NEXT: jmp .LBB7_3
; X86-NEXT: .LBB7_1:
; X86-NEXT: movl %edi, (%esp) # 4-byte Spill
-; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: movl %esi, %edi
-; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: movl %eax, %edx
+; X86-NEXT: movl %ecx, %edi
+; X86-NEXT: movl %esi, %eax
; X86-NEXT: .LBB7_3:
-; X86-NEXT: negl %edx
-; X86-NEXT: movl %edx, %ebp
+; X86-NEXT: negl %ebx
+; X86-NEXT: movl %ebx, %ebp
; X86-NEXT: orl $1, %ebp
-; X86-NEXT: subl %ebx, %eax
+; X86-NEXT: subl %edx, %eax
; X86-NEXT: sbbl (%esp), %edi # 4-byte Folded Reload
; X86-NEXT: shrdl $1, %edi, %eax
+; X86-NEXT: imull %eax, %ebx
+; X86-NEXT: mull %ebp
+; X86-NEXT: addl %ebx, %edx
; X86-NEXT: shrl %edi
-; X86-NEXT: imull %eax, %edx
; X86-NEXT: imull %ebp, %edi
-; X86-NEXT: addl %edx, %edi
-; X86-NEXT: mull %ebp
; X86-NEXT: addl %edi, %edx
-; X86-NEXT: addl %ecx, %eax
-; X86-NEXT: adcl %esi, %edx
+; X86-NEXT: addl %esi, %eax
+; X86-NEXT: adcl %ecx, %edx
; X86-NEXT: addl $4, %esp
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl (%ecx), %eax
-; X86-NEXT: movl 4(%ecx), %edi
-; X86-NEXT: cmpl %ebp, %eax
-; X86-NEXT: movl %edi, %ecx
-; X86-NEXT: sbbl %esi, %ecx
-; X86-NEXT: setl %cl
-; X86-NEXT: movzbl %cl, %edx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl (%edx), %eax
+; X86-NEXT: movl 4(%edx), %edi
+; X86-NEXT: cmpl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl %edi, %edx
+; X86-NEXT: sbbl %ebp, %edx
+; X86-NEXT: setl %dl
+; X86-NEXT: movzbl %dl, %ebx
; X86-NEXT: jl .LBB8_1
; X86-NEXT: # %bb.2:
-; X86-NEXT: movl %esi, %ebx
-; X86-NEXT: movl %ebp, %esi
; X86-NEXT: movl %ebp, %ecx
+; X86-NEXT: movl %ebp, %edx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: jmp .LBB8_3
; X86-NEXT: .LBB8_1:
-; X86-NEXT: movl %edi, %ebx
-; X86-NEXT: movl %eax, %ecx
-; X86-NEXT: movl %esi, %edi
-; X86-NEXT: movl %ebp, %esi
-; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: movl %edi, %edx
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: movl %ebp, %ecx
+; X86-NEXT: movl %ebp, %edi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: .LBB8_3:
-; X86-NEXT: negl %edx
-; X86-NEXT: movl %edx, %ebp
+; X86-NEXT: negl %ebx
+; X86-NEXT: movl %ebx, %ebp
; X86-NEXT: orl $1, %ebp
-; X86-NEXT: subl %ecx, %eax
-; X86-NEXT: sbbl %ebx, %edi
+; X86-NEXT: subl %esi, %eax
+; X86-NEXT: sbbl %edx, %edi
; X86-NEXT: shrdl $1, %edi, %eax
+; X86-NEXT: imull %eax, %ebx
+; X86-NEXT: mull %ebp
+; X86-NEXT: addl %ebx, %edx
; X86-NEXT: shrl %edi
-; X86-NEXT: imull %eax, %edx
; X86-NEXT: imull %ebp, %edi
-; X86-NEXT: addl %edx, %edi
-; X86-NEXT: mull %ebp
; X86-NEXT: addl %edi, %edx
-; X86-NEXT: addl %esi, %eax
-; X86-NEXT: adcl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: addl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: adcl %ecx, %edx
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: popl %ebx
; X86-NEXT: pushl %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl (%eax), %ecx
-; X86-NEXT: movl 4(%eax), %esi
+; X86-NEXT: movl (%eax), %esi
+; X86-NEXT: movl 4(%eax), %ecx
; X86-NEXT: movl (%edx), %eax
; X86-NEXT: movl 4(%edx), %edi
-; X86-NEXT: cmpl %ecx, %eax
+; X86-NEXT: cmpl %esi, %eax
; X86-NEXT: movl %edi, %edx
-; X86-NEXT: sbbl %esi, %edx
+; X86-NEXT: sbbl %ecx, %edx
; X86-NEXT: setl %dl
-; X86-NEXT: movzbl %dl, %edx
+; X86-NEXT: movzbl %dl, %ebx
; X86-NEXT: jl .LBB9_1
; X86-NEXT: # %bb.2:
-; X86-NEXT: movl %esi, (%esp) # 4-byte Spill
-; X86-NEXT: movl %ecx, %ebx
+; X86-NEXT: movl %ecx, (%esp) # 4-byte Spill
+; X86-NEXT: movl %esi, %edx
; X86-NEXT: jmp .LBB9_3
; X86-NEXT: .LBB9_1:
; X86-NEXT: movl %edi, (%esp) # 4-byte Spill
-; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: movl %esi, %edi
-; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: movl %eax, %edx
+; X86-NEXT: movl %ecx, %edi
+; X86-NEXT: movl %esi, %eax
; X86-NEXT: .LBB9_3:
-; X86-NEXT: negl %edx
-; X86-NEXT: movl %edx, %ebp
+; X86-NEXT: negl %ebx
+; X86-NEXT: movl %ebx, %ebp
; X86-NEXT: orl $1, %ebp
-; X86-NEXT: subl %ebx, %eax
+; X86-NEXT: subl %edx, %eax
; X86-NEXT: sbbl (%esp), %edi # 4-byte Folded Reload
; X86-NEXT: shrdl $1, %edi, %eax
+; X86-NEXT: imull %eax, %ebx
+; X86-NEXT: mull %ebp
+; X86-NEXT: addl %ebx, %edx
; X86-NEXT: shrl %edi
-; X86-NEXT: imull %eax, %edx
; X86-NEXT: imull %ebp, %edi
-; X86-NEXT: addl %edx, %edi
-; X86-NEXT: mull %ebp
; X86-NEXT: addl %edi, %edx
-; X86-NEXT: addl %ecx, %eax
-; X86-NEXT: adcl %esi, %edx
+; X86-NEXT: addl %esi, %eax
+; X86-NEXT: adcl %ecx, %edx
; X86-NEXT: addl $4, %esp
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; SSE-LABEL: allones_v64i8_sign:
; SSE: # %bb.0:
; SSE-NEXT: pand %xmm2, %xmm0
-; SSE-NEXT: pand %xmm1, %xmm0
-; SSE-NEXT: pand %xmm3, %xmm0
-; SSE-NEXT: pmovmskb %xmm0, %eax
+; SSE-NEXT: pand %xmm3, %xmm1
+; SSE-NEXT: pand %xmm0, %xmm1
+; SSE-NEXT: pmovmskb %xmm1, %eax
; SSE-NEXT: cmpw $-1, %ax
; SSE-NEXT: sete %al
; SSE-NEXT: retq
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpand %xmm0, %xmm2, %xmm0
-; AVX1-NEXT: vpand %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpmovmskb %xmm0, %eax
; AVX1-NEXT: cmpw $-1, %ax
; AVX1-NEXT: sete %al
; SSE-LABEL: allzeros_v64i8_sign:
; SSE: # %bb.0:
; SSE-NEXT: por %xmm3, %xmm1
-; SSE-NEXT: por %xmm2, %xmm1
-; SSE-NEXT: por %xmm0, %xmm1
-; SSE-NEXT: pmovmskb %xmm1, %eax
+; SSE-NEXT: por %xmm2, %xmm0
+; SSE-NEXT: por %xmm1, %xmm0
+; SSE-NEXT: pmovmskb %xmm0, %eax
; SSE-NEXT: testl %eax, %eax
; SSE-NEXT: sete %al
; SSE-NEXT: retq
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpor %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpmovmskb %xmm0, %eax
; AVX1-NEXT: testl %eax, %eax
; AVX1-NEXT: sete %al
; SSE-LABEL: allones_v64i8_and1:
; SSE: # %bb.0:
; SSE-NEXT: pand %xmm2, %xmm0
-; SSE-NEXT: pand %xmm1, %xmm0
-; SSE-NEXT: pand %xmm3, %xmm0
-; SSE-NEXT: psllw $7, %xmm0
-; SSE-NEXT: pmovmskb %xmm0, %eax
+; SSE-NEXT: pand %xmm3, %xmm1
+; SSE-NEXT: pand %xmm0, %xmm1
+; SSE-NEXT: psllw $7, %xmm1
+; SSE-NEXT: pmovmskb %xmm1, %eax
; SSE-NEXT: cmpw $-1, %ax
; SSE-NEXT: sete %al
; SSE-NEXT: retq
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpand %xmm0, %xmm2, %xmm0
-; AVX1-NEXT: vpand %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpsllw $7, %xmm0, %xmm0
; AVX1-NEXT: vpmovmskb %xmm0, %eax
; AVX1-NEXT: cmpw $-1, %ax
; SSE-LABEL: allzeros_v64i8_and1:
; SSE: # %bb.0:
; SSE-NEXT: por %xmm3, %xmm1
-; SSE-NEXT: por %xmm2, %xmm1
-; SSE-NEXT: por %xmm0, %xmm1
-; SSE-NEXT: psllw $7, %xmm1
-; SSE-NEXT: pmovmskb %xmm1, %eax
+; SSE-NEXT: por %xmm2, %xmm0
+; SSE-NEXT: por %xmm1, %xmm0
+; SSE-NEXT: psllw $7, %xmm0
+; SSE-NEXT: pmovmskb %xmm0, %eax
; SSE-NEXT: testl %eax, %eax
; SSE-NEXT: sete %al
; SSE-NEXT: retq
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpor %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpsllw $7, %xmm0, %xmm0
; AVX1-NEXT: vpmovmskb %xmm0, %eax
; AVX1-NEXT: testl %eax, %eax
; SSE-LABEL: allones_v64i8_and4:
; SSE: # %bb.0:
; SSE-NEXT: pand %xmm2, %xmm0
-; SSE-NEXT: pand %xmm1, %xmm0
-; SSE-NEXT: pand %xmm3, %xmm0
-; SSE-NEXT: psllw $5, %xmm0
-; SSE-NEXT: pmovmskb %xmm0, %eax
+; SSE-NEXT: pand %xmm3, %xmm1
+; SSE-NEXT: pand %xmm0, %xmm1
+; SSE-NEXT: psllw $5, %xmm1
+; SSE-NEXT: pmovmskb %xmm1, %eax
; SSE-NEXT: cmpw $-1, %ax
; SSE-NEXT: sete %al
; SSE-NEXT: retq
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpand %xmm0, %xmm2, %xmm0
-; AVX1-NEXT: vpand %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpsllw $5, %xmm0, %xmm0
; AVX1-NEXT: vpmovmskb %xmm0, %eax
; AVX1-NEXT: cmpw $-1, %ax
; SSE-LABEL: allzeros_v64i8_and4:
; SSE: # %bb.0:
; SSE-NEXT: por %xmm3, %xmm1
-; SSE-NEXT: por %xmm2, %xmm1
-; SSE-NEXT: por %xmm0, %xmm1
-; SSE-NEXT: psllw $5, %xmm1
-; SSE-NEXT: pmovmskb %xmm1, %eax
+; SSE-NEXT: por %xmm2, %xmm0
+; SSE-NEXT: por %xmm1, %xmm0
+; SSE-NEXT: psllw $5, %xmm0
+; SSE-NEXT: pmovmskb %xmm0, %eax
; SSE-NEXT: testl %eax, %eax
; SSE-NEXT: sete %al
; SSE-NEXT: retq
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpor %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpsllw $5, %xmm0, %xmm0
; AVX1-NEXT: vpmovmskb %xmm0, %eax
; AVX1-NEXT: testl %eax, %eax
define i64 @test_mul_by_17(i64 %x) {
; X86-LABEL: test_mul_by_17:
; X86: # %bb.0:
-; X86-NEXT: pushl %esi
-; X86-NEXT: .cfi_def_cfa_offset 8
-; X86-NEXT: .cfi_offset %esi, -8
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl %ecx, %esi
-; X86-NEXT: shll $4, %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: shll $4, %ecx
+; X86-NEXT: addl %eax, %ecx
; X86-NEXT: movl $17, %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: addl %esi, %edx
; X86-NEXT: addl %ecx, %edx
-; X86-NEXT: popl %esi
-; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
;
; X86-NOOPT-LABEL: test_mul_by_17:
; X86-NEXT: leal (%ecx,%eax,4), %esi
; X86-NEXT: movl $22, %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: addl %esi, %edx
; X86-NEXT: addl %ecx, %edx
+; X86-NEXT: addl %esi, %edx
; X86-NEXT: popl %esi
; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
; X86-NEXT: addl %ecx, %ecx
; X86-NEXT: movl $29, %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: addl %esi, %edx
; X86-NEXT: addl %ecx, %edx
+; X86-NEXT: addl %esi, %edx
; X86-NEXT: popl %esi
; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
; X86-NEXT: movl %esi, %eax
; X86-NEXT: mull %edi
; X86-NEXT: imull %esi, %ebx
+; X86-NEXT: addl %ebx, %edx
; X86-NEXT: imull %ecx, %edi
-; X86-NEXT: addl %ebx, %edi
; X86-NEXT: addl %edi, %edx
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NOOPT-NEXT: movl %esi, %eax
; X86-NOOPT-NEXT: mull %edi
; X86-NOOPT-NEXT: imull %esi, %ebx
+; X86-NOOPT-NEXT: addl %ebx, %edx
; X86-NOOPT-NEXT: imull %ecx, %edi
-; X86-NOOPT-NEXT: addl %ebx, %edi
; X86-NOOPT-NEXT: addl %edi, %edx
; X86-NOOPT-NEXT: popl %esi
; X86-NOOPT-NEXT: popl %edi
define i32 @foo() local_unnamed_addr #0 {
; X86-LABEL: foo:
; X86: # %bb.0:
-; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %ebp
; X86-NEXT: .cfi_def_cfa_offset 8
-; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %ebx
; X86-NEXT: .cfi_def_cfa_offset 12
-; X86-NEXT: pushl %esi
+; X86-NEXT: pushl %edi
; X86-NEXT: .cfi_def_cfa_offset 16
-; X86-NEXT: .cfi_offset %esi, -16
-; X86-NEXT: .cfi_offset %edi, -12
-; X86-NEXT: .cfi_offset %ebx, -8
+; X86-NEXT: pushl %esi
+; X86-NEXT: .cfi_def_cfa_offset 20
+; X86-NEXT: .cfi_offset %esi, -20
+; X86-NEXT: .cfi_offset %edi, -16
+; X86-NEXT: .cfi_offset %ebx, -12
+; X86-NEXT: .cfi_offset %ebp, -8
; X86-NEXT: pushl $0
; X86-NEXT: .cfi_adjust_cfa_offset 4
; X86-NEXT: pushl $1
; X86-NEXT: calll mult@PLT
; X86-NEXT: addl $8, %esp
; X86-NEXT: .cfi_adjust_cfa_offset -8
-; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: xorl $2, %ebx
+; X86-NEXT: movl %eax, %edi
+; X86-NEXT: xorl $2, %edi
+; X86-NEXT: orl %esi, %edi
; X86-NEXT: pushl $1
; X86-NEXT: .cfi_adjust_cfa_offset 4
; X86-NEXT: pushl $3
; X86-NEXT: calll mult@PLT
; X86-NEXT: addl $8, %esp
; X86-NEXT: .cfi_adjust_cfa_offset -8
-; X86-NEXT: movl %eax, %edi
-; X86-NEXT: xorl $3, %edi
-; X86-NEXT: orl %ebx, %edi
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: xorl $3, %ebx
; X86-NEXT: pushl $2
; X86-NEXT: .cfi_adjust_cfa_offset 4
; X86-NEXT: pushl $4
; X86-NEXT: calll mult@PLT
; X86-NEXT: addl $8, %esp
; X86-NEXT: .cfi_adjust_cfa_offset -8
-; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: xorl $4, %ebx
-; X86-NEXT: orl %edi, %ebx
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: xorl $4, %esi
+; X86-NEXT: orl %ebx, %esi
+; X86-NEXT: orl %edi, %esi
; X86-NEXT: pushl $2
; X86-NEXT: .cfi_adjust_cfa_offset 4
; X86-NEXT: pushl $5
; X86-NEXT: .cfi_adjust_cfa_offset -8
; X86-NEXT: movl %eax, %edi
; X86-NEXT: xorl $5, %edi
-; X86-NEXT: orl %ebx, %edi
; X86-NEXT: pushl $3
; X86-NEXT: .cfi_adjust_cfa_offset 4
; X86-NEXT: pushl $6
; X86-NEXT: movl %eax, %edi
; X86-NEXT: xorl $7, %edi
; X86-NEXT: orl %ebx, %edi
+; X86-NEXT: orl %esi, %edi
; X86-NEXT: pushl $4
; X86-NEXT: .cfi_adjust_cfa_offset 4
; X86-NEXT: pushl $8
; X86-NEXT: .cfi_adjust_cfa_offset -8
; X86-NEXT: movl %eax, %ebx
; X86-NEXT: xorl $8, %ebx
-; X86-NEXT: orl %edi, %ebx
; X86-NEXT: pushl $4
; X86-NEXT: .cfi_adjust_cfa_offset 4
; X86-NEXT: pushl $9
; X86-NEXT: calll mult@PLT
; X86-NEXT: addl $8, %esp
; X86-NEXT: .cfi_adjust_cfa_offset -8
-; X86-NEXT: movl %eax, %edi
-; X86-NEXT: xorl $9, %edi
-; X86-NEXT: orl %ebx, %edi
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: xorl $9, %esi
+; X86-NEXT: orl %ebx, %esi
; X86-NEXT: pushl $5
; X86-NEXT: .cfi_adjust_cfa_offset 4
; X86-NEXT: pushl $10
; X86-NEXT: .cfi_adjust_cfa_offset -8
; X86-NEXT: movl %eax, %ebx
; X86-NEXT: xorl $10, %ebx
-; X86-NEXT: orl %edi, %ebx
+; X86-NEXT: orl %esi, %ebx
; X86-NEXT: pushl $5
; X86-NEXT: .cfi_adjust_cfa_offset 4
; X86-NEXT: pushl $11
; X86-NEXT: calll mult@PLT
; X86-NEXT: addl $8, %esp
; X86-NEXT: .cfi_adjust_cfa_offset -8
-; X86-NEXT: movl %eax, %edi
-; X86-NEXT: xorl $11, %edi
-; X86-NEXT: orl %ebx, %edi
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: xorl $11, %esi
+; X86-NEXT: orl %ebx, %esi
+; X86-NEXT: orl %edi, %esi
; X86-NEXT: pushl $6
; X86-NEXT: .cfi_adjust_cfa_offset 4
; X86-NEXT: pushl $12
; X86-NEXT: .cfi_adjust_cfa_offset -8
; X86-NEXT: movl %eax, %ebx
; X86-NEXT: xorl $12, %ebx
-; X86-NEXT: orl %edi, %ebx
; X86-NEXT: pushl $6
; X86-NEXT: .cfi_adjust_cfa_offset 4
; X86-NEXT: pushl $13
; X86-NEXT: calll mult@PLT
; X86-NEXT: addl $8, %esp
; X86-NEXT: .cfi_adjust_cfa_offset -8
-; X86-NEXT: movl %eax, %edi
-; X86-NEXT: xorl $15, %edi
-; X86-NEXT: orl %ebx, %edi
+; X86-NEXT: movl %eax, %ebp
+; X86-NEXT: xorl $15, %ebp
+; X86-NEXT: orl %ebx, %ebp
; X86-NEXT: pushl $8
; X86-NEXT: .cfi_adjust_cfa_offset 4
; X86-NEXT: pushl $16
; X86-NEXT: calll mult@PLT
; X86-NEXT: addl $8, %esp
; X86-NEXT: .cfi_adjust_cfa_offset -8
-; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: xorl $16, %ebx
-; X86-NEXT: orl %edi, %ebx
+; X86-NEXT: movl %eax, %edi
+; X86-NEXT: xorl $16, %edi
+; X86-NEXT: orl %ebp, %edi
+; X86-NEXT: orl %esi, %edi
; X86-NEXT: pushl $8
; X86-NEXT: .cfi_adjust_cfa_offset 4
; X86-NEXT: pushl $17
; X86-NEXT: calll mult@PLT
; X86-NEXT: addl $8, %esp
; X86-NEXT: .cfi_adjust_cfa_offset -8
-; X86-NEXT: movl %eax, %edi
-; X86-NEXT: xorl $17, %edi
-; X86-NEXT: orl %ebx, %edi
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: xorl $17, %ebx
; X86-NEXT: pushl $9
; X86-NEXT: .cfi_adjust_cfa_offset 4
; X86-NEXT: pushl $18
; X86-NEXT: calll mult@PLT
; X86-NEXT: addl $8, %esp
; X86-NEXT: .cfi_adjust_cfa_offset -8
-; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: xorl $18, %ebx
-; X86-NEXT: orl %edi, %ebx
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: xorl $18, %esi
+; X86-NEXT: orl %ebx, %esi
; X86-NEXT: pushl $9
; X86-NEXT: .cfi_adjust_cfa_offset 4
; X86-NEXT: pushl $19
; X86-NEXT: calll mult@PLT
; X86-NEXT: addl $8, %esp
; X86-NEXT: .cfi_adjust_cfa_offset -8
-; X86-NEXT: movl %eax, %edi
-; X86-NEXT: xorl $19, %edi
-; X86-NEXT: orl %ebx, %edi
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: xorl $19, %ebx
+; X86-NEXT: orl %esi, %ebx
; X86-NEXT: pushl $10
; X86-NEXT: .cfi_adjust_cfa_offset 4
; X86-NEXT: pushl $20
; X86-NEXT: calll mult@PLT
; X86-NEXT: addl $8, %esp
; X86-NEXT: .cfi_adjust_cfa_offset -8
-; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: xorl $20, %ebx
-; X86-NEXT: orl %edi, %ebx
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: xorl $20, %esi
+; X86-NEXT: orl %ebx, %esi
; X86-NEXT: pushl $10
; X86-NEXT: .cfi_adjust_cfa_offset 4
; X86-NEXT: pushl $21
; X86-NEXT: calll mult@PLT
; X86-NEXT: addl $8, %esp
; X86-NEXT: .cfi_adjust_cfa_offset -8
-; X86-NEXT: movl %eax, %edi
-; X86-NEXT: xorl $21, %edi
-; X86-NEXT: orl %ebx, %edi
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: xorl $21, %ebx
+; X86-NEXT: orl %esi, %ebx
; X86-NEXT: pushl $11
; X86-NEXT: .cfi_adjust_cfa_offset 4
; X86-NEXT: pushl $22
; X86-NEXT: calll mult@PLT
; X86-NEXT: addl $8, %esp
; X86-NEXT: .cfi_adjust_cfa_offset -8
-; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: xorl $22, %ebx
-; X86-NEXT: orl %edi, %ebx
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: xorl $22, %esi
+; X86-NEXT: orl %ebx, %esi
+; X86-NEXT: orl %edi, %esi
; X86-NEXT: pushl $11
; X86-NEXT: .cfi_adjust_cfa_offset 4
; X86-NEXT: pushl $23
; X86-NEXT: calll mult@PLT
; X86-NEXT: addl $8, %esp
; X86-NEXT: .cfi_adjust_cfa_offset -8
-; X86-NEXT: movl %eax, %edi
-; X86-NEXT: xorl $23, %edi
-; X86-NEXT: orl %ebx, %edi
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: xorl $23, %ebx
; X86-NEXT: pushl $12
; X86-NEXT: .cfi_adjust_cfa_offset 4
; X86-NEXT: pushl $24
; X86-NEXT: calll mult@PLT
; X86-NEXT: addl $8, %esp
; X86-NEXT: .cfi_adjust_cfa_offset -8
-; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: xorl $24, %ebx
-; X86-NEXT: orl %edi, %ebx
+; X86-NEXT: movl %eax, %edi
+; X86-NEXT: xorl $24, %edi
+; X86-NEXT: orl %ebx, %edi
; X86-NEXT: pushl $12
; X86-NEXT: .cfi_adjust_cfa_offset 4
; X86-NEXT: pushl $25
; X86-NEXT: calll mult@PLT
; X86-NEXT: addl $8, %esp
; X86-NEXT: .cfi_adjust_cfa_offset -8
-; X86-NEXT: movl %eax, %edi
-; X86-NEXT: xorl $25, %edi
-; X86-NEXT: orl %ebx, %edi
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: xorl $25, %ebx
+; X86-NEXT: orl %edi, %ebx
; X86-NEXT: pushl $13
; X86-NEXT: .cfi_adjust_cfa_offset 4
; X86-NEXT: pushl $26
; X86-NEXT: calll mult@PLT
; X86-NEXT: addl $8, %esp
; X86-NEXT: .cfi_adjust_cfa_offset -8
-; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: xorl $26, %ebx
-; X86-NEXT: orl %edi, %ebx
+; X86-NEXT: movl %eax, %edi
+; X86-NEXT: xorl $26, %edi
+; X86-NEXT: orl %ebx, %edi
; X86-NEXT: pushl $13
; X86-NEXT: .cfi_adjust_cfa_offset 4
; X86-NEXT: pushl $27
; X86-NEXT: calll mult@PLT
; X86-NEXT: addl $8, %esp
; X86-NEXT: .cfi_adjust_cfa_offset -8
-; X86-NEXT: movl %eax, %edi
-; X86-NEXT: xorl $27, %edi
-; X86-NEXT: orl %ebx, %edi
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: xorl $27, %ebx
+; X86-NEXT: orl %edi, %ebx
; X86-NEXT: pushl $14
; X86-NEXT: .cfi_adjust_cfa_offset 4
; X86-NEXT: pushl $28
; X86-NEXT: calll mult@PLT
; X86-NEXT: addl $8, %esp
; X86-NEXT: .cfi_adjust_cfa_offset -8
-; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: xorl $28, %ebx
-; X86-NEXT: orl %edi, %ebx
+; X86-NEXT: movl %eax, %ebp
+; X86-NEXT: xorl $28, %ebp
+; X86-NEXT: orl %ebx, %ebp
; X86-NEXT: pushl $14
; X86-NEXT: .cfi_adjust_cfa_offset 4
; X86-NEXT: pushl $29
; X86-NEXT: .cfi_adjust_cfa_offset -8
; X86-NEXT: movl %eax, %edi
; X86-NEXT: xorl $29, %edi
-; X86-NEXT: orl %ebx, %edi
+; X86-NEXT: orl %ebp, %edi
+; X86-NEXT: orl %esi, %edi
; X86-NEXT: pushl $15
; X86-NEXT: .cfi_adjust_cfa_offset 4
; X86-NEXT: pushl $30
; X86-NEXT: .cfi_adjust_cfa_offset -8
; X86-NEXT: movl %eax, %ebx
; X86-NEXT: xorl $30, %ebx
-; X86-NEXT: orl %edi, %ebx
; X86-NEXT: pushl $15
; X86-NEXT: .cfi_adjust_cfa_offset 4
; X86-NEXT: pushl $31
; X86-NEXT: calll mult@PLT
; X86-NEXT: addl $8, %esp
; X86-NEXT: .cfi_adjust_cfa_offset -8
-; X86-NEXT: movl %eax, %edi
-; X86-NEXT: xorl $31, %edi
-; X86-NEXT: orl %ebx, %edi
-; X86-NEXT: orl %esi, %edi
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: xorl $31, %esi
+; X86-NEXT: orl %ebx, %esi
+; X86-NEXT: orl %edi, %esi
; X86-NEXT: pushl $16
; X86-NEXT: .cfi_adjust_cfa_offset 4
; X86-NEXT: pushl $32
; X86-NEXT: .cfi_adjust_cfa_offset -8
; X86-NEXT: xorl $32, %eax
; X86-NEXT: xorl %ecx, %ecx
-; X86-NEXT: orl %edi, %eax
+; X86-NEXT: orl %esi, %eax
; X86-NEXT: setne %cl
; X86-NEXT: negl %ecx
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: popl %esi
-; X86-NEXT: .cfi_def_cfa_offset 12
+; X86-NEXT: .cfi_def_cfa_offset 16
; X86-NEXT: popl %edi
-; X86-NEXT: .cfi_def_cfa_offset 8
+; X86-NEXT: .cfi_def_cfa_offset 12
; X86-NEXT: popl %ebx
+; X86-NEXT: .cfi_def_cfa_offset 8
+; X86-NEXT: popl %ebp
; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
;
; X32-NEXT: pushl %edi
; X32-NEXT: pushl %esi
; X32-NEXT: subl $400, %esp # imm = 0x190
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: movl 60(%ecx), %edi
-; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl 56(%ecx), %ebx
-; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl (%ebp), %esi
+; X32-NEXT: movl 60(%ecx), %esi
; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl %ebx, %eax
-; X32-NEXT: mull %esi
+; X32-NEXT: movl 56(%ecx), %edi
+; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: movl (%eax), %ebp
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: mull %ebp
; X32-NEXT: movl %edx, %ecx
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl %edi, %eax
-; X32-NEXT: mull %esi
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: mull %ebp
; X32-NEXT: movl %edx, %esi
-; X32-NEXT: movl %eax, %edi
-; X32-NEXT: addl %ecx, %edi
+; X32-NEXT: movl %eax, %ebx
+; X32-NEXT: addl %ecx, %ebx
; X32-NEXT: adcl $0, %esi
-; X32-NEXT: movl 4(%ebp), %ecx
-; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl 4(%eax), %ecx
+; X32-NEXT: movl %edi, %eax
; X32-NEXT: mull %ecx
-; X32-NEXT: movl %ecx, %ebp
+; X32-NEXT: movl %ecx, %edi
; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl %edx, %ecx
-; X32-NEXT: addl %edi, %eax
-; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: addl %ebx, %eax
+; X32-NEXT: movl %eax, (%esp) # 4-byte Spill
; X32-NEXT: adcl %esi, %ecx
; X32-NEXT: setb %bl
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT: mull %ebp
+; X32-NEXT: mull %edi
; X32-NEXT: addl %ecx, %eax
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movzbl %bl, %eax
; X32-NEXT: adcl %eax, %edx
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: movl 48(%esi), %edi
-; X32-NEXT: movl %edi, %eax
-; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT: mull %ebx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X32-NEXT: movl 48(%edi), %esi
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: mull %ebp
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl %edx, %ecx
-; X32-NEXT: movl 52(%esi), %eax
+; X32-NEXT: movl 52(%edi), %eax
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: mull %ebx
-; X32-NEXT: movl %edx, %esi
-; X32-NEXT: movl %eax, %ebp
-; X32-NEXT: addl %ecx, %ebp
-; X32-NEXT: adcl $0, %esi
-; X32-NEXT: movl %edi, %eax
+; X32-NEXT: mull %ebp
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: movl %eax, %edi
+; X32-NEXT: addl %ecx, %edi
+; X32-NEXT: adcl $0, %ebx
+; X32-NEXT: movl %esi, %eax
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X32-NEXT: mull %ecx
-; X32-NEXT: movl %edx, %ebx
-; X32-NEXT: addl %ebp, %eax
+; X32-NEXT: movl %edx, %ebp
+; X32-NEXT: addl %edi, %eax
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: adcl %esi, %ebx
-; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT: movl %edi, %eax
+; X32-NEXT: adcl %ebx, %ebp
+; X32-NEXT: setb %bl
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT: movl %esi, %eax
; X32-NEXT: mull %ecx
; X32-NEXT: movl %eax, %ecx
-; X32-NEXT: addl %ebx, %ecx
-; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X32-NEXT: addl %ebp, %ecx
+; X32-NEXT: movzbl %bl, %eax
; X32-NEXT: adcl %eax, %edx
; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-NEXT: adcl (%esp), %edx # 4-byte Folded Reload
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl 8(%eax), %ebx
-; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT: movl %esi, %eax
-; X32-NEXT: mull %ebx
-; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: movl 8(%eax), %ebp
+; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: mull %ebp
+; X32-NEXT: movl %edx, %edi
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl %edi, %eax
-; X32-NEXT: mull %ebx
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: mull %ebp
; X32-NEXT: movl %edx, %ebp
-; X32-NEXT: movl %eax, %edi
-; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NEXT: movl %eax, %esi
+; X32-NEXT: addl %edi, %esi
; X32-NEXT: adcl $0, %ebp
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl 12(%eax), %ebx
-; X32-NEXT: movl %esi, %eax
-; X32-NEXT: mull %ebx
-; X32-NEXT: movl %edx, %esi
-; X32-NEXT: addl %edi, %eax
-; X32-NEXT: movl %eax, %edi
-; X32-NEXT: adcl %ebp, %esi
-; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X32-NEXT: movl 12(%eax), %edi
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: addl %esi, %eax
+; X32-NEXT: movl %eax, %esi
+; X32-NEXT: adcl %ebp, %ebx
+; X32-NEXT: setb (%esp) # 1-byte Folded Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT: mull %ebx
-; X32-NEXT: movl %eax, %ebp
-; X32-NEXT: addl %esi, %ebp
-; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %edi, %ebp
+; X32-NEXT: movl %eax, %edi
+; X32-NEXT: addl %ebx, %edi
+; X32-NEXT: movzbl (%esp), %eax # 1-byte Folded Reload
; X32-NEXT: adcl %eax, %edx
; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: adcl $0, %ebp
+; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: adcl $0, %edi
; X32-NEXT: adcl $0, %edx
-; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
; X32-NEXT: movl %ecx, %eax
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X32-NEXT: mull %esi
-; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: movl %edx, (%esp) # 4-byte Spill
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X32-NEXT: mull %esi
; X32-NEXT: movl %edx, %esi
-; X32-NEXT: movl %eax, %edi
-; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NEXT: movl %eax, %ebx
+; X32-NEXT: addl (%esp), %ebx # 4-byte Folded Reload
; X32-NEXT: adcl $0, %esi
; X32-NEXT: movl %ecx, %eax
-; X32-NEXT: movl %ebx, %ecx
-; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: mull %ebx
-; X32-NEXT: movl %edx, %ebx
-; X32-NEXT: addl %edi, %eax
-; X32-NEXT: movl %eax, %edi
-; X32-NEXT: adcl %esi, %ebx
-; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X32-NEXT: movl %ebp, %ecx
+; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: mull %ebp
+; X32-NEXT: movl %edx, %ebp
+; X32-NEXT: addl %ebx, %eax
+; X32-NEXT: movl %eax, %ebx
+; X32-NEXT: adcl %esi, %ebp
+; X32-NEXT: setb (%esp) # 1-byte Folded Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X32-NEXT: mull %ecx
-; X32-NEXT: addl %ebx, %eax
-; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X32-NEXT: addl %ebp, %eax
+; X32-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload
; X32-NEXT: adcl %ecx, %edx
-; X32-NEXT: addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
; X32-NEXT: adcl %ecx, %eax
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: adcl $0, %edx
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: movl 40(%esi), %ebp
-; X32-NEXT: movl %ebp, %eax
-; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT: mull %edi
+; X32-NEXT: movl 40(%esi), %edi
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT: mull %ebx
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl %edx, %ecx
-; X32-NEXT: movl 44(%esi), %ebx
-; X32-NEXT: movl %ebx, %eax
-; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: mull %edi
+; X32-NEXT: movl 44(%esi), %ebp
+; X32-NEXT: movl %ebp, %eax
+; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: mull %ebx
; X32-NEXT: movl %edx, %esi
-; X32-NEXT: movl %eax, %edi
-; X32-NEXT: addl %ecx, %edi
+; X32-NEXT: movl %eax, %ebx
+; X32-NEXT: addl %ecx, %ebx
; X32-NEXT: adcl $0, %esi
-; X32-NEXT: movl %ebp, %eax
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NEXT: mull %ebp
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT: mull %edi
; X32-NEXT: movl %edx, %ecx
-; X32-NEXT: addl %edi, %eax
+; X32-NEXT: addl %ebx, %eax
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: adcl %esi, %ecx
-; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT: movl %ebx, %eax
-; X32-NEXT: mull %ebp
+; X32-NEXT: setb %bl
+; X32-NEXT: movl %ebp, %eax
+; X32-NEXT: mull %edi
; X32-NEXT: addl %ecx, %eax
-; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X32-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X32-NEXT: movzbl %bl, %eax
; X32-NEXT: adcl %eax, %edx
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X32-NEXT: movl 32(%edi), %ecx
-; X32-NEXT: movl %ecx, %eax
-; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X32-NEXT: movl 32(%ebp), %edi
+; X32-NEXT: movl %edi, %eax
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X32-NEXT: mull %esi
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl %edx, %ebp
-; X32-NEXT: movl 36(%edi), %edi
-; X32-NEXT: movl %edi, %eax
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: movl 36(%ebp), %eax
+; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: mull %esi
-; X32-NEXT: movl %edx, %esi
-; X32-NEXT: movl %eax, %ebx
-; X32-NEXT: addl %ebp, %ebx
-; X32-NEXT: adcl $0, %esi
-; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: movl %eax, %ebp
+; X32-NEXT: addl %ecx, %ebp
+; X32-NEXT: adcl $0, %ebx
+; X32-NEXT: movl %edi, %esi
+; X32-NEXT: movl %edi, %eax
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X32-NEXT: mull %ecx
-; X32-NEXT: movl %edx, %ebp
-; X32-NEXT: addl %ebx, %eax
+; X32-NEXT: movl %edx, %edi
+; X32-NEXT: addl %ebp, %eax
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: adcl %esi, %ebp
+; X32-NEXT: adcl %ebx, %edi
; X32-NEXT: setb %bl
-; X32-NEXT: movl %edi, %eax
-; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NEXT: movl %ebp, %eax
; X32-NEXT: mull %ecx
; X32-NEXT: movl %eax, %ecx
-; X32-NEXT: addl %ebp, %ecx
+; X32-NEXT: addl %edi, %ecx
; X32-NEXT: movzbl %bl, %eax
; X32-NEXT: adcl %eax, %edx
; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: adcl $0, (%esp) # 4-byte Folded Spill
; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: movl %esi, %ebx
+; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: movl %esi, %eax
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X32-NEXT: mull %esi
-; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: movl %edx, %edi
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl %edi, %eax
+; X32-NEXT: movl %ebp, %eax
; X32-NEXT: mull %esi
; X32-NEXT: movl %edx, %ebp
-; X32-NEXT: movl %eax, %edi
-; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NEXT: movl %eax, %esi
+; X32-NEXT: addl %edi, %esi
; X32-NEXT: adcl $0, %ebp
; X32-NEXT: movl %ebx, %eax
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT: mull %ebx
-; X32-NEXT: movl %edx, %esi
-; X32-NEXT: addl %edi, %eax
-; X32-NEXT: movl %eax, %edi
-; X32-NEXT: adcl %ebp, %esi
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: addl %esi, %eax
+; X32-NEXT: movl %eax, %esi
+; X32-NEXT: adcl %ebp, %ebx
; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT: mull %ebx
-; X32-NEXT: movl %eax, %ebp
-; X32-NEXT: addl %esi, %ebp
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %eax, %edi
+; X32-NEXT: addl %ebx, %edi
; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
; X32-NEXT: adcl %eax, %edx
; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: adcl $0, %ebp
+; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: adcl $0, %edi
; X32-NEXT: adcl $0, %edx
-; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NEXT: addl (%esp), %edi # 4-byte Folded Reload
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT: movl %ebx, %eax
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NEXT: movl %ebp, %eax
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X32-NEXT: mull %esi
; X32-NEXT: movl %edx, %ecx
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X32-NEXT: mull %esi
; X32-NEXT: movl %edx, %esi
-; X32-NEXT: movl %eax, %edi
-; X32-NEXT: addl %ecx, %edi
+; X32-NEXT: movl %eax, %ebx
+; X32-NEXT: addl %ecx, %ebx
; X32-NEXT: adcl $0, %esi
-; X32-NEXT: movl %ebx, %eax
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT: mull %ebx
+; X32-NEXT: movl %ebp, %eax
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NEXT: mull %ebp
; X32-NEXT: movl %edx, %ecx
-; X32-NEXT: addl %edi, %eax
-; X32-NEXT: movl %eax, %edi
+; X32-NEXT: addl %ebx, %eax
+; X32-NEXT: movl %eax, (%esp) # 4-byte Spill
; X32-NEXT: adcl %esi, %ecx
-; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X32-NEXT: setb %bl
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT: mull %ebx
+; X32-NEXT: mull %ebp
; X32-NEXT: addl %ecx, %eax
-; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X32-NEXT: movzbl %bl, %ecx
; X32-NEXT: adcl %ecx, %edx
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT: addl %ebp, %ecx
-; X32-NEXT: movl %edi, %esi
+; X32-NEXT: addl %edi, %ecx
+; X32-NEXT: movl (%esp), %esi # 4-byte Reload
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 1-byte Folded Reload
; X32-NEXT: adcl %edi, %eax
; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: movl %esi, (%esp) # 4-byte Spill
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl 16(%eax), %esi
-; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NEXT: movl %ebp, %eax
-; X32-NEXT: mull %esi
-; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: movl 16(%eax), %edi
+; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %edx, %ebp
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT: movl %ebx, %eax
-; X32-NEXT: mull %esi
-; X32-NEXT: movl %edx, %esi
-; X32-NEXT: movl %eax, %edi
-; X32-NEXT: addl %ecx, %edi
-; X32-NEXT: adcl $0, %esi
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %edx, %edi
+; X32-NEXT: movl %eax, %ebx
+; X32-NEXT: addl %ebp, %ebx
+; X32-NEXT: adcl $0, %edi
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl 20(%eax), %ecx
-; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl %ebp, %eax
-; X32-NEXT: mull %ecx
+; X32-NEXT: movl 20(%eax), %edx
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: mull %edx
; X32-NEXT: movl %edx, %ebp
-; X32-NEXT: addl %edi, %eax
+; X32-NEXT: addl %ebx, %eax
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: adcl %esi, %ebp
-; X32-NEXT: setb %cl
-; X32-NEXT: movl %ebx, %eax
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT: mull %ebx
+; X32-NEXT: adcl %edi, %ebp
+; X32-NEXT: setb %bl
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %esi, %edi
+; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: addl %ebp, %eax
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movzbl %cl, %eax
+; X32-NEXT: movzbl %bl, %eax
; X32-NEXT: adcl %eax, %edx
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X32-NEXT: movl %ecx, %eax
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT: mull %edi
-; X32-NEXT: movl %edx, %esi
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT: mull %edi
-; X32-NEXT: movl %edx, %edi
-; X32-NEXT: movl %eax, %ebp
-; X32-NEXT: addl %esi, %ebp
-; X32-NEXT: adcl $0, %edi
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %ebp
+; X32-NEXT: movl %eax, %esi
+; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NEXT: adcl $0, %ebp
; X32-NEXT: movl %ecx, %eax
-; X32-NEXT: mull %ebx
-; X32-NEXT: movl %ebx, %esi
+; X32-NEXT: mull %edi
; X32-NEXT: movl %edx, %ecx
-; X32-NEXT: addl %ebp, %eax
-; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: adcl %edi, %ecx
-; X32-NEXT: setb %bl
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT: movl %edi, %eax
-; X32-NEXT: mull %esi
-; X32-NEXT: addl %ecx, %eax
-; X32-NEXT: movzbl %bl, %ecx
-; X32-NEXT: adcl %ecx, %edx
-; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NEXT: addl %esi, %eax
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: adcl %ebp, %ecx
+; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X32-NEXT: movl %ebx, %esi
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %eax, %edi
+; X32-NEXT: addl %ecx, %edi
+; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X32-NEXT: adcl %eax, %edx
+; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl 24(%eax), %ebx
; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT: movl %esi, %eax
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NEXT: movl %ebp, %eax
; X32-NEXT: mull %ebx
; X32-NEXT: movl %edx, %ecx
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl %edi, %eax
+; X32-NEXT: movl %esi, %eax
; X32-NEXT: mull %ebx
-; X32-NEXT: movl %edx, %ebp
+; X32-NEXT: movl %edx, %esi
; X32-NEXT: movl %eax, %ebx
; X32-NEXT: addl %ecx, %ebx
-; X32-NEXT: adcl $0, %ebp
+; X32-NEXT: adcl $0, %esi
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl 28(%eax), %ecx
-; X32-NEXT: movl %esi, %eax
+; X32-NEXT: movl %ebp, %eax
; X32-NEXT: mull %ecx
; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl %edx, %edi
+; X32-NEXT: movl %edx, %ebp
; X32-NEXT: addl %ebx, %eax
-; X32-NEXT: movl %eax, %esi
-; X32-NEXT: adcl %ebp, %edi
-; X32-NEXT: setb %bl
+; X32-NEXT: movl %eax, %ebx
+; X32-NEXT: adcl %esi, %ebp
+; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X32-NEXT: mull %ecx
-; X32-NEXT: movl %eax, %ebp
-; X32-NEXT: addl %edi, %ebp
-; X32-NEXT: movzbl %bl, %eax
+; X32-NEXT: movl %eax, %esi
+; X32-NEXT: addl %ebp, %esi
+; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
; X32-NEXT: adcl %eax, %edx
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: adcl $0, %ebp
+; X32-NEXT: addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: adcl $0, %esi
; X32-NEXT: adcl $0, %edx
-; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT: movl %esi, %eax
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT: movl %edi, %eax
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X32-NEXT: mull %ecx
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X32-NEXT: mull %ecx
-; X32-NEXT: movl %edx, %edi
-; X32-NEXT: movl %eax, %ebx
-; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NEXT: adcl $0, %edi
-; X32-NEXT: movl %esi, %eax
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: movl %eax, %ebp
+; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NEXT: adcl $0, %ebx
+; X32-NEXT: movl %edi, %eax
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X32-NEXT: mull %ecx
-; X32-NEXT: movl %edx, %esi
-; X32-NEXT: addl %ebx, %eax
-; X32-NEXT: movl %eax, %ebx
-; X32-NEXT: adcl %edi, %esi
-; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X32-NEXT: movl %edx, %edi
+; X32-NEXT: addl %ebp, %eax
+; X32-NEXT: movl %eax, %ebp
+; X32-NEXT: adcl %ebx, %edi
+; X32-NEXT: setb %bl
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X32-NEXT: mull %ecx
-; X32-NEXT: addl %esi, %eax
-; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
-; X32-NEXT: adcl %esi, %edx
+; X32-NEXT: addl %edi, %eax
+; X32-NEXT: movzbl %bl, %edi
+; X32-NEXT: adcl %edi, %edx
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT: addl %ebp, %edi
-; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NEXT: addl %esi, %edi
+; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
; X32-NEXT: adcl %ecx, %eax
; X32-NEXT: adcl $0, %edx
; X32-NEXT: movl %edx, %esi
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT: movl (%esp), %ecx # 4-byte Reload
; X32-NEXT: adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X32-NEXT: adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X32-NEXT: adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X32-NEXT: movl %edi, %edx
; X32-NEXT: adcl $0, %edx
-; X32-NEXT: adcl $0, %ebx
+; X32-NEXT: movl %ebp, %ecx
+; X32-NEXT: adcl $0, %ecx
; X32-NEXT: adcl $0, %eax
; X32-NEXT: adcl $0, %esi
; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X32-NEXT: setb (%esp) # 1-byte Folded Spill
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NEXT: movl %ebp, %eax
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT: movl %esi, %eax
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT: mull %edi
+; X32-NEXT: mull %esi
; X32-NEXT: movl %edx, %ecx
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NEXT: movl %ebp, %eax
-; X32-NEXT: mull %edi
-; X32-NEXT: movl %edx, %edi
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %esi
; X32-NEXT: movl %eax, %ebx
; X32-NEXT: addl %ecx, %ebx
-; X32-NEXT: adcl $0, %edi
-; X32-NEXT: movl %esi, %eax
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT: mull %esi
+; X32-NEXT: adcl $0, %esi
+; X32-NEXT: movl %ebp, %eax
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NEXT: mull %ebp
; X32-NEXT: movl %edx, %ecx
; X32-NEXT: addl %ebx, %eax
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: adcl %edi, %ecx
+; X32-NEXT: adcl %esi, %ecx
; X32-NEXT: setb %bl
-; X32-NEXT: movl %ebp, %eax
-; X32-NEXT: mull %esi
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: mull %ebp
; X32-NEXT: addl %ecx, %eax
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movzbl %bl, %eax
; X32-NEXT: adcl %eax, %edx
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X32-NEXT: movl %esi, %eax
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT: mull %ebx
-; X32-NEXT: movl %edx, %ecx
-; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT: movl %edi, %eax
-; X32-NEXT: mull %ebx
+; X32-NEXT: mull %edi
; X32-NEXT: movl %edx, %ebx
; X32-NEXT: movl %eax, %ebp
-; X32-NEXT: addl %ecx, %ebp
+; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
; X32-NEXT: adcl $0, %ebx
-; X32-NEXT: movl %esi, %eax
+; X32-NEXT: movl %ecx, %eax
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X32-NEXT: mull %ecx
-; X32-NEXT: movl %edx, %esi
+; X32-NEXT: movl %edx, %edi
; X32-NEXT: addl %ebp, %eax
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: adcl %ebx, %esi
+; X32-NEXT: adcl %ebx, %edi
; X32-NEXT: setb %bl
-; X32-NEXT: movl %edi, %eax
+; X32-NEXT: movl %esi, %eax
; X32-NEXT: mull %ecx
; X32-NEXT: movl %eax, %ecx
-; X32-NEXT: addl %esi, %ecx
+; X32-NEXT: addl %edi, %ecx
; X32-NEXT: movzbl %bl, %eax
; X32-NEXT: adcl %eax, %edx
; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT: movl %ebx, %eax
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT: mull %esi
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NEXT: movl %ebp, %eax
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT: mull %edi
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl %edi, %eax
-; X32-NEXT: mull %esi
-; X32-NEXT: movl %edx, %ebp
-; X32-NEXT: movl %eax, %edi
-; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NEXT: adcl $0, %ebp
-; X32-NEXT: movl %ebx, %eax
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT: mull %esi
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: mull %edi
; X32-NEXT: movl %edx, %ebx
-; X32-NEXT: addl %edi, %eax
-; X32-NEXT: movl %eax, %edi
-; X32-NEXT: adcl %ebp, %ebx
+; X32-NEXT: movl %eax, %esi
+; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NEXT: adcl $0, %ebx
+; X32-NEXT: movl %ebp, %eax
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %edx, %ebp
+; X32-NEXT: addl %esi, %eax
+; X32-NEXT: movl %eax, %esi
+; X32-NEXT: adcl %ebx, %ebp
; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT: mull %esi
-; X32-NEXT: movl %esi, %ebp
-; X32-NEXT: movl %eax, %esi
-; X32-NEXT: addl %ebx, %esi
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %edi, %ebx
+; X32-NEXT: movl %eax, %edi
+; X32-NEXT: addl %ebp, %edi
; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
; X32-NEXT: adcl %eax, %edx
; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: adcl $0, %esi
+; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: adcl $0, %edi
; X32-NEXT: adcl $0, %edx
-; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X32-NEXT: movl %ecx, %eax
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT: mull %edi
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT: mull %edi
-; X32-NEXT: movl %edx, %edi
-; X32-NEXT: movl %eax, %ebx
-; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NEXT: adcl $0, %edi
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: movl %eax, %ebp
+; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NEXT: adcl $0, %esi
; X32-NEXT: movl %ecx, %eax
-; X32-NEXT: movl %ebp, %ecx
-; X32-NEXT: mull %ebp
-; X32-NEXT: movl %edx, %ebp
-; X32-NEXT: addl %ebx, %eax
-; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: adcl %edi, %ebp
-; X32-NEXT: setb %bl
+; X32-NEXT: movl %ebx, %ecx
+; X32-NEXT: mull %ebx
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: addl %ebp, %eax
+; X32-NEXT: movl %eax, %ebp
+; X32-NEXT: adcl %esi, %ebx
+; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X32-NEXT: mull %ecx
-; X32-NEXT: addl %ebp, %eax
-; X32-NEXT: movzbl %bl, %ecx
+; X32-NEXT: addl %ebx, %eax
+; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
; X32-NEXT: adcl %ecx, %edx
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT: addl %esi, %ecx
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 1-byte Folded Reload
-; X32-NEXT: adcl %edi, %eax
-; X32-NEXT: movl %eax, %edi
+; X32-NEXT: addl %edi, %ecx
+; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
+; X32-NEXT: adcl %esi, %eax
+; X32-NEXT: movl %eax, %esi
; X32-NEXT: adcl $0, %edx
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X32-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X32-NEXT: movzbl (%esp), %eax # 1-byte Folded Reload
; X32-NEXT: adcl %eax, %ecx
; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: adcl $0, %ebp
+; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: adcl $0, %esi
; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: adcl $0, %edi
-; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: adcl $0, %edx
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: movl 24(%esi), %ebx
-; X32-NEXT: movl %ebx, %eax
-; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT: mull %edi
-; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl %edx, %ecx
-; X32-NEXT: movl 28(%esi), %ebp
+; X32-NEXT: movl 24(%esi), %ebp
; X32-NEXT: movl %ebp, %eax
; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: mull %edi
-; X32-NEXT: movl %edx, %esi
-; X32-NEXT: movl %eax, %edi
-; X32-NEXT: addl %ecx, %edi
-; X32-NEXT: adcl $0, %esi
-; X32-NEXT: movl %ebx, %eax
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; X32-NEXT: mull %ebx
+; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl %edx, %ecx
-; X32-NEXT: addl %edi, %eax
+; X32-NEXT: movl 28(%esi), %edi
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: mull %ebx
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: movl %eax, %ebx
+; X32-NEXT: addl %ecx, %ebx
+; X32-NEXT: adcl $0, %esi
+; X32-NEXT: movl %ebp, %eax
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NEXT: mull %ebp
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: addl %ebx, %eax
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: adcl %esi, %ecx
-; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT: movl %ebp, %eax
-; X32-NEXT: mull %ebx
+; X32-NEXT: setb %bl
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: mull %ebp
; X32-NEXT: addl %ecx, %eax
-; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X32-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X32-NEXT: movzbl %bl, %eax
; X32-NEXT: adcl %eax, %edx
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: movl 16(%esi), %edi
-; X32-NEXT: movl %edi, %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X32-NEXT: movl 16(%edi), %esi
+; X32-NEXT: movl %esi, %eax
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; X32-NEXT: mull %ebx
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl %edx, %ecx
-; X32-NEXT: movl 20(%esi), %eax
+; X32-NEXT: movl 20(%edi), %eax
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: mull %ebx
-; X32-NEXT: movl %edx, %esi
-; X32-NEXT: movl %eax, %ebp
-; X32-NEXT: addl %ecx, %ebp
-; X32-NEXT: adcl $0, %esi
-; X32-NEXT: movl %edi, %eax
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT: mull %ecx
; X32-NEXT: movl %edx, %ebx
-; X32-NEXT: addl %ebp, %eax
+; X32-NEXT: movl %eax, %edi
+; X32-NEXT: addl %ecx, %edi
+; X32-NEXT: adcl $0, %ebx
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: movl %ebp, %ecx
+; X32-NEXT: mull %ebp
+; X32-NEXT: movl %edx, %ebp
+; X32-NEXT: addl %edi, %eax
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: adcl %esi, %ebx
-; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NEXT: movl %ebp, %eax
+; X32-NEXT: adcl %ebx, %ebp
+; X32-NEXT: setb %bl
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT: movl %edi, %eax
; X32-NEXT: mull %ecx
; X32-NEXT: movl %eax, %ecx
-; X32-NEXT: addl %ebx, %ecx
-; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X32-NEXT: addl %ebp, %ecx
+; X32-NEXT: movzbl %bl, %eax
; X32-NEXT: adcl %eax, %edx
; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: adcl $0, (%esp) # 4-byte Folded Spill
; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT: movl %edi, %eax
-; X32-NEXT: movl %edi, %ebx
-; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: movl %esi, %ebx
+; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: movl %esi, %eax
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X32-NEXT: mull %esi
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl %ebp, %eax
+; X32-NEXT: movl %edi, %eax
; X32-NEXT: mull %esi
; X32-NEXT: movl %edx, %ebp
-; X32-NEXT: movl %eax, %edi
-; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NEXT: movl %eax, %esi
+; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
; X32-NEXT: adcl $0, %ebp
; X32-NEXT: movl %ebx, %eax
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT: mull %ebx
-; X32-NEXT: movl %edx, %esi
-; X32-NEXT: addl %edi, %eax
-; X32-NEXT: movl %eax, %edi
-; X32-NEXT: adcl %ebp, %esi
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: addl %esi, %eax
+; X32-NEXT: movl %eax, %esi
+; X32-NEXT: adcl %ebp, %ebx
; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT: mull %ebx
-; X32-NEXT: movl %eax, %ebp
-; X32-NEXT: addl %esi, %ebp
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %edi, %ebp
+; X32-NEXT: movl %eax, %edi
+; X32-NEXT: addl %ebx, %edi
; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
; X32-NEXT: adcl %eax, %edx
; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: adcl $0, %ebp
+; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: adcl $0, %edi
; X32-NEXT: adcl $0, %edx
-; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NEXT: addl (%esp), %edi # 4-byte Folded Reload
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: movl %edx, (%esp) # 4-byte Spill
; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X32-NEXT: movl %ecx, %eax
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X32-NEXT: mull %esi
; X32-NEXT: movl %edx, %esi
-; X32-NEXT: movl %eax, %edi
-; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NEXT: movl %eax, %ebx
+; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
; X32-NEXT: adcl $0, %esi
; X32-NEXT: movl %ecx, %eax
-; X32-NEXT: movl %ebx, %ecx
-; X32-NEXT: mull %ebx
-; X32-NEXT: movl %edx, %ebx
-; X32-NEXT: addl %edi, %eax
-; X32-NEXT: movl %eax, %edi
-; X32-NEXT: adcl %esi, %ebx
+; X32-NEXT: movl %ebp, %ecx
+; X32-NEXT: mull %ebp
+; X32-NEXT: movl %edx, %ebp
+; X32-NEXT: addl %ebx, %eax
+; X32-NEXT: movl %eax, %ebx
+; X32-NEXT: adcl %esi, %ebp
; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X32-NEXT: mull %ecx
-; X32-NEXT: addl %ebx, %eax
+; X32-NEXT: addl %ebp, %eax
; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
; X32-NEXT: adcl %ecx, %edx
-; X32-NEXT: addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT: adcl (%esp), %ebx # 4-byte Folded Reload
+; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
; X32-NEXT: adcl %ecx, %eax
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: adcl $0, %edx
-; X32-NEXT: movl %edx, (%esp) # 4-byte Spill
+; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: movl 8(%esi), %ebp
-; X32-NEXT: movl %ebp, %eax
-; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: movl 8(%esi), %ecx
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
; X32-NEXT: mull %edi
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl %edx, %ecx
-; X32-NEXT: movl 12(%esi), %ebx
-; X32-NEXT: movl %ebx, %eax
-; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: movl %edx, (%esp) # 4-byte Spill
+; X32-NEXT: movl 12(%esi), %ebp
+; X32-NEXT: movl %ebp, %eax
+; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: mull %edi
; X32-NEXT: movl %edx, %esi
-; X32-NEXT: movl %eax, %edi
-; X32-NEXT: addl %ecx, %edi
+; X32-NEXT: movl %eax, %ebx
+; X32-NEXT: addl (%esp), %ebx # 4-byte Folded Reload
; X32-NEXT: adcl $0, %esi
-; X32-NEXT: movl %ebp, %eax
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NEXT: mull %ebp
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT: mull %edi
; X32-NEXT: movl %edx, %ecx
-; X32-NEXT: addl %edi, %eax
+; X32-NEXT: addl %ebx, %eax
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: adcl %esi, %ecx
-; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT: movl %ebx, %eax
-; X32-NEXT: mull %ebp
-; X32-NEXT: addl %ecx, %eax
-; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X32-NEXT: adcl %eax, %edx
-; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: movl (%esi), %ecx
-; X32-NEXT: movl %ecx, %eax
-; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT: setb %bl
+; X32-NEXT: movl %ebp, %eax
; X32-NEXT: mull %edi
+; X32-NEXT: addl %ecx, %eax
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl %edx, %ebp
-; X32-NEXT: movl 4(%esi), %eax
+; X32-NEXT: movzbl %bl, %eax
+; X32-NEXT: adcl %eax, %edx
+; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X32-NEXT: movl (%ebp), %edi
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: mull %edi
-; X32-NEXT: movl %edx, %esi
-; X32-NEXT: movl %eax, %ebx
-; X32-NEXT: addl %ebp, %ebx
-; X32-NEXT: adcl $0, %esi
-; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: movl 4(%ebp), %eax
+; X32-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: movl %eax, %ebp
+; X32-NEXT: addl %ecx, %ebp
+; X32-NEXT: adcl $0, %ebx
+; X32-NEXT: movl %edi, %esi
+; X32-NEXT: movl %edi, %eax
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X32-NEXT: mull %ecx
-; X32-NEXT: movl %edx, %ebp
-; X32-NEXT: addl %ebx, %eax
+; X32-NEXT: movl %edx, %edi
+; X32-NEXT: addl %ebp, %eax
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: adcl %esi, %ebp
+; X32-NEXT: adcl %ebx, %edi
; X32-NEXT: setb %bl
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT: movl %edi, %eax
+; X32-NEXT: movl (%esp), %ebp # 4-byte Reload
+; X32-NEXT: movl %ebp, %eax
; X32-NEXT: mull %ecx
; X32-NEXT: movl %eax, %ecx
-; X32-NEXT: addl %ebp, %ecx
+; X32-NEXT: addl %edi, %ecx
; X32-NEXT: movzbl %bl, %eax
; X32-NEXT: adcl %eax, %edx
; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: movl %esi, %ebx
+; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: movl %esi, %eax
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X32-NEXT: mull %esi
-; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: movl %edx, %edi
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl %edi, %eax
+; X32-NEXT: movl %ebp, %eax
; X32-NEXT: mull %esi
; X32-NEXT: movl %edx, %ebp
-; X32-NEXT: movl %eax, %edi
-; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NEXT: movl %eax, %esi
+; X32-NEXT: addl %edi, %esi
; X32-NEXT: adcl $0, %ebp
; X32-NEXT: movl %ebx, %eax
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT: mull %ebx
-; X32-NEXT: movl %edx, %esi
-; X32-NEXT: addl %edi, %eax
-; X32-NEXT: movl %eax, %edi
-; X32-NEXT: adcl %ebp, %esi
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: addl %esi, %eax
+; X32-NEXT: movl %eax, %esi
+; X32-NEXT: adcl %ebp, %ebx
; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT: mull %ebx
-; X32-NEXT: movl %eax, %ebp
-; X32-NEXT: addl %esi, %ebp
+; X32-NEXT: movl (%esp), %eax # 4-byte Reload
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %eax, %edi
+; X32-NEXT: addl %ebx, %edi
; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
; X32-NEXT: adcl %eax, %edx
; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: adcl $0, %ebp
+; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: adcl $0, %edi
; X32-NEXT: adcl $0, %edx
-; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NEXT: movl %ebp, %eax
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X32-NEXT: mull %esi
; X32-NEXT: movl %edx, %ecx
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X32-NEXT: mull %esi
; X32-NEXT: movl %edx, %esi
-; X32-NEXT: movl %eax, %edi
-; X32-NEXT: addl %ecx, %edi
+; X32-NEXT: movl %eax, %ebx
+; X32-NEXT: addl %ecx, %ebx
; X32-NEXT: adcl $0, %esi
-; X32-NEXT: movl %ebx, %eax
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT: mull %ebx
+; X32-NEXT: movl %ebp, %eax
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NEXT: mull %ebp
; X32-NEXT: movl %edx, %ecx
-; X32-NEXT: addl %edi, %eax
-; X32-NEXT: movl %eax, %edi
+; X32-NEXT: addl %ebx, %eax
+; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: adcl %esi, %ecx
-; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT: movl %esi, %eax
-; X32-NEXT: mull %ebx
+; X32-NEXT: setb %bl
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NEXT: mull %ebp
; X32-NEXT: addl %ecx, %eax
-; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X32-NEXT: movzbl %bl, %ecx
; X32-NEXT: adcl %ecx, %edx
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT: addl %ebp, %ecx
-; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 1-byte Folded Reload
-; X32-NEXT: adcl %ebx, %eax
+; X32-NEXT: addl %edi, %ecx
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 1-byte Folded Reload
+; X32-NEXT: adcl %edi, %eax
; X32-NEXT: adcl $0, %edx
; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT: adcl $0, (%esp) # 4-byte Folded Spill
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT: movl %esi, %eax
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
; X32-NEXT: mull %edi
; X32-NEXT: movl %edx, %ebp
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl %esi, %eax
-; X32-NEXT: movl %esi, %ecx
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT: movl %ecx, %eax
; X32-NEXT: mull %edi
-; X32-NEXT: movl %edx, %esi
-; X32-NEXT: movl %eax, %edi
-; X32-NEXT: addl %ebp, %edi
-; X32-NEXT: adcl $0, %esi
-; X32-NEXT: movl %ebx, %eax
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT: mull %ebx
+; X32-NEXT: movl %edx, %edi
+; X32-NEXT: movl %eax, %ebx
+; X32-NEXT: addl %ebp, %ebx
+; X32-NEXT: adcl $0, %edi
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
; X32-NEXT: movl %edx, %ebp
-; X32-NEXT: addl %edi, %eax
+; X32-NEXT: addl %ebx, %eax
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: adcl %esi, %ebp
-; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X32-NEXT: adcl %edi, %ebp
+; X32-NEXT: setb %bl
; X32-NEXT: movl %ecx, %eax
-; X32-NEXT: mull %ebx
+; X32-NEXT: mull %esi
; X32-NEXT: addl %ebp, %eax
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X32-NEXT: movzbl %bl, %eax
; X32-NEXT: adcl %eax, %edx
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; X32-NEXT: movl %ebx, %eax
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X32-NEXT: mull %ecx
-; X32-NEXT: movl %edx, %esi
+; X32-NEXT: movl %edx, %edi
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NEXT: movl (%esp), %eax # 4-byte Reload
; X32-NEXT: mull %ecx
-; X32-NEXT: movl %edx, %edi
-; X32-NEXT: movl %eax, %ebp
-; X32-NEXT: addl %esi, %ebp
-; X32-NEXT: adcl $0, %edi
+; X32-NEXT: movl %edx, %ebp
+; X32-NEXT: movl %eax, %esi
+; X32-NEXT: addl %edi, %esi
+; X32-NEXT: adcl $0, %ebp
; X32-NEXT: movl %ebx, %eax
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT: mull %esi
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT: mull %edi
; X32-NEXT: movl %edx, %ecx
-; X32-NEXT: addl %ebp, %eax
+; X32-NEXT: addl %esi, %eax
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: adcl %edi, %ecx
+; X32-NEXT: adcl %ebp, %ecx
; X32-NEXT: setb %bl
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NEXT: movl %ebp, %eax
-; X32-NEXT: mull %esi
-; X32-NEXT: movl %eax, %esi
-; X32-NEXT: addl %ecx, %esi
+; X32-NEXT: movl (%esp), %esi # 4-byte Reload
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %eax, %edi
+; X32-NEXT: addl %ecx, %edi
; X32-NEXT: movzbl %bl, %eax
; X32-NEXT: adcl %eax, %edx
-; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT: movl %ecx, %eax
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT: mull %edi
-; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
; X32-NEXT: movl %ebp, %eax
-; X32-NEXT: mull %edi
-; X32-NEXT: movl %edx, %ebp
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT: mull %ebx
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: mull %ebx
+; X32-NEXT: movl %edx, %esi
; X32-NEXT: movl %eax, %ebx
-; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NEXT: adcl $0, %ebp
-; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: addl %ecx, %ebx
+; X32-NEXT: adcl $0, %esi
+; X32-NEXT: movl %ebp, %eax
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X32-NEXT: mull %ecx
-; X32-NEXT: movl %edx, %edi
+; X32-NEXT: movl %edx, %ebp
; X32-NEXT: addl %ebx, %eax
; X32-NEXT: movl %eax, %ebx
-; X32-NEXT: adcl %ebp, %edi
+; X32-NEXT: adcl %esi, %ebp
; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NEXT: movl (%esp), %eax # 4-byte Reload
; X32-NEXT: mull %ecx
-; X32-NEXT: movl %eax, %ebp
-; X32-NEXT: addl %edi, %ebp
+; X32-NEXT: movl %eax, %esi
+; X32-NEXT: addl %ebp, %esi
; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
; X32-NEXT: adcl %eax, %edx
-; X32-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT: addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: adcl $0, %ebp
+; X32-NEXT: adcl $0, %esi
; X32-NEXT: adcl $0, %edx
-; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT: movl %esi, %eax
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT: movl %edi, %eax
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X32-NEXT: mull %ecx
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X32-NEXT: mull %ecx
-; X32-NEXT: movl %edx, %edi
-; X32-NEXT: movl %eax, %ebx
-; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NEXT: adcl $0, %edi
-; X32-NEXT: movl %esi, %eax
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: movl %eax, %ebp
+; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NEXT: adcl $0, %ebx
+; X32-NEXT: movl %edi, %eax
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X32-NEXT: mull %ecx
-; X32-NEXT: movl %edx, %esi
-; X32-NEXT: addl %ebx, %eax
-; X32-NEXT: movl %eax, %ebx
-; X32-NEXT: adcl %edi, %esi
-; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X32-NEXT: movl %edx, %edi
+; X32-NEXT: addl %ebp, %eax
+; X32-NEXT: movl %eax, %ebp
+; X32-NEXT: adcl %ebx, %edi
+; X32-NEXT: setb %bl
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X32-NEXT: mull %ecx
-; X32-NEXT: addl %esi, %eax
-; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
-; X32-NEXT: adcl %esi, %edx
+; X32-NEXT: addl %edi, %eax
+; X32-NEXT: movzbl %bl, %edi
+; X32-NEXT: adcl %edi, %edx
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT: addl %ebp, %edi
-; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NEXT: addl %esi, %edi
+; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
; X32-NEXT: adcl %ecx, %eax
; X32-NEXT: adcl $0, %edx
; X32-NEXT: adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X32-NEXT: movl %edi, %edx
; X32-NEXT: adcl $0, %edx
-; X32-NEXT: adcl $0, %ebx
+; X32-NEXT: movl %ebp, %ecx
+; X32-NEXT: adcl $0, %ecx
; X32-NEXT: adcl $0, %eax
; X32-NEXT: adcl $0, %esi
; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: adcl (%esp), %esi # 4-byte Folded Reload
+; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT: movl %edi, %eax
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X32-NEXT: mull %esi
-; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
; X32-NEXT: movl %ebp, %eax
; X32-NEXT: mull %esi
-; X32-NEXT: movl %edx, %edi
+; X32-NEXT: movl %edx, %esi
; X32-NEXT: movl %eax, %ebx
-; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NEXT: adcl $0, %edi
-; X32-NEXT: movl %ecx, %eax
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT: mull %esi
+; X32-NEXT: addl %ecx, %ebx
+; X32-NEXT: adcl $0, %esi
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT: mull %edi
; X32-NEXT: movl %edx, %ecx
; X32-NEXT: addl %ebx, %eax
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: adcl %edi, %ecx
+; X32-NEXT: adcl %esi, %ecx
; X32-NEXT: setb %bl
; X32-NEXT: movl %ebp, %eax
-; X32-NEXT: mull %esi
+; X32-NEXT: mull %edi
; X32-NEXT: addl %ecx, %eax
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movzbl %bl, %eax
; X32-NEXT: movl %ecx, %eax
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
; X32-NEXT: mull %ebp
-; X32-NEXT: movl %edx, %esi
+; X32-NEXT: movl %edx, %edi
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT: movl %edi, %eax
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT: movl %esi, %eax
; X32-NEXT: mull %ebp
; X32-NEXT: movl %edx, %ebx
; X32-NEXT: movl %eax, %ebp
-; X32-NEXT: addl %esi, %ebp
+; X32-NEXT: addl %edi, %ebp
; X32-NEXT: adcl $0, %ebx
; X32-NEXT: movl %ecx, %eax
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X32-NEXT: mull %ecx
-; X32-NEXT: movl %edx, %esi
+; X32-NEXT: movl %edx, %edi
; X32-NEXT: addl %ebp, %eax
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: adcl %ebx, %esi
+; X32-NEXT: adcl %ebx, %edi
; X32-NEXT: setb %bl
-; X32-NEXT: movl %edi, %eax
-; X32-NEXT: movl %edi, %ebp
+; X32-NEXT: movl %esi, %eax
; X32-NEXT: mull %ecx
; X32-NEXT: movl %eax, %ecx
-; X32-NEXT: addl %esi, %ecx
+; X32-NEXT: addl %edi, %ecx
; X32-NEXT: movzbl %bl, %eax
; X32-NEXT: adcl %eax, %edx
-; X32-NEXT: addl (%esp), %ecx # 4-byte Folded Reload
+; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NEXT: movl %edx, (%esp) # 4-byte Spill
+; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NEXT: movl %ebp, %eax
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT: movl %ebx, %eax
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT: mull %edi
-; X32-NEXT: movl %edx, %esi
+; X32-NEXT: mull %ebx
+; X32-NEXT: movl %edx, %edi
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: mull %ebx
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: movl %eax, %esi
+; X32-NEXT: addl %edi, %esi
+; X32-NEXT: adcl $0, %ebx
; X32-NEXT: movl %ebp, %eax
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
; X32-NEXT: mull %edi
; X32-NEXT: movl %edx, %ebp
-; X32-NEXT: movl %eax, %edi
-; X32-NEXT: addl %esi, %edi
-; X32-NEXT: adcl $0, %ebp
-; X32-NEXT: movl %ebx, %eax
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT: mull %esi
-; X32-NEXT: movl %edx, %ebx
-; X32-NEXT: addl %edi, %eax
-; X32-NEXT: movl %eax, %edi
-; X32-NEXT: adcl %ebp, %ebx
-; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT: mull %esi
+; X32-NEXT: addl %esi, %eax
; X32-NEXT: movl %eax, %esi
-; X32-NEXT: addl %ebx, %esi
-; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X32-NEXT: adcl %ebx, %ebp
+; X32-NEXT: setb %bl
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %eax, %edi
+; X32-NEXT: addl %ebp, %edi
+; X32-NEXT: movzbl %bl, %eax
; X32-NEXT: adcl %eax, %edx
; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT: adcl (%esp), %edi # 4-byte Folded Reload
-; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: adcl $0, %esi
+; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: adcl $0, %edi
; X32-NEXT: adcl $0, %edx
-; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NEXT: movl %ebp, %eax
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT: mull %edi
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
; X32-NEXT: movl %edx, %ecx
-; X32-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT: mull %edi
-; X32-NEXT: movl %edx, %edi
-; X32-NEXT: movl %eax, %ebx
-; X32-NEXT: addl %ecx, %ebx
-; X32-NEXT: adcl $0, %edi
-; X32-NEXT: movl %ebp, %eax
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NEXT: mull %ebp
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: movl %eax, %ebp
+; X32-NEXT: addl %ecx, %ebp
+; X32-NEXT: adcl $0, %esi
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT: mull %ebx
; X32-NEXT: movl %edx, %ecx
-; X32-NEXT: addl %ebx, %eax
-; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: adcl %edi, %ecx
-; X32-NEXT: setb %bl
+; X32-NEXT: addl %ebp, %eax
+; X32-NEXT: movl %eax, %ebp
+; X32-NEXT: adcl %esi, %ecx
+; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT: mull %ebp
+; X32-NEXT: mull %ebx
; X32-NEXT: addl %ecx, %eax
-; X32-NEXT: movzbl %bl, %ecx
+; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
; X32-NEXT: adcl %ecx, %edx
-; X32-NEXT: movl (%esp), %ebx # 4-byte Reload
-; X32-NEXT: addl %esi, %ebx
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT: addl %edi, %ebx
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
; X32-NEXT: adcl %ecx, %eax
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
; X32-NEXT: adcl %eax, %ebx
-; X32-NEXT: movl %ebx, (%esp) # 4-byte Spill
+; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: adcl $0, %ebp
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; X32-NEXT: adcl $0, %ebx
; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl (%esp), %ecx # 4-byte Reload
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-NEXT: movl %ecx, (%esp) # 4-byte Spill
+; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X32-NEXT: movl 32(%ebp), %esi
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X32-NEXT: movl 32(%ebx), %esi
; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT: movl %ecx, %eax
-; X32-NEXT: mull %esi
-; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; X32-NEXT: movl %ebx, %eax
; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NEXT: movl %ebp, %eax
+; X32-NEXT: mull %esi
; X32-NEXT: movl %edx, %esi
; X32-NEXT: movl %eax, %edi
-; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NEXT: addl %ecx, %edi
; X32-NEXT: adcl $0, %esi
-; X32-NEXT: movl 36(%ebp), %ebp
-; X32-NEXT: movl %ecx, %eax
-; X32-NEXT: mull %ebp
-; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl 36(%eax), %ecx
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %ecx, %ebx
+; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl %edx, %ecx
; X32-NEXT: addl %edi, %eax
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: adcl %esi, %ecx
; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT: movl %ebx, %eax
-; X32-NEXT: mull %ebp
+; X32-NEXT: movl %ebp, %eax
+; X32-NEXT: mull %ebx
; X32-NEXT: addl %ecx, %eax
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
; X32-NEXT: adcl %eax, %edx
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT: movl %edi, %eax
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT: mull %ebx
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
; X32-NEXT: movl %edx, %ecx
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT: mull %ebx
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: mull %esi
; X32-NEXT: movl %edx, %esi
; X32-NEXT: movl %eax, %ebp
; X32-NEXT: addl %ecx, %ebp
; X32-NEXT: adcl $0, %esi
-; X32-NEXT: movl %edi, %eax
+; X32-NEXT: movl %ebx, %eax
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X32-NEXT: mull %ecx
; X32-NEXT: movl %edx, %ebx
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: adcl %esi, %ebx
; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT: movl %esi, %eax
+; X32-NEXT: movl %edi, %eax
; X32-NEXT: mull %ecx
; X32-NEXT: movl %eax, %ecx
; X32-NEXT: addl %ebx, %ecx
; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl 40(%eax), %ebp
-; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl %edi, %eax
-; X32-NEXT: movl %edi, %ebx
-; X32-NEXT: mull %ebp
+; X32-NEXT: movl 40(%eax), %ebx
+; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: mull %ebx
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl %esi, %eax
-; X32-NEXT: mull %ebp
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: mull %ebx
; X32-NEXT: movl %edx, %ebp
; X32-NEXT: movl %eax, %edi
; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
; X32-NEXT: adcl $0, %ebp
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl 44(%eax), %esi
-; X32-NEXT: movl %ebx, %eax
-; X32-NEXT: mull %esi
-; X32-NEXT: movl %esi, %ebx
+; X32-NEXT: movl 44(%eax), %ebx
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: mull %ebx
; X32-NEXT: movl %edx, %esi
; X32-NEXT: addl %edi, %eax
; X32-NEXT: movl %eax, %edi
; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X32-NEXT: mull %ebx
-; X32-NEXT: movl %ebx, %ebp
-; X32-NEXT: movl %eax, %ebx
-; X32-NEXT: addl %esi, %ebx
+; X32-NEXT: movl %eax, %ebp
+; X32-NEXT: addl %esi, %ebp
; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
; X32-NEXT: adcl %eax, %edx
; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: adcl $0, %ebx
+; X32-NEXT: adcl $0, %ebp
; X32-NEXT: adcl $0, %edx
-; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
; X32-NEXT: adcl $0, %esi
; X32-NEXT: movl %ecx, %eax
-; X32-NEXT: movl %ebp, %ecx
-; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: mull %ebp
-; X32-NEXT: movl %edx, %ebp
+; X32-NEXT: movl %ebx, %ecx
+; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: mull %ebx
+; X32-NEXT: movl %edx, %ebx
; X32-NEXT: addl %edi, %eax
; X32-NEXT: movl %eax, %edi
-; X32-NEXT: adcl %esi, %ebp
+; X32-NEXT: adcl %esi, %ebx
; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X32-NEXT: mull %ecx
-; X32-NEXT: addl %ebp, %eax
+; X32-NEXT: addl %ebx, %eax
; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
; X32-NEXT: adcl %ecx, %edx
-; X32-NEXT: addl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT: addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: adcl $0, %edx
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT: movl %ebx, %eax
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT: mull %esi
-; X32-NEXT: movl %edx, %ecx
-; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
; X32-NEXT: movl %ebp, %eax
-; X32-NEXT: mull %esi
-; X32-NEXT: movl %edx, %esi
-; X32-NEXT: movl %eax, %edi
-; X32-NEXT: addl %ecx, %edi
-; X32-NEXT: adcl $0, %esi
-; X32-NEXT: movl %ebx, %eax
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT: mull %ebx
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT: mull %edi
; X32-NEXT: movl %edx, %ecx
-; X32-NEXT: addl %edi, %eax
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: adcl %esi, %ecx
-; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT: movl %ebp, %eax
-; X32-NEXT: mull %ebx
-; X32-NEXT: addl %ecx, %eax
-; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X32-NEXT: adcl %eax, %edx
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %edx, %edi
+; X32-NEXT: movl %eax, %ebx
+; X32-NEXT: addl %ecx, %ebx
+; X32-NEXT: adcl $0, %edi
+; X32-NEXT: movl %ebp, %eax
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NEXT: mull %ebp
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: addl %ebx, %eax
+; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: adcl %edi, %ecx
+; X32-NEXT: setb %bl
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: mull %ebp
+; X32-NEXT: addl %ecx, %eax
+; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: movzbl %bl, %eax
+; X32-NEXT: adcl %eax, %edx
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT: movl %ecx, %eax
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X32-NEXT: mull %esi
-; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT: movl (%esp), %edi # 4-byte Reload
; X32-NEXT: movl %edi, %eax
; X32-NEXT: mull %esi
-; X32-NEXT: movl %edx, %esi
+; X32-NEXT: movl %edx, %ebx
; X32-NEXT: movl %eax, %ebp
-; X32-NEXT: addl %ecx, %ebp
-; X32-NEXT: adcl $0, %esi
-; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NEXT: adcl $0, %ebx
+; X32-NEXT: movl %ecx, %eax
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X32-NEXT: mull %ecx
-; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: movl %edx, %esi
; X32-NEXT: addl %ebp, %eax
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: adcl %esi, %ebx
-; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X32-NEXT: adcl %ebx, %esi
+; X32-NEXT: setb %bl
; X32-NEXT: movl %edi, %eax
+; X32-NEXT: movl %edi, %ebp
; X32-NEXT: mull %ecx
; X32-NEXT: movl %eax, %ecx
-; X32-NEXT: addl %ebx, %ecx
-; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X32-NEXT: addl %esi, %ecx
+; X32-NEXT: movzbl %bl, %eax
; X32-NEXT: adcl %eax, %edx
; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; X32-NEXT: movl %ebx, %eax
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT: mull %esi
-; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %edx, %esi
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl %edi, %eax
-; X32-NEXT: mull %esi
+; X32-NEXT: movl %ebp, %eax
+; X32-NEXT: mull %edi
; X32-NEXT: movl %edx, %ebp
; X32-NEXT: movl %eax, %edi
-; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NEXT: addl %esi, %edi
; X32-NEXT: adcl $0, %ebp
; X32-NEXT: movl %ebx, %eax
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT: mull %ebx
-; X32-NEXT: movl %edx, %esi
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %ebx
; X32-NEXT: addl %edi, %eax
; X32-NEXT: movl %eax, %edi
-; X32-NEXT: adcl %ebp, %esi
+; X32-NEXT: adcl %ebp, %ebx
; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT: mull %ebx
-; X32-NEXT: movl %eax, %ebx
-; X32-NEXT: addl %esi, %ebx
+; X32-NEXT: movl (%esp), %eax # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %eax, %esi
+; X32-NEXT: addl %ebx, %esi
; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
; X32-NEXT: adcl %eax, %edx
; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: adcl $0, %ebx
+; X32-NEXT: adcl $0, %esi
; X32-NEXT: adcl $0, %edx
-; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
; X32-NEXT: movl %ebp, %eax
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT: mull %esi
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT: mull %edi
; X32-NEXT: movl %edx, %ecx
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT: mull %esi
-; X32-NEXT: movl %edx, %esi
-; X32-NEXT: movl %eax, %edi
-; X32-NEXT: addl %ecx, %edi
-; X32-NEXT: adcl $0, %esi
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %edx, %edi
+; X32-NEXT: movl %eax, %ebx
+; X32-NEXT: addl %ecx, %ebx
+; X32-NEXT: adcl $0, %edi
; X32-NEXT: movl %ebp, %eax
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
; X32-NEXT: mull %ebp
; X32-NEXT: movl %edx, %ecx
-; X32-NEXT: addl %edi, %eax
-; X32-NEXT: movl %eax, %edi
-; X32-NEXT: adcl %esi, %ecx
-; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT: movl %esi, %eax
+; X32-NEXT: addl %ebx, %eax
+; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: adcl %edi, %ecx
+; X32-NEXT: setb %bl
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT: movl %edi, %eax
; X32-NEXT: mull %ebp
; X32-NEXT: addl %ecx, %eax
-; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X32-NEXT: movzbl %bl, %ecx
; X32-NEXT: adcl %ecx, %edx
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT: addl %ebx, %ecx
-; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NEXT: addl %esi, %ecx
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 1-byte Folded Reload
; X32-NEXT: adcl %ebx, %eax
; X32-NEXT: adcl $0, %edx
; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl 48(%eax), %ebx
-; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT: movl %edi, %eax
-; X32-NEXT: mull %ebx
+; X32-NEXT: movl 48(%eax), %esi
+; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: mull %esi
; X32-NEXT: movl %edx, %ebp
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl %esi, %eax
-; X32-NEXT: movl %esi, %ecx
-; X32-NEXT: mull %ebx
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: movl %edi, %ecx
+; X32-NEXT: mull %esi
; X32-NEXT: movl %edx, %esi
-; X32-NEXT: movl %eax, %ebx
-; X32-NEXT: addl %ebp, %ebx
+; X32-NEXT: movl %eax, %edi
+; X32-NEXT: addl %ebp, %edi
; X32-NEXT: adcl $0, %esi
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl 52(%eax), %edx
-; X32-NEXT: movl %edi, %eax
-; X32-NEXT: movl %edx, %edi
+; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: movl %ebx, %eax
; X32-NEXT: mull %edx
; X32-NEXT: movl %edx, %ebp
-; X32-NEXT: addl %ebx, %eax
+; X32-NEXT: addl %edi, %eax
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: adcl %esi, %ebp
; X32-NEXT: setb %bl
; X32-NEXT: movl %ecx, %eax
-; X32-NEXT: mull %edi
-; X32-NEXT: movl %edi, %esi
-; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
; X32-NEXT: addl %ebp, %eax
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movzbl %bl, %eax
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X32-NEXT: movl %ecx, %eax
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT: mull %edi
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT: mull %ebx
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT: movl %ebx, %eax
-; X32-NEXT: mull %edi
+; X32-NEXT: movl (%esp), %edi # 4-byte Reload
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: mull %ebx
; X32-NEXT: movl %edx, %ebp
-; X32-NEXT: movl %eax, %edi
-; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NEXT: movl %eax, %ebx
+; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
; X32-NEXT: adcl $0, %ebp
; X32-NEXT: movl %ecx, %eax
; X32-NEXT: mull %esi
; X32-NEXT: movl %edx, %ecx
-; X32-NEXT: addl %edi, %eax
+; X32-NEXT: addl %ebx, %eax
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: adcl %ebp, %ecx
-; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: setb %bl
+; X32-NEXT: movl %edi, %eax
; X32-NEXT: mull %esi
; X32-NEXT: movl %eax, %esi
; X32-NEXT: addl %ecx, %esi
-; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X32-NEXT: movzbl %bl, %eax
; X32-NEXT: adcl %eax, %edx
; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X32-NEXT: mull %ecx
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: movl %edi, %eax
; X32-NEXT: mull %ecx
-; X32-NEXT: movl %edx, %edi
-; X32-NEXT: movl %eax, %ebx
-; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NEXT: adcl $0, %edi
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: movl %eax, %edi
+; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NEXT: adcl $0, %ebx
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl 60(%eax), %ecx
; X32-NEXT: movl %ebp, %eax
; X32-NEXT: mull %ecx
; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl %edx, %ebp
-; X32-NEXT: addl %ebx, %eax
-; X32-NEXT: movl %eax, %ebx
-; X32-NEXT: adcl %edi, %ebp
+; X32-NEXT: addl %edi, %eax
+; X32-NEXT: movl %eax, %edi
+; X32-NEXT: adcl %ebx, %ebp
; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NEXT: movl (%esp), %eax # 4-byte Reload
; X32-NEXT: mull %ecx
-; X32-NEXT: movl %eax, %edi
-; X32-NEXT: addl %ebp, %edi
+; X32-NEXT: movl %eax, %ebx
+; X32-NEXT: addl %ebp, %ebx
; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
; X32-NEXT: adcl %eax, %edx
; X32-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: adcl $0, %edi
+; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: adcl $0, %ebx
; X32-NEXT: adcl $0, %edx
-; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X32-NEXT: mull %ecx
-; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: movl %edx, %edi
; X32-NEXT: movl %eax, %ebp
; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NEXT: adcl $0, %ebx
+; X32-NEXT: adcl $0, %edi
; X32-NEXT: movl %esi, %eax
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X32-NEXT: mull %ecx
; X32-NEXT: movl %edx, %esi
; X32-NEXT: addl %ebp, %eax
; X32-NEXT: movl %eax, %ebp
-; X32-NEXT: adcl %ebx, %esi
-; X32-NEXT: setb %bl
+; X32-NEXT: adcl %edi, %esi
+; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X32-NEXT: mull %ecx
; X32-NEXT: addl %esi, %eax
-; X32-NEXT: movzbl %bl, %esi
+; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
; X32-NEXT: adcl %esi, %edx
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT: addl %edi, %ebx
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT: addl %ebx, %edi
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
; X32-NEXT: adcl %ecx, %eax
; X32-NEXT: adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X32-NEXT: adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT: movl %ebx, %edx
+; X32-NEXT: movl %edi, %edx
; X32-NEXT: adcl $0, %edx
; X32-NEXT: movl %ebp, %ecx
; X32-NEXT: adcl $0, %ecx
; X32-NEXT: movzbl %bl, %eax
; X32-NEXT: adcl %eax, %edx
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT: movl %ebx, %eax
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X32-NEXT: mull %esi
-; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: movl %edx, %ecx
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT: movl %edi, %eax
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X32-NEXT: mull %esi
-; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: movl %edx, %edi
; X32-NEXT: movl %eax, %ebp
-; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NEXT: adcl $0, %ebx
-; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: addl %ecx, %ebp
+; X32-NEXT: adcl $0, %edi
+; X32-NEXT: movl %ebx, %eax
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X32-NEXT: mull %ecx
; X32-NEXT: movl %edx, %esi
; X32-NEXT: addl %ebp, %eax
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: adcl %ebx, %esi
+; X32-NEXT: adcl %edi, %esi
; X32-NEXT: setb %bl
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
; X32-NEXT: movl %edi, %eax
; X32-NEXT: mull %ecx
; X32-NEXT: movl %eax, %ecx
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl %edi, %eax
; X32-NEXT: mull %esi
-; X32-NEXT: movl %edx, %ebx
-; X32-NEXT: movl %eax, %edi
-; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NEXT: adcl $0, %ebx
+; X32-NEXT: movl %edx, %edi
+; X32-NEXT: movl %eax, %ebx
+; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NEXT: adcl $0, %edi
; X32-NEXT: movl %ebp, %eax
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X32-NEXT: mull %esi
; X32-NEXT: movl %edx, %ebp
-; X32-NEXT: addl %edi, %eax
-; X32-NEXT: movl %eax, %edi
-; X32-NEXT: adcl %ebx, %ebp
+; X32-NEXT: addl %ebx, %eax
+; X32-NEXT: movl %eax, %ebx
+; X32-NEXT: adcl %edi, %ebp
; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X32-NEXT: mull %esi
-; X32-NEXT: movl %eax, %ebx
-; X32-NEXT: addl %ebp, %ebx
+; X32-NEXT: movl %eax, %edi
+; X32-NEXT: addl %ebp, %edi
; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
; X32-NEXT: adcl %eax, %edx
; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: adcl $0, %ebx
-; X32-NEXT: adcl $0, %edx
-; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: adcl $0, %edi
+; X32-NEXT: adcl $0, %edx
+; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X32-NEXT: mull %esi
-; X32-NEXT: movl %edx, %edi
+; X32-NEXT: movl %edx, %ebx
; X32-NEXT: movl %eax, %ebp
; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NEXT: adcl $0, %edi
+; X32-NEXT: adcl $0, %ebx
; X32-NEXT: movl %ecx, %eax
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X32-NEXT: mull %ecx
; X32-NEXT: movl %edx, %esi
; X32-NEXT: addl %ebp, %eax
; X32-NEXT: movl %eax, %ebp
-; X32-NEXT: adcl %edi, %esi
-; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X32-NEXT: adcl %ebx, %esi
+; X32-NEXT: setb %bl
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X32-NEXT: mull %ecx
; X32-NEXT: addl %esi, %eax
; X32-NEXT: movl %eax, %ecx
-; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X32-NEXT: movzbl %bl, %eax
; X32-NEXT: adcl %eax, %edx
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT: addl %ebx, %edi
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT: addl %edi, %ebx
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X32-NEXT: movl %ecx, %ebx
-; X32-NEXT: adcl %eax, %ebx
+; X32-NEXT: movl %ecx, %edi
+; X32-NEXT: adcl %eax, %edi
; X32-NEXT: adcl $0, %edx
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X32-NEXT: adcl %eax, %edi
-; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: adcl %eax, %ebx
; X32-NEXT: adcl $0, %ebp
-; X32-NEXT: adcl $0, %ebx
-; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: adcl $0, %edi
+; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
; X32-NEXT: adcl $0, %edi
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT: movl (%esp), %eax # 4-byte Reload
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X32-NEXT: adcl $0, %edx
; X32-NEXT: adcl $0, %esi
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; X32-NEXT: adcl $0, %ebx
; X32-NEXT: adcl $0, %ebp
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X32-NEXT: addl %edi, %eax
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: adcl %esi, %ecx
-; X32-NEXT: setb (%esp) # 1-byte Folded Spill
+; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
; X32-NEXT: movl %ebp, %eax
; X32-NEXT: mull %ebx
; X32-NEXT: addl %ecx, %eax
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movzbl (%esp), %eax # 1-byte Folded Reload
+; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
; X32-NEXT: adcl %eax, %edx
-; X32-NEXT: movl %edx, (%esp) # 4-byte Spill
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT: movl %edi, %eax
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X32-NEXT: mull %esi
-; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: movl %edx, %ebx
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT: movl %edi, %eax
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT: movl %ecx, %eax
; X32-NEXT: mull %esi
; X32-NEXT: movl %edx, %esi
; X32-NEXT: movl %eax, %ebp
-; X32-NEXT: addl %ecx, %ebp
+; X32-NEXT: addl %ebx, %ebp
; X32-NEXT: adcl $0, %esi
-; X32-NEXT: movl %ebx, %eax
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT: mull %edi
; X32-NEXT: movl %edx, %ebx
; X32-NEXT: addl %ebp, %eax
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: adcl %esi, %ebx
; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT: movl %edi, %eax
-; X32-NEXT: mull %ecx
-; X32-NEXT: movl %eax, %esi
-; X32-NEXT: addl %ebx, %esi
+; X32-NEXT: movl %ecx, %esi
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: addl %ebx, %ecx
; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
; X32-NEXT: adcl %eax, %edx
-; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT: adcl $0, (%esp) # 4-byte Folded Spill
+; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; X32-NEXT: movl %ebx, %eax
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT: mull %ecx
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT: mull %edi
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl %edi, %eax
-; X32-NEXT: mull %ecx
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: mull %edi
; X32-NEXT: movl %edx, %ebp
; X32-NEXT: movl %eax, %edi
; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
; X32-NEXT: movl %ebx, %eax
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; X32-NEXT: mull %ebx
-; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: movl %edx, %esi
; X32-NEXT: addl %edi, %eax
; X32-NEXT: movl %eax, %edi
-; X32-NEXT: adcl %ebp, %ecx
+; X32-NEXT: adcl %ebp, %esi
; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X32-NEXT: mull %ebx
; X32-NEXT: movl %eax, %ebp
-; X32-NEXT: addl %ecx, %ebp
+; X32-NEXT: addl %esi, %ebp
; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
; X32-NEXT: adcl %eax, %edx
-; X32-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: adcl $0, %ebp
; X32-NEXT: adcl $0, %edx
; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NEXT: adcl (%esp), %edx # 4-byte Folded Reload
+; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X32-NEXT: mull %esi
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X32-NEXT: mull %esi
; X32-NEXT: movl %edx, %esi
; X32-NEXT: addl %ebx, %eax
; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
; X32-NEXT: adcl %ecx, %edx
-; X32-NEXT: addl %ebp, (%esp) # 4-byte Folded Spill
+; X32-NEXT: addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; X32-NEXT: movl %ebx, %eax
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT: mull %esi
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT: mull %edi
; X32-NEXT: movl %edx, %ecx
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT: mull %esi
-; X32-NEXT: movl %edx, %esi
-; X32-NEXT: movl %eax, %edi
-; X32-NEXT: addl %ecx, %edi
-; X32-NEXT: adcl $0, %esi
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %edx, %edi
+; X32-NEXT: movl %eax, %ebp
+; X32-NEXT: addl %ecx, %ebp
+; X32-NEXT: adcl $0, %edi
; X32-NEXT: movl %ebx, %eax
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NEXT: mull %ebp
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT: mull %ebx
; X32-NEXT: movl %edx, %ecx
-; X32-NEXT: addl %edi, %eax
+; X32-NEXT: addl %ebp, %eax
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: adcl %esi, %ecx
-; X32-NEXT: setb %bl
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT: mull %ebp
+; X32-NEXT: adcl %edi, %ecx
+; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: mull %ebx
; X32-NEXT: addl %ecx, %eax
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movzbl %bl, %eax
+; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
; X32-NEXT: adcl %eax, %edx
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
; X32-NEXT: movl %edi, %eax
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT: mull %ecx
+; X32-NEXT: mull %esi
; X32-NEXT: movl %edx, %ebp
-; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT: mull %ecx
-; X32-NEXT: movl %edx, %esi
; X32-NEXT: movl %eax, %ebx
-; X32-NEXT: addl %ebp, %ebx
-; X32-NEXT: adcl $0, %esi
-; X32-NEXT: movl %edi, %eax
+; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NEXT: adcl $0, %ebp
+; X32-NEXT: movl %ecx, %eax
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X32-NEXT: mull %ecx
-; X32-NEXT: movl %edx, %ebp
+; X32-NEXT: movl %edx, %esi
; X32-NEXT: addl %ebx, %eax
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: adcl %esi, %ebp
-; X32-NEXT: setb %bl
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT: adcl %ebp, %esi
+; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
; X32-NEXT: movl %edi, %eax
+; X32-NEXT: movl %edi, %ebx
; X32-NEXT: mull %ecx
-; X32-NEXT: movl %eax, %esi
-; X32-NEXT: addl %ebp, %esi
-; X32-NEXT: movzbl %bl, %eax
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: addl %esi, %ecx
+; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
; X32-NEXT: adcl %eax, %edx
-; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT: movl %ebx, %eax
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT: mull %ecx
-; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NEXT: movl %ebp, %eax
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %edx, %esi
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl %edi, %eax
-; X32-NEXT: mull %ecx
-; X32-NEXT: movl %edx, %ebp
-; X32-NEXT: movl %eax, %edi
-; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NEXT: adcl $0, %ebp
; X32-NEXT: movl %ebx, %eax
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT: mull %ebx
-; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: movl %eax, %edi
+; X32-NEXT: addl %esi, %edi
+; X32-NEXT: adcl $0, %ebx
+; X32-NEXT: movl %ebp, %eax
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %ebp
; X32-NEXT: addl %edi, %eax
; X32-NEXT: movl %eax, %edi
-; X32-NEXT: adcl %ebp, %ecx
-; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X32-NEXT: adcl %ebx, %ebp
+; X32-NEXT: setb %bl
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT: mull %ebx
-; X32-NEXT: movl %eax, %ebp
-; X32-NEXT: addl %ecx, %ebp
-; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %eax, %esi
+; X32-NEXT: addl %ebp, %esi
+; X32-NEXT: movzbl %bl, %eax
; X32-NEXT: adcl %eax, %edx
-; X32-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: adcl $0, %ebp
+; X32-NEXT: adcl $0, %esi
; X32-NEXT: adcl $0, %edx
-; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; X32-NEXT: movl %ebx, %eax
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT: mull %esi
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT: mull %edi
; X32-NEXT: movl %edx, %ecx
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT: mull %esi
-; X32-NEXT: movl %edx, %esi
-; X32-NEXT: movl %eax, %edi
-; X32-NEXT: addl %ecx, %edi
-; X32-NEXT: adcl $0, %esi
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %edx, %edi
+; X32-NEXT: movl %eax, %ebp
+; X32-NEXT: addl %ecx, %ebp
+; X32-NEXT: adcl $0, %edi
; X32-NEXT: movl %ebx, %eax
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; X32-NEXT: mull %ebx
; X32-NEXT: movl %edx, %ecx
-; X32-NEXT: addl %edi, %eax
-; X32-NEXT: movl %eax, %edi
-; X32-NEXT: adcl %esi, %ecx
+; X32-NEXT: addl %ebp, %eax
+; X32-NEXT: movl %eax, %ebp
+; X32-NEXT: adcl %edi, %ecx
; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT: movl %esi, %eax
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT: movl %edi, %eax
; X32-NEXT: mull %ebx
; X32-NEXT: addl %ecx, %eax
; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
; X32-NEXT: adcl %ecx, %edx
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT: addl %ebp, %ecx
-; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NEXT: addl %esi, %ecx
+; X32-NEXT: movl %ebp, %esi
+; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 1-byte Folded Reload
; X32-NEXT: adcl %ebx, %eax
; X32-NEXT: adcl $0, %edx
; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: adcl $0, (%esp) # 4-byte Folded Spill
; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT: movl %edi, %eax
+; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT: mull %ebx
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
; X32-NEXT: movl %edx, %ecx
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl %esi, %ebp
-; X32-NEXT: movl %esi, %eax
-; X32-NEXT: mull %ebx
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: movl %edi, %ebp
+; X32-NEXT: mull %esi
; X32-NEXT: movl %edx, %esi
-; X32-NEXT: movl %eax, %ebx
-; X32-NEXT: addl %ecx, %ebx
+; X32-NEXT: movl %eax, %edi
+; X32-NEXT: addl %ecx, %edi
; X32-NEXT: adcl $0, %esi
-; X32-NEXT: movl %edi, %eax
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT: mull %edi
-; X32-NEXT: movl %edx, %ecx
-; X32-NEXT: addl %ebx, %eax
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: addl %edi, %eax
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: adcl %esi, %ecx
-; X32-NEXT: setb %bl
+; X32-NEXT: adcl %esi, %ebx
+; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
; X32-NEXT: movl %ebp, %eax
-; X32-NEXT: mull %edi
-; X32-NEXT: addl %ecx, %eax
+; X32-NEXT: mull %ecx
+; X32-NEXT: addl %ebx, %eax
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movzbl %bl, %eax
+; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
; X32-NEXT: adcl %eax, %edx
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT: movl %esi, %eax
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT: movl %ecx, %eax
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT: mull %edi
+; X32-NEXT: mull %ecx
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT: movl %esi, %eax
-; X32-NEXT: mull %edi
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: mull %ecx
; X32-NEXT: movl %edx, %ebx
; X32-NEXT: movl %eax, %ebp
; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
; X32-NEXT: adcl $0, %ebx
-; X32-NEXT: movl %ecx, %eax
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT: mull %ecx
-; X32-NEXT: movl %edx, %edi
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %ecx
; X32-NEXT: addl %ebp, %eax
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: adcl %ebx, %edi
+; X32-NEXT: adcl %ebx, %ecx
; X32-NEXT: setb %bl
-; X32-NEXT: movl %esi, %eax
-; X32-NEXT: mull %ecx
-; X32-NEXT: movl %eax, %ecx
-; X32-NEXT: addl %edi, %ecx
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %eax, %esi
+; X32-NEXT: addl %ecx, %esi
; X32-NEXT: movzbl %bl, %eax
; X32-NEXT: adcl %eax, %edx
-; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; X32-NEXT: movl %ebx, %eax
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT: mull %edi
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT: mull %ecx
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl %esi, %eax
-; X32-NEXT: mull %edi
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: mull %ecx
; X32-NEXT: movl %edx, %ebp
-; X32-NEXT: movl %eax, %esi
-; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NEXT: movl %eax, %edi
+; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
; X32-NEXT: adcl $0, %ebp
; X32-NEXT: movl %ebx, %eax
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT: mull %edi
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT: mull %ecx
; X32-NEXT: movl %edx, %ebx
-; X32-NEXT: addl %esi, %eax
-; X32-NEXT: movl %eax, %esi
+; X32-NEXT: addl %edi, %eax
+; X32-NEXT: movl %eax, %edi
; X32-NEXT: adcl %ebp, %ebx
; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT: mull %edi
-; X32-NEXT: movl %edi, %ebp
-; X32-NEXT: movl %eax, %edi
-; X32-NEXT: addl %ebx, %edi
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %eax, %ebp
+; X32-NEXT: addl %ebx, %ebp
; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
; X32-NEXT: adcl %eax, %edx
-; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: adcl $0, %edi
+; X32-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: adcl $0, %ebp
; X32-NEXT: adcl $0, %edx
-; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT: movl %ecx, %eax
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT: mull %esi
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT: mull %ecx
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT: mull %esi
-; X32-NEXT: movl %edx, %esi
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, %edi
; X32-NEXT: movl %eax, %ebx
; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NEXT: adcl $0, %esi
-; X32-NEXT: movl %ecx, %eax
-; X32-NEXT: movl %ebp, %ecx
-; X32-NEXT: mull %ebp
-; X32-NEXT: movl %edx, %ebp
+; X32-NEXT: adcl $0, %edi
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, %esi
; X32-NEXT: addl %ebx, %eax
-; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: adcl %esi, %ebp
-; X32-NEXT: setb %bl
+; X32-NEXT: movl %eax, %ebx
+; X32-NEXT: adcl %edi, %esi
+; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X32-NEXT: mull %ecx
-; X32-NEXT: addl %ebp, %eax
-; X32-NEXT: movzbl %bl, %ecx
-; X32-NEXT: adcl %ecx, %edx
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT: addl %edi, %ebx
+; X32-NEXT: addl %esi, %eax
+; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
+; X32-NEXT: adcl %esi, %edx
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NEXT: addl %ebp, %edi
+; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
; X32-NEXT: adcl %ecx, %eax
; X32-NEXT: adcl $0, %edx
; X32-NEXT: adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X32-NEXT: adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT: movl %ebx, %edx
+; X32-NEXT: movl %edi, %edx
; X32-NEXT: adcl $0, %edx
-; X32-NEXT: movl %edi, %ecx
-; X32-NEXT: adcl $0, %ecx
+; X32-NEXT: adcl $0, %ebx
; X32-NEXT: adcl $0, %eax
; X32-NEXT: adcl $0, %esi
-; X32-NEXT: addl (%esp), %edx # 4-byte Folded Reload
+; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT: movl %ebx, %eax
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT: mull %esi
-; X32-NEXT: movl %edx, %ecx
-; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
; X32-NEXT: movl %ebp, %eax
-; X32-NEXT: mull %esi
-; X32-NEXT: movl %edx, %esi
-; X32-NEXT: movl %eax, %edi
-; X32-NEXT: addl %ecx, %edi
-; X32-NEXT: adcl $0, %esi
-; X32-NEXT: movl %ebx, %eax
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT: mull %ecx
-; X32-NEXT: movl %edx, %ebx
-; X32-NEXT: addl %edi, %eax
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %edx, %ecx
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: adcl %esi, %ebx
-; X32-NEXT: setb (%esp) # 1-byte Folded Spill
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %edx, %edi
+; X32-NEXT: movl %eax, %ebx
+; X32-NEXT: addl %ecx, %ebx
+; X32-NEXT: adcl $0, %edi
; X32-NEXT: movl %ebp, %eax
-; X32-NEXT: mull %ecx
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NEXT: mull %ebp
+; X32-NEXT: movl %edx, %ecx
; X32-NEXT: addl %ebx, %eax
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movzbl (%esp), %eax # 1-byte Folded Reload
+; X32-NEXT: adcl %edi, %ecx
+; X32-NEXT: setb %bl
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: mull %ebp
+; X32-NEXT: addl %ecx, %eax
+; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: movzbl %bl, %eax
; X32-NEXT: adcl %eax, %edx
-; X32-NEXT: movl %edx, (%esp) # 4-byte Spill
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT: movl %esi, %eax
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
; X32-NEXT: mull %edi
-; X32-NEXT: movl %edx, %esi
+; X32-NEXT: movl %edx, %ecx
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
; X32-NEXT: movl %ebp, %eax
; X32-NEXT: mull %edi
; X32-NEXT: movl %edx, %edi
; X32-NEXT: movl %eax, %ebx
-; X32-NEXT: addl %esi, %ebx
+; X32-NEXT: addl %ecx, %ebx
; X32-NEXT: adcl $0, %edi
-; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: movl %esi, %eax
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X32-NEXT: mull %ecx
; X32-NEXT: movl %edx, %esi
; X32-NEXT: setb %bl
; X32-NEXT: movl %ebp, %eax
; X32-NEXT: mull %ecx
-; X32-NEXT: movl %eax, %edi
-; X32-NEXT: addl %esi, %edi
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: addl %esi, %ecx
; X32-NEXT: movzbl %bl, %eax
; X32-NEXT: adcl %eax, %edx
-; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT: adcl $0, (%esp) # 4-byte Folded Spill
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT: movl %ebx, %eax
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT: mull %ecx
+; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl %ebp, %eax
-; X32-NEXT: mull %ecx
-; X32-NEXT: movl %edx, %esi
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %ebx
; X32-NEXT: movl %eax, %ebp
; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NEXT: adcl $0, %esi
-; X32-NEXT: movl %ebx, %eax
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT: mull %ecx
-; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: adcl $0, %ebx
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %edi
; X32-NEXT: addl %ebp, %eax
; X32-NEXT: movl %eax, %ebp
-; X32-NEXT: adcl %esi, %ebx
+; X32-NEXT: adcl %ebx, %edi
; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT: mull %ecx
-; X32-NEXT: movl %eax, %ecx
-; X32-NEXT: addl %ebx, %ecx
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %eax, %ebx
+; X32-NEXT: addl %edi, %ebx
; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
; X32-NEXT: adcl %eax, %edx
-; X32-NEXT: addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: adcl $0, %ecx
+; X32-NEXT: adcl $0, %ebx
; X32-NEXT: adcl $0, %edx
-; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-NEXT: adcl (%esp), %edx # 4-byte Folded Reload
+; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: setb (%esp) # 1-byte Folded Spill
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT: movl %edi, %eax
+; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT: movl %ecx, %eax
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X32-NEXT: mull %esi
-; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X32-NEXT: mull %esi
-; X32-NEXT: movl %edx, %esi
-; X32-NEXT: movl %eax, %ebp
-; X32-NEXT: addl %ebx, %ebp
-; X32-NEXT: adcl $0, %esi
-; X32-NEXT: movl %edi, %eax
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT: mull %ebx
; X32-NEXT: movl %edx, %edi
-; X32-NEXT: addl %ebp, %eax
; X32-NEXT: movl %eax, %ebp
-; X32-NEXT: adcl %esi, %edi
-; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT: mull %ebx
-; X32-NEXT: addl %edi, %eax
-; X32-NEXT: movl %eax, %edi
-; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X32-NEXT: adcl %eax, %edx
+; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NEXT: adcl $0, %edi
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: addl %ebp, %eax
+; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: adcl %edi, %esi
+; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NEXT: mull %ecx
+; X32-NEXT: addl %esi, %eax
+; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X32-NEXT: adcl %ecx, %edx
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NEXT: addl %ebx, %ebp
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT: addl %ecx, %ebx
-; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movzbl (%esp), %eax # 1-byte Folded Reload
-; X32-NEXT: movl %edi, %ebp
-; X32-NEXT: adcl %eax, %ebp
+; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X32-NEXT: adcl %ecx, %eax
+; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: adcl $0, %edx
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X32-NEXT: adcl %eax, %ebx
-; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT: adcl $0, %ebx
-; X32-NEXT: adcl $0, %ebp
+; X32-NEXT: adcl %eax, %ebp
; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: adcl $0, %ebx
+; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
; X32-NEXT: adcl $0, %ebp
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT: adcl $0, %ebx
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X32-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X32-NEXT: adcl $0, %edi
; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT: adcl $0, %ebx
-; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X32-NEXT: adcl $0, %ebp
; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: adcl $0, %ebx
+; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl 64(%eax), %edi
; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT: movl %esi, %eax
; X32-NEXT: mull %edi
-; X32-NEXT: movl %edx, %esi
+; X32-NEXT: movl %edx, %ecx
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
; X32-NEXT: movl %ebp, %eax
; X32-NEXT: mull %edi
; X32-NEXT: movl %edx, %edi
; X32-NEXT: movl %eax, %ebx
-; X32-NEXT: addl %esi, %ebx
+; X32-NEXT: addl %ecx, %ebx
; X32-NEXT: adcl $0, %edi
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl 68(%eax), %esi
-; X32-NEXT: movl %ecx, %eax
-; X32-NEXT: mull %esi
-; X32-NEXT: movl %esi, %ecx
-; X32-NEXT: movl %esi, (%esp) # 4-byte Spill
-; X32-NEXT: movl %edx, %esi
+; X32-NEXT: movl 68(%eax), %ecx
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: movl %ecx, %esi
+; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, %ecx
; X32-NEXT: addl %ebx, %eax
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: adcl %edi, %esi
+; X32-NEXT: adcl %edi, %ecx
; X32-NEXT: setb %bl
; X32-NEXT: movl %ebp, %eax
-; X32-NEXT: mull %ecx
-; X32-NEXT: addl %esi, %eax
+; X32-NEXT: mull %esi
+; X32-NEXT: addl %ecx, %eax
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movzbl %bl, %eax
; X32-NEXT: adcl %eax, %edx
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NEXT: movl %ebp, %eax
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT: mull %ecx
-; X32-NEXT: movl %edx, %esi
-; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT: movl %ebx, %eax
-; X32-NEXT: mull %ecx
-; X32-NEXT: movl %edx, %edi
-; X32-NEXT: movl %eax, %ecx
-; X32-NEXT: addl %esi, %ecx
-; X32-NEXT: adcl $0, %edi
-; X32-NEXT: movl %ebp, %eax
-; X32-NEXT: movl (%esp), %esi # 4-byte Reload
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X32-NEXT: mull %esi
; X32-NEXT: movl %edx, %ebp
-; X32-NEXT: addl %ecx, %eax
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: adcl %edi, %ebp
-; X32-NEXT: setb %cl
-; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT: movl %ecx, %eax
; X32-NEXT: mull %esi
-; X32-NEXT: movl %eax, %edi
-; X32-NEXT: addl %ebp, %edi
-; X32-NEXT: movzbl %cl, %eax
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: movl %eax, %esi
+; X32-NEXT: addl %ebp, %esi
+; X32-NEXT: adcl $0, %ebx
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %edx, %ebp
+; X32-NEXT: addl %esi, %eax
+; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: adcl %ebx, %ebp
+; X32-NEXT: setb %bl
+; X32-NEXT: movl %ecx, %esi
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: addl %ebp, %ecx
+; X32-NEXT: movzbl %bl, %eax
; X32-NEXT: adcl %eax, %edx
-; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl 72(%eax), %ecx
-; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT: movl %esi, %eax
-; X32-NEXT: mull %ecx
+; X32-NEXT: movl 72(%eax), %edi
+; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: mull %edi
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl %ebx, %eax
-; X32-NEXT: mull %ecx
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: mull %edi
; X32-NEXT: movl %edx, %ebp
-; X32-NEXT: movl %eax, %ebx
-; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NEXT: movl %eax, %edi
+; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
; X32-NEXT: adcl $0, %ebp
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl 76(%eax), %ecx
-; X32-NEXT: movl %esi, %eax
-; X32-NEXT: mull %ecx
-; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl %edx, %esi
-; X32-NEXT: addl %ebx, %eax
-; X32-NEXT: movl %eax, %ebx
-; X32-NEXT: adcl %ebp, %esi
+; X32-NEXT: movl 76(%eax), %esi
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: addl %edi, %eax
+; X32-NEXT: movl %eax, %edi
+; X32-NEXT: adcl %ebp, %ebx
; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT: mull %ecx
-; X32-NEXT: movl %eax, %ecx
-; X32-NEXT: addl %esi, %ecx
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %esi, %ebp
+; X32-NEXT: movl %eax, %esi
+; X32-NEXT: addl %ebx, %esi
; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
; X32-NEXT: adcl %eax, %edx
-; X32-NEXT: addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: adcl $0, %ecx
+; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: adcl $0, %esi
; X32-NEXT: adcl $0, %edx
-; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT: movl %esi, %eax
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT: movl %ecx, %eax
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
; X32-NEXT: mull %edi
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X32-NEXT: mull %edi
-; X32-NEXT: movl %edx, %ebp
+; X32-NEXT: movl %edx, %edi
; X32-NEXT: movl %eax, %ebx
; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NEXT: adcl $0, %ebp
-; X32-NEXT: movl %esi, %eax
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT: mull %edi
-; X32-NEXT: movl %edx, %esi
+; X32-NEXT: adcl $0, %edi
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: movl %ebp, %ecx
+; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: mull %ebp
+; X32-NEXT: movl %edx, %ebp
; X32-NEXT: addl %ebx, %eax
; X32-NEXT: movl %eax, %ebx
-; X32-NEXT: adcl %ebp, %esi
+; X32-NEXT: adcl %edi, %ebp
; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT: mull %edi
-; X32-NEXT: addl %esi, %eax
-; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
-; X32-NEXT: adcl %esi, %edx
-; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT: mull %ecx
+; X32-NEXT: addl %ebp, %eax
+; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X32-NEXT: adcl %ecx, %edx
+; X32-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
; X32-NEXT: movl %ebp, %eax
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT: mull %esi
-; X32-NEXT: movl %edx, %ecx
-; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT: movl %ebx, %eax
-; X32-NEXT: mull %esi
-; X32-NEXT: movl %edx, %esi
-; X32-NEXT: movl %eax, %edi
-; X32-NEXT: addl %ecx, %edi
-; X32-NEXT: adcl $0, %esi
-; X32-NEXT: movl %ebp, %eax
-; X32-NEXT: movl (%esp), %ecx # 4-byte Reload
-; X32-NEXT: mull %ecx
-; X32-NEXT: movl %edx, %ebp
-; X32-NEXT: addl %edi, %eax
-; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: adcl %esi, %ebp
-; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT: movl %ebx, %eax
-; X32-NEXT: mull %ecx
-; X32-NEXT: addl %ebp, %eax
-; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X32-NEXT: adcl %eax, %edx
-; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT: movl %ecx, %eax
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
; X32-NEXT: mull %edi
-; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: movl %edx, %ecx
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X32-NEXT: movl %esi, %eax
; X32-NEXT: mull %edi
; X32-NEXT: movl %edx, %edi
-; X32-NEXT: movl %eax, %ebp
-; X32-NEXT: addl %ebx, %ebp
+; X32-NEXT: movl %eax, %ebx
+; X32-NEXT: addl %ecx, %ebx
; X32-NEXT: adcl $0, %edi
-; X32-NEXT: movl %ecx, %eax
-; X32-NEXT: movl (%esp), %ebx # 4-byte Reload
-; X32-NEXT: mull %ebx
+; X32-NEXT: movl %ebp, %eax
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NEXT: mull %ebp
; X32-NEXT: movl %edx, %ecx
-; X32-NEXT: addl %ebp, %eax
+; X32-NEXT: addl %ebx, %eax
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: adcl %edi, %ecx
-; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X32-NEXT: setb %bl
; X32-NEXT: movl %esi, %eax
-; X32-NEXT: mull %ebx
-; X32-NEXT: movl %eax, %edi
-; X32-NEXT: addl %ecx, %edi
-; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X32-NEXT: mull %ebp
+; X32-NEXT: addl %ecx, %eax
+; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: movzbl %bl, %eax
; X32-NEXT: adcl %eax, %edx
-; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT: movl %esi, %eax
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; X32-NEXT: mull %ebx
-; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: movl %edx, %ecx
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl %esi, %eax
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT: movl %edi, %eax
; X32-NEXT: mull %ebx
-; X32-NEXT: movl %edx, %ebp
-; X32-NEXT: movl %eax, %ebx
-; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NEXT: adcl $0, %ebp
-; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: movl %eax, %ebp
+; X32-NEXT: addl %ecx, %ebp
+; X32-NEXT: adcl $0, %ebx
+; X32-NEXT: movl %esi, %eax
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X32-NEXT: mull %ecx
; X32-NEXT: movl %edx, %esi
-; X32-NEXT: addl %ebx, %eax
-; X32-NEXT: movl %eax, %ebx
-; X32-NEXT: adcl %ebp, %esi
-; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NEXT: addl %ebp, %eax
+; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: adcl %ebx, %esi
+; X32-NEXT: setb %bl
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: movl %edi, %ebp
; X32-NEXT: mull %ecx
; X32-NEXT: movl %eax, %ecx
; X32-NEXT: addl %esi, %ecx
+; X32-NEXT: movzbl %bl, %eax
+; X32-NEXT: adcl %eax, %edx
+; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: movl %ebp, %eax
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %edx, %ebp
+; X32-NEXT: movl %eax, %edi
+; X32-NEXT: addl %esi, %edi
+; X32-NEXT: adcl $0, %ebp
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: addl %edi, %eax
+; X32-NEXT: movl %eax, %edi
+; X32-NEXT: adcl %ebp, %ebx
+; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %eax, %esi
+; X32-NEXT: addl %ebx, %esi
; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
; X32-NEXT: adcl %eax, %edx
-; X32-NEXT: addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: adcl $0, %ecx
+; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: adcl $0, %esi
; X32-NEXT: adcl $0, %edx
-; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
; X32-NEXT: movl %ebp, %eax
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
; X32-NEXT: mull %edi
-; X32-NEXT: movl %edx, %esi
+; X32-NEXT: movl %edx, %ecx
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X32-NEXT: mull %edi
; X32-NEXT: movl %edx, %edi
; X32-NEXT: movl %eax, %ebx
-; X32-NEXT: addl %esi, %ebx
+; X32-NEXT: addl %ecx, %ebx
; X32-NEXT: adcl $0, %edi
; X32-NEXT: movl %ebp, %eax
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
; X32-NEXT: mull %ebp
-; X32-NEXT: movl %edx, %esi
+; X32-NEXT: movl %edx, %ecx
; X32-NEXT: addl %ebx, %eax
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: adcl %edi, %esi
+; X32-NEXT: adcl %edi, %ecx
; X32-NEXT: setb %bl
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT: movl %edi, %eax
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X32-NEXT: mull %ebp
-; X32-NEXT: addl %esi, %eax
-; X32-NEXT: movzbl %bl, %ebx
-; X32-NEXT: movl %edx, %esi
-; X32-NEXT: adcl %ebx, %esi
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NEXT: addl %ecx, %edx
+; X32-NEXT: addl %ecx, %eax
+; X32-NEXT: movzbl %bl, %ecx
+; X32-NEXT: adcl %ecx, %edx
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 1-byte Folded Reload
-; X32-NEXT: adcl %ebx, %eax
-; X32-NEXT: adcl $0, %esi
-; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-NEXT: addl %esi, %ecx
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 1-byte Folded Reload
+; X32-NEXT: adcl %edi, %eax
+; X32-NEXT: adcl $0, %edx
+; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl 80(%eax), %esi
; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NEXT: movl %ebp, %eax
; X32-NEXT: mull %esi
; X32-NEXT: movl %edx, %ecx
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
; X32-NEXT: movl %edi, %eax
; X32-NEXT: mull %esi
; X32-NEXT: movl %edx, %esi
-; X32-NEXT: movl %eax, %ebp
-; X32-NEXT: addl %ecx, %ebp
+; X32-NEXT: movl %eax, %ebx
+; X32-NEXT: addl %ecx, %ebx
; X32-NEXT: adcl $0, %esi
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl 84(%eax), %ecx
-; X32-NEXT: movl %ebx, %eax
-; X32-NEXT: mull %ecx
-; X32-NEXT: movl %ecx, %ebx
+; X32-NEXT: movl %ebp, %eax
+; X32-NEXT: movl %ecx, %ebp
; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: mull %ecx
; X32-NEXT: movl %edx, %ecx
-; X32-NEXT: addl %ebp, %eax
+; X32-NEXT: addl %ebx, %eax
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: adcl %esi, %ecx
-; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X32-NEXT: setb %bl
; X32-NEXT: movl %edi, %eax
-; X32-NEXT: mull %ebx
+; X32-NEXT: mull %ebp
; X32-NEXT: addl %ecx, %eax
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X32-NEXT: movzbl %bl, %eax
; X32-NEXT: adcl %eax, %edx
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
; X32-NEXT: movl %edi, %eax
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT: mull %ebx
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
; X32-NEXT: movl %edx, %ecx
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT: movl %esi, %eax
-; X32-NEXT: mull %ebx
-; X32-NEXT: movl %edx, %ebp
-; X32-NEXT: movl %eax, %ebx
-; X32-NEXT: addl %ecx, %ebx
-; X32-NEXT: adcl $0, %ebp
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: movl %eax, %ebp
+; X32-NEXT: addl %ecx, %ebp
+; X32-NEXT: adcl $0, %esi
; X32-NEXT: movl %edi, %eax
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X32-NEXT: mull %ecx
; X32-NEXT: movl %edx, %edi
-; X32-NEXT: addl %ebx, %eax
+; X32-NEXT: addl %ebp, %eax
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: adcl %ebp, %edi
-; X32-NEXT: setb %bl
-; X32-NEXT: movl %esi, %eax
+; X32-NEXT: adcl %esi, %edi
+; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X32-NEXT: movl %ebx, %eax
; X32-NEXT: mull %ecx
-; X32-NEXT: movl %eax, %ecx
-; X32-NEXT: addl %edi, %ecx
-; X32-NEXT: movzbl %bl, %eax
+; X32-NEXT: movl %eax, %esi
+; X32-NEXT: addl %edi, %esi
+; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
; X32-NEXT: adcl %eax, %edx
-; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl 88(%eax), %edi
; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NEXT: movl %ebp, %eax
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT: movl %ecx, %eax
; X32-NEXT: mull %edi
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl %esi, %eax
+; X32-NEXT: movl %ebx, %eax
; X32-NEXT: mull %edi
-; X32-NEXT: movl %edx, %ebx
-; X32-NEXT: movl %eax, %esi
-; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NEXT: adcl $0, %ebx
+; X32-NEXT: movl %edx, %ebp
+; X32-NEXT: movl %eax, %ebx
+; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NEXT: adcl $0, %ebp
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl 92(%eax), %edi
-; X32-NEXT: movl %ebp, %eax
+; X32-NEXT: movl %ecx, %eax
; X32-NEXT: mull %edi
-; X32-NEXT: movl %edx, %ebp
-; X32-NEXT: addl %esi, %eax
-; X32-NEXT: movl %eax, %esi
-; X32-NEXT: adcl %ebx, %ebp
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: addl %ebx, %eax
+; X32-NEXT: movl %eax, %ebx
+; X32-NEXT: adcl %ebp, %ecx
; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X32-NEXT: mull %edi
-; X32-NEXT: movl %edi, %ebx
-; X32-NEXT: movl %eax, %edi
-; X32-NEXT: addl %ebp, %edi
+; X32-NEXT: movl %eax, %ebp
+; X32-NEXT: addl %ecx, %ebp
; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
; X32-NEXT: adcl %eax, %edx
-; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: adcl $0, %edi
+; X32-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: adcl $0, %ebp
; X32-NEXT: adcl $0, %edx
-; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X32-NEXT: mull %esi
; X32-NEXT: movl %edx, %esi
-; X32-NEXT: movl %eax, %ebp
-; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NEXT: movl %eax, %ebx
+; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
; X32-NEXT: adcl $0, %esi
; X32-NEXT: movl %ecx, %eax
-; X32-NEXT: movl %ebx, %ecx
-; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: mull %ebx
-; X32-NEXT: movl %edx, %ebx
-; X32-NEXT: addl %ebp, %eax
-; X32-NEXT: movl %eax, %ebp
-; X32-NEXT: adcl %esi, %ebx
+; X32-NEXT: movl %edi, %ecx
+; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %edx, %edi
+; X32-NEXT: addl %ebx, %eax
+; X32-NEXT: movl %eax, %ebx
+; X32-NEXT: adcl %esi, %edi
; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X32-NEXT: mull %ecx
-; X32-NEXT: addl %ebx, %eax
+; X32-NEXT: addl %edi, %eax
; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
; X32-NEXT: adcl %ecx, %edx
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT: addl %edi, %ebx
-; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT: addl %ebp, %edi
+; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
; X32-NEXT: adcl %ecx, %eax
; X32-NEXT: adcl $0, %edx
; X32-NEXT: adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X32-NEXT: adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT: movl %ebx, %edx
+; X32-NEXT: movl %edi, %edx
; X32-NEXT: adcl $0, %edx
-; X32-NEXT: movl %ebp, %ecx
-; X32-NEXT: adcl $0, %ecx
+; X32-NEXT: adcl $0, %ebx
; X32-NEXT: adcl $0, %eax
; X32-NEXT: adcl $0, %esi
; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
; X32-NEXT: movl %ebx, %eax
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X32-NEXT: mull %esi
-; X32-NEXT: movl %edx, %ebp
+; X32-NEXT: movl %edx, %ecx
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NEXT: movl %ebp, %eax
; X32-NEXT: mull %esi
; X32-NEXT: movl %edx, %esi
; X32-NEXT: movl %eax, %edi
-; X32-NEXT: addl %ebp, %edi
+; X32-NEXT: addl %ecx, %edi
; X32-NEXT: adcl $0, %esi
; X32-NEXT: movl %ebx, %eax
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; X32-NEXT: mull %ebx
-; X32-NEXT: movl %edx, %ebp
+; X32-NEXT: movl %edx, %ecx
; X32-NEXT: addl %edi, %eax
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: adcl %esi, %ebp
+; X32-NEXT: adcl %esi, %ecx
; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: movl %ebp, %eax
; X32-NEXT: mull %ebx
-; X32-NEXT: addl %ebp, %eax
+; X32-NEXT: addl %ecx, %eax
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
; X32-NEXT: adcl %eax, %edx
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT: mull %edi
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: mull %esi
; X32-NEXT: movl %edx, %esi
-; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT: movl %ecx, %eax
-; X32-NEXT: mull %edi
-; X32-NEXT: movl %edx, %edi
; X32-NEXT: movl %eax, %ebp
-; X32-NEXT: addl %esi, %ebp
-; X32-NEXT: adcl $0, %edi
+; X32-NEXT: addl %ecx, %ebp
+; X32-NEXT: adcl $0, %esi
; X32-NEXT: movl %ebx, %eax
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT: mull %ebx
-; X32-NEXT: movl %edx, %esi
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, %ebx
; X32-NEXT: addl %ebp, %eax
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: adcl %edi, %esi
+; X32-NEXT: adcl %esi, %ebx
; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT: movl %ecx, %eax
-; X32-NEXT: mull %ebx
-; X32-NEXT: movl %eax, %edi
-; X32-NEXT: addl %esi, %edi
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: addl %ebx, %ecx
; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
; X32-NEXT: adcl %eax, %edx
-; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NEXT: movl %ebp, %eax
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT: mull %esi
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT: mull %ebx
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl %ecx, %eax
-; X32-NEXT: mull %esi
-; X32-NEXT: movl %edx, %esi
-; X32-NEXT: movl %eax, %ebx
-; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NEXT: adcl $0, %esi
-; X32-NEXT: movl %ebp, %eax
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: mull %ebx
; X32-NEXT: movl %edx, %ebp
-; X32-NEXT: addl %ebx, %eax
-; X32-NEXT: movl %eax, %ebx
-; X32-NEXT: adcl %esi, %ebp
+; X32-NEXT: movl %eax, %edi
+; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NEXT: adcl $0, %ebp
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT: mull %ebx
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: addl %edi, %eax
+; X32-NEXT: movl %eax, %edi
+; X32-NEXT: adcl %ebp, %esi
; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT: mull %ecx
-; X32-NEXT: movl %eax, %ecx
-; X32-NEXT: addl %ebp, %ecx
+; X32-NEXT: mull %ebx
+; X32-NEXT: movl %ebx, %ebp
+; X32-NEXT: movl %eax, %ebx
+; X32-NEXT: addl %esi, %ebx
; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
; X32-NEXT: adcl %eax, %edx
-; X32-NEXT: addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: adcl $0, %ecx
+; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: adcl $0, %ebx
; X32-NEXT: adcl $0, %edx
-; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT: movl %edi, %eax
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT: movl %ecx, %eax
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X32-NEXT: mull %esi
-; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X32-NEXT: mull %esi
; X32-NEXT: movl %edx, %esi
-; X32-NEXT: movl %eax, %ebp
-; X32-NEXT: addl %ebx, %ebp
+; X32-NEXT: movl %eax, %edi
+; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
; X32-NEXT: adcl $0, %esi
-; X32-NEXT: movl %edi, %eax
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT: mull %ebx
-; X32-NEXT: movl %edx, %edi
-; X32-NEXT: addl %ebp, %eax
-; X32-NEXT: movl %eax, %ebp
-; X32-NEXT: adcl %esi, %edi
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: movl %ebp, %ecx
+; X32-NEXT: mull %ebp
+; X32-NEXT: movl %edx, %ebp
+; X32-NEXT: addl %edi, %eax
+; X32-NEXT: movl %eax, %edi
+; X32-NEXT: adcl %esi, %ebp
; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT: mull %ebx
-; X32-NEXT: movl %eax, %ebx
-; X32-NEXT: addl %edi, %ebx
-; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X32-NEXT: movl %edx, %edi
-; X32-NEXT: adcl %eax, %edi
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NEXT: addl %ecx, %edx
-; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X32-NEXT: adcl %eax, %ebx
-; X32-NEXT: adcl $0, %edi
+; X32-NEXT: mull %ecx
+; X32-NEXT: addl %ebp, %eax
+; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X32-NEXT: adcl %ecx, %edx
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT: addl %ebx, %ecx
+; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
+; X32-NEXT: adcl %esi, %eax
+; X32-NEXT: movl %eax, %esi
+; X32-NEXT: adcl $0, %edx
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X32-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X32-NEXT: adcl %eax, %edx
-; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: adcl $0, %ebp
-; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: adcl $0, %ebx
-; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: adcl %eax, %ecx
+; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: adcl $0, %edi
; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: adcl $0, %esi
+; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: adcl $0, %edx
+; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
; X32-NEXT: movl %edi, %eax
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X32-NEXT: mull %edi
; X32-NEXT: movl %edx, %ecx
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl (%esp), %esi # 4-byte Reload
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X32-NEXT: movl %esi, %eax
; X32-NEXT: mull %edi
; X32-NEXT: movl %edx, %ebx
; X32-NEXT: movl %eax, %esi
; X32-NEXT: adcl %ebp, %ebx
; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT: movl (%esp), %eax # 4-byte Reload
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X32-NEXT: mull %edi
; X32-NEXT: movl %edx, %ebp
; X32-NEXT: movl %eax, %edi
; X32-NEXT: adcl $0, %edx
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT: imull %eax, %esi
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT: mull %ebx
+; X32-NEXT: movl %esi, %ecx
+; X32-NEXT: imull %eax, %ecx
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NEXT: addl %esi, %ebx
-; X32-NEXT: addl %edx, %ebx
+; X32-NEXT: addl %ecx, %edx
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NEXT: imull %ebp, %esi
+; X32-NEXT: addl %edx, %esi
+; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT: movl %eax, %edx
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT: imull %ecx, %edx
+; X32-NEXT: movl %eax, %esi
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT: imull %ebx, %esi
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: addl %esi, %edx
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NEXT: imull %esi, %ebp
-; X32-NEXT: addl %edx, %ebp
-; X32-NEXT: mull %esi
-; X32-NEXT: movl %eax, %edi
-; X32-NEXT: addl %edx, %ebp
-; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NEXT: adcl %ebx, %ebp
-; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl %esi, %eax
+; X32-NEXT: imull %edi, %esi
+; X32-NEXT: addl %edx, %esi
+; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: movl %edi, %eax
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X32-NEXT: mull %esi
-; X32-NEXT: movl %edx, %ebp
+; X32-NEXT: movl %edx, %ecx
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: movl %ebx, %eax
; X32-NEXT: mull %esi
; X32-NEXT: movl %edx, %esi
; X32-NEXT: movl %eax, %ebx
-; X32-NEXT: addl %ebp, %ebx
+; X32-NEXT: addl %ecx, %ebx
; X32-NEXT: adcl $0, %esi
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: mull %ebp
+; X32-NEXT: movl %ebp, %edi
; X32-NEXT: movl %edx, %ebp
; X32-NEXT: addl %ebx, %eax
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: adcl %esi, %ebp
-; X32-NEXT: setb %bl
+; X32-NEXT: setb %cl
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT: mull %ecx
+; X32-NEXT: mull %edi
; X32-NEXT: addl %ebp, %eax
-; X32-NEXT: movzbl %bl, %ecx
+; X32-NEXT: movzbl %cl, %ecx
; X32-NEXT: adcl %ecx, %edx
-; X32-NEXT: addl %edi, %eax
+; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
; X32-NEXT: mull %ebp
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl (%esp), %ebx # 4-byte Reload
-; X32-NEXT: imull %ebx, %ebp
-; X32-NEXT: addl %ecx, %ebp
+; X32-NEXT: addl %ecx, %edx
+; X32-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
; X32-NEXT: addl %edx, %ebp
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT: movl %eax, %edx
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT: imull %ebx, %ecx
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT: imull %edi, %edx
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %eax, %esi
+; X32-NEXT: addl %ecx, %edx
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT: imull %ecx, %esi
-; X32-NEXT: addl %edx, %esi
-; X32-NEXT: mull %ecx
-; X32-NEXT: addl %edx, %esi
-; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: adcl %ebp, %esi
-; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: imull %edi, %ecx
+; X32-NEXT: addl %edx, %ecx
+; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: adcl %ebp, %ecx
+; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: movl %edi, %eax
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X32-NEXT: mull %ecx
-; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: movl %edx, %esi
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl %edi, %eax
+; X32-NEXT: movl %ebx, %eax
; X32-NEXT: mull %ecx
; X32-NEXT: movl %edx, %ebp
; X32-NEXT: movl %eax, %ecx
-; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-NEXT: addl %esi, %ecx
; X32-NEXT: adcl $0, %ebp
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT: mull %ebx
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
; X32-NEXT: movl %edx, %ebx
; X32-NEXT: movl %eax, %edi
; X32-NEXT: addl %ecx, %edi
; X32-NEXT: adcl %ebp, %ebx
; X32-NEXT: setb %cl
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT: mull (%esp) # 4-byte Folded Reload
+; X32-NEXT: mull %esi
; X32-NEXT: addl %ebx, %eax
; X32-NEXT: movzbl %cl, %ecx
; X32-NEXT: adcl %ecx, %edx
; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NEXT: adcl %esi, %edx
+; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NEXT: movl %edi, (%esp) # 4-byte Spill
+; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: movl 104(%ecx), %ebp
-; X32-NEXT: movl %ebp, %eax
-; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT: mull %esi
+; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT: movl 104(%esi), %ebx
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT: mull %edi
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl %edx, %ebx
-; X32-NEXT: movl 108(%ecx), %ecx
-; X32-NEXT: movl %ecx, %eax
-; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: mull %esi
-; X32-NEXT: movl %edx, %esi
-; X32-NEXT: movl %eax, %edi
-; X32-NEXT: addl %ebx, %edi
-; X32-NEXT: adcl $0, %esi
-; X32-NEXT: movl %ebp, %eax
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: movl 108(%esi), %esi
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %edx, %edi
+; X32-NEXT: movl %eax, %ebp
+; X32-NEXT: addl %ecx, %ebp
+; X32-NEXT: adcl $0, %edi
+; X32-NEXT: movl %ebx, %eax
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; X32-NEXT: mull %ebx
-; X32-NEXT: movl %edx, %ebp
-; X32-NEXT: addl %edi, %eax
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: addl %ebp, %eax
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: adcl %esi, %ebp
+; X32-NEXT: adcl %edi, %ecx
; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: movl %esi, %eax
; X32-NEXT: mull %ebx
-; X32-NEXT: movl %ebx, %edi
-; X32-NEXT: addl %ebp, %eax
+; X32-NEXT: addl %ecx, %eax
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
; X32-NEXT: adcl %eax, %edx
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: movl 96(%ecx), %esi
-; X32-NEXT: movl %esi, %eax
-; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT: mull %ebx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT: movl 96(%esi), %eax
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl 100(%ecx), %eax
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT: mull %edi
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: mull %ebx
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: movl 100(%esi), %eax
+; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: mull %edi
; X32-NEXT: movl %edx, %ebp
; X32-NEXT: movl %eax, %ebx
-; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NEXT: addl %ecx, %ebx
; X32-NEXT: adcl $0, %ebp
-; X32-NEXT: movl %esi, %eax
-; X32-NEXT: mull %edi
-; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, %esi
; X32-NEXT: addl %ebx, %eax
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: adcl %ebp, %ecx
+; X32-NEXT: adcl %ebp, %esi
; X32-NEXT: setb %bl
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
; X32-NEXT: movl %ebp, %eax
-; X32-NEXT: mull %edi
-; X32-NEXT: movl %eax, %esi
-; X32-NEXT: addl %ecx, %esi
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: addl %esi, %ecx
; X32-NEXT: movzbl %bl, %eax
; X32-NEXT: adcl %eax, %edx
-; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: movl %edi, %esi
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
; X32-NEXT: mull %edi
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl %eax, %edi
; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
; X32-NEXT: adcl $0, %ebx
-; X32-NEXT: movl %ecx, %eax
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT: mull %ecx
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
; X32-NEXT: movl %edx, %ebp
; X32-NEXT: addl %edi, %eax
; X32-NEXT: movl %eax, %edi
; X32-NEXT: adcl %ebx, %ebp
; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT: mull %ecx
-; X32-NEXT: movl %edx, %ebx
-; X32-NEXT: movl %eax, %ecx
-; X32-NEXT: addl %ebp, %ecx
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %esi, %ebx
+; X32-NEXT: movl %eax, %esi
+; X32-NEXT: addl %ebp, %esi
; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X32-NEXT: adcl %eax, %ebx
-; X32-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT: adcl %eax, %edx
+; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: adcl $0, %ecx
-; X32-NEXT: adcl $0, %ebx
-; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NEXT: adcl $0, %esi
+; X32-NEXT: adcl $0, %edx
+; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT: movl %esi, %eax
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT: movl %ecx, %eax
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
; X32-NEXT: mull %edi
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl %eax, %ebp
; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
; X32-NEXT: adcl $0, %edi
-; X32-NEXT: movl %esi, %eax
-; X32-NEXT: mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X32-NEXT: movl %edx, %esi
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: movl %ebx, %ecx
+; X32-NEXT: mull %ebx
+; X32-NEXT: movl %edx, %ebx
; X32-NEXT: addl %ebp, %eax
; X32-NEXT: movl %eax, %ebp
-; X32-NEXT: adcl %edi, %esi
+; X32-NEXT: adcl %edi, %ebx
; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT: mull %edi
-; X32-NEXT: addl %esi, %eax
-; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
-; X32-NEXT: adcl %esi, %edx
-; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT: adcl %ebx, %ebp
+; X32-NEXT: mull %ecx
+; X32-NEXT: addl %ebx, %eax
+; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X32-NEXT: adcl %ecx, %edx
+; X32-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
; X32-NEXT: adcl %ecx, %eax
; X32-NEXT: adcl $0, %edx
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: movl 112(%ecx), %eax
-; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: imull %eax, %edi
+; X32-NEXT: movl 112(%ecx), %edi
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT: imull %edi, %esi
+; X32-NEXT: movl %edi, %eax
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; X32-NEXT: mull %ebx
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: addl %esi, %edx
; X32-NEXT: movl 116(%ecx), %eax
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: imull %eax, %ebx
-; X32-NEXT: addl %edi, %ebx
; X32-NEXT: addl %edx, %ebx
; X32-NEXT: movl 120(%ecx), %eax
-; X32-NEXT: movl %ecx, %edx
-; X32-NEXT: movl %eax, %ecx
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT: imull %esi, %ecx
-; X32-NEXT: movl 124(%edx), %edi
+; X32-NEXT: movl %eax, %esi
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT: imull %ecx, %esi
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NEXT: imull %ebp, %edi
-; X32-NEXT: addl %ecx, %edi
; X32-NEXT: mull %ebp
-; X32-NEXT: addl %edx, %edi
+; X32-NEXT: addl %esi, %edx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT: movl 124(%esi), %esi
+; X32-NEXT: imull %ebp, %esi
+; X32-NEXT: addl %edx, %esi
; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: adcl %ebx, %edi
+; X32-NEXT: adcl %ebx, %esi
; X32-NEXT: movl %ebp, %eax
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT: mull %ecx
+; X32-NEXT: mull %edi
; X32-NEXT: movl %edx, %ebp
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl %esi, %eax
-; X32-NEXT: mull %ecx
-; X32-NEXT: movl %edx, %esi
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %edx, %ecx
; X32-NEXT: movl %eax, %ebx
; X32-NEXT: addl %ebp, %ebx
-; X32-NEXT: adcl $0, %esi
+; X32-NEXT: adcl $0, %ecx
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT: mull %ecx
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT: mull %edi
; X32-NEXT: movl %edx, %ebp
; X32-NEXT: addl %ebx, %eax
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: adcl %esi, %ebp
+; X32-NEXT: adcl %ecx, %ebp
; X32-NEXT: setb %bl
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT: mull %ecx
+; X32-NEXT: mull %edi
; X32-NEXT: addl %ebp, %eax
-; X32-NEXT: movzbl %bl, %esi
-; X32-NEXT: adcl %esi, %edx
+; X32-NEXT: movzbl %bl, %ecx
+; X32-NEXT: adcl %ecx, %edx
; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: adcl %edi, %edx
+; X32-NEXT: adcl %esi, %edx
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT: imull %eax, %ecx
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT: mull %ebx
-; X32-NEXT: movl %eax, %ebp
-; X32-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NEXT: addl %ecx, %ebx
-; X32-NEXT: addl %edx, %ebx
+; X32-NEXT: imull %eax, %esi
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NEXT: mull %ebp
+; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: addl %esi, %edx
+; X32-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NEXT: addl %edx, %ebp
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT: movl %eax, %edx
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT: imull %esi, %edx
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT: movl %eax, %edi
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT: imull %ebx, %edi
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT: imull %edi, %ecx
-; X32-NEXT: addl %edx, %ecx
-; X32-NEXT: mull %edi
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %eax, %esi
+; X32-NEXT: addl %edi, %edx
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT: imull %eax, %ecx
; X32-NEXT: addl %edx, %ecx
-; X32-NEXT: addl %ebp, %eax
-; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: adcl %ebx, %ecx
-; X32-NEXT: movl %edi, %eax
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT: mull %edi
+; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: adcl %ebp, %ecx
+; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: movl %eax, %esi
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT: mull %ecx
; X32-NEXT: movl %edx, %ebp
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl %esi, %eax
-; X32-NEXT: mull %edi
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: mull %ecx
; X32-NEXT: movl %edx, %edi
; X32-NEXT: movl %eax, %ebx
; X32-NEXT: addl %ebp, %ebx
; X32-NEXT: adcl $0, %edi
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NEXT: movl %esi, %eax
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X32-NEXT: mull %esi
; X32-NEXT: movl %edx, %ebp
-; X32-NEXT: addl %ebx, %eax
-; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: addl %ebx, %ecx
; X32-NEXT: adcl %edi, %ebp
; X32-NEXT: setb %bl
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X32-NEXT: movzbl %bl, %edi
; X32-NEXT: adcl %edi, %edx
; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NEXT: adcl %ecx, %edx
+; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: adcl (%esp), %ebx # 4-byte Folded Reload
+; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
; X32-NEXT: addl %ecx, %edi
; X32-NEXT: adcl $0, %esi
; X32-NEXT: movl %ebp, %eax
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NEXT: movl (%esp), %ebp # 4-byte Reload
; X32-NEXT: mull %ebp
; X32-NEXT: movl %edx, %ecx
; X32-NEXT: addl %edi, %eax
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X32-NEXT: mull %esi
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X32-NEXT: mull %esi
; X32-NEXT: movl %edx, %esi
; X32-NEXT: addl %ebp, %eax
; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
; X32-NEXT: adcl %ecx, %edx
-; X32-NEXT: addl %ebx, (%esp) # 4-byte Folded Spill
+; X32-NEXT: addl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
; X32-NEXT: addl %ecx, %edi
; X32-NEXT: adcl $0, %esi
; X32-NEXT: movl %ebp, %eax
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NEXT: movl (%esp), %ebp # 4-byte Reload
; X32-NEXT: mull %ebp
; X32-NEXT: movl %edx, %ecx
; X32-NEXT: addl %edi, %eax
; X32-NEXT: addl %edi, %ebp
; X32-NEXT: adcl $0, %esi
; X32-NEXT: movl %ecx, %eax
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT: movl (%esp), %edi # 4-byte Reload
; X32-NEXT: mull %edi
; X32-NEXT: movl %edx, %ebx
; X32-NEXT: addl %ebp, %eax
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: adcl $0, (%esp) # 4-byte Folded Spill
+; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X32-NEXT: adcl $0, %ecx
; X32-NEXT: adcl $0, %eax
; X32-NEXT: adcl $0, %esi
-; X32-NEXT: addl (%esp), %edx # 4-byte Folded Reload
+; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movzbl %bl, %eax
; X32-NEXT: adcl %eax, %edx
-; X32-NEXT: movl %edx, (%esp) # 4-byte Spill
+; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X32-NEXT: movl %esi, %eax
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; X32-NEXT: adcl %ebx, %esi
; X32-NEXT: setb %bl
; X32-NEXT: movl %edi, %eax
+; X32-NEXT: movl %edi, %ebp
; X32-NEXT: mull %ecx
; X32-NEXT: movl %eax, %ecx
; X32-NEXT: addl %esi, %ecx
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT: adcl $0, (%esp) # 4-byte Folded Spill
+; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; X32-NEXT: movl %ebx, %eax
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT: mull %esi
-; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %edx, %esi
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl %edi, %eax
-; X32-NEXT: mull %esi
+; X32-NEXT: movl %ebp, %eax
+; X32-NEXT: mull %edi
; X32-NEXT: movl %edx, %ebp
; X32-NEXT: movl %eax, %edi
-; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NEXT: addl %esi, %edi
; X32-NEXT: adcl $0, %ebp
; X32-NEXT: movl %ebx, %eax
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X32-NEXT: adcl $0, %ebp
; X32-NEXT: adcl $0, %edx
; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NEXT: adcl (%esp), %edx # 4-byte Folded Reload
+; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: setb (%esp) # 1-byte Folded Spill
+; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X32-NEXT: movl %ecx, %eax
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X32-NEXT: addl %ebp, %ecx
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NEXT: movzbl (%esp), %eax # 1-byte Folded Reload
+; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
; X32-NEXT: adcl %eax, %ebx
; X32-NEXT: adcl $0, %edx
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X32-NEXT: movl %esi, %eax
; X32-NEXT: mull %edi
; X32-NEXT: movl %edx, %ecx
-; X32-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
; X32-NEXT: movl %ebp, %eax
; X32-NEXT: mull %edi
; X32-NEXT: mull %edi
; X32-NEXT: movl %edx, %ecx
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT: movl (%esp), %ebx # 4-byte Reload
; X32-NEXT: movl %ebx, %eax
; X32-NEXT: mull %edi
; X32-NEXT: movl %edx, %edi
; X32-NEXT: addl %esi, %edi
; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
; X32-NEXT: adcl %eax, %edx
-; X32-NEXT: addl (%esp), %edi # 4-byte Folded Reload
+; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl 104(%eax), %esi
-; X32-NEXT: movl %esi, (%esp) # 4-byte Spill
+; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X32-NEXT: movl %ecx, %eax
; X32-NEXT: mull %esi
; X32-NEXT: movl %eax, %ebx
; X32-NEXT: adcl %ebp, %ecx
; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NEXT: movl (%esp), %eax # 4-byte Reload
; X32-NEXT: mull %esi
; X32-NEXT: movl %edx, %ebp
; X32-NEXT: movl %eax, %esi
; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X32-NEXT: movl %ecx, %eax
-; X32-NEXT: movl (%esp), %edi # 4-byte Reload
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
; X32-NEXT: mull %edi
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: adcl $0, %edx
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT: imull %eax, %edi
-; X32-NEXT: movl (%esp), %ecx # 4-byte Reload
+; X32-NEXT: movl %edi, %esi
+; X32-NEXT: imull %eax, %esi
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X32-NEXT: mull %ecx
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: addl %esi, %edx
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; X32-NEXT: imull %ebx, %ecx
-; X32-NEXT: addl %edi, %ecx
; X32-NEXT: addl %edx, %ecx
-; X32-NEXT: movl %ecx, (%esp) # 4-byte Spill
+; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT: movl %eax, %edx
+; X32-NEXT: movl %eax, %esi
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT: imull %edi, %edx
+; X32-NEXT: imull %edi, %esi
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NEXT: mull %ebp
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: addl %esi, %edx
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT: imull %ecx, %esi
+; X32-NEXT: movl %ebp, %eax
+; X32-NEXT: imull %ebp, %esi
; X32-NEXT: addl %edx, %esi
-; X32-NEXT: mull %ecx
-; X32-NEXT: movl %eax, %ebp
-; X32-NEXT: addl %edx, %esi
-; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NEXT: adcl (%esp), %esi # 4-byte Folded Reload
+; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl %ecx, %eax
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT: mull %esi
-; X32-NEXT: movl %edx, (%esp) # 4-byte Spill
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, %ebp
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl %edi, %eax
-; X32-NEXT: mull %esi
+; X32-NEXT: mull %ecx
; X32-NEXT: movl %edx, %esi
; X32-NEXT: movl %eax, %edi
-; X32-NEXT: addl (%esp), %edi # 4-byte Folded Reload
+; X32-NEXT: addl %ebp, %edi
; X32-NEXT: adcl $0, %esi
-; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X32-NEXT: mull %ebx
+; X32-NEXT: movl %ebx, %ebp
; X32-NEXT: movl %edx, %ecx
; X32-NEXT: addl %edi, %eax
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: adcl %esi, %ecx
; X32-NEXT: setb %bl
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT: mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X32-NEXT: mull %ebp
; X32-NEXT: addl %ecx, %eax
; X32-NEXT: movzbl %bl, %ecx
; X32-NEXT: adcl %ecx, %edx
-; X32-NEXT: addl %ebp, %eax
+; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X32-NEXT: movl 124(%edi), %ecx
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT: imull %eax, %ecx
-; X32-NEXT: movl 120(%edi), %esi
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X32-NEXT: movl 120(%ebx), %esi
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT: movl %ecx, %eax
; X32-NEXT: mull %esi
-; X32-NEXT: movl %eax, (%esp) # 4-byte Spill
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NEXT: imull %ebp, %esi
-; X32-NEXT: addl %ecx, %esi
+; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: imull (%esp), %esi # 4-byte Folded Reload
; X32-NEXT: addl %edx, %esi
-; X32-NEXT: movl 112(%edi), %ecx
-; X32-NEXT: movl 116(%edi), %edi
+; X32-NEXT: movl 124(%ebx), %eax
+; X32-NEXT: imull %ecx, %eax
+; X32-NEXT: addl %eax, %esi
+; X32-NEXT: movl 112(%ebx), %edi
+; X32-NEXT: movl 116(%ebx), %ebp
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT: movl %eax, %edx
-; X32-NEXT: imull %edi, %edx
-; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT: imull %ecx, %ebx
-; X32-NEXT: addl %edx, %ebx
-; X32-NEXT: mull %ecx
-; X32-NEXT: addl %edx, %ebx
-; X32-NEXT: addl (%esp), %eax # 4-byte Folded Reload
-; X32-NEXT: movl %eax, (%esp) # 4-byte Spill
-; X32-NEXT: adcl %esi, %ebx
-; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl %ecx, %eax
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT: mull %esi
-; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: movl %eax, %ebx
+; X32-NEXT: imull %ebp, %ebx
+; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: mull %edi
+; X32-NEXT: addl %ebx, %edx
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT: imull %edi, %ecx
+; X32-NEXT: addl %edx, %ecx
+; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: adcl %esi, %ecx
+; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl %edi, %eax
-; X32-NEXT: mull %esi
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: movl %ebp, %eax
+; X32-NEXT: mull %ecx
; X32-NEXT: movl %edx, %ebx
-; X32-NEXT: movl %eax, %esi
-; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NEXT: movl %eax, %ebp
+; X32-NEXT: addl %esi, %ebp
; X32-NEXT: adcl $0, %ebx
-; X32-NEXT: movl %ecx, %eax
-; X32-NEXT: mull %ebp
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: movl (%esp), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
; X32-NEXT: movl %edx, %ecx
; X32-NEXT: movl %eax, %edi
-; X32-NEXT: addl %esi, %edi
+; X32-NEXT: addl %ebp, %edi
; X32-NEXT: adcl %ebx, %ecx
; X32-NEXT: setb %bl
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT: mull %ebp
+; X32-NEXT: mull %esi
; X32-NEXT: addl %ecx, %eax
; X32-NEXT: movzbl %bl, %ecx
; X32-NEXT: adcl %ecx, %edx
-; X32-NEXT: addl (%esp), %eax # 4-byte Folded Reload
+; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NEXT: movl %edi, (%esp) # 4-byte Spill
+; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X32-NEXT: movl %ebp, %eax
; X32-NEXT: mull %ecx
; X32-NEXT: addl %esi, %eax
-; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: movl %eax, (%esp) # 4-byte Spill
; X32-NEXT: movzbl %bl, %eax
; X32-NEXT: adcl %eax, %edx
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT: adcl $0, (%esp) # 4-byte Folded Spill
; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X32-NEXT: movl %ecx, %eax
; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: adcl $0, %ebp
; X32-NEXT: adcl $0, %ecx
-; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NEXT: addl (%esp), %ebp # 4-byte Folded Reload
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X32-NEXT: setb (%esp) # 1-byte Folded Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X32-NEXT: movl %esi, %eax
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
; X32-NEXT: addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X32-NEXT: adcl %ecx, %ebx
; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X32-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload
; X32-NEXT: adcl %ecx, %eax
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: adcl $0, %edx
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT: imull %eax, %edi
-; X32-NEXT: movl %eax, %ebx
+; X32-NEXT: movl %edi, %esi
+; X32-NEXT: imull %eax, %esi
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X32-NEXT: mull %ecx
-; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X32-NEXT: addl %esi, %edx
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
; X32-NEXT: imull %ebp, %ecx
-; X32-NEXT: addl %edi, %ecx
; X32-NEXT: addl %edx, %ecx
; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT: movl %eax, %edx
+; X32-NEXT: movl %eax, %edi
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT: imull %ebx, %edi
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT: imull %esi, %edx
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: addl %edi, %edx
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT: imull %ecx, %edi
-; X32-NEXT: addl %edx, %edi
-; X32-NEXT: mull %ecx
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: imull %esi, %edi
; X32-NEXT: addl %edx, %edi
-; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: addl (%esp), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl %ecx, (%esp) # 4-byte Spill
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl %ecx, %eax
-; X32-NEXT: mull %ebx
-; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, %esi
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl %esi, %eax
-; X32-NEXT: mull %ebx
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: mull %ecx
; X32-NEXT: movl %edx, %edi
; X32-NEXT: movl %eax, %ebx
-; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NEXT: addl %esi, %ebx
; X32-NEXT: adcl $0, %edi
-; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X32-NEXT: mull %ebp
+; X32-NEXT: movl %ebp, %esi
; X32-NEXT: movl %edx, %ebp
; X32-NEXT: addl %ebx, %eax
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: adcl %edi, %ebp
; X32-NEXT: setb %cl
-; X32-NEXT: movl %esi, %eax
-; X32-NEXT: mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NEXT: mull %esi
; X32-NEXT: addl %ebp, %eax
; X32-NEXT: movzbl %cl, %ecx
; X32-NEXT: adcl %ecx, %edx
-; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NEXT: addl (%esp), %eax # 4-byte Folded Reload
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT: imull %eax, %ecx
+; X32-NEXT: imull %esi, %ecx
+; X32-NEXT: movl %esi, %eax
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; X32-NEXT: mull %ebx
-; X32-NEXT: movl %eax, %edi
+; X32-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X32-NEXT: addl %ecx, %edx
; X32-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NEXT: addl %ecx, %ebx
; X32-NEXT: addl %edx, %ebx
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT: movl %eax, %edx
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT: imull %esi, %edx
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT: imull %edi, %ecx
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NEXT: imull %ebp, %ecx
-; X32-NEXT: addl %edx, %ecx
; X32-NEXT: mull %ebp
+; X32-NEXT: addl %ecx, %edx
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT: imull %ebp, %ecx
; X32-NEXT: addl %edx, %ecx
-; X32-NEXT: addl %edi, %eax
+; X32-NEXT: addl (%esp), %eax # 4-byte Folded Reload
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: adcl %ebx, %ecx
; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl %ebp, %eax
-; X32-NEXT: movl %ebp, %edi
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT: mull %ecx
+; X32-NEXT: movl %ebp, %ebx
+; X32-NEXT: mull %esi
; X32-NEXT: movl %edx, %ebp
-; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl %esi, %eax
-; X32-NEXT: mull %ecx
+; X32-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: mull %esi
; X32-NEXT: movl %edx, %esi
; X32-NEXT: movl %eax, %ecx
; X32-NEXT: addl %ebp, %ecx
; X32-NEXT: adcl $0, %esi
-; X32-NEXT: movl %edi, %eax
+; X32-NEXT: movl %ebx, %eax
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
; X32-NEXT: mull %ebp
; X32-NEXT: movl %edx, %edi
; X32-NEXT: adcl %ecx, %esi
; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT: movl (%esp), %ecx # 4-byte Reload
; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: movl %ebp, (%esp) # 4-byte Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NEXT: adcl (%esp), %ebp # 4-byte Folded Reload
+; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT: movl (%esp), %ecx # 4-byte Reload
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: movl %ecx, (%esp) # 4-byte Spill
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NEXT: movl (%esp), %edx # 4-byte Reload
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: movl %edx, (%esp) # 4-byte Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT: movl (%esp), %edi # 4-byte Reload
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; X64-NEXT: movq 40(%rdi), %rbx
-; X64-NEXT: movq 32(%rdi), %r12
-; X64-NEXT: movq 56(%rdi), %r14
+; X64-NEXT: movq 32(%rdi), %r14
+; X64-NEXT: movq 56(%rdi), %r15
; X64-NEXT: movq 48(%rdi), %r10
; X64-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; X64-NEXT: movq (%rsi), %r11
-; X64-NEXT: movq 8(%rsi), %rcx
-; X64-NEXT: movq %rsi, %r13
+; X64-NEXT: movq 8(%rsi), %r8
+; X64-NEXT: movq %rsi, %r12
; X64-NEXT: movq %r10, %rax
; X64-NEXT: mulq %r11
-; X64-NEXT: movq %rdx, %rsi
-; X64-NEXT: movq %rax, %rbp
-; X64-NEXT: movq %r14, %rax
-; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq %rdx, %rcx
+; X64-NEXT: movq %rax, %rdi
+; X64-NEXT: movq %r15, %rax
+; X64-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; X64-NEXT: mulq %r11
-; X64-NEXT: movq %rdx, %rdi
+; X64-NEXT: movq %rdx, %rsi
; X64-NEXT: movq %rax, %r9
-; X64-NEXT: addq %rsi, %r9
-; X64-NEXT: adcq $0, %rdi
+; X64-NEXT: addq %rcx, %r9
+; X64-NEXT: adcq $0, %rsi
; X64-NEXT: movq %r10, %rax
-; X64-NEXT: mulq %rcx
+; X64-NEXT: mulq %r8
; X64-NEXT: movq %rdx, %r10
-; X64-NEXT: movq %rax, %r8
-; X64-NEXT: addq %r9, %r8
-; X64-NEXT: adcq %rdi, %r10
+; X64-NEXT: movq %rax, %rcx
+; X64-NEXT: addq %r9, %rcx
+; X64-NEXT: adcq %rsi, %r10
; X64-NEXT: setb %al
; X64-NEXT: movzbl %al, %r9d
+; X64-NEXT: movq %r15, %rax
+; X64-NEXT: mulq %r8
+; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq %rdx, %r13
+; X64-NEXT: movq %rax, %rsi
+; X64-NEXT: addq %r10, %rsi
+; X64-NEXT: adcq %r9, %r13
; X64-NEXT: movq %r14, %rax
-; X64-NEXT: mulq %rcx
-; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq %rdx, %r15
-; X64-NEXT: movq %rax, %rdi
-; X64-NEXT: addq %r10, %rdi
-; X64-NEXT: adcq %r9, %r15
-; X64-NEXT: movq %r12, %rax
; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; X64-NEXT: mulq %r11
; X64-NEXT: movq %rdx, %r9
; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq %rbx, %r14
+; X64-NEXT: movq %rbx, %r15
; X64-NEXT: movq %rbx, %rax
; X64-NEXT: mulq %r11
; X64-NEXT: movq %rdx, %r10
; X64-NEXT: movq %rax, %r11
; X64-NEXT: addq %r9, %r11
; X64-NEXT: adcq $0, %r10
-; X64-NEXT: movq %r12, %rax
-; X64-NEXT: mulq %rcx
+; X64-NEXT: movq %r14, %rax
+; X64-NEXT: mulq %r8
; X64-NEXT: movq %rdx, %r9
; X64-NEXT: addq %r11, %rax
; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; X64-NEXT: adcq %r10, %r9
; X64-NEXT: setb %r10b
; X64-NEXT: movq %rbx, %rax
-; X64-NEXT: mulq %rcx
+; X64-NEXT: mulq %r8
; X64-NEXT: movq %rdx, %r11
; X64-NEXT: movq %rax, %rbx
; X64-NEXT: addq %r9, %rbx
; X64-NEXT: movzbl %r10b, %eax
; X64-NEXT: adcq %rax, %r11
-; X64-NEXT: addq %rbp, %rbx
-; X64-NEXT: adcq %r8, %r11
-; X64-NEXT: adcq $0, %rdi
-; X64-NEXT: adcq $0, %r15
-; X64-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq 16(%r13), %rcx
+; X64-NEXT: addq %rdi, %rbx
+; X64-NEXT: adcq %rcx, %r11
+; X64-NEXT: adcq $0, %rsi
+; X64-NEXT: adcq $0, %r13
; X64-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq %r12, %rax
-; X64-NEXT: mulq %rcx
-; X64-NEXT: movq %rdx, %r8
-; X64-NEXT: movq %rax, %rsi
-; X64-NEXT: movq %r14, %rax
-; X64-NEXT: movq %r14, %rbp
+; X64-NEXT: movq 16(%r12), %r8
+; X64-NEXT: movq %r14, %r10
; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: mulq %rcx
+; X64-NEXT: movq %r14, %rax
+; X64-NEXT: mulq %r8
+; X64-NEXT: movq %rdx, %rdi
+; X64-NEXT: movq %rax, %rcx
+; X64-NEXT: movq %r15, %rax
+; X64-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: mulq %r8
; X64-NEXT: movq %rdx, %r9
; X64-NEXT: movq %rax, %r14
-; X64-NEXT: addq %r8, %r14
+; X64-NEXT: addq %rdi, %r14
; X64-NEXT: adcq $0, %r9
-; X64-NEXT: movq 24(%r13), %r13
-; X64-NEXT: movq %r12, %rax
-; X64-NEXT: mulq %r13
+; X64-NEXT: movq 24(%r12), %rbp
+; X64-NEXT: movq %r10, %rax
+; X64-NEXT: mulq %rbp
; X64-NEXT: movq %rdx, %r12
; X64-NEXT: addq %r14, %rax
; X64-NEXT: movq %rax, %r14
; X64-NEXT: adcq %r9, %r12
; X64-NEXT: setb %r10b
-; X64-NEXT: movq %rbp, %rax
-; X64-NEXT: mulq %r13
-; X64-NEXT: movq %rdx, %r8
+; X64-NEXT: movq %r15, %rax
+; X64-NEXT: mulq %rbp
+; X64-NEXT: movq %rdx, %rdi
; X64-NEXT: movq %rax, %r9
; X64-NEXT: addq %r12, %r9
; X64-NEXT: movzbl %r10b, %eax
-; X64-NEXT: adcq %rax, %r8
-; X64-NEXT: addq %rbx, %rsi
-; X64-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: adcq %rax, %rdi
+; X64-NEXT: addq %rbx, %rcx
+; X64-NEXT: movq %rcx, (%rsp) # 8-byte Spill
; X64-NEXT: adcq %r11, %r14
-; X64-NEXT: movq %r14, (%rsp) # 8-byte Spill
+; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; X64-NEXT: adcq $0, %r9
-; X64-NEXT: adcq $0, %r8
-; X64-NEXT: addq %rdi, %r9
-; X64-NEXT: adcq %r15, %r8
+; X64-NEXT: adcq $0, %rdi
+; X64-NEXT: addq %rsi, %r9
+; X64-NEXT: adcq %r13, %rdi
; X64-NEXT: setb %r10b
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; X64-NEXT: movq %rbx, %rax
-; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: mulq %rcx
-; X64-NEXT: movq %rdx, %rsi
-; X64-NEXT: movq %rax, %r14
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
-; X64-NEXT: movq %r12, %rax
-; X64-NEXT: mulq %rcx
-; X64-NEXT: movq %rdx, %rdi
-; X64-NEXT: movq %rax, %r11
-; X64-NEXT: addq %rsi, %r11
-; X64-NEXT: adcq $0, %rdi
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; X64-NEXT: movq %r14, %rax
+; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: mulq %r8
+; X64-NEXT: movq %rdx, %rcx
+; X64-NEXT: movq %rax, %r15
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
; X64-NEXT: movq %rbx, %rax
-; X64-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: mulq %r13
+; X64-NEXT: mulq %r8
; X64-NEXT: movq %rdx, %rsi
+; X64-NEXT: movq %rax, %r11
+; X64-NEXT: addq %rcx, %r11
+; X64-NEXT: adcq $0, %rsi
+; X64-NEXT: movq %r14, %rax
+; X64-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: mulq %rbp
+; X64-NEXT: movq %rdx, %rcx
; X64-NEXT: addq %r11, %rax
; X64-NEXT: movq %rax, %r11
-; X64-NEXT: adcq %rdi, %rsi
-; X64-NEXT: setb %dil
-; X64-NEXT: movq %r12, %rax
-; X64-NEXT: mulq %r13
-; X64-NEXT: addq %rsi, %rax
-; X64-NEXT: movzbl %dil, %esi
-; X64-NEXT: adcq %rsi, %rdx
-; X64-NEXT: addq %r9, %r14
-; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: adcq %r8, %r11
+; X64-NEXT: adcq %rsi, %rcx
+; X64-NEXT: setb %sil
+; X64-NEXT: movq %rbx, %rax
+; X64-NEXT: mulq %rbp
+; X64-NEXT: addq %rcx, %rax
+; X64-NEXT: movzbl %sil, %ecx
+; X64-NEXT: adcq %rcx, %rdx
+; X64-NEXT: addq %r9, %r15
+; X64-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: adcq %rdi, %r11
; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movzbl %r10b, %esi
-; X64-NEXT: adcq %rsi, %rax
+; X64-NEXT: movzbl %r10b, %ecx
+; X64-NEXT: adcq %rcx, %rax
; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; X64-NEXT: adcq $0, %rdx
; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; X64-NEXT: movq 16(%rcx), %r10
-; X64-NEXT: movq %r10, %rax
-; X64-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; X64-NEXT: mulq %r13
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; X64-NEXT: movq 16(%r8), %rsi
+; X64-NEXT: movq %rsi, %rax
+; X64-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
+; X64-NEXT: mulq %rbp
; X64-NEXT: movq %rax, %r9
-; X64-NEXT: movq %rdx, %rsi
-; X64-NEXT: movq 24(%rcx), %r14
+; X64-NEXT: movq %rdx, %rcx
+; X64-NEXT: movq 24(%r8), %r14
; X64-NEXT: movq %r14, %rax
; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: mulq %r13
-; X64-NEXT: movq %r13, %rbx
+; X64-NEXT: mulq %rbp
; X64-NEXT: movq %rdx, %rdi
-; X64-NEXT: movq %rax, %r8
-; X64-NEXT: addq %rsi, %r8
+; X64-NEXT: movq %rax, %r11
+; X64-NEXT: addq %rcx, %r11
; X64-NEXT: adcq $0, %rdi
-; X64-NEXT: movq %r10, %rax
+; X64-NEXT: movq %rsi, %rax
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
; X64-NEXT: mulq %r13
-; X64-NEXT: movq %rdx, %r11
+; X64-NEXT: movq %rdx, %rbx
; X64-NEXT: movq %rax, %rsi
-; X64-NEXT: addq %r8, %rsi
-; X64-NEXT: adcq %rdi, %r11
+; X64-NEXT: addq %r11, %rsi
+; X64-NEXT: adcq %rdi, %rbx
; X64-NEXT: setb %r10b
; X64-NEXT: movq %r14, %rax
; X64-NEXT: mulq %r13
; X64-NEXT: movq %r13, %r12
-; X64-NEXT: movq %rdx, %rbp
-; X64-NEXT: movq %rax, %r8
-; X64-NEXT: addq %r11, %r8
+; X64-NEXT: movq %rdx, %rcx
+; X64-NEXT: movq %rax, %rdi
+; X64-NEXT: addq %rbx, %rdi
; X64-NEXT: movzbl %r10b, %eax
-; X64-NEXT: adcq %rax, %rbp
-; X64-NEXT: movq (%rcx), %r13
+; X64-NEXT: adcq %rax, %rcx
+; X64-NEXT: movq (%r8), %r13
; X64-NEXT: movq %r13, %rax
-; X64-NEXT: mulq %rbx
+; X64-NEXT: mulq %rbp
; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; X64-NEXT: movq %rdx, %r11
-; X64-NEXT: movq 8(%rcx), %rdi
-; X64-NEXT: movq %rdi, %rax
-; X64-NEXT: mulq %rbx
+; X64-NEXT: movq 8(%r8), %rax
+; X64-NEXT: movq %rax, %r8
+; X64-NEXT: mulq %rbp
; X64-NEXT: movq %rdx, %rbx
; X64-NEXT: movq %rax, %r14
; X64-NEXT: addq %r11, %r14
; X64-NEXT: adcq $0, %rbx
; X64-NEXT: movq %r13, %rax
-; X64-NEXT: movq %r12, %rcx
+; X64-NEXT: movq %r12, %r11
; X64-NEXT: mulq %r12
; X64-NEXT: movq %rdx, %r12
; X64-NEXT: addq %r14, %rax
; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; X64-NEXT: adcq %rbx, %r12
; X64-NEXT: setb %r10b
-; X64-NEXT: movq %rdi, %rax
-; X64-NEXT: mulq %rcx
+; X64-NEXT: movq %r8, %rbp
+; X64-NEXT: movq %r8, %rax
+; X64-NEXT: mulq %r11
; X64-NEXT: movq %rdx, %r11
; X64-NEXT: movq %rax, %rbx
; X64-NEXT: addq %r12, %rbx
; X64-NEXT: adcq %rax, %r11
; X64-NEXT: addq %r9, %rbx
; X64-NEXT: adcq %rsi, %r11
-; X64-NEXT: adcq $0, %r8
-; X64-NEXT: adcq $0, %rbp
-; X64-NEXT: movq %r13, %rcx
+; X64-NEXT: adcq $0, %rdi
+; X64-NEXT: adcq $0, %rcx
+; X64-NEXT: movq %r13, %r10
; X64-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; X64-NEXT: movq %r13, %rax
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; X64-NEXT: mulq %r15
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; X64-NEXT: mulq %r8
; X64-NEXT: movq %rdx, %rsi
; X64-NEXT: movq %rax, %r13
-; X64-NEXT: movq %rdi, %rax
-; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: mulq %r15
+; X64-NEXT: movq %rbp, %rax
+; X64-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: mulq %r8
; X64-NEXT: movq %rdx, %r9
; X64-NEXT: movq %rax, %r14
; X64-NEXT: addq %rsi, %r14
; X64-NEXT: adcq $0, %r9
-; X64-NEXT: movq %rcx, %rax
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; X64-NEXT: mulq %rcx
+; X64-NEXT: movq %r10, %rax
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; X64-NEXT: mulq %r15
; X64-NEXT: movq %rdx, %r12
; X64-NEXT: addq %r14, %rax
; X64-NEXT: movq %rax, %r14
; X64-NEXT: adcq %r9, %r12
; X64-NEXT: setb %r10b
-; X64-NEXT: movq %rdi, %rax
-; X64-NEXT: mulq %rcx
+; X64-NEXT: movq %rbp, %rax
+; X64-NEXT: mulq %r15
; X64-NEXT: movq %rdx, %r9
; X64-NEXT: movq %rax, %rsi
; X64-NEXT: addq %r12, %rsi
; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; X64-NEXT: adcq $0, %rsi
; X64-NEXT: adcq $0, %r9
-; X64-NEXT: addq %r8, %rsi
-; X64-NEXT: adcq %rbp, %r9
+; X64-NEXT: addq %rdi, %rsi
+; X64-NEXT: adcq %rcx, %r9
; X64-NEXT: setb %r10b
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; X64-NEXT: movq %rcx, %rax
-; X64-NEXT: mulq %r15
-; X64-NEXT: movq %rdx, %rdi
-; X64-NEXT: movq %rax, %r14
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
-; X64-NEXT: movq %r12, %rax
-; X64-NEXT: mulq %r15
-; X64-NEXT: movq %rdx, %r8
-; X64-NEXT: movq %rax, %r11
-; X64-NEXT: addq %rdi, %r11
-; X64-NEXT: adcq $0, %r8
-; X64-NEXT: movq %rcx, %rax
-; X64-NEXT: movq %rcx, %r15
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; X64-NEXT: movq %r8, %rax
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
; X64-NEXT: mulq %rbp
-; X64-NEXT: movq %rdx, %rbx
+; X64-NEXT: movq %rdx, %rcx
+; X64-NEXT: movq %rax, %r12
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: mulq %rbp
+; X64-NEXT: movq %rdx, %r11
+; X64-NEXT: movq %rax, %rbx
+; X64-NEXT: addq %rcx, %rbx
+; X64-NEXT: adcq $0, %r11
+; X64-NEXT: movq %r8, %rax
+; X64-NEXT: mulq %r15
+; X64-NEXT: movq %rdx, %r14
; X64-NEXT: movq %rax, %rcx
-; X64-NEXT: addq %r11, %rcx
-; X64-NEXT: adcq %r8, %rbx
+; X64-NEXT: addq %rbx, %rcx
+; X64-NEXT: adcq %r11, %r14
; X64-NEXT: setb %r11b
-; X64-NEXT: movq %r12, %rax
-; X64-NEXT: mulq %rbp
-; X64-NEXT: addq %rbx, %rax
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: movq %rdi, %rbx
+; X64-NEXT: mulq %r15
+; X64-NEXT: addq %r14, %rax
; X64-NEXT: movzbl %r11b, %edi
; X64-NEXT: adcq %rdi, %rdx
-; X64-NEXT: addq %rsi, %r14
+; X64-NEXT: addq %rsi, %r12
; X64-NEXT: adcq %r9, %rcx
; X64-NEXT: movzbl %r10b, %esi
; X64-NEXT: adcq %rsi, %rax
; X64-NEXT: adcq $0, %rdx
-; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload
-; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload
+; X64-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
+; X64-NEXT: adcq (%rsp), %rax # 8-byte Folded Reload
; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: adcq (%rsp), %rdx # 8-byte Folded Reload
+; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload
; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; X64-NEXT: adcq $0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
; X64-NEXT: adcq $0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
; X64-NEXT: adcq $0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
; X64-NEXT: adcq $0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; X64-NEXT: movq 32(%r10), %rcx
-; X64-NEXT: movq %r15, %rax
-; X64-NEXT: mulq %rcx
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT: movq 32(%rcx), %rdi
+; X64-NEXT: movq %r8, %r10
+; X64-NEXT: movq %r8, %rax
+; X64-NEXT: mulq %rdi
; X64-NEXT: movq %rdx, %rsi
-; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq %r12, %r8
-; X64-NEXT: movq %r12, %rax
-; X64-NEXT: mulq %rcx
-; X64-NEXT: movq %rcx, %r14
+; X64-NEXT: movq %rax, %r8
+; X64-NEXT: movq %rbx, %r14
+; X64-NEXT: movq %rbx, %rax
+; X64-NEXT: mulq %rdi
; X64-NEXT: movq %rdx, %r9
; X64-NEXT: movq %rax, %r11
; X64-NEXT: addq %rsi, %r11
; X64-NEXT: adcq $0, %r9
-; X64-NEXT: movq 40(%r10), %rcx
-; X64-NEXT: movq %r10, %rdi
-; X64-NEXT: movq %r15, %rax
-; X64-NEXT: mulq %rcx
-; X64-NEXT: movq %rcx, %r12
+; X64-NEXT: movq 40(%rcx), %rsi
+; X64-NEXT: movq %r10, %rax
+; X64-NEXT: mulq %rsi
+; X64-NEXT: movq %rsi, %r15
; X64-NEXT: movq %rdx, %rbx
; X64-NEXT: movq %rax, %rsi
; X64-NEXT: addq %r11, %rsi
; X64-NEXT: adcq %r9, %rbx
; X64-NEXT: setb %r10b
-; X64-NEXT: movq %r8, %rax
-; X64-NEXT: mulq %rcx
+; X64-NEXT: movq %r14, %rax
+; X64-NEXT: mulq %r15
; X64-NEXT: movq %rdx, %r9
; X64-NEXT: movq %rax, %r11
; X64-NEXT: addq %rbx, %r11
; X64-NEXT: movzbl %r10b, %eax
; X64-NEXT: adcq %rax, %r9
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; X64-NEXT: movq %rcx, %rax
-; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: mulq %r14
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
+; X64-NEXT: movq %rbp, %rax
+; X64-NEXT: movq %rdi, %r10
+; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: mulq %rdi
; X64-NEXT: movq %rdx, %rbx
; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; X64-NEXT: movq %r8, %rax
-; X64-NEXT: mulq %r14
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: mulq %r10
; X64-NEXT: movq %rdx, %r14
; X64-NEXT: movq %rax, %r13
; X64-NEXT: addq %rbx, %r13
; X64-NEXT: adcq $0, %r14
-; X64-NEXT: movq %rcx, %rax
-; X64-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: mulq %r12
-; X64-NEXT: movq %rdx, %r15
+; X64-NEXT: movq %rbp, %rax
+; X64-NEXT: movq %r15, %rbx
+; X64-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: mulq %r15
+; X64-NEXT: movq %rdx, %r10
; X64-NEXT: addq %r13, %rax
; X64-NEXT: movq %rax, (%rsp) # 8-byte Spill
-; X64-NEXT: adcq %r14, %r15
-; X64-NEXT: setb %r10b
-; X64-NEXT: movq %r8, %rax
-; X64-NEXT: mulq %r12
+; X64-NEXT: adcq %r14, %r10
+; X64-NEXT: setb %r15b
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: mulq %rbx
; X64-NEXT: movq %rdx, %rbx
; X64-NEXT: movq %rax, %r14
-; X64-NEXT: addq %r15, %r14
-; X64-NEXT: movzbl %r10b, %eax
+; X64-NEXT: addq %r10, %r14
+; X64-NEXT: movzbl %r15b, %eax
; X64-NEXT: adcq %rax, %rbx
-; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload
+; X64-NEXT: addq %r8, %r14
; X64-NEXT: adcq %rsi, %rbx
; X64-NEXT: adcq $0, %r11
; X64-NEXT: adcq $0, %r9
-; X64-NEXT: movq 48(%rdi), %r12
-; X64-NEXT: movq %rcx, %rax
-; X64-NEXT: mulq %r12
+; X64-NEXT: movq %rcx, %r8
+; X64-NEXT: movq 48(%rcx), %rcx
+; X64-NEXT: movq %rbp, %r15
+; X64-NEXT: movq %rbp, %rax
+; X64-NEXT: mulq %rcx
; X64-NEXT: movq %rdx, %rsi
; X64-NEXT: movq %rax, %rbp
-; X64-NEXT: movq %r8, %rax
-; X64-NEXT: mulq %r12
-; X64-NEXT: movq %rdx, %r15
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: mulq %rcx
+; X64-NEXT: movq %rdx, %r10
; X64-NEXT: movq %rax, %r13
; X64-NEXT: addq %rsi, %r13
-; X64-NEXT: adcq $0, %r15
-; X64-NEXT: movq 56(%rdi), %rsi
-; X64-NEXT: movq %rcx, %rax
+; X64-NEXT: adcq $0, %r10
+; X64-NEXT: movq 56(%r8), %rsi
+; X64-NEXT: movq %r15, %rax
; X64-NEXT: mulq %rsi
-; X64-NEXT: movq %rdx, %rcx
-; X64-NEXT: movq %rax, %rdi
-; X64-NEXT: addq %r13, %rdi
-; X64-NEXT: adcq %r15, %rcx
-; X64-NEXT: setb %r10b
-; X64-NEXT: movq %r8, %rax
+; X64-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq %rdx, %r15
+; X64-NEXT: movq %rax, %r12
+; X64-NEXT: addq %r13, %r12
+; X64-NEXT: adcq %r10, %r15
+; X64-NEXT: setb %r8b
+; X64-NEXT: movq %rdi, %rax
; X64-NEXT: mulq %rsi
-; X64-NEXT: movq %rsi, %r8
; X64-NEXT: movq %rdx, %rsi
; X64-NEXT: movq %rax, %r13
-; X64-NEXT: addq %rcx, %r13
-; X64-NEXT: movzbl %r10b, %eax
+; X64-NEXT: addq %r15, %r13
+; X64-NEXT: movzbl %r8b, %eax
; X64-NEXT: adcq %rax, %rsi
; X64-NEXT: addq %r14, %rbp
-; X64-NEXT: movq %rbp, %r10
+; X64-NEXT: movq %rbp, %r8
+; X64-NEXT: movq %r12, %rdi
; X64-NEXT: adcq %rbx, %rdi
; X64-NEXT: adcq $0, %r13
; X64-NEXT: adcq $0, %rsi
; X64-NEXT: addq %r11, %r13
; X64-NEXT: adcq %r9, %rsi
-; X64-NEXT: setb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill
+; X64-NEXT: setb %bpl
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
; X64-NEXT: movq %r14, %rax
-; X64-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: mulq %r12
-; X64-NEXT: movq %rdx, %rcx
-; X64-NEXT: movq %rax, %rbp
+; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: mulq %rcx
+; X64-NEXT: movq %rdx, %r9
+; X64-NEXT: movq %rax, %r12
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
; X64-NEXT: movq %r11, %rax
-; X64-NEXT: mulq %r12
-; X64-NEXT: movq %rdx, %r9
+; X64-NEXT: mulq %rcx
+; X64-NEXT: movq %rdx, %r10
; X64-NEXT: movq %rax, %rbx
-; X64-NEXT: addq %rcx, %rbx
-; X64-NEXT: adcq $0, %r9
+; X64-NEXT: addq %r9, %rbx
+; X64-NEXT: adcq $0, %r10
; X64-NEXT: movq %r14, %rax
-; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: mulq %r8
-; X64-NEXT: movq %rdx, %rcx
-; X64-NEXT: movq %rax, %r12
-; X64-NEXT: addq %rbx, %r12
-; X64-NEXT: adcq %r9, %rcx
-; X64-NEXT: setb %r15b
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT: mulq %rcx
+; X64-NEXT: movq %rdx, %r15
+; X64-NEXT: movq %rax, %r9
+; X64-NEXT: addq %rbx, %r9
+; X64-NEXT: adcq %r10, %r15
+; X64-NEXT: setb %r10b
; X64-NEXT: movq %r11, %rax
-; X64-NEXT: mulq %r8
+; X64-NEXT: mulq %rcx
; X64-NEXT: movq %rdx, %r14
; X64-NEXT: movq %rax, %rbx
-; X64-NEXT: addq %rcx, %rbx
-; X64-NEXT: movzbl %r15b, %eax
+; X64-NEXT: addq %r15, %rbx
+; X64-NEXT: movzbl %r10b, %eax
; X64-NEXT: adcq %rax, %r14
-; X64-NEXT: addq %r13, %rbp
-; X64-NEXT: adcq %rsi, %r12
-; X64-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
+; X64-NEXT: addq %r13, %r12
+; X64-NEXT: adcq %rsi, %r9
+; X64-NEXT: movzbl %bpl, %eax
; X64-NEXT: adcq %rax, %rbx
; X64-NEXT: adcq $0, %r14
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
; X64-NEXT: addq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
; X64-NEXT: adcq %rax, (%rsp) # 8-byte Folded Spill
-; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Folded Reload
-; X64-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
+; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Folded Reload
; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: adcq $0, %rbp
; X64-NEXT: adcq $0, %r12
+; X64-NEXT: adcq $0, %r9
; X64-NEXT: adcq $0, %rbx
; X64-NEXT: adcq $0, %r14
-; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload
-; X64-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload
+; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload
; X64-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload
+; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload
; X64-NEXT: setb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
; X64-NEXT: mulq %rsi
; X64-NEXT: movq %rdx, %rcx
-; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq %rax, %rdi
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
; X64-NEXT: movq %r9, %rax
; X64-NEXT: mulq %rsi
; X64-NEXT: movq %rsi, %r13
; X64-NEXT: movq %rdx, %rsi
-; X64-NEXT: movq %rax, %rdi
-; X64-NEXT: addq %rcx, %rdi
+; X64-NEXT: movq %rax, %r10
+; X64-NEXT: addq %rcx, %r10
; X64-NEXT: adcq $0, %rsi
; X64-NEXT: movq %r8, %rax
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; X64-NEXT: mulq %r8
-; X64-NEXT: movq %rdx, %rcx
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT: mulq %rcx
+; X64-NEXT: movq %rdx, %r15
; X64-NEXT: movq %rax, %r12
-; X64-NEXT: addq %rdi, %r12
-; X64-NEXT: adcq %rsi, %rcx
-; X64-NEXT: setb %r10b
+; X64-NEXT: addq %r10, %r12
+; X64-NEXT: adcq %rsi, %r15
+; X64-NEXT: setb %r8b
; X64-NEXT: movq %r9, %rax
-; X64-NEXT: mulq %r8
+; X64-NEXT: mulq %rcx
; X64-NEXT: movq %rdx, %r9
-; X64-NEXT: movq %rax, %rdi
-; X64-NEXT: addq %rcx, %rdi
-; X64-NEXT: movzbl %r10b, %eax
+; X64-NEXT: movq %rax, %rsi
+; X64-NEXT: addq %r15, %rsi
+; X64-NEXT: movzbl %r8b, %eax
; X64-NEXT: adcq %rax, %r9
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; X64-NEXT: movq %r10, %rax
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; X64-NEXT: movq %r8, %rax
; X64-NEXT: mulq %r13
-; X64-NEXT: movq %rdx, %rcx
+; X64-NEXT: movq %rdx, %r10
; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
; X64-NEXT: movq %rbp, %rax
; X64-NEXT: mulq %r13
; X64-NEXT: movq %rdx, %r15
; X64-NEXT: movq %rax, %r13
-; X64-NEXT: addq %rcx, %r13
+; X64-NEXT: addq %r10, %r13
; X64-NEXT: adcq $0, %r15
-; X64-NEXT: movq %r10, %rax
-; X64-NEXT: movq %r10, %r11
-; X64-NEXT: mulq %r8
-; X64-NEXT: movq %rdx, %rcx
+; X64-NEXT: movq %r8, %rax
+; X64-NEXT: movq %r8, %r11
+; X64-NEXT: mulq %rcx
+; X64-NEXT: movq %rdx, %r10
; X64-NEXT: addq %r13, %rax
; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: adcq %r15, %rcx
-; X64-NEXT: setb %r10b
+; X64-NEXT: adcq %r15, %r10
+; X64-NEXT: setb %r8b
; X64-NEXT: movq %rbp, %rax
; X64-NEXT: movq %rbp, %r15
-; X64-NEXT: mulq %r8
+; X64-NEXT: mulq %rcx
; X64-NEXT: movq %rdx, %r13
; X64-NEXT: movq %rax, %rbp
-; X64-NEXT: addq %rcx, %rbp
-; X64-NEXT: movzbl %r10b, %eax
+; X64-NEXT: addq %r10, %rbp
+; X64-NEXT: movzbl %r8b, %eax
; X64-NEXT: adcq %rax, %r13
-; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload
+; X64-NEXT: addq %rdi, %rbp
; X64-NEXT: adcq %r12, %r13
-; X64-NEXT: adcq $0, %rdi
+; X64-NEXT: adcq $0, %rsi
; X64-NEXT: adcq $0, %r9
-; X64-NEXT: movq %r11, %r10
+; X64-NEXT: movq %r11, %r8
; X64-NEXT: movq %r11, %rax
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; X64-NEXT: mulq %rsi
-; X64-NEXT: movq %rdx, %rcx
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT: mulq %rcx
+; X64-NEXT: movq %rdx, %rdi
; X64-NEXT: movq %rax, %r11
; X64-NEXT: movq %r15, %rax
; X64-NEXT: movq %r15, %r12
-; X64-NEXT: mulq %rsi
-; X64-NEXT: movq %rdx, %r8
+; X64-NEXT: mulq %rcx
+; X64-NEXT: movq %rdx, %r10
; X64-NEXT: movq %rax, %r15
-; X64-NEXT: addq %rcx, %r15
-; X64-NEXT: adcq $0, %r8
-; X64-NEXT: movq %r10, %rax
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; X64-NEXT: mulq %rsi
-; X64-NEXT: movq %rdx, %rcx
+; X64-NEXT: addq %rdi, %r15
+; X64-NEXT: adcq $0, %r10
+; X64-NEXT: movq %r8, %rax
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; X64-NEXT: mulq %rdi
+; X64-NEXT: movq %rdx, %r8
; X64-NEXT: addq %r15, %rax
; X64-NEXT: movq %rax, %r15
-; X64-NEXT: adcq %r8, %rcx
+; X64-NEXT: adcq %r10, %r8
; X64-NEXT: setb %r10b
; X64-NEXT: movq %r12, %rax
-; X64-NEXT: mulq %rsi
-; X64-NEXT: movq %rdx, %r8
+; X64-NEXT: mulq %rdi
+; X64-NEXT: movq %rdx, %rdi
; X64-NEXT: movq %rax, %r12
-; X64-NEXT: addq %rcx, %r12
+; X64-NEXT: addq %r8, %r12
; X64-NEXT: movzbl %r10b, %eax
-; X64-NEXT: adcq %rax, %r8
+; X64-NEXT: adcq %rax, %rdi
; X64-NEXT: addq %rbp, %r11
; X64-NEXT: adcq %r13, %r15
; X64-NEXT: movq %r15, %rbp
; X64-NEXT: adcq $0, %r12
-; X64-NEXT: adcq $0, %r8
-; X64-NEXT: addq %rdi, %r12
-; X64-NEXT: adcq %r9, %r8
-; X64-NEXT: setb %r9b
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; X64-NEXT: movq %r13, %rax
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; X64-NEXT: mulq %rsi
-; X64-NEXT: movq %rdx, %rcx
-; X64-NEXT: movq %rax, %r10
+; X64-NEXT: adcq $0, %rdi
+; X64-NEXT: addq %rsi, %r12
+; X64-NEXT: adcq %r9, %rdi
+; X64-NEXT: setb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
; X64-NEXT: movq %r15, %rax
+; X64-NEXT: movq %rcx, %rsi
+; X64-NEXT: mulq %rcx
+; X64-NEXT: movq %rdx, %rcx
+; X64-NEXT: movq %rax, %r10
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; X64-NEXT: movq %r13, %rax
; X64-NEXT: mulq %rsi
; X64-NEXT: movq %rdx, %rsi
-; X64-NEXT: movq %rax, %rdi
-; X64-NEXT: addq %rcx, %rdi
+; X64-NEXT: movq %rax, %r8
+; X64-NEXT: addq %rcx, %r8
; X64-NEXT: adcq $0, %rsi
-; X64-NEXT: movq %r13, %rax
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; X64-NEXT: mulq %r13
+; X64-NEXT: movq %r15, %rax
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; X64-NEXT: mulq %r9
; X64-NEXT: movq %rdx, %rcx
-; X64-NEXT: addq %rdi, %rax
-; X64-NEXT: movq %rax, %rdi
+; X64-NEXT: addq %r8, %rax
+; X64-NEXT: movq %rax, %r8
; X64-NEXT: adcq %rsi, %rcx
; X64-NEXT: setb %sil
-; X64-NEXT: movq %r15, %rax
-; X64-NEXT: mulq %r13
+; X64-NEXT: movq %r13, %rax
+; X64-NEXT: mulq %r9
; X64-NEXT: addq %rcx, %rax
; X64-NEXT: movzbl %sil, %ecx
; X64-NEXT: adcq %rcx, %rdx
; X64-NEXT: addq %r12, %r10
-; X64-NEXT: adcq %r8, %rdi
-; X64-NEXT: movzbl %r9b, %ecx
+; X64-NEXT: adcq %rdi, %r8
+; X64-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
; X64-NEXT: adcq %rcx, %rax
; X64-NEXT: movq %rax, %rcx
; X64-NEXT: adcq $0, %rdx
; X64-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; X64-NEXT: adcq %rax, %r10
; X64-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: adcq $0, %rdi
-; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: adcq $0, %r8
+; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; X64-NEXT: adcq $0, %rcx
; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; X64-NEXT: adcq $0, %rdx
; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; X64-NEXT: movq 64(%r9), %rcx
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; X64-NEXT: movq 64(%r10), %rsi
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; X64-NEXT: movq %rdi, %rax
-; X64-NEXT: mulq %rsi
-; X64-NEXT: movq %rdx, %rcx
+; X64-NEXT: movq %r10, %rax
+; X64-NEXT: mulq %rcx
+; X64-NEXT: movq %rdx, %rsi
; X64-NEXT: movq %rax, %r11
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
; X64-NEXT: movq %r14, %rax
-; X64-NEXT: mulq %rsi
-; X64-NEXT: movq %rsi, %r15
-; X64-NEXT: movq %rdx, %r8
-; X64-NEXT: movq %rax, %r9
-; X64-NEXT: addq %rcx, %r9
-; X64-NEXT: adcq $0, %r8
-; X64-NEXT: movq 72(%r10), %rcx
-; X64-NEXT: movq %r10, %rsi
-; X64-NEXT: movq %rdi, %rax
; X64-NEXT: mulq %rcx
-; X64-NEXT: movq %rcx, %rdi
-; X64-NEXT: movq %rdx, %rcx
+; X64-NEXT: movq %rcx, %r15
+; X64-NEXT: movq %rdx, %rdi
+; X64-NEXT: movq %rax, %r8
+; X64-NEXT: addq %rsi, %r8
+; X64-NEXT: adcq $0, %rdi
+; X64-NEXT: movq 72(%r9), %rsi
+; X64-NEXT: movq %r9, %rcx
+; X64-NEXT: movq %r10, %rax
+; X64-NEXT: mulq %rsi
+; X64-NEXT: movq %rsi, %r13
+; X64-NEXT: movq %rdx, %r10
; X64-NEXT: movq %rax, %rbx
-; X64-NEXT: addq %r9, %rbx
-; X64-NEXT: adcq %r8, %rcx
-; X64-NEXT: setb %r10b
+; X64-NEXT: addq %r8, %rbx
+; X64-NEXT: adcq %rdi, %r10
+; X64-NEXT: setb %r8b
; X64-NEXT: movq %r14, %rax
-; X64-NEXT: mulq %rdi
-; X64-NEXT: movq %rdi, %r13
-; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq %rdx, %rdi
+; X64-NEXT: mulq %rsi
+; X64-NEXT: movq %rdx, %rsi
; X64-NEXT: movq %rax, %r9
-; X64-NEXT: addq %rcx, %r9
-; X64-NEXT: movzbl %r10b, %eax
-; X64-NEXT: adcq %rax, %rdi
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; X64-NEXT: movq %r8, %rax
+; X64-NEXT: addq %r10, %r9
+; X64-NEXT: movzbl %r8b, %eax
+; X64-NEXT: adcq %rax, %rsi
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
+; X64-NEXT: movq %r12, %rax
+; X64-NEXT: movq %r15, %rdi
; X64-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; X64-NEXT: mulq %r15
-; X64-NEXT: movq %rdx, %rcx
+; X64-NEXT: movq %rdx, %r8
; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; X64-NEXT: movq %r15, %rax
+; X64-NEXT: mulq %rdi
+; X64-NEXT: movq %rdx, %r10
+; X64-NEXT: movq %rax, %r14
+; X64-NEXT: addq %r8, %r14
+; X64-NEXT: adcq $0, %r10
; X64-NEXT: movq %r12, %rax
-; X64-NEXT: mulq %r15
-; X64-NEXT: movq %rdx, %r14
-; X64-NEXT: movq %rax, %r15
-; X64-NEXT: addq %rcx, %r15
-; X64-NEXT: adcq $0, %r14
-; X64-NEXT: movq %r8, %rax
+; X64-NEXT: movq %r12, %rdi
+; X64-NEXT: movq %r13, %r12
+; X64-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; X64-NEXT: mulq %r13
-; X64-NEXT: movq %rdx, %rcx
-; X64-NEXT: addq %r15, %rax
+; X64-NEXT: movq %rdx, %r8
+; X64-NEXT: addq %r14, %rax
; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: adcq %r14, %rcx
+; X64-NEXT: adcq %r10, %r8
; X64-NEXT: setb %r10b
-; X64-NEXT: movq %r12, %rax
-; X64-NEXT: mulq %r13
+; X64-NEXT: movq %r15, %rax
+; X64-NEXT: movq %r15, %r13
+; X64-NEXT: mulq %r12
; X64-NEXT: movq %rdx, %r14
; X64-NEXT: movq %rax, %rbp
-; X64-NEXT: addq %rcx, %rbp
+; X64-NEXT: addq %r8, %rbp
; X64-NEXT: movzbl %r10b, %eax
; X64-NEXT: adcq %rax, %r14
; X64-NEXT: addq %r11, %rbp
; X64-NEXT: adcq %rbx, %r14
; X64-NEXT: adcq $0, %r9
-; X64-NEXT: adcq $0, %rdi
-; X64-NEXT: movq %rsi, %r10
-; X64-NEXT: movq 80(%rsi), %r15
-; X64-NEXT: movq %r8, %rax
+; X64-NEXT: adcq $0, %rsi
+; X64-NEXT: movq %rcx, %rbx
+; X64-NEXT: movq 80(%rcx), %r15
+; X64-NEXT: movq %rdi, %rax
; X64-NEXT: mulq %r15
-; X64-NEXT: movq %rdx, %rcx
-; X64-NEXT: movq %rax, %rsi
-; X64-NEXT: movq %r12, %rax
+; X64-NEXT: movq %rdx, %r8
+; X64-NEXT: movq %rax, %rcx
+; X64-NEXT: movq %r13, %rax
; X64-NEXT: mulq %r15
-; X64-NEXT: movq %rdx, %r11
-; X64-NEXT: movq %rax, %rbx
-; X64-NEXT: addq %rcx, %rbx
-; X64-NEXT: adcq $0, %r11
-; X64-NEXT: movq 88(%r10), %r10
-; X64-NEXT: movq %r8, %rax
-; X64-NEXT: mulq %r10
-; X64-NEXT: movq %rdx, %rcx
-; X64-NEXT: addq %rbx, %rax
-; X64-NEXT: movq %rax, %rbx
-; X64-NEXT: adcq %r11, %rcx
-; X64-NEXT: setb %r11b
-; X64-NEXT: movq %r12, %rax
-; X64-NEXT: mulq %r10
+; X64-NEXT: movq %rdx, %r10
+; X64-NEXT: movq %rax, %r11
+; X64-NEXT: addq %r8, %r11
+; X64-NEXT: adcq $0, %r10
+; X64-NEXT: movq 88(%rbx), %rbx
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: mulq %rbx
+; X64-NEXT: movq %rdx, %r8
+; X64-NEXT: addq %r11, %rax
+; X64-NEXT: movq %rax, %r11
+; X64-NEXT: adcq %r10, %r8
+; X64-NEXT: setb %r10b
+; X64-NEXT: movq %r13, %rax
+; X64-NEXT: mulq %rbx
; X64-NEXT: movq %rdx, %r12
; X64-NEXT: movq %rax, %r13
-; X64-NEXT: addq %rcx, %r13
-; X64-NEXT: movzbl %r11b, %eax
+; X64-NEXT: addq %r8, %r13
+; X64-NEXT: movzbl %r10b, %eax
; X64-NEXT: adcq %rax, %r12
-; X64-NEXT: addq %rbp, %rsi
-; X64-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: adcq %r14, %rbx
-; X64-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: addq %rbp, %rcx
+; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: adcq %r14, %r11
+; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; X64-NEXT: adcq $0, %r13
; X64-NEXT: adcq $0, %r12
; X64-NEXT: addq %r9, %r13
-; X64-NEXT: adcq %rdi, %r12
-; X64-NEXT: setb %r9b
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; X64-NEXT: movq %r14, %rax
+; X64-NEXT: adcq %rsi, %r12
+; X64-NEXT: setb %bpl
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; X64-NEXT: movq %r9, %rax
; X64-NEXT: mulq %r15
-; X64-NEXT: movq %rdx, %rcx
-; X64-NEXT: movq %rax, %rdi
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; X64-NEXT: movq %r11, %rax
+; X64-NEXT: movq %rdx, %rdi
+; X64-NEXT: movq %rax, %rsi
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT: movq %rcx, %rax
; X64-NEXT: mulq %r15
; X64-NEXT: movq %rdx, %r8
-; X64-NEXT: movq %rax, %rbx
-; X64-NEXT: addq %rcx, %rbx
+; X64-NEXT: movq %rax, %r10
+; X64-NEXT: addq %rdi, %r10
; X64-NEXT: adcq $0, %r8
-; X64-NEXT: movq %r14, %rax
-; X64-NEXT: mulq %r10
-; X64-NEXT: movq %rdx, %rcx
-; X64-NEXT: addq %rbx, %rax
-; X64-NEXT: movq %rax, %rsi
-; X64-NEXT: adcq %r8, %rcx
+; X64-NEXT: movq %r9, %rax
+; X64-NEXT: mulq %rbx
+; X64-NEXT: movq %rdx, %rdi
+; X64-NEXT: addq %r10, %rax
+; X64-NEXT: movq %rax, %r9
+; X64-NEXT: adcq %r8, %rdi
; X64-NEXT: setb %r8b
-; X64-NEXT: movq %r11, %rax
-; X64-NEXT: mulq %r10
-; X64-NEXT: addq %rcx, %rax
+; X64-NEXT: movq %rcx, %rax
+; X64-NEXT: mulq %rbx
+; X64-NEXT: addq %rdi, %rax
; X64-NEXT: movzbl %r8b, %ecx
; X64-NEXT: adcq %rcx, %rdx
-; X64-NEXT: addq %r13, %rdi
-; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: adcq %r12, %rsi
+; X64-NEXT: addq %r13, %rsi
; X64-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movzbl %r9b, %ecx
+; X64-NEXT: adcq %r12, %r9
+; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movzbl %bpl, %ecx
; X64-NEXT: adcq %rcx, %rax
; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; X64-NEXT: adcq $0, %rdx
; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; X64-NEXT: imulq %rax, %r10
-; X64-NEXT: movq %rax, %r9
+; X64-NEXT: imulq %rax, %rbx
+; X64-NEXT: movq %rax, %r12
; X64-NEXT: mulq %r15
-; X64-NEXT: movq %rax, %rcx
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; X64-NEXT: imulq %rsi, %r15
-; X64-NEXT: addq %r10, %r15
+; X64-NEXT: movq %rax, %r8
+; X64-NEXT: addq %rbx, %rdx
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT: imulq %rcx, %r15
; X64-NEXT: addq %rdx, %r15
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; X64-NEXT: movq %rax, %rdx
+; X64-NEXT: movq %rax, %r10
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; X64-NEXT: imulq %r14, %r10
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; X64-NEXT: mulq %rsi
+; X64-NEXT: movq %rax, %rdi
+; X64-NEXT: addq %r10, %rdx
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; X64-NEXT: imulq %rbx, %rdx
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; X64-NEXT: imulq %rdi, %r10
-; X64-NEXT: addq %rdx, %r10
-; X64-NEXT: mulq %rdi
-; X64-NEXT: movq %rax, %r8
-; X64-NEXT: addq %rdx, %r10
-; X64-NEXT: addq %rcx, %r8
-; X64-NEXT: adcq %r15, %r10
-; X64-NEXT: movq %rdi, %rax
-; X64-NEXT: mulq %r9
-; X64-NEXT: movq %rdx, %r15
+; X64-NEXT: imulq %rsi, %rbx
+; X64-NEXT: addq %rdx, %rbx
+; X64-NEXT: addq %r8, %rdi
+; X64-NEXT: adcq %r15, %rbx
+; X64-NEXT: movq %rsi, %rax
+; X64-NEXT: mulq %r12
+; X64-NEXT: movq %rdx, %r8
; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq %rbx, %rax
-; X64-NEXT: mulq %r9
-; X64-NEXT: movq %rdx, %r12
-; X64-NEXT: movq %rax, %r13
-; X64-NEXT: addq %r15, %r13
-; X64-NEXT: adcq $0, %r12
-; X64-NEXT: movq %rdi, %rax
-; X64-NEXT: mulq %rsi
-; X64-NEXT: movq %rdx, %rbp
+; X64-NEXT: movq %r14, %rax
+; X64-NEXT: mulq %r12
+; X64-NEXT: movq %rdx, %r10
; X64-NEXT: movq %rax, %r15
-; X64-NEXT: addq %r13, %r15
-; X64-NEXT: adcq %r12, %rbp
-; X64-NEXT: setb %cl
-; X64-NEXT: movq %rbx, %rax
-; X64-NEXT: mulq %rsi
+; X64-NEXT: addq %r8, %r15
+; X64-NEXT: adcq $0, %r10
+; X64-NEXT: movq %rsi, %rax
+; X64-NEXT: mulq %rcx
+; X64-NEXT: movq %rdx, %r8
+; X64-NEXT: movq %rax, %r11
+; X64-NEXT: addq %r15, %r11
+; X64-NEXT: adcq %r10, %r8
+; X64-NEXT: setb %r10b
+; X64-NEXT: movq %r14, %rax
+; X64-NEXT: mulq %rcx
; X64-NEXT: movq %rdx, %rsi
-; X64-NEXT: movq %rax, %r12
-; X64-NEXT: addq %rbp, %r12
-; X64-NEXT: movzbl %cl, %eax
+; X64-NEXT: movq %rax, %r15
+; X64-NEXT: addq %r8, %r15
+; X64-NEXT: movzbl %r10b, %eax
; X64-NEXT: adcq %rax, %rsi
-; X64-NEXT: addq %r8, %r12
-; X64-NEXT: adcq %r10, %rsi
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; X64-NEXT: movq 120(%rdx), %rcx
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; X64-NEXT: imulq %rax, %rcx
-; X64-NEXT: movq 112(%rdx), %r9
-; X64-NEXT: movq %rdx, %rdi
-; X64-NEXT: movq %rax, %r10
-; X64-NEXT: mulq %r9
+; X64-NEXT: addq %rdi, %r15
+; X64-NEXT: adcq %rbx, %rsi
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; X64-NEXT: movq 112(%r9), %rbx
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: mulq %rbx
; X64-NEXT: movq %rax, %r8
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; X64-NEXT: imulq %rbx, %r9
-; X64-NEXT: addq %rcx, %r9
-; X64-NEXT: addq %rdx, %r9
-; X64-NEXT: movq 96(%rdi), %rbp
-; X64-NEXT: movq 104(%rdi), %rdi
-; X64-NEXT: movq %r14, %rax
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT: imulq %rcx, %rbx
+; X64-NEXT: addq %rdx, %rbx
+; X64-NEXT: movq 120(%r9), %rax
; X64-NEXT: imulq %rdi, %rax
-; X64-NEXT: imulq %rbp, %r11
-; X64-NEXT: addq %rax, %r11
-; X64-NEXT: movq %r14, %rax
-; X64-NEXT: mulq %rbp
+; X64-NEXT: movq %rdi, %rbp
+; X64-NEXT: addq %rax, %rbx
+; X64-NEXT: movq 96(%r9), %r10
+; X64-NEXT: movq 104(%r9), %rdi
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64-NEXT: movq %rax, %r12
+; X64-NEXT: imulq %rdi, %r12
+; X64-NEXT: mulq %r10
; X64-NEXT: movq %rax, %r13
-; X64-NEXT: addq %rdx, %r11
+; X64-NEXT: addq %r12, %rdx
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; X64-NEXT: imulq %r10, %r14
+; X64-NEXT: addq %rdx, %r14
; X64-NEXT: addq %r8, %r13
-; X64-NEXT: adcq %r9, %r11
-; X64-NEXT: movq %r11, %r14
-; X64-NEXT: movq %rbp, %rax
-; X64-NEXT: mulq %r10
+; X64-NEXT: adcq %rbx, %r14
+; X64-NEXT: movq %r10, %rax
+; X64-NEXT: mulq %rbp
; X64-NEXT: movq %rdx, %r8
-; X64-NEXT: movq %rax, %r11
-; X64-NEXT: movq %rdi, %rax
-; X64-NEXT: mulq %r10
-; X64-NEXT: movq %rdx, %rcx
; X64-NEXT: movq %rax, %r9
-; X64-NEXT: addq %r8, %r9
-; X64-NEXT: adcq $0, %rcx
-; X64-NEXT: movq %rbp, %rax
-; X64-NEXT: mulq %rbx
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: mulq %rbp
; X64-NEXT: movq %rdx, %rbp
-; X64-NEXT: movq %rax, %r8
-; X64-NEXT: addq %r9, %r8
-; X64-NEXT: adcq %rcx, %rbp
-; X64-NEXT: setb %cl
+; X64-NEXT: movq %rax, %r12
+; X64-NEXT: addq %r8, %r12
+; X64-NEXT: adcq $0, %rbp
+; X64-NEXT: movq %r10, %rax
+; X64-NEXT: mulq %rcx
+; X64-NEXT: movq %rdx, %r10
+; X64-NEXT: movq %rax, %rbx
+; X64-NEXT: addq %r12, %rbx
+; X64-NEXT: adcq %rbp, %r10
+; X64-NEXT: setb %r8b
; X64-NEXT: movq %rdi, %rax
-; X64-NEXT: mulq %rbx
-; X64-NEXT: addq %rbp, %rax
-; X64-NEXT: movzbl %cl, %ecx
-; X64-NEXT: adcq %rcx, %rdx
+; X64-NEXT: mulq %rcx
+; X64-NEXT: addq %r10, %rax
+; X64-NEXT: movzbl %r8b, %edi
+; X64-NEXT: adcq %rdi, %rdx
; X64-NEXT: addq %r13, %rax
; X64-NEXT: adcq %r14, %rdx
-; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
-; X64-NEXT: adcq %r15, %r8
-; X64-NEXT: adcq %r12, %rax
+; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload
+; X64-NEXT: adcq %r11, %rbx
+; X64-NEXT: adcq %r15, %rax
; X64-NEXT: adcq %rsi, %rdx
-; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
-; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
-; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload
+; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
+; X64-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload
; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; X64-NEXT: movq 80(%r8), %r9
-; X64-NEXT: movq %r9, %rax
-; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; X64-NEXT: movq 80(%r13), %r8
+; X64-NEXT: movq %r8, %rax
+; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
; X64-NEXT: mulq %rsi
-; X64-NEXT: movq %rax, %r10
+; X64-NEXT: movq %rax, %rbx
; X64-NEXT: movq %rdx, %rcx
-; X64-NEXT: movq 88(%r8), %rbx
-; X64-NEXT: movq %rbx, %rax
-; X64-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq 88(%r13), %r11
+; X64-NEXT: movq %r13, %r10
+; X64-NEXT: movq %r11, %rax
+; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; X64-NEXT: mulq %rsi
-; X64-NEXT: movq %rsi, %r11
+; X64-NEXT: movq %rsi, %r9
; X64-NEXT: movq %rdx, %rsi
; X64-NEXT: movq %rax, %rdi
; X64-NEXT: addq %rcx, %rdi
; X64-NEXT: adcq $0, %rsi
-; X64-NEXT: movq %r9, %rax
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; X64-NEXT: mulq %r9
+; X64-NEXT: movq %r8, %rax
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; X64-NEXT: mulq %r8
; X64-NEXT: movq %rdx, %rcx
; X64-NEXT: movq %rax, %r14
; X64-NEXT: addq %rdi, %r14
; X64-NEXT: adcq %rsi, %rcx
; X64-NEXT: setb %dil
-; X64-NEXT: movq %rbx, %rax
-; X64-NEXT: mulq %r9
-; X64-NEXT: movq %rdx, %rbx
+; X64-NEXT: movq %r11, %rax
+; X64-NEXT: mulq %r8
+; X64-NEXT: movq %r8, %r11
+; X64-NEXT: movq %rdx, %r13
; X64-NEXT: movq %rax, %rsi
; X64-NEXT: addq %rcx, %rsi
; X64-NEXT: movzbl %dil, %eax
-; X64-NEXT: adcq %rax, %rbx
-; X64-NEXT: movq 64(%r8), %rdi
-; X64-NEXT: movq %rdi, %rax
-; X64-NEXT: mulq %r11
+; X64-NEXT: adcq %rax, %r13
+; X64-NEXT: movq %r10, %rdi
+; X64-NEXT: movq 64(%r10), %r10
+; X64-NEXT: movq %r10, %rax
+; X64-NEXT: mulq %r9
; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; X64-NEXT: movq %rdx, %rcx
-; X64-NEXT: movq 72(%r8), %rax
-; X64-NEXT: movq %rax, %r13
-; X64-NEXT: mulq %r11
+; X64-NEXT: movq 72(%rdi), %rax
+; X64-NEXT: movq %rax, %r8
+; X64-NEXT: mulq %r9
; X64-NEXT: movq %rdx, %r15
; X64-NEXT: movq %rax, %r12
; X64-NEXT: addq %rcx, %r12
; X64-NEXT: adcq $0, %r15
-; X64-NEXT: movq %rdi, %rax
-; X64-NEXT: movq %rdi, %r8
-; X64-NEXT: mulq %r9
+; X64-NEXT: movq %r10, %rax
+; X64-NEXT: movq %r11, %r9
+; X64-NEXT: mulq %r11
; X64-NEXT: movq %rdx, %rcx
; X64-NEXT: addq %r12, %rax
; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; X64-NEXT: adcq %r15, %rcx
; X64-NEXT: setb %dil
-; X64-NEXT: movq %r13, %r11
-; X64-NEXT: movq %r13, %rax
+; X64-NEXT: movq %r8, %r11
+; X64-NEXT: movq %r8, %rax
; X64-NEXT: mulq %r9
; X64-NEXT: movq %rdx, %r12
; X64-NEXT: movq %rax, %rbp
; X64-NEXT: addq %rcx, %rbp
; X64-NEXT: movzbl %dil, %eax
; X64-NEXT: adcq %rax, %r12
-; X64-NEXT: addq %r10, %rbp
+; X64-NEXT: addq %rbx, %rbp
; X64-NEXT: adcq %r14, %r12
; X64-NEXT: adcq $0, %rsi
-; X64-NEXT: adcq $0, %rbx
-; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq %r8, %rax
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; X64-NEXT: mulq %r13
+; X64-NEXT: adcq $0, %r13
+; X64-NEXT: movq %r10, %rdi
+; X64-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq %r10, %rax
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; X64-NEXT: mulq %r8
; X64-NEXT: movq %rdx, %rcx
; X64-NEXT: movq %rax, %r9
; X64-NEXT: movq %r11, %rax
+; X64-NEXT: movq %r11, %rbx
; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: mulq %r13
+; X64-NEXT: mulq %r8
; X64-NEXT: movq %rdx, %r10
; X64-NEXT: movq %rax, %r14
; X64-NEXT: addq %rcx, %r14
; X64-NEXT: adcq $0, %r10
-; X64-NEXT: movq %r8, %rax
+; X64-NEXT: movq %rdi, %rax
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
; X64-NEXT: mulq %r15
; X64-NEXT: movq %rdx, %rcx
-; X64-NEXT: movq %rax, %r8
-; X64-NEXT: addq %r14, %r8
+; X64-NEXT: addq %r14, %rax
+; X64-NEXT: movq %rax, %r11
; X64-NEXT: adcq %r10, %rcx
; X64-NEXT: setb %dil
-; X64-NEXT: movq %r11, %rax
+; X64-NEXT: movq %rbx, %rax
; X64-NEXT: mulq %r15
; X64-NEXT: movq %rdx, %r10
; X64-NEXT: movq %rax, %r14
; X64-NEXT: adcq %rax, %r10
; X64-NEXT: addq %rbp, %r9
; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: adcq %r12, %r8
-; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: adcq %r12, %r11
+; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; X64-NEXT: adcq $0, %r14
; X64-NEXT: adcq $0, %r10
; X64-NEXT: addq %rsi, %r14
-; X64-NEXT: adcq %rbx, %r10
+; X64-NEXT: adcq %r13, %r10
; X64-NEXT: setb %dil
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
-; X64-NEXT: movq %rbp, %rax
-; X64-NEXT: mulq %r13
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; X64-NEXT: movq %r11, %rax
+; X64-NEXT: mulq %r8
; X64-NEXT: movq %rdx, %rcx
-; X64-NEXT: movq %rax, %r8
+; X64-NEXT: movq %rax, %r12
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
; X64-NEXT: movq %rbx, %rax
-; X64-NEXT: mulq %r13
+; X64-NEXT: mulq %r8
; X64-NEXT: movq %rdx, %rsi
; X64-NEXT: movq %rax, %r9
; X64-NEXT: addq %rcx, %r9
; X64-NEXT: adcq $0, %rsi
-; X64-NEXT: movq %rbp, %rax
+; X64-NEXT: movq %r11, %rax
; X64-NEXT: mulq %r15
; X64-NEXT: movq %rdx, %rcx
; X64-NEXT: addq %r9, %rax
; X64-NEXT: addq %rcx, %rax
; X64-NEXT: movzbl %sil, %ecx
; X64-NEXT: adcq %rcx, %rdx
-; X64-NEXT: addq %r14, %r8
-; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: addq %r14, %r12
+; X64-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; X64-NEXT: adcq %r10, %r9
; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; X64-NEXT: movzbl %dil, %ecx
; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; X64-NEXT: adcq $0, %rdx
; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; X64-NEXT: movq 96(%r8), %rcx
-; X64-NEXT: imulq %rcx, %r15
-; X64-NEXT: movq %rcx, %rax
-; X64-NEXT: movq %r13, %r9
-; X64-NEXT: mulq %r13
-; X64-NEXT: movq %rax, %rsi
-; X64-NEXT: movq 104(%r8), %rdi
-; X64-NEXT: imulq %rdi, %r9
-; X64-NEXT: addq %r15, %r9
-; X64-NEXT: addq %rdx, %r9
-; X64-NEXT: movq 112(%r8), %rax
-; X64-NEXT: movq %rax, %rdx
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; X64-NEXT: imulq %r11, %rdx
-; X64-NEXT: movq 120(%r8), %r8
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; X64-NEXT: imulq %r10, %r8
-; X64-NEXT: addq %rdx, %r8
-; X64-NEXT: mulq %r10
-; X64-NEXT: movq %rax, %r13
-; X64-NEXT: addq %rdx, %r8
-; X64-NEXT: addq %rsi, %r13
-; X64-NEXT: adcq %r9, %r8
-; X64-NEXT: movq %r10, %rax
-; X64-NEXT: movq %r10, %r9
-; X64-NEXT: mulq %rcx
-; X64-NEXT: movq %rdx, %rsi
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; X64-NEXT: movq 96(%rdi), %rsi
+; X64-NEXT: imulq %rsi, %r15
+; X64-NEXT: movq %rsi, %rax
+; X64-NEXT: movq %r8, %rcx
+; X64-NEXT: mulq %r8
+; X64-NEXT: movq %rax, %r10
+; X64-NEXT: addq %r15, %rdx
+; X64-NEXT: movq 104(%rdi), %r9
+; X64-NEXT: imulq %r9, %rcx
+; X64-NEXT: addq %rdx, %rcx
+; X64-NEXT: movq %rcx, %r14
+; X64-NEXT: movq 112(%rdi), %rax
+; X64-NEXT: movq %rax, %rcx
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
+; X64-NEXT: imulq %r12, %rcx
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; X64-NEXT: mulq %r15
+; X64-NEXT: movq %rax, %r8
+; X64-NEXT: addq %rcx, %rdx
+; X64-NEXT: movq 120(%rdi), %rdi
+; X64-NEXT: imulq %r15, %rdi
+; X64-NEXT: addq %rdx, %rdi
+; X64-NEXT: addq %r10, %r8
+; X64-NEXT: adcq %r14, %rdi
+; X64-NEXT: movq %r15, %rax
+; X64-NEXT: mulq %rsi
+; X64-NEXT: movq %rdx, %r10
; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq %r11, %rax
-; X64-NEXT: mulq %rcx
+; X64-NEXT: movq %r12, %rax
+; X64-NEXT: mulq %rsi
+; X64-NEXT: movq %rdx, %r14
+; X64-NEXT: movq %rax, %r13
+; X64-NEXT: addq %r10, %r13
+; X64-NEXT: adcq $0, %r14
+; X64-NEXT: movq %r15, %rax
+; X64-NEXT: mulq %r9
; X64-NEXT: movq %rdx, %rcx
-; X64-NEXT: movq %rax, %r10
-; X64-NEXT: addq %rsi, %r10
-; X64-NEXT: adcq $0, %rcx
-; X64-NEXT: movq %r9, %rax
-; X64-NEXT: mulq %rdi
-; X64-NEXT: movq %rdx, %rsi
-; X64-NEXT: movq %rax, %r12
-; X64-NEXT: addq %r10, %r12
-; X64-NEXT: adcq %rcx, %rsi
-; X64-NEXT: setb %cl
-; X64-NEXT: movq %r11, %rax
-; X64-NEXT: mulq %rdi
+; X64-NEXT: movq %rax, %rbp
+; X64-NEXT: addq %r13, %rbp
+; X64-NEXT: adcq %r14, %rcx
+; X64-NEXT: setb %sil
+; X64-NEXT: movq %r12, %rax
+; X64-NEXT: mulq %r9
; X64-NEXT: movq %rdx, %r10
; X64-NEXT: movq %rax, %r14
-; X64-NEXT: addq %rsi, %r14
-; X64-NEXT: movzbl %cl, %eax
+; X64-NEXT: addq %rcx, %r14
+; X64-NEXT: movzbl %sil, %eax
; X64-NEXT: adcq %rax, %r10
-; X64-NEXT: addq %r13, %r14
-; X64-NEXT: adcq %r8, %r10
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; X64-NEXT: imulq %rax, %rsi
-; X64-NEXT: movq %rax, %r11
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; X64-NEXT: mulq %rcx
-; X64-NEXT: movq %rax, %r9
+; X64-NEXT: addq %r8, %r14
+; X64-NEXT: adcq %rdi, %r10
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; X64-NEXT: imulq %r15, %rcx
-; X64-NEXT: addq %rsi, %rcx
-; X64-NEXT: addq %rdx, %rcx
-; X64-NEXT: movq %rcx, %r8
-; X64-NEXT: movq %rbp, %rax
+; X64-NEXT: imulq %r15, %rdi
+; X64-NEXT: movq %r15, %rax
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; X64-NEXT: imulq %rsi, %rax
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; X64-NEXT: imulq %rdi, %rbx
-; X64-NEXT: addq %rax, %rbx
-; X64-NEXT: movq %rbp, %rax
-; X64-NEXT: mulq %rdi
+; X64-NEXT: mulq %rsi
; X64-NEXT: movq %rax, %rcx
+; X64-NEXT: addq %rdi, %rdx
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
+; X64-NEXT: imulq %r12, %rsi
+; X64-NEXT: addq %rdx, %rsi
+; X64-NEXT: movq %rsi, %r8
+; X64-NEXT: movq %r11, %rax
+; X64-NEXT: movq %r11, %rsi
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; X64-NEXT: imulq %r9, %rsi
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; X64-NEXT: mulq %r11
+; X64-NEXT: movq %rax, %r13
+; X64-NEXT: addq %rsi, %rdx
+; X64-NEXT: imulq %r11, %rbx
; X64-NEXT: addq %rdx, %rbx
-; X64-NEXT: addq %r9, %rcx
+; X64-NEXT: addq %rcx, %r13
; X64-NEXT: adcq %r8, %rbx
-; X64-NEXT: movq %rbx, %rbp
-; X64-NEXT: movq %rdi, %rax
-; X64-NEXT: movq %rdi, %r9
-; X64-NEXT: mulq %r11
+; X64-NEXT: movq %r11, %rax
+; X64-NEXT: mulq %r15
; X64-NEXT: movq %rdx, %r8
-; X64-NEXT: movq %rax, %r13
-; X64-NEXT: movq %rsi, %rax
-; X64-NEXT: movq %rsi, %rbx
-; X64-NEXT: mulq %r11
-; X64-NEXT: movq %rdx, %rsi
-; X64-NEXT: movq %rax, %rdi
-; X64-NEXT: addq %r8, %rdi
-; X64-NEXT: adcq $0, %rsi
+; X64-NEXT: movq %rax, %rcx
; X64-NEXT: movq %r9, %rax
; X64-NEXT: mulq %r15
+; X64-NEXT: movq %rdx, %rdi
+; X64-NEXT: movq %rax, %rsi
+; X64-NEXT: addq %r8, %rsi
+; X64-NEXT: adcq $0, %rdi
+; X64-NEXT: movq %r11, %rax
+; X64-NEXT: mulq %r12
; X64-NEXT: movq %rdx, %r8
; X64-NEXT: movq %rax, %r11
-; X64-NEXT: addq %rdi, %r11
-; X64-NEXT: adcq %rsi, %r8
+; X64-NEXT: addq %rsi, %r11
+; X64-NEXT: adcq %rdi, %r8
; X64-NEXT: setb %sil
-; X64-NEXT: movq %rbx, %rax
-; X64-NEXT: mulq %r15
+; X64-NEXT: movq %r9, %rax
+; X64-NEXT: mulq %r12
; X64-NEXT: addq %r8, %rax
; X64-NEXT: movzbl %sil, %esi
; X64-NEXT: adcq %rsi, %rdx
-; X64-NEXT: addq %rcx, %rax
-; X64-NEXT: adcq %rbp, %rdx
-; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
-; X64-NEXT: adcq %r12, %r11
+; X64-NEXT: addq %r13, %rax
+; X64-NEXT: adcq %rbx, %rdx
+; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
+; X64-NEXT: adcq %rbp, %r11
; X64-NEXT: adcq %r14, %rax
; X64-NEXT: adcq %r10, %rdx
-; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
+; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload
+; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Folded Reload
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Folded Reload
-; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
+; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload
-; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
-; X64-NEXT: movq %rcx, %rdi
-; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
+; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
; X64-NEXT: movq %rsi, %r8
-; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload
+; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Folded Reload
+; X64-NEXT: movq %rdi, %r9
; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Folded Reload
-; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
+; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
+; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; X64-NEXT: movq %rsi, (%rcx)
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; X64-NEXT: movq %rsi, 8(%rcx)
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; X64-NEXT: movq %rsi, 16(%rcx)
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; X64-NEXT: movq %rsi, 24(%rcx)
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; X64-NEXT: movq %rsi, 32(%rcx)
-; X64-NEXT: movq (%rsp), %rsi # 8-byte Reload
-; X64-NEXT: movq %rsi, 40(%rcx)
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; X64-NEXT: movq %rsi, 48(%rcx)
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; X64-NEXT: movq %rsi, 56(%rcx)
-; X64-NEXT: movq %rdi, 64(%rcx)
-; X64-NEXT: movq %r8, 72(%rcx)
-; X64-NEXT: movq %r9, 80(%rcx)
-; X64-NEXT: movq %r10, 88(%rcx)
-; X64-NEXT: movq %r13, 96(%rcx)
-; X64-NEXT: movq %r11, 104(%rcx)
-; X64-NEXT: movq %rax, 112(%rcx)
-; X64-NEXT: movq %rdx, 120(%rcx)
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; X64-NEXT: movq %rdi, (%rsi)
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; X64-NEXT: movq %rdi, 8(%rsi)
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; X64-NEXT: movq %rdi, 16(%rsi)
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; X64-NEXT: movq %rdi, 24(%rsi)
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; X64-NEXT: movq %rdi, 32(%rsi)
+; X64-NEXT: movq (%rsp), %rdi # 8-byte Reload
+; X64-NEXT: movq %rdi, 40(%rsi)
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; X64-NEXT: movq %rdi, 48(%rsi)
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; X64-NEXT: movq %rdi, 56(%rsi)
+; X64-NEXT: movq %r8, 64(%rsi)
+; X64-NEXT: movq %r9, 72(%rsi)
+; X64-NEXT: movq %r10, 80(%rsi)
+; X64-NEXT: movq %rbx, 88(%rsi)
+; X64-NEXT: movq %rcx, 96(%rsi)
+; X64-NEXT: movq %r11, 104(%rsi)
+; X64-NEXT: movq %rax, 112(%rsi)
+; X64-NEXT: movq %rdx, 120(%rsi)
; X64-NEXT: addq $240, %rsp
; X64-NEXT: popq %rbx
; X64-NEXT: popq %r12
; X32-NEXT: .cfi_offset %ebp, -8
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: movl 12(%ecx), %ebp
-; X32-NEXT: movl 8(%ecx), %esi
-; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl (%eax), %ebx
+; X32-NEXT: movl 12(%ecx), %esi
+; X32-NEXT: movl 8(%ecx), %ebx
; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: movl (%eax), %edi
+; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %edx, %ebp
+; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl %esi, %eax
-; X32-NEXT: mull %ebx
-; X32-NEXT: movl %edx, %ecx
-; X32-NEXT: movl %eax, (%esp) # 4-byte Spill
-; X32-NEXT: movl %ebp, %eax
-; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: mull %ebx
-; X32-NEXT: movl %edx, %edi
-; X32-NEXT: movl %eax, %ebx
-; X32-NEXT: addl %ecx, %ebx
-; X32-NEXT: adcl $0, %edi
+; X32-NEXT: movl %esi, %ecx
+; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: movl %eax, %edi
+; X32-NEXT: addl %ebp, %edi
+; X32-NEXT: adcl $0, %esi
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl 4(%eax), %ecx
-; X32-NEXT: movl %esi, %eax
-; X32-NEXT: mull %ecx
-; X32-NEXT: movl %ecx, %esi
-; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl %edx, %ecx
-; X32-NEXT: addl %ebx, %eax
+; X32-NEXT: movl 4(%eax), %ebp
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: mull %ebp
+; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: addl %edi, %eax
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: adcl %edi, %ecx
-; X32-NEXT: setb %bl
-; X32-NEXT: movl %ebp, %eax
-; X32-NEXT: mull %esi
-; X32-NEXT: addl %ecx, %eax
+; X32-NEXT: adcl %esi, %ebx
+; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: mull %ebp
+; X32-NEXT: addl %ebx, %eax
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movzbl %bl, %eax
+; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
; X32-NEXT: adcl %eax, %edx
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X32-NEXT: movl (%edi), %esi
-; X32-NEXT: movl %esi, %eax
-; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT: mull %ebx
+; X32-NEXT: movl (%edi), %eax
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl %edx, %ecx
-; X32-NEXT: movl 4(%edi), %edi
-; X32-NEXT: movl %edi, %eax
-; X32-NEXT: mull %ebx
-; X32-NEXT: movl %edx, %ebx
-; X32-NEXT: movl %eax, %ebp
-; X32-NEXT: addl %ecx, %ebp
-; X32-NEXT: adcl $0, %ebx
-; X32-NEXT: movl %esi, %eax
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X32-NEXT: mull %ecx
+; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl %edx, %esi
-; X32-NEXT: addl %ebp, %eax
+; X32-NEXT: movl 4(%edi), %eax
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: adcl %ebx, %esi
-; X32-NEXT: setb %bl
-; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl %edi, %eax
; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, %ebx
; X32-NEXT: movl %eax, %ebp
; X32-NEXT: addl %esi, %ebp
-; X32-NEXT: movzbl %bl, %eax
+; X32-NEXT: adcl $0, %ebx
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: addl %ebp, %eax
+; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: adcl %ebx, %ecx
+; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %eax, %ebp
+; X32-NEXT: addl %ecx, %ebp
+; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
; X32-NEXT: adcl %eax, %edx
-; X32-NEXT: addl (%esp), %ebp # 4-byte Folded Reload
+; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl 8(%eax), %esi
-; X32-NEXT: movl %esi, (%esp) # 4-byte Spill
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT: movl %ecx, %eax
-; X32-NEXT: mull %esi
+; X32-NEXT: movl 8(%eax), %ecx
+; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: mull %ecx
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl %edi, %eax
-; X32-NEXT: mull %esi
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: mull %ecx
; X32-NEXT: movl %edx, %ebx
; X32-NEXT: movl %eax, %edi
; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
; X32-NEXT: adcl $0, %ebx
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl 12(%eax), %esi
-; X32-NEXT: movl %ecx, %eax
-; X32-NEXT: mull %esi
-; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: movl 12(%eax), %ecx
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %ecx, (%esp) # 4-byte Spill
+; X32-NEXT: movl %edx, %esi
; X32-NEXT: addl %edi, %eax
; X32-NEXT: movl %eax, %edi
-; X32-NEXT: adcl %ebx, %ecx
-; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X32-NEXT: adcl %ebx, %esi
+; X32-NEXT: setb %bl
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT: mull %esi
-; X32-NEXT: movl %esi, %ebx
-; X32-NEXT: movl %eax, %esi
-; X32-NEXT: addl %ecx, %esi
-; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: addl %esi, %ecx
+; X32-NEXT: movzbl %bl, %eax
; X32-NEXT: adcl %eax, %edx
; X32-NEXT: addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: adcl $0, %esi
+; X32-NEXT: adcl $0, %ecx
; X32-NEXT: adcl $0, %edx
-; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT: movl %ecx, %eax
-; X32-NEXT: movl (%esp), %edi # 4-byte Reload
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
; X32-NEXT: mull %edi
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X32-NEXT: mull %edi
-; X32-NEXT: movl %edx, %edi
+; X32-NEXT: movl %edx, %ebx
; X32-NEXT: movl %eax, %ebp
; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NEXT: adcl $0, %edi
-; X32-NEXT: movl %ecx, %eax
-; X32-NEXT: movl %ebx, %ecx
-; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: mull %ebx
-; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: adcl $0, %ebx
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: movl (%esp), %edi # 4-byte Reload
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %edx, %esi
; X32-NEXT: addl %ebp, %eax
; X32-NEXT: movl %eax, %ebp
-; X32-NEXT: adcl %edi, %ebx
+; X32-NEXT: adcl %ebx, %esi
; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT: mull %ecx
-; X32-NEXT: addl %ebx, %eax
-; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X32-NEXT: adcl %ecx, %edx
-; X32-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %edi, %ebx
+; X32-NEXT: addl %esi, %eax
+; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
+; X32-NEXT: adcl %esi, %edx
+; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: movl 16(%ecx), %edi
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT: imull %edi, %ebx
+; X32-NEXT: movl %ebx, %esi
+; X32-NEXT: imull %edi, %esi
; X32-NEXT: movl %edi, %eax
-; X32-NEXT: movl (%esp), %esi # 4-byte Reload
-; X32-NEXT: mull %esi
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NEXT: mull %ebp
+; X32-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X32-NEXT: addl %esi, %edx
+; X32-NEXT: movl 20(%ecx), %eax
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl 20(%ecx), %ebp
-; X32-NEXT: imull %ebp, %esi
-; X32-NEXT: addl %ebx, %esi
-; X32-NEXT: addl %edx, %esi
-; X32-NEXT: movl %esi, (%esp) # 4-byte Spill
+; X32-NEXT: imull %eax, %ebp
+; X32-NEXT: addl %edx, %ebp
; X32-NEXT: movl 24(%ecx), %eax
-; X32-NEXT: movl %eax, %edx
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT: imull %esi, %edx
-; X32-NEXT: movl 28(%ecx), %ecx
+; X32-NEXT: movl %eax, %ecx
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; X32-NEXT: imull %ebx, %ecx
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: addl %ecx, %edx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movl 28(%ecx), %ecx
+; X32-NEXT: imull %esi, %ecx
; X32-NEXT: addl %edx, %ecx
-; X32-NEXT: mull %ebx
-; X32-NEXT: addl %edx, %ecx
-; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: adcl (%esp), %ecx # 4-byte Folded Reload
-; X32-NEXT: movl %ebx, %eax
-; X32-NEXT: mull %edi
-; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: addl (%esp), %eax # 4-byte Folded Reload
; X32-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X32-NEXT: adcl %ebp, %ecx
; X32-NEXT: movl %esi, %eax
; X32-NEXT: mull %edi
+; X32-NEXT: movl %edx, %ebp
+; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: mull %edi
; X32-NEXT: movl %edx, %esi
; X32-NEXT: movl %eax, %ebx
-; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NEXT: addl %ebp, %ebx
; X32-NEXT: adcl $0, %esi
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
; X32-NEXT: mull %ebp
; X32-NEXT: movl %edx, %edi
; X32-NEXT: addl %ebx, %eax
; X32-NEXT: addl %edi, %eax
; X32-NEXT: movzbl %bl, %esi
; X32-NEXT: adcl %esi, %edx
-; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: addl (%esp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl %eax, (%esp) # 4-byte Spill
; X32-NEXT: adcl %ecx, %edx
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X32-NEXT: movl 28(%edi), %ecx
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NEXT: imull %ebp, %ecx
-; X32-NEXT: movl 24(%edi), %esi
-; X32-NEXT: movl %ebp, %eax
-; X32-NEXT: mull %esi
+; X32-NEXT: movl 24(%edi), %ecx
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: mull %ecx
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NEXT: addl %ecx, %esi
-; X32-NEXT: addl %edx, %esi
-; X32-NEXT: movl 16(%edi), %ecx
+; X32-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-NEXT: addl %edx, %ecx
+; X32-NEXT: movl 28(%edi), %eax
+; X32-NEXT: imull %esi, %eax
+; X32-NEXT: addl %eax, %ecx
+; X32-NEXT: movl 16(%edi), %ebp
; X32-NEXT: movl 20(%edi), %ebx
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT: movl %eax, %edx
-; X32-NEXT: imull %ebx, %edx
+; X32-NEXT: movl %eax, %edi
+; X32-NEXT: imull %ebx, %edi
; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT: imull %ecx, %edi
-; X32-NEXT: addl %edx, %edi
-; X32-NEXT: mull %ecx
-; X32-NEXT: addl %edx, %edi
+; X32-NEXT: mull %ebp
+; X32-NEXT: addl %edi, %edx
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT: imull %ebp, %esi
+; X32-NEXT: addl %edx, %esi
; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: adcl %esi, %edi
-; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl %ecx, %eax
-; X32-NEXT: mull %ebp
-; X32-NEXT: movl %edx, %edi
+; X32-NEXT: adcl %ecx, %esi
+; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: movl %ebp, %eax
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, %esi
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl %ebx, %eax
-; X32-NEXT: mull %ebp
-; X32-NEXT: movl %edx, %ebx
-; X32-NEXT: movl %eax, %esi
-; X32-NEXT: addl %edi, %esi
-; X32-NEXT: adcl $0, %ebx
-; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: movl %eax, %ebx
+; X32-NEXT: addl %esi, %ebx
+; X32-NEXT: adcl $0, %ecx
+; X32-NEXT: movl %ebp, %eax
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
; X32-NEXT: mull %ebp
; X32-NEXT: movl %edx, %edi
-; X32-NEXT: movl %eax, %ecx
-; X32-NEXT: addl %esi, %ecx
-; X32-NEXT: adcl %ebx, %edi
-; X32-NEXT: setb %bl
+; X32-NEXT: movl %eax, %esi
+; X32-NEXT: addl %ebx, %esi
+; X32-NEXT: adcl %ecx, %edi
+; X32-NEXT: setb %cl
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X32-NEXT: mull %ebp
; X32-NEXT: addl %edi, %eax
-; X32-NEXT: movzbl %bl, %esi
-; X32-NEXT: adcl %esi, %edx
+; X32-NEXT: movzbl %cl, %ecx
+; X32-NEXT: adcl %ecx, %edx
; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT: addl (%esp), %esi # 4-byte Folded Reload
-; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NEXT: adcl (%esp), %eax # 4-byte Folded Reload
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NEXT: movl %esi, %ebx
-; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl %ecx, %ebx
+; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT: movl %edi, (%esi)
+; X32-NEXT: movl %edi, (%ecx)
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT: movl %edi, 4(%esi)
+; X32-NEXT: movl %edi, 4(%ecx)
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT: movl %edi, 8(%esi)
+; X32-NEXT: movl %edi, 8(%ecx)
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT: movl %edi, 12(%esi)
-; X32-NEXT: movl %ebx, 16(%esi)
-; X32-NEXT: movl %ecx, 20(%esi)
-; X32-NEXT: movl %eax, 24(%esi)
-; X32-NEXT: movl %edx, 28(%esi)
+; X32-NEXT: movl %edi, 12(%ecx)
+; X32-NEXT: movl %ebx, 16(%ecx)
+; X32-NEXT: movl %esi, 20(%ecx)
+; X32-NEXT: movl %eax, 24(%ecx)
+; X32-NEXT: movl %edx, 28(%ecx)
; X32-NEXT: addl $72, %esp
; X32-NEXT: .cfi_def_cfa_offset 20
; X32-NEXT: popl %esi
; X64-NEXT: .cfi_def_cfa_offset 16
; X64-NEXT: pushq %r14
; X64-NEXT: .cfi_def_cfa_offset 24
-; X64-NEXT: pushq %r12
-; X64-NEXT: .cfi_def_cfa_offset 32
; X64-NEXT: pushq %rbx
-; X64-NEXT: .cfi_def_cfa_offset 40
-; X64-NEXT: .cfi_offset %rbx, -40
-; X64-NEXT: .cfi_offset %r12, -32
+; X64-NEXT: .cfi_def_cfa_offset 32
+; X64-NEXT: .cfi_offset %rbx, -32
; X64-NEXT: .cfi_offset %r14, -24
; X64-NEXT: .cfi_offset %r15, -16
; X64-NEXT: movq %rdx, %rcx
; X64-NEXT: mulq %r10
; X64-NEXT: movq %rax, %rdi
; X64-NEXT: imulq %r14, %r10
-; X64-NEXT: addq %r15, %r10
; X64-NEXT: addq %rdx, %r10
-; X64-NEXT: movq %r8, %r12
-; X64-NEXT: imulq %r11, %r12
+; X64-NEXT: addq %r15, %r10
+; X64-NEXT: movq %r8, %r15
+; X64-NEXT: imulq %r11, %r15
; X64-NEXT: movq %r8, %rax
; X64-NEXT: mulq %rbx
; X64-NEXT: movq %rax, %r8
+; X64-NEXT: addq %r15, %rdx
; X64-NEXT: movq 24(%rsi), %r15
; X64-NEXT: imulq %rbx, %r15
-; X64-NEXT: addq %r12, %r15
; X64-NEXT: addq %rdx, %r15
; X64-NEXT: addq %rdi, %r8
; X64-NEXT: adcq %r10, %r15
; X64-NEXT: movq %rax, 16(%rcx)
; X64-NEXT: movq %rdx, 24(%rcx)
; X64-NEXT: popq %rbx
-; X64-NEXT: .cfi_def_cfa_offset 32
-; X64-NEXT: popq %r12
; X64-NEXT: .cfi_def_cfa_offset 24
; X64-NEXT: popq %r14
; X64-NEXT: .cfi_def_cfa_offset 16
; X32-NEXT: movl %ebp, %eax
; X32-NEXT: mull %ecx
; X32-NEXT: movl %ecx, %ebp
+; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl %edx, %esi
; X32-NEXT: addl %ebx, %eax
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: adcl %eax, %edx
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: movl 16(%ecx), %ebx
-; X32-NEXT: movl %ebx, %eax
-; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: movl 16(%ecx), %ebp
+; X32-NEXT: movl %ebp, %eax
+; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
; X32-NEXT: mull %edi
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl %edx, %esi
-; X32-NEXT: movl 20(%ecx), %eax
-; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: movl 20(%ecx), %ebx
+; X32-NEXT: movl %ebx, %eax
; X32-NEXT: mull %edi
; X32-NEXT: movl %edx, %edi
; X32-NEXT: movl %eax, %ecx
; X32-NEXT: addl %esi, %ecx
; X32-NEXT: adcl $0, %edi
-; X32-NEXT: movl %ebx, %eax
-; X32-NEXT: movl %ebp, %esi
-; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: mull %ebp
+; X32-NEXT: movl %ebp, %eax
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
; X32-NEXT: movl %edx, %ebp
; X32-NEXT: addl %ecx, %eax
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: adcl %edi, %ebp
; X32-NEXT: setb %cl
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl %ebx, %eax
; X32-NEXT: mull %esi
-; X32-NEXT: movl %eax, %esi
-; X32-NEXT: addl %ebp, %esi
-; X32-NEXT: movzbl %cl, %eax
-; X32-NEXT: adcl %eax, %edx
-; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NEXT: addl %ebp, %eax
+; X32-NEXT: movzbl %cl, %ecx
+; X32-NEXT: adcl %ecx, %edx
+; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl 8(%eax), %ecx
-; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT: movl %edi, %eax
-; X32-NEXT: mull %ecx
-; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X32-NEXT: movl 8(%edi), %ebp
+; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: mull %ebp
+; X32-NEXT: movl %edx, %ecx
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl %ebx, %eax
-; X32-NEXT: mull %ecx
+; X32-NEXT: mull %ebp
; X32-NEXT: movl %edx, %ebp
; X32-NEXT: movl %eax, %ebx
-; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NEXT: addl %ecx, %ebx
; X32-NEXT: adcl $0, %ebp
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl 12(%eax), %ecx
-; X32-NEXT: movl %edi, %eax
+; X32-NEXT: movl 12(%edi), %ecx
+; X32-NEXT: movl %esi, %eax
; X32-NEXT: mull %ecx
; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl %edx, %edi
; X32-NEXT: addl %ebx, %eax
-; X32-NEXT: movl %eax, %ebx
+; X32-NEXT: movl %eax, %esi
; X32-NEXT: adcl %ebp, %edi
-; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X32-NEXT: setb %bl
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X32-NEXT: mull %ecx
; X32-NEXT: movl %eax, %ecx
; X32-NEXT: addl %edi, %ecx
-; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X32-NEXT: movzbl %bl, %eax
; X32-NEXT: adcl %eax, %edx
-; X32-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: adcl $0, %ecx
; X32-NEXT: adcl $0, %edx
; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
; X32-NEXT: mull %edi
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X32-NEXT: mull %edi
; X32-NEXT: movl %edx, %ebp
; X32-NEXT: addl %esi, %eax
; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
; X32-NEXT: adcl %esi, %edx
-; X32-NEXT: addl %ecx, (%esp) # 4-byte Folded Spill
+; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
; X32-NEXT: adcl $0, %edx
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: movl 8(%ecx), %edi
-; X32-NEXT: movl %edi, %eax
-; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: movl 8(%ecx), %ebx
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X32-NEXT: mull %esi
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: mull %esi
; X32-NEXT: movl %edx, %esi
-; X32-NEXT: movl %eax, %ebx
-; X32-NEXT: addl %ebp, %ebx
+; X32-NEXT: movl %eax, %edi
+; X32-NEXT: addl %ebp, %edi
; X32-NEXT: adcl $0, %esi
-; X32-NEXT: movl %edi, %eax
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT: mull %edi
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT: mull %ebx
; X32-NEXT: movl %edx, %ebp
-; X32-NEXT: addl %ebx, %eax
+; X32-NEXT: addl %edi, %eax
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: adcl %esi, %ebp
; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
; X32-NEXT: movl %ecx, %eax
-; X32-NEXT: mull %edi
-; X32-NEXT: movl %edi, %ebx
+; X32-NEXT: mull %ebx
; X32-NEXT: addl %ebp, %eax
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
; X32-NEXT: mull %edi
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: movl %edx, (%esp) # 4-byte Spill
; X32-NEXT: movl 4(%ecx), %eax
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: mull %edi
-; X32-NEXT: movl %edx, %ebp
-; X32-NEXT: movl %eax, %edi
-; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NEXT: adcl $0, %ebp
+; X32-NEXT: movl %edx, %edi
+; X32-NEXT: movl %eax, %ebp
+; X32-NEXT: addl (%esp), %ebp # 4-byte Folded Reload
+; X32-NEXT: adcl $0, %edi
; X32-NEXT: movl %esi, %eax
; X32-NEXT: mull %ebx
; X32-NEXT: movl %ebx, %esi
; X32-NEXT: movl %edx, %ecx
-; X32-NEXT: addl %edi, %eax
+; X32-NEXT: addl %ebp, %eax
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: adcl %ebp, %ecx
-; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: adcl %edi, %ecx
+; X32-NEXT: setb %bl
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NEXT: movl %ebp, %eax
; X32-NEXT: mull %esi
; X32-NEXT: movl %eax, %esi
; X32-NEXT: addl %ecx, %esi
-; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X32-NEXT: movzbl %bl, %eax
; X32-NEXT: adcl %eax, %edx
; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NEXT: movl %ebp, %eax
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT: movl %ecx, %eax
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
; X32-NEXT: mull %edi
-; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: movl %ebp, %eax
; X32-NEXT: mull %edi
-; X32-NEXT: movl %edx, %edi
+; X32-NEXT: movl %edx, %ebp
; X32-NEXT: movl %eax, %ebx
-; X32-NEXT: addl %ecx, %ebx
-; X32-NEXT: adcl $0, %edi
-; X32-NEXT: movl %ebp, %eax
+; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NEXT: adcl $0, %ebp
+; X32-NEXT: movl %ecx, %eax
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X32-NEXT: mull %ecx
-; X32-NEXT: movl %edx, %ebp
+; X32-NEXT: movl %edx, %edi
; X32-NEXT: addl %ebx, %eax
; X32-NEXT: movl %eax, %ebx
-; X32-NEXT: adcl %edi, %ebp
+; X32-NEXT: adcl %ebp, %edi
; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X32-NEXT: mull %ecx
-; X32-NEXT: movl %eax, %edi
-; X32-NEXT: addl %ebp, %edi
+; X32-NEXT: movl %eax, %ebp
+; X32-NEXT: addl %edi, %ebp
; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
; X32-NEXT: adcl %eax, %edx
; X32-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: adcl $0, %edi
+; X32-NEXT: adcl $0, %ebp
; X32-NEXT: adcl $0, %edx
-; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X32-NEXT: mull %ecx
-; X32-NEXT: movl %edx, %ebx
-; X32-NEXT: movl %eax, %ebp
-; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NEXT: adcl $0, %ebx
+; X32-NEXT: movl %edx, %edi
+; X32-NEXT: movl %eax, %ebx
+; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NEXT: adcl $0, %edi
; X32-NEXT: movl %esi, %eax
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X32-NEXT: mull %ecx
; X32-NEXT: movl %edx, %esi
-; X32-NEXT: addl %ebp, %eax
-; X32-NEXT: movl %eax, %ebp
-; X32-NEXT: adcl %ebx, %esi
-; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: addl %ebx, %eax
+; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: adcl %edi, %esi
+; X32-NEXT: setb %bl
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X32-NEXT: mull %ecx
; X32-NEXT: addl %esi, %eax
-; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X32-NEXT: movzbl %bl, %ecx
; X32-NEXT: movl %edx, %esi
; X32-NEXT: adcl %ecx, %esi
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT: addl %edi, %ecx
-; X32-NEXT: movl %ebp, %edx
+; X32-NEXT: addl %ebp, %ecx
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 1-byte Folded Reload
; X32-NEXT: adcl %edi, %eax
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: adcl $0, (%esp) # 4-byte Folded Spill
+; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl 16(%eax), %ebp
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT: movl %esi, %eax
-; X32-NEXT: mull %ebp
+; X32-NEXT: movl 16(%eax), %esi
+; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: mull %esi
; X32-NEXT: movl %edx, %ecx
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl %ebx, %eax
-; X32-NEXT: mull %ebp
-; X32-NEXT: movl %edx, %edi
-; X32-NEXT: movl %eax, %ebx
-; X32-NEXT: addl %ecx, %ebx
-; X32-NEXT: adcl $0, %edi
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NEXT: movl %ebp, %eax
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: movl %eax, %edi
+; X32-NEXT: addl %ecx, %edi
+; X32-NEXT: adcl $0, %esi
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl 20(%eax), %ecx
-; X32-NEXT: movl %esi, %eax
-; X32-NEXT: movl %ecx, %esi
-; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: movl %ebx, %eax
; X32-NEXT: mull %ecx
+; X32-NEXT: movl %ecx, %ebx
+; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl %edx, %ecx
-; X32-NEXT: addl %ebx, %eax
+; X32-NEXT: addl %edi, %eax
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: adcl %edi, %ecx
-; X32-NEXT: setb %bl
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT: mull %esi
+; X32-NEXT: adcl %esi, %ecx
+; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X32-NEXT: movl %ebp, %eax
+; X32-NEXT: mull %ebx
; X32-NEXT: addl %ecx, %eax
-; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movzbl %bl, %eax
+; X32-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
; X32-NEXT: adcl %eax, %edx
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT: movl %ecx, %eax
-; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: mull %ebp
-; X32-NEXT: movl %edx, %esi
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NEXT: movl %ebp, %eax
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %ecx
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
; X32-NEXT: movl %edi, %eax
-; X32-NEXT: mull %ebp
-; X32-NEXT: movl %edx, %ebx
-; X32-NEXT: movl %eax, %ebp
-; X32-NEXT: addl %esi, %ebp
-; X32-NEXT: adcl $0, %ebx
-; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: movl %eax, %ebx
+; X32-NEXT: addl %ecx, %ebx
+; X32-NEXT: adcl $0, %esi
+; X32-NEXT: movl %ebp, %eax
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X32-NEXT: mull %ecx
-; X32-NEXT: movl %edx, %esi
-; X32-NEXT: addl %ebp, %eax
+; X32-NEXT: movl %edx, %ebp
+; X32-NEXT: addl %ebx, %eax
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: adcl %ebx, %esi
+; X32-NEXT: adcl %esi, %ebp
; X32-NEXT: setb %bl
; X32-NEXT: movl %edi, %eax
; X32-NEXT: mull %ecx
; X32-NEXT: movl %eax, %ecx
-; X32-NEXT: addl %esi, %ecx
+; X32-NEXT: addl %ebp, %ecx
; X32-NEXT: movzbl %bl, %eax
; X32-NEXT: adcl %eax, %edx
; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT: adcl $0, (%esp) # 4-byte Folded Spill
; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl 24(%eax), %esi
-; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT: movl %ebx, %eax
-; X32-NEXT: mull %esi
+; X32-NEXT: movl 24(%eax), %ebx
+; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: mull %ebx
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl %edi, %eax
-; X32-NEXT: mull %esi
+; X32-NEXT: mull %ebx
; X32-NEXT: movl %edx, %ebp
; X32-NEXT: movl %eax, %edi
; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
; X32-NEXT: adcl $0, %ebp
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl 28(%eax), %esi
-; X32-NEXT: movl %ebx, %eax
-; X32-NEXT: mull %esi
-; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: movl 28(%eax), %ebx
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: mull %ebx
+; X32-NEXT: movl %edx, %esi
; X32-NEXT: addl %edi, %eax
; X32-NEXT: movl %eax, %edi
-; X32-NEXT: adcl %ebp, %ebx
+; X32-NEXT: adcl %ebp, %esi
; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT: mull %esi
-; X32-NEXT: movl %eax, %ebp
-; X32-NEXT: addl %ebx, %ebp
+; X32-NEXT: mull %ebx
+; X32-NEXT: movl %ebx, %ebp
+; X32-NEXT: movl %eax, %ebx
+; X32-NEXT: addl %esi, %ebx
; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
; X32-NEXT: adcl %eax, %edx
; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: adcl $0, %ebp
+; X32-NEXT: adcl $0, %ebx
; X32-NEXT: adcl $0, %edx
-; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NEXT: addl (%esp), %ebx # 4-byte Folded Reload
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
; X32-NEXT: movl %ecx, %eax
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X32-NEXT: mull %esi
-; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: movl %edx, (%esp) # 4-byte Spill
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X32-NEXT: mull %esi
-; X32-NEXT: movl %edx, %edi
-; X32-NEXT: movl %eax, %ebx
-; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NEXT: adcl $0, %edi
-; X32-NEXT: movl %ecx, %eax
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT: mull %ecx
; X32-NEXT: movl %edx, %esi
-; X32-NEXT: addl %ebx, %eax
-; X32-NEXT: movl %eax, %ebx
-; X32-NEXT: adcl %edi, %esi
+; X32-NEXT: movl %eax, %edi
+; X32-NEXT: addl (%esp), %edi # 4-byte Folded Reload
+; X32-NEXT: adcl $0, %esi
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: movl %ebp, %ecx
+; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: mull %ebp
+; X32-NEXT: movl %edx, %ebp
+; X32-NEXT: addl %edi, %eax
+; X32-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X32-NEXT: adcl %esi, %ebp
; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X32-NEXT: mull %ecx
-; X32-NEXT: addl %esi, %eax
+; X32-NEXT: addl %ebp, %eax
; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
; X32-NEXT: adcl %ecx, %edx
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT: addl %ebp, %edi
+; X32-NEXT: addl %ebx, %edi
+; X32-NEXT: movl (%esp), %ebx # 4-byte Reload
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
; X32-NEXT: adcl %ecx, %eax
; X32-NEXT: adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X32-NEXT: movl %edi, %edx
; X32-NEXT: adcl $0, %edx
-; X32-NEXT: adcl $0, %ebx
+; X32-NEXT: movl %ebx, %ecx
+; X32-NEXT: adcl $0, %ecx
; X32-NEXT: adcl $0, %eax
; X32-NEXT: adcl $0, %esi
-; X32-NEXT: addl (%esp), %edx # 4-byte Folded Reload
+; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl %ecx, (%esp) # 4-byte Spill
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT: movl %edi, %eax
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X32-NEXT: mull %esi
; X32-NEXT: movl %edx, %ecx
; X32-NEXT: movl %ebp, %eax
; X32-NEXT: mull %esi
; X32-NEXT: movl %edx, %esi
-; X32-NEXT: movl %eax, %edi
-; X32-NEXT: addl %ecx, %edi
+; X32-NEXT: movl %eax, %ebx
+; X32-NEXT: addl %ecx, %ebx
; X32-NEXT: adcl $0, %esi
-; X32-NEXT: movl %ebx, %eax
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT: mull %ecx
-; X32-NEXT: movl %edx, %ebx
-; X32-NEXT: addl %edi, %eax
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: addl %ebx, %eax
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: adcl %esi, %ebx
-; X32-NEXT: setb (%esp) # 1-byte Folded Spill
+; X32-NEXT: adcl %esi, %ecx
+; X32-NEXT: setb %bl
; X32-NEXT: movl %ebp, %eax
-; X32-NEXT: mull %ecx
-; X32-NEXT: addl %ebx, %eax
+; X32-NEXT: mull %edi
+; X32-NEXT: addl %ecx, %eax
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movzbl (%esp), %eax # 1-byte Folded Reload
+; X32-NEXT: movzbl %bl, %eax
; X32-NEXT: adcl %eax, %edx
-; X32-NEXT: movl %edx, (%esp) # 4-byte Spill
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT: movl %esi, %eax
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT: mull %ecx
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %ecx
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
; X32-NEXT: movl %ebp, %eax
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: movl %eax, %ebx
+; X32-NEXT: addl %ecx, %ebx
+; X32-NEXT: adcl $0, %esi
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X32-NEXT: mull %ecx
; X32-NEXT: movl %edx, %edi
-; X32-NEXT: movl %eax, %ebx
-; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NEXT: adcl $0, %edi
-; X32-NEXT: movl %esi, %eax
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT: mull %esi
-; X32-NEXT: movl %edx, %ecx
; X32-NEXT: addl %ebx, %eax
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: adcl %edi, %ecx
+; X32-NEXT: adcl %esi, %edi
; X32-NEXT: setb %bl
; X32-NEXT: movl %ebp, %eax
-; X32-NEXT: mull %esi
+; X32-NEXT: mull %ecx
; X32-NEXT: movl %eax, %esi
-; X32-NEXT: addl %ecx, %esi
+; X32-NEXT: addl %edi, %esi
; X32-NEXT: movzbl %bl, %eax
; X32-NEXT: adcl %eax, %edx
; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT: adcl $0, (%esp) # 4-byte Folded Spill
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT: mull %ecx
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT: mull %edi
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl %ebp, %eax
-; X32-NEXT: mull %ecx
-; X32-NEXT: movl %edx, %edi
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %edx, %ebx
; X32-NEXT: movl %eax, %ebp
; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NEXT: adcl $0, %edi
-; X32-NEXT: movl %ebx, %eax
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT: mull %ecx
-; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: adcl $0, %ebx
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %edx, %ecx
; X32-NEXT: addl %ebp, %eax
; X32-NEXT: movl %eax, %ebp
-; X32-NEXT: adcl %edi, %ebx
+; X32-NEXT: adcl %ebx, %ecx
; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT: mull %ecx
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %edi, %ebx
; X32-NEXT: movl %eax, %edi
-; X32-NEXT: addl %ebx, %edi
+; X32-NEXT: addl %ecx, %edi
; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
; X32-NEXT: adcl %eax, %edx
; X32-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X32-NEXT: adcl $0, %edi
; X32-NEXT: adcl $0, %edx
; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NEXT: adcl (%esp), %edx # 4-byte Folded Reload
+; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: setb (%esp) # 1-byte Folded Spill
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT: movl %esi, %eax
+; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT: mull %ecx
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT: mull %ecx
-; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %esi
; X32-NEXT: movl %eax, %ebp
; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NEXT: adcl $0, %ebx
-; X32-NEXT: movl %esi, %eax
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT: mull %ecx
-; X32-NEXT: movl %edx, %esi
+; X32-NEXT: adcl $0, %esi
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: movl %ebx, %ecx
+; X32-NEXT: mull %ebx
+; X32-NEXT: movl %edx, %ebx
; X32-NEXT: addl %ebp, %eax
; X32-NEXT: movl %eax, %ebp
-; X32-NEXT: adcl %ebx, %esi
+; X32-NEXT: adcl %esi, %ebx
; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X32-NEXT: mull %ecx
-; X32-NEXT: movl %eax, %ebx
-; X32-NEXT: addl %esi, %ebx
-; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X32-NEXT: movl %edx, %esi
-; X32-NEXT: adcl %eax, %esi
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NEXT: addl %edi, %edx
+; X32-NEXT: addl %ebx, %eax
+; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X32-NEXT: adcl %ecx, %edx
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT: addl %edi, %ecx
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NEXT: movzbl (%esp), %eax # 1-byte Folded Reload
-; X32-NEXT: adcl %eax, %ebx
-; X32-NEXT: adcl $0, %esi
+; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
+; X32-NEXT: adcl %esi, %eax
+; X32-NEXT: movl %eax, %esi
+; X32-NEXT: adcl $0, %edx
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X32-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NEXT: movl (%esp), %eax # 4-byte Reload
; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X32-NEXT: adcl %eax, %edx
-; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: adcl %eax, %ecx
+; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: adcl $0, %ebp
; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: adcl $0, %ebx
-; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: adcl $0, %esi
; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: adcl $0, %edx
+; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl 32(%eax), %edi
; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl %esi, %eax
; X32-NEXT: mull %edi
; X32-NEXT: movl %edx, %ecx
-; X32-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
; X32-NEXT: movl %ebp, %eax
; X32-NEXT: mull %edi
; X32-NEXT: movl 36(%eax), %ecx
; X32-NEXT: movl %esi, %eax
; X32-NEXT: movl %ecx, %esi
-; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: movl %ecx, (%esp) # 4-byte Spill
; X32-NEXT: mull %ecx
; X32-NEXT: movl %edx, %ecx
; X32-NEXT: addl %ebx, %eax
; X32-NEXT: addl %ecx, %ebp
; X32-NEXT: adcl $0, %edi
; X32-NEXT: movl %esi, %eax
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT: movl (%esp), %ecx # 4-byte Reload
; X32-NEXT: mull %ecx
; X32-NEXT: movl %edx, %esi
; X32-NEXT: addl %ebp, %eax
; X32-NEXT: addl %esi, %edi
; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
; X32-NEXT: adcl %eax, %edx
-; X32-NEXT: addl (%esp), %edi # 4-byte Folded Reload
+; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl 40(%eax), %esi
-; X32-NEXT: movl %esi, (%esp) # 4-byte Spill
+; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X32-NEXT: movl %ecx, %eax
; X32-NEXT: mull %esi
; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X32-NEXT: movl %ecx, %eax
-; X32-NEXT: movl (%esp), %edi # 4-byte Reload
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
; X32-NEXT: mull %edi
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: adcl $0, %edx
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT: imull %eax, %edi
-; X32-NEXT: movl (%esp), %ecx # 4-byte Reload
+; X32-NEXT: movl %edi, %esi
+; X32-NEXT: imull %eax, %esi
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X32-NEXT: mull %ecx
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: addl %esi, %edx
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; X32-NEXT: imull %ebx, %ecx
-; X32-NEXT: addl %edi, %ecx
; X32-NEXT: addl %edx, %ecx
-; X32-NEXT: movl %ecx, (%esp) # 4-byte Spill
+; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT: movl %eax, %edx
+; X32-NEXT: movl %eax, %esi
+; X32-NEXT: movl (%esp), %edi # 4-byte Reload
+; X32-NEXT: imull %edi, %esi
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NEXT: mull %ebp
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: addl %esi, %edx
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT: imull %esi, %edx
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT: movl %ebp, %eax
+; X32-NEXT: imull %ebp, %esi
+; X32-NEXT: addl %edx, %esi
+; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT: imull %ecx, %edi
-; X32-NEXT: addl %edx, %edi
; X32-NEXT: mull %ecx
-; X32-NEXT: movl %eax, %ebp
-; X32-NEXT: addl %edx, %edi
-; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NEXT: adcl (%esp), %edi # 4-byte Folded Reload
-; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl %ecx, %eax
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT: mull %edi
-; X32-NEXT: movl %edx, (%esp) # 4-byte Spill
+; X32-NEXT: movl %edx, %ebp
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl %esi, %eax
-; X32-NEXT: mull %edi
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: mull %ecx
; X32-NEXT: movl %edx, %esi
; X32-NEXT: movl %eax, %edi
-; X32-NEXT: addl (%esp), %edi # 4-byte Folded Reload
+; X32-NEXT: addl %ebp, %edi
; X32-NEXT: adcl $0, %esi
-; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X32-NEXT: mull %ebx
+; X32-NEXT: movl %ebx, %ebp
; X32-NEXT: movl %edx, %ecx
; X32-NEXT: addl %edi, %eax
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: adcl %esi, %ecx
; X32-NEXT: setb %bl
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT: mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X32-NEXT: movl (%esp), %eax # 4-byte Reload
+; X32-NEXT: mull %ebp
; X32-NEXT: addl %ecx, %eax
; X32-NEXT: movzbl %bl, %ecx
; X32-NEXT: adcl %ecx, %edx
-; X32-NEXT: addl %ebp, %eax
+; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X32-NEXT: movl 60(%edi), %ecx
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT: imull %eax, %ecx
-; X32-NEXT: movl 56(%edi), %esi
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X32-NEXT: movl 56(%ebx), %esi
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT: movl %ecx, %eax
; X32-NEXT: mull %esi
-; X32-NEXT: movl %eax, (%esp) # 4-byte Spill
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NEXT: imull %ebp, %esi
-; X32-NEXT: addl %ecx, %esi
+; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
; X32-NEXT: addl %edx, %esi
-; X32-NEXT: movl 48(%edi), %ecx
-; X32-NEXT: movl 52(%edi), %ebx
+; X32-NEXT: movl 60(%ebx), %eax
+; X32-NEXT: imull %ecx, %eax
+; X32-NEXT: addl %eax, %esi
+; X32-NEXT: movl 48(%ebx), %edi
+; X32-NEXT: movl 52(%ebx), %ebp
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT: movl %eax, %edx
-; X32-NEXT: imull %ebx, %edx
-; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT: imull %ecx, %edi
-; X32-NEXT: addl %edx, %edi
-; X32-NEXT: mull %ecx
-; X32-NEXT: addl %edx, %edi
-; X32-NEXT: addl (%esp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl %eax, %ebx
+; X32-NEXT: imull %ebp, %ebx
+; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: mull %edi
+; X32-NEXT: addl %ebx, %edx
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT: imull %edi, %ecx
+; X32-NEXT: addl %edx, %ecx
+; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: adcl %esi, %edi
-; X32-NEXT: movl %ecx, %eax
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT: mull %esi
-; X32-NEXT: movl %edx, (%esp) # 4-byte Spill
+; X32-NEXT: adcl %esi, %ecx
+; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, %esi
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl %ebx, %eax
-; X32-NEXT: mull %esi
+; X32-NEXT: movl %ebp, %eax
+; X32-NEXT: mull %ecx
; X32-NEXT: movl %edx, %ebx
-; X32-NEXT: movl %eax, %esi
-; X32-NEXT: addl (%esp), %esi # 4-byte Folded Reload
-; X32-NEXT: adcl $0, %ebx
-; X32-NEXT: movl %ecx, %eax
-; X32-NEXT: mull %ebp
-; X32-NEXT: movl %edx, %ecx
; X32-NEXT: movl %eax, %ebp
; X32-NEXT: addl %esi, %ebp
+; X32-NEXT: adcl $0, %ebx
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: movl %eax, %edi
+; X32-NEXT: addl %ebp, %edi
; X32-NEXT: adcl %ebx, %ecx
; X32-NEXT: setb %bl
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT: mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X32-NEXT: mull %esi
; X32-NEXT: addl %ecx, %eax
; X32-NEXT: movzbl %bl, %ecx
; X32-NEXT: adcl %ecx, %edx
; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NEXT: adcl %edi, %edx
+; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X32-NEXT: movl %edi, %eax
; X32-NEXT: mull %ebp
; X32-NEXT: addl %esi, %eax
-; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: movl %eax, (%esp) # 4-byte Spill
; X32-NEXT: movzbl %cl, %eax
; X32-NEXT: adcl %eax, %edx
-; X32-NEXT: movl %edx, (%esp) # 4-byte Spill
+; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: movl 32(%ecx), %esi
; X32-NEXT: movl %esi, %eax
; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X32-NEXT: adcl $0, (%esp) # 4-byte Folded Spill
+; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X32-NEXT: movl %ecx, %eax
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: adcl $0, %edi
; X32-NEXT: adcl $0, %ecx
-; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NEXT: adcl (%esp), %ecx # 4-byte Folded Reload
+; X32-NEXT: addl (%esp), %edi # 4-byte Folded Reload
+; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X32-NEXT: movl %esi, %eax
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; X32-NEXT: mull %ebx
-; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X32-NEXT: movl %edx, (%esp) # 4-byte Spill
+; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X32-NEXT: mull %ebx
; X32-NEXT: movl %edx, %ebx
; X32-NEXT: movl %eax, %ebp
-; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NEXT: addl (%esp), %ebp # 4-byte Folded Reload
; X32-NEXT: adcl $0, %ebx
; X32-NEXT: movl %esi, %eax
; X32-NEXT: mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
; X32-NEXT: addl %ebp, %eax
; X32-NEXT: movl %eax, %ebp
; X32-NEXT: adcl %ebx, %esi
-; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X32-NEXT: setb (%esp) # 1-byte Folded Spill
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; X32-NEXT: mull %ebx
; X32-NEXT: addl %esi, %eax
; X32-NEXT: movl %eax, %esi
-; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X32-NEXT: movzbl (%esp), %eax # 1-byte Folded Reload
; X32-NEXT: adcl %eax, %edx
-; X32-NEXT: addl %edi, (%esp) # 4-byte Folded Spill
+; X32-NEXT: addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X32-NEXT: adcl %ecx, %ebp
-; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: movl %ebp, (%esp) # 4-byte Spill
; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
; X32-NEXT: adcl %eax, %esi
; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: adcl $0, %edx
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: movl 48(%ecx), %ebp
+; X32-NEXT: movl 48(%ecx), %edi
; X32-NEXT: movl %ebx, %esi
-; X32-NEXT: imull %ebp, %esi
-; X32-NEXT: movl %ebp, %eax
+; X32-NEXT: imull %edi, %esi
+; X32-NEXT: movl %edi, %eax
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; X32-NEXT: mull %ebx
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: addl %esi, %edx
; X32-NEXT: movl 52(%ecx), %eax
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: imull %eax, %ebx
-; X32-NEXT: addl %esi, %ebx
; X32-NEXT: addl %edx, %ebx
; X32-NEXT: movl 56(%ecx), %eax
-; X32-NEXT: movl %ecx, %esi
-; X32-NEXT: movl %eax, %edx
+; X32-NEXT: movl %eax, %esi
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NEXT: imull %ebp, %esi
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT: imull %ecx, %edx
+; X32-NEXT: mull %ecx
+; X32-NEXT: addl %esi, %edx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
; X32-NEXT: movl 60(%esi), %esi
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT: imull %edi, %esi
-; X32-NEXT: addl %edx, %esi
-; X32-NEXT: mull %edi
+; X32-NEXT: imull %ecx, %esi
; X32-NEXT: addl %edx, %esi
; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: adcl %ebx, %esi
-; X32-NEXT: movl %edi, %eax
-; X32-NEXT: mull %ebp
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: mull %edi
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl %ecx, %eax
-; X32-NEXT: mull %ebp
+; X32-NEXT: movl %ebp, %eax
+; X32-NEXT: mull %edi
; X32-NEXT: movl %edx, %ecx
; X32-NEXT: movl %eax, %ebx
; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
; X32-NEXT: adcl $0, %ecx
-; X32-NEXT: movl %edi, %eax
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
; X32-NEXT: mull %ebp
; X32-NEXT: movl %edx, %edi
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: adcl %esi, %edx
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT: imull %eax, %edi
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT: mull %ecx
-; X32-NEXT: movl %eax, %ebx
-; X32-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-NEXT: addl %edi, %ecx
-; X32-NEXT: addl %edx, %ecx
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT: movl %eax, %edx
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT: imull %esi, %edx
+; X32-NEXT: imull %eax, %ecx
+; X32-NEXT: movl %eax, %esi
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NEXT: imull %edi, %ebp
-; X32-NEXT: addl %edx, %ebp
; X32-NEXT: mull %edi
-; X32-NEXT: addl %edx, %ebp
-; X32-NEXT: addl %ebx, %eax
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: adcl %ecx, %ebp
-; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl %edi, %eax
-; X32-NEXT: movl %edi, %ebx
+; X32-NEXT: addl %ecx, %edx
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NEXT: imull %ebp, %edi
+; X32-NEXT: addl %edx, %edi
+; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT: imull %edi, %ecx
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT: mull %ebx
+; X32-NEXT: addl %ecx, %edx
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT: mull %ecx
-; X32-NEXT: movl %edx, %edi
+; X32-NEXT: imull %ebx, %ecx
+; X32-NEXT: addl %edx, %ecx
+; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl %esi, %eax
-; X32-NEXT: mull %ecx
+; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: mull %esi
; X32-NEXT: movl %edx, %esi
; X32-NEXT: movl %eax, %ecx
-; X32-NEXT: addl %edi, %ecx
+; X32-NEXT: addl %ebx, %ecx
; X32-NEXT: adcl $0, %esi
-; X32-NEXT: movl %ebx, %eax
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X32-NEXT: mull %ebp
; X32-NEXT: movl %edx, %ebx
; X32-NEXT: movl %eax, %edi
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NEXT: addl (%esp), %ecx # 4-byte Folded Reload
+; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NEXT: adcl (%esp), %edi # 4-byte Folded Reload
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; X64-NEXT: pushq %r13
; X64-NEXT: pushq %r12
; X64-NEXT: pushq %rbx
-; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: pushq %rax
+; X64-NEXT: movq %rdx, (%rsp) # 8-byte Spill
; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq (%rdi), %r14
+; X64-NEXT: movq (%rdi), %rbx
; X64-NEXT: movq 8(%rdi), %r9
-; X64-NEXT: movq 24(%rdi), %r15
-; X64-NEXT: movq 16(%rdi), %rax
-; X64-NEXT: movq (%rsi), %rdi
-; X64-NEXT: movq 8(%rsi), %rbx
-; X64-NEXT: movq %rsi, %r12
-; X64-NEXT: movq %rax, %rsi
-; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: mulq %rdi
+; X64-NEXT: movq 24(%rdi), %r12
+; X64-NEXT: movq 16(%rdi), %r14
+; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq (%rsi), %rcx
+; X64-NEXT: movq 8(%rsi), %r11
+; X64-NEXT: movq %rsi, %rdi
+; X64-NEXT: movq %r14, %rax
+; X64-NEXT: movq %rcx, %rsi
+; X64-NEXT: mulq %rcx
; X64-NEXT: movq %rdx, %rcx
; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq %r15, %rax
-; X64-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: mulq %rdi
+; X64-NEXT: movq %r12, %rax
+; X64-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: mulq %rsi
+; X64-NEXT: movq %rsi, %r15
; X64-NEXT: movq %rdx, %r8
; X64-NEXT: movq %rax, %r10
; X64-NEXT: addq %rcx, %r10
; X64-NEXT: adcq $0, %r8
-; X64-NEXT: movq %rsi, %rax
-; X64-NEXT: mulq %rbx
-; X64-NEXT: movq %rdx, %rcx
-; X64-NEXT: movq %rax, %r11
-; X64-NEXT: addq %r10, %r11
-; X64-NEXT: adcq %r8, %rcx
+; X64-NEXT: movq %r14, %rax
+; X64-NEXT: mulq %r11
+; X64-NEXT: movq %rdx, %r14
+; X64-NEXT: movq %rax, %rcx
+; X64-NEXT: addq %r10, %rcx
+; X64-NEXT: adcq %r8, %r14
; X64-NEXT: setb %al
; X64-NEXT: movzbl %al, %esi
-; X64-NEXT: movq %r15, %rax
-; X64-NEXT: mulq %rbx
+; X64-NEXT: movq %r12, %rax
+; X64-NEXT: mulq %r11
; X64-NEXT: movq %rdx, %r13
; X64-NEXT: movq %rax, %r10
-; X64-NEXT: addq %rcx, %r10
+; X64-NEXT: addq %r14, %r10
; X64-NEXT: adcq %rsi, %r13
-; X64-NEXT: movq %r14, %rsi
-; X64-NEXT: movq %r14, %rax
-; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: mulq %rdi
-; X64-NEXT: movq %rdx, %rcx
+; X64-NEXT: movq %rbx, %rax
+; X64-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: mulq %r15
+; X64-NEXT: movq %rdx, %r14
; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; X64-NEXT: movq %r9, %rax
-; X64-NEXT: mulq %rdi
-; X64-NEXT: movq %rdx, %r14
-; X64-NEXT: movq %rax, %r15
-; X64-NEXT: addq %rcx, %r15
-; X64-NEXT: adcq $0, %r14
-; X64-NEXT: movq %rsi, %rax
-; X64-NEXT: movq %rsi, %r8
-; X64-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: mulq %rbx
+; X64-NEXT: mulq %r15
; X64-NEXT: movq %rdx, %rbp
+; X64-NEXT: movq %rax, %r15
+; X64-NEXT: addq %r14, %r15
+; X64-NEXT: adcq $0, %rbp
+; X64-NEXT: movq %rbx, %rax
+; X64-NEXT: movq %rbx, %r12
+; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: mulq %r11
+; X64-NEXT: movq %rdx, %rbx
; X64-NEXT: addq %r15, %rax
; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: adcq %r14, %rbp
+; X64-NEXT: adcq %rbp, %rbx
; X64-NEXT: setb %sil
-; X64-NEXT: movq %r9, %rdi
; X64-NEXT: movq %r9, %rax
-; X64-NEXT: mulq %rbx
+; X64-NEXT: mulq %r11
; X64-NEXT: movq %rdx, %r14
-; X64-NEXT: movq %rax, %rcx
-; X64-NEXT: addq %rbp, %rcx
+; X64-NEXT: movq %rax, %rbp
+; X64-NEXT: addq %rbx, %rbp
; X64-NEXT: movzbl %sil, %eax
; X64-NEXT: adcq %rax, %r14
-; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
-; X64-NEXT: adcq %r11, %r14
+; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload
+; X64-NEXT: adcq %rcx, %r14
; X64-NEXT: adcq $0, %r10
; X64-NEXT: adcq $0, %r13
-; X64-NEXT: movq %r12, %r9
-; X64-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq 16(%r12), %rsi
-; X64-NEXT: movq %r8, %rbx
-; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq %r8, %rax
-; X64-NEXT: mulq %rsi
-; X64-NEXT: movq %rdx, %r11
-; X64-NEXT: movq %rax, %r8
-; X64-NEXT: movq %rdi, %rax
-; X64-NEXT: movq %rdi, %r12
+; X64-NEXT: movq %rdi, %rsi
; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq 16(%rdi), %r8
+; X64-NEXT: movq %r12, %r11
+; X64-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq %r12, %rax
+; X64-NEXT: mulq %r8
+; X64-NEXT: movq %rdx, %rcx
+; X64-NEXT: movq %rax, %rdi
+; X64-NEXT: movq %r9, %rax
+; X64-NEXT: movq %r9, %r12
+; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: mulq %r8
+; X64-NEXT: movq %rdx, %rbx
+; X64-NEXT: movq %rax, %r15
+; X64-NEXT: addq %rcx, %r15
+; X64-NEXT: adcq $0, %rbx
+; X64-NEXT: movq 24(%rsi), %rsi
+; X64-NEXT: movq %r11, %rax
; X64-NEXT: mulq %rsi
-; X64-NEXT: movq %rdx, %r15
-; X64-NEXT: movq %rax, %rbp
-; X64-NEXT: addq %r11, %rbp
-; X64-NEXT: adcq $0, %r15
-; X64-NEXT: movq 24(%r9), %rdi
-; X64-NEXT: movq %rbx, %rax
-; X64-NEXT: mulq %rdi
; X64-NEXT: movq %rdx, %r9
-; X64-NEXT: movq %rax, %rbx
-; X64-NEXT: addq %rbp, %rbx
-; X64-NEXT: adcq %r15, %r9
-; X64-NEXT: setb %bpl
+; X64-NEXT: movq %rax, %r11
+; X64-NEXT: addq %r15, %r11
+; X64-NEXT: adcq %rbx, %r9
+; X64-NEXT: setb %bl
; X64-NEXT: movq %r12, %rax
-; X64-NEXT: mulq %rdi
+; X64-NEXT: mulq %rsi
; X64-NEXT: movq %rdx, %r15
-; X64-NEXT: movq %rax, %r11
-; X64-NEXT: addq %r9, %r11
-; X64-NEXT: movzbl %bpl, %eax
+; X64-NEXT: movq %rax, %rcx
+; X64-NEXT: addq %r9, %rcx
+; X64-NEXT: movzbl %bl, %eax
; X64-NEXT: adcq %rax, %r15
-; X64-NEXT: addq %rcx, %r8
-; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: adcq %r14, %rbx
-; X64-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: adcq $0, %r11
+; X64-NEXT: addq %rbp, %rdi
+; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: adcq %r14, %r11
+; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: adcq $0, %rcx
; X64-NEXT: adcq $0, %r15
-; X64-NEXT: addq %r10, %r11
+; X64-NEXT: addq %r10, %rcx
; X64-NEXT: adcq %r13, %r15
-; X64-NEXT: setb %bpl
+; X64-NEXT: setb %r12b
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; X64-NEXT: movq %r14, %rax
+; X64-NEXT: mulq %r8
+; X64-NEXT: movq %rdx, %rdi
+; X64-NEXT: movq %rax, %r11
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
; X64-NEXT: movq %r10, %rax
-; X64-NEXT: mulq %rsi
-; X64-NEXT: movq %rdx, %rcx
+; X64-NEXT: mulq %r8
+; X64-NEXT: movq %rdx, %r9
; X64-NEXT: movq %rax, %rbx
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; X64-NEXT: movq %r13, %rax
+; X64-NEXT: addq %rdi, %rbx
+; X64-NEXT: adcq $0, %r9
+; X64-NEXT: movq %r14, %rax
; X64-NEXT: mulq %rsi
-; X64-NEXT: movq %rdx, %r8
-; X64-NEXT: movq %rax, %r9
-; X64-NEXT: addq %rcx, %r9
-; X64-NEXT: adcq $0, %r8
+; X64-NEXT: movq %rdx, %rbp
+; X64-NEXT: addq %rbx, %rax
+; X64-NEXT: movq %rax, %rbx
+; X64-NEXT: adcq %r9, %rbp
+; X64-NEXT: setb %dil
; X64-NEXT: movq %r10, %rax
-; X64-NEXT: mulq %rdi
-; X64-NEXT: movq %rdx, %rcx
-; X64-NEXT: addq %r9, %rax
-; X64-NEXT: movq %rax, %r9
-; X64-NEXT: adcq %r8, %rcx
-; X64-NEXT: setb %r8b
-; X64-NEXT: movq %r13, %rax
-; X64-NEXT: mulq %rdi
-; X64-NEXT: movq %rax, %r14
-; X64-NEXT: addq %rcx, %r14
-; X64-NEXT: movzbl %r8b, %eax
-; X64-NEXT: adcq %rax, %rdx
-; X64-NEXT: addq %r11, %rbx
+; X64-NEXT: mulq %rsi
+; X64-NEXT: addq %rbp, %rax
+; X64-NEXT: movzbl %dil, %edi
+; X64-NEXT: adcq %rdi, %rdx
+; X64-NEXT: addq %rcx, %r11
+; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: adcq %r15, %rbx
; X64-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: adcq %r15, %r9
-; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movzbl %bpl, %eax
-; X64-NEXT: adcq %rax, %r14
+; X64-NEXT: movzbl %r12b, %ecx
+; X64-NEXT: adcq %rcx, %rax
+; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; X64-NEXT: adcq $0, %rdx
; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; X64-NEXT: movq 32(%rcx), %r11
-; X64-NEXT: imulq %r11, %rdi
-; X64-NEXT: movq %r11, %rax
-; X64-NEXT: mulq %rsi
+; X64-NEXT: movq 32(%rcx), %r15
+; X64-NEXT: imulq %r15, %rsi
+; X64-NEXT: movq %r15, %rax
+; X64-NEXT: mulq %r8
; X64-NEXT: movq %rax, %r9
-; X64-NEXT: movq 40(%rcx), %r8
-; X64-NEXT: imulq %r8, %rsi
-; X64-NEXT: addq %rdi, %rsi
-; X64-NEXT: addq %rdx, %rsi
+; X64-NEXT: addq %rsi, %rdx
+; X64-NEXT: movq 40(%rcx), %rsi
+; X64-NEXT: imulq %rsi, %r8
+; X64-NEXT: addq %rdx, %r8
; X64-NEXT: movq 48(%rcx), %rax
-; X64-NEXT: movq %rcx, %rdx
-; X64-NEXT: movq %rax, %rcx
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; X64-NEXT: imulq %r10, %rcx
-; X64-NEXT: movq 56(%rdx), %rbp
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; X64-NEXT: imulq %rdi, %rbp
-; X64-NEXT: addq %rcx, %rbp
-; X64-NEXT: mulq %rdi
+; X64-NEXT: movq %rcx, %r11
+; X64-NEXT: movq %rax, %rdi
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; X64-NEXT: imulq %r14, %rdi
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; X64-NEXT: mulq %rbx
; X64-NEXT: movq %rax, %rcx
-; X64-NEXT: addq %rdx, %rbp
+; X64-NEXT: addq %rdi, %rdx
+; X64-NEXT: movq 56(%r11), %r11
+; X64-NEXT: imulq %rbx, %r11
+; X64-NEXT: addq %rdx, %r11
; X64-NEXT: addq %r9, %rcx
-; X64-NEXT: adcq %rsi, %rbp
-; X64-NEXT: movq %rdi, %rax
-; X64-NEXT: movq %rdi, %rsi
-; X64-NEXT: mulq %r11
+; X64-NEXT: adcq %r8, %r11
+; X64-NEXT: movq %rbx, %rax
+; X64-NEXT: movq %rbx, %r8
+; X64-NEXT: mulq %r15
; X64-NEXT: movq %rdx, %rdi
; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq %r10, %rax
-; X64-NEXT: mulq %r11
+; X64-NEXT: movq %r14, %rax
+; X64-NEXT: mulq %r15
; X64-NEXT: movq %rdx, %r9
-; X64-NEXT: movq %rax, %r11
-; X64-NEXT: addq %rdi, %r11
+; X64-NEXT: movq %rax, %rbx
+; X64-NEXT: addq %rdi, %rbx
; X64-NEXT: adcq $0, %r9
-; X64-NEXT: movq %rsi, %rax
-; X64-NEXT: mulq %r8
+; X64-NEXT: movq %r8, %rax
+; X64-NEXT: mulq %rsi
; X64-NEXT: movq %rdx, %r15
-; X64-NEXT: movq %rax, %rbx
-; X64-NEXT: addq %r11, %rbx
+; X64-NEXT: movq %rax, %r13
+; X64-NEXT: addq %rbx, %r13
; X64-NEXT: adcq %r9, %r15
-; X64-NEXT: setb %sil
-; X64-NEXT: movq %r10, %rax
-; X64-NEXT: mulq %r8
+; X64-NEXT: setb %dil
+; X64-NEXT: movq %r14, %rax
+; X64-NEXT: mulq %rsi
; X64-NEXT: movq %rdx, %r12
; X64-NEXT: movq %rax, %r8
; X64-NEXT: addq %r15, %r8
-; X64-NEXT: movzbl %sil, %eax
+; X64-NEXT: movzbl %dil, %eax
; X64-NEXT: adcq %rax, %r12
; X64-NEXT: addq %rcx, %r8
-; X64-NEXT: adcq %rbp, %r12
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; X64-NEXT: movq 56(%rcx), %rsi
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; X64-NEXT: imulq %rax, %rsi
-; X64-NEXT: movq 48(%rcx), %r11
-; X64-NEXT: movq %rcx, %rdi
-; X64-NEXT: movq %rax, %rbp
-; X64-NEXT: mulq %r11
-; X64-NEXT: movq %rax, %r10
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; X64-NEXT: imulq %rcx, %r11
-; X64-NEXT: addq %rsi, %r11
-; X64-NEXT: addq %rdx, %r11
-; X64-NEXT: movq 32(%rdi), %r9
-; X64-NEXT: movq 40(%rdi), %rdi
+; X64-NEXT: adcq %r11, %r12
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; X64-NEXT: movq 48(%r9), %rsi
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: mulq %rsi
+; X64-NEXT: movq %rax, %rcx
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; X64-NEXT: imulq %r14, %rsi
+; X64-NEXT: addq %rdx, %rsi
+; X64-NEXT: movq %r9, %rdx
+; X64-NEXT: movq 56(%r9), %rax
+; X64-NEXT: imulq %rdi, %rax
+; X64-NEXT: movq %rdi, %rbx
+; X64-NEXT: addq %rax, %rsi
+; X64-NEXT: movq 32(%r9), %r9
+; X64-NEXT: movq 40(%rdx), %r15
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; X64-NEXT: movq %rax, %rdx
-; X64-NEXT: imulq %rdi, %rdx
-; X64-NEXT: imulq %r9, %r13
-; X64-NEXT: addq %rdx, %r13
+; X64-NEXT: movq %rax, %r11
+; X64-NEXT: imulq %r15, %r11
; X64-NEXT: mulq %r9
-; X64-NEXT: movq %rax, %r15
-; X64-NEXT: addq %rdx, %r13
-; X64-NEXT: addq %r10, %r15
-; X64-NEXT: adcq %r11, %r13
+; X64-NEXT: movq %rax, %rdi
+; X64-NEXT: addq %r11, %rdx
+; X64-NEXT: imulq %r9, %r10
+; X64-NEXT: addq %rdx, %r10
+; X64-NEXT: addq %rcx, %rdi
+; X64-NEXT: adcq %rsi, %r10
; X64-NEXT: movq %r9, %rax
-; X64-NEXT: mulq %rbp
-; X64-NEXT: movq %rdx, %r10
-; X64-NEXT: movq %rax, %rsi
-; X64-NEXT: movq %rdi, %rax
-; X64-NEXT: mulq %rbp
+; X64-NEXT: mulq %rbx
+; X64-NEXT: movq %rdx, %rsi
+; X64-NEXT: movq %rax, %rcx
+; X64-NEXT: movq %r15, %rax
+; X64-NEXT: mulq %rbx
; X64-NEXT: movq %rdx, %rbp
-; X64-NEXT: movq %rax, %r11
-; X64-NEXT: addq %r10, %r11
+; X64-NEXT: movq %rax, %rbx
+; X64-NEXT: addq %rsi, %rbx
; X64-NEXT: adcq $0, %rbp
; X64-NEXT: movq %r9, %rax
-; X64-NEXT: mulq %rcx
-; X64-NEXT: movq %rcx, %r10
-; X64-NEXT: movq %rdx, %r9
-; X64-NEXT: movq %rax, %rcx
-; X64-NEXT: addq %r11, %rcx
-; X64-NEXT: adcq %rbp, %r9
-; X64-NEXT: setb %r11b
-; X64-NEXT: movq %rdi, %rax
-; X64-NEXT: mulq %r10
-; X64-NEXT: addq %r9, %rax
-; X64-NEXT: movzbl %r11b, %edi
-; X64-NEXT: adcq %rdi, %rdx
-; X64-NEXT: addq %r15, %rax
-; X64-NEXT: adcq %r13, %rdx
-; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
-; X64-NEXT: adcq %rbx, %rcx
+; X64-NEXT: mulq %r14
+; X64-NEXT: movq %rdx, %rsi
+; X64-NEXT: movq %rax, %r9
+; X64-NEXT: addq %rbx, %r9
+; X64-NEXT: adcq %rbp, %rsi
+; X64-NEXT: setb %bl
+; X64-NEXT: movq %r15, %rax
+; X64-NEXT: mulq %r14
+; X64-NEXT: addq %rsi, %rax
+; X64-NEXT: movzbl %bl, %esi
+; X64-NEXT: adcq %rsi, %rdx
+; X64-NEXT: addq %rdi, %rax
+; X64-NEXT: adcq %r10, %rdx
+; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
+; X64-NEXT: adcq %r13, %r9
; X64-NEXT: adcq %r8, %rax
; X64-NEXT: adcq %r12, %rdx
-; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
-; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
-; X64-NEXT: adcq %r14, %rax
+; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
+; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload
+; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload
+; X64-NEXT: movq (%rsp), %rsi # 8-byte Reload
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; X64-NEXT: movq %rdi, (%rsi)
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; X64-NEXT: movq %rdi, 8(%rsi)
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; X64-NEXT: movq %rdi, 16(%rsi)
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; X64-NEXT: movq %r8, (%rdi)
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; X64-NEXT: movq %r8, 8(%rdi)
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; X64-NEXT: movq %r8, 16(%rdi)
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; X64-NEXT: movq %r8, 24(%rdi)
-; X64-NEXT: movq %rsi, 32(%rdi)
-; X64-NEXT: movq %rcx, 40(%rdi)
-; X64-NEXT: movq %rax, 48(%rdi)
-; X64-NEXT: movq %rdx, 56(%rdi)
+; X64-NEXT: movq %rdi, 24(%rsi)
+; X64-NEXT: movq %rcx, 32(%rsi)
+; X64-NEXT: movq %r9, 40(%rsi)
+; X64-NEXT: movq %rax, 48(%rsi)
+; X64-NEXT: movq %rdx, 56(%rsi)
+; X64-NEXT: addq $8, %rsp
; X64-NEXT: popq %rbx
; X64-NEXT: popq %r12
; X64-NEXT: popq %r13
; X64-NEXT: movq %rdi, %rax
; X64-NEXT: imulq %rdi, %rcx
; X64-NEXT: mulq %rdx
+; X64-NEXT: addq %rcx, %rdx
; X64-NEXT: imulq %rsi, %r8
-; X64-NEXT: addq %rcx, %r8
; X64-NEXT: addq %r8, %rdx
; X64-NEXT: retq
;
; X86-NEXT: .cfi_def_cfa_offset 16
; X86-NEXT: pushl %esi
; X86-NEXT: .cfi_def_cfa_offset 20
-; X86-NEXT: subl $8, %esp
-; X86-NEXT: .cfi_def_cfa_offset 28
+; X86-NEXT: subl $12, %esp
+; X86-NEXT: .cfi_def_cfa_offset 32
; X86-NEXT: .cfi_offset %esi, -20
; X86-NEXT: .cfi_offset %edi, -16
; X86-NEXT: .cfi_offset %ebx, -12
; X86-NEXT: .cfi_offset %ebp, -8
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT: imull %ecx, %ebp
+; X86-NEXT: mull %ecx
+; X86-NEXT: movl %eax, %edi
+; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: addl %ebp, %ebx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: imull %ecx, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT: imull %ebp, %eax
+; X86-NEXT: addl %eax, %ebx
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: movl %esi, %ecx
+; X86-NEXT: imull {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: imull %ebp, %esi
-; X86-NEXT: addl %eax, %esi
-; X86-NEXT: movl %ebp, %eax
-; X86-NEXT: mull %ecx
-; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: addl %edx, %esi
-; X86-NEXT: movl %edi, %eax
-; X86-NEXT: movl %edi, %edx
-; X86-NEXT: imull {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: mull %esi
+; X86-NEXT: addl %ecx, %edx
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: imull %edi, %ecx
+; X86-NEXT: imull %esi, %ecx
; X86-NEXT: addl %edx, %ecx
-; X86-NEXT: mull %edi
-; X86-NEXT: addl %edx, %ecx
-; X86-NEXT: addl %ebx, %eax
-; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT: adcl %esi, %ecx
-; X86-NEXT: movl %edi, %eax
+; X86-NEXT: addl %edi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl %ebx, %ecx
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: movl %ebp, %edi
; X86-NEXT: mull %ebp
-; X86-NEXT: movl %edx, %esi
+; X86-NEXT: movl %edx, %ebp
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %ebp
-; X86-NEXT: movl %edx, %ebx
-; X86-NEXT: movl %eax, %ebp
-; X86-NEXT: addl %esi, %ebp
-; X86-NEXT: adcl $0, %ebx
-; X86-NEXT: movl %edi, %eax
+; X86-NEXT: mull %edi
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: addl %ebp, %ebx
+; X86-NEXT: adcl $0, %edi
+; X86-NEXT: movl %esi, %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: movl %edx, %esi
-; X86-NEXT: movl %eax, %edi
-; X86-NEXT: addl %ebp, %edi
-; X86-NEXT: adcl %ebx, %esi
+; X86-NEXT: movl %edx, %ebp
+; X86-NEXT: addl %ebx, %eax
+; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT: adcl %edi, %ebp
; X86-NEXT: setb %bl
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: addl %esi, %eax
-; X86-NEXT: movzbl %bl, %esi
-; X86-NEXT: adcl %esi, %edx
-; X86-NEXT: addl (%esp), %eax # 4-byte Folded Reload
+; X86-NEXT: addl %ebp, %eax
+; X86-NEXT: movzbl %bl, %edi
+; X86-NEXT: adcl %edi, %edx
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X86-NEXT: adcl %ecx, %edx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl %edi, 4(%ecx)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: movl %esi, (%ecx)
-; X86-NEXT: movl %eax, 8(%ecx)
-; X86-NEXT: movl %edx, 12(%ecx)
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: addl $8, %esp
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl (%esp), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, 4(%esi)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, (%esi)
+; X86-NEXT: movl %eax, 8(%esi)
+; X86-NEXT: movl %edx, 12(%esi)
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: addl $12, %esp
; X86-NEXT: .cfi_def_cfa_offset 20
; X86-NEXT: popl %esi
; X86-NEXT: .cfi_def_cfa_offset 16
; X32-NEXT: movl %ecx, %eax
; X32-NEXT: mull %esi
; X32-NEXT: imull {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: addl %ecx, %edx
; X32-NEXT: imull {{[0-9]+}}(%esp), %esi
-; X32-NEXT: addl %ecx, %esi
; X32-NEXT: addl %esi, %edx
; X32-NEXT: popl %esi
; X32-NEXT: retl
; CHECK-NEXT: .cfi_def_cfa_offset 24
; CHECK-NEXT: .cfi_offset %rbx, -24
; CHECK-NEXT: .cfi_offset %r14, -16
-; CHECK-NEXT: movq %rdx, %r10
-; CHECK-NEXT: movq %rdi, %r9
+; CHECK-NEXT: movq %rdx, %r11
+; CHECK-NEXT: movq %rdi, %r10
; CHECK-NEXT: movq %rsi, %rdi
; CHECK-NEXT: sarq $63, %rdi
-; CHECK-NEXT: movq %rcx, %r11
-; CHECK-NEXT: imulq %rdi, %r11
+; CHECK-NEXT: movq %rcx, %r8
+; CHECK-NEXT: imulq %rdi, %r8
; CHECK-NEXT: movq %rdx, %rax
; CHECK-NEXT: mulq %rdi
-; CHECK-NEXT: movq %rax, %rdi
-; CHECK-NEXT: addq %rax, %r11
-; CHECK-NEXT: addq %rdx, %r11
+; CHECK-NEXT: movq %rdx, %rdi
+; CHECK-NEXT: movq %rax, %rbx
+; CHECK-NEXT: addq %rax, %rdi
+; CHECK-NEXT: addq %r8, %rdi
; CHECK-NEXT: movq %rcx, %rax
; CHECK-NEXT: sarq $63, %rax
; CHECK-NEXT: movq %rax, %r14
; CHECK-NEXT: imulq %rsi, %r14
-; CHECK-NEXT: mulq %r9
-; CHECK-NEXT: movq %rax, %r8
-; CHECK-NEXT: addq %rax, %r14
-; CHECK-NEXT: addq %rdx, %r14
-; CHECK-NEXT: addq %rdi, %r8
-; CHECK-NEXT: adcq %r11, %r14
-; CHECK-NEXT: movq %r9, %rax
; CHECK-NEXT: mulq %r10
-; CHECK-NEXT: movq %rdx, %r11
+; CHECK-NEXT: movq %rax, %r9
+; CHECK-NEXT: movq %rdx, %r8
+; CHECK-NEXT: addq %r14, %r8
+; CHECK-NEXT: addq %rax, %r8
+; CHECK-NEXT: addq %rbx, %r9
+; CHECK-NEXT: adcq %rdi, %r8
+; CHECK-NEXT: movq %r10, %rax
+; CHECK-NEXT: mulq %r11
+; CHECK-NEXT: movq %rdx, %rbx
; CHECK-NEXT: movq %rax, %rdi
; CHECK-NEXT: movq %rsi, %rax
-; CHECK-NEXT: mulq %r10
-; CHECK-NEXT: movq %rdx, %r10
-; CHECK-NEXT: movq %rax, %rbx
-; CHECK-NEXT: addq %r11, %rbx
-; CHECK-NEXT: adcq $0, %r10
-; CHECK-NEXT: movq %r9, %rax
-; CHECK-NEXT: mulq %rcx
+; CHECK-NEXT: mulq %r11
; CHECK-NEXT: movq %rdx, %r11
-; CHECK-NEXT: movq %rax, %r9
-; CHECK-NEXT: addq %rbx, %r9
-; CHECK-NEXT: adcq %r10, %r11
+; CHECK-NEXT: movq %rax, %r14
+; CHECK-NEXT: addq %rbx, %r14
+; CHECK-NEXT: adcq $0, %r11
+; CHECK-NEXT: movq %r10, %rax
+; CHECK-NEXT: mulq %rcx
+; CHECK-NEXT: movq %rdx, %rbx
+; CHECK-NEXT: movq %rax, %r10
+; CHECK-NEXT: addq %r14, %r10
+; CHECK-NEXT: adcq %r11, %rbx
; CHECK-NEXT: setb %al
-; CHECK-NEXT: movzbl %al, %r10d
+; CHECK-NEXT: movzbl %al, %r11d
; CHECK-NEXT: movq %rsi, %rax
; CHECK-NEXT: mulq %rcx
-; CHECK-NEXT: addq %r11, %rax
-; CHECK-NEXT: adcq %r10, %rdx
-; CHECK-NEXT: addq %r8, %rax
-; CHECK-NEXT: adcq %r14, %rdx
-; CHECK-NEXT: movq %r9, %rcx
+; CHECK-NEXT: addq %rbx, %rax
+; CHECK-NEXT: adcq %r11, %rdx
+; CHECK-NEXT: addq %r9, %rax
+; CHECK-NEXT: adcq %r8, %rdx
+; CHECK-NEXT: movq %r10, %rcx
; CHECK-NEXT: sarq $63, %rcx
; CHECK-NEXT: xorq %rcx, %rdx
; CHECK-NEXT: xorq %rax, %rcx
; CHECK-NEXT: jne LBB0_1
; CHECK-NEXT: ## %bb.2: ## %nooverflow
; CHECK-NEXT: movq %rdi, %rax
-; CHECK-NEXT: movq %r9, %rdx
+; CHECK-NEXT: movq %r10, %rdx
; CHECK-NEXT: popq %rbx
; CHECK-NEXT: popq %r14
; CHECK-NEXT: retq
; CHECK-NEXT: jne LBB0_17
; CHECK-NEXT: LBB0_18: ## %bb26
; CHECK-NEXT: movl (%esp), %ecx ## 4-byte Reload
+; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
+; CHECK-NEXT: addl %ecx, %esi
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx
-; CHECK-NEXT: addl %ecx, %edx
-; CHECK-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; CHECK-NEXT: addl %esi, %edx
; CHECK-NEXT: jmp LBB0_23
; CHECK-NEXT: LBB0_19: ## %bb29
; CHECK-NEXT: testl %edx, %edx
; CHECK-NEXT: jne LBB1_17
; CHECK-NEXT: LBB1_18: ## %bb26
; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; CHECK-NEXT: addl %eax, %ecx
-; CHECK-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
+; CHECK-NEXT: addl %ecx, %eax
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx
+; CHECK-NEXT: addl %eax, %edx
+; CHECK-NEXT: shrl %ecx
+; CHECK-NEXT: subl $4, %esp
+; CHECK-NEXT: pushl %ecx
+; CHECK-NEXT: pushl $128
+; CHECK-NEXT: pushl %edx
; CHECK-NEXT: jmp LBB1_23
; CHECK-NEXT: LBB1_19: ## %bb29
; CHECK-NEXT: testl %ebp, %ebp
; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
; CHECK-NEXT: addl %eax, %ecx
-; CHECK-NEXT: LBB1_23: ## %bb33
; CHECK-NEXT: shrl %eax
; CHECK-NEXT: subl $4, %esp
; CHECK-NEXT: pushl %eax
; CHECK-NEXT: pushl $128
; CHECK-NEXT: pushl %ecx
+; CHECK-NEXT: LBB1_23: ## %bb33
; CHECK-NEXT: calll _memset
; CHECK-NEXT: addl $44, %esp
; CHECK-NEXT: LBB1_25: ## %return
; X86-NOSSE-NEXT: andl $252645135, %edi # imm = 0xF0F0F0F
; X86-NOSSE-NEXT: imull $16843009, %edi, %edx # imm = 0x1010101
; X86-NOSSE-NEXT: shrl $24, %edx
-; X86-NOSSE-NEXT: addl %esi, %edx
-; X86-NOSSE-NEXT: movl %ecx, %esi
-; X86-NOSSE-NEXT: shrl %esi
-; X86-NOSSE-NEXT: andl $1431655765, %esi # imm = 0x55555555
-; X86-NOSSE-NEXT: subl %esi, %ecx
-; X86-NOSSE-NEXT: movl %ecx, %esi
-; X86-NOSSE-NEXT: andl $858993459, %esi # imm = 0x33333333
+; X86-NOSSE-NEXT: movl %ecx, %edi
+; X86-NOSSE-NEXT: shrl %edi
+; X86-NOSSE-NEXT: andl $1431655765, %edi # imm = 0x55555555
+; X86-NOSSE-NEXT: subl %edi, %ecx
+; X86-NOSSE-NEXT: movl %ecx, %edi
+; X86-NOSSE-NEXT: andl $858993459, %edi # imm = 0x33333333
; X86-NOSSE-NEXT: shrl $2, %ecx
; X86-NOSSE-NEXT: andl $858993459, %ecx # imm = 0x33333333
-; X86-NOSSE-NEXT: addl %esi, %ecx
-; X86-NOSSE-NEXT: movl %ecx, %esi
-; X86-NOSSE-NEXT: shrl $4, %esi
-; X86-NOSSE-NEXT: addl %ecx, %esi
-; X86-NOSSE-NEXT: andl $252645135, %esi # imm = 0xF0F0F0F
-; X86-NOSSE-NEXT: imull $16843009, %esi, %ecx # imm = 0x1010101
+; X86-NOSSE-NEXT: addl %edi, %ecx
+; X86-NOSSE-NEXT: movl %ecx, %edi
+; X86-NOSSE-NEXT: shrl $4, %edi
+; X86-NOSSE-NEXT: addl %ecx, %edi
+; X86-NOSSE-NEXT: andl $252645135, %edi # imm = 0xF0F0F0F
+; X86-NOSSE-NEXT: imull $16843009, %edi, %ecx # imm = 0x1010101
; X86-NOSSE-NEXT: shrl $24, %ecx
; X86-NOSSE-NEXT: addl %edx, %ecx
+; X86-NOSSE-NEXT: addl %esi, %ecx
; X86-NOSSE-NEXT: movl %ecx, (%eax)
; X86-NOSSE-NEXT: movl $0, 12(%eax)
; X86-NOSSE-NEXT: movl $0, 8(%eax)
;
; X86-POPCNT-LABEL: cnt128:
; X86-POPCNT: # %bb.0:
+; X86-POPCNT-NEXT: pushl %esi
; X86-POPCNT-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %ecx
; X86-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %edx
; X86-POPCNT-NEXT: addl %ecx, %edx
; X86-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %ecx
-; X86-POPCNT-NEXT: addl %edx, %ecx
-; X86-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %edx
-; X86-POPCNT-NEXT: addl %ecx, %edx
-; X86-POPCNT-NEXT: movl %edx, (%eax)
+; X86-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %esi
+; X86-POPCNT-NEXT: addl %ecx, %esi
+; X86-POPCNT-NEXT: addl %edx, %esi
+; X86-POPCNT-NEXT: movl %esi, (%eax)
; X86-POPCNT-NEXT: movl $0, 12(%eax)
; X86-POPCNT-NEXT: movl $0, 8(%eax)
; X86-POPCNT-NEXT: movl $0, 4(%eax)
+; X86-POPCNT-NEXT: popl %esi
; X86-POPCNT-NEXT: retl $4
;
; X64-POPCNT-LABEL: cnt128:
; X86-NOSSE-NEXT: pushl %esi
; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-NOSSE-NEXT: movl %ebx, %eax
-; X86-NOSSE-NEXT: shrl %eax
-; X86-NOSSE-NEXT: movl $1431655765, %ecx # imm = 0x55555555
-; X86-NOSSE-NEXT: andl %ecx, %eax
-; X86-NOSSE-NEXT: subl %eax, %ebx
-; X86-NOSSE-NEXT: movl $858993459, %eax # imm = 0x33333333
-; X86-NOSSE-NEXT: movl %ebx, %edi
-; X86-NOSSE-NEXT: andl %eax, %edi
+; X86-NOSSE-NEXT: movl %ebx, %ecx
+; X86-NOSSE-NEXT: shrl %ecx
+; X86-NOSSE-NEXT: movl $1431655765, %edi # imm = 0x55555555
+; X86-NOSSE-NEXT: andl %edi, %ecx
+; X86-NOSSE-NEXT: subl %ecx, %ebx
+; X86-NOSSE-NEXT: movl $858993459, %ecx # imm = 0x33333333
+; X86-NOSSE-NEXT: movl %ebx, %ebp
+; X86-NOSSE-NEXT: andl %ecx, %ebp
; X86-NOSSE-NEXT: shrl $2, %ebx
-; X86-NOSSE-NEXT: andl %eax, %ebx
-; X86-NOSSE-NEXT: addl %edi, %ebx
-; X86-NOSSE-NEXT: movl %ebx, %edi
-; X86-NOSSE-NEXT: shrl $4, %edi
-; X86-NOSSE-NEXT: addl %ebx, %edi
-; X86-NOSSE-NEXT: movl %esi, %ebx
-; X86-NOSSE-NEXT: shrl %ebx
; X86-NOSSE-NEXT: andl %ecx, %ebx
-; X86-NOSSE-NEXT: subl %ebx, %esi
-; X86-NOSSE-NEXT: movl %esi, %ebx
-; X86-NOSSE-NEXT: andl %eax, %ebx
-; X86-NOSSE-NEXT: shrl $2, %esi
-; X86-NOSSE-NEXT: andl %eax, %esi
-; X86-NOSSE-NEXT: addl %ebx, %esi
-; X86-NOSSE-NEXT: movl %esi, %ebx
-; X86-NOSSE-NEXT: shrl $4, %ebx
-; X86-NOSSE-NEXT: addl %esi, %ebx
-; X86-NOSSE-NEXT: movl $252645135, %esi # imm = 0xF0F0F0F
-; X86-NOSSE-NEXT: andl %esi, %edi
-; X86-NOSSE-NEXT: imull $16843009, %edi, %ebp # imm = 0x1010101
-; X86-NOSSE-NEXT: shrl $24, %ebp
-; X86-NOSSE-NEXT: andl %esi, %ebx
-; X86-NOSSE-NEXT: imull $16843009, %ebx, %edi # imm = 0x1010101
-; X86-NOSSE-NEXT: shrl $24, %edi
-; X86-NOSSE-NEXT: addl %ebp, %edi
-; X86-NOSSE-NEXT: movl %edx, %ebx
+; X86-NOSSE-NEXT: addl %ebp, %ebx
+; X86-NOSSE-NEXT: movl %ebx, %ebp
+; X86-NOSSE-NEXT: shrl $4, %ebp
+; X86-NOSSE-NEXT: addl %ebx, %ebp
+; X86-NOSSE-NEXT: movl %eax, %ebx
; X86-NOSSE-NEXT: shrl %ebx
+; X86-NOSSE-NEXT: andl %edi, %ebx
+; X86-NOSSE-NEXT: subl %ebx, %eax
+; X86-NOSSE-NEXT: movl %eax, %ebx
; X86-NOSSE-NEXT: andl %ecx, %ebx
-; X86-NOSSE-NEXT: subl %ebx, %edx
-; X86-NOSSE-NEXT: movl %edx, %ebx
-; X86-NOSSE-NEXT: andl %eax, %ebx
-; X86-NOSSE-NEXT: shrl $2, %edx
-; X86-NOSSE-NEXT: andl %eax, %edx
-; X86-NOSSE-NEXT: addl %ebx, %edx
-; X86-NOSSE-NEXT: movl %edx, %ebp
+; X86-NOSSE-NEXT: shrl $2, %eax
+; X86-NOSSE-NEXT: andl %ecx, %eax
+; X86-NOSSE-NEXT: addl %ebx, %eax
+; X86-NOSSE-NEXT: movl %eax, %edi
+; X86-NOSSE-NEXT: shrl $4, %edi
+; X86-NOSSE-NEXT: addl %eax, %edi
+; X86-NOSSE-NEXT: movl $252645135, %ebx # imm = 0xF0F0F0F
+; X86-NOSSE-NEXT: andl %ebx, %ebp
+; X86-NOSSE-NEXT: imull $16843009, %ebp, %eax # imm = 0x1010101
+; X86-NOSSE-NEXT: shrl $24, %eax
+; X86-NOSSE-NEXT: andl %ebx, %edi
+; X86-NOSSE-NEXT: imull $16843009, %edi, %edi # imm = 0x1010101
+; X86-NOSSE-NEXT: shrl $24, %edi
+; X86-NOSSE-NEXT: addl %eax, %edi
+; X86-NOSSE-NEXT: movl %esi, %eax
+; X86-NOSSE-NEXT: shrl %eax
+; X86-NOSSE-NEXT: movl $1431655765, %ebp # imm = 0x55555555
+; X86-NOSSE-NEXT: andl %ebp, %eax
+; X86-NOSSE-NEXT: subl %eax, %esi
+; X86-NOSSE-NEXT: movl %esi, %eax
+; X86-NOSSE-NEXT: andl %ecx, %eax
+; X86-NOSSE-NEXT: shrl $2, %esi
+; X86-NOSSE-NEXT: andl %ecx, %esi
+; X86-NOSSE-NEXT: addl %eax, %esi
+; X86-NOSSE-NEXT: movl %esi, %ebp
; X86-NOSSE-NEXT: shrl $4, %ebp
-; X86-NOSSE-NEXT: addl %edx, %ebp
-; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-NOSSE-NEXT: andl %esi, %ebp
-; X86-NOSSE-NEXT: imull $16843009, %ebp, %edx # imm = 0x1010101
+; X86-NOSSE-NEXT: addl %esi, %ebp
+; X86-NOSSE-NEXT: movl %edx, %eax
+; X86-NOSSE-NEXT: shrl %eax
+; X86-NOSSE-NEXT: movl $1431655765, %esi # imm = 0x55555555
+; X86-NOSSE-NEXT: andl %esi, %eax
+; X86-NOSSE-NEXT: subl %eax, %edx
+; X86-NOSSE-NEXT: movl %edx, %eax
+; X86-NOSSE-NEXT: andl %ecx, %eax
+; X86-NOSSE-NEXT: shrl $2, %edx
+; X86-NOSSE-NEXT: andl %ecx, %edx
+; X86-NOSSE-NEXT: addl %eax, %edx
+; X86-NOSSE-NEXT: movl %edx, %eax
+; X86-NOSSE-NEXT: shrl $4, %eax
+; X86-NOSSE-NEXT: addl %edx, %eax
+; X86-NOSSE-NEXT: andl %ebx, %ebp
+; X86-NOSSE-NEXT: andl %ebx, %eax
+; X86-NOSSE-NEXT: imull $16843009, %ebp, %ecx # imm = 0x1010101
+; X86-NOSSE-NEXT: shrl $24, %ecx
+; X86-NOSSE-NEXT: imull $16843009, %eax, %edx # imm = 0x1010101
; X86-NOSSE-NEXT: shrl $24, %edx
-; X86-NOSSE-NEXT: addl %edi, %edx
-; X86-NOSSE-NEXT: movl %ebx, %edi
-; X86-NOSSE-NEXT: shrl %edi
-; X86-NOSSE-NEXT: andl %ecx, %edi
-; X86-NOSSE-NEXT: subl %edi, %ebx
-; X86-NOSSE-NEXT: movl %ebx, %ecx
-; X86-NOSSE-NEXT: andl %eax, %ecx
-; X86-NOSSE-NEXT: shrl $2, %ebx
-; X86-NOSSE-NEXT: andl %eax, %ebx
-; X86-NOSSE-NEXT: addl %ecx, %ebx
-; X86-NOSSE-NEXT: movl %ebx, %ecx
-; X86-NOSSE-NEXT: shrl $4, %ecx
-; X86-NOSSE-NEXT: addl %ebx, %ecx
-; X86-NOSSE-NEXT: andl %esi, %ecx
+; X86-NOSSE-NEXT: addl %ecx, %edx
; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NOSSE-NEXT: imull $16843009, %ecx, %ecx # imm = 0x1010101
-; X86-NOSSE-NEXT: shrl $24, %ecx
-; X86-NOSSE-NEXT: addl %edx, %ecx
-; X86-NOSSE-NEXT: xorl %edx, %edx
-; X86-NOSSE-NEXT: movl %edx, 12(%eax)
-; X86-NOSSE-NEXT: movl %edx, 8(%eax)
-; X86-NOSSE-NEXT: movl %edx, 4(%eax)
-; X86-NOSSE-NEXT: movl %ecx, (%eax)
+; X86-NOSSE-NEXT: addl %edi, %edx
+; X86-NOSSE-NEXT: xorl %ecx, %ecx
+; X86-NOSSE-NEXT: movl %ecx, 12(%eax)
+; X86-NOSSE-NEXT: movl %ecx, 8(%eax)
+; X86-NOSSE-NEXT: movl %ecx, 4(%eax)
+; X86-NOSSE-NEXT: movl %edx, (%eax)
; X86-NOSSE-NEXT: popl %esi
; X86-NOSSE-NEXT: popl %edi
; X86-NOSSE-NEXT: popl %ebx
;
; X86-POPCNT-LABEL: cnt128_optsize:
; X86-POPCNT: # %bb.0:
+; X86-POPCNT-NEXT: pushl %esi
; X86-POPCNT-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %ecx
; X86-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %edx
; X86-POPCNT-NEXT: addl %ecx, %edx
; X86-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %ecx
-; X86-POPCNT-NEXT: addl %edx, %ecx
-; X86-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %edx
-; X86-POPCNT-NEXT: addl %ecx, %edx
+; X86-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %esi
+; X86-POPCNT-NEXT: addl %ecx, %esi
+; X86-POPCNT-NEXT: addl %edx, %esi
; X86-POPCNT-NEXT: xorl %ecx, %ecx
; X86-POPCNT-NEXT: movl %ecx, 12(%eax)
; X86-POPCNT-NEXT: movl %ecx, 8(%eax)
; X86-POPCNT-NEXT: movl %ecx, 4(%eax)
-; X86-POPCNT-NEXT: movl %edx, (%eax)
+; X86-POPCNT-NEXT: movl %esi, (%eax)
+; X86-POPCNT-NEXT: popl %esi
; X86-POPCNT-NEXT: retl $4
;
; X64-POPCNT-LABEL: cnt128_optsize:
; X86-NOSSE-NEXT: pushl %esi
; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-NOSSE-NEXT: movl %ebx, %eax
-; X86-NOSSE-NEXT: shrl %eax
-; X86-NOSSE-NEXT: movl $1431655765, %ecx # imm = 0x55555555
-; X86-NOSSE-NEXT: andl %ecx, %eax
-; X86-NOSSE-NEXT: subl %eax, %ebx
-; X86-NOSSE-NEXT: movl $858993459, %eax # imm = 0x33333333
-; X86-NOSSE-NEXT: movl %ebx, %edi
-; X86-NOSSE-NEXT: andl %eax, %edi
+; X86-NOSSE-NEXT: movl %ebx, %ecx
+; X86-NOSSE-NEXT: shrl %ecx
+; X86-NOSSE-NEXT: movl $1431655765, %edi # imm = 0x55555555
+; X86-NOSSE-NEXT: andl %edi, %ecx
+; X86-NOSSE-NEXT: subl %ecx, %ebx
+; X86-NOSSE-NEXT: movl $858993459, %ecx # imm = 0x33333333
+; X86-NOSSE-NEXT: movl %ebx, %ebp
+; X86-NOSSE-NEXT: andl %ecx, %ebp
; X86-NOSSE-NEXT: shrl $2, %ebx
-; X86-NOSSE-NEXT: andl %eax, %ebx
-; X86-NOSSE-NEXT: addl %edi, %ebx
-; X86-NOSSE-NEXT: movl %ebx, %edi
-; X86-NOSSE-NEXT: shrl $4, %edi
-; X86-NOSSE-NEXT: addl %ebx, %edi
-; X86-NOSSE-NEXT: movl %esi, %ebx
-; X86-NOSSE-NEXT: shrl %ebx
; X86-NOSSE-NEXT: andl %ecx, %ebx
-; X86-NOSSE-NEXT: subl %ebx, %esi
-; X86-NOSSE-NEXT: movl %esi, %ebx
-; X86-NOSSE-NEXT: andl %eax, %ebx
-; X86-NOSSE-NEXT: shrl $2, %esi
-; X86-NOSSE-NEXT: andl %eax, %esi
-; X86-NOSSE-NEXT: addl %ebx, %esi
-; X86-NOSSE-NEXT: movl %esi, %ebx
-; X86-NOSSE-NEXT: shrl $4, %ebx
-; X86-NOSSE-NEXT: addl %esi, %ebx
-; X86-NOSSE-NEXT: movl $252645135, %esi # imm = 0xF0F0F0F
-; X86-NOSSE-NEXT: andl %esi, %edi
-; X86-NOSSE-NEXT: imull $16843009, %edi, %ebp # imm = 0x1010101
-; X86-NOSSE-NEXT: shrl $24, %ebp
-; X86-NOSSE-NEXT: andl %esi, %ebx
-; X86-NOSSE-NEXT: imull $16843009, %ebx, %edi # imm = 0x1010101
-; X86-NOSSE-NEXT: shrl $24, %edi
-; X86-NOSSE-NEXT: addl %ebp, %edi
-; X86-NOSSE-NEXT: movl %edx, %ebx
+; X86-NOSSE-NEXT: addl %ebp, %ebx
+; X86-NOSSE-NEXT: movl %ebx, %ebp
+; X86-NOSSE-NEXT: shrl $4, %ebp
+; X86-NOSSE-NEXT: addl %ebx, %ebp
+; X86-NOSSE-NEXT: movl %eax, %ebx
; X86-NOSSE-NEXT: shrl %ebx
+; X86-NOSSE-NEXT: andl %edi, %ebx
+; X86-NOSSE-NEXT: subl %ebx, %eax
+; X86-NOSSE-NEXT: movl %eax, %ebx
; X86-NOSSE-NEXT: andl %ecx, %ebx
-; X86-NOSSE-NEXT: subl %ebx, %edx
-; X86-NOSSE-NEXT: movl %edx, %ebx
-; X86-NOSSE-NEXT: andl %eax, %ebx
-; X86-NOSSE-NEXT: shrl $2, %edx
-; X86-NOSSE-NEXT: andl %eax, %edx
-; X86-NOSSE-NEXT: addl %ebx, %edx
-; X86-NOSSE-NEXT: movl %edx, %ebp
+; X86-NOSSE-NEXT: shrl $2, %eax
+; X86-NOSSE-NEXT: andl %ecx, %eax
+; X86-NOSSE-NEXT: addl %ebx, %eax
+; X86-NOSSE-NEXT: movl %eax, %edi
+; X86-NOSSE-NEXT: shrl $4, %edi
+; X86-NOSSE-NEXT: addl %eax, %edi
+; X86-NOSSE-NEXT: movl $252645135, %ebx # imm = 0xF0F0F0F
+; X86-NOSSE-NEXT: andl %ebx, %ebp
+; X86-NOSSE-NEXT: imull $16843009, %ebp, %eax # imm = 0x1010101
+; X86-NOSSE-NEXT: shrl $24, %eax
+; X86-NOSSE-NEXT: andl %ebx, %edi
+; X86-NOSSE-NEXT: imull $16843009, %edi, %edi # imm = 0x1010101
+; X86-NOSSE-NEXT: shrl $24, %edi
+; X86-NOSSE-NEXT: addl %eax, %edi
+; X86-NOSSE-NEXT: movl %esi, %eax
+; X86-NOSSE-NEXT: shrl %eax
+; X86-NOSSE-NEXT: movl $1431655765, %ebp # imm = 0x55555555
+; X86-NOSSE-NEXT: andl %ebp, %eax
+; X86-NOSSE-NEXT: subl %eax, %esi
+; X86-NOSSE-NEXT: movl %esi, %eax
+; X86-NOSSE-NEXT: andl %ecx, %eax
+; X86-NOSSE-NEXT: shrl $2, %esi
+; X86-NOSSE-NEXT: andl %ecx, %esi
+; X86-NOSSE-NEXT: addl %eax, %esi
+; X86-NOSSE-NEXT: movl %esi, %ebp
; X86-NOSSE-NEXT: shrl $4, %ebp
-; X86-NOSSE-NEXT: addl %edx, %ebp
-; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-NOSSE-NEXT: andl %esi, %ebp
-; X86-NOSSE-NEXT: imull $16843009, %ebp, %edx # imm = 0x1010101
+; X86-NOSSE-NEXT: addl %esi, %ebp
+; X86-NOSSE-NEXT: movl %edx, %eax
+; X86-NOSSE-NEXT: shrl %eax
+; X86-NOSSE-NEXT: movl $1431655765, %esi # imm = 0x55555555
+; X86-NOSSE-NEXT: andl %esi, %eax
+; X86-NOSSE-NEXT: subl %eax, %edx
+; X86-NOSSE-NEXT: movl %edx, %eax
+; X86-NOSSE-NEXT: andl %ecx, %eax
+; X86-NOSSE-NEXT: shrl $2, %edx
+; X86-NOSSE-NEXT: andl %ecx, %edx
+; X86-NOSSE-NEXT: addl %eax, %edx
+; X86-NOSSE-NEXT: movl %edx, %eax
+; X86-NOSSE-NEXT: shrl $4, %eax
+; X86-NOSSE-NEXT: addl %edx, %eax
+; X86-NOSSE-NEXT: andl %ebx, %ebp
+; X86-NOSSE-NEXT: andl %ebx, %eax
+; X86-NOSSE-NEXT: imull $16843009, %ebp, %ecx # imm = 0x1010101
+; X86-NOSSE-NEXT: shrl $24, %ecx
+; X86-NOSSE-NEXT: imull $16843009, %eax, %edx # imm = 0x1010101
; X86-NOSSE-NEXT: shrl $24, %edx
-; X86-NOSSE-NEXT: addl %edi, %edx
-; X86-NOSSE-NEXT: movl %ebx, %edi
-; X86-NOSSE-NEXT: shrl %edi
-; X86-NOSSE-NEXT: andl %ecx, %edi
-; X86-NOSSE-NEXT: subl %edi, %ebx
-; X86-NOSSE-NEXT: movl %ebx, %ecx
-; X86-NOSSE-NEXT: andl %eax, %ecx
-; X86-NOSSE-NEXT: shrl $2, %ebx
-; X86-NOSSE-NEXT: andl %eax, %ebx
-; X86-NOSSE-NEXT: addl %ecx, %ebx
-; X86-NOSSE-NEXT: movl %ebx, %ecx
-; X86-NOSSE-NEXT: shrl $4, %ecx
-; X86-NOSSE-NEXT: addl %ebx, %ecx
-; X86-NOSSE-NEXT: andl %esi, %ecx
+; X86-NOSSE-NEXT: addl %ecx, %edx
; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NOSSE-NEXT: imull $16843009, %ecx, %ecx # imm = 0x1010101
-; X86-NOSSE-NEXT: shrl $24, %ecx
-; X86-NOSSE-NEXT: addl %edx, %ecx
-; X86-NOSSE-NEXT: xorl %edx, %edx
-; X86-NOSSE-NEXT: movl %edx, 12(%eax)
-; X86-NOSSE-NEXT: movl %edx, 8(%eax)
-; X86-NOSSE-NEXT: movl %edx, 4(%eax)
-; X86-NOSSE-NEXT: movl %ecx, (%eax)
+; X86-NOSSE-NEXT: addl %edi, %edx
+; X86-NOSSE-NEXT: xorl %ecx, %ecx
+; X86-NOSSE-NEXT: movl %ecx, 12(%eax)
+; X86-NOSSE-NEXT: movl %ecx, 8(%eax)
+; X86-NOSSE-NEXT: movl %ecx, 4(%eax)
+; X86-NOSSE-NEXT: movl %edx, (%eax)
; X86-NOSSE-NEXT: popl %esi
; X86-NOSSE-NEXT: popl %edi
; X86-NOSSE-NEXT: popl %ebx
;
; X86-POPCNT-LABEL: cnt128_pgso:
; X86-POPCNT: # %bb.0:
+; X86-POPCNT-NEXT: pushl %esi
; X86-POPCNT-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %ecx
; X86-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %edx
; X86-POPCNT-NEXT: addl %ecx, %edx
; X86-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %ecx
-; X86-POPCNT-NEXT: addl %edx, %ecx
-; X86-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %edx
-; X86-POPCNT-NEXT: addl %ecx, %edx
+; X86-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %esi
+; X86-POPCNT-NEXT: addl %ecx, %esi
+; X86-POPCNT-NEXT: addl %edx, %esi
; X86-POPCNT-NEXT: xorl %ecx, %ecx
; X86-POPCNT-NEXT: movl %ecx, 12(%eax)
; X86-POPCNT-NEXT: movl %ecx, 8(%eax)
; X86-POPCNT-NEXT: movl %ecx, 4(%eax)
-; X86-POPCNT-NEXT: movl %edx, (%eax)
+; X86-POPCNT-NEXT: movl %esi, (%eax)
+; X86-POPCNT-NEXT: popl %esi
; X86-POPCNT-NEXT: retl $4
;
; X64-POPCNT-LABEL: cnt128_pgso:
; CHECK-NEXT: movl %esi, %eax
; CHECK-NEXT: imull %edx
; CHECK-NEXT: movl %edx, %edi
+; CHECK-NEXT: movl %edx, %eax
+; CHECK-NEXT: shrl $31, %eax
+; CHECK-NEXT: sarl $7, %edi
+; CHECK-NEXT: addl %eax, %edi
; CHECK-NEXT: imull $36525, %esi, %eax # imm = 0x8EAD
; CHECK-NEXT: addl $172251900, %eax # imm = 0xA445AFC
; CHECK-NEXT: movl $1374389535, %edx # imm = 0x51EB851F
; CHECK-NEXT: shrl $31, %eax
; CHECK-NEXT: sarl $5, %edx
; CHECK-NEXT: addl %eax, %edx
-; CHECK-NEXT: movl %edi, %eax
-; CHECK-NEXT: shrl $31, %eax
; CHECK-NEXT: addl 16(%ebx), %ecx
-; CHECK-NEXT: addl %eax, %ecx
-; CHECK-NEXT: sarl $7, %edi
; CHECK-NEXT: addl %edi, %ecx
; CHECK-NEXT: leal 257(%ecx,%edx), %eax
; CHECK-NEXT: movl %eax, {{[0-9]+}}(%esp)
; SSE2-LABEL: veccond512:
; SSE2: # %bb.0: # %entry
; SSE2-NEXT: por %xmm3, %xmm1
-; SSE2-NEXT: por %xmm2, %xmm1
-; SSE2-NEXT: por %xmm0, %xmm1
-; SSE2-NEXT: pxor %xmm0, %xmm0
-; SSE2-NEXT: pcmpeqb %xmm1, %xmm0
-; SSE2-NEXT: pmovmskb %xmm0, %eax
+; SSE2-NEXT: por %xmm2, %xmm0
+; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: pcmpeqb %xmm0, %xmm1
+; SSE2-NEXT: pmovmskb %xmm1, %eax
; SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
; SSE2-NEXT: je .LBB2_2
; SSE2-NEXT: # %bb.1: # %if-true-block
; SSE41-LABEL: veccond512:
; SSE41: # %bb.0: # %entry
; SSE41-NEXT: por %xmm3, %xmm1
-; SSE41-NEXT: por %xmm2, %xmm1
-; SSE41-NEXT: por %xmm0, %xmm1
-; SSE41-NEXT: ptest %xmm1, %xmm1
+; SSE41-NEXT: por %xmm2, %xmm0
+; SSE41-NEXT: por %xmm1, %xmm0
+; SSE41-NEXT: ptest %xmm0, %xmm0
; SSE41-NEXT: je .LBB2_2
; SSE41-NEXT: # %bb.1: # %if-true-block
; SSE41-NEXT: xorl %eax, %eax
; SSE2-LABEL: vectest512:
; SSE2: # %bb.0:
; SSE2-NEXT: por %xmm3, %xmm1
-; SSE2-NEXT: por %xmm2, %xmm1
-; SSE2-NEXT: por %xmm0, %xmm1
-; SSE2-NEXT: pxor %xmm0, %xmm0
-; SSE2-NEXT: pcmpeqb %xmm1, %xmm0
-; SSE2-NEXT: pmovmskb %xmm0, %ecx
+; SSE2-NEXT: por %xmm2, %xmm0
+; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: pcmpeqb %xmm0, %xmm1
+; SSE2-NEXT: pmovmskb %xmm1, %ecx
; SSE2-NEXT: xorl %eax, %eax
; SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF
; SSE2-NEXT: setne %al
; SSE41-LABEL: vectest512:
; SSE41: # %bb.0:
; SSE41-NEXT: por %xmm3, %xmm1
-; SSE41-NEXT: por %xmm2, %xmm1
-; SSE41-NEXT: por %xmm0, %xmm1
+; SSE41-NEXT: por %xmm2, %xmm0
+; SSE41-NEXT: por %xmm1, %xmm0
; SSE41-NEXT: xorl %eax, %eax
-; SSE41-NEXT: ptest %xmm1, %xmm1
+; SSE41-NEXT: ptest %xmm0, %xmm0
; SSE41-NEXT: setne %al
; SSE41-NEXT: retq
;
; SSE2: # %bb.0:
; SSE2-NEXT: movl %edi, %eax
; SSE2-NEXT: por %xmm3, %xmm1
-; SSE2-NEXT: por %xmm2, %xmm1
-; SSE2-NEXT: por %xmm0, %xmm1
-; SSE2-NEXT: pxor %xmm0, %xmm0
-; SSE2-NEXT: pcmpeqb %xmm1, %xmm0
-; SSE2-NEXT: pmovmskb %xmm0, %ecx
+; SSE2-NEXT: por %xmm2, %xmm0
+; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: pcmpeqb %xmm0, %xmm1
+; SSE2-NEXT: pmovmskb %xmm1, %ecx
; SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF
; SSE2-NEXT: cmovel %esi, %eax
; SSE2-NEXT: retq
; SSE41: # %bb.0:
; SSE41-NEXT: movl %edi, %eax
; SSE41-NEXT: por %xmm3, %xmm1
-; SSE41-NEXT: por %xmm2, %xmm1
-; SSE41-NEXT: por %xmm0, %xmm1
-; SSE41-NEXT: ptest %xmm1, %xmm1
+; SSE41-NEXT: por %xmm2, %xmm0
+; SSE41-NEXT: por %xmm1, %xmm0
+; SSE41-NEXT: ptest %xmm0, %xmm0
; SSE41-NEXT: cmovel %esi, %eax
; SSE41-NEXT: retq
;
define i32 @extra_maskop_uses2(i32 %a) {
; X86-LABEL: extra_maskop_uses2:
; X86: # %bb.0:
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl %ecx, %edx
-; X86-NEXT: shll $8, %edx
-; X86-NEXT: shrl $8, %ecx
-; X86-NEXT: andl $-16711936, %edx # imm = 0xFF00FF00
-; X86-NEXT: andl $16711935, %ecx # imm = 0xFF00FF
-; X86-NEXT: leal (%ecx,%edx), %eax
-; X86-NEXT: imull %edx, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: shll $8, %ecx
+; X86-NEXT: shrl $8, %eax
+; X86-NEXT: andl $-16711936, %ecx # imm = 0xFF00FF00
+; X86-NEXT: andl $16711935, %eax # imm = 0xFF00FF
+; X86-NEXT: leal (%eax,%ecx), %edx
; X86-NEXT: imull %ecx, %eax
+; X86-NEXT: imull %edx, %eax
; X86-NEXT: retl
;
; X64-LABEL: extra_maskop_uses2:
; X64: # %bb.0:
; X64-NEXT: # kill: def $edi killed $edi def $rdi
-; X64-NEXT: movl %edi, %ecx
-; X64-NEXT: shll $8, %ecx
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: shll $8, %eax
; X64-NEXT: shrl $8, %edi
-; X64-NEXT: andl $-16711936, %ecx # imm = 0xFF00FF00
+; X64-NEXT: andl $-16711936, %eax # imm = 0xFF00FF00
; X64-NEXT: andl $16711935, %edi # imm = 0xFF00FF
-; X64-NEXT: leal (%rdi,%rcx), %eax
-; X64-NEXT: imull %ecx, %eax
+; X64-NEXT: leal (%rdi,%rax), %ecx
; X64-NEXT: imull %edi, %eax
+; X64-NEXT: imull %ecx, %eax
+; X64-NEXT: # kill: def $eax killed $eax killed $rax
; X64-NEXT: retq
%l8 = shl i32 %a, 8
%r8 = lshr i32 %a, 8
; CHECK: # %bb.0: # %b0
; CHECK-NEXT: movl %edi, %eax
; CHECK-NEXT: shll $7, %eax
+; CHECK-NEXT: orl %esi, %eax
; CHECK-NEXT: roll $9, %edi
-; CHECK-NEXT: orl %esi, %edi
; CHECK-NEXT: orl %edi, %eax
; CHECK-NEXT: retq
b0:
; CHECK-NEXT: shrl $21, %edi
; CHECK-NEXT: movl %esi, %eax
; CHECK-NEXT: shll $19, %eax
+; CHECK-NEXT: orl %ecx, %eax
; CHECK-NEXT: shrl $13, %esi
; CHECK-NEXT: orl %edi, %esi
; CHECK-NEXT: orl %esi, %eax
-; CHECK-NEXT: orl %ecx, %eax
; CHECK-NEXT: retq
%v0 = shl i32 %a0, 11
%v1 = lshr i32 %a0, 21
; CHECK-LABEL: f3:
; CHECK: # %bb.0: # %b0
; CHECK-NEXT: # kill: def $edi killed $edi def $rdi
-; CHECK-NEXT: leal (,%rdi,8), %ecx
-; CHECK-NEXT: movl %edi, %eax
-; CHECK-NEXT: shll $5, %eax
-; CHECK-NEXT: movl %edi, %edx
-; CHECK-NEXT: shll $7, %edx
-; CHECK-NEXT: orl %eax, %edx
+; CHECK-NEXT: leal (,%rdi,8), %eax
+; CHECK-NEXT: movl %edi, %ecx
+; CHECK-NEXT: shll $5, %ecx
+; CHECK-NEXT: orl %eax, %ecx
; CHECK-NEXT: movl %edi, %eax
-; CHECK-NEXT: shll $13, %eax
-; CHECK-NEXT: orl %edx, %eax
+; CHECK-NEXT: shll $7, %eax
; CHECK-NEXT: movl %edi, %edx
-; CHECK-NEXT: shll $19, %edx
+; CHECK-NEXT: shll $13, %edx
; CHECK-NEXT: orl %eax, %edx
+; CHECK-NEXT: orl %ecx, %edx
; CHECK-NEXT: movl %edi, %eax
-; CHECK-NEXT: shrl $2, %eax
-; CHECK-NEXT: orl %edx, %eax
-; CHECK-NEXT: movl %edi, %edx
-; CHECK-NEXT: shrl $15, %edx
-; CHECK-NEXT: orl %eax, %edx
+; CHECK-NEXT: shll $19, %eax
+; CHECK-NEXT: movl %edi, %ecx
+; CHECK-NEXT: shrl $2, %ecx
+; CHECK-NEXT: orl %eax, %ecx
; CHECK-NEXT: movl %edi, %esi
-; CHECK-NEXT: shrl $23, %esi
+; CHECK-NEXT: shrl $15, %esi
+; CHECK-NEXT: orl %ecx, %esi
; CHECK-NEXT: orl %edx, %esi
+; CHECK-NEXT: movl %edi, %ecx
+; CHECK-NEXT: shrl $23, %ecx
; CHECK-NEXT: movl %edi, %eax
; CHECK-NEXT: shrl $25, %eax
-; CHECK-NEXT: orl %esi, %eax
+; CHECK-NEXT: orl %ecx, %eax
; CHECK-NEXT: shrl $30, %edi
; CHECK-NEXT: orl %edi, %eax
-; CHECK-NEXT: orl %ecx, %eax
+; CHECK-NEXT: orl %esi, %eax
; CHECK-NEXT: retq
b0:
%v0 = shl i32 %a0, 3
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; SSE2: # %bb.0: # %entry
; SSE2-NEXT: pxor %xmm0, %xmm0
; SSE2-NEXT: movq $-1024, %rax # imm = 0xFC00
-; SSE2-NEXT: pxor %xmm2, %xmm2
; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: pxor %xmm2, %xmm2
; SSE2-NEXT: .p2align 4, 0x90
; SSE2-NEXT: .LBB1_1: # %vector.body
; SSE2-NEXT: # =>This Inner Loop Header: Depth=1
; SSE2-NEXT: movdqa a+1024(%rax), %xmm3
; SSE2-NEXT: psadbw b+1024(%rax), %xmm3
-; SSE2-NEXT: paddd %xmm3, %xmm2
+; SSE2-NEXT: paddd %xmm3, %xmm1
; SSE2-NEXT: movdqa a+1040(%rax), %xmm3
; SSE2-NEXT: psadbw b+1040(%rax), %xmm3
-; SSE2-NEXT: paddd %xmm3, %xmm1
+; SSE2-NEXT: paddd %xmm3, %xmm2
; SSE2-NEXT: addq $32, %rax
; SSE2-NEXT: jne .LBB1_1
; SSE2-NEXT: # %bb.2: # %middle.block
-; SSE2-NEXT: paddd %xmm0, %xmm1
; SSE2-NEXT: paddd %xmm0, %xmm2
-; SSE2-NEXT: paddd %xmm0, %xmm0
; SSE2-NEXT: paddd %xmm0, %xmm1
+; SSE2-NEXT: paddd %xmm0, %xmm0
; SSE2-NEXT: paddd %xmm0, %xmm1
-; SSE2-NEXT: paddd %xmm2, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE2-NEXT: paddd %xmm2, %xmm0
; SSE2-NEXT: paddd %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE2-NEXT: paddd %xmm0, %xmm1
-; SSE2-NEXT: movd %xmm1, %eax
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
+; SSE2-NEXT: paddd %xmm1, %xmm0
+; SSE2-NEXT: movd %xmm0, %eax
; SSE2-NEXT: retq
;
; AVX1-LABEL: sad_32i8:
; AVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm4
; AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3
; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpaddd %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpaddd %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; SSE2-NEXT: paddd %xmm4, %xmm0
; SSE2-NEXT: paddd %xmm4, %xmm1
; SSE2-NEXT: paddd %xmm4, %xmm3
-; SSE2-NEXT: paddd %xmm5, %xmm1
-; SSE2-NEXT: paddd %xmm5, %xmm2
-; SSE2-NEXT: paddd %xmm5, %xmm2
+; SSE2-NEXT: paddd %xmm5, %xmm3
; SSE2-NEXT: paddd %xmm5, %xmm1
; SSE2-NEXT: paddd %xmm3, %xmm1
-; SSE2-NEXT: paddd %xmm2, %xmm1
-; SSE2-NEXT: paddd %xmm0, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
-; SSE2-NEXT: paddd %xmm1, %xmm0
+; SSE2-NEXT: paddd %xmm5, %xmm0
+; SSE2-NEXT: paddd %xmm2, %xmm5
+; SSE2-NEXT: paddd %xmm0, %xmm5
+; SSE2-NEXT: paddd %xmm1, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,3,2,3]
+; SSE2-NEXT: paddd %xmm5, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; SSE2-NEXT: paddd %xmm0, %xmm1
; SSE2-NEXT: movd %xmm1, %eax
;
; AVX1-LABEL: sad_avx64i8:
; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: movq $-1024, %rax # imm = 0xFC00
; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; AVX1-NEXT: movq $-1024, %rax # imm = 0xFC00
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: .p2align 4, 0x90
; AVX1-NEXT: .LBB2_1: # %vector.body
; AVX1-NEXT: # =>This Inner Loop Header: Depth=1
; AVX1-NEXT: vpsadbw b+1056(%rax), %xmm5, %xmm5
; AVX1-NEXT: vmovdqa a+1072(%rax), %xmm6
; AVX1-NEXT: vpsadbw b+1072(%rax), %xmm6, %xmm6
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm7
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7
; AVX1-NEXT: vpaddd %xmm7, %xmm6, %xmm6
-; AVX1-NEXT: vpaddd %xmm2, %xmm5, %xmm2
-; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT: vpaddd %xmm1, %xmm5, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
; AVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm4
-; AVX1-NEXT: vpaddd %xmm0, %xmm3, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
; AVX1-NEXT: addq $64, %rax
; AVX1-NEXT: jne .LBB2_1
; AVX1-NEXT: # %bb.2: # %middle.block
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
; AVX1-NEXT: vpaddd %xmm4, %xmm4, %xmm5
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
-; AVX1-NEXT: vpaddd %xmm1, %xmm1, %xmm7
-; AVX1-NEXT: vpaddd %xmm1, %xmm1, %xmm8
-; AVX1-NEXT: vpaddd %xmm1, %xmm8, %xmm8
-; AVX1-NEXT: vpaddd %xmm7, %xmm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6
+; AVX1-NEXT: vpaddd %xmm0, %xmm0, %xmm7
+; AVX1-NEXT: vpaddd %xmm0, %xmm0, %xmm8
+; AVX1-NEXT: vpaddd %xmm0, %xmm8, %xmm8
+; AVX1-NEXT: vpaddd %xmm2, %xmm8, %xmm2
+; AVX1-NEXT: vpaddd %xmm7, %xmm0, %xmm0
+; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm1
+; AVX1-NEXT: vpaddd %xmm1, %xmm6, %xmm2
+; AVX1-NEXT: vpaddd %xmm1, %xmm3, %xmm1
; AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vpaddd %xmm1, %xmm8, %xmm1
-; AVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm2
-; AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm3
-; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpaddd %xmm2, %xmm6, %xmm2
-; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpaddd %ymm0, %ymm2, %ymm2
; AVX2-NEXT: vpaddd %ymm0, %ymm0, %ymm3
; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpaddd %ymm3, %ymm0, %ymm0
; AVX2-NEXT: vpaddd %ymm3, %ymm2, %ymm1
-; AVX2-NEXT: vpaddd %ymm1, %ymm3, %ymm1
; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; SSE2-NEXT: psadbw %xmm1, %xmm0
; SSE2-NEXT: movdqu 32(%rdi), %xmm1
; SSE2-NEXT: psadbw %xmm2, %xmm1
+; SSE2-NEXT: paddq %xmm4, %xmm1
; SSE2-NEXT: movdqu 48(%rdi), %xmm2
; SSE2-NEXT: psadbw %xmm3, %xmm2
; SSE2-NEXT: paddq %xmm0, %xmm2
; SSE2-NEXT: paddq %xmm1, %xmm2
-; SSE2-NEXT: paddq %xmm4, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
; SSE2-NEXT: paddq %xmm2, %xmm0
; SSE2-NEXT: movd %xmm0, %eax
; AVX1-NEXT: vpsadbw 16(%rdx), %xmm1, %xmm1
; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vpsadbw 32(%rdx), %xmm2, %xmm2
-; AVX1-NEXT: vpaddq %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vpsadbw (%rdx), %xmm0, %xmm0
+; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3]
; SSE2-NEXT: movq %xmm0, %rdx
; SSE2-NEXT: xorq %rsi, %rdx
+; SSE2-NEXT: orq %r11, %rdx
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,3,2,3]
; SSE2-NEXT: movq %xmm0, %rsi
; SSE2-NEXT: xorq %rdi, %rsi
; SSE2-NEXT: xorq %r8, %rdi
; SSE2-NEXT: orq %rsi, %rdi
; SSE2-NEXT: orq %rdx, %rdi
-; SSE2-NEXT: orq %r11, %rdi
; SSE2-NEXT: movq %xmm4, %rdx
; SSE2-NEXT: xorq %r9, %rdx
; SSE2-NEXT: movq %xmm6, %rsi
; SSE2-NEXT: xorq %r10, %rsi
-; SSE2-NEXT: movq %xmm5, %r8
-; SSE2-NEXT: xorq %rcx, %r8
+; SSE2-NEXT: orq %rdx, %rsi
+; SSE2-NEXT: movq %xmm5, %rdx
+; SSE2-NEXT: xorq %rcx, %rdx
; SSE2-NEXT: movq %xmm7, %rcx
; SSE2-NEXT: xorq %rax, %rcx
-; SSE2-NEXT: orq %r8, %rcx
-; SSE2-NEXT: orq %rsi, %rcx
; SSE2-NEXT: orq %rdx, %rcx
+; SSE2-NEXT: orq %rsi, %rcx
; SSE2-NEXT: xorl %eax, %eax
; SSE2-NEXT: orq %rdi, %rcx
; SSE2-NEXT: setne %al
;
; SSE41-LABEL: ne_i512:
; SSE41: # %bb.0:
-; SSE41-NEXT: movq %xmm0, %rax
-; SSE41-NEXT: movq %xmm2, %rcx
-; SSE41-NEXT: movq %xmm1, %rdx
-; SSE41-NEXT: movq %xmm3, %rsi
-; SSE41-NEXT: pextrq $1, %xmm0, %rdi
-; SSE41-NEXT: pextrq $1, %xmm2, %r8
-; SSE41-NEXT: pextrq $1, %xmm1, %r9
-; SSE41-NEXT: pextrq $1, %xmm3, %r10
+; SSE41-NEXT: movq %xmm0, %rcx
+; SSE41-NEXT: movq %xmm2, %rdx
+; SSE41-NEXT: movq %xmm1, %rsi
+; SSE41-NEXT: movq %xmm3, %rdi
+; SSE41-NEXT: pextrq $1, %xmm0, %r8
+; SSE41-NEXT: pextrq $1, %xmm2, %r9
+; SSE41-NEXT: pextrq $1, %xmm1, %r10
+; SSE41-NEXT: pextrq $1, %xmm3, %rax
; SSE41-NEXT: movq %xmm4, %r11
-; SSE41-NEXT: xorq %rax, %r11
-; SSE41-NEXT: movq %xmm6, %rax
-; SSE41-NEXT: xorq %rcx, %rax
-; SSE41-NEXT: movq %xmm5, %rcx
+; SSE41-NEXT: xorq %rcx, %r11
+; SSE41-NEXT: movq %xmm6, %rcx
; SSE41-NEXT: xorq %rdx, %rcx
-; SSE41-NEXT: movq %xmm7, %rdx
+; SSE41-NEXT: orq %r11, %rcx
+; SSE41-NEXT: movq %xmm5, %rdx
; SSE41-NEXT: xorq %rsi, %rdx
-; SSE41-NEXT: orq %rcx, %rdx
-; SSE41-NEXT: orq %rax, %rdx
-; SSE41-NEXT: orq %r11, %rdx
-; SSE41-NEXT: pextrq $1, %xmm4, %rax
-; SSE41-NEXT: xorq %rdi, %rax
-; SSE41-NEXT: pextrq $1, %xmm6, %rcx
+; SSE41-NEXT: movq %xmm7, %rsi
+; SSE41-NEXT: xorq %rdi, %rsi
+; SSE41-NEXT: orq %rdx, %rsi
+; SSE41-NEXT: orq %rcx, %rsi
+; SSE41-NEXT: pextrq $1, %xmm4, %rcx
; SSE41-NEXT: xorq %r8, %rcx
-; SSE41-NEXT: pextrq $1, %xmm5, %rsi
-; SSE41-NEXT: xorq %r9, %rsi
+; SSE41-NEXT: pextrq $1, %xmm6, %rdx
+; SSE41-NEXT: xorq %r9, %rdx
+; SSE41-NEXT: orq %rcx, %rdx
+; SSE41-NEXT: pextrq $1, %xmm5, %rcx
+; SSE41-NEXT: xorq %r10, %rcx
; SSE41-NEXT: pextrq $1, %xmm7, %rdi
-; SSE41-NEXT: xorq %r10, %rdi
-; SSE41-NEXT: orq %rsi, %rdi
+; SSE41-NEXT: xorq %rax, %rdi
; SSE41-NEXT: orq %rcx, %rdi
-; SSE41-NEXT: orq %rax, %rdi
-; SSE41-NEXT: xorl %eax, %eax
; SSE41-NEXT: orq %rdx, %rdi
+; SSE41-NEXT: xorl %eax, %eax
+; SSE41-NEXT: orq %rsi, %rdi
; SSE41-NEXT: setne %al
; SSE41-NEXT: retq
;
; AVX1-NEXT: xorq %rdx, %r11
; AVX1-NEXT: vmovq %xmm3, %rdx
; AVX1-NEXT: xorq %rsi, %rdx
+; AVX1-NEXT: orq %r11, %rdx
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm0
; AVX1-NEXT: vmovq %xmm0, %rsi
; AVX1-NEXT: xorq %rdi, %rsi
; AVX1-NEXT: xorq %r8, %rdi
; AVX1-NEXT: orq %rsi, %rdi
; AVX1-NEXT: orq %rdx, %rdi
-; AVX1-NEXT: orq %r11, %rdi
; AVX1-NEXT: vpextrq $1, %xmm2, %rdx
; AVX1-NEXT: xorq %r9, %rdx
; AVX1-NEXT: vpextrq $1, %xmm3, %rsi
; AVX1-NEXT: xorq %r10, %rsi
-; AVX1-NEXT: vpextrq $1, %xmm0, %r8
-; AVX1-NEXT: xorq %rcx, %r8
+; AVX1-NEXT: orq %rdx, %rsi
+; AVX1-NEXT: vpextrq $1, %xmm0, %rdx
+; AVX1-NEXT: xorq %rcx, %rdx
; AVX1-NEXT: vpextrq $1, %xmm1, %rcx
; AVX1-NEXT: xorq %rax, %rcx
-; AVX1-NEXT: orq %r8, %rcx
-; AVX1-NEXT: orq %rsi, %rcx
; AVX1-NEXT: orq %rdx, %rcx
+; AVX1-NEXT: orq %rsi, %rcx
; AVX1-NEXT: xorl %eax, %eax
; AVX1-NEXT: orq %rdi, %rcx
; AVX1-NEXT: setne %al
; AVX2-NEXT: xorq %rdx, %r11
; AVX2-NEXT: vmovq %xmm3, %rdx
; AVX2-NEXT: xorq %rsi, %rdx
+; AVX2-NEXT: orq %r11, %rdx
; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm0
; AVX2-NEXT: vmovq %xmm0, %rsi
; AVX2-NEXT: xorq %rdi, %rsi
; AVX2-NEXT: xorq %r8, %rdi
; AVX2-NEXT: orq %rsi, %rdi
; AVX2-NEXT: orq %rdx, %rdi
-; AVX2-NEXT: orq %r11, %rdi
; AVX2-NEXT: vpextrq $1, %xmm2, %rdx
; AVX2-NEXT: xorq %r9, %rdx
; AVX2-NEXT: vpextrq $1, %xmm3, %rsi
; AVX2-NEXT: xorq %r10, %rsi
-; AVX2-NEXT: vpextrq $1, %xmm0, %r8
-; AVX2-NEXT: xorq %rcx, %r8
+; AVX2-NEXT: orq %rdx, %rsi
+; AVX2-NEXT: vpextrq $1, %xmm0, %rdx
+; AVX2-NEXT: xorq %rcx, %rdx
; AVX2-NEXT: vpextrq $1, %xmm1, %rcx
; AVX2-NEXT: xorq %rax, %rcx
-; AVX2-NEXT: orq %r8, %rcx
-; AVX2-NEXT: orq %rsi, %rcx
; AVX2-NEXT: orq %rdx, %rcx
+; AVX2-NEXT: orq %rsi, %rcx
; AVX2-NEXT: xorl %eax, %eax
; AVX2-NEXT: orq %rdi, %rcx
; AVX2-NEXT: setne %al
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3]
; SSE2-NEXT: movq %xmm0, %rdx
; SSE2-NEXT: xorq %rsi, %rdx
+; SSE2-NEXT: orq %r11, %rdx
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,3,2,3]
; SSE2-NEXT: movq %xmm0, %rsi
; SSE2-NEXT: xorq %rdi, %rsi
; SSE2-NEXT: xorq %r8, %rdi
; SSE2-NEXT: orq %rsi, %rdi
; SSE2-NEXT: orq %rdx, %rdi
-; SSE2-NEXT: orq %r11, %rdi
; SSE2-NEXT: movq %xmm4, %rdx
; SSE2-NEXT: xorq %r9, %rdx
; SSE2-NEXT: movq %xmm6, %rsi
; SSE2-NEXT: xorq %r10, %rsi
-; SSE2-NEXT: movq %xmm5, %r8
-; SSE2-NEXT: xorq %rcx, %r8
+; SSE2-NEXT: orq %rdx, %rsi
+; SSE2-NEXT: movq %xmm5, %rdx
+; SSE2-NEXT: xorq %rcx, %rdx
; SSE2-NEXT: movq %xmm7, %rcx
; SSE2-NEXT: xorq %rax, %rcx
-; SSE2-NEXT: orq %r8, %rcx
-; SSE2-NEXT: orq %rsi, %rcx
; SSE2-NEXT: orq %rdx, %rcx
+; SSE2-NEXT: orq %rsi, %rcx
; SSE2-NEXT: xorl %eax, %eax
; SSE2-NEXT: orq %rdi, %rcx
; SSE2-NEXT: sete %al
;
; SSE41-LABEL: eq_i512:
; SSE41: # %bb.0:
-; SSE41-NEXT: movq %xmm0, %rax
-; SSE41-NEXT: movq %xmm2, %rcx
-; SSE41-NEXT: movq %xmm1, %rdx
-; SSE41-NEXT: movq %xmm3, %rsi
-; SSE41-NEXT: pextrq $1, %xmm0, %rdi
-; SSE41-NEXT: pextrq $1, %xmm2, %r8
-; SSE41-NEXT: pextrq $1, %xmm1, %r9
-; SSE41-NEXT: pextrq $1, %xmm3, %r10
+; SSE41-NEXT: movq %xmm0, %rcx
+; SSE41-NEXT: movq %xmm2, %rdx
+; SSE41-NEXT: movq %xmm1, %rsi
+; SSE41-NEXT: movq %xmm3, %rdi
+; SSE41-NEXT: pextrq $1, %xmm0, %r8
+; SSE41-NEXT: pextrq $1, %xmm2, %r9
+; SSE41-NEXT: pextrq $1, %xmm1, %r10
+; SSE41-NEXT: pextrq $1, %xmm3, %rax
; SSE41-NEXT: movq %xmm4, %r11
-; SSE41-NEXT: xorq %rax, %r11
-; SSE41-NEXT: movq %xmm6, %rax
-; SSE41-NEXT: xorq %rcx, %rax
-; SSE41-NEXT: movq %xmm5, %rcx
+; SSE41-NEXT: xorq %rcx, %r11
+; SSE41-NEXT: movq %xmm6, %rcx
; SSE41-NEXT: xorq %rdx, %rcx
-; SSE41-NEXT: movq %xmm7, %rdx
+; SSE41-NEXT: orq %r11, %rcx
+; SSE41-NEXT: movq %xmm5, %rdx
; SSE41-NEXT: xorq %rsi, %rdx
-; SSE41-NEXT: orq %rcx, %rdx
-; SSE41-NEXT: orq %rax, %rdx
-; SSE41-NEXT: orq %r11, %rdx
-; SSE41-NEXT: pextrq $1, %xmm4, %rax
-; SSE41-NEXT: xorq %rdi, %rax
-; SSE41-NEXT: pextrq $1, %xmm6, %rcx
+; SSE41-NEXT: movq %xmm7, %rsi
+; SSE41-NEXT: xorq %rdi, %rsi
+; SSE41-NEXT: orq %rdx, %rsi
+; SSE41-NEXT: orq %rcx, %rsi
+; SSE41-NEXT: pextrq $1, %xmm4, %rcx
; SSE41-NEXT: xorq %r8, %rcx
-; SSE41-NEXT: pextrq $1, %xmm5, %rsi
-; SSE41-NEXT: xorq %r9, %rsi
+; SSE41-NEXT: pextrq $1, %xmm6, %rdx
+; SSE41-NEXT: xorq %r9, %rdx
+; SSE41-NEXT: orq %rcx, %rdx
+; SSE41-NEXT: pextrq $1, %xmm5, %rcx
+; SSE41-NEXT: xorq %r10, %rcx
; SSE41-NEXT: pextrq $1, %xmm7, %rdi
-; SSE41-NEXT: xorq %r10, %rdi
-; SSE41-NEXT: orq %rsi, %rdi
+; SSE41-NEXT: xorq %rax, %rdi
; SSE41-NEXT: orq %rcx, %rdi
-; SSE41-NEXT: orq %rax, %rdi
-; SSE41-NEXT: xorl %eax, %eax
; SSE41-NEXT: orq %rdx, %rdi
+; SSE41-NEXT: xorl %eax, %eax
+; SSE41-NEXT: orq %rsi, %rdi
; SSE41-NEXT: sete %al
; SSE41-NEXT: retq
;
; AVX1-NEXT: xorq %rdx, %r11
; AVX1-NEXT: vmovq %xmm3, %rdx
; AVX1-NEXT: xorq %rsi, %rdx
+; AVX1-NEXT: orq %r11, %rdx
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm0
; AVX1-NEXT: vmovq %xmm0, %rsi
; AVX1-NEXT: xorq %rdi, %rsi
; AVX1-NEXT: xorq %r8, %rdi
; AVX1-NEXT: orq %rsi, %rdi
; AVX1-NEXT: orq %rdx, %rdi
-; AVX1-NEXT: orq %r11, %rdi
; AVX1-NEXT: vpextrq $1, %xmm2, %rdx
; AVX1-NEXT: xorq %r9, %rdx
; AVX1-NEXT: vpextrq $1, %xmm3, %rsi
; AVX1-NEXT: xorq %r10, %rsi
-; AVX1-NEXT: vpextrq $1, %xmm0, %r8
-; AVX1-NEXT: xorq %rcx, %r8
+; AVX1-NEXT: orq %rdx, %rsi
+; AVX1-NEXT: vpextrq $1, %xmm0, %rdx
+; AVX1-NEXT: xorq %rcx, %rdx
; AVX1-NEXT: vpextrq $1, %xmm1, %rcx
; AVX1-NEXT: xorq %rax, %rcx
-; AVX1-NEXT: orq %r8, %rcx
-; AVX1-NEXT: orq %rsi, %rcx
; AVX1-NEXT: orq %rdx, %rcx
+; AVX1-NEXT: orq %rsi, %rcx
; AVX1-NEXT: xorl %eax, %eax
; AVX1-NEXT: orq %rdi, %rcx
; AVX1-NEXT: sete %al
; AVX2-NEXT: xorq %rdx, %r11
; AVX2-NEXT: vmovq %xmm3, %rdx
; AVX2-NEXT: xorq %rsi, %rdx
+; AVX2-NEXT: orq %r11, %rdx
; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm0
; AVX2-NEXT: vmovq %xmm0, %rsi
; AVX2-NEXT: xorq %rdi, %rsi
; AVX2-NEXT: xorq %r8, %rdi
; AVX2-NEXT: orq %rsi, %rdi
; AVX2-NEXT: orq %rdx, %rdi
-; AVX2-NEXT: orq %r11, %rdi
; AVX2-NEXT: vpextrq $1, %xmm2, %rdx
; AVX2-NEXT: xorq %r9, %rdx
; AVX2-NEXT: vpextrq $1, %xmm3, %rsi
; AVX2-NEXT: xorq %r10, %rsi
-; AVX2-NEXT: vpextrq $1, %xmm0, %r8
-; AVX2-NEXT: xorq %rcx, %r8
+; AVX2-NEXT: orq %rdx, %rsi
+; AVX2-NEXT: vpextrq $1, %xmm0, %rdx
+; AVX2-NEXT: xorq %rcx, %rdx
; AVX2-NEXT: vpextrq $1, %xmm1, %rcx
; AVX2-NEXT: xorq %rax, %rcx
-; AVX2-NEXT: orq %r8, %rcx
-; AVX2-NEXT: orq %rsi, %rcx
; AVX2-NEXT: orq %rdx, %rcx
+; AVX2-NEXT: orq %rsi, %rcx
; AVX2-NEXT: xorl %eax, %eax
; AVX2-NEXT: orq %rdi, %rcx
; AVX2-NEXT: sete %al
define i32 @ne_i256_pair(ptr %a, ptr %b) {
; SSE2-LABEL: ne_i256_pair:
; SSE2: # %bb.0:
-; SSE2-NEXT: movq 16(%rdi), %rcx
-; SSE2-NEXT: movq 24(%rdi), %rdx
-; SSE2-NEXT: movq (%rdi), %rax
+; SSE2-NEXT: movq 16(%rdi), %rax
+; SSE2-NEXT: movq 24(%rdi), %rcx
+; SSE2-NEXT: movq (%rdi), %rdx
; SSE2-NEXT: movq 8(%rdi), %r8
; SSE2-NEXT: xorq 8(%rsi), %r8
-; SSE2-NEXT: xorq 24(%rsi), %rdx
-; SSE2-NEXT: xorq (%rsi), %rax
-; SSE2-NEXT: xorq 16(%rsi), %rcx
+; SSE2-NEXT: xorq 24(%rsi), %rcx
+; SSE2-NEXT: xorq (%rsi), %rdx
+; SSE2-NEXT: xorq 16(%rsi), %rax
; SSE2-NEXT: movq 48(%rdi), %r9
; SSE2-NEXT: movq 32(%rdi), %r10
; SSE2-NEXT: movq 56(%rdi), %r11
; SSE2-NEXT: movq 40(%rdi), %rdi
; SSE2-NEXT: xorq 40(%rsi), %rdi
+; SSE2-NEXT: orq %r8, %rdi
; SSE2-NEXT: xorq 56(%rsi), %r11
-; SSE2-NEXT: orq %rdx, %r11
+; SSE2-NEXT: orq %rcx, %r11
; SSE2-NEXT: orq %rdi, %r11
-; SSE2-NEXT: orq %r8, %r11
; SSE2-NEXT: xorq 32(%rsi), %r10
+; SSE2-NEXT: orq %rdx, %r10
; SSE2-NEXT: xorq 48(%rsi), %r9
-; SSE2-NEXT: orq %rcx, %r9
-; SSE2-NEXT: orq %r10, %r9
; SSE2-NEXT: orq %rax, %r9
+; SSE2-NEXT: orq %r10, %r9
; SSE2-NEXT: xorl %eax, %eax
; SSE2-NEXT: orq %r11, %r9
; SSE2-NEXT: setne %al
;
; SSE41-LABEL: ne_i256_pair:
; SSE41: # %bb.0:
-; SSE41-NEXT: movq 16(%rdi), %rcx
-; SSE41-NEXT: movq 24(%rdi), %rdx
-; SSE41-NEXT: movq (%rdi), %rax
+; SSE41-NEXT: movq 16(%rdi), %rax
+; SSE41-NEXT: movq 24(%rdi), %rcx
+; SSE41-NEXT: movq (%rdi), %rdx
; SSE41-NEXT: movq 8(%rdi), %r8
; SSE41-NEXT: xorq 8(%rsi), %r8
-; SSE41-NEXT: xorq 24(%rsi), %rdx
-; SSE41-NEXT: xorq (%rsi), %rax
-; SSE41-NEXT: xorq 16(%rsi), %rcx
+; SSE41-NEXT: xorq 24(%rsi), %rcx
+; SSE41-NEXT: xorq (%rsi), %rdx
+; SSE41-NEXT: xorq 16(%rsi), %rax
; SSE41-NEXT: movq 48(%rdi), %r9
; SSE41-NEXT: movq 32(%rdi), %r10
; SSE41-NEXT: movq 56(%rdi), %r11
; SSE41-NEXT: movq 40(%rdi), %rdi
; SSE41-NEXT: xorq 40(%rsi), %rdi
+; SSE41-NEXT: orq %r8, %rdi
; SSE41-NEXT: xorq 56(%rsi), %r11
-; SSE41-NEXT: orq %rdx, %r11
+; SSE41-NEXT: orq %rcx, %r11
; SSE41-NEXT: orq %rdi, %r11
-; SSE41-NEXT: orq %r8, %r11
; SSE41-NEXT: xorq 32(%rsi), %r10
+; SSE41-NEXT: orq %rdx, %r10
; SSE41-NEXT: xorq 48(%rsi), %r9
-; SSE41-NEXT: orq %rcx, %r9
-; SSE41-NEXT: orq %r10, %r9
; SSE41-NEXT: orq %rax, %r9
+; SSE41-NEXT: orq %r10, %r9
; SSE41-NEXT: xorl %eax, %eax
; SSE41-NEXT: orq %r11, %r9
; SSE41-NEXT: setne %al
define i32 @eq_i256_pair(ptr %a, ptr %b) {
; SSE2-LABEL: eq_i256_pair:
; SSE2: # %bb.0:
-; SSE2-NEXT: movq 16(%rdi), %rcx
-; SSE2-NEXT: movq 24(%rdi), %rdx
-; SSE2-NEXT: movq (%rdi), %rax
+; SSE2-NEXT: movq 16(%rdi), %rax
+; SSE2-NEXT: movq 24(%rdi), %rcx
+; SSE2-NEXT: movq (%rdi), %rdx
; SSE2-NEXT: movq 8(%rdi), %r8
; SSE2-NEXT: xorq 8(%rsi), %r8
-; SSE2-NEXT: xorq 24(%rsi), %rdx
-; SSE2-NEXT: xorq (%rsi), %rax
-; SSE2-NEXT: xorq 16(%rsi), %rcx
+; SSE2-NEXT: xorq 24(%rsi), %rcx
+; SSE2-NEXT: xorq (%rsi), %rdx
+; SSE2-NEXT: xorq 16(%rsi), %rax
; SSE2-NEXT: movq 48(%rdi), %r9
; SSE2-NEXT: movq 32(%rdi), %r10
; SSE2-NEXT: movq 56(%rdi), %r11
; SSE2-NEXT: movq 40(%rdi), %rdi
; SSE2-NEXT: xorq 40(%rsi), %rdi
+; SSE2-NEXT: orq %r8, %rdi
; SSE2-NEXT: xorq 56(%rsi), %r11
-; SSE2-NEXT: orq %rdx, %r11
+; SSE2-NEXT: orq %rcx, %r11
; SSE2-NEXT: orq %rdi, %r11
-; SSE2-NEXT: orq %r8, %r11
; SSE2-NEXT: xorq 32(%rsi), %r10
+; SSE2-NEXT: orq %rdx, %r10
; SSE2-NEXT: xorq 48(%rsi), %r9
-; SSE2-NEXT: orq %rcx, %r9
-; SSE2-NEXT: orq %r10, %r9
; SSE2-NEXT: orq %rax, %r9
+; SSE2-NEXT: orq %r10, %r9
; SSE2-NEXT: xorl %eax, %eax
; SSE2-NEXT: orq %r11, %r9
; SSE2-NEXT: sete %al
;
; SSE41-LABEL: eq_i256_pair:
; SSE41: # %bb.0:
-; SSE41-NEXT: movq 16(%rdi), %rcx
-; SSE41-NEXT: movq 24(%rdi), %rdx
-; SSE41-NEXT: movq (%rdi), %rax
+; SSE41-NEXT: movq 16(%rdi), %rax
+; SSE41-NEXT: movq 24(%rdi), %rcx
+; SSE41-NEXT: movq (%rdi), %rdx
; SSE41-NEXT: movq 8(%rdi), %r8
; SSE41-NEXT: xorq 8(%rsi), %r8
-; SSE41-NEXT: xorq 24(%rsi), %rdx
-; SSE41-NEXT: xorq (%rsi), %rax
-; SSE41-NEXT: xorq 16(%rsi), %rcx
+; SSE41-NEXT: xorq 24(%rsi), %rcx
+; SSE41-NEXT: xorq (%rsi), %rdx
+; SSE41-NEXT: xorq 16(%rsi), %rax
; SSE41-NEXT: movq 48(%rdi), %r9
; SSE41-NEXT: movq 32(%rdi), %r10
; SSE41-NEXT: movq 56(%rdi), %r11
; SSE41-NEXT: movq 40(%rdi), %rdi
; SSE41-NEXT: xorq 40(%rsi), %rdi
+; SSE41-NEXT: orq %r8, %rdi
; SSE41-NEXT: xorq 56(%rsi), %r11
-; SSE41-NEXT: orq %rdx, %r11
+; SSE41-NEXT: orq %rcx, %r11
; SSE41-NEXT: orq %rdi, %r11
-; SSE41-NEXT: orq %r8, %r11
; SSE41-NEXT: xorq 32(%rsi), %r10
+; SSE41-NEXT: orq %rdx, %r10
; SSE41-NEXT: xorq 48(%rsi), %r9
-; SSE41-NEXT: orq %rcx, %r9
-; SSE41-NEXT: orq %r10, %r9
; SSE41-NEXT: orq %rax, %r9
+; SSE41-NEXT: orq %r10, %r9
; SSE41-NEXT: xorl %eax, %eax
; SSE41-NEXT: orq %r11, %r9
; SSE41-NEXT: sete %al
define i32 @ne_i512_pair(ptr %a, ptr %b) {
; NO512-LABEL: ne_i512_pair:
; NO512: # %bb.0:
-; NO512-NEXT: movq 32(%rdi), %rax
-; NO512-NEXT: movq 48(%rdi), %rcx
-; NO512-NEXT: movq 40(%rdi), %rdx
-; NO512-NEXT: movq 56(%rdi), %r8
-; NO512-NEXT: xorq 56(%rsi), %r8
-; NO512-NEXT: movq 120(%rdi), %r9
-; NO512-NEXT: xorq 120(%rsi), %r9
-; NO512-NEXT: orq %r8, %r9
+; NO512-NEXT: movq 40(%rdi), %rax
+; NO512-NEXT: movq 56(%rdi), %rcx
+; NO512-NEXT: movq 24(%rdi), %rdx
+; NO512-NEXT: xorq 24(%rsi), %rdx
+; NO512-NEXT: xorq 56(%rsi), %rcx
; NO512-NEXT: movq 88(%rdi), %r8
; NO512-NEXT: xorq 88(%rsi), %r8
-; NO512-NEXT: orq %r8, %r9
-; NO512-NEXT: movq 24(%rdi), %r8
-; NO512-NEXT: xorq 24(%rsi), %r8
-; NO512-NEXT: xorq 40(%rsi), %rdx
-; NO512-NEXT: orq %r8, %r9
-; NO512-NEXT: movq 104(%rdi), %r8
-; NO512-NEXT: xorq 104(%rsi), %r8
; NO512-NEXT: orq %rdx, %r8
-; NO512-NEXT: movq 72(%rdi), %rdx
-; NO512-NEXT: xorq 72(%rsi), %rdx
+; NO512-NEXT: movq 120(%rdi), %rdx
+; NO512-NEXT: xorq 120(%rsi), %rdx
+; NO512-NEXT: orq %rcx, %rdx
+; NO512-NEXT: movq 8(%rdi), %rcx
+; NO512-NEXT: xorq 8(%rsi), %rcx
+; NO512-NEXT: xorq 40(%rsi), %rax
+; NO512-NEXT: orq %r8, %rdx
+; NO512-NEXT: movq 72(%rdi), %r8
+; NO512-NEXT: xorq 72(%rsi), %r8
+; NO512-NEXT: orq %rcx, %r8
+; NO512-NEXT: movq 104(%rdi), %rcx
+; NO512-NEXT: xorq 104(%rsi), %rcx
+; NO512-NEXT: orq %rax, %rcx
+; NO512-NEXT: movq 48(%rdi), %rax
+; NO512-NEXT: orq %r8, %rcx
+; NO512-NEXT: movq 16(%rdi), %r8
+; NO512-NEXT: xorq 16(%rsi), %r8
+; NO512-NEXT: xorq 48(%rsi), %rax
+; NO512-NEXT: orq %rdx, %rcx
+; NO512-NEXT: movq 80(%rdi), %rdx
+; NO512-NEXT: xorq 80(%rsi), %rdx
+; NO512-NEXT: orq %r8, %rdx
+; NO512-NEXT: movq 112(%rdi), %r8
+; NO512-NEXT: xorq 112(%rsi), %r8
+; NO512-NEXT: orq %rax, %r8
+; NO512-NEXT: movq (%rdi), %rax
+; NO512-NEXT: xorq (%rsi), %rax
; NO512-NEXT: orq %rdx, %r8
-; NO512-NEXT: movq 16(%rdi), %rdx
-; NO512-NEXT: orq %r9, %r8
-; NO512-NEXT: movq 8(%rdi), %r9
-; NO512-NEXT: xorq 8(%rsi), %r9
-; NO512-NEXT: xorq 48(%rsi), %rcx
-; NO512-NEXT: orq %r9, %r8
-; NO512-NEXT: movq 112(%rdi), %r9
-; NO512-NEXT: xorq 112(%rsi), %r9
-; NO512-NEXT: orq %rcx, %r9
-; NO512-NEXT: movq 80(%rdi), %rcx
-; NO512-NEXT: xorq 80(%rsi), %rcx
-; NO512-NEXT: orq %rcx, %r9
-; NO512-NEXT: movq (%rdi), %rcx
-; NO512-NEXT: xorq 16(%rsi), %rdx
-; NO512-NEXT: xorq (%rsi), %rcx
-; NO512-NEXT: xorq 32(%rsi), %rax
-; NO512-NEXT: orq %rdx, %r9
-; NO512-NEXT: movq 96(%rdi), %rdx
-; NO512-NEXT: movq 64(%rdi), %rdi
-; NO512-NEXT: xorq 64(%rsi), %rdi
-; NO512-NEXT: xorq 96(%rsi), %rdx
+; NO512-NEXT: movq 64(%rdi), %rdx
+; NO512-NEXT: xorq 64(%rsi), %rdx
; NO512-NEXT: orq %rax, %rdx
-; NO512-NEXT: orq %rdi, %rdx
-; NO512-NEXT: orq %r9, %rdx
-; NO512-NEXT: orq %rcx, %rdx
+; NO512-NEXT: movq 32(%rdi), %rax
+; NO512-NEXT: xorq 32(%rsi), %rax
+; NO512-NEXT: movq 96(%rdi), %rdi
+; NO512-NEXT: xorq 96(%rsi), %rdi
+; NO512-NEXT: orq %rax, %rdi
+; NO512-NEXT: orq %rdx, %rdi
+; NO512-NEXT: orq %r8, %rdi
; NO512-NEXT: xorl %eax, %eax
-; NO512-NEXT: orq %r8, %rdx
+; NO512-NEXT: orq %rcx, %rdi
; NO512-NEXT: setne %al
; NO512-NEXT: retq
;
define i32 @eq_i512_pair(ptr %a, ptr %b) {
; NO512-LABEL: eq_i512_pair:
; NO512: # %bb.0:
-; NO512-NEXT: movq 32(%rdi), %rax
-; NO512-NEXT: movq 48(%rdi), %rcx
-; NO512-NEXT: movq 40(%rdi), %rdx
-; NO512-NEXT: movq 56(%rdi), %r8
-; NO512-NEXT: xorq 56(%rsi), %r8
-; NO512-NEXT: movq 120(%rdi), %r9
-; NO512-NEXT: xorq 120(%rsi), %r9
-; NO512-NEXT: orq %r8, %r9
+; NO512-NEXT: movq 40(%rdi), %rax
+; NO512-NEXT: movq 56(%rdi), %rcx
+; NO512-NEXT: movq 24(%rdi), %rdx
+; NO512-NEXT: xorq 24(%rsi), %rdx
+; NO512-NEXT: xorq 56(%rsi), %rcx
; NO512-NEXT: movq 88(%rdi), %r8
; NO512-NEXT: xorq 88(%rsi), %r8
-; NO512-NEXT: orq %r8, %r9
-; NO512-NEXT: movq 24(%rdi), %r8
-; NO512-NEXT: xorq 24(%rsi), %r8
-; NO512-NEXT: xorq 40(%rsi), %rdx
-; NO512-NEXT: orq %r8, %r9
-; NO512-NEXT: movq 104(%rdi), %r8
-; NO512-NEXT: xorq 104(%rsi), %r8
; NO512-NEXT: orq %rdx, %r8
-; NO512-NEXT: movq 72(%rdi), %rdx
-; NO512-NEXT: xorq 72(%rsi), %rdx
+; NO512-NEXT: movq 120(%rdi), %rdx
+; NO512-NEXT: xorq 120(%rsi), %rdx
+; NO512-NEXT: orq %rcx, %rdx
+; NO512-NEXT: movq 8(%rdi), %rcx
+; NO512-NEXT: xorq 8(%rsi), %rcx
+; NO512-NEXT: xorq 40(%rsi), %rax
+; NO512-NEXT: orq %r8, %rdx
+; NO512-NEXT: movq 72(%rdi), %r8
+; NO512-NEXT: xorq 72(%rsi), %r8
+; NO512-NEXT: orq %rcx, %r8
+; NO512-NEXT: movq 104(%rdi), %rcx
+; NO512-NEXT: xorq 104(%rsi), %rcx
+; NO512-NEXT: orq %rax, %rcx
+; NO512-NEXT: movq 48(%rdi), %rax
+; NO512-NEXT: orq %r8, %rcx
+; NO512-NEXT: movq 16(%rdi), %r8
+; NO512-NEXT: xorq 16(%rsi), %r8
+; NO512-NEXT: xorq 48(%rsi), %rax
+; NO512-NEXT: orq %rdx, %rcx
+; NO512-NEXT: movq 80(%rdi), %rdx
+; NO512-NEXT: xorq 80(%rsi), %rdx
+; NO512-NEXT: orq %r8, %rdx
+; NO512-NEXT: movq 112(%rdi), %r8
+; NO512-NEXT: xorq 112(%rsi), %r8
+; NO512-NEXT: orq %rax, %r8
+; NO512-NEXT: movq (%rdi), %rax
+; NO512-NEXT: xorq (%rsi), %rax
; NO512-NEXT: orq %rdx, %r8
-; NO512-NEXT: movq 16(%rdi), %rdx
-; NO512-NEXT: orq %r9, %r8
-; NO512-NEXT: movq 8(%rdi), %r9
-; NO512-NEXT: xorq 8(%rsi), %r9
-; NO512-NEXT: xorq 48(%rsi), %rcx
-; NO512-NEXT: orq %r9, %r8
-; NO512-NEXT: movq 112(%rdi), %r9
-; NO512-NEXT: xorq 112(%rsi), %r9
-; NO512-NEXT: orq %rcx, %r9
-; NO512-NEXT: movq 80(%rdi), %rcx
-; NO512-NEXT: xorq 80(%rsi), %rcx
-; NO512-NEXT: orq %rcx, %r9
-; NO512-NEXT: movq (%rdi), %rcx
-; NO512-NEXT: xorq 16(%rsi), %rdx
-; NO512-NEXT: xorq (%rsi), %rcx
-; NO512-NEXT: xorq 32(%rsi), %rax
-; NO512-NEXT: orq %rdx, %r9
-; NO512-NEXT: movq 96(%rdi), %rdx
-; NO512-NEXT: movq 64(%rdi), %rdi
-; NO512-NEXT: xorq 64(%rsi), %rdi
-; NO512-NEXT: xorq 96(%rsi), %rdx
+; NO512-NEXT: movq 64(%rdi), %rdx
+; NO512-NEXT: xorq 64(%rsi), %rdx
; NO512-NEXT: orq %rax, %rdx
-; NO512-NEXT: orq %rdi, %rdx
-; NO512-NEXT: orq %r9, %rdx
-; NO512-NEXT: orq %rcx, %rdx
+; NO512-NEXT: movq 32(%rdi), %rax
+; NO512-NEXT: xorq 32(%rsi), %rax
+; NO512-NEXT: movq 96(%rdi), %rdi
+; NO512-NEXT: xorq 96(%rsi), %rdi
+; NO512-NEXT: orq %rax, %rdi
+; NO512-NEXT: orq %rdx, %rdi
+; NO512-NEXT: orq %r8, %rdi
; NO512-NEXT: xorl %eax, %eax
-; NO512-NEXT: orq %r8, %rdx
+; NO512-NEXT: orq %rcx, %rdi
; NO512-NEXT: sete %al
; NO512-NEXT: retq
;
; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %rcx
; ANY-NEXT: orq %r10, %rcx
; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %r9
-; ANY-NEXT: orq %rcx, %r9
; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %rsi
; ANY-NEXT: orq %r9, %rsi
+; ANY-NEXT: orq %rcx, %rsi
; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %rax
; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %rdx
; ANY-NEXT: orq %rax, %rdx
; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %r8
-; ANY-NEXT: orq %rdx, %r8
; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %rdi
; ANY-NEXT: orq %r8, %rdi
+; ANY-NEXT: orq %rdx, %rdi
; ANY-NEXT: orq %rsi, %rdi
; ANY-NEXT: sete %al
; ANY-NEXT: retq
; ANY-NEXT: adcq $0, %rax
; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %rsi
; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %r9
+; ANY-NEXT: orq %rsi, %r9
; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %rcx
; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %rax
; ANY-NEXT: orq %rcx, %rax
; ANY-NEXT: orq %r9, %rax
-; ANY-NEXT: orq %rsi, %rax
; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %rdx
; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %r10
; ANY-NEXT: orq %rdx, %r10
; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %r8
-; ANY-NEXT: orq %r10, %r8
; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %rdi
; ANY-NEXT: orq %r8, %rdi
+; ANY-NEXT: orq %r10, %rdi
; ANY-NEXT: orq %rax, %rdi
; ANY-NEXT: sete %al
; ANY-NEXT: retq
; ANY-NEXT: orq %r8, %r11
; ANY-NEXT: xorq 8(%rdi), %rdx
; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %rax
-; ANY-NEXT: orq %r11, %rax
; ANY-NEXT: orq %rdx, %rax
+; ANY-NEXT: orq %r11, %rax
; ANY-NEXT: xorq 32(%rdi), %r9
; ANY-NEXT: xorq (%rdi), %rsi
+; ANY-NEXT: orq %r9, %rsi
; ANY-NEXT: xorq 16(%rdi), %rcx
; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %r10
; ANY-NEXT: orq %rcx, %r10
-; ANY-NEXT: orq %r9, %r10
; ANY-NEXT: orq %rsi, %r10
; ANY-NEXT: orq %rax, %r10
; ANY-NEXT: sete %al
; X64-NEXT: movl %edx, %eax
; X64-NEXT: shll $15, %edi
; X64-NEXT: shll $16, %eax
+; X64-NEXT: orl %esi, %edi
; X64-NEXT: orl %ecx, %eax
-; X64-NEXT: orl %esi, %eax
; X64-NEXT: orl %edi, %eax
; X64-NEXT: retq
%a.shifted = shl i32 %a, 15
; X64-NEXT: movl %edx, %eax
; X64-NEXT: shll $16, %edi
; X64-NEXT: shrl $16, %eax
+; X64-NEXT: orl %esi, %edi
; X64-NEXT: orl %ecx, %eax
-; X64-NEXT: orl %esi, %eax
; X64-NEXT: orl %edi, %eax
; X64-NEXT: retq
%a.shifted = shl i32 %a, 16
; X64: # %bb.0:
; X64-NEXT: pslld $16, %xmm0
; X64-NEXT: pslld $17, %xmm2
+; X64-NEXT: por %xmm1, %xmm0
; X64-NEXT: por %xmm3, %xmm2
-; X64-NEXT: por %xmm1, %xmm2
; X64-NEXT: por %xmm2, %xmm0
; X64-NEXT: retq
%a.shifted = shl <4 x i32> %a, <i32 16, i32 16, i32 16, i32 16>
; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: subl $192, %esp
+; X86-NEXT: subl $188, %esp
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: andl $1, %eax
; X86-NEXT: negl %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl %esi, %ecx
; X86-NEXT: adcl $0, %edi
-; X86-NEXT: movl %edi, (%esp) # 4-byte Spill
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl $0, %esi
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NEXT: adcl %ebp, %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ecx, (%esp) # 4-byte Spill
; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %edi, %esi
; X86-NEXT: adcl $0, %esi
; X86-NEXT: movl %eax, %ecx
; X86-NEXT: movl %eax, %ebx
; X86-NEXT: adcl $0, %ecx
-; X86-NEXT: addl (%esp), %esi # 4-byte Folded Reload
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; X86-NEXT: setb %al
; X86-NEXT: addl %edx, %esi
; X86-NEXT: mull %edi
; X86-NEXT: movl %edx, %ebx
; X86-NEXT: addl %esi, %eax
-; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movzbl %cl, %eax
; X86-NEXT: adcl %eax, %ebx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl $0, (%esp) # 4-byte Folded Spill
+; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NEXT: adcl $0, %ebx
; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl $0, %edi
; X86-NEXT: adcl $0, %ecx
-; X86-NEXT: addl (%esp), %edi # 4-byte Folded Reload
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT: setb (%esp) # 1-byte Folded Spill
+; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
; X86-NEXT: mull %ebx
; X86-NEXT: mull {{[0-9]+}}(%esp)
; X86-NEXT: movl %edx, %esi
; X86-NEXT: addl %ebp, %eax
-; X86-NEXT: movl %eax, %ebp
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl %ebx, %esi
-; X86-NEXT: setb %bl
+; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: addl %esi, %eax
-; X86-NEXT: movzbl %bl, %esi
-; X86-NEXT: adcl %esi, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: addl %edi, %ebx
-; X86-NEXT: movl %ebp, %esi
+; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: movl %eax, %ebp
+; X86-NEXT: addl %esi, %ebp
+; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NEXT: adcl %eax, %ebx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: addl %edi, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X86-NEXT: adcl %ecx, %esi
-; X86-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload
-; X86-NEXT: adcl %ecx, %eax
-; X86-NEXT: adcl $0, %edx
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NEXT: adcl %eax, %ebp
+; X86-NEXT: adcl $0, %ebx
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl (%esp), %ebx # 4-byte Folded Reload
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NEXT: movl %eax, %edi
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: movl %ecx, %ebp
+; X86-NEXT: movl %ecx, %ebx
; X86-NEXT: mull {{[0-9]+}}(%esp)
; X86-NEXT: movl %eax, %ecx
-; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: movl %eax, %ebp
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %esi, %eax
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl $0, %esi
; X86-NEXT: addl %edi, %ecx
-; X86-NEXT: movl %ecx, (%esp) # 4-byte Spill
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl %eax, %esi
; X86-NEXT: setb %al
-; X86-NEXT: addl %ebx, %esi
+; X86-NEXT: addl %ebp, %esi
; X86-NEXT: movzbl %al, %eax
; X86-NEXT: adcl %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %ebp, %ecx
-; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT: movl %ebx, %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: movl %edx, %ecx
; X86-NEXT: movl %eax, %ebp
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: movl %ebx, %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %eax, %ebx
; X86-NEXT: movl %eax, %edi
-; X86-NEXT: movl %eax, %ecx
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %ebx, %eax
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: addl %ebx, %ecx
-; X86-NEXT: movl %edx, %ebx
-; X86-NEXT: adcl $0, %ebx
-; X86-NEXT: addl %ebp, %ecx
+; X86-NEXT: movl %ecx, %eax
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl %eax, %ebx
+; X86-NEXT: addl %ecx, %edi
+; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: adcl $0, %ecx
+; X86-NEXT: addl %ebp, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl %eax, %ecx
; X86-NEXT: setb %al
-; X86-NEXT: addl %edi, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: addl %ebx, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movzbl %al, %ebp
; X86-NEXT: adcl %edx, %ebp
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: addl %ebx, %edx
-; X86-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-NEXT: addl %ecx, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: adcl %ebp, %eax
; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %esi, %edi
; X86-NEXT: adcl $0, %edi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: movl (%esp), %ebx # 4-byte Reload
; X86-NEXT: adcl $0, %ebx
; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X86-NEXT: adcl %ebx, %ebp
; X86-NEXT: setb %cl
; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT: adcl (%esp), %ebp # 4-byte Folded Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
; X86-NEXT: movl %ebp, %eax
; X86-NEXT: movzbl %cl, %ebx
; X86-NEXT: adcl %esi, %ebx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT: movl (%esp), %ebp # 4-byte Reload
; X86-NEXT: adcl $0, %ebp
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ecx, (%esp) # 4-byte Spill
; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
; X86-NEXT: adcl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT: adcl (%esp), %ebp # 4-byte Folded Reload
; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
; X86-NEXT: adcl %eax, %ebx
; X86-NEXT: addl %esi, %edx
; X86-NEXT: movzbl %cl, %eax
; X86-NEXT: adcl %edi, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; X86-NEXT: movl %ebx, %eax
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
; X86-NEXT: movl %ebp, %edi
; X86-NEXT: adcl %ecx, %edi
; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %edx, %edi
-; X86-NEXT: adcl $0, %edi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: movl %edx, %ebx
; X86-NEXT: adcl $0, %ebx
+; X86-NEXT: movl (%esp), %edi # 4-byte Reload
+; X86-NEXT: adcl $0, %edi
; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl $0, %esi
; X86-NEXT: adcl $0, %ecx
-; X86-NEXT: addl %edi, %esi
-; X86-NEXT: adcl %ebx, %ecx
+; X86-NEXT: addl %ebx, %esi
+; X86-NEXT: adcl %edi, %ecx
; X86-NEXT: setb %bl
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: addl %eax, %esi
; X86-NEXT: movzbl %bl, %edi
; X86-NEXT: adcl %edx, %edi
; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: adcl $0, (%esp) # 4-byte Folded Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
; X86-NEXT: addl %edi, %eax
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: adcl (%esp), %edx # 4-byte Folded Reload
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: adcl %ebx, %edi
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: addl %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %eax, %ebx
; X86-NEXT: movl %ebp, %eax
; X86-NEXT: adcl %edi, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
; X86-NEXT: adcl $0, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl $0, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, %eax
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: addl %esi, %eax
+; X86-NEXT: adcl $0, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: addl %edx, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT: adcl %ebp, %esi
+; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT: addl %edx, %esi
+; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 1-byte Folded Reload
+; X86-NEXT: adcl %ebp, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ecx, %ebp
+; X86-NEXT: addl %ecx, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl %eax, (%esp) # 4-byte Folded Spill
; X86-NEXT: movl %esi, %edx
+; X86-NEXT: movl %esi, %ebx
+; X86-NEXT: adcl $0, %edx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: addl %ecx, %edx
-; X86-NEXT: adcl $0, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT: movl %ecx, %esi
+; X86-NEXT: adcl $0, %esi
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: adcl %edi, %esi
+; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT: movl %ebp, %edi
; X86-NEXT: addl %ebp, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: adcl %ebx, %ecx
-; X86-NEXT: setb %al
-; X86-NEXT: addl %ebp, %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movzbl %al, %ebp
-; X86-NEXT: adcl %ebx, %ebp
-; X86-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: adcl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: adcl $0, %ecx
-; X86-NEXT: movl %ebp, %ebx
-; X86-NEXT: adcl $0, %ebx
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT: adcl %edi, %ebx
-; X86-NEXT: setb %al
-; X86-NEXT: addl %esi, %ecx
-; X86-NEXT: adcl %edx, %ebx
-; X86-NEXT: movzbl %al, %eax
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: adcl %eax, %esi
+; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NEXT: adcl %ebx, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl $0, %ebp
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: addl %edx, %esi
-; X86-NEXT: movl (%esp), %eax # 4-byte Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: adcl %edi, %eax
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: addl %edx, %eax
-; X86-NEXT: adcl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT: movl (%esp), %edx # 4-byte Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: addl %ecx, %eax
-; X86-NEXT: movl %eax, %edi
+; X86-NEXT: adcl $0, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: addl %ebx, %edi
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: adcl %ebx, %eax
-; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: movl %esi, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT: adcl %ebp, %eax
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: adcl %ebp, %edx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: addl %ebx, %ecx
+; X86-NEXT: adcl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: addl %edx, %ecx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: adcl %esi, %ebp
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: movl %edi, %eax
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl (%esp), %ebx # 4-byte Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: movl %ebp, %edi
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: movl %esi, %ebp
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: movl %edx, (%esp) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT: movl %esi, %ebp
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ebx, (%esp) # 4-byte Spill
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: movl (%esp), %esi # 4-byte Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: movl %ebx, %edx
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X86-NEXT: sarl $31, %edx
-; X86-NEXT: xorl %edx, %ecx
-; X86-NEXT: xorl %edx, %eax
-; X86-NEXT: orl %ecx, %eax
-; X86-NEXT: xorl %edx, %edi
-; X86-NEXT: orl %eax, %edi
-; X86-NEXT: xorl %edx, %ebp
; X86-NEXT: xorl %edx, %esi
-; X86-NEXT: orl %ebp, %esi
+; X86-NEXT: xorl %edx, %eax
+; X86-NEXT: orl %esi, %eax
+; X86-NEXT: movl %ecx, %ebx
+; X86-NEXT: xorl %edx, %ebx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: xorl %edx, %ecx
-; X86-NEXT: orl %esi, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: xorl %edx, %eax
+; X86-NEXT: orl %ebx, %ecx
+; X86-NEXT: orl %eax, %ecx
+; X86-NEXT: movl (%esp), %ebx # 4-byte Reload
+; X86-NEXT: xorl %edx, %ebx
+; X86-NEXT: xorl %edx, %ebp
+; X86-NEXT: orl %ebx, %ebp
+; X86-NEXT: movl %edi, %esi
+; X86-NEXT: xorl %edx, %esi
; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: orl %esi, %edx
+; X86-NEXT: orl %ebp, %edx
; X86-NEXT: orl %ecx, %edx
-; X86-NEXT: orl %edi, %edx
-; X86-NEXT: orl %eax, %edx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
; X86-NEXT: movl %edi, %ecx
; X86-NEXT: andl $1, %ecx
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: negl %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; X86-NEXT: xorl %eax, %ebx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X86-NEXT: xorl %eax, %esi
; X86-NEXT: orl %ebx, %esi
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; X86-NEXT: xorl %eax, %ebx
-; X86-NEXT: orl %esi, %ebx
; X86-NEXT: xorl %edi, %eax
; X86-NEXT: orl %ebx, %eax
+; X86-NEXT: orl %esi, %eax
; X86-NEXT: orl %edx, %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X86-NEXT: movl %edx, 12(%eax)
; X86-NEXT: movb %cl, 16(%eax)
; X86-NEXT: setne 20(%eax)
-; X86-NEXT: addl $192, %esp
+; X86-NEXT: addl $188, %esp
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: popl %ebx
; X64-NEXT: xorq %rcx, %r10
; X64-NEXT: orq %rdx, %r10
; X64-NEXT: xorq %rcx, %rax
-; X64-NEXT: orq %r10, %rax
; X64-NEXT: xorq %rbx, %rcx
; X64-NEXT: orq %rax, %rcx
+; X64-NEXT: orq %r10, %rcx
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
; X64-NEXT: movl %eax, %esi
; X64-NEXT: andl $1, %esi
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: mull %esi
; X86-NEXT: imull {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: addl %ecx, %edx
; X86-NEXT: imull {{[0-9]+}}(%esp), %esi
-; X86-NEXT: addl %ecx, %esi
; X86-NEXT: addl %esi, %edx
; X86-NEXT: popl %esi
; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: .cfi_offset %ebx, -12
; X86-NEXT: .cfi_offset %ebp, -8
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT: movl %ebx, %edx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl %ecx, %edx
; X86-NEXT: sarl $31, %edx
-; X86-NEXT: movl %esi, %ecx
-; X86-NEXT: imull %edx, %ecx
+; X86-NEXT: movl %ebp, %ebx
+; X86-NEXT: imull %edx, %ebx
; X86-NEXT: mull %edx
+; X86-NEXT: movl %edx, %esi
; X86-NEXT: movl %eax, %edi
-; X86-NEXT: addl %eax, %ecx
-; X86-NEXT: addl %edx, %ecx
-; X86-NEXT: movl %esi, %eax
-; X86-NEXT: sarl $31, %eax
-; X86-NEXT: movl %eax, %esi
-; X86-NEXT: imull %ebx, %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT: mull %ebp
; X86-NEXT: addl %eax, %esi
-; X86-NEXT: addl %edx, %esi
-; X86-NEXT: addl %edi, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl %ecx, %esi
+; X86-NEXT: addl %ebx, %esi
; X86-NEXT: movl %ebp, %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: mull %edi
-; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: sarl $31, %eax
+; X86-NEXT: movl %eax, %ebp
+; X86-NEXT: imull %ecx, %ebp
+; X86-NEXT: mull {{[0-9]+}}(%esp)
+; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: addl %ebp, %ebx
+; X86-NEXT: addl %eax, %ebx
+; X86-NEXT: addl %edi, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %ebx, %eax
-; X86-NEXT: mull %edi
-; X86-NEXT: movl %edx, %ebp
-; X86-NEXT: addl %eax, %ecx
-; X86-NEXT: adcl $0, %ebp
+; X86-NEXT: adcl %esi, %ebx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: mull %esi
+; X86-NEXT: movl %edx, %ebp
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: mull %esi
; X86-NEXT: movl %edx, %edi
-; X86-NEXT: addl %eax, %ecx
-; X86-NEXT: adcl %ebp, %edi
+; X86-NEXT: addl %eax, %ebp
+; X86-NEXT: adcl $0, %edi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: mull %edx
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: addl %eax, %ebp
+; X86-NEXT: adcl %edi, %esi
; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT: movl %ebx, %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT: mull %ebp
-; X86-NEXT: addl %edi, %eax
-; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 1-byte Folded Reload
-; X86-NEXT: adcl %edi, %edx
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: mull {{[0-9]+}}(%esp)
+; X86-NEXT: addl %esi, %eax
+; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
; X86-NEXT: adcl %esi, %edx
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: adcl %ebx, %edx
+; X86-NEXT: movl %ebp, %edi
+; X86-NEXT: sarl $31, %edi
+; X86-NEXT: xorl %edi, %edx
+; X86-NEXT: xorl %eax, %edi
+; X86-NEXT: xorl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: sarl $31, %ecx
; X86-NEXT: movl %ecx, %esi
-; X86-NEXT: sarl $31, %esi
-; X86-NEXT: xorl %esi, %edx
-; X86-NEXT: xorl %eax, %esi
-; X86-NEXT: xorl %ebp, %ebx
-; X86-NEXT: sarl $31, %ebx
-; X86-NEXT: movl %ebx, %edi
-; X86-NEXT: xorl $2147483647, %edi # imm = 0x7FFFFFFF
-; X86-NEXT: orl %edx, %esi
-; X86-NEXT: notl %ebx
-; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT: cmovel %ecx, %edi
-; X86-NEXT: movl %ebx, %eax
-; X86-NEXT: movl %edi, %edx
+; X86-NEXT: xorl $2147483647, %esi # imm = 0x7FFFFFFF
+; X86-NEXT: orl %edx, %edi
+; X86-NEXT: notl %ecx
+; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: cmovel %ebp, %esi
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: movl %esi, %edx
; X86-NEXT: addl $12, %esp
; X86-NEXT: .cfi_def_cfa_offset 20
; X86-NEXT: popl %esi
; X64-NEXT: .cfi_offset %rbx, -32
; X64-NEXT: .cfi_offset %r14, -24
; X64-NEXT: .cfi_offset %r15, -16
-; X64-NEXT: movq %rdx, %r11
-; X64-NEXT: movq %rdi, %r10
+; X64-NEXT: movq %rdx, %rbx
+; X64-NEXT: movq %rdi, %r11
; X64-NEXT: movq %rsi, %rdi
; X64-NEXT: sarq $63, %rdi
-; X64-NEXT: movq %rcx, %rbx
-; X64-NEXT: imulq %rdi, %rbx
+; X64-NEXT: movq %rcx, %r9
+; X64-NEXT: imulq %rdi, %r9
; X64-NEXT: movq %rdx, %rax
; X64-NEXT: mulq %rdi
-; X64-NEXT: movq %rax, %rdi
-; X64-NEXT: addq %rax, %rbx
-; X64-NEXT: addq %rdx, %rbx
+; X64-NEXT: movq %rdx, %rdi
+; X64-NEXT: movq %rax, %r14
+; X64-NEXT: addq %rax, %rdi
+; X64-NEXT: addq %r9, %rdi
; X64-NEXT: movq %rcx, %rax
; X64-NEXT: sarq $63, %rax
; X64-NEXT: movq %rax, %r15
; X64-NEXT: imulq %rsi, %r15
-; X64-NEXT: mulq %r10
-; X64-NEXT: movq %rax, %r9
-; X64-NEXT: addq %rax, %r15
-; X64-NEXT: addq %rdx, %r15
-; X64-NEXT: addq %rdi, %r9
-; X64-NEXT: adcq %rbx, %r15
-; X64-NEXT: movq %r10, %rax
; X64-NEXT: mulq %r11
-; X64-NEXT: movq %rdx, %rbx
+; X64-NEXT: movq %rax, %r10
+; X64-NEXT: movq %rdx, %r9
+; X64-NEXT: addq %r15, %r9
+; X64-NEXT: addq %rax, %r9
+; X64-NEXT: addq %r14, %r10
+; X64-NEXT: adcq %rdi, %r9
+; X64-NEXT: movq %r11, %rax
+; X64-NEXT: mulq %rbx
+; X64-NEXT: movq %rdx, %r14
; X64-NEXT: movq %rax, %rdi
; X64-NEXT: movq %rsi, %rax
-; X64-NEXT: mulq %r11
-; X64-NEXT: movq %rdx, %r11
-; X64-NEXT: movq %rax, %r14
-; X64-NEXT: addq %rbx, %r14
-; X64-NEXT: adcq $0, %r11
-; X64-NEXT: movq %r10, %rax
-; X64-NEXT: mulq %rcx
+; X64-NEXT: mulq %rbx
; X64-NEXT: movq %rdx, %rbx
-; X64-NEXT: movq %rax, %r10
-; X64-NEXT: addq %r14, %r10
-; X64-NEXT: adcq %r11, %rbx
+; X64-NEXT: movq %rax, %r15
+; X64-NEXT: addq %r14, %r15
+; X64-NEXT: adcq $0, %rbx
+; X64-NEXT: movq %r11, %rax
+; X64-NEXT: mulq %rcx
+; X64-NEXT: movq %rdx, %r14
+; X64-NEXT: movq %rax, %r11
+; X64-NEXT: addq %r15, %r11
+; X64-NEXT: adcq %rbx, %r14
; X64-NEXT: setb %al
-; X64-NEXT: movzbl %al, %r11d
+; X64-NEXT: movzbl %al, %ebx
; X64-NEXT: movq %rsi, %rax
; X64-NEXT: mulq %rcx
-; X64-NEXT: addq %rbx, %rax
-; X64-NEXT: adcq %r11, %rdx
-; X64-NEXT: addq %r9, %rax
-; X64-NEXT: adcq %r15, %rdx
-; X64-NEXT: movq %r10, 8(%r8)
-; X64-NEXT: sarq $63, %r10
-; X64-NEXT: xorq %r10, %rdx
-; X64-NEXT: xorq %rax, %r10
-; X64-NEXT: orq %rdx, %r10
+; X64-NEXT: addq %r14, %rax
+; X64-NEXT: adcq %rbx, %rdx
+; X64-NEXT: addq %r10, %rax
+; X64-NEXT: adcq %r9, %rdx
+; X64-NEXT: movq %r11, 8(%r8)
+; X64-NEXT: sarq $63, %r11
+; X64-NEXT: xorq %r11, %rdx
+; X64-NEXT: xorq %rax, %r11
+; X64-NEXT: orq %rdx, %r11
; X64-NEXT: setne %al
; X64-NEXT: movq %rdi, (%r8)
; X64-NEXT: popq %rbx
; X86-NEXT: .cfi_offset %ebx, -12
; X86-NEXT: .cfi_offset %ebp, -8
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: movl %ebx, %eax
-; X86-NEXT: mull %edi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: mull %ebx
; X86-NEXT: movl %edx, %ecx
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movl %esi, %eax
-; X86-NEXT: mull %edi
+; X86-NEXT: mull %ebx
; X86-NEXT: movl %edx, %esi
-; X86-NEXT: movl %eax, %edi
-; X86-NEXT: addl %ecx, %edi
+; X86-NEXT: movl %eax, %ebp
+; X86-NEXT: addl %ecx, %ebp
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: adcl $0, %esi
-; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: movl %edi, %eax
; X86-NEXT: mull %ecx
-; X86-NEXT: movl %edx, %ebp
-; X86-NEXT: addl %edi, %eax
+; X86-NEXT: movl %ecx, %edi
+; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: addl %ebp, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl %esi, %ebp
+; X86-NEXT: adcl %esi, %ecx
; X86-NEXT: setb %bl
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %ecx
-; X86-NEXT: addl %ebp, %eax
-; X86-NEXT: movl %eax, (%esp) ## 4-byte Spill
+; X86-NEXT: mull %edi
+; X86-NEXT: addl %ecx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movzbl %bl, %eax
; X86-NEXT: adcl %eax, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl %ecx, %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: mull %edi
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: mull %esi
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl %edx, %esi
+; X86-NEXT: movl %edx, %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %edi
-; X86-NEXT: movl %edx, %edi
+; X86-NEXT: mull %esi
+; X86-NEXT: movl %edx, %esi
; X86-NEXT: movl %eax, %ebp
-; X86-NEXT: addl %esi, %ebp
-; X86-NEXT: adcl $0, %edi
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: addl %ecx, %ebp
+; X86-NEXT: adcl $0, %esi
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: mull %ebx
+; X86-NEXT: movl %edx, %edi
; X86-NEXT: addl %ebp, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl %edi, %ebx
+; X86-NEXT: adcl %esi, %edi
; X86-NEXT: setb %cl
; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
; X86-NEXT: movl %ebp, %eax
-; X86-NEXT: mull %esi
-; X86-NEXT: movl %edx, %edi
+; X86-NEXT: mull %ebx
+; X86-NEXT: movl %edx, %ebx
; X86-NEXT: movl %eax, %esi
-; X86-NEXT: addl %ebx, %esi
+; X86-NEXT: addl %edi, %esi
; X86-NEXT: movzbl %cl, %eax
-; X86-NEXT: adcl %eax, %edi
+; X86-NEXT: adcl %eax, %ebx
; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
-; X86-NEXT: adcl $0, (%esp) ## 4-byte Folded Spill
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %ecx
-; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: mull %edi
+; X86-NEXT: movl %edx, %ecx
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movl %ebp, %eax
-; X86-NEXT: mull %ecx
+; X86-NEXT: mull %edi
; X86-NEXT: movl %edx, %ebp
-; X86-NEXT: movl %eax, %ecx
-; X86-NEXT: addl %ebx, %ecx
+; X86-NEXT: movl %eax, %edi
+; X86-NEXT: addl %ecx, %edi
; X86-NEXT: adcl $0, %ebp
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: movl %edx, %ebx
-; X86-NEXT: addl %ecx, %eax
+; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: addl %edi, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl %ebp, %ebx
+; X86-NEXT: adcl %ebp, %ecx
; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: movl %edx, %ebp
-; X86-NEXT: movl %eax, %ecx
-; X86-NEXT: addl %ebx, %ecx
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: movl %eax, %ebp
+; X86-NEXT: addl %ecx, %ebp
; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X86-NEXT: adcl %eax, %ebp
+; X86-NEXT: adcl %eax, %edi
; X86-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT: adcl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT: adcl $0, %ecx
+; X86-NEXT: adcl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
; X86-NEXT: adcl $0, %ebp
-; X86-NEXT: addl (%esp), %ecx ## 4-byte Folded Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
-; X86-NEXT: setb (%esp) ## 1-byte Folded Spill
+; X86-NEXT: adcl $0, %edi
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
+; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: mull %edi
-; X86-NEXT: movl %edx, %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: mull %esi
+; X86-NEXT: movl %edx, %ecx
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %edi
-; X86-NEXT: movl %edx, %edi
+; X86-NEXT: mull %esi
+; X86-NEXT: movl %edx, %esi
; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: addl %esi, %ebx
-; X86-NEXT: adcl $0, %edi
+; X86-NEXT: addl %ecx, %ebx
+; X86-NEXT: adcl $0, %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: mull %edx
-; X86-NEXT: movl %edx, %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: mull %ecx
+; X86-NEXT: movl %edx, %ecx
; X86-NEXT: addl %ebx, %eax
; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: adcl %edi, %esi
+; X86-NEXT: adcl %esi, %ecx
; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: movl %edi, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: movl %esi, %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: addl %esi, %eax
-; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 1-byte Folded Reload
-; X86-NEXT: adcl %esi, %edx
-; X86-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT: adcl %ebp, %ebx
+; X86-NEXT: addl %ecx, %eax
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
+; X86-NEXT: adcl %eax, %edx
+; X86-NEXT: addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT: adcl %edi, %ebx
; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movzbl (%esp), %ecx ## 1-byte Folded Reload
-; X86-NEXT: adcl %ecx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
+; X86-NEXT: adcl %eax, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: adcl $0, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: sarl $31, %edi
+; X86-NEXT: movl %esi, %ecx
+; X86-NEXT: sarl $31, %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %edi
-; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: movl %eax, %esi
+; X86-NEXT: mull %ecx
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: movl %eax, %ebx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %edi
-; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: mull %ecx
+; X86-NEXT: movl %edx, %esi
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movl %eax, %ebp
-; X86-NEXT: addl %ecx, %ebp
-; X86-NEXT: adcl $0, %ebx
-; X86-NEXT: addl %esi, %ebp
-; X86-NEXT: movl %ebp, (%esp) ## 4-byte Spill
-; X86-NEXT: movl %esi, %ebp
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl %ecx, %ebx
-; X86-NEXT: setb %cl
-; X86-NEXT: movl %edi, %esi
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: addl %edi, %ebp
+; X86-NEXT: adcl $0, %esi
+; X86-NEXT: addl %ebx, %ebp
+; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: adcl %edi, %esi
+; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT: movl %ecx, %edi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: imull %eax, %esi
-; X86-NEXT: movl %edi, %eax
+; X86-NEXT: imull %eax, %edi
+; X86-NEXT: movl %ecx, %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: addl %eax, %esi
-; X86-NEXT: addl %edx, %esi
-; X86-NEXT: addl %ebp, %eax
-; X86-NEXT: adcl (%esp), %esi ## 4-byte Folded Reload
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
-; X86-NEXT: movzbl %cl, %ecx
+; X86-NEXT: addl %edi, %edx
+; X86-NEXT: addl %eax, %edx
+; X86-NEXT: addl %ebx, %eax
+; X86-NEXT: adcl %ebp, %edx
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 1-byte Folded Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
-; X86-NEXT: addl %eax, %ebx
-; X86-NEXT: adcl %esi, %ecx
+; X86-NEXT: addl %eax, %esi
+; X86-NEXT: adcl %edx, %ecx
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: sarl $31, %eax
-; X86-NEXT: movl %eax, %esi
+; X86-NEXT: movl %eax, %ecx
; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: movl %edx, %ebp
-; X86-NEXT: movl %eax, %edi
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: addl %edx, %edi
-; X86-NEXT: adcl $0, %ebp
-; X86-NEXT: movl %esi, %eax
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: movl %eax, %ebp
+; X86-NEXT: addl %edx, %ebp
+; X86-NEXT: adcl $0, %edi
+; X86-NEXT: movl %ecx, %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: addl %eax, %edi
-; X86-NEXT: adcl %edx, %ebp
+; X86-NEXT: addl %eax, %ebp
+; X86-NEXT: adcl %edx, %edi
; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: imull %esi, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
+; X86-NEXT: imull %ecx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %esi
-; X86-NEXT: movl %eax, %esi
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: addl %esi, %eax
-; X86-NEXT: addl %edx, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
-; X86-NEXT: addl %ecx, %esi
-; X86-NEXT: adcl %edi, %eax
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
-; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 1-byte Folded Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
-; X86-NEXT: addl %esi, %ebp
-; X86-NEXT: adcl %eax, %edx
-; X86-NEXT: movl %ecx, %esi
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
-; X86-NEXT: adcl (%esp), %edi ## 4-byte Folded Reload
-; X86-NEXT: adcl %ebx, %ebp
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
+; X86-NEXT: mull %ecx
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: addl %eax, %edx
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT: addl %ebx, %ecx
+; X86-NEXT: adcl %ebp, %edx
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
+; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
+; X86-NEXT: addl %ecx, %edi
+; X86-NEXT: adcl %edx, %eax
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: sarl $31, %eax
-; X86-NEXT: xorl %eax, %edx
-; X86-NEXT: xorl %eax, %edi
-; X86-NEXT: orl %edx, %edi
-; X86-NEXT: xorl %eax, %ebp
-; X86-NEXT: xorl %esi, %eax
-; X86-NEXT: orl %ebp, %eax
-; X86-NEXT: orl %edi, %eax
+; X86-NEXT: adcl %esi, %edi
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
+; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: sarl $31, %ecx
+; X86-NEXT: xorl %ecx, %eax
+; X86-NEXT: xorl %ecx, %ebp
+; X86-NEXT: orl %eax, %ebp
+; X86-NEXT: xorl %ecx, %edi
+; X86-NEXT: xorl %ebx, %ecx
+; X86-NEXT: orl %edi, %ecx
+; X86-NEXT: orl %ebp, %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl %ecx, 12(%eax)
+; X86-NEXT: movl %edx, 12(%eax)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
; X86-NEXT: movl %ecx, (%eax)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
; X64-NEXT: .cfi_offset %r14, -32
; X64-NEXT: .cfi_offset %r15, -24
; X64-NEXT: .cfi_offset %rbp, -16
-; X64-NEXT: movq %rcx, %r15
+; X64-NEXT: movq %rcx, %r13
; X64-NEXT: movq %rdx, %r14
; X64-NEXT: movq %rsi, %r11
; X64-NEXT: movq %rdx, %rax
; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
; X64-NEXT: mulq %r8
-; X64-NEXT: movq %rdx, %rsi
-; X64-NEXT: movq %rax, %r10
-; X64-NEXT: movq %rcx, %rax
-; X64-NEXT: mulq %r8
; X64-NEXT: movq %rdx, %rcx
+; X64-NEXT: movq %rax, %r15
+; X64-NEXT: movq %r13, %rax
+; X64-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; X64-NEXT: mulq %r8
+; X64-NEXT: movq %rdx, %rsi
; X64-NEXT: movq %rax, %rbx
-; X64-NEXT: addq %rsi, %rbx
-; X64-NEXT: adcq $0, %rcx
+; X64-NEXT: addq %rcx, %rbx
+; X64-NEXT: adcq $0, %rsi
; X64-NEXT: movq %r14, %rax
; X64-NEXT: mulq %r9
; X64-NEXT: movq %rdx, %r12
; X64-NEXT: movq %rax, %r14
; X64-NEXT: addq %rbx, %r14
-; X64-NEXT: adcq %rcx, %r12
+; X64-NEXT: adcq %rsi, %r12
; X64-NEXT: setb %al
-; X64-NEXT: movzbl %al, %ecx
-; X64-NEXT: movq %r15, %rax
+; X64-NEXT: movzbl %al, %r10d
+; X64-NEXT: movq %r13, %rax
; X64-NEXT: mulq %r9
+; X64-NEXT: movq %rdx, %rcx
; X64-NEXT: movq %rax, %rsi
; X64-NEXT: addq %r12, %rsi
-; X64-NEXT: adcq %rcx, %rdx
-; X64-NEXT: movq %rdx, %rcx
+; X64-NEXT: adcq %r10, %rcx
; X64-NEXT: movq %rdi, %rax
-; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
; X64-NEXT: mulq %r8
; X64-NEXT: movq %rdx, %rbx
; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
; X64-NEXT: addq %r13, %rax
; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
; X64-NEXT: adcq %r12, %rbx
-; X64-NEXT: setb %r8b
+; X64-NEXT: setb %r10b
; X64-NEXT: movq %r11, %rax
; X64-NEXT: mulq %r9
; X64-NEXT: movq %rdx, %r13
; X64-NEXT: movq %rax, %rbp
; X64-NEXT: addq %rbx, %rbp
-; X64-NEXT: movzbl %r8b, %eax
+; X64-NEXT: movzbl %r10b, %eax
; X64-NEXT: adcq %rax, %r13
; X64-NEXT: movq {{[0-9]+}}(%rsp), %r9
-; X64-NEXT: addq %r10, %rbp
+; X64-NEXT: addq %r15, %rbp
; X64-NEXT: adcq %r14, %r13
; X64-NEXT: adcq $0, %rsi
; X64-NEXT: adcq $0, %rcx
-; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: movq %rdi, %r14
+; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
; X64-NEXT: mulq %r9
-; X64-NEXT: movq %rdx, %r8
-; X64-NEXT: movq %rax, %r14
+; X64-NEXT: movq %rdx, %rdi
+; X64-NEXT: movq %rax, %r15
; X64-NEXT: movq %r11, %rbx
+; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
; X64-NEXT: movq %r11, %rax
; X64-NEXT: mulq %r9
-; X64-NEXT: movq %rdx, %r10
-; X64-NEXT: movq %rax, %r9
-; X64-NEXT: addq %r8, %r9
-; X64-NEXT: adcq $0, %r10
+; X64-NEXT: movq %rdx, %r9
+; X64-NEXT: movq %rax, %r10
+; X64-NEXT: addq %rdi, %r10
+; X64-NEXT: adcq $0, %r9
; X64-NEXT: movq {{[0-9]+}}(%rsp), %r12
-; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: movq %r14, %rax
; X64-NEXT: mulq %r12
; X64-NEXT: movq %rdx, %r11
-; X64-NEXT: addq %r9, %rax
-; X64-NEXT: movq %rax, %rdi
-; X64-NEXT: adcq %r10, %r11
-; X64-NEXT: setb %cl
+; X64-NEXT: movq %rax, %r14
+; X64-NEXT: addq %r10, %r14
+; X64-NEXT: adcq %r9, %r11
+; X64-NEXT: setb %r10b
; X64-NEXT: movq %rbx, %rax
; X64-NEXT: mulq %r12
-; X64-NEXT: movq %rdx, %r10
-; X64-NEXT: movq %rax, %r8
-; X64-NEXT: addq %r11, %r8
-; X64-NEXT: movzbl %cl, %eax
-; X64-NEXT: adcq %rax, %r10
-; X64-NEXT: addq %rbp, %r14
+; X64-NEXT: movq %rdx, %rdi
+; X64-NEXT: movq %rax, %r9
+; X64-NEXT: addq %r11, %r9
+; X64-NEXT: movzbl %r10b, %eax
+; X64-NEXT: adcq %rax, %rdi
+; X64-NEXT: addq %rbp, %r15
+; X64-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; X64-NEXT: adcq %r13, %r14
; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT: adcq %r13, %rdi
-; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT: adcq $0, %r8
-; X64-NEXT: adcq $0, %r10
-; X64-NEXT: addq %rsi, %r8
-; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r10 ## 8-byte Folded Reload
-; X64-NEXT: setb %cl
+; X64-NEXT: adcq $0, %r9
+; X64-NEXT: adcq $0, %rdi
+; X64-NEXT: addq %rsi, %r9
+; X64-NEXT: adcq %rcx, %rdi
+; X64-NEXT: setb %bl
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 ## 8-byte Reload
; X64-NEXT: movq %r11, %rax
-; X64-NEXT: movq {{[0-9]+}}(%rsp), %rdi
-; X64-NEXT: mulq %rdi
+; X64-NEXT: movq {{[0-9]+}}(%rsp), %rsi
+; X64-NEXT: mulq %rsi
+; X64-NEXT: movq %rdx, %rcx
+; X64-NEXT: movq %rax, %rbp
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 ## 8-byte Reload
+; X64-NEXT: movq %r14, %rax
+; X64-NEXT: mulq %rsi
; X64-NEXT: movq %rdx, %rsi
-; X64-NEXT: movq %rax, %r14
-; X64-NEXT: movq %r15, %rax
-; X64-NEXT: mulq %rdi
-; X64-NEXT: movq %rdx, %rdi
-; X64-NEXT: movq %rax, %r9
-; X64-NEXT: addq %rsi, %r9
-; X64-NEXT: adcq $0, %rdi
+; X64-NEXT: movq %rax, %r10
+; X64-NEXT: addq %rcx, %r10
+; X64-NEXT: adcq $0, %rsi
; X64-NEXT: movq %r11, %rax
; X64-NEXT: mulq %r12
; X64-NEXT: movq %rdx, %r11
-; X64-NEXT: addq %r9, %rax
-; X64-NEXT: movq %rax, %r9
-; X64-NEXT: adcq %rdi, %r11
-; X64-NEXT: setb %sil
-; X64-NEXT: movq %r15, %rax
+; X64-NEXT: addq %r10, %rax
+; X64-NEXT: movq %rax, %r10
+; X64-NEXT: adcq %rsi, %r11
+; X64-NEXT: setb %cl
+; X64-NEXT: movq %r14, %rax
; X64-NEXT: mulq %r12
-; X64-NEXT: movq %rdx, %rbp
-; X64-NEXT: movq %rax, %r13
-; X64-NEXT: addq %r11, %r13
-; X64-NEXT: movzbl %sil, %eax
-; X64-NEXT: adcq %rax, %rbp
-; X64-NEXT: addq %r8, %r14
-; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT: adcq %r10, %r9
-; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; X64-NEXT: movq %rdx, %r13
+; X64-NEXT: movq %rax, %r15
+; X64-NEXT: addq %r11, %r15
; X64-NEXT: movzbl %cl, %eax
; X64-NEXT: adcq %rax, %r13
-; X64-NEXT: adcq $0, %rbp
-; X64-NEXT: movq %r15, %r8
-; X64-NEXT: sarq $63, %r8
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax ## 8-byte Reload
-; X64-NEXT: mulq %r8
+; X64-NEXT: addq %r9, %rbp
+; X64-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; X64-NEXT: adcq %rdi, %r10
+; X64-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; X64-NEXT: movzbl %bl, %eax
+; X64-NEXT: adcq %rax, %r15
+; X64-NEXT: adcq $0, %r13
+; X64-NEXT: movq %r14, %rdi
+; X64-NEXT: movq %r14, %rbp
+; X64-NEXT: sarq $63, %rdi
+; X64-NEXT: movq %r8, %rax
+; X64-NEXT: mulq %rdi
; X64-NEXT: movq %rdx, %r9
-; X64-NEXT: movq %rax, %rsi
+; X64-NEXT: movq %rax, %rcx
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax ## 8-byte Reload
-; X64-NEXT: mulq %r8
-; X64-NEXT: movq %rdx, %rcx
-; X64-NEXT: movq %rax, %r11
-; X64-NEXT: movq %rax, %r10
-; X64-NEXT: addq %r9, %r10
+; X64-NEXT: mulq %rdi
; X64-NEXT: movq %rdx, %r14
-; X64-NEXT: adcq $0, %r14
-; X64-NEXT: addq %rsi, %r10
-; X64-NEXT: movq %rsi, %rdi
-; X64-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT: adcq %r9, %r14
-; X64-NEXT: setb %sil
-; X64-NEXT: movq %r8, %r9
-; X64-NEXT: imulq %r12, %r9
-; X64-NEXT: movq %r8, %rax
+; X64-NEXT: movq %rax, %r11
+; X64-NEXT: movq %rax, %r8
+; X64-NEXT: addq %r9, %r8
+; X64-NEXT: movq %rdx, %r10
+; X64-NEXT: adcq $0, %r10
+; X64-NEXT: addq %rcx, %r8
+; X64-NEXT: movq %rcx, %rbx
+; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; X64-NEXT: adcq %r9, %r10
+; X64-NEXT: setb %cl
+; X64-NEXT: movq %rdi, %rsi
+; X64-NEXT: imulq %r12, %rsi
+; X64-NEXT: movq %rdi, %rax
; X64-NEXT: mulq {{[0-9]+}}(%rsp)
-; X64-NEXT: addq %rax, %r9
-; X64-NEXT: addq %rdx, %r9
-; X64-NEXT: addq %rdi, %rax
-; X64-NEXT: adcq %r10, %r9
-; X64-NEXT: addq %r11, %r14
-; X64-NEXT: movzbl %sil, %edi
-; X64-NEXT: adcq %rcx, %rdi
-; X64-NEXT: addq %rax, %r14
-; X64-NEXT: adcq %r9, %rdi
+; X64-NEXT: addq %rsi, %rdx
+; X64-NEXT: addq %rax, %rdx
+; X64-NEXT: addq %rbx, %rax
+; X64-NEXT: adcq %r8, %rdx
+; X64-NEXT: addq %r11, %r10
+; X64-NEXT: movzbl %cl, %esi
+; X64-NEXT: adcq %r14, %rsi
+; X64-NEXT: addq %rax, %r10
+; X64-NEXT: adcq %rdx, %rsi
; X64-NEXT: sarq $63, %r12
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax ## 8-byte Reload
; X64-NEXT: mulq %r12
; X64-NEXT: movq %rdx, %r11
-; X64-NEXT: movq %rax, %rsi
; X64-NEXT: movq %rax, %rcx
-; X64-NEXT: addq %rdx, %rcx
+; X64-NEXT: movq %rax, %r14
+; X64-NEXT: addq %rdx, %r14
; X64-NEXT: adcq $0, %r11
; X64-NEXT: movq %r12, %rax
-; X64-NEXT: mulq %rbx
-; X64-NEXT: movq %rdx, %r8
+; X64-NEXT: mulq {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Folded Reload
+; X64-NEXT: movq %rdx, %rdi
; X64-NEXT: movq %rax, %r9
-; X64-NEXT: addq %rax, %rcx
+; X64-NEXT: addq %rax, %r14
; X64-NEXT: adcq %rdx, %r11
; X64-NEXT: setb %bl
-; X64-NEXT: imulq %r12, %r15
+; X64-NEXT: imulq %r12, %rbp
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax ## 8-byte Reload
; X64-NEXT: mulq %r12
-; X64-NEXT: addq %rax, %r15
-; X64-NEXT: addq %rdx, %r15
-; X64-NEXT: addq %rsi, %rax
-; X64-NEXT: adcq %rcx, %r15
+; X64-NEXT: addq %rax, %rdx
+; X64-NEXT: addq %rbp, %rdx
+; X64-NEXT: addq %rcx, %rax
+; X64-NEXT: adcq %r14, %rdx
; X64-NEXT: addq %r9, %r11
-; X64-NEXT: movzbl %bl, %edx
-; X64-NEXT: adcq %r8, %rdx
+; X64-NEXT: movzbl %bl, %r9d
+; X64-NEXT: adcq %rdi, %r9
; X64-NEXT: addq %rax, %r11
-; X64-NEXT: adcq %r15, %rdx
-; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rsi ## 8-byte Folded Reload
-; X64-NEXT: adcq %r10, %rcx
-; X64-NEXT: adcq %r14, %r11
-; X64-NEXT: adcq %rdi, %rdx
-; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rsi ## 8-byte Folded Reload
-; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rcx ## 8-byte Folded Reload
-; X64-NEXT: adcq %r13, %r11
-; X64-NEXT: adcq %rbp, %rdx
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi ## 8-byte Reload
-; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: adcq %rdx, %r9
+; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rcx ## 8-byte Folded Reload
+; X64-NEXT: adcq %r8, %r14
+; X64-NEXT: adcq %r10, %r11
+; X64-NEXT: adcq %rsi, %r9
+; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rcx ## 8-byte Folded Reload
+; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r14 ## 8-byte Folded Reload
+; X64-NEXT: adcq %r15, %r11
+; X64-NEXT: adcq %r13, %r9
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx ## 8-byte Reload
+; X64-NEXT: movq %rdx, %rax
; X64-NEXT: sarq $63, %rax
-; X64-NEXT: xorq %rax, %rdx
-; X64-NEXT: xorq %rax, %rcx
-; X64-NEXT: orq %rdx, %rcx
+; X64-NEXT: xorq %rax, %r9
+; X64-NEXT: xorq %rax, %r14
+; X64-NEXT: orq %r9, %r14
; X64-NEXT: xorq %rax, %r11
-; X64-NEXT: xorq %rsi, %rax
+; X64-NEXT: xorq %rcx, %rax
; X64-NEXT: orq %r11, %rax
-; X64-NEXT: orq %rcx, %rax
+; X64-NEXT: orq %r14, %rax
; X64-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; X64-NEXT: movq %rdi, 24(%rax)
+; X64-NEXT: movq %rdx, 24(%rax)
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx ## 8-byte Reload
; X64-NEXT: movq %rcx, (%rax)
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx ## 8-byte Reload
; X86-NEXT: .cfi_offset %edi, -16
; X86-NEXT: .cfi_offset %ebx, -12
; X86-NEXT: .cfi_offset %ebp, -8
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
; X86-NEXT: movl %ebp, %eax
-; X86-NEXT: mull %edi
+; X86-NEXT: mull %ebx
; X86-NEXT: movl %edx, %ecx
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl %esi, %eax
-; X86-NEXT: mull %edi
-; X86-NEXT: movl %edx, %edi
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: mull %ebx
+; X86-NEXT: movl %edx, %esi
; X86-NEXT: movl %eax, %ebx
; X86-NEXT: addl %ecx, %ebx
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: adcl $0, %edi
+; X86-NEXT: adcl $0, %esi
; X86-NEXT: movl %ebp, %eax
; X86-NEXT: mull %ecx
; X86-NEXT: movl %ecx, %ebp
; X86-NEXT: movl %edx, %ecx
; X86-NEXT: addl %ebx, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl %edi, %ecx
+; X86-NEXT: adcl %esi, %ecx
; X86-NEXT: setb %bl
-; X86-NEXT: movl %esi, %eax
+; X86-NEXT: movl %edi, %eax
; X86-NEXT: mull %ebp
; X86-NEXT: addl %ecx, %eax
; X86-NEXT: movl %eax, (%esp) ## 4-byte Spill
; X86-NEXT: movzbl %bl, %eax
; X86-NEXT: adcl %eax, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: movl %edi, %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: mull %esi
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movl %edx, %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull %esi
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: movl %eax, %ebp
+; X86-NEXT: addl %ecx, %ebp
+; X86-NEXT: adcl $0, %esi
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: mull %ecx
; X86-NEXT: movl %edx, %edi
-; X86-NEXT: movl %eax, %esi
-; X86-NEXT: addl %ecx, %esi
-; X86-NEXT: adcl $0, %edi
-; X86-NEXT: movl %ebx, %eax
-; X86-NEXT: movl %ebp, %ecx
-; X86-NEXT: mull %ebp
-; X86-NEXT: movl %edx, %ebp
-; X86-NEXT: addl %esi, %eax
+; X86-NEXT: addl %ebp, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl %edi, %ebp
+; X86-NEXT: adcl %esi, %edi
; X86-NEXT: setb %bl
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT: movl %ebp, %eax
; X86-NEXT: mull %ecx
; X86-NEXT: movl %eax, %ecx
-; X86-NEXT: addl %ebp, %ecx
+; X86-NEXT: addl %edi, %ecx
; X86-NEXT: movzbl %bl, %eax
; X86-NEXT: adcl %eax, %edx
; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
; X86-NEXT: adcl $0, (%esp) ## 4-byte Folded Spill
; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: movl %edi, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl %esi, %eax
; X86-NEXT: mull %ebx
-; X86-NEXT: movl %edx, %esi
+; X86-NEXT: movl %edx, %edi
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl %ebp, %eax
; X86-NEXT: mull %ebx
; X86-NEXT: movl %edx, %ebp
; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: addl %esi, %ebx
+; X86-NEXT: addl %edi, %ebx
; X86-NEXT: adcl $0, %ebp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: movl %edi, %eax
-; X86-NEXT: mull %esi
-; X86-NEXT: movl %edx, %edi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: mull %edi
+; X86-NEXT: movl %edx, %esi
; X86-NEXT: addl %ebx, %eax
; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: adcl %ebp, %edi
+; X86-NEXT: adcl %ebp, %esi
; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %esi
+; X86-NEXT: mull %edi
; X86-NEXT: movl %edx, %ebp
-; X86-NEXT: movl %eax, %esi
-; X86-NEXT: addl %edi, %esi
+; X86-NEXT: movl %eax, %edi
+; X86-NEXT: addl %esi, %edi
; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
; X86-NEXT: adcl %eax, %ebp
; X86-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl $0, %esi
+; X86-NEXT: adcl $0, %edi
; X86-NEXT: adcl $0, %ebp
-; X86-NEXT: addl (%esp), %esi ## 4-byte Folded Reload
+; X86-NEXT: addl (%esp), %edi ## 4-byte Folded Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
; X86-NEXT: setb (%esp) ## 1-byte Folded Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: mull %edi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: mull %esi
; X86-NEXT: movl %edx, %ecx
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %edi
-; X86-NEXT: movl %edx, %edi
+; X86-NEXT: mull %esi
+; X86-NEXT: movl %edx, %esi
; X86-NEXT: movl %eax, %ebx
; X86-NEXT: addl %ecx, %ebx
-; X86-NEXT: adcl $0, %edi
+; X86-NEXT: adcl $0, %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: mull %ecx
; X86-NEXT: movl %edx, %ecx
; X86-NEXT: addl %ebx, %eax
; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: adcl %edi, %ecx
+; X86-NEXT: adcl %esi, %ecx
; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
; X86-NEXT: addl %ecx, %eax
; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 1-byte Folded Reload
; X86-NEXT: adcl %ecx, %edx
-; X86-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT: addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
; X86-NEXT: adcl %ebp, %ebx
; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movzbl (%esp), %ecx ## 1-byte Folded Reload
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: adcl $0, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: movl %edi, %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: mull %esi
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT: movl %ebp, %eax
; X86-NEXT: mull %esi
-; X86-NEXT: movl %edx, %edi
+; X86-NEXT: movl %edx, %esi
; X86-NEXT: movl %eax, %ebx
; X86-NEXT: addl %ecx, %ebx
-; X86-NEXT: adcl $0, %edi
-; X86-NEXT: movl %ebp, %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: mull %esi
+; X86-NEXT: adcl $0, %esi
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: mull %edi
; X86-NEXT: movl %edx, %ecx
; X86-NEXT: addl %ebx, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl %edi, %ecx
+; X86-NEXT: adcl %esi, %ecx
; X86-NEXT: setb %bl
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %esi
+; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: mull %edi
; X86-NEXT: addl %ecx, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movzbl %bl, %eax
; X86-NEXT: movl %edx, %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull %ebp
-; X86-NEXT: movl %edx, %edi
+; X86-NEXT: movl %edx, %esi
; X86-NEXT: movl %eax, %ebp
; X86-NEXT: addl %ecx, %ebp
-; X86-NEXT: adcl $0, %edi
+; X86-NEXT: adcl $0, %esi
; X86-NEXT: movl %ebx, %eax
-; X86-NEXT: movl %esi, %ecx
-; X86-NEXT: mull %esi
-; X86-NEXT: movl %edx, %esi
+; X86-NEXT: movl %edi, %ecx
+; X86-NEXT: mull %edi
+; X86-NEXT: movl %edx, %edi
; X86-NEXT: addl %ebp, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl %edi, %esi
+; X86-NEXT: adcl %esi, %edi
; X86-NEXT: setb %bl
; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
; X86-NEXT: movl %ebp, %eax
; X86-NEXT: mull %ecx
; X86-NEXT: movl %eax, %ecx
-; X86-NEXT: addl %esi, %ecx
+; X86-NEXT: addl %edi, %ecx
; X86-NEXT: movzbl %bl, %eax
; X86-NEXT: adcl %eax, %edx
; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
; X86-NEXT: adcl $0, (%esp) ## 4-byte Folded Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: mull %edi
-; X86-NEXT: movl %edx, %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: mull %esi
+; X86-NEXT: movl %edx, %edi
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movl %ebp, %eax
-; X86-NEXT: mull %edi
+; X86-NEXT: mull %esi
; X86-NEXT: movl %edx, %ebp
; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: addl %esi, %ebx
+; X86-NEXT: addl %edi, %ebx
; X86-NEXT: adcl $0, %ebp
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: mull %esi
-; X86-NEXT: movl %edx, %edi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: mull %edi
+; X86-NEXT: movl %edx, %esi
; X86-NEXT: addl %ebx, %eax
; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: adcl %ebp, %edi
+; X86-NEXT: adcl %ebp, %esi
; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %esi
-; X86-NEXT: movl %edx, %esi
+; X86-NEXT: mull %edi
+; X86-NEXT: movl %edx, %edi
; X86-NEXT: movl %eax, %ebp
-; X86-NEXT: addl %edi, %ebp
+; X86-NEXT: addl %esi, %ebp
; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X86-NEXT: adcl %eax, %esi
+; X86-NEXT: adcl %eax, %edi
; X86-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: adcl $0, %ebp
-; X86-NEXT: adcl $0, %esi
+; X86-NEXT: adcl $0, %edi
; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
-; X86-NEXT: adcl (%esp), %esi ## 4-byte Folded Reload
+; X86-NEXT: adcl (%esp), %edi ## 4-byte Folded Reload
; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: mull %edi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: mull %esi
; X86-NEXT: movl %edx, %ecx
; X86-NEXT: movl %eax, (%esp) ## 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %edi
-; X86-NEXT: movl %edx, %edi
+; X86-NEXT: mull %esi
+; X86-NEXT: movl %edx, %esi
; X86-NEXT: movl %eax, %ebx
; X86-NEXT: addl %ecx, %ebx
-; X86-NEXT: adcl $0, %edi
+; X86-NEXT: adcl $0, %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: mull %ecx
; X86-NEXT: movl %edx, %ecx
; X86-NEXT: addl %ebx, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl %edi, %ecx
+; X86-NEXT: adcl %esi, %ecx
; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
; X86-NEXT: adcl %eax, %edx
; X86-NEXT: movl (%esp), %ecx ## 4-byte Reload
; X86-NEXT: addl %ebp, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
-; X86-NEXT: adcl %esi, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
+; X86-NEXT: adcl %edi, %esi
; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
; X86-NEXT: adcl %eax, %ebx
; X86-NEXT: adcl $0, %edx
; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
; X86-NEXT: movl %ecx, (%esp) ## 4-byte Spill
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
; X86-NEXT: adcl $0, %esi
; X86-NEXT: movl %ebx, %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: movl %edx, %ebx
; X86-NEXT: addl %edi, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl %esi, %ecx
-; X86-NEXT: setb %bl
+; X86-NEXT: adcl %esi, %ebx
+; X86-NEXT: setb %cl
; X86-NEXT: movl %ebp, %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: mull %edi
-; X86-NEXT: addl %ecx, %eax
+; X86-NEXT: mull {{[0-9]+}}(%esp)
+; X86-NEXT: addl %ebx, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movzbl %bl, %eax
+; X86-NEXT: movzbl %cl, %eax
; X86-NEXT: adcl %eax, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: mull %ebx
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: mull %ebx
+; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: movl %eax, %ebp
+; X86-NEXT: addl %esi, %ebp
+; X86-NEXT: adcl $0, %ebx
+; X86-NEXT: movl %ecx, %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: mull %esi
; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: addl %ebp, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: adcl %ebx, %ecx
+; X86-NEXT: setb %bl
+; X86-NEXT: movl %edi, %eax
; X86-NEXT: mull %esi
-; X86-NEXT: movl %edx, %esi
-; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: addl %ecx, %ebx
-; X86-NEXT: adcl $0, %esi
-; X86-NEXT: movl %ebp, %eax
-; X86-NEXT: mull %edi
-; X86-NEXT: movl %edx, %ebp
-; X86-NEXT: addl %ebx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl %esi, %ebp
-; X86-NEXT: setb %cl
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %edi
; X86-NEXT: movl %eax, %esi
-; X86-NEXT: addl %ebp, %esi
-; X86-NEXT: movzbl %cl, %eax
+; X86-NEXT: addl %ecx, %esi
+; X86-NEXT: movzbl %bl, %eax
; X86-NEXT: adcl %eax, %edx
; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: mull %edi
-; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: mull %ebp
+; X86-NEXT: movl %edx, %ecx
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %edi
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: mull %ebp
; X86-NEXT: movl %edx, %ebp
; X86-NEXT: movl %eax, %edi
-; X86-NEXT: addl %ebx, %edi
+; X86-NEXT: addl %ecx, %edi
; X86-NEXT: adcl $0, %ebp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: mull %ebx
-; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: mull %ecx
+; X86-NEXT: movl %edx, %ebx
; X86-NEXT: addl %edi, %eax
; X86-NEXT: movl %eax, %edi
-; X86-NEXT: adcl %ebp, %ecx
+; X86-NEXT: adcl %ebp, %ebx
; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %ebx
+; X86-NEXT: mull %ecx
; X86-NEXT: movl %edx, %ebp
-; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: addl %ecx, %ebx
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: addl %ebx, %ecx
; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
; X86-NEXT: adcl %eax, %ebp
; X86-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl $0, %ebx
+; X86-NEXT: adcl $0, %ecx
; X86-NEXT: adcl $0, %ebp
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: mull %esi
-; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: mull %edi
+; X86-NEXT: movl %edx, %esi
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %esi
-; X86-NEXT: movl %edx, %esi
-; X86-NEXT: movl %eax, %edi
-; X86-NEXT: addl %ecx, %edi
-; X86-NEXT: adcl $0, %esi
+; X86-NEXT: mull %edi
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: addl %esi, %ebx
+; X86-NEXT: adcl $0, %edi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: mull %ecx
-; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: addl %edi, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl %esi, %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: mull %edx
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: addl %ebx, %eax
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: adcl %edi, %esi
; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: addl %ecx, %eax
-; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 1-byte Folded Reload
-; X86-NEXT: adcl %ecx, %edx
+; X86-NEXT: addl %esi, %eax
+; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 1-byte Folded Reload
+; X86-NEXT: adcl %esi, %edx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
-; X86-NEXT: addl %ebx, %edi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
+; X86-NEXT: addl %ecx, %edi
; X86-NEXT: adcl %ebp, %ebx
; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 1-byte Folded Reload
; X86-NEXT: adcl %ecx, %eax
; X86-NEXT: adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
; X86-NEXT: movl %edi, %edx
; X86-NEXT: adcl $0, %edx
-; X86-NEXT: movl %ebx, %ecx
-; X86-NEXT: adcl $0, %ecx
+; X86-NEXT: adcl $0, %ebx
; X86-NEXT: adcl $0, %eax
; X86-NEXT: adcl $0, %esi
; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: movl %esi, %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: mull %edi
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: mull %esi
; X86-NEXT: movl %edx, %ecx
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
; X86-NEXT: movl %ebp, %eax
-; X86-NEXT: mull %edi
-; X86-NEXT: movl %edx, %edi
+; X86-NEXT: mull %esi
+; X86-NEXT: movl %edx, %esi
; X86-NEXT: movl %eax, %ebx
; X86-NEXT: addl %ecx, %ebx
-; X86-NEXT: adcl $0, %edi
-; X86-NEXT: movl %esi, %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: mull %esi
+; X86-NEXT: adcl $0, %esi
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: mull %edi
; X86-NEXT: movl %edx, %ecx
; X86-NEXT: addl %ebx, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl %edi, %ecx
+; X86-NEXT: adcl %esi, %ecx
; X86-NEXT: setb %bl
; X86-NEXT: movl %ebp, %eax
-; X86-NEXT: mull %esi
+; X86-NEXT: mull %edi
; X86-NEXT: addl %ecx, %eax
; X86-NEXT: movl %eax, (%esp) ## 4-byte Spill
; X86-NEXT: movzbl %bl, %eax
; X86-NEXT: adcl %eax, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: movl %esi, %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: mull %edi
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: mull %esi
; X86-NEXT: movl %edx, %ecx
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
; X86-NEXT: movl %ebp, %eax
-; X86-NEXT: mull %edi
-; X86-NEXT: movl %edx, %edi
+; X86-NEXT: mull %esi
+; X86-NEXT: movl %edx, %esi
; X86-NEXT: movl %eax, %ebx
; X86-NEXT: addl %ecx, %ebx
-; X86-NEXT: adcl $0, %edi
-; X86-NEXT: movl %esi, %eax
+; X86-NEXT: adcl $0, %esi
+; X86-NEXT: movl %edi, %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: mull %ecx
-; X86-NEXT: movl %edx, %esi
+; X86-NEXT: movl %edx, %edi
; X86-NEXT: addl %ebx, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl %edi, %esi
+; X86-NEXT: adcl %esi, %edi
; X86-NEXT: setb %bl
; X86-NEXT: movl %ebp, %eax
; X86-NEXT: mull %ecx
; X86-NEXT: movl %eax, %ecx
-; X86-NEXT: addl %esi, %ecx
+; X86-NEXT: addl %edi, %ecx
; X86-NEXT: movzbl %bl, %eax
; X86-NEXT: adcl %eax, %edx
; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: adcl $0, (%esp) ## 4-byte Folded Spill
; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: movl %edi, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl %esi, %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
; X86-NEXT: mull %ebx
-; X86-NEXT: movl %edx, %esi
+; X86-NEXT: movl %edx, %edi
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movl %ebp, %eax
; X86-NEXT: mull %ebx
; X86-NEXT: movl %edx, %ebx
; X86-NEXT: movl %eax, %ebp
-; X86-NEXT: addl %esi, %ebp
+; X86-NEXT: addl %edi, %ebp
; X86-NEXT: adcl $0, %ebx
-; X86-NEXT: movl %edi, %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: mull %esi
-; X86-NEXT: movl %edx, %edi
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: mull %edi
+; X86-NEXT: movl %edx, %esi
; X86-NEXT: addl %ebp, %eax
; X86-NEXT: movl %eax, %ebp
-; X86-NEXT: adcl %ebx, %edi
+; X86-NEXT: adcl %ebx, %esi
; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %esi
-; X86-NEXT: movl %edx, %ebx
-; X86-NEXT: movl %eax, %esi
-; X86-NEXT: addl %edi, %esi
+; X86-NEXT: mull %edi
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: addl %esi, %ebx
; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X86-NEXT: adcl %eax, %ebx
+; X86-NEXT: adcl %eax, %edi
; X86-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl $0, %esi
; X86-NEXT: adcl $0, %ebx
-; X86-NEXT: addl (%esp), %esi ## 4-byte Folded Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; X86-NEXT: adcl $0, %edi
+; X86-NEXT: addl (%esp), %ebx ## 4-byte Folded Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: mull %edi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: mull %esi
; X86-NEXT: movl %edx, %ecx
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %edi
-; X86-NEXT: movl %edx, %edi
+; X86-NEXT: mull %esi
+; X86-NEXT: movl %edx, %esi
; X86-NEXT: movl %eax, %ebp
; X86-NEXT: addl %ecx, %ebp
-; X86-NEXT: adcl $0, %edi
+; X86-NEXT: adcl $0, %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: mull %ecx
; X86-NEXT: movl %edx, %ecx
; X86-NEXT: addl %ebp, %eax
; X86-NEXT: movl %eax, %ebp
-; X86-NEXT: adcl %edi, %ecx
+; X86-NEXT: adcl %esi, %ecx
; X86-NEXT: setb (%esp) ## 1-byte Folded Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: movl %edi, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: addl %ecx, %eax
-; X86-NEXT: movzbl (%esp), %ecx ## 1-byte Folded Reload
-; X86-NEXT: adcl %ecx, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
-; X86-NEXT: addl %esi, %ecx
-; X86-NEXT: adcl %ebx, %ebp
-; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 1-byte Folded Reload
-; X86-NEXT: adcl %esi, %eax
; X86-NEXT: movl %eax, %esi
+; X86-NEXT: addl %ecx, %esi
+; X86-NEXT: movzbl (%esp), %eax ## 1-byte Folded Reload
+; X86-NEXT: adcl %eax, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
+; X86-NEXT: addl %ebx, %ecx
+; X86-NEXT: adcl %edi, %ebp
+; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
+; X86-NEXT: adcl %eax, %esi
; X86-NEXT: adcl $0, %edx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
; X86-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: adcl $0, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: sarl $31, %edi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %edi
-; X86-NEXT: movl %edx, %esi
-; X86-NEXT: movl %eax, %ecx
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: sarl $31, %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %edi
-; X86-NEXT: movl %edi, %ebp
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: mull %esi
+; X86-NEXT: movl %edx, %ecx
; X86-NEXT: movl %eax, %edi
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl %esi, %eax
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: addl %esi, %ebx
-; X86-NEXT: movl %ebx, %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: mull %esi
+; X86-NEXT: movl %esi, %ebp
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: addl %ecx, %eax
; X86-NEXT: movl %edx, %ebx
; X86-NEXT: adcl $0, %ebx
-; X86-NEXT: addl %ecx, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl %eax, %ebx
+; X86-NEXT: addl %edi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: adcl %ecx, %ebx
; X86-NEXT: setb %al
-; X86-NEXT: addl %edi, %ebx
+; X86-NEXT: addl %esi, %ebx
; X86-NEXT: movzbl %al, %eax
; X86-NEXT: adcl %edx, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl %ebp, %ecx
+; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: mull %ebp
; X86-NEXT: movl %edx, %edi
-; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: movl %eax, %ebp
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %ebp
-; X86-NEXT: movl %edx, %esi
+; X86-NEXT: mull %ecx
+; X86-NEXT: movl %edx, %ecx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movl %eax, %edx
-; X86-NEXT: movl %eax, %ebp
+; X86-NEXT: movl %eax, %esi
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movl %edi, %eax
; X86-NEXT: movl %edi, (%esp) ## 4-byte Spill
; X86-NEXT: addl %edi, %edx
-; X86-NEXT: movl %esi, %edi
+; X86-NEXT: movl %ecx, %edi
; X86-NEXT: adcl $0, %edi
-; X86-NEXT: addl %ecx, %edx
+; X86-NEXT: addl %ebp, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: adcl %eax, %edi
; X86-NEXT: setb %al
-; X86-NEXT: addl %ebp, %edi
+; X86-NEXT: addl %esi, %edi
; X86-NEXT: movzbl %al, %edx
-; X86-NEXT: adcl %esi, %edx
+; X86-NEXT: adcl %ecx, %edx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
; X86-NEXT: addl %edi, %ecx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
; X86-NEXT: movl %edx, %esi
-; X86-NEXT: movl %eax, %ebp
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: movl %eax, %ebp
; X86-NEXT: addl %edx, %ebp
; X86-NEXT: adcl $0, %esi
; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: movl %ecx, %ebx
; X86-NEXT: mull {{[0-9]+}}(%esp)
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: adcl %edx, %esi
; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT: imull %ecx, %ebx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: imull %ebx, %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %ecx
-; X86-NEXT: addl %eax, %ebx
-; X86-NEXT: addl %edx, %ebx
+; X86-NEXT: mull %ebx
+; X86-NEXT: addl %eax, %edx
+; X86-NEXT: addl %ecx, %edx
; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
-; X86-NEXT: adcl %ebp, %ebx
+; X86-NEXT: adcl %ebp, %edx
; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 1-byte Folded Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
; X86-NEXT: addl %eax, %esi
-; X86-NEXT: adcl %ebx, %ebp
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
-; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: adcl %edx, %ebp
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
+; X86-NEXT: movl %ebx, %edx
; X86-NEXT: movl (%esp), %eax ## 4-byte Reload
-; X86-NEXT: addl %eax, %ebx
+; X86-NEXT: addl %eax, %edx
; X86-NEXT: adcl $0, %eax
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
; X86-NEXT: movl %eax, (%esp) ## 4-byte Spill
; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
; X86-NEXT: addl %ecx, %eax
; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
-; X86-NEXT: addl %edx, %ecx
-; X86-NEXT: movl %ecx, %edx
-; X86-NEXT: movl %ebx, %ecx
-; X86-NEXT: adcl %ebx, %eax
+; X86-NEXT: addl %ebx, %ecx
+; X86-NEXT: movl %ecx, %ebx
+; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: adcl %edx, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl (%esp), %ebx ## 4-byte Reload
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; X86-NEXT: movl (%esp), %edx ## 4-byte Reload
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
-; X86-NEXT: addl %edx, %ebx
+; X86-NEXT: addl %ebx, %edx
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
-; X86-NEXT: movl %eax, %edx
+; X86-NEXT: movl %eax, %ebx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
-; X86-NEXT: adcl %esi, %ebx
-; X86-NEXT: movl %edx, %esi
+; X86-NEXT: adcl %esi, %edx
+; X86-NEXT: movl %ebx, %esi
; X86-NEXT: adcl %ebp, %esi
; X86-NEXT: addl %edi, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
-; X86-NEXT: movl %ebx, (%esp) ## 4-byte Spill
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT: movl %edx, (%esp) ## 4-byte Spill
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: sarl $31, %eax
-; X86-NEXT: movl %eax, %esi
+; X86-NEXT: movl %eax, %edi
; X86-NEXT: mull {{[0-9]+}}(%esp)
; X86-NEXT: movl %eax, %ecx
; X86-NEXT: movl %eax, %ebp
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl %edx, %edi
+; X86-NEXT: movl %edx, %esi
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: addl %edx, %ecx
-; X86-NEXT: adcl $0, %edi
-; X86-NEXT: movl %esi, %eax
+; X86-NEXT: adcl $0, %esi
+; X86-NEXT: movl %edi, %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: addl %eax, %ecx
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl %edx, %edi
+; X86-NEXT: adcl %edx, %esi
; X86-NEXT: setb %bl
-; X86-NEXT: addl %eax, %edi
+; X86-NEXT: addl %eax, %esi
; X86-NEXT: movzbl %bl, %ebx
; X86-NEXT: adcl %edx, %ebx
; X86-NEXT: movl %ebp, %eax
-; X86-NEXT: addl %edi, %eax
+; X86-NEXT: addl %esi, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: adcl %ebx, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl $0, %edi
+; X86-NEXT: adcl $0, %esi
; X86-NEXT: adcl $0, %ebx
; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl %esi, %eax
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: movl %edi, %ecx
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: mull {{[0-9]+}}(%esp)
; X86-NEXT: movl %edx, %ebp
; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: movl %eax, %ecx
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: addl %edx, %ebx
-; X86-NEXT: adcl $0, %ebp
-; X86-NEXT: movl %esi, %eax
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: adcl $0, %edi
+; X86-NEXT: movl %ecx, %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: movl %eax, %esi
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: addl %eax, %ebx
-; X86-NEXT: adcl %edx, %ebp
-; X86-NEXT: setb %al
-; X86-NEXT: addl %esi, %ebp
-; X86-NEXT: movzbl %al, %eax
+; X86-NEXT: adcl %edx, %edi
+; X86-NEXT: setb %cl
+; X86-NEXT: addl %eax, %edi
+; X86-NEXT: movzbl %cl, %eax
; X86-NEXT: adcl %edx, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
; X86-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
; X86-NEXT: adcl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT: movl %ebp, %edx
-; X86-NEXT: adcl $0, %edx
-; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: movl %edi, %ecx
; X86-NEXT: adcl $0, %ecx
-; X86-NEXT: addl %edi, %edx
-; X86-NEXT: movl %edx, %edi
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
-; X86-NEXT: setb %dl
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl %ebx, %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movzbl %dl, %ecx
-; X86-NEXT: adcl %ebp, %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: movl %eax, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
; X86-NEXT: adcl $0, %eax
+; X86-NEXT: addl %esi, %ecx
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: setb %al
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: adcl %ebx, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: movzbl %al, %eax
+; X86-NEXT: adcl %edi, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: adcl $0, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
+; X86-NEXT: movl %esi, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
+; X86-NEXT: addl %eax, %edx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
-; X86-NEXT: movl %ebx, %edx
+; X86-NEXT: adcl $0, %ebx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
; X86-NEXT: addl %ecx, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
-; X86-NEXT: movl %ebp, %edi
-; X86-NEXT: adcl $0, %edi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
-; X86-NEXT: addl %eax, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl %ecx, %edi
-; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
-; X86-NEXT: addl %ecx, %esi
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
-; X86-NEXT: addl %eax, %ecx
-; X86-NEXT: adcl %edx, %esi
-; X86-NEXT: addl %ebx, %edi
-; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X86-NEXT: adcl %ebp, %eax
+; X86-NEXT: adcl %eax, %ebx
+; X86-NEXT: setb %al
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
+; X86-NEXT: addl %edi, %ebp
; X86-NEXT: addl %ecx, %edi
-; X86-NEXT: adcl %esi, %eax
+; X86-NEXT: adcl %edx, %ebp
+; X86-NEXT: addl %esi, %ebx
+; X86-NEXT: movzbl %al, %eax
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
+; X86-NEXT: addl %edi, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: adcl %ebp, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
-; X86-NEXT: mull %ecx
-; X86-NEXT: movl %edx, %ebx
-; X86-NEXT: movl %eax, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
+; X86-NEXT: mull %ebp
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: movl %eax, %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %ecx
-; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: mull %ebp
+; X86-NEXT: movl %edx, %edi
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl %eax, %ebp
+; X86-NEXT: movl %eax, %ebx
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: addl %ebx, %ebp
-; X86-NEXT: adcl $0, %ecx
-; X86-NEXT: addl %esi, %ebp
-; X86-NEXT: adcl %ebx, %ecx
+; X86-NEXT: addl %esi, %ebx
+; X86-NEXT: adcl $0, %edi
+; X86-NEXT: addl %ecx, %ebx
+; X86-NEXT: adcl %esi, %edi
; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
-; X86-NEXT: imull %eax, %ebx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: imull %ebp, %esi
+; X86-NEXT: movl %ebp, %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: addl %eax, %ebx
-; X86-NEXT: addl %edx, %ebx
-; X86-NEXT: addl %esi, %eax
-; X86-NEXT: adcl %ebp, %ebx
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
-; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 1-byte Folded Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
-; X86-NEXT: addl %eax, %ecx
+; X86-NEXT: addl %esi, %edx
+; X86-NEXT: addl %eax, %edx
+; X86-NEXT: addl %ecx, %eax
; X86-NEXT: adcl %ebx, %edx
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
+; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 1-byte Folded Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT: addl %eax, %edi
+; X86-NEXT: adcl %edx, %esi
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
-; X86-NEXT: addl %eax, %esi
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
-; X86-NEXT: adcl %edi, %ecx
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
+; X86-NEXT: addl %eax, %ecx
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; X86-NEXT: adcl (%esp), %edi ## 4-byte Folded Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
-; X86-NEXT: adcl (%esp), %ecx ## 4-byte Folded Reload
+; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
-; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
+; X86-NEXT: movl %ebp, %eax
; X86-NEXT: sarl $31, %eax
+; X86-NEXT: xorl %eax, %edx
; X86-NEXT: xorl %eax, %edi
+; X86-NEXT: orl %edx, %edi
; X86-NEXT: xorl %eax, %ecx
-; X86-NEXT: orl %edi, %ecx
-; X86-NEXT: xorl %eax, %esi
-; X86-NEXT: orl %ecx, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
+; X86-NEXT: xorl %eax, %edx
+; X86-NEXT: orl %ecx, %edx
+; X86-NEXT: orl %edi, %edx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
; X86-NEXT: xorl %eax, %ecx
-; X86-NEXT: orl %esi, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
; X86-NEXT: xorl %eax, %esi
-; X86-NEXT: xorl %eax, %edx
-; X86-NEXT: orl %esi, %edx
-; X86-NEXT: xorl %eax, %ebp
-; X86-NEXT: orl %edx, %ebp
+; X86-NEXT: orl %ecx, %esi
+; X86-NEXT: xorl %eax, %ebx
; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
-; X86-NEXT: orl %ebp, %eax
-; X86-NEXT: orl %ecx, %eax
+; X86-NEXT: orl %ebx, %eax
+; X86-NEXT: orl %esi, %eax
+; X86-NEXT: orl %edx, %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl %ebx, 28(%eax)
+; X86-NEXT: movl %ebp, 28(%eax)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
; X86-NEXT: movl %ecx, (%eax)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
; WIN32: # %bb.0:
; WIN32-NEXT: pushl %ebp
; WIN32-NEXT: pushl %ebx
-; WIN32-NEXT: subl $16, %esp
+; WIN32-NEXT: subl $12, %esp
; WIN32-NEXT: movl %esi, (%esp) # 4-byte Spill
; WIN32-NEXT: movl %edi, %esi
; WIN32-NEXT: movl %edx, %ebx
; WIN32-NEXT: subl %esi, %ebx
; WIN32-NEXT: movl %edi, %eax
; WIN32-NEXT: subl %ecx, %eax
-; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; WIN32-NEXT: movl %ebp, %ecx
; WIN32-NEXT: subl {{[0-9]+}}(%esp), %ecx
; WIN32-NEXT: imull %eax, %ecx
-; WIN32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %esi
-; WIN32-NEXT: movl %esi, %edx
-; WIN32-NEXT: subl {{[0-9]+}}(%esp), %edx
-; WIN32-NEXT: imull %ebx, %edx
-; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; WIN32-NEXT: movl %esi, %eax
+; WIN32-NEXT: subl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT: imull %ebx, %eax
+; WIN32-NEXT: addl %ecx, %eax
; WIN32-NEXT: movl (%esp), %ebx # 4-byte Reload
-; WIN32-NEXT: subl %ebp, %ebx
-; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT: movl %eax, %ecx
+; WIN32-NEXT: subl {{[0-9]+}}(%esp), %ebx
+; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edx
+; WIN32-NEXT: movl %edx, %ecx
; WIN32-NEXT: subl {{[0-9]+}}(%esp), %ecx
; WIN32-NEXT: imull %ebx, %ecx
-; WIN32-NEXT: addl %edx, %ecx
+; WIN32-NEXT: addl %eax, %ecx
; WIN32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; WIN32-NEXT: addl (%esp), %ebp # 4-byte Folded Reload
-; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; WIN32-NEXT: addl {{[0-9]+}}(%esp), %edx
-; WIN32-NEXT: imull %edx, %edi
+; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT: addl (%esp), %eax # 4-byte Folded Reload
+; WIN32-NEXT: addl {{[0-9]+}}(%esp), %ebp
+; WIN32-NEXT: imull %ebp, %edi
; WIN32-NEXT: addl {{[0-9]+}}(%esp), %esi
; WIN32-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; WIN32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT: imull %ebp, %eax
-; WIN32-NEXT: addl %esi, %eax
-; WIN32-NEXT: addl %eax, %edi
+; WIN32-NEXT: addl %esi, %edi
+; WIN32-NEXT: addl {{[0-9]+}}(%esp), %edx
+; WIN32-NEXT: imull %eax, %edx
+; WIN32-NEXT: addl %edx, %edi
; WIN32-NEXT: addl %ecx, %edi
-; WIN32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
; WIN32-NEXT: movl %edi, %eax
-; WIN32-NEXT: addl $16, %esp
+; WIN32-NEXT: addl $12, %esp
; WIN32-NEXT: popl %ebx
; WIN32-NEXT: popl %ebp
; WIN32-NEXT: retl
; WIN64-NEXT: # kill: def $r11d killed $r11d killed $r11
; WIN64-NEXT: subl %r12d, %r11d
; WIN64-NEXT: imull %edx, %r11d
+; WIN64-NEXT: addl %r9d, %r11d
; WIN64-NEXT: leal (%r14,%r15), %edx
-; WIN64-NEXT: # kill: def $r14d killed $r14d killed $r14
-; WIN64-NEXT: subl %r15d, %r14d
-; WIN64-NEXT: imull %esi, %r14d
-; WIN64-NEXT: addl %r11d, %r14d
+; WIN64-NEXT: movl %r14d, %r9d
+; WIN64-NEXT: subl %r15d, %r9d
+; WIN64-NEXT: imull %esi, %r9d
+; WIN64-NEXT: addl %r11d, %r9d
; WIN64-NEXT: addl %ecx, %eax
; WIN64-NEXT: imull %r8d, %eax
; WIN64-NEXT: imull %ebx, %r10d
+; WIN64-NEXT: addl %r10d, %eax
; WIN64-NEXT: imull %edi, %edx
-; WIN64-NEXT: addl %r10d, %edx
; WIN64-NEXT: addl %edx, %eax
-; WIN64-NEXT: addl %r14d, %eax
; WIN64-NEXT: addl %r9d, %eax
; WIN64-NEXT: popq %rbx
; WIN64-NEXT: retq
; LINUXOSX-NEXT: leal (%r13,%r14), %r11d
; LINUXOSX-NEXT: movl %r13d, %r12d
; LINUXOSX-NEXT: subl %r14d, %r12d
-; LINUXOSX-NEXT: movl {{[0-9]+}}(%rsp), %r14d
; LINUXOSX-NEXT: imull %edx, %r12d
-; LINUXOSX-NEXT: movl %r15d, %edx
-; LINUXOSX-NEXT: subl %r14d, %edx
-; LINUXOSX-NEXT: imull %esi, %edx
-; LINUXOSX-NEXT: addl %r12d, %edx
+; LINUXOSX-NEXT: movl {{[0-9]+}}(%rsp), %edx
+; LINUXOSX-NEXT: addl %r9d, %r12d
+; LINUXOSX-NEXT: movl %r15d, %r9d
+; LINUXOSX-NEXT: subl %edx, %r9d
+; LINUXOSX-NEXT: imull %esi, %r9d
+; LINUXOSX-NEXT: addl %r12d, %r9d
; LINUXOSX-NEXT: addl %ecx, %eax
; LINUXOSX-NEXT: imull %r8d, %eax
; LINUXOSX-NEXT: imull %r10d, %r11d
-; LINUXOSX-NEXT: addl %r15d, %r14d
-; LINUXOSX-NEXT: imull %edi, %r14d
-; LINUXOSX-NEXT: addl %r11d, %r14d
-; LINUXOSX-NEXT: addl %r14d, %eax
+; LINUXOSX-NEXT: addl %r11d, %eax
+; LINUXOSX-NEXT: addl %r15d, %edx
+; LINUXOSX-NEXT: imull %edi, %edx
; LINUXOSX-NEXT: addl %edx, %eax
; LINUXOSX-NEXT: addl %r9d, %eax
; LINUXOSX-NEXT: retq
; CHECK-X64-NEXT: .cfi_def_cfa_offset 71888
; CHECK-X64-NEXT: .cfi_offset %rax, -16
; CHECK-X64-NEXT: movl 71888(%rsp), %eax
+; CHECK-X64-NEXT: addl %esi, %edi
; CHECK-X64-NEXT: addl %ecx, %edx
+; CHECK-X64-NEXT: addl %edi, %edx
+; CHECK-X64-NEXT: addl %r9d, %r8d
; CHECK-X64-NEXT: addl 71896(%rsp), %eax
-; CHECK-X64-NEXT: addl %esi, %edx
-; CHECK-X64-NEXT: addl %r9d, %eax
; CHECK-X64-NEXT: addl %r8d, %eax
; CHECK-X64-NEXT: addl %edx, %eax
-; CHECK-X64-NEXT: addl %edi, %eax
; CHECK-X64-NEXT: movl %eax, 264(%rsp)
; CHECK-X64-NEXT: movl %eax, 28664(%rsp)
; CHECK-X64-NEXT: addq $71872, %rsp # imm = 0x118C0
; CHECK-X86-NEXT: .cfi_offset %edx, -12
; CHECK-X86-NEXT: .cfi_offset %esi, -8
; CHECK-X86-NEXT: movl 72056(%esp), %eax
-; CHECK-X86-NEXT: movl 72048(%esp), %ecx
-; CHECK-X86-NEXT: movl 72040(%esp), %edx
+; CHECK-X86-NEXT: movl 72048(%esp), %edx
+; CHECK-X86-NEXT: movl 72040(%esp), %ecx
; CHECK-X86-NEXT: movl 72032(%esp), %esi
; CHECK-X86-NEXT: addl 72036(%esp), %esi
-; CHECK-X86-NEXT: addl 72044(%esp), %edx
-; CHECK-X86-NEXT: addl 72052(%esp), %ecx
+; CHECK-X86-NEXT: addl 72044(%esp), %ecx
+; CHECK-X86-NEXT: addl %esi, %ecx
+; CHECK-X86-NEXT: addl 72052(%esp), %edx
; CHECK-X86-NEXT: addl 72060(%esp), %eax
-; CHECK-X86-NEXT: addl %ecx, %eax
; CHECK-X86-NEXT: addl %edx, %eax
-; CHECK-X86-NEXT: addl %esi, %eax
+; CHECK-X86-NEXT: addl %ecx, %eax
; CHECK-X86-NEXT: movl %eax, 392(%esp)
; CHECK-X86-NEXT: movl %eax, 28792(%esp)
; CHECK-X86-NEXT: addl $72012, %esp # imm = 0x1194C
; CHECK-X32-NEXT: .cfi_def_cfa_offset 71888
; CHECK-X32-NEXT: .cfi_offset %rax, -16
; CHECK-X32-NEXT: movl 71888(%esp), %eax
+; CHECK-X32-NEXT: addl %esi, %edi
; CHECK-X32-NEXT: addl %ecx, %edx
+; CHECK-X32-NEXT: addl %edi, %edx
+; CHECK-X32-NEXT: addl %r9d, %r8d
; CHECK-X32-NEXT: addl 71896(%esp), %eax
-; CHECK-X32-NEXT: addl %esi, %edx
-; CHECK-X32-NEXT: addl %r9d, %eax
; CHECK-X32-NEXT: addl %r8d, %eax
; CHECK-X32-NEXT: addl %edx, %eax
-; CHECK-X32-NEXT: addl %edi, %eax
; CHECK-X32-NEXT: movl %eax, 264(%esp)
; CHECK-X32-NEXT: movl %eax, 28664(%esp)
; CHECK-X32-NEXT: addl $71872, %esp # imm = 0x118C0
; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax
; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; CHECK-NEXT: movl %edi, %ebp
-; CHECK-NEXT: movl %esi, %ebx
-; CHECK-NEXT: movl %edx, %r12d
-; CHECK-NEXT: movl %ecx, %r13d
-; CHECK-NEXT: movl %r8d, %r14d
-; CHECK-NEXT: movl %r9d, %r15d
-; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT: movl %edi, %eax
; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT: movl %esi, %eax
; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT: movl %edx, %r14d
+; CHECK-NEXT: movl %ecx, %eax
; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT: movl %r8d, %r12d
+; CHECK-NEXT: movl %r9d, %eax
; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %r13d
; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax
; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax
; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax
; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %ebp
; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax
; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax
; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %ebx
; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax
; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %r15d
; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax
; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
; CHECK-NEXT: callq _bar ## 160-byte Folded Reload
; CHECK-NEXT: Ltmp13:
-; CHECK-NEXT: addq %r12, %rbx
-; CHECK-NEXT: addq %r13, %rbx
-; CHECK-NEXT: addq %r14, %rbx
-; CHECK-NEXT: addq %r15, %rbx
+; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax ## 8-byte Reload
+; CHECK-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rax ## 8-byte Folded Reload
+; CHECK-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r14 ## 8-byte Folded Reload
+; CHECK-NEXT: addq %rax, %r14
+; CHECK-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r12 ## 8-byte Folded Reload
+; CHECK-NEXT: addq %r14, %r12
; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT: addq %rax, %rbx
+; CHECK-NEXT: addq %rax, %r12
; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT: addq %rax, %rbx
-; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT: addq %rax, %rbx
-; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT: addq %rax, %rbx
-; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT: addq %rax, %rbx
+; CHECK-NEXT: addq %rax, %r15
+; CHECK-NEXT: addq %r12, %r15
; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax
; CHECK-NEXT: addq %rax, %rbx
; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax
; CHECK-NEXT: addq %rax, %rbx
+; CHECK-NEXT: addq %r15, %rbx
; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT: addq %rax, %rbx
+; CHECK-NEXT: addq %rax, %rbp
; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT: addq %rax, %rbx
+; CHECK-NEXT: addq %rax, %rbp
; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT: addq %rax, %rbx
+; CHECK-NEXT: addq %rax, %rbp
+; CHECK-NEXT: addq %rbx, %rbp
; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT: addq %rax, %rbx
+; CHECK-NEXT: addq %rax, %r13
; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT: addq %rax, %rbx
+; CHECK-NEXT: addq %rax, %r13
; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT: addq %rax, %rbx
+; CHECK-NEXT: addq %rax, %r13
; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT: addq %rax, %rbx
+; CHECK-NEXT: addq %rax, %r13
+; CHECK-NEXT: addq %rbp, %r13
+; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %ecx
; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT: addq %rax, %rbx
-; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT: addq %rax, %rbx
-; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT: addq %rax, %rbx
+; CHECK-NEXT: addq %rax, %rcx
; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT: addq %rax, %rbx
+; CHECK-NEXT: addq %rax, %rcx
; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT: addq %rax, %rbx
+; CHECK-NEXT: addq %rax, %rcx
; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT: addq %rax, %rbx
-; CHECK-NEXT: addq %rbp, %rbx
-; CHECK-NEXT: movq %rbx, %rax
+; CHECK-NEXT: addq %rax, %rcx
+; CHECK-NEXT: addq %r13, %rcx
+; CHECK-NEXT: movq %rcx, %rax
; CHECK-NEXT: addq $168, %rsp
; CHECK-NEXT: popq %rbx
; CHECK-NEXT: popq %r12
; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax
; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; CHECK-NEXT: movl %edi, %ebp
-; CHECK-NEXT: movl %esi, %ebx
-; CHECK-NEXT: movl %edx, %r12d
-; CHECK-NEXT: movl %ecx, %r13d
-; CHECK-NEXT: movl %r8d, %r14d
-; CHECK-NEXT: movl %r9d, %r15d
-; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT: movl %edi, %eax
; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT: movl %esi, %eax
; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT: movl %edx, %r14d
+; CHECK-NEXT: movl %ecx, %eax
; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT: movl %r8d, %r12d
+; CHECK-NEXT: movl %r9d, %eax
; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %r13d
; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax
; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax
; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax
; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %ebp
; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax
; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax
; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %ebx
; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax
; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %r15d
; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax
; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
; CHECK-NEXT: callq _bar ## 160-byte Folded Reload
; CHECK-NEXT: Ltmp14:
-; CHECK-NEXT: addq %r12, %rbx
-; CHECK-NEXT: addq %r13, %rbx
-; CHECK-NEXT: addq %r14, %rbx
-; CHECK-NEXT: addq %r15, %rbx
+; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax ## 8-byte Reload
+; CHECK-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rax ## 8-byte Folded Reload
+; CHECK-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r14 ## 8-byte Folded Reload
+; CHECK-NEXT: addq %rax, %r14
+; CHECK-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r12 ## 8-byte Folded Reload
+; CHECK-NEXT: addq %r14, %r12
; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT: addq %rax, %rbx
+; CHECK-NEXT: addq %rax, %r12
; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT: addq %rax, %rbx
-; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT: addq %rax, %rbx
-; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT: addq %rax, %rbx
-; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT: addq %rax, %rbx
+; CHECK-NEXT: addq %rax, %r15
+; CHECK-NEXT: addq %r12, %r15
; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax
; CHECK-NEXT: addq %rax, %rbx
; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax
; CHECK-NEXT: addq %rax, %rbx
+; CHECK-NEXT: addq %r15, %rbx
; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT: addq %rax, %rbx
+; CHECK-NEXT: addq %rax, %rbp
; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT: addq %rax, %rbx
+; CHECK-NEXT: addq %rax, %rbp
; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT: addq %rax, %rbx
+; CHECK-NEXT: addq %rax, %rbp
+; CHECK-NEXT: addq %rbx, %rbp
; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT: addq %rax, %rbx
+; CHECK-NEXT: addq %rax, %r13
; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT: addq %rax, %rbx
+; CHECK-NEXT: addq %rax, %r13
; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT: addq %rax, %rbx
+; CHECK-NEXT: addq %rax, %r13
; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT: addq %rax, %rbx
+; CHECK-NEXT: addq %rax, %r13
+; CHECK-NEXT: addq %rbp, %r13
+; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %ecx
; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT: addq %rax, %rbx
-; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT: addq %rax, %rbx
-; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT: addq %rax, %rbx
+; CHECK-NEXT: addq %rax, %rcx
; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT: addq %rax, %rbx
+; CHECK-NEXT: addq %rax, %rcx
; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT: addq %rax, %rbx
+; CHECK-NEXT: addq %rax, %rcx
; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT: addq %rax, %rbx
-; CHECK-NEXT: addq %rbp, %rbx
-; CHECK-NEXT: movq %rbx, %rax
+; CHECK-NEXT: addq %rax, %rcx
+; CHECK-NEXT: addq %r13, %rcx
+; CHECK-NEXT: movq %rcx, %rax
; CHECK-NEXT: addq $168, %rsp
; CHECK-NEXT: popq %rbx
; CHECK-NEXT: popq %r12
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: movl %edi, {{[0-9]+}}(%rsp)
; CHECK-NEXT: callq gen3@PLT
-; CHECK-NEXT: # kill: def $edx killed $edx def $rdx
; CHECK-NEXT: # kill: def $ecx killed $ecx def $rcx
-; CHECK-NEXT: addl %edx, %ecx
+; CHECK-NEXT: # kill: def $r8d killed $r8d def $r8
+; CHECK-NEXT: addl %edx, %eax
; CHECK-NEXT: addl %r8d, %ecx
; CHECK-NEXT: addl %ecx, %eax
; CHECK-NEXT: popq %rcx
; CHECK-NEXT: addsd %xmm1, %xmm0
; CHECK-NEXT: addsd %xmm2, %xmm0
; CHECK-NEXT: addsd %xmm3, %xmm0
-; CHECK-NEXT: addq %rdx, %rcx
+; CHECK-NEXT: addq %rdx, %rax
; CHECK-NEXT: addq %r8, %rcx
; CHECK-NEXT: addq %rcx, %rax
; CHECK-NEXT: popq %rcx
; X86-NEXT: movl %ebx, %eax
; X86-NEXT: mull %edi
; X86-NEXT: movl %edx, %esi
-; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: mull %edi
; X86-NEXT: movl %edx, %edi
; X86-NEXT: movl %ecx, %ebx
; X86-NEXT: movl %edx, %esi
; X86-NEXT: addl %ebp, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
; X86-NEXT: adcl %edi, %esi
; X86-NEXT: setb %cl
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: addl %ebx, %esi
; X86-NEXT: movzbl %cl, %eax
; X86-NEXT: adcl %eax, %edx
-; X86-NEXT: addl (%esp), %esi # 4-byte Folded Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: adcl (%esp), %edx # 4-byte Folded Reload
+; X86-NEXT: movl %edx, (%esp) # 4-byte Spill
; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NEXT: adcl $0, %edi
-; X86-NEXT: movl %edi, (%esp) # 4-byte Spill
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
; X86-NEXT: movl %ebp, %eax
; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
; X86-NEXT: adcl %eax, %ecx
; X86-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: adcl (%esp), %edi # 4-byte Folded Reload
; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl $0, %ebx
; X86-NEXT: adcl $0, %ecx
; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT: adcl (%esp), %ecx # 4-byte Folded Reload
-; X86-NEXT: setb (%esp) # 1-byte Folded Spill
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
; X86-NEXT: mull %edi
; X86-NEXT: addl %ebp, %eax
; X86-NEXT: movl %eax, %ebp
; X86-NEXT: adcl %edi, %esi
-; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT: setb (%esp) # 1-byte Folded Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
; X86-NEXT: addl %esi, %eax
-; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
+; X86-NEXT: movzbl (%esp), %esi # 1-byte Folded Reload
; X86-NEXT: adcl %esi, %edx
; X86-NEXT: addl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NEXT: adcl %ecx, %ebp
; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload
+; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
; X86-NEXT: adcl %ecx, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl $0, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: movl %edi, %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: mull %esi
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT: movl %ebp, %eax
; X86-NEXT: mull %esi
; X86-NEXT: movl %edx, %esi
-; X86-NEXT: movl %eax, %edi
-; X86-NEXT: addl %ecx, %edi
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: addl %ecx, %ebx
; X86-NEXT: adcl $0, %esi
-; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: mull %edi
+; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: addl %ebx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl %esi, %ecx
+; X86-NEXT: setb %bl
+; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: mull %edi
+; X86-NEXT: addl %ecx, %eax
+; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT: movzbl %bl, %eax
+; X86-NEXT: adcl %eax, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: movl %edi, %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
; X86-NEXT: mull %ebp
-; X86-NEXT: movl %edx, %ebx
-; X86-NEXT: addl %edi, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl %esi, %ebx
-; X86-NEXT: setb %cl
+; X86-NEXT: movl %edx, %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull %ebp
-; X86-NEXT: movl %edx, %edi
-; X86-NEXT: addl %ebx, %eax
-; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT: movzbl %cl, %eax
-; X86-NEXT: adcl %eax, %edi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: mull %ecx
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %edx, %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %ecx
-; X86-NEXT: movl %edx, %ebx
-; X86-NEXT: movl %eax, %ebp
-; X86-NEXT: addl %esi, %ebp
-; X86-NEXT: adcl $0, %ebx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: movl %edx, %esi
-; X86-NEXT: addl %ebp, %eax
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: addl %ecx, %ebx
+; X86-NEXT: adcl $0, %esi
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: movl %edi, %ebp
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: mull %edi
+; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: addl %ebx, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl %ebx, %esi
+; X86-NEXT: adcl %esi, %ecx
; X86-NEXT: setb %bl
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: movl %eax, %ecx
-; X86-NEXT: addl %esi, %ecx
-; X86-NEXT: movzbl %bl, %eax
-; X86-NEXT: adcl %eax, %edx
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: mull %edi
+; X86-NEXT: addl %ecx, %eax
+; X86-NEXT: movzbl %bl, %ecx
+; X86-NEXT: adcl %ecx, %edx
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl $0, (%esp) # 4-byte Folded Spill
-; X86-NEXT: adcl $0, %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl %ebp, %ebx
; X86-NEXT: movl %ebp, %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: mull %esi
-; X86-NEXT: movl %edx, %ebx
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: mull %edi
+; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: movl %eax, %ebp
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: mull %edi
; X86-NEXT: movl %edx, %esi
; X86-NEXT: movl %eax, %edi
-; X86-NEXT: addl %ebx, %edi
+; X86-NEXT: addl %ecx, %edi
; X86-NEXT: adcl $0, %esi
-; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: movl %ebx, %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
; X86-NEXT: mull %ebx
-; X86-NEXT: movl %edx, %ebp
+; X86-NEXT: movl %edx, %ecx
; X86-NEXT: addl %edi, %eax
; X86-NEXT: movl %eax, %edi
-; X86-NEXT: adcl %esi, %ebp
+; X86-NEXT: adcl %esi, %ecx
; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull %ebx
; X86-NEXT: movl %edx, %ebx
; X86-NEXT: movl %eax, %esi
-; X86-NEXT: addl %ebp, %esi
+; X86-NEXT: addl %ecx, %esi
; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
; X86-NEXT: adcl %eax, %ebx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT: addl %ecx, %ebp
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl $0, %esi
; X86-NEXT: adcl $0, %ebx
; X86-NEXT: addl (%esp), %esi # 4-byte Folded Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: movl %edi, %eax
-; X86-NEXT: imull {{[0-9]+}}(%esp), %edi
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: imull %edx, %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: movl %edi, %eax
; X86-NEXT: mull %edx
-; X86-NEXT: addl %edx, %edi
+; X86-NEXT: addl %edx, %ecx
+; X86-NEXT: imull {{[0-9]+}}(%esp), %edi
; X86-NEXT: addl %ecx, %edi
; X86-NEXT: movl %eax, %edx
; X86-NEXT: addl %esi, %edx
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: mull %esi
; X86-NEXT: movl %edx, %edi
-; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull %esi
; X86-NEXT: movl %edx, %ebx
; X86-NEXT: mull %esi
; X86-NEXT: movl %edx, %edi
; X86-NEXT: addl %ebp, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
; X86-NEXT: adcl %ebx, %edi
; X86-NEXT: setb %cl
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull %esi
-; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: addl %edi, %ebx
+; X86-NEXT: movl %eax, %ebp
+; X86-NEXT: addl %edi, %ebp
; X86-NEXT: movzbl %cl, %eax
; X86-NEXT: adcl %eax, %edx
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: movl %ebx, %eax
; X86-NEXT: mull %ecx
; X86-NEXT: movl %edx, %esi
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: addl %esi, %ecx
; X86-NEXT: adcl $0, %edi
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: movl %ebx, %eax
; X86-NEXT: mull %esi
-; X86-NEXT: movl %edx, %ebp
+; X86-NEXT: movl %edx, %ebx
; X86-NEXT: addl %ecx, %eax
; X86-NEXT: movl %eax, %ecx
-; X86-NEXT: adcl %edi, %ebp
+; X86-NEXT: adcl %edi, %ebx
; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull %esi
; X86-NEXT: movl %edx, %edi
; X86-NEXT: movl %eax, %esi
-; X86-NEXT: addl %ebp, %esi
+; X86-NEXT: addl %ebx, %esi
; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
; X86-NEXT: adcl %eax, %edi
-; X86-NEXT: addl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl $0, %esi
; X86-NEXT: adcl $0, %edi
; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT: movl %ebx, %eax
-; X86-NEXT: imull {{[0-9]+}}(%esp), %ebx
; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: imull %ecx, %ebp
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: movl %ebx, %eax
; X86-NEXT: mull %ecx
; X86-NEXT: movl %eax, %ecx
-; X86-NEXT: addl %edx, %ebx
+; X86-NEXT: addl %edx, %ebp
+; X86-NEXT: imull {{[0-9]+}}(%esp), %ebx
; X86-NEXT: addl %ebp, %ebx
; X86-NEXT: addl %esi, %ecx
; X86-NEXT: adcl %edi, %ebx
-; X86-NEXT: movl (%esp), %eax # 4-byte Reload
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl $0, %ebx
; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: movl %esi, %eax
-; X86-NEXT: imull {{[0-9]+}}(%esp), %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: imull %edx, %ebp
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl %esi, %eax
; X86-NEXT: mull %edx
; X86-NEXT: movl %eax, %edi
-; X86-NEXT: addl %edx, %esi
+; X86-NEXT: addl %edx, %ebp
+; X86-NEXT: imull {{[0-9]+}}(%esp), %esi
; X86-NEXT: addl %ebp, %esi
; X86-NEXT: addl %ecx, %edi
; X86-NEXT: adcl %ebx, %esi
; X86-NEXT: movl %esi, 8(%edx)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X86-NEXT: movl %esi, 12(%edx)
-; X86-NEXT: movl (%esp), %esi # 4-byte Reload
-; X86-NEXT: movl %esi, 16(%edx)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: movl %esi, 16(%edx)
+; X86-NEXT: movl (%esp), %esi # 4-byte Reload
; X86-NEXT: movl %esi, 20(%edx)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X86-NEXT: movl %esi, 24(%edx)
; X64-NEXT: pushq %rbx
; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; X64-NEXT: movq %r8, %r11
-; X64-NEXT: movq %rdx, %r8
+; X64-NEXT: movq %rcx, %r8
+; X64-NEXT: movq %rdx, %rcx
; X64-NEXT: movq {{[0-9]+}}(%rsp), %r10
; X64-NEXT: movq {{[0-9]+}}(%rsp), %r9
; X64-NEXT: movq %rsi, %rax
; X64-NEXT: mulq %r10
; X64-NEXT: movq %rdx, %rbx
; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq %r8, %rax
+; X64-NEXT: movq %rcx, %rax
; X64-NEXT: mulq %r10
; X64-NEXT: movq %r10, %rbp
; X64-NEXT: movq %rdx, %r14
; X64-NEXT: adcq %r14, %r12
; X64-NEXT: setb %al
; X64-NEXT: movzbl %al, %r10d
-; X64-NEXT: movq %r8, %rax
+; X64-NEXT: movq %rcx, %rax
; X64-NEXT: mulq %r9
; X64-NEXT: movq %rdx, %r15
; X64-NEXT: movq %rax, %r13
; X64-NEXT: addq %r12, %r13
; X64-NEXT: adcq %r10, %r15
-; X64-NEXT: movq %rcx, %rax
+; X64-NEXT: movq %r8, %rax
; X64-NEXT: mulq %rbp
; X64-NEXT: movq %rdx, %r12
; X64-NEXT: movq %rax, %r14
; X64-NEXT: movq %rax, %r10
; X64-NEXT: addq %r12, %r10
; X64-NEXT: adcq $0, %rbp
-; X64-NEXT: movq %rcx, %rax
+; X64-NEXT: movq %r8, %rax
; X64-NEXT: mulq %r9
; X64-NEXT: movq %rax, %r12
; X64-NEXT: addq %r10, %r12
; X64-NEXT: mulq %r9
; X64-NEXT: movq %rdx, %r10
; X64-NEXT: movq %rax, %r15
-; X64-NEXT: movq %r8, %rax
+; X64-NEXT: movq %rcx, %rax
; X64-NEXT: mulq %r9
; X64-NEXT: movq %rdx, %r13
; X64-NEXT: movq %rax, %rbp
; X64-NEXT: mulq %r10
; X64-NEXT: addq %rbp, %rax
; X64-NEXT: adcq %r13, %rdx
-; X64-NEXT: imulq %r10, %r8
-; X64-NEXT: addq %rdx, %r8
+; X64-NEXT: imulq %r10, %rcx
+; X64-NEXT: addq %rdx, %rcx
; X64-NEXT: addq %r14, %r15
; X64-NEXT: adcq %r12, %rax
-; X64-NEXT: adcq %r11, %r8
-; X64-NEXT: imulq %r9, %rcx
+; X64-NEXT: adcq %r11, %rcx
+; X64-NEXT: imulq %r9, %r8
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
; X64-NEXT: imulq {{[0-9]+}}(%rsp), %rdx
; X64-NEXT: imulq {{[0-9]+}}(%rsp), %rsi
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: mull %esi
; X86-NEXT: imull {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: addl %ecx, %edx
; X86-NEXT: imull {{[0-9]+}}(%esp), %esi
-; X86-NEXT: addl %ecx, %esi
; X86-NEXT: addl %esi, %edx
; X86-NEXT: popl %esi
; X86-NEXT: retl
; X86-NEXT: .cfi_offset %edi, -16
; X86-NEXT: .cfi_offset %ebx, -12
; X86-NEXT: .cfi_offset %ebp, -8
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: testl %esi, %esi
; X86-NEXT: setne %dl
; X86-NEXT: testl %eax, %eax
-; X86-NEXT: setne %bl
-; X86-NEXT: andb %dl, %bl
-; X86-NEXT: mull %ebp
+; X86-NEXT: setne %cl
+; X86-NEXT: andb %dl, %cl
+; X86-NEXT: mull {{[0-9]+}}(%esp)
; X86-NEXT: movl %eax, %edi
-; X86-NEXT: seto %bh
+; X86-NEXT: seto %bl
; X86-NEXT: movl %esi, %eax
-; X86-NEXT: mull %ecx
-; X86-NEXT: movl %ecx, %edx
-; X86-NEXT: seto %cl
-; X86-NEXT: orb %bh, %cl
-; X86-NEXT: leal (%edi,%eax), %esi
-; X86-NEXT: movl %edx, %eax
; X86-NEXT: mull %ebp
-; X86-NEXT: addl %esi, %edx
-; X86-NEXT: setb %ch
-; X86-NEXT: orb %cl, %ch
+; X86-NEXT: seto %ch
; X86-NEXT: orb %bl, %ch
+; X86-NEXT: orb %cl, %ch
+; X86-NEXT: leal (%edi,%eax), %esi
+; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: mull {{[0-9]+}}(%esp)
+; X86-NEXT: addl %esi, %edx
+; X86-NEXT: setb %cl
+; X86-NEXT: orb %ch, %cl
; X86-NEXT: movl $-1, %ecx
; X86-NEXT: cmovnel %ecx, %eax
; X86-NEXT: cmovnel %ecx, %edx
; X64-NEXT: mulq %rdi
; X64-NEXT: seto %r11b
; X64-NEXT: orb %r10b, %r11b
+; X64-NEXT: orb %r9b, %r11b
; X64-NEXT: leaq (%rsi,%rax), %rcx
; X64-NEXT: movq %rdi, %rax
; X64-NEXT: mulq %r8
; X64-NEXT: addq %rcx, %rdx
; X64-NEXT: setb %cl
; X64-NEXT: orb %r11b, %cl
-; X64-NEXT: orb %r9b, %cl
; X64-NEXT: retq
;
; X86-LABEL: muloti_test:
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: mull %ebx
+; X86-NEXT: mull %ebp
; X86-NEXT: movl %eax, %ecx
-; X86-NEXT: seto {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT: seto (%esp) # 1-byte Folded Spill
; X86-NEXT: movl %edi, %eax
; X86-NEXT: mull %esi
; X86-NEXT: leal (%ecx,%eax), %ecx
-; X86-NEXT: seto {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT: seto %bh
; X86-NEXT: movl %esi, %eax
-; X86-NEXT: mull %ebx
-; X86-NEXT: movl %eax, %esi
+; X86-NEXT: mull %ebp
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %edx, %ebp
; X86-NEXT: addl %ecx, %ebp
; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT: mull %ebx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: mull %esi
; X86-NEXT: movl %eax, %ecx
; X86-NEXT: seto {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull %edi
; X86-NEXT: leal (%ecx,%eax), %ecx
-; X86-NEXT: seto {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT: seto %bl
; X86-NEXT: movl %edi, %eax
-; X86-NEXT: mull %ebx
-; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: mull %esi
; X86-NEXT: movl %edx, %edi
; X86-NEXT: addl %ecx, %edi
; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT: addl %esi, %ebx
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl %ebp, %edi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: mull %ecx
+; X86-NEXT: mull %esi
+; X86-NEXT: movl %esi, %ecx
; X86-NEXT: movl %edx, %esi
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: addl %esi, %eax
; X86-NEXT: movzbl %cl, %ecx
; X86-NEXT: adcl %ecx, %edx
-; X86-NEXT: addl %ebx, %eax
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X86-NEXT: adcl %edi, %edx
; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
; X86-NEXT: testl %ebp, %ebp
; X86-NEXT: testl %esi, %esi
; X86-NEXT: setne %ch
; X86-NEXT: andb %cl, %ch
-; X86-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
-; X86-NEXT: orb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Folded Reload
-; X86-NEXT: orb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Folded Reload
-; X86-NEXT: orb %ch, %cl
-; X86-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NEXT: orb (%esp), %bh # 1-byte Folded Reload
+; X86-NEXT: orb %ch, %bh
+; X86-NEXT: orb {{[-0-9]+}}(%e{{[sb]}}p), %bh # 1-byte Folded Reload
+; X86-NEXT: movb %bh, (%esp) # 1-byte Spill
; X86-NEXT: cmpl $0, {{[0-9]+}}(%esp)
; X86-NEXT: setne %cl
; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
; X86-NEXT: testl %edi, %edi
-; X86-NEXT: setne %bh
-; X86-NEXT: andb %cl, %bh
-; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X86-NEXT: orb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Folded Reload
-; X86-NEXT: orb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Folded Reload
-; X86-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NEXT: setne %ch
+; X86-NEXT: andb %cl, %ch
+; X86-NEXT: orb {{[-0-9]+}}(%e{{[sb]}}p), %bl # 1-byte Folded Reload
+; X86-NEXT: orb %ch, %bl
+; X86-NEXT: orb {{[-0-9]+}}(%e{{[sb]}}p), %bl # 1-byte Folded Reload
; X86-NEXT: orl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: setne %bl
+; X86-NEXT: setne %bh
; X86-NEXT: orl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X86-NEXT: movl %eax, 8(%ecx)
; X86-NEXT: movl %edx, 12(%ecx)
; X86-NEXT: setne %al
-; X86-NEXT: andb %bl, %al
+; X86-NEXT: andb %bh, %al
+; X86-NEXT: orb %bl, %al
+; X86-NEXT: orb (%esp), %al # 1-byte Folded Reload
; X86-NEXT: orb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Folded Reload
-; X86-NEXT: orb {{[-0-9]+}}(%e{{[sb]}}p), %bh # 1-byte Folded Reload
-; X86-NEXT: orb {{[-0-9]+}}(%e{{[sb]}}p), %bh # 1-byte Folded Reload
-; X86-NEXT: orb %al, %bh
-; X86-NEXT: andb $1, %bh
-; X86-NEXT: movb %bh, 16(%ecx)
+; X86-NEXT: andb $1, %al
+; X86-NEXT: movb %al, 16(%ecx)
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: addl $24, %esp
; X86-NEXT: .cfi_def_cfa_offset 20
; X86-NEXT: .cfi_offset %edi, -16
; X86-NEXT: .cfi_offset %ebx, -12
; X86-NEXT: .cfi_offset %ebp, -8
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: testl %esi, %esi
; X86-NEXT: setne %dl
; X86-NEXT: testl %eax, %eax
-; X86-NEXT: setne %bl
-; X86-NEXT: andb %dl, %bl
-; X86-NEXT: mull %ebp
+; X86-NEXT: setne %cl
+; X86-NEXT: andb %dl, %cl
+; X86-NEXT: mull {{[0-9]+}}(%esp)
; X86-NEXT: movl %eax, %edi
-; X86-NEXT: seto %bh
+; X86-NEXT: seto %bl
; X86-NEXT: movl %esi, %eax
-; X86-NEXT: mull %ecx
-; X86-NEXT: movl %ecx, %edx
+; X86-NEXT: mull %ebp
; X86-NEXT: seto %ch
-; X86-NEXT: orb %bh, %ch
+; X86-NEXT: orb %bl, %ch
+; X86-NEXT: orb %cl, %ch
; X86-NEXT: leal (%edi,%eax), %esi
-; X86-NEXT: movl %edx, %eax
-; X86-NEXT: mull %ebp
+; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: mull {{[0-9]+}}(%esp)
; X86-NEXT: addl %esi, %edx
; X86-NEXT: setb %cl
; X86-NEXT: orb %ch, %cl
-; X86-NEXT: orb %bl, %cl
; X86-NEXT: popl %esi
; X86-NEXT: .cfi_def_cfa_offset 16
; X86-NEXT: popl %edi
define i1 @t64_3_2(i64 %X) nounwind {
; X86-LABEL: t64_3_2:
; X86: # %bb.0:
-; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl $-1431655765, %edx # imm = 0xAAAAAAAB
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: mull %edx
; X86-NEXT: imull $-1431655766, %ecx, %ecx # imm = 0xAAAAAAAA
-; X86-NEXT: imull $-1431655765, {{[0-9]+}}(%esp), %esi # imm = 0xAAAAAAAB
-; X86-NEXT: addl %ecx, %esi
-; X86-NEXT: addl %edx, %esi
+; X86-NEXT: addl %edx, %ecx
+; X86-NEXT: imull $-1431655765, {{[0-9]+}}(%esp), %edx # imm = 0xAAAAAAAB
+; X86-NEXT: addl %ecx, %edx
; X86-NEXT: addl $-1431655766, %eax # imm = 0xAAAAAAAA
-; X86-NEXT: adcl $-1431655766, %esi # imm = 0xAAAAAAAA
+; X86-NEXT: adcl $-1431655766, %edx # imm = 0xAAAAAAAA
; X86-NEXT: cmpl $1431655765, %eax # imm = 0x55555555
-; X86-NEXT: sbbl $1431655765, %esi # imm = 0x55555555
+; X86-NEXT: sbbl $1431655765, %edx # imm = 0x55555555
; X86-NEXT: setb %al
-; X86-NEXT: popl %esi
; X86-NEXT: retl
;
; X64-LABEL: t64_3_2:
; X32-AVX2-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X32-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; X32-AVX2-NEXT: vcmpltps %ymm2, %ymm0, %ymm2
-; X32-AVX2-NEXT: vxorps %xmm3, %xmm3, %xmm3
-; X32-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm0
-; X32-AVX2-NEXT: vandps %ymm0, %ymm2, %ymm0
+; X32-AVX2-NEXT: vandps %ymm2, %ymm1, %ymm1
+; X32-AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; X32-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm0
; X32-AVX2-NEXT: vandps %ymm0, %ymm1, %ymm0
; X32-AVX2-NEXT: retl
;
; X64-AVX2-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; X64-AVX2-NEXT: vcmpltps %ymm2, %ymm0, %ymm2
-; X64-AVX2-NEXT: vxorps %xmm3, %xmm3, %xmm3
-; X64-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm0
-; X64-AVX2-NEXT: vandps %ymm0, %ymm2, %ymm0
+; X64-AVX2-NEXT: vandps %ymm2, %ymm1, %ymm1
+; X64-AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; X64-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm0
; X64-AVX2-NEXT: vandps %ymm0, %ymm1, %ymm0
; X64-AVX2-NEXT: retq
entry:
; X32-NEXT: vxorps %xmm3, %xmm3, %xmm3
; X32-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3
; X32-NEXT: vandps %ymm3, %ymm2, %ymm2
+; X32-NEXT: vandps %ymm2, %ymm1, %ymm1
; X32-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
-; X32-NEXT: vandps %ymm0, %ymm2, %ymm0
; X32-NEXT: vandps %ymm0, %ymm1, %ymm0
; X32-NEXT: retl
;
; X64-NEXT: vxorps %xmm3, %xmm3, %xmm3
; X64-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3
; X64-NEXT: vandps %ymm3, %ymm2, %ymm2
+; X64-NEXT: vandps %ymm2, %ymm1, %ymm1
; X64-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; X64-NEXT: vandps %ymm0, %ymm2, %ymm0
; X64-NEXT: vandps %ymm0, %ymm1, %ymm0
; X64-NEXT: retq
;
; X32-AVX2-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X32-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; X32-AVX2-NEXT: vcmpltps %ymm2, %ymm0, %ymm2
-; X32-AVX2-NEXT: vxorps %xmm3, %xmm3, %xmm3
-; X32-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3
-; X32-AVX2-NEXT: vandps %ymm3, %ymm2, %ymm2
+; X32-AVX2-NEXT: vandps %ymm2, %ymm1, %ymm1
+; X32-AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; X32-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm2
; X32-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1]
; X32-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm0
; X32-AVX2-NEXT: vandps %ymm0, %ymm2, %ymm0
; X64-AVX2-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; X64-AVX2-NEXT: vcmpltps %ymm2, %ymm0, %ymm2
-; X64-AVX2-NEXT: vxorps %xmm3, %xmm3, %xmm3
-; X64-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3
-; X64-AVX2-NEXT: vandps %ymm3, %ymm2, %ymm2
+; X64-AVX2-NEXT: vandps %ymm2, %ymm1, %ymm1
+; X64-AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; X64-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm2
; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1]
; X64-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm0
; X64-AVX2-NEXT: vandps %ymm0, %ymm2, %ymm0
; X32-NEXT: vxorps %xmm3, %xmm3, %xmm3
; X32-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3
; X32-NEXT: vandps %ymm3, %ymm2, %ymm2
-; X32-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm3
-; X32-NEXT: vandps %ymm3, %ymm2, %ymm2
+; X32-NEXT: vandps %ymm2, %ymm1, %ymm1
+; X32-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2
; X32-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
; X32-NEXT: vandps %ymm0, %ymm2, %ymm0
; X32-NEXT: vandps %ymm0, %ymm1, %ymm0
; X64-NEXT: vxorps %xmm3, %xmm3, %xmm3
; X64-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3
; X64-NEXT: vandps %ymm3, %ymm2, %ymm2
-; X64-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3
-; X64-NEXT: vandps %ymm3, %ymm2, %ymm2
+; X64-NEXT: vandps %ymm2, %ymm1, %ymm1
+; X64-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2
; X64-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; X64-NEXT: vandps %ymm0, %ymm2, %ymm0
; X64-NEXT: vandps %ymm0, %ymm1, %ymm0
; X32-AVX2-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X32-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; X32-AVX2-NEXT: vcmpltps %ymm2, %ymm0, %ymm2
-; X32-AVX2-NEXT: vxorps %xmm3, %xmm3, %xmm3
-; X32-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3
-; X32-AVX2-NEXT: vandps %ymm3, %ymm2, %ymm2
+; X32-AVX2-NEXT: vandps %ymm2, %ymm1, %ymm1
+; X32-AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; X32-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm2
; X32-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1]
; X32-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3
; X32-AVX2-NEXT: vandps %ymm3, %ymm2, %ymm2
-; X32-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1]
-; X32-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm0
-; X32-AVX2-NEXT: vandps %ymm0, %ymm2, %ymm0
+; X32-AVX2-NEXT: vandps %ymm2, %ymm1, %ymm1
+; X32-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1]
+; X32-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm0
; X32-AVX2-NEXT: vandps %ymm0, %ymm1, %ymm0
; X32-AVX2-NEXT: retl
;
; X64-AVX2-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; X64-AVX2-NEXT: vcmpltps %ymm2, %ymm0, %ymm2
-; X64-AVX2-NEXT: vxorps %xmm3, %xmm3, %xmm3
-; X64-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3
-; X64-AVX2-NEXT: vandps %ymm3, %ymm2, %ymm2
+; X64-AVX2-NEXT: vandps %ymm2, %ymm1, %ymm1
+; X64-AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; X64-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm2
; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1]
; X64-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3
; X64-AVX2-NEXT: vandps %ymm3, %ymm2, %ymm2
-; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1]
-; X64-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm0
-; X64-AVX2-NEXT: vandps %ymm0, %ymm2, %ymm0
+; X64-AVX2-NEXT: vandps %ymm2, %ymm1, %ymm1
+; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1]
+; X64-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm0
; X64-AVX2-NEXT: vandps %ymm0, %ymm1, %ymm0
; X64-AVX2-NEXT: retq
entry:
; X32-AVX2-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X32-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; X32-AVX2-NEXT: vcmpltps %ymm2, %ymm0, %ymm2
-; X32-AVX2-NEXT: vxorps %xmm3, %xmm3, %xmm3
-; X32-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm0
-; X32-AVX2-NEXT: vorps %ymm0, %ymm2, %ymm0
+; X32-AVX2-NEXT: vorps %ymm2, %ymm1, %ymm1
+; X32-AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; X32-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm0
; X32-AVX2-NEXT: vorps %ymm0, %ymm1, %ymm0
; X32-AVX2-NEXT: retl
;
; X64-AVX2-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; X64-AVX2-NEXT: vcmpltps %ymm2, %ymm0, %ymm2
-; X64-AVX2-NEXT: vxorps %xmm3, %xmm3, %xmm3
-; X64-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm0
-; X64-AVX2-NEXT: vorps %ymm0, %ymm2, %ymm0
+; X64-AVX2-NEXT: vorps %ymm2, %ymm1, %ymm1
+; X64-AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; X64-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm0
; X64-AVX2-NEXT: vorps %ymm0, %ymm1, %ymm0
; X64-AVX2-NEXT: retq
entry:
; X32-NEXT: vxorps %xmm3, %xmm3, %xmm3
; X32-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3
; X32-NEXT: vorps %ymm3, %ymm2, %ymm2
+; X32-NEXT: vorps %ymm2, %ymm1, %ymm1
; X32-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
-; X32-NEXT: vorps %ymm0, %ymm2, %ymm0
; X32-NEXT: vorps %ymm0, %ymm1, %ymm0
; X32-NEXT: retl
;
; X64-NEXT: vxorps %xmm3, %xmm3, %xmm3
; X64-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3
; X64-NEXT: vorps %ymm3, %ymm2, %ymm2
+; X64-NEXT: vorps %ymm2, %ymm1, %ymm1
; X64-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; X64-NEXT: vorps %ymm0, %ymm2, %ymm0
; X64-NEXT: vorps %ymm0, %ymm1, %ymm0
; X64-NEXT: retq
;
; X32-AVX2-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X32-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; X32-AVX2-NEXT: vcmpltps %ymm2, %ymm0, %ymm2
-; X32-AVX2-NEXT: vxorps %xmm3, %xmm3, %xmm3
-; X32-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3
-; X32-AVX2-NEXT: vorps %ymm3, %ymm2, %ymm2
+; X32-AVX2-NEXT: vorps %ymm2, %ymm1, %ymm1
+; X32-AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; X32-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm2
; X32-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1]
; X32-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm0
; X32-AVX2-NEXT: vorps %ymm0, %ymm2, %ymm0
; X64-AVX2-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; X64-AVX2-NEXT: vcmpltps %ymm2, %ymm0, %ymm2
-; X64-AVX2-NEXT: vxorps %xmm3, %xmm3, %xmm3
-; X64-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3
-; X64-AVX2-NEXT: vorps %ymm3, %ymm2, %ymm2
+; X64-AVX2-NEXT: vorps %ymm2, %ymm1, %ymm1
+; X64-AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; X64-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm2
; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1]
; X64-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm0
; X64-AVX2-NEXT: vorps %ymm0, %ymm2, %ymm0
; X32-NEXT: vxorps %xmm3, %xmm3, %xmm3
; X32-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3
; X32-NEXT: vorps %ymm3, %ymm2, %ymm2
-; X32-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm3
-; X32-NEXT: vorps %ymm3, %ymm2, %ymm2
+; X32-NEXT: vorps %ymm2, %ymm1, %ymm1
+; X32-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2
; X32-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
; X32-NEXT: vorps %ymm0, %ymm2, %ymm0
; X32-NEXT: vorps %ymm0, %ymm1, %ymm0
; X64-NEXT: vxorps %xmm3, %xmm3, %xmm3
; X64-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3
; X64-NEXT: vorps %ymm3, %ymm2, %ymm2
-; X64-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3
-; X64-NEXT: vorps %ymm3, %ymm2, %ymm2
+; X64-NEXT: vorps %ymm2, %ymm1, %ymm1
+; X64-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2
; X64-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; X64-NEXT: vorps %ymm0, %ymm2, %ymm0
; X64-NEXT: vorps %ymm0, %ymm1, %ymm0
; X32-AVX2-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X32-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; X32-AVX2-NEXT: vcmpltps %ymm2, %ymm0, %ymm2
-; X32-AVX2-NEXT: vxorps %xmm3, %xmm3, %xmm3
-; X32-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3
-; X32-AVX2-NEXT: vorps %ymm3, %ymm2, %ymm2
+; X32-AVX2-NEXT: vorps %ymm2, %ymm1, %ymm1
+; X32-AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; X32-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm2
; X32-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1]
; X32-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3
; X32-AVX2-NEXT: vorps %ymm3, %ymm2, %ymm2
-; X32-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1]
-; X32-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm0
-; X32-AVX2-NEXT: vorps %ymm0, %ymm2, %ymm0
+; X32-AVX2-NEXT: vorps %ymm2, %ymm1, %ymm1
+; X32-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1]
+; X32-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm0
; X32-AVX2-NEXT: vorps %ymm0, %ymm1, %ymm0
; X32-AVX2-NEXT: retl
;
; X64-AVX2-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; X64-AVX2-NEXT: vcmpltps %ymm2, %ymm0, %ymm2
-; X64-AVX2-NEXT: vxorps %xmm3, %xmm3, %xmm3
-; X64-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3
-; X64-AVX2-NEXT: vorps %ymm3, %ymm2, %ymm2
+; X64-AVX2-NEXT: vorps %ymm2, %ymm1, %ymm1
+; X64-AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; X64-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm2
; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1]
; X64-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3
; X64-AVX2-NEXT: vorps %ymm3, %ymm2, %ymm2
-; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1]
-; X64-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm0
-; X64-AVX2-NEXT: vorps %ymm0, %ymm2, %ymm0
+; X64-AVX2-NEXT: vorps %ymm2, %ymm1, %ymm1
+; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1]
+; X64-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm0
; X64-AVX2-NEXT: vorps %ymm0, %ymm1, %ymm0
; X64-AVX2-NEXT: retq
entry:
; X32-NEXT: vxorps %xmm3, %xmm3, %xmm3
; X32-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3
; X32-NEXT: vandps %ymm3, %ymm2, %ymm2
-; X32-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm3
+; X32-NEXT: vorps %ymm1, %ymm2, %ymm1
+; X32-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2
; X32-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
-; X32-NEXT: vandps %ymm0, %ymm3, %ymm0
+; X32-NEXT: vandps %ymm0, %ymm2, %ymm0
; X32-NEXT: vorps %ymm0, %ymm1, %ymm0
-; X32-NEXT: vorps %ymm0, %ymm2, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: five_or_and:
; X64-NEXT: vxorps %xmm3, %xmm3, %xmm3
; X64-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3
; X64-NEXT: vandps %ymm3, %ymm2, %ymm2
-; X64-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3
+; X64-NEXT: vorps %ymm1, %ymm2, %ymm1
+; X64-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2
; X64-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; X64-NEXT: vandps %ymm0, %ymm3, %ymm0
+; X64-NEXT: vandps %ymm0, %ymm2, %ymm0
; X64-NEXT: vorps %ymm0, %ymm1, %ymm0
-; X64-NEXT: vorps %ymm0, %ymm2, %ymm0
; X64-NEXT: retq
;
; X32-AVX2-LABEL: five_or_and:
; X32-AVX2-NEXT: vxorps %xmm3, %xmm3, %xmm3
; X32-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3
; X32-AVX2-NEXT: vandps %ymm3, %ymm2, %ymm2
-; X32-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1]
-; X32-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3
-; X32-AVX2-NEXT: vbroadcastss {{.*#+}} ymm4 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1]
-; X32-AVX2-NEXT: vcmpneqps %ymm4, %ymm0, %ymm0
-; X32-AVX2-NEXT: vandps %ymm0, %ymm3, %ymm0
+; X32-AVX2-NEXT: vorps %ymm1, %ymm2, %ymm1
+; X32-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1]
+; X32-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm2
+; X32-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1]
+; X32-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm0
+; X32-AVX2-NEXT: vandps %ymm0, %ymm2, %ymm0
; X32-AVX2-NEXT: vorps %ymm0, %ymm1, %ymm0
-; X32-AVX2-NEXT: vorps %ymm0, %ymm2, %ymm0
; X32-AVX2-NEXT: retl
;
; X64-AVX2-LABEL: five_or_and:
; X64-AVX2-NEXT: vxorps %xmm3, %xmm3, %xmm3
; X64-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3
; X64-AVX2-NEXT: vandps %ymm3, %ymm2, %ymm2
-; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1]
-; X64-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3
-; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm4 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1]
-; X64-AVX2-NEXT: vcmpneqps %ymm4, %ymm0, %ymm0
-; X64-AVX2-NEXT: vandps %ymm0, %ymm3, %ymm0
+; X64-AVX2-NEXT: vorps %ymm1, %ymm2, %ymm1
+; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1]
+; X64-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm2
+; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1]
+; X64-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm0
+; X64-AVX2-NEXT: vandps %ymm0, %ymm2, %ymm0
; X64-AVX2-NEXT: vorps %ymm0, %ymm1, %ymm0
-; X64-AVX2-NEXT: vorps %ymm0, %ymm2, %ymm0
; X64-AVX2-NEXT: retq
entry:
%cmp = fcmp oge <8 x float> %x, <float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01>
; X32-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2
; X32-NEXT: vxorps %xmm3, %xmm3, %xmm3
; X32-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3
-; X32-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm4
+; X32-NEXT: vxorps %ymm3, %ymm2, %ymm2
+; X32-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm3
; X32-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
-; X32-NEXT: vandps %ymm0, %ymm4, %ymm0
-; X32-NEXT: vxorps %ymm0, %ymm3, %ymm0
+; X32-NEXT: vandps %ymm0, %ymm3, %ymm0
; X32-NEXT: vxorps %ymm0, %ymm2, %ymm0
; X32-NEXT: vorps %ymm1, %ymm0, %ymm0
; X32-NEXT: retl
; X64-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2
; X64-NEXT: vxorps %xmm3, %xmm3, %xmm3
; X64-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3
-; X64-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm4
+; X64-NEXT: vxorps %ymm3, %ymm2, %ymm2
+; X64-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3
; X64-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; X64-NEXT: vandps %ymm0, %ymm4, %ymm0
-; X64-NEXT: vxorps %ymm0, %ymm3, %ymm0
+; X64-NEXT: vandps %ymm0, %ymm3, %ymm0
; X64-NEXT: vxorps %ymm0, %ymm2, %ymm0
; X64-NEXT: vorps %ymm1, %ymm0, %ymm0
; X64-NEXT: retq
; X32-AVX2-NEXT: vcmpltps %ymm2, %ymm0, %ymm2
; X32-AVX2-NEXT: vxorps %xmm3, %xmm3, %xmm3
; X32-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3
-; X32-AVX2-NEXT: vbroadcastss {{.*#+}} ymm4 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1]
-; X32-AVX2-NEXT: vcmpneqps %ymm4, %ymm0, %ymm4
-; X32-AVX2-NEXT: vbroadcastss {{.*#+}} ymm5 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1]
-; X32-AVX2-NEXT: vcmpneqps %ymm5, %ymm0, %ymm0
-; X32-AVX2-NEXT: vandps %ymm0, %ymm4, %ymm0
-; X32-AVX2-NEXT: vxorps %ymm0, %ymm3, %ymm0
+; X32-AVX2-NEXT: vxorps %ymm3, %ymm2, %ymm2
+; X32-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1]
+; X32-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3
+; X32-AVX2-NEXT: vbroadcastss {{.*#+}} ymm4 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1]
+; X32-AVX2-NEXT: vcmpneqps %ymm4, %ymm0, %ymm0
+; X32-AVX2-NEXT: vandps %ymm0, %ymm3, %ymm0
; X32-AVX2-NEXT: vxorps %ymm0, %ymm2, %ymm0
; X32-AVX2-NEXT: vorps %ymm1, %ymm0, %ymm0
; X32-AVX2-NEXT: retl
; X64-AVX2-NEXT: vcmpltps %ymm2, %ymm0, %ymm2
; X64-AVX2-NEXT: vxorps %xmm3, %xmm3, %xmm3
; X64-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3
-; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm4 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1]
-; X64-AVX2-NEXT: vcmpneqps %ymm4, %ymm0, %ymm4
-; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm5 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1]
-; X64-AVX2-NEXT: vcmpneqps %ymm5, %ymm0, %ymm0
-; X64-AVX2-NEXT: vandps %ymm0, %ymm4, %ymm0
-; X64-AVX2-NEXT: vxorps %ymm0, %ymm3, %ymm0
+; X64-AVX2-NEXT: vxorps %ymm3, %ymm2, %ymm2
+; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1]
+; X64-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3
+; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm4 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1]
+; X64-AVX2-NEXT: vcmpneqps %ymm4, %ymm0, %ymm0
+; X64-AVX2-NEXT: vandps %ymm0, %ymm3, %ymm0
; X64-AVX2-NEXT: vxorps %ymm0, %ymm2, %ymm0
; X64-AVX2-NEXT: vorps %ymm1, %ymm0, %ymm0
; X64-AVX2-NEXT: retq
; X32-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2
; X32-NEXT: vxorps %xmm3, %xmm3, %xmm3
; X32-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3
-; X32-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm4
-; X32-NEXT: vandps %ymm4, %ymm3, %ymm3
; X32-NEXT: vandps %ymm3, %ymm2, %ymm2
+; X32-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm3
+; X32-NEXT: vandps %ymm3, %ymm2, %ymm2
+; X32-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm3
+; X32-NEXT: vxorps %ymm1, %ymm3, %ymm1
; X32-NEXT: vxorps %ymm2, %ymm1, %ymm1
-; X32-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2
-; X32-NEXT: vxorps %ymm1, %ymm2, %ymm1
; X32-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
; X32-NEXT: vorps %ymm0, %ymm1, %ymm0
; X32-NEXT: retl
; X64-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2
; X64-NEXT: vxorps %xmm3, %xmm3, %xmm3
; X64-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3
-; X64-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm4
-; X64-NEXT: vandps %ymm4, %ymm3, %ymm3
; X64-NEXT: vandps %ymm3, %ymm2, %ymm2
+; X64-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3
+; X64-NEXT: vandps %ymm3, %ymm2, %ymm2
+; X64-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3
+; X64-NEXT: vxorps %ymm1, %ymm3, %ymm1
; X64-NEXT: vxorps %ymm2, %ymm1, %ymm1
-; X64-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2
-; X64-NEXT: vxorps %ymm1, %ymm2, %ymm1
; X64-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; X64-NEXT: vorps %ymm0, %ymm1, %ymm0
; X64-NEXT: retq
; X32-AVX2-NEXT: vcmpltps %ymm2, %ymm0, %ymm2
; X32-AVX2-NEXT: vxorps %xmm3, %xmm3, %xmm3
; X32-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3
-; X32-AVX2-NEXT: vbroadcastss {{.*#+}} ymm4 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1]
-; X32-AVX2-NEXT: vcmpneqps %ymm4, %ymm0, %ymm4
-; X32-AVX2-NEXT: vandps %ymm4, %ymm3, %ymm3
; X32-AVX2-NEXT: vandps %ymm3, %ymm2, %ymm2
+; X32-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1]
+; X32-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3
+; X32-AVX2-NEXT: vandps %ymm3, %ymm2, %ymm2
+; X32-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1]
+; X32-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3
+; X32-AVX2-NEXT: vxorps %ymm1, %ymm3, %ymm1
; X32-AVX2-NEXT: vxorps %ymm2, %ymm1, %ymm1
-; X32-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1]
-; X32-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm2
-; X32-AVX2-NEXT: vxorps %ymm1, %ymm2, %ymm1
; X32-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [4.00000006E-1,4.00000006E-1,4.00000006E-1,4.00000006E-1,4.00000006E-1,4.00000006E-1,4.00000006E-1,4.00000006E-1]
; X32-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm0
; X32-AVX2-NEXT: vorps %ymm0, %ymm1, %ymm0
; X64-AVX2-NEXT: vcmpltps %ymm2, %ymm0, %ymm2
; X64-AVX2-NEXT: vxorps %xmm3, %xmm3, %xmm3
; X64-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3
-; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm4 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1]
-; X64-AVX2-NEXT: vcmpneqps %ymm4, %ymm0, %ymm4
-; X64-AVX2-NEXT: vandps %ymm4, %ymm3, %ymm3
; X64-AVX2-NEXT: vandps %ymm3, %ymm2, %ymm2
+; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1]
+; X64-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3
+; X64-AVX2-NEXT: vandps %ymm3, %ymm2, %ymm2
+; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1]
+; X64-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3
+; X64-AVX2-NEXT: vxorps %ymm1, %ymm3, %ymm1
; X64-AVX2-NEXT: vxorps %ymm2, %ymm1, %ymm1
-; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1]
-; X64-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm2
-; X64-AVX2-NEXT: vxorps %ymm1, %ymm2, %ymm1
; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [4.00000006E-1,4.00000006E-1,4.00000006E-1,4.00000006E-1,4.00000006E-1,4.00000006E-1,4.00000006E-1,4.00000006E-1]
; X64-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm0
; X64-AVX2-NEXT: vorps %ymm0, %ymm1, %ymm0
; SSE2-NEXT: pushq %r13
; SSE2-NEXT: pushq %r12
; SSE2-NEXT: pushq %rbx
-; SSE2-NEXT: movq %r8, %r14
-; SSE2-NEXT: movq %rcx, %r13
+; SSE2-NEXT: movq %r8, %r15
; SSE2-NEXT: movq %rdx, %r8
; SSE2-NEXT: movq %rsi, %r11
; SSE2-NEXT: movq %rdi, %r10
; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rsi
; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rbp
-; SSE2-NEXT: movq %r11, %rcx
-; SSE2-NEXT: sarq $63, %rcx
-; SSE2-NEXT: movq %r9, %r15
-; SSE2-NEXT: imulq %rcx, %r15
-; SSE2-NEXT: movq %r14, %rax
-; SSE2-NEXT: mulq %rcx
-; SSE2-NEXT: movq %rax, %rdi
-; SSE2-NEXT: addq %rax, %r15
-; SSE2-NEXT: addq %rdx, %r15
+; SSE2-NEXT: movq %r11, %rdi
+; SSE2-NEXT: sarq $63, %rdi
+; SSE2-NEXT: movq %r9, %rbx
+; SSE2-NEXT: imulq %rdi, %rbx
+; SSE2-NEXT: movq %r15, %rax
+; SSE2-NEXT: mulq %rdi
+; SSE2-NEXT: movq %rdx, %rdi
+; SSE2-NEXT: movq %rax, %r12
+; SSE2-NEXT: addq %rax, %rdi
+; SSE2-NEXT: addq %rbx, %rdi
; SSE2-NEXT: movq %r9, %rax
; SSE2-NEXT: sarq $63, %rax
-; SSE2-NEXT: movq %rax, %rcx
-; SSE2-NEXT: imulq %r11, %rcx
+; SSE2-NEXT: movq %rax, %r13
+; SSE2-NEXT: imulq %r11, %r13
; SSE2-NEXT: mulq %r10
-; SSE2-NEXT: movq %rax, %rbx
-; SSE2-NEXT: addq %rax, %rcx
-; SSE2-NEXT: addq %rdx, %rcx
-; SSE2-NEXT: addq %rdi, %rbx
-; SSE2-NEXT: adcq %r15, %rcx
+; SSE2-NEXT: movq %rax, %r14
+; SSE2-NEXT: movq %rdx, %rbx
+; SSE2-NEXT: addq %r13, %rbx
+; SSE2-NEXT: addq %rax, %rbx
+; SSE2-NEXT: addq %r12, %r14
+; SSE2-NEXT: adcq %rdi, %rbx
; SSE2-NEXT: movq %r10, %rax
-; SSE2-NEXT: mulq %r14
-; SSE2-NEXT: movq %rdx, %r15
+; SSE2-NEXT: mulq %r15
+; SSE2-NEXT: movq %rdx, %r12
; SSE2-NEXT: movq %rax, %rdi
; SSE2-NEXT: movq %r11, %rax
-; SSE2-NEXT: mulq %r14
-; SSE2-NEXT: movq %rdx, %r14
-; SSE2-NEXT: movq %rax, %r12
-; SSE2-NEXT: addq %r15, %r12
-; SSE2-NEXT: adcq $0, %r14
+; SSE2-NEXT: mulq %r15
+; SSE2-NEXT: movq %rdx, %r15
+; SSE2-NEXT: movq %rax, %r13
+; SSE2-NEXT: addq %r12, %r13
+; SSE2-NEXT: adcq $0, %r15
; SSE2-NEXT: movq %r10, %rax
; SSE2-NEXT: mulq %r9
-; SSE2-NEXT: movq %rdx, %r15
+; SSE2-NEXT: movq %rdx, %r12
; SSE2-NEXT: movq %rax, %r10
-; SSE2-NEXT: addq %r12, %r10
-; SSE2-NEXT: adcq %r14, %r15
+; SSE2-NEXT: addq %r13, %r10
+; SSE2-NEXT: adcq %r15, %r12
; SSE2-NEXT: setb %al
-; SSE2-NEXT: movzbl %al, %r14d
+; SSE2-NEXT: movzbl %al, %r15d
; SSE2-NEXT: movq %r11, %rax
; SSE2-NEXT: mulq %r9
-; SSE2-NEXT: addq %r15, %rax
-; SSE2-NEXT: adcq %r14, %rdx
-; SSE2-NEXT: addq %rbx, %rax
-; SSE2-NEXT: adcq %rcx, %rdx
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r15
-; SSE2-NEXT: movq %r10, 8(%r15)
+; SSE2-NEXT: addq %r12, %rax
+; SSE2-NEXT: adcq %r15, %rdx
+; SSE2-NEXT: addq %r14, %rax
+; SSE2-NEXT: adcq %rbx, %rdx
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r12
+; SSE2-NEXT: movq %r10, 8(%r12)
; SSE2-NEXT: sarq $63, %r10
; SSE2-NEXT: xorq %r10, %rdx
; SSE2-NEXT: xorq %rax, %r10
-; SSE2-NEXT: xorl %ecx, %ecx
+; SSE2-NEXT: xorl %r15d, %r15d
; SSE2-NEXT: orq %rdx, %r10
-; SSE2-NEXT: setne %cl
-; SSE2-NEXT: movq %r13, %r9
+; SSE2-NEXT: setne %r15b
+; SSE2-NEXT: movq %rcx, %r9
; SSE2-NEXT: sarq $63, %r9
; SSE2-NEXT: movq %rbp, %r11
; SSE2-NEXT: imulq %r9, %r11
; SSE2-NEXT: movq %rsi, %rax
; SSE2-NEXT: mulq %r9
-; SSE2-NEXT: movq %rax, %r9
-; SSE2-NEXT: addq %rax, %r11
-; SSE2-NEXT: addq %rdx, %r11
+; SSE2-NEXT: movq %rdx, %r9
+; SSE2-NEXT: movq %rax, %r10
+; SSE2-NEXT: addq %rax, %r9
+; SSE2-NEXT: addq %r11, %r9
; SSE2-NEXT: movq %rbp, %rax
; SSE2-NEXT: sarq $63, %rax
; SSE2-NEXT: movq %rax, %r14
-; SSE2-NEXT: imulq %r13, %r14
+; SSE2-NEXT: imulq %rcx, %r14
; SSE2-NEXT: mulq %r8
-; SSE2-NEXT: movq %rax, %r10
-; SSE2-NEXT: addq %rax, %r14
-; SSE2-NEXT: addq %rdx, %r14
-; SSE2-NEXT: addq %r9, %r10
-; SSE2-NEXT: adcq %r11, %r14
+; SSE2-NEXT: movq %rax, %r11
+; SSE2-NEXT: movq %rdx, %rbx
+; SSE2-NEXT: addq %r14, %rbx
+; SSE2-NEXT: addq %rax, %rbx
+; SSE2-NEXT: addq %r10, %r11
+; SSE2-NEXT: adcq %r9, %rbx
; SSE2-NEXT: movq %r8, %rax
; SSE2-NEXT: mulq %rsi
; SSE2-NEXT: movq %rdx, %r9
-; SSE2-NEXT: movq %rax, %r11
-; SSE2-NEXT: movq %r13, %rax
+; SSE2-NEXT: movq %rax, %r10
+; SSE2-NEXT: movq %rcx, %rax
; SSE2-NEXT: mulq %rsi
; SSE2-NEXT: movq %rdx, %rsi
-; SSE2-NEXT: movq %rax, %rbx
-; SSE2-NEXT: addq %r9, %rbx
+; SSE2-NEXT: movq %rax, %r14
+; SSE2-NEXT: addq %r9, %r14
; SSE2-NEXT: adcq $0, %rsi
; SSE2-NEXT: movq %r8, %rax
; SSE2-NEXT: mulq %rbp
; SSE2-NEXT: movq %rdx, %r8
; SSE2-NEXT: movq %rax, %r9
-; SSE2-NEXT: addq %rbx, %r9
+; SSE2-NEXT: addq %r14, %r9
; SSE2-NEXT: adcq %rsi, %r8
; SSE2-NEXT: setb %al
; SSE2-NEXT: movzbl %al, %esi
-; SSE2-NEXT: movq %r13, %rax
+; SSE2-NEXT: movq %rcx, %rax
; SSE2-NEXT: mulq %rbp
; SSE2-NEXT: addq %r8, %rax
; SSE2-NEXT: adcq %rsi, %rdx
-; SSE2-NEXT: addq %r10, %rax
-; SSE2-NEXT: adcq %r14, %rdx
-; SSE2-NEXT: movq %r9, 24(%r15)
+; SSE2-NEXT: addq %r11, %rax
+; SSE2-NEXT: adcq %rbx, %rdx
+; SSE2-NEXT: movq %r9, 24(%r12)
; SSE2-NEXT: sarq $63, %r9
; SSE2-NEXT: xorq %r9, %rdx
; SSE2-NEXT: xorq %rax, %r9
; SSE2-NEXT: setne %al
; SSE2-NEXT: negl %eax
; SSE2-NEXT: movd %eax, %xmm1
-; SSE2-NEXT: negl %ecx
-; SSE2-NEXT: movd %ecx, %xmm0
+; SSE2-NEXT: negl %r15d
+; SSE2-NEXT: movd %r15d, %xmm0
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT: movq %r11, 16(%r15)
-; SSE2-NEXT: movq %rdi, (%r15)
+; SSE2-NEXT: movq %r10, 16(%r12)
+; SSE2-NEXT: movq %rdi, (%r12)
; SSE2-NEXT: popq %rbx
; SSE2-NEXT: popq %r12
; SSE2-NEXT: popq %r13
; SSSE3-NEXT: pushq %r13
; SSSE3-NEXT: pushq %r12
; SSSE3-NEXT: pushq %rbx
-; SSSE3-NEXT: movq %r8, %r14
-; SSSE3-NEXT: movq %rcx, %r13
+; SSSE3-NEXT: movq %r8, %r15
; SSSE3-NEXT: movq %rdx, %r8
; SSSE3-NEXT: movq %rsi, %r11
; SSSE3-NEXT: movq %rdi, %r10
; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %rsi
; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %rbp
-; SSSE3-NEXT: movq %r11, %rcx
-; SSSE3-NEXT: sarq $63, %rcx
-; SSSE3-NEXT: movq %r9, %r15
-; SSSE3-NEXT: imulq %rcx, %r15
-; SSSE3-NEXT: movq %r14, %rax
-; SSSE3-NEXT: mulq %rcx
-; SSSE3-NEXT: movq %rax, %rdi
-; SSSE3-NEXT: addq %rax, %r15
-; SSSE3-NEXT: addq %rdx, %r15
+; SSSE3-NEXT: movq %r11, %rdi
+; SSSE3-NEXT: sarq $63, %rdi
+; SSSE3-NEXT: movq %r9, %rbx
+; SSSE3-NEXT: imulq %rdi, %rbx
+; SSSE3-NEXT: movq %r15, %rax
+; SSSE3-NEXT: mulq %rdi
+; SSSE3-NEXT: movq %rdx, %rdi
+; SSSE3-NEXT: movq %rax, %r12
+; SSSE3-NEXT: addq %rax, %rdi
+; SSSE3-NEXT: addq %rbx, %rdi
; SSSE3-NEXT: movq %r9, %rax
; SSSE3-NEXT: sarq $63, %rax
-; SSSE3-NEXT: movq %rax, %rcx
-; SSSE3-NEXT: imulq %r11, %rcx
+; SSSE3-NEXT: movq %rax, %r13
+; SSSE3-NEXT: imulq %r11, %r13
; SSSE3-NEXT: mulq %r10
-; SSSE3-NEXT: movq %rax, %rbx
-; SSSE3-NEXT: addq %rax, %rcx
-; SSSE3-NEXT: addq %rdx, %rcx
-; SSSE3-NEXT: addq %rdi, %rbx
-; SSSE3-NEXT: adcq %r15, %rcx
+; SSSE3-NEXT: movq %rax, %r14
+; SSSE3-NEXT: movq %rdx, %rbx
+; SSSE3-NEXT: addq %r13, %rbx
+; SSSE3-NEXT: addq %rax, %rbx
+; SSSE3-NEXT: addq %r12, %r14
+; SSSE3-NEXT: adcq %rdi, %rbx
; SSSE3-NEXT: movq %r10, %rax
-; SSSE3-NEXT: mulq %r14
-; SSSE3-NEXT: movq %rdx, %r15
+; SSSE3-NEXT: mulq %r15
+; SSSE3-NEXT: movq %rdx, %r12
; SSSE3-NEXT: movq %rax, %rdi
; SSSE3-NEXT: movq %r11, %rax
-; SSSE3-NEXT: mulq %r14
-; SSSE3-NEXT: movq %rdx, %r14
-; SSSE3-NEXT: movq %rax, %r12
-; SSSE3-NEXT: addq %r15, %r12
-; SSSE3-NEXT: adcq $0, %r14
+; SSSE3-NEXT: mulq %r15
+; SSSE3-NEXT: movq %rdx, %r15
+; SSSE3-NEXT: movq %rax, %r13
+; SSSE3-NEXT: addq %r12, %r13
+; SSSE3-NEXT: adcq $0, %r15
; SSSE3-NEXT: movq %r10, %rax
; SSSE3-NEXT: mulq %r9
-; SSSE3-NEXT: movq %rdx, %r15
+; SSSE3-NEXT: movq %rdx, %r12
; SSSE3-NEXT: movq %rax, %r10
-; SSSE3-NEXT: addq %r12, %r10
-; SSSE3-NEXT: adcq %r14, %r15
+; SSSE3-NEXT: addq %r13, %r10
+; SSSE3-NEXT: adcq %r15, %r12
; SSSE3-NEXT: setb %al
-; SSSE3-NEXT: movzbl %al, %r14d
+; SSSE3-NEXT: movzbl %al, %r15d
; SSSE3-NEXT: movq %r11, %rax
; SSSE3-NEXT: mulq %r9
-; SSSE3-NEXT: addq %r15, %rax
-; SSSE3-NEXT: adcq %r14, %rdx
-; SSSE3-NEXT: addq %rbx, %rax
-; SSSE3-NEXT: adcq %rcx, %rdx
-; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r15
-; SSSE3-NEXT: movq %r10, 8(%r15)
+; SSSE3-NEXT: addq %r12, %rax
+; SSSE3-NEXT: adcq %r15, %rdx
+; SSSE3-NEXT: addq %r14, %rax
+; SSSE3-NEXT: adcq %rbx, %rdx
+; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r12
+; SSSE3-NEXT: movq %r10, 8(%r12)
; SSSE3-NEXT: sarq $63, %r10
; SSSE3-NEXT: xorq %r10, %rdx
; SSSE3-NEXT: xorq %rax, %r10
-; SSSE3-NEXT: xorl %ecx, %ecx
+; SSSE3-NEXT: xorl %r15d, %r15d
; SSSE3-NEXT: orq %rdx, %r10
-; SSSE3-NEXT: setne %cl
-; SSSE3-NEXT: movq %r13, %r9
+; SSSE3-NEXT: setne %r15b
+; SSSE3-NEXT: movq %rcx, %r9
; SSSE3-NEXT: sarq $63, %r9
; SSSE3-NEXT: movq %rbp, %r11
; SSSE3-NEXT: imulq %r9, %r11
; SSSE3-NEXT: movq %rsi, %rax
; SSSE3-NEXT: mulq %r9
-; SSSE3-NEXT: movq %rax, %r9
-; SSSE3-NEXT: addq %rax, %r11
-; SSSE3-NEXT: addq %rdx, %r11
+; SSSE3-NEXT: movq %rdx, %r9
+; SSSE3-NEXT: movq %rax, %r10
+; SSSE3-NEXT: addq %rax, %r9
+; SSSE3-NEXT: addq %r11, %r9
; SSSE3-NEXT: movq %rbp, %rax
; SSSE3-NEXT: sarq $63, %rax
; SSSE3-NEXT: movq %rax, %r14
-; SSSE3-NEXT: imulq %r13, %r14
+; SSSE3-NEXT: imulq %rcx, %r14
; SSSE3-NEXT: mulq %r8
-; SSSE3-NEXT: movq %rax, %r10
-; SSSE3-NEXT: addq %rax, %r14
-; SSSE3-NEXT: addq %rdx, %r14
-; SSSE3-NEXT: addq %r9, %r10
-; SSSE3-NEXT: adcq %r11, %r14
+; SSSE3-NEXT: movq %rax, %r11
+; SSSE3-NEXT: movq %rdx, %rbx
+; SSSE3-NEXT: addq %r14, %rbx
+; SSSE3-NEXT: addq %rax, %rbx
+; SSSE3-NEXT: addq %r10, %r11
+; SSSE3-NEXT: adcq %r9, %rbx
; SSSE3-NEXT: movq %r8, %rax
; SSSE3-NEXT: mulq %rsi
; SSSE3-NEXT: movq %rdx, %r9
-; SSSE3-NEXT: movq %rax, %r11
-; SSSE3-NEXT: movq %r13, %rax
+; SSSE3-NEXT: movq %rax, %r10
+; SSSE3-NEXT: movq %rcx, %rax
; SSSE3-NEXT: mulq %rsi
; SSSE3-NEXT: movq %rdx, %rsi
-; SSSE3-NEXT: movq %rax, %rbx
-; SSSE3-NEXT: addq %r9, %rbx
+; SSSE3-NEXT: movq %rax, %r14
+; SSSE3-NEXT: addq %r9, %r14
; SSSE3-NEXT: adcq $0, %rsi
; SSSE3-NEXT: movq %r8, %rax
; SSSE3-NEXT: mulq %rbp
; SSSE3-NEXT: movq %rdx, %r8
; SSSE3-NEXT: movq %rax, %r9
-; SSSE3-NEXT: addq %rbx, %r9
+; SSSE3-NEXT: addq %r14, %r9
; SSSE3-NEXT: adcq %rsi, %r8
; SSSE3-NEXT: setb %al
; SSSE3-NEXT: movzbl %al, %esi
-; SSSE3-NEXT: movq %r13, %rax
+; SSSE3-NEXT: movq %rcx, %rax
; SSSE3-NEXT: mulq %rbp
; SSSE3-NEXT: addq %r8, %rax
; SSSE3-NEXT: adcq %rsi, %rdx
-; SSSE3-NEXT: addq %r10, %rax
-; SSSE3-NEXT: adcq %r14, %rdx
-; SSSE3-NEXT: movq %r9, 24(%r15)
+; SSSE3-NEXT: addq %r11, %rax
+; SSSE3-NEXT: adcq %rbx, %rdx
+; SSSE3-NEXT: movq %r9, 24(%r12)
; SSSE3-NEXT: sarq $63, %r9
; SSSE3-NEXT: xorq %r9, %rdx
; SSSE3-NEXT: xorq %rax, %r9
; SSSE3-NEXT: setne %al
; SSSE3-NEXT: negl %eax
; SSSE3-NEXT: movd %eax, %xmm1
-; SSSE3-NEXT: negl %ecx
-; SSSE3-NEXT: movd %ecx, %xmm0
+; SSSE3-NEXT: negl %r15d
+; SSSE3-NEXT: movd %r15d, %xmm0
; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSSE3-NEXT: movq %r11, 16(%r15)
-; SSSE3-NEXT: movq %rdi, (%r15)
+; SSSE3-NEXT: movq %r10, 16(%r12)
+; SSSE3-NEXT: movq %rdi, (%r12)
; SSSE3-NEXT: popq %rbx
; SSSE3-NEXT: popq %r12
; SSSE3-NEXT: popq %r13
; SSE41-NEXT: pushq %r13
; SSE41-NEXT: pushq %r12
; SSE41-NEXT: pushq %rbx
-; SSE41-NEXT: movq %r8, %r14
-; SSE41-NEXT: movq %rcx, %r13
+; SSE41-NEXT: movq %r8, %r15
; SSE41-NEXT: movq %rdx, %r8
; SSE41-NEXT: movq %rsi, %r11
; SSE41-NEXT: movq %rdi, %r10
; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %rsi
; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %rbp
-; SSE41-NEXT: movq %r11, %rcx
-; SSE41-NEXT: sarq $63, %rcx
-; SSE41-NEXT: movq %r9, %r15
-; SSE41-NEXT: imulq %rcx, %r15
-; SSE41-NEXT: movq %r14, %rax
-; SSE41-NEXT: mulq %rcx
-; SSE41-NEXT: movq %rax, %rdi
-; SSE41-NEXT: addq %rax, %r15
-; SSE41-NEXT: addq %rdx, %r15
+; SSE41-NEXT: movq %r11, %rdi
+; SSE41-NEXT: sarq $63, %rdi
+; SSE41-NEXT: movq %r9, %rbx
+; SSE41-NEXT: imulq %rdi, %rbx
+; SSE41-NEXT: movq %r15, %rax
+; SSE41-NEXT: mulq %rdi
+; SSE41-NEXT: movq %rdx, %rdi
+; SSE41-NEXT: movq %rax, %r12
+; SSE41-NEXT: addq %rax, %rdi
+; SSE41-NEXT: addq %rbx, %rdi
; SSE41-NEXT: movq %r9, %rax
; SSE41-NEXT: sarq $63, %rax
-; SSE41-NEXT: movq %rax, %rcx
-; SSE41-NEXT: imulq %r11, %rcx
+; SSE41-NEXT: movq %rax, %r13
+; SSE41-NEXT: imulq %r11, %r13
; SSE41-NEXT: mulq %r10
-; SSE41-NEXT: movq %rax, %rbx
-; SSE41-NEXT: addq %rax, %rcx
-; SSE41-NEXT: addq %rdx, %rcx
-; SSE41-NEXT: addq %rdi, %rbx
-; SSE41-NEXT: adcq %r15, %rcx
+; SSE41-NEXT: movq %rax, %r14
+; SSE41-NEXT: movq %rdx, %rbx
+; SSE41-NEXT: addq %r13, %rbx
+; SSE41-NEXT: addq %rax, %rbx
+; SSE41-NEXT: addq %r12, %r14
+; SSE41-NEXT: adcq %rdi, %rbx
; SSE41-NEXT: movq %r10, %rax
-; SSE41-NEXT: mulq %r14
-; SSE41-NEXT: movq %rdx, %r15
+; SSE41-NEXT: mulq %r15
+; SSE41-NEXT: movq %rdx, %r12
; SSE41-NEXT: movq %rax, %rdi
; SSE41-NEXT: movq %r11, %rax
-; SSE41-NEXT: mulq %r14
-; SSE41-NEXT: movq %rdx, %r14
-; SSE41-NEXT: movq %rax, %r12
-; SSE41-NEXT: addq %r15, %r12
-; SSE41-NEXT: adcq $0, %r14
+; SSE41-NEXT: mulq %r15
+; SSE41-NEXT: movq %rdx, %r15
+; SSE41-NEXT: movq %rax, %r13
+; SSE41-NEXT: addq %r12, %r13
+; SSE41-NEXT: adcq $0, %r15
; SSE41-NEXT: movq %r10, %rax
; SSE41-NEXT: mulq %r9
-; SSE41-NEXT: movq %rdx, %r15
+; SSE41-NEXT: movq %rdx, %r12
; SSE41-NEXT: movq %rax, %r10
-; SSE41-NEXT: addq %r12, %r10
-; SSE41-NEXT: adcq %r14, %r15
+; SSE41-NEXT: addq %r13, %r10
+; SSE41-NEXT: adcq %r15, %r12
; SSE41-NEXT: setb %al
-; SSE41-NEXT: movzbl %al, %r14d
+; SSE41-NEXT: movzbl %al, %r15d
; SSE41-NEXT: movq %r11, %rax
; SSE41-NEXT: mulq %r9
-; SSE41-NEXT: addq %r15, %rax
-; SSE41-NEXT: adcq %r14, %rdx
-; SSE41-NEXT: addq %rbx, %rax
-; SSE41-NEXT: adcq %rcx, %rdx
-; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r15
-; SSE41-NEXT: movq %r10, 8(%r15)
+; SSE41-NEXT: addq %r12, %rax
+; SSE41-NEXT: adcq %r15, %rdx
+; SSE41-NEXT: addq %r14, %rax
+; SSE41-NEXT: adcq %rbx, %rdx
+; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r12
+; SSE41-NEXT: movq %r10, 8(%r12)
; SSE41-NEXT: sarq $63, %r10
; SSE41-NEXT: xorq %r10, %rdx
; SSE41-NEXT: xorq %rax, %r10
-; SSE41-NEXT: xorl %ecx, %ecx
+; SSE41-NEXT: xorl %r15d, %r15d
; SSE41-NEXT: orq %rdx, %r10
-; SSE41-NEXT: setne %cl
-; SSE41-NEXT: movq %r13, %r9
+; SSE41-NEXT: setne %r15b
+; SSE41-NEXT: movq %rcx, %r9
; SSE41-NEXT: sarq $63, %r9
; SSE41-NEXT: movq %rbp, %r11
; SSE41-NEXT: imulq %r9, %r11
; SSE41-NEXT: movq %rsi, %rax
; SSE41-NEXT: mulq %r9
-; SSE41-NEXT: movq %rax, %r9
-; SSE41-NEXT: addq %rax, %r11
-; SSE41-NEXT: addq %rdx, %r11
+; SSE41-NEXT: movq %rdx, %r9
+; SSE41-NEXT: movq %rax, %r10
+; SSE41-NEXT: addq %rax, %r9
+; SSE41-NEXT: addq %r11, %r9
; SSE41-NEXT: movq %rbp, %rax
; SSE41-NEXT: sarq $63, %rax
; SSE41-NEXT: movq %rax, %r14
-; SSE41-NEXT: imulq %r13, %r14
+; SSE41-NEXT: imulq %rcx, %r14
; SSE41-NEXT: mulq %r8
-; SSE41-NEXT: movq %rax, %r10
-; SSE41-NEXT: addq %rax, %r14
-; SSE41-NEXT: addq %rdx, %r14
-; SSE41-NEXT: addq %r9, %r10
-; SSE41-NEXT: adcq %r11, %r14
+; SSE41-NEXT: movq %rax, %r11
+; SSE41-NEXT: movq %rdx, %rbx
+; SSE41-NEXT: addq %r14, %rbx
+; SSE41-NEXT: addq %rax, %rbx
+; SSE41-NEXT: addq %r10, %r11
+; SSE41-NEXT: adcq %r9, %rbx
; SSE41-NEXT: movq %r8, %rax
; SSE41-NEXT: mulq %rsi
; SSE41-NEXT: movq %rdx, %r9
-; SSE41-NEXT: movq %rax, %r11
-; SSE41-NEXT: movq %r13, %rax
+; SSE41-NEXT: movq %rax, %r10
+; SSE41-NEXT: movq %rcx, %rax
; SSE41-NEXT: mulq %rsi
; SSE41-NEXT: movq %rdx, %rsi
-; SSE41-NEXT: movq %rax, %rbx
-; SSE41-NEXT: addq %r9, %rbx
+; SSE41-NEXT: movq %rax, %r14
+; SSE41-NEXT: addq %r9, %r14
; SSE41-NEXT: adcq $0, %rsi
; SSE41-NEXT: movq %r8, %rax
; SSE41-NEXT: mulq %rbp
; SSE41-NEXT: movq %rdx, %r8
; SSE41-NEXT: movq %rax, %r9
-; SSE41-NEXT: addq %rbx, %r9
+; SSE41-NEXT: addq %r14, %r9
; SSE41-NEXT: adcq %rsi, %r8
; SSE41-NEXT: setb %al
; SSE41-NEXT: movzbl %al, %esi
-; SSE41-NEXT: movq %r13, %rax
+; SSE41-NEXT: movq %rcx, %rax
; SSE41-NEXT: mulq %rbp
; SSE41-NEXT: addq %r8, %rax
; SSE41-NEXT: adcq %rsi, %rdx
-; SSE41-NEXT: addq %r10, %rax
-; SSE41-NEXT: adcq %r14, %rdx
-; SSE41-NEXT: movq %r9, 24(%r15)
+; SSE41-NEXT: addq %r11, %rax
+; SSE41-NEXT: adcq %rbx, %rdx
+; SSE41-NEXT: movq %r9, 24(%r12)
; SSE41-NEXT: sarq $63, %r9
; SSE41-NEXT: xorq %r9, %rdx
; SSE41-NEXT: xorq %rax, %r9
; SSE41-NEXT: orq %rdx, %r9
; SSE41-NEXT: setne %al
; SSE41-NEXT: negl %eax
-; SSE41-NEXT: negl %ecx
-; SSE41-NEXT: movd %ecx, %xmm0
+; SSE41-NEXT: negl %r15d
+; SSE41-NEXT: movd %r15d, %xmm0
; SSE41-NEXT: pinsrd $1, %eax, %xmm0
-; SSE41-NEXT: movq %r11, 16(%r15)
-; SSE41-NEXT: movq %rdi, (%r15)
+; SSE41-NEXT: movq %r10, 16(%r12)
+; SSE41-NEXT: movq %rdi, (%r12)
; SSE41-NEXT: popq %rbx
; SSE41-NEXT: popq %r12
; SSE41-NEXT: popq %r13
; AVX-NEXT: pushq %r13
; AVX-NEXT: pushq %r12
; AVX-NEXT: pushq %rbx
-; AVX-NEXT: movq %r8, %r14
-; AVX-NEXT: movq %rcx, %r13
+; AVX-NEXT: movq %r8, %r15
; AVX-NEXT: movq %rdx, %r8
; AVX-NEXT: movq %rsi, %r11
; AVX-NEXT: movq %rdi, %r10
; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rsi
; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rbp
-; AVX-NEXT: movq %r11, %rcx
-; AVX-NEXT: sarq $63, %rcx
-; AVX-NEXT: movq %r9, %r15
-; AVX-NEXT: imulq %rcx, %r15
-; AVX-NEXT: movq %r14, %rax
-; AVX-NEXT: mulq %rcx
-; AVX-NEXT: movq %rax, %rdi
-; AVX-NEXT: addq %rax, %r15
-; AVX-NEXT: addq %rdx, %r15
+; AVX-NEXT: movq %r11, %rdi
+; AVX-NEXT: sarq $63, %rdi
+; AVX-NEXT: movq %r9, %rbx
+; AVX-NEXT: imulq %rdi, %rbx
+; AVX-NEXT: movq %r15, %rax
+; AVX-NEXT: mulq %rdi
+; AVX-NEXT: movq %rdx, %rdi
+; AVX-NEXT: movq %rax, %r12
+; AVX-NEXT: addq %rax, %rdi
+; AVX-NEXT: addq %rbx, %rdi
; AVX-NEXT: movq %r9, %rax
; AVX-NEXT: sarq $63, %rax
-; AVX-NEXT: movq %rax, %rcx
-; AVX-NEXT: imulq %r11, %rcx
+; AVX-NEXT: movq %rax, %r13
+; AVX-NEXT: imulq %r11, %r13
; AVX-NEXT: mulq %r10
-; AVX-NEXT: movq %rax, %rbx
-; AVX-NEXT: addq %rax, %rcx
-; AVX-NEXT: addq %rdx, %rcx
-; AVX-NEXT: addq %rdi, %rbx
-; AVX-NEXT: adcq %r15, %rcx
+; AVX-NEXT: movq %rax, %r14
+; AVX-NEXT: movq %rdx, %rbx
+; AVX-NEXT: addq %r13, %rbx
+; AVX-NEXT: addq %rax, %rbx
+; AVX-NEXT: addq %r12, %r14
+; AVX-NEXT: adcq %rdi, %rbx
; AVX-NEXT: movq %r10, %rax
-; AVX-NEXT: mulq %r14
-; AVX-NEXT: movq %rdx, %r15
+; AVX-NEXT: mulq %r15
+; AVX-NEXT: movq %rdx, %r12
; AVX-NEXT: movq %rax, %rdi
; AVX-NEXT: movq %r11, %rax
-; AVX-NEXT: mulq %r14
-; AVX-NEXT: movq %rdx, %r14
-; AVX-NEXT: movq %rax, %r12
-; AVX-NEXT: addq %r15, %r12
-; AVX-NEXT: adcq $0, %r14
+; AVX-NEXT: mulq %r15
+; AVX-NEXT: movq %rdx, %r15
+; AVX-NEXT: movq %rax, %r13
+; AVX-NEXT: addq %r12, %r13
+; AVX-NEXT: adcq $0, %r15
; AVX-NEXT: movq %r10, %rax
; AVX-NEXT: mulq %r9
-; AVX-NEXT: movq %rdx, %r15
+; AVX-NEXT: movq %rdx, %r12
; AVX-NEXT: movq %rax, %r10
-; AVX-NEXT: addq %r12, %r10
-; AVX-NEXT: adcq %r14, %r15
+; AVX-NEXT: addq %r13, %r10
+; AVX-NEXT: adcq %r15, %r12
; AVX-NEXT: setb %al
-; AVX-NEXT: movzbl %al, %r14d
+; AVX-NEXT: movzbl %al, %r15d
; AVX-NEXT: movq %r11, %rax
; AVX-NEXT: mulq %r9
-; AVX-NEXT: addq %r15, %rax
-; AVX-NEXT: adcq %r14, %rdx
-; AVX-NEXT: addq %rbx, %rax
-; AVX-NEXT: adcq %rcx, %rdx
-; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r15
-; AVX-NEXT: movq %r10, 8(%r15)
+; AVX-NEXT: addq %r12, %rax
+; AVX-NEXT: adcq %r15, %rdx
+; AVX-NEXT: addq %r14, %rax
+; AVX-NEXT: adcq %rbx, %rdx
+; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r12
+; AVX-NEXT: movq %r10, 8(%r12)
; AVX-NEXT: sarq $63, %r10
; AVX-NEXT: xorq %r10, %rdx
; AVX-NEXT: xorq %rax, %r10
-; AVX-NEXT: xorl %ecx, %ecx
+; AVX-NEXT: xorl %r15d, %r15d
; AVX-NEXT: orq %rdx, %r10
-; AVX-NEXT: setne %cl
-; AVX-NEXT: movq %r13, %r9
+; AVX-NEXT: setne %r15b
+; AVX-NEXT: movq %rcx, %r9
; AVX-NEXT: sarq $63, %r9
; AVX-NEXT: movq %rbp, %r11
; AVX-NEXT: imulq %r9, %r11
; AVX-NEXT: movq %rsi, %rax
; AVX-NEXT: mulq %r9
-; AVX-NEXT: movq %rax, %r9
-; AVX-NEXT: addq %rax, %r11
-; AVX-NEXT: addq %rdx, %r11
+; AVX-NEXT: movq %rdx, %r9
+; AVX-NEXT: movq %rax, %r10
+; AVX-NEXT: addq %rax, %r9
+; AVX-NEXT: addq %r11, %r9
; AVX-NEXT: movq %rbp, %rax
; AVX-NEXT: sarq $63, %rax
; AVX-NEXT: movq %rax, %r14
-; AVX-NEXT: imulq %r13, %r14
+; AVX-NEXT: imulq %rcx, %r14
; AVX-NEXT: mulq %r8
-; AVX-NEXT: movq %rax, %r10
-; AVX-NEXT: addq %rax, %r14
-; AVX-NEXT: addq %rdx, %r14
-; AVX-NEXT: addq %r9, %r10
-; AVX-NEXT: adcq %r11, %r14
+; AVX-NEXT: movq %rax, %r11
+; AVX-NEXT: movq %rdx, %rbx
+; AVX-NEXT: addq %r14, %rbx
+; AVX-NEXT: addq %rax, %rbx
+; AVX-NEXT: addq %r10, %r11
+; AVX-NEXT: adcq %r9, %rbx
; AVX-NEXT: movq %r8, %rax
; AVX-NEXT: mulq %rsi
; AVX-NEXT: movq %rdx, %r9
-; AVX-NEXT: movq %rax, %r11
-; AVX-NEXT: movq %r13, %rax
+; AVX-NEXT: movq %rax, %r10
+; AVX-NEXT: movq %rcx, %rax
; AVX-NEXT: mulq %rsi
; AVX-NEXT: movq %rdx, %rsi
-; AVX-NEXT: movq %rax, %rbx
-; AVX-NEXT: addq %r9, %rbx
+; AVX-NEXT: movq %rax, %r14
+; AVX-NEXT: addq %r9, %r14
; AVX-NEXT: adcq $0, %rsi
; AVX-NEXT: movq %r8, %rax
; AVX-NEXT: mulq %rbp
; AVX-NEXT: movq %rdx, %r8
; AVX-NEXT: movq %rax, %r9
-; AVX-NEXT: addq %rbx, %r9
+; AVX-NEXT: addq %r14, %r9
; AVX-NEXT: adcq %rsi, %r8
; AVX-NEXT: setb %al
; AVX-NEXT: movzbl %al, %esi
-; AVX-NEXT: movq %r13, %rax
+; AVX-NEXT: movq %rcx, %rax
; AVX-NEXT: mulq %rbp
; AVX-NEXT: addq %r8, %rax
; AVX-NEXT: adcq %rsi, %rdx
-; AVX-NEXT: addq %r10, %rax
-; AVX-NEXT: adcq %r14, %rdx
-; AVX-NEXT: movq %r9, 24(%r15)
+; AVX-NEXT: addq %r11, %rax
+; AVX-NEXT: adcq %rbx, %rdx
+; AVX-NEXT: movq %r9, 24(%r12)
; AVX-NEXT: sarq $63, %r9
; AVX-NEXT: xorq %r9, %rdx
; AVX-NEXT: xorq %rax, %r9
; AVX-NEXT: orq %rdx, %r9
; AVX-NEXT: setne %al
; AVX-NEXT: negl %eax
-; AVX-NEXT: negl %ecx
-; AVX-NEXT: vmovd %ecx, %xmm0
+; AVX-NEXT: negl %r15d
+; AVX-NEXT: vmovd %r15d, %xmm0
; AVX-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0
-; AVX-NEXT: movq %r11, 16(%r15)
-; AVX-NEXT: movq %rdi, (%r15)
+; AVX-NEXT: movq %r10, 16(%r12)
+; AVX-NEXT: movq %rdi, (%r12)
; AVX-NEXT: popq %rbx
; AVX-NEXT: popq %r12
; AVX-NEXT: popq %r13
; AVX512F-NEXT: pushq %r13
; AVX512F-NEXT: pushq %r12
; AVX512F-NEXT: pushq %rbx
+; AVX512F-NEXT: movq %r9, %rbp
; AVX512F-NEXT: movq %rcx, %r11
; AVX512F-NEXT: movq %rdx, %r10
-; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r13
-; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r14
-; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rbp
+; AVX512F-NEXT: movq %rsi, %r9
+; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r15
+; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rsi
; AVX512F-NEXT: sarq $63, %rcx
-; AVX512F-NEXT: movq %rbp, %r12
-; AVX512F-NEXT: imulq %rcx, %r12
-; AVX512F-NEXT: movq %r14, %rax
+; AVX512F-NEXT: movq %rsi, %rbx
+; AVX512F-NEXT: imulq %rcx, %rbx
+; AVX512F-NEXT: movq %r15, %rax
; AVX512F-NEXT: mulq %rcx
-; AVX512F-NEXT: movq %rax, %r15
-; AVX512F-NEXT: addq %rax, %r12
-; AVX512F-NEXT: addq %rdx, %r12
-; AVX512F-NEXT: movq %rbp, %rax
+; AVX512F-NEXT: movq %rdx, %rcx
+; AVX512F-NEXT: movq %rax, %r12
+; AVX512F-NEXT: addq %rax, %rcx
+; AVX512F-NEXT: addq %rbx, %rcx
+; AVX512F-NEXT: movq %rsi, %rax
; AVX512F-NEXT: sarq $63, %rax
-; AVX512F-NEXT: movq %rax, %rcx
-; AVX512F-NEXT: imulq %r11, %rcx
+; AVX512F-NEXT: movq %rax, %r13
+; AVX512F-NEXT: imulq %r11, %r13
; AVX512F-NEXT: mulq %r10
-; AVX512F-NEXT: movq %rax, %rbx
-; AVX512F-NEXT: addq %rax, %rcx
-; AVX512F-NEXT: addq %rdx, %rcx
-; AVX512F-NEXT: addq %r15, %rbx
-; AVX512F-NEXT: adcq %r12, %rcx
+; AVX512F-NEXT: movq %rax, %r14
+; AVX512F-NEXT: movq %rdx, %rbx
+; AVX512F-NEXT: addq %r13, %rbx
+; AVX512F-NEXT: addq %rax, %rbx
+; AVX512F-NEXT: addq %r12, %r14
+; AVX512F-NEXT: adcq %rcx, %rbx
; AVX512F-NEXT: movq %r10, %rax
-; AVX512F-NEXT: mulq %r14
-; AVX512F-NEXT: movq %rdx, %r15
-; AVX512F-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512F-NEXT: mulq %r15
+; AVX512F-NEXT: movq %rdx, %r12
+; AVX512F-NEXT: movq %rax, %rcx
; AVX512F-NEXT: movq %r11, %rax
-; AVX512F-NEXT: mulq %r14
-; AVX512F-NEXT: movq %rdx, %r14
-; AVX512F-NEXT: movq %rax, %r12
-; AVX512F-NEXT: addq %r15, %r12
-; AVX512F-NEXT: adcq $0, %r14
-; AVX512F-NEXT: movq %r10, %rax
-; AVX512F-NEXT: mulq %rbp
+; AVX512F-NEXT: mulq %r15
; AVX512F-NEXT: movq %rdx, %r15
+; AVX512F-NEXT: movq %rax, %r13
+; AVX512F-NEXT: addq %r12, %r13
+; AVX512F-NEXT: adcq $0, %r15
+; AVX512F-NEXT: movq %r10, %rax
+; AVX512F-NEXT: mulq %rsi
+; AVX512F-NEXT: movq %rdx, %r12
; AVX512F-NEXT: movq %rax, %r10
-; AVX512F-NEXT: addq %r12, %r10
-; AVX512F-NEXT: adcq %r14, %r15
+; AVX512F-NEXT: addq %r13, %r10
+; AVX512F-NEXT: adcq %r15, %r12
; AVX512F-NEXT: setb %al
-; AVX512F-NEXT: movzbl %al, %r14d
+; AVX512F-NEXT: movzbl %al, %r15d
; AVX512F-NEXT: movq %r11, %rax
-; AVX512F-NEXT: mulq %rbp
-; AVX512F-NEXT: addq %r15, %rax
-; AVX512F-NEXT: adcq %r14, %rdx
-; AVX512F-NEXT: addq %rbx, %rax
-; AVX512F-NEXT: adcq %rcx, %rdx
-; AVX512F-NEXT: movq %r10, 24(%r13)
+; AVX512F-NEXT: mulq %rsi
+; AVX512F-NEXT: addq %r12, %rax
+; AVX512F-NEXT: adcq %r15, %rdx
+; AVX512F-NEXT: addq %r14, %rax
+; AVX512F-NEXT: adcq %rbx, %rdx
+; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r12
+; AVX512F-NEXT: movq %r10, 24(%r12)
; AVX512F-NEXT: sarq $63, %r10
; AVX512F-NEXT: xorq %r10, %rdx
; AVX512F-NEXT: xorq %rax, %r10
; AVX512F-NEXT: orq %rdx, %r10
; AVX512F-NEXT: setne %al
; AVX512F-NEXT: kmovw %eax, %k0
-; AVX512F-NEXT: movq %rsi, %rcx
-; AVX512F-NEXT: sarq $63, %rcx
-; AVX512F-NEXT: movq %r9, %rbx
-; AVX512F-NEXT: imulq %rcx, %rbx
+; AVX512F-NEXT: movq %r9, %rsi
+; AVX512F-NEXT: sarq $63, %rsi
+; AVX512F-NEXT: movq %rbp, %rbx
+; AVX512F-NEXT: imulq %rsi, %rbx
; AVX512F-NEXT: movq %r8, %rax
-; AVX512F-NEXT: mulq %rcx
-; AVX512F-NEXT: movq %rax, %r10
-; AVX512F-NEXT: addq %rax, %rbx
-; AVX512F-NEXT: addq %rdx, %rbx
-; AVX512F-NEXT: movq %r9, %rax
+; AVX512F-NEXT: mulq %rsi
+; AVX512F-NEXT: movq %rdx, %r10
+; AVX512F-NEXT: movq %rax, %r11
+; AVX512F-NEXT: addq %rax, %r10
+; AVX512F-NEXT: addq %rbx, %r10
+; AVX512F-NEXT: movq %rbp, %rax
; AVX512F-NEXT: sarq $63, %rax
-; AVX512F-NEXT: movq %rax, %rcx
-; AVX512F-NEXT: imulq %rsi, %rcx
+; AVX512F-NEXT: movq %rax, %rsi
+; AVX512F-NEXT: imulq %r9, %rsi
; AVX512F-NEXT: mulq %rdi
-; AVX512F-NEXT: movq %rax, %r11
-; AVX512F-NEXT: addq %rax, %rcx
-; AVX512F-NEXT: addq %rdx, %rcx
-; AVX512F-NEXT: addq %r10, %r11
-; AVX512F-NEXT: adcq %rbx, %rcx
+; AVX512F-NEXT: movq %rax, %rbx
+; AVX512F-NEXT: movq %rdx, %r14
+; AVX512F-NEXT: addq %rsi, %r14
+; AVX512F-NEXT: addq %rax, %r14
+; AVX512F-NEXT: addq %r11, %rbx
+; AVX512F-NEXT: adcq %r10, %r14
; AVX512F-NEXT: movq %rdi, %rax
; AVX512F-NEXT: mulq %r8
; AVX512F-NEXT: movq %rdx, %r10
-; AVX512F-NEXT: movq %rax, %rbx
-; AVX512F-NEXT: movq %rsi, %rax
+; AVX512F-NEXT: movq %rax, %r11
+; AVX512F-NEXT: movq %r9, %rax
; AVX512F-NEXT: mulq %r8
; AVX512F-NEXT: movq %rdx, %r8
-; AVX512F-NEXT: movq %rax, %r14
-; AVX512F-NEXT: addq %r10, %r14
+; AVX512F-NEXT: movq %rax, %r15
+; AVX512F-NEXT: addq %r10, %r15
; AVX512F-NEXT: adcq $0, %r8
; AVX512F-NEXT: movq %rdi, %rax
-; AVX512F-NEXT: mulq %r9
+; AVX512F-NEXT: mulq %rbp
; AVX512F-NEXT: movq %rdx, %rdi
; AVX512F-NEXT: movq %rax, %r10
-; AVX512F-NEXT: addq %r14, %r10
+; AVX512F-NEXT: addq %r15, %r10
; AVX512F-NEXT: adcq %r8, %rdi
; AVX512F-NEXT: setb %al
-; AVX512F-NEXT: movzbl %al, %r8d
-; AVX512F-NEXT: movq %rsi, %rax
-; AVX512F-NEXT: mulq %r9
+; AVX512F-NEXT: movzbl %al, %esi
+; AVX512F-NEXT: movq %r9, %rax
+; AVX512F-NEXT: mulq %rbp
; AVX512F-NEXT: addq %rdi, %rax
-; AVX512F-NEXT: adcq %r8, %rdx
-; AVX512F-NEXT: addq %r11, %rax
-; AVX512F-NEXT: adcq %rcx, %rdx
-; AVX512F-NEXT: movq %r10, 8(%r13)
+; AVX512F-NEXT: adcq %rsi, %rdx
+; AVX512F-NEXT: addq %rbx, %rax
+; AVX512F-NEXT: adcq %r14, %rdx
+; AVX512F-NEXT: movq %r10, 8(%r12)
; AVX512F-NEXT: sarq $63, %r10
; AVX512F-NEXT: xorq %r10, %rdx
; AVX512F-NEXT: xorq %rax, %r10
; AVX512F-NEXT: korw %k0, %k1, %k1
; AVX512F-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512F-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
-; AVX512F-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512F-NEXT: movq %rax, 16(%r13)
-; AVX512F-NEXT: movq %rbx, (%r13)
+; AVX512F-NEXT: movq %rcx, 16(%r12)
+; AVX512F-NEXT: movq %r11, (%r12)
; AVX512F-NEXT: popq %rbx
; AVX512F-NEXT: popq %r12
; AVX512F-NEXT: popq %r13
; AVX512BW-NEXT: pushq %r13
; AVX512BW-NEXT: pushq %r12
; AVX512BW-NEXT: pushq %rbx
+; AVX512BW-NEXT: movq %r9, %rbp
; AVX512BW-NEXT: movq %rcx, %r11
; AVX512BW-NEXT: movq %rdx, %r10
-; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r13
-; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r14
-; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rbp
+; AVX512BW-NEXT: movq %rsi, %r9
+; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r15
+; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rsi
; AVX512BW-NEXT: sarq $63, %rcx
-; AVX512BW-NEXT: movq %rbp, %r12
-; AVX512BW-NEXT: imulq %rcx, %r12
-; AVX512BW-NEXT: movq %r14, %rax
+; AVX512BW-NEXT: movq %rsi, %rbx
+; AVX512BW-NEXT: imulq %rcx, %rbx
+; AVX512BW-NEXT: movq %r15, %rax
; AVX512BW-NEXT: mulq %rcx
-; AVX512BW-NEXT: movq %rax, %r15
-; AVX512BW-NEXT: addq %rax, %r12
-; AVX512BW-NEXT: addq %rdx, %r12
-; AVX512BW-NEXT: movq %rbp, %rax
+; AVX512BW-NEXT: movq %rdx, %rcx
+; AVX512BW-NEXT: movq %rax, %r12
+; AVX512BW-NEXT: addq %rax, %rcx
+; AVX512BW-NEXT: addq %rbx, %rcx
+; AVX512BW-NEXT: movq %rsi, %rax
; AVX512BW-NEXT: sarq $63, %rax
-; AVX512BW-NEXT: movq %rax, %rcx
-; AVX512BW-NEXT: imulq %r11, %rcx
+; AVX512BW-NEXT: movq %rax, %r13
+; AVX512BW-NEXT: imulq %r11, %r13
; AVX512BW-NEXT: mulq %r10
-; AVX512BW-NEXT: movq %rax, %rbx
-; AVX512BW-NEXT: addq %rax, %rcx
-; AVX512BW-NEXT: addq %rdx, %rcx
-; AVX512BW-NEXT: addq %r15, %rbx
-; AVX512BW-NEXT: adcq %r12, %rcx
+; AVX512BW-NEXT: movq %rax, %r14
+; AVX512BW-NEXT: movq %rdx, %rbx
+; AVX512BW-NEXT: addq %r13, %rbx
+; AVX512BW-NEXT: addq %rax, %rbx
+; AVX512BW-NEXT: addq %r12, %r14
+; AVX512BW-NEXT: adcq %rcx, %rbx
; AVX512BW-NEXT: movq %r10, %rax
-; AVX512BW-NEXT: mulq %r14
-; AVX512BW-NEXT: movq %rdx, %r15
-; AVX512BW-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512BW-NEXT: mulq %r15
+; AVX512BW-NEXT: movq %rdx, %r12
+; AVX512BW-NEXT: movq %rax, %rcx
; AVX512BW-NEXT: movq %r11, %rax
-; AVX512BW-NEXT: mulq %r14
-; AVX512BW-NEXT: movq %rdx, %r14
-; AVX512BW-NEXT: movq %rax, %r12
-; AVX512BW-NEXT: addq %r15, %r12
-; AVX512BW-NEXT: adcq $0, %r14
-; AVX512BW-NEXT: movq %r10, %rax
-; AVX512BW-NEXT: mulq %rbp
+; AVX512BW-NEXT: mulq %r15
; AVX512BW-NEXT: movq %rdx, %r15
+; AVX512BW-NEXT: movq %rax, %r13
+; AVX512BW-NEXT: addq %r12, %r13
+; AVX512BW-NEXT: adcq $0, %r15
+; AVX512BW-NEXT: movq %r10, %rax
+; AVX512BW-NEXT: mulq %rsi
+; AVX512BW-NEXT: movq %rdx, %r12
; AVX512BW-NEXT: movq %rax, %r10
-; AVX512BW-NEXT: addq %r12, %r10
-; AVX512BW-NEXT: adcq %r14, %r15
+; AVX512BW-NEXT: addq %r13, %r10
+; AVX512BW-NEXT: adcq %r15, %r12
; AVX512BW-NEXT: setb %al
-; AVX512BW-NEXT: movzbl %al, %r14d
+; AVX512BW-NEXT: movzbl %al, %r15d
; AVX512BW-NEXT: movq %r11, %rax
-; AVX512BW-NEXT: mulq %rbp
-; AVX512BW-NEXT: addq %r15, %rax
-; AVX512BW-NEXT: adcq %r14, %rdx
-; AVX512BW-NEXT: addq %rbx, %rax
-; AVX512BW-NEXT: adcq %rcx, %rdx
-; AVX512BW-NEXT: movq %r10, 24(%r13)
+; AVX512BW-NEXT: mulq %rsi
+; AVX512BW-NEXT: addq %r12, %rax
+; AVX512BW-NEXT: adcq %r15, %rdx
+; AVX512BW-NEXT: addq %r14, %rax
+; AVX512BW-NEXT: adcq %rbx, %rdx
+; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r12
+; AVX512BW-NEXT: movq %r10, 24(%r12)
; AVX512BW-NEXT: sarq $63, %r10
; AVX512BW-NEXT: xorq %r10, %rdx
; AVX512BW-NEXT: xorq %rax, %r10
; AVX512BW-NEXT: orq %rdx, %r10
; AVX512BW-NEXT: setne %al
; AVX512BW-NEXT: kmovd %eax, %k0
-; AVX512BW-NEXT: movq %rsi, %rcx
-; AVX512BW-NEXT: sarq $63, %rcx
-; AVX512BW-NEXT: movq %r9, %rbx
-; AVX512BW-NEXT: imulq %rcx, %rbx
+; AVX512BW-NEXT: movq %r9, %rsi
+; AVX512BW-NEXT: sarq $63, %rsi
+; AVX512BW-NEXT: movq %rbp, %rbx
+; AVX512BW-NEXT: imulq %rsi, %rbx
; AVX512BW-NEXT: movq %r8, %rax
-; AVX512BW-NEXT: mulq %rcx
-; AVX512BW-NEXT: movq %rax, %r10
-; AVX512BW-NEXT: addq %rax, %rbx
-; AVX512BW-NEXT: addq %rdx, %rbx
-; AVX512BW-NEXT: movq %r9, %rax
+; AVX512BW-NEXT: mulq %rsi
+; AVX512BW-NEXT: movq %rdx, %r10
+; AVX512BW-NEXT: movq %rax, %r11
+; AVX512BW-NEXT: addq %rax, %r10
+; AVX512BW-NEXT: addq %rbx, %r10
+; AVX512BW-NEXT: movq %rbp, %rax
; AVX512BW-NEXT: sarq $63, %rax
-; AVX512BW-NEXT: movq %rax, %rcx
-; AVX512BW-NEXT: imulq %rsi, %rcx
+; AVX512BW-NEXT: movq %rax, %rsi
+; AVX512BW-NEXT: imulq %r9, %rsi
; AVX512BW-NEXT: mulq %rdi
-; AVX512BW-NEXT: movq %rax, %r11
-; AVX512BW-NEXT: addq %rax, %rcx
-; AVX512BW-NEXT: addq %rdx, %rcx
-; AVX512BW-NEXT: addq %r10, %r11
-; AVX512BW-NEXT: adcq %rbx, %rcx
+; AVX512BW-NEXT: movq %rax, %rbx
+; AVX512BW-NEXT: movq %rdx, %r14
+; AVX512BW-NEXT: addq %rsi, %r14
+; AVX512BW-NEXT: addq %rax, %r14
+; AVX512BW-NEXT: addq %r11, %rbx
+; AVX512BW-NEXT: adcq %r10, %r14
; AVX512BW-NEXT: movq %rdi, %rax
; AVX512BW-NEXT: mulq %r8
; AVX512BW-NEXT: movq %rdx, %r10
-; AVX512BW-NEXT: movq %rax, %rbx
-; AVX512BW-NEXT: movq %rsi, %rax
+; AVX512BW-NEXT: movq %rax, %r11
+; AVX512BW-NEXT: movq %r9, %rax
; AVX512BW-NEXT: mulq %r8
; AVX512BW-NEXT: movq %rdx, %r8
-; AVX512BW-NEXT: movq %rax, %r14
-; AVX512BW-NEXT: addq %r10, %r14
+; AVX512BW-NEXT: movq %rax, %r15
+; AVX512BW-NEXT: addq %r10, %r15
; AVX512BW-NEXT: adcq $0, %r8
; AVX512BW-NEXT: movq %rdi, %rax
-; AVX512BW-NEXT: mulq %r9
+; AVX512BW-NEXT: mulq %rbp
; AVX512BW-NEXT: movq %rdx, %rdi
; AVX512BW-NEXT: movq %rax, %r10
-; AVX512BW-NEXT: addq %r14, %r10
+; AVX512BW-NEXT: addq %r15, %r10
; AVX512BW-NEXT: adcq %r8, %rdi
; AVX512BW-NEXT: setb %al
-; AVX512BW-NEXT: movzbl %al, %r8d
-; AVX512BW-NEXT: movq %rsi, %rax
-; AVX512BW-NEXT: mulq %r9
+; AVX512BW-NEXT: movzbl %al, %esi
+; AVX512BW-NEXT: movq %r9, %rax
+; AVX512BW-NEXT: mulq %rbp
; AVX512BW-NEXT: addq %rdi, %rax
-; AVX512BW-NEXT: adcq %r8, %rdx
-; AVX512BW-NEXT: addq %r11, %rax
-; AVX512BW-NEXT: adcq %rcx, %rdx
-; AVX512BW-NEXT: movq %r10, 8(%r13)
+; AVX512BW-NEXT: adcq %rsi, %rdx
+; AVX512BW-NEXT: addq %rbx, %rax
+; AVX512BW-NEXT: adcq %r14, %rdx
+; AVX512BW-NEXT: movq %r10, 8(%r12)
; AVX512BW-NEXT: sarq $63, %r10
; AVX512BW-NEXT: xorq %r10, %rdx
; AVX512BW-NEXT: xorq %rax, %r10
; AVX512BW-NEXT: korw %k0, %k1, %k1
; AVX512BW-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512BW-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
-; AVX512BW-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512BW-NEXT: movq %rax, 16(%r13)
-; AVX512BW-NEXT: movq %rbx, (%r13)
+; AVX512BW-NEXT: movq %rcx, 16(%r12)
+; AVX512BW-NEXT: movq %r11, (%r12)
; AVX512BW-NEXT: popq %rbx
; AVX512BW-NEXT: popq %r12
; AVX512BW-NEXT: popq %r13
; SSE2-NEXT: mulq %rdi
; SSE2-NEXT: seto %r12b
; SSE2-NEXT: orb %r15b, %r12b
+; SSE2-NEXT: orb %bpl, %r12b
; SSE2-NEXT: leaq (%rsi,%rax), %r10
; SSE2-NEXT: movq %rdi, %rax
; SSE2-NEXT: mulq %r8
; SSE2-NEXT: addq %r10, %rsi
; SSE2-NEXT: setb %r10b
; SSE2-NEXT: orb %r12b, %r10b
-; SSE2-NEXT: orb %bpl, %r10b
; SSE2-NEXT: testq %r9, %r9
; SSE2-NEXT: setne %al
; SSE2-NEXT: testq %r11, %r11
; SSE2-NEXT: mulq %rcx
; SSE2-NEXT: seto %r9b
; SSE2-NEXT: orb %r11b, %r9b
+; SSE2-NEXT: orb %bpl, %r9b
; SSE2-NEXT: addq %rax, %r8
; SSE2-NEXT: movq %rcx, %rax
; SSE2-NEXT: mulq %r14
; SSE2-NEXT: addq %r8, %rdx
; SSE2-NEXT: setb %cl
; SSE2-NEXT: orb %r9b, %cl
-; SSE2-NEXT: orb %bpl, %cl
; SSE2-NEXT: movzbl %cl, %ecx
; SSE2-NEXT: negl %ecx
; SSE2-NEXT: movd %ecx, %xmm1
; SSSE3-NEXT: mulq %rdi
; SSSE3-NEXT: seto %r12b
; SSSE3-NEXT: orb %r15b, %r12b
+; SSSE3-NEXT: orb %bpl, %r12b
; SSSE3-NEXT: leaq (%rsi,%rax), %r10
; SSSE3-NEXT: movq %rdi, %rax
; SSSE3-NEXT: mulq %r8
; SSSE3-NEXT: addq %r10, %rsi
; SSSE3-NEXT: setb %r10b
; SSSE3-NEXT: orb %r12b, %r10b
-; SSSE3-NEXT: orb %bpl, %r10b
; SSSE3-NEXT: testq %r9, %r9
; SSSE3-NEXT: setne %al
; SSSE3-NEXT: testq %r11, %r11
; SSSE3-NEXT: mulq %rcx
; SSSE3-NEXT: seto %r9b
; SSSE3-NEXT: orb %r11b, %r9b
+; SSSE3-NEXT: orb %bpl, %r9b
; SSSE3-NEXT: addq %rax, %r8
; SSSE3-NEXT: movq %rcx, %rax
; SSSE3-NEXT: mulq %r14
; SSSE3-NEXT: addq %r8, %rdx
; SSSE3-NEXT: setb %cl
; SSSE3-NEXT: orb %r9b, %cl
-; SSSE3-NEXT: orb %bpl, %cl
; SSSE3-NEXT: movzbl %cl, %ecx
; SSSE3-NEXT: negl %ecx
; SSSE3-NEXT: movd %ecx, %xmm1
; SSE41-NEXT: mulq %rdi
; SSE41-NEXT: seto %r12b
; SSE41-NEXT: orb %r15b, %r12b
+; SSE41-NEXT: orb %bpl, %r12b
; SSE41-NEXT: leaq (%rsi,%rax), %r10
; SSE41-NEXT: movq %rdi, %rax
; SSE41-NEXT: mulq %r8
; SSE41-NEXT: addq %r10, %rsi
; SSE41-NEXT: setb %r10b
; SSE41-NEXT: orb %r12b, %r10b
-; SSE41-NEXT: orb %bpl, %r10b
; SSE41-NEXT: testq %r9, %r9
; SSE41-NEXT: setne %al
; SSE41-NEXT: testq %r11, %r11
; SSE41-NEXT: mulq %rcx
; SSE41-NEXT: seto %r9b
; SSE41-NEXT: orb %r11b, %r9b
+; SSE41-NEXT: orb %bpl, %r9b
; SSE41-NEXT: addq %rax, %r8
; SSE41-NEXT: movq %rcx, %rax
; SSE41-NEXT: mulq %r14
; SSE41-NEXT: addq %r8, %rdx
; SSE41-NEXT: setb %cl
; SSE41-NEXT: orb %r9b, %cl
-; SSE41-NEXT: orb %bpl, %cl
; SSE41-NEXT: movzbl %cl, %ecx
; SSE41-NEXT: negl %ecx
; SSE41-NEXT: movzbl %r10b, %r8d
; AVX-NEXT: mulq %rdi
; AVX-NEXT: seto %r12b
; AVX-NEXT: orb %r15b, %r12b
+; AVX-NEXT: orb %bpl, %r12b
; AVX-NEXT: leaq (%rsi,%rax), %r10
; AVX-NEXT: movq %rdi, %rax
; AVX-NEXT: mulq %r8
; AVX-NEXT: addq %r10, %rsi
; AVX-NEXT: setb %r10b
; AVX-NEXT: orb %r12b, %r10b
-; AVX-NEXT: orb %bpl, %r10b
; AVX-NEXT: testq %r9, %r9
; AVX-NEXT: setne %al
; AVX-NEXT: testq %r11, %r11
; AVX-NEXT: mulq %rcx
; AVX-NEXT: seto %r9b
; AVX-NEXT: orb %r11b, %r9b
+; AVX-NEXT: orb %bpl, %r9b
; AVX-NEXT: addq %rax, %r8
; AVX-NEXT: movq %rcx, %rax
; AVX-NEXT: mulq %r14
; AVX-NEXT: addq %r8, %rdx
; AVX-NEXT: setb %cl
; AVX-NEXT: orb %r9b, %cl
-; AVX-NEXT: orb %bpl, %cl
; AVX-NEXT: movzbl %cl, %ecx
; AVX-NEXT: negl %ecx
; AVX-NEXT: movzbl %r10b, %r8d
; AVX512F-NEXT: mulq %rcx
; AVX512F-NEXT: seto %r12b
; AVX512F-NEXT: orb %r15b, %r12b
+; AVX512F-NEXT: orb %bpl, %r12b
; AVX512F-NEXT: addq %rax, %r11
; AVX512F-NEXT: movq %rcx, %rax
; AVX512F-NEXT: mulq %r14
; AVX512F-NEXT: addq %r11, %rcx
; AVX512F-NEXT: setb %al
; AVX512F-NEXT: orb %r12b, %al
-; AVX512F-NEXT: orb %bpl, %al
; AVX512F-NEXT: kmovw %eax, %k0
; AVX512F-NEXT: testq %r9, %r9
; AVX512F-NEXT: setne %al
; AVX512F-NEXT: mulq %rdi
; AVX512F-NEXT: seto %r9b
; AVX512F-NEXT: orb %bpl, %r9b
+; AVX512F-NEXT: orb %r11b, %r9b
; AVX512F-NEXT: addq %rax, %r10
; AVX512F-NEXT: movq %rdi, %rax
; AVX512F-NEXT: mulq %r8
; AVX512F-NEXT: addq %r10, %rdx
; AVX512F-NEXT: setb %dil
; AVX512F-NEXT: orb %r9b, %dil
-; AVX512F-NEXT: orb %r11b, %dil
; AVX512F-NEXT: andl $1, %edi
; AVX512F-NEXT: kmovw %edi, %k1
; AVX512F-NEXT: kshiftlw $1, %k0, %k0
; AVX512BW-NEXT: mulq %rcx
; AVX512BW-NEXT: seto %r12b
; AVX512BW-NEXT: orb %r15b, %r12b
+; AVX512BW-NEXT: orb %bpl, %r12b
; AVX512BW-NEXT: addq %rax, %r11
; AVX512BW-NEXT: movq %rcx, %rax
; AVX512BW-NEXT: mulq %r14
; AVX512BW-NEXT: addq %r11, %rcx
; AVX512BW-NEXT: setb %al
; AVX512BW-NEXT: orb %r12b, %al
-; AVX512BW-NEXT: orb %bpl, %al
; AVX512BW-NEXT: kmovd %eax, %k0
; AVX512BW-NEXT: testq %r9, %r9
; AVX512BW-NEXT: setne %al
; AVX512BW-NEXT: mulq %rdi
; AVX512BW-NEXT: seto %r9b
; AVX512BW-NEXT: orb %bpl, %r9b
+; AVX512BW-NEXT: orb %r11b, %r9b
; AVX512BW-NEXT: addq %rax, %r10
; AVX512BW-NEXT: movq %rdi, %rax
; AVX512BW-NEXT: mulq %r8
; AVX512BW-NEXT: addq %r10, %rdx
; AVX512BW-NEXT: setb %dil
; AVX512BW-NEXT: orb %r9b, %dil
-; AVX512BW-NEXT: orb %r11b, %dil
; AVX512BW-NEXT: andl $1, %edi
; AVX512BW-NEXT: kmovw %edi, %k1
; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
; SSE2-NEXT: pandn %xmm3, %xmm1
; SSE2-NEXT: psrlw $1, %xmm3
; SSE2-NEXT: pand %xmm4, %xmm3
+; SSE2-NEXT: por %xmm1, %xmm3
; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm4
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pslld $23, %xmm4
-; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216]
-; SSE2-NEXT: paddd %xmm5, %xmm4
-; SSE2-NEXT: cvttps2dq %xmm4, %xmm4
-; SSE2-NEXT: pslld $16, %xmm4
-; SSE2-NEXT: psrad $16, %xmm4
+; SSE2-NEXT: movdqa %xmm2, %xmm1
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pslld $23, %xmm1
+; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
+; SSE2-NEXT: paddd %xmm4, %xmm1
+; SSE2-NEXT: cvttps2dq %xmm1, %xmm1
+; SSE2-NEXT: pslld $16, %xmm1
+; SSE2-NEXT: psrad $16, %xmm1
; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
; SSE2-NEXT: pslld $23, %xmm2
-; SSE2-NEXT: paddd %xmm5, %xmm2
+; SSE2-NEXT: paddd %xmm4, %xmm2
; SSE2-NEXT: cvttps2dq %xmm2, %xmm2
; SSE2-NEXT: pslld $16, %xmm2
; SSE2-NEXT: psrad $16, %xmm2
-; SSE2-NEXT: packssdw %xmm4, %xmm2
+; SSE2-NEXT: packssdw %xmm1, %xmm2
; SSE2-NEXT: paddw %xmm0, %xmm0
; SSE2-NEXT: pmullw %xmm2, %xmm0
-; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: por %xmm3, %xmm0
; SSE2-NEXT: retq
;
; X86-SSE2-NEXT: pandn %xmm3, %xmm1
; X86-SSE2-NEXT: psrlw $1, %xmm3
; X86-SSE2-NEXT: pand %xmm4, %xmm3
+; X86-SSE2-NEXT: por %xmm1, %xmm3
; X86-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
-; X86-SSE2-NEXT: movdqa %xmm2, %xmm4
-; X86-SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7]
-; X86-SSE2-NEXT: pslld $23, %xmm4
-; X86-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216]
-; X86-SSE2-NEXT: paddd %xmm5, %xmm4
-; X86-SSE2-NEXT: cvttps2dq %xmm4, %xmm4
-; X86-SSE2-NEXT: pslld $16, %xmm4
-; X86-SSE2-NEXT: psrad $16, %xmm4
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm1
+; X86-SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
+; X86-SSE2-NEXT: pslld $23, %xmm1
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
+; X86-SSE2-NEXT: paddd %xmm4, %xmm1
+; X86-SSE2-NEXT: cvttps2dq %xmm1, %xmm1
+; X86-SSE2-NEXT: pslld $16, %xmm1
+; X86-SSE2-NEXT: psrad $16, %xmm1
; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
; X86-SSE2-NEXT: pslld $23, %xmm2
-; X86-SSE2-NEXT: paddd %xmm5, %xmm2
+; X86-SSE2-NEXT: paddd %xmm4, %xmm2
; X86-SSE2-NEXT: cvttps2dq %xmm2, %xmm2
; X86-SSE2-NEXT: pslld $16, %xmm2
; X86-SSE2-NEXT: psrad $16, %xmm2
-; X86-SSE2-NEXT: packssdw %xmm4, %xmm2
+; X86-SSE2-NEXT: packssdw %xmm1, %xmm2
; X86-SSE2-NEXT: paddw %xmm0, %xmm0
; X86-SSE2-NEXT: pmullw %xmm2, %xmm0
-; X86-SSE2-NEXT: por %xmm1, %xmm0
; X86-SSE2-NEXT: por %xmm3, %xmm0
; X86-SSE2-NEXT: retl
%res = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt)
; SSE2-NEXT: movdqa %xmm2, %xmm6
; SSE2-NEXT: pand %xmm5, %xmm6
; SSE2-NEXT: psllw $5, %xmm6
-; SSE2-NEXT: pxor %xmm3, %xmm3
; SSE2-NEXT: pxor %xmm4, %xmm4
-; SSE2-NEXT: pcmpgtb %xmm6, %xmm4
-; SSE2-NEXT: movdqa %xmm4, %xmm7
+; SSE2-NEXT: pxor %xmm3, %xmm3
+; SSE2-NEXT: pcmpgtb %xmm6, %xmm3
+; SSE2-NEXT: movdqa %xmm3, %xmm7
; SSE2-NEXT: pandn %xmm1, %xmm7
; SSE2-NEXT: psrlw $4, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm4
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
-; SSE2-NEXT: por %xmm7, %xmm4
+; SSE2-NEXT: pand %xmm1, %xmm3
+; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; SSE2-NEXT: por %xmm7, %xmm3
; SSE2-NEXT: paddb %xmm6, %xmm6
; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: pcmpgtb %xmm6, %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm7
-; SSE2-NEXT: pandn %xmm4, %xmm7
-; SSE2-NEXT: psrlw $2, %xmm4
-; SSE2-NEXT: pand %xmm1, %xmm4
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
-; SSE2-NEXT: por %xmm7, %xmm4
+; SSE2-NEXT: pandn %xmm3, %xmm7
+; SSE2-NEXT: psrlw $2, %xmm3
+; SSE2-NEXT: pand %xmm1, %xmm3
+; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; SSE2-NEXT: por %xmm7, %xmm3
; SSE2-NEXT: paddb %xmm6, %xmm6
; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: pcmpgtb %xmm6, %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm6
-; SSE2-NEXT: pandn %xmm4, %xmm6
-; SSE2-NEXT: psrlw $1, %xmm4
-; SSE2-NEXT: pand %xmm1, %xmm4
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
-; SSE2-NEXT: por %xmm6, %xmm4
+; SSE2-NEXT: pandn %xmm3, %xmm6
+; SSE2-NEXT: psrlw $1, %xmm3
+; SSE2-NEXT: pand %xmm1, %xmm3
+; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; SSE2-NEXT: por %xmm6, %xmm3
; SSE2-NEXT: pandn %xmm5, %xmm2
; SSE2-NEXT: psllw $5, %xmm2
; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE2-NEXT: por %xmm5, %xmm0
; SSE2-NEXT: paddb %xmm2, %xmm2
-; SSE2-NEXT: pcmpgtb %xmm2, %xmm3
-; SSE2-NEXT: movdqa %xmm3, %xmm1
+; SSE2-NEXT: pcmpgtb %xmm2, %xmm4
+; SSE2-NEXT: movdqa %xmm4, %xmm1
; SSE2-NEXT: pandn %xmm0, %xmm1
-; SSE2-NEXT: por %xmm4, %xmm1
; SSE2-NEXT: paddb %xmm0, %xmm0
-; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: pand %xmm4, %xmm0
; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: por %xmm3, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: var_funnnel_v16i8:
; X86-SSE2-NEXT: movdqa %xmm2, %xmm6
; X86-SSE2-NEXT: pand %xmm5, %xmm6
; X86-SSE2-NEXT: psllw $5, %xmm6
-; X86-SSE2-NEXT: pxor %xmm3, %xmm3
; X86-SSE2-NEXT: pxor %xmm4, %xmm4
-; X86-SSE2-NEXT: pcmpgtb %xmm6, %xmm4
-; X86-SSE2-NEXT: movdqa %xmm4, %xmm7
+; X86-SSE2-NEXT: pxor %xmm3, %xmm3
+; X86-SSE2-NEXT: pcmpgtb %xmm6, %xmm3
+; X86-SSE2-NEXT: movdqa %xmm3, %xmm7
; X86-SSE2-NEXT: pandn %xmm1, %xmm7
; X86-SSE2-NEXT: psrlw $4, %xmm1
-; X86-SSE2-NEXT: pand %xmm1, %xmm4
-; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm4
-; X86-SSE2-NEXT: por %xmm7, %xmm4
+; X86-SSE2-NEXT: pand %xmm1, %xmm3
+; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3
+; X86-SSE2-NEXT: por %xmm7, %xmm3
; X86-SSE2-NEXT: paddb %xmm6, %xmm6
; X86-SSE2-NEXT: pxor %xmm1, %xmm1
; X86-SSE2-NEXT: pcmpgtb %xmm6, %xmm1
; X86-SSE2-NEXT: movdqa %xmm1, %xmm7
-; X86-SSE2-NEXT: pandn %xmm4, %xmm7
-; X86-SSE2-NEXT: psrlw $2, %xmm4
-; X86-SSE2-NEXT: pand %xmm1, %xmm4
-; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm4
-; X86-SSE2-NEXT: por %xmm7, %xmm4
+; X86-SSE2-NEXT: pandn %xmm3, %xmm7
+; X86-SSE2-NEXT: psrlw $2, %xmm3
+; X86-SSE2-NEXT: pand %xmm1, %xmm3
+; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3
+; X86-SSE2-NEXT: por %xmm7, %xmm3
; X86-SSE2-NEXT: paddb %xmm6, %xmm6
; X86-SSE2-NEXT: pxor %xmm1, %xmm1
; X86-SSE2-NEXT: pcmpgtb %xmm6, %xmm1
; X86-SSE2-NEXT: movdqa %xmm1, %xmm6
-; X86-SSE2-NEXT: pandn %xmm4, %xmm6
-; X86-SSE2-NEXT: psrlw $1, %xmm4
-; X86-SSE2-NEXT: pand %xmm1, %xmm4
-; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm4
-; X86-SSE2-NEXT: por %xmm6, %xmm4
+; X86-SSE2-NEXT: pandn %xmm3, %xmm6
+; X86-SSE2-NEXT: psrlw $1, %xmm3
+; X86-SSE2-NEXT: pand %xmm1, %xmm3
+; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3
+; X86-SSE2-NEXT: por %xmm6, %xmm3
; X86-SSE2-NEXT: pandn %xmm5, %xmm2
; X86-SSE2-NEXT: psllw $5, %xmm2
; X86-SSE2-NEXT: pxor %xmm1, %xmm1
; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
; X86-SSE2-NEXT: por %xmm5, %xmm0
; X86-SSE2-NEXT: paddb %xmm2, %xmm2
-; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm3
-; X86-SSE2-NEXT: movdqa %xmm3, %xmm1
+; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm4
+; X86-SSE2-NEXT: movdqa %xmm4, %xmm1
; X86-SSE2-NEXT: pandn %xmm0, %xmm1
-; X86-SSE2-NEXT: por %xmm4, %xmm1
; X86-SSE2-NEXT: paddb %xmm0, %xmm0
-; X86-SSE2-NEXT: pand %xmm3, %xmm0
+; X86-SSE2-NEXT: pand %xmm4, %xmm0
; X86-SSE2-NEXT: por %xmm1, %xmm0
+; X86-SSE2-NEXT: por %xmm3, %xmm0
; X86-SSE2-NEXT: retl
%res = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt)
ret <16 x i8> %res
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,65535]
; SSE2-NEXT: pandn %xmm1, %xmm2
; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT: por %xmm1, %xmm2
; SSE2-NEXT: paddw %xmm0, %xmm0
; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE2-NEXT: por %xmm2, %xmm0
-; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: constant_funnnel_v8i16:
; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,65535]
; X86-SSE2-NEXT: pandn %xmm1, %xmm2
; X86-SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE2-NEXT: por %xmm1, %xmm2
; X86-SSE2-NEXT: paddw %xmm0, %xmm0
; X86-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
; X86-SSE2-NEXT: por %xmm2, %xmm0
-; X86-SSE2-NEXT: por %xmm1, %xmm0
; X86-SSE2-NEXT: retl
%res = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
ret <8 x i16> %res
; SSE-NEXT: punpckhqdq {{.*#+}} xmm13 = xmm13[1],xmm11[1]
; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,65535,65535,0,0,65535,65535,65535]
; SSE-NEXT: pandn %xmm13, %xmm11
+; SSE-NEXT: movdqa %xmm6, %xmm12
+; SSE-NEXT: psrldq {{.*#+}} xmm12 = xmm12[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE-NEXT: por %xmm11, %xmm12
; SSE-NEXT: psrld $16, %xmm9
-; SSE-NEXT: movdqa %xmm4, %xmm12
-; SSE-NEXT: punpckhdq {{.*#+}} xmm12 = xmm12[2],xmm9[2],xmm12[3],xmm9[3]
+; SSE-NEXT: movdqa %xmm4, %xmm11
+; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm9[2],xmm11[3],xmm9[3]
; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,0,0,65535]
; SSE-NEXT: movdqa %xmm0, %xmm13
-; SSE-NEXT: pandn %xmm12, %xmm13
-; SSE-NEXT: movdqa %xmm6, %xmm9
-; SSE-NEXT: psrldq {{.*#+}} xmm9 = xmm9[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE-NEXT: por %xmm11, %xmm13
-; SSE-NEXT: por %xmm9, %xmm13
+; SSE-NEXT: pandn %xmm11, %xmm13
+; SSE-NEXT: por %xmm12, %xmm13
; SSE-NEXT: movdqa {{.*#+}} xmm9 = [0,65535,65535,65535,65535,65535,65535,0]
; SSE-NEXT: pand %xmm9, %xmm13
; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm3[3,3,3,3]
;
; AVX1-ONLY-LABEL: store_i8_stride5_vf16:
; AVX1-ONLY: # %bb.0:
-; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm1
-; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm2
-; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm3
+; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm2
+; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm3
+; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm1
; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm4
; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm0
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = zero,xmm4[6,u,u,u],zero,xmm4[7,u,u,u],zero,xmm4[8,u,u,u],zero
-; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[6],zero,xmm3[u,u,u,7],zero,xmm3[u,u,u,8],zero,xmm3[u,u,u,9]
+; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[6],zero,xmm1[u,u,u,7],zero,xmm1[u,u,u,8],zero,xmm1[u,u,u,9]
; AVX1-ONLY-NEXT: vpor %xmm5, %xmm6, %xmm5
-; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm2[u,u,u],zero,xmm2[7,u,u,u],zero,xmm2[8,u,u,u],zero,xmm2[9,u]
-; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm1[u,u,u,7],zero,xmm1[u,u,u,8],zero,xmm1[u,u,u,9],zero,xmm1[u]
+; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[u,u,u],zero,xmm3[7,u,u,u],zero,xmm3[8,u,u,u],zero,xmm3[9,u]
+; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm2[u,u,u,7],zero,xmm2[u,u,u,8],zero,xmm2[u,u,u,9],zero,xmm2[u]
; AVX1-ONLY-NEXT: vpor %xmm6, %xmm7, %xmm6
; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = <255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255>
; AVX1-ONLY-NEXT: vpblendvb %xmm7, %xmm5, %xmm6, %xmm5
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,1],zero,xmm5[3,4,5,6],zero,xmm5[8,9,10,11],zero,xmm5[13,14,15]
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm0[6],zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,xmm0[8],zero,zero,zero
; AVX1-ONLY-NEXT: vpor %xmm6, %xmm5, %xmm5
-; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15]
+; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15]
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[10,11],zero,zero,zero,xmm6[12,13],zero,zero,zero,xmm6[14,15],zero
-; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
+; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15]
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = zero,xmm7[10,11],zero,zero,zero,xmm7[12,13],zero,zero,zero,xmm7[14,15],zero,zero,zero
-; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
-; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,xmm9[0,1],zero,zero,zero,xmm9[2,3],zero,zero,zero,xmm9[4,5],zero,zero
-; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[0,1],zero,zero,zero,xmm11[2,3],zero,zero,zero,xmm11[4,5],zero,zero,zero,xmm11[6]
-; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,xmm0[2],zero
-; AVX1-ONLY-NEXT: vpor %xmm12, %xmm10, %xmm10
-; AVX1-ONLY-NEXT: vpor %xmm10, %xmm11, %xmm10
-; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = zero,xmm9[6,7],zero,zero,zero,xmm9[8,9],zero,zero,zero,xmm9[10,11],zero,zero,zero
-; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6],zero,zero,zero,xmm1[9,8],zero,zero,zero,xmm1[11,10],zero,zero,zero,xmm1[13,12]
-; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm0[3],zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,xmm0[5],zero,zero
-; AVX1-ONLY-NEXT: vpor %xmm2, %xmm9, %xmm2
-; AVX1-ONLY-NEXT: vpor %xmm2, %xmm1, %xmm1
-; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm7[4,5],zero,zero,zero,xmm7[6,7],zero,zero,zero,xmm7[8,9],zero,zero
-; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15]
-; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2],zero,zero,zero,xmm3[5,4],zero,zero,zero,xmm3[7,6],zero,zero,zero,xmm3[9,8]
-; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = zero,xmm0[9],zero,zero,zero,zero,xmm0[10],zero,zero,zero,zero,xmm0[11],zero,zero,zero,zero
-; AVX1-ONLY-NEXT: vpor %xmm4, %xmm2, %xmm2
-; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2
-; AVX1-ONLY-NEXT: vmovdqa %xmm2, 48(%r9)
-; AVX1-ONLY-NEXT: vmovdqa %xmm1, 16(%r9)
-; AVX1-ONLY-NEXT: vmovdqa %xmm10, (%r9)
+; AVX1-ONLY-NEXT: vpor %xmm6, %xmm8, %xmm6
+; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
+; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,xmm8[0,1],zero,zero,zero,xmm8[2,3],zero,zero,zero,xmm8[4,5],zero,zero
+; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
+; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1],zero,zero,zero,xmm10[2,3],zero,zero,zero,xmm10[4,5],zero,zero,zero,xmm10[6]
+; AVX1-ONLY-NEXT: vpor %xmm9, %xmm10, %xmm9
+; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,xmm0[2],zero
+; AVX1-ONLY-NEXT: vpor %xmm10, %xmm9, %xmm9
+; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = zero,xmm8[6,7],zero,zero,zero,xmm8[8,9],zero,zero,zero,xmm8[10,11],zero,zero,zero
+; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[6],zero,zero,zero,xmm2[9,8],zero,zero,zero,xmm2[11,10],zero,zero,zero,xmm2[13,12]
+; AVX1-ONLY-NEXT: vpor %xmm2, %xmm8, %xmm2
+; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm0[3],zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,xmm0[5],zero,zero
+; AVX1-ONLY-NEXT: vpor %xmm3, %xmm2, %xmm2
+; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm7[4,5],zero,zero,zero,xmm7[6,7],zero,zero,zero,xmm7[8,9],zero,zero
+; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15]
+; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[2],zero,zero,zero,xmm1[5,4],zero,zero,zero,xmm1[7,6],zero,zero,zero,xmm1[9,8]
+; AVX1-ONLY-NEXT: vpor %xmm3, %xmm1, %xmm1
+; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,xmm0[9],zero,zero,zero,zero,xmm0[10],zero,zero,zero,zero,xmm0[11],zero,zero,zero,zero
+; AVX1-ONLY-NEXT: vpor %xmm3, %xmm1, %xmm1
+; AVX1-ONLY-NEXT: vmovdqa %xmm1, 48(%r9)
+; AVX1-ONLY-NEXT: vmovdqa %xmm2, 16(%r9)
+; AVX1-ONLY-NEXT: vmovdqa %xmm9, (%r9)
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12],zero,zero,zero,zero,xmm0[13],zero,zero,zero,zero,xmm0[14],zero,zero,zero,zero,xmm0[15]
; AVX1-ONLY-NEXT: vpor %xmm0, %xmm6, %xmm0
-; AVX1-ONLY-NEXT: vpor %xmm0, %xmm8, %xmm0
; AVX1-ONLY-NEXT: vmovdqa %xmm0, 64(%r9)
; AVX1-ONLY-NEXT: vmovdqa %xmm5, 32(%r9)
; AVX1-ONLY-NEXT: retq
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[10,11],zero,zero,zero,xmm3[12,13],zero,zero,zero,xmm3[14,15],zero
; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[10,11],zero,zero,zero,xmm0[12,13],zero,zero,zero,xmm0[14,15],zero,zero,zero
+; AVX2-SLOW-NEXT: vpor %xmm3, %xmm0, %xmm0
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[12],zero,zero,zero,zero,xmm2[13],zero,zero,zero,zero,xmm2[14],zero,zero,zero,zero,xmm2[15]
-; AVX2-SLOW-NEXT: vpor %xmm1, %xmm3, %xmm1
; AVX2-SLOW-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX2-SLOW-NEXT: vmovdqa %xmm0, 64(%r9)
; AVX2-SLOW-NEXT: vmovdqa %ymm5, (%r9)
; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[10,11],zero,zero,zero,xmm2[12,13],zero,zero,zero,xmm2[14,15],zero
; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[10,11],zero,zero,zero,xmm0[12,13],zero,zero,zero,xmm0[14,15],zero,zero,zero
+; AVX2-FAST-NEXT: vpor %xmm2, %xmm0, %xmm0
; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[12],zero,zero,zero,zero,xmm4[13],zero,zero,zero,zero,xmm4[14],zero,zero,zero,zero,xmm4[15]
-; AVX2-FAST-NEXT: vpor %xmm1, %xmm2, %xmm1
; AVX2-FAST-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX2-FAST-NEXT: vmovdqa %xmm0, 64(%r9)
; AVX2-FAST-NEXT: vmovdqa %ymm5, 32(%r9)
; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[10,11],zero,zero,zero,xmm3[12,13],zero,zero,zero,xmm3[14,15],zero
; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[10,11],zero,zero,zero,xmm0[12,13],zero,zero,zero,xmm0[14,15],zero,zero,zero
+; AVX2-FAST-PERLANE-NEXT: vpor %xmm3, %xmm0, %xmm0
; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[12],zero,zero,zero,zero,xmm2[13],zero,zero,zero,zero,xmm2[14],zero,zero,zero,zero,xmm2[15]
-; AVX2-FAST-PERLANE-NEXT: vpor %xmm1, %xmm3, %xmm1
; AVX2-FAST-PERLANE-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, 64(%r9)
; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, (%r9)
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm0[0,8],zero,zero,zero,zero,zero,zero,ymm0[1,9],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[18,26],zero,zero,zero,zero,zero,zero,ymm0[19,27],zero,zero,zero,zero
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm0[2,3,0,1]
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,ymm5[0,8],zero,zero,zero,zero,zero,zero,ymm5[1,9],zero,zero,zero,zero,ymm5[18,26],zero,zero,zero,zero,zero,zero,ymm5[19,27],zero,zero,zero,zero,zero,zero
-; AVX2-SLOW-NEXT: vpor %ymm2, %ymm6, %ymm2
+; AVX2-SLOW-NEXT: vpor %ymm6, %ymm4, %ymm4
; AVX2-SLOW-NEXT: vpor %ymm2, %ymm4, %ymm2
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,ymm1[4,12],zero,zero,zero,zero,zero,zero,ymm1[5,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm1[22,30],zero,zero,zero,zero,zero,zero,ymm1[23,31]
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,ymm3[4,12],zero,zero,zero,zero,zero,zero,ymm3[5,13],zero,zero,zero,zero,ymm3[22,30],zero,zero,zero,zero,zero,zero,ymm3[23,31],zero,zero
; AVX2-SLOW-NEXT: vpor %ymm3, %ymm1, %ymm1
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,12],zero,zero,zero,zero,zero,zero,ymm0[5,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[22,30],zero,zero,zero,zero,zero,zero,ymm0[23,31],zero,zero,zero,zero
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm5[4,12],zero,zero,zero,zero,zero,zero,ymm5[5,13],zero,zero,zero,zero,ymm5[22,30],zero,zero,zero,zero,zero,zero,ymm5[23,31],zero,zero,zero,zero,zero,zero
-; AVX2-SLOW-NEXT: vpor %ymm1, %ymm3, %ymm1
+; AVX2-SLOW-NEXT: vpor %ymm3, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%rax)
; AVX2-SLOW-NEXT: vmovdqa %ymm2, (%rax)
; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm4 = ymm0[0,8],zero,zero,zero,zero,zero,zero,ymm0[1,9],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[18,26],zero,zero,zero,zero,zero,zero,ymm0[19,27],zero,zero,zero,zero
; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm0[2,3,0,1]
; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,ymm5[0,8],zero,zero,zero,zero,zero,zero,ymm5[1,9],zero,zero,zero,zero,ymm5[18,26],zero,zero,zero,zero,zero,zero,ymm5[19,27],zero,zero,zero,zero,zero,zero
-; AVX2-FAST-PERLANE-NEXT: vpor %ymm2, %ymm6, %ymm2
+; AVX2-FAST-PERLANE-NEXT: vpor %ymm6, %ymm4, %ymm4
; AVX2-FAST-PERLANE-NEXT: vpor %ymm2, %ymm4, %ymm2
; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,ymm1[4,12],zero,zero,zero,zero,zero,zero,ymm1[5,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm1[22,30],zero,zero,zero,zero,zero,zero,ymm1[23,31]
; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,ymm3[4,12],zero,zero,zero,zero,zero,zero,ymm3[5,13],zero,zero,zero,zero,ymm3[22,30],zero,zero,zero,zero,zero,zero,ymm3[23,31],zero,zero
; AVX2-FAST-PERLANE-NEXT: vpor %ymm3, %ymm1, %ymm1
; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,12],zero,zero,zero,zero,zero,zero,ymm0[5,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[22,30],zero,zero,zero,zero,zero,zero,ymm0[23,31],zero,zero,zero,zero
; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm5[4,12],zero,zero,zero,zero,zero,zero,ymm5[5,13],zero,zero,zero,zero,ymm5[22,30],zero,zero,zero,zero,zero,zero,ymm5[23,31],zero,zero,zero,zero,zero,zero
-; AVX2-FAST-PERLANE-NEXT: vpor %ymm1, %ymm3, %ymm1
+; AVX2-FAST-PERLANE-NEXT: vpor %ymm3, %ymm0, %ymm0
; AVX2-FAST-PERLANE-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rax)
; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, (%rax)
; SSE2: # %bb.0:
; SSE2-NEXT: pcmpeqd %xmm2, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,0,3,2]
-; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1,1]
-; SSE2-NEXT: pand %xmm4, %xmm2
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1,1]
; SSE2-NEXT: pand %xmm2, %xmm0
; SSE2-NEXT: pcmpeqd %xmm3, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,0,3,2]
-; SSE2-NEXT: pand %xmm4, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,0,3,2]
+; SSE2-NEXT: pand %xmm3, %xmm1
; SSE2-NEXT: pand %xmm2, %xmm1
; SSE2-NEXT: retq
;
; SSE2: # %bb.0:
; SSE2-NEXT: psrlq $60, %xmm2
; SSE2-NEXT: psrlq $60, %xmm0
+; SSE2-NEXT: paddq %xmm2, %xmm0
; SSE2-NEXT: psrlq $60, %xmm3
; SSE2-NEXT: psrlq $60, %xmm1
; SSE2-NEXT: paddq %xmm3, %xmm1
-; SSE2-NEXT: paddq %xmm2, %xmm1
; SSE2-NEXT: paddq %xmm0, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; SSE2-NEXT: paddq %xmm1, %xmm0
; SSE41: # %bb.0:
; SSE41-NEXT: psrlq $60, %xmm2
; SSE41-NEXT: psrlq $60, %xmm0
+; SSE41-NEXT: paddq %xmm2, %xmm0
; SSE41-NEXT: psrlq $60, %xmm3
; SSE41-NEXT: psrlq $60, %xmm1
; SSE41-NEXT: paddq %xmm3, %xmm1
-; SSE41-NEXT: paddq %xmm2, %xmm1
; SSE41-NEXT: paddq %xmm0, %xmm1
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; SSE41-NEXT: paddq %xmm1, %xmm0
; AVX1: # %bb.0:
; AVX1-NEXT: vpsrlq $60, %xmm1, %xmm2
; AVX1-NEXT: vpsrlq $60, %xmm0, %xmm3
+; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-NEXT: vpsrlq $60, %xmm1, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpsrlq $60, %xmm0, %xmm0
; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpaddq %xmm0, %xmm2, %xmm0
-; AVX1-NEXT: vpaddq %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vmovq %xmm0, %rax
; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [1,1]
; SSE2-NEXT: pand %xmm8, %xmm5
; SSE2-NEXT: pand %xmm8, %xmm1
+; SSE2-NEXT: paddq %xmm5, %xmm1
; SSE2-NEXT: pand %xmm8, %xmm7
; SSE2-NEXT: pand %xmm8, %xmm3
; SSE2-NEXT: paddq %xmm7, %xmm3
-; SSE2-NEXT: paddq %xmm5, %xmm3
; SSE2-NEXT: paddq %xmm1, %xmm3
; SSE2-NEXT: pand %xmm8, %xmm4
; SSE2-NEXT: pand %xmm8, %xmm0
+; SSE2-NEXT: paddq %xmm4, %xmm0
; SSE2-NEXT: pand %xmm8, %xmm6
; SSE2-NEXT: pand %xmm8, %xmm2
; SSE2-NEXT: paddq %xmm6, %xmm2
-; SSE2-NEXT: paddq %xmm4, %xmm2
-; SSE2-NEXT: paddq %xmm3, %xmm2
; SSE2-NEXT: paddq %xmm0, %xmm2
+; SSE2-NEXT: paddq %xmm3, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
; SSE2-NEXT: paddq %xmm2, %xmm0
; SSE2-NEXT: movq %xmm0, %rax
; SSE41-NEXT: movdqa {{.*#+}} xmm8 = [1,1]
; SSE41-NEXT: pand %xmm8, %xmm5
; SSE41-NEXT: pand %xmm8, %xmm1
+; SSE41-NEXT: paddq %xmm5, %xmm1
; SSE41-NEXT: pand %xmm8, %xmm7
; SSE41-NEXT: pand %xmm8, %xmm3
; SSE41-NEXT: paddq %xmm7, %xmm3
-; SSE41-NEXT: paddq %xmm5, %xmm3
; SSE41-NEXT: paddq %xmm1, %xmm3
; SSE41-NEXT: pand %xmm8, %xmm4
; SSE41-NEXT: pand %xmm8, %xmm0
+; SSE41-NEXT: paddq %xmm4, %xmm0
; SSE41-NEXT: pand %xmm8, %xmm6
; SSE41-NEXT: pand %xmm8, %xmm2
; SSE41-NEXT: paddq %xmm6, %xmm2
-; SSE41-NEXT: paddq %xmm4, %xmm2
-; SSE41-NEXT: paddq %xmm3, %xmm2
; SSE41-NEXT: paddq %xmm0, %xmm2
+; SSE41-NEXT: paddq %xmm3, %xmm2
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
; SSE41-NEXT: paddq %xmm2, %xmm0
; SSE41-NEXT: movq %xmm0, %rax
; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1
; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3
; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm4
+; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm5
+; AVX1-NEXT: vpaddq %xmm4, %xmm5, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
-; AVX1-NEXT: vpaddq %xmm1, %xmm3, %xmm1
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT: vpaddq %xmm1, %xmm3, %xmm1
-; AVX1-NEXT: vpaddq %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpaddq %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpaddq %xmm0, %xmm4, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vmovq %xmm0, %rax
; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [1,1,1,1]
; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2
; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0
-; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3
+; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm2
; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1
-; AVX2-NEXT: vpaddq %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vpaddq %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpaddq %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
; SSE2-NEXT: pand %xmm4, %xmm2
; SSE2-NEXT: pand %xmm4, %xmm0
+; SSE2-NEXT: paddd %xmm2, %xmm0
; SSE2-NEXT: pand %xmm4, %xmm3
; SSE2-NEXT: pand %xmm4, %xmm1
; SSE2-NEXT: paddd %xmm3, %xmm1
-; SSE2-NEXT: paddd %xmm2, %xmm1
; SSE2-NEXT: paddd %xmm0, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; SSE2-NEXT: paddd %xmm1, %xmm0
; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
; SSE41-NEXT: pand %xmm4, %xmm2
; SSE41-NEXT: pand %xmm4, %xmm0
+; SSE41-NEXT: paddd %xmm2, %xmm0
; SSE41-NEXT: pand %xmm4, %xmm3
; SSE41-NEXT: pand %xmm4, %xmm1
; SSE41-NEXT: paddd %xmm3, %xmm1
-; SSE41-NEXT: paddd %xmm2, %xmm1
; SSE41-NEXT: paddd %xmm0, %xmm1
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; SSE41-NEXT: paddd %xmm1, %xmm0
; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm3, %xmm2
-; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1
; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm0, %xmm0
; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-FAST-NEXT: vpaddd %xmm2, %xmm3, %xmm2
-; AVX1-FAST-NEXT: vpaddd %xmm2, %xmm1, %xmm1
; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vpaddd %xmm2, %xmm0, %xmm0
; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
; SSE2-NEXT: pand %xmm8, %xmm5
; SSE2-NEXT: pand %xmm8, %xmm1
+; SSE2-NEXT: paddd %xmm5, %xmm1
; SSE2-NEXT: pand %xmm8, %xmm7
; SSE2-NEXT: pand %xmm8, %xmm3
; SSE2-NEXT: paddd %xmm7, %xmm3
-; SSE2-NEXT: paddd %xmm5, %xmm3
; SSE2-NEXT: paddd %xmm1, %xmm3
; SSE2-NEXT: pand %xmm8, %xmm4
; SSE2-NEXT: pand %xmm8, %xmm0
+; SSE2-NEXT: paddd %xmm4, %xmm0
; SSE2-NEXT: pand %xmm8, %xmm6
; SSE2-NEXT: pand %xmm8, %xmm2
; SSE2-NEXT: paddd %xmm6, %xmm2
-; SSE2-NEXT: paddd %xmm4, %xmm2
-; SSE2-NEXT: paddd %xmm3, %xmm2
; SSE2-NEXT: paddd %xmm0, %xmm2
+; SSE2-NEXT: paddd %xmm3, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
; SSE2-NEXT: paddd %xmm2, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; SSE41-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
; SSE41-NEXT: pand %xmm8, %xmm5
; SSE41-NEXT: pand %xmm8, %xmm1
+; SSE41-NEXT: paddd %xmm5, %xmm1
; SSE41-NEXT: pand %xmm8, %xmm7
; SSE41-NEXT: pand %xmm8, %xmm3
; SSE41-NEXT: paddd %xmm7, %xmm3
-; SSE41-NEXT: paddd %xmm5, %xmm3
; SSE41-NEXT: paddd %xmm1, %xmm3
; SSE41-NEXT: pand %xmm8, %xmm4
; SSE41-NEXT: pand %xmm8, %xmm0
+; SSE41-NEXT: paddd %xmm4, %xmm0
; SSE41-NEXT: pand %xmm8, %xmm6
; SSE41-NEXT: pand %xmm8, %xmm2
; SSE41-NEXT: paddd %xmm6, %xmm2
-; SSE41-NEXT: paddd %xmm4, %xmm2
-; SSE41-NEXT: paddd %xmm3, %xmm2
; SSE41-NEXT: paddd %xmm0, %xmm2
+; SSE41-NEXT: paddd %xmm3, %xmm2
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
; SSE41-NEXT: paddd %xmm2, %xmm0
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; AVX1-SLOW-NEXT: vandps %ymm4, %ymm1, %ymm1
; AVX1-SLOW-NEXT: vandps %ymm4, %ymm3, %ymm3
; AVX1-SLOW-NEXT: vpaddd %xmm3, %xmm1, %xmm4
+; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm0, %xmm5
+; AVX1-SLOW-NEXT: vpaddd %xmm4, %xmm5, %xmm4
; AVX1-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm3
; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-SLOW-NEXT: vpaddd %xmm3, %xmm1, %xmm1
-; AVX1-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm3
-; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm3, %xmm1
-; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm3, %xmm1
-; AVX1-SLOW-NEXT: vpaddd %xmm4, %xmm2, %xmm2
-; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm2, %xmm1
+; AVX1-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm2
+; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm0, %xmm0
; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT: vpaddd %xmm0, %xmm4, %xmm0
; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; AVX1-FAST-NEXT: vandps %ymm4, %ymm1, %ymm1
; AVX1-FAST-NEXT: vandps %ymm4, %ymm3, %ymm3
; AVX1-FAST-NEXT: vpaddd %xmm3, %xmm1, %xmm4
+; AVX1-FAST-NEXT: vpaddd %xmm2, %xmm0, %xmm5
+; AVX1-FAST-NEXT: vpaddd %xmm4, %xmm5, %xmm4
; AVX1-FAST-NEXT: vextractf128 $1, %ymm3, %xmm3
; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-FAST-NEXT: vpaddd %xmm3, %xmm1, %xmm1
-; AVX1-FAST-NEXT: vextractf128 $1, %ymm2, %xmm3
-; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm3, %xmm1
-; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm3, %xmm1
-; AVX1-FAST-NEXT: vpaddd %xmm4, %xmm2, %xmm2
-; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm2, %xmm1
+; AVX1-FAST-NEXT: vextractf128 $1, %ymm2, %xmm2
+; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-FAST-NEXT: vpaddd %xmm2, %xmm0, %xmm0
; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vpaddd %xmm0, %xmm4, %xmm0
; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255]
; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2
; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0
-; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3
+; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm2
; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1
-; AVX2-NEXT: vpaddd %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vpaddd %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpaddd %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; SSE2-NEXT: psadbw %xmm1, %xmm2
; SSE2-NEXT: paddq %xmm6, %xmm2
; SSE2-NEXT: psadbw %xmm1, %xmm4
-; SSE2-NEXT: paddq %xmm2, %xmm4
; SSE2-NEXT: psadbw %xmm1, %xmm0
; SSE2-NEXT: paddq %xmm4, %xmm0
+; SSE2-NEXT: paddq %xmm2, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE2-NEXT: paddq %xmm0, %xmm1
; SSE2-NEXT: movd %xmm1, %eax
; SSE41-NEXT: psadbw %xmm1, %xmm2
; SSE41-NEXT: paddq %xmm6, %xmm2
; SSE41-NEXT: psadbw %xmm1, %xmm4
-; SSE41-NEXT: paddq %xmm2, %xmm4
; SSE41-NEXT: psadbw %xmm1, %xmm0
; SSE41-NEXT: paddq %xmm4, %xmm0
+; SSE41-NEXT: paddq %xmm2, %xmm0
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE41-NEXT: paddq %xmm0, %xmm1
; SSE41-NEXT: movd %xmm1, %eax
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpaddq %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; SSE2-NEXT: pcmpgtd %xmm0, %xmm2
; SSE2-NEXT: movdqa %xmm0, %xmm5
; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1]
+; SSE2-NEXT: paddq %xmm4, %xmm5
; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
; SSE2-NEXT: paddq %xmm1, %xmm0
-; SSE2-NEXT: paddq %xmm4, %xmm0
; SSE2-NEXT: paddq %xmm5, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE2-NEXT: paddq %xmm0, %xmm1
; SSE41-NEXT: pmovsxbq %xmm0, %xmm1
; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
; SSE41-NEXT: pmovsxbq %xmm2, %xmm2
-; SSE41-NEXT: movdqa %xmm0, %xmm3
-; SSE41-NEXT: psrlq $48, %xmm3
-; SSE41-NEXT: pmovsxbq %xmm3, %xmm3
+; SSE41-NEXT: paddq %xmm1, %xmm2
+; SSE41-NEXT: movdqa %xmm0, %xmm1
+; SSE41-NEXT: psrlq $48, %xmm1
+; SSE41-NEXT: pmovsxbq %xmm1, %xmm1
; SSE41-NEXT: psrld $16, %xmm0
; SSE41-NEXT: pmovsxbq %xmm0, %xmm0
-; SSE41-NEXT: paddq %xmm3, %xmm0
-; SSE41-NEXT: paddq %xmm2, %xmm0
; SSE41-NEXT: paddq %xmm1, %xmm0
+; SSE41-NEXT: paddq %xmm2, %xmm0
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE41-NEXT: paddq %xmm0, %xmm1
; SSE41-NEXT: movq %xmm1, %rax
; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vpmovsxbq %xmm0, %xmm0
; AVX1-NEXT: vpmovsxbq %xmm2, %xmm2
-; AVX1-NEXT: vpaddq %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; SSE2-LABEL: test_v16i64_v16i8:
; SSE2: # %bb.0:
; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
-; SSE2-NEXT: psrad $24, %xmm1
-; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
+; SSE2-NEXT: psrad $24, %xmm2
+; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm3
-; SSE2-NEXT: movdqa %xmm1, %xmm5
+; SSE2-NEXT: pcmpgtd %xmm2, %xmm3
+; SSE2-NEXT: movdqa %xmm2, %xmm5
; SSE2-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm3[2],xmm5[3],xmm3[3]
; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3],xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3]
; SSE2-NEXT: pcmpgtd %xmm0, %xmm7
; SSE2-NEXT: movdqa %xmm0, %xmm8
; SSE2-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm7[2],xmm8[3],xmm7[3]
+; SSE2-NEXT: paddq %xmm5, %xmm8
; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7]
; SSE2-NEXT: psrad $24, %xmm4
-; SSE2-NEXT: pxor %xmm9, %xmm9
-; SSE2-NEXT: pcmpgtd %xmm4, %xmm9
-; SSE2-NEXT: movdqa %xmm4, %xmm10
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm9[2],xmm10[3],xmm9[3]
+; SSE2-NEXT: pxor %xmm5, %xmm5
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm5
+; SSE2-NEXT: movdqa %xmm4, %xmm9
+; SSE2-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm5[2],xmm9[3],xmm5[3]
; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4,4,5,5,6,6,7,7]
; SSE2-NEXT: psrad $24, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm6, %xmm2
-; SSE2-NEXT: movdqa %xmm6, %xmm11
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm2[2],xmm11[3],xmm2[3]
-; SSE2-NEXT: paddq %xmm10, %xmm11
-; SSE2-NEXT: paddq %xmm5, %xmm11
-; SSE2-NEXT: paddq %xmm8, %xmm11
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; SSE2-NEXT: pcmpgtd %xmm6, %xmm1
+; SSE2-NEXT: movdqa %xmm6, %xmm10
+; SSE2-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm1[2],xmm10[3],xmm1[3]
+; SSE2-NEXT: paddq %xmm9, %xmm10
+; SSE2-NEXT: paddq %xmm8, %xmm10
+; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1]
+; SSE2-NEXT: paddq %xmm2, %xmm0
+; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1]
; SSE2-NEXT: paddq %xmm4, %xmm6
-; SSE2-NEXT: paddq %xmm1, %xmm6
-; SSE2-NEXT: paddq %xmm11, %xmm6
; SSE2-NEXT: paddq %xmm0, %xmm6
+; SSE2-NEXT: paddq %xmm10, %xmm6
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3]
; SSE2-NEXT: paddq %xmm6, %xmm0
; SSE2-NEXT: movq %xmm0, %rax
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm1
; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: psrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE41-NEXT: pmovsxbq %xmm2, %xmm2
; SSE41-NEXT: movdqa %xmm0, %xmm3
-; SSE41-NEXT: psrlq $48, %xmm3
-; SSE41-NEXT: pmovsxbq %xmm3, %xmm3
-; SSE41-NEXT: paddq %xmm2, %xmm3
-; SSE41-NEXT: pmovsxbq %xmm0, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
-; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[3,3,3,3]
-; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,1,1]
+; SSE41-NEXT: pmovsxbq %xmm0, %xmm4
+; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,3,2,3]
+; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[3,3,3,3]
+; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,1,1]
; SSE41-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; SSE41-NEXT: pmovsxbq %xmm0, %xmm0
-; SSE41-NEXT: paddq %xmm0, %xmm3
; SSE41-NEXT: psrld $16, %xmm1
-; SSE41-NEXT: pmovsxbq %xmm1, %xmm0
-; SSE41-NEXT: paddq %xmm0, %xmm3
-; SSE41-NEXT: pmovsxbq %xmm4, %xmm0
-; SSE41-NEXT: pmovsxbq %xmm5, %xmm1
-; SSE41-NEXT: pmovsxbq %xmm6, %xmm4
-; SSE41-NEXT: paddq %xmm1, %xmm4
-; SSE41-NEXT: paddq %xmm0, %xmm4
-; SSE41-NEXT: paddq %xmm3, %xmm4
-; SSE41-NEXT: paddq %xmm2, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3]
+; SSE41-NEXT: pmovsxbq %xmm1, %xmm1
+; SSE41-NEXT: paddq %xmm0, %xmm1
+; SSE41-NEXT: psrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT: pmovsxbq %xmm2, %xmm0
+; SSE41-NEXT: psrlq $48, %xmm3
+; SSE41-NEXT: pmovsxbq %xmm3, %xmm2
+; SSE41-NEXT: paddq %xmm0, %xmm2
+; SSE41-NEXT: paddq %xmm1, %xmm2
+; SSE41-NEXT: pmovsxbq %xmm5, %xmm0
; SSE41-NEXT: paddq %xmm4, %xmm0
+; SSE41-NEXT: pmovsxbq %xmm6, %xmm1
+; SSE41-NEXT: pmovsxbq %xmm7, %xmm3
+; SSE41-NEXT: paddq %xmm1, %xmm3
+; SSE41-NEXT: paddq %xmm0, %xmm3
+; SSE41-NEXT: paddq %xmm2, %xmm3
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3]
+; SSE41-NEXT: paddq %xmm3, %xmm0
; SSE41-NEXT: movq %xmm0, %rax
; SSE41-NEXT: retq
;
; AVX1-NEXT: vpmovsxbw %xmm2, %xmm3
; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,1,1]
; AVX1-NEXT: vpmovsxwq %xmm4, %xmm4
-; AVX1-NEXT: vpmovsxbw %xmm0, %xmm5
-; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[3,3,3,3]
-; AVX1-NEXT: vpmovsxwq %xmm6, %xmm6
-; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm3[3,3,3,3]
-; AVX1-NEXT: vpmovsxwq %xmm7, %xmm7
-; AVX1-NEXT: vpaddq %xmm7, %xmm6, %xmm6
-; AVX1-NEXT: vpaddq %xmm6, %xmm4, %xmm4
; AVX1-NEXT: vpaddq %xmm4, %xmm1, %xmm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[2,3,2,3]
+; AVX1-NEXT: vpmovsxbw %xmm0, %xmm4
+; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[3,3,3,3]
+; AVX1-NEXT: vpmovsxwq %xmm5, %xmm5
+; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[3,3,3,3]
+; AVX1-NEXT: vpmovsxwq %xmm6, %xmm6
+; AVX1-NEXT: vpaddq %xmm6, %xmm5, %xmm5
+; AVX1-NEXT: vpaddq %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
; AVX1-NEXT: vpmovsxwq %xmm4, %xmm4
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
; AVX1-NEXT: vpmovsxwq %xmm3, %xmm3
; AVX1-NEXT: vpaddq %xmm3, %xmm4, %xmm3
; AVX1-NEXT: vpmovsxbq %xmm0, %xmm0
; AVX1-NEXT: vpmovsxbq %xmm2, %xmm2
-; AVX1-NEXT: vpaddq %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpaddq %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpaddq %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpaddq %ymm3, %ymm1, %ymm1
; AVX2-NEXT: vpmovsxbq %xmm0, %ymm0
; AVX2-NEXT: vpmovsxwq %xmm2, %ymm2
-; AVX2-NEXT: vpaddq %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
; SSE2-NEXT: psrad $24, %xmm3
+; SSE2-NEXT: paddd %xmm2, %xmm3
; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
; SSE2-NEXT: psrad $24, %xmm1
; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
; SSE2-NEXT: psrad $24, %xmm0
; SSE2-NEXT: paddd %xmm1, %xmm0
-; SSE2-NEXT: paddd %xmm2, %xmm0
; SSE2-NEXT: paddd %xmm3, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE2-NEXT: paddd %xmm0, %xmm1
; SSE41-NEXT: pmovsxbd %xmm0, %xmm1
; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
; SSE41-NEXT: pmovsxbd %xmm2, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
-; SSE41-NEXT: pmovsxbd %xmm3, %xmm3
+; SSE41-NEXT: paddd %xmm1, %xmm2
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
+; SSE41-NEXT: pmovsxbd %xmm1, %xmm1
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
; SSE41-NEXT: pmovsxbd %xmm0, %xmm0
-; SSE41-NEXT: paddd %xmm3, %xmm0
-; SSE41-NEXT: paddd %xmm2, %xmm0
; SSE41-NEXT: paddd %xmm1, %xmm0
+; SSE41-NEXT: paddd %xmm2, %xmm0
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE41-NEXT: paddd %xmm0, %xmm1
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
; AVX1-SLOW-NEXT: vpmovsxbd %xmm0, %xmm1
; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
; AVX1-SLOW-NEXT: vpmovsxbd %xmm2, %xmm2
-; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
-; AVX1-SLOW-NEXT: vpmovsxbd %xmm3, %xmm3
+; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[3,3,3,3]
+; AVX1-SLOW-NEXT: vpmovsxbd %xmm2, %xmm2
; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
; AVX1-SLOW-NEXT: vpmovsxbd %xmm0, %xmm0
-; AVX1-SLOW-NEXT: vpaddd %xmm3, %xmm0, %xmm0
-; AVX1-SLOW-NEXT: vpaddd %xmm0, %xmm2, %xmm0
+; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm0, %xmm0
; AVX1-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0
; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX1-FAST-NEXT: vpmovsxbd %xmm0, %xmm1
; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
; AVX1-FAST-NEXT: vpmovsxbd %xmm2, %xmm2
-; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
-; AVX1-FAST-NEXT: vpmovsxbd %xmm3, %xmm3
+; AVX1-FAST-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[3,3,3,3]
+; AVX1-FAST-NEXT: vpmovsxbd %xmm2, %xmm2
; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
; AVX1-FAST-NEXT: vpmovsxbd %xmm0, %xmm0
-; AVX1-FAST-NEXT: vpaddd %xmm3, %xmm0, %xmm0
-; AVX1-FAST-NEXT: vpaddd %xmm0, %xmm2, %xmm0
+; AVX1-FAST-NEXT: vpaddd %xmm2, %xmm0, %xmm0
; AVX1-FAST-NEXT: vpaddd %xmm0, %xmm1, %xmm0
; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
; SSE2-NEXT: psrad $24, %xmm5
+; SSE2-NEXT: paddd %xmm3, %xmm5
; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm1[4],xmm6[5],xmm1[5],xmm6[6],xmm1[6],xmm6[7],xmm1[7]
-; SSE2-NEXT: psrad $24, %xmm6
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
+; SSE2-NEXT: psrad $24, %xmm3
; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7]
-; SSE2-NEXT: psrad $24, %xmm7
-; SSE2-NEXT: paddd %xmm6, %xmm7
-; SSE2-NEXT: paddd %xmm3, %xmm7
-; SSE2-NEXT: paddd %xmm5, %xmm7
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
+; SSE2-NEXT: psrad $24, %xmm6
+; SSE2-NEXT: paddd %xmm3, %xmm6
+; SSE2-NEXT: paddd %xmm5, %xmm6
; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
; SSE2-NEXT: psrad $24, %xmm2
; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
; SSE2-NEXT: psrad $24, %xmm3
+; SSE2-NEXT: paddd %xmm2, %xmm3
; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
; SSE2-NEXT: psrad $24, %xmm1
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
; SSE2-NEXT: psrad $24, %xmm0
; SSE2-NEXT: paddd %xmm1, %xmm0
-; SSE2-NEXT: paddd %xmm2, %xmm0
-; SSE2-NEXT: paddd %xmm7, %xmm0
; SSE2-NEXT: paddd %xmm3, %xmm0
+; SSE2-NEXT: paddd %xmm6, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE2-NEXT: paddd %xmm0, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
; SSE41-NEXT: pmovsxbd %xmm2, %xmm2
; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
; SSE41-NEXT: pmovsxbd %xmm3, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm1[3,3,3,3]
+; SSE41-NEXT: paddd %xmm2, %xmm3
+; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3]
+; SSE41-NEXT: pmovsxbd %xmm2, %xmm2
+; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3]
; SSE41-NEXT: pmovsxbd %xmm4, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[3,3,3,3]
-; SSE41-NEXT: pmovsxbd %xmm5, %xmm5
-; SSE41-NEXT: paddd %xmm4, %xmm5
-; SSE41-NEXT: paddd %xmm2, %xmm5
-; SSE41-NEXT: paddd %xmm3, %xmm5
+; SSE41-NEXT: paddd %xmm2, %xmm4
+; SSE41-NEXT: paddd %xmm3, %xmm4
; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
; SSE41-NEXT: pmovsxbd %xmm2, %xmm2
; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
; SSE41-NEXT: pmovsxbd %xmm3, %xmm3
; SSE41-NEXT: paddd %xmm2, %xmm3
; SSE41-NEXT: pmovsxbd %xmm1, %xmm1
-; SSE41-NEXT: paddd %xmm3, %xmm1
-; SSE41-NEXT: paddd %xmm5, %xmm1
; SSE41-NEXT: pmovsxbd %xmm0, %xmm0
; SSE41-NEXT: paddd %xmm1, %xmm0
+; SSE41-NEXT: paddd %xmm3, %xmm0
+; SSE41-NEXT: paddd %xmm4, %xmm0
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE41-NEXT: paddd %xmm0, %xmm1
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
; AVX1-SLOW-NEXT: vpmovsxbd %xmm2, %xmm2
; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
; AVX1-SLOW-NEXT: vpmovsxbd %xmm3, %xmm3
-; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[3,3,3,3]
-; AVX1-SLOW-NEXT: vpmovsxbd %xmm4, %xmm4
-; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[3,3,3,3]
-; AVX1-SLOW-NEXT: vpmovsxbd %xmm5, %xmm5
-; AVX1-SLOW-NEXT: vpaddd %xmm4, %xmm5, %xmm4
-; AVX1-SLOW-NEXT: vpaddd %xmm4, %xmm2, %xmm2
; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm3, %xmm2
+; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[3,3,3,3]
+; AVX1-SLOW-NEXT: vpmovsxbd %xmm3, %xmm3
+; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[3,3,3,3]
+; AVX1-SLOW-NEXT: vpmovsxbd %xmm4, %xmm4
+; AVX1-SLOW-NEXT: vpaddd %xmm3, %xmm4, %xmm3
+; AVX1-SLOW-NEXT: vpaddd %xmm3, %xmm2, %xmm2
; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
; AVX1-SLOW-NEXT: vpmovsxbd %xmm3, %xmm3
; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
; AVX1-SLOW-NEXT: vpmovsxbd %xmm4, %xmm4
; AVX1-SLOW-NEXT: vpaddd %xmm3, %xmm4, %xmm3
; AVX1-SLOW-NEXT: vpmovsxbd %xmm1, %xmm1
-; AVX1-SLOW-NEXT: vpaddd %xmm3, %xmm1, %xmm1
-; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1
; AVX1-SLOW-NEXT: vpmovsxbd %xmm0, %xmm0
; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT: vpaddd %xmm3, %xmm0, %xmm0
+; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm0, %xmm0
; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; AVX1-FAST-NEXT: vpmovsxbd %xmm2, %xmm2
; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
; AVX1-FAST-NEXT: vpmovsxbd %xmm3, %xmm3
-; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[3,3,3,3]
-; AVX1-FAST-NEXT: vpmovsxbd %xmm4, %xmm4
-; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[3,3,3,3]
-; AVX1-FAST-NEXT: vpmovsxbd %xmm5, %xmm5
-; AVX1-FAST-NEXT: vpaddd %xmm4, %xmm5, %xmm4
-; AVX1-FAST-NEXT: vpaddd %xmm4, %xmm2, %xmm2
; AVX1-FAST-NEXT: vpaddd %xmm2, %xmm3, %xmm2
+; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[3,3,3,3]
+; AVX1-FAST-NEXT: vpmovsxbd %xmm3, %xmm3
+; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[3,3,3,3]
+; AVX1-FAST-NEXT: vpmovsxbd %xmm4, %xmm4
+; AVX1-FAST-NEXT: vpaddd %xmm3, %xmm4, %xmm3
+; AVX1-FAST-NEXT: vpaddd %xmm3, %xmm2, %xmm2
; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
; AVX1-FAST-NEXT: vpmovsxbd %xmm3, %xmm3
; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
; AVX1-FAST-NEXT: vpmovsxbd %xmm4, %xmm4
; AVX1-FAST-NEXT: vpaddd %xmm3, %xmm4, %xmm3
; AVX1-FAST-NEXT: vpmovsxbd %xmm1, %xmm1
-; AVX1-FAST-NEXT: vpaddd %xmm3, %xmm1, %xmm1
-; AVX1-FAST-NEXT: vpaddd %xmm2, %xmm1, %xmm1
; AVX1-FAST-NEXT: vpmovsxbd %xmm0, %xmm0
; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vpaddd %xmm3, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vpaddd %xmm2, %xmm0, %xmm0
; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
; AVX2-NEXT: vpmovsxbd %xmm3, %ymm3
; AVX2-NEXT: vpaddd %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpmovsxbd %xmm1, %ymm1
-; AVX2-NEXT: vpaddd %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0
; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE2-NEXT: psraw $8, %xmm2
; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
; SSE2-NEXT: psraw $8, %xmm3
+; SSE2-NEXT: paddw %xmm2, %xmm3
; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; SSE2-NEXT: psraw $8, %xmm1
; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; SSE2-NEXT: psraw $8, %xmm0
; SSE2-NEXT: paddw %xmm1, %xmm0
-; SSE2-NEXT: paddw %xmm2, %xmm0
; SSE2-NEXT: paddw %xmm3, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE2-NEXT: paddw %xmm0, %xmm1
; SSE41-NEXT: pmovsxbw %xmm3, %xmm3
; SSE41-NEXT: paddw %xmm2, %xmm3
; SSE41-NEXT: pmovsxbw %xmm1, %xmm1
-; SSE41-NEXT: paddw %xmm3, %xmm1
; SSE41-NEXT: pmovsxbw %xmm0, %xmm0
; SSE41-NEXT: paddw %xmm1, %xmm0
+; SSE41-NEXT: paddw %xmm3, %xmm0
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE41-NEXT: paddw %xmm0, %xmm1
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
; AVX1-SLOW-NEXT: vpmovsxbw %xmm3, %xmm3
; AVX1-SLOW-NEXT: vpaddw %xmm2, %xmm3, %xmm2
; AVX1-SLOW-NEXT: vpmovsxbw %xmm1, %xmm1
-; AVX1-SLOW-NEXT: vpaddw %xmm2, %xmm1, %xmm1
; AVX1-SLOW-NEXT: vpmovsxbw %xmm0, %xmm0
; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT: vpaddw %xmm2, %xmm0, %xmm0
; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; AVX1-FAST-NEXT: vpmovsxbw %xmm3, %xmm3
; AVX1-FAST-NEXT: vpaddw %xmm2, %xmm3, %xmm2
; AVX1-FAST-NEXT: vpmovsxbw %xmm1, %xmm1
-; AVX1-FAST-NEXT: vpaddw %xmm2, %xmm1, %xmm1
; AVX1-FAST-NEXT: vpmovsxbw %xmm0, %xmm0
; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vpaddw %xmm2, %xmm0, %xmm0
; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; SSE2-NEXT: psraw $8, %xmm4
; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
; SSE2-NEXT: psraw $8, %xmm5
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm3[8],xmm6[9],xmm3[9],xmm6[10],xmm3[10],xmm6[11],xmm3[11],xmm6[12],xmm3[12],xmm6[13],xmm3[13],xmm6[14],xmm3[14],xmm6[15],xmm3[15]
+; SSE2-NEXT: paddw %xmm4, %xmm5
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15]
+; SSE2-NEXT: psraw $8, %xmm4
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm1[8],xmm6[9],xmm1[9],xmm6[10],xmm1[10],xmm6[11],xmm1[11],xmm6[12],xmm1[12],xmm6[13],xmm1[13],xmm6[14],xmm1[14],xmm6[15],xmm1[15]
; SSE2-NEXT: psraw $8, %xmm6
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm1[8],xmm7[9],xmm1[9],xmm7[10],xmm1[10],xmm7[11],xmm1[11],xmm7[12],xmm1[12],xmm7[13],xmm1[13],xmm7[14],xmm1[14],xmm7[15],xmm1[15]
-; SSE2-NEXT: psraw $8, %xmm7
-; SSE2-NEXT: paddw %xmm6, %xmm7
-; SSE2-NEXT: paddw %xmm4, %xmm7
-; SSE2-NEXT: paddw %xmm5, %xmm7
+; SSE2-NEXT: paddw %xmm4, %xmm6
+; SSE2-NEXT: paddw %xmm5, %xmm6
; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: psraw $8, %xmm2
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: psraw $8, %xmm0
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: psraw $8, %xmm3
+; SSE2-NEXT: paddw %xmm2, %xmm0
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
+; SSE2-NEXT: psraw $8, %xmm2
; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: psraw $8, %xmm1
-; SSE2-NEXT: paddw %xmm3, %xmm1
; SSE2-NEXT: paddw %xmm2, %xmm1
-; SSE2-NEXT: paddw %xmm7, %xmm1
; SSE2-NEXT: paddw %xmm0, %xmm1
+; SSE2-NEXT: paddw %xmm6, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; SSE2-NEXT: paddw %xmm1, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; SSE41: # %bb.0:
; SSE41-NEXT: pmovsxbw %xmm2, %xmm4
; SSE41-NEXT: pmovsxbw %xmm0, %xmm5
-; SSE41-NEXT: pmovsxbw %xmm3, %xmm6
-; SSE41-NEXT: pmovsxbw %xmm1, %xmm7
-; SSE41-NEXT: paddw %xmm6, %xmm7
-; SSE41-NEXT: paddw %xmm4, %xmm7
+; SSE41-NEXT: paddw %xmm4, %xmm5
+; SSE41-NEXT: pmovsxbw %xmm3, %xmm4
+; SSE41-NEXT: pmovsxbw %xmm1, %xmm6
+; SSE41-NEXT: paddw %xmm4, %xmm6
+; SSE41-NEXT: paddw %xmm5, %xmm6
; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
; SSE41-NEXT: pmovsxbw %xmm2, %xmm2
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE41-NEXT: pmovsxbw %xmm0, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
-; SSE41-NEXT: pmovsxbw %xmm3, %xmm3
+; SSE41-NEXT: paddw %xmm2, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3]
+; SSE41-NEXT: pmovsxbw %xmm2, %xmm2
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; SSE41-NEXT: pmovsxbw %xmm1, %xmm1
-; SSE41-NEXT: paddw %xmm3, %xmm1
; SSE41-NEXT: paddw %xmm2, %xmm1
; SSE41-NEXT: paddw %xmm0, %xmm1
-; SSE41-NEXT: paddw %xmm7, %xmm1
-; SSE41-NEXT: paddw %xmm5, %xmm1
+; SSE41-NEXT: paddw %xmm6, %xmm1
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; SSE41-NEXT: paddw %xmm1, %xmm0
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; AVX1-SLOW: # %bb.0:
; AVX1-SLOW-NEXT: vpmovsxbw %xmm1, %xmm2
; AVX1-SLOW-NEXT: vpmovsxbw %xmm0, %xmm3
-; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm4
-; AVX1-SLOW-NEXT: vpmovsxbw %xmm4, %xmm5
-; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm6
-; AVX1-SLOW-NEXT: vpmovsxbw %xmm6, %xmm7
-; AVX1-SLOW-NEXT: vpaddw %xmm5, %xmm7, %xmm5
-; AVX1-SLOW-NEXT: vpaddw %xmm5, %xmm2, %xmm2
+; AVX1-SLOW-NEXT: vpaddw %xmm2, %xmm3, %xmm2
+; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-SLOW-NEXT: vpmovsxbw %xmm3, %xmm4
+; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm5
+; AVX1-SLOW-NEXT: vpmovsxbw %xmm5, %xmm6
+; AVX1-SLOW-NEXT: vpaddw %xmm4, %xmm6, %xmm4
+; AVX1-SLOW-NEXT: vpaddw %xmm4, %xmm2, %xmm2
; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; AVX1-SLOW-NEXT: vpmovsxbw %xmm1, %xmm1
; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX1-SLOW-NEXT: vpmovsxbw %xmm0, %xmm0
-; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
-; AVX1-SLOW-NEXT: vpmovsxbw %xmm4, %xmm4
-; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[2,3,2,3]
-; AVX1-SLOW-NEXT: vpmovsxbw %xmm5, %xmm5
-; AVX1-SLOW-NEXT: vpaddw %xmm4, %xmm5, %xmm4
-; AVX1-SLOW-NEXT: vpaddw %xmm4, %xmm1, %xmm1
+; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[2,3,2,3]
+; AVX1-SLOW-NEXT: vpmovsxbw %xmm1, %xmm1
+; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[2,3,2,3]
+; AVX1-SLOW-NEXT: vpmovsxbw %xmm3, %xmm3
+; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm3, %xmm1
; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; AVX1-SLOW-NEXT: vpaddw %xmm0, %xmm2, %xmm0
-; AVX1-SLOW-NEXT: vpaddw %xmm0, %xmm3, %xmm0
; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; AVX1-FAST: # %bb.0:
; AVX1-FAST-NEXT: vpmovsxbw %xmm1, %xmm2
; AVX1-FAST-NEXT: vpmovsxbw %xmm0, %xmm3
-; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm4
-; AVX1-FAST-NEXT: vpmovsxbw %xmm4, %xmm5
-; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm6
-; AVX1-FAST-NEXT: vpmovsxbw %xmm6, %xmm7
-; AVX1-FAST-NEXT: vpaddw %xmm5, %xmm7, %xmm5
-; AVX1-FAST-NEXT: vpaddw %xmm5, %xmm2, %xmm2
+; AVX1-FAST-NEXT: vpaddw %xmm2, %xmm3, %xmm2
+; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-FAST-NEXT: vpmovsxbw %xmm3, %xmm4
+; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm5
+; AVX1-FAST-NEXT: vpmovsxbw %xmm5, %xmm6
+; AVX1-FAST-NEXT: vpaddw %xmm4, %xmm6, %xmm4
+; AVX1-FAST-NEXT: vpaddw %xmm4, %xmm2, %xmm2
; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; AVX1-FAST-NEXT: vpmovsxbw %xmm1, %xmm1
; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX1-FAST-NEXT: vpmovsxbw %xmm0, %xmm0
-; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
-; AVX1-FAST-NEXT: vpmovsxbw %xmm4, %xmm4
-; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[2,3,2,3]
-; AVX1-FAST-NEXT: vpmovsxbw %xmm5, %xmm5
-; AVX1-FAST-NEXT: vpaddw %xmm4, %xmm5, %xmm4
-; AVX1-FAST-NEXT: vpaddw %xmm4, %xmm1, %xmm1
+; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[2,3,2,3]
+; AVX1-FAST-NEXT: vpmovsxbw %xmm1, %xmm1
+; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[2,3,2,3]
+; AVX1-FAST-NEXT: vpmovsxbw %xmm3, %xmm3
+; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm3, %xmm1
; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; AVX1-FAST-NEXT: vpaddw %xmm0, %xmm2, %xmm0
-; AVX1-FAST-NEXT: vpaddw %xmm0, %xmm3, %xmm0
; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; AVX2: # %bb.0:
; AVX2-NEXT: vpmovsxbw %xmm1, %ymm2
; AVX2-NEXT: vpmovsxbw %xmm0, %ymm3
+; AVX2-NEXT: vpaddw %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1
; AVX2-NEXT: vpmovsxbw %xmm1, %ymm1
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0
; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpaddw %ymm0, %ymm2, %ymm0
-; AVX2-NEXT: vpaddw %ymm0, %ymm3, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE2-NEXT: pcmpgtb %xmm2, %xmm5
; SSE2-NEXT: pxor %xmm2, %xmm2
; SSE2-NEXT: pcmpgtb %xmm0, %xmm2
+; SSE2-NEXT: paddb %xmm5, %xmm2
; SSE2-NEXT: pxor %xmm0, %xmm0
; SSE2-NEXT: pcmpgtb %xmm3, %xmm0
; SSE2-NEXT: pxor %xmm3, %xmm3
; SSE2-NEXT: pcmpgtb %xmm1, %xmm3
; SSE2-NEXT: paddb %xmm0, %xmm3
-; SSE2-NEXT: paddb %xmm5, %xmm3
; SSE2-NEXT: paddb %xmm2, %xmm3
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3]
; SSE2-NEXT: paddb %xmm3, %xmm0
; SSE41-NEXT: pcmpgtb %xmm2, %xmm5
; SSE41-NEXT: pxor %xmm2, %xmm2
; SSE41-NEXT: pcmpgtb %xmm0, %xmm2
+; SSE41-NEXT: paddb %xmm5, %xmm2
; SSE41-NEXT: pxor %xmm0, %xmm0
; SSE41-NEXT: pcmpgtb %xmm3, %xmm0
; SSE41-NEXT: pxor %xmm3, %xmm3
; SSE41-NEXT: pcmpgtb %xmm1, %xmm3
; SSE41-NEXT: paddb %xmm0, %xmm3
-; SSE41-NEXT: paddb %xmm5, %xmm3
; SSE41-NEXT: paddb %xmm2, %xmm3
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3]
; SSE41-NEXT: paddb %xmm3, %xmm0
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpcmpgtb %xmm1, %xmm2, %xmm3
; AVX1-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm4
+; AVX1-NEXT: vpaddb %xmm3, %xmm4, %xmm3
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-NEXT: vpcmpgtb %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm0
; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpaddb %xmm0, %xmm3, %xmm0
-; AVX1-NEXT: vpaddb %xmm0, %xmm4, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm2, %xmm0, %xmm0
; SSE2-NEXT: pcmpgtb %xmm4, %xmm9
; SSE2-NEXT: pxor %xmm4, %xmm4
; SSE2-NEXT: pcmpgtb %xmm0, %xmm4
+; SSE2-NEXT: paddb %xmm9, %xmm4
; SSE2-NEXT: pxor %xmm0, %xmm0
; SSE2-NEXT: pcmpgtb %xmm6, %xmm0
; SSE2-NEXT: pxor %xmm6, %xmm6
; SSE2-NEXT: pcmpgtb %xmm2, %xmm6
; SSE2-NEXT: paddb %xmm0, %xmm6
-; SSE2-NEXT: paddb %xmm9, %xmm6
+; SSE2-NEXT: paddb %xmm4, %xmm6
; SSE2-NEXT: pxor %xmm0, %xmm0
; SSE2-NEXT: pcmpgtb %xmm5, %xmm0
; SSE2-NEXT: pxor %xmm2, %xmm2
; SSE2-NEXT: pcmpgtb %xmm1, %xmm2
+; SSE2-NEXT: paddb %xmm0, %xmm2
+; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: pcmpgtb %xmm7, %xmm0
; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: pcmpgtb %xmm7, %xmm1
-; SSE2-NEXT: pxor %xmm5, %xmm5
-; SSE2-NEXT: pcmpgtb %xmm3, %xmm5
-; SSE2-NEXT: paddb %xmm1, %xmm5
-; SSE2-NEXT: paddb %xmm0, %xmm5
-; SSE2-NEXT: paddb %xmm2, %xmm5
-; SSE2-NEXT: paddb %xmm6, %xmm5
-; SSE2-NEXT: paddb %xmm4, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,3,2,3]
-; SSE2-NEXT: paddb %xmm5, %xmm0
+; SSE2-NEXT: pcmpgtb %xmm3, %xmm1
+; SSE2-NEXT: paddb %xmm0, %xmm1
+; SSE2-NEXT: paddb %xmm2, %xmm1
+; SSE2-NEXT: paddb %xmm6, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE2-NEXT: paddb %xmm1, %xmm0
; SSE2-NEXT: psadbw %xmm8, %xmm0
; SSE2-NEXT: movd %xmm0, %eax
; SSE2-NEXT: # kill: def $al killed $al killed $eax
; SSE41-NEXT: pcmpgtb %xmm4, %xmm9
; SSE41-NEXT: pxor %xmm4, %xmm4
; SSE41-NEXT: pcmpgtb %xmm0, %xmm4
+; SSE41-NEXT: paddb %xmm9, %xmm4
; SSE41-NEXT: pxor %xmm0, %xmm0
; SSE41-NEXT: pcmpgtb %xmm6, %xmm0
; SSE41-NEXT: pxor %xmm6, %xmm6
; SSE41-NEXT: pcmpgtb %xmm2, %xmm6
; SSE41-NEXT: paddb %xmm0, %xmm6
-; SSE41-NEXT: paddb %xmm9, %xmm6
+; SSE41-NEXT: paddb %xmm4, %xmm6
; SSE41-NEXT: pxor %xmm0, %xmm0
; SSE41-NEXT: pcmpgtb %xmm5, %xmm0
; SSE41-NEXT: pxor %xmm2, %xmm2
; SSE41-NEXT: pcmpgtb %xmm1, %xmm2
+; SSE41-NEXT: paddb %xmm0, %xmm2
+; SSE41-NEXT: pxor %xmm0, %xmm0
+; SSE41-NEXT: pcmpgtb %xmm7, %xmm0
; SSE41-NEXT: pxor %xmm1, %xmm1
-; SSE41-NEXT: pcmpgtb %xmm7, %xmm1
-; SSE41-NEXT: pxor %xmm5, %xmm5
-; SSE41-NEXT: pcmpgtb %xmm3, %xmm5
-; SSE41-NEXT: paddb %xmm1, %xmm5
-; SSE41-NEXT: paddb %xmm0, %xmm5
-; SSE41-NEXT: paddb %xmm2, %xmm5
-; SSE41-NEXT: paddb %xmm6, %xmm5
-; SSE41-NEXT: paddb %xmm4, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,3,2,3]
-; SSE41-NEXT: paddb %xmm5, %xmm0
+; SSE41-NEXT: pcmpgtb %xmm3, %xmm1
+; SSE41-NEXT: paddb %xmm0, %xmm1
+; SSE41-NEXT: paddb %xmm2, %xmm1
+; SSE41-NEXT: paddb %xmm6, %xmm1
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE41-NEXT: paddb %xmm1, %xmm0
; SSE41-NEXT: psadbw %xmm8, %xmm0
; SSE41-NEXT: movd %xmm0, %eax
; SSE41-NEXT: # kill: def $al killed $al killed $eax
; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
; AVX1-NEXT: vpcmpgtb %xmm2, %xmm4, %xmm5
; AVX1-NEXT: vpcmpgtb %xmm0, %xmm4, %xmm6
-; AVX1-NEXT: vpcmpgtb %xmm3, %xmm4, %xmm7
-; AVX1-NEXT: vpcmpgtb %xmm1, %xmm4, %xmm8
-; AVX1-NEXT: vpaddb %xmm7, %xmm8, %xmm7
-; AVX1-NEXT: vpaddb %xmm7, %xmm5, %xmm5
+; AVX1-NEXT: vpaddb %xmm5, %xmm6, %xmm5
+; AVX1-NEXT: vpcmpgtb %xmm3, %xmm4, %xmm6
+; AVX1-NEXT: vpcmpgtb %xmm1, %xmm4, %xmm7
+; AVX1-NEXT: vpaddb %xmm6, %xmm7, %xmm6
+; AVX1-NEXT: vpaddb %xmm6, %xmm5, %xmm5
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
; AVX1-NEXT: vpcmpgtb %xmm2, %xmm4, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpcmpgtb %xmm0, %xmm4, %xmm0
-; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
-; AVX1-NEXT: vpcmpgtb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2
+; AVX1-NEXT: vpcmpgtb %xmm2, %xmm4, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-NEXT: vpcmpgtb %xmm1, %xmm4, %xmm1
-; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpaddb %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vpaddb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpaddb %xmm0, %xmm5, %xmm0
-; AVX1-NEXT: vpaddb %xmm0, %xmm6, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4
; AVX2-NEXT: vpcmpgtb %ymm2, %ymm4, %ymm2
; AVX2-NEXT: vpcmpgtb %ymm0, %ymm4, %ymm0
-; AVX2-NEXT: vpcmpgtb %ymm3, %ymm4, %ymm3
+; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpcmpgtb %ymm3, %ymm4, %ymm2
; AVX2-NEXT: vpcmpgtb %ymm1, %ymm4, %ymm1
-; AVX2-NEXT: vpaddb %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vpaddb %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpaddb %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0
; SSE-NEXT: psadbw %xmm4, %xmm1
; SSE-NEXT: paddq %xmm3, %xmm1
; SSE-NEXT: psadbw %xmm4, %xmm2
-; SSE-NEXT: paddq %xmm1, %xmm2
; SSE-NEXT: psadbw %xmm4, %xmm0
; SSE-NEXT: paddq %xmm2, %xmm0
+; SSE-NEXT: paddq %xmm1, %xmm0
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE-NEXT: paddq %xmm0, %xmm1
; SSE-NEXT: movd %xmm1, %eax
; AVX1-NEXT: vpsadbw %xmm3, %xmm4, %xmm4
; AVX1-NEXT: vpaddq %xmm2, %xmm4, %xmm2
; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpaddq %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vmovd %xmm0, %eax
; SSE-LABEL: test_v8i64:
; SSE: # %bb.0:
; SSE-NEXT: paddq %xmm3, %xmm1
-; SSE-NEXT: paddq %xmm2, %xmm1
-; SSE-NEXT: paddq %xmm0, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE-NEXT: paddq %xmm2, %xmm0
; SSE-NEXT: paddq %xmm1, %xmm0
-; SSE-NEXT: movq %xmm0, %rax
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; SSE-NEXT: paddq %xmm0, %xmm1
+; SSE-NEXT: movq %xmm1, %rax
; SSE-NEXT: retq
;
; AVX1-LABEL: test_v8i64:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpaddq %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vmovq %xmm0, %rax
; SSE-LABEL: test_v16i64:
; SSE: # %bb.0:
; SSE-NEXT: paddq %xmm6, %xmm2
-; SSE-NEXT: paddq %xmm7, %xmm3
-; SSE-NEXT: paddq %xmm5, %xmm3
-; SSE-NEXT: paddq %xmm1, %xmm3
-; SSE-NEXT: paddq %xmm4, %xmm2
-; SSE-NEXT: paddq %xmm3, %xmm2
-; SSE-NEXT: paddq %xmm0, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
+; SSE-NEXT: paddq %xmm4, %xmm0
; SSE-NEXT: paddq %xmm2, %xmm0
+; SSE-NEXT: paddq %xmm7, %xmm3
+; SSE-NEXT: paddq %xmm5, %xmm1
+; SSE-NEXT: paddq %xmm3, %xmm1
+; SSE-NEXT: paddq %xmm0, %xmm1
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE-NEXT: paddq %xmm1, %xmm0
; SSE-NEXT: movq %xmm0, %rax
; SSE-NEXT: retq
;
; AVX1-LABEL: test_v16i64:
; AVX1: # %bb.0:
; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm4
+; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm5
+; AVX1-NEXT: vpaddq %xmm4, %xmm5, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
-; AVX1-NEXT: vpaddq %xmm1, %xmm3, %xmm1
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT: vpaddq %xmm1, %xmm3, %xmm1
-; AVX1-NEXT: vpaddq %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpaddq %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpaddq %xmm0, %xmm4, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vmovq %xmm0, %rax
; AVX2-LABEL: test_v16i64:
; AVX2: # %bb.0:
; AVX2-NEXT: vpaddq %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vpaddq %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; SSE-LABEL: test_v16i32:
; SSE: # %bb.0:
; SSE-NEXT: paddd %xmm3, %xmm1
-; SSE-NEXT: paddd %xmm2, %xmm1
-; SSE-NEXT: paddd %xmm0, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE-NEXT: paddd %xmm2, %xmm0
; SSE-NEXT: paddd %xmm1, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE-NEXT: paddd %xmm0, %xmm1
-; SSE-NEXT: movd %xmm1, %eax
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
+; SSE-NEXT: paddd %xmm1, %xmm0
+; SSE-NEXT: movd %xmm0, %eax
; SSE-NEXT: retq
;
; AVX1-SLOW-LABEL: test_v16i32:
; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm3, %xmm2
-; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1
; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm0, %xmm0
; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-FAST-NEXT: vpaddd %xmm2, %xmm3, %xmm2
-; AVX1-FAST-NEXT: vpaddd %xmm2, %xmm1, %xmm1
; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vpaddd %xmm2, %xmm0, %xmm0
; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
; SSE-LABEL: test_v32i32:
; SSE: # %bb.0:
; SSE-NEXT: paddd %xmm6, %xmm2
-; SSE-NEXT: paddd %xmm7, %xmm3
-; SSE-NEXT: paddd %xmm5, %xmm3
-; SSE-NEXT: paddd %xmm1, %xmm3
-; SSE-NEXT: paddd %xmm4, %xmm2
-; SSE-NEXT: paddd %xmm3, %xmm2
-; SSE-NEXT: paddd %xmm0, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
+; SSE-NEXT: paddd %xmm4, %xmm0
; SSE-NEXT: paddd %xmm2, %xmm0
+; SSE-NEXT: paddd %xmm7, %xmm3
+; SSE-NEXT: paddd %xmm5, %xmm1
+; SSE-NEXT: paddd %xmm3, %xmm1
+; SSE-NEXT: paddd %xmm0, %xmm1
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE-NEXT: paddd %xmm1, %xmm0
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; SSE-NEXT: paddd %xmm0, %xmm1
; SSE-NEXT: movd %xmm1, %eax
; AVX1-SLOW-LABEL: test_v32i32:
; AVX1-SLOW: # %bb.0:
; AVX1-SLOW-NEXT: vpaddd %xmm3, %xmm1, %xmm4
+; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm0, %xmm5
+; AVX1-SLOW-NEXT: vpaddd %xmm4, %xmm5, %xmm4
; AVX1-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm3
; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-SLOW-NEXT: vpaddd %xmm3, %xmm1, %xmm1
-; AVX1-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm3
-; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm3, %xmm1
-; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm3, %xmm1
-; AVX1-SLOW-NEXT: vpaddd %xmm4, %xmm2, %xmm2
-; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm2, %xmm1
+; AVX1-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm2
+; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm0, %xmm0
; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT: vpaddd %xmm0, %xmm4, %xmm0
; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; AVX1-FAST-LABEL: test_v32i32:
; AVX1-FAST: # %bb.0:
; AVX1-FAST-NEXT: vpaddd %xmm3, %xmm1, %xmm4
+; AVX1-FAST-NEXT: vpaddd %xmm2, %xmm0, %xmm5
+; AVX1-FAST-NEXT: vpaddd %xmm4, %xmm5, %xmm4
; AVX1-FAST-NEXT: vextractf128 $1, %ymm3, %xmm3
; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-FAST-NEXT: vpaddd %xmm3, %xmm1, %xmm1
-; AVX1-FAST-NEXT: vextractf128 $1, %ymm2, %xmm3
-; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm3, %xmm1
-; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm3, %xmm1
-; AVX1-FAST-NEXT: vpaddd %xmm4, %xmm2, %xmm2
-; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm2, %xmm1
+; AVX1-FAST-NEXT: vextractf128 $1, %ymm2, %xmm2
+; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-FAST-NEXT: vpaddd %xmm2, %xmm0, %xmm0
; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vpaddd %xmm0, %xmm4, %xmm0
; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
; AVX2-LABEL: test_v32i32:
; AVX2: # %bb.0:
; AVX2-NEXT: vpaddd %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vpaddd %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; SSE-LABEL: test_v32i16:
; SSE: # %bb.0:
; SSE-NEXT: paddw %xmm3, %xmm1
-; SSE-NEXT: paddw %xmm2, %xmm1
-; SSE-NEXT: paddw %xmm0, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE-NEXT: paddw %xmm2, %xmm0
; SSE-NEXT: paddw %xmm1, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE-NEXT: paddw %xmm0, %xmm1
-; SSE-NEXT: movdqa %xmm1, %xmm0
-; SSE-NEXT: psrld $16, %xmm0
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
; SSE-NEXT: paddw %xmm1, %xmm0
-; SSE-NEXT: movd %xmm0, %eax
+; SSE-NEXT: movdqa %xmm0, %xmm1
+; SSE-NEXT: psrld $16, %xmm1
+; SSE-NEXT: paddw %xmm0, %xmm1
+; SSE-NEXT: movd %xmm1, %eax
; SSE-NEXT: # kill: def $ax killed $ax killed $eax
; SSE-NEXT: retq
;
; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-SLOW-NEXT: vpaddw %xmm2, %xmm3, %xmm2
-; AVX1-SLOW-NEXT: vpaddw %xmm2, %xmm1, %xmm1
; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT: vpaddw %xmm2, %xmm0, %xmm0
; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-FAST-NEXT: vpaddw %xmm2, %xmm3, %xmm2
-; AVX1-FAST-NEXT: vpaddw %xmm2, %xmm1, %xmm1
; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vpaddw %xmm2, %xmm0, %xmm0
; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; SSE-LABEL: test_v64i16:
; SSE: # %bb.0:
; SSE-NEXT: paddw %xmm6, %xmm2
-; SSE-NEXT: paddw %xmm7, %xmm3
-; SSE-NEXT: paddw %xmm5, %xmm3
-; SSE-NEXT: paddw %xmm1, %xmm3
-; SSE-NEXT: paddw %xmm4, %xmm2
-; SSE-NEXT: paddw %xmm3, %xmm2
-; SSE-NEXT: paddw %xmm0, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
+; SSE-NEXT: paddw %xmm4, %xmm0
; SSE-NEXT: paddw %xmm2, %xmm0
+; SSE-NEXT: paddw %xmm7, %xmm3
+; SSE-NEXT: paddw %xmm5, %xmm1
+; SSE-NEXT: paddw %xmm3, %xmm1
+; SSE-NEXT: paddw %xmm0, %xmm1
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE-NEXT: paddw %xmm1, %xmm0
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; SSE-NEXT: paddw %xmm0, %xmm1
; SSE-NEXT: movdqa %xmm1, %xmm0
; AVX1-SLOW-LABEL: test_v64i16:
; AVX1-SLOW: # %bb.0:
; AVX1-SLOW-NEXT: vpaddw %xmm3, %xmm1, %xmm4
+; AVX1-SLOW-NEXT: vpaddw %xmm2, %xmm0, %xmm5
+; AVX1-SLOW-NEXT: vpaddw %xmm4, %xmm5, %xmm4
; AVX1-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm3
; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-SLOW-NEXT: vpaddw %xmm3, %xmm1, %xmm1
-; AVX1-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm3
-; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm3, %xmm1
-; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm3, %xmm1
-; AVX1-SLOW-NEXT: vpaddw %xmm4, %xmm2, %xmm2
-; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm2, %xmm1
+; AVX1-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm2
+; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-SLOW-NEXT: vpaddw %xmm2, %xmm0, %xmm0
; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT: vpaddw %xmm0, %xmm4, %xmm0
; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; AVX1-FAST-LABEL: test_v64i16:
; AVX1-FAST: # %bb.0:
; AVX1-FAST-NEXT: vpaddw %xmm3, %xmm1, %xmm4
+; AVX1-FAST-NEXT: vpaddw %xmm2, %xmm0, %xmm5
+; AVX1-FAST-NEXT: vpaddw %xmm4, %xmm5, %xmm4
; AVX1-FAST-NEXT: vextractf128 $1, %ymm3, %xmm3
; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-FAST-NEXT: vpaddw %xmm3, %xmm1, %xmm1
-; AVX1-FAST-NEXT: vextractf128 $1, %ymm2, %xmm3
-; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm3, %xmm1
-; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm3, %xmm1
-; AVX1-FAST-NEXT: vpaddw %xmm4, %xmm2, %xmm2
-; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm2, %xmm1
+; AVX1-FAST-NEXT: vextractf128 $1, %ymm2, %xmm2
+; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-FAST-NEXT: vpaddw %xmm2, %xmm0, %xmm0
; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vpaddw %xmm0, %xmm4, %xmm0
; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; AVX2-LABEL: test_v64i16:
; AVX2: # %bb.0:
; AVX2-NEXT: vpaddw %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vpaddw %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpaddw %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; SSE-LABEL: test_v64i8:
; SSE: # %bb.0:
; SSE-NEXT: paddb %xmm3, %xmm1
-; SSE-NEXT: paddb %xmm2, %xmm1
-; SSE-NEXT: paddb %xmm0, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE-NEXT: paddb %xmm2, %xmm0
; SSE-NEXT: paddb %xmm1, %xmm0
-; SSE-NEXT: pxor %xmm1, %xmm1
-; SSE-NEXT: psadbw %xmm0, %xmm1
-; SSE-NEXT: movd %xmm1, %eax
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; SSE-NEXT: paddb %xmm0, %xmm1
+; SSE-NEXT: pxor %xmm0, %xmm0
+; SSE-NEXT: psadbw %xmm1, %xmm0
+; SSE-NEXT: movd %xmm0, %eax
; SSE-NEXT: # kill: def $al killed $al killed $eax
; SSE-NEXT: retq
;
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpaddb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpaddb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; SSE-LABEL: test_v128i8:
; SSE: # %bb.0:
; SSE-NEXT: paddb %xmm7, %xmm3
-; SSE-NEXT: paddb %xmm5, %xmm3
-; SSE-NEXT: paddb %xmm1, %xmm3
+; SSE-NEXT: paddb %xmm5, %xmm1
+; SSE-NEXT: paddb %xmm3, %xmm1
; SSE-NEXT: paddb %xmm6, %xmm2
-; SSE-NEXT: paddb %xmm4, %xmm2
-; SSE-NEXT: paddb %xmm3, %xmm2
-; SSE-NEXT: paddb %xmm0, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
+; SSE-NEXT: paddb %xmm4, %xmm0
; SSE-NEXT: paddb %xmm2, %xmm0
-; SSE-NEXT: pxor %xmm1, %xmm1
-; SSE-NEXT: psadbw %xmm0, %xmm1
-; SSE-NEXT: movd %xmm1, %eax
+; SSE-NEXT: paddb %xmm1, %xmm0
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; SSE-NEXT: paddb %xmm0, %xmm1
+; SSE-NEXT: pxor %xmm0, %xmm0
+; SSE-NEXT: psadbw %xmm1, %xmm0
+; SSE-NEXT: movd %xmm0, %eax
; SSE-NEXT: # kill: def $al killed $al killed $eax
; SSE-NEXT: retq
;
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
; AVX1-NEXT: vpaddb %xmm4, %xmm5, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
-; AVX1-NEXT: vpaddb %xmm4, %xmm5, %xmm4
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
+; AVX1-NEXT: vpaddb %xmm5, %xmm6, %xmm5
; AVX1-NEXT: vpaddb %xmm4, %xmm5, %xmm4
; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpaddb %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vpaddb %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpaddb %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-LABEL: test_v128i8:
; AVX2: # %bb.0:
; AVX2-NEXT: vpaddb %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vpaddb %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0
; SSE-LABEL: trunc_v64i8_v64i1:
; SSE: # %bb.0:
; SSE-NEXT: pand %xmm3, %xmm1
-; SSE-NEXT: pand %xmm2, %xmm1
-; SSE-NEXT: pand %xmm0, %xmm1
-; SSE-NEXT: psllw $7, %xmm1
-; SSE-NEXT: pmovmskb %xmm1, %eax
+; SSE-NEXT: pand %xmm2, %xmm0
+; SSE-NEXT: pand %xmm1, %xmm0
+; SSE-NEXT: psllw $7, %xmm0
+; SSE-NEXT: pmovmskb %xmm0, %eax
; SSE-NEXT: cmpw $-1, %ax
; SSE-NEXT: sete %al
; SSE-NEXT: retq
; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm3
; AVX512F-NEXT: vpand %xmm2, %xmm3, %xmm2
-; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm1
; AVX512F-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT: vpand %xmm2, %xmm0, %xmm0
; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
; SSE2: # %bb.0:
; SSE2-NEXT: pxor %xmm4, %xmm4
; SSE2-NEXT: por %xmm3, %xmm1
-; SSE2-NEXT: por %xmm2, %xmm1
-; SSE2-NEXT: por %xmm0, %xmm1
-; SSE2-NEXT: pcmpeqb %xmm4, %xmm1
-; SSE2-NEXT: pmovmskb %xmm1, %eax
+; SSE2-NEXT: por %xmm2, %xmm0
+; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: pcmpeqb %xmm4, %xmm0
+; SSE2-NEXT: pmovmskb %xmm0, %eax
; SSE2-NEXT: cmpw $-1, %ax
; SSE2-NEXT: sete %al
; SSE2-NEXT: retq
; SSE41-LABEL: icmp0_v8i64_v8i1:
; SSE41: # %bb.0:
; SSE41-NEXT: por %xmm3, %xmm1
-; SSE41-NEXT: por %xmm2, %xmm1
-; SSE41-NEXT: por %xmm0, %xmm1
-; SSE41-NEXT: ptest %xmm1, %xmm1
+; SSE41-NEXT: por %xmm2, %xmm0
+; SSE41-NEXT: por %xmm1, %xmm0
+; SSE41-NEXT: ptest %xmm0, %xmm0
; SSE41-NEXT: sete %al
; SSE41-NEXT: retq
;
; SSE2: # %bb.0:
; SSE2-NEXT: pxor %xmm4, %xmm4
; SSE2-NEXT: por %xmm3, %xmm1
-; SSE2-NEXT: por %xmm2, %xmm1
-; SSE2-NEXT: por %xmm0, %xmm1
-; SSE2-NEXT: pcmpeqb %xmm4, %xmm1
-; SSE2-NEXT: pmovmskb %xmm1, %eax
+; SSE2-NEXT: por %xmm2, %xmm0
+; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: pcmpeqb %xmm4, %xmm0
+; SSE2-NEXT: pmovmskb %xmm0, %eax
; SSE2-NEXT: cmpw $-1, %ax
; SSE2-NEXT: sete %al
; SSE2-NEXT: retq
; SSE41-LABEL: icmp0_v16i32_v16i1:
; SSE41: # %bb.0:
; SSE41-NEXT: por %xmm3, %xmm1
-; SSE41-NEXT: por %xmm2, %xmm1
-; SSE41-NEXT: por %xmm0, %xmm1
-; SSE41-NEXT: ptest %xmm1, %xmm1
+; SSE41-NEXT: por %xmm2, %xmm0
+; SSE41-NEXT: por %xmm1, %xmm0
+; SSE41-NEXT: ptest %xmm0, %xmm0
; SSE41-NEXT: sete %al
; SSE41-NEXT: retq
;
; SSE2: # %bb.0:
; SSE2-NEXT: pxor %xmm4, %xmm4
; SSE2-NEXT: por %xmm3, %xmm1
-; SSE2-NEXT: por %xmm2, %xmm1
-; SSE2-NEXT: por %xmm0, %xmm1
-; SSE2-NEXT: pcmpeqb %xmm4, %xmm1
-; SSE2-NEXT: pmovmskb %xmm1, %eax
+; SSE2-NEXT: por %xmm2, %xmm0
+; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: pcmpeqb %xmm4, %xmm0
+; SSE2-NEXT: pmovmskb %xmm0, %eax
; SSE2-NEXT: cmpw $-1, %ax
; SSE2-NEXT: sete %al
; SSE2-NEXT: retq
; SSE41-LABEL: icmp0_v32i16_v32i1:
; SSE41: # %bb.0:
; SSE41-NEXT: por %xmm3, %xmm1
-; SSE41-NEXT: por %xmm2, %xmm1
-; SSE41-NEXT: por %xmm0, %xmm1
-; SSE41-NEXT: ptest %xmm1, %xmm1
+; SSE41-NEXT: por %xmm2, %xmm0
+; SSE41-NEXT: por %xmm1, %xmm0
+; SSE41-NEXT: ptest %xmm0, %xmm0
; SSE41-NEXT: sete %al
; SSE41-NEXT: retq
;
; SSE2-LABEL: icmp0_v64i8_v64i1:
; SSE2: # %bb.0:
; SSE2-NEXT: por %xmm3, %xmm1
-; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: por %xmm2, %xmm1
-; SSE2-NEXT: por %xmm0, %xmm1
-; SSE2-NEXT: pcmpeqb %xmm3, %xmm1
+; SSE2-NEXT: por %xmm2, %xmm0
+; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: pcmpeqb %xmm0, %xmm1
; SSE2-NEXT: pmovmskb %xmm1, %eax
; SSE2-NEXT: cmpw $-1, %ax
; SSE2-NEXT: sete %al
; SSE41-LABEL: icmp0_v64i8_v64i1:
; SSE41: # %bb.0:
; SSE41-NEXT: por %xmm3, %xmm1
-; SSE41-NEXT: por %xmm2, %xmm1
-; SSE41-NEXT: por %xmm0, %xmm1
-; SSE41-NEXT: ptest %xmm1, %xmm1
+; SSE41-NEXT: por %xmm2, %xmm0
+; SSE41-NEXT: por %xmm1, %xmm0
+; SSE41-NEXT: ptest %xmm0, %xmm0
; SSE41-NEXT: sete %al
; SSE41-NEXT: retq
;
; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm3
; AVX512F-NEXT: vpand %xmm2, %xmm3, %xmm2
-; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm1
; AVX512F-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT: vpand %xmm2, %xmm0, %xmm0
; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
; SSE2: # %bb.0:
; SSE2-NEXT: pcmpeqd %xmm4, %xmm4
; SSE2-NEXT: pand %xmm3, %xmm1
-; SSE2-NEXT: pand %xmm2, %xmm1
-; SSE2-NEXT: pand %xmm0, %xmm1
-; SSE2-NEXT: pcmpeqb %xmm4, %xmm1
-; SSE2-NEXT: pmovmskb %xmm1, %eax
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: pand %xmm1, %xmm0
+; SSE2-NEXT: pcmpeqb %xmm4, %xmm0
+; SSE2-NEXT: pmovmskb %xmm0, %eax
; SSE2-NEXT: cmpw $-1, %ax
; SSE2-NEXT: sete %al
; SSE2-NEXT: retq
; SSE41: # %bb.0:
; SSE41-NEXT: pcmpeqd %xmm4, %xmm4
; SSE41-NEXT: pand %xmm3, %xmm1
-; SSE41-NEXT: pand %xmm2, %xmm1
-; SSE41-NEXT: pand %xmm0, %xmm1
-; SSE41-NEXT: psubb %xmm4, %xmm1
-; SSE41-NEXT: ptest %xmm1, %xmm1
+; SSE41-NEXT: pand %xmm2, %xmm0
+; SSE41-NEXT: pand %xmm1, %xmm0
+; SSE41-NEXT: psubb %xmm4, %xmm0
+; SSE41-NEXT: ptest %xmm0, %xmm0
; SSE41-NEXT: sete %al
; SSE41-NEXT: retq
;
; SSE2: # %bb.0:
; SSE2-NEXT: pcmpeqd %xmm4, %xmm4
; SSE2-NEXT: pand %xmm3, %xmm1
-; SSE2-NEXT: pand %xmm2, %xmm1
-; SSE2-NEXT: pand %xmm0, %xmm1
-; SSE2-NEXT: pcmpeqb %xmm4, %xmm1
-; SSE2-NEXT: pmovmskb %xmm1, %eax
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: pand %xmm1, %xmm0
+; SSE2-NEXT: pcmpeqb %xmm4, %xmm0
+; SSE2-NEXT: pmovmskb %xmm0, %eax
; SSE2-NEXT: cmpw $-1, %ax
; SSE2-NEXT: sete %al
; SSE2-NEXT: retq
; SSE41: # %bb.0:
; SSE41-NEXT: pcmpeqd %xmm4, %xmm4
; SSE41-NEXT: pand %xmm3, %xmm1
-; SSE41-NEXT: pand %xmm2, %xmm1
-; SSE41-NEXT: pand %xmm0, %xmm1
-; SSE41-NEXT: psubb %xmm4, %xmm1
-; SSE41-NEXT: ptest %xmm1, %xmm1
+; SSE41-NEXT: pand %xmm2, %xmm0
+; SSE41-NEXT: pand %xmm1, %xmm0
+; SSE41-NEXT: psubb %xmm4, %xmm0
+; SSE41-NEXT: ptest %xmm0, %xmm0
; SSE41-NEXT: sete %al
; SSE41-NEXT: retq
;
; SSE2: # %bb.0:
; SSE2-NEXT: pcmpeqd %xmm4, %xmm4
; SSE2-NEXT: pand %xmm3, %xmm1
-; SSE2-NEXT: pand %xmm2, %xmm1
-; SSE2-NEXT: pand %xmm0, %xmm1
-; SSE2-NEXT: pcmpeqb %xmm4, %xmm1
-; SSE2-NEXT: pmovmskb %xmm1, %eax
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: pand %xmm1, %xmm0
+; SSE2-NEXT: pcmpeqb %xmm4, %xmm0
+; SSE2-NEXT: pmovmskb %xmm0, %eax
; SSE2-NEXT: cmpw $-1, %ax
; SSE2-NEXT: sete %al
; SSE2-NEXT: retq
; SSE41: # %bb.0:
; SSE41-NEXT: pcmpeqd %xmm4, %xmm4
; SSE41-NEXT: pand %xmm3, %xmm1
-; SSE41-NEXT: pand %xmm2, %xmm1
-; SSE41-NEXT: pand %xmm0, %xmm1
-; SSE41-NEXT: psubb %xmm4, %xmm1
-; SSE41-NEXT: ptest %xmm1, %xmm1
+; SSE41-NEXT: pand %xmm2, %xmm0
+; SSE41-NEXT: pand %xmm1, %xmm0
+; SSE41-NEXT: psubb %xmm4, %xmm0
+; SSE41-NEXT: ptest %xmm0, %xmm0
; SSE41-NEXT: sete %al
; SSE41-NEXT: retq
;
; SSE2-LABEL: icmp1_v64i8_v64i1:
; SSE2: # %bb.0:
; SSE2-NEXT: pand %xmm3, %xmm1
-; SSE2-NEXT: pcmpeqd %xmm3, %xmm3
-; SSE2-NEXT: pand %xmm2, %xmm1
-; SSE2-NEXT: pand %xmm0, %xmm1
-; SSE2-NEXT: pcmpeqb %xmm3, %xmm1
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: pand %xmm1, %xmm0
+; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
+; SSE2-NEXT: pcmpeqb %xmm0, %xmm1
; SSE2-NEXT: pmovmskb %xmm1, %eax
; SSE2-NEXT: cmpw $-1, %ax
; SSE2-NEXT: sete %al
; SSE41-LABEL: icmp1_v64i8_v64i1:
; SSE41: # %bb.0:
; SSE41-NEXT: pand %xmm3, %xmm1
-; SSE41-NEXT: pcmpeqd %xmm3, %xmm3
-; SSE41-NEXT: pand %xmm2, %xmm1
-; SSE41-NEXT: pand %xmm0, %xmm1
-; SSE41-NEXT: psubb %xmm3, %xmm1
-; SSE41-NEXT: ptest %xmm1, %xmm1
+; SSE41-NEXT: pand %xmm2, %xmm0
+; SSE41-NEXT: pand %xmm1, %xmm0
+; SSE41-NEXT: pcmpeqd %xmm1, %xmm1
+; SSE41-NEXT: psubb %xmm1, %xmm0
+; SSE41-NEXT: ptest %xmm0, %xmm0
; SSE41-NEXT: sete %al
; SSE41-NEXT: retq
;
; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm3
; AVX512F-NEXT: vpand %xmm2, %xmm3, %xmm2
-; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm1
; AVX512F-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT: vpand %xmm2, %xmm0, %xmm0
; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
; SSE-NEXT: pcmpeqb %xmm5, %xmm1
; SSE-NEXT: pand %xmm3, %xmm1
; SSE-NEXT: pcmpeqb %xmm6, %xmm2
-; SSE-NEXT: pand %xmm1, %xmm2
; SSE-NEXT: pcmpeqb %xmm4, %xmm0
; SSE-NEXT: pand %xmm2, %xmm0
+; SSE-NEXT: pand %xmm1, %xmm0
; SSE-NEXT: pmovmskb %xmm0, %eax
; SSE-NEXT: cmpw $-1, %ax
; SSE-NEXT: sete %al
; AVX1-NEXT: vpcmpeqb %xmm5, %xmm6, %xmm5
; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm4
; AVX1-NEXT: vpcmpeqb %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1
; AVX1-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vpmovmskb %xmm0, %eax
; AVX1-NEXT: cmpw $-1, %ax
; AVX1-NEXT: sete %al
; SSE-NEXT: pcmpeqb %xmm5, %xmm1
; SSE-NEXT: pand %xmm3, %xmm1
; SSE-NEXT: pcmpeqb %xmm6, %xmm2
-; SSE-NEXT: pand %xmm1, %xmm2
; SSE-NEXT: pcmpeqb %xmm4, %xmm0
; SSE-NEXT: pand %xmm2, %xmm0
+; SSE-NEXT: pand %xmm1, %xmm0
; SSE-NEXT: pmovmskb %xmm0, %eax
; SSE-NEXT: cmpw $-1, %ax
; SSE-NEXT: sete %al
; AVX1-NEXT: vpcmpeqb %xmm5, %xmm6, %xmm5
; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm4
; AVX1-NEXT: vpcmpeqb %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1
; AVX1-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vpmovmskb %xmm0, %eax
; AVX1-NEXT: cmpw $-1, %ax
; AVX1-NEXT: sete %al
; SSE-NEXT: pcmpeqb %xmm5, %xmm1
; SSE-NEXT: pand %xmm3, %xmm1
; SSE-NEXT: pcmpeqb %xmm6, %xmm2
-; SSE-NEXT: pand %xmm1, %xmm2
; SSE-NEXT: pcmpeqb %xmm4, %xmm0
; SSE-NEXT: pand %xmm2, %xmm0
+; SSE-NEXT: pand %xmm1, %xmm0
; SSE-NEXT: pmovmskb %xmm0, %eax
; SSE-NEXT: cmpw $-1, %ax
; SSE-NEXT: sete %al
; AVX1-NEXT: vpcmpeqb %xmm5, %xmm6, %xmm5
; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm4
; AVX1-NEXT: vpcmpeqb %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1
; AVX1-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vpmovmskb %xmm0, %eax
; AVX1-NEXT: cmpw $-1, %ax
; AVX1-NEXT: sete %al
; SSE: # %bb.0:
; SSE-NEXT: pcmpeqb %xmm6, %xmm2
; SSE-NEXT: pcmpeqb %xmm4, %xmm0
+; SSE-NEXT: pand %xmm2, %xmm0
; SSE-NEXT: pcmpeqb %xmm7, %xmm3
; SSE-NEXT: pcmpeqb %xmm5, %xmm1
; SSE-NEXT: pand %xmm3, %xmm1
-; SSE-NEXT: pand %xmm2, %xmm1
; SSE-NEXT: pand %xmm0, %xmm1
; SSE-NEXT: pmovmskb %xmm1, %eax
; SSE-NEXT: cmpw $-1, %ax
; AVX1: # %bb.0:
; AVX1-NEXT: vpcmpeqb %xmm3, %xmm1, %xmm4
; AVX1-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm5
+; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-NEXT: vpcmpeqb %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpand %xmm0, %xmm4, %xmm0
-; AVX1-NEXT: vpand %xmm0, %xmm5, %xmm0
; AVX1-NEXT: vpmovmskb %xmm0, %eax
; AVX1-NEXT: cmpw $-1, %ax
; AVX1-NEXT: sete %al
; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm1
; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm3
; AVX512F-NEXT: vpand %xmm1, %xmm3, %xmm1
-; AVX512F-NEXT: vpand %xmm1, %xmm2, %xmm1
+; AVX512F-NEXT: vpand %xmm2, %xmm0, %xmm0
; AVX512F-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
; SSE-LABEL: test_v8i64:
; SSE: # %bb.0:
; SSE-NEXT: pand %xmm3, %xmm1
-; SSE-NEXT: pand %xmm2, %xmm1
-; SSE-NEXT: pand %xmm0, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE-NEXT: pand %xmm2, %xmm0
; SSE-NEXT: pand %xmm1, %xmm0
-; SSE-NEXT: movq %xmm0, %rax
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; SSE-NEXT: pand %xmm0, %xmm1
+; SSE-NEXT: movq %xmm1, %rax
; SSE-NEXT: testq %rax, %rax
; SSE-NEXT: sete %al
; SSE-NEXT: retq
; SSE-LABEL: test_v16i64:
; SSE: # %bb.0:
; SSE-NEXT: pand %xmm6, %xmm2
-; SSE-NEXT: pand %xmm7, %xmm3
-; SSE-NEXT: pand %xmm5, %xmm3
-; SSE-NEXT: pand %xmm1, %xmm3
-; SSE-NEXT: pand %xmm4, %xmm2
-; SSE-NEXT: pand %xmm3, %xmm2
-; SSE-NEXT: pand %xmm0, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
+; SSE-NEXT: pand %xmm4, %xmm0
; SSE-NEXT: pand %xmm2, %xmm0
+; SSE-NEXT: pand %xmm7, %xmm3
+; SSE-NEXT: pand %xmm5, %xmm1
+; SSE-NEXT: pand %xmm3, %xmm1
+; SSE-NEXT: pand %xmm0, %xmm1
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE-NEXT: pand %xmm1, %xmm0
; SSE-NEXT: movq %xmm0, %rax
; SSE-NEXT: testq %rax, %rax
; SSE-NEXT: setne %al
; AVX1-LABEL: test_v16i64:
; AVX1: # %bb.0:
; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1
-; AVX1-NEXT: vandps %ymm1, %ymm2, %ymm1
+; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
; AVX2-LABEL: test_v16i64:
; AVX2: # %bb.0:
; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
; SSE-LABEL: test_v16i32:
; SSE: # %bb.0:
; SSE-NEXT: pand %xmm3, %xmm1
-; SSE-NEXT: pand %xmm2, %xmm1
-; SSE-NEXT: pand %xmm0, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE-NEXT: pand %xmm2, %xmm0
; SSE-NEXT: pand %xmm1, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE-NEXT: pand %xmm0, %xmm1
-; SSE-NEXT: movd %xmm1, %eax
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
+; SSE-NEXT: pand %xmm1, %xmm0
+; SSE-NEXT: movd %xmm0, %eax
; SSE-NEXT: testl %eax, %eax
; SSE-NEXT: setne %al
; SSE-NEXT: retq
; SSE-LABEL: test_v32i32:
; SSE: # %bb.0:
; SSE-NEXT: pand %xmm6, %xmm2
-; SSE-NEXT: pand %xmm7, %xmm3
-; SSE-NEXT: pand %xmm5, %xmm3
-; SSE-NEXT: pand %xmm1, %xmm3
-; SSE-NEXT: pand %xmm4, %xmm2
-; SSE-NEXT: pand %xmm3, %xmm2
-; SSE-NEXT: pand %xmm0, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
+; SSE-NEXT: pand %xmm4, %xmm0
; SSE-NEXT: pand %xmm2, %xmm0
+; SSE-NEXT: pand %xmm7, %xmm3
+; SSE-NEXT: pand %xmm5, %xmm1
+; SSE-NEXT: pand %xmm3, %xmm1
+; SSE-NEXT: pand %xmm0, %xmm1
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE-NEXT: pand %xmm1, %xmm0
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; SSE-NEXT: pand %xmm0, %xmm1
; SSE-NEXT: movd %xmm1, %eax
; AVX1-LABEL: test_v32i32:
; AVX1: # %bb.0:
; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1
-; AVX1-NEXT: vandps %ymm1, %ymm2, %ymm1
+; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
; AVX2-LABEL: test_v32i32:
; AVX2: # %bb.0:
; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
; SSE-LABEL: test_v32i16:
; SSE: # %bb.0:
; SSE-NEXT: pand %xmm3, %xmm1
-; SSE-NEXT: pand %xmm2, %xmm1
-; SSE-NEXT: pand %xmm0, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE-NEXT: pand %xmm2, %xmm0
; SSE-NEXT: pand %xmm1, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE-NEXT: pand %xmm0, %xmm1
-; SSE-NEXT: movdqa %xmm1, %xmm0
-; SSE-NEXT: psrld $16, %xmm0
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
; SSE-NEXT: pand %xmm1, %xmm0
-; SSE-NEXT: movd %xmm0, %eax
+; SSE-NEXT: movdqa %xmm0, %xmm1
+; SSE-NEXT: psrld $16, %xmm1
+; SSE-NEXT: pand %xmm0, %xmm1
+; SSE-NEXT: movd %xmm1, %eax
; SSE-NEXT: testl %eax, %eax
; SSE-NEXT: sete %al
; SSE-NEXT: retq
; SSE-LABEL: test_v64i16:
; SSE: # %bb.0:
; SSE-NEXT: pand %xmm6, %xmm2
-; SSE-NEXT: pand %xmm7, %xmm3
-; SSE-NEXT: pand %xmm5, %xmm3
-; SSE-NEXT: pand %xmm1, %xmm3
-; SSE-NEXT: pand %xmm4, %xmm2
-; SSE-NEXT: pand %xmm3, %xmm2
-; SSE-NEXT: pand %xmm0, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
+; SSE-NEXT: pand %xmm4, %xmm0
; SSE-NEXT: pand %xmm2, %xmm0
+; SSE-NEXT: pand %xmm7, %xmm3
+; SSE-NEXT: pand %xmm5, %xmm1
+; SSE-NEXT: pand %xmm3, %xmm1
+; SSE-NEXT: pand %xmm0, %xmm1
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE-NEXT: pand %xmm1, %xmm0
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; SSE-NEXT: pand %xmm0, %xmm1
; SSE-NEXT: movdqa %xmm1, %xmm0
; AVX1-LABEL: test_v64i16:
; AVX1: # %bb.0:
; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1
-; AVX1-NEXT: vandps %ymm1, %ymm2, %ymm1
+; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
; AVX2-LABEL: test_v64i16:
; AVX2: # %bb.0:
; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
; SSE-LABEL: test_v64i8:
; SSE: # %bb.0:
; SSE-NEXT: pand %xmm3, %xmm1
-; SSE-NEXT: pand %xmm2, %xmm1
-; SSE-NEXT: pand %xmm0, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE-NEXT: pand %xmm2, %xmm0
; SSE-NEXT: pand %xmm1, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE-NEXT: pand %xmm0, %xmm1
-; SSE-NEXT: movdqa %xmm1, %xmm0
-; SSE-NEXT: psrld $16, %xmm0
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
; SSE-NEXT: pand %xmm1, %xmm0
; SSE-NEXT: movdqa %xmm0, %xmm1
-; SSE-NEXT: psrlw $8, %xmm1
+; SSE-NEXT: psrld $16, %xmm1
; SSE-NEXT: pand %xmm0, %xmm1
-; SSE-NEXT: movd %xmm1, %eax
+; SSE-NEXT: movdqa %xmm1, %xmm0
+; SSE-NEXT: psrlw $8, %xmm0
+; SSE-NEXT: pand %xmm1, %xmm0
+; SSE-NEXT: movd %xmm0, %eax
; SSE-NEXT: testl %eax, %eax
; SSE-NEXT: setne %al
; SSE-NEXT: retq
; SSE-LABEL: test_v128i8:
; SSE: # %bb.0:
; SSE-NEXT: pand %xmm6, %xmm2
-; SSE-NEXT: pand %xmm7, %xmm3
-; SSE-NEXT: pand %xmm5, %xmm3
-; SSE-NEXT: pand %xmm1, %xmm3
-; SSE-NEXT: pand %xmm4, %xmm2
-; SSE-NEXT: pand %xmm3, %xmm2
-; SSE-NEXT: pand %xmm0, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
+; SSE-NEXT: pand %xmm4, %xmm0
; SSE-NEXT: pand %xmm2, %xmm0
+; SSE-NEXT: pand %xmm7, %xmm3
+; SSE-NEXT: pand %xmm5, %xmm1
+; SSE-NEXT: pand %xmm3, %xmm1
+; SSE-NEXT: pand %xmm0, %xmm1
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE-NEXT: pand %xmm1, %xmm0
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; SSE-NEXT: pand %xmm0, %xmm1
; SSE-NEXT: movdqa %xmm1, %xmm0
; AVX1-LABEL: test_v128i8:
; AVX1: # %bb.0:
; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1
-; AVX1-NEXT: vandps %ymm1, %ymm2, %ymm1
+; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
; AVX2-LABEL: test_v128i8:
; AVX2: # %bb.0:
; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
; SSE-LABEL: test_v8i64:
; SSE: # %bb.0:
; SSE-NEXT: pand %xmm3, %xmm1
-; SSE-NEXT: pand %xmm2, %xmm1
-; SSE-NEXT: pand %xmm0, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE-NEXT: pand %xmm2, %xmm0
; SSE-NEXT: pand %xmm1, %xmm0
-; SSE-NEXT: movq %xmm0, %rax
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; SSE-NEXT: pand %xmm0, %xmm1
+; SSE-NEXT: movq %xmm1, %rax
; SSE-NEXT: retq
;
; AVX1-LABEL: test_v8i64:
; SSE-LABEL: test_v16i64:
; SSE: # %bb.0:
; SSE-NEXT: pand %xmm6, %xmm2
-; SSE-NEXT: pand %xmm7, %xmm3
-; SSE-NEXT: pand %xmm5, %xmm3
-; SSE-NEXT: pand %xmm1, %xmm3
-; SSE-NEXT: pand %xmm4, %xmm2
-; SSE-NEXT: pand %xmm3, %xmm2
-; SSE-NEXT: pand %xmm0, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
+; SSE-NEXT: pand %xmm4, %xmm0
; SSE-NEXT: pand %xmm2, %xmm0
+; SSE-NEXT: pand %xmm7, %xmm3
+; SSE-NEXT: pand %xmm5, %xmm1
+; SSE-NEXT: pand %xmm3, %xmm1
+; SSE-NEXT: pand %xmm0, %xmm1
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE-NEXT: pand %xmm1, %xmm0
; SSE-NEXT: movq %xmm0, %rax
; SSE-NEXT: retq
;
; AVX1-LABEL: test_v16i64:
; AVX1: # %bb.0:
; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1
-; AVX1-NEXT: vandps %ymm1, %ymm2, %ymm1
+; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
; AVX2-LABEL: test_v16i64:
; AVX2: # %bb.0:
; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
; SSE-LABEL: test_v16i32:
; SSE: # %bb.0:
; SSE-NEXT: pand %xmm3, %xmm1
-; SSE-NEXT: pand %xmm2, %xmm1
-; SSE-NEXT: pand %xmm0, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE-NEXT: pand %xmm2, %xmm0
; SSE-NEXT: pand %xmm1, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE-NEXT: pand %xmm0, %xmm1
-; SSE-NEXT: movd %xmm1, %eax
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
+; SSE-NEXT: pand %xmm1, %xmm0
+; SSE-NEXT: movd %xmm0, %eax
; SSE-NEXT: retq
;
; AVX1-LABEL: test_v16i32:
; SSE-LABEL: test_v32i32:
; SSE: # %bb.0:
; SSE-NEXT: pand %xmm6, %xmm2
-; SSE-NEXT: pand %xmm7, %xmm3
-; SSE-NEXT: pand %xmm5, %xmm3
-; SSE-NEXT: pand %xmm1, %xmm3
-; SSE-NEXT: pand %xmm4, %xmm2
-; SSE-NEXT: pand %xmm3, %xmm2
-; SSE-NEXT: pand %xmm0, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
+; SSE-NEXT: pand %xmm4, %xmm0
; SSE-NEXT: pand %xmm2, %xmm0
+; SSE-NEXT: pand %xmm7, %xmm3
+; SSE-NEXT: pand %xmm5, %xmm1
+; SSE-NEXT: pand %xmm3, %xmm1
+; SSE-NEXT: pand %xmm0, %xmm1
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE-NEXT: pand %xmm1, %xmm0
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; SSE-NEXT: pand %xmm0, %xmm1
; SSE-NEXT: movd %xmm1, %eax
; AVX1-LABEL: test_v32i32:
; AVX1: # %bb.0:
; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1
-; AVX1-NEXT: vandps %ymm1, %ymm2, %ymm1
+; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
; AVX2-LABEL: test_v32i32:
; AVX2: # %bb.0:
; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
; SSE-LABEL: test_v32i16:
; SSE: # %bb.0:
; SSE-NEXT: pand %xmm3, %xmm1
-; SSE-NEXT: pand %xmm2, %xmm1
-; SSE-NEXT: pand %xmm0, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE-NEXT: pand %xmm2, %xmm0
; SSE-NEXT: pand %xmm1, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE-NEXT: pand %xmm0, %xmm1
-; SSE-NEXT: movdqa %xmm1, %xmm0
-; SSE-NEXT: psrld $16, %xmm0
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
; SSE-NEXT: pand %xmm1, %xmm0
-; SSE-NEXT: movd %xmm0, %eax
+; SSE-NEXT: movdqa %xmm0, %xmm1
+; SSE-NEXT: psrld $16, %xmm1
+; SSE-NEXT: pand %xmm0, %xmm1
+; SSE-NEXT: movd %xmm1, %eax
; SSE-NEXT: # kill: def $ax killed $ax killed $eax
; SSE-NEXT: retq
;
; SSE-LABEL: test_v64i16:
; SSE: # %bb.0:
; SSE-NEXT: pand %xmm6, %xmm2
-; SSE-NEXT: pand %xmm7, %xmm3
-; SSE-NEXT: pand %xmm5, %xmm3
-; SSE-NEXT: pand %xmm1, %xmm3
-; SSE-NEXT: pand %xmm4, %xmm2
-; SSE-NEXT: pand %xmm3, %xmm2
-; SSE-NEXT: pand %xmm0, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
+; SSE-NEXT: pand %xmm4, %xmm0
; SSE-NEXT: pand %xmm2, %xmm0
+; SSE-NEXT: pand %xmm7, %xmm3
+; SSE-NEXT: pand %xmm5, %xmm1
+; SSE-NEXT: pand %xmm3, %xmm1
+; SSE-NEXT: pand %xmm0, %xmm1
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE-NEXT: pand %xmm1, %xmm0
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; SSE-NEXT: pand %xmm0, %xmm1
; SSE-NEXT: movdqa %xmm1, %xmm0
; AVX1-LABEL: test_v64i16:
; AVX1: # %bb.0:
; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1
-; AVX1-NEXT: vandps %ymm1, %ymm2, %ymm1
+; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
; AVX2-LABEL: test_v64i16:
; AVX2: # %bb.0:
; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
; SSE-LABEL: test_v64i8:
; SSE: # %bb.0:
; SSE-NEXT: pand %xmm3, %xmm1
-; SSE-NEXT: pand %xmm2, %xmm1
-; SSE-NEXT: pand %xmm0, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE-NEXT: pand %xmm2, %xmm0
; SSE-NEXT: pand %xmm1, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE-NEXT: pand %xmm0, %xmm1
-; SSE-NEXT: movdqa %xmm1, %xmm0
-; SSE-NEXT: psrld $16, %xmm0
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
; SSE-NEXT: pand %xmm1, %xmm0
; SSE-NEXT: movdqa %xmm0, %xmm1
-; SSE-NEXT: psrlw $8, %xmm1
+; SSE-NEXT: psrld $16, %xmm1
; SSE-NEXT: pand %xmm0, %xmm1
-; SSE-NEXT: movd %xmm1, %eax
+; SSE-NEXT: movdqa %xmm1, %xmm0
+; SSE-NEXT: psrlw $8, %xmm0
+; SSE-NEXT: pand %xmm1, %xmm0
+; SSE-NEXT: movd %xmm0, %eax
; SSE-NEXT: # kill: def $al killed $al killed $eax
; SSE-NEXT: retq
;
; SSE-LABEL: test_v128i8:
; SSE: # %bb.0:
; SSE-NEXT: pand %xmm6, %xmm2
-; SSE-NEXT: pand %xmm7, %xmm3
-; SSE-NEXT: pand %xmm5, %xmm3
-; SSE-NEXT: pand %xmm1, %xmm3
-; SSE-NEXT: pand %xmm4, %xmm2
-; SSE-NEXT: pand %xmm3, %xmm2
-; SSE-NEXT: pand %xmm0, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
+; SSE-NEXT: pand %xmm4, %xmm0
; SSE-NEXT: pand %xmm2, %xmm0
+; SSE-NEXT: pand %xmm7, %xmm3
+; SSE-NEXT: pand %xmm5, %xmm1
+; SSE-NEXT: pand %xmm3, %xmm1
+; SSE-NEXT: pand %xmm0, %xmm1
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE-NEXT: pand %xmm1, %xmm0
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; SSE-NEXT: pand %xmm0, %xmm1
; SSE-NEXT: movdqa %xmm1, %xmm0
; AVX1-LABEL: test_v128i8:
; AVX1: # %bb.0:
; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1
-; AVX1-NEXT: vandps %ymm1, %ymm2, %ymm1
+; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
; AVX2-LABEL: test_v128i8:
; AVX2: # %bb.0:
; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
; SSE41-LABEL: test_v16i32:
; SSE41: # %bb.0:
; SSE41-NEXT: pmulld %xmm3, %xmm1
-; SSE41-NEXT: pmulld %xmm2, %xmm1
-; SSE41-NEXT: pmulld %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE41-NEXT: pmulld %xmm2, %xmm0
; SSE41-NEXT: pmulld %xmm1, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE41-NEXT: pmulld %xmm0, %xmm1
-; SSE41-NEXT: movd %xmm1, %eax
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
+; SSE41-NEXT: pmulld %xmm1, %xmm0
+; SSE41-NEXT: movd %xmm0, %eax
; SSE41-NEXT: retq
;
; AVX1-LABEL: test_v16i32:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpmulld %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpmulld %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; SSE41-LABEL: test_v32i32:
; SSE41: # %bb.0:
; SSE41-NEXT: pmulld %xmm6, %xmm2
-; SSE41-NEXT: pmulld %xmm7, %xmm3
-; SSE41-NEXT: pmulld %xmm5, %xmm3
-; SSE41-NEXT: pmulld %xmm1, %xmm3
-; SSE41-NEXT: pmulld %xmm4, %xmm2
-; SSE41-NEXT: pmulld %xmm3, %xmm2
-; SSE41-NEXT: pmulld %xmm0, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
+; SSE41-NEXT: pmulld %xmm4, %xmm0
; SSE41-NEXT: pmulld %xmm2, %xmm0
+; SSE41-NEXT: pmulld %xmm7, %xmm3
+; SSE41-NEXT: pmulld %xmm5, %xmm1
+; SSE41-NEXT: pmulld %xmm3, %xmm1
+; SSE41-NEXT: pmulld %xmm0, %xmm1
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE41-NEXT: pmulld %xmm1, %xmm0
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; SSE41-NEXT: pmulld %xmm0, %xmm1
; SSE41-NEXT: movd %xmm1, %eax
; AVX1-LABEL: test_v32i32:
; AVX1: # %bb.0:
; AVX1-NEXT: vpmulld %xmm3, %xmm1, %xmm4
+; AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm5
+; AVX1-NEXT: vpmulld %xmm4, %xmm5, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-NEXT: vpmulld %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
-; AVX1-NEXT: vpmulld %xmm1, %xmm3, %xmm1
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT: vpmulld %xmm1, %xmm3, %xmm1
-; AVX1-NEXT: vpmulld %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpmulld %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpmulld %xmm0, %xmm4, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; AVX2-LABEL: test_v32i32:
; AVX2: # %bb.0:
; AVX2-NEXT: vpmulld %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vpmulld %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpmulld %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0
; SSE-LABEL: test_v32i16:
; SSE: # %bb.0:
; SSE-NEXT: pmullw %xmm3, %xmm1
-; SSE-NEXT: pmullw %xmm2, %xmm1
-; SSE-NEXT: pmullw %xmm0, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE-NEXT: pmullw %xmm2, %xmm0
; SSE-NEXT: pmullw %xmm1, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE-NEXT: pmullw %xmm0, %xmm1
-; SSE-NEXT: movdqa %xmm1, %xmm0
-; SSE-NEXT: psrld $16, %xmm0
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
; SSE-NEXT: pmullw %xmm1, %xmm0
-; SSE-NEXT: movd %xmm0, %eax
+; SSE-NEXT: movdqa %xmm0, %xmm1
+; SSE-NEXT: psrld $16, %xmm1
+; SSE-NEXT: pmullw %xmm0, %xmm1
+; SSE-NEXT: movd %xmm1, %eax
; SSE-NEXT: # kill: def $ax killed $ax killed $eax
; SSE-NEXT: retq
;
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpmullw %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; SSE-LABEL: test_v64i16:
; SSE: # %bb.0:
; SSE-NEXT: pmullw %xmm6, %xmm2
-; SSE-NEXT: pmullw %xmm7, %xmm3
-; SSE-NEXT: pmullw %xmm5, %xmm3
-; SSE-NEXT: pmullw %xmm1, %xmm3
-; SSE-NEXT: pmullw %xmm4, %xmm2
-; SSE-NEXT: pmullw %xmm3, %xmm2
-; SSE-NEXT: pmullw %xmm0, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
+; SSE-NEXT: pmullw %xmm4, %xmm0
; SSE-NEXT: pmullw %xmm2, %xmm0
+; SSE-NEXT: pmullw %xmm7, %xmm3
+; SSE-NEXT: pmullw %xmm5, %xmm1
+; SSE-NEXT: pmullw %xmm3, %xmm1
+; SSE-NEXT: pmullw %xmm0, %xmm1
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE-NEXT: pmullw %xmm1, %xmm0
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; SSE-NEXT: pmullw %xmm0, %xmm1
; SSE-NEXT: movdqa %xmm1, %xmm0
; AVX1-LABEL: test_v64i16:
; AVX1: # %bb.0:
; AVX1-NEXT: vpmullw %xmm3, %xmm1, %xmm4
+; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm5
+; AVX1-NEXT: vpmullw %xmm4, %xmm5, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-NEXT: vpmullw %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
-; AVX1-NEXT: vpmullw %xmm1, %xmm3, %xmm1
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT: vpmullw %xmm1, %xmm3, %xmm1
-; AVX1-NEXT: vpmullw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpmullw %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpmullw %xmm0, %xmm4, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; AVX2-LABEL: test_v64i16:
; AVX2: # %bb.0:
; AVX2-NEXT: vpmullw %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vpmullw %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpmullw %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm2
; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm3
; AVX512DQ-NEXT: vpmullw %ymm2, %ymm3, %ymm2
-; AVX512DQ-NEXT: vpmullw %ymm2, %ymm1, %ymm1
; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0
+; AVX512DQ-NEXT: vpmullw %ymm2, %ymm0, %ymm0
; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512DQVL-NEXT: vextracti64x4 $1, %zmm1, %ymm2
; AVX512DQVL-NEXT: vextracti64x4 $1, %zmm0, %ymm3
; AVX512DQVL-NEXT: vpmullw %ymm2, %ymm3, %ymm2
-; AVX512DQVL-NEXT: vpmullw %ymm2, %ymm1, %ymm1
; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0
+; AVX512DQVL-NEXT: vpmullw %ymm2, %ymm0, %ymm0
; AVX512DQVL-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE2-NEXT: pmullw %xmm2, %xmm1
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE2-NEXT: pmullw %xmm1, %xmm2
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: pmullw %xmm2, %xmm0
+; SSE2-NEXT: pmullw %xmm1, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE2-NEXT: pmullw %xmm0, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
; SSE41-NEXT: pmullw %xmm2, %xmm1
; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE41-NEXT: pmullw %xmm1, %xmm0
; SSE41-NEXT: pmullw %xmm2, %xmm0
+; SSE41-NEXT: pmullw %xmm1, %xmm0
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE41-NEXT: pmullw %xmm0, %xmm1
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-NEXT: vpmullw %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; SSE2-NEXT: pmullw %xmm4, %xmm3
; SSE2-NEXT: movdqa %xmm1, %xmm4
; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE2-NEXT: pmullw %xmm3, %xmm4
; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: pmullw %xmm4, %xmm1
+; SSE2-NEXT: pmullw %xmm3, %xmm1
; SSE2-NEXT: movdqa %xmm2, %xmm3
; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: pmullw %xmm3, %xmm2
; SSE2-NEXT: movdqa %xmm0, %xmm3
; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE2-NEXT: pmullw %xmm2, %xmm3
-; SSE2-NEXT: pmullw %xmm1, %xmm3
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: pmullw %xmm3, %xmm0
+; SSE2-NEXT: pmullw %xmm2, %xmm0
+; SSE2-NEXT: pmullw %xmm1, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE2-NEXT: pmullw %xmm0, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
; SSE41-NEXT: pmullw %xmm4, %xmm3
; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE41-NEXT: pmullw %xmm3, %xmm1
; SSE41-NEXT: pmullw %xmm4, %xmm1
+; SSE41-NEXT: pmullw %xmm3, %xmm1
; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; SSE41-NEXT: pmullw %xmm3, %xmm2
; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE41-NEXT: pmullw %xmm3, %xmm0
; SSE41-NEXT: pmullw %xmm2, %xmm0
; SSE41-NEXT: pmullw %xmm1, %xmm0
-; SSE41-NEXT: pmullw %xmm3, %xmm0
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE41-NEXT: pmullw %xmm0, %xmm1
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
; AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-NEXT: vpmullw %xmm2, %xmm4, %xmm2
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
+; AVX1-NEXT: vpmullw %xmm4, %xmm3, %xmm3
; AVX1-NEXT: vpmullw %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; AVX1-NEXT: vpmullw %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-NEXT: vpmullw %xmm1, %xmm3, %xmm1
-; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT: vpmullw %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
; AVX2-NEXT: vpmullw %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
-; AVX2-NEXT: vpmullw %ymm1, %ymm2, %ymm1
; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX2-NEXT: vpmullw %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
; AVX512DQ-NEXT: vpmullw %ymm2, %ymm1, %ymm1
; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
-; AVX512DQ-NEXT: vpmullw %ymm1, %ymm2, %ymm1
; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512DQ-NEXT: vpmullw %ymm2, %ymm0, %ymm0
; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0
; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
; AVX512DQVL-NEXT: vpmullw %ymm2, %ymm1, %ymm1
; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
-; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm2, %ymm1
; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512DQVL-NEXT: vpmullw %ymm2, %ymm0, %ymm0
; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0
; AVX512DQVL-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; SSE2-NEXT: pmullw %xmm8, %xmm7
; SSE2-NEXT: movdqa %xmm3, %xmm8
; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE2-NEXT: pmullw %xmm7, %xmm8
; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: pmullw %xmm8, %xmm3
+; SSE2-NEXT: pmullw %xmm7, %xmm3
; SSE2-NEXT: movdqa %xmm5, %xmm7
; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: pmullw %xmm7, %xmm5
; SSE2-NEXT: movdqa %xmm1, %xmm7
; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE2-NEXT: pmullw %xmm5, %xmm7
-; SSE2-NEXT: pmullw %xmm3, %xmm7
; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: pmullw %xmm7, %xmm1
+; SSE2-NEXT: pmullw %xmm5, %xmm1
+; SSE2-NEXT: pmullw %xmm3, %xmm1
; SSE2-NEXT: movdqa %xmm6, %xmm3
; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: pmullw %xmm3, %xmm6
; SSE2-NEXT: movdqa %xmm2, %xmm3
; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE2-NEXT: pmullw %xmm6, %xmm3
; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: pmullw %xmm3, %xmm2
+; SSE2-NEXT: pmullw %xmm6, %xmm2
; SSE2-NEXT: movdqa %xmm4, %xmm3
; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: pmullw %xmm3, %xmm4
; SSE2-NEXT: movdqa %xmm0, %xmm3
; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE2-NEXT: pmullw %xmm4, %xmm3
-; SSE2-NEXT: pmullw %xmm2, %xmm3
-; SSE2-NEXT: pmullw %xmm1, %xmm3
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: pmullw %xmm3, %xmm0
+; SSE2-NEXT: pmullw %xmm4, %xmm0
+; SSE2-NEXT: pmullw %xmm2, %xmm0
+; SSE2-NEXT: pmullw %xmm1, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE2-NEXT: pmullw %xmm0, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
; SSE41-NEXT: pmullw %xmm8, %xmm7
; SSE41-NEXT: pmovzxbw {{.*#+}} xmm8 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE41-NEXT: pmullw %xmm7, %xmm3
; SSE41-NEXT: pmullw %xmm8, %xmm3
+; SSE41-NEXT: pmullw %xmm7, %xmm3
; SSE41-NEXT: pmovzxbw {{.*#+}} xmm7 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
; SSE41-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; SSE41-NEXT: pmullw %xmm7, %xmm5
; SSE41-NEXT: pmovzxbw {{.*#+}} xmm7 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE41-NEXT: pmullw %xmm7, %xmm1
; SSE41-NEXT: pmullw %xmm5, %xmm1
; SSE41-NEXT: pmullw %xmm3, %xmm1
-; SSE41-NEXT: pmullw %xmm7, %xmm1
; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero
; SSE41-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; SSE41-NEXT: pmullw %xmm3, %xmm6
; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE41-NEXT: pmullw %xmm6, %xmm2
; SSE41-NEXT: pmullw %xmm3, %xmm2
+; SSE41-NEXT: pmullw %xmm6, %xmm2
; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
; SSE41-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; SSE41-NEXT: pmullw %xmm3, %xmm4
; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE41-NEXT: pmullw %xmm3, %xmm0
; SSE41-NEXT: pmullw %xmm4, %xmm0
; SSE41-NEXT: pmullw %xmm2, %xmm0
; SSE41-NEXT: pmullw %xmm1, %xmm0
-; SSE41-NEXT: pmullw %xmm3, %xmm0
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE41-NEXT: pmullw %xmm0, %xmm1
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
; AVX1-NEXT: vpmullw %xmm5, %xmm4, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-NEXT: vpmullw %xmm4, %xmm6, %xmm4
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
+; AVX1-NEXT: vpmullw %xmm6, %xmm5, %xmm5
; AVX1-NEXT: vpmullw %xmm4, %xmm5, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; AVX1-NEXT: vpmullw %xmm6, %xmm5, %xmm5
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-NEXT: vpmullw %xmm5, %xmm7, %xmm5
-; AVX1-NEXT: vpmullw %xmm4, %xmm5, %xmm4
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero
+; AVX1-NEXT: vpmullw %xmm7, %xmm6, %xmm6
+; AVX1-NEXT: vpmullw %xmm5, %xmm6, %xmm5
; AVX1-NEXT: vpmullw %xmm4, %xmm5, %xmm4
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
; AVX1-NEXT: vpmullw %xmm5, %xmm3, %xmm3
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-NEXT: vpmullw %xmm3, %xmm5, %xmm3
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX1-NEXT: vpmullw %xmm5, %xmm1, %xmm1
; AVX1-NEXT: vpmullw %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
; AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-NEXT: vpmullw %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpmullw %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vpmullw %xmm4, %xmm1, %xmm1
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT: vpmullw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpmullw %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
; AVX2-NEXT: vpmullw %ymm4, %ymm3, %ymm3
; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
-; AVX2-NEXT: vpmullw %ymm3, %ymm4, %ymm3
; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX2-NEXT: vpmullw %ymm4, %ymm1, %ymm1
; AVX2-NEXT: vpmullw %ymm3, %ymm1, %ymm1
; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
; AVX2-NEXT: vpmullw %ymm3, %ymm2, %ymm2
; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
-; AVX2-NEXT: vpmullw %ymm2, %ymm3, %ymm2
-; AVX2-NEXT: vpmullw %ymm1, %ymm2, %ymm1
; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX2-NEXT: vpmullw %ymm3, %ymm0, %ymm0
+; AVX2-NEXT: vpmullw %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
; AVX512BW-NEXT: vpmullw %zmm2, %zmm1, %zmm1
; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
-; AVX512BW-NEXT: vpmullw %zmm1, %zmm2, %zmm1
; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
+; AVX512BW-NEXT: vpmullw %zmm2, %zmm0, %zmm0
; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512BW-NEXT: vpmullw %ymm1, %ymm0, %ymm0
; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
; AVX512BWVL-NEXT: vpmullw %zmm2, %zmm1, %zmm1
; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
-; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm2, %zmm1
; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
+; AVX512BWVL-NEXT: vpmullw %zmm2, %zmm0, %zmm0
; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0
; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512BWVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0
; AVX512DQ-NEXT: vpmullw %ymm3, %ymm2, %ymm2
; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm3
; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
-; AVX512DQ-NEXT: vpmullw %ymm2, %ymm4, %ymm2
; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512DQ-NEXT: vpmullw %ymm4, %ymm3, %ymm3
; AVX512DQ-NEXT: vpmullw %ymm2, %ymm3, %ymm2
; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
; AVX512DQ-NEXT: vpmullw %ymm3, %ymm1, %ymm1
; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
-; AVX512DQ-NEXT: vpmullw %ymm1, %ymm3, %ymm1
-; AVX512DQ-NEXT: vpmullw %ymm2, %ymm1, %ymm1
; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512DQ-NEXT: vpmullw %ymm3, %ymm0, %ymm0
; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0
+; AVX512DQ-NEXT: vpmullw %ymm2, %ymm0, %ymm0
; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512DQVL-NEXT: vpmullw %ymm3, %ymm2, %ymm2
; AVX512DQVL-NEXT: vextracti64x4 $1, %zmm0, %ymm3
; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
-; AVX512DQVL-NEXT: vpmullw %ymm2, %ymm4, %ymm2
; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512DQVL-NEXT: vpmullw %ymm4, %ymm3, %ymm3
; AVX512DQVL-NEXT: vpmullw %ymm2, %ymm3, %ymm2
; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
; AVX512DQVL-NEXT: vpmullw %ymm3, %ymm1, %ymm1
; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
-; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm3, %ymm1
-; AVX512DQVL-NEXT: vpmullw %ymm2, %ymm1, %ymm1
; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512DQVL-NEXT: vpmullw %ymm3, %ymm0, %ymm0
; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0
+; AVX512DQVL-NEXT: vpmullw %ymm2, %ymm0, %ymm0
; AVX512DQVL-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE-LABEL: trunc_v64i8_v64i1:
; SSE: # %bb.0:
; SSE-NEXT: por %xmm3, %xmm1
-; SSE-NEXT: por %xmm2, %xmm1
-; SSE-NEXT: por %xmm0, %xmm1
-; SSE-NEXT: psllw $7, %xmm1
-; SSE-NEXT: pmovmskb %xmm1, %eax
+; SSE-NEXT: por %xmm2, %xmm0
+; SSE-NEXT: por %xmm1, %xmm0
+; SSE-NEXT: psllw $7, %xmm0
+; SSE-NEXT: pmovmskb %xmm0, %eax
; SSE-NEXT: testl %eax, %eax
; SSE-NEXT: setne %al
; SSE-NEXT: retq
; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm3
; AVX512F-NEXT: vpor %xmm2, %xmm3, %xmm2
-; AVX512F-NEXT: vpor %xmm2, %xmm1, %xmm1
; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT: vpor %xmm2, %xmm0, %xmm0
; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
; SSE-NEXT: pxor %xmm4, %xmm4
; SSE-NEXT: pcmpeqb %xmm4, %xmm2
; SSE-NEXT: pcmpeqb %xmm4, %xmm0
+; SSE-NEXT: por %xmm2, %xmm0
; SSE-NEXT: pcmpeqb %xmm4, %xmm3
; SSE-NEXT: pcmpeqb %xmm4, %xmm1
; SSE-NEXT: por %xmm3, %xmm1
-; SSE-NEXT: por %xmm2, %xmm1
; SSE-NEXT: por %xmm0, %xmm1
; SSE-NEXT: pmovmskb %xmm1, %eax
; SSE-NEXT: testl %eax, %eax
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm3
; AVX1-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm4
+; AVX1-NEXT: vpor %xmm3, %xmm4, %xmm3
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpor %xmm0, %xmm3, %xmm0
-; AVX1-NEXT: vpor %xmm0, %xmm4, %xmm0
; AVX1-NEXT: vpmovmskb %xmm0, %eax
; AVX1-NEXT: testl %eax, %eax
; AVX1-NEXT: setne %al
; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm3
; AVX512F-NEXT: vpor %xmm2, %xmm3, %xmm2
-; AVX512F-NEXT: vpor %xmm2, %xmm1, %xmm1
; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT: vpor %xmm2, %xmm0, %xmm0
; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
; SSE: # %bb.0:
; SSE-NEXT: pcmpeqb %xmm6, %xmm2
; SSE-NEXT: pcmpeqb %xmm4, %xmm0
+; SSE-NEXT: por %xmm2, %xmm0
; SSE-NEXT: pcmpeqb %xmm7, %xmm3
; SSE-NEXT: pcmpeqb %xmm5, %xmm1
; SSE-NEXT: por %xmm3, %xmm1
-; SSE-NEXT: por %xmm2, %xmm1
; SSE-NEXT: por %xmm0, %xmm1
; SSE-NEXT: pmovmskb %xmm1, %eax
; SSE-NEXT: testl %eax, %eax
; AVX1: # %bb.0:
; AVX1-NEXT: vpcmpeqb %xmm3, %xmm1, %xmm4
; AVX1-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm5
+; AVX1-NEXT: vpor %xmm4, %xmm5, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-NEXT: vpcmpeqb %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpor %xmm0, %xmm4, %xmm0
-; AVX1-NEXT: vpor %xmm0, %xmm5, %xmm0
; AVX1-NEXT: vpmovmskb %xmm0, %eax
; AVX1-NEXT: testl %eax, %eax
; AVX1-NEXT: setne %al
; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm1
; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm3
; AVX512F-NEXT: vpor %xmm1, %xmm3, %xmm1
-; AVX512F-NEXT: vpor %xmm1, %xmm2, %xmm1
+; AVX512F-NEXT: vpor %xmm2, %xmm0, %xmm0
; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
; SSE2-LABEL: test_v8i64:
; SSE2: # %bb.0:
; SSE2-NEXT: por %xmm3, %xmm1
-; SSE2-NEXT: por %xmm2, %xmm1
-; SSE2-NEXT: por %xmm0, %xmm1
-; SSE2-NEXT: pxor %xmm0, %xmm0
-; SSE2-NEXT: pcmpeqb %xmm1, %xmm0
-; SSE2-NEXT: pmovmskb %xmm0, %eax
+; SSE2-NEXT: por %xmm2, %xmm0
+; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: pcmpeqb %xmm0, %xmm1
+; SSE2-NEXT: pmovmskb %xmm1, %eax
; SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
; SSE2-NEXT: sete %al
; SSE2-NEXT: retq
; SSE41-LABEL: test_v8i64:
; SSE41: # %bb.0:
; SSE41-NEXT: por %xmm3, %xmm1
-; SSE41-NEXT: por %xmm2, %xmm1
-; SSE41-NEXT: por %xmm0, %xmm1
-; SSE41-NEXT: ptest %xmm1, %xmm1
+; SSE41-NEXT: por %xmm2, %xmm0
+; SSE41-NEXT: por %xmm1, %xmm0
+; SSE41-NEXT: ptest %xmm0, %xmm0
; SSE41-NEXT: sete %al
; SSE41-NEXT: retq
;
; SSE2-LABEL: test_v16i64:
; SSE2: # %bb.0:
; SSE2-NEXT: por %xmm7, %xmm3
-; SSE2-NEXT: por %xmm5, %xmm3
-; SSE2-NEXT: por %xmm1, %xmm3
+; SSE2-NEXT: por %xmm5, %xmm1
+; SSE2-NEXT: por %xmm3, %xmm1
; SSE2-NEXT: por %xmm6, %xmm2
-; SSE2-NEXT: por %xmm4, %xmm2
-; SSE2-NEXT: por %xmm3, %xmm2
-; SSE2-NEXT: por %xmm0, %xmm2
-; SSE2-NEXT: pxor %xmm0, %xmm0
-; SSE2-NEXT: pcmpeqb %xmm2, %xmm0
-; SSE2-NEXT: pmovmskb %xmm0, %eax
+; SSE2-NEXT: por %xmm4, %xmm0
+; SSE2-NEXT: por %xmm2, %xmm0
+; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: pcmpeqb %xmm0, %xmm1
+; SSE2-NEXT: pmovmskb %xmm1, %eax
; SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
; SSE2-NEXT: setne %al
; SSE2-NEXT: retq
; SSE41-LABEL: test_v16i64:
; SSE41: # %bb.0:
; SSE41-NEXT: por %xmm7, %xmm3
-; SSE41-NEXT: por %xmm5, %xmm3
-; SSE41-NEXT: por %xmm1, %xmm3
+; SSE41-NEXT: por %xmm5, %xmm1
+; SSE41-NEXT: por %xmm3, %xmm1
; SSE41-NEXT: por %xmm6, %xmm2
-; SSE41-NEXT: por %xmm4, %xmm2
-; SSE41-NEXT: por %xmm3, %xmm2
-; SSE41-NEXT: por %xmm0, %xmm2
-; SSE41-NEXT: ptest %xmm2, %xmm2
+; SSE41-NEXT: por %xmm4, %xmm0
+; SSE41-NEXT: por %xmm2, %xmm0
+; SSE41-NEXT: por %xmm1, %xmm0
+; SSE41-NEXT: ptest %xmm0, %xmm0
; SSE41-NEXT: setne %al
; SSE41-NEXT: retq
;
; AVX1-LABEL: test_v16i64:
; AVX1: # %bb.0:
; AVX1-NEXT: vorps %ymm3, %ymm1, %ymm1
-; AVX1-NEXT: vorps %ymm1, %ymm2, %ymm1
+; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0
; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vptest %ymm0, %ymm0
; AVX1-NEXT: setne %al
; AVX2-LABEL: test_v16i64:
; AVX2: # %bb.0:
; AVX2-NEXT: vpor %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vptest %ymm0, %ymm0
; AVX2-NEXT: setne %al
; SSE2-LABEL: test_v16i32:
; SSE2: # %bb.0:
; SSE2-NEXT: por %xmm3, %xmm1
-; SSE2-NEXT: por %xmm2, %xmm1
-; SSE2-NEXT: por %xmm0, %xmm1
-; SSE2-NEXT: pxor %xmm0, %xmm0
-; SSE2-NEXT: pcmpeqb %xmm1, %xmm0
-; SSE2-NEXT: pmovmskb %xmm0, %eax
+; SSE2-NEXT: por %xmm2, %xmm0
+; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: pcmpeqb %xmm0, %xmm1
+; SSE2-NEXT: pmovmskb %xmm1, %eax
; SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
; SSE2-NEXT: setne %al
; SSE2-NEXT: retq
; SSE41-LABEL: test_v16i32:
; SSE41: # %bb.0:
; SSE41-NEXT: por %xmm3, %xmm1
-; SSE41-NEXT: por %xmm2, %xmm1
-; SSE41-NEXT: por %xmm0, %xmm1
-; SSE41-NEXT: ptest %xmm1, %xmm1
+; SSE41-NEXT: por %xmm2, %xmm0
+; SSE41-NEXT: por %xmm1, %xmm0
+; SSE41-NEXT: ptest %xmm0, %xmm0
; SSE41-NEXT: setne %al
; SSE41-NEXT: retq
;
; SSE2-LABEL: test_v32i32:
; SSE2: # %bb.0:
; SSE2-NEXT: por %xmm7, %xmm3
-; SSE2-NEXT: por %xmm5, %xmm3
-; SSE2-NEXT: por %xmm1, %xmm3
+; SSE2-NEXT: por %xmm5, %xmm1
+; SSE2-NEXT: por %xmm3, %xmm1
; SSE2-NEXT: por %xmm6, %xmm2
-; SSE2-NEXT: por %xmm4, %xmm2
-; SSE2-NEXT: por %xmm3, %xmm2
-; SSE2-NEXT: por %xmm0, %xmm2
-; SSE2-NEXT: pxor %xmm0, %xmm0
-; SSE2-NEXT: pcmpeqb %xmm2, %xmm0
-; SSE2-NEXT: pmovmskb %xmm0, %eax
+; SSE2-NEXT: por %xmm4, %xmm0
+; SSE2-NEXT: por %xmm2, %xmm0
+; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: pcmpeqb %xmm0, %xmm1
+; SSE2-NEXT: pmovmskb %xmm1, %eax
; SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
; SSE2-NEXT: sete %al
; SSE2-NEXT: retq
; SSE41-LABEL: test_v32i32:
; SSE41: # %bb.0:
; SSE41-NEXT: por %xmm7, %xmm3
-; SSE41-NEXT: por %xmm5, %xmm3
-; SSE41-NEXT: por %xmm1, %xmm3
+; SSE41-NEXT: por %xmm5, %xmm1
+; SSE41-NEXT: por %xmm3, %xmm1
; SSE41-NEXT: por %xmm6, %xmm2
-; SSE41-NEXT: por %xmm4, %xmm2
-; SSE41-NEXT: por %xmm3, %xmm2
-; SSE41-NEXT: por %xmm0, %xmm2
-; SSE41-NEXT: ptest %xmm2, %xmm2
+; SSE41-NEXT: por %xmm4, %xmm0
+; SSE41-NEXT: por %xmm2, %xmm0
+; SSE41-NEXT: por %xmm1, %xmm0
+; SSE41-NEXT: ptest %xmm0, %xmm0
; SSE41-NEXT: sete %al
; SSE41-NEXT: retq
;
; AVX1-LABEL: test_v32i32:
; AVX1: # %bb.0:
; AVX1-NEXT: vorps %ymm3, %ymm1, %ymm1
-; AVX1-NEXT: vorps %ymm1, %ymm2, %ymm1
+; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0
; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vptest %ymm0, %ymm0
; AVX1-NEXT: sete %al
; AVX2-LABEL: test_v32i32:
; AVX2: # %bb.0:
; AVX2-NEXT: vpor %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vptest %ymm0, %ymm0
; AVX2-NEXT: sete %al
; SSE2-LABEL: test_v32i16:
; SSE2: # %bb.0:
; SSE2-NEXT: por %xmm3, %xmm1
-; SSE2-NEXT: por %xmm2, %xmm1
-; SSE2-NEXT: por %xmm0, %xmm1
-; SSE2-NEXT: pxor %xmm0, %xmm0
-; SSE2-NEXT: pcmpeqb %xmm1, %xmm0
-; SSE2-NEXT: pmovmskb %xmm0, %eax
+; SSE2-NEXT: por %xmm2, %xmm0
+; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: pcmpeqb %xmm0, %xmm1
+; SSE2-NEXT: pmovmskb %xmm1, %eax
; SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
; SSE2-NEXT: sete %al
; SSE2-NEXT: retq
; SSE41-LABEL: test_v32i16:
; SSE41: # %bb.0:
; SSE41-NEXT: por %xmm3, %xmm1
-; SSE41-NEXT: por %xmm2, %xmm1
-; SSE41-NEXT: por %xmm0, %xmm1
-; SSE41-NEXT: ptest %xmm1, %xmm1
+; SSE41-NEXT: por %xmm2, %xmm0
+; SSE41-NEXT: por %xmm1, %xmm0
+; SSE41-NEXT: ptest %xmm0, %xmm0
; SSE41-NEXT: sete %al
; SSE41-NEXT: retq
;
; SSE2-LABEL: test_v64i16:
; SSE2: # %bb.0:
; SSE2-NEXT: por %xmm7, %xmm3
-; SSE2-NEXT: por %xmm5, %xmm3
-; SSE2-NEXT: por %xmm1, %xmm3
+; SSE2-NEXT: por %xmm5, %xmm1
+; SSE2-NEXT: por %xmm3, %xmm1
; SSE2-NEXT: por %xmm6, %xmm2
-; SSE2-NEXT: por %xmm4, %xmm2
-; SSE2-NEXT: por %xmm3, %xmm2
-; SSE2-NEXT: por %xmm0, %xmm2
-; SSE2-NEXT: pxor %xmm0, %xmm0
-; SSE2-NEXT: pcmpeqb %xmm2, %xmm0
-; SSE2-NEXT: pmovmskb %xmm0, %eax
+; SSE2-NEXT: por %xmm4, %xmm0
+; SSE2-NEXT: por %xmm2, %xmm0
+; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: pcmpeqb %xmm0, %xmm1
+; SSE2-NEXT: pmovmskb %xmm1, %eax
; SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
; SSE2-NEXT: setne %al
; SSE2-NEXT: retq
; SSE41-LABEL: test_v64i16:
; SSE41: # %bb.0:
; SSE41-NEXT: por %xmm7, %xmm3
-; SSE41-NEXT: por %xmm5, %xmm3
-; SSE41-NEXT: por %xmm1, %xmm3
+; SSE41-NEXT: por %xmm5, %xmm1
+; SSE41-NEXT: por %xmm3, %xmm1
; SSE41-NEXT: por %xmm6, %xmm2
-; SSE41-NEXT: por %xmm4, %xmm2
-; SSE41-NEXT: por %xmm3, %xmm2
-; SSE41-NEXT: por %xmm0, %xmm2
-; SSE41-NEXT: ptest %xmm2, %xmm2
+; SSE41-NEXT: por %xmm4, %xmm0
+; SSE41-NEXT: por %xmm2, %xmm0
+; SSE41-NEXT: por %xmm1, %xmm0
+; SSE41-NEXT: ptest %xmm0, %xmm0
; SSE41-NEXT: setne %al
; SSE41-NEXT: retq
;
; AVX1-LABEL: test_v64i16:
; AVX1: # %bb.0:
; AVX1-NEXT: vorps %ymm3, %ymm1, %ymm1
-; AVX1-NEXT: vorps %ymm1, %ymm2, %ymm1
+; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0
; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vptest %ymm0, %ymm0
; AVX1-NEXT: setne %al
; AVX2-LABEL: test_v64i16:
; AVX2: # %bb.0:
; AVX2-NEXT: vpor %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vptest %ymm0, %ymm0
; AVX2-NEXT: setne %al
; SSE2-LABEL: test_v64i8:
; SSE2: # %bb.0:
; SSE2-NEXT: por %xmm3, %xmm1
-; SSE2-NEXT: por %xmm2, %xmm1
-; SSE2-NEXT: por %xmm0, %xmm1
-; SSE2-NEXT: pxor %xmm0, %xmm0
-; SSE2-NEXT: pcmpeqb %xmm1, %xmm0
-; SSE2-NEXT: pmovmskb %xmm0, %eax
+; SSE2-NEXT: por %xmm2, %xmm0
+; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: pcmpeqb %xmm0, %xmm1
+; SSE2-NEXT: pmovmskb %xmm1, %eax
; SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
; SSE2-NEXT: setne %al
; SSE2-NEXT: retq
; SSE41-LABEL: test_v64i8:
; SSE41: # %bb.0:
; SSE41-NEXT: por %xmm3, %xmm1
-; SSE41-NEXT: por %xmm2, %xmm1
-; SSE41-NEXT: por %xmm0, %xmm1
-; SSE41-NEXT: ptest %xmm1, %xmm1
+; SSE41-NEXT: por %xmm2, %xmm0
+; SSE41-NEXT: por %xmm1, %xmm0
+; SSE41-NEXT: ptest %xmm0, %xmm0
; SSE41-NEXT: setne %al
; SSE41-NEXT: retq
;
; SSE2-LABEL: test_v128i8:
; SSE2: # %bb.0:
; SSE2-NEXT: por %xmm7, %xmm3
-; SSE2-NEXT: por %xmm5, %xmm3
-; SSE2-NEXT: por %xmm1, %xmm3
+; SSE2-NEXT: por %xmm5, %xmm1
+; SSE2-NEXT: por %xmm3, %xmm1
; SSE2-NEXT: por %xmm6, %xmm2
-; SSE2-NEXT: por %xmm4, %xmm2
-; SSE2-NEXT: por %xmm3, %xmm2
-; SSE2-NEXT: por %xmm0, %xmm2
-; SSE2-NEXT: pxor %xmm0, %xmm0
-; SSE2-NEXT: pcmpeqb %xmm2, %xmm0
-; SSE2-NEXT: pmovmskb %xmm0, %eax
+; SSE2-NEXT: por %xmm4, %xmm0
+; SSE2-NEXT: por %xmm2, %xmm0
+; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: pcmpeqb %xmm0, %xmm1
+; SSE2-NEXT: pmovmskb %xmm1, %eax
; SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
; SSE2-NEXT: sete %al
; SSE2-NEXT: retq
; SSE41-LABEL: test_v128i8:
; SSE41: # %bb.0:
; SSE41-NEXT: por %xmm7, %xmm3
-; SSE41-NEXT: por %xmm5, %xmm3
-; SSE41-NEXT: por %xmm1, %xmm3
+; SSE41-NEXT: por %xmm5, %xmm1
+; SSE41-NEXT: por %xmm3, %xmm1
; SSE41-NEXT: por %xmm6, %xmm2
-; SSE41-NEXT: por %xmm4, %xmm2
-; SSE41-NEXT: por %xmm3, %xmm2
-; SSE41-NEXT: por %xmm0, %xmm2
-; SSE41-NEXT: ptest %xmm2, %xmm2
+; SSE41-NEXT: por %xmm4, %xmm0
+; SSE41-NEXT: por %xmm2, %xmm0
+; SSE41-NEXT: por %xmm1, %xmm0
+; SSE41-NEXT: ptest %xmm0, %xmm0
; SSE41-NEXT: sete %al
; SSE41-NEXT: retq
;
; AVX1-LABEL: test_v128i8:
; AVX1: # %bb.0:
; AVX1-NEXT: vorps %ymm3, %ymm1, %ymm1
-; AVX1-NEXT: vorps %ymm1, %ymm2, %ymm1
+; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0
; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vptest %ymm0, %ymm0
; AVX1-NEXT: sete %al
; AVX2-LABEL: test_v128i8:
; AVX2: # %bb.0:
; AVX2-NEXT: vpor %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vptest %ymm0, %ymm0
; AVX2-NEXT: sete %al
; SSE2-LABEL: mask_v128i8:
; SSE2: # %bb.0:
; SSE2-NEXT: por %xmm7, %xmm3
-; SSE2-NEXT: por %xmm5, %xmm3
-; SSE2-NEXT: por %xmm1, %xmm3
+; SSE2-NEXT: por %xmm5, %xmm1
+; SSE2-NEXT: por %xmm3, %xmm1
; SSE2-NEXT: por %xmm6, %xmm2
-; SSE2-NEXT: por %xmm4, %xmm2
-; SSE2-NEXT: por %xmm3, %xmm2
-; SSE2-NEXT: por %xmm0, %xmm2
-; SSE2-NEXT: psllw $7, %xmm2
-; SSE2-NEXT: pmovmskb %xmm2, %eax
+; SSE2-NEXT: por %xmm4, %xmm0
+; SSE2-NEXT: por %xmm2, %xmm0
+; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: psllw $7, %xmm0
+; SSE2-NEXT: pmovmskb %xmm0, %eax
; SSE2-NEXT: xorl $65535, %eax # imm = 0xFFFF
; SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
; SSE2-NEXT: sete %al
; SSE41-LABEL: mask_v128i8:
; SSE41: # %bb.0:
; SSE41-NEXT: por %xmm7, %xmm3
-; SSE41-NEXT: por %xmm5, %xmm3
-; SSE41-NEXT: por %xmm1, %xmm3
+; SSE41-NEXT: por %xmm5, %xmm1
+; SSE41-NEXT: por %xmm3, %xmm1
; SSE41-NEXT: por %xmm6, %xmm2
-; SSE41-NEXT: por %xmm4, %xmm2
-; SSE41-NEXT: por %xmm3, %xmm2
-; SSE41-NEXT: por %xmm0, %xmm2
-; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE41-NEXT: por %xmm4, %xmm0
+; SSE41-NEXT: por %xmm2, %xmm0
+; SSE41-NEXT: por %xmm1, %xmm0
+; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE41-NEXT: sete %al
; SSE41-NEXT: retq
;
; AVX1-LABEL: mask_v128i8:
; AVX1: # %bb.0:
; AVX1-NEXT: vorps %ymm3, %ymm1, %ymm1
-; AVX1-NEXT: vorps %ymm1, %ymm2, %ymm1
+; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0
; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0
; AVX1-NEXT: sete %al
; AVX2-LABEL: mask_v128i8:
; AVX2: # %bb.0:
; AVX2-NEXT: vpor %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [72340172838076673,72340172838076673,72340172838076673,72340172838076673]
; AVX2-NEXT: vptest %ymm1, %ymm0
; SSE2-NEXT: movd %xmm1, %eax
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
; SSE2-NEXT: movd %xmm0, %ecx
+; SSE2-NEXT: orl %eax, %ecx
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
-; SSE2-NEXT: movd %xmm0, %edx
-; SSE2-NEXT: orl %ecx, %edx
-; SSE2-NEXT: orl %eax, %edx
-; SSE2-NEXT: testb $1, %dl
+; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: orl %ecx, %eax
+; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: je .LBB27_2
; SSE2-NEXT: # %bb.1:
; SSE2-NEXT: xorl %eax, %eax
; SSE41-NEXT: pxor %xmm0, %xmm1
; SSE41-NEXT: pextrd $1, %xmm1, %eax
; SSE41-NEXT: movd %xmm1, %ecx
-; SSE41-NEXT: pextrd $2, %xmm1, %edx
-; SSE41-NEXT: orl %eax, %edx
-; SSE41-NEXT: orl %ecx, %edx
-; SSE41-NEXT: testb $1, %dl
+; SSE41-NEXT: orl %eax, %ecx
+; SSE41-NEXT: pextrd $2, %xmm1, %eax
+; SSE41-NEXT: orl %ecx, %eax
+; SSE41-NEXT: testb $1, %al
; SSE41-NEXT: je .LBB27_2
; SSE41-NEXT: # %bb.1:
; SSE41-NEXT: xorl %eax, %eax
; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpextrd $1, %xmm0, %eax
; AVX1-NEXT: vmovd %xmm0, %ecx
-; AVX1-NEXT: vpextrd $2, %xmm0, %edx
-; AVX1-NEXT: orl %eax, %edx
-; AVX1-NEXT: orl %ecx, %edx
-; AVX1-NEXT: testb $1, %dl
+; AVX1-NEXT: orl %eax, %ecx
+; AVX1-NEXT: vpextrd $2, %xmm0, %eax
+; AVX1-NEXT: orl %ecx, %eax
+; AVX1-NEXT: testb $1, %al
; AVX1-NEXT: je .LBB27_2
; AVX1-NEXT: # %bb.1:
; AVX1-NEXT: xorl %eax, %eax
; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpextrd $1, %xmm0, %eax
; AVX2-NEXT: vmovd %xmm0, %ecx
-; AVX2-NEXT: vpextrd $2, %xmm0, %edx
-; AVX2-NEXT: orl %eax, %edx
-; AVX2-NEXT: orl %ecx, %edx
-; AVX2-NEXT: testb $1, %dl
+; AVX2-NEXT: orl %eax, %ecx
+; AVX2-NEXT: vpextrd $2, %xmm0, %eax
+; AVX2-NEXT: orl %ecx, %eax
+; AVX2-NEXT: testb $1, %al
; AVX2-NEXT: je .LBB27_2
; AVX2-NEXT: # %bb.1:
; AVX2-NEXT: xorl %eax, %eax
; SSE-LABEL: test_v8i64:
; SSE: # %bb.0:
; SSE-NEXT: por %xmm3, %xmm1
-; SSE-NEXT: por %xmm2, %xmm1
-; SSE-NEXT: por %xmm0, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE-NEXT: por %xmm2, %xmm0
; SSE-NEXT: por %xmm1, %xmm0
-; SSE-NEXT: movq %xmm0, %rax
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; SSE-NEXT: por %xmm0, %xmm1
+; SSE-NEXT: movq %xmm1, %rax
; SSE-NEXT: retq
;
; AVX1-LABEL: test_v8i64:
; SSE-LABEL: test_v16i64:
; SSE: # %bb.0:
; SSE-NEXT: por %xmm6, %xmm2
-; SSE-NEXT: por %xmm7, %xmm3
-; SSE-NEXT: por %xmm5, %xmm3
-; SSE-NEXT: por %xmm1, %xmm3
-; SSE-NEXT: por %xmm4, %xmm2
-; SSE-NEXT: por %xmm3, %xmm2
-; SSE-NEXT: por %xmm0, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
+; SSE-NEXT: por %xmm4, %xmm0
; SSE-NEXT: por %xmm2, %xmm0
+; SSE-NEXT: por %xmm7, %xmm3
+; SSE-NEXT: por %xmm5, %xmm1
+; SSE-NEXT: por %xmm3, %xmm1
+; SSE-NEXT: por %xmm0, %xmm1
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE-NEXT: por %xmm1, %xmm0
; SSE-NEXT: movq %xmm0, %rax
; SSE-NEXT: retq
;
; AVX1-LABEL: test_v16i64:
; AVX1: # %bb.0:
; AVX1-NEXT: vorps %ymm3, %ymm1, %ymm1
-; AVX1-NEXT: vorps %ymm1, %ymm2, %ymm1
+; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0
; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0
; AVX2-LABEL: test_v16i64:
; AVX2: # %bb.0:
; AVX2-NEXT: vpor %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
; SSE-LABEL: test_v16i32:
; SSE: # %bb.0:
; SSE-NEXT: por %xmm3, %xmm1
-; SSE-NEXT: por %xmm2, %xmm1
-; SSE-NEXT: por %xmm0, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE-NEXT: por %xmm2, %xmm0
; SSE-NEXT: por %xmm1, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE-NEXT: por %xmm0, %xmm1
-; SSE-NEXT: movd %xmm1, %eax
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
+; SSE-NEXT: por %xmm1, %xmm0
+; SSE-NEXT: movd %xmm0, %eax
; SSE-NEXT: retq
;
; AVX1-LABEL: test_v16i32:
; SSE-LABEL: test_v32i32:
; SSE: # %bb.0:
; SSE-NEXT: por %xmm6, %xmm2
-; SSE-NEXT: por %xmm7, %xmm3
-; SSE-NEXT: por %xmm5, %xmm3
-; SSE-NEXT: por %xmm1, %xmm3
-; SSE-NEXT: por %xmm4, %xmm2
-; SSE-NEXT: por %xmm3, %xmm2
-; SSE-NEXT: por %xmm0, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
+; SSE-NEXT: por %xmm4, %xmm0
; SSE-NEXT: por %xmm2, %xmm0
+; SSE-NEXT: por %xmm7, %xmm3
+; SSE-NEXT: por %xmm5, %xmm1
+; SSE-NEXT: por %xmm3, %xmm1
+; SSE-NEXT: por %xmm0, %xmm1
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE-NEXT: por %xmm1, %xmm0
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; SSE-NEXT: por %xmm0, %xmm1
; SSE-NEXT: movd %xmm1, %eax
; AVX1-LABEL: test_v32i32:
; AVX1: # %bb.0:
; AVX1-NEXT: vorps %ymm3, %ymm1, %ymm1
-; AVX1-NEXT: vorps %ymm1, %ymm2, %ymm1
+; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0
; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0
; AVX2-LABEL: test_v32i32:
; AVX2: # %bb.0:
; AVX2-NEXT: vpor %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
; SSE-LABEL: test_v32i16:
; SSE: # %bb.0:
; SSE-NEXT: por %xmm3, %xmm1
-; SSE-NEXT: por %xmm2, %xmm1
-; SSE-NEXT: por %xmm0, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE-NEXT: por %xmm2, %xmm0
; SSE-NEXT: por %xmm1, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE-NEXT: por %xmm0, %xmm1
-; SSE-NEXT: movdqa %xmm1, %xmm0
-; SSE-NEXT: psrld $16, %xmm0
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
; SSE-NEXT: por %xmm1, %xmm0
-; SSE-NEXT: movd %xmm0, %eax
+; SSE-NEXT: movdqa %xmm0, %xmm1
+; SSE-NEXT: psrld $16, %xmm1
+; SSE-NEXT: por %xmm0, %xmm1
+; SSE-NEXT: movd %xmm1, %eax
; SSE-NEXT: # kill: def $ax killed $ax killed $eax
; SSE-NEXT: retq
;
; SSE-LABEL: test_v64i16:
; SSE: # %bb.0:
; SSE-NEXT: por %xmm6, %xmm2
-; SSE-NEXT: por %xmm7, %xmm3
-; SSE-NEXT: por %xmm5, %xmm3
-; SSE-NEXT: por %xmm1, %xmm3
-; SSE-NEXT: por %xmm4, %xmm2
-; SSE-NEXT: por %xmm3, %xmm2
-; SSE-NEXT: por %xmm0, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
+; SSE-NEXT: por %xmm4, %xmm0
; SSE-NEXT: por %xmm2, %xmm0
+; SSE-NEXT: por %xmm7, %xmm3
+; SSE-NEXT: por %xmm5, %xmm1
+; SSE-NEXT: por %xmm3, %xmm1
+; SSE-NEXT: por %xmm0, %xmm1
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE-NEXT: por %xmm1, %xmm0
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; SSE-NEXT: por %xmm0, %xmm1
; SSE-NEXT: movdqa %xmm1, %xmm0
; AVX1-LABEL: test_v64i16:
; AVX1: # %bb.0:
; AVX1-NEXT: vorps %ymm3, %ymm1, %ymm1
-; AVX1-NEXT: vorps %ymm1, %ymm2, %ymm1
+; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0
; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0
; AVX2-LABEL: test_v64i16:
; AVX2: # %bb.0:
; AVX2-NEXT: vpor %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
; SSE-LABEL: test_v64i8:
; SSE: # %bb.0:
; SSE-NEXT: por %xmm3, %xmm1
-; SSE-NEXT: por %xmm2, %xmm1
-; SSE-NEXT: por %xmm0, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE-NEXT: por %xmm2, %xmm0
; SSE-NEXT: por %xmm1, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE-NEXT: por %xmm0, %xmm1
-; SSE-NEXT: movdqa %xmm1, %xmm0
-; SSE-NEXT: psrld $16, %xmm0
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
; SSE-NEXT: por %xmm1, %xmm0
; SSE-NEXT: movdqa %xmm0, %xmm1
-; SSE-NEXT: psrlw $8, %xmm1
+; SSE-NEXT: psrld $16, %xmm1
; SSE-NEXT: por %xmm0, %xmm1
-; SSE-NEXT: movd %xmm1, %eax
+; SSE-NEXT: movdqa %xmm1, %xmm0
+; SSE-NEXT: psrlw $8, %xmm0
+; SSE-NEXT: por %xmm1, %xmm0
+; SSE-NEXT: movd %xmm0, %eax
; SSE-NEXT: # kill: def $al killed $al killed $eax
; SSE-NEXT: retq
;
; SSE-LABEL: test_v128i8:
; SSE: # %bb.0:
; SSE-NEXT: por %xmm6, %xmm2
-; SSE-NEXT: por %xmm7, %xmm3
-; SSE-NEXT: por %xmm5, %xmm3
-; SSE-NEXT: por %xmm1, %xmm3
-; SSE-NEXT: por %xmm4, %xmm2
-; SSE-NEXT: por %xmm3, %xmm2
-; SSE-NEXT: por %xmm0, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
+; SSE-NEXT: por %xmm4, %xmm0
; SSE-NEXT: por %xmm2, %xmm0
+; SSE-NEXT: por %xmm7, %xmm3
+; SSE-NEXT: por %xmm5, %xmm1
+; SSE-NEXT: por %xmm3, %xmm1
+; SSE-NEXT: por %xmm0, %xmm1
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE-NEXT: por %xmm1, %xmm0
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; SSE-NEXT: por %xmm0, %xmm1
; SSE-NEXT: movdqa %xmm1, %xmm0
; AVX1-LABEL: test_v128i8:
; AVX1: # %bb.0:
; AVX1-NEXT: vorps %ymm3, %ymm1, %ymm1
-; AVX1-NEXT: vorps %ymm1, %ymm2, %ymm1
+; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0
; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0
; AVX2-LABEL: test_v128i8:
; AVX2: # %bb.0:
; AVX2-NEXT: vpor %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
; SSE4-LABEL: test_v16i32:
; SSE4: # %bb.0:
; SSE4-NEXT: pmaxsd %xmm3, %xmm1
-; SSE4-NEXT: pmaxsd %xmm2, %xmm1
-; SSE4-NEXT: pmaxsd %xmm0, %xmm1
-; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE4-NEXT: pmaxsd %xmm2, %xmm0
; SSE4-NEXT: pmaxsd %xmm1, %xmm0
-; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE4-NEXT: pmaxsd %xmm0, %xmm1
-; SSE4-NEXT: movd %xmm1, %eax
+; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
+; SSE4-NEXT: pmaxsd %xmm1, %xmm0
+; SSE4-NEXT: movd %xmm0, %eax
; SSE4-NEXT: retq
;
; AVX1-LABEL: test_v16i32:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpmaxsd %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpmaxsd %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpmaxsd %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; SSE4-LABEL: test_v32i32:
; SSE4: # %bb.0:
; SSE4-NEXT: pmaxsd %xmm6, %xmm2
-; SSE4-NEXT: pmaxsd %xmm7, %xmm3
-; SSE4-NEXT: pmaxsd %xmm5, %xmm3
-; SSE4-NEXT: pmaxsd %xmm1, %xmm3
-; SSE4-NEXT: pmaxsd %xmm4, %xmm2
-; SSE4-NEXT: pmaxsd %xmm3, %xmm2
-; SSE4-NEXT: pmaxsd %xmm0, %xmm2
-; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
+; SSE4-NEXT: pmaxsd %xmm4, %xmm0
; SSE4-NEXT: pmaxsd %xmm2, %xmm0
+; SSE4-NEXT: pmaxsd %xmm7, %xmm3
+; SSE4-NEXT: pmaxsd %xmm5, %xmm1
+; SSE4-NEXT: pmaxsd %xmm3, %xmm1
+; SSE4-NEXT: pmaxsd %xmm0, %xmm1
+; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE4-NEXT: pmaxsd %xmm1, %xmm0
; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; SSE4-NEXT: pmaxsd %xmm0, %xmm1
; SSE4-NEXT: movd %xmm1, %eax
; AVX1-LABEL: test_v32i32:
; AVX1: # %bb.0:
; AVX1-NEXT: vpmaxsd %xmm3, %xmm1, %xmm4
+; AVX1-NEXT: vpmaxsd %xmm2, %xmm0, %xmm5
+; AVX1-NEXT: vpmaxsd %xmm4, %xmm5, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-NEXT: vpmaxsd %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
-; AVX1-NEXT: vpmaxsd %xmm1, %xmm3, %xmm1
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT: vpmaxsd %xmm1, %xmm3, %xmm1
-; AVX1-NEXT: vpmaxsd %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpmaxsd %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpmaxsd %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpmaxsd %xmm0, %xmm4, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; AVX2-LABEL: test_v32i32:
; AVX2: # %bb.0:
; AVX2-NEXT: vpmaxsd %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vpmaxsd %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpmaxsd %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
; SSE2-LABEL: test_v32i16:
; SSE2: # %bb.0:
; SSE2-NEXT: pmaxsw %xmm3, %xmm1
-; SSE2-NEXT: pmaxsw %xmm2, %xmm1
-; SSE2-NEXT: pmaxsw %xmm0, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE2-NEXT: pmaxsw %xmm2, %xmm0
; SSE2-NEXT: pmaxsw %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE2-NEXT: pmaxsw %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psrld $16, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
; SSE2-NEXT: pmaxsw %xmm1, %xmm0
-; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psrld $16, %xmm1
+; SSE2-NEXT: pmaxsw %xmm0, %xmm1
+; SSE2-NEXT: movd %xmm1, %eax
; SSE2-NEXT: # kill: def $ax killed $ax killed $eax
; SSE2-NEXT: retq
;
; SSE4-LABEL: test_v32i16:
; SSE4: # %bb.0:
; SSE4-NEXT: pmaxsw %xmm3, %xmm1
-; SSE4-NEXT: pmaxsw %xmm2, %xmm1
-; SSE4-NEXT: pmaxsw %xmm0, %xmm1
-; SSE4-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE4-NEXT: phminposuw %xmm1, %xmm0
+; SSE4-NEXT: pmaxsw %xmm2, %xmm0
+; SSE4-NEXT: pmaxsw %xmm1, %xmm0
+; SSE4-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE4-NEXT: phminposuw %xmm0, %xmm0
; SSE4-NEXT: movd %xmm0, %eax
; SSE4-NEXT: xorl $32767, %eax # imm = 0x7FFF
; SSE4-NEXT: # kill: def $ax killed $ax killed $eax
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpmaxsw %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpmaxsw %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpmaxsw %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: vphminposuw %xmm0, %xmm0
; AVX1-NEXT: vmovd %xmm0, %eax
; SSE2-LABEL: test_v64i16:
; SSE2: # %bb.0:
; SSE2-NEXT: pmaxsw %xmm6, %xmm2
-; SSE2-NEXT: pmaxsw %xmm7, %xmm3
-; SSE2-NEXT: pmaxsw %xmm5, %xmm3
-; SSE2-NEXT: pmaxsw %xmm1, %xmm3
-; SSE2-NEXT: pmaxsw %xmm4, %xmm2
-; SSE2-NEXT: pmaxsw %xmm3, %xmm2
-; SSE2-NEXT: pmaxsw %xmm0, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
+; SSE2-NEXT: pmaxsw %xmm4, %xmm0
; SSE2-NEXT: pmaxsw %xmm2, %xmm0
+; SSE2-NEXT: pmaxsw %xmm7, %xmm3
+; SSE2-NEXT: pmaxsw %xmm5, %xmm1
+; SSE2-NEXT: pmaxsw %xmm3, %xmm1
+; SSE2-NEXT: pmaxsw %xmm0, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE2-NEXT: pmaxsw %xmm1, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; SSE2-NEXT: pmaxsw %xmm0, %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm0
; SSE4-LABEL: test_v64i16:
; SSE4: # %bb.0:
; SSE4-NEXT: pmaxsw %xmm7, %xmm3
-; SSE4-NEXT: pmaxsw %xmm5, %xmm3
-; SSE4-NEXT: pmaxsw %xmm1, %xmm3
+; SSE4-NEXT: pmaxsw %xmm5, %xmm1
+; SSE4-NEXT: pmaxsw %xmm3, %xmm1
; SSE4-NEXT: pmaxsw %xmm6, %xmm2
-; SSE4-NEXT: pmaxsw %xmm4, %xmm2
-; SSE4-NEXT: pmaxsw %xmm3, %xmm2
-; SSE4-NEXT: pmaxsw %xmm0, %xmm2
-; SSE4-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE4-NEXT: phminposuw %xmm2, %xmm0
+; SSE4-NEXT: pmaxsw %xmm4, %xmm0
+; SSE4-NEXT: pmaxsw %xmm2, %xmm0
+; SSE4-NEXT: pmaxsw %xmm1, %xmm0
+; SSE4-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE4-NEXT: phminposuw %xmm0, %xmm0
; SSE4-NEXT: movd %xmm0, %eax
; SSE4-NEXT: xorl $32767, %eax # imm = 0x7FFF
; SSE4-NEXT: # kill: def $ax killed $ax killed $eax
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
; AVX1-NEXT: vpmaxsw %xmm4, %xmm5, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
-; AVX1-NEXT: vpmaxsw %xmm4, %xmm5, %xmm4
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
+; AVX1-NEXT: vpmaxsw %xmm5, %xmm6, %xmm5
; AVX1-NEXT: vpmaxsw %xmm4, %xmm5, %xmm4
; AVX1-NEXT: vpmaxsw %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpmaxsw %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vpmaxsw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpmaxsw %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpmaxsw %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: vphminposuw %xmm0, %xmm0
; AVX1-NEXT: vmovd %xmm0, %eax
; AVX2-LABEL: test_v64i16:
; AVX2: # %bb.0:
; AVX2-NEXT: vpmaxsw %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vpmaxsw %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpmaxsw %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
; SSE4-LABEL: test_v64i8:
; SSE4: # %bb.0:
; SSE4-NEXT: pmaxsb %xmm3, %xmm1
-; SSE4-NEXT: pmaxsb %xmm2, %xmm1
-; SSE4-NEXT: pmaxsb %xmm0, %xmm1
-; SSE4-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE4-NEXT: movdqa %xmm1, %xmm0
-; SSE4-NEXT: psrlw $8, %xmm0
-; SSE4-NEXT: pminub %xmm1, %xmm0
-; SSE4-NEXT: phminposuw %xmm0, %xmm0
+; SSE4-NEXT: pmaxsb %xmm2, %xmm0
+; SSE4-NEXT: pmaxsb %xmm1, %xmm0
+; SSE4-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE4-NEXT: movdqa %xmm0, %xmm1
+; SSE4-NEXT: psrlw $8, %xmm1
+; SSE4-NEXT: pminub %xmm0, %xmm1
+; SSE4-NEXT: phminposuw %xmm1, %xmm0
; SSE4-NEXT: movd %xmm0, %eax
; SSE4-NEXT: xorb $127, %al
; SSE4-NEXT: # kill: def $al killed $al killed $eax
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpmaxsb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpmaxsb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpmaxsb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
; AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
; SSE4-LABEL: test_v128i8:
; SSE4: # %bb.0:
; SSE4-NEXT: pmaxsb %xmm7, %xmm3
-; SSE4-NEXT: pmaxsb %xmm5, %xmm3
-; SSE4-NEXT: pmaxsb %xmm1, %xmm3
+; SSE4-NEXT: pmaxsb %xmm5, %xmm1
+; SSE4-NEXT: pmaxsb %xmm3, %xmm1
; SSE4-NEXT: pmaxsb %xmm6, %xmm2
-; SSE4-NEXT: pmaxsb %xmm4, %xmm2
-; SSE4-NEXT: pmaxsb %xmm3, %xmm2
-; SSE4-NEXT: pmaxsb %xmm0, %xmm2
-; SSE4-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE4-NEXT: movdqa %xmm2, %xmm0
-; SSE4-NEXT: psrlw $8, %xmm0
-; SSE4-NEXT: pminub %xmm2, %xmm0
-; SSE4-NEXT: phminposuw %xmm0, %xmm0
+; SSE4-NEXT: pmaxsb %xmm4, %xmm0
+; SSE4-NEXT: pmaxsb %xmm2, %xmm0
+; SSE4-NEXT: pmaxsb %xmm1, %xmm0
+; SSE4-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE4-NEXT: movdqa %xmm0, %xmm1
+; SSE4-NEXT: psrlw $8, %xmm1
+; SSE4-NEXT: pminub %xmm0, %xmm1
+; SSE4-NEXT: phminposuw %xmm1, %xmm0
; SSE4-NEXT: movd %xmm0, %eax
; SSE4-NEXT: xorb $127, %al
; SSE4-NEXT: # kill: def $al killed $al killed $eax
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
; AVX1-NEXT: vpmaxsb %xmm4, %xmm5, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
-; AVX1-NEXT: vpmaxsb %xmm4, %xmm5, %xmm4
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
+; AVX1-NEXT: vpmaxsb %xmm5, %xmm6, %xmm5
; AVX1-NEXT: vpmaxsb %xmm4, %xmm5, %xmm4
; AVX1-NEXT: vpmaxsb %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpmaxsb %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vpmaxsb %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpmaxsb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpmaxsb %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
; AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
; AVX2-LABEL: test_v128i8:
; AVX2: # %bb.0:
; AVX2-NEXT: vpmaxsb %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vpmaxsb %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpmaxsb %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
; SSE4-LABEL: test_v16i32:
; SSE4: # %bb.0:
; SSE4-NEXT: pminsd %xmm3, %xmm1
-; SSE4-NEXT: pminsd %xmm2, %xmm1
-; SSE4-NEXT: pminsd %xmm0, %xmm1
-; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE4-NEXT: pminsd %xmm2, %xmm0
; SSE4-NEXT: pminsd %xmm1, %xmm0
-; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE4-NEXT: pminsd %xmm0, %xmm1
-; SSE4-NEXT: movd %xmm1, %eax
+; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
+; SSE4-NEXT: pminsd %xmm1, %xmm0
+; SSE4-NEXT: movd %xmm0, %eax
; SSE4-NEXT: retq
;
; AVX1-LABEL: test_v16i32:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpminsd %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpminsd %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpminsd %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; SSE4-LABEL: test_v32i32:
; SSE4: # %bb.0:
; SSE4-NEXT: pminsd %xmm6, %xmm2
-; SSE4-NEXT: pminsd %xmm7, %xmm3
-; SSE4-NEXT: pminsd %xmm5, %xmm3
-; SSE4-NEXT: pminsd %xmm1, %xmm3
-; SSE4-NEXT: pminsd %xmm4, %xmm2
-; SSE4-NEXT: pminsd %xmm3, %xmm2
-; SSE4-NEXT: pminsd %xmm0, %xmm2
-; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
+; SSE4-NEXT: pminsd %xmm4, %xmm0
; SSE4-NEXT: pminsd %xmm2, %xmm0
+; SSE4-NEXT: pminsd %xmm7, %xmm3
+; SSE4-NEXT: pminsd %xmm5, %xmm1
+; SSE4-NEXT: pminsd %xmm3, %xmm1
+; SSE4-NEXT: pminsd %xmm0, %xmm1
+; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE4-NEXT: pminsd %xmm1, %xmm0
; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; SSE4-NEXT: pminsd %xmm0, %xmm1
; SSE4-NEXT: movd %xmm1, %eax
; AVX1-LABEL: test_v32i32:
; AVX1: # %bb.0:
; AVX1-NEXT: vpminsd %xmm3, %xmm1, %xmm4
+; AVX1-NEXT: vpminsd %xmm2, %xmm0, %xmm5
+; AVX1-NEXT: vpminsd %xmm4, %xmm5, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-NEXT: vpminsd %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
-; AVX1-NEXT: vpminsd %xmm1, %xmm3, %xmm1
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT: vpminsd %xmm1, %xmm3, %xmm1
-; AVX1-NEXT: vpminsd %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpminsd %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpminsd %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpminsd %xmm0, %xmm4, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; AVX2-LABEL: test_v32i32:
; AVX2: # %bb.0:
; AVX2-NEXT: vpminsd %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vpminsd %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpminsd %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0
; SSE2-LABEL: test_v32i16:
; SSE2: # %bb.0:
; SSE2-NEXT: pminsw %xmm3, %xmm1
-; SSE2-NEXT: pminsw %xmm2, %xmm1
-; SSE2-NEXT: pminsw %xmm0, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE2-NEXT: pminsw %xmm2, %xmm0
; SSE2-NEXT: pminsw %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE2-NEXT: pminsw %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psrld $16, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
; SSE2-NEXT: pminsw %xmm1, %xmm0
-; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psrld $16, %xmm1
+; SSE2-NEXT: pminsw %xmm0, %xmm1
+; SSE2-NEXT: movd %xmm1, %eax
; SSE2-NEXT: # kill: def $ax killed $ax killed $eax
; SSE2-NEXT: retq
;
; SSE4-LABEL: test_v32i16:
; SSE4: # %bb.0:
; SSE4-NEXT: pminsw %xmm3, %xmm1
-; SSE4-NEXT: pminsw %xmm2, %xmm1
-; SSE4-NEXT: pminsw %xmm0, %xmm1
-; SSE4-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE4-NEXT: phminposuw %xmm1, %xmm0
+; SSE4-NEXT: pminsw %xmm2, %xmm0
+; SSE4-NEXT: pminsw %xmm1, %xmm0
+; SSE4-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE4-NEXT: phminposuw %xmm0, %xmm0
; SSE4-NEXT: movd %xmm0, %eax
; SSE4-NEXT: xorl $32768, %eax # imm = 0x8000
; SSE4-NEXT: # kill: def $ax killed $ax killed $eax
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpminsw %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpminsw %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpminsw %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: vphminposuw %xmm0, %xmm0
; AVX1-NEXT: vmovd %xmm0, %eax
; SSE2-LABEL: test_v64i16:
; SSE2: # %bb.0:
; SSE2-NEXT: pminsw %xmm6, %xmm2
-; SSE2-NEXT: pminsw %xmm7, %xmm3
-; SSE2-NEXT: pminsw %xmm5, %xmm3
-; SSE2-NEXT: pminsw %xmm1, %xmm3
-; SSE2-NEXT: pminsw %xmm4, %xmm2
-; SSE2-NEXT: pminsw %xmm3, %xmm2
-; SSE2-NEXT: pminsw %xmm0, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
+; SSE2-NEXT: pminsw %xmm4, %xmm0
; SSE2-NEXT: pminsw %xmm2, %xmm0
+; SSE2-NEXT: pminsw %xmm7, %xmm3
+; SSE2-NEXT: pminsw %xmm5, %xmm1
+; SSE2-NEXT: pminsw %xmm3, %xmm1
+; SSE2-NEXT: pminsw %xmm0, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE2-NEXT: pminsw %xmm1, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; SSE2-NEXT: pminsw %xmm0, %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm0
; SSE4-LABEL: test_v64i16:
; SSE4: # %bb.0:
; SSE4-NEXT: pminsw %xmm7, %xmm3
-; SSE4-NEXT: pminsw %xmm5, %xmm3
-; SSE4-NEXT: pminsw %xmm1, %xmm3
+; SSE4-NEXT: pminsw %xmm5, %xmm1
+; SSE4-NEXT: pminsw %xmm3, %xmm1
; SSE4-NEXT: pminsw %xmm6, %xmm2
-; SSE4-NEXT: pminsw %xmm4, %xmm2
-; SSE4-NEXT: pminsw %xmm3, %xmm2
-; SSE4-NEXT: pminsw %xmm0, %xmm2
-; SSE4-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE4-NEXT: phminposuw %xmm2, %xmm0
+; SSE4-NEXT: pminsw %xmm4, %xmm0
+; SSE4-NEXT: pminsw %xmm2, %xmm0
+; SSE4-NEXT: pminsw %xmm1, %xmm0
+; SSE4-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE4-NEXT: phminposuw %xmm0, %xmm0
; SSE4-NEXT: movd %xmm0, %eax
; SSE4-NEXT: xorl $32768, %eax # imm = 0x8000
; SSE4-NEXT: # kill: def $ax killed $ax killed $eax
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
; AVX1-NEXT: vpminsw %xmm4, %xmm5, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
-; AVX1-NEXT: vpminsw %xmm4, %xmm5, %xmm4
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
+; AVX1-NEXT: vpminsw %xmm5, %xmm6, %xmm5
; AVX1-NEXT: vpminsw %xmm4, %xmm5, %xmm4
; AVX1-NEXT: vpminsw %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpminsw %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vpminsw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpminsw %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpminsw %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: vphminposuw %xmm0, %xmm0
; AVX1-NEXT: vmovd %xmm0, %eax
; AVX2-LABEL: test_v64i16:
; AVX2: # %bb.0:
; AVX2-NEXT: vpminsw %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vpminsw %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpminsw %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpminsw %xmm1, %xmm0, %xmm0
; SSE4-LABEL: test_v64i8:
; SSE4: # %bb.0:
; SSE4-NEXT: pminsb %xmm3, %xmm1
-; SSE4-NEXT: pminsb %xmm2, %xmm1
-; SSE4-NEXT: pminsb %xmm0, %xmm1
-; SSE4-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE4-NEXT: movdqa %xmm1, %xmm0
-; SSE4-NEXT: psrlw $8, %xmm0
-; SSE4-NEXT: pminub %xmm1, %xmm0
-; SSE4-NEXT: phminposuw %xmm0, %xmm0
+; SSE4-NEXT: pminsb %xmm2, %xmm0
+; SSE4-NEXT: pminsb %xmm1, %xmm0
+; SSE4-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE4-NEXT: movdqa %xmm0, %xmm1
+; SSE4-NEXT: psrlw $8, %xmm1
+; SSE4-NEXT: pminub %xmm0, %xmm1
+; SSE4-NEXT: phminposuw %xmm1, %xmm0
; SSE4-NEXT: movd %xmm0, %eax
; SSE4-NEXT: addb $-128, %al
; SSE4-NEXT: # kill: def $al killed $al killed $eax
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpminsb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpminsb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpminsb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
; AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
; SSE4-LABEL: test_v128i8:
; SSE4: # %bb.0:
; SSE4-NEXT: pminsb %xmm7, %xmm3
-; SSE4-NEXT: pminsb %xmm5, %xmm3
-; SSE4-NEXT: pminsb %xmm1, %xmm3
+; SSE4-NEXT: pminsb %xmm5, %xmm1
+; SSE4-NEXT: pminsb %xmm3, %xmm1
; SSE4-NEXT: pminsb %xmm6, %xmm2
-; SSE4-NEXT: pminsb %xmm4, %xmm2
-; SSE4-NEXT: pminsb %xmm3, %xmm2
-; SSE4-NEXT: pminsb %xmm0, %xmm2
-; SSE4-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE4-NEXT: movdqa %xmm2, %xmm0
-; SSE4-NEXT: psrlw $8, %xmm0
-; SSE4-NEXT: pminub %xmm2, %xmm0
-; SSE4-NEXT: phminposuw %xmm0, %xmm0
+; SSE4-NEXT: pminsb %xmm4, %xmm0
+; SSE4-NEXT: pminsb %xmm2, %xmm0
+; SSE4-NEXT: pminsb %xmm1, %xmm0
+; SSE4-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE4-NEXT: movdqa %xmm0, %xmm1
+; SSE4-NEXT: psrlw $8, %xmm1
+; SSE4-NEXT: pminub %xmm0, %xmm1
+; SSE4-NEXT: phminposuw %xmm1, %xmm0
; SSE4-NEXT: movd %xmm0, %eax
; SSE4-NEXT: addb $-128, %al
; SSE4-NEXT: # kill: def $al killed $al killed $eax
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
; AVX1-NEXT: vpminsb %xmm4, %xmm5, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
-; AVX1-NEXT: vpminsb %xmm4, %xmm5, %xmm4
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
+; AVX1-NEXT: vpminsb %xmm5, %xmm6, %xmm5
; AVX1-NEXT: vpminsb %xmm4, %xmm5, %xmm4
; AVX1-NEXT: vpminsb %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpminsb %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vpminsb %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpminsb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpminsb %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
; AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
; AVX2-LABEL: test_v128i8:
; AVX2: # %bb.0:
; AVX2-NEXT: vpminsb %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vpminsb %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpminsb %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpminsb %xmm1, %xmm0, %xmm0
; SSE4-LABEL: test_v16i32:
; SSE4: # %bb.0:
; SSE4-NEXT: pmaxud %xmm3, %xmm1
-; SSE4-NEXT: pmaxud %xmm2, %xmm1
-; SSE4-NEXT: pmaxud %xmm0, %xmm1
-; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE4-NEXT: pmaxud %xmm2, %xmm0
; SSE4-NEXT: pmaxud %xmm1, %xmm0
-; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE4-NEXT: pmaxud %xmm0, %xmm1
-; SSE4-NEXT: movd %xmm1, %eax
+; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
+; SSE4-NEXT: pmaxud %xmm1, %xmm0
+; SSE4-NEXT: movd %xmm0, %eax
; SSE4-NEXT: retq
;
; AVX1-LABEL: test_v16i32:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpmaxud %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpmaxud %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpmaxud %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; SSE4-LABEL: test_v32i32:
; SSE4: # %bb.0:
; SSE4-NEXT: pmaxud %xmm6, %xmm2
-; SSE4-NEXT: pmaxud %xmm7, %xmm3
-; SSE4-NEXT: pmaxud %xmm5, %xmm3
-; SSE4-NEXT: pmaxud %xmm1, %xmm3
-; SSE4-NEXT: pmaxud %xmm4, %xmm2
-; SSE4-NEXT: pmaxud %xmm3, %xmm2
-; SSE4-NEXT: pmaxud %xmm0, %xmm2
-; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
+; SSE4-NEXT: pmaxud %xmm4, %xmm0
; SSE4-NEXT: pmaxud %xmm2, %xmm0
+; SSE4-NEXT: pmaxud %xmm7, %xmm3
+; SSE4-NEXT: pmaxud %xmm5, %xmm1
+; SSE4-NEXT: pmaxud %xmm3, %xmm1
+; SSE4-NEXT: pmaxud %xmm0, %xmm1
+; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE4-NEXT: pmaxud %xmm1, %xmm0
; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; SSE4-NEXT: pmaxud %xmm0, %xmm1
; SSE4-NEXT: movd %xmm1, %eax
; AVX1-LABEL: test_v32i32:
; AVX1: # %bb.0:
; AVX1-NEXT: vpmaxud %xmm3, %xmm1, %xmm4
+; AVX1-NEXT: vpmaxud %xmm2, %xmm0, %xmm5
+; AVX1-NEXT: vpmaxud %xmm4, %xmm5, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-NEXT: vpmaxud %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
-; AVX1-NEXT: vpmaxud %xmm1, %xmm3, %xmm1
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT: vpmaxud %xmm1, %xmm3, %xmm1
-; AVX1-NEXT: vpmaxud %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpmaxud %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpmaxud %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpmaxud %xmm0, %xmm4, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; AVX2-LABEL: test_v32i32:
; AVX2: # %bb.0:
; AVX2-NEXT: vpmaxud %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vpmaxud %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpmaxud %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
; SSE4-LABEL: test_v32i16:
; SSE4: # %bb.0:
; SSE4-NEXT: pmaxuw %xmm3, %xmm1
-; SSE4-NEXT: pmaxuw %xmm2, %xmm1
-; SSE4-NEXT: pmaxuw %xmm0, %xmm1
-; SSE4-NEXT: pcmpeqd %xmm0, %xmm0
-; SSE4-NEXT: pxor %xmm1, %xmm0
-; SSE4-NEXT: phminposuw %xmm0, %xmm0
+; SSE4-NEXT: pmaxuw %xmm2, %xmm0
+; SSE4-NEXT: pmaxuw %xmm1, %xmm0
+; SSE4-NEXT: pcmpeqd %xmm1, %xmm1
+; SSE4-NEXT: pxor %xmm0, %xmm1
+; SSE4-NEXT: phminposuw %xmm1, %xmm0
; SSE4-NEXT: movd %xmm0, %eax
; SSE4-NEXT: notl %eax
; SSE4-NEXT: # kill: def $ax killed $ax killed $eax
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpmaxuw %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpmaxuw %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpmaxuw %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vphminposuw %xmm0, %xmm0
; SSE4-LABEL: test_v64i16:
; SSE4: # %bb.0:
; SSE4-NEXT: pmaxuw %xmm7, %xmm3
-; SSE4-NEXT: pmaxuw %xmm5, %xmm3
-; SSE4-NEXT: pmaxuw %xmm1, %xmm3
+; SSE4-NEXT: pmaxuw %xmm5, %xmm1
+; SSE4-NEXT: pmaxuw %xmm3, %xmm1
; SSE4-NEXT: pmaxuw %xmm6, %xmm2
-; SSE4-NEXT: pmaxuw %xmm4, %xmm2
-; SSE4-NEXT: pmaxuw %xmm3, %xmm2
-; SSE4-NEXT: pmaxuw %xmm0, %xmm2
-; SSE4-NEXT: pcmpeqd %xmm0, %xmm0
-; SSE4-NEXT: pxor %xmm2, %xmm0
-; SSE4-NEXT: phminposuw %xmm0, %xmm0
+; SSE4-NEXT: pmaxuw %xmm4, %xmm0
+; SSE4-NEXT: pmaxuw %xmm2, %xmm0
+; SSE4-NEXT: pmaxuw %xmm1, %xmm0
+; SSE4-NEXT: pcmpeqd %xmm1, %xmm1
+; SSE4-NEXT: pxor %xmm0, %xmm1
+; SSE4-NEXT: phminposuw %xmm1, %xmm0
; SSE4-NEXT: movd %xmm0, %eax
; SSE4-NEXT: notl %eax
; SSE4-NEXT: # kill: def $ax killed $ax killed $eax
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
; AVX1-NEXT: vpmaxuw %xmm4, %xmm5, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
-; AVX1-NEXT: vpmaxuw %xmm4, %xmm5, %xmm4
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
+; AVX1-NEXT: vpmaxuw %xmm5, %xmm6, %xmm5
; AVX1-NEXT: vpmaxuw %xmm4, %xmm5, %xmm4
; AVX1-NEXT: vpmaxuw %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpmaxuw %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vpmaxuw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpmaxuw %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpmaxuw %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vphminposuw %xmm0, %xmm0
; AVX2-LABEL: test_v64i16:
; AVX2: # %bb.0:
; AVX2-NEXT: vpmaxuw %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vpmaxuw %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpmaxuw %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
; SSE2-LABEL: test_v64i8:
; SSE2: # %bb.0:
; SSE2-NEXT: pmaxub %xmm3, %xmm1
-; SSE2-NEXT: pmaxub %xmm2, %xmm1
-; SSE2-NEXT: pmaxub %xmm0, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE2-NEXT: pmaxub %xmm2, %xmm0
; SSE2-NEXT: pmaxub %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE2-NEXT: pmaxub %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psrld $16, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
; SSE2-NEXT: pmaxub %xmm1, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrlw $8, %xmm1
+; SSE2-NEXT: psrld $16, %xmm1
; SSE2-NEXT: pmaxub %xmm0, %xmm1
-; SSE2-NEXT: movd %xmm1, %eax
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: psrlw $8, %xmm0
+; SSE2-NEXT: pmaxub %xmm1, %xmm0
+; SSE2-NEXT: movd %xmm0, %eax
; SSE2-NEXT: # kill: def $al killed $al killed $eax
; SSE2-NEXT: retq
;
; SSE4-LABEL: test_v64i8:
; SSE4: # %bb.0:
; SSE4-NEXT: pmaxub %xmm3, %xmm1
-; SSE4-NEXT: pmaxub %xmm2, %xmm1
-; SSE4-NEXT: pmaxub %xmm0, %xmm1
-; SSE4-NEXT: pcmpeqd %xmm0, %xmm0
-; SSE4-NEXT: pxor %xmm1, %xmm0
-; SSE4-NEXT: movdqa %xmm0, %xmm1
-; SSE4-NEXT: psrlw $8, %xmm1
-; SSE4-NEXT: pminub %xmm0, %xmm1
-; SSE4-NEXT: phminposuw %xmm1, %xmm0
+; SSE4-NEXT: pmaxub %xmm2, %xmm0
+; SSE4-NEXT: pmaxub %xmm1, %xmm0
+; SSE4-NEXT: pcmpeqd %xmm1, %xmm1
+; SSE4-NEXT: pxor %xmm0, %xmm1
+; SSE4-NEXT: movdqa %xmm1, %xmm0
+; SSE4-NEXT: psrlw $8, %xmm0
+; SSE4-NEXT: pminub %xmm1, %xmm0
+; SSE4-NEXT: phminposuw %xmm0, %xmm0
; SSE4-NEXT: movd %xmm0, %eax
; SSE4-NEXT: notb %al
; SSE4-NEXT: # kill: def $al killed $al killed $eax
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpmaxub %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpmaxub %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpmaxub %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
; SSE2-LABEL: test_v128i8:
; SSE2: # %bb.0:
; SSE2-NEXT: pmaxub %xmm6, %xmm2
-; SSE2-NEXT: pmaxub %xmm7, %xmm3
-; SSE2-NEXT: pmaxub %xmm5, %xmm3
-; SSE2-NEXT: pmaxub %xmm1, %xmm3
-; SSE2-NEXT: pmaxub %xmm4, %xmm2
-; SSE2-NEXT: pmaxub %xmm3, %xmm2
-; SSE2-NEXT: pmaxub %xmm0, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
+; SSE2-NEXT: pmaxub %xmm4, %xmm0
; SSE2-NEXT: pmaxub %xmm2, %xmm0
+; SSE2-NEXT: pmaxub %xmm7, %xmm3
+; SSE2-NEXT: pmaxub %xmm5, %xmm1
+; SSE2-NEXT: pmaxub %xmm3, %xmm1
+; SSE2-NEXT: pmaxub %xmm0, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE2-NEXT: pmaxub %xmm1, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; SSE2-NEXT: pmaxub %xmm0, %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm0
; SSE4-LABEL: test_v128i8:
; SSE4: # %bb.0:
; SSE4-NEXT: pmaxub %xmm7, %xmm3
-; SSE4-NEXT: pmaxub %xmm5, %xmm3
-; SSE4-NEXT: pmaxub %xmm1, %xmm3
+; SSE4-NEXT: pmaxub %xmm5, %xmm1
+; SSE4-NEXT: pmaxub %xmm3, %xmm1
; SSE4-NEXT: pmaxub %xmm6, %xmm2
-; SSE4-NEXT: pmaxub %xmm4, %xmm2
-; SSE4-NEXT: pmaxub %xmm3, %xmm2
-; SSE4-NEXT: pmaxub %xmm0, %xmm2
-; SSE4-NEXT: pcmpeqd %xmm0, %xmm0
-; SSE4-NEXT: pxor %xmm2, %xmm0
-; SSE4-NEXT: movdqa %xmm0, %xmm1
-; SSE4-NEXT: psrlw $8, %xmm1
-; SSE4-NEXT: pminub %xmm0, %xmm1
-; SSE4-NEXT: phminposuw %xmm1, %xmm0
+; SSE4-NEXT: pmaxub %xmm4, %xmm0
+; SSE4-NEXT: pmaxub %xmm2, %xmm0
+; SSE4-NEXT: pmaxub %xmm1, %xmm0
+; SSE4-NEXT: pcmpeqd %xmm1, %xmm1
+; SSE4-NEXT: pxor %xmm0, %xmm1
+; SSE4-NEXT: movdqa %xmm1, %xmm0
+; SSE4-NEXT: psrlw $8, %xmm0
+; SSE4-NEXT: pminub %xmm1, %xmm0
+; SSE4-NEXT: phminposuw %xmm0, %xmm0
; SSE4-NEXT: movd %xmm0, %eax
; SSE4-NEXT: notb %al
; SSE4-NEXT: # kill: def $al killed $al killed $eax
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
; AVX1-NEXT: vpmaxub %xmm4, %xmm5, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
-; AVX1-NEXT: vpmaxub %xmm4, %xmm5, %xmm4
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
+; AVX1-NEXT: vpmaxub %xmm5, %xmm6, %xmm5
; AVX1-NEXT: vpmaxub %xmm4, %xmm5, %xmm4
; AVX1-NEXT: vpmaxub %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpmaxub %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vpmaxub %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpmaxub %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpmaxub %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
; AVX2-LABEL: test_v128i8:
; AVX2: # %bb.0:
; AVX2-NEXT: vpmaxub %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vpmaxub %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpmaxub %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
; SSE4-LABEL: test_v16i32:
; SSE4: # %bb.0:
; SSE4-NEXT: pminud %xmm3, %xmm1
-; SSE4-NEXT: pminud %xmm2, %xmm1
-; SSE4-NEXT: pminud %xmm0, %xmm1
-; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE4-NEXT: pminud %xmm2, %xmm0
; SSE4-NEXT: pminud %xmm1, %xmm0
-; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE4-NEXT: pminud %xmm0, %xmm1
-; SSE4-NEXT: movd %xmm1, %eax
+; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
+; SSE4-NEXT: pminud %xmm1, %xmm0
+; SSE4-NEXT: movd %xmm0, %eax
; SSE4-NEXT: retq
;
; AVX1-LABEL: test_v16i32:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpminud %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpminud %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpminud %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; SSE4-LABEL: test_v32i32:
; SSE4: # %bb.0:
; SSE4-NEXT: pminud %xmm6, %xmm2
-; SSE4-NEXT: pminud %xmm7, %xmm3
-; SSE4-NEXT: pminud %xmm5, %xmm3
-; SSE4-NEXT: pminud %xmm1, %xmm3
-; SSE4-NEXT: pminud %xmm4, %xmm2
-; SSE4-NEXT: pminud %xmm3, %xmm2
-; SSE4-NEXT: pminud %xmm0, %xmm2
-; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
+; SSE4-NEXT: pminud %xmm4, %xmm0
; SSE4-NEXT: pminud %xmm2, %xmm0
+; SSE4-NEXT: pminud %xmm7, %xmm3
+; SSE4-NEXT: pminud %xmm5, %xmm1
+; SSE4-NEXT: pminud %xmm3, %xmm1
+; SSE4-NEXT: pminud %xmm0, %xmm1
+; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE4-NEXT: pminud %xmm1, %xmm0
; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; SSE4-NEXT: pminud %xmm0, %xmm1
; SSE4-NEXT: movd %xmm1, %eax
; AVX1-LABEL: test_v32i32:
; AVX1: # %bb.0:
; AVX1-NEXT: vpminud %xmm3, %xmm1, %xmm4
+; AVX1-NEXT: vpminud %xmm2, %xmm0, %xmm5
+; AVX1-NEXT: vpminud %xmm4, %xmm5, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-NEXT: vpminud %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
-; AVX1-NEXT: vpminud %xmm1, %xmm3, %xmm1
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT: vpminud %xmm1, %xmm3, %xmm1
-; AVX1-NEXT: vpminud %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpminud %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpminud %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpminud %xmm0, %xmm4, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; AVX2-LABEL: test_v32i32:
; AVX2: # %bb.0:
; AVX2-NEXT: vpminud %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vpminud %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpminud %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0
; SSE4-LABEL: test_v32i16:
; SSE4: # %bb.0:
; SSE4-NEXT: pminuw %xmm3, %xmm1
-; SSE4-NEXT: pminuw %xmm2, %xmm1
-; SSE4-NEXT: pminuw %xmm0, %xmm1
-; SSE4-NEXT: phminposuw %xmm1, %xmm0
+; SSE4-NEXT: pminuw %xmm2, %xmm0
+; SSE4-NEXT: pminuw %xmm1, %xmm0
+; SSE4-NEXT: phminposuw %xmm0, %xmm0
; SSE4-NEXT: movd %xmm0, %eax
; SSE4-NEXT: # kill: def $ax killed $ax killed $eax
; SSE4-NEXT: retq
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpminuw %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpminuw %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpminuw %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vphminposuw %xmm0, %xmm0
; AVX1-NEXT: vmovd %xmm0, %eax
; AVX1-NEXT: # kill: def $ax killed $ax killed $eax
; SSE4-LABEL: test_v64i16:
; SSE4: # %bb.0:
; SSE4-NEXT: pminuw %xmm7, %xmm3
-; SSE4-NEXT: pminuw %xmm5, %xmm3
-; SSE4-NEXT: pminuw %xmm1, %xmm3
+; SSE4-NEXT: pminuw %xmm5, %xmm1
+; SSE4-NEXT: pminuw %xmm3, %xmm1
; SSE4-NEXT: pminuw %xmm6, %xmm2
-; SSE4-NEXT: pminuw %xmm4, %xmm2
-; SSE4-NEXT: pminuw %xmm3, %xmm2
-; SSE4-NEXT: pminuw %xmm0, %xmm2
-; SSE4-NEXT: phminposuw %xmm2, %xmm0
+; SSE4-NEXT: pminuw %xmm4, %xmm0
+; SSE4-NEXT: pminuw %xmm2, %xmm0
+; SSE4-NEXT: pminuw %xmm1, %xmm0
+; SSE4-NEXT: phminposuw %xmm0, %xmm0
; SSE4-NEXT: movd %xmm0, %eax
; SSE4-NEXT: # kill: def $ax killed $ax killed $eax
; SSE4-NEXT: retq
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
; AVX1-NEXT: vpminuw %xmm4, %xmm5, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
-; AVX1-NEXT: vpminuw %xmm4, %xmm5, %xmm4
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
+; AVX1-NEXT: vpminuw %xmm5, %xmm6, %xmm5
; AVX1-NEXT: vpminuw %xmm4, %xmm5, %xmm4
; AVX1-NEXT: vpminuw %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpminuw %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vpminuw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpminuw %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpminuw %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vphminposuw %xmm0, %xmm0
; AVX1-NEXT: vmovd %xmm0, %eax
; AVX1-NEXT: # kill: def $ax killed $ax killed $eax
; AVX2-LABEL: test_v64i16:
; AVX2: # %bb.0:
; AVX2-NEXT: vpminuw %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vpminuw %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpminuw %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpminuw %xmm1, %xmm0, %xmm0
; SSE2-LABEL: test_v64i8:
; SSE2: # %bb.0:
; SSE2-NEXT: pminub %xmm3, %xmm1
-; SSE2-NEXT: pminub %xmm2, %xmm1
-; SSE2-NEXT: pminub %xmm0, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE2-NEXT: pminub %xmm2, %xmm0
; SSE2-NEXT: pminub %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE2-NEXT: pminub %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psrld $16, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
; SSE2-NEXT: pminub %xmm1, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrlw $8, %xmm1
+; SSE2-NEXT: psrld $16, %xmm1
; SSE2-NEXT: pminub %xmm0, %xmm1
-; SSE2-NEXT: movd %xmm1, %eax
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: psrlw $8, %xmm0
+; SSE2-NEXT: pminub %xmm1, %xmm0
+; SSE2-NEXT: movd %xmm0, %eax
; SSE2-NEXT: # kill: def $al killed $al killed $eax
; SSE2-NEXT: retq
;
; SSE4-LABEL: test_v64i8:
; SSE4: # %bb.0:
; SSE4-NEXT: pminub %xmm3, %xmm1
-; SSE4-NEXT: pminub %xmm2, %xmm1
-; SSE4-NEXT: pminub %xmm0, %xmm1
-; SSE4-NEXT: movdqa %xmm1, %xmm0
-; SSE4-NEXT: psrlw $8, %xmm0
+; SSE4-NEXT: pminub %xmm2, %xmm0
; SSE4-NEXT: pminub %xmm1, %xmm0
-; SSE4-NEXT: phminposuw %xmm0, %xmm0
+; SSE4-NEXT: movdqa %xmm0, %xmm1
+; SSE4-NEXT: psrlw $8, %xmm1
+; SSE4-NEXT: pminub %xmm0, %xmm1
+; SSE4-NEXT: phminposuw %xmm1, %xmm0
; SSE4-NEXT: movd %xmm0, %eax
; SSE4-NEXT: # kill: def $al killed $al killed $eax
; SSE4-NEXT: retq
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpminub %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpminub %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpminub %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
; AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vphminposuw %xmm0, %xmm0
; SSE2-LABEL: test_v128i8:
; SSE2: # %bb.0:
; SSE2-NEXT: pminub %xmm6, %xmm2
-; SSE2-NEXT: pminub %xmm7, %xmm3
-; SSE2-NEXT: pminub %xmm5, %xmm3
-; SSE2-NEXT: pminub %xmm1, %xmm3
-; SSE2-NEXT: pminub %xmm4, %xmm2
-; SSE2-NEXT: pminub %xmm3, %xmm2
-; SSE2-NEXT: pminub %xmm0, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
+; SSE2-NEXT: pminub %xmm4, %xmm0
; SSE2-NEXT: pminub %xmm2, %xmm0
+; SSE2-NEXT: pminub %xmm7, %xmm3
+; SSE2-NEXT: pminub %xmm5, %xmm1
+; SSE2-NEXT: pminub %xmm3, %xmm1
+; SSE2-NEXT: pminub %xmm0, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE2-NEXT: pminub %xmm1, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; SSE2-NEXT: pminub %xmm0, %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm0
; SSE4-LABEL: test_v128i8:
; SSE4: # %bb.0:
; SSE4-NEXT: pminub %xmm7, %xmm3
-; SSE4-NEXT: pminub %xmm5, %xmm3
-; SSE4-NEXT: pminub %xmm1, %xmm3
+; SSE4-NEXT: pminub %xmm5, %xmm1
+; SSE4-NEXT: pminub %xmm3, %xmm1
; SSE4-NEXT: pminub %xmm6, %xmm2
-; SSE4-NEXT: pminub %xmm4, %xmm2
-; SSE4-NEXT: pminub %xmm3, %xmm2
-; SSE4-NEXT: pminub %xmm0, %xmm2
-; SSE4-NEXT: movdqa %xmm2, %xmm0
-; SSE4-NEXT: psrlw $8, %xmm0
+; SSE4-NEXT: pminub %xmm4, %xmm0
; SSE4-NEXT: pminub %xmm2, %xmm0
-; SSE4-NEXT: phminposuw %xmm0, %xmm0
+; SSE4-NEXT: pminub %xmm1, %xmm0
+; SSE4-NEXT: movdqa %xmm0, %xmm1
+; SSE4-NEXT: psrlw $8, %xmm1
+; SSE4-NEXT: pminub %xmm0, %xmm1
+; SSE4-NEXT: phminposuw %xmm1, %xmm0
; SSE4-NEXT: movd %xmm0, %eax
; SSE4-NEXT: # kill: def $al killed $al killed $eax
; SSE4-NEXT: retq
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
; AVX1-NEXT: vpminub %xmm4, %xmm5, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
-; AVX1-NEXT: vpminub %xmm4, %xmm5, %xmm4
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
+; AVX1-NEXT: vpminub %xmm5, %xmm6, %xmm5
; AVX1-NEXT: vpminub %xmm4, %xmm5, %xmm4
; AVX1-NEXT: vpminub %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpminub %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vpminub %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpminub %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpminub %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
; AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vphminposuw %xmm0, %xmm0
; AVX2-LABEL: test_v128i8:
; AVX2: # %bb.0:
; AVX2-NEXT: vpminub %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vpminub %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpminub %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0
; SSE-LABEL: trunc_v64i8_v64i1:
; SSE: # %bb.0:
; SSE-NEXT: pxor %xmm3, %xmm1
-; SSE-NEXT: pxor %xmm2, %xmm1
-; SSE-NEXT: pxor %xmm0, %xmm1
-; SSE-NEXT: psllw $7, %xmm1
-; SSE-NEXT: pmovmskb %xmm1, %eax
+; SSE-NEXT: pxor %xmm2, %xmm0
+; SSE-NEXT: pxor %xmm1, %xmm0
+; SSE-NEXT: psllw $7, %xmm0
+; SSE-NEXT: pmovmskb %xmm0, %eax
; SSE-NEXT: xorb %ah, %al
; SSE-NEXT: setnp %al
; SSE-NEXT: retq
; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm3
; AVX512F-NEXT: vpxor %xmm2, %xmm3, %xmm2
-; AVX512F-NEXT: vpxor %xmm2, %xmm1, %xmm1
; AVX512F-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT: vpxor %xmm2, %xmm0, %xmm0
; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
; SSE-NEXT: pxor %xmm4, %xmm4
; SSE-NEXT: pcmpeqb %xmm4, %xmm2
; SSE-NEXT: pcmpeqb %xmm4, %xmm0
+; SSE-NEXT: pxor %xmm2, %xmm0
; SSE-NEXT: pcmpeqb %xmm4, %xmm3
; SSE-NEXT: pcmpeqb %xmm4, %xmm1
; SSE-NEXT: pxor %xmm3, %xmm1
-; SSE-NEXT: pxor %xmm2, %xmm1
; SSE-NEXT: pxor %xmm0, %xmm1
; SSE-NEXT: pmovmskb %xmm1, %eax
; SSE-NEXT: xorb %ah, %al
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm3
; AVX1-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm4
+; AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm3
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpxor %xmm0, %xmm3, %xmm0
-; AVX1-NEXT: vpxor %xmm0, %xmm4, %xmm0
; AVX1-NEXT: vpmovmskb %xmm0, %eax
; AVX1-NEXT: xorb %ah, %al
; AVX1-NEXT: setnp %al
; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm3
; AVX512F-NEXT: vpxor %xmm2, %xmm3, %xmm2
-; AVX512F-NEXT: vpxor %xmm2, %xmm1, %xmm1
; AVX512F-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT: vpxor %xmm2, %xmm0, %xmm0
; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
; SSE: # %bb.0:
; SSE-NEXT: pcmpeqb %xmm6, %xmm2
; SSE-NEXT: pcmpeqb %xmm4, %xmm0
+; SSE-NEXT: pxor %xmm2, %xmm0
; SSE-NEXT: pcmpeqb %xmm7, %xmm3
; SSE-NEXT: pcmpeqb %xmm5, %xmm1
; SSE-NEXT: pxor %xmm3, %xmm1
-; SSE-NEXT: pxor %xmm2, %xmm1
; SSE-NEXT: pxor %xmm0, %xmm1
; SSE-NEXT: pmovmskb %xmm1, %eax
; SSE-NEXT: xorb %ah, %al
; AVX1: # %bb.0:
; AVX1-NEXT: vpcmpeqb %xmm3, %xmm1, %xmm4
; AVX1-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm5
+; AVX1-NEXT: vpxor %xmm4, %xmm5, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-NEXT: vpcmpeqb %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpxor %xmm0, %xmm4, %xmm0
-; AVX1-NEXT: vpxor %xmm0, %xmm5, %xmm0
; AVX1-NEXT: vpmovmskb %xmm0, %eax
; AVX1-NEXT: xorb %ah, %al
; AVX1-NEXT: setnp %al
; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm1
; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm3
; AVX512F-NEXT: vpxor %xmm1, %xmm3, %xmm1
-; AVX512F-NEXT: vpxor %xmm1, %xmm2, %xmm1
+; AVX512F-NEXT: vpxor %xmm2, %xmm0, %xmm0
; AVX512F-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
; SSE-LABEL: test_v8i64:
; SSE: # %bb.0:
; SSE-NEXT: pxor %xmm3, %xmm1
-; SSE-NEXT: pxor %xmm2, %xmm1
-; SSE-NEXT: pxor %xmm0, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE-NEXT: pxor %xmm2, %xmm0
; SSE-NEXT: pxor %xmm1, %xmm0
-; SSE-NEXT: movq %xmm0, %rax
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; SSE-NEXT: pxor %xmm0, %xmm1
+; SSE-NEXT: movq %xmm1, %rax
; SSE-NEXT: retq
;
; AVX1-LABEL: test_v8i64:
; SSE-LABEL: test_v16i64:
; SSE: # %bb.0:
; SSE-NEXT: pxor %xmm6, %xmm2
-; SSE-NEXT: pxor %xmm7, %xmm3
-; SSE-NEXT: pxor %xmm5, %xmm3
-; SSE-NEXT: pxor %xmm1, %xmm3
-; SSE-NEXT: pxor %xmm4, %xmm2
-; SSE-NEXT: pxor %xmm3, %xmm2
-; SSE-NEXT: pxor %xmm0, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
+; SSE-NEXT: pxor %xmm4, %xmm0
; SSE-NEXT: pxor %xmm2, %xmm0
+; SSE-NEXT: pxor %xmm7, %xmm3
+; SSE-NEXT: pxor %xmm5, %xmm1
+; SSE-NEXT: pxor %xmm3, %xmm1
+; SSE-NEXT: pxor %xmm0, %xmm1
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE-NEXT: pxor %xmm1, %xmm0
; SSE-NEXT: movq %xmm0, %rax
; SSE-NEXT: retq
;
; AVX1-LABEL: test_v16i64:
; AVX1: # %bb.0:
; AVX1-NEXT: vxorps %ymm3, %ymm1, %ymm1
-; AVX1-NEXT: vxorps %ymm1, %ymm2, %ymm1
+; AVX1-NEXT: vxorps %ymm2, %ymm0, %ymm0
; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0
; AVX2-LABEL: test_v16i64:
; AVX2: # %bb.0:
; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vpxor %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
; SSE-LABEL: test_v16i32:
; SSE: # %bb.0:
; SSE-NEXT: pxor %xmm3, %xmm1
-; SSE-NEXT: pxor %xmm2, %xmm1
-; SSE-NEXT: pxor %xmm0, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE-NEXT: pxor %xmm2, %xmm0
; SSE-NEXT: pxor %xmm1, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE-NEXT: pxor %xmm0, %xmm1
-; SSE-NEXT: movd %xmm1, %eax
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
+; SSE-NEXT: pxor %xmm1, %xmm0
+; SSE-NEXT: movd %xmm0, %eax
; SSE-NEXT: retq
;
; AVX1-LABEL: test_v16i32:
; SSE-LABEL: test_v32i32:
; SSE: # %bb.0:
; SSE-NEXT: pxor %xmm6, %xmm2
-; SSE-NEXT: pxor %xmm7, %xmm3
-; SSE-NEXT: pxor %xmm5, %xmm3
-; SSE-NEXT: pxor %xmm1, %xmm3
-; SSE-NEXT: pxor %xmm4, %xmm2
-; SSE-NEXT: pxor %xmm3, %xmm2
-; SSE-NEXT: pxor %xmm0, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
+; SSE-NEXT: pxor %xmm4, %xmm0
; SSE-NEXT: pxor %xmm2, %xmm0
+; SSE-NEXT: pxor %xmm7, %xmm3
+; SSE-NEXT: pxor %xmm5, %xmm1
+; SSE-NEXT: pxor %xmm3, %xmm1
+; SSE-NEXT: pxor %xmm0, %xmm1
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE-NEXT: pxor %xmm1, %xmm0
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; SSE-NEXT: pxor %xmm0, %xmm1
; SSE-NEXT: movd %xmm1, %eax
; AVX1-LABEL: test_v32i32:
; AVX1: # %bb.0:
; AVX1-NEXT: vxorps %ymm3, %ymm1, %ymm1
-; AVX1-NEXT: vxorps %ymm1, %ymm2, %ymm1
+; AVX1-NEXT: vxorps %ymm2, %ymm0, %ymm0
; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0
; AVX2-LABEL: test_v32i32:
; AVX2: # %bb.0:
; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vpxor %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
; SSE-LABEL: test_v32i16:
; SSE: # %bb.0:
; SSE-NEXT: pxor %xmm3, %xmm1
-; SSE-NEXT: pxor %xmm2, %xmm1
-; SSE-NEXT: pxor %xmm0, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE-NEXT: pxor %xmm2, %xmm0
; SSE-NEXT: pxor %xmm1, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE-NEXT: pxor %xmm0, %xmm1
-; SSE-NEXT: movdqa %xmm1, %xmm0
-; SSE-NEXT: psrld $16, %xmm0
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
; SSE-NEXT: pxor %xmm1, %xmm0
-; SSE-NEXT: movd %xmm0, %eax
+; SSE-NEXT: movdqa %xmm0, %xmm1
+; SSE-NEXT: psrld $16, %xmm1
+; SSE-NEXT: pxor %xmm0, %xmm1
+; SSE-NEXT: movd %xmm1, %eax
; SSE-NEXT: # kill: def $ax killed $ax killed $eax
; SSE-NEXT: retq
;
; SSE-LABEL: test_v64i16:
; SSE: # %bb.0:
; SSE-NEXT: pxor %xmm6, %xmm2
-; SSE-NEXT: pxor %xmm7, %xmm3
-; SSE-NEXT: pxor %xmm5, %xmm3
-; SSE-NEXT: pxor %xmm1, %xmm3
-; SSE-NEXT: pxor %xmm4, %xmm2
-; SSE-NEXT: pxor %xmm3, %xmm2
-; SSE-NEXT: pxor %xmm0, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
+; SSE-NEXT: pxor %xmm4, %xmm0
; SSE-NEXT: pxor %xmm2, %xmm0
+; SSE-NEXT: pxor %xmm7, %xmm3
+; SSE-NEXT: pxor %xmm5, %xmm1
+; SSE-NEXT: pxor %xmm3, %xmm1
+; SSE-NEXT: pxor %xmm0, %xmm1
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE-NEXT: pxor %xmm1, %xmm0
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; SSE-NEXT: pxor %xmm0, %xmm1
; SSE-NEXT: movdqa %xmm1, %xmm0
; AVX1-LABEL: test_v64i16:
; AVX1: # %bb.0:
; AVX1-NEXT: vxorps %ymm3, %ymm1, %ymm1
-; AVX1-NEXT: vxorps %ymm1, %ymm2, %ymm1
+; AVX1-NEXT: vxorps %ymm2, %ymm0, %ymm0
; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0
; AVX2-LABEL: test_v64i16:
; AVX2: # %bb.0:
; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vpxor %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
; SSE-LABEL: test_v64i8:
; SSE: # %bb.0:
; SSE-NEXT: pxor %xmm3, %xmm1
-; SSE-NEXT: pxor %xmm2, %xmm1
-; SSE-NEXT: pxor %xmm0, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE-NEXT: pxor %xmm2, %xmm0
; SSE-NEXT: pxor %xmm1, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE-NEXT: pxor %xmm0, %xmm1
-; SSE-NEXT: movdqa %xmm1, %xmm0
-; SSE-NEXT: psrld $16, %xmm0
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
; SSE-NEXT: pxor %xmm1, %xmm0
; SSE-NEXT: movdqa %xmm0, %xmm1
-; SSE-NEXT: psrlw $8, %xmm1
+; SSE-NEXT: psrld $16, %xmm1
; SSE-NEXT: pxor %xmm0, %xmm1
-; SSE-NEXT: movd %xmm1, %eax
+; SSE-NEXT: movdqa %xmm1, %xmm0
+; SSE-NEXT: psrlw $8, %xmm0
+; SSE-NEXT: pxor %xmm1, %xmm0
+; SSE-NEXT: movd %xmm0, %eax
; SSE-NEXT: # kill: def $al killed $al killed $eax
; SSE-NEXT: retq
;
; SSE-LABEL: test_v128i8:
; SSE: # %bb.0:
; SSE-NEXT: pxor %xmm6, %xmm2
-; SSE-NEXT: pxor %xmm7, %xmm3
-; SSE-NEXT: pxor %xmm5, %xmm3
-; SSE-NEXT: pxor %xmm1, %xmm3
-; SSE-NEXT: pxor %xmm4, %xmm2
-; SSE-NEXT: pxor %xmm3, %xmm2
-; SSE-NEXT: pxor %xmm0, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
+; SSE-NEXT: pxor %xmm4, %xmm0
; SSE-NEXT: pxor %xmm2, %xmm0
+; SSE-NEXT: pxor %xmm7, %xmm3
+; SSE-NEXT: pxor %xmm5, %xmm1
+; SSE-NEXT: pxor %xmm3, %xmm1
+; SSE-NEXT: pxor %xmm0, %xmm1
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE-NEXT: pxor %xmm1, %xmm0
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; SSE-NEXT: pxor %xmm0, %xmm1
; SSE-NEXT: movdqa %xmm1, %xmm0
; AVX1-LABEL: test_v128i8:
; AVX1: # %bb.0:
; AVX1-NEXT: vxorps %ymm3, %ymm1, %ymm1
-; AVX1-NEXT: vxorps %ymm1, %ymm2, %ymm1
+; AVX1-NEXT: vxorps %ymm2, %ymm0, %ymm0
; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0
; AVX2-LABEL: test_v128i8:
; AVX2: # %bb.0:
; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vpxor %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
;
; AVX1-LABEL: trunc_and_v8i64_v8i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [65535,65535,65535,65535]
-; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3
+; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1
+; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535]
+; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2
; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
;
; AVX1-LABEL: trunc_and_v16i64_v16i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} ymm8 = [255,255,255,255]
-; AVX1-NEXT: vandps %ymm7, %ymm8, %ymm7
-; AVX1-NEXT: vandps %ymm7, %ymm3, %ymm3
-; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm7
-; AVX1-NEXT: vpackusdw %xmm7, %xmm3, %xmm3
-; AVX1-NEXT: vandps %ymm6, %ymm8, %ymm6
+; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0
+; AVX1-NEXT: vandps %ymm5, %ymm1, %ymm1
; AVX1-NEXT: vandps %ymm6, %ymm2, %ymm2
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6
-; AVX1-NEXT: vpackusdw %xmm6, %xmm2, %xmm2
+; AVX1-NEXT: vandps %ymm7, %ymm3, %ymm3
+; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255]
+; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5
+; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3
+; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
+; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2
; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vandps %ymm5, %ymm8, %ymm3
-; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1
+; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vandps %ymm4, %ymm8, %ymm3
-; AVX1-NEXT: vandps %ymm3, %ymm0, %ymm0
+; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
;
; AVX2-LABEL: trunc_and_v16i64_v16i8:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm8 = [255,255,255,255]
-; AVX2-NEXT: vpand %ymm7, %ymm8, %ymm7
-; AVX2-NEXT: vpand %ymm7, %ymm3, %ymm3
-; AVX2-NEXT: vpand %ymm6, %ymm8, %ymm6
+; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0
+; AVX2-NEXT: vpand %ymm5, %ymm1, %ymm1
; AVX2-NEXT: vpand %ymm6, %ymm2, %ymm2
+; AVX2-NEXT: vpand %ymm7, %ymm3, %ymm3
+; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,255,255,255]
+; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3
+; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2
; AVX2-NEXT: vpackusdw %ymm3, %ymm2, %ymm2
; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
-; AVX2-NEXT: vpand %ymm5, %ymm8, %ymm3
-; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vpand %ymm4, %ymm8, %ymm3
-; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm0
+; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1
+; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0
; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2-NEXT: vpackusdw %ymm2, %ymm0, %ymm0
define <16 x i8> @trunc_and_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind {
; SSE-LABEL: trunc_and_v16i32_v16i8:
; SSE: # %bb.0:
-; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
-; SSE-NEXT: pand %xmm8, %xmm7
-; SSE-NEXT: pand %xmm3, %xmm7
-; SSE-NEXT: pand %xmm8, %xmm6
-; SSE-NEXT: pand %xmm2, %xmm6
-; SSE-NEXT: packuswb %xmm7, %xmm6
-; SSE-NEXT: pand %xmm8, %xmm5
-; SSE-NEXT: pand %xmm1, %xmm5
-; SSE-NEXT: pand %xmm4, %xmm8
-; SSE-NEXT: pand %xmm8, %xmm0
-; SSE-NEXT: packuswb %xmm5, %xmm0
-; SSE-NEXT: packuswb %xmm6, %xmm0
+; SSE-NEXT: pand %xmm4, %xmm0
+; SSE-NEXT: pand %xmm5, %xmm1
+; SSE-NEXT: pand %xmm6, %xmm2
+; SSE-NEXT: pand %xmm7, %xmm3
+; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; SSE-NEXT: pand %xmm4, %xmm3
+; SSE-NEXT: pand %xmm4, %xmm2
+; SSE-NEXT: packuswb %xmm3, %xmm2
+; SSE-NEXT: pand %xmm4, %xmm1
+; SSE-NEXT: pand %xmm4, %xmm0
+; SSE-NEXT: packuswb %xmm1, %xmm0
+; SSE-NEXT: packuswb %xmm2, %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_and_v16i32_v16i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255]
-; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3
+; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1
+; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2
; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
;
; AVX2-LABEL: trunc_and_v16i32_v16i8:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255]
-; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3
+; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2
+; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
define <16 x i8> @trunc_and_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind {
; SSE-LABEL: trunc_and_v16i16_v16i8:
; SSE: # %bb.0:
-; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
-; SSE-NEXT: pand %xmm4, %xmm3
-; SSE-NEXT: pand %xmm1, %xmm3
-; SSE-NEXT: pand %xmm2, %xmm4
-; SSE-NEXT: pand %xmm4, %xmm0
-; SSE-NEXT: packuswb %xmm3, %xmm0
+; SSE-NEXT: pand %xmm2, %xmm0
+; SSE-NEXT: pand %xmm3, %xmm1
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; SSE-NEXT: pand %xmm2, %xmm1
+; SSE-NEXT: pand %xmm2, %xmm0
+; SSE-NEXT: packuswb %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_and_v16i16_v16i8:
; SSE2-LABEL: trunc_packus_v4i64_v4i8:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255]
-; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648]
-; SSE2-NEXT: movdqa %xmm1, %xmm4
-; SSE2-NEXT: pxor %xmm3, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
+; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648]
+; SSE2-NEXT: movdqa %xmm1, %xmm3
+; SSE2-NEXT: pxor %xmm4, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3]
; SSE2-NEXT: pxor %xmm6, %xmm6
; SSE2-NEXT: pcmpeqd %xmm6, %xmm5
; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [2147483903,2147483903]
; SSE2-NEXT: movdqa %xmm7, %xmm8
-; SSE2-NEXT: pcmpgtd %xmm4, %xmm8
+; SSE2-NEXT: pcmpgtd %xmm3, %xmm8
; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2]
; SSE2-NEXT: pand %xmm5, %xmm9
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm8[1,1,3,3]
-; SSE2-NEXT: por %xmm9, %xmm4
-; SSE2-NEXT: pand %xmm4, %xmm1
-; SSE2-NEXT: pandn %xmm2, %xmm4
-; SSE2-NEXT: por %xmm1, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm8[1,1,3,3]
+; SSE2-NEXT: por %xmm9, %xmm3
+; SSE2-NEXT: pand %xmm3, %xmm1
+; SSE2-NEXT: pandn %xmm2, %xmm3
+; SSE2-NEXT: por %xmm1, %xmm3
; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: pxor %xmm3, %xmm1
+; SSE2-NEXT: pxor %xmm4, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3]
; SSE2-NEXT: pcmpeqd %xmm6, %xmm5
; SSE2-NEXT: pcmpgtd %xmm1, %xmm7
; SSE2-NEXT: pandn %xmm2, %xmm5
; SSE2-NEXT: por %xmm5, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: pxor %xmm3, %xmm1
+; SSE2-NEXT: pxor %xmm4, %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm3, %xmm5
-; SSE2-NEXT: pcmpeqd %xmm3, %xmm1
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm5
+; SSE2-NEXT: pcmpeqd %xmm4, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSE2-NEXT: pand %xmm5, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
; SSE2-NEXT: por %xmm1, %xmm5
-; SSE2-NEXT: movdqa %xmm4, %xmm1
-; SSE2-NEXT: pxor %xmm3, %xmm1
+; SSE2-NEXT: movdqa %xmm3, %xmm1
+; SSE2-NEXT: pxor %xmm4, %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm3, %xmm6
-; SSE2-NEXT: pcmpeqd %xmm3, %xmm1
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm6
+; SSE2-NEXT: pcmpeqd %xmm4, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSE2-NEXT: pand %xmm6, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3]
-; SSE2-NEXT: por %xmm1, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3]
+; SSE2-NEXT: por %xmm1, %xmm4
; SSE2-NEXT: pand %xmm2, %xmm3
; SSE2-NEXT: pand %xmm4, %xmm3
-; SSE2-NEXT: pand %xmm2, %xmm5
+; SSE2-NEXT: pand %xmm2, %xmm0
; SSE2-NEXT: pand %xmm5, %xmm0
; SSE2-NEXT: packuswb %xmm3, %xmm0
; SSE2-NEXT: packuswb %xmm0, %xmm0
; SSE2-LABEL: trunc_packus_v4i64_v4i8_store:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255]
-; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648]
-; SSE2-NEXT: movdqa %xmm1, %xmm4
-; SSE2-NEXT: pxor %xmm3, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
+; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648]
+; SSE2-NEXT: movdqa %xmm1, %xmm3
+; SSE2-NEXT: pxor %xmm4, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3]
; SSE2-NEXT: pxor %xmm6, %xmm6
; SSE2-NEXT: pcmpeqd %xmm6, %xmm5
; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [2147483903,2147483903]
; SSE2-NEXT: movdqa %xmm7, %xmm8
-; SSE2-NEXT: pcmpgtd %xmm4, %xmm8
+; SSE2-NEXT: pcmpgtd %xmm3, %xmm8
; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2]
; SSE2-NEXT: pand %xmm5, %xmm9
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm8[1,1,3,3]
-; SSE2-NEXT: por %xmm9, %xmm4
-; SSE2-NEXT: pand %xmm4, %xmm1
-; SSE2-NEXT: pandn %xmm2, %xmm4
-; SSE2-NEXT: por %xmm1, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm8[1,1,3,3]
+; SSE2-NEXT: por %xmm9, %xmm3
+; SSE2-NEXT: pand %xmm3, %xmm1
+; SSE2-NEXT: pandn %xmm2, %xmm3
+; SSE2-NEXT: por %xmm1, %xmm3
; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: pxor %xmm3, %xmm1
+; SSE2-NEXT: pxor %xmm4, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3]
; SSE2-NEXT: pcmpeqd %xmm6, %xmm5
; SSE2-NEXT: pcmpgtd %xmm1, %xmm7
; SSE2-NEXT: pandn %xmm2, %xmm1
; SSE2-NEXT: por %xmm0, %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: pxor %xmm3, %xmm0
+; SSE2-NEXT: pxor %xmm4, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm3, %xmm5
-; SSE2-NEXT: pcmpeqd %xmm3, %xmm0
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm5
+; SSE2-NEXT: pcmpeqd %xmm4, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
; SSE2-NEXT: pand %xmm5, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
; SSE2-NEXT: por %xmm0, %xmm5
-; SSE2-NEXT: movdqa %xmm4, %xmm0
-; SSE2-NEXT: pxor %xmm3, %xmm0
+; SSE2-NEXT: movdqa %xmm3, %xmm0
+; SSE2-NEXT: pxor %xmm4, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm3, %xmm6
-; SSE2-NEXT: pcmpeqd %xmm3, %xmm0
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm6
+; SSE2-NEXT: pcmpeqd %xmm4, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
; SSE2-NEXT: pand %xmm6, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3]
-; SSE2-NEXT: por %xmm0, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3]
+; SSE2-NEXT: por %xmm0, %xmm4
; SSE2-NEXT: pand %xmm2, %xmm3
; SSE2-NEXT: pand %xmm4, %xmm3
-; SSE2-NEXT: pand %xmm2, %xmm5
-; SSE2-NEXT: pand %xmm1, %xmm5
-; SSE2-NEXT: packuswb %xmm3, %xmm5
-; SSE2-NEXT: packuswb %xmm5, %xmm5
-; SSE2-NEXT: packuswb %xmm5, %xmm5
-; SSE2-NEXT: movd %xmm5, (%rdi)
+; SSE2-NEXT: pand %xmm2, %xmm1
+; SSE2-NEXT: pand %xmm5, %xmm1
+; SSE2-NEXT: packuswb %xmm3, %xmm1
+; SSE2-NEXT: packuswb %xmm1, %xmm1
+; SSE2-NEXT: packuswb %xmm1, %xmm1
+; SSE2-NEXT: movd %xmm1, (%rdi)
; SSE2-NEXT: retq
;
; SSSE3-LABEL: trunc_packus_v4i64_v4i8_store:
; X86-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k0 # 2-byte Reload
; X86-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k1 # 2-byte Reload
; X86-NEXT: kmovw %k0, %edi
-; X86-NEXT: addl %eax, %ecx
-; X86-NEXT: kmovw %k1, %eax
-; X86-NEXT: addl %edx, %eax
-; X86-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k0 # 2-byte Reload
-; X86-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k1 # 2-byte Reload
-; X86-NEXT: kmovw %k0, %edx
-; X86-NEXT: addl %eax, %edx
+; X86-NEXT: addl %edi, %eax
+; X86-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k2 # 2-byte Reload
+; X86-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k3 # 2-byte Reload
+; X86-NEXT: kmovw %k2, %edi
; X86-NEXT: addl %ecx, %edx
-; X86-NEXT: addl %edi, %edx
-; X86-NEXT: movw %dx, (%esi)
+; X86-NEXT: kmovw %k1, %ecx
+; X86-NEXT: addl %edi, %ecx
+; X86-NEXT: addl %ecx, %eax
+; X86-NEXT: addl %edx, %eax
+; X86-NEXT: movw %ax, (%esi)
; X86-NEXT: leal -8(%ebp), %esp
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X64-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; X64-NEXT: kmovw %k0, %edi
; X64-NEXT: kmovw %k1, %r8d
-; X64-NEXT: addl %ecx, %edx
-; X64-NEXT: addl %r8d, %eax
-; X64-NEXT: addl %esi, %eax
-; X64-NEXT: addl %edx, %eax
; X64-NEXT: addl %edi, %eax
-; X64-NEXT: movw %ax, (%rbx)
+; X64-NEXT: addl %ecx, %edx
+; X64-NEXT: addl %eax, %edx
+; X64-NEXT: addl %r8d, %edx
+; X64-NEXT: addl %esi, %edx
+; X64-NEXT: movw %dx, (%rbx)
; X64-NEXT: leaq -8(%rbp), %rsp
; X64-NEXT: popq %rbx
; X64-NEXT: popq %rbp
; WIN64: # %bb.0: # %entry
; WIN64-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d
; WIN64-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d
-; WIN64-NEXT: movsbl %cl, %ecx
-; WIN64-NEXT: movswl %dx, %eax
+; WIN64-NEXT: movsbl %cl, %eax
+; WIN64-NEXT: movswl %dx, %ecx
+; WIN64-NEXT: addl %eax, %ecx
; WIN64-NEXT: movzbl %r8b, %edx
-; WIN64-NEXT: addl %eax, %edx
; WIN64-NEXT: movzwl %r9w, %eax
; WIN64-NEXT: addl %edx, %eax
+; WIN64-NEXT: addl %ecx, %eax
; WIN64-NEXT: addl %r11d, %eax
; WIN64-NEXT: addl %r10d, %eax
-; WIN64-NEXT: addl %ecx, %eax
; WIN64-NEXT: retq
;
; WIN32-MSVC-LABEL: manyargs:
; WIN32-MSVC: # %bb.0: # %entry
-; WIN32-MSVC-NEXT: pushl %edi
; WIN32-MSVC-NEXT: pushl %esi
; WIN32-MSVC-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; WIN32-MSVC-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
-; WIN32-MSVC-NEXT: movzwl {{[0-9]+}}(%esp), %edx
-; WIN32-MSVC-NEXT: movzbl {{[0-9]+}}(%esp), %esi
-; WIN32-MSVC-NEXT: movswl {{[0-9]+}}(%esp), %edi
-; WIN32-MSVC-NEXT: addl %esi, %edi
-; WIN32-MSVC-NEXT: addl %edx, %edi
-; WIN32-MSVC-NEXT: addl %ecx, %edi
-; WIN32-MSVC-NEXT: addl %eax, %edi
+; WIN32-MSVC-NEXT: addl %eax, %ecx
+; WIN32-MSVC-NEXT: movzwl {{[0-9]+}}(%esp), %eax
+; WIN32-MSVC-NEXT: movzbl {{[0-9]+}}(%esp), %edx
+; WIN32-MSVC-NEXT: addl %eax, %edx
+; WIN32-MSVC-NEXT: movswl {{[0-9]+}}(%esp), %esi
; WIN32-MSVC-NEXT: movsbl {{[0-9]+}}(%esp), %eax
-; WIN32-MSVC-NEXT: addl %edi, %eax
+; WIN32-MSVC-NEXT: addl %esi, %eax
+; WIN32-MSVC-NEXT: addl %edx, %eax
+; WIN32-MSVC-NEXT: addl %ecx, %eax
; WIN32-MSVC-NEXT: popl %esi
-; WIN32-MSVC-NEXT: popl %edi
; WIN32-MSVC-NEXT: retl
;
; WIN32-GNU-LABEL: manyargs:
; WIN32-GNU: # %bb.0: # %entry
-; WIN32-GNU-NEXT: pushl %edi
-; WIN32-GNU-NEXT: .cfi_def_cfa_offset 8
; WIN32-GNU-NEXT: pushl %esi
-; WIN32-GNU-NEXT: .cfi_def_cfa_offset 12
-; WIN32-GNU-NEXT: .cfi_offset %esi, -12
-; WIN32-GNU-NEXT: .cfi_offset %edi, -8
+; WIN32-GNU-NEXT: .cfi_def_cfa_offset 8
+; WIN32-GNU-NEXT: .cfi_offset %esi, -8
; WIN32-GNU-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; WIN32-GNU-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
-; WIN32-GNU-NEXT: movzwl {{[0-9]+}}(%esp), %edx
-; WIN32-GNU-NEXT: movzbl {{[0-9]+}}(%esp), %esi
-; WIN32-GNU-NEXT: movswl {{[0-9]+}}(%esp), %edi
-; WIN32-GNU-NEXT: addl %esi, %edi
-; WIN32-GNU-NEXT: addl %edx, %edi
-; WIN32-GNU-NEXT: addl %ecx, %edi
-; WIN32-GNU-NEXT: addl %eax, %edi
+; WIN32-GNU-NEXT: addl %eax, %ecx
+; WIN32-GNU-NEXT: movzwl {{[0-9]+}}(%esp), %eax
+; WIN32-GNU-NEXT: movzbl {{[0-9]+}}(%esp), %edx
+; WIN32-GNU-NEXT: addl %eax, %edx
+; WIN32-GNU-NEXT: movswl {{[0-9]+}}(%esp), %esi
; WIN32-GNU-NEXT: movsbl {{[0-9]+}}(%esp), %eax
-; WIN32-GNU-NEXT: addl %edi, %eax
+; WIN32-GNU-NEXT: addl %esi, %eax
+; WIN32-GNU-NEXT: addl %edx, %eax
+; WIN32-GNU-NEXT: addl %ecx, %eax
; WIN32-GNU-NEXT: popl %esi
-; WIN32-GNU-NEXT: popl %edi
; WIN32-GNU-NEXT: retl
entry:
%aa = sext i8 %a to i32
define <4 x i32> @test_sse(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d) nounwind {
; DARWIN-LABEL: test_sse:
; DARWIN: ## %bb.0:
-; DARWIN-NEXT: vpaddd %xmm3, %xmm2, %xmm2
-; DARWIN-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; DARWIN-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; DARWIN-NEXT: vpaddd %xmm3, %xmm2, %xmm1
; DARWIN-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; DARWIN-NEXT: retl
;
; LINUX-LABEL: test_sse:
; LINUX: # %bb.0:
; LINUX-NEXT: subl $12, %esp
-; LINUX-NEXT: vpaddd {{[0-9]+}}(%esp), %xmm2, %xmm2
-; LINUX-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; LINUX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; LINUX-NEXT: vpaddd {{[0-9]+}}(%esp), %xmm2, %xmm1
; LINUX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; LINUX-NEXT: addl $12, %esp
; LINUX-NEXT: retl
define <8 x i32> @test_avx(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x i32> %d) nounwind {
; DARWIN-LABEL: test_avx:
; DARWIN: ## %bb.0:
-; DARWIN-NEXT: vpaddd %ymm3, %ymm2, %ymm2
-; DARWIN-NEXT: vpaddd %ymm2, %ymm1, %ymm1
+; DARWIN-NEXT: vpaddd %ymm1, %ymm0, %ymm0
+; DARWIN-NEXT: vpaddd %ymm3, %ymm2, %ymm1
; DARWIN-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; DARWIN-NEXT: retl
;
; LINUX-NEXT: movl %esp, %ebp
; LINUX-NEXT: andl $-32, %esp
; LINUX-NEXT: subl $32, %esp
-; LINUX-NEXT: vpaddd 8(%ebp), %ymm2, %ymm2
-; LINUX-NEXT: vpaddd %ymm2, %ymm1, %ymm1
+; LINUX-NEXT: vpaddd %ymm1, %ymm0, %ymm0
+; LINUX-NEXT: vpaddd 8(%ebp), %ymm2, %ymm1
; LINUX-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; LINUX-NEXT: movl %ebp, %esp
; LINUX-NEXT: popl %ebp
define <16 x i32> @test_avx512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %c, <16 x i32> %d) nounwind {
; DARWIN-LABEL: test_avx512:
; DARWIN: ## %bb.0:
-; DARWIN-NEXT: vpaddd %zmm3, %zmm2, %zmm2
-; DARWIN-NEXT: vpaddd %zmm2, %zmm1, %zmm1
+; DARWIN-NEXT: vpaddd %zmm1, %zmm0, %zmm0
+; DARWIN-NEXT: vpaddd %zmm3, %zmm2, %zmm1
; DARWIN-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; DARWIN-NEXT: retl
;
; LINUX-NEXT: movl %esp, %ebp
; LINUX-NEXT: andl $-64, %esp
; LINUX-NEXT: subl $64, %esp
-; LINUX-NEXT: vpaddd 8(%ebp), %zmm2, %zmm2
-; LINUX-NEXT: vpaddd %zmm2, %zmm1, %zmm1
+; LINUX-NEXT: vpaddd %zmm1, %zmm0, %zmm0
+; LINUX-NEXT: vpaddd 8(%ebp), %zmm2, %zmm1
; LINUX-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; LINUX-NEXT: movl %ebp, %esp
; LINUX-NEXT: popl %ebp
; AVX1-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm3
-; AVX1-NEXT: vpaddq %xmm5, %xmm2, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm5
-; AVX1-NEXT: vpaddq %xmm5, %xmm1, %xmm1
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
-; AVX1-NEXT: vpaddq %xmm5, %xmm1, %xmm1
; AVX1-NEXT: vpaddq %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vpaddq %xmm2, %xmm4, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT: vpaddq %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpaddq %xmm0, %xmm5, %xmm0
; AVX1-NEXT: vpaddq %xmm0, %xmm2, %xmm0
-; AVX1-NEXT: vpaddq %xmm0, %xmm4, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2OR512-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
; AVX2OR512-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
; AVX2OR512-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3]
-; AVX2OR512-NEXT: vpaddq %ymm3, %ymm4, %ymm3
+; AVX2OR512-NEXT: vpaddq %ymm4, %ymm2, %ymm2
; AVX2OR512-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
; AVX2OR512-NEXT: vpaddq %ymm0, %ymm3, %ymm0
; AVX2OR512-NEXT: vpaddq %ymm0, %ymm2, %ymm0
; AVX1-NEXT: vorps %ymm2, %ymm5, %ymm2
; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7,8,9,10]
; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
+; AVX1-NEXT: vpaddb %xmm3, %xmm4, %xmm3
; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7,8,9,10]
; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
-; AVX1-NEXT: vpaddb %xmm5, %xmm4, %xmm4
-; AVX1-NEXT: vpaddb %xmm3, %xmm4, %xmm3
-; AVX1-NEXT: vpaddb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1
+; AVX1-NEXT: vpaddb %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2OR512-LABEL: interleaved_load_vf32_i8_stride3:
; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u]
; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
; AVX-NEXT: vpor %xmm3, %xmm4, %xmm3
+; AVX-NEXT: vpaddb %xmm3, %xmm2, %xmm2
; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u]
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpaddb %xmm3, %xmm0, %xmm0
; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX-NEXT: retq
%wide.vec = load <24 x i8>, ptr %ptr
define <64 x i8> @interleaved_load_vf64_i8_stride3(ptr %ptr){
; AVX1-LABEL: interleaved_load_vf64_i8_stride3:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqu (%rdi), %xmm8
+; AVX1-NEXT: vmovdqu (%rdi), %xmm9
; AVX1-NEXT: vmovups 16(%rdi), %xmm0
; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-NEXT: vmovdqu 48(%rdi), %xmm10
; AVX1-NEXT: vmovdqu 64(%rdi), %xmm3
; AVX1-NEXT: vmovdqu 80(%rdi), %xmm4
-; AVX1-NEXT: vmovdqu 96(%rdi), %xmm5
+; AVX1-NEXT: vmovdqu 96(%rdi), %xmm6
; AVX1-NEXT: vmovdqu 112(%rdi), %xmm2
; AVX1-NEXT: vmovdqu 144(%rdi), %xmm12
; AVX1-NEXT: vmovdqu 160(%rdi), %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm11 = [128,128,128,128,128,0,3,6,9,12,15,2,5,8,11,14]
-; AVX1-NEXT: vpshufb %xmm11, %xmm5, %xmm6
+; AVX1-NEXT: vpshufb %xmm11, %xmm6, %xmm5
; AVX1-NEXT: vpshufb %xmm11, %xmm12, %xmm7
-; AVX1-NEXT: vpshufb %xmm11, %xmm8, %xmm9
+; AVX1-NEXT: vpshufb %xmm11, %xmm9, %xmm8
; AVX1-NEXT: vpshufb %xmm11, %xmm10, %xmm11
; AVX1-NEXT: vmovdqa {{.*#+}} xmm13 = <1,4,7,10,13,128,128,128,128,128,128,u,u,u,u,u>
-; AVX1-NEXT: vpshufb %xmm13, %xmm5, %xmm5
+; AVX1-NEXT: vpshufb %xmm13, %xmm6, %xmm6
; AVX1-NEXT: vmovdqa {{.*#+}} xmm14 = <128,128,128,128,128,0,3,6,9,12,15,u,u,u,u,u>
; AVX1-NEXT: vpshufb %xmm14, %xmm2, %xmm15
; AVX1-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vpor %xmm5, %xmm15, %xmm0
+; AVX1-NEXT: vpor %xmm6, %xmm15, %xmm0
; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-NEXT: vpshufb %xmm13, %xmm12, %xmm12
; AVX1-NEXT: vpshufb %xmm14, %xmm1, %xmm15
; AVX1-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-NEXT: vpor %xmm12, %xmm15, %xmm1
; AVX1-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vpshufb %xmm13, %xmm8, %xmm8
+; AVX1-NEXT: vpshufb %xmm13, %xmm9, %xmm9
; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX1-NEXT: vpshufb %xmm14, %xmm1, %xmm15
-; AVX1-NEXT: vpor %xmm8, %xmm15, %xmm5
-; AVX1-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vpshufb %xmm13, %xmm10, %xmm8
+; AVX1-NEXT: vpor %xmm9, %xmm15, %xmm6
+; AVX1-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vpshufb %xmm13, %xmm10, %xmm9
; AVX1-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-NEXT: vpshufb %xmm14, %xmm3, %xmm10
-; AVX1-NEXT: vpor %xmm8, %xmm10, %xmm10
-; AVX1-NEXT: vpshufb %xmm13, %xmm3, %xmm8
-; AVX1-NEXT: vpshufb %xmm14, %xmm4, %xmm5
-; AVX1-NEXT: vpor %xmm5, %xmm8, %xmm5
-; AVX1-NEXT: vmovdqu 32(%rdi), %xmm8
+; AVX1-NEXT: vpor %xmm9, %xmm10, %xmm10
+; AVX1-NEXT: vpshufb %xmm13, %xmm3, %xmm9
+; AVX1-NEXT: vpshufb %xmm14, %xmm4, %xmm6
+; AVX1-NEXT: vpor %xmm6, %xmm9, %xmm6
+; AVX1-NEXT: vmovdqu 32(%rdi), %xmm9
; AVX1-NEXT: vpshufb %xmm13, %xmm1, %xmm3
-; AVX1-NEXT: vpshufb %xmm14, %xmm8, %xmm12
+; AVX1-NEXT: vpshufb %xmm14, %xmm9, %xmm12
; AVX1-NEXT: vpor %xmm3, %xmm12, %xmm3
; AVX1-NEXT: vmovdqu 176(%rdi), %xmm12
; AVX1-NEXT: vpshufb %xmm13, %xmm0, %xmm1
; AVX1-NEXT: vpor %xmm13, %xmm14, %xmm14
; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = [1,4,7,10,13,128,128,128,128,128,128,128,128,128,128,128]
; AVX1-NEXT: vpshufb %xmm0, %xmm15, %xmm13
-; AVX1-NEXT: vpor %xmm6, %xmm13, %xmm13
-; AVX1-NEXT: vpalignr {{.*#+}} xmm6 = xmm6[11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7,8,9,10]
+; AVX1-NEXT: vpor %xmm5, %xmm13, %xmm13
+; AVX1-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7,8,9,10]
; AVX1-NEXT: vpshufb %xmm0, %xmm12, %xmm14
; AVX1-NEXT: vpor %xmm7, %xmm14, %xmm14
; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm7[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10]
-; AVX1-NEXT: vpshufb %xmm0, %xmm8, %xmm7
-; AVX1-NEXT: vpor %xmm7, %xmm9, %xmm7
-; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm9[11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7,8,9,10]
+; AVX1-NEXT: vpshufb %xmm0, %xmm9, %xmm7
+; AVX1-NEXT: vpor %xmm7, %xmm8, %xmm7
+; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm8[11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7,8,9,10]
; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0
; AVX1-NEXT: vpor %xmm0, %xmm11, %xmm0
-; AVX1-NEXT: vpalignr {{.*#+}} xmm5 = xmm11[11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7,8,9,10]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,6,7,8,9,10,128,128,128,128,128]
-; AVX1-NEXT: vpshufb %xmm9, %xmm10, %xmm10
+; AVX1-NEXT: vpalignr {{.*#+}} xmm6 = xmm11[11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7,8,9,10]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,4,5,6,7,8,9,10,128,128,128,128,128]
+; AVX1-NEXT: vpshufb %xmm8, %xmm10, %xmm10
; AVX1-NEXT: vmovdqa {{.*#+}} xmm11 = [128,128,128,128,128,128,128,128,128,128,128,2,5,8,11,14]
; AVX1-NEXT: vpshufb %xmm11, %xmm4, %xmm4
; AVX1-NEXT: vpor %xmm4, %xmm10, %xmm4
-; AVX1-NEXT: vpaddb %xmm4, %xmm5, %xmm4
; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX1-NEXT: vpshufb %xmm9, %xmm3, %xmm5
-; AVX1-NEXT: vpshufb %xmm11, %xmm8, %xmm8
-; AVX1-NEXT: vpor %xmm5, %xmm8, %xmm5
-; AVX1-NEXT: vpaddb %xmm5, %xmm2, %xmm2
+; AVX1-NEXT: vpshufb %xmm8, %xmm3, %xmm10
+; AVX1-NEXT: vpshufb %xmm11, %xmm9, %xmm9
+; AVX1-NEXT: vpor %xmm9, %xmm10, %xmm9
; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX1-NEXT: vpshufb %xmm9, %xmm3, %xmm5
-; AVX1-NEXT: vpshufb %xmm11, %xmm12, %xmm8
-; AVX1-NEXT: vpor %xmm5, %xmm8, %xmm5
-; AVX1-NEXT: vpaddb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm8, %xmm3, %xmm10
+; AVX1-NEXT: vpshufb %xmm11, %xmm12, %xmm12
+; AVX1-NEXT: vpor %xmm12, %xmm10, %xmm10
; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX1-NEXT: vpshufb %xmm9, %xmm3, %xmm5
-; AVX1-NEXT: vpshufb %xmm11, %xmm15, %xmm8
-; AVX1-NEXT: vpor %xmm5, %xmm8, %xmm5
-; AVX1-NEXT: vpaddb %xmm5, %xmm6, %xmm5
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [5,6,7,8,9,10,128,128,128,128,128,0,1,2,3,4]
-; AVX1-NEXT: vpshufb %xmm6, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [128,128,128,128,128,128,2,5,8,11,14,128,128,128,128,128]
+; AVX1-NEXT: vpshufb %xmm8, %xmm3, %xmm8
+; AVX1-NEXT: vpshufb %xmm11, %xmm15, %xmm11
+; AVX1-NEXT: vpor %xmm11, %xmm8, %xmm8
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm11 = [5,6,7,8,9,10,128,128,128,128,128,0,1,2,3,4]
+; AVX1-NEXT: vpshufb %xmm11, %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm12 = [128,128,128,128,128,128,2,5,8,11,14,128,128,128,128,128]
; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX1-NEXT: vpshufb %xmm8, %xmm3, %xmm3
+; AVX1-NEXT: vpshufb %xmm12, %xmm3, %xmm3
; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0
-; AVX1-NEXT: vpaddb %xmm0, %xmm4, %xmm0
-; AVX1-NEXT: vpshufb %xmm6, %xmm7, %xmm3
+; AVX1-NEXT: vpaddb %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpaddb %xmm0, %xmm6, %xmm0
+; AVX1-NEXT: vpshufb %xmm11, %xmm7, %xmm3
; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX1-NEXT: vpshufb %xmm8, %xmm4, %xmm4
+; AVX1-NEXT: vpshufb %xmm12, %xmm4, %xmm4
; AVX1-NEXT: vpor %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpaddb %xmm3, %xmm9, %xmm3
; AVX1-NEXT: vpaddb %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpshufb %xmm6, %xmm14, %xmm3
+; AVX1-NEXT: vpshufb %xmm11, %xmm14, %xmm3
; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX1-NEXT: vpshufb %xmm8, %xmm4, %xmm4
+; AVX1-NEXT: vpshufb %xmm12, %xmm4, %xmm4
; AVX1-NEXT: vpor %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpaddb %xmm3, %xmm10, %xmm3
; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpshufb %xmm6, %xmm13, %xmm3
+; AVX1-NEXT: vpshufb %xmm11, %xmm13, %xmm3
; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX1-NEXT: vpshufb %xmm8, %xmm4, %xmm4
+; AVX1-NEXT: vpshufb %xmm12, %xmm4, %xmm4
; AVX1-NEXT: vpor %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpaddb %xmm3, %xmm8, %xmm3
; AVX1-NEXT: vpaddb %xmm3, %xmm5, %xmm3
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1
; CHECK-NEXT: movl %esi, %ecx
; CHECK-NEXT: movl %edi, %edx
; CHECK-NEXT: callq bar@PLT
-; CHECK-NEXT: addl %ecx, %eax
-; CHECK-NEXT: addl %edx, %eax
+; CHECK-NEXT: addl %ecx, %edx
+; CHECK-NEXT: addl %eax, %edx
; CHECK-NEXT: xorps %xmm0, %xmm0
-; CHECK-NEXT: cvtsi2ss %eax, %xmm0
+; CHECK-NEXT: cvtsi2ss %edx, %xmm0
; CHECK-NEXT: addss %xmm1, %xmm0
; CHECK-NEXT: popq %rax
; CHECK-NEXT: .cfi_def_cfa_offset 8
; WIN32-NEXT: pushl %ebx
; WIN32-NEXT: pushl %edi
; WIN32-NEXT: pushl %esi
-; WIN32-NEXT: subl $12, %esp
+; WIN32-NEXT: subl $8, %esp
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edi
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edx
; WIN32-NEXT: movl %edx, %ecx
-; WIN32-NEXT: movl %edx, %ebp
+; WIN32-NEXT: movl %edx, %ebx
; WIN32-NEXT: sarl $31, %ecx
-; WIN32-NEXT: movl %ebx, %edi
-; WIN32-NEXT: imull %ecx, %edi
+; WIN32-NEXT: movl %edi, %esi
+; WIN32-NEXT: imull %ecx, %esi
; WIN32-NEXT: mull %ecx
-; WIN32-NEXT: movl %eax, %esi
-; WIN32-NEXT: addl %eax, %edi
-; WIN32-NEXT: addl %edx, %edi
-; WIN32-NEXT: movl %ebx, %eax
-; WIN32-NEXT: sarl $31, %eax
+; WIN32-NEXT: movl %edx, %ebp
; WIN32-NEXT: movl %eax, %ecx
-; WIN32-NEXT: imull %ebp, %ecx
-; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; WIN32-NEXT: mull %ebp
-; WIN32-NEXT: addl %eax, %ecx
-; WIN32-NEXT: addl %edx, %ecx
-; WIN32-NEXT: addl %esi, %eax
-; WIN32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; WIN32-NEXT: adcl %edi, %ecx
-; WIN32-NEXT: movl %ebp, %eax
-; WIN32-NEXT: movl {{[0-9]+}}(%esp), %esi
-; WIN32-NEXT: mull %esi
-; WIN32-NEXT: movl %edx, %ebx
+; WIN32-NEXT: addl %eax, %ebp
+; WIN32-NEXT: addl %esi, %ebp
+; WIN32-NEXT: movl %edi, %eax
+; WIN32-NEXT: sarl $31, %eax
+; WIN32-NEXT: movl %eax, %edi
+; WIN32-NEXT: imull %ebx, %edi
+; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; WIN32-NEXT: mull %ebx
+; WIN32-NEXT: movl %edx, %esi
+; WIN32-NEXT: addl %edi, %esi
+; WIN32-NEXT: addl %eax, %esi
+; WIN32-NEXT: addl %ecx, %eax
+; WIN32-NEXT: movl %eax, (%esp) # 4-byte Spill
+; WIN32-NEXT: adcl %ebp, %esi
+; WIN32-NEXT: movl %ebx, %eax
+; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; WIN32-NEXT: mull %ecx
+; WIN32-NEXT: movl %edx, %ebp
; WIN32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT: mull %esi
+; WIN32-NEXT: mull %ecx
; WIN32-NEXT: movl %edx, %edi
-; WIN32-NEXT: movl %eax, %esi
-; WIN32-NEXT: addl %ebx, %esi
+; WIN32-NEXT: movl %eax, %ecx
+; WIN32-NEXT: addl %ebp, %ecx
; WIN32-NEXT: adcl $0, %edi
-; WIN32-NEXT: movl %ebp, %eax
+; WIN32-NEXT: movl %ebx, %eax
; WIN32-NEXT: mull {{[0-9]+}}(%esp)
-; WIN32-NEXT: movl %edx, %ebp
-; WIN32-NEXT: movl %eax, %ebx
-; WIN32-NEXT: addl %esi, %ebx
-; WIN32-NEXT: adcl %edi, %ebp
-; WIN32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; WIN32-NEXT: movl %edx, %ebx
+; WIN32-NEXT: movl %eax, %ebp
+; WIN32-NEXT: addl %ecx, %ebp
+; WIN32-NEXT: adcl %edi, %ebx
+; WIN32-NEXT: setb %cl
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
; WIN32-NEXT: mull {{[0-9]+}}(%esp)
-; WIN32-NEXT: addl %ebp, %eax
-; WIN32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
-; WIN32-NEXT: adcl %esi, %edx
-; WIN32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; WIN32-NEXT: addl %ebx, %eax
+; WIN32-NEXT: movzbl %cl, %ecx
; WIN32-NEXT: adcl %ecx, %edx
-; WIN32-NEXT: movl %ebx, %ecx
+; WIN32-NEXT: addl (%esp), %eax # 4-byte Folded Reload
+; WIN32-NEXT: adcl %esi, %edx
+; WIN32-NEXT: movl %ebp, %ecx
; WIN32-NEXT: sarl $31, %ecx
; WIN32-NEXT: xorl %ecx, %edx
; WIN32-NEXT: xorl %eax, %ecx
; WIN32-NEXT: orl %edx, %ecx
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT: movl %ebx, 4(%eax)
+; WIN32-NEXT: movl %ebp, 4(%eax)
; WIN32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; WIN32-NEXT: movl %ecx, (%eax)
; WIN32-NEXT: setne %al
-; WIN32-NEXT: addl $12, %esp
+; WIN32-NEXT: addl $8, %esp
; WIN32-NEXT: popl %esi
; WIN32-NEXT: popl %edi
; WIN32-NEXT: popl %ebx
; WIN32-NEXT: pushl %ebx
; WIN32-NEXT: pushl %edi
; WIN32-NEXT: pushl %esi
-; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %esi
; WIN32-NEXT: testl %esi, %esi
; WIN32-NEXT: setne %dl
; WIN32-NEXT: testl %eax, %eax
-; WIN32-NEXT: setne %bl
-; WIN32-NEXT: andb %dl, %bl
-; WIN32-NEXT: mull %ebp
+; WIN32-NEXT: setne %cl
+; WIN32-NEXT: andb %dl, %cl
+; WIN32-NEXT: mull {{[0-9]+}}(%esp)
; WIN32-NEXT: movl %eax, %edi
-; WIN32-NEXT: seto %bh
+; WIN32-NEXT: seto %bl
; WIN32-NEXT: movl %esi, %eax
-; WIN32-NEXT: mull %ecx
-; WIN32-NEXT: movl %ecx, %edx
+; WIN32-NEXT: mull %ebp
; WIN32-NEXT: seto %ch
-; WIN32-NEXT: orb %bh, %ch
+; WIN32-NEXT: orb %bl, %ch
+; WIN32-NEXT: orb %cl, %ch
; WIN32-NEXT: leal (%edi,%eax), %esi
-; WIN32-NEXT: movl %edx, %eax
-; WIN32-NEXT: mull %ebp
+; WIN32-NEXT: movl %ebp, %eax
+; WIN32-NEXT: mull {{[0-9]+}}(%esp)
; WIN32-NEXT: addl %esi, %edx
; WIN32-NEXT: setb %cl
; WIN32-NEXT: orb %ch, %cl
-; WIN32-NEXT: orb %bl, %cl
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %esi
; WIN32-NEXT: movl %eax, (%esi)
; WIN32-NEXT: movl %edx, 4(%esi)
; WIN32-NEXT: pushl %esi
; WIN32-NEXT: pushl %eax
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT: movl {{[0-9]+}}(%esp), %esi
-; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; WIN32-NEXT: movl %ecx, %edx
-; WIN32-NEXT: movl %ecx, %edi
-; WIN32-NEXT: sarl $31, %edx
-; WIN32-NEXT: movl %esi, %ecx
-; WIN32-NEXT: imull %edx, %ecx
-; WIN32-NEXT: mull %edx
-; WIN32-NEXT: movl %eax, %ebp
-; WIN32-NEXT: addl %eax, %ecx
-; WIN32-NEXT: addl %edx, %ecx
-; WIN32-NEXT: movl %esi, %eax
-; WIN32-NEXT: sarl $31, %eax
-; WIN32-NEXT: movl %eax, %esi
-; WIN32-NEXT: imull %edi, %esi
-; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edi
-; WIN32-NEXT: mull %edi
+; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edx
+; WIN32-NEXT: movl %edx, %ecx
+; WIN32-NEXT: movl %edx, %ebp
+; WIN32-NEXT: sarl $31, %ecx
+; WIN32-NEXT: movl %ebx, %edi
+; WIN32-NEXT: imull %ecx, %edi
+; WIN32-NEXT: mull %ecx
+; WIN32-NEXT: movl %edx, %esi
+; WIN32-NEXT: movl %eax, %ecx
; WIN32-NEXT: addl %eax, %esi
-; WIN32-NEXT: addl %edx, %esi
-; WIN32-NEXT: addl %ebp, %eax
+; WIN32-NEXT: addl %edi, %esi
+; WIN32-NEXT: movl %ebx, %eax
+; WIN32-NEXT: sarl $31, %eax
+; WIN32-NEXT: movl %eax, %edi
+; WIN32-NEXT: imull %ebp, %edi
+; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; WIN32-NEXT: mull %ebp
+; WIN32-NEXT: movl %edx, %ebx
+; WIN32-NEXT: addl %edi, %ebx
+; WIN32-NEXT: addl %eax, %ebx
+; WIN32-NEXT: addl %ecx, %eax
; WIN32-NEXT: movl %eax, (%esp) # 4-byte Spill
-; WIN32-NEXT: adcl %ecx, %esi
-; WIN32-NEXT: movl %edi, %eax
+; WIN32-NEXT: adcl %esi, %ebx
+; WIN32-NEXT: movl %ebp, %eax
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; WIN32-NEXT: mull %ecx
-; WIN32-NEXT: movl %edx, %ebp
+; WIN32-NEXT: movl %edx, %esi
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
; WIN32-NEXT: mull %ecx
-; WIN32-NEXT: movl %edx, %ebx
-; WIN32-NEXT: movl %eax, %ecx
-; WIN32-NEXT: addl %ebp, %ecx
-; WIN32-NEXT: adcl $0, %ebx
-; WIN32-NEXT: movl %edi, %eax
+; WIN32-NEXT: movl %edx, %ecx
+; WIN32-NEXT: movl %eax, %edi
+; WIN32-NEXT: addl %esi, %edi
+; WIN32-NEXT: adcl $0, %ecx
+; WIN32-NEXT: movl %ebp, %eax
; WIN32-NEXT: mull {{[0-9]+}}(%esp)
-; WIN32-NEXT: movl %edx, %edi
-; WIN32-NEXT: movl %eax, %ebp
-; WIN32-NEXT: addl %ecx, %ebp
-; WIN32-NEXT: adcl %ebx, %edi
-; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; WIN32-NEXT: movl %edx, %ebp
+; WIN32-NEXT: movl %eax, %esi
+; WIN32-NEXT: addl %edi, %esi
+; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edi
+; WIN32-NEXT: adcl %ecx, %ebp
; WIN32-NEXT: setb %cl
-; WIN32-NEXT: movl %ebx, %eax
+; WIN32-NEXT: movl %edi, %eax
; WIN32-NEXT: mull {{[0-9]+}}(%esp)
-; WIN32-NEXT: addl %edi, %eax
+; WIN32-NEXT: addl %ebp, %eax
; WIN32-NEXT: movzbl %cl, %ecx
; WIN32-NEXT: adcl %ecx, %edx
; WIN32-NEXT: addl (%esp), %eax # 4-byte Folded Reload
-; WIN32-NEXT: adcl %esi, %edx
-; WIN32-NEXT: sarl $31, %ebp
-; WIN32-NEXT: xorl %ebp, %edx
-; WIN32-NEXT: xorl %eax, %ebp
+; WIN32-NEXT: adcl %ebx, %edx
+; WIN32-NEXT: sarl $31, %esi
+; WIN32-NEXT: xorl %esi, %edx
+; WIN32-NEXT: xorl %eax, %esi
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT: orl %edx, %ebp
+; WIN32-NEXT: orl %edx, %esi
; WIN32-NEXT: jne LBB12_2
; WIN32-NEXT: # %bb.1:
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edi
; WIN32-NEXT: LBB12_2:
-; WIN32-NEXT: movl %ebx, %edx
+; WIN32-NEXT: movl %edi, %edx
; WIN32-NEXT: addl $4, %esp
; WIN32-NEXT: popl %esi
; WIN32-NEXT: popl %edi
; WIN32-NEXT: mull %ecx
; WIN32-NEXT: seto %bh
; WIN32-NEXT: orb {{[-0-9]+}}(%e{{[sb]}}p), %bh # 1-byte Folded Reload
+; WIN32-NEXT: orb %bl, %bh
; WIN32-NEXT: addl %eax, %edi
; WIN32-NEXT: movl %ecx, %eax
; WIN32-NEXT: mull %ebp
; WIN32-NEXT: addl %edi, %edx
; WIN32-NEXT: setb %al
; WIN32-NEXT: orb %bh, %al
-; WIN32-NEXT: orb %bl, %al
; WIN32-NEXT: testb %al, %al
; WIN32-NEXT: jne LBB14_2
; WIN32-NEXT: # %bb.1:
; WIN32-NEXT: pushl %ebx
; WIN32-NEXT: pushl %edi
; WIN32-NEXT: pushl %esi
-; WIN32-NEXT: subl $8, %esp
+; WIN32-NEXT: pushl %eax
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edi
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edx
; WIN32-NEXT: movl %edx, %ecx
-; WIN32-NEXT: movl %edx, %edi
+; WIN32-NEXT: movl %edx, %ebp
; WIN32-NEXT: sarl $31, %ecx
-; WIN32-NEXT: movl %ebx, %esi
+; WIN32-NEXT: movl %edi, %esi
; WIN32-NEXT: imull %ecx, %esi
; WIN32-NEXT: mull %ecx
-; WIN32-NEXT: movl %eax, %ebp
-; WIN32-NEXT: addl %eax, %esi
-; WIN32-NEXT: addl %edx, %esi
-; WIN32-NEXT: movl %ebx, %eax
-; WIN32-NEXT: sarl $31, %eax
+; WIN32-NEXT: movl %edx, %ebx
; WIN32-NEXT: movl %eax, %ecx
-; WIN32-NEXT: imull %edi, %ecx
-; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; WIN32-NEXT: mull %ebx
-; WIN32-NEXT: addl %eax, %ecx
-; WIN32-NEXT: addl %edx, %ecx
-; WIN32-NEXT: addl %ebp, %eax
-; WIN32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; WIN32-NEXT: adcl %esi, %ecx
-; WIN32-NEXT: movl %ebx, %eax
-; WIN32-NEXT: movl {{[0-9]+}}(%esp), %esi
-; WIN32-NEXT: mull %esi
-; WIN32-NEXT: movl %edx, %edi
-; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT: mull %esi
-; WIN32-NEXT: movl %edx, %esi
-; WIN32-NEXT: movl %eax, %ebp
-; WIN32-NEXT: addl %edi, %ebp
-; WIN32-NEXT: adcl $0, %esi
-; WIN32-NEXT: movl %ebx, %eax
+; WIN32-NEXT: addl %eax, %ebx
+; WIN32-NEXT: addl %esi, %ebx
+; WIN32-NEXT: movl %edi, %eax
+; WIN32-NEXT: sarl $31, %eax
+; WIN32-NEXT: movl %eax, %edi
+; WIN32-NEXT: imull %ebp, %edi
; WIN32-NEXT: mull {{[0-9]+}}(%esp)
+; WIN32-NEXT: movl %edx, %esi
+; WIN32-NEXT: addl %edi, %esi
+; WIN32-NEXT: addl %eax, %esi
+; WIN32-NEXT: addl %ecx, %eax
+; WIN32-NEXT: movl %eax, (%esp) # 4-byte Spill
+; WIN32-NEXT: adcl %ebx, %esi
+; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edi
+; WIN32-NEXT: movl %edi, %eax
+; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; WIN32-NEXT: mull %ecx
; WIN32-NEXT: movl %edx, %ebx
-; WIN32-NEXT: movl %eax, %edi
-; WIN32-NEXT: addl %ebp, %edi
-; WIN32-NEXT: adcl %esi, %ebx
-; WIN32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; WIN32-NEXT: movl %ebp, %eax
+; WIN32-NEXT: mull %ecx
+; WIN32-NEXT: movl %edx, %ebp
+; WIN32-NEXT: movl %eax, %ecx
+; WIN32-NEXT: addl %ebx, %ecx
+; WIN32-NEXT: adcl $0, %ebp
+; WIN32-NEXT: movl %edi, %eax
+; WIN32-NEXT: mull {{[0-9]+}}(%esp)
+; WIN32-NEXT: movl %edx, %edi
+; WIN32-NEXT: movl %eax, %ebx
+; WIN32-NEXT: addl %ecx, %ebx
+; WIN32-NEXT: adcl %ebp, %edi
+; WIN32-NEXT: setb %cl
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
; WIN32-NEXT: mull {{[0-9]+}}(%esp)
-; WIN32-NEXT: addl %ebx, %eax
-; WIN32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
-; WIN32-NEXT: adcl %esi, %edx
-; WIN32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; WIN32-NEXT: addl %edi, %eax
+; WIN32-NEXT: movzbl %cl, %ecx
; WIN32-NEXT: adcl %ecx, %edx
-; WIN32-NEXT: sarl $31, %edi
-; WIN32-NEXT: xorl %edi, %edx
-; WIN32-NEXT: xorl %eax, %edi
-; WIN32-NEXT: orl %edx, %edi
+; WIN32-NEXT: addl (%esp), %eax # 4-byte Folded Reload
+; WIN32-NEXT: adcl %esi, %edx
+; WIN32-NEXT: sarl $31, %ebx
+; WIN32-NEXT: xorl %ebx, %edx
+; WIN32-NEXT: xorl %eax, %ebx
+; WIN32-NEXT: orl %edx, %ebx
; WIN32-NEXT: jne LBB18_1
; WIN32-NEXT: # %bb.3: # %continue
; WIN32-NEXT: movb $1, %al
; WIN32-NEXT: LBB18_2: # %overflow
-; WIN32-NEXT: addl $8, %esp
+; WIN32-NEXT: addl $4, %esp
; WIN32-NEXT: popl %esi
; WIN32-NEXT: popl %edi
; WIN32-NEXT: popl %ebx
; WIN32-NEXT: pushl %ebx
; WIN32-NEXT: pushl %edi
; WIN32-NEXT: pushl %esi
-; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %esi
; WIN32-NEXT: testl %esi, %esi
; WIN32-NEXT: setne %dl
; WIN32-NEXT: testl %eax, %eax
-; WIN32-NEXT: setne %bl
-; WIN32-NEXT: andb %dl, %bl
-; WIN32-NEXT: mull %ebp
+; WIN32-NEXT: setne %cl
+; WIN32-NEXT: andb %dl, %cl
+; WIN32-NEXT: mull {{[0-9]+}}(%esp)
; WIN32-NEXT: movl %eax, %edi
-; WIN32-NEXT: seto %bh
+; WIN32-NEXT: seto %bl
; WIN32-NEXT: movl %esi, %eax
-; WIN32-NEXT: mull %ecx
-; WIN32-NEXT: movl %ecx, %edx
-; WIN32-NEXT: seto %cl
-; WIN32-NEXT: orb %bh, %cl
-; WIN32-NEXT: leal (%edi,%eax), %esi
-; WIN32-NEXT: movl %edx, %eax
; WIN32-NEXT: mull %ebp
+; WIN32-NEXT: seto %ch
+; WIN32-NEXT: orb %bl, %ch
+; WIN32-NEXT: orb %cl, %ch
+; WIN32-NEXT: leal (%edi,%eax), %esi
+; WIN32-NEXT: movl %ebp, %eax
+; WIN32-NEXT: mull {{[0-9]+}}(%esp)
; WIN32-NEXT: addl %esi, %edx
; WIN32-NEXT: setb %al
-; WIN32-NEXT: orb %cl, %al
-; WIN32-NEXT: orb %bl, %al
+; WIN32-NEXT: orb %ch, %al
; WIN32-NEXT: subb $1, %al
; WIN32-NEXT: je LBB22_1
; WIN32-NEXT: # %bb.3: # %continue
; WIN32-NEXT: pushl %ebx
; WIN32-NEXT: pushl %edi
; WIN32-NEXT: pushl %esi
-; WIN32-NEXT: subl $20, %esp
+; WIN32-NEXT: subl $16, %esp
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
; WIN32-NEXT: movl (%eax), %edx
-; WIN32-NEXT: movl %edx, (%esp) # 4-byte Spill
-; WIN32-NEXT: movl 4(%eax), %ebx
+; WIN32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; WIN32-NEXT: movl 4(%eax), %esi
+; WIN32-NEXT: movl %esi, (%esp) # 4-byte Spill
; WIN32-NEXT: movl %ecx, %eax
+; WIN32-NEXT: movl %ecx, %edi
; WIN32-NEXT: sarl $31, %eax
-; WIN32-NEXT: movl %eax, %esi
-; WIN32-NEXT: imull %ebx, %esi
+; WIN32-NEXT: movl %eax, %ecx
+; WIN32-NEXT: imull %esi, %ecx
; WIN32-NEXT: mull %edx
; WIN32-NEXT: movl %eax, %ebp
-; WIN32-NEXT: addl %eax, %esi
-; WIN32-NEXT: addl %edx, %esi
-; WIN32-NEXT: movl %ebx, %edi
-; WIN32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; WIN32-NEXT: sarl $31, %edi
-; WIN32-NEXT: imull %edi, %ecx
+; WIN32-NEXT: movl %edx, %ebx
+; WIN32-NEXT: addl %ecx, %ebx
+; WIN32-NEXT: movl %esi, %ecx
+; WIN32-NEXT: sarl $31, %ecx
+; WIN32-NEXT: movl %edi, %esi
+; WIN32-NEXT: imull %ecx, %esi
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT: mull %edi
-; WIN32-NEXT: addl %eax, %ecx
-; WIN32-NEXT: addl %edx, %ecx
+; WIN32-NEXT: mull %ecx
+; WIN32-NEXT: movl %edx, %edi
+; WIN32-NEXT: addl %eax, %edi
+; WIN32-NEXT: addl %esi, %edi
+; WIN32-NEXT: addl %ebp, %ebx
; WIN32-NEXT: addl %eax, %ebp
; WIN32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; WIN32-NEXT: adcl %esi, %ecx
-; WIN32-NEXT: movl (%esp), %esi # 4-byte Reload
+; WIN32-NEXT: adcl %ebx, %edi
+; WIN32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; WIN32-NEXT: movl %esi, %eax
-; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edi
-; WIN32-NEXT: mull %edi
-; WIN32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; WIN32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; WIN32-NEXT: movl %ebx, %eax
-; WIN32-NEXT: mull %edi
+; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; WIN32-NEXT: mull %ecx
; WIN32-NEXT: movl %edx, %ebp
-; WIN32-NEXT: movl %eax, %edi
-; WIN32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; WIN32-NEXT: adcl $0, %ebp
+; WIN32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; WIN32-NEXT: movl (%esp), %eax # 4-byte Reload
+; WIN32-NEXT: mull %ecx
+; WIN32-NEXT: movl %edx, %ebx
+; WIN32-NEXT: movl %eax, %ecx
+; WIN32-NEXT: addl %ebp, %ecx
+; WIN32-NEXT: adcl $0, %ebx
; WIN32-NEXT: movl %esi, %eax
; WIN32-NEXT: mull {{[0-9]+}}(%esp)
; WIN32-NEXT: movl %edx, %esi
-; WIN32-NEXT: movl %eax, %ebx
-; WIN32-NEXT: addl %edi, %ebx
-; WIN32-NEXT: adcl %ebp, %esi
-; WIN32-NEXT: setb (%esp) # 1-byte Folded Spill
-; WIN32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; WIN32-NEXT: movl %eax, %ebp
+; WIN32-NEXT: addl %ecx, %ebp
+; WIN32-NEXT: adcl %ebx, %esi
+; WIN32-NEXT: setb %cl
+; WIN32-NEXT: movl (%esp), %eax # 4-byte Reload
; WIN32-NEXT: mull {{[0-9]+}}(%esp)
; WIN32-NEXT: addl %esi, %eax
-; WIN32-NEXT: movzbl (%esp), %esi # 1-byte Folded Reload
-; WIN32-NEXT: adcl %esi, %edx
-; WIN32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; WIN32-NEXT: movzbl %cl, %ecx
; WIN32-NEXT: adcl %ecx, %edx
-; WIN32-NEXT: movl %ebx, %ecx
+; WIN32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; WIN32-NEXT: adcl %edi, %edx
+; WIN32-NEXT: movl %ebp, %ecx
; WIN32-NEXT: sarl $31, %ecx
; WIN32-NEXT: xorl %ecx, %edx
; WIN32-NEXT: xorl %eax, %ecx
; WIN32-NEXT: orl %edx, %ecx
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT: movl %ebx, 4(%eax)
+; WIN32-NEXT: movl %ebp, 4(%eax)
; WIN32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; WIN32-NEXT: movl %ecx, (%eax)
; WIN32-NEXT: setne %al
-; WIN32-NEXT: addl $20, %esp
+; WIN32-NEXT: addl $16, %esp
; WIN32-NEXT: popl %esi
; WIN32-NEXT: popl %edi
; WIN32-NEXT: popl %ebx
; WIN32-NEXT: subl $16, %esp
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT: movl (%eax), %ebx
-; WIN32-NEXT: movl 4(%eax), %ebp
-; WIN32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; WIN32-NEXT: movl (%eax), %ebp
+; WIN32-NEXT: movl 4(%eax), %ebx
+; WIN32-NEXT: movl %ebx, (%esp) # 4-byte Spill
; WIN32-NEXT: sarl $31, %ecx
-; WIN32-NEXT: movl %ebp, %edi
-; WIN32-NEXT: imull %ecx, %edi
-; WIN32-NEXT: movl %ebx, %eax
-; WIN32-NEXT: mull %ecx
-; WIN32-NEXT: movl %eax, %esi
-; WIN32-NEXT: addl %eax, %edi
-; WIN32-NEXT: addl %edx, %edi
+; WIN32-NEXT: movl %ebx, %esi
+; WIN32-NEXT: imull %ecx, %esi
; WIN32-NEXT: movl %ebp, %eax
-; WIN32-NEXT: sarl $31, %eax
-; WIN32-NEXT: movl %eax, %ecx
-; WIN32-NEXT: imull {{[0-9]+}}(%esp), %ecx
-; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; WIN32-NEXT: mull %ebp
+; WIN32-NEXT: mull %ecx
+; WIN32-NEXT: movl %edx, %ecx
+; WIN32-NEXT: movl %eax, %edi
; WIN32-NEXT: addl %eax, %ecx
-; WIN32-NEXT: addl %edx, %ecx
-; WIN32-NEXT: addl %esi, %eax
+; WIN32-NEXT: addl %esi, %ecx
+; WIN32-NEXT: movl %ebx, %eax
+; WIN32-NEXT: sarl $31, %eax
+; WIN32-NEXT: movl %eax, %ebx
+; WIN32-NEXT: imull {{[0-9]+}}(%esp), %ebx
+; WIN32-NEXT: mull {{[0-9]+}}(%esp)
+; WIN32-NEXT: movl %edx, %esi
+; WIN32-NEXT: addl %ebx, %esi
+; WIN32-NEXT: addl %eax, %esi
+; WIN32-NEXT: addl %edi, %eax
; WIN32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; WIN32-NEXT: adcl %edi, %ecx
-; WIN32-NEXT: movl %ebp, %eax
-; WIN32-NEXT: mull %ebx
-; WIN32-NEXT: movl %edx, %ebp
+; WIN32-NEXT: adcl %ecx, %esi
+; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; WIN32-NEXT: movl %ebx, %eax
+; WIN32-NEXT: mull %ebp
+; WIN32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; WIN32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT: mull %ebx
+; WIN32-NEXT: mull %ebp
; WIN32-NEXT: movl %edx, %edi
-; WIN32-NEXT: movl %eax, %esi
-; WIN32-NEXT: addl %ebp, %esi
+; WIN32-NEXT: movl %eax, %ecx
+; WIN32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; WIN32-NEXT: adcl $0, %edi
+; WIN32-NEXT: movl %ebx, %eax
+; WIN32-NEXT: mull (%esp) # 4-byte Folded Reload
+; WIN32-NEXT: movl %edx, %ebx
+; WIN32-NEXT: movl %eax, %ebp
+; WIN32-NEXT: addl %ecx, %ebp
+; WIN32-NEXT: adcl %edi, %ebx
+; WIN32-NEXT: setb %cl
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT: mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; WIN32-NEXT: movl %edx, %ebp
-; WIN32-NEXT: movl %eax, %ebx
-; WIN32-NEXT: addl %esi, %ebx
-; WIN32-NEXT: adcl %edi, %ebp
-; WIN32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT: mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; WIN32-NEXT: addl %ebp, %eax
-; WIN32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
-; WIN32-NEXT: adcl %esi, %edx
-; WIN32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; WIN32-NEXT: mull (%esp) # 4-byte Folded Reload
+; WIN32-NEXT: addl %ebx, %eax
+; WIN32-NEXT: movzbl %cl, %ecx
; WIN32-NEXT: adcl %ecx, %edx
-; WIN32-NEXT: movl %ebx, %ecx
+; WIN32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; WIN32-NEXT: adcl %esi, %edx
+; WIN32-NEXT: movl %ebp, %ecx
; WIN32-NEXT: sarl $31, %ecx
; WIN32-NEXT: xorl %ecx, %edx
; WIN32-NEXT: xorl %eax, %ecx
; WIN32-NEXT: orl %edx, %ecx
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT: movl %ebx, 4(%eax)
+; WIN32-NEXT: movl %ebp, 4(%eax)
; WIN32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; WIN32-NEXT: movl %ecx, (%eax)
; WIN32-NEXT: setne %al
; WIN32-NEXT: pushl %ebx
; WIN32-NEXT: pushl %edi
; WIN32-NEXT: pushl %esi
-; WIN32-NEXT: pushl %eax
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; WIN32-NEXT: movl {{[0-9]+}}(%esp), %esi
+; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebx
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT: movl (%eax), %ecx
+; WIN32-NEXT: movl (%eax), %esi
; WIN32-NEXT: movl 4(%eax), %eax
-; WIN32-NEXT: testl %esi, %esi
+; WIN32-NEXT: testl %ebx, %ebx
; WIN32-NEXT: setne %dl
; WIN32-NEXT: testl %eax, %eax
-; WIN32-NEXT: setne %bl
-; WIN32-NEXT: andb %dl, %bl
+; WIN32-NEXT: setne %cl
+; WIN32-NEXT: andb %dl, %cl
; WIN32-NEXT: mull %ebp
; WIN32-NEXT: movl %eax, %edi
-; WIN32-NEXT: seto {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; WIN32-NEXT: seto %ch
+; WIN32-NEXT: movl %ebx, %eax
+; WIN32-NEXT: mull %esi
+; WIN32-NEXT: seto %bl
+; WIN32-NEXT: orb %ch, %bl
+; WIN32-NEXT: orb %cl, %bl
+; WIN32-NEXT: leal (%edi,%eax), %ecx
; WIN32-NEXT: movl %esi, %eax
-; WIN32-NEXT: mull %ecx
-; WIN32-NEXT: seto %bh
-; WIN32-NEXT: orb {{[-0-9]+}}(%e{{[sb]}}p), %bh # 1-byte Folded Reload
-; WIN32-NEXT: leal (%edi,%eax), %esi
-; WIN32-NEXT: movl %ecx, %eax
; WIN32-NEXT: mull %ebp
-; WIN32-NEXT: addl %esi, %edx
+; WIN32-NEXT: addl %ecx, %edx
; WIN32-NEXT: setb %cl
-; WIN32-NEXT: orb %bh, %cl
; WIN32-NEXT: orb %bl, %cl
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %esi
; WIN32-NEXT: movl %eax, (%esi)
; WIN32-NEXT: movl %edx, 4(%esi)
; WIN32-NEXT: movl %ecx, %eax
-; WIN32-NEXT: addl $4, %esp
; WIN32-NEXT: popl %esi
; WIN32-NEXT: popl %edi
; WIN32-NEXT: popl %ebx
; WIN32-NEXT: pushl %ebx
; WIN32-NEXT: pushl %edi
; WIN32-NEXT: pushl %esi
-; WIN32-NEXT: pushl %eax
-; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebp
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; WIN32-NEXT: movl (%edx), %ecx
+; WIN32-NEXT: movl (%edx), %ebp
; WIN32-NEXT: movl 4(%edx), %esi
; WIN32-NEXT: testl %eax, %eax
; WIN32-NEXT: setne %dl
; WIN32-NEXT: testl %esi, %esi
-; WIN32-NEXT: setne %bl
-; WIN32-NEXT: andb %dl, %bl
-; WIN32-NEXT: mull %ecx
+; WIN32-NEXT: setne %cl
+; WIN32-NEXT: andb %dl, %cl
+; WIN32-NEXT: mull %ebp
; WIN32-NEXT: movl %eax, %edi
-; WIN32-NEXT: seto {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; WIN32-NEXT: seto %bl
; WIN32-NEXT: movl %esi, %eax
-; WIN32-NEXT: mull %ebp
-; WIN32-NEXT: seto %bh
-; WIN32-NEXT: orb {{[-0-9]+}}(%e{{[sb]}}p), %bh # 1-byte Folded Reload
+; WIN32-NEXT: mull {{[0-9]+}}(%esp)
+; WIN32-NEXT: seto %ch
+; WIN32-NEXT: orb %bl, %ch
+; WIN32-NEXT: orb %cl, %ch
; WIN32-NEXT: leal (%edi,%eax), %esi
-; WIN32-NEXT: movl %ebp, %eax
-; WIN32-NEXT: mull %ecx
+; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT: mull %ebp
; WIN32-NEXT: addl %esi, %edx
; WIN32-NEXT: setb %cl
-; WIN32-NEXT: orb %bh, %cl
-; WIN32-NEXT: orb %bl, %cl
+; WIN32-NEXT: orb %ch, %cl
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %esi
; WIN32-NEXT: movl %eax, (%esi)
; WIN32-NEXT: movl %edx, 4(%esi)
; WIN32-NEXT: movl %ecx, %eax
-; WIN32-NEXT: addl $4, %esp
; WIN32-NEXT: popl %esi
; WIN32-NEXT: popl %edi
; WIN32-NEXT: popl %ebx