From ac9209751ad7f06c42b6ac80cf9c71b3c4bd238d Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Tue, 11 Oct 2022 16:30:40 -0700 Subject: [PATCH] Revert "[DAGCombiner] Fold (mul (sra X, BW-1), Y) -> (neg (and (sra X, BW-1), Y))" This reverts commit 0148df8157f05ecf3b1064508e6f012aefb87dad. Getting a lit test failures on AMDGPU but I can't reproduce it so far. Reverting to investigate. --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 29 - .../AArch64/umulo-128-legalisation-lowering.ll | 24 +- llvm/test/CodeGen/AMDGPU/mad_64_32.ll | 161 +- llvm/test/CodeGen/PowerPC/pr45448.ll | 3 +- llvm/test/CodeGen/RISCV/mul.ll | 10 +- llvm/test/CodeGen/RISCV/xaluo.ll | 272 ++- llvm/test/CodeGen/Thumb2/mve-vmull-splat.ll | 190 +- llvm/test/CodeGen/X86/extmul128.ll | 35 +- llvm/test/CodeGen/X86/muloti.ll | 46 +- llvm/test/CodeGen/X86/smul_fix_sat.ll | 60 +- .../CodeGen/X86/smulo-128-legalisation-lowering.ll | 1875 ++++++++++---------- llvm/test/CodeGen/X86/vec_smulo.ll | 674 ++++--- llvm/test/CodeGen/X86/xmulo.ll | 281 ++- 13 files changed, 1735 insertions(+), 1925 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 131364e..195238e 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -3939,30 +3939,6 @@ SDValue DAGCombiner::visitMULFIX(SDNode *N) { return SDValue(); } -// Fold (mul (sra X, BW-1), Y) -> (neg (and (sra X, BW-1), Y)) -static SDValue foldSraMulToAndNeg(SDNode *N, SDValue N0, SDValue N1, - SelectionDAG &DAG) { - if (N0.getOpcode() != ISD::SRA) - return SDValue(); - - EVT VT = N->getValueType(0); - - // TODO: Use computeNumSignBits() == BitWidth? - unsigned BitWidth = VT.getScalarSizeInBits(); - ConstantSDNode *ShiftAmt = isConstOrConstSplat(N0.getOperand(1)); - if (!ShiftAmt || ShiftAmt->getAPIntValue() != (BitWidth - 1)) - return SDValue(); - - // If optimizing for minsize, we don't want to increase the number of - // instructions. - if (DAG.getMachineFunction().getFunction().hasMinSize()) - return SDValue(); - - SDLoc dl(N); - SDValue And = DAG.getNode(ISD::AND, dl, VT, N0, N1); - return DAG.getNode(ISD::SUB, dl, VT, DAG.getConstant(0, dl, VT), And); -} - SDValue DAGCombiner::visitMUL(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); @@ -4173,11 +4149,6 @@ SDValue DAGCombiner::visitMUL(SDNode *N) { } } - if (SDValue V = foldSraMulToAndNeg(N, N0, N1, DAG)) - return V; - if (SDValue V = foldSraMulToAndNeg(N, N1, N0, DAG)) - return V; - // reassociate mul if (SDValue RMUL = reassociateOps(ISD::MUL, DL, N0, N1, N->getFlags())) return RMUL; diff --git a/llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll index c01ec69..e955014 100644 --- a/llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll +++ b/llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll @@ -39,24 +39,21 @@ define i128 @__muloti4(i128 %0, i128 %1, i32* nocapture nonnull writeonly align ; AARCH: // %bb.0: // %Entry ; AARCH-NEXT: asr x9, x1, #63 ; AARCH-NEXT: asr x10, x3, #63 -; AARCH-NEXT: and x11, x9, x2 -; AARCH-NEXT: and x14, x10, x1 -; AARCH-NEXT: umulh x12, x2, x9 -; AARCH-NEXT: and x9, x9, x3 -; AARCH-NEXT: umulh x13, x10, x0 -; AARCH-NEXT: and x10, x10, x0 -; AARCH-NEXT: sub x12, x12, x11 -; AARCH-NEXT: neg x11, x11 -; AARCH-NEXT: sub x13, x13, x14 -; AARCH-NEXT: sub x9, x12, x9 -; AARCH-NEXT: sub x12, x13, x10 -; AARCH-NEXT: neg x10, x10 ; AARCH-NEXT: umulh x14, x0, x2 +; AARCH-NEXT: mov x8, x1 +; AARCH-NEXT: mul x11, x2, x9 +; AARCH-NEXT: str wzr, [x4] +; AARCH-NEXT: umulh x12, x10, x0 +; AARCH-NEXT: umulh x13, x2, x9 +; AARCH-NEXT: madd x12, x10, x1, x12 +; AARCH-NEXT: add x13, x13, x11 +; AARCH-NEXT: mul x10, x10, x0 +; AARCH-NEXT: madd x9, x3, x9, x13 +; AARCH-NEXT: add x12, x12, x10 ; AARCH-NEXT: adds x10, x10, x11 ; AARCH-NEXT: mul x11, x1, x2 ; AARCH-NEXT: adc x9, x12, x9 ; AARCH-NEXT: umulh x13, x1, x2 -; AARCH-NEXT: mov x8, x1 ; AARCH-NEXT: mul x12, x0, x3 ; AARCH-NEXT: adds x11, x11, x14 ; AARCH-NEXT: umulh x14, x0, x3 @@ -76,7 +73,6 @@ define i128 @__muloti4(i128 %0, i128 %1, i32* nocapture nonnull writeonly align ; AARCH-NEXT: eor x9, x9, x11 ; AARCH-NEXT: eor x10, x10, x11 ; AARCH-NEXT: orr x9, x10, x9 -; AARCH-NEXT: str wzr, [x4] ; AARCH-NEXT: cmp x9, #0 ; AARCH-NEXT: cset w9, ne ; AARCH-NEXT: tbz x8, #63, .LBB1_2 diff --git a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll index bac0255..f806149 100644 --- a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll +++ b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll @@ -159,28 +159,24 @@ define i128 @mad_i64_i32_sextops_i32_i128(i32 %arg0, i32 %arg1, i128 %arg2) #0 { ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CI-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v0, v1, 0 -; CI-NEXT: v_ashrrev_i32_e32 v11, 31, v0 +; CI-NEXT: v_ashrrev_i32_e32 v13, 31, v0 ; CI-NEXT: v_mov_b32_e32 v8, 0 -; CI-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v11, v1, v[7:8] -; CI-NEXT: v_ashrrev_i32_e32 v12, 31, v1 -; CI-NEXT: v_and_b32_e32 v14, v11, v1 -; CI-NEXT: v_mov_b32_e32 v1, v10 +; CI-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v1, v[7:8] +; CI-NEXT: v_ashrrev_i32_e32 v14, 31, v1 +; CI-NEXT: v_mad_i64_i32 v[11:12], s[4:5], v1, v13, 0 +; CI-NEXT: v_mov_b32_e32 v7, v10 ; CI-NEXT: v_mov_b32_e32 v10, v8 -; CI-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v0, v12, v[9:10] -; CI-NEXT: v_and_b32_e32 v13, v11, v12 -; CI-NEXT: v_sub_i32_e32 v9, vcc, 0, v14 -; CI-NEXT: v_subb_u32_e32 v10, vcc, 0, v13, vcc -; CI-NEXT: v_mad_i64_i32 v[9:10], s[4:5], v12, v0, v[9:10] -; CI-NEXT: v_mov_b32_e32 v0, v8 -; CI-NEXT: v_add_i32_e32 v0, vcc, v1, v0 -; CI-NEXT: v_addc_u32_e64 v1, s[4:5], 0, 0, vcc -; CI-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v11, v12, v[0:1] -; CI-NEXT: v_add_i32_e32 v8, vcc, v0, v9 -; CI-NEXT: v_addc_u32_e32 v9, vcc, v1, v10, vcc -; CI-NEXT: v_mov_b32_e32 v1, v7 +; CI-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v0, v14, v[9:10] +; CI-NEXT: v_mad_i64_i32 v[0:1], s[4:5], v14, v0, v[11:12] +; CI-NEXT: v_add_i32_e32 v9, vcc, v7, v9 +; CI-NEXT: v_addc_u32_e64 v10, s[4:5], 0, 0, vcc +; CI-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v14, v[9:10] +; CI-NEXT: v_add_i32_e32 v7, vcc, v9, v0 +; CI-NEXT: v_addc_u32_e32 v9, vcc, v10, v1, vcc +; CI-NEXT: v_mov_b32_e32 v1, v8 ; CI-NEXT: v_add_i32_e32 v0, vcc, v6, v2 ; CI-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; CI-NEXT: v_addc_u32_e32 v2, vcc, v8, v4, vcc +; CI-NEXT: v_addc_u32_e32 v2, vcc, v7, v4, vcc ; CI-NEXT: v_addc_u32_e32 v3, vcc, v9, v5, vcc ; CI-NEXT: s_setpc_b64 s[30:31] ; @@ -188,64 +184,60 @@ define i128 @mad_i64_i32_sextops_i32_i128(i32 %arg0, i32 %arg1, i128 %arg2) #0 { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_ashrrev_i32_e32 v6, 31, v0 +; SI-NEXT: v_mul_lo_u32 v11, v6, v1 +; SI-NEXT: v_mul_hi_u32 v12, v0, v1 ; SI-NEXT: v_ashrrev_i32_e32 v7, 31, v1 -; SI-NEXT: v_and_b32_e32 v9, v7, v0 -; SI-NEXT: v_and_b32_e32 v10, v6, v1 -; SI-NEXT: v_mul_lo_u32 v13, v6, v1 -; SI-NEXT: v_mul_hi_u32 v14, v0, v1 -; SI-NEXT: v_and_b32_e32 v8, v7, v6 -; SI-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; SI-NEXT: v_mul_hi_u32 v10, v6, v7 -; SI-NEXT: v_mul_i32_i24_e32 v11, v6, v7 -; SI-NEXT: v_mul_hi_u32 v6, v6, v1 -; SI-NEXT: v_mul_hi_u32 v12, v0, v7 -; SI-NEXT: v_mul_lo_u32 v7, v0, v7 -; SI-NEXT: v_addc_u32_e32 v8, vcc, v8, v8, vcc -; SI-NEXT: v_add_i32_e32 v13, vcc, v13, v14 -; SI-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc -; SI-NEXT: v_add_i32_e32 v7, vcc, v7, v13 -; SI-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc -; SI-NEXT: v_add_i32_e32 v6, vcc, v6, v12 -; SI-NEXT: v_addc_u32_e64 v12, s[4:5], 0, 0, vcc -; SI-NEXT: v_add_i32_e32 v6, vcc, v11, v6 +; SI-NEXT: v_mul_hi_u32 v14, v6, v1 +; SI-NEXT: v_mul_lo_u32 v13, v0, v7 +; SI-NEXT: v_mul_hi_u32 v10, v0, v7 +; SI-NEXT: v_add_i32_e32 v12, vcc, v11, v12 +; SI-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc +; SI-NEXT: v_mul_hi_u32 v8, v6, v7 +; SI-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; SI-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc +; SI-NEXT: v_mul_i32_i24_e32 v9, v6, v7 +; SI-NEXT: v_add_i32_e32 v10, vcc, v14, v10 +; SI-NEXT: v_mul_hi_i32 v6, v1, v6 +; SI-NEXT: v_mul_hi_i32 v7, v7, v0 +; SI-NEXT: v_addc_u32_e64 v14, s[4:5], 0, 0, vcc +; SI-NEXT: v_add_i32_e32 v9, vcc, v9, v10 +; SI-NEXT: v_addc_u32_e32 v8, vcc, v8, v14, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, v13, v11 ; SI-NEXT: v_mul_lo_u32 v0, v0, v1 -; SI-NEXT: v_addc_u32_e32 v10, vcc, v10, v12, vcc -; SI-NEXT: v_sub_i32_e32 v6, vcc, v6, v9 -; SI-NEXT: v_subb_u32_e32 v8, vcc, v10, v8, vcc +; SI-NEXT: v_addc_u32_e32 v6, vcc, v7, v6, vcc +; SI-NEXT: v_add_i32_e32 v7, vcc, v9, v10 +; SI-NEXT: v_addc_u32_e32 v6, vcc, v8, v6, vcc ; SI-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; SI-NEXT: v_addc_u32_e32 v1, vcc, v7, v3, vcc -; SI-NEXT: v_addc_u32_e32 v2, vcc, v6, v4, vcc -; SI-NEXT: v_addc_u32_e32 v3, vcc, v8, v5, vcc +; SI-NEXT: v_addc_u32_e32 v1, vcc, v12, v3, vcc +; SI-NEXT: v_addc_u32_e32 v2, vcc, v7, v4, vcc +; SI-NEXT: v_addc_u32_e32 v3, vcc, v6, v5, vcc ; SI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: mad_i64_i32_sextops_i32_i128: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_ashrrev_i32_e32 v14, 31, v0 -; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v0, v1, 0 -; GFX9-NEXT: v_ashrrev_i32_e32 v15, 31, v1 -; GFX9-NEXT: v_and_b32_e32 v6, v14, v1 -; GFX9-NEXT: v_mov_b32_e32 v11, 0 -; GFX9-NEXT: v_mov_b32_e32 v10, v9 -; GFX9-NEXT: v_and_b32_e32 v7, v14, v15 -; GFX9-NEXT: v_sub_co_u32_e32 v6, vcc, 0, v6 -; GFX9-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v14, v1, v[10:11] -; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, 0, v7, vcc -; GFX9-NEXT: v_mov_b32_e32 v10, v13 -; GFX9-NEXT: v_mov_b32_e32 v13, v11 -; GFX9-NEXT: v_mad_i64_i32 v[6:7], s[4:5], v15, v0, v[6:7] -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v0, v15, v[12:13] -; GFX9-NEXT: v_mov_b32_e32 v12, v1 -; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v10, v12 -; GFX9-NEXT: v_addc_co_u32_e64 v11, s[4:5], 0, 0, vcc -; GFX9-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v15, v[10:11] -; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v10, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v11, v7, vcc -; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v8, v2 +; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v0, v1, 0 +; GFX9-NEXT: v_ashrrev_i32_e32 v13, 31, v0 +; GFX9-NEXT: v_mov_b32_e32 v9, 0 +; GFX9-NEXT: v_mov_b32_e32 v8, v7 +; GFX9-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v13, v1, v[8:9] +; GFX9-NEXT: v_ashrrev_i32_e32 v14, 31, v1 +; GFX9-NEXT: v_mov_b32_e32 v8, v11 +; GFX9-NEXT: v_mov_b32_e32 v11, v9 +; GFX9-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v0, v14, v[10:11] +; GFX9-NEXT: v_mov_b32_e32 v12, v11 +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v8, v12 +; GFX9-NEXT: v_addc_co_u32_e64 v9, s[4:5], 0, 0, vcc +; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v14, v[8:9] +; GFX9-NEXT: v_mad_i64_i32 v[12:13], s[4:5], v1, v13, 0 +; GFX9-NEXT: v_mad_i64_i32 v[0:1], s[4:5], v14, v0, v[12:13] +; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v8, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v9, v1, vcc +; GFX9-NEXT: v_mov_b32_e32 v1, v10 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v6, v4, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v7, v5, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v7, v4, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v8, v5, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: mad_i64_i32_sextops_i32_i128: @@ -254,30 +246,27 @@ define i128 @mad_i64_i32_sextops_i32_i128(i32 %arg0, i32 %arg1, i128 %arg2) #0 { ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, v0, v1, 0 ; GFX11-NEXT: v_mov_b32_e32 v8, 0 -; GFX11-NEXT: v_ashrrev_i32_e32 v16, 31, v0 -; GFX11-NEXT: v_ashrrev_i32_e32 v17, 31, v1 +; GFX11-NEXT: v_ashrrev_i32_e32 v14, 31, v0 +; GFX11-NEXT: v_ashrrev_i32_e32 v15, 31, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mad_u64_u32 v[9:10], null, v16, v1, v[7:8] +; GFX11-NEXT: v_mad_u64_u32 v[9:10], null, v14, v1, v[7:8] ; GFX11-NEXT: v_dual_mov_b32 v7, v10 :: v_dual_mov_b32 v10, v8 -; GFX11-NEXT: v_and_b32_e32 v8, v16, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_mad_u64_u32 v[11:12], null, v0, v17, v[9:10] -; GFX11-NEXT: v_and_b32_e32 v9, v16, v17 -; GFX11-NEXT: v_sub_co_u32 v8, vcc_lo, 0, v8 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_sub_co_ci_u32_e32 v9, vcc_lo, 0, v9, vcc_lo -; GFX11-NEXT: v_mov_b32_e32 v1, v12 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_mad_u64_u32 v[11:12], null, v0, v15, v[9:10] +; GFX11-NEXT: v_mad_i64_i32 v[9:10], null, v1, v14, 0 +; GFX11-NEXT: v_mov_b32_e32 v8, v12 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_mad_i64_i32 v[14:15], null, v17, v0, v[8:9] -; GFX11-NEXT: v_add_co_u32 v12, s0, v7, v1 -; GFX11-NEXT: v_mov_b32_e32 v7, v11 -; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, 0, s0 +; GFX11-NEXT: v_mad_i64_i32 v[12:13], null, v15, v0, v[9:10] +; GFX11-NEXT: v_add_co_u32 v7, s0, v7, v8 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v16, v17, v[12:13] -; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v0, v14 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v1, v15, vcc_lo +; GFX11-NEXT: v_add_co_ci_u32_e64 v8, null, 0, 0, s0 +; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v14, v15, v[7:8] +; GFX11-NEXT: v_mov_b32_e32 v7, v11 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v0, v12 +; GFX11-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v1, v13, vcc_lo ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v6, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v7, v3, vcc_lo ; GFX11-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v8, v4, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) diff --git a/llvm/test/CodeGen/PowerPC/pr45448.ll b/llvm/test/CodeGen/PowerPC/pr45448.ll index c3337c7..0f8014d 100644 --- a/llvm/test/CodeGen/PowerPC/pr45448.ll +++ b/llvm/test/CodeGen/PowerPC/pr45448.ll @@ -25,8 +25,7 @@ define hidden void @julia_tryparse_internal_45896() #0 { ; CHECK-NEXT: rldic r5, r5, 4, 32 ; CHECK-NEXT: crnot 4*cr5+lt, eq ; CHECK-NEXT: mulhdu r3, r3, r5 -; CHECK-NEXT: and r6, r4, r5 -; CHECK-NEXT: sub r6, r3, r6 +; CHECK-NEXT: maddld r6, r4, r5, r3 ; CHECK-NEXT: cmpld cr1, r6, r3 ; CHECK-NEXT: mulhdu. r3, r4, r5 ; CHECK-NEXT: bc 4, 4*cr5+lt, .LBB0_10 diff --git a/llvm/test/CodeGen/RISCV/mul.ll b/llvm/test/CodeGen/RISCV/mul.ll index 986e7994..3923c43 100644 --- a/llvm/test/CodeGen/RISCV/mul.ll +++ b/llvm/test/CodeGen/RISCV/mul.ll @@ -1480,18 +1480,18 @@ define i64 @mulhsu_i64(i64 %a, i64 %b) nounwind { ; RV32IM-NEXT: add a5, a6, a2 ; RV32IM-NEXT: mul a7, a1, a3 ; RV32IM-NEXT: add t0, a7, a5 -; RV32IM-NEXT: and t1, a4, a0 -; RV32IM-NEXT: sub a2, t0, t1 +; RV32IM-NEXT: mul t1, a4, a0 +; RV32IM-NEXT: add a2, t0, t1 ; RV32IM-NEXT: sltu t2, a2, t0 ; RV32IM-NEXT: sltu a7, t0, a7 ; RV32IM-NEXT: sltu a5, a5, a6 ; RV32IM-NEXT: mulhu a3, a1, a3 ; RV32IM-NEXT: add a3, a3, a5 ; RV32IM-NEXT: add a3, a3, a7 -; RV32IM-NEXT: and a1, a4, a1 +; RV32IM-NEXT: mul a1, a4, a1 ; RV32IM-NEXT: mulhu a0, a4, a0 -; RV32IM-NEXT: sub a0, a0, a1 -; RV32IM-NEXT: sub a0, a0, t1 +; RV32IM-NEXT: add a0, a0, a1 +; RV32IM-NEXT: add a0, a0, t1 ; RV32IM-NEXT: add a0, a3, a0 ; RV32IM-NEXT: add a1, a0, t2 ; RV32IM-NEXT: mv a0, a2 diff --git a/llvm/test/CodeGen/RISCV/xaluo.ll b/llvm/test/CodeGen/RISCV/xaluo.ll index f3391b2..f6963fd 100644 --- a/llvm/test/CodeGen/RISCV/xaluo.ll +++ b/llvm/test/CodeGen/RISCV/xaluo.ll @@ -961,10 +961,8 @@ define zeroext i1 @smulo.i64(i64 %v1, i64 %v2, i64* %res) { ; RV32-NEXT: .cfi_def_cfa_offset 16 ; RV32-NEXT: sw s0, 12(sp) # 4-byte Folded Spill ; RV32-NEXT: sw s1, 8(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s2, 4(sp) # 4-byte Folded Spill ; RV32-NEXT: .cfi_offset s0, -4 ; RV32-NEXT: .cfi_offset s1, -8 -; RV32-NEXT: .cfi_offset s2, -12 ; RV32-NEXT: mulhu a5, a0, a2 ; RV32-NEXT: mul a6, a1, a2 ; RV32-NEXT: add a5, a6, a5 @@ -980,34 +978,33 @@ define zeroext i1 @smulo.i64(i64 %v1, i64 %v2, i64* %res) { ; RV32-NEXT: mul t0, a1, a3 ; RV32-NEXT: add t1, t0, a7 ; RV32-NEXT: srai t2, a1, 31 -; RV32-NEXT: and t3, t2, a2 +; RV32-NEXT: mul t3, a2, t2 ; RV32-NEXT: srai t4, a3, 31 -; RV32-NEXT: and t5, t4, a0 -; RV32-NEXT: neg t6, t5 -; RV32-NEXT: sub s0, t6, t3 -; RV32-NEXT: add s1, t1, s0 -; RV32-NEXT: sltu s2, s1, t1 +; RV32-NEXT: mul t5, t4, a0 +; RV32-NEXT: add t6, t5, t3 +; RV32-NEXT: add s0, t1, t6 +; RV32-NEXT: sltu s1, s0, t1 ; RV32-NEXT: sltu t0, t1, t0 ; RV32-NEXT: sltu a6, a7, a6 ; RV32-NEXT: mulhu a7, a1, a3 ; RV32-NEXT: add a6, a7, a6 ; RV32-NEXT: add a6, a6, t0 ; RV32-NEXT: mulhu a7, a2, t2 -; RV32-NEXT: sub a7, a7, t3 -; RV32-NEXT: and a3, t2, a3 -; RV32-NEXT: sub a3, a7, a3 -; RV32-NEXT: and a1, t4, a1 +; RV32-NEXT: add a7, a7, t3 +; RV32-NEXT: mul a3, a3, t2 +; RV32-NEXT: add a3, a7, a3 +; RV32-NEXT: mul a1, t4, a1 ; RV32-NEXT: mulhu a7, t4, a0 -; RV32-NEXT: sub a1, a7, a1 -; RV32-NEXT: sub a1, a1, t5 +; RV32-NEXT: add a1, a7, a1 +; RV32-NEXT: add a1, a1, t5 ; RV32-NEXT: add a1, a1, a3 -; RV32-NEXT: sltu a3, s0, t6 +; RV32-NEXT: sltu a3, t6, t5 ; RV32-NEXT: add a1, a1, a3 ; RV32-NEXT: add a1, a6, a1 -; RV32-NEXT: add a1, a1, s2 +; RV32-NEXT: add a1, a1, s1 ; RV32-NEXT: srai a3, a5, 31 ; RV32-NEXT: xor a1, a1, a3 -; RV32-NEXT: xor a3, s1, a3 +; RV32-NEXT: xor a3, s0, a3 ; RV32-NEXT: or a1, a3, a1 ; RV32-NEXT: snez a1, a1 ; RV32-NEXT: mul a0, a0, a2 @@ -1016,7 +1013,6 @@ define zeroext i1 @smulo.i64(i64 %v1, i64 %v2, i64* %res) { ; RV32-NEXT: mv a0, a1 ; RV32-NEXT: lw s0, 12(sp) # 4-byte Folded Reload ; RV32-NEXT: lw s1, 8(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s2, 4(sp) # 4-byte Folded Reload ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; @@ -1036,10 +1032,8 @@ define zeroext i1 @smulo.i64(i64 %v1, i64 %v2, i64* %res) { ; RV32ZBA-NEXT: .cfi_def_cfa_offset 16 ; RV32ZBA-NEXT: sw s0, 12(sp) # 4-byte Folded Spill ; RV32ZBA-NEXT: sw s1, 8(sp) # 4-byte Folded Spill -; RV32ZBA-NEXT: sw s2, 4(sp) # 4-byte Folded Spill ; RV32ZBA-NEXT: .cfi_offset s0, -4 ; RV32ZBA-NEXT: .cfi_offset s1, -8 -; RV32ZBA-NEXT: .cfi_offset s2, -12 ; RV32ZBA-NEXT: mulhu a5, a0, a2 ; RV32ZBA-NEXT: mul a6, a1, a2 ; RV32ZBA-NEXT: add a5, a6, a5 @@ -1055,34 +1049,33 @@ define zeroext i1 @smulo.i64(i64 %v1, i64 %v2, i64* %res) { ; RV32ZBA-NEXT: mul t0, a1, a3 ; RV32ZBA-NEXT: add t1, t0, a7 ; RV32ZBA-NEXT: srai t2, a1, 31 -; RV32ZBA-NEXT: and t3, t2, a2 +; RV32ZBA-NEXT: mul t3, a2, t2 ; RV32ZBA-NEXT: srai t4, a3, 31 -; RV32ZBA-NEXT: and t5, t4, a0 -; RV32ZBA-NEXT: neg t6, t5 -; RV32ZBA-NEXT: sub s0, t6, t3 -; RV32ZBA-NEXT: add s1, t1, s0 -; RV32ZBA-NEXT: sltu s2, s1, t1 +; RV32ZBA-NEXT: mul t5, t4, a0 +; RV32ZBA-NEXT: add t6, t5, t3 +; RV32ZBA-NEXT: add s0, t1, t6 +; RV32ZBA-NEXT: sltu s1, s0, t1 ; RV32ZBA-NEXT: sltu t0, t1, t0 ; RV32ZBA-NEXT: sltu a6, a7, a6 ; RV32ZBA-NEXT: mulhu a7, a1, a3 ; RV32ZBA-NEXT: add a6, a7, a6 ; RV32ZBA-NEXT: add a6, a6, t0 ; RV32ZBA-NEXT: mulhu a7, a2, t2 -; RV32ZBA-NEXT: sub a7, a7, t3 -; RV32ZBA-NEXT: and a3, t2, a3 -; RV32ZBA-NEXT: sub a3, a7, a3 -; RV32ZBA-NEXT: and a1, t4, a1 +; RV32ZBA-NEXT: add a7, a7, t3 +; RV32ZBA-NEXT: mul a3, a3, t2 +; RV32ZBA-NEXT: add a3, a7, a3 +; RV32ZBA-NEXT: mul a1, t4, a1 ; RV32ZBA-NEXT: mulhu a7, t4, a0 -; RV32ZBA-NEXT: sub a1, a7, a1 -; RV32ZBA-NEXT: sub a1, a1, t5 +; RV32ZBA-NEXT: add a1, a7, a1 +; RV32ZBA-NEXT: add a1, a1, t5 ; RV32ZBA-NEXT: add a1, a1, a3 -; RV32ZBA-NEXT: sltu a3, s0, t6 +; RV32ZBA-NEXT: sltu a3, t6, t5 ; RV32ZBA-NEXT: add a1, a1, a3 ; RV32ZBA-NEXT: add a1, a6, a1 -; RV32ZBA-NEXT: add a1, a1, s2 +; RV32ZBA-NEXT: add a1, a1, s1 ; RV32ZBA-NEXT: srai a3, a5, 31 ; RV32ZBA-NEXT: xor a1, a1, a3 -; RV32ZBA-NEXT: xor a3, s1, a3 +; RV32ZBA-NEXT: xor a3, s0, a3 ; RV32ZBA-NEXT: or a1, a3, a1 ; RV32ZBA-NEXT: snez a1, a1 ; RV32ZBA-NEXT: mul a0, a0, a2 @@ -1091,7 +1084,6 @@ define zeroext i1 @smulo.i64(i64 %v1, i64 %v2, i64* %res) { ; RV32ZBA-NEXT: mv a0, a1 ; RV32ZBA-NEXT: lw s0, 12(sp) # 4-byte Folded Reload ; RV32ZBA-NEXT: lw s1, 8(sp) # 4-byte Folded Reload -; RV32ZBA-NEXT: lw s2, 4(sp) # 4-byte Folded Reload ; RV32ZBA-NEXT: addi sp, sp, 16 ; RV32ZBA-NEXT: ret ; @@ -1123,8 +1115,8 @@ define zeroext i1 @smulo2.i64(i64 %v1, i64* %res) { ; RV32-NEXT: mulhu a6, a1, a3 ; RV32-NEXT: add a5, a6, a5 ; RV32-NEXT: srai a1, a1, 31 -; RV32-NEXT: andi a6, a1, 13 -; RV32-NEXT: sub a6, a5, a6 +; RV32-NEXT: mul a6, a1, a3 +; RV32-NEXT: add a6, a5, a6 ; RV32-NEXT: srai a7, a4, 31 ; RV32-NEXT: xor t0, a6, a7 ; RV32-NEXT: sltu a5, a6, a5 @@ -1160,8 +1152,8 @@ define zeroext i1 @smulo2.i64(i64 %v1, i64* %res) { ; RV32ZBA-NEXT: mulhu a6, a1, a3 ; RV32ZBA-NEXT: add a5, a6, a5 ; RV32ZBA-NEXT: srai a1, a1, 31 -; RV32ZBA-NEXT: andi a6, a1, 13 -; RV32ZBA-NEXT: sub a6, a5, a6 +; RV32ZBA-NEXT: mul a6, a1, a3 +; RV32ZBA-NEXT: add a6, a5, a6 ; RV32ZBA-NEXT: srai a7, a4, 31 ; RV32ZBA-NEXT: xor t0, a6, a7 ; RV32ZBA-NEXT: sltu a5, a6, a5 @@ -2360,9 +2352,7 @@ define i64 @smulo.select.i64(i64 %v1, i64 %v2) { ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: .cfi_def_cfa_offset 16 ; RV32-NEXT: sw s0, 12(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s1, 8(sp) # 4-byte Folded Spill ; RV32-NEXT: .cfi_offset s0, -4 -; RV32-NEXT: .cfi_offset s1, -8 ; RV32-NEXT: mulhu a4, a0, a2 ; RV32-NEXT: mul a5, a1, a2 ; RV32-NEXT: add a4, a5, a4 @@ -2378,34 +2368,33 @@ define i64 @smulo.select.i64(i64 %v1, i64 %v2) { ; RV32-NEXT: mul a7, a1, a3 ; RV32-NEXT: add t0, a7, a6 ; RV32-NEXT: srai t1, a1, 31 -; RV32-NEXT: and t2, t1, a2 +; RV32-NEXT: mul t2, a2, t1 ; RV32-NEXT: srai t3, a3, 31 -; RV32-NEXT: and t4, t3, a0 -; RV32-NEXT: neg t5, t4 -; RV32-NEXT: sub t6, t5, t2 -; RV32-NEXT: add s0, t0, t6 -; RV32-NEXT: sltu s1, s0, t0 +; RV32-NEXT: mul t4, t3, a0 +; RV32-NEXT: add t5, t4, t2 +; RV32-NEXT: add t6, t0, t5 +; RV32-NEXT: sltu s0, t6, t0 ; RV32-NEXT: sltu a7, t0, a7 ; RV32-NEXT: sltu a5, a6, a5 ; RV32-NEXT: mulhu a6, a1, a3 ; RV32-NEXT: add a5, a6, a5 ; RV32-NEXT: add a5, a5, a7 ; RV32-NEXT: mulhu a6, a2, t1 -; RV32-NEXT: sub a6, a6, t2 -; RV32-NEXT: and a7, t1, a3 -; RV32-NEXT: sub a6, a6, a7 -; RV32-NEXT: and a7, t3, a1 +; RV32-NEXT: add a6, a6, t2 +; RV32-NEXT: mul a7, a3, t1 +; RV32-NEXT: add a6, a6, a7 +; RV32-NEXT: mul a7, t3, a1 ; RV32-NEXT: mulhu t0, t3, a0 -; RV32-NEXT: sub a7, t0, a7 -; RV32-NEXT: sub a7, a7, t4 +; RV32-NEXT: add a7, t0, a7 +; RV32-NEXT: add a7, a7, t4 ; RV32-NEXT: add a6, a7, a6 -; RV32-NEXT: sltu a7, t6, t5 +; RV32-NEXT: sltu a7, t5, t4 ; RV32-NEXT: add a6, a6, a7 ; RV32-NEXT: add a5, a5, a6 -; RV32-NEXT: add a5, a5, s1 +; RV32-NEXT: add a5, a5, s0 ; RV32-NEXT: srai a4, a4, 31 ; RV32-NEXT: xor a5, a5, a4 -; RV32-NEXT: xor a4, s0, a4 +; RV32-NEXT: xor a4, t6, a4 ; RV32-NEXT: or a4, a4, a5 ; RV32-NEXT: bnez a4, .LBB46_2 ; RV32-NEXT: # %bb.1: # %entry @@ -2413,7 +2402,6 @@ define i64 @smulo.select.i64(i64 %v1, i64 %v2) { ; RV32-NEXT: mv a1, a3 ; RV32-NEXT: .LBB46_2: # %entry ; RV32-NEXT: lw s0, 12(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s1, 8(sp) # 4-byte Folded Reload ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; @@ -2433,9 +2421,7 @@ define i64 @smulo.select.i64(i64 %v1, i64 %v2) { ; RV32ZBA-NEXT: addi sp, sp, -16 ; RV32ZBA-NEXT: .cfi_def_cfa_offset 16 ; RV32ZBA-NEXT: sw s0, 12(sp) # 4-byte Folded Spill -; RV32ZBA-NEXT: sw s1, 8(sp) # 4-byte Folded Spill ; RV32ZBA-NEXT: .cfi_offset s0, -4 -; RV32ZBA-NEXT: .cfi_offset s1, -8 ; RV32ZBA-NEXT: mulhu a4, a0, a2 ; RV32ZBA-NEXT: mul a5, a1, a2 ; RV32ZBA-NEXT: add a4, a5, a4 @@ -2451,34 +2437,33 @@ define i64 @smulo.select.i64(i64 %v1, i64 %v2) { ; RV32ZBA-NEXT: mul a7, a1, a3 ; RV32ZBA-NEXT: add t0, a7, a6 ; RV32ZBA-NEXT: srai t1, a1, 31 -; RV32ZBA-NEXT: and t2, t1, a2 +; RV32ZBA-NEXT: mul t2, a2, t1 ; RV32ZBA-NEXT: srai t3, a3, 31 -; RV32ZBA-NEXT: and t4, t3, a0 -; RV32ZBA-NEXT: neg t5, t4 -; RV32ZBA-NEXT: sub t6, t5, t2 -; RV32ZBA-NEXT: add s0, t0, t6 -; RV32ZBA-NEXT: sltu s1, s0, t0 +; RV32ZBA-NEXT: mul t4, t3, a0 +; RV32ZBA-NEXT: add t5, t4, t2 +; RV32ZBA-NEXT: add t6, t0, t5 +; RV32ZBA-NEXT: sltu s0, t6, t0 ; RV32ZBA-NEXT: sltu a7, t0, a7 ; RV32ZBA-NEXT: sltu a5, a6, a5 ; RV32ZBA-NEXT: mulhu a6, a1, a3 ; RV32ZBA-NEXT: add a5, a6, a5 ; RV32ZBA-NEXT: add a5, a5, a7 ; RV32ZBA-NEXT: mulhu a6, a2, t1 -; RV32ZBA-NEXT: sub a6, a6, t2 -; RV32ZBA-NEXT: and a7, t1, a3 -; RV32ZBA-NEXT: sub a6, a6, a7 -; RV32ZBA-NEXT: and a7, t3, a1 +; RV32ZBA-NEXT: add a6, a6, t2 +; RV32ZBA-NEXT: mul a7, a3, t1 +; RV32ZBA-NEXT: add a6, a6, a7 +; RV32ZBA-NEXT: mul a7, t3, a1 ; RV32ZBA-NEXT: mulhu t0, t3, a0 -; RV32ZBA-NEXT: sub a7, t0, a7 -; RV32ZBA-NEXT: sub a7, a7, t4 +; RV32ZBA-NEXT: add a7, t0, a7 +; RV32ZBA-NEXT: add a7, a7, t4 ; RV32ZBA-NEXT: add a6, a7, a6 -; RV32ZBA-NEXT: sltu a7, t6, t5 +; RV32ZBA-NEXT: sltu a7, t5, t4 ; RV32ZBA-NEXT: add a6, a6, a7 ; RV32ZBA-NEXT: add a5, a5, a6 -; RV32ZBA-NEXT: add a5, a5, s1 +; RV32ZBA-NEXT: add a5, a5, s0 ; RV32ZBA-NEXT: srai a4, a4, 31 ; RV32ZBA-NEXT: xor a5, a5, a4 -; RV32ZBA-NEXT: xor a4, s0, a4 +; RV32ZBA-NEXT: xor a4, t6, a4 ; RV32ZBA-NEXT: or a4, a4, a5 ; RV32ZBA-NEXT: bnez a4, .LBB46_2 ; RV32ZBA-NEXT: # %bb.1: # %entry @@ -2486,7 +2471,6 @@ define i64 @smulo.select.i64(i64 %v1, i64 %v2) { ; RV32ZBA-NEXT: mv a1, a3 ; RV32ZBA-NEXT: .LBB46_2: # %entry ; RV32ZBA-NEXT: lw s0, 12(sp) # 4-byte Folded Reload -; RV32ZBA-NEXT: lw s1, 8(sp) # 4-byte Folded Reload ; RV32ZBA-NEXT: addi sp, sp, 16 ; RV32ZBA-NEXT: ret ; @@ -2513,9 +2497,7 @@ define i1 @smulo.not.i64(i64 %v1, i64 %v2) { ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: .cfi_def_cfa_offset 16 ; RV32-NEXT: sw s0, 12(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s1, 8(sp) # 4-byte Folded Spill ; RV32-NEXT: .cfi_offset s0, -4 -; RV32-NEXT: .cfi_offset s1, -8 ; RV32-NEXT: mulhu a4, a0, a2 ; RV32-NEXT: mul a5, a1, a2 ; RV32-NEXT: add a4, a5, a4 @@ -2531,38 +2513,36 @@ define i1 @smulo.not.i64(i64 %v1, i64 %v2) { ; RV32-NEXT: mul a7, a1, a3 ; RV32-NEXT: add t0, a7, a6 ; RV32-NEXT: srai t1, a1, 31 -; RV32-NEXT: and t2, t1, a2 +; RV32-NEXT: mul t2, a2, t1 ; RV32-NEXT: srai t3, a3, 31 -; RV32-NEXT: and t4, t3, a0 -; RV32-NEXT: neg t5, t4 -; RV32-NEXT: sub t6, t5, t2 -; RV32-NEXT: add s0, t0, t6 -; RV32-NEXT: sltu s1, s0, t0 +; RV32-NEXT: mul t4, t3, a0 +; RV32-NEXT: add t5, t4, t2 +; RV32-NEXT: add t6, t0, t5 +; RV32-NEXT: sltu s0, t6, t0 ; RV32-NEXT: sltu a7, t0, a7 ; RV32-NEXT: sltu a5, a6, a5 ; RV32-NEXT: mulhu a6, a1, a3 ; RV32-NEXT: add a5, a6, a5 ; RV32-NEXT: add a5, a5, a7 ; RV32-NEXT: mulhu a2, a2, t1 -; RV32-NEXT: sub a2, a2, t2 -; RV32-NEXT: and a3, t1, a3 -; RV32-NEXT: sub a2, a2, a3 -; RV32-NEXT: and a1, t3, a1 +; RV32-NEXT: add a2, a2, t2 +; RV32-NEXT: mul a3, a3, t1 +; RV32-NEXT: add a2, a2, a3 +; RV32-NEXT: mul a1, t3, a1 ; RV32-NEXT: mulhu a0, t3, a0 -; RV32-NEXT: sub a0, a0, a1 -; RV32-NEXT: sub a0, a0, t4 +; RV32-NEXT: add a0, a0, a1 +; RV32-NEXT: add a0, a0, t4 ; RV32-NEXT: add a0, a0, a2 -; RV32-NEXT: sltu a1, t6, t5 +; RV32-NEXT: sltu a1, t5, t4 ; RV32-NEXT: add a0, a0, a1 ; RV32-NEXT: add a0, a5, a0 -; RV32-NEXT: add a0, a0, s1 +; RV32-NEXT: add a0, a0, s0 ; RV32-NEXT: srai a1, a4, 31 ; RV32-NEXT: xor a0, a0, a1 -; RV32-NEXT: xor a1, s0, a1 +; RV32-NEXT: xor a1, t6, a1 ; RV32-NEXT: or a0, a1, a0 ; RV32-NEXT: seqz a0, a0 ; RV32-NEXT: lw s0, 12(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s1, 8(sp) # 4-byte Folded Reload ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; @@ -2580,9 +2560,7 @@ define i1 @smulo.not.i64(i64 %v1, i64 %v2) { ; RV32ZBA-NEXT: addi sp, sp, -16 ; RV32ZBA-NEXT: .cfi_def_cfa_offset 16 ; RV32ZBA-NEXT: sw s0, 12(sp) # 4-byte Folded Spill -; RV32ZBA-NEXT: sw s1, 8(sp) # 4-byte Folded Spill ; RV32ZBA-NEXT: .cfi_offset s0, -4 -; RV32ZBA-NEXT: .cfi_offset s1, -8 ; RV32ZBA-NEXT: mulhu a4, a0, a2 ; RV32ZBA-NEXT: mul a5, a1, a2 ; RV32ZBA-NEXT: add a4, a5, a4 @@ -2598,38 +2576,36 @@ define i1 @smulo.not.i64(i64 %v1, i64 %v2) { ; RV32ZBA-NEXT: mul a7, a1, a3 ; RV32ZBA-NEXT: add t0, a7, a6 ; RV32ZBA-NEXT: srai t1, a1, 31 -; RV32ZBA-NEXT: and t2, t1, a2 +; RV32ZBA-NEXT: mul t2, a2, t1 ; RV32ZBA-NEXT: srai t3, a3, 31 -; RV32ZBA-NEXT: and t4, t3, a0 -; RV32ZBA-NEXT: neg t5, t4 -; RV32ZBA-NEXT: sub t6, t5, t2 -; RV32ZBA-NEXT: add s0, t0, t6 -; RV32ZBA-NEXT: sltu s1, s0, t0 +; RV32ZBA-NEXT: mul t4, t3, a0 +; RV32ZBA-NEXT: add t5, t4, t2 +; RV32ZBA-NEXT: add t6, t0, t5 +; RV32ZBA-NEXT: sltu s0, t6, t0 ; RV32ZBA-NEXT: sltu a7, t0, a7 ; RV32ZBA-NEXT: sltu a5, a6, a5 ; RV32ZBA-NEXT: mulhu a6, a1, a3 ; RV32ZBA-NEXT: add a5, a6, a5 ; RV32ZBA-NEXT: add a5, a5, a7 ; RV32ZBA-NEXT: mulhu a2, a2, t1 -; RV32ZBA-NEXT: sub a2, a2, t2 -; RV32ZBA-NEXT: and a3, t1, a3 -; RV32ZBA-NEXT: sub a2, a2, a3 -; RV32ZBA-NEXT: and a1, t3, a1 +; RV32ZBA-NEXT: add a2, a2, t2 +; RV32ZBA-NEXT: mul a3, a3, t1 +; RV32ZBA-NEXT: add a2, a2, a3 +; RV32ZBA-NEXT: mul a1, t3, a1 ; RV32ZBA-NEXT: mulhu a0, t3, a0 -; RV32ZBA-NEXT: sub a0, a0, a1 -; RV32ZBA-NEXT: sub a0, a0, t4 +; RV32ZBA-NEXT: add a0, a0, a1 +; RV32ZBA-NEXT: add a0, a0, t4 ; RV32ZBA-NEXT: add a0, a0, a2 -; RV32ZBA-NEXT: sltu a1, t6, t5 +; RV32ZBA-NEXT: sltu a1, t5, t4 ; RV32ZBA-NEXT: add a0, a0, a1 ; RV32ZBA-NEXT: add a0, a5, a0 -; RV32ZBA-NEXT: add a0, a0, s1 +; RV32ZBA-NEXT: add a0, a0, s0 ; RV32ZBA-NEXT: srai a1, a4, 31 ; RV32ZBA-NEXT: xor a0, a0, a1 -; RV32ZBA-NEXT: xor a1, s0, a1 +; RV32ZBA-NEXT: xor a1, t6, a1 ; RV32ZBA-NEXT: or a0, a1, a0 ; RV32ZBA-NEXT: seqz a0, a0 ; RV32ZBA-NEXT: lw s0, 12(sp) # 4-byte Folded Reload -; RV32ZBA-NEXT: lw s1, 8(sp) # 4-byte Folded Reload ; RV32ZBA-NEXT: addi sp, sp, 16 ; RV32ZBA-NEXT: ret ; @@ -3477,9 +3453,7 @@ define zeroext i1 @smulo.br.i64(i64 %v1, i64 %v2) { ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: .cfi_def_cfa_offset 16 ; RV32-NEXT: sw s0, 12(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s1, 8(sp) # 4-byte Folded Spill ; RV32-NEXT: .cfi_offset s0, -4 -; RV32-NEXT: .cfi_offset s1, -8 ; RV32-NEXT: mulhu a4, a0, a2 ; RV32-NEXT: mul a5, a1, a2 ; RV32-NEXT: add a4, a5, a4 @@ -3495,34 +3469,33 @@ define zeroext i1 @smulo.br.i64(i64 %v1, i64 %v2) { ; RV32-NEXT: mul a7, a1, a3 ; RV32-NEXT: add t0, a7, a6 ; RV32-NEXT: srai t1, a1, 31 -; RV32-NEXT: and t2, t1, a2 +; RV32-NEXT: mul t2, a2, t1 ; RV32-NEXT: srai t3, a3, 31 -; RV32-NEXT: and t4, t3, a0 -; RV32-NEXT: neg t5, t4 -; RV32-NEXT: sub t6, t5, t2 -; RV32-NEXT: add s0, t0, t6 -; RV32-NEXT: sltu s1, s0, t0 +; RV32-NEXT: mul t4, t3, a0 +; RV32-NEXT: add t5, t4, t2 +; RV32-NEXT: add t6, t0, t5 +; RV32-NEXT: sltu s0, t6, t0 ; RV32-NEXT: sltu a7, t0, a7 ; RV32-NEXT: sltu a5, a6, a5 ; RV32-NEXT: mulhu a6, a1, a3 ; RV32-NEXT: add a5, a6, a5 ; RV32-NEXT: add a5, a5, a7 ; RV32-NEXT: mulhu a2, a2, t1 -; RV32-NEXT: sub a2, a2, t2 -; RV32-NEXT: and a3, t1, a3 -; RV32-NEXT: sub a2, a2, a3 -; RV32-NEXT: and a1, t3, a1 +; RV32-NEXT: add a2, a2, t2 +; RV32-NEXT: mul a3, a3, t1 +; RV32-NEXT: add a2, a2, a3 +; RV32-NEXT: mul a1, t3, a1 ; RV32-NEXT: mulhu a0, t3, a0 -; RV32-NEXT: sub a0, a0, a1 -; RV32-NEXT: sub a0, a0, t4 +; RV32-NEXT: add a0, a0, a1 +; RV32-NEXT: add a0, a0, t4 ; RV32-NEXT: add a0, a0, a2 -; RV32-NEXT: sltu a1, t6, t5 +; RV32-NEXT: sltu a1, t5, t4 ; RV32-NEXT: add a0, a0, a1 ; RV32-NEXT: add a0, a5, a0 -; RV32-NEXT: add a0, a0, s1 +; RV32-NEXT: add a0, a0, s0 ; RV32-NEXT: srai a1, a4, 31 ; RV32-NEXT: xor a0, a0, a1 -; RV32-NEXT: xor a1, s0, a1 +; RV32-NEXT: xor a1, t6, a1 ; RV32-NEXT: or a0, a1, a0 ; RV32-NEXT: beqz a0, .LBB61_2 ; RV32-NEXT: # %bb.1: # %overflow @@ -3532,7 +3505,6 @@ define zeroext i1 @smulo.br.i64(i64 %v1, i64 %v2) { ; RV32-NEXT: li a0, 1 ; RV32-NEXT: .LBB61_3: # %overflow ; RV32-NEXT: lw s0, 12(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s1, 8(sp) # 4-byte Folded Reload ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; @@ -3554,9 +3526,7 @@ define zeroext i1 @smulo.br.i64(i64 %v1, i64 %v2) { ; RV32ZBA-NEXT: addi sp, sp, -16 ; RV32ZBA-NEXT: .cfi_def_cfa_offset 16 ; RV32ZBA-NEXT: sw s0, 12(sp) # 4-byte Folded Spill -; RV32ZBA-NEXT: sw s1, 8(sp) # 4-byte Folded Spill ; RV32ZBA-NEXT: .cfi_offset s0, -4 -; RV32ZBA-NEXT: .cfi_offset s1, -8 ; RV32ZBA-NEXT: mulhu a4, a0, a2 ; RV32ZBA-NEXT: mul a5, a1, a2 ; RV32ZBA-NEXT: add a4, a5, a4 @@ -3572,34 +3542,33 @@ define zeroext i1 @smulo.br.i64(i64 %v1, i64 %v2) { ; RV32ZBA-NEXT: mul a7, a1, a3 ; RV32ZBA-NEXT: add t0, a7, a6 ; RV32ZBA-NEXT: srai t1, a1, 31 -; RV32ZBA-NEXT: and t2, t1, a2 +; RV32ZBA-NEXT: mul t2, a2, t1 ; RV32ZBA-NEXT: srai t3, a3, 31 -; RV32ZBA-NEXT: and t4, t3, a0 -; RV32ZBA-NEXT: neg t5, t4 -; RV32ZBA-NEXT: sub t6, t5, t2 -; RV32ZBA-NEXT: add s0, t0, t6 -; RV32ZBA-NEXT: sltu s1, s0, t0 +; RV32ZBA-NEXT: mul t4, t3, a0 +; RV32ZBA-NEXT: add t5, t4, t2 +; RV32ZBA-NEXT: add t6, t0, t5 +; RV32ZBA-NEXT: sltu s0, t6, t0 ; RV32ZBA-NEXT: sltu a7, t0, a7 ; RV32ZBA-NEXT: sltu a5, a6, a5 ; RV32ZBA-NEXT: mulhu a6, a1, a3 ; RV32ZBA-NEXT: add a5, a6, a5 ; RV32ZBA-NEXT: add a5, a5, a7 ; RV32ZBA-NEXT: mulhu a2, a2, t1 -; RV32ZBA-NEXT: sub a2, a2, t2 -; RV32ZBA-NEXT: and a3, t1, a3 -; RV32ZBA-NEXT: sub a2, a2, a3 -; RV32ZBA-NEXT: and a1, t3, a1 +; RV32ZBA-NEXT: add a2, a2, t2 +; RV32ZBA-NEXT: mul a3, a3, t1 +; RV32ZBA-NEXT: add a2, a2, a3 +; RV32ZBA-NEXT: mul a1, t3, a1 ; RV32ZBA-NEXT: mulhu a0, t3, a0 -; RV32ZBA-NEXT: sub a0, a0, a1 -; RV32ZBA-NEXT: sub a0, a0, t4 +; RV32ZBA-NEXT: add a0, a0, a1 +; RV32ZBA-NEXT: add a0, a0, t4 ; RV32ZBA-NEXT: add a0, a0, a2 -; RV32ZBA-NEXT: sltu a1, t6, t5 +; RV32ZBA-NEXT: sltu a1, t5, t4 ; RV32ZBA-NEXT: add a0, a0, a1 ; RV32ZBA-NEXT: add a0, a5, a0 -; RV32ZBA-NEXT: add a0, a0, s1 +; RV32ZBA-NEXT: add a0, a0, s0 ; RV32ZBA-NEXT: srai a1, a4, 31 ; RV32ZBA-NEXT: xor a0, a0, a1 -; RV32ZBA-NEXT: xor a1, s0, a1 +; RV32ZBA-NEXT: xor a1, t6, a1 ; RV32ZBA-NEXT: or a0, a1, a0 ; RV32ZBA-NEXT: beqz a0, .LBB61_2 ; RV32ZBA-NEXT: # %bb.1: # %overflow @@ -3609,7 +3578,6 @@ define zeroext i1 @smulo.br.i64(i64 %v1, i64 %v2) { ; RV32ZBA-NEXT: li a0, 1 ; RV32ZBA-NEXT: .LBB61_3: # %overflow ; RV32ZBA-NEXT: lw s0, 12(sp) # 4-byte Folded Reload -; RV32ZBA-NEXT: lw s1, 8(sp) # 4-byte Folded Reload ; RV32ZBA-NEXT: addi sp, sp, 16 ; RV32ZBA-NEXT: ret ; @@ -3657,8 +3625,8 @@ define zeroext i1 @smulo2.br.i64(i64 %v1) { ; RV32-NEXT: add a6, a4, a6 ; RV32-NEXT: sub t1, a6, a1 ; RV32-NEXT: srai t2, a1, 31 -; RV32-NEXT: andi t3, t2, -13 -; RV32-NEXT: sub t3, a5, t3 +; RV32-NEXT: mul t3, t2, a2 +; RV32-NEXT: sub t3, t3, a0 ; RV32-NEXT: add t4, t1, t3 ; RV32-NEXT: sltu t5, t4, t1 ; RV32-NEXT: neg t6, a1 @@ -3719,8 +3687,8 @@ define zeroext i1 @smulo2.br.i64(i64 %v1) { ; RV32ZBA-NEXT: add a6, a4, a6 ; RV32ZBA-NEXT: sub t1, a6, a1 ; RV32ZBA-NEXT: srai t2, a1, 31 -; RV32ZBA-NEXT: andi t3, t2, -13 -; RV32ZBA-NEXT: sub t3, a5, t3 +; RV32ZBA-NEXT: mul t3, t2, a2 +; RV32ZBA-NEXT: sub t3, t3, a0 ; RV32ZBA-NEXT: add t4, t1, t3 ; RV32ZBA-NEXT: sltu t5, t4, t1 ; RV32ZBA-NEXT: neg t6, a1 diff --git a/llvm/test/CodeGen/Thumb2/mve-vmull-splat.ll b/llvm/test/CodeGen/Thumb2/mve-vmull-splat.ll index 9cb0ec4..217caee 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vmull-splat.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vmull-splat.ll @@ -38,23 +38,22 @@ entry: define arm_aapcs_vfpcc <2 x i64> @sext32_0246_ext0(<4 x i32> %src1, i32 %src2) { ; CHECK-LABEL: sext32_0246_ext0: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, lr} -; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: .save {r4, r5, r7, lr} +; CHECK-NEXT: push {r4, r5, r7, lr} ; CHECK-NEXT: vmov r1, s2 ; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: umull lr, r12, r1, r0 -; CHECK-NEXT: umull r2, r4, r3, r0 +; CHECK-NEXT: umull r2, r5, r3, r0 ; CHECK-NEXT: vmov q0[2], q0[0], r2, lr -; CHECK-NEXT: and.w r2, r1, r0, asr #31 -; CHECK-NEXT: sub.w r2, r12, r2 -; CHECK-NEXT: and.w r1, r0, r1, asr #31 -; CHECK-NEXT: subs r1, r2, r1 -; CHECK-NEXT: and.w r2, r3, r0, asr #31 -; CHECK-NEXT: subs r2, r4, r2 -; CHECK-NEXT: and.w r0, r0, r3, asr #31 -; CHECK-NEXT: subs r0, r2, r0 +; CHECK-NEXT: asrs r2, r0, #31 +; CHECK-NEXT: mla r4, r1, r2, r12 +; CHECK-NEXT: asrs r1, r1, #31 +; CHECK-NEXT: mla r2, r3, r2, r5 +; CHECK-NEXT: asrs r3, r3, #31 +; CHECK-NEXT: mla r1, r1, r0, r4 +; CHECK-NEXT: mla r0, r3, r0, r2 ; CHECK-NEXT: vmov q0[3], q0[1], r0, r1 -; CHECK-NEXT: pop {r4, pc} +; CHECK-NEXT: pop {r4, r5, r7, pc} entry: %shuf1 = shufflevector <4 x i32> %src1, <4 x i32> undef, <2 x i32> %out1 = sext <2 x i32> %shuf1 to <2 x i64> @@ -68,23 +67,22 @@ entry: define arm_aapcs_vfpcc <2 x i64> @sext32_ext0_0246(<4 x i32> %src1, i32 %src2) { ; CHECK-LABEL: sext32_ext0_0246: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, lr} -; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: .save {r4, r5, r7, lr} +; CHECK-NEXT: push {r4, r5, r7, lr} ; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: asrs r4, r0, #31 ; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: umull lr, r12, r0, r1 -; CHECK-NEXT: umull r2, r4, r0, r3 +; CHECK-NEXT: umull r2, r5, r0, r3 ; CHECK-NEXT: vmov q0[2], q0[0], r2, lr -; CHECK-NEXT: and.w r2, r0, r1, asr #31 -; CHECK-NEXT: sub.w r2, r12, r2 -; CHECK-NEXT: and.w r1, r1, r0, asr #31 -; CHECK-NEXT: subs r1, r2, r1 -; CHECK-NEXT: and.w r2, r0, r3, asr #31 -; CHECK-NEXT: subs r2, r4, r2 -; CHECK-NEXT: and.w r0, r3, r0, asr #31 -; CHECK-NEXT: subs r0, r2, r0 +; CHECK-NEXT: asrs r2, r1, #31 +; CHECK-NEXT: mla r2, r0, r2, r12 +; CHECK-NEXT: mla r1, r4, r1, r2 +; CHECK-NEXT: asrs r2, r3, #31 +; CHECK-NEXT: mla r0, r0, r2, r5 +; CHECK-NEXT: mla r0, r4, r3, r0 ; CHECK-NEXT: vmov q0[3], q0[1], r0, r1 -; CHECK-NEXT: pop {r4, pc} +; CHECK-NEXT: pop {r4, r5, r7, pc} entry: %shuf1 = shufflevector <4 x i32> %src1, <4 x i32> undef, <2 x i32> %out1 = sext <2 x i32> %shuf1 to <2 x i64> @@ -132,24 +130,23 @@ entry: define arm_aapcs_vfpcc <2 x i64> @sext32_1357_ext0(<4 x i32> %src1, i32 %src2) { ; CHECK-LABEL: sext32_1357_ext0: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, lr} -; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: .save {r4, r5, r7, lr} +; CHECK-NEXT: push {r4, r5, r7, lr} ; CHECK-NEXT: vrev64.32 q1, q0 ; CHECK-NEXT: vmov r1, s6 ; CHECK-NEXT: vmov r3, s4 ; CHECK-NEXT: umull lr, r12, r1, r0 -; CHECK-NEXT: umull r2, r4, r3, r0 +; CHECK-NEXT: umull r2, r5, r3, r0 ; CHECK-NEXT: vmov q0[2], q0[0], r2, lr -; CHECK-NEXT: and.w r2, r1, r0, asr #31 -; CHECK-NEXT: sub.w r2, r12, r2 -; CHECK-NEXT: and.w r1, r0, r1, asr #31 -; CHECK-NEXT: subs r1, r2, r1 -; CHECK-NEXT: and.w r2, r3, r0, asr #31 -; CHECK-NEXT: subs r2, r4, r2 -; CHECK-NEXT: and.w r0, r0, r3, asr #31 -; CHECK-NEXT: subs r0, r2, r0 +; CHECK-NEXT: asrs r2, r0, #31 +; CHECK-NEXT: mla r4, r1, r2, r12 +; CHECK-NEXT: asrs r1, r1, #31 +; CHECK-NEXT: mla r2, r3, r2, r5 +; CHECK-NEXT: asrs r3, r3, #31 +; CHECK-NEXT: mla r1, r1, r0, r4 +; CHECK-NEXT: mla r0, r3, r0, r2 ; CHECK-NEXT: vmov q0[3], q0[1], r0, r1 -; CHECK-NEXT: pop {r4, pc} +; CHECK-NEXT: pop {r4, r5, r7, pc} entry: %shuf1 = shufflevector <4 x i32> %src1, <4 x i32> undef, <2 x i32> %out1 = sext <2 x i32> %shuf1 to <2 x i64> @@ -163,24 +160,23 @@ entry: define arm_aapcs_vfpcc <2 x i64> @sext32_ext0_1357(<4 x i32> %src1, i32 %src2) { ; CHECK-LABEL: sext32_ext0_1357: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, lr} -; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: .save {r4, r5, r7, lr} +; CHECK-NEXT: push {r4, r5, r7, lr} ; CHECK-NEXT: vrev64.32 q1, q0 +; CHECK-NEXT: asrs r4, r0, #31 ; CHECK-NEXT: vmov r1, s6 ; CHECK-NEXT: vmov r3, s4 ; CHECK-NEXT: umull lr, r12, r0, r1 -; CHECK-NEXT: umull r2, r4, r0, r3 +; CHECK-NEXT: umull r2, r5, r0, r3 ; CHECK-NEXT: vmov q0[2], q0[0], r2, lr -; CHECK-NEXT: and.w r2, r0, r1, asr #31 -; CHECK-NEXT: sub.w r2, r12, r2 -; CHECK-NEXT: and.w r1, r1, r0, asr #31 -; CHECK-NEXT: subs r1, r2, r1 -; CHECK-NEXT: and.w r2, r0, r3, asr #31 -; CHECK-NEXT: subs r2, r4, r2 -; CHECK-NEXT: and.w r0, r3, r0, asr #31 -; CHECK-NEXT: subs r0, r2, r0 +; CHECK-NEXT: asrs r2, r1, #31 +; CHECK-NEXT: mla r2, r0, r2, r12 +; CHECK-NEXT: mla r1, r4, r1, r2 +; CHECK-NEXT: asrs r2, r3, #31 +; CHECK-NEXT: mla r0, r0, r2, r5 +; CHECK-NEXT: mla r0, r4, r3, r0 ; CHECK-NEXT: vmov q0[3], q0[1], r0, r1 -; CHECK-NEXT: pop {r4, pc} +; CHECK-NEXT: pop {r4, r5, r7, pc} entry: %shuf1 = shufflevector <4 x i32> %src1, <4 x i32> undef, <2 x i32> %out1 = sext <2 x i32> %shuf1 to <2 x i64> @@ -234,39 +230,36 @@ entry: define arm_aapcs_vfpcc <4 x i64> @sext32_0213_ext0(<8 x i32> %src1, i32 %src2) { ; CHECK-LABEL: sext32_0213_ext0: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, lr} -; CHECK-NEXT: push {r4, lr} -; CHECK-NEXT: vmov.f32 s6, s3 +; CHECK-NEXT: .save {r4, r5, r7, lr} +; CHECK-NEXT: push {r4, r5, r7, lr} ; CHECK-NEXT: vmov.f32 s4, s1 -; CHECK-NEXT: vmov r1, s6 +; CHECK-NEXT: vmov.f32 s6, s3 ; CHECK-NEXT: vmov r3, s4 +; CHECK-NEXT: vmov r1, s6 +; CHECK-NEXT: umull r2, r5, r3, r0 ; CHECK-NEXT: umull lr, r12, r1, r0 -; CHECK-NEXT: umull r2, r4, r3, r0 ; CHECK-NEXT: vmov q1[2], q1[0], r2, lr -; CHECK-NEXT: and.w r2, r1, r0, asr #31 -; CHECK-NEXT: sub.w r2, r12, r2 -; CHECK-NEXT: and.w r1, r0, r1, asr #31 -; CHECK-NEXT: subs r1, r2, r1 -; CHECK-NEXT: and.w r2, r3, r0, asr #31 -; CHECK-NEXT: subs r2, r4, r2 -; CHECK-NEXT: and.w r3, r0, r3, asr #31 -; CHECK-NEXT: subs r2, r2, r3 -; CHECK-NEXT: vmov q1[3], q1[1], r2, r1 +; CHECK-NEXT: asrs r2, r0, #31 +; CHECK-NEXT: mla r4, r1, r2, r12 +; CHECK-NEXT: asrs r1, r1, #31 +; CHECK-NEXT: mla r5, r3, r2, r5 +; CHECK-NEXT: asrs r3, r3, #31 +; CHECK-NEXT: mla r1, r1, r0, r4 +; CHECK-NEXT: mla r3, r3, r0, r5 +; CHECK-NEXT: vmov q1[3], q1[1], r3, r1 ; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: and.w r2, r1, r0, asr #31 -; CHECK-NEXT: umull r3, r4, r1, r0 -; CHECK-NEXT: and.w r1, r0, r1, asr #31 -; CHECK-NEXT: subs r2, r4, r2 -; CHECK-NEXT: sub.w r12, r2, r1 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: umull r4, r1, r2, r0 +; CHECK-NEXT: umull r3, r5, r1, r0 +; CHECK-NEXT: mla r5, r1, r2, r5 +; CHECK-NEXT: asrs r1, r1, #31 +; CHECK-NEXT: mla r12, r1, r0, r5 +; CHECK-NEXT: vmov r5, s0 +; CHECK-NEXT: umull r4, r1, r5, r0 +; CHECK-NEXT: mla r1, r5, r2, r1 +; CHECK-NEXT: asrs r2, r5, #31 ; CHECK-NEXT: vmov q0[2], q0[0], r4, r3 -; CHECK-NEXT: and.w r3, r2, r0, asr #31 -; CHECK-NEXT: and.w r0, r0, r2, asr #31 -; CHECK-NEXT: subs r1, r1, r3 -; CHECK-NEXT: subs r0, r1, r0 +; CHECK-NEXT: mla r0, r2, r0, r1 ; CHECK-NEXT: vmov q0[3], q0[1], r0, r12 -; CHECK-NEXT: pop {r4, pc} +; CHECK-NEXT: pop {r4, r5, r7, pc} entry: %shuf1 = shufflevector <8 x i32> %src1, <8 x i32> undef, <4 x i32> %out1 = sext <4 x i32> %shuf1 to <4 x i64> @@ -280,39 +273,36 @@ entry: define arm_aapcs_vfpcc <4 x i64> @sext32_ext0_0213(<8 x i32> %src1, i32 %src2) { ; CHECK-LABEL: sext32_ext0_0213: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, lr} -; CHECK-NEXT: push {r4, lr} -; CHECK-NEXT: vmov.f32 s6, s3 +; CHECK-NEXT: .save {r4, r5, r7, lr} +; CHECK-NEXT: push {r4, r5, r7, lr} ; CHECK-NEXT: vmov.f32 s4, s1 -; CHECK-NEXT: vmov r1, s6 +; CHECK-NEXT: asrs r4, r0, #31 +; CHECK-NEXT: vmov.f32 s6, s3 ; CHECK-NEXT: vmov r3, s4 +; CHECK-NEXT: vmov r1, s6 +; CHECK-NEXT: umull r2, r5, r0, r3 ; CHECK-NEXT: umull lr, r12, r0, r1 -; CHECK-NEXT: umull r2, r4, r0, r3 ; CHECK-NEXT: vmov q1[2], q1[0], r2, lr -; CHECK-NEXT: and.w r2, r0, r1, asr #31 -; CHECK-NEXT: sub.w r2, r12, r2 -; CHECK-NEXT: and.w r1, r1, r0, asr #31 -; CHECK-NEXT: subs r1, r2, r1 -; CHECK-NEXT: and.w r2, r0, r3, asr #31 -; CHECK-NEXT: subs r2, r4, r2 -; CHECK-NEXT: and.w r3, r3, r0, asr #31 -; CHECK-NEXT: subs r2, r2, r3 +; CHECK-NEXT: asrs r2, r1, #31 +; CHECK-NEXT: mla r2, r0, r2, r12 +; CHECK-NEXT: mla r1, r4, r1, r2 +; CHECK-NEXT: asrs r2, r3, #31 +; CHECK-NEXT: mla r2, r0, r2, r5 +; CHECK-NEXT: mla r2, r4, r3, r2 ; CHECK-NEXT: vmov q1[3], q1[1], r2, r1 ; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: umull r3, r4, r0, r1 -; CHECK-NEXT: and.w r2, r0, r1, asr #31 -; CHECK-NEXT: and.w r1, r1, r0, asr #31 -; CHECK-NEXT: subs r2, r4, r2 -; CHECK-NEXT: sub.w r12, r2, r1 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: umull r4, r1, r0, r2 -; CHECK-NEXT: vmov q0[2], q0[0], r4, r3 -; CHECK-NEXT: and.w r3, r0, r2, asr #31 -; CHECK-NEXT: and.w r0, r2, r0, asr #31 -; CHECK-NEXT: subs r1, r1, r3 -; CHECK-NEXT: subs r0, r1, r0 +; CHECK-NEXT: umull r2, r3, r0, r1 +; CHECK-NEXT: asrs r5, r1, #31 +; CHECK-NEXT: mla r3, r0, r5, r3 +; CHECK-NEXT: mla r12, r4, r1, r3 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: umull r5, r1, r0, r3 +; CHECK-NEXT: vmov q0[2], q0[0], r5, r2 +; CHECK-NEXT: asrs r2, r3, #31 +; CHECK-NEXT: mla r0, r0, r2, r1 +; CHECK-NEXT: mla r0, r4, r3, r0 ; CHECK-NEXT: vmov q0[3], q0[1], r0, r12 -; CHECK-NEXT: pop {r4, pc} +; CHECK-NEXT: pop {r4, r5, r7, pc} entry: %shuf1 = shufflevector <8 x i32> %src1, <8 x i32> undef, <4 x i32> %out1 = sext <4 x i32> %shuf1 to <4 x i64> diff --git a/llvm/test/CodeGen/X86/extmul128.ll b/llvm/test/CodeGen/X86/extmul128.ll index a2d8211..a7f2959 100644 --- a/llvm/test/CodeGen/X86/extmul128.ll +++ b/llvm/test/CodeGen/X86/extmul128.ll @@ -29,37 +29,6 @@ define i128 @i64_zext_sext_i128(i64 %a, i64 %b) { ; CHECK-NEXT: movq %rdi, %rax ; CHECK-NEXT: mulq %rsi ; CHECK-NEXT: sarq $63, %rsi -; CHECK-NEXT: andq %rdi, %rsi -; CHECK-NEXT: subq %rsi, %rdx -; CHECK-NEXT: retq - %aa = zext i64 %a to i128 - %bb = sext i64 %b to i128 - %cc = mul i128 %aa, %bb - ret i128 %cc -} - -define i128 @i64_sext_zext_i128(i64 %a, i64 %b) { -; CHECK-LABEL: i64_sext_zext_i128: -; CHECK: # %bb.0: -; CHECK-NEXT: movq %rdi, %rax -; CHECK-NEXT: movq %rdi, %rcx -; CHECK-NEXT: sarq $63, %rcx -; CHECK-NEXT: mulq %rsi -; CHECK-NEXT: andq %rsi, %rcx -; CHECK-NEXT: subq %rcx, %rdx -; CHECK-NEXT: retq - %aa = sext i64 %a to i128 - %bb = zext i64 %b to i128 - %cc = mul i128 %aa, %bb - ret i128 %cc -} - -define i128 @i64_zext_sext_i128_minsize(i64 %a, i64 %b) minsize { -; CHECK-LABEL: i64_zext_sext_i128_minsize: -; CHECK: # %bb.0: -; CHECK-NEXT: movq %rdi, %rax -; CHECK-NEXT: mulq %rsi -; CHECK-NEXT: sarq $63, %rsi ; CHECK-NEXT: imulq %rdi, %rsi ; CHECK-NEXT: addq %rsi, %rdx ; CHECK-NEXT: retq @@ -69,8 +38,8 @@ define i128 @i64_zext_sext_i128_minsize(i64 %a, i64 %b) minsize { ret i128 %cc } -define i128 @i64_sext_zext_i128_minsize(i64 %a, i64 %b) minsize { -; CHECK-LABEL: i64_sext_zext_i128_minsize: +define i128 @i64_sext_zext_i128(i64 %a, i64 %b) { +; CHECK-LABEL: i64_sext_zext_i128: ; CHECK: # %bb.0: ; CHECK-NEXT: movq %rdi, %rax ; CHECK-NEXT: movq %rdi, %rcx diff --git a/llvm/test/CodeGen/X86/muloti.ll b/llvm/test/CodeGen/X86/muloti.ll index 3733306..9a6cf0b0 100644 --- a/llvm/test/CodeGen/X86/muloti.ll +++ b/llvm/test/CodeGen/X86/muloti.ll @@ -7,39 +7,34 @@ define %0 @x(i64 %a.coerce0, i64 %a.coerce1, i64 %b.coerce0, i64 %b.coerce1) nounwind uwtable ssp { ; CHECK-LABEL: x: ; CHECK: ## %bb.0: ## %entry -; CHECK-NEXT: pushq %r15 -; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: pushq %r14 -; CHECK-NEXT: .cfi_def_cfa_offset 24 +; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: pushq %rbx -; CHECK-NEXT: .cfi_def_cfa_offset 32 -; CHECK-NEXT: .cfi_offset %rbx, -32 -; CHECK-NEXT: .cfi_offset %r14, -24 -; CHECK-NEXT: .cfi_offset %r15, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 24 +; CHECK-NEXT: .cfi_offset %rbx, -24 +; CHECK-NEXT: .cfi_offset %r14, -16 ; CHECK-NEXT: movq %rdx, %r11 ; CHECK-NEXT: movq %rdi, %r9 -; CHECK-NEXT: movq %rsi, %rdi -; CHECK-NEXT: sarq $63, %rdi -; CHECK-NEXT: movq %rdi, %r10 -; CHECK-NEXT: andq %rdx, %r10 +; CHECK-NEXT: movq %rsi, %rbx +; CHECK-NEXT: sarq $63, %rbx +; CHECK-NEXT: movq %rdx, %rdi +; CHECK-NEXT: imulq %rbx, %rdi ; CHECK-NEXT: movq %rdx, %rax -; CHECK-NEXT: mulq %rdi +; CHECK-NEXT: mulq %rbx ; CHECK-NEXT: movq %rax, %r8 -; CHECK-NEXT: movq %rdx, %rbx -; CHECK-NEXT: subq %r10, %rbx -; CHECK-NEXT: andq %rcx, %rdi -; CHECK-NEXT: subq %rdi, %rbx -; CHECK-NEXT: movq %rcx, %r14 -; CHECK-NEXT: sarq $63, %r14 -; CHECK-NEXT: movq %r14, %r15 -; CHECK-NEXT: andq %rsi, %r15 -; CHECK-NEXT: movq %r14, %rax +; CHECK-NEXT: addq %rdi, %rdx +; CHECK-NEXT: imulq %rcx, %rbx +; CHECK-NEXT: addq %rdx, %rbx +; CHECK-NEXT: movq %rcx, %rdi +; CHECK-NEXT: sarq $63, %rdi +; CHECK-NEXT: movq %rdi, %r14 +; CHECK-NEXT: imulq %rsi, %r14 +; CHECK-NEXT: movq %rdi, %rax ; CHECK-NEXT: mulq %r9 ; CHECK-NEXT: movq %rax, %r10 -; CHECK-NEXT: movq %rdx, %rdi -; CHECK-NEXT: subq %r15, %rdi -; CHECK-NEXT: andq %r9, %r14 -; CHECK-NEXT: subq %r14, %rdi +; CHECK-NEXT: addq %r14, %rdx +; CHECK-NEXT: imulq %r9, %rdi +; CHECK-NEXT: addq %rdx, %rdi ; CHECK-NEXT: addq %r8, %r10 ; CHECK-NEXT: adcq %rbx, %rdi ; CHECK-NEXT: movq %r9, %rax @@ -77,7 +72,6 @@ define %0 @x(i64 %a.coerce0, i64 %a.coerce1, i64 %b.coerce0, i64 %b.coerce1) nou ; CHECK-NEXT: movq %r9, %rdx ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: popq %r14 -; CHECK-NEXT: popq %r15 ; CHECK-NEXT: retq ; CHECK-NEXT: LBB0_1: ## %overflow ; CHECK-NEXT: ud2 diff --git a/llvm/test/CodeGen/X86/smul_fix_sat.ll b/llvm/test/CodeGen/X86/smul_fix_sat.ll index 07debb1..996601ed 100644 --- a/llvm/test/CodeGen/X86/smul_fix_sat.ll +++ b/llvm/test/CodeGen/X86/smul_fix_sat.ll @@ -369,8 +369,8 @@ define i64 @func5(i64 %x, i64 %y) { ; X86-NEXT: .cfi_def_cfa_offset 16 ; X86-NEXT: pushl %esi ; X86-NEXT: .cfi_def_cfa_offset 20 -; X86-NEXT: subl $12, %esp -; X86-NEXT: .cfi_def_cfa_offset 32 +; X86-NEXT: subl $8, %esp +; X86-NEXT: .cfi_def_cfa_offset 28 ; X86-NEXT: .cfi_offset %esi, -20 ; X86-NEXT: .cfi_offset %edi, -16 ; X86-NEXT: .cfi_offset %ebx, -12 @@ -378,54 +378,52 @@ define i64 @func5(i64 %x, i64 %y) { ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ecx, %edi -; X86-NEXT: sarl $31, %edi -; X86-NEXT: movl %edi, %ebx -; X86-NEXT: andl %eax, %ebx -; X86-NEXT: mull %edi -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %edx, %esi -; X86-NEXT: subl %ebx, %esi -; X86-NEXT: andl %ebp, %edi -; X86-NEXT: subl %edi, %esi +; X86-NEXT: movl %ecx, %ebx +; X86-NEXT: sarl $31, %ebx +; X86-NEXT: movl %eax, %edi +; X86-NEXT: imull %ebx, %edi +; X86-NEXT: mull %ebx +; X86-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-NEXT: addl %edi, %edx ; X86-NEXT: movl %ebp, %edi +; X86-NEXT: imull %ebp, %ebx +; X86-NEXT: addl %edx, %ebx ; X86-NEXT: sarl $31, %edi ; X86-NEXT: movl %edi, %ebp -; X86-NEXT: andl %ecx, %ebp +; X86-NEXT: imull %ecx, %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl %edi, %eax -; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: subl %ebp, %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: andl %edx, %edi -; X86-NEXT: subl %edi, %ebx -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: mull %esi +; X86-NEXT: addl %ebp, %edx +; X86-NEXT: imull %esi, %edi +; X86-NEXT: addl %edx, %edi +; X86-NEXT: addl (%esp), %eax # 4-byte Folded Reload ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl %esi, %ebx -; X86-NEXT: movl %edx, %eax +; X86-NEXT: adcl %ebx, %edi +; X86-NEXT: movl %esi, %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: mull %esi ; X86-NEXT: movl %edx, %ebp -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %eax, (%esp) # 4-byte Spill ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl %edx, %ebx ; X86-NEXT: addl %eax, %ebp -; X86-NEXT: adcl $0, %edi +; X86-NEXT: adcl $0, %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: mull %edx ; X86-NEXT: movl %edx, %esi ; X86-NEXT: addl %eax, %ebp -; X86-NEXT: adcl %edi, %esi -; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X86-NEXT: adcl %ebx, %esi +; X86-NEXT: setb %bl ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: addl %esi, %eax -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload +; X86-NEXT: movzbl %bl, %esi ; X86-NEXT: adcl %esi, %edx ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: adcl %ebx, %edx +; X86-NEXT: adcl %edi, %edx ; X86-NEXT: movl %ebp, %edi ; X86-NEXT: sarl $31, %edi ; X86-NEXT: xorl %edi, %edx @@ -436,11 +434,11 @@ define i64 @func5(i64 %x, i64 %y) { ; X86-NEXT: xorl $2147483647, %esi # imm = 0x7FFFFFFF ; X86-NEXT: orl %edx, %edi ; X86-NEXT: notl %ecx -; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: cmovel (%esp), %ecx # 4-byte Folded Reload ; X86-NEXT: cmovel %ebp, %esi ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: movl %esi, %edx -; X86-NEXT: addl $12, %esp +; X86-NEXT: addl $8, %esp ; X86-NEXT: .cfi_def_cfa_offset 20 ; X86-NEXT: popl %esi ; X86-NEXT: .cfi_def_cfa_offset 16 diff --git a/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll index 6631c6c..367ca66 100644 --- a/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll +++ b/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll @@ -9,44 +9,39 @@ define zeroext i1 @smuloi128(i128 %v1, i128 %v2, ptr %res) { ; X64-NEXT: .cfi_def_cfa_offset 16 ; X64-NEXT: pushq %r14 ; X64-NEXT: .cfi_def_cfa_offset 24 -; X64-NEXT: pushq %r12 -; X64-NEXT: .cfi_def_cfa_offset 32 ; X64-NEXT: pushq %rbx -; X64-NEXT: .cfi_def_cfa_offset 40 -; X64-NEXT: .cfi_offset %rbx, -40 -; X64-NEXT: .cfi_offset %r12, -32 +; X64-NEXT: .cfi_def_cfa_offset 32 +; X64-NEXT: .cfi_offset %rbx, -32 ; X64-NEXT: .cfi_offset %r14, -24 ; X64-NEXT: .cfi_offset %r15, -16 ; X64-NEXT: movq %rdx, %rbx ; X64-NEXT: movq %rdi, %r10 -; X64-NEXT: movq %rsi, %r9 -; X64-NEXT: sarq $63, %r9 -; X64-NEXT: movq %r9, %r11 -; X64-NEXT: andq %rdx, %r11 +; X64-NEXT: movq %rsi, %r14 +; X64-NEXT: sarq $63, %r14 +; X64-NEXT: movq %rdx, %rdi +; X64-NEXT: imulq %r14, %rdi ; X64-NEXT: movq %rdx, %rax -; X64-NEXT: mulq %r9 -; X64-NEXT: movq %rax, %rdi -; X64-NEXT: movq %rdx, %r14 -; X64-NEXT: subq %r11, %r14 -; X64-NEXT: andq %rcx, %r9 -; X64-NEXT: subq %r9, %r14 -; X64-NEXT: movq %rcx, %r15 -; X64-NEXT: sarq $63, %r15 -; X64-NEXT: movq %r15, %r12 -; X64-NEXT: andq %rsi, %r12 -; X64-NEXT: movq %r15, %rax +; X64-NEXT: mulq %r14 +; X64-NEXT: movq %rax, %r9 +; X64-NEXT: addq %rdi, %rdx +; X64-NEXT: imulq %rcx, %r14 +; X64-NEXT: addq %rdx, %r14 +; X64-NEXT: movq %rcx, %rdi +; X64-NEXT: sarq $63, %rdi +; X64-NEXT: movq %rdi, %r15 +; X64-NEXT: imulq %rsi, %r15 +; X64-NEXT: movq %rdi, %rax ; X64-NEXT: mulq %r10 ; X64-NEXT: movq %rax, %r11 -; X64-NEXT: movq %rdx, %r9 -; X64-NEXT: subq %r12, %r9 -; X64-NEXT: andq %r10, %r15 -; X64-NEXT: subq %r15, %r9 -; X64-NEXT: addq %rdi, %r11 -; X64-NEXT: adcq %r14, %r9 +; X64-NEXT: addq %r15, %rdx +; X64-NEXT: imulq %r10, %rdi +; X64-NEXT: addq %rdx, %rdi +; X64-NEXT: addq %r9, %r11 +; X64-NEXT: adcq %r14, %rdi ; X64-NEXT: movq %r10, %rax ; X64-NEXT: mulq %rbx ; X64-NEXT: movq %rdx, %r14 -; X64-NEXT: movq %rax, %rdi +; X64-NEXT: movq %rax, %r9 ; X64-NEXT: movq %rsi, %rax ; X64-NEXT: mulq %rbx ; X64-NEXT: movq %rdx, %rbx @@ -66,16 +61,15 @@ define zeroext i1 @smuloi128(i128 %v1, i128 %v2, ptr %res) { ; X64-NEXT: addq %r14, %rax ; X64-NEXT: adcq %rbx, %rdx ; X64-NEXT: addq %r11, %rax -; X64-NEXT: adcq %r9, %rdx +; X64-NEXT: adcq %rdi, %rdx ; X64-NEXT: movq %r10, 8(%r8) ; X64-NEXT: sarq $63, %r10 ; X64-NEXT: xorq %r10, %rdx ; X64-NEXT: xorq %rax, %r10 ; X64-NEXT: orq %rdx, %r10 ; X64-NEXT: setne %al -; X64-NEXT: movq %rdi, (%r8) +; X64-NEXT: movq %r9, (%r8) ; X64-NEXT: popq %rbx -; X64-NEXT: popq %r12 ; X64-NEXT: popq %r14 ; X64-NEXT: popq %r15 ; X64-NEXT: retq @@ -90,8 +84,8 @@ define zeroext i1 @smuloi128(i128 %v1, i128 %v2, ptr %res) { ; X86-NEXT: .cfi_def_cfa_offset 16 ; X86-NEXT: pushl %esi ; X86-NEXT: .cfi_def_cfa_offset 20 -; X86-NEXT: subl $60, %esp -; X86-NEXT: .cfi_def_cfa_offset 80 +; X86-NEXT: subl $56, %esp +; X86-NEXT: .cfi_def_cfa_offset 76 ; X86-NEXT: .cfi_offset %esi, -20 ; X86-NEXT: .cfi_offset %edi, -16 ; X86-NEXT: .cfi_offset %ebx, -12 @@ -105,229 +99,226 @@ define zeroext i1 @smuloi128(i128 %v1, i128 %v2, ptr %res) { ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl %esi, %eax ; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %edi -; X86-NEXT: movl %eax, %ebp -; X86-NEXT: addl %ecx, %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: adcl $0, %edi +; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl %eax, %edi +; X86-NEXT: addl %ecx, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: adcl $0, %esi ; X86-NEXT: movl %ebx, %eax -; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: addl %ebp, %eax -; X86-NEXT: movl %eax, (%esp) ## 4-byte Spill -; X86-NEXT: adcl %edi, %ecx +; X86-NEXT: mull %ecx +; X86-NEXT: movl %edx, %ebp +; X86-NEXT: addl %edi, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl %esi, %ebp ; X86-NEXT: setb %bl ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %esi -; X86-NEXT: addl %ecx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: mull %ecx +; X86-NEXT: addl %ebp, %eax +; X86-NEXT: movl %eax, (%esp) ## 4-byte Spill ; X86-NEXT: movzbl %bl, %eax ; X86-NEXT: adcl %eax, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: mull %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: mull %edi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %edx, %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %esi +; X86-NEXT: mull %edi ; X86-NEXT: movl %edx, %edi ; X86-NEXT: movl %eax, %ebp -; X86-NEXT: addl %ecx, %ebp +; X86-NEXT: addl %esi, %ebp ; X86-NEXT: adcl $0, %edi -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: mull %ebx -; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl %edx, %ebx ; X86-NEXT: addl %ebp, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl %edi, %esi +; X86-NEXT: adcl %edi, %ebx ; X86-NEXT: setb %cl ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl %ebp, %eax -; X86-NEXT: mull %ebx +; X86-NEXT: mull %esi ; X86-NEXT: movl %eax, %edi -; X86-NEXT: addl %esi, %edi +; X86-NEXT: addl %ebx, %edi ; X86-NEXT: movzbl %cl, %eax ; X86-NEXT: adcl %eax, %edx ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload -; X86-NEXT: adcl (%esp), %edx ## 4-byte Folded Reload -; X86-NEXT: movl %edx, (%esp) ## 4-byte Spill -; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl $0, (%esp) ## 4-byte Folded Spill ; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: mull %ebx -; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl %esi, %eax +; X86-NEXT: mull %ecx +; X86-NEXT: movl %edx, %ebx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl %ebp, %eax -; X86-NEXT: mull %ebx -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: movl %eax, %ebp -; X86-NEXT: addl %esi, %ebp -; X86-NEXT: adcl $0, %ebx -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: mull %ecx +; X86-NEXT: movl %edx, %ebp +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: addl %ebx, %ecx +; X86-NEXT: adcl $0, %ebp +; X86-NEXT: movl %esi, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: addl %ebp, %eax -; X86-NEXT: movl %eax, %esi -; X86-NEXT: adcl %ebx, %ecx +; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: addl %ecx, %ebx +; X86-NEXT: adcl %ebp, %esi ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: movl %eax, %ebp -; X86-NEXT: addl %ecx, %ebp +; X86-NEXT: movl %edx, %ebp +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: addl %esi, %ecx ; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload -; X86-NEXT: adcl %eax, %ebx +; X86-NEXT: adcl %eax, %ebp ; X86-NEXT: addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; X86-NEXT: adcl (%esp), %esi ## 4-byte Folded Reload -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl $0, %ebp -; X86-NEXT: adcl $0, %ebx -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload -; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl $0, %ecx +; X86-NEXT: adcl $0, %ebp +; X86-NEXT: addl (%esp), %ecx ## 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload +; X86-NEXT: setb (%esp) ## 1-byte Folded Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl %ebx, %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %esi, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, (%esp) ## 4-byte Spill +; X86-NEXT: mull %esi +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %ecx +; X86-NEXT: mull %esi ; X86-NEXT: movl %edx, %edi -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: addl (%esp), %ecx ## 4-byte Folded Reload +; X86-NEXT: movl %eax, %esi +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload ; X86-NEXT: adcl $0, %edi -; X86-NEXT: movl %esi, %eax -; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: movl %edx, %esi -; X86-NEXT: addl %ecx, %eax -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: adcl %edi, %esi -; X86-NEXT: setb (%esp) ## 1-byte Folded Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: mull %edi +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: mull %edx +; X86-NEXT: movl %edx, %ebx ; X86-NEXT: addl %esi, %eax ; X86-NEXT: movl %eax, %esi -; X86-NEXT: movzbl (%esp), %eax ## 1-byte Folded Reload -; X86-NEXT: adcl %eax, %edx -; X86-NEXT: addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; X86-NEXT: adcl %ebx, %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl %edi, %ebx +; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %edi, %eax +; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: addl %ebx, %eax +; X86-NEXT: movl %eax, %ebx ; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload -; X86-NEXT: adcl %eax, %esi +; X86-NEXT: adcl %eax, %edx +; X86-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; X86-NEXT: adcl %ebp, %esi ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movzbl (%esp), %eax ## 1-byte Folded Reload +; X86-NEXT: adcl %eax, %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: adcl $0, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: sarl $31, %ecx -; X86-NEXT: movl %ecx, %esi -; X86-NEXT: andl %edi, %esi -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl %edi, %esi +; X86-NEXT: sarl $31, %esi +; X86-NEXT: movl %esi, %edi +; X86-NEXT: imull {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %esi, %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: mull %ebx -; X86-NEXT: movl %eax, (%esp) ## 4-byte Spill +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: addl %edi, %edx +; X86-NEXT: imull %esi, %ebx +; X86-NEXT: addl %edx, %ebx +; X86-NEXT: movl %ebx, (%esp) ## 4-byte Spill +; X86-NEXT: movl %esi, %ebx +; X86-NEXT: imull {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl %ebp, %eax +; X86-NEXT: mull %esi ; X86-NEXT: movl %edx, %edi -; X86-NEXT: subl %esi, %edi -; X86-NEXT: andl %ecx, %ebx -; X86-NEXT: subl %ebx, %edi -; X86-NEXT: movl %ecx, %esi -; X86-NEXT: andl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %ecx +; X86-NEXT: addl %edx, %ebx +; X86-NEXT: imull %esi, %ebp +; X86-NEXT: addl %ebx, %ebp +; X86-NEXT: movl %eax, %ebx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: movl %edx, %ebp -; X86-NEXT: subl %esi, %ebp +; X86-NEXT: addl %eax, %ecx +; X86-NEXT: adcl (%esp), %ebp ## 4-byte Folded Reload +; X86-NEXT: movl %ebp, (%esp) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: andl %ecx, %eax -; X86-NEXT: subl %eax, %ebp -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload -; X86-NEXT: addl %esi, (%esp) ## 4-byte Folded Spill +; X86-NEXT: mull %esi +; X86-NEXT: movl %eax, %esi +; X86-NEXT: addl %edi, %esi +; X86-NEXT: movl %edx, %ebp +; X86-NEXT: adcl $0, %ebp +; X86-NEXT: addl %ebx, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: adcl %edi, %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %ecx -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: addl %ebx, %ecx -; X86-NEXT: movl %edx, %edi -; X86-NEXT: adcl $0, %edi -; X86-NEXT: addl %esi, %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl %ebx, %edi -; X86-NEXT: setb %cl -; X86-NEXT: addl %eax, %edi -; X86-NEXT: movzbl %cl, %eax +; X86-NEXT: setb %bl +; X86-NEXT: addl %eax, %ebp +; X86-NEXT: movzbl %bl, %eax ; X86-NEXT: adcl %edx, %eax -; X86-NEXT: addl (%esp), %edi ## 4-byte Folded Reload -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl %ebp, %eax +; X86-NEXT: addl %ecx, %ebp +; X86-NEXT: adcl (%esp), %eax ## 4-byte Folded Reload ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: sarl $31, %eax -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: andl %edx, %ecx -; X86-NEXT: movl %eax, %edi -; X86-NEXT: mull %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: sarl $31, %ebx +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: mull %ecx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl %edx, %ebp -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: subl %ecx, %ebx -; X86-NEXT: movl %edi, %eax -; X86-NEXT: andl {{[0-9]+}}(%esp), %eax -; X86-NEXT: subl %eax, %ebx -; X86-NEXT: movl %edi, %esi +; X86-NEXT: movl %edx, %esi +; X86-NEXT: imull %ebx, %ecx +; X86-NEXT: addl %edx, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: imull %ebx, %edi +; X86-NEXT: addl %ecx, %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: andl %eax, %esi -; X86-NEXT: mull %edi +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: imull %ebx, %ecx +; X86-NEXT: mull %ebx ; X86-NEXT: movl %eax, (%esp) ## 4-byte Spill -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: subl %esi, %ecx +; X86-NEXT: addl %ecx, %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: andl %edi, %eax -; X86-NEXT: subl %eax, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload -; X86-NEXT: addl %eax, (%esp) ## 4-byte Folded Spill -; X86-NEXT: adcl %ebx, %ecx -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: movl %eax, %esi -; X86-NEXT: addl %ebp, %ebx -; X86-NEXT: adcl $0, %ebp -; X86-NEXT: movl %edi, %eax +; X86-NEXT: imull %ebx, %eax +; X86-NEXT: addl %edx, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload +; X86-NEXT: addl %ecx, (%esp) ## 4-byte Folded Spill +; X86-NEXT: adcl %edi, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movl %ecx, %edi +; X86-NEXT: addl %esi, %edi +; X86-NEXT: adcl $0, %esi +; X86-NEXT: movl %ebx, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: addl %eax, %ebx -; X86-NEXT: adcl %edx, %ebp -; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill -; X86-NEXT: addl %eax, %ebp -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload +; X86-NEXT: addl %eax, %edi +; X86-NEXT: adcl %edx, %esi +; X86-NEXT: setb %bl +; X86-NEXT: addl %eax, %esi +; X86-NEXT: movzbl %bl, %eax ; X86-NEXT: adcl %edx, %eax -; X86-NEXT: addl (%esp), %ebp ## 4-byte Folded Reload -; X86-NEXT: adcl %ecx, %eax -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload +; X86-NEXT: addl (%esp), %esi ## 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload +; X86-NEXT: adcl %ebp, %esi ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: sarl $31, %ecx -; X86-NEXT: xorl %ecx, %eax -; X86-NEXT: xorl %ecx, %ebx -; X86-NEXT: orl %eax, %ebx -; X86-NEXT: xorl %ecx, %ebp -; X86-NEXT: xorl %esi, %ecx -; X86-NEXT: orl %ebp, %ecx -; X86-NEXT: orl %ebx, %ecx +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload +; X86-NEXT: movl %ebx, %edx +; X86-NEXT: sarl $31, %edx +; X86-NEXT: xorl %edx, %eax +; X86-NEXT: xorl %edx, %edi +; X86-NEXT: orl %eax, %edi +; X86-NEXT: xorl %edx, %esi +; X86-NEXT: xorl %ecx, %edx +; X86-NEXT: orl %esi, %edx +; X86-NEXT: orl %edi, %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %edx, 12(%eax) +; X86-NEXT: movl %ebx, 12(%eax) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload ; X86-NEXT: movl %ecx, (%eax) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload @@ -335,7 +326,7 @@ define zeroext i1 @smuloi128(i128 %v1, i128 %v2, ptr %res) { ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload ; X86-NEXT: movl %ecx, 8(%eax) ; X86-NEXT: setne %al -; X86-NEXT: addl $60, %esp +; X86-NEXT: addl $56, %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -369,239 +360,234 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) { ; X64-NEXT: .cfi_offset %r14, -32 ; X64-NEXT: .cfi_offset %r15, -24 ; X64-NEXT: .cfi_offset %rbp, -16 -; X64-NEXT: movq %rcx, %r14 -; X64-NEXT: movq %rdx, %r15 -; X64-NEXT: movq %rsi, %r10 -; X64-NEXT: movq %rdi, %r11 +; X64-NEXT: movq %rcx, %r11 +; X64-NEXT: movq %rdx, %rbx +; X64-NEXT: movq %rsi, %r15 ; X64-NEXT: movq %rdx, %rax ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; X64-NEXT: mulq %r8 -; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: movq %rdx, %rsi ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; X64-NEXT: movq %r14, %rax -; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill +; X64-NEXT: movq %rcx, %rax +; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; X64-NEXT: mulq %r8 -; X64-NEXT: movq %rdx, %rsi -; X64-NEXT: movq %rax, %rdi -; X64-NEXT: addq %rcx, %rdi -; X64-NEXT: adcq $0, %rsi -; X64-NEXT: movq %r15, %rax +; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: movq %rax, %r10 +; X64-NEXT: addq %rsi, %r10 +; X64-NEXT: adcq $0, %rcx +; X64-NEXT: movq %rbx, %rax ; X64-NEXT: mulq %r9 ; X64-NEXT: movq %rdx, %r12 -; X64-NEXT: movq %rax, %rbx -; X64-NEXT: addq %rdi, %rbx -; X64-NEXT: adcq %rsi, %r12 +; X64-NEXT: movq %rax, %r14 +; X64-NEXT: addq %r10, %r14 +; X64-NEXT: adcq %rcx, %r12 ; X64-NEXT: setb %al -; X64-NEXT: movzbl %al, %edi -; X64-NEXT: movq %r14, %rax -; X64-NEXT: mulq %r9 -; X64-NEXT: movq %r9, %rcx -; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; X64-NEXT: movq %rax, %rsi -; X64-NEXT: addq %r12, %rsi -; X64-NEXT: adcq %rdi, %rdx -; X64-NEXT: movq %rdx, %r15 +; X64-NEXT: movzbl %al, %ecx ; X64-NEXT: movq %r11, %rax -; X64-NEXT: movq %r8, %rdi +; X64-NEXT: mulq %r9 +; X64-NEXT: movq %rdx, %r11 +; X64-NEXT: movq %rax, %rbx +; X64-NEXT: addq %r12, %rbx +; X64-NEXT: adcq %rcx, %r11 +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: movq %r8, %rcx ; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; X64-NEXT: mulq %r8 ; X64-NEXT: movq %rdx, %r8 ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; X64-NEXT: movq %r10, %rax -; X64-NEXT: mulq %rdi +; X64-NEXT: movq %r15, %rax +; X64-NEXT: mulq %rcx ; X64-NEXT: movq %rdx, %r12 ; X64-NEXT: movq %rax, %r13 ; X64-NEXT: addq %r8, %r13 ; X64-NEXT: adcq $0, %r12 -; X64-NEXT: movq %r11, %rax +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: movq %r9, %rsi +; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; X64-NEXT: mulq %r9 -; X64-NEXT: movq %rdx, %rdi +; X64-NEXT: movq %rdx, %r10 ; X64-NEXT: addq %r13, %rax ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; X64-NEXT: adcq %r12, %rdi -; X64-NEXT: setb %r9b -; X64-NEXT: movq %r10, %rax -; X64-NEXT: mulq %rcx +; X64-NEXT: adcq %r12, %r10 +; X64-NEXT: setb %cl +; X64-NEXT: movq %r15, %r9 +; X64-NEXT: movq %r15, %rax +; X64-NEXT: mulq %rsi ; X64-NEXT: movq %rdx, %rbp ; X64-NEXT: movq %rax, %r8 -; X64-NEXT: addq %rdi, %r8 -; X64-NEXT: movzbl %r9b, %eax +; X64-NEXT: addq %r10, %r8 +; X64-NEXT: movzbl %cl, %eax ; X64-NEXT: adcq %rax, %rbp -; X64-NEXT: movq {{[0-9]+}}(%rsp), %r14 +; X64-NEXT: movq {{[0-9]+}}(%rsp), %r15 ; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r8 ## 8-byte Folded Reload -; X64-NEXT: adcq %rbx, %rbp -; X64-NEXT: adcq $0, %rsi -; X64-NEXT: adcq $0, %r15 -; X64-NEXT: movq %r15, %r12 +; X64-NEXT: adcq %r14, %rbp +; X64-NEXT: adcq $0, %rbx +; X64-NEXT: adcq $0, %r11 ; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; X64-NEXT: movq %r11, %rax -; X64-NEXT: mulq %r14 -; X64-NEXT: movq %rdx, %rdi -; X64-NEXT: movq %rax, %rbx -; X64-NEXT: movq %r10, %rax -; X64-NEXT: movq %r10, %rcx -; X64-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; X64-NEXT: mulq %r14 +; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: mulq %r15 +; X64-NEXT: movq %rdx, %r10 +; X64-NEXT: movq %rax, %r14 +; X64-NEXT: movq %r9, %rax +; X64-NEXT: movq %r9, %rsi +; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill +; X64-NEXT: mulq %r15 ; X64-NEXT: movq %rdx, %r13 -; X64-NEXT: movq %rax, %r10 -; X64-NEXT: addq %rdi, %r10 +; X64-NEXT: movq %rax, %r9 +; X64-NEXT: addq %r10, %r9 ; X64-NEXT: adcq $0, %r13 -; X64-NEXT: movq {{[0-9]+}}(%rsp), %r9 -; X64-NEXT: movq %r11, %rax -; X64-NEXT: mulq %r9 +; X64-NEXT: movq {{[0-9]+}}(%rsp), %r12 +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: mulq %r12 ; X64-NEXT: movq %rdx, %r11 -; X64-NEXT: movq %rax, %r15 -; X64-NEXT: addq %r10, %r15 +; X64-NEXT: addq %r9, %rax +; X64-NEXT: movq %rax, %rdi ; X64-NEXT: adcq %r13, %r11 -; X64-NEXT: setb %r10b -; X64-NEXT: movq %rcx, %rax -; X64-NEXT: mulq %r9 -; X64-NEXT: movq %rdx, %rdi +; X64-NEXT: setb %cl +; X64-NEXT: movq %rsi, %rax +; X64-NEXT: mulq %r12 +; X64-NEXT: movq %rdx, %r10 ; X64-NEXT: movq %rax, %r13 ; X64-NEXT: addq %r11, %r13 -; X64-NEXT: movzbl %r10b, %eax -; X64-NEXT: adcq %rax, %rdi -; X64-NEXT: addq %r8, %rbx -; X64-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; X64-NEXT: adcq %rbp, %r15 -; X64-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill +; X64-NEXT: movzbl %cl, %eax +; X64-NEXT: adcq %rax, %r10 +; X64-NEXT: addq %r8, %r14 +; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill +; X64-NEXT: adcq %rbp, %rdi +; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; X64-NEXT: adcq $0, %r13 -; X64-NEXT: adcq $0, %rdi -; X64-NEXT: addq %rsi, %r13 -; X64-NEXT: adcq %r12, %rdi -; X64-NEXT: setb %r11b -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 ## 8-byte Reload -; X64-NEXT: movq %r10, %rax -; X64-NEXT: mulq %r14 -; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq %rax, %r15 +; X64-NEXT: adcq $0, %r10 +; X64-NEXT: addq %rbx, %r13 +; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r10 ## 8-byte Folded Reload +; X64-NEXT: setb %cl +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 ## 8-byte Reload +; X64-NEXT: movq %r9, %rax +; X64-NEXT: mulq %r15 +; X64-NEXT: movq %rdx, %rsi +; X64-NEXT: movq %rax, %r11 ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx ## 8-byte Reload ; X64-NEXT: movq %rbx, %rax -; X64-NEXT: mulq %r14 -; X64-NEXT: movq %rdx, %rsi +; X64-NEXT: mulq %r15 +; X64-NEXT: movq %rdx, %rdi ; X64-NEXT: movq %rax, %r8 -; X64-NEXT: addq %rcx, %r8 -; X64-NEXT: adcq $0, %rsi -; X64-NEXT: movq %r10, %rax -; X64-NEXT: mulq %r9 -; X64-NEXT: movq %rdx, %r10 +; X64-NEXT: addq %rsi, %r8 +; X64-NEXT: adcq $0, %rdi +; X64-NEXT: movq %r9, %rax +; X64-NEXT: mulq %r12 +; X64-NEXT: movq %rdx, %r9 ; X64-NEXT: addq %r8, %rax -; X64-NEXT: movq %rax, %r8 -; X64-NEXT: adcq %rsi, %r10 -; X64-NEXT: setb %cl -; X64-NEXT: movq %rbx, %rsi +; X64-NEXT: movq %rax, %rsi +; X64-NEXT: adcq %rdi, %r9 +; X64-NEXT: setb %r8b ; X64-NEXT: movq %rbx, %rax -; X64-NEXT: mulq %r9 +; X64-NEXT: mulq %r12 ; X64-NEXT: movq %rdx, %rbp -; X64-NEXT: movq %rax, %rbx -; X64-NEXT: addq %r10, %rbx -; X64-NEXT: movzbl %cl, %eax +; X64-NEXT: movq %rax, %r14 +; X64-NEXT: addq %r9, %r14 +; X64-NEXT: movzbl %r8b, %eax ; X64-NEXT: adcq %rax, %rbp -; X64-NEXT: addq %r13, %r15 -; X64-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; X64-NEXT: adcq %rdi, %r8 -; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; X64-NEXT: movzbl %r11b, %eax -; X64-NEXT: adcq %rax, %rbx +; X64-NEXT: addq %r13, %r11 +; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill +; X64-NEXT: adcq %r10, %rsi +; X64-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill +; X64-NEXT: movzbl %cl, %eax +; X64-NEXT: adcq %rax, %r14 ; X64-NEXT: adcq $0, %rbp -; X64-NEXT: movq %rsi, %r13 +; X64-NEXT: movq %rbx, %r13 +; X64-NEXT: movq %rbx, %r10 ; X64-NEXT: sarq $63, %r13 ; X64-NEXT: movq %r13, %rcx -; X64-NEXT: andq %r9, %rcx +; X64-NEXT: imulq %r12, %rcx ; X64-NEXT: movq %r13, %rax -; X64-NEXT: mulq %r14 +; X64-NEXT: mulq %r15 ; X64-NEXT: movq %rax, %r8 -; X64-NEXT: movq %rdx, %r10 -; X64-NEXT: subq %rcx, %r10 -; X64-NEXT: andq %r13, %r14 -; X64-NEXT: subq %r14, %r10 -; X64-NEXT: movq %r13, %rsi -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 ## 8-byte Reload -; X64-NEXT: andq %r14, %rsi +; X64-NEXT: addq %rcx, %rdx +; X64-NEXT: imulq %r13, %r15 +; X64-NEXT: addq %rdx, %r15 +; X64-NEXT: movq %r13, %rcx ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi ## 8-byte Reload -; X64-NEXT: movq %rdi, %rax +; X64-NEXT: imulq %rdi, %rcx +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi ## 8-byte Reload +; X64-NEXT: movq %rsi, %rax ; X64-NEXT: mulq %r13 -; X64-NEXT: movq %rdx, %r11 -; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: subq %rsi, %rcx -; X64-NEXT: andq %r13, %rdi -; X64-NEXT: subq %rdi, %rcx -; X64-NEXT: movq %rax, %rsi +; X64-NEXT: movq %rdx, %r9 +; X64-NEXT: addq %rdx, %rcx +; X64-NEXT: imulq %r13, %rsi +; X64-NEXT: addq %rcx, %rsi +; X64-NEXT: movq %rax, %rcx ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; X64-NEXT: addq %rax, %r8 -; X64-NEXT: adcq %r10, %rcx -; X64-NEXT: movq %r14, %rax +; X64-NEXT: adcq %r15, %rsi +; X64-NEXT: movq %rdi, %rax ; X64-NEXT: mulq %r13 ; X64-NEXT: movq %rax, %r15 -; X64-NEXT: addq %r11, %r15 +; X64-NEXT: addq %r9, %r15 ; X64-NEXT: movq %rdx, %r13 ; X64-NEXT: adcq $0, %r13 -; X64-NEXT: addq %rsi, %r15 -; X64-NEXT: adcq %r11, %r13 -; X64-NEXT: setb %sil +; X64-NEXT: addq %rcx, %r15 +; X64-NEXT: adcq %r9, %r13 +; X64-NEXT: setb %cl ; X64-NEXT: addq %rax, %r13 -; X64-NEXT: movzbl %sil, %esi -; X64-NEXT: adcq %rdx, %rsi +; X64-NEXT: movzbl %cl, %r9d +; X64-NEXT: adcq %rdx, %r9 ; X64-NEXT: addq %r8, %r13 -; X64-NEXT: adcq %rcx, %rsi -; X64-NEXT: sarq $63, %r9 -; X64-NEXT: movq %r9, %r8 +; X64-NEXT: adcq %rsi, %r9 +; X64-NEXT: sarq $63, %r12 ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax ## 8-byte Reload -; X64-NEXT: andq %rax, %r8 -; X64-NEXT: mulq %r9 -; X64-NEXT: movq %rax, %rcx +; X64-NEXT: movq %rax, %r8 +; X64-NEXT: imulq %r12, %r8 +; X64-NEXT: mulq %r12 +; X64-NEXT: movq %rax, %rsi ; X64-NEXT: movq %rdx, %r11 -; X64-NEXT: movq %rdx, %r14 -; X64-NEXT: subq %r8, %r14 -; X64-NEXT: movq %r9, %rax +; X64-NEXT: addq %rdx, %r8 ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi ## 8-byte Reload -; X64-NEXT: andq %rdi, %rax -; X64-NEXT: subq %rax, %r14 -; X64-NEXT: movq %r9, %r12 -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax ## 8-byte Reload -; X64-NEXT: andq %rax, %r12 -; X64-NEXT: mulq %r9 -; X64-NEXT: movq %rax, %r10 -; X64-NEXT: movq %rdx, %r8 -; X64-NEXT: subq %r12, %r8 +; X64-NEXT: movq %rdi, %rbx +; X64-NEXT: imulq %r12, %rbx +; X64-NEXT: addq %r8, %rbx ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax ## 8-byte Reload -; X64-NEXT: andq %r9, %rax -; X64-NEXT: subq %rax, %r8 -; X64-NEXT: addq %rcx, %r10 -; X64-NEXT: adcq %r14, %r8 -; X64-NEXT: movq %rcx, %r14 -; X64-NEXT: addq %r11, %r14 +; X64-NEXT: movq %rax, %rcx +; X64-NEXT: imulq %r12, %rcx +; X64-NEXT: mulq %r12 +; X64-NEXT: movq %rax, %r8 +; X64-NEXT: addq %rcx, %rdx +; X64-NEXT: imulq %r12, %r10 +; X64-NEXT: addq %rdx, %r10 +; X64-NEXT: addq %rsi, %r8 +; X64-NEXT: adcq %rbx, %r10 +; X64-NEXT: movq %rsi, %rbx +; X64-NEXT: addq %r11, %rbx ; X64-NEXT: adcq $0, %r11 -; X64-NEXT: movq %r9, %rax +; X64-NEXT: movq %r12, %rax ; X64-NEXT: mulq %rdi -; X64-NEXT: addq %rax, %r14 +; X64-NEXT: addq %rax, %rbx ; X64-NEXT: adcq %rdx, %r11 -; X64-NEXT: setb %r9b +; X64-NEXT: setb %cl ; X64-NEXT: addq %rax, %r11 -; X64-NEXT: movzbl %r9b, %eax +; X64-NEXT: movzbl %cl, %eax ; X64-NEXT: adcq %rdx, %rax -; X64-NEXT: addq %r10, %r11 -; X64-NEXT: adcq %r8, %rax -; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rcx ## 8-byte Folded Reload -; X64-NEXT: adcq %r15, %r14 +; X64-NEXT: addq %r8, %r11 +; X64-NEXT: adcq %r10, %rax +; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rsi ## 8-byte Folded Reload +; X64-NEXT: adcq %r15, %rbx ; X64-NEXT: adcq %r13, %r11 -; X64-NEXT: adcq %rsi, %rax -; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rcx ## 8-byte Folded Reload -; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r14 ## 8-byte Folded Reload -; X64-NEXT: adcq %rbx, %r11 +; X64-NEXT: adcq %r9, %rax +; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rsi ## 8-byte Folded Reload +; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rbx ## 8-byte Folded Reload +; X64-NEXT: adcq %r14, %r11 ; X64-NEXT: adcq %rbp, %rax -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi ## 8-byte Reload -; X64-NEXT: movq %rsi, %rdx -; X64-NEXT: sarq $63, %rdx -; X64-NEXT: xorq %rdx, %rax -; X64-NEXT: xorq %rdx, %r14 -; X64-NEXT: orq %rax, %r14 -; X64-NEXT: xorq %rdx, %r11 -; X64-NEXT: xorq %rcx, %rdx -; X64-NEXT: orq %r11, %rdx -; X64-NEXT: orq %r14, %rdx +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx ## 8-byte Reload +; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: sarq $63, %rcx +; X64-NEXT: xorq %rcx, %rax +; X64-NEXT: xorq %rcx, %rbx +; X64-NEXT: orq %rax, %rbx +; X64-NEXT: xorq %rcx, %r11 +; X64-NEXT: xorq %rsi, %rcx +; X64-NEXT: orq %r11, %rcx +; X64-NEXT: orq %rbx, %rcx ; X64-NEXT: movq {{[0-9]+}}(%rsp), %rax -; X64-NEXT: movq %rsi, 24(%rax) +; X64-NEXT: movq %rdx, 24(%rax) ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx ## 8-byte Reload ; X64-NEXT: movq %rcx, (%rax) ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx ## 8-byte Reload @@ -627,399 +613,400 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) { ; X86-NEXT: .cfi_def_cfa_offset 16 ; X86-NEXT: pushl %esi ; X86-NEXT: .cfi_def_cfa_offset 20 -; X86-NEXT: subl $152, %esp -; X86-NEXT: .cfi_def_cfa_offset 172 +; X86-NEXT: subl $156, %esp +; X86-NEXT: .cfi_def_cfa_offset 176 ; X86-NEXT: .cfi_offset %esi, -20 ; X86-NEXT: .cfi_offset %edi, -16 ; X86-NEXT: .cfi_offset %ebx, -12 ; X86-NEXT: .cfi_offset %ebp, -8 -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %ebp, %eax +; X86-NEXT: movl %ebx, %eax ; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl %eax, (%esp) ## 4-byte Spill -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl %esi, %eax ; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %edi -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: addl %esi, %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: adcl $0, %edi -; X86-NEXT: movl %ebp, %eax -; X86-NEXT: mull %ecx -; X86-NEXT: movl %ecx, %ebp ; X86-NEXT: movl %edx, %esi -; X86-NEXT: addl %ebx, %eax +; X86-NEXT: movl %eax, %edi +; X86-NEXT: addl %ecx, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: adcl $0, %esi +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: mull %ebp +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: addl %edi, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl %edi, %esi -; X86-NEXT: setb %cl +; X86-NEXT: adcl %esi, %ecx +; X86-NEXT: setb %bl ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %ebp -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: addl %esi, %eax +; X86-NEXT: movl %edx, %esi +; X86-NEXT: addl %ecx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movzbl %cl, %eax -; X86-NEXT: adcl %eax, %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl %ebp, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: mull %ecx +; X86-NEXT: movzbl %bl, %eax +; X86-NEXT: adcl %eax, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: mull %edi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %ecx +; X86-NEXT: mull %edi ; X86-NEXT: movl %edx, %edi -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: addl %esi, %ecx +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: addl %ecx, %ebp ; X86-NEXT: adcl $0, %edi -; X86-NEXT: movl %ebp, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %ebp -; X86-NEXT: addl %ecx, %eax +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: mull %ecx +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: addl %ebp, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl %edi, %ebp -; X86-NEXT: setb %cl -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %esi -; X86-NEXT: movl %eax, %edi -; X86-NEXT: addl %ebp, %edi -; X86-NEXT: movzbl %cl, %eax +; X86-NEXT: adcl %edi, %ebx +; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl %ebp, %eax +; X86-NEXT: mull %ecx +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: addl %ebx, %ecx +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload ; X86-NEXT: adcl %eax, %edx -; X86-NEXT: addl (%esp), %edi ## 4-byte Folded Reload +; X86-NEXT: addl (%esp), %ecx ## 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; X86-NEXT: adcl $0, %ebx -; X86-NEXT: movl %ebx, (%esp) ## 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: adcl $0, %esi +; X86-NEXT: movl %esi, (%esp) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %esi, %eax -; X86-NEXT: mull %ebx -; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %edi, %eax +; X86-NEXT: mull %esi +; X86-NEXT: movl %edx, %ebx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %ebx +; X86-NEXT: movl %ebp, %eax +; X86-NEXT: mull %esi ; X86-NEXT: movl %edx, %ebp -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: addl %ecx, %ebx +; X86-NEXT: movl %eax, %esi +; X86-NEXT: addl %ebx, %esi ; X86-NEXT: adcl $0, %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %esi, %eax -; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, %esi -; X86-NEXT: addl %ebx, %eax -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: adcl %ebp, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl %edi, %eax +; X86-NEXT: mull %ebx +; X86-NEXT: movl %edx, %edi +; X86-NEXT: addl %esi, %eax +; X86-NEXT: movl %eax, %esi +; X86-NEXT: adcl %ebp, %edi ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, %ebp -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: addl %esi, %ecx -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload -; X86-NEXT: adcl %eax, %ebp -; X86-NEXT: addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl $0, %ecx +; X86-NEXT: mull %ebx +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: addl %edi, %ebp +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload +; X86-NEXT: adcl %eax, %ebx +; X86-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: adcl $0, %ebp -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload -; X86-NEXT: adcl (%esp), %ebp ## 4-byte Folded Reload +; X86-NEXT: adcl $0, %ebx +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload +; X86-NEXT: adcl (%esp), %ebx ## 4-byte Folded Reload ; X86-NEXT: setb (%esp) ## 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: mull %esi +; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %edi -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: addl %esi, %ebx -; X86-NEXT: adcl $0, %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: mull %edx +; X86-NEXT: mull %esi ; X86-NEXT: movl %edx, %esi -; X86-NEXT: addl %ebx, %eax -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: adcl %edi, %esi +; X86-NEXT: movl %eax, %edi +; X86-NEXT: addl %ecx, %edi +; X86-NEXT: adcl $0, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: mull %ecx +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: addl %edi, %eax +; X86-NEXT: movl %eax, %edi +; X86-NEXT: adcl %esi, %ecx ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: addl %esi, %eax -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 1-byte Folded Reload -; X86-NEXT: adcl %esi, %edx -; X86-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; X86-NEXT: adcl %ebp, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: addl %ecx, %eax +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 1-byte Folded Reload +; X86-NEXT: adcl %ecx, %edx +; X86-NEXT: addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; X86-NEXT: adcl %ebx, %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movzbl (%esp), %ecx ## 1-byte Folded Reload ; X86-NEXT: adcl %ecx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: adcl $0, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl %ebp, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl %ebx, %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: mull %esi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %esi +; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl %eax, %edi +; X86-NEXT: addl %ecx, %edi +; X86-NEXT: adcl $0, %esi +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: mull %ebp +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: addl %edi, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl %esi, %ecx +; X86-NEXT: setb %bl +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: mull %ebp +; X86-NEXT: movl %edx, %esi +; X86-NEXT: addl %ecx, %eax +; X86-NEXT: movl %eax, (%esp) ## 4-byte Spill +; X86-NEXT: movzbl %bl, %eax +; X86-NEXT: adcl %eax, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl %ebp, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: mull %edi +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: mull %edi ; X86-NEXT: movl %edx, %edi ; X86-NEXT: movl %eax, %ebx ; X86-NEXT: addl %ecx, %ebx ; X86-NEXT: adcl $0, %edi ; X86-NEXT: movl %ebp, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %ebp -; X86-NEXT: addl %ebx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl %edi, %ebp -; X86-NEXT: setb %cl -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %esi -; X86-NEXT: addl %ebp, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movzbl %cl, %eax -; X86-NEXT: adcl %eax, %edx -; X86-NEXT: movl %edx, (%esp) ## 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl %ebx, %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: mull %ecx -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl %edx, %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %ecx ; X86-NEXT: movl %edx, %ebp -; X86-NEXT: movl %eax, %esi -; X86-NEXT: addl %edi, %esi -; X86-NEXT: adcl $0, %ebp -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: addl %esi, %eax +; X86-NEXT: addl %ebx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl %ebp, %ecx +; X86-NEXT: adcl %edi, %ebp ; X86-NEXT: setb %bl ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %edi -; X86-NEXT: movl %eax, %edi -; X86-NEXT: addl %ecx, %edi +; X86-NEXT: mull %ecx +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: addl %ebp, %ecx ; X86-NEXT: movzbl %bl, %eax ; X86-NEXT: adcl %eax, %edx -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill ; X86-NEXT: adcl $0, (%esp) ## 4-byte Folded Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl %ebp, %eax +; X86-NEXT: adcl $0, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %edx, %ebx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: addl %ecx, %ebx -; X86-NEXT: adcl $0, %esi -; X86-NEXT: movl %ebp, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: mull %ecx ; X86-NEXT: movl %edx, %ebp -; X86-NEXT: addl %ebx, %eax -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: adcl %esi, %ebp -; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl %eax, %esi -; X86-NEXT: addl %ebp, %esi -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload -; X86-NEXT: adcl %eax, %ecx -; X86-NEXT: addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl $0, %esi -; X86-NEXT: adcl $0, %ecx -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload -; X86-NEXT: adcl (%esp), %ecx ## 4-byte Folded Reload -; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill +; X86-NEXT: addl %ebx, %esi +; X86-NEXT: adcl $0, %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: mull %ebx ; X86-NEXT: movl %edx, %edi -; X86-NEXT: movl %eax, (%esp) ## 4-byte Spill +; X86-NEXT: addl %esi, %eax +; X86-NEXT: movl %eax, %esi +; X86-NEXT: adcl %ebp, %edi +; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %ebx ; X86-NEXT: movl %edx, %ebx ; X86-NEXT: movl %eax, %ebp ; X86-NEXT: addl %edi, %ebp +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload +; X86-NEXT: adcl %eax, %ebx +; X86-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl $0, %ebp ; X86-NEXT: adcl $0, %ebx +; X86-NEXT: addl (%esp), %ebp ## 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload +; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: mull %edx -; X86-NEXT: movl %edx, %edi -; X86-NEXT: addl %ebp, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl %ebx, %edi -; X86-NEXT: setb %bl -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl %ebp, %eax -; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: mull %esi +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %eax, (%esp) ## 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: mull %esi +; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl %eax, %edi +; X86-NEXT: addl %ecx, %edi +; X86-NEXT: adcl $0, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: mull %ecx +; X86-NEXT: movl %edx, %ecx ; X86-NEXT: addl %edi, %eax -; X86-NEXT: movzbl %bl, %ebx -; X86-NEXT: movl %edx, %edi -; X86-NEXT: adcl %ebx, %edi -; X86-NEXT: movl (%esp), %edx ## 4-byte Reload -; X86-NEXT: addl %esi, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload -; X86-NEXT: adcl %ecx, %esi +; X86-NEXT: movl %eax, %edi +; X86-NEXT: adcl %esi, %ecx +; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: addl %ecx, %eax ; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 1-byte Folded Reload -; X86-NEXT: adcl %ecx, %eax -; X86-NEXT: adcl $0, %edi -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload -; X86-NEXT: movl %edx, (%esp) ## 4-byte Spill +; X86-NEXT: adcl %ecx, %edx +; X86-NEXT: movl (%esp), %ecx ## 4-byte Reload +; X86-NEXT: addl %ebp, %ecx +; X86-NEXT: movl %edi, %esi +; X86-NEXT: adcl %ebx, %esi +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 1-byte Folded Reload +; X86-NEXT: adcl %edi, %eax +; X86-NEXT: adcl $0, %edx +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload +; X86-NEXT: movl %ecx, (%esp) ## 4-byte Spill ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill ; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill ; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill ; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl %edi, %eax +; X86-NEXT: mull %esi +; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl %ebp, %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl %ebp, %eax -; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %edi -; X86-NEXT: movl %eax, %ebp -; X86-NEXT: addl %esi, %ebp -; X86-NEXT: adcl $0, %edi -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: mull %esi ; X86-NEXT: movl %edx, %esi -; X86-NEXT: addl %ebp, %eax +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: addl %ecx, %ebx +; X86-NEXT: adcl $0, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %edi, %eax +; X86-NEXT: mull %ecx +; X86-NEXT: movl %ecx, %edi +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: addl %ebx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl %edi, %esi -; X86-NEXT: setb %cl -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: addl %esi, %eax +; X86-NEXT: adcl %esi, %ecx +; X86-NEXT: setb %bl +; X86-NEXT: movl %ebp, %eax +; X86-NEXT: mull %edi +; X86-NEXT: movl %edi, %ebp +; X86-NEXT: addl %ecx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movzbl %cl, %eax +; X86-NEXT: movzbl %bl, %eax ; X86-NEXT: adcl %eax, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl %ebx, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, %eax -; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, %ebp -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: addl %esi, %ecx -; X86-NEXT: adcl $0, %ebp -; X86-NEXT: movl %ebx, %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: addl %ecx, %eax +; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl %ebp, %ebx -; X86-NEXT: setb %cl -; X86-NEXT: movl %edi, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %esi -; X86-NEXT: movl %eax, %esi -; X86-NEXT: addl %ebx, %esi -; X86-NEXT: movzbl %cl, %eax +; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl %eax, %edi +; X86-NEXT: addl %ecx, %edi +; X86-NEXT: adcl $0, %esi +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: movl %ebp, %ecx +; X86-NEXT: mull %ebp +; X86-NEXT: movl %edx, %ebp +; X86-NEXT: addl %edi, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl %esi, %ebp +; X86-NEXT: setb %bl +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: mull %ecx +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: addl %ebp, %ecx +; X86-NEXT: movzbl %bl, %eax ; X86-NEXT: adcl %eax, %edx -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill ; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl %ebp, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl %esi, %eax ; X86-NEXT: mull %ebx -; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %edx, %edi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl %edi, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %ebx -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: movl %eax, %edi -; X86-NEXT: addl %ecx, %edi -; X86-NEXT: adcl $0, %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ebp, %eax -; X86-NEXT: mull %ecx ; X86-NEXT: movl %edx, %ebp -; X86-NEXT: addl %edi, %eax -; X86-NEXT: movl %eax, %edi -; X86-NEXT: adcl %ebx, %ebp -; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl %eax, %ebx -; X86-NEXT: addl %ebp, %ebx -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload -; X86-NEXT: adcl %eax, %ecx -; X86-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl $0, %ebx -; X86-NEXT: adcl $0, %ecx -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload -; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: addl %edi, %ebx +; X86-NEXT: adcl $0, %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %esi, %eax ; X86-NEXT: mull %edi ; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: addl %ebx, %eax +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: adcl %ebp, %esi +; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %edi -; X86-NEXT: movl %eax, %ebp -; X86-NEXT: addl %esi, %ebp +; X86-NEXT: movl %edx, %ebp +; X86-NEXT: movl %eax, %edi +; X86-NEXT: addl %esi, %edi +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload +; X86-NEXT: adcl %eax, %ebp +; X86-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: adcl $0, %edi +; X86-NEXT: adcl $0, %ebp +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload +; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: mull %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: mull %esi +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: mull %esi ; X86-NEXT: movl %edx, %esi -; X86-NEXT: addl %ebp, %eax -; X86-NEXT: movl %eax, %ebp -; X86-NEXT: adcl %edi, %esi -; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: addl %ecx, %ebx +; X86-NEXT: adcl $0, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: mull %ecx +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: addl %ebx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl %esi, %ecx +; X86-NEXT: setb %bl ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: addl %esi, %eax -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 1-byte Folded Reload -; X86-NEXT: adcl %esi, %edx +; X86-NEXT: addl %ecx, %eax +; X86-NEXT: movzbl %bl, %ecx +; X86-NEXT: adcl %ecx, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload +; X86-NEXT: addl %edi, %ebx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload -; X86-NEXT: addl %ebx, %edi -; X86-NEXT: adcl %ecx, %ebp +; X86-NEXT: adcl %ebp, %edi ; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 1-byte Folded Reload ; X86-NEXT: adcl %ecx, %eax ; X86-NEXT: adcl $0, %edx @@ -1032,9 +1019,9 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) { ; X86-NEXT: adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload ; X86-NEXT: adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; X86-NEXT: movl %edi, %edx +; X86-NEXT: movl %ebx, %edx ; X86-NEXT: adcl $0, %edx -; X86-NEXT: movl %ebp, %ecx +; X86-NEXT: movl %edi, %ecx ; X86-NEXT: adcl $0, %ecx ; X86-NEXT: adcl $0, %eax ; X86-NEXT: adcl $0, %esi @@ -1047,13 +1034,41 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) { ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: mull %esi +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: mull %esi +; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl %eax, %edi +; X86-NEXT: addl %ecx, %edi +; X86-NEXT: adcl $0, %esi +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: mull %ebp +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: addl %edi, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl %esi, %ebx +; X86-NEXT: setb %cl +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: mull %ebp +; X86-NEXT: addl %ebx, %eax +; X86-NEXT: movl %eax, (%esp) ## 4-byte Spill +; X86-NEXT: movzbl %cl, %eax +; X86-NEXT: adcl %eax, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: mull %edi ; X86-NEXT: movl %edx, %esi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl %ebp, %eax ; X86-NEXT: mull %edi ; X86-NEXT: movl %edx, %edi ; X86-NEXT: movl %eax, %ebx @@ -1062,117 +1077,89 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) { ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %ebp +; X86-NEXT: movl %edx, %ecx ; X86-NEXT: addl %ebx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl %edi, %ebp -; X86-NEXT: setb %cl -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: adcl %edi, %ecx +; X86-NEXT: setb %bl +; X86-NEXT: movl %ebp, %eax ; X86-NEXT: mull %esi -; X86-NEXT: addl %ebp, %eax -; X86-NEXT: movl %eax, (%esp) ## 4-byte Spill -; X86-NEXT: movzbl %cl, %eax +; X86-NEXT: movl %eax, %esi +; X86-NEXT: addl %ecx, %esi +; X86-NEXT: movzbl %bl, %eax ; X86-NEXT: adcl %eax, %edx +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %esi, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, %edi -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl $0, (%esp) ## 4-byte Folded Spill +; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl %ebx, %eax -; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, %ebp -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: addl %edi, %ecx -; X86-NEXT: adcl $0, %ebp -; X86-NEXT: movl %esi, %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %esi -; X86-NEXT: addl %ecx, %eax +; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl %ebp, %esi -; X86-NEXT: setb %cl -; X86-NEXT: movl %ebx, %eax +; X86-NEXT: movl %ebp, %eax ; X86-NEXT: mull %edi +; X86-NEXT: movl %edx, %edi ; X86-NEXT: movl %eax, %ebp -; X86-NEXT: addl %esi, %ebp -; X86-NEXT: movzbl %cl, %eax -; X86-NEXT: adcl %eax, %edx -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl $0, (%esp) ## 4-byte Folded Spill -; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: addl %ecx, %ebp +; X86-NEXT: adcl $0, %edi ; X86-NEXT: movl %ebx, %eax -; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: addl %ecx, %ebx -; X86-NEXT: adcl $0, %esi -; X86-NEXT: movl %edi, %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, %edi -; X86-NEXT: addl %ebx, %eax -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: adcl %esi, %edi +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: addl %ebp, %eax +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: adcl %edi, %ebx ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: movl %eax, %esi -; X86-NEXT: addl %edi, %esi +; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: addl %ebx, %ecx ; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload -; X86-NEXT: adcl %eax, %ecx -; X86-NEXT: addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl $0, %esi +; X86-NEXT: adcl %eax, %edi +; X86-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload +; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: adcl $0, %ecx -; X86-NEXT: addl (%esp), %esi ## 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload +; X86-NEXT: adcl $0, %edi +; X86-NEXT: addl (%esp), %ecx ## 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: mull %ebx -; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl %edx, %esi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %ebx ; X86-NEXT: movl %edx, %ebx ; X86-NEXT: movl %eax, %ebp -; X86-NEXT: addl %edi, %ebp +; X86-NEXT: addl %esi, %ebp ; X86-NEXT: adcl $0, %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: mull %edx -; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl %edx, %esi ; X86-NEXT: addl %ebp, %eax ; X86-NEXT: movl %eax, %ebp -; X86-NEXT: adcl %ebx, %edi +; X86-NEXT: adcl %ebx, %esi ; X86-NEXT: setb (%esp) ## 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %eax, %ebx -; X86-NEXT: addl %edi, %ebx +; X86-NEXT: addl %esi, %ebx ; X86-NEXT: movzbl (%esp), %eax ## 1-byte Folded Reload -; X86-NEXT: movl %edx, %edi -; X86-NEXT: adcl %eax, %edi +; X86-NEXT: movl %edx, %esi +; X86-NEXT: adcl %eax, %esi ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload -; X86-NEXT: addl %esi, %edx -; X86-NEXT: adcl %ecx, %ebp -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload +; X86-NEXT: addl %ecx, %edx +; X86-NEXT: adcl %edi, %ebp +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload ; X86-NEXT: adcl %eax, %ebx -; X86-NEXT: adcl $0, %edi +; X86-NEXT: adcl $0, %esi ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload ; X86-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload @@ -1188,25 +1175,25 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) { ; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: adcl $0, %ebx ; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl $0, %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl $0, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: sarl $31, %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %edx, %esi ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl %eax, %esi +; X86-NEXT: movl %eax, %ecx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %edi ; X86-NEXT: movl %eax, %ebp -; X86-NEXT: addl %ecx, %ebp +; X86-NEXT: addl %esi, %ebp ; X86-NEXT: movl %edx, %ebx ; X86-NEXT: adcl $0, %ebx -; X86-NEXT: addl %esi, %ebp -; X86-NEXT: movl %ebp, (%esp) ## 4-byte Spill -; X86-NEXT: adcl %ecx, %ebx +; X86-NEXT: addl %ecx, %ebp +; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl %esi, %ebx ; X86-NEXT: setb %cl ; X86-NEXT: addl %eax, %ebx ; X86-NEXT: movzbl %cl, %eax @@ -1214,75 +1201,76 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) { ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %eax, (%esp) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %ebp +; X86-NEXT: movl %edx, %esi ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl %eax, %ecx +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: movl %eax, %edx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: addl %esi, %ecx -; X86-NEXT: adcl $0, %edx -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload +; X86-NEXT: movl %ecx, %eax ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl %esi, %edx -; X86-NEXT: setb %cl -; X86-NEXT: addl %eax, %edx +; X86-NEXT: addl %ecx, %edx +; X86-NEXT: movl %esi, %ecx +; X86-NEXT: adcl $0, %ecx +; X86-NEXT: addl (%esp), %edx ## 4-byte Folded Reload ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movzbl %cl, %ecx -; X86-NEXT: adcl %ebp, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload -; X86-NEXT: addl %edx, %ebp -; X86-NEXT: movl (%esp), %eax ## 4-byte Reload -; X86-NEXT: adcl %ecx, %eax -; X86-NEXT: movl %ebx, %esi +; X86-NEXT: adcl %eax, %ecx +; X86-NEXT: setb %al +; X86-NEXT: addl %ebp, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movzbl %al, %edx +; X86-NEXT: adcl %esi, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload +; X86-NEXT: addl %ecx, %eax +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload +; X86-NEXT: adcl %edx, %eax +; X86-NEXT: movl %ebx, %ebp +; X86-NEXT: adcl $0, %ebp +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload ; X86-NEXT: adcl $0, %esi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload -; X86-NEXT: adcl $0, %edx -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload -; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: addl (%esp), %ecx ## 4-byte Folded Reload +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload -; X86-NEXT: adcl $0, %ebp +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload ; X86-NEXT: adcl $0, %ecx -; X86-NEXT: addl %esi, %ebp -; X86-NEXT: adcl %edx, %ecx +; X86-NEXT: adcl $0, %edx +; X86-NEXT: addl %ebp, %ecx +; X86-NEXT: adcl %esi, %edx ; X86-NEXT: setb %al -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload -; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl (%esp), %ecx ## 4-byte Folded Reload +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movzbl %al, %eax ; X86-NEXT: adcl %ebx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; X86-NEXT: movl %edi, %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: andl %edx, %ecx ; X86-NEXT: movl %edi, %eax -; X86-NEXT: mull %edx -; X86-NEXT: movl %eax, (%esp) ## 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: mull %ecx +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl %edx, %ebp -; X86-NEXT: movl %edx, %esi -; X86-NEXT: subl %ecx, %esi -; X86-NEXT: movl %edi, %eax -; X86-NEXT: andl {{[0-9]+}}(%esp), %eax -; X86-NEXT: subl %eax, %esi -; X86-NEXT: movl %edi, %ebx +; X86-NEXT: imull %edi, %ecx +; X86-NEXT: addl %edx, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: imull %edi, %esi +; X86-NEXT: addl %ecx, %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: andl %eax, %ebx +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: imull %edi, %ecx ; X86-NEXT: mull %edi -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: subl %ebx, %ecx -; X86-NEXT: movl %edi, %eax -; X86-NEXT: andl {{[0-9]+}}(%esp), %eax -; X86-NEXT: subl %eax, %ecx -; X86-NEXT: movl (%esp), %eax ## 4-byte Reload -; X86-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: addl %ecx, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: imull %edi, %ecx +; X86-NEXT: addl %edx, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload +; X86-NEXT: addl %eax, %ebx ; X86-NEXT: adcl %esi, %ecx ; X86-NEXT: movl %eax, %esi ; X86-NEXT: addl %ebp, %esi @@ -1292,266 +1280,263 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) { ; X86-NEXT: addl %eax, %esi ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: adcl %edx, %ebp -; X86-NEXT: setb %bl +; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill ; X86-NEXT: addl %eax, %ebp -; X86-NEXT: movzbl %bl, %ebx -; X86-NEXT: adcl %edx, %ebx -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload -; X86-NEXT: adcl %ecx, %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: andl %edi, %edx +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload +; X86-NEXT: adcl %edx, %eax +; X86-NEXT: addl %ebx, %ebp +; X86-NEXT: adcl %ecx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: imull %edi, %ecx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: subl %edx, %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: andl %edi, %edx -; X86-NEXT: subl %edx, %ecx +; X86-NEXT: addl %eax, %ecx +; X86-NEXT: movl %ecx, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: imull %edi, %ecx +; X86-NEXT: addl %edx, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: andl %edi, %edx +; X86-NEXT: imull %edi, %edx +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload +; X86-NEXT: imull {{[0-9]+}}(%esp), %edi +; X86-NEXT: addl %edx, %edi +; X86-NEXT: movl (%esp), %edx ## 4-byte Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload -; X86-NEXT: subl %edx, %esi -; X86-NEXT: andl {{[0-9]+}}(%esp), %edi -; X86-NEXT: subl %edi, %esi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload -; X86-NEXT: addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; X86-NEXT: adcl %ecx, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl %edi, %ecx +; X86-NEXT: addl %edx, %esi +; X86-NEXT: adcl %ecx, %edi +; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl %eax, %edx ; X86-NEXT: addl %eax, %ecx ; X86-NEXT: adcl $0, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload -; X86-NEXT: addl %eax, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload +; X86-NEXT: addl %ebx, %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload -; X86-NEXT: adcl %esi, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload +; X86-NEXT: adcl %eax, %edx ; X86-NEXT: setb %cl -; X86-NEXT: addl %eax, %edx +; X86-NEXT: addl %ebx, %edx ; X86-NEXT: movzbl %cl, %ecx -; X86-NEXT: adcl %esi, %ecx -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload +; X86-NEXT: adcl %eax, %ecx +; X86-NEXT: addl %esi, %edx +; X86-NEXT: adcl %edi, %ecx +; X86-NEXT: movl %ecx, %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload +; X86-NEXT: addl (%esp), %esi ## 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload -; X86-NEXT: movl (%esp), %esi ## 4-byte Reload -; X86-NEXT: addl %edi, %esi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload ; X86-NEXT: adcl %ebp, %edx -; X86-NEXT: adcl %ebx, %ecx -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload -; X86-NEXT: movl %esi, (%esp) ## 4-byte Spill ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: sarl $31, %eax -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movl %eax, %ebp ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %eax, %ecx -; X86-NEXT: movl %eax, %esi +; X86-NEXT: movl %eax, %edi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl %edx, %esi ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: addl %edx, %ecx -; X86-NEXT: adcl $0, %edi -; X86-NEXT: movl %ebx, %eax +; X86-NEXT: adcl $0, %esi +; X86-NEXT: movl %ebp, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: addl %eax, %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl %edx, %edi +; X86-NEXT: adcl %edx, %esi ; X86-NEXT: setb %bl -; X86-NEXT: addl %eax, %edi -; X86-NEXT: movzbl %bl, %ebp -; X86-NEXT: adcl %edx, %ebp -; X86-NEXT: movl %esi, %eax -; X86-NEXT: addl %edi, %eax +; X86-NEXT: addl %eax, %esi +; X86-NEXT: movzbl %bl, %ebx +; X86-NEXT: adcl %edx, %ebx +; X86-NEXT: movl %edi, %eax +; X86-NEXT: addl %esi, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl %ecx, %eax -; X86-NEXT: adcl %ebp, %eax +; X86-NEXT: adcl %ebx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl $0, %edi -; X86-NEXT: adcl $0, %ebp -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload -; X86-NEXT: movl %esi, %eax +; X86-NEXT: adcl $0, %esi +; X86-NEXT: adcl $0, %ebx +; X86-NEXT: movl %ebp, %ecx +; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movl %ebp, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: movl %eax, %ebx +; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl %eax, %ebp ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: addl %edx, %ebx -; X86-NEXT: adcl $0, %ecx -; X86-NEXT: movl %esi, %eax +; X86-NEXT: addl %edx, %ebp +; X86-NEXT: adcl $0, %edi +; X86-NEXT: movl %ecx, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: addl %eax, %ebx -; X86-NEXT: adcl %edx, %ecx -; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill -; X86-NEXT: addl %eax, %ecx -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 1-byte Folded Reload -; X86-NEXT: adcl %edx, %esi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload -; X86-NEXT: addl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; X86-NEXT: adcl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; X86-NEXT: movl %ecx, %edx +; X86-NEXT: addl %eax, %ebp +; X86-NEXT: adcl %edx, %edi +; X86-NEXT: setb %cl +; X86-NEXT: addl %eax, %edi +; X86-NEXT: movzbl %cl, %eax +; X86-NEXT: adcl %edx, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload +; X86-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; X86-NEXT: adcl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; X86-NEXT: movl %edi, %edx ; X86-NEXT: adcl $0, %edx -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movl %eax, %ecx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload ; X86-NEXT: adcl $0, %eax -; X86-NEXT: addl %edi, %edx -; X86-NEXT: adcl %ebp, %eax -; X86-NEXT: movl %eax, %edi +; X86-NEXT: addl %esi, %edx +; X86-NEXT: adcl %ebx, %eax +; X86-NEXT: movl %eax, %esi ; X86-NEXT: setb %al -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload -; X86-NEXT: addl %ebp, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload +; X86-NEXT: addl %ebx, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl %ebx, %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl %ebp, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movzbl %al, %eax -; X86-NEXT: adcl %ecx, %eax +; X86-NEXT: adcl %edi, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl $0, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl $0, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload -; X86-NEXT: andl %edx, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload -; X86-NEXT: subl %eax, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload +; X86-NEXT: imull %ebp, %eax +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload +; X86-NEXT: movl %eax, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: andl %edx, %eax -; X86-NEXT: subl %eax, %ecx -; X86-NEXT: movl %ecx, %edi +; X86-NEXT: imull %ebp, %eax +; X86-NEXT: addl %ecx, %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: andl %edx, %ecx +; X86-NEXT: imull %ebp, %ecx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload -; X86-NEXT: movl %esi, %eax -; X86-NEXT: subl %ecx, %eax +; X86-NEXT: addl %esi, %ecx +; X86-NEXT: movl %ecx, %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: andl %edx, %ecx -; X86-NEXT: subl %ecx, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload -; X86-NEXT: addl %ecx, %ebp -; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl %edi, %eax +; X86-NEXT: imull %ebp, %ecx +; X86-NEXT: addl %edx, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload +; X86-NEXT: addl %edx, %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl %eax, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload -; X86-NEXT: movl %ebx, %edx -; X86-NEXT: addl %esi, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload -; X86-NEXT: movl %ebp, %edi +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: addl %esi, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload +; X86-NEXT: movl %ecx, %edi ; X86-NEXT: adcl $0, %edi -; X86-NEXT: addl %ecx, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: addl %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: adcl %esi, %edi ; X86-NEXT: setb %dl ; X86-NEXT: addl %ebx, %edi -; X86-NEXT: movzbl %dl, %ecx -; X86-NEXT: adcl %ebp, %ecx +; X86-NEXT: movzbl %dl, %eax +; X86-NEXT: adcl %ecx, %eax ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload -; X86-NEXT: adcl %eax, %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload -; X86-NEXT: andl %ebp, %esi -; X86-NEXT: movl %ebp, %eax +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: mull %ecx +; X86-NEXT: imull %ebp, %ecx +; X86-NEXT: movl %ebp, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: mull %esi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: subl %esi, %ebx -; X86-NEXT: andl %ebp, %ecx -; X86-NEXT: subl %ecx, %ebx -; X86-NEXT: movl %ebp, %ecx -; X86-NEXT: andl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: addl %ecx, %edx +; X86-NEXT: imull %ebp, %esi +; X86-NEXT: addl %edx, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movl %ebp, %esi +; X86-NEXT: imull {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %ebp -; X86-NEXT: movl %eax, %esi -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl %edx, %ebp -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: subl %ecx, %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload -; X86-NEXT: andl %ecx, %eax -; X86-NEXT: subl %eax, %ebp -; X86-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; X86-NEXT: adcl %ebx, %ebp +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: addl %edx, %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl %eax, %esi -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload -; X86-NEXT: addl %eax, %ebx -; X86-NEXT: adcl $0, %edx -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload -; X86-NEXT: adcl %eax, %edx -; X86-NEXT: setb %cl -; X86-NEXT: addl %esi, %edx -; X86-NEXT: movzbl %cl, %eax -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload -; X86-NEXT: adcl %ebp, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload -; X86-NEXT: addl %ecx, %esi -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload -; X86-NEXT: adcl %edi, %edx +; X86-NEXT: imull %ebp, %eax +; X86-NEXT: addl %esi, %eax +; X86-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: mull %ebp +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: addl %ebx, %ebp +; X86-NEXT: movl %edx, %esi +; X86-NEXT: adcl $0, %esi +; X86-NEXT: addl %ecx, %ebp +; X86-NEXT: adcl %ebx, %esi +; X86-NEXT: setb %bl +; X86-NEXT: addl %eax, %esi +; X86-NEXT: movzbl %bl, %eax +; X86-NEXT: adcl %edx, %eax ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload +; X86-NEXT: addl %edx, %ecx +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload +; X86-NEXT: adcl %edi, %esi ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload +; X86-NEXT: addl (%esp), %edx ## 4-byte Folded Reload +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload -; X86-NEXT: adcl (%esp), %esi ## 4-byte Folded Reload -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload -; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload -; X86-NEXT: movl %ebp, %ecx -; X86-NEXT: sarl $31, %ecx -; X86-NEXT: xorl %ecx, %edi -; X86-NEXT: xorl %ecx, %edx -; X86-NEXT: orl %edi, %edx -; X86-NEXT: xorl %ecx, %esi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload +; X86-NEXT: movl %ebx, %edi +; X86-NEXT: sarl $31, %edi +; X86-NEXT: xorl %edi, %edx +; X86-NEXT: xorl %edi, %esi ; X86-NEXT: orl %edx, %esi +; X86-NEXT: xorl %edi, %ecx +; X86-NEXT: orl %esi, %ecx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload -; X86-NEXT: xorl %ecx, %edx -; X86-NEXT: orl %esi, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload -; X86-NEXT: xorl %ecx, %esi -; X86-NEXT: xorl %ecx, %eax -; X86-NEXT: orl %esi, %eax -; X86-NEXT: xorl %ecx, %ebx -; X86-NEXT: orl %eax, %ebx -; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload -; X86-NEXT: orl %ebx, %ecx -; X86-NEXT: orl %edx, %ecx +; X86-NEXT: xorl %edi, %edx +; X86-NEXT: orl %ecx, %edx +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload +; X86-NEXT: xorl %edi, %edx +; X86-NEXT: xorl %edi, %eax +; X86-NEXT: orl %edx, %eax +; X86-NEXT: xorl %edi, %ebp +; X86-NEXT: orl %eax, %ebp +; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload +; X86-NEXT: orl %ebp, %edi +; X86-NEXT: orl %ecx, %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %ebp, 28(%eax) +; X86-NEXT: movl %ebx, 28(%eax) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload ; X86-NEXT: movl %ecx, (%eax) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload @@ -1567,7 +1552,7 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) { ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload ; X86-NEXT: movl %ecx, 24(%eax) ; X86-NEXT: setne %al -; X86-NEXT: addl $152, %esp +; X86-NEXT: addl $156, %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx diff --git a/llvm/test/CodeGen/X86/vec_smulo.ll b/llvm/test/CodeGen/X86/vec_smulo.ll index 641663d..dbec8675 100644 --- a/llvm/test/CodeGen/X86/vec_smulo.ll +++ b/llvm/test/CodeGen/X86/vec_smulo.ll @@ -3297,33 +3297,31 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; SSE2-NEXT: pushq %r12 ; SSE2-NEXT: pushq %rbx ; SSE2-NEXT: movq %r8, %r14 -; SSE2-NEXT: movq %rcx, %rbp ; SSE2-NEXT: movq %rdx, %r8 ; SSE2-NEXT: movq %rsi, %r11 ; SSE2-NEXT: movq %rdi, %r10 ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rsi -; SSE2-NEXT: movq %r11, %rbx -; SSE2-NEXT: sarq $63, %rbx -; SSE2-NEXT: movq %rbx, %r15 -; SSE2-NEXT: andq %r14, %r15 +; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rbp +; SSE2-NEXT: movq %r11, %r12 +; SSE2-NEXT: sarq $63, %r12 +; SSE2-NEXT: movq %r14, %rbx +; SSE2-NEXT: imulq %r12, %rbx ; SSE2-NEXT: movq %r14, %rax -; SSE2-NEXT: mulq %rbx +; SSE2-NEXT: mulq %r12 ; SSE2-NEXT: movq %rax, %rdi -; SSE2-NEXT: movq %rdx, %r12 -; SSE2-NEXT: subq %r15, %r12 -; SSE2-NEXT: andq %r9, %rbx -; SSE2-NEXT: subq %rbx, %r12 -; SSE2-NEXT: movq %r9, %r13 -; SSE2-NEXT: sarq $63, %r13 -; SSE2-NEXT: movq %r13, %rcx -; SSE2-NEXT: andq %r11, %rcx -; SSE2-NEXT: movq %r13, %rax +; SSE2-NEXT: addq %rbx, %rdx +; SSE2-NEXT: imulq %r9, %r12 +; SSE2-NEXT: addq %rdx, %r12 +; SSE2-NEXT: movq %r9, %rbx +; SSE2-NEXT: sarq $63, %rbx +; SSE2-NEXT: movq %rbx, %r13 +; SSE2-NEXT: imulq %r11, %r13 +; SSE2-NEXT: movq %rbx, %rax ; SSE2-NEXT: mulq %r10 ; SSE2-NEXT: movq %rax, %r15 -; SSE2-NEXT: movq %rdx, %rbx -; SSE2-NEXT: subq %rcx, %rbx -; SSE2-NEXT: andq %r10, %r13 -; SSE2-NEXT: subq %r13, %rbx +; SSE2-NEXT: addq %r13, %rdx +; SSE2-NEXT: imulq %r10, %rbx +; SSE2-NEXT: addq %rdx, %rbx ; SSE2-NEXT: addq %rdi, %r15 ; SSE2-NEXT: adcq %r12, %rbx ; SSE2-NEXT: movq %r10, %rax @@ -3343,11 +3341,11 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; SSE2-NEXT: addq %r13, %r10 ; SSE2-NEXT: adcq %r14, %r12 ; SSE2-NEXT: setb %al -; SSE2-NEXT: movzbl %al, %ecx +; SSE2-NEXT: movzbl %al, %r14d ; SSE2-NEXT: movq %r11, %rax ; SSE2-NEXT: mulq %r9 ; SSE2-NEXT: addq %r12, %rax -; SSE2-NEXT: adcq %rcx, %rdx +; SSE2-NEXT: adcq %r14, %rdx ; SSE2-NEXT: addq %r15, %rax ; SSE2-NEXT: adcq %rbx, %rdx ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r12 @@ -3358,56 +3356,52 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; SSE2-NEXT: xorl %r15d, %r15d ; SSE2-NEXT: orq %rdx, %r10 ; SSE2-NEXT: setne %r15b -; SSE2-NEXT: movq %rbp, %rcx -; SSE2-NEXT: sarq $63, %rcx -; SSE2-NEXT: movq %rcx, %r11 -; SSE2-NEXT: andq %rsi, %r11 +; SSE2-NEXT: movq %rcx, %rbx +; SSE2-NEXT: sarq $63, %rbx +; SSE2-NEXT: movq %rsi, %r10 +; SSE2-NEXT: imulq %rbx, %r10 ; SSE2-NEXT: movq %rsi, %rax -; SSE2-NEXT: mulq %rcx +; SSE2-NEXT: mulq %rbx ; SSE2-NEXT: movq %rax, %r9 -; SSE2-NEXT: movq %rdx, %r10 -; SSE2-NEXT: subq %r11, %r10 -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE2-NEXT: andq %rax, %rcx -; SSE2-NEXT: subq %rcx, %r10 -; SSE2-NEXT: movq %rax, %r11 -; SSE2-NEXT: movq %rax, %r13 -; SSE2-NEXT: sarq $63, %r11 -; SSE2-NEXT: movq %r11, %rcx -; SSE2-NEXT: andq %rbp, %rcx -; SSE2-NEXT: movq %r11, %rax +; SSE2-NEXT: addq %r10, %rdx +; SSE2-NEXT: imulq %rbp, %rbx +; SSE2-NEXT: addq %rdx, %rbx +; SSE2-NEXT: movq %rbp, %r10 +; SSE2-NEXT: sarq $63, %r10 +; SSE2-NEXT: movq %r10, %r14 +; SSE2-NEXT: imulq %rcx, %r14 +; SSE2-NEXT: movq %r10, %rax ; SSE2-NEXT: mulq %r8 -; SSE2-NEXT: movq %rax, %rbx -; SSE2-NEXT: movq %rdx, %r14 -; SSE2-NEXT: subq %rcx, %r14 -; SSE2-NEXT: andq %r8, %r11 -; SSE2-NEXT: subq %r11, %r14 -; SSE2-NEXT: addq %r9, %rbx -; SSE2-NEXT: adcq %r10, %r14 +; SSE2-NEXT: movq %rax, %r11 +; SSE2-NEXT: addq %r14, %rdx +; SSE2-NEXT: imulq %r8, %r10 +; SSE2-NEXT: addq %rdx, %r10 +; SSE2-NEXT: addq %r9, %r11 +; SSE2-NEXT: adcq %rbx, %r10 ; SSE2-NEXT: movq %r8, %rax ; SSE2-NEXT: mulq %rsi ; SSE2-NEXT: movq %rdx, %r9 -; SSE2-NEXT: movq %rax, %r10 -; SSE2-NEXT: movq %rbp, %rax +; SSE2-NEXT: movq %rax, %rbx +; SSE2-NEXT: movq %rcx, %rax ; SSE2-NEXT: mulq %rsi ; SSE2-NEXT: movq %rdx, %rsi -; SSE2-NEXT: movq %rax, %r11 -; SSE2-NEXT: addq %r9, %r11 +; SSE2-NEXT: movq %rax, %r14 +; SSE2-NEXT: addq %r9, %r14 ; SSE2-NEXT: adcq $0, %rsi ; SSE2-NEXT: movq %r8, %rax -; SSE2-NEXT: mulq %r13 +; SSE2-NEXT: mulq %rbp ; SSE2-NEXT: movq %rdx, %r8 ; SSE2-NEXT: movq %rax, %r9 -; SSE2-NEXT: addq %r11, %r9 +; SSE2-NEXT: addq %r14, %r9 ; SSE2-NEXT: adcq %rsi, %r8 ; SSE2-NEXT: setb %al -; SSE2-NEXT: movzbl %al, %ecx -; SSE2-NEXT: movq %rbp, %rax -; SSE2-NEXT: mulq %r13 +; SSE2-NEXT: movzbl %al, %esi +; SSE2-NEXT: movq %rcx, %rax +; SSE2-NEXT: mulq %rbp ; SSE2-NEXT: addq %r8, %rax -; SSE2-NEXT: adcq %rcx, %rdx -; SSE2-NEXT: addq %rbx, %rax -; SSE2-NEXT: adcq %r14, %rdx +; SSE2-NEXT: adcq %rsi, %rdx +; SSE2-NEXT: addq %r11, %rax +; SSE2-NEXT: adcq %r10, %rdx ; SSE2-NEXT: movq %r9, 24(%r12) ; SSE2-NEXT: sarq $63, %r9 ; SSE2-NEXT: xorq %r9, %rdx @@ -3420,7 +3414,7 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; SSE2-NEXT: negl %r15d ; SSE2-NEXT: movd %r15d, %xmm0 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: movq %r10, 16(%r12) +; SSE2-NEXT: movq %rbx, 16(%r12) ; SSE2-NEXT: movq %rdi, (%r12) ; SSE2-NEXT: popq %rbx ; SSE2-NEXT: popq %r12 @@ -3439,33 +3433,31 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; SSSE3-NEXT: pushq %r12 ; SSSE3-NEXT: pushq %rbx ; SSSE3-NEXT: movq %r8, %r14 -; SSSE3-NEXT: movq %rcx, %rbp ; SSSE3-NEXT: movq %rdx, %r8 ; SSSE3-NEXT: movq %rsi, %r11 ; SSSE3-NEXT: movq %rdi, %r10 ; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %rsi -; SSSE3-NEXT: movq %r11, %rbx -; SSSE3-NEXT: sarq $63, %rbx -; SSSE3-NEXT: movq %rbx, %r15 -; SSSE3-NEXT: andq %r14, %r15 +; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %rbp +; SSSE3-NEXT: movq %r11, %r12 +; SSSE3-NEXT: sarq $63, %r12 +; SSSE3-NEXT: movq %r14, %rbx +; SSSE3-NEXT: imulq %r12, %rbx ; SSSE3-NEXT: movq %r14, %rax -; SSSE3-NEXT: mulq %rbx +; SSSE3-NEXT: mulq %r12 ; SSSE3-NEXT: movq %rax, %rdi -; SSSE3-NEXT: movq %rdx, %r12 -; SSSE3-NEXT: subq %r15, %r12 -; SSSE3-NEXT: andq %r9, %rbx -; SSSE3-NEXT: subq %rbx, %r12 -; SSSE3-NEXT: movq %r9, %r13 -; SSSE3-NEXT: sarq $63, %r13 -; SSSE3-NEXT: movq %r13, %rcx -; SSSE3-NEXT: andq %r11, %rcx -; SSSE3-NEXT: movq %r13, %rax +; SSSE3-NEXT: addq %rbx, %rdx +; SSSE3-NEXT: imulq %r9, %r12 +; SSSE3-NEXT: addq %rdx, %r12 +; SSSE3-NEXT: movq %r9, %rbx +; SSSE3-NEXT: sarq $63, %rbx +; SSSE3-NEXT: movq %rbx, %r13 +; SSSE3-NEXT: imulq %r11, %r13 +; SSSE3-NEXT: movq %rbx, %rax ; SSSE3-NEXT: mulq %r10 ; SSSE3-NEXT: movq %rax, %r15 -; SSSE3-NEXT: movq %rdx, %rbx -; SSSE3-NEXT: subq %rcx, %rbx -; SSSE3-NEXT: andq %r10, %r13 -; SSSE3-NEXT: subq %r13, %rbx +; SSSE3-NEXT: addq %r13, %rdx +; SSSE3-NEXT: imulq %r10, %rbx +; SSSE3-NEXT: addq %rdx, %rbx ; SSSE3-NEXT: addq %rdi, %r15 ; SSSE3-NEXT: adcq %r12, %rbx ; SSSE3-NEXT: movq %r10, %rax @@ -3485,11 +3477,11 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; SSSE3-NEXT: addq %r13, %r10 ; SSSE3-NEXT: adcq %r14, %r12 ; SSSE3-NEXT: setb %al -; SSSE3-NEXT: movzbl %al, %ecx +; SSSE3-NEXT: movzbl %al, %r14d ; SSSE3-NEXT: movq %r11, %rax ; SSSE3-NEXT: mulq %r9 ; SSSE3-NEXT: addq %r12, %rax -; SSSE3-NEXT: adcq %rcx, %rdx +; SSSE3-NEXT: adcq %r14, %rdx ; SSSE3-NEXT: addq %r15, %rax ; SSSE3-NEXT: adcq %rbx, %rdx ; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r12 @@ -3500,56 +3492,52 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; SSSE3-NEXT: xorl %r15d, %r15d ; SSSE3-NEXT: orq %rdx, %r10 ; SSSE3-NEXT: setne %r15b -; SSSE3-NEXT: movq %rbp, %rcx -; SSSE3-NEXT: sarq $63, %rcx -; SSSE3-NEXT: movq %rcx, %r11 -; SSSE3-NEXT: andq %rsi, %r11 +; SSSE3-NEXT: movq %rcx, %rbx +; SSSE3-NEXT: sarq $63, %rbx +; SSSE3-NEXT: movq %rsi, %r10 +; SSSE3-NEXT: imulq %rbx, %r10 ; SSSE3-NEXT: movq %rsi, %rax -; SSSE3-NEXT: mulq %rcx +; SSSE3-NEXT: mulq %rbx ; SSSE3-NEXT: movq %rax, %r9 -; SSSE3-NEXT: movq %rdx, %r10 -; SSSE3-NEXT: subq %r11, %r10 -; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSSE3-NEXT: andq %rax, %rcx -; SSSE3-NEXT: subq %rcx, %r10 -; SSSE3-NEXT: movq %rax, %r11 -; SSSE3-NEXT: movq %rax, %r13 -; SSSE3-NEXT: sarq $63, %r11 -; SSSE3-NEXT: movq %r11, %rcx -; SSSE3-NEXT: andq %rbp, %rcx -; SSSE3-NEXT: movq %r11, %rax +; SSSE3-NEXT: addq %r10, %rdx +; SSSE3-NEXT: imulq %rbp, %rbx +; SSSE3-NEXT: addq %rdx, %rbx +; SSSE3-NEXT: movq %rbp, %r10 +; SSSE3-NEXT: sarq $63, %r10 +; SSSE3-NEXT: movq %r10, %r14 +; SSSE3-NEXT: imulq %rcx, %r14 +; SSSE3-NEXT: movq %r10, %rax ; SSSE3-NEXT: mulq %r8 -; SSSE3-NEXT: movq %rax, %rbx -; SSSE3-NEXT: movq %rdx, %r14 -; SSSE3-NEXT: subq %rcx, %r14 -; SSSE3-NEXT: andq %r8, %r11 -; SSSE3-NEXT: subq %r11, %r14 -; SSSE3-NEXT: addq %r9, %rbx -; SSSE3-NEXT: adcq %r10, %r14 +; SSSE3-NEXT: movq %rax, %r11 +; SSSE3-NEXT: addq %r14, %rdx +; SSSE3-NEXT: imulq %r8, %r10 +; SSSE3-NEXT: addq %rdx, %r10 +; SSSE3-NEXT: addq %r9, %r11 +; SSSE3-NEXT: adcq %rbx, %r10 ; SSSE3-NEXT: movq %r8, %rax ; SSSE3-NEXT: mulq %rsi ; SSSE3-NEXT: movq %rdx, %r9 -; SSSE3-NEXT: movq %rax, %r10 -; SSSE3-NEXT: movq %rbp, %rax +; SSSE3-NEXT: movq %rax, %rbx +; SSSE3-NEXT: movq %rcx, %rax ; SSSE3-NEXT: mulq %rsi ; SSSE3-NEXT: movq %rdx, %rsi -; SSSE3-NEXT: movq %rax, %r11 -; SSSE3-NEXT: addq %r9, %r11 +; SSSE3-NEXT: movq %rax, %r14 +; SSSE3-NEXT: addq %r9, %r14 ; SSSE3-NEXT: adcq $0, %rsi ; SSSE3-NEXT: movq %r8, %rax -; SSSE3-NEXT: mulq %r13 +; SSSE3-NEXT: mulq %rbp ; SSSE3-NEXT: movq %rdx, %r8 ; SSSE3-NEXT: movq %rax, %r9 -; SSSE3-NEXT: addq %r11, %r9 +; SSSE3-NEXT: addq %r14, %r9 ; SSSE3-NEXT: adcq %rsi, %r8 ; SSSE3-NEXT: setb %al -; SSSE3-NEXT: movzbl %al, %ecx -; SSSE3-NEXT: movq %rbp, %rax -; SSSE3-NEXT: mulq %r13 +; SSSE3-NEXT: movzbl %al, %esi +; SSSE3-NEXT: movq %rcx, %rax +; SSSE3-NEXT: mulq %rbp ; SSSE3-NEXT: addq %r8, %rax -; SSSE3-NEXT: adcq %rcx, %rdx -; SSSE3-NEXT: addq %rbx, %rax -; SSSE3-NEXT: adcq %r14, %rdx +; SSSE3-NEXT: adcq %rsi, %rdx +; SSSE3-NEXT: addq %r11, %rax +; SSSE3-NEXT: adcq %r10, %rdx ; SSSE3-NEXT: movq %r9, 24(%r12) ; SSSE3-NEXT: sarq $63, %r9 ; SSSE3-NEXT: xorq %r9, %rdx @@ -3562,7 +3550,7 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; SSSE3-NEXT: negl %r15d ; SSSE3-NEXT: movd %r15d, %xmm0 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSSE3-NEXT: movq %r10, 16(%r12) +; SSSE3-NEXT: movq %rbx, 16(%r12) ; SSSE3-NEXT: movq %rdi, (%r12) ; SSSE3-NEXT: popq %rbx ; SSSE3-NEXT: popq %r12 @@ -3581,33 +3569,31 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; SSE41-NEXT: pushq %r12 ; SSE41-NEXT: pushq %rbx ; SSE41-NEXT: movq %r8, %r14 -; SSE41-NEXT: movq %rcx, %rbp ; SSE41-NEXT: movq %rdx, %r8 ; SSE41-NEXT: movq %rsi, %r11 ; SSE41-NEXT: movq %rdi, %r10 ; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %rsi -; SSE41-NEXT: movq %r11, %rbx -; SSE41-NEXT: sarq $63, %rbx -; SSE41-NEXT: movq %rbx, %r15 -; SSE41-NEXT: andq %r14, %r15 +; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %rbp +; SSE41-NEXT: movq %r11, %r12 +; SSE41-NEXT: sarq $63, %r12 +; SSE41-NEXT: movq %r14, %rbx +; SSE41-NEXT: imulq %r12, %rbx ; SSE41-NEXT: movq %r14, %rax -; SSE41-NEXT: mulq %rbx +; SSE41-NEXT: mulq %r12 ; SSE41-NEXT: movq %rax, %rdi -; SSE41-NEXT: movq %rdx, %r12 -; SSE41-NEXT: subq %r15, %r12 -; SSE41-NEXT: andq %r9, %rbx -; SSE41-NEXT: subq %rbx, %r12 -; SSE41-NEXT: movq %r9, %r13 -; SSE41-NEXT: sarq $63, %r13 -; SSE41-NEXT: movq %r13, %rcx -; SSE41-NEXT: andq %r11, %rcx -; SSE41-NEXT: movq %r13, %rax +; SSE41-NEXT: addq %rbx, %rdx +; SSE41-NEXT: imulq %r9, %r12 +; SSE41-NEXT: addq %rdx, %r12 +; SSE41-NEXT: movq %r9, %rbx +; SSE41-NEXT: sarq $63, %rbx +; SSE41-NEXT: movq %rbx, %r13 +; SSE41-NEXT: imulq %r11, %r13 +; SSE41-NEXT: movq %rbx, %rax ; SSE41-NEXT: mulq %r10 ; SSE41-NEXT: movq %rax, %r15 -; SSE41-NEXT: movq %rdx, %rbx -; SSE41-NEXT: subq %rcx, %rbx -; SSE41-NEXT: andq %r10, %r13 -; SSE41-NEXT: subq %r13, %rbx +; SSE41-NEXT: addq %r13, %rdx +; SSE41-NEXT: imulq %r10, %rbx +; SSE41-NEXT: addq %rdx, %rbx ; SSE41-NEXT: addq %rdi, %r15 ; SSE41-NEXT: adcq %r12, %rbx ; SSE41-NEXT: movq %r10, %rax @@ -3627,11 +3613,11 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; SSE41-NEXT: addq %r13, %r10 ; SSE41-NEXT: adcq %r14, %r12 ; SSE41-NEXT: setb %al -; SSE41-NEXT: movzbl %al, %ecx +; SSE41-NEXT: movzbl %al, %r14d ; SSE41-NEXT: movq %r11, %rax ; SSE41-NEXT: mulq %r9 ; SSE41-NEXT: addq %r12, %rax -; SSE41-NEXT: adcq %rcx, %rdx +; SSE41-NEXT: adcq %r14, %rdx ; SSE41-NEXT: addq %r15, %rax ; SSE41-NEXT: adcq %rbx, %rdx ; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r12 @@ -3642,56 +3628,52 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; SSE41-NEXT: xorl %r15d, %r15d ; SSE41-NEXT: orq %rdx, %r10 ; SSE41-NEXT: setne %r15b -; SSE41-NEXT: movq %rbp, %rcx -; SSE41-NEXT: sarq $63, %rcx -; SSE41-NEXT: movq %rcx, %r11 -; SSE41-NEXT: andq %rsi, %r11 +; SSE41-NEXT: movq %rcx, %rbx +; SSE41-NEXT: sarq $63, %rbx +; SSE41-NEXT: movq %rsi, %r10 +; SSE41-NEXT: imulq %rbx, %r10 ; SSE41-NEXT: movq %rsi, %rax -; SSE41-NEXT: mulq %rcx +; SSE41-NEXT: mulq %rbx ; SSE41-NEXT: movq %rax, %r9 -; SSE41-NEXT: movq %rdx, %r10 -; SSE41-NEXT: subq %r11, %r10 -; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE41-NEXT: andq %rax, %rcx -; SSE41-NEXT: subq %rcx, %r10 -; SSE41-NEXT: movq %rax, %r11 -; SSE41-NEXT: movq %rax, %r13 -; SSE41-NEXT: sarq $63, %r11 -; SSE41-NEXT: movq %r11, %rcx -; SSE41-NEXT: andq %rbp, %rcx -; SSE41-NEXT: movq %r11, %rax +; SSE41-NEXT: addq %r10, %rdx +; SSE41-NEXT: imulq %rbp, %rbx +; SSE41-NEXT: addq %rdx, %rbx +; SSE41-NEXT: movq %rbp, %r10 +; SSE41-NEXT: sarq $63, %r10 +; SSE41-NEXT: movq %r10, %r14 +; SSE41-NEXT: imulq %rcx, %r14 +; SSE41-NEXT: movq %r10, %rax ; SSE41-NEXT: mulq %r8 -; SSE41-NEXT: movq %rax, %rbx -; SSE41-NEXT: movq %rdx, %r14 -; SSE41-NEXT: subq %rcx, %r14 -; SSE41-NEXT: andq %r8, %r11 -; SSE41-NEXT: subq %r11, %r14 -; SSE41-NEXT: addq %r9, %rbx -; SSE41-NEXT: adcq %r10, %r14 +; SSE41-NEXT: movq %rax, %r11 +; SSE41-NEXT: addq %r14, %rdx +; SSE41-NEXT: imulq %r8, %r10 +; SSE41-NEXT: addq %rdx, %r10 +; SSE41-NEXT: addq %r9, %r11 +; SSE41-NEXT: adcq %rbx, %r10 ; SSE41-NEXT: movq %r8, %rax ; SSE41-NEXT: mulq %rsi ; SSE41-NEXT: movq %rdx, %r9 -; SSE41-NEXT: movq %rax, %r10 -; SSE41-NEXT: movq %rbp, %rax +; SSE41-NEXT: movq %rax, %rbx +; SSE41-NEXT: movq %rcx, %rax ; SSE41-NEXT: mulq %rsi ; SSE41-NEXT: movq %rdx, %rsi -; SSE41-NEXT: movq %rax, %r11 -; SSE41-NEXT: addq %r9, %r11 +; SSE41-NEXT: movq %rax, %r14 +; SSE41-NEXT: addq %r9, %r14 ; SSE41-NEXT: adcq $0, %rsi ; SSE41-NEXT: movq %r8, %rax -; SSE41-NEXT: mulq %r13 +; SSE41-NEXT: mulq %rbp ; SSE41-NEXT: movq %rdx, %r8 ; SSE41-NEXT: movq %rax, %r9 -; SSE41-NEXT: addq %r11, %r9 +; SSE41-NEXT: addq %r14, %r9 ; SSE41-NEXT: adcq %rsi, %r8 ; SSE41-NEXT: setb %al -; SSE41-NEXT: movzbl %al, %ecx -; SSE41-NEXT: movq %rbp, %rax -; SSE41-NEXT: mulq %r13 +; SSE41-NEXT: movzbl %al, %esi +; SSE41-NEXT: movq %rcx, %rax +; SSE41-NEXT: mulq %rbp ; SSE41-NEXT: addq %r8, %rax -; SSE41-NEXT: adcq %rcx, %rdx -; SSE41-NEXT: addq %rbx, %rax -; SSE41-NEXT: adcq %r14, %rdx +; SSE41-NEXT: adcq %rsi, %rdx +; SSE41-NEXT: addq %r11, %rax +; SSE41-NEXT: adcq %r10, %rdx ; SSE41-NEXT: movq %r9, 24(%r12) ; SSE41-NEXT: sarq $63, %r9 ; SSE41-NEXT: xorq %r9, %rdx @@ -3703,7 +3685,7 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; SSE41-NEXT: negl %r15d ; SSE41-NEXT: movd %r15d, %xmm0 ; SSE41-NEXT: pinsrd $1, %eax, %xmm0 -; SSE41-NEXT: movq %r10, 16(%r12) +; SSE41-NEXT: movq %rbx, 16(%r12) ; SSE41-NEXT: movq %rdi, (%r12) ; SSE41-NEXT: popq %rbx ; SSE41-NEXT: popq %r12 @@ -3722,33 +3704,31 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; AVX-NEXT: pushq %r12 ; AVX-NEXT: pushq %rbx ; AVX-NEXT: movq %r8, %r14 -; AVX-NEXT: movq %rcx, %rbp ; AVX-NEXT: movq %rdx, %r8 ; AVX-NEXT: movq %rsi, %r11 ; AVX-NEXT: movq %rdi, %r10 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rsi -; AVX-NEXT: movq %r11, %rbx -; AVX-NEXT: sarq $63, %rbx -; AVX-NEXT: movq %rbx, %r15 -; AVX-NEXT: andq %r14, %r15 +; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rbp +; AVX-NEXT: movq %r11, %r12 +; AVX-NEXT: sarq $63, %r12 +; AVX-NEXT: movq %r14, %rbx +; AVX-NEXT: imulq %r12, %rbx ; AVX-NEXT: movq %r14, %rax -; AVX-NEXT: mulq %rbx +; AVX-NEXT: mulq %r12 ; AVX-NEXT: movq %rax, %rdi -; AVX-NEXT: movq %rdx, %r12 -; AVX-NEXT: subq %r15, %r12 -; AVX-NEXT: andq %r9, %rbx -; AVX-NEXT: subq %rbx, %r12 -; AVX-NEXT: movq %r9, %r13 -; AVX-NEXT: sarq $63, %r13 -; AVX-NEXT: movq %r13, %rcx -; AVX-NEXT: andq %r11, %rcx -; AVX-NEXT: movq %r13, %rax +; AVX-NEXT: addq %rbx, %rdx +; AVX-NEXT: imulq %r9, %r12 +; AVX-NEXT: addq %rdx, %r12 +; AVX-NEXT: movq %r9, %rbx +; AVX-NEXT: sarq $63, %rbx +; AVX-NEXT: movq %rbx, %r13 +; AVX-NEXT: imulq %r11, %r13 +; AVX-NEXT: movq %rbx, %rax ; AVX-NEXT: mulq %r10 ; AVX-NEXT: movq %rax, %r15 -; AVX-NEXT: movq %rdx, %rbx -; AVX-NEXT: subq %rcx, %rbx -; AVX-NEXT: andq %r10, %r13 -; AVX-NEXT: subq %r13, %rbx +; AVX-NEXT: addq %r13, %rdx +; AVX-NEXT: imulq %r10, %rbx +; AVX-NEXT: addq %rdx, %rbx ; AVX-NEXT: addq %rdi, %r15 ; AVX-NEXT: adcq %r12, %rbx ; AVX-NEXT: movq %r10, %rax @@ -3768,11 +3748,11 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; AVX-NEXT: addq %r13, %r10 ; AVX-NEXT: adcq %r14, %r12 ; AVX-NEXT: setb %al -; AVX-NEXT: movzbl %al, %ecx +; AVX-NEXT: movzbl %al, %r14d ; AVX-NEXT: movq %r11, %rax ; AVX-NEXT: mulq %r9 ; AVX-NEXT: addq %r12, %rax -; AVX-NEXT: adcq %rcx, %rdx +; AVX-NEXT: adcq %r14, %rdx ; AVX-NEXT: addq %r15, %rax ; AVX-NEXT: adcq %rbx, %rdx ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r12 @@ -3783,56 +3763,52 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; AVX-NEXT: xorl %r15d, %r15d ; AVX-NEXT: orq %rdx, %r10 ; AVX-NEXT: setne %r15b -; AVX-NEXT: movq %rbp, %rcx -; AVX-NEXT: sarq $63, %rcx -; AVX-NEXT: movq %rcx, %r11 -; AVX-NEXT: andq %rsi, %r11 +; AVX-NEXT: movq %rcx, %rbx +; AVX-NEXT: sarq $63, %rbx +; AVX-NEXT: movq %rsi, %r10 +; AVX-NEXT: imulq %rbx, %r10 ; AVX-NEXT: movq %rsi, %rax -; AVX-NEXT: mulq %rcx +; AVX-NEXT: mulq %rbx ; AVX-NEXT: movq %rax, %r9 -; AVX-NEXT: movq %rdx, %r10 -; AVX-NEXT: subq %r11, %r10 -; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX-NEXT: andq %rax, %rcx -; AVX-NEXT: subq %rcx, %r10 -; AVX-NEXT: movq %rax, %r11 -; AVX-NEXT: movq %rax, %r13 -; AVX-NEXT: sarq $63, %r11 -; AVX-NEXT: movq %r11, %rcx -; AVX-NEXT: andq %rbp, %rcx -; AVX-NEXT: movq %r11, %rax +; AVX-NEXT: addq %r10, %rdx +; AVX-NEXT: imulq %rbp, %rbx +; AVX-NEXT: addq %rdx, %rbx +; AVX-NEXT: movq %rbp, %r10 +; AVX-NEXT: sarq $63, %r10 +; AVX-NEXT: movq %r10, %r14 +; AVX-NEXT: imulq %rcx, %r14 +; AVX-NEXT: movq %r10, %rax ; AVX-NEXT: mulq %r8 -; AVX-NEXT: movq %rax, %rbx -; AVX-NEXT: movq %rdx, %r14 -; AVX-NEXT: subq %rcx, %r14 -; AVX-NEXT: andq %r8, %r11 -; AVX-NEXT: subq %r11, %r14 -; AVX-NEXT: addq %r9, %rbx -; AVX-NEXT: adcq %r10, %r14 +; AVX-NEXT: movq %rax, %r11 +; AVX-NEXT: addq %r14, %rdx +; AVX-NEXT: imulq %r8, %r10 +; AVX-NEXT: addq %rdx, %r10 +; AVX-NEXT: addq %r9, %r11 +; AVX-NEXT: adcq %rbx, %r10 ; AVX-NEXT: movq %r8, %rax ; AVX-NEXT: mulq %rsi ; AVX-NEXT: movq %rdx, %r9 -; AVX-NEXT: movq %rax, %r10 -; AVX-NEXT: movq %rbp, %rax +; AVX-NEXT: movq %rax, %rbx +; AVX-NEXT: movq %rcx, %rax ; AVX-NEXT: mulq %rsi ; AVX-NEXT: movq %rdx, %rsi -; AVX-NEXT: movq %rax, %r11 -; AVX-NEXT: addq %r9, %r11 +; AVX-NEXT: movq %rax, %r14 +; AVX-NEXT: addq %r9, %r14 ; AVX-NEXT: adcq $0, %rsi ; AVX-NEXT: movq %r8, %rax -; AVX-NEXT: mulq %r13 +; AVX-NEXT: mulq %rbp ; AVX-NEXT: movq %rdx, %r8 ; AVX-NEXT: movq %rax, %r9 -; AVX-NEXT: addq %r11, %r9 +; AVX-NEXT: addq %r14, %r9 ; AVX-NEXT: adcq %rsi, %r8 ; AVX-NEXT: setb %al -; AVX-NEXT: movzbl %al, %ecx -; AVX-NEXT: movq %rbp, %rax -; AVX-NEXT: mulq %r13 +; AVX-NEXT: movzbl %al, %esi +; AVX-NEXT: movq %rcx, %rax +; AVX-NEXT: mulq %rbp ; AVX-NEXT: addq %r8, %rax -; AVX-NEXT: adcq %rcx, %rdx -; AVX-NEXT: addq %rbx, %rax -; AVX-NEXT: adcq %r14, %rdx +; AVX-NEXT: adcq %rsi, %rdx +; AVX-NEXT: addq %r11, %rax +; AVX-NEXT: adcq %r10, %rdx ; AVX-NEXT: movq %r9, 24(%r12) ; AVX-NEXT: sarq $63, %r9 ; AVX-NEXT: xorq %r9, %rdx @@ -3844,7 +3820,7 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; AVX-NEXT: negl %r15d ; AVX-NEXT: vmovd %r15d, %xmm0 ; AVX-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 -; AVX-NEXT: movq %r10, 16(%r12) +; AVX-NEXT: movq %rbx, 16(%r12) ; AVX-NEXT: movq %rdi, (%r12) ; AVX-NEXT: popq %rbx ; AVX-NEXT: popq %r12 @@ -3862,35 +3838,32 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; AVX512F-NEXT: pushq %r13 ; AVX512F-NEXT: pushq %r12 ; AVX512F-NEXT: pushq %rbx -; AVX512F-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512F-NEXT: movq %r9, %rbp ; AVX512F-NEXT: movq %rcx, %r11 ; AVX512F-NEXT: movq %rdx, %r10 -; AVX512F-NEXT: movq %rsi, %rbp -; AVX512F-NEXT: movq %rdi, %r9 +; AVX512F-NEXT: movq %rsi, %r9 ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r15 -; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rdi -; AVX512F-NEXT: movq %rcx, %rbx -; AVX512F-NEXT: sarq $63, %rbx -; AVX512F-NEXT: movq %rbx, %r14 -; AVX512F-NEXT: andq %r15, %r14 +; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rsi +; AVX512F-NEXT: movq %rcx, %r12 +; AVX512F-NEXT: sarq $63, %r12 +; AVX512F-NEXT: movq %r15, %rbx +; AVX512F-NEXT: imulq %r12, %rbx ; AVX512F-NEXT: movq %r15, %rax -; AVX512F-NEXT: mulq %rbx +; AVX512F-NEXT: mulq %r12 ; AVX512F-NEXT: movq %rax, %rcx -; AVX512F-NEXT: movq %rdx, %r12 -; AVX512F-NEXT: subq %r14, %r12 -; AVX512F-NEXT: andq %rdi, %rbx -; AVX512F-NEXT: subq %rbx, %r12 -; AVX512F-NEXT: movq %rdi, %r13 -; AVX512F-NEXT: sarq $63, %r13 -; AVX512F-NEXT: movq %r13, %rsi -; AVX512F-NEXT: andq %r11, %rsi -; AVX512F-NEXT: movq %r13, %rax +; AVX512F-NEXT: addq %rbx, %rdx +; AVX512F-NEXT: imulq %rsi, %r12 +; AVX512F-NEXT: addq %rdx, %r12 +; AVX512F-NEXT: movq %rsi, %rbx +; AVX512F-NEXT: sarq $63, %rbx +; AVX512F-NEXT: movq %rbx, %r13 +; AVX512F-NEXT: imulq %r11, %r13 +; AVX512F-NEXT: movq %rbx, %rax ; AVX512F-NEXT: mulq %r10 ; AVX512F-NEXT: movq %rax, %r14 -; AVX512F-NEXT: movq %rdx, %rbx -; AVX512F-NEXT: subq %rsi, %rbx -; AVX512F-NEXT: andq %r10, %r13 -; AVX512F-NEXT: subq %r13, %rbx +; AVX512F-NEXT: addq %r13, %rdx +; AVX512F-NEXT: imulq %r10, %rbx +; AVX512F-NEXT: addq %rdx, %rbx ; AVX512F-NEXT: addq %rcx, %r14 ; AVX512F-NEXT: adcq %r12, %rbx ; AVX512F-NEXT: movq %r10, %rax @@ -3904,78 +3877,74 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; AVX512F-NEXT: addq %r12, %r13 ; AVX512F-NEXT: adcq $0, %r15 ; AVX512F-NEXT: movq %r10, %rax -; AVX512F-NEXT: mulq %rdi +; AVX512F-NEXT: mulq %rsi ; AVX512F-NEXT: movq %rdx, %r12 ; AVX512F-NEXT: movq %rax, %r10 ; AVX512F-NEXT: addq %r13, %r10 ; AVX512F-NEXT: adcq %r15, %r12 ; AVX512F-NEXT: setb %al -; AVX512F-NEXT: movzbl %al, %esi +; AVX512F-NEXT: movzbl %al, %r15d ; AVX512F-NEXT: movq %r11, %rax -; AVX512F-NEXT: mulq %rdi +; AVX512F-NEXT: mulq %rsi ; AVX512F-NEXT: addq %r12, %rax -; AVX512F-NEXT: adcq %rsi, %rdx +; AVX512F-NEXT: adcq %r15, %rdx ; AVX512F-NEXT: addq %r14, %rax ; AVX512F-NEXT: adcq %rbx, %rdx -; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r13 -; AVX512F-NEXT: movq %r10, 24(%r13) +; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r12 +; AVX512F-NEXT: movq %r10, 24(%r12) ; AVX512F-NEXT: sarq $63, %r10 ; AVX512F-NEXT: xorq %r10, %rdx ; AVX512F-NEXT: xorq %rax, %r10 ; AVX512F-NEXT: orq %rdx, %r10 ; AVX512F-NEXT: setne %al ; AVX512F-NEXT: kmovw %eax, %k0 -; AVX512F-NEXT: movq %rbp, %rsi +; AVX512F-NEXT: movq %r9, %rsi ; AVX512F-NEXT: sarq $63, %rsi -; AVX512F-NEXT: movq %rsi, %rdi -; AVX512F-NEXT: andq %r8, %rdi +; AVX512F-NEXT: movq %r8, %r11 +; AVX512F-NEXT: imulq %rsi, %r11 ; AVX512F-NEXT: movq %r8, %rax ; AVX512F-NEXT: mulq %rsi ; AVX512F-NEXT: movq %rax, %r10 -; AVX512F-NEXT: movq %rdx, %r11 -; AVX512F-NEXT: subq %rdi, %r11 -; AVX512F-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512F-NEXT: andq %rax, %rsi -; AVX512F-NEXT: subq %rsi, %r11 +; AVX512F-NEXT: addq %r11, %rdx +; AVX512F-NEXT: imulq %rbp, %rsi +; AVX512F-NEXT: addq %rdx, %rsi +; AVX512F-NEXT: movq %rbp, %r11 +; AVX512F-NEXT: sarq $63, %r11 +; AVX512F-NEXT: movq %r11, %r14 +; AVX512F-NEXT: imulq %r9, %r14 +; AVX512F-NEXT: movq %r11, %rax +; AVX512F-NEXT: mulq %rdi ; AVX512F-NEXT: movq %rax, %rbx -; AVX512F-NEXT: movq %rax, %r12 -; AVX512F-NEXT: sarq $63, %rbx -; AVX512F-NEXT: movq %rbx, %rsi -; AVX512F-NEXT: andq %rbp, %rsi -; AVX512F-NEXT: movq %rbx, %rax -; AVX512F-NEXT: mulq %r9 -; AVX512F-NEXT: movq %rax, %r14 -; AVX512F-NEXT: movq %rdx, %r15 -; AVX512F-NEXT: subq %rsi, %r15 -; AVX512F-NEXT: andq %r9, %rbx -; AVX512F-NEXT: subq %rbx, %r15 -; AVX512F-NEXT: addq %r10, %r14 -; AVX512F-NEXT: adcq %r11, %r15 -; AVX512F-NEXT: movq %r9, %rax +; AVX512F-NEXT: addq %r14, %rdx +; AVX512F-NEXT: imulq %rdi, %r11 +; AVX512F-NEXT: addq %rdx, %r11 +; AVX512F-NEXT: addq %r10, %rbx +; AVX512F-NEXT: adcq %rsi, %r11 +; AVX512F-NEXT: movq %rdi, %rax ; AVX512F-NEXT: mulq %r8 ; AVX512F-NEXT: movq %rdx, %r10 -; AVX512F-NEXT: movq %rax, %r11 -; AVX512F-NEXT: movq %rbp, %rax +; AVX512F-NEXT: movq %rax, %r14 +; AVX512F-NEXT: movq %r9, %rax ; AVX512F-NEXT: mulq %r8 ; AVX512F-NEXT: movq %rdx, %r8 -; AVX512F-NEXT: movq %rax, %rbx -; AVX512F-NEXT: addq %r10, %rbx +; AVX512F-NEXT: movq %rax, %r15 +; AVX512F-NEXT: addq %r10, %r15 ; AVX512F-NEXT: adcq $0, %r8 -; AVX512F-NEXT: movq %r9, %rax -; AVX512F-NEXT: mulq %r12 +; AVX512F-NEXT: movq %rdi, %rax +; AVX512F-NEXT: mulq %rbp ; AVX512F-NEXT: movq %rdx, %rdi ; AVX512F-NEXT: movq %rax, %r10 -; AVX512F-NEXT: addq %rbx, %r10 +; AVX512F-NEXT: addq %r15, %r10 ; AVX512F-NEXT: adcq %r8, %rdi ; AVX512F-NEXT: setb %al ; AVX512F-NEXT: movzbl %al, %esi -; AVX512F-NEXT: movq %rbp, %rax -; AVX512F-NEXT: mulq %r12 +; AVX512F-NEXT: movq %r9, %rax +; AVX512F-NEXT: mulq %rbp ; AVX512F-NEXT: addq %rdi, %rax ; AVX512F-NEXT: adcq %rsi, %rdx -; AVX512F-NEXT: addq %r14, %rax -; AVX512F-NEXT: adcq %r15, %rdx -; AVX512F-NEXT: movq %r10, 8(%r13) +; AVX512F-NEXT: addq %rbx, %rax +; AVX512F-NEXT: adcq %r11, %rdx +; AVX512F-NEXT: movq %r10, 8(%r12) ; AVX512F-NEXT: sarq $63, %r10 ; AVX512F-NEXT: xorq %r10, %rdx ; AVX512F-NEXT: xorq %rax, %r10 @@ -3987,8 +3956,8 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; AVX512F-NEXT: korw %k0, %k1, %k1 ; AVX512F-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512F-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} -; AVX512F-NEXT: movq %rcx, 16(%r13) -; AVX512F-NEXT: movq %r11, (%r13) +; AVX512F-NEXT: movq %rcx, 16(%r12) +; AVX512F-NEXT: movq %r14, (%r12) ; AVX512F-NEXT: popq %rbx ; AVX512F-NEXT: popq %r12 ; AVX512F-NEXT: popq %r13 @@ -4005,35 +3974,32 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; AVX512BW-NEXT: pushq %r13 ; AVX512BW-NEXT: pushq %r12 ; AVX512BW-NEXT: pushq %rbx -; AVX512BW-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512BW-NEXT: movq %r9, %rbp ; AVX512BW-NEXT: movq %rcx, %r11 ; AVX512BW-NEXT: movq %rdx, %r10 -; AVX512BW-NEXT: movq %rsi, %rbp -; AVX512BW-NEXT: movq %rdi, %r9 +; AVX512BW-NEXT: movq %rsi, %r9 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r15 -; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rdi -; AVX512BW-NEXT: movq %rcx, %rbx -; AVX512BW-NEXT: sarq $63, %rbx -; AVX512BW-NEXT: movq %rbx, %r14 -; AVX512BW-NEXT: andq %r15, %r14 +; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rsi +; AVX512BW-NEXT: movq %rcx, %r12 +; AVX512BW-NEXT: sarq $63, %r12 +; AVX512BW-NEXT: movq %r15, %rbx +; AVX512BW-NEXT: imulq %r12, %rbx ; AVX512BW-NEXT: movq %r15, %rax -; AVX512BW-NEXT: mulq %rbx +; AVX512BW-NEXT: mulq %r12 ; AVX512BW-NEXT: movq %rax, %rcx -; AVX512BW-NEXT: movq %rdx, %r12 -; AVX512BW-NEXT: subq %r14, %r12 -; AVX512BW-NEXT: andq %rdi, %rbx -; AVX512BW-NEXT: subq %rbx, %r12 -; AVX512BW-NEXT: movq %rdi, %r13 -; AVX512BW-NEXT: sarq $63, %r13 -; AVX512BW-NEXT: movq %r13, %rsi -; AVX512BW-NEXT: andq %r11, %rsi -; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: addq %rbx, %rdx +; AVX512BW-NEXT: imulq %rsi, %r12 +; AVX512BW-NEXT: addq %rdx, %r12 +; AVX512BW-NEXT: movq %rsi, %rbx +; AVX512BW-NEXT: sarq $63, %rbx +; AVX512BW-NEXT: movq %rbx, %r13 +; AVX512BW-NEXT: imulq %r11, %r13 +; AVX512BW-NEXT: movq %rbx, %rax ; AVX512BW-NEXT: mulq %r10 ; AVX512BW-NEXT: movq %rax, %r14 -; AVX512BW-NEXT: movq %rdx, %rbx -; AVX512BW-NEXT: subq %rsi, %rbx -; AVX512BW-NEXT: andq %r10, %r13 -; AVX512BW-NEXT: subq %r13, %rbx +; AVX512BW-NEXT: addq %r13, %rdx +; AVX512BW-NEXT: imulq %r10, %rbx +; AVX512BW-NEXT: addq %rdx, %rbx ; AVX512BW-NEXT: addq %rcx, %r14 ; AVX512BW-NEXT: adcq %r12, %rbx ; AVX512BW-NEXT: movq %r10, %rax @@ -4047,78 +4013,74 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; AVX512BW-NEXT: addq %r12, %r13 ; AVX512BW-NEXT: adcq $0, %r15 ; AVX512BW-NEXT: movq %r10, %rax -; AVX512BW-NEXT: mulq %rdi +; AVX512BW-NEXT: mulq %rsi ; AVX512BW-NEXT: movq %rdx, %r12 ; AVX512BW-NEXT: movq %rax, %r10 ; AVX512BW-NEXT: addq %r13, %r10 ; AVX512BW-NEXT: adcq %r15, %r12 ; AVX512BW-NEXT: setb %al -; AVX512BW-NEXT: movzbl %al, %esi +; AVX512BW-NEXT: movzbl %al, %r15d ; AVX512BW-NEXT: movq %r11, %rax -; AVX512BW-NEXT: mulq %rdi +; AVX512BW-NEXT: mulq %rsi ; AVX512BW-NEXT: addq %r12, %rax -; AVX512BW-NEXT: adcq %rsi, %rdx +; AVX512BW-NEXT: adcq %r15, %rdx ; AVX512BW-NEXT: addq %r14, %rax ; AVX512BW-NEXT: adcq %rbx, %rdx -; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r13 -; AVX512BW-NEXT: movq %r10, 24(%r13) +; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r12 +; AVX512BW-NEXT: movq %r10, 24(%r12) ; AVX512BW-NEXT: sarq $63, %r10 ; AVX512BW-NEXT: xorq %r10, %rdx ; AVX512BW-NEXT: xorq %rax, %r10 ; AVX512BW-NEXT: orq %rdx, %r10 ; AVX512BW-NEXT: setne %al ; AVX512BW-NEXT: kmovd %eax, %k0 -; AVX512BW-NEXT: movq %rbp, %rsi +; AVX512BW-NEXT: movq %r9, %rsi ; AVX512BW-NEXT: sarq $63, %rsi -; AVX512BW-NEXT: movq %rsi, %rdi -; AVX512BW-NEXT: andq %r8, %rdi +; AVX512BW-NEXT: movq %r8, %r11 +; AVX512BW-NEXT: imulq %rsi, %r11 ; AVX512BW-NEXT: movq %r8, %rax ; AVX512BW-NEXT: mulq %rsi ; AVX512BW-NEXT: movq %rax, %r10 -; AVX512BW-NEXT: movq %rdx, %r11 -; AVX512BW-NEXT: subq %rdi, %r11 -; AVX512BW-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512BW-NEXT: andq %rax, %rsi -; AVX512BW-NEXT: subq %rsi, %r11 +; AVX512BW-NEXT: addq %r11, %rdx +; AVX512BW-NEXT: imulq %rbp, %rsi +; AVX512BW-NEXT: addq %rdx, %rsi +; AVX512BW-NEXT: movq %rbp, %r11 +; AVX512BW-NEXT: sarq $63, %r11 +; AVX512BW-NEXT: movq %r11, %r14 +; AVX512BW-NEXT: imulq %r9, %r14 +; AVX512BW-NEXT: movq %r11, %rax +; AVX512BW-NEXT: mulq %rdi ; AVX512BW-NEXT: movq %rax, %rbx -; AVX512BW-NEXT: movq %rax, %r12 -; AVX512BW-NEXT: sarq $63, %rbx -; AVX512BW-NEXT: movq %rbx, %rsi -; AVX512BW-NEXT: andq %rbp, %rsi -; AVX512BW-NEXT: movq %rbx, %rax -; AVX512BW-NEXT: mulq %r9 -; AVX512BW-NEXT: movq %rax, %r14 -; AVX512BW-NEXT: movq %rdx, %r15 -; AVX512BW-NEXT: subq %rsi, %r15 -; AVX512BW-NEXT: andq %r9, %rbx -; AVX512BW-NEXT: subq %rbx, %r15 -; AVX512BW-NEXT: addq %r10, %r14 -; AVX512BW-NEXT: adcq %r11, %r15 -; AVX512BW-NEXT: movq %r9, %rax +; AVX512BW-NEXT: addq %r14, %rdx +; AVX512BW-NEXT: imulq %rdi, %r11 +; AVX512BW-NEXT: addq %rdx, %r11 +; AVX512BW-NEXT: addq %r10, %rbx +; AVX512BW-NEXT: adcq %rsi, %r11 +; AVX512BW-NEXT: movq %rdi, %rax ; AVX512BW-NEXT: mulq %r8 ; AVX512BW-NEXT: movq %rdx, %r10 -; AVX512BW-NEXT: movq %rax, %r11 -; AVX512BW-NEXT: movq %rbp, %rax +; AVX512BW-NEXT: movq %rax, %r14 +; AVX512BW-NEXT: movq %r9, %rax ; AVX512BW-NEXT: mulq %r8 ; AVX512BW-NEXT: movq %rdx, %r8 -; AVX512BW-NEXT: movq %rax, %rbx -; AVX512BW-NEXT: addq %r10, %rbx +; AVX512BW-NEXT: movq %rax, %r15 +; AVX512BW-NEXT: addq %r10, %r15 ; AVX512BW-NEXT: adcq $0, %r8 -; AVX512BW-NEXT: movq %r9, %rax -; AVX512BW-NEXT: mulq %r12 +; AVX512BW-NEXT: movq %rdi, %rax +; AVX512BW-NEXT: mulq %rbp ; AVX512BW-NEXT: movq %rdx, %rdi ; AVX512BW-NEXT: movq %rax, %r10 -; AVX512BW-NEXT: addq %rbx, %r10 +; AVX512BW-NEXT: addq %r15, %r10 ; AVX512BW-NEXT: adcq %r8, %rdi ; AVX512BW-NEXT: setb %al ; AVX512BW-NEXT: movzbl %al, %esi -; AVX512BW-NEXT: movq %rbp, %rax -; AVX512BW-NEXT: mulq %r12 +; AVX512BW-NEXT: movq %r9, %rax +; AVX512BW-NEXT: mulq %rbp ; AVX512BW-NEXT: addq %rdi, %rax ; AVX512BW-NEXT: adcq %rsi, %rdx -; AVX512BW-NEXT: addq %r14, %rax -; AVX512BW-NEXT: adcq %r15, %rdx -; AVX512BW-NEXT: movq %r10, 8(%r13) +; AVX512BW-NEXT: addq %rbx, %rax +; AVX512BW-NEXT: adcq %r11, %rdx +; AVX512BW-NEXT: movq %r10, 8(%r12) ; AVX512BW-NEXT: sarq $63, %r10 ; AVX512BW-NEXT: xorq %r10, %rdx ; AVX512BW-NEXT: xorq %rax, %r10 @@ -4130,8 +4092,8 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; AVX512BW-NEXT: korw %k0, %k1, %k1 ; AVX512BW-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512BW-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} -; AVX512BW-NEXT: movq %rcx, 16(%r13) -; AVX512BW-NEXT: movq %r11, (%r13) +; AVX512BW-NEXT: movq %rcx, 16(%r12) +; AVX512BW-NEXT: movq %r14, (%r12) ; AVX512BW-NEXT: popq %rbx ; AVX512BW-NEXT: popq %r12 ; AVX512BW-NEXT: popq %r13 diff --git a/llvm/test/CodeGen/X86/xmulo.ll b/llvm/test/CodeGen/X86/xmulo.ll index 508b0d7..4adc80b 100644 --- a/llvm/test/CodeGen/X86/xmulo.ll +++ b/llvm/test/CodeGen/X86/xmulo.ll @@ -215,36 +215,35 @@ define zeroext i1 @smuloi64(i64 %v1, i64 %v2, ptr %res) { ; WIN32-NEXT: subl $8, %esp ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebp -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebx -; WIN32-NEXT: movl %ebx, %esi +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; WIN32-NEXT: movl %ecx, %edi +; WIN32-NEXT: sarl $31, %edi +; WIN32-NEXT: movl %eax, %esi +; WIN32-NEXT: imull %edi, %esi +; WIN32-NEXT: mull %edi +; WIN32-NEXT: movl %eax, %ebx +; WIN32-NEXT: addl %esi, %edx +; WIN32-NEXT: movl %ebp, %esi +; WIN32-NEXT: imull %ebp, %edi +; WIN32-NEXT: addl %edx, %edi ; WIN32-NEXT: sarl $31, %esi -; WIN32-NEXT: movl %esi, %edi -; WIN32-NEXT: andl %eax, %edi -; WIN32-NEXT: mull %esi +; WIN32-NEXT: movl %esi, %ebp +; WIN32-NEXT: imull %ecx, %ebp +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; WIN32-NEXT: movl %esi, %eax +; WIN32-NEXT: mull %ecx +; WIN32-NEXT: addl %ebp, %edx +; WIN32-NEXT: imull %ecx, %esi +; WIN32-NEXT: addl %edx, %esi +; WIN32-NEXT: addl %ebx, %eax ; WIN32-NEXT: movl %eax, (%esp) # 4-byte Spill -; WIN32-NEXT: movl %edx, %ecx -; WIN32-NEXT: subl %edi, %ecx -; WIN32-NEXT: andl %ebp, %esi -; WIN32-NEXT: subl %esi, %ecx -; WIN32-NEXT: sarl $31, %ebp -; WIN32-NEXT: movl %ebp, %edi -; WIN32-NEXT: andl %ebx, %edi -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebx -; WIN32-NEXT: movl %ebp, %eax -; WIN32-NEXT: mull %ebx -; WIN32-NEXT: movl %edx, %esi -; WIN32-NEXT: subl %edi, %esi -; WIN32-NEXT: andl %ebx, %ebp -; WIN32-NEXT: subl %ebp, %esi -; WIN32-NEXT: addl (%esp), %eax # 4-byte Folded Reload -; WIN32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; WIN32-NEXT: adcl %ecx, %esi -; WIN32-NEXT: movl %ebx, %eax -; WIN32-NEXT: movl %ebx, %edi +; WIN32-NEXT: adcl %edi, %esi +; WIN32-NEXT: movl %ecx, %eax +; WIN32-NEXT: movl %ecx, %edi ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; WIN32-NEXT: mull %ecx ; WIN32-NEXT: movl %edx, %ebp -; WIN32-NEXT: movl %eax, (%esp) # 4-byte Spill +; WIN32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN32-NEXT: mull %ecx ; WIN32-NEXT: movl %edx, %ebx @@ -263,7 +262,7 @@ define zeroext i1 @smuloi64(i64 %v1, i64 %v2, ptr %res) { ; WIN32-NEXT: addl %edi, %eax ; WIN32-NEXT: movzbl %cl, %ecx ; WIN32-NEXT: adcl %ecx, %edx -; WIN32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; WIN32-NEXT: addl (%esp), %eax # 4-byte Folded Reload ; WIN32-NEXT: adcl %esi, %edx ; WIN32-NEXT: movl %ebp, %ecx ; WIN32-NEXT: sarl $31, %ecx @@ -272,7 +271,7 @@ define zeroext i1 @smuloi64(i64 %v1, i64 %v2, ptr %res) { ; WIN32-NEXT: orl %edx, %ecx ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN32-NEXT: movl %ebp, 4(%eax) -; WIN32-NEXT: movl (%esp), %ecx # 4-byte Reload +; WIN32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; WIN32-NEXT: movl %ecx, (%eax) ; WIN32-NEXT: setne %al ; WIN32-NEXT: addl $8, %esp @@ -574,52 +573,49 @@ define i64 @smuloselecti64(i64 %v1, i64 %v2) { ; WIN32-NEXT: pushl %edi ; WIN32-NEXT: pushl %esi ; WIN32-NEXT: pushl %eax -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebp ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebx -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edx -; WIN32-NEXT: movl %edx, %ecx -; WIN32-NEXT: movl %edx, %ebp +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax +; WIN32-NEXT: movl %eax, %ecx +; WIN32-NEXT: movl %eax, %esi ; WIN32-NEXT: sarl $31, %ecx -; WIN32-NEXT: movl %ecx, %edi -; WIN32-NEXT: andl %eax, %edi +; WIN32-NEXT: movl %ebp, %edi +; WIN32-NEXT: imull %ecx, %edi +; WIN32-NEXT: movl %ebp, %eax ; WIN32-NEXT: mull %ecx ; WIN32-NEXT: movl %eax, (%esp) # 4-byte Spill -; WIN32-NEXT: movl %edx, %esi -; WIN32-NEXT: subl %edi, %esi -; WIN32-NEXT: andl %ebx, %ecx -; WIN32-NEXT: subl %ecx, %esi -; WIN32-NEXT: movl %ebx, %ecx -; WIN32-NEXT: sarl $31, %ecx -; WIN32-NEXT: movl %ecx, %edi -; WIN32-NEXT: andl %ebp, %edi -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebp -; WIN32-NEXT: movl %ecx, %eax -; WIN32-NEXT: mull %ebp -; WIN32-NEXT: movl %edx, %ebx -; WIN32-NEXT: subl %edi, %ebx -; WIN32-NEXT: movl %ebp, %edi -; WIN32-NEXT: andl %ebp, %ecx -; WIN32-NEXT: subl %ecx, %ebx +; WIN32-NEXT: addl %edi, %edx +; WIN32-NEXT: imull %ebx, %ecx +; WIN32-NEXT: addl %edx, %ecx +; WIN32-NEXT: sarl $31, %ebx +; WIN32-NEXT: movl %ebx, %edi +; WIN32-NEXT: imull %esi, %edi +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %esi +; WIN32-NEXT: movl %ebx, %eax +; WIN32-NEXT: mull %esi +; WIN32-NEXT: addl %edi, %edx +; WIN32-NEXT: movl %esi, %edi +; WIN32-NEXT: imull %esi, %ebx +; WIN32-NEXT: addl %edx, %ebx ; WIN32-NEXT: addl (%esp), %eax # 4-byte Folded Reload ; WIN32-NEXT: movl %eax, (%esp) # 4-byte Spill -; WIN32-NEXT: adcl %esi, %ebx +; WIN32-NEXT: adcl %ecx, %ebx ; WIN32-NEXT: movl %edi, %eax -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; WIN32-NEXT: mull %ecx +; WIN32-NEXT: mull %ebp ; WIN32-NEXT: movl %edx, %esi ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax -; WIN32-NEXT: mull %ecx -; WIN32-NEXT: movl %edx, %ebp -; WIN32-NEXT: movl %eax, %ecx -; WIN32-NEXT: addl %esi, %ecx -; WIN32-NEXT: adcl $0, %ebp +; WIN32-NEXT: mull %ebp +; WIN32-NEXT: movl %edx, %ecx +; WIN32-NEXT: movl %eax, %ebp +; WIN32-NEXT: addl %esi, %ebp +; WIN32-NEXT: adcl $0, %ecx ; WIN32-NEXT: movl %edi, %eax ; WIN32-NEXT: mull {{[0-9]+}}(%esp) ; WIN32-NEXT: movl %edx, %edi ; WIN32-NEXT: movl %eax, %esi -; WIN32-NEXT: addl %ecx, %esi -; WIN32-NEXT: adcl %ebp, %edi +; WIN32-NEXT: addl %ebp, %esi ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebp +; WIN32-NEXT: adcl %ecx, %edi ; WIN32-NEXT: setb %cl ; WIN32-NEXT: movl %ebp, %eax ; WIN32-NEXT: mull {{[0-9]+}}(%esp) @@ -1003,32 +999,30 @@ define zeroext i1 @smulobri64(i64 %v1, i64 %v2) { ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebx ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; WIN32-NEXT: movl %ecx, %esi -; WIN32-NEXT: movl %ecx, %ebp +; WIN32-NEXT: movl %ecx, %edi +; WIN32-NEXT: sarl $31, %edi +; WIN32-NEXT: movl %eax, %esi +; WIN32-NEXT: imull %edi, %esi +; WIN32-NEXT: mull %edi +; WIN32-NEXT: movl %eax, %ebp +; WIN32-NEXT: addl %esi, %edx +; WIN32-NEXT: movl %ebx, %esi +; WIN32-NEXT: imull %ebx, %edi +; WIN32-NEXT: addl %edx, %edi ; WIN32-NEXT: sarl $31, %esi -; WIN32-NEXT: movl %esi, %edi -; WIN32-NEXT: andl %eax, %edi -; WIN32-NEXT: mull %esi -; WIN32-NEXT: movl %eax, (%esp) # 4-byte Spill -; WIN32-NEXT: movl %edx, %ecx -; WIN32-NEXT: subl %edi, %ecx -; WIN32-NEXT: andl %ebx, %esi -; WIN32-NEXT: subl %esi, %ecx -; WIN32-NEXT: sarl $31, %ebx -; WIN32-NEXT: movl %ebx, %edi -; WIN32-NEXT: andl %ebp, %edi -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebp -; WIN32-NEXT: movl %ebx, %eax -; WIN32-NEXT: mull %ebp -; WIN32-NEXT: movl %edx, %esi -; WIN32-NEXT: subl %edi, %esi -; WIN32-NEXT: andl %ebp, %ebx -; WIN32-NEXT: subl %ebx, %esi -; WIN32-NEXT: addl (%esp), %eax # 4-byte Folded Reload +; WIN32-NEXT: movl %esi, %ebx +; WIN32-NEXT: imull %ecx, %ebx +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; WIN32-NEXT: movl %esi, %eax +; WIN32-NEXT: mull %ecx +; WIN32-NEXT: addl %ebx, %edx +; WIN32-NEXT: imull %ecx, %esi +; WIN32-NEXT: addl %edx, %esi +; WIN32-NEXT: addl %ebp, %eax ; WIN32-NEXT: movl %eax, (%esp) # 4-byte Spill -; WIN32-NEXT: adcl %ecx, %esi -; WIN32-NEXT: movl %ebp, %eax -; WIN32-NEXT: movl %ebp, %edi +; WIN32-NEXT: adcl %edi, %esi +; WIN32-NEXT: movl %ecx, %eax +; WIN32-NEXT: movl %ecx, %edi ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; WIN32-NEXT: mull %ecx ; WIN32-NEXT: movl %edx, %ebx @@ -1710,62 +1704,57 @@ define zeroext i1 @smuloi64_load(ptr %ptr1, i64 %v2, ptr %res) { ; WIN32-NEXT: pushl %edi ; WIN32-NEXT: pushl %esi ; WIN32-NEXT: subl $16, %esp -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edi +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebx ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN32-NEXT: movl (%eax), %esi -; WIN32-NEXT: movl 4(%eax), %eax -; WIN32-NEXT: sarl $31, %edi -; WIN32-NEXT: movl %edi, %ecx -; WIN32-NEXT: andl %eax, %ecx -; WIN32-NEXT: movl %eax, %ebx -; WIN32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; WIN32-NEXT: movl %edi, %eax +; WIN32-NEXT: movl 4(%eax), %ebp +; WIN32-NEXT: sarl $31, %ebx +; WIN32-NEXT: movl %ebx, %ecx +; WIN32-NEXT: imull %ebp, %ecx +; WIN32-NEXT: movl %ebx, %eax ; WIN32-NEXT: mull %esi -; WIN32-NEXT: movl %eax, (%esp) # 4-byte Spill -; WIN32-NEXT: movl %edx, %ebp -; WIN32-NEXT: subl %ecx, %ebp +; WIN32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; WIN32-NEXT: addl %ecx, %edx ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax -; WIN32-NEXT: andl %esi, %edi -; WIN32-NEXT: subl %edi, %ebp -; WIN32-NEXT: movl %ebx, %ecx +; WIN32-NEXT: imull %esi, %ebx +; WIN32-NEXT: addl %edx, %ebx +; WIN32-NEXT: movl %ebp, %ecx +; WIN32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; WIN32-NEXT: sarl $31, %ecx -; WIN32-NEXT: movl %ecx, %ebx -; WIN32-NEXT: andl %eax, %ebx +; WIN32-NEXT: movl %eax, %edi +; WIN32-NEXT: imull %ecx, %edi ; WIN32-NEXT: mull %ecx -; WIN32-NEXT: movl %edx, %edi -; WIN32-NEXT: subl %ebx, %edi -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edx -; WIN32-NEXT: andl %edx, %ecx -; WIN32-NEXT: subl %ecx, %edi -; WIN32-NEXT: addl (%esp), %eax # 4-byte Folded Reload +; WIN32-NEXT: addl %edi, %edx +; WIN32-NEXT: imull {{[0-9]+}}(%esp), %ecx +; WIN32-NEXT: addl %edx, %ecx +; WIN32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; WIN32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; WIN32-NEXT: adcl %ebp, %edi +; WIN32-NEXT: adcl %ebx, %ecx ; WIN32-NEXT: movl %esi, %eax -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; WIN32-NEXT: mull %ecx -; WIN32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; WIN32-NEXT: movl %eax, (%esp) # 4-byte Spill -; WIN32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; WIN32-NEXT: mull %ecx +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edi +; WIN32-NEXT: mull %edi +; WIN32-NEXT: movl %edx, %ebx +; WIN32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; WIN32-NEXT: movl %ebp, %eax +; WIN32-NEXT: mull %edi ; WIN32-NEXT: movl %edx, %ebp -; WIN32-NEXT: movl %eax, %ebx -; WIN32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; WIN32-NEXT: movl %eax, %edi +; WIN32-NEXT: addl %ebx, %edi ; WIN32-NEXT: adcl $0, %ebp ; WIN32-NEXT: movl %esi, %eax -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; WIN32-NEXT: mull %ecx -; WIN32-NEXT: movl %edx, %ecx +; WIN32-NEXT: mull {{[0-9]+}}(%esp) +; WIN32-NEXT: movl %edx, %ebx ; WIN32-NEXT: movl %eax, %esi -; WIN32-NEXT: addl %ebx, %esi -; WIN32-NEXT: adcl %ebp, %ecx -; WIN32-NEXT: setb %bl +; WIN32-NEXT: addl %edi, %esi +; WIN32-NEXT: adcl %ebp, %ebx +; WIN32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; WIN32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; WIN32-NEXT: mull {{[0-9]+}}(%esp) -; WIN32-NEXT: addl %ecx, %eax -; WIN32-NEXT: movzbl %bl, %ecx -; WIN32-NEXT: adcl %ecx, %edx -; WIN32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; WIN32-NEXT: addl %ebx, %eax +; WIN32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 1-byte Folded Reload ; WIN32-NEXT: adcl %edi, %edx +; WIN32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; WIN32-NEXT: adcl %ecx, %edx ; WIN32-NEXT: movl %esi, %ecx ; WIN32-NEXT: sarl $31, %ecx ; WIN32-NEXT: xorl %ecx, %edx @@ -1773,7 +1762,7 @@ define zeroext i1 @smuloi64_load(ptr %ptr1, i64 %v2, ptr %res) { ; WIN32-NEXT: orl %edx, %ecx ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN32-NEXT: movl %esi, 4(%eax) -; WIN32-NEXT: movl (%esp), %ecx # 4-byte Reload +; WIN32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; WIN32-NEXT: movl %ecx, (%eax) ; WIN32-NEXT: setne %al ; WIN32-NEXT: addl $16, %esp @@ -1821,35 +1810,35 @@ define zeroext i1 @smuloi64_load2(i64 %v1, ptr %ptr2, ptr %res) { ; WIN32-NEXT: pushl %edi ; WIN32-NEXT: pushl %esi ; WIN32-NEXT: subl $12, %esp -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %esi +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN32-NEXT: movl (%eax), %ebp ; WIN32-NEXT: movl 4(%eax), %ebx -; WIN32-NEXT: sarl $31, %esi -; WIN32-NEXT: movl %esi, %edi -; WIN32-NEXT: andl %ebp, %edi +; WIN32-NEXT: movl %ecx, %edi +; WIN32-NEXT: sarl $31, %edi +; WIN32-NEXT: movl %ebp, %esi +; WIN32-NEXT: imull %edi, %esi ; WIN32-NEXT: movl %ebp, %eax -; WIN32-NEXT: mull %esi +; WIN32-NEXT: mull %edi ; WIN32-NEXT: movl %eax, (%esp) # 4-byte Spill -; WIN32-NEXT: movl %edx, %ecx -; WIN32-NEXT: subl %edi, %ecx +; WIN32-NEXT: addl %esi, %edx +; WIN32-NEXT: movl %ebx, %esi ; WIN32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; WIN32-NEXT: andl %ebx, %esi -; WIN32-NEXT: subl %esi, %ecx -; WIN32-NEXT: sarl $31, %ebx -; WIN32-NEXT: movl %ebx, %edi -; WIN32-NEXT: andl {{[0-9]+}}(%esp), %edi -; WIN32-NEXT: movl %ebx, %eax -; WIN32-NEXT: mull {{[0-9]+}}(%esp) -; WIN32-NEXT: movl %edx, %esi -; WIN32-NEXT: subl %edi, %esi -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edx -; WIN32-NEXT: andl %edx, %ebx -; WIN32-NEXT: subl %ebx, %esi +; WIN32-NEXT: imull %ebx, %edi +; WIN32-NEXT: addl %edx, %edi +; WIN32-NEXT: sarl $31, %esi +; WIN32-NEXT: movl %esi, %ebx +; WIN32-NEXT: imull %ecx, %ebx +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; WIN32-NEXT: movl %esi, %eax +; WIN32-NEXT: mull %ecx +; WIN32-NEXT: addl %ebx, %edx +; WIN32-NEXT: imull %ecx, %esi +; WIN32-NEXT: addl %edx, %esi ; WIN32-NEXT: addl (%esp), %eax # 4-byte Folded Reload ; WIN32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; WIN32-NEXT: adcl %ecx, %esi -; WIN32-NEXT: movl %edx, %eax +; WIN32-NEXT: adcl %edi, %esi +; WIN32-NEXT: movl %ecx, %eax ; WIN32-NEXT: mull %ebp ; WIN32-NEXT: movl %edx, %edi ; WIN32-NEXT: movl %eax, (%esp) # 4-byte Spill -- 2.7.4