const MachineFunction &MF) const;
/// Return true if the instruction is trivially rematerializable, meaning it
- /// has no side effects. Uses of constants and unallocatable physical
- /// registers are always trivial to rematerialize so that the instructions
- /// result is independent of the place in the function. Uses of virtual
- /// registers are allowed but it is caller's responsility to ensure these
- /// operands are valid at the point the instruction is beeing moved.
+ /// has no side effects and requires no operands that aren't always available.
+ /// This means the only allowed uses are constants and unallocatable physical
+ /// registers so that the instructions result is independent of the place
+ /// in the function.
bool isTriviallyReMaterializable(const MachineInstr &MI,
AAResults *AA = nullptr) const {
return MI.getOpcode() == TargetOpcode::IMPLICIT_DEF ||
/// set, this hook lets the target specify whether the instruction is actually
/// trivially rematerializable, taking into consideration its operands. This
/// predicate must return false if the instruction has any side effects other
- /// than producing a value.
+ /// than producing a value, or if it requres any address registers that are
+ /// not always available.
/// Requirements must be check as stated in isTriviallyReMaterializable() .
virtual bool isReallyTriviallyReMaterializable(const MachineInstr &MI,
AAResults *AA) const {
const MachineRegisterInfo &MRI = MF.getRegInfo();
// Remat clients assume operand 0 is the defined register.
- if (!MI.getNumOperands() || !MI.getOperand(0).isReg() ||
- MI.getOperand(0).isTied())
+ if (!MI.getNumOperands() || !MI.getOperand(0).isReg())
return false;
Register DefReg = MI.getOperand(0).getReg();
// same virtual register, though.
if (MO.isDef() && Reg != DefReg)
return false;
+
+ // Don't allow any virtual-register uses. Rematting an instruction with
+ // virtual register uses would length the live ranges of the uses, which
+ // is not necessarily a good idea, certainly not "trivial".
+ if (MO.isUse())
+ return false;
}
// Everything checked out.
S_NOP 0, implicit %2
S_ENDPGM 0
...
-# The liverange of %0 covers a point of rematerialization, source value is
-# availabe.
----
-name: test_remat_s_mov_b32_vreg_src_long_lr
-tracksRegLiveness: true
-machineFunctionInfo:
- stackPtrOffsetReg: $sgpr32
-body: |
- bb.0:
- ; GCN-LABEL: name: test_remat_s_mov_b32_vreg_src_long_lr
- ; GCN: renamable $sgpr0 = IMPLICIT_DEF
- ; GCN: renamable $sgpr1 = S_MOV_B32 renamable $sgpr0
- ; GCN: S_NOP 0, implicit killed renamable $sgpr1
- ; GCN: renamable $sgpr1 = S_MOV_B32 renamable $sgpr0
- ; GCN: S_NOP 0, implicit killed renamable $sgpr1
- ; GCN: renamable $sgpr1 = S_MOV_B32 renamable $sgpr0
- ; GCN: S_NOP 0, implicit killed renamable $sgpr1
- ; GCN: S_NOP 0, implicit killed renamable $sgpr0
- ; GCN: S_ENDPGM 0
- %0:sreg_32 = IMPLICIT_DEF
- %1:sreg_32 = S_MOV_B32 %0:sreg_32
- %2:sreg_32 = S_MOV_B32 %0:sreg_32
- %3:sreg_32 = S_MOV_B32 %0:sreg_32
- S_NOP 0, implicit %1
- S_NOP 0, implicit %2
- S_NOP 0, implicit %3
- S_NOP 0, implicit %0
- S_ENDPGM 0
-...
-# The liverange of %0 does not cover a point of rematerialization, source value is
-# unavailabe and we do not want to artificially extend the liverange.
----
-name: test_no_remat_s_mov_b32_vreg_src_short_lr
-tracksRegLiveness: true
-machineFunctionInfo:
- stackPtrOffsetReg: $sgpr32
-body: |
- bb.0:
- ; GCN-LABEL: name: test_no_remat_s_mov_b32_vreg_src_short_lr
- ; GCN: renamable $sgpr0 = IMPLICIT_DEF
- ; GCN: renamable $sgpr1 = S_MOV_B32 renamable $sgpr0
- ; GCN: SI_SPILL_S32_SAVE killed renamable $sgpr1, %stack.1, implicit $exec, implicit $sgpr32 :: (store (s32) into %stack.1, addrspace 5)
- ; GCN: renamable $sgpr1 = S_MOV_B32 renamable $sgpr0
- ; GCN: SI_SPILL_S32_SAVE killed renamable $sgpr1, %stack.0, implicit $exec, implicit $sgpr32 :: (store (s32) into %stack.0, addrspace 5)
- ; GCN: renamable $sgpr0 = S_MOV_B32 killed renamable $sgpr0
- ; GCN: renamable $sgpr1 = SI_SPILL_S32_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.1, addrspace 5)
- ; GCN: S_NOP 0, implicit killed renamable $sgpr1
- ; GCN: renamable $sgpr1 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.0, addrspace 5)
- ; GCN: S_NOP 0, implicit killed renamable $sgpr1
- ; GCN: S_NOP 0, implicit killed renamable $sgpr0
- ; GCN: S_ENDPGM 0
- %0:sreg_32 = IMPLICIT_DEF
- %1:sreg_32 = S_MOV_B32 %0:sreg_32
- %2:sreg_32 = S_MOV_B32 %0:sreg_32
- %3:sreg_32 = S_MOV_B32 %0:sreg_32
- S_NOP 0, implicit %1
- S_NOP 0, implicit %2
- S_NOP 0, implicit %3
- S_ENDPGM 0
-...
---
name: test_remat_s_mov_b64
tracksRegLiveness: true
; ENABLE-NEXT: pophs {r11, pc}
; ENABLE-NEXT: .LBB0_3: @ %while.body.preheader
; ENABLE-NEXT: movw r12, :lower16:skip
-; ENABLE-NEXT: sub r3, r1, #1
+; ENABLE-NEXT: sub r1, r1, #1
; ENABLE-NEXT: movt r12, :upper16:skip
; ENABLE-NEXT: .LBB0_4: @ %while.body
; ENABLE-NEXT: @ =>This Inner Loop Header: Depth=1
-; ENABLE-NEXT: ldrb r1, [r0]
-; ENABLE-NEXT: ldrb r1, [r12, r1]
-; ENABLE-NEXT: add r0, r0, r1
-; ENABLE-NEXT: sub r1, r3, #1
-; ENABLE-NEXT: cmp r1, r3
+; ENABLE-NEXT: ldrb r3, [r0]
+; ENABLE-NEXT: ldrb r3, [r12, r3]
+; ENABLE-NEXT: add r0, r0, r3
+; ENABLE-NEXT: sub r3, r1, #1
+; ENABLE-NEXT: cmp r3, r1
; ENABLE-NEXT: bhs .LBB0_6
; ENABLE-NEXT: @ %bb.5: @ %while.body
; ENABLE-NEXT: @ in Loop: Header=BB0_4 Depth=1
; ENABLE-NEXT: cmp r0, r2
-; ENABLE-NEXT: mov r3, r1
+; ENABLE-NEXT: mov r1, r3
; ENABLE-NEXT: blo .LBB0_4
; ENABLE-NEXT: .LBB0_6: @ %if.end29
; ENABLE-NEXT: pop {r11, pc}
; DISABLE-NEXT: pophs {r11, pc}
; DISABLE-NEXT: .LBB0_3: @ %while.body.preheader
; DISABLE-NEXT: movw r12, :lower16:skip
-; DISABLE-NEXT: sub r3, r1, #1
+; DISABLE-NEXT: sub r1, r1, #1
; DISABLE-NEXT: movt r12, :upper16:skip
; DISABLE-NEXT: .LBB0_4: @ %while.body
; DISABLE-NEXT: @ =>This Inner Loop Header: Depth=1
-; DISABLE-NEXT: ldrb r1, [r0]
-; DISABLE-NEXT: ldrb r1, [r12, r1]
-; DISABLE-NEXT: add r0, r0, r1
-; DISABLE-NEXT: sub r1, r3, #1
-; DISABLE-NEXT: cmp r1, r3
+; DISABLE-NEXT: ldrb r3, [r0]
+; DISABLE-NEXT: ldrb r3, [r12, r3]
+; DISABLE-NEXT: add r0, r0, r3
+; DISABLE-NEXT: sub r3, r1, #1
+; DISABLE-NEXT: cmp r3, r1
; DISABLE-NEXT: bhs .LBB0_6
; DISABLE-NEXT: @ %bb.5: @ %while.body
; DISABLE-NEXT: @ in Loop: Header=BB0_4 Depth=1
; DISABLE-NEXT: cmp r0, r2
-; DISABLE-NEXT: mov r3, r1
+; DISABLE-NEXT: mov r1, r3
; DISABLE-NEXT: blo .LBB0_4
; DISABLE-NEXT: .LBB0_6: @ %if.end29
; DISABLE-NEXT: pop {r11, pc}
; SCALAR-NEXT: push {r4, r5, r11, lr}
; SCALAR-NEXT: rsb r3, r2, #0
; SCALAR-NEXT: and r4, r2, #63
-; SCALAR-NEXT: and r12, r3, #63
-; SCALAR-NEXT: rsb r3, r12, #32
+; SCALAR-NEXT: and lr, r3, #63
+; SCALAR-NEXT: rsb r3, lr, #32
; SCALAR-NEXT: lsl r2, r0, r4
-; SCALAR-NEXT: lsr lr, r0, r12
-; SCALAR-NEXT: orr r3, lr, r1, lsl r3
-; SCALAR-NEXT: subs lr, r12, #32
-; SCALAR-NEXT: lsrpl r3, r1, lr
+; SCALAR-NEXT: lsr r12, r0, lr
+; SCALAR-NEXT: orr r3, r12, r1, lsl r3
+; SCALAR-NEXT: subs r12, lr, #32
+; SCALAR-NEXT: lsrpl r3, r1, r12
; SCALAR-NEXT: subs r5, r4, #32
; SCALAR-NEXT: movwpl r2, #0
; SCALAR-NEXT: cmp r5, #0
; SCALAR-NEXT: lsr r3, r0, r3
; SCALAR-NEXT: orr r3, r3, r1, lsl r4
; SCALAR-NEXT: lslpl r3, r0, r5
-; SCALAR-NEXT: lsr r0, r1, r12
-; SCALAR-NEXT: cmp lr, #0
+; SCALAR-NEXT: lsr r0, r1, lr
+; SCALAR-NEXT: cmp r12, #0
; SCALAR-NEXT: movwpl r0, #0
; SCALAR-NEXT: orr r1, r3, r0
; SCALAR-NEXT: mov r0, r2
; CHECK: @ %bb.0:
; CHECK-NEXT: .save {r4, r5, r11, lr}
; CHECK-NEXT: push {r4, r5, r11, lr}
-; CHECK-NEXT: and r12, r2, #63
+; CHECK-NEXT: and lr, r2, #63
; CHECK-NEXT: rsb r2, r2, #0
-; CHECK-NEXT: rsb r3, r12, #32
+; CHECK-NEXT: rsb r3, lr, #32
; CHECK-NEXT: and r4, r2, #63
-; CHECK-NEXT: lsr lr, r0, r12
-; CHECK-NEXT: orr r3, lr, r1, lsl r3
-; CHECK-NEXT: subs lr, r12, #32
+; CHECK-NEXT: lsr r12, r0, lr
+; CHECK-NEXT: orr r3, r12, r1, lsl r3
+; CHECK-NEXT: subs r12, lr, #32
; CHECK-NEXT: lsl r2, r0, r4
-; CHECK-NEXT: lsrpl r3, r1, lr
+; CHECK-NEXT: lsrpl r3, r1, r12
; CHECK-NEXT: subs r5, r4, #32
; CHECK-NEXT: movwpl r2, #0
; CHECK-NEXT: cmp r5, #0
; CHECK-NEXT: lsr r3, r0, r3
; CHECK-NEXT: orr r3, r3, r1, lsl r4
; CHECK-NEXT: lslpl r3, r0, r5
-; CHECK-NEXT: lsr r0, r1, r12
-; CHECK-NEXT: cmp lr, #0
+; CHECK-NEXT: lsr r0, r1, lr
+; CHECK-NEXT: cmp r12, #0
; CHECK-NEXT: movwpl r0, #0
; CHECK-NEXT: orr r1, r0, r3
; CHECK-NEXT: mov r0, r2
; CHECK-NEXT: mov r3, #0
; CHECK-NEXT: bl __aeabi_uldivmod
; CHECK-NEXT: add r0, r2, #27
-; CHECK-NEXT: lsl r2, r7, #27
-; CHECK-NEXT: and r12, r0, #63
; CHECK-NEXT: lsl r6, r6, #27
+; CHECK-NEXT: and r1, r0, #63
+; CHECK-NEXT: lsl r2, r7, #27
; CHECK-NEXT: orr r7, r6, r7, lsr #5
-; CHECK-NEXT: rsb r3, r12, #32
-; CHECK-NEXT: lsr r2, r2, r12
; CHECK-NEXT: mov r6, #63
-; CHECK-NEXT: orr r2, r2, r7, lsl r3
-; CHECK-NEXT: subs r3, r12, #32
+; CHECK-NEXT: rsb r3, r1, #32
+; CHECK-NEXT: lsr r2, r2, r1
+; CHECK-NEXT: subs r12, r1, #32
; CHECK-NEXT: bic r6, r6, r0
+; CHECK-NEXT: orr r2, r2, r7, lsl r3
; CHECK-NEXT: lsl r5, r9, #1
-; CHECK-NEXT: lsrpl r2, r7, r3
-; CHECK-NEXT: subs r1, r6, #32
+; CHECK-NEXT: lsrpl r2, r7, r12
; CHECK-NEXT: lsl r0, r5, r6
-; CHECK-NEXT: lsl r4, r8, #1
+; CHECK-NEXT: subs r4, r6, #32
+; CHECK-NEXT: lsl r3, r8, #1
; CHECK-NEXT: movwpl r0, #0
-; CHECK-NEXT: orr r4, r4, r9, lsr #31
+; CHECK-NEXT: orr r3, r3, r9, lsr #31
; CHECK-NEXT: orr r0, r0, r2
; CHECK-NEXT: rsb r2, r6, #32
-; CHECK-NEXT: cmp r1, #0
+; CHECK-NEXT: cmp r4, #0
+; CHECK-NEXT: lsr r1, r7, r1
; CHECK-NEXT: lsr r2, r5, r2
-; CHECK-NEXT: orr r2, r2, r4, lsl r6
-; CHECK-NEXT: lslpl r2, r5, r1
-; CHECK-NEXT: lsr r1, r7, r12
-; CHECK-NEXT: cmp r3, #0
+; CHECK-NEXT: orr r2, r2, r3, lsl r6
+; CHECK-NEXT: lslpl r2, r5, r4
+; CHECK-NEXT: cmp r12, #0
; CHECK-NEXT: movwpl r1, #0
; CHECK-NEXT: orr r1, r2, r1
; CHECK-NEXT: pop {r4, r5, r6, r7, r8, r9, r11, pc}
; BE-LABEL: i56_or:
; BE: @ %bb.0:
; BE-NEXT: mov r1, r0
+; BE-NEXT: ldr r12, [r0]
; BE-NEXT: ldrh r2, [r1, #4]!
; BE-NEXT: ldrb r3, [r1, #2]
; BE-NEXT: orr r2, r3, r2, lsl #8
-; BE-NEXT: ldr r3, [r0]
-; BE-NEXT: orr r2, r2, r3, lsl #24
-; BE-NEXT: orr r12, r2, #384
-; BE-NEXT: strb r12, [r1, #2]
-; BE-NEXT: lsr r2, r12, #8
-; BE-NEXT: strh r2, [r1]
-; BE-NEXT: bic r1, r3, #255
-; BE-NEXT: orr r1, r1, r12, lsr #24
+; BE-NEXT: orr r2, r2, r12, lsl #24
+; BE-NEXT: orr r2, r2, #384
+; BE-NEXT: strb r2, [r1, #2]
+; BE-NEXT: lsr r3, r2, #8
+; BE-NEXT: strh r3, [r1]
+; BE-NEXT: bic r1, r12, #255
+; BE-NEXT: orr r1, r1, r2, lsr #24
; BE-NEXT: str r1, [r0]
; BE-NEXT: mov pc, lr
%aa = load i56, i56* %a
; BE-NEXT: ldrb r3, [r1, #2]
; BE-NEXT: strb r2, [r1, #2]
; BE-NEXT: orr r2, r3, r12, lsl #8
-; BE-NEXT: ldr r3, [r0]
-; BE-NEXT: orr r2, r2, r3, lsl #24
-; BE-NEXT: orr r12, r2, #384
-; BE-NEXT: lsr r2, r12, #8
-; BE-NEXT: strh r2, [r1]
-; BE-NEXT: bic r1, r3, #255
-; BE-NEXT: orr r1, r1, r12, lsr #24
+; BE-NEXT: ldr r12, [r0]
+; BE-NEXT: orr r2, r2, r12, lsl #24
+; BE-NEXT: orr r2, r2, #384
+; BE-NEXT: lsr r3, r2, #8
+; BE-NEXT: strh r3, [r1]
+; BE-NEXT: bic r1, r12, #255
+; BE-NEXT: orr r1, r1, r2, lsr #24
; BE-NEXT: str r1, [r0]
; BE-NEXT: mov pc, lr
; CHECK-NEXT: .pad #8
; CHECK-NEXT: sub sp, sp, #8
; CHECK-NEXT: vmov.u16 r1, d0[1]
-; CHECK-NEXT: and r12, r0, #3
+; CHECK-NEXT: and r0, r0, #3
; CHECK-NEXT: vmov.u16 r2, d0[2]
-; CHECK-NEXT: mov r0, sp
-; CHECK-NEXT: vmov.u16 r3, d0[3]
-; CHECK-NEXT: orr r0, r0, r12, lsl #1
+; CHECK-NEXT: mov r3, sp
+; CHECK-NEXT: vmov.u16 r12, d0[3]
+; CHECK-NEXT: orr r0, r3, r0, lsl #1
; CHECK-NEXT: vst1.16 {d0[0]}, [r0:16]
; CHECK-NEXT: vldr d0, [sp]
; CHECK-NEXT: vmov.16 d0[1], r1
; CHECK-NEXT: vmov.16 d0[2], r2
-; CHECK-NEXT: vmov.16 d0[3], r3
+; CHECK-NEXT: vmov.16 d0[3], r12
; CHECK-NEXT: add sp, sp, #8
; CHECK-NEXT: bx lr
%tmp = extractelement <8 x i16> %x, i32 0
; MMR3-NEXT: .cfi_offset 17, -4
; MMR3-NEXT: .cfi_offset 16, -8
; MMR3-NEXT: move $8, $7
-; MMR3-NEXT: move $2, $6
-; MMR3-NEXT: sw $5, 0($sp) # 4-byte Folded Spill
-; MMR3-NEXT: sw $4, 12($sp) # 4-byte Folded Spill
+; MMR3-NEXT: sw $6, 32($sp) # 4-byte Folded Spill
+; MMR3-NEXT: sw $5, 36($sp) # 4-byte Folded Spill
+; MMR3-NEXT: sw $4, 8($sp) # 4-byte Folded Spill
; MMR3-NEXT: lw $16, 76($sp)
-; MMR3-NEXT: srlv $3, $7, $16
-; MMR3-NEXT: not16 $6, $16
-; MMR3-NEXT: sw $6, 24($sp) # 4-byte Folded Spill
-; MMR3-NEXT: move $4, $2
-; MMR3-NEXT: sw $2, 32($sp) # 4-byte Folded Spill
-; MMR3-NEXT: sll16 $2, $2, 1
-; MMR3-NEXT: sllv $2, $2, $6
-; MMR3-NEXT: li16 $6, 64
-; MMR3-NEXT: or16 $2, $3
-; MMR3-NEXT: srlv $4, $4, $16
-; MMR3-NEXT: sw $4, 16($sp) # 4-byte Folded Spill
-; MMR3-NEXT: subu16 $7, $6, $16
+; MMR3-NEXT: srlv $4, $7, $16
+; MMR3-NEXT: not16 $3, $16
+; MMR3-NEXT: sw $3, 24($sp) # 4-byte Folded Spill
+; MMR3-NEXT: sll16 $2, $6, 1
+; MMR3-NEXT: sllv $3, $2, $3
+; MMR3-NEXT: li16 $2, 64
+; MMR3-NEXT: or16 $3, $4
+; MMR3-NEXT: srlv $6, $6, $16
+; MMR3-NEXT: sw $6, 12($sp) # 4-byte Folded Spill
+; MMR3-NEXT: subu16 $7, $2, $16
; MMR3-NEXT: sllv $9, $5, $7
-; MMR3-NEXT: andi16 $5, $7, 32
-; MMR3-NEXT: sw $5, 28($sp) # 4-byte Folded Spill
-; MMR3-NEXT: andi16 $6, $16, 32
-; MMR3-NEXT: sw $6, 36($sp) # 4-byte Folded Spill
-; MMR3-NEXT: move $3, $9
+; MMR3-NEXT: andi16 $2, $7, 32
+; MMR3-NEXT: sw $2, 28($sp) # 4-byte Folded Spill
+; MMR3-NEXT: andi16 $5, $16, 32
+; MMR3-NEXT: sw $5, 16($sp) # 4-byte Folded Spill
+; MMR3-NEXT: move $4, $9
; MMR3-NEXT: li16 $17, 0
-; MMR3-NEXT: movn $3, $17, $5
-; MMR3-NEXT: movn $2, $4, $6
-; MMR3-NEXT: addiu $4, $16, -64
-; MMR3-NEXT: lw $17, 0($sp) # 4-byte Folded Reload
-; MMR3-NEXT: srlv $4, $17, $4
-; MMR3-NEXT: sw $4, 20($sp) # 4-byte Folded Spill
-; MMR3-NEXT: lw $6, 12($sp) # 4-byte Folded Reload
-; MMR3-NEXT: sll16 $4, $6, 1
-; MMR3-NEXT: sw $4, 8($sp) # 4-byte Folded Spill
-; MMR3-NEXT: addiu $5, $16, -64
-; MMR3-NEXT: not16 $5, $5
-; MMR3-NEXT: sllv $5, $4, $5
-; MMR3-NEXT: or16 $2, $3
-; MMR3-NEXT: lw $3, 20($sp) # 4-byte Folded Reload
-; MMR3-NEXT: or16 $5, $3
-; MMR3-NEXT: addiu $3, $16, -64
-; MMR3-NEXT: srav $1, $6, $3
-; MMR3-NEXT: andi16 $3, $3, 32
-; MMR3-NEXT: sw $3, 20($sp) # 4-byte Folded Spill
-; MMR3-NEXT: movn $5, $1, $3
-; MMR3-NEXT: sllv $3, $6, $7
-; MMR3-NEXT: sw $3, 4($sp) # 4-byte Folded Spill
-; MMR3-NEXT: not16 $3, $7
-; MMR3-NEXT: srl16 $4, $17, 1
-; MMR3-NEXT: srlv $3, $4, $3
+; MMR3-NEXT: movn $4, $17, $2
+; MMR3-NEXT: movn $3, $6, $5
+; MMR3-NEXT: addiu $2, $16, -64
+; MMR3-NEXT: lw $5, 36($sp) # 4-byte Folded Reload
+; MMR3-NEXT: srlv $5, $5, $2
+; MMR3-NEXT: sw $5, 20($sp) # 4-byte Folded Spill
+; MMR3-NEXT: lw $17, 8($sp) # 4-byte Folded Reload
+; MMR3-NEXT: sll16 $6, $17, 1
+; MMR3-NEXT: sw $6, 4($sp) # 4-byte Folded Spill
+; MMR3-NEXT: not16 $5, $2
+; MMR3-NEXT: sllv $5, $6, $5
+; MMR3-NEXT: or16 $3, $4
+; MMR3-NEXT: lw $4, 20($sp) # 4-byte Folded Reload
+; MMR3-NEXT: or16 $5, $4
+; MMR3-NEXT: srav $1, $17, $2
+; MMR3-NEXT: andi16 $2, $2, 32
+; MMR3-NEXT: sw $2, 20($sp) # 4-byte Folded Spill
+; MMR3-NEXT: movn $5, $1, $2
+; MMR3-NEXT: sllv $2, $17, $7
+; MMR3-NEXT: not16 $4, $7
+; MMR3-NEXT: lw $7, 36($sp) # 4-byte Folded Reload
+; MMR3-NEXT: srl16 $6, $7, 1
+; MMR3-NEXT: srlv $6, $6, $4
; MMR3-NEXT: sltiu $10, $16, 64
-; MMR3-NEXT: movn $5, $2, $10
-; MMR3-NEXT: lw $2, 4($sp) # 4-byte Folded Reload
+; MMR3-NEXT: movn $5, $3, $10
+; MMR3-NEXT: or16 $6, $2
+; MMR3-NEXT: srlv $2, $7, $16
+; MMR3-NEXT: lw $3, 24($sp) # 4-byte Folded Reload
+; MMR3-NEXT: lw $4, 4($sp) # 4-byte Folded Reload
+; MMR3-NEXT: sllv $3, $4, $3
; MMR3-NEXT: or16 $3, $2
-; MMR3-NEXT: srlv $2, $17, $16
-; MMR3-NEXT: lw $4, 24($sp) # 4-byte Folded Reload
-; MMR3-NEXT: lw $7, 8($sp) # 4-byte Folded Reload
-; MMR3-NEXT: sllv $17, $7, $4
-; MMR3-NEXT: or16 $17, $2
-; MMR3-NEXT: srav $11, $6, $16
-; MMR3-NEXT: lw $2, 36($sp) # 4-byte Folded Reload
-; MMR3-NEXT: movn $17, $11, $2
-; MMR3-NEXT: sra $2, $6, 31
+; MMR3-NEXT: srav $11, $17, $16
+; MMR3-NEXT: lw $4, 16($sp) # 4-byte Folded Reload
+; MMR3-NEXT: movn $3, $11, $4
+; MMR3-NEXT: sra $2, $17, 31
; MMR3-NEXT: movz $5, $8, $16
-; MMR3-NEXT: move $4, $2
-; MMR3-NEXT: movn $4, $17, $10
-; MMR3-NEXT: lw $6, 28($sp) # 4-byte Folded Reload
-; MMR3-NEXT: movn $3, $9, $6
-; MMR3-NEXT: lw $6, 36($sp) # 4-byte Folded Reload
-; MMR3-NEXT: li16 $17, 0
-; MMR3-NEXT: lw $7, 16($sp) # 4-byte Folded Reload
-; MMR3-NEXT: movn $7, $17, $6
-; MMR3-NEXT: or16 $7, $3
+; MMR3-NEXT: move $8, $2
+; MMR3-NEXT: movn $8, $3, $10
+; MMR3-NEXT: lw $3, 28($sp) # 4-byte Folded Reload
+; MMR3-NEXT: movn $6, $9, $3
+; MMR3-NEXT: li16 $3, 0
+; MMR3-NEXT: lw $7, 12($sp) # 4-byte Folded Reload
+; MMR3-NEXT: movn $7, $3, $4
+; MMR3-NEXT: or16 $7, $6
; MMR3-NEXT: lw $3, 20($sp) # 4-byte Folded Reload
; MMR3-NEXT: movn $1, $2, $3
; MMR3-NEXT: movn $1, $7, $10
; MMR3-NEXT: lw $3, 32($sp) # 4-byte Folded Reload
; MMR3-NEXT: movz $1, $3, $16
-; MMR3-NEXT: movn $11, $2, $6
+; MMR3-NEXT: movn $11, $2, $4
; MMR3-NEXT: movn $2, $11, $10
-; MMR3-NEXT: move $3, $4
+; MMR3-NEXT: move $3, $8
; MMR3-NEXT: move $4, $1
; MMR3-NEXT: lwp $16, 40($sp)
; MMR3-NEXT: addiusp 48
; MMR6-NEXT: sw $16, 8($sp) # 4-byte Folded Spill
; MMR6-NEXT: .cfi_offset 17, -4
; MMR6-NEXT: .cfi_offset 16, -8
-; MMR6-NEXT: move $12, $7
+; MMR6-NEXT: move $1, $7
; MMR6-NEXT: lw $3, 44($sp)
; MMR6-NEXT: li16 $2, 64
-; MMR6-NEXT: subu16 $16, $2, $3
-; MMR6-NEXT: sllv $1, $5, $16
-; MMR6-NEXT: andi16 $2, $16, 32
-; MMR6-NEXT: selnez $8, $1, $2
-; MMR6-NEXT: sllv $9, $4, $16
-; MMR6-NEXT: not16 $16, $16
-; MMR6-NEXT: srl16 $17, $5, 1
-; MMR6-NEXT: srlv $10, $17, $16
-; MMR6-NEXT: or $9, $9, $10
-; MMR6-NEXT: seleqz $9, $9, $2
-; MMR6-NEXT: or $8, $8, $9
-; MMR6-NEXT: srlv $9, $7, $3
-; MMR6-NEXT: not16 $7, $3
-; MMR6-NEXT: sw $7, 4($sp) # 4-byte Folded Spill
+; MMR6-NEXT: subu16 $7, $2, $3
+; MMR6-NEXT: sllv $8, $5, $7
+; MMR6-NEXT: andi16 $2, $7, 32
+; MMR6-NEXT: selnez $9, $8, $2
+; MMR6-NEXT: sllv $10, $4, $7
+; MMR6-NEXT: not16 $7, $7
+; MMR6-NEXT: srl16 $16, $5, 1
+; MMR6-NEXT: srlv $7, $16, $7
+; MMR6-NEXT: or $7, $10, $7
+; MMR6-NEXT: seleqz $7, $7, $2
+; MMR6-NEXT: or $7, $9, $7
+; MMR6-NEXT: srlv $9, $1, $3
+; MMR6-NEXT: not16 $16, $3
+; MMR6-NEXT: sw $16, 4($sp) # 4-byte Folded Spill
; MMR6-NEXT: sll16 $17, $6, 1
-; MMR6-NEXT: sllv $10, $17, $7
+; MMR6-NEXT: sllv $10, $17, $16
; MMR6-NEXT: or $9, $10, $9
; MMR6-NEXT: andi16 $17, $3, 32
; MMR6-NEXT: seleqz $9, $9, $17
; MMR6-NEXT: srlv $10, $6, $3
; MMR6-NEXT: selnez $11, $10, $17
; MMR6-NEXT: seleqz $10, $10, $17
-; MMR6-NEXT: or $8, $10, $8
-; MMR6-NEXT: seleqz $1, $1, $2
-; MMR6-NEXT: or $9, $11, $9
+; MMR6-NEXT: or $10, $10, $7
+; MMR6-NEXT: seleqz $12, $8, $2
+; MMR6-NEXT: or $8, $11, $9
; MMR6-NEXT: addiu $2, $3, -64
-; MMR6-NEXT: srlv $10, $5, $2
+; MMR6-NEXT: srlv $9, $5, $2
; MMR6-NEXT: sll16 $7, $4, 1
; MMR6-NEXT: not16 $16, $2
; MMR6-NEXT: sllv $11, $7, $16
; MMR6-NEXT: sltiu $13, $3, 64
-; MMR6-NEXT: or $1, $9, $1
-; MMR6-NEXT: selnez $8, $8, $13
-; MMR6-NEXT: or $9, $11, $10
-; MMR6-NEXT: srav $10, $4, $2
+; MMR6-NEXT: or $8, $8, $12
+; MMR6-NEXT: selnez $10, $10, $13
+; MMR6-NEXT: or $9, $11, $9
+; MMR6-NEXT: srav $11, $4, $2
; MMR6-NEXT: andi16 $2, $2, 32
-; MMR6-NEXT: seleqz $11, $10, $2
+; MMR6-NEXT: seleqz $12, $11, $2
; MMR6-NEXT: sra $14, $4, 31
; MMR6-NEXT: selnez $15, $14, $2
; MMR6-NEXT: seleqz $9, $9, $2
-; MMR6-NEXT: or $11, $15, $11
-; MMR6-NEXT: seleqz $11, $11, $13
-; MMR6-NEXT: selnez $2, $10, $2
-; MMR6-NEXT: seleqz $10, $14, $13
-; MMR6-NEXT: or $8, $8, $11
-; MMR6-NEXT: selnez $8, $8, $3
-; MMR6-NEXT: selnez $1, $1, $13
+; MMR6-NEXT: or $12, $15, $12
+; MMR6-NEXT: seleqz $12, $12, $13
+; MMR6-NEXT: selnez $2, $11, $2
+; MMR6-NEXT: seleqz $11, $14, $13
+; MMR6-NEXT: or $10, $10, $12
+; MMR6-NEXT: selnez $10, $10, $3
+; MMR6-NEXT: selnez $8, $8, $13
; MMR6-NEXT: or $2, $2, $9
; MMR6-NEXT: srav $9, $4, $3
; MMR6-NEXT: seleqz $4, $9, $17
-; MMR6-NEXT: selnez $11, $14, $17
-; MMR6-NEXT: or $4, $11, $4
-; MMR6-NEXT: selnez $11, $4, $13
+; MMR6-NEXT: selnez $12, $14, $17
+; MMR6-NEXT: or $4, $12, $4
+; MMR6-NEXT: selnez $12, $4, $13
; MMR6-NEXT: seleqz $2, $2, $13
; MMR6-NEXT: seleqz $4, $6, $3
-; MMR6-NEXT: seleqz $6, $12, $3
+; MMR6-NEXT: seleqz $1, $1, $3
+; MMR6-NEXT: or $2, $8, $2
+; MMR6-NEXT: selnez $2, $2, $3
; MMR6-NEXT: or $1, $1, $2
-; MMR6-NEXT: selnez $1, $1, $3
-; MMR6-NEXT: or $1, $6, $1
-; MMR6-NEXT: or $4, $4, $8
-; MMR6-NEXT: or $6, $11, $10
-; MMR6-NEXT: srlv $2, $5, $3
-; MMR6-NEXT: lw $3, 4($sp) # 4-byte Folded Reload
-; MMR6-NEXT: sllv $3, $7, $3
-; MMR6-NEXT: or $2, $3, $2
-; MMR6-NEXT: seleqz $2, $2, $17
-; MMR6-NEXT: selnez $3, $9, $17
-; MMR6-NEXT: or $2, $3, $2
-; MMR6-NEXT: selnez $2, $2, $13
-; MMR6-NEXT: or $3, $2, $10
-; MMR6-NEXT: move $2, $6
+; MMR6-NEXT: or $4, $4, $10
+; MMR6-NEXT: or $2, $12, $11
+; MMR6-NEXT: srlv $3, $5, $3
+; MMR6-NEXT: lw $5, 4($sp) # 4-byte Folded Reload
+; MMR6-NEXT: sllv $5, $7, $5
+; MMR6-NEXT: or $3, $5, $3
+; MMR6-NEXT: seleqz $3, $3, $17
+; MMR6-NEXT: selnez $5, $9, $17
+; MMR6-NEXT: or $3, $5, $3
+; MMR6-NEXT: selnez $3, $3, $13
+; MMR6-NEXT: or $3, $3, $11
; MMR6-NEXT: move $5, $1
; MMR6-NEXT: lw $16, 8($sp) # 4-byte Folded Reload
; MMR6-NEXT: lw $17, 12($sp) # 4-byte Folded Reload
; MMR3-NEXT: .cfi_offset 17, -4
; MMR3-NEXT: .cfi_offset 16, -8
; MMR3-NEXT: move $8, $7
-; MMR3-NEXT: sw $5, 4($sp) # 4-byte Folded Spill
+; MMR3-NEXT: sw $6, 24($sp) # 4-byte Folded Spill
; MMR3-NEXT: sw $4, 28($sp) # 4-byte Folded Spill
; MMR3-NEXT: lw $16, 68($sp)
; MMR3-NEXT: li16 $2, 64
-; MMR3-NEXT: subu16 $17, $2, $16
-; MMR3-NEXT: sllv $9, $5, $17
-; MMR3-NEXT: andi16 $3, $17, 32
+; MMR3-NEXT: subu16 $7, $2, $16
+; MMR3-NEXT: sllv $9, $5, $7
+; MMR3-NEXT: move $17, $5
+; MMR3-NEXT: sw $5, 0($sp) # 4-byte Folded Spill
+; MMR3-NEXT: andi16 $3, $7, 32
; MMR3-NEXT: sw $3, 20($sp) # 4-byte Folded Spill
; MMR3-NEXT: li16 $2, 0
; MMR3-NEXT: move $4, $9
; MMR3-NEXT: movn $4, $2, $3
-; MMR3-NEXT: srlv $5, $7, $16
+; MMR3-NEXT: srlv $5, $8, $16
; MMR3-NEXT: not16 $3, $16
; MMR3-NEXT: sw $3, 16($sp) # 4-byte Folded Spill
; MMR3-NEXT: sll16 $2, $6, 1
-; MMR3-NEXT: sw $6, 24($sp) # 4-byte Folded Spill
; MMR3-NEXT: sllv $2, $2, $3
; MMR3-NEXT: or16 $2, $5
-; MMR3-NEXT: srlv $7, $6, $16
+; MMR3-NEXT: srlv $5, $6, $16
+; MMR3-NEXT: sw $5, 4($sp) # 4-byte Folded Spill
; MMR3-NEXT: andi16 $3, $16, 32
; MMR3-NEXT: sw $3, 12($sp) # 4-byte Folded Spill
-; MMR3-NEXT: movn $2, $7, $3
+; MMR3-NEXT: movn $2, $5, $3
; MMR3-NEXT: addiu $3, $16, -64
; MMR3-NEXT: or16 $2, $4
-; MMR3-NEXT: lw $6, 4($sp) # 4-byte Folded Reload
-; MMR3-NEXT: srlv $3, $6, $3
-; MMR3-NEXT: sw $3, 8($sp) # 4-byte Folded Spill
-; MMR3-NEXT: lw $3, 28($sp) # 4-byte Folded Reload
-; MMR3-NEXT: sll16 $4, $3, 1
-; MMR3-NEXT: sw $4, 0($sp) # 4-byte Folded Spill
-; MMR3-NEXT: addiu $5, $16, -64
-; MMR3-NEXT: not16 $5, $5
-; MMR3-NEXT: sllv $5, $4, $5
-; MMR3-NEXT: lw $4, 8($sp) # 4-byte Folded Reload
-; MMR3-NEXT: or16 $5, $4
-; MMR3-NEXT: addiu $4, $16, -64
-; MMR3-NEXT: srlv $1, $3, $4
-; MMR3-NEXT: andi16 $4, $4, 32
+; MMR3-NEXT: srlv $4, $17, $3
; MMR3-NEXT: sw $4, 8($sp) # 4-byte Folded Spill
-; MMR3-NEXT: movn $5, $1, $4
+; MMR3-NEXT: lw $4, 28($sp) # 4-byte Folded Reload
+; MMR3-NEXT: sll16 $6, $4, 1
+; MMR3-NEXT: not16 $5, $3
+; MMR3-NEXT: sllv $5, $6, $5
+; MMR3-NEXT: lw $17, 8($sp) # 4-byte Folded Reload
+; MMR3-NEXT: or16 $5, $17
+; MMR3-NEXT: srlv $1, $4, $3
+; MMR3-NEXT: andi16 $3, $3, 32
+; MMR3-NEXT: sw $3, 8($sp) # 4-byte Folded Spill
+; MMR3-NEXT: movn $5, $1, $3
; MMR3-NEXT: sltiu $10, $16, 64
; MMR3-NEXT: movn $5, $2, $10
-; MMR3-NEXT: sllv $2, $3, $17
-; MMR3-NEXT: not16 $3, $17
-; MMR3-NEXT: srl16 $4, $6, 1
+; MMR3-NEXT: sllv $2, $4, $7
+; MMR3-NEXT: not16 $3, $7
+; MMR3-NEXT: lw $7, 0($sp) # 4-byte Folded Reload
+; MMR3-NEXT: srl16 $4, $7, 1
; MMR3-NEXT: srlv $4, $4, $3
; MMR3-NEXT: or16 $4, $2
-; MMR3-NEXT: srlv $2, $6, $16
+; MMR3-NEXT: srlv $2, $7, $16
; MMR3-NEXT: lw $3, 16($sp) # 4-byte Folded Reload
-; MMR3-NEXT: lw $6, 0($sp) # 4-byte Folded Reload
; MMR3-NEXT: sllv $3, $6, $3
; MMR3-NEXT: or16 $3, $2
; MMR3-NEXT: lw $2, 28($sp) # 4-byte Folded Reload
; MMR3-NEXT: srlv $2, $2, $16
-; MMR3-NEXT: lw $6, 12($sp) # 4-byte Folded Reload
-; MMR3-NEXT: movn $3, $2, $6
+; MMR3-NEXT: lw $17, 12($sp) # 4-byte Folded Reload
+; MMR3-NEXT: movn $3, $2, $17
; MMR3-NEXT: movz $5, $8, $16
-; MMR3-NEXT: li16 $17, 0
-; MMR3-NEXT: movz $3, $17, $10
-; MMR3-NEXT: lw $17, 20($sp) # 4-byte Folded Reload
-; MMR3-NEXT: movn $4, $9, $17
-; MMR3-NEXT: li16 $17, 0
-; MMR3-NEXT: movn $7, $17, $6
-; MMR3-NEXT: or16 $7, $4
+; MMR3-NEXT: li16 $6, 0
+; MMR3-NEXT: movz $3, $6, $10
+; MMR3-NEXT: lw $7, 20($sp) # 4-byte Folded Reload
+; MMR3-NEXT: movn $4, $9, $7
+; MMR3-NEXT: lw $6, 4($sp) # 4-byte Folded Reload
+; MMR3-NEXT: li16 $7, 0
+; MMR3-NEXT: movn $6, $7, $17
+; MMR3-NEXT: or16 $6, $4
; MMR3-NEXT: lw $4, 8($sp) # 4-byte Folded Reload
-; MMR3-NEXT: movn $1, $17, $4
-; MMR3-NEXT: li16 $17, 0
-; MMR3-NEXT: movn $1, $7, $10
+; MMR3-NEXT: movn $1, $7, $4
+; MMR3-NEXT: li16 $7, 0
+; MMR3-NEXT: movn $1, $6, $10
; MMR3-NEXT: lw $4, 24($sp) # 4-byte Folded Reload
; MMR3-NEXT: movz $1, $4, $16
-; MMR3-NEXT: movn $2, $17, $6
+; MMR3-NEXT: movn $2, $7, $17
; MMR3-NEXT: li16 $4, 0
; MMR3-NEXT: movz $2, $4, $10
; MMR3-NEXT: move $4, $1
;
; MMR6-LABEL: lshr_i128:
; MMR6: # %bb.0: # %entry
-; MMR6-NEXT: addiu $sp, $sp, -24
-; MMR6-NEXT: .cfi_def_cfa_offset 24
-; MMR6-NEXT: sw $17, 20($sp) # 4-byte Folded Spill
-; MMR6-NEXT: sw $16, 16($sp) # 4-byte Folded Spill
+; MMR6-NEXT: addiu $sp, $sp, -32
+; MMR6-NEXT: .cfi_def_cfa_offset 32
+; MMR6-NEXT: sw $17, 28($sp) # 4-byte Folded Spill
+; MMR6-NEXT: sw $16, 24($sp) # 4-byte Folded Spill
; MMR6-NEXT: .cfi_offset 17, -4
; MMR6-NEXT: .cfi_offset 16, -8
; MMR6-NEXT: move $1, $7
-; MMR6-NEXT: move $7, $4
-; MMR6-NEXT: lw $3, 52($sp)
+; MMR6-NEXT: move $7, $5
+; MMR6-NEXT: lw $3, 60($sp)
; MMR6-NEXT: srlv $2, $1, $3
-; MMR6-NEXT: not16 $16, $3
-; MMR6-NEXT: sw $16, 8($sp) # 4-byte Folded Spill
-; MMR6-NEXT: move $4, $6
-; MMR6-NEXT: sw $6, 12($sp) # 4-byte Folded Spill
+; MMR6-NEXT: not16 $5, $3
+; MMR6-NEXT: sw $5, 12($sp) # 4-byte Folded Spill
+; MMR6-NEXT: move $17, $6
+; MMR6-NEXT: sw $6, 16($sp) # 4-byte Folded Spill
; MMR6-NEXT: sll16 $6, $6, 1
-; MMR6-NEXT: sllv $6, $6, $16
+; MMR6-NEXT: sllv $6, $6, $5
; MMR6-NEXT: or $8, $6, $2
-; MMR6-NEXT: addiu $6, $3, -64
-; MMR6-NEXT: srlv $9, $5, $6
-; MMR6-NEXT: sll16 $2, $7, 1
-; MMR6-NEXT: sw $2, 4($sp) # 4-byte Folded Spill
-; MMR6-NEXT: not16 $16, $6
+; MMR6-NEXT: addiu $5, $3, -64
+; MMR6-NEXT: srlv $9, $7, $5
+; MMR6-NEXT: move $6, $4
+; MMR6-NEXT: sll16 $2, $4, 1
+; MMR6-NEXT: sw $2, 8($sp) # 4-byte Folded Spill
+; MMR6-NEXT: not16 $16, $5
; MMR6-NEXT: sllv $10, $2, $16
; MMR6-NEXT: andi16 $16, $3, 32
; MMR6-NEXT: seleqz $8, $8, $16
; MMR6-NEXT: or $9, $10, $9
-; MMR6-NEXT: srlv $10, $4, $3
+; MMR6-NEXT: srlv $10, $17, $3
; MMR6-NEXT: selnez $11, $10, $16
; MMR6-NEXT: li16 $17, 64
; MMR6-NEXT: subu16 $2, $17, $3
-; MMR6-NEXT: sllv $12, $5, $2
+; MMR6-NEXT: sllv $12, $7, $2
+; MMR6-NEXT: move $17, $7
; MMR6-NEXT: andi16 $4, $2, 32
-; MMR6-NEXT: andi16 $17, $6, 32
-; MMR6-NEXT: seleqz $9, $9, $17
+; MMR6-NEXT: andi16 $7, $5, 32
+; MMR6-NEXT: sw $7, 20($sp) # 4-byte Folded Spill
+; MMR6-NEXT: seleqz $9, $9, $7
; MMR6-NEXT: seleqz $13, $12, $4
; MMR6-NEXT: or $8, $11, $8
; MMR6-NEXT: selnez $11, $12, $4
-; MMR6-NEXT: sllv $12, $7, $2
+; MMR6-NEXT: sllv $12, $6, $2
+; MMR6-NEXT: move $7, $6
+; MMR6-NEXT: sw $6, 4($sp) # 4-byte Folded Spill
; MMR6-NEXT: not16 $2, $2
-; MMR6-NEXT: srl16 $6, $5, 1
+; MMR6-NEXT: srl16 $6, $17, 1
; MMR6-NEXT: srlv $2, $6, $2
; MMR6-NEXT: or $2, $12, $2
; MMR6-NEXT: seleqz $2, $2, $4
-; MMR6-NEXT: addiu $4, $3, -64
-; MMR6-NEXT: srlv $4, $7, $4
-; MMR6-NEXT: or $12, $11, $2
-; MMR6-NEXT: or $6, $8, $13
-; MMR6-NEXT: srlv $5, $5, $3
-; MMR6-NEXT: selnez $8, $4, $17
-; MMR6-NEXT: sltiu $11, $3, 64
-; MMR6-NEXT: selnez $13, $6, $11
-; MMR6-NEXT: or $8, $8, $9
+; MMR6-NEXT: srlv $4, $7, $5
+; MMR6-NEXT: or $11, $11, $2
+; MMR6-NEXT: or $5, $8, $13
+; MMR6-NEXT: srlv $6, $17, $3
+; MMR6-NEXT: lw $2, 20($sp) # 4-byte Folded Reload
+; MMR6-NEXT: selnez $7, $4, $2
+; MMR6-NEXT: sltiu $8, $3, 64
+; MMR6-NEXT: selnez $12, $5, $8
+; MMR6-NEXT: or $7, $7, $9
+; MMR6-NEXT: lw $5, 12($sp) # 4-byte Folded Reload
; MMR6-NEXT: lw $2, 8($sp) # 4-byte Folded Reload
-; MMR6-NEXT: lw $6, 4($sp) # 4-byte Folded Reload
-; MMR6-NEXT: sllv $9, $6, $2
+; MMR6-NEXT: sllv $9, $2, $5
; MMR6-NEXT: seleqz $10, $10, $16
-; MMR6-NEXT: li16 $2, 0
-; MMR6-NEXT: or $10, $10, $12
-; MMR6-NEXT: or $9, $9, $5
-; MMR6-NEXT: seleqz $5, $8, $11
-; MMR6-NEXT: seleqz $8, $2, $11
-; MMR6-NEXT: srlv $7, $7, $3
-; MMR6-NEXT: seleqz $2, $7, $16
-; MMR6-NEXT: selnez $2, $2, $11
+; MMR6-NEXT: li16 $5, 0
+; MMR6-NEXT: or $10, $10, $11
+; MMR6-NEXT: or $6, $9, $6
+; MMR6-NEXT: seleqz $2, $7, $8
+; MMR6-NEXT: seleqz $7, $5, $8
+; MMR6-NEXT: lw $5, 4($sp) # 4-byte Folded Reload
+; MMR6-NEXT: srlv $9, $5, $3
+; MMR6-NEXT: seleqz $11, $9, $16
+; MMR6-NEXT: selnez $11, $11, $8
; MMR6-NEXT: seleqz $1, $1, $3
-; MMR6-NEXT: or $5, $13, $5
-; MMR6-NEXT: selnez $5, $5, $3
-; MMR6-NEXT: or $5, $1, $5
-; MMR6-NEXT: or $2, $8, $2
-; MMR6-NEXT: seleqz $1, $9, $16
-; MMR6-NEXT: selnez $6, $7, $16
-; MMR6-NEXT: lw $7, 12($sp) # 4-byte Folded Reload
-; MMR6-NEXT: seleqz $7, $7, $3
-; MMR6-NEXT: selnez $9, $10, $11
-; MMR6-NEXT: seleqz $4, $4, $17
-; MMR6-NEXT: seleqz $4, $4, $11
-; MMR6-NEXT: or $4, $9, $4
+; MMR6-NEXT: or $2, $12, $2
+; MMR6-NEXT: selnez $2, $2, $3
+; MMR6-NEXT: or $5, $1, $2
+; MMR6-NEXT: or $2, $7, $11
+; MMR6-NEXT: seleqz $1, $6, $16
+; MMR6-NEXT: selnez $6, $9, $16
+; MMR6-NEXT: lw $16, 16($sp) # 4-byte Folded Reload
+; MMR6-NEXT: seleqz $9, $16, $3
+; MMR6-NEXT: selnez $10, $10, $8
+; MMR6-NEXT: lw $16, 20($sp) # 4-byte Folded Reload
+; MMR6-NEXT: seleqz $4, $4, $16
+; MMR6-NEXT: seleqz $4, $4, $8
+; MMR6-NEXT: or $4, $10, $4
; MMR6-NEXT: selnez $3, $4, $3
-; MMR6-NEXT: or $4, $7, $3
+; MMR6-NEXT: or $4, $9, $3
; MMR6-NEXT: or $1, $6, $1
-; MMR6-NEXT: selnez $1, $1, $11
-; MMR6-NEXT: or $3, $8, $1
-; MMR6-NEXT: lw $16, 16($sp) # 4-byte Folded Reload
-; MMR6-NEXT: lw $17, 20($sp) # 4-byte Folded Reload
-; MMR6-NEXT: addiu $sp, $sp, 24
+; MMR6-NEXT: selnez $1, $1, $8
+; MMR6-NEXT: or $3, $7, $1
+; MMR6-NEXT: lw $16, 24($sp) # 4-byte Folded Reload
+; MMR6-NEXT: lw $17, 28($sp) # 4-byte Folded Reload
+; MMR6-NEXT: addiu $sp, $sp, 32
; MMR6-NEXT: jrc $ra
entry:
; MMR3-NEXT: swp $16, 32($sp)
; MMR3-NEXT: .cfi_offset 17, -4
; MMR3-NEXT: .cfi_offset 16, -8
-; MMR3-NEXT: sw $7, 8($sp) # 4-byte Folded Spill
-; MMR3-NEXT: move $17, $6
-; MMR3-NEXT: sw $5, 28($sp) # 4-byte Folded Spill
+; MMR3-NEXT: move $17, $7
+; MMR3-NEXT: sw $7, 4($sp) # 4-byte Folded Spill
+; MMR3-NEXT: move $7, $6
; MMR3-NEXT: move $1, $4
; MMR3-NEXT: lw $16, 68($sp)
; MMR3-NEXT: li16 $2, 64
; MMR3-NEXT: subu16 $6, $2, $16
-; MMR3-NEXT: srlv $9, $17, $6
-; MMR3-NEXT: andi16 $7, $6, 32
-; MMR3-NEXT: sw $7, 24($sp) # 4-byte Folded Spill
+; MMR3-NEXT: srlv $9, $7, $6
+; MMR3-NEXT: andi16 $4, $6, 32
+; MMR3-NEXT: sw $4, 24($sp) # 4-byte Folded Spill
; MMR3-NEXT: li16 $3, 0
-; MMR3-NEXT: move $4, $9
-; MMR3-NEXT: movn $4, $3, $7
-; MMR3-NEXT: sllv $7, $1, $16
-; MMR3-NEXT: not16 $2, $16
-; MMR3-NEXT: sw $2, 20($sp) # 4-byte Folded Spill
+; MMR3-NEXT: move $2, $9
+; MMR3-NEXT: movn $2, $3, $4
+; MMR3-NEXT: sllv $3, $1, $16
+; MMR3-NEXT: sw $3, 16($sp) # 4-byte Folded Spill
+; MMR3-NEXT: not16 $4, $16
+; MMR3-NEXT: sw $4, 20($sp) # 4-byte Folded Spill
+; MMR3-NEXT: sw $5, 28($sp) # 4-byte Folded Spill
; MMR3-NEXT: srl16 $3, $5, 1
-; MMR3-NEXT: srlv $3, $3, $2
-; MMR3-NEXT: or16 $3, $7
-; MMR3-NEXT: sllv $5, $5, $16
-; MMR3-NEXT: sw $5, 4($sp) # 4-byte Folded Spill
-; MMR3-NEXT: andi16 $2, $16, 32
-; MMR3-NEXT: sw $2, 16($sp) # 4-byte Folded Spill
-; MMR3-NEXT: movn $3, $5, $2
-; MMR3-NEXT: addiu $7, $16, -64
+; MMR3-NEXT: srlv $3, $3, $4
+; MMR3-NEXT: lw $4, 16($sp) # 4-byte Folded Reload
; MMR3-NEXT: or16 $3, $4
-; MMR3-NEXT: sllv $2, $17, $7
+; MMR3-NEXT: sllv $5, $5, $16
+; MMR3-NEXT: sw $5, 8($sp) # 4-byte Folded Spill
+; MMR3-NEXT: andi16 $4, $16, 32
+; MMR3-NEXT: sw $4, 16($sp) # 4-byte Folded Spill
+; MMR3-NEXT: movn $3, $5, $4
+; MMR3-NEXT: addiu $4, $16, -64
+; MMR3-NEXT: or16 $3, $2
+; MMR3-NEXT: sllv $2, $7, $4
; MMR3-NEXT: sw $2, 12($sp) # 4-byte Folded Spill
-; MMR3-NEXT: lw $4, 8($sp) # 4-byte Folded Reload
-; MMR3-NEXT: srl16 $5, $4, 1
-; MMR3-NEXT: not16 $2, $7
+; MMR3-NEXT: srl16 $5, $17, 1
+; MMR3-NEXT: not16 $2, $4
; MMR3-NEXT: srlv $2, $5, $2
-; MMR3-NEXT: lw $7, 12($sp) # 4-byte Folded Reload
-; MMR3-NEXT: or16 $2, $7
-; MMR3-NEXT: addiu $7, $16, -64
-; MMR3-NEXT: sllv $8, $4, $7
-; MMR3-NEXT: andi16 $7, $7, 32
-; MMR3-NEXT: sw $7, 12($sp) # 4-byte Folded Spill
-; MMR3-NEXT: movn $2, $8, $7
+; MMR3-NEXT: lw $17, 12($sp) # 4-byte Folded Reload
+; MMR3-NEXT: or16 $2, $17
+; MMR3-NEXT: lw $17, 4($sp) # 4-byte Folded Reload
+; MMR3-NEXT: sllv $8, $17, $4
+; MMR3-NEXT: andi16 $4, $4, 32
+; MMR3-NEXT: sw $4, 12($sp) # 4-byte Folded Spill
+; MMR3-NEXT: movn $2, $8, $4
; MMR3-NEXT: sltiu $10, $16, 64
; MMR3-NEXT: movn $2, $3, $10
-; MMR3-NEXT: srlv $3, $4, $6
-; MMR3-NEXT: sw $3, 0($sp) # 4-byte Folded Spill
-; MMR3-NEXT: move $7, $4
+; MMR3-NEXT: srlv $4, $17, $6
; MMR3-NEXT: not16 $3, $6
-; MMR3-NEXT: sll16 $4, $17, 1
-; MMR3-NEXT: sllv $3, $4, $3
-; MMR3-NEXT: lw $4, 0($sp) # 4-byte Folded Reload
+; MMR3-NEXT: sll16 $6, $7, 1
+; MMR3-NEXT: sllv $3, $6, $3
; MMR3-NEXT: or16 $3, $4
-; MMR3-NEXT: sllv $6, $17, $16
+; MMR3-NEXT: sllv $6, $7, $16
; MMR3-NEXT: lw $4, 20($sp) # 4-byte Folded Reload
; MMR3-NEXT: srlv $4, $5, $4
; MMR3-NEXT: or16 $4, $6
-; MMR3-NEXT: sllv $6, $7, $16
-; MMR3-NEXT: lw $7, 16($sp) # 4-byte Folded Reload
-; MMR3-NEXT: movn $4, $6, $7
+; MMR3-NEXT: sllv $6, $17, $16
+; MMR3-NEXT: lw $17, 16($sp) # 4-byte Folded Reload
+; MMR3-NEXT: movn $4, $6, $17
; MMR3-NEXT: movz $2, $1, $16
; MMR3-NEXT: li16 $5, 0
; MMR3-NEXT: movz $4, $5, $10
-; MMR3-NEXT: lw $17, 24($sp) # 4-byte Folded Reload
-; MMR3-NEXT: movn $3, $9, $17
-; MMR3-NEXT: lw $5, 4($sp) # 4-byte Folded Reload
-; MMR3-NEXT: li16 $17, 0
-; MMR3-NEXT: movn $5, $17, $7
+; MMR3-NEXT: lw $7, 24($sp) # 4-byte Folded Reload
+; MMR3-NEXT: movn $3, $9, $7
+; MMR3-NEXT: lw $5, 8($sp) # 4-byte Folded Reload
+; MMR3-NEXT: li16 $7, 0
+; MMR3-NEXT: movn $5, $7, $17
; MMR3-NEXT: or16 $5, $3
; MMR3-NEXT: lw $3, 12($sp) # 4-byte Folded Reload
-; MMR3-NEXT: movn $8, $17, $3
-; MMR3-NEXT: li16 $17, 0
+; MMR3-NEXT: movn $8, $7, $3
+; MMR3-NEXT: li16 $7, 0
; MMR3-NEXT: movn $8, $5, $10
; MMR3-NEXT: lw $3, 28($sp) # 4-byte Folded Reload
; MMR3-NEXT: movz $8, $3, $16
-; MMR3-NEXT: movn $6, $17, $7
+; MMR3-NEXT: movn $6, $7, $17
; MMR3-NEXT: li16 $3, 0
; MMR3-NEXT: movz $6, $3, $10
; MMR3-NEXT: move $3, $8
; MMR3: lw $[[T20:[0-9]+]], 0($sp)
; MMR3: subu16 $5, $[[T19]], $[[T20]]
-; MMR6: sw $7, 4($sp)
-; MMR6: sw $4, 8($sp)
+; MMR6: move $[[T0:[0-9]+]], $7
+; MMR6: sw $7, 8($sp)
+; MMR6: move $[[T1:[0-9]+]], $5
+; MMR6: sw $4, 12($sp)
; MMR6: lw $[[T2:[0-9]+]], 48($sp)
; MMR6: sltu $[[T3:[0-9]+]], $6, $[[T2]]
; MMR6: xor $[[T4:[0-9]+]], $6, $[[T2]]
; MMR6: sltiu $[[T5:[0-9]+]], $[[T4]], 1
; MMR6: seleqz $[[T6:[0-9]+]], $[[T3]], $[[T5]]
; MMR6: lw $[[T7:[0-9]+]], 52($sp)
-; MMR6: sltu $[[T8:[0-9]+]], $7, $[[T7]]
+; MMR6: sltu $[[T8:[0-9]+]], $[[T0]], $[[T7]]
; MMR6: selnez $[[T9:[0-9]+]], $[[T8]], $[[T5]]
; MMR6: or $[[T10:[0-9]+]], $[[T9]], $[[T6]]
; MMR6: lw $[[T11:[0-9]+]], 44($sp)
-; MMR6: subu16 $[[T12:[0-9]+]], $5, $[[T11]]
-; MMR6: lw $[[T1:[0-9]+]], 12($sp)
-; MMR6: subu16 $[[T13:[0-9]+]], $[[T12]], $[[T1]]
-; MMR6: sltu $[[T16:[0-9]+]], $[[T12]], $[[T1]]
-; MMR6: sltu $[[T17:[0-9]+]], $5, $[[T11]]
-; MMR6: lw $[[T19:[0-9]+]], 8($sp)
-; MMR6: subu16 $[[T20:[0-9]+]], $[[T19]], $5
+; MMR6: subu16 $[[T12:[0-9]+]], $[[T1]], $[[T11]]
+; MMR6: subu16 $[[T13:[0-9]+]], $[[T12]], $[[T7]]
+; MMR6: sltu $[[T16:[0-9]+]], $[[T12]], $[[T7]]
+; MMR6: sltu $[[T17:[0-9]+]], $[[T1]], $[[T11]]
+; MMR6: lw $[[T18:[0-9]+]], 40($sp)
+; MMR6: lw $[[T19:[0-9]+]], 12($sp)
+; MMR6: subu16 $[[T20:[0-9]+]], $[[T19]], $[[T18]]
; MMR6: subu16 $[[T21:[0-9]+]], $[[T20]], $[[T17]]
; MMR6: subu16 $[[T22:[0-9]+]], $[[T21]], $[[T16]]
; MMR6: subu16 $[[T23:[0-9]+]], $6, $[[T2]]
-; MMR6: subu16 $4, $[[T23]], $[[T8]]
-; MMR6: lw $[[T24:[0-9]+]], 4($sp)
-; MMR6: subu16 $5, $[[T24]], $[[T7]]
-; MMR6: lw $3, 0($sp)
+; MMR6: subu16 $4, $[[T23]], $5
+; MMR6: lw $[[T24:[0-9]+]], 8($sp)
+; MMR6: lw $[[T25:[0-9]+]], 0($sp)
+; MMR6: subu16 $5, $[[T24]], $[[T25]]
+; MMR6: lw $3, 4($sp)
; FIXME: The sltu, dsll, dsrl pattern here occurs when an i32 is zero
; extended to 64 bits. Fortunately slt(i)(u) actually gives an i1.
entry:
; PIC32-LABEL: f3:
; PIC32: addu $[[R0:[a-z0-9]+]], $2, $25
-; PIC32: lw $25, %call16(__tls_get_addr)($[[R0]])
; PIC32: addiu $4, $[[R0]], %tlsldm(f3.i)
+; PIC32: lw $25, %call16(__tls_get_addr)($[[R0]])
; PIC32: jalr $25
; PIC32: lui $[[R0:[0-9]+]], %dtprel_hi(f3.i)
; PIC32: addu $[[R1:[0-9]+]], $[[R0]], $2
; PIC64: lui $[[R0:[a-z0-9]+]], %hi(%neg(%gp_rel(f3)))
; PIC64: daddu $[[R0]], $[[R0]], $25
; PIC64: daddiu $[[R1:[a-z0-9]+]], $[[R0]], %lo(%neg(%gp_rel(f3)))
-; PIC64: ld $25, %call16(__tls_get_addr)($[[R1]])
; PIC64: daddiu $4, $[[R1]], %tlsldm(f3.i)
+; PIC64: ld $25, %call16(__tls_get_addr)($[[R1]])
; PIC64: jalr $25
; PIC64: lui $[[R0:[0-9]+]], %dtprel_hi(f3.i)
; PIC64: daddu $[[R1:[0-9]+]], $[[R0]], $2
; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill
; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill
; RV32I-NEXT: mv s2, a1
-; RV32I-NEXT: mv s1, a0
+; RV32I-NEXT: mv s3, a0
; RV32I-NEXT: lhu a1, 0(a0)
; RV32I-NEXT: lui a0, 16
-; RV32I-NEXT: addi s3, a0, -1
-; RV32I-NEXT: and s0, s2, s3
+; RV32I-NEXT: addi s0, a0, -1
+; RV32I-NEXT: and s1, s2, s0
; RV32I-NEXT: j .LBB100_2
; RV32I-NEXT: .LBB100_1: # %atomicrmw.start
; RV32I-NEXT: # in Loop: Header=BB100_2 Depth=1
; RV32I-NEXT: sh a1, 10(sp)
; RV32I-NEXT: addi a1, sp, 10
-; RV32I-NEXT: mv a0, s1
+; RV32I-NEXT: mv a0, s3
; RV32I-NEXT: mv a3, zero
; RV32I-NEXT: mv a4, zero
; RV32I-NEXT: call __atomic_compare_exchange_2@plt
; RV32I-NEXT: bnez a0, .LBB100_4
; RV32I-NEXT: .LBB100_2: # %atomicrmw.start
; RV32I-NEXT: # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT: and a0, a1, s3
+; RV32I-NEXT: and a0, a1, s0
; RV32I-NEXT: mv a2, a1
-; RV32I-NEXT: bltu s0, a0, .LBB100_1
+; RV32I-NEXT: bltu s1, a0, .LBB100_1
; RV32I-NEXT: # %bb.3: # %atomicrmw.start
; RV32I-NEXT: # in Loop: Header=BB100_2 Depth=1
; RV32I-NEXT: mv a2, s2
; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill
; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill
; RV32I-NEXT: mv s2, a1
-; RV32I-NEXT: mv s1, a0
+; RV32I-NEXT: mv s3, a0
; RV32I-NEXT: lhu a1, 0(a0)
; RV32I-NEXT: lui a0, 16
-; RV32I-NEXT: addi s3, a0, -1
-; RV32I-NEXT: and s0, s2, s3
+; RV32I-NEXT: addi s0, a0, -1
+; RV32I-NEXT: and s1, s2, s0
; RV32I-NEXT: j .LBB101_2
; RV32I-NEXT: .LBB101_1: # %atomicrmw.start
; RV32I-NEXT: # in Loop: Header=BB101_2 Depth=1
; RV32I-NEXT: addi a1, sp, 10
; RV32I-NEXT: addi a3, zero, 2
; RV32I-NEXT: addi a4, zero, 2
-; RV32I-NEXT: mv a0, s1
+; RV32I-NEXT: mv a0, s3
; RV32I-NEXT: call __atomic_compare_exchange_2@plt
; RV32I-NEXT: lh a1, 10(sp)
; RV32I-NEXT: bnez a0, .LBB101_4
; RV32I-NEXT: .LBB101_2: # %atomicrmw.start
; RV32I-NEXT: # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT: and a0, a1, s3
+; RV32I-NEXT: and a0, a1, s0
; RV32I-NEXT: mv a2, a1
-; RV32I-NEXT: bltu s0, a0, .LBB101_1
+; RV32I-NEXT: bltu s1, a0, .LBB101_1
; RV32I-NEXT: # %bb.3: # %atomicrmw.start
; RV32I-NEXT: # in Loop: Header=BB101_2 Depth=1
; RV32I-NEXT: mv a2, s2
; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill
; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill
; RV32I-NEXT: mv s2, a1
-; RV32I-NEXT: mv s1, a0
+; RV32I-NEXT: mv s3, a0
; RV32I-NEXT: lhu a1, 0(a0)
; RV32I-NEXT: lui a0, 16
-; RV32I-NEXT: addi s3, a0, -1
-; RV32I-NEXT: and s0, s2, s3
+; RV32I-NEXT: addi s0, a0, -1
+; RV32I-NEXT: and s1, s2, s0
; RV32I-NEXT: j .LBB102_2
; RV32I-NEXT: .LBB102_1: # %atomicrmw.start
; RV32I-NEXT: # in Loop: Header=BB102_2 Depth=1
; RV32I-NEXT: sh a1, 10(sp)
; RV32I-NEXT: addi a1, sp, 10
; RV32I-NEXT: addi a3, zero, 3
-; RV32I-NEXT: mv a0, s1
+; RV32I-NEXT: mv a0, s3
; RV32I-NEXT: mv a4, zero
; RV32I-NEXT: call __atomic_compare_exchange_2@plt
; RV32I-NEXT: lh a1, 10(sp)
; RV32I-NEXT: bnez a0, .LBB102_4
; RV32I-NEXT: .LBB102_2: # %atomicrmw.start
; RV32I-NEXT: # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT: and a0, a1, s3
+; RV32I-NEXT: and a0, a1, s0
; RV32I-NEXT: mv a2, a1
-; RV32I-NEXT: bltu s0, a0, .LBB102_1
+; RV32I-NEXT: bltu s1, a0, .LBB102_1
; RV32I-NEXT: # %bb.3: # %atomicrmw.start
; RV32I-NEXT: # in Loop: Header=BB102_2 Depth=1
; RV32I-NEXT: mv a2, s2
; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill
; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill
; RV32I-NEXT: mv s2, a1
-; RV32I-NEXT: mv s1, a0
+; RV32I-NEXT: mv s3, a0
; RV32I-NEXT: lhu a1, 0(a0)
; RV32I-NEXT: lui a0, 16
-; RV32I-NEXT: addi s3, a0, -1
-; RV32I-NEXT: and s0, s2, s3
+; RV32I-NEXT: addi s0, a0, -1
+; RV32I-NEXT: and s1, s2, s0
; RV32I-NEXT: j .LBB103_2
; RV32I-NEXT: .LBB103_1: # %atomicrmw.start
; RV32I-NEXT: # in Loop: Header=BB103_2 Depth=1
; RV32I-NEXT: addi a1, sp, 10
; RV32I-NEXT: addi a3, zero, 4
; RV32I-NEXT: addi a4, zero, 2
-; RV32I-NEXT: mv a0, s1
+; RV32I-NEXT: mv a0, s3
; RV32I-NEXT: call __atomic_compare_exchange_2@plt
; RV32I-NEXT: lh a1, 10(sp)
; RV32I-NEXT: bnez a0, .LBB103_4
; RV32I-NEXT: .LBB103_2: # %atomicrmw.start
; RV32I-NEXT: # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT: and a0, a1, s3
+; RV32I-NEXT: and a0, a1, s0
; RV32I-NEXT: mv a2, a1
-; RV32I-NEXT: bltu s0, a0, .LBB103_1
+; RV32I-NEXT: bltu s1, a0, .LBB103_1
; RV32I-NEXT: # %bb.3: # %atomicrmw.start
; RV32I-NEXT: # in Loop: Header=BB103_2 Depth=1
; RV32I-NEXT: mv a2, s2
; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill
; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill
; RV32I-NEXT: mv s2, a1
-; RV32I-NEXT: mv s1, a0
+; RV32I-NEXT: mv s3, a0
; RV32I-NEXT: lhu a1, 0(a0)
; RV32I-NEXT: lui a0, 16
-; RV32I-NEXT: addi s3, a0, -1
-; RV32I-NEXT: and s0, s2, s3
+; RV32I-NEXT: addi s0, a0, -1
+; RV32I-NEXT: and s1, s2, s0
; RV32I-NEXT: j .LBB104_2
; RV32I-NEXT: .LBB104_1: # %atomicrmw.start
; RV32I-NEXT: # in Loop: Header=BB104_2 Depth=1
; RV32I-NEXT: addi a1, sp, 10
; RV32I-NEXT: addi a3, zero, 5
; RV32I-NEXT: addi a4, zero, 5
-; RV32I-NEXT: mv a0, s1
+; RV32I-NEXT: mv a0, s3
; RV32I-NEXT: call __atomic_compare_exchange_2@plt
; RV32I-NEXT: lh a1, 10(sp)
; RV32I-NEXT: bnez a0, .LBB104_4
; RV32I-NEXT: .LBB104_2: # %atomicrmw.start
; RV32I-NEXT: # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT: and a0, a1, s3
+; RV32I-NEXT: and a0, a1, s0
; RV32I-NEXT: mv a2, a1
-; RV32I-NEXT: bltu s0, a0, .LBB104_1
+; RV32I-NEXT: bltu s1, a0, .LBB104_1
; RV32I-NEXT: # %bb.3: # %atomicrmw.start
; RV32I-NEXT: # in Loop: Header=BB104_2 Depth=1
; RV32I-NEXT: mv a2, s2
; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill
; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill
; RV32I-NEXT: mv s2, a1
-; RV32I-NEXT: mv s1, a0
+; RV32I-NEXT: mv s3, a0
; RV32I-NEXT: lhu a1, 0(a0)
; RV32I-NEXT: lui a0, 16
-; RV32I-NEXT: addi s3, a0, -1
-; RV32I-NEXT: and s0, s2, s3
+; RV32I-NEXT: addi s0, a0, -1
+; RV32I-NEXT: and s1, s2, s0
; RV32I-NEXT: j .LBB105_2
; RV32I-NEXT: .LBB105_1: # %atomicrmw.start
; RV32I-NEXT: # in Loop: Header=BB105_2 Depth=1
; RV32I-NEXT: sh a1, 10(sp)
; RV32I-NEXT: addi a1, sp, 10
-; RV32I-NEXT: mv a0, s1
+; RV32I-NEXT: mv a0, s3
; RV32I-NEXT: mv a3, zero
; RV32I-NEXT: mv a4, zero
; RV32I-NEXT: call __atomic_compare_exchange_2@plt
; RV32I-NEXT: bnez a0, .LBB105_4
; RV32I-NEXT: .LBB105_2: # %atomicrmw.start
; RV32I-NEXT: # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT: and a0, a1, s3
+; RV32I-NEXT: and a0, a1, s0
; RV32I-NEXT: mv a2, a1
-; RV32I-NEXT: bgeu s0, a0, .LBB105_1
+; RV32I-NEXT: bgeu s1, a0, .LBB105_1
; RV32I-NEXT: # %bb.3: # %atomicrmw.start
; RV32I-NEXT: # in Loop: Header=BB105_2 Depth=1
; RV32I-NEXT: mv a2, s2
; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill
; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill
; RV32I-NEXT: mv s2, a1
-; RV32I-NEXT: mv s1, a0
+; RV32I-NEXT: mv s3, a0
; RV32I-NEXT: lhu a1, 0(a0)
; RV32I-NEXT: lui a0, 16
-; RV32I-NEXT: addi s3, a0, -1
-; RV32I-NEXT: and s0, s2, s3
+; RV32I-NEXT: addi s0, a0, -1
+; RV32I-NEXT: and s1, s2, s0
; RV32I-NEXT: j .LBB106_2
; RV32I-NEXT: .LBB106_1: # %atomicrmw.start
; RV32I-NEXT: # in Loop: Header=BB106_2 Depth=1
; RV32I-NEXT: addi a1, sp, 10
; RV32I-NEXT: addi a3, zero, 2
; RV32I-NEXT: addi a4, zero, 2
-; RV32I-NEXT: mv a0, s1
+; RV32I-NEXT: mv a0, s3
; RV32I-NEXT: call __atomic_compare_exchange_2@plt
; RV32I-NEXT: lh a1, 10(sp)
; RV32I-NEXT: bnez a0, .LBB106_4
; RV32I-NEXT: .LBB106_2: # %atomicrmw.start
; RV32I-NEXT: # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT: and a0, a1, s3
+; RV32I-NEXT: and a0, a1, s0
; RV32I-NEXT: mv a2, a1
-; RV32I-NEXT: bgeu s0, a0, .LBB106_1
+; RV32I-NEXT: bgeu s1, a0, .LBB106_1
; RV32I-NEXT: # %bb.3: # %atomicrmw.start
; RV32I-NEXT: # in Loop: Header=BB106_2 Depth=1
; RV32I-NEXT: mv a2, s2
; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill
; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill
; RV32I-NEXT: mv s2, a1
-; RV32I-NEXT: mv s1, a0
+; RV32I-NEXT: mv s3, a0
; RV32I-NEXT: lhu a1, 0(a0)
; RV32I-NEXT: lui a0, 16
-; RV32I-NEXT: addi s3, a0, -1
-; RV32I-NEXT: and s0, s2, s3
+; RV32I-NEXT: addi s0, a0, -1
+; RV32I-NEXT: and s1, s2, s0
; RV32I-NEXT: j .LBB107_2
; RV32I-NEXT: .LBB107_1: # %atomicrmw.start
; RV32I-NEXT: # in Loop: Header=BB107_2 Depth=1
; RV32I-NEXT: sh a1, 10(sp)
; RV32I-NEXT: addi a1, sp, 10
; RV32I-NEXT: addi a3, zero, 3
-; RV32I-NEXT: mv a0, s1
+; RV32I-NEXT: mv a0, s3
; RV32I-NEXT: mv a4, zero
; RV32I-NEXT: call __atomic_compare_exchange_2@plt
; RV32I-NEXT: lh a1, 10(sp)
; RV32I-NEXT: bnez a0, .LBB107_4
; RV32I-NEXT: .LBB107_2: # %atomicrmw.start
; RV32I-NEXT: # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT: and a0, a1, s3
+; RV32I-NEXT: and a0, a1, s0
; RV32I-NEXT: mv a2, a1
-; RV32I-NEXT: bgeu s0, a0, .LBB107_1
+; RV32I-NEXT: bgeu s1, a0, .LBB107_1
; RV32I-NEXT: # %bb.3: # %atomicrmw.start
; RV32I-NEXT: # in Loop: Header=BB107_2 Depth=1
; RV32I-NEXT: mv a2, s2
; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill
; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill
; RV32I-NEXT: mv s2, a1
-; RV32I-NEXT: mv s1, a0
+; RV32I-NEXT: mv s3, a0
; RV32I-NEXT: lhu a1, 0(a0)
; RV32I-NEXT: lui a0, 16
-; RV32I-NEXT: addi s3, a0, -1
-; RV32I-NEXT: and s0, s2, s3
+; RV32I-NEXT: addi s0, a0, -1
+; RV32I-NEXT: and s1, s2, s0
; RV32I-NEXT: j .LBB108_2
; RV32I-NEXT: .LBB108_1: # %atomicrmw.start
; RV32I-NEXT: # in Loop: Header=BB108_2 Depth=1
; RV32I-NEXT: addi a1, sp, 10
; RV32I-NEXT: addi a3, zero, 4
; RV32I-NEXT: addi a4, zero, 2
-; RV32I-NEXT: mv a0, s1
+; RV32I-NEXT: mv a0, s3
; RV32I-NEXT: call __atomic_compare_exchange_2@plt
; RV32I-NEXT: lh a1, 10(sp)
; RV32I-NEXT: bnez a0, .LBB108_4
; RV32I-NEXT: .LBB108_2: # %atomicrmw.start
; RV32I-NEXT: # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT: and a0, a1, s3
+; RV32I-NEXT: and a0, a1, s0
; RV32I-NEXT: mv a2, a1
-; RV32I-NEXT: bgeu s0, a0, .LBB108_1
+; RV32I-NEXT: bgeu s1, a0, .LBB108_1
; RV32I-NEXT: # %bb.3: # %atomicrmw.start
; RV32I-NEXT: # in Loop: Header=BB108_2 Depth=1
; RV32I-NEXT: mv a2, s2
; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill
; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill
; RV32I-NEXT: mv s2, a1
-; RV32I-NEXT: mv s1, a0
+; RV32I-NEXT: mv s3, a0
; RV32I-NEXT: lhu a1, 0(a0)
; RV32I-NEXT: lui a0, 16
-; RV32I-NEXT: addi s3, a0, -1
-; RV32I-NEXT: and s0, s2, s3
+; RV32I-NEXT: addi s0, a0, -1
+; RV32I-NEXT: and s1, s2, s0
; RV32I-NEXT: j .LBB109_2
; RV32I-NEXT: .LBB109_1: # %atomicrmw.start
; RV32I-NEXT: # in Loop: Header=BB109_2 Depth=1
; RV32I-NEXT: addi a1, sp, 10
; RV32I-NEXT: addi a3, zero, 5
; RV32I-NEXT: addi a4, zero, 5
-; RV32I-NEXT: mv a0, s1
+; RV32I-NEXT: mv a0, s3
; RV32I-NEXT: call __atomic_compare_exchange_2@plt
; RV32I-NEXT: lh a1, 10(sp)
; RV32I-NEXT: bnez a0, .LBB109_4
; RV32I-NEXT: .LBB109_2: # %atomicrmw.start
; RV32I-NEXT: # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT: and a0, a1, s3
+; RV32I-NEXT: and a0, a1, s0
; RV32I-NEXT: mv a2, a1
-; RV32I-NEXT: bgeu s0, a0, .LBB109_1
+; RV32I-NEXT: bgeu s1, a0, .LBB109_1
; RV32I-NEXT: # %bb.3: # %atomicrmw.start
; RV32I-NEXT: # in Loop: Header=BB109_2 Depth=1
; RV32I-NEXT: mv a2, s2
; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill
; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill
; RV32I-NEXT: mv s2, a1
-; RV32I-NEXT: mv s1, a0
+; RV32I-NEXT: mv s3, a0
; RV32I-NEXT: lhu a1, 0(a0)
; RV32I-NEXT: lui a0, 16
-; RV32I-NEXT: addi s3, a0, -1
-; RV32I-NEXT: and s0, s2, s3
+; RV32I-NEXT: addi s0, a0, -1
+; RV32I-NEXT: and s1, s2, s0
; RV32I-NEXT: j .LBB23_2
; RV32I-NEXT: .LBB23_1: # %atomicrmw.start
; RV32I-NEXT: # in Loop: Header=BB23_2 Depth=1
; RV32I-NEXT: sh a1, 10(sp)
; RV32I-NEXT: addi a1, sp, 10
-; RV32I-NEXT: mv a0, s1
+; RV32I-NEXT: mv a0, s3
; RV32I-NEXT: mv a3, zero
; RV32I-NEXT: mv a4, zero
; RV32I-NEXT: call __atomic_compare_exchange_2@plt
; RV32I-NEXT: bnez a0, .LBB23_4
; RV32I-NEXT: .LBB23_2: # %atomicrmw.start
; RV32I-NEXT: # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT: and a0, a1, s3
+; RV32I-NEXT: and a0, a1, s0
; RV32I-NEXT: mv a2, a1
-; RV32I-NEXT: bltu s0, a0, .LBB23_1
+; RV32I-NEXT: bltu s1, a0, .LBB23_1
; RV32I-NEXT: # %bb.3: # %atomicrmw.start
; RV32I-NEXT: # in Loop: Header=BB23_2 Depth=1
; RV32I-NEXT: mv a2, s2
; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill
; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill
; RV32I-NEXT: mv s2, a1
-; RV32I-NEXT: mv s1, a0
+; RV32I-NEXT: mv s3, a0
; RV32I-NEXT: lhu a1, 0(a0)
; RV32I-NEXT: lui a0, 16
-; RV32I-NEXT: addi s3, a0, -1
-; RV32I-NEXT: and s0, s2, s3
+; RV32I-NEXT: addi s0, a0, -1
+; RV32I-NEXT: and s1, s2, s0
; RV32I-NEXT: j .LBB24_2
; RV32I-NEXT: .LBB24_1: # %atomicrmw.start
; RV32I-NEXT: # in Loop: Header=BB24_2 Depth=1
; RV32I-NEXT: sh a1, 10(sp)
; RV32I-NEXT: addi a1, sp, 10
-; RV32I-NEXT: mv a0, s1
+; RV32I-NEXT: mv a0, s3
; RV32I-NEXT: mv a3, zero
; RV32I-NEXT: mv a4, zero
; RV32I-NEXT: call __atomic_compare_exchange_2@plt
; RV32I-NEXT: bnez a0, .LBB24_4
; RV32I-NEXT: .LBB24_2: # %atomicrmw.start
; RV32I-NEXT: # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT: and a0, a1, s3
+; RV32I-NEXT: and a0, a1, s0
; RV32I-NEXT: mv a2, a1
-; RV32I-NEXT: bgeu s0, a0, .LBB24_1
+; RV32I-NEXT: bgeu s1, a0, .LBB24_1
; RV32I-NEXT: # %bb.3: # %atomicrmw.start
; RV32I-NEXT: # in Loop: Header=BB24_2 Depth=1
; RV32I-NEXT: mv a2, s2
; RV32I-NEXT: sw s4, 8(sp) # 4-byte Folded Spill
; RV32I-NEXT: sw s5, 4(sp) # 4-byte Folded Spill
; RV32I-NEXT: sw s6, 0(sp) # 4-byte Folded Spill
-; RV32I-NEXT: mv s1, a1
-; RV32I-NEXT: mv s0, a0
+; RV32I-NEXT: mv s3, a1
+; RV32I-NEXT: mv s4, a0
; RV32I-NEXT: addi a0, a0, -1
-; RV32I-NEXT: not a1, s0
+; RV32I-NEXT: not a1, s4
; RV32I-NEXT: and a0, a1, a0
; RV32I-NEXT: srli a1, a0, 1
; RV32I-NEXT: lui a2, 349525
-; RV32I-NEXT: addi s4, a2, 1365
-; RV32I-NEXT: and a1, a1, s4
+; RV32I-NEXT: addi s5, a2, 1365
+; RV32I-NEXT: and a1, a1, s5
; RV32I-NEXT: sub a0, a0, a1
; RV32I-NEXT: lui a1, 209715
-; RV32I-NEXT: addi s5, a1, 819
-; RV32I-NEXT: and a1, a0, s5
+; RV32I-NEXT: addi s0, a1, 819
+; RV32I-NEXT: and a1, a0, s0
; RV32I-NEXT: srli a0, a0, 2
-; RV32I-NEXT: and a0, a0, s5
+; RV32I-NEXT: and a0, a0, s0
; RV32I-NEXT: add a0, a1, a0
; RV32I-NEXT: srli a1, a0, 4
; RV32I-NEXT: add a0, a0, a1
; RV32I-NEXT: addi s6, a1, -241
; RV32I-NEXT: and a0, a0, s6
; RV32I-NEXT: lui a1, 4112
-; RV32I-NEXT: addi s3, a1, 257
-; RV32I-NEXT: mv a1, s3
+; RV32I-NEXT: addi s1, a1, 257
+; RV32I-NEXT: mv a1, s1
; RV32I-NEXT: call __mulsi3@plt
; RV32I-NEXT: mv s2, a0
-; RV32I-NEXT: addi a0, s1, -1
-; RV32I-NEXT: not a1, s1
+; RV32I-NEXT: addi a0, s3, -1
+; RV32I-NEXT: not a1, s3
; RV32I-NEXT: and a0, a1, a0
; RV32I-NEXT: srli a1, a0, 1
-; RV32I-NEXT: and a1, a1, s4
+; RV32I-NEXT: and a1, a1, s5
; RV32I-NEXT: sub a0, a0, a1
-; RV32I-NEXT: and a1, a0, s5
+; RV32I-NEXT: and a1, a0, s0
; RV32I-NEXT: srli a0, a0, 2
-; RV32I-NEXT: and a0, a0, s5
+; RV32I-NEXT: and a0, a0, s0
; RV32I-NEXT: add a0, a1, a0
; RV32I-NEXT: srli a1, a0, 4
; RV32I-NEXT: add a0, a0, a1
; RV32I-NEXT: and a0, a0, s6
-; RV32I-NEXT: mv a1, s3
+; RV32I-NEXT: mv a1, s1
; RV32I-NEXT: call __mulsi3@plt
-; RV32I-NEXT: bnez s0, .LBB7_2
+; RV32I-NEXT: bnez s4, .LBB7_2
; RV32I-NEXT: # %bb.1:
; RV32I-NEXT: srli a0, a0, 24
; RV32I-NEXT: addi a0, a0, 32
; RV32I-NEXT: sw s4, 8(sp) # 4-byte Folded Spill
; RV32I-NEXT: sw s5, 4(sp) # 4-byte Folded Spill
; RV32I-NEXT: sw s6, 0(sp) # 4-byte Folded Spill
-; RV32I-NEXT: mv s1, a1
-; RV32I-NEXT: mv s0, a0
+; RV32I-NEXT: mv s3, a1
+; RV32I-NEXT: mv s4, a0
; RV32I-NEXT: addi a0, a0, -1
-; RV32I-NEXT: not a1, s0
+; RV32I-NEXT: not a1, s4
; RV32I-NEXT: and a0, a1, a0
; RV32I-NEXT: srli a1, a0, 1
; RV32I-NEXT: lui a2, 349525
-; RV32I-NEXT: addi s4, a2, 1365
-; RV32I-NEXT: and a1, a1, s4
+; RV32I-NEXT: addi s5, a2, 1365
+; RV32I-NEXT: and a1, a1, s5
; RV32I-NEXT: sub a0, a0, a1
; RV32I-NEXT: lui a1, 209715
-; RV32I-NEXT: addi s5, a1, 819
-; RV32I-NEXT: and a1, a0, s5
+; RV32I-NEXT: addi s0, a1, 819
+; RV32I-NEXT: and a1, a0, s0
; RV32I-NEXT: srli a0, a0, 2
-; RV32I-NEXT: and a0, a0, s5
+; RV32I-NEXT: and a0, a0, s0
; RV32I-NEXT: add a0, a1, a0
; RV32I-NEXT: srli a1, a0, 4
; RV32I-NEXT: add a0, a0, a1
; RV32I-NEXT: addi s6, a1, -241
; RV32I-NEXT: and a0, a0, s6
; RV32I-NEXT: lui a1, 4112
-; RV32I-NEXT: addi s3, a1, 257
-; RV32I-NEXT: mv a1, s3
+; RV32I-NEXT: addi s1, a1, 257
+; RV32I-NEXT: mv a1, s1
; RV32I-NEXT: call __mulsi3@plt
; RV32I-NEXT: mv s2, a0
-; RV32I-NEXT: addi a0, s1, -1
-; RV32I-NEXT: not a1, s1
+; RV32I-NEXT: addi a0, s3, -1
+; RV32I-NEXT: not a1, s3
; RV32I-NEXT: and a0, a1, a0
; RV32I-NEXT: srli a1, a0, 1
-; RV32I-NEXT: and a1, a1, s4
+; RV32I-NEXT: and a1, a1, s5
; RV32I-NEXT: sub a0, a0, a1
-; RV32I-NEXT: and a1, a0, s5
+; RV32I-NEXT: and a1, a0, s0
; RV32I-NEXT: srli a0, a0, 2
-; RV32I-NEXT: and a0, a0, s5
+; RV32I-NEXT: and a0, a0, s0
; RV32I-NEXT: add a0, a1, a0
; RV32I-NEXT: srli a1, a0, 4
; RV32I-NEXT: add a0, a0, a1
; RV32I-NEXT: and a0, a0, s6
-; RV32I-NEXT: mv a1, s3
+; RV32I-NEXT: mv a1, s1
; RV32I-NEXT: call __mulsi3@plt
-; RV32I-NEXT: bnez s0, .LBB11_2
+; RV32I-NEXT: bnez s4, .LBB11_2
; RV32I-NEXT: # %bb.1:
; RV32I-NEXT: srli a0, a0, 24
; RV32I-NEXT: addi a0, a0, 32
; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill
; RV32I-NEXT: sw s4, 8(sp) # 4-byte Folded Spill
; RV32I-NEXT: sw s5, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT: mv s0, a0
+; RV32I-NEXT: mv s2, a0
; RV32I-NEXT: srli a0, a1, 1
; RV32I-NEXT: lui a2, 349525
-; RV32I-NEXT: addi s2, a2, 1365
-; RV32I-NEXT: and a0, a0, s2
+; RV32I-NEXT: addi s3, a2, 1365
+; RV32I-NEXT: and a0, a0, s3
; RV32I-NEXT: sub a0, a1, a0
; RV32I-NEXT: lui a1, 209715
-; RV32I-NEXT: addi s1, a1, 819
-; RV32I-NEXT: and a1, a0, s1
+; RV32I-NEXT: addi s0, a1, 819
+; RV32I-NEXT: and a1, a0, s0
; RV32I-NEXT: srli a0, a0, 2
-; RV32I-NEXT: and a0, a0, s1
+; RV32I-NEXT: and a0, a0, s0
; RV32I-NEXT: add a0, a1, a0
; RV32I-NEXT: srli a1, a0, 4
; RV32I-NEXT: add a0, a0, a1
; RV32I-NEXT: addi s4, a1, -241
; RV32I-NEXT: and a0, a0, s4
; RV32I-NEXT: lui a1, 4112
-; RV32I-NEXT: addi s3, a1, 257
-; RV32I-NEXT: mv a1, s3
+; RV32I-NEXT: addi s1, a1, 257
+; RV32I-NEXT: mv a1, s1
; RV32I-NEXT: call __mulsi3@plt
; RV32I-NEXT: srli s5, a0, 24
-; RV32I-NEXT: srli a0, s0, 1
-; RV32I-NEXT: and a0, a0, s2
-; RV32I-NEXT: sub a0, s0, a0
-; RV32I-NEXT: and a1, a0, s1
+; RV32I-NEXT: srli a0, s2, 1
+; RV32I-NEXT: and a0, a0, s3
+; RV32I-NEXT: sub a0, s2, a0
+; RV32I-NEXT: and a1, a0, s0
; RV32I-NEXT: srli a0, a0, 2
-; RV32I-NEXT: and a0, a0, s1
+; RV32I-NEXT: and a0, a0, s0
; RV32I-NEXT: add a0, a1, a0
; RV32I-NEXT: srli a1, a0, 4
; RV32I-NEXT: add a0, a0, a1
; RV32I-NEXT: and a0, a0, s4
-; RV32I-NEXT: mv a1, s3
+; RV32I-NEXT: mv a1, s1
; RV32I-NEXT: call __mulsi3@plt
; RV32I-NEXT: srli a0, a0, 24
; RV32I-NEXT: add a0, a0, s5
; RV32I-NEXT: sw s2, 0(sp) # 4-byte Folded Spill
; RV32I-NEXT: mv s0, a1
; RV32I-NEXT: lui a1, 16
-; RV32I-NEXT: addi s2, a1, -1
-; RV32I-NEXT: and a0, a0, s2
+; RV32I-NEXT: addi s1, a1, -1
+; RV32I-NEXT: and a0, a0, s1
; RV32I-NEXT: call __gnu_h2f_ieee@plt
-; RV32I-NEXT: mv s1, a0
-; RV32I-NEXT: and a0, s0, s2
+; RV32I-NEXT: mv s2, a0
+; RV32I-NEXT: and a0, s0, s1
; RV32I-NEXT: call __gnu_h2f_ieee@plt
; RV32I-NEXT: mv s0, a0
-; RV32I-NEXT: mv a0, s1
+; RV32I-NEXT: mv a0, s2
; RV32I-NEXT: mv a1, s0
; RV32I-NEXT: call __addsf3@plt
; RV32I-NEXT: call __gnu_f2h_ieee@plt
-; RV32I-NEXT: and a0, a0, s2
+; RV32I-NEXT: and a0, a0, s1
; RV32I-NEXT: call __gnu_h2f_ieee@plt
; RV32I-NEXT: mv a1, s0
; RV32I-NEXT: call __divsf3@plt
define i64 @rol_i64(i64 %a, i64 %b) nounwind {
; RV32I-LABEL: rol_i64:
; RV32I: # %bb.0:
-; RV32I-NEXT: mv t1, a1
+; RV32I-NEXT: mv a7, a1
; RV32I-NEXT: andi a1, a2, 63
-; RV32I-NEXT: addi a7, a1, -32
+; RV32I-NEXT: addi t0, a1, -32
; RV32I-NEXT: addi a6, zero, 31
-; RV32I-NEXT: bltz a7, .LBB7_2
+; RV32I-NEXT: bltz t0, .LBB7_2
; RV32I-NEXT: # %bb.1:
-; RV32I-NEXT: sll a1, a0, a7
+; RV32I-NEXT: sll a1, a0, t0
; RV32I-NEXT: j .LBB7_3
; RV32I-NEXT: .LBB7_2:
-; RV32I-NEXT: sll a4, t1, a2
+; RV32I-NEXT: sll a3, a7, a2
; RV32I-NEXT: sub a1, a6, a1
-; RV32I-NEXT: srli a5, a0, 1
-; RV32I-NEXT: srl a1, a5, a1
-; RV32I-NEXT: or a1, a4, a1
+; RV32I-NEXT: srli a4, a0, 1
+; RV32I-NEXT: srl a1, a4, a1
+; RV32I-NEXT: or a1, a3, a1
; RV32I-NEXT: .LBB7_3:
; RV32I-NEXT: neg a5, a2
-; RV32I-NEXT: andi a4, a5, 63
-; RV32I-NEXT: addi t0, a4, -32
-; RV32I-NEXT: bltz t0, .LBB7_5
+; RV32I-NEXT: andi a3, a5, 63
+; RV32I-NEXT: addi a4, a3, -32
+; RV32I-NEXT: bltz a4, .LBB7_5
; RV32I-NEXT: # %bb.4:
-; RV32I-NEXT: srl a3, t1, t0
-; RV32I-NEXT: bltz a7, .LBB7_6
+; RV32I-NEXT: srl a3, a7, a4
+; RV32I-NEXT: bltz t0, .LBB7_6
; RV32I-NEXT: j .LBB7_7
; RV32I-NEXT: .LBB7_5:
-; RV32I-NEXT: srl a3, t1, a5
-; RV32I-NEXT: or a1, a1, a3
-; RV32I-NEXT: srl a3, a0, a5
-; RV32I-NEXT: sub a4, a6, a4
-; RV32I-NEXT: slli a5, t1, 1
-; RV32I-NEXT: sll a4, a5, a4
-; RV32I-NEXT: or a3, a3, a4
-; RV32I-NEXT: bgez a7, .LBB7_7
+; RV32I-NEXT: srl a4, a7, a5
+; RV32I-NEXT: or a1, a1, a4
+; RV32I-NEXT: srl a4, a0, a5
+; RV32I-NEXT: sub a3, a6, a3
+; RV32I-NEXT: slli a5, a7, 1
+; RV32I-NEXT: sll a3, a5, a3
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: bgez t0, .LBB7_7
; RV32I-NEXT: .LBB7_6:
; RV32I-NEXT: sll a0, a0, a2
; RV32I-NEXT: or a3, a3, a0
; RV32I-NEXT: mv a0, a3
; RV32I-NEXT: ret
;
-; RV32IB-LABEL: rol_i64:
-; RV32IB: # %bb.0:
-; RV32IB-NEXT: sll a7, a1, a2
-; RV32IB-NEXT: andi a4, a2, 63
-; RV32IB-NEXT: addi a6, zero, 31
-; RV32IB-NEXT: sub a5, a6, a4
-; RV32IB-NEXT: srli a3, a0, 1
-; RV32IB-NEXT: srl a3, a3, a5
-; RV32IB-NEXT: or a3, a7, a3
-; RV32IB-NEXT: addi a7, a4, -32
-; RV32IB-NEXT: sll a5, a0, a7
-; RV32IB-NEXT: slti a4, a7, 0
-; RV32IB-NEXT: cmov t0, a4, a3, a5
-; RV32IB-NEXT: neg a4, a2
-; RV32IB-NEXT: srl t2, a1, a4
-; RV32IB-NEXT: andi a3, a4, 63
-; RV32IB-NEXT: addi t1, a3, -32
-; RV32IB-NEXT: srai a5, t1, 31
-; RV32IB-NEXT: and a5, a5, t2
-; RV32IB-NEXT: or t0, t0, a5
-; RV32IB-NEXT: srl a4, a0, a4
-; RV32IB-NEXT: sub a3, a6, a3
-; RV32IB-NEXT: slli a5, a1, 1
-; RV32IB-NEXT: sll a3, a5, a3
-; RV32IB-NEXT: or a3, a4, a3
-; RV32IB-NEXT: srl a1, a1, t1
-; RV32IB-NEXT: slti a4, t1, 0
-; RV32IB-NEXT: cmov a1, a4, a3, a1
-; RV32IB-NEXT: sll a0, a0, a2
-; RV32IB-NEXT: srai a2, a7, 31
-; RV32IB-NEXT: and a0, a2, a0
-; RV32IB-NEXT: or a0, a0, a1
-; RV32IB-NEXT: mv a1, t0
-; RV32IB-NEXT: ret
-;
-; RV32IBB-LABEL: rol_i64:
-; RV32IBB: # %bb.0:
-; RV32IBB-NEXT: mv t1, a1
-; RV32IBB-NEXT: andi a1, a2, 63
-; RV32IBB-NEXT: addi a7, a1, -32
-; RV32IBB-NEXT: addi a6, zero, 31
-; RV32IBB-NEXT: bltz a7, .LBB7_2
-; RV32IBB-NEXT: # %bb.1:
-; RV32IBB-NEXT: sll a1, a0, a7
-; RV32IBB-NEXT: j .LBB7_3
-; RV32IBB-NEXT: .LBB7_2:
-; RV32IBB-NEXT: sll a4, t1, a2
-; RV32IBB-NEXT: sub a1, a6, a1
-; RV32IBB-NEXT: srli a5, a0, 1
-; RV32IBB-NEXT: srl a1, a5, a1
-; RV32IBB-NEXT: or a1, a4, a1
-; RV32IBB-NEXT: .LBB7_3:
-; RV32IBB-NEXT: neg a5, a2
-; RV32IBB-NEXT: andi a4, a5, 63
-; RV32IBB-NEXT: addi t0, a4, -32
-; RV32IBB-NEXT: bltz t0, .LBB7_5
-; RV32IBB-NEXT: # %bb.4:
-; RV32IBB-NEXT: srl a3, t1, t0
-; RV32IBB-NEXT: bltz a7, .LBB7_6
-; RV32IBB-NEXT: j .LBB7_7
-; RV32IBB-NEXT: .LBB7_5:
-; RV32IBB-NEXT: srl a3, t1, a5
-; RV32IBB-NEXT: or a1, a1, a3
-; RV32IBB-NEXT: srl a3, a0, a5
-; RV32IBB-NEXT: sub a4, a6, a4
-; RV32IBB-NEXT: slli a5, t1, 1
-; RV32IBB-NEXT: sll a4, a5, a4
-; RV32IBB-NEXT: or a3, a3, a4
-; RV32IBB-NEXT: bgez a7, .LBB7_7
-; RV32IBB-NEXT: .LBB7_6:
-; RV32IBB-NEXT: sll a0, a0, a2
-; RV32IBB-NEXT: or a3, a3, a0
-; RV32IBB-NEXT: .LBB7_7:
-; RV32IBB-NEXT: mv a0, a3
-; RV32IBB-NEXT: ret
-;
-; RV32IBP-LABEL: rol_i64:
-; RV32IBP: # %bb.0:
-; RV32IBP-NEXT: mv t1, a1
-; RV32IBP-NEXT: andi a1, a2, 63
-; RV32IBP-NEXT: addi a7, a1, -32
-; RV32IBP-NEXT: addi a6, zero, 31
-; RV32IBP-NEXT: bltz a7, .LBB7_2
-; RV32IBP-NEXT: # %bb.1:
-; RV32IBP-NEXT: sll a1, a0, a7
-; RV32IBP-NEXT: j .LBB7_3
-; RV32IBP-NEXT: .LBB7_2:
-; RV32IBP-NEXT: sll a4, t1, a2
-; RV32IBP-NEXT: sub a1, a6, a1
-; RV32IBP-NEXT: srli a5, a0, 1
-; RV32IBP-NEXT: srl a1, a5, a1
-; RV32IBP-NEXT: or a1, a4, a1
-; RV32IBP-NEXT: .LBB7_3:
-; RV32IBP-NEXT: neg a5, a2
-; RV32IBP-NEXT: andi a4, a5, 63
-; RV32IBP-NEXT: addi t0, a4, -32
-; RV32IBP-NEXT: bltz t0, .LBB7_5
-; RV32IBP-NEXT: # %bb.4:
-; RV32IBP-NEXT: srl a3, t1, t0
-; RV32IBP-NEXT: bltz a7, .LBB7_6
-; RV32IBP-NEXT: j .LBB7_7
-; RV32IBP-NEXT: .LBB7_5:
-; RV32IBP-NEXT: srl a3, t1, a5
-; RV32IBP-NEXT: or a1, a1, a3
-; RV32IBP-NEXT: srl a3, a0, a5
-; RV32IBP-NEXT: sub a4, a6, a4
-; RV32IBP-NEXT: slli a5, t1, 1
-; RV32IBP-NEXT: sll a4, a5, a4
-; RV32IBP-NEXT: or a3, a3, a4
-; RV32IBP-NEXT: bgez a7, .LBB7_7
-; RV32IBP-NEXT: .LBB7_6:
-; RV32IBP-NEXT: sll a0, a0, a2
-; RV32IBP-NEXT: or a3, a3, a0
-; RV32IBP-NEXT: .LBB7_7:
-; RV32IBP-NEXT: mv a0, a3
-; RV32IBP-NEXT: ret
+; RV32B-LABEL: rol_i64:
+; RV32B: # %bb.0:
+; RV32B-NEXT: sll a7, a1, a2
+; RV32B-NEXT: andi a4, a2, 63
+; RV32B-NEXT: addi a6, zero, 31
+; RV32B-NEXT: sub a5, a6, a4
+; RV32B-NEXT: srli a3, a0, 1
+; RV32B-NEXT: srl a3, a3, a5
+; RV32B-NEXT: or a7, a7, a3
+; RV32B-NEXT: addi t1, a4, -32
+; RV32B-NEXT: sll a5, a0, t1
+; RV32B-NEXT: slti a3, t1, 0
+; RV32B-NEXT: cmov a7, a3, a7, a5
+; RV32B-NEXT: neg a5, a2
+; RV32B-NEXT: srl t0, a1, a5
+; RV32B-NEXT: andi t2, a5, 63
+; RV32B-NEXT: addi a4, t2, -32
+; RV32B-NEXT: srai a3, a4, 31
+; RV32B-NEXT: and a3, a3, t0
+; RV32B-NEXT: or a7, a7, a3
+; RV32B-NEXT: srl t0, a0, a5
+; RV32B-NEXT: sub a5, a6, t2
+; RV32B-NEXT: slli a3, a1, 1
+; RV32B-NEXT: sll a3, a3, a5
+; RV32B-NEXT: or a3, t0, a3
+; RV32B-NEXT: srl a1, a1, a4
+; RV32B-NEXT: slti a4, a4, 0
+; RV32B-NEXT: cmov a1, a4, a3, a1
+; RV32B-NEXT: sll a0, a0, a2
+; RV32B-NEXT: srai a2, t1, 31
+; RV32B-NEXT: and a0, a2, a0
+; RV32B-NEXT: or a0, a0, a1
+; RV32B-NEXT: mv a1, a7
+; RV32B-NEXT: ret
+;
+; RV32ZBB-LABEL: rol_i64:
+; RV32ZBB: # %bb.0:
+; RV32ZBB-NEXT: mv a7, a1
+; RV32ZBB-NEXT: andi a1, a2, 63
+; RV32ZBB-NEXT: addi t0, a1, -32
+; RV32ZBB-NEXT: addi a6, zero, 31
+; RV32ZBB-NEXT: bltz t0, .LBB7_2
+; RV32ZBB-NEXT: # %bb.1:
+; RV32ZBB-NEXT: sll a1, a0, t0
+; RV32ZBB-NEXT: j .LBB7_3
+; RV32ZBB-NEXT: .LBB7_2:
+; RV32ZBB-NEXT: sll a3, a7, a2
+; RV32ZBB-NEXT: sub a1, a6, a1
+; RV32ZBB-NEXT: srli a4, a0, 1
+; RV32ZBB-NEXT: srl a1, a4, a1
+; RV32ZBB-NEXT: or a1, a3, a1
+; RV32ZBB-NEXT: .LBB7_3:
+; RV32ZBB-NEXT: neg a5, a2
+; RV32ZBB-NEXT: andi a3, a5, 63
+; RV32ZBB-NEXT: addi a4, a3, -32
+; RV32ZBB-NEXT: bltz a4, .LBB7_5
+; RV32ZBB-NEXT: # %bb.4:
+; RV32ZBB-NEXT: srl a3, a7, a4
+; RV32ZBB-NEXT: bltz t0, .LBB7_6
+; RV32ZBB-NEXT: j .LBB7_7
+; RV32ZBB-NEXT: .LBB7_5:
+; RV32ZBB-NEXT: srl a4, a7, a5
+; RV32ZBB-NEXT: or a1, a1, a4
+; RV32ZBB-NEXT: srl a4, a0, a5
+; RV32ZBB-NEXT: sub a3, a6, a3
+; RV32ZBB-NEXT: slli a5, a7, 1
+; RV32ZBB-NEXT: sll a3, a5, a3
+; RV32ZBB-NEXT: or a3, a4, a3
+; RV32ZBB-NEXT: bgez t0, .LBB7_7
+; RV32ZBB-NEXT: .LBB7_6:
+; RV32ZBB-NEXT: sll a0, a0, a2
+; RV32ZBB-NEXT: or a3, a3, a0
+; RV32ZBB-NEXT: .LBB7_7:
+; RV32ZBB-NEXT: mv a0, a3
+; RV32ZBB-NEXT: ret
+;
+; RV32ZBP-LABEL: rol_i64:
+; RV32ZBP: # %bb.0:
+; RV32ZBP-NEXT: mv a7, a1
+; RV32ZBP-NEXT: andi a1, a2, 63
+; RV32ZBP-NEXT: addi t0, a1, -32
+; RV32ZBP-NEXT: addi a6, zero, 31
+; RV32ZBP-NEXT: bltz t0, .LBB7_2
+; RV32ZBP-NEXT: # %bb.1:
+; RV32ZBP-NEXT: sll a1, a0, t0
+; RV32ZBP-NEXT: j .LBB7_3
+; RV32ZBP-NEXT: .LBB7_2:
+; RV32ZBP-NEXT: sll a3, a7, a2
+; RV32ZBP-NEXT: sub a1, a6, a1
+; RV32ZBP-NEXT: srli a4, a0, 1
+; RV32ZBP-NEXT: srl a1, a4, a1
+; RV32ZBP-NEXT: or a1, a3, a1
+; RV32ZBP-NEXT: .LBB7_3:
+; RV32ZBP-NEXT: neg a5, a2
+; RV32ZBP-NEXT: andi a3, a5, 63
+; RV32ZBP-NEXT: addi a4, a3, -32
+; RV32ZBP-NEXT: bltz a4, .LBB7_5
+; RV32ZBP-NEXT: # %bb.4:
+; RV32ZBP-NEXT: srl a3, a7, a4
+; RV32ZBP-NEXT: bltz t0, .LBB7_6
+; RV32ZBP-NEXT: j .LBB7_7
+; RV32ZBP-NEXT: .LBB7_5:
+; RV32ZBP-NEXT: srl a4, a7, a5
+; RV32ZBP-NEXT: or a1, a1, a4
+; RV32ZBP-NEXT: srl a4, a0, a5
+; RV32ZBP-NEXT: sub a3, a6, a3
+; RV32ZBP-NEXT: slli a5, a7, 1
+; RV32ZBP-NEXT: sll a3, a5, a3
+; RV32ZBP-NEXT: or a3, a4, a3
+; RV32ZBP-NEXT: bgez t0, .LBB7_7
+; RV32ZBP-NEXT: .LBB7_6:
+; RV32ZBP-NEXT: sll a0, a0, a2
+; RV32ZBP-NEXT: or a3, a3, a0
+; RV32ZBP-NEXT: .LBB7_7:
+; RV32ZBP-NEXT: mv a0, a3
+; RV32ZBP-NEXT: ret
%or = tail call i64 @llvm.fshl.i64(i64 %a, i64 %a, i64 %b)
ret i64 %or
}
define i64 @ror_i64(i64 %a, i64 %b) nounwind {
; RV32I-LABEL: ror_i64:
; RV32I: # %bb.0:
-; RV32I-NEXT: mv t1, a0
+; RV32I-NEXT: mv t0, a0
; RV32I-NEXT: andi a0, a2, 63
; RV32I-NEXT: addi a7, a0, -32
; RV32I-NEXT: addi a6, zero, 31
; RV32I-NEXT: srl a0, a1, a7
; RV32I-NEXT: j .LBB9_3
; RV32I-NEXT: .LBB9_2:
-; RV32I-NEXT: srl a4, t1, a2
+; RV32I-NEXT: srl a3, t0, a2
; RV32I-NEXT: sub a0, a6, a0
-; RV32I-NEXT: slli a5, a1, 1
-; RV32I-NEXT: sll a0, a5, a0
-; RV32I-NEXT: or a0, a4, a0
+; RV32I-NEXT: slli a4, a1, 1
+; RV32I-NEXT: sll a0, a4, a0
+; RV32I-NEXT: or a0, a3, a0
; RV32I-NEXT: .LBB9_3:
; RV32I-NEXT: neg a5, a2
; RV32I-NEXT: andi a4, a5, 63
-; RV32I-NEXT: addi t0, a4, -32
-; RV32I-NEXT: bltz t0, .LBB9_5
+; RV32I-NEXT: addi a3, a4, -32
+; RV32I-NEXT: bltz a3, .LBB9_5
; RV32I-NEXT: # %bb.4:
-; RV32I-NEXT: sll a3, t1, t0
+; RV32I-NEXT: sll a3, t0, a3
; RV32I-NEXT: bltz a7, .LBB9_6
; RV32I-NEXT: j .LBB9_7
; RV32I-NEXT: .LBB9_5:
-; RV32I-NEXT: sll a3, t1, a5
+; RV32I-NEXT: sll a3, t0, a5
; RV32I-NEXT: or a0, a0, a3
; RV32I-NEXT: sll a3, a1, a5
; RV32I-NEXT: sub a4, a6, a4
-; RV32I-NEXT: srli a5, t1, 1
+; RV32I-NEXT: srli a5, t0, 1
; RV32I-NEXT: srl a4, a5, a4
; RV32I-NEXT: or a3, a3, a4
; RV32I-NEXT: bgez a7, .LBB9_7
; RV32I-NEXT: mv a1, a3
; RV32I-NEXT: ret
;
-; RV32IB-LABEL: ror_i64:
-; RV32IB: # %bb.0:
-; RV32IB-NEXT: srl a7, a0, a2
-; RV32IB-NEXT: andi a4, a2, 63
-; RV32IB-NEXT: addi a6, zero, 31
-; RV32IB-NEXT: sub a5, a6, a4
-; RV32IB-NEXT: slli a3, a1, 1
-; RV32IB-NEXT: sll a3, a3, a5
-; RV32IB-NEXT: or a3, a7, a3
-; RV32IB-NEXT: addi a7, a4, -32
-; RV32IB-NEXT: srl a5, a1, a7
-; RV32IB-NEXT: slti a4, a7, 0
-; RV32IB-NEXT: cmov t0, a4, a3, a5
-; RV32IB-NEXT: neg a4, a2
-; RV32IB-NEXT: sll t2, a0, a4
-; RV32IB-NEXT: andi a3, a4, 63
-; RV32IB-NEXT: addi t1, a3, -32
-; RV32IB-NEXT: srai a5, t1, 31
-; RV32IB-NEXT: and a5, a5, t2
-; RV32IB-NEXT: or t0, t0, a5
-; RV32IB-NEXT: sll a4, a1, a4
-; RV32IB-NEXT: sub a3, a6, a3
-; RV32IB-NEXT: srli a5, a0, 1
-; RV32IB-NEXT: srl a3, a5, a3
-; RV32IB-NEXT: or a3, a4, a3
-; RV32IB-NEXT: sll a0, a0, t1
-; RV32IB-NEXT: slti a4, t1, 0
-; RV32IB-NEXT: cmov a0, a4, a3, a0
-; RV32IB-NEXT: srl a1, a1, a2
-; RV32IB-NEXT: srai a2, a7, 31
-; RV32IB-NEXT: and a1, a2, a1
-; RV32IB-NEXT: or a1, a1, a0
-; RV32IB-NEXT: mv a0, t0
-; RV32IB-NEXT: ret
-;
-; RV32IBB-LABEL: ror_i64:
-; RV32IBB: # %bb.0:
-; RV32IBB-NEXT: mv t1, a0
-; RV32IBB-NEXT: andi a0, a2, 63
-; RV32IBB-NEXT: addi a7, a0, -32
-; RV32IBB-NEXT: addi a6, zero, 31
-; RV32IBB-NEXT: bltz a7, .LBB9_2
-; RV32IBB-NEXT: # %bb.1:
-; RV32IBB-NEXT: srl a0, a1, a7
-; RV32IBB-NEXT: j .LBB9_3
-; RV32IBB-NEXT: .LBB9_2:
-; RV32IBB-NEXT: srl a4, t1, a2
-; RV32IBB-NEXT: sub a0, a6, a0
-; RV32IBB-NEXT: slli a5, a1, 1
-; RV32IBB-NEXT: sll a0, a5, a0
-; RV32IBB-NEXT: or a0, a4, a0
-; RV32IBB-NEXT: .LBB9_3:
-; RV32IBB-NEXT: neg a5, a2
-; RV32IBB-NEXT: andi a4, a5, 63
-; RV32IBB-NEXT: addi t0, a4, -32
-; RV32IBB-NEXT: bltz t0, .LBB9_5
-; RV32IBB-NEXT: # %bb.4:
-; RV32IBB-NEXT: sll a3, t1, t0
-; RV32IBB-NEXT: bltz a7, .LBB9_6
-; RV32IBB-NEXT: j .LBB9_7
-; RV32IBB-NEXT: .LBB9_5:
-; RV32IBB-NEXT: sll a3, t1, a5
-; RV32IBB-NEXT: or a0, a0, a3
-; RV32IBB-NEXT: sll a3, a1, a5
-; RV32IBB-NEXT: sub a4, a6, a4
-; RV32IBB-NEXT: srli a5, t1, 1
-; RV32IBB-NEXT: srl a4, a5, a4
-; RV32IBB-NEXT: or a3, a3, a4
-; RV32IBB-NEXT: bgez a7, .LBB9_7
-; RV32IBB-NEXT: .LBB9_6:
-; RV32IBB-NEXT: srl a1, a1, a2
-; RV32IBB-NEXT: or a3, a3, a1
-; RV32IBB-NEXT: .LBB9_7:
-; RV32IBB-NEXT: mv a1, a3
-; RV32IBB-NEXT: ret
-;
-; RV32IBP-LABEL: ror_i64:
-; RV32IBP: # %bb.0:
-; RV32IBP-NEXT: mv t1, a0
-; RV32IBP-NEXT: andi a0, a2, 63
-; RV32IBP-NEXT: addi a7, a0, -32
-; RV32IBP-NEXT: addi a6, zero, 31
-; RV32IBP-NEXT: bltz a7, .LBB9_2
-; RV32IBP-NEXT: # %bb.1:
-; RV32IBP-NEXT: srl a0, a1, a7
-; RV32IBP-NEXT: j .LBB9_3
-; RV32IBP-NEXT: .LBB9_2:
-; RV32IBP-NEXT: srl a4, t1, a2
-; RV32IBP-NEXT: sub a0, a6, a0
-; RV32IBP-NEXT: slli a5, a1, 1
-; RV32IBP-NEXT: sll a0, a5, a0
-; RV32IBP-NEXT: or a0, a4, a0
-; RV32IBP-NEXT: .LBB9_3:
-; RV32IBP-NEXT: neg a5, a2
-; RV32IBP-NEXT: andi a4, a5, 63
-; RV32IBP-NEXT: addi t0, a4, -32
-; RV32IBP-NEXT: bltz t0, .LBB9_5
-; RV32IBP-NEXT: # %bb.4:
-; RV32IBP-NEXT: sll a3, t1, t0
-; RV32IBP-NEXT: bltz a7, .LBB9_6
-; RV32IBP-NEXT: j .LBB9_7
-; RV32IBP-NEXT: .LBB9_5:
-; RV32IBP-NEXT: sll a3, t1, a5
-; RV32IBP-NEXT: or a0, a0, a3
-; RV32IBP-NEXT: sll a3, a1, a5
-; RV32IBP-NEXT: sub a4, a6, a4
-; RV32IBP-NEXT: srli a5, t1, 1
-; RV32IBP-NEXT: srl a4, a5, a4
-; RV32IBP-NEXT: or a3, a3, a4
-; RV32IBP-NEXT: bgez a7, .LBB9_7
-; RV32IBP-NEXT: .LBB9_6:
-; RV32IBP-NEXT: srl a1, a1, a2
-; RV32IBP-NEXT: or a3, a3, a1
-; RV32IBP-NEXT: .LBB9_7:
-; RV32IBP-NEXT: mv a1, a3
-; RV32IBP-NEXT: ret
+; RV32B-LABEL: ror_i64:
+; RV32B: # %bb.0:
+; RV32B-NEXT: srl a7, a0, a2
+; RV32B-NEXT: andi a4, a2, 63
+; RV32B-NEXT: addi a6, zero, 31
+; RV32B-NEXT: sub a5, a6, a4
+; RV32B-NEXT: slli a3, a1, 1
+; RV32B-NEXT: sll a3, a3, a5
+; RV32B-NEXT: or a7, a7, a3
+; RV32B-NEXT: addi t1, a4, -32
+; RV32B-NEXT: srl a5, a1, t1
+; RV32B-NEXT: slti a3, t1, 0
+; RV32B-NEXT: cmov a7, a3, a7, a5
+; RV32B-NEXT: neg a5, a2
+; RV32B-NEXT: sll t0, a0, a5
+; RV32B-NEXT: andi t2, a5, 63
+; RV32B-NEXT: addi a4, t2, -32
+; RV32B-NEXT: srai a3, a4, 31
+; RV32B-NEXT: and a3, a3, t0
+; RV32B-NEXT: or a7, a7, a3
+; RV32B-NEXT: sll t0, a1, a5
+; RV32B-NEXT: sub a5, a6, t2
+; RV32B-NEXT: srli a3, a0, 1
+; RV32B-NEXT: srl a3, a3, a5
+; RV32B-NEXT: or a3, t0, a3
+; RV32B-NEXT: sll a0, a0, a4
+; RV32B-NEXT: slti a4, a4, 0
+; RV32B-NEXT: cmov a0, a4, a3, a0
+; RV32B-NEXT: srl a1, a1, a2
+; RV32B-NEXT: srai a2, t1, 31
+; RV32B-NEXT: and a1, a2, a1
+; RV32B-NEXT: or a1, a1, a0
+; RV32B-NEXT: mv a0, a7
+; RV32B-NEXT: ret
+;
+; RV32ZBB-LABEL: ror_i64:
+; RV32ZBB: # %bb.0:
+; RV32ZBB-NEXT: mv t0, a0
+; RV32ZBB-NEXT: andi a0, a2, 63
+; RV32ZBB-NEXT: addi a7, a0, -32
+; RV32ZBB-NEXT: addi a6, zero, 31
+; RV32ZBB-NEXT: bltz a7, .LBB9_2
+; RV32ZBB-NEXT: # %bb.1:
+; RV32ZBB-NEXT: srl a0, a1, a7
+; RV32ZBB-NEXT: j .LBB9_3
+; RV32ZBB-NEXT: .LBB9_2:
+; RV32ZBB-NEXT: srl a3, t0, a2
+; RV32ZBB-NEXT: sub a0, a6, a0
+; RV32ZBB-NEXT: slli a4, a1, 1
+; RV32ZBB-NEXT: sll a0, a4, a0
+; RV32ZBB-NEXT: or a0, a3, a0
+; RV32ZBB-NEXT: .LBB9_3:
+; RV32ZBB-NEXT: neg a5, a2
+; RV32ZBB-NEXT: andi a4, a5, 63
+; RV32ZBB-NEXT: addi a3, a4, -32
+; RV32ZBB-NEXT: bltz a3, .LBB9_5
+; RV32ZBB-NEXT: # %bb.4:
+; RV32ZBB-NEXT: sll a3, t0, a3
+; RV32ZBB-NEXT: bltz a7, .LBB9_6
+; RV32ZBB-NEXT: j .LBB9_7
+; RV32ZBB-NEXT: .LBB9_5:
+; RV32ZBB-NEXT: sll a3, t0, a5
+; RV32ZBB-NEXT: or a0, a0, a3
+; RV32ZBB-NEXT: sll a3, a1, a5
+; RV32ZBB-NEXT: sub a4, a6, a4
+; RV32ZBB-NEXT: srli a5, t0, 1
+; RV32ZBB-NEXT: srl a4, a5, a4
+; RV32ZBB-NEXT: or a3, a3, a4
+; RV32ZBB-NEXT: bgez a7, .LBB9_7
+; RV32ZBB-NEXT: .LBB9_6:
+; RV32ZBB-NEXT: srl a1, a1, a2
+; RV32ZBB-NEXT: or a3, a3, a1
+; RV32ZBB-NEXT: .LBB9_7:
+; RV32ZBB-NEXT: mv a1, a3
+; RV32ZBB-NEXT: ret
+;
+; RV32ZBP-LABEL: ror_i64:
+; RV32ZBP: # %bb.0:
+; RV32ZBP-NEXT: mv t0, a0
+; RV32ZBP-NEXT: andi a0, a2, 63
+; RV32ZBP-NEXT: addi a7, a0, -32
+; RV32ZBP-NEXT: addi a6, zero, 31
+; RV32ZBP-NEXT: bltz a7, .LBB9_2
+; RV32ZBP-NEXT: # %bb.1:
+; RV32ZBP-NEXT: srl a0, a1, a7
+; RV32ZBP-NEXT: j .LBB9_3
+; RV32ZBP-NEXT: .LBB9_2:
+; RV32ZBP-NEXT: srl a3, t0, a2
+; RV32ZBP-NEXT: sub a0, a6, a0
+; RV32ZBP-NEXT: slli a4, a1, 1
+; RV32ZBP-NEXT: sll a0, a4, a0
+; RV32ZBP-NEXT: or a0, a3, a0
+; RV32ZBP-NEXT: .LBB9_3:
+; RV32ZBP-NEXT: neg a5, a2
+; RV32ZBP-NEXT: andi a4, a5, 63
+; RV32ZBP-NEXT: addi a3, a4, -32
+; RV32ZBP-NEXT: bltz a3, .LBB9_5
+; RV32ZBP-NEXT: # %bb.4:
+; RV32ZBP-NEXT: sll a3, t0, a3
+; RV32ZBP-NEXT: bltz a7, .LBB9_6
+; RV32ZBP-NEXT: j .LBB9_7
+; RV32ZBP-NEXT: .LBB9_5:
+; RV32ZBP-NEXT: sll a3, t0, a5
+; RV32ZBP-NEXT: or a0, a0, a3
+; RV32ZBP-NEXT: sll a3, a1, a5
+; RV32ZBP-NEXT: sub a4, a6, a4
+; RV32ZBP-NEXT: srli a5, t0, 1
+; RV32ZBP-NEXT: srl a4, a5, a4
+; RV32ZBP-NEXT: or a3, a3, a4
+; RV32ZBP-NEXT: bgez a7, .LBB9_7
+; RV32ZBP-NEXT: .LBB9_6:
+; RV32ZBP-NEXT: srl a1, a1, a2
+; RV32ZBP-NEXT: or a3, a3, a1
+; RV32ZBP-NEXT: .LBB9_7:
+; RV32ZBP-NEXT: mv a1, a3
+; RV32ZBP-NEXT: ret
%or = tail call i64 @llvm.fshr.i64(i64 %a, i64 %a, i64 %b)
ret i64 %or
}
; RV32I-NEXT: sw s4, 8(sp) # 4-byte Folded Spill
; RV32I-NEXT: sw s5, 4(sp) # 4-byte Folded Spill
; RV32I-NEXT: sw s6, 0(sp) # 4-byte Folded Spill
-; RV32I-NEXT: mv s0, a1
-; RV32I-NEXT: mv s1, a0
+; RV32I-NEXT: mv s3, a1
+; RV32I-NEXT: mv s4, a0
; RV32I-NEXT: srli a0, a1, 1
; RV32I-NEXT: or a0, a1, a0
; RV32I-NEXT: srli a1, a0, 2
; RV32I-NEXT: not a0, a0
; RV32I-NEXT: srli a1, a0, 1
; RV32I-NEXT: lui a2, 349525
-; RV32I-NEXT: addi s4, a2, 1365
-; RV32I-NEXT: and a1, a1, s4
+; RV32I-NEXT: addi s5, a2, 1365
+; RV32I-NEXT: and a1, a1, s5
; RV32I-NEXT: sub a0, a0, a1
; RV32I-NEXT: lui a1, 209715
-; RV32I-NEXT: addi s5, a1, 819
-; RV32I-NEXT: and a1, a0, s5
+; RV32I-NEXT: addi s1, a1, 819
+; RV32I-NEXT: and a1, a0, s1
; RV32I-NEXT: srli a0, a0, 2
-; RV32I-NEXT: and a0, a0, s5
+; RV32I-NEXT: and a0, a0, s1
; RV32I-NEXT: add a0, a1, a0
; RV32I-NEXT: srli a1, a0, 4
; RV32I-NEXT: add a0, a0, a1
; RV32I-NEXT: addi s6, a1, -241
; RV32I-NEXT: and a0, a0, s6
; RV32I-NEXT: lui a1, 4112
-; RV32I-NEXT: addi s3, a1, 257
-; RV32I-NEXT: mv a1, s3
+; RV32I-NEXT: addi s0, a1, 257
+; RV32I-NEXT: mv a1, s0
; RV32I-NEXT: call __mulsi3@plt
; RV32I-NEXT: mv s2, a0
-; RV32I-NEXT: srli a0, s1, 1
-; RV32I-NEXT: or a0, s1, a0
+; RV32I-NEXT: srli a0, s4, 1
+; RV32I-NEXT: or a0, s4, a0
; RV32I-NEXT: srli a1, a0, 2
; RV32I-NEXT: or a0, a0, a1
; RV32I-NEXT: srli a1, a0, 4
; RV32I-NEXT: or a0, a0, a1
; RV32I-NEXT: not a0, a0
; RV32I-NEXT: srli a1, a0, 1
-; RV32I-NEXT: and a1, a1, s4
+; RV32I-NEXT: and a1, a1, s5
; RV32I-NEXT: sub a0, a0, a1
-; RV32I-NEXT: and a1, a0, s5
+; RV32I-NEXT: and a1, a0, s1
; RV32I-NEXT: srli a0, a0, 2
-; RV32I-NEXT: and a0, a0, s5
+; RV32I-NEXT: and a0, a0, s1
; RV32I-NEXT: add a0, a1, a0
; RV32I-NEXT: srli a1, a0, 4
; RV32I-NEXT: add a0, a0, a1
; RV32I-NEXT: and a0, a0, s6
-; RV32I-NEXT: mv a1, s3
+; RV32I-NEXT: mv a1, s0
; RV32I-NEXT: call __mulsi3@plt
-; RV32I-NEXT: bnez s0, .LBB1_2
+; RV32I-NEXT: bnez s3, .LBB1_2
; RV32I-NEXT: # %bb.1:
; RV32I-NEXT: srli a0, a0, 24
; RV32I-NEXT: addi a0, a0, 32
; RV32I-NEXT: sw s4, 8(sp) # 4-byte Folded Spill
; RV32I-NEXT: sw s5, 4(sp) # 4-byte Folded Spill
; RV32I-NEXT: sw s6, 0(sp) # 4-byte Folded Spill
-; RV32I-NEXT: mv s1, a1
-; RV32I-NEXT: mv s0, a0
+; RV32I-NEXT: mv s3, a1
+; RV32I-NEXT: mv s4, a0
; RV32I-NEXT: addi a0, a0, -1
-; RV32I-NEXT: not a1, s0
+; RV32I-NEXT: not a1, s4
; RV32I-NEXT: and a0, a1, a0
; RV32I-NEXT: srli a1, a0, 1
; RV32I-NEXT: lui a2, 349525
-; RV32I-NEXT: addi s4, a2, 1365
-; RV32I-NEXT: and a1, a1, s4
+; RV32I-NEXT: addi s5, a2, 1365
+; RV32I-NEXT: and a1, a1, s5
; RV32I-NEXT: sub a0, a0, a1
; RV32I-NEXT: lui a1, 209715
-; RV32I-NEXT: addi s5, a1, 819
-; RV32I-NEXT: and a1, a0, s5
+; RV32I-NEXT: addi s0, a1, 819
+; RV32I-NEXT: and a1, a0, s0
; RV32I-NEXT: srli a0, a0, 2
-; RV32I-NEXT: and a0, a0, s5
+; RV32I-NEXT: and a0, a0, s0
; RV32I-NEXT: add a0, a1, a0
; RV32I-NEXT: srli a1, a0, 4
; RV32I-NEXT: add a0, a0, a1
; RV32I-NEXT: addi s6, a1, -241
; RV32I-NEXT: and a0, a0, s6
; RV32I-NEXT: lui a1, 4112
-; RV32I-NEXT: addi s3, a1, 257
-; RV32I-NEXT: mv a1, s3
+; RV32I-NEXT: addi s1, a1, 257
+; RV32I-NEXT: mv a1, s1
; RV32I-NEXT: call __mulsi3@plt
; RV32I-NEXT: mv s2, a0
-; RV32I-NEXT: addi a0, s1, -1
-; RV32I-NEXT: not a1, s1
+; RV32I-NEXT: addi a0, s3, -1
+; RV32I-NEXT: not a1, s3
; RV32I-NEXT: and a0, a1, a0
; RV32I-NEXT: srli a1, a0, 1
-; RV32I-NEXT: and a1, a1, s4
+; RV32I-NEXT: and a1, a1, s5
; RV32I-NEXT: sub a0, a0, a1
-; RV32I-NEXT: and a1, a0, s5
+; RV32I-NEXT: and a1, a0, s0
; RV32I-NEXT: srli a0, a0, 2
-; RV32I-NEXT: and a0, a0, s5
+; RV32I-NEXT: and a0, a0, s0
; RV32I-NEXT: add a0, a1, a0
; RV32I-NEXT: srli a1, a0, 4
; RV32I-NEXT: add a0, a0, a1
; RV32I-NEXT: and a0, a0, s6
-; RV32I-NEXT: mv a1, s3
+; RV32I-NEXT: mv a1, s1
; RV32I-NEXT: call __mulsi3@plt
-; RV32I-NEXT: bnez s0, .LBB3_2
+; RV32I-NEXT: bnez s4, .LBB3_2
; RV32I-NEXT: # %bb.1:
; RV32I-NEXT: srli a0, a0, 24
; RV32I-NEXT: addi a0, a0, 32
; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill
; RV32I-NEXT: sw s4, 8(sp) # 4-byte Folded Spill
; RV32I-NEXT: sw s5, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT: mv s0, a0
+; RV32I-NEXT: mv s2, a0
; RV32I-NEXT: srli a0, a1, 1
; RV32I-NEXT: lui a2, 349525
-; RV32I-NEXT: addi s2, a2, 1365
-; RV32I-NEXT: and a0, a0, s2
+; RV32I-NEXT: addi s3, a2, 1365
+; RV32I-NEXT: and a0, a0, s3
; RV32I-NEXT: sub a0, a1, a0
; RV32I-NEXT: lui a1, 209715
-; RV32I-NEXT: addi s1, a1, 819
-; RV32I-NEXT: and a1, a0, s1
+; RV32I-NEXT: addi s0, a1, 819
+; RV32I-NEXT: and a1, a0, s0
; RV32I-NEXT: srli a0, a0, 2
-; RV32I-NEXT: and a0, a0, s1
+; RV32I-NEXT: and a0, a0, s0
; RV32I-NEXT: add a0, a1, a0
; RV32I-NEXT: srli a1, a0, 4
; RV32I-NEXT: add a0, a0, a1
; RV32I-NEXT: addi s4, a1, -241
; RV32I-NEXT: and a0, a0, s4
; RV32I-NEXT: lui a1, 4112
-; RV32I-NEXT: addi s3, a1, 257
-; RV32I-NEXT: mv a1, s3
+; RV32I-NEXT: addi s1, a1, 257
+; RV32I-NEXT: mv a1, s1
; RV32I-NEXT: call __mulsi3@plt
; RV32I-NEXT: srli s5, a0, 24
-; RV32I-NEXT: srli a0, s0, 1
-; RV32I-NEXT: and a0, a0, s2
-; RV32I-NEXT: sub a0, s0, a0
-; RV32I-NEXT: and a1, a0, s1
+; RV32I-NEXT: srli a0, s2, 1
+; RV32I-NEXT: and a0, a0, s3
+; RV32I-NEXT: sub a0, s2, a0
+; RV32I-NEXT: and a1, a0, s0
; RV32I-NEXT: srli a0, a0, 2
-; RV32I-NEXT: and a0, a0, s1
+; RV32I-NEXT: and a0, a0, s0
; RV32I-NEXT: add a0, a1, a0
; RV32I-NEXT: srli a1, a0, 4
; RV32I-NEXT: add a0, a0, a1
; RV32I-NEXT: and a0, a0, s4
-; RV32I-NEXT: mv a1, s3
+; RV32I-NEXT: mv a1, s1
; RV32I-NEXT: call __mulsi3@plt
; RV32I-NEXT: srli a0, a0, 24
; RV32I-NEXT: add a0, a0, s5
; RV32I-NEXT: slli a2, a0, 1
; RV32I-NEXT: slli a3, a1, 1
; RV32I-NEXT: lui a4, 699051
-; RV32I-NEXT: addi a6, a4, -1366
-; RV32I-NEXT: and a7, a3, a6
-; RV32I-NEXT: and a2, a2, a6
+; RV32I-NEXT: addi a4, a4, -1366
+; RV32I-NEXT: and a6, a3, a4
+; RV32I-NEXT: and a7, a2, a4
; RV32I-NEXT: srli a5, a1, 1
-; RV32I-NEXT: srli a4, a0, 1
-; RV32I-NEXT: lui a3, 349525
-; RV32I-NEXT: addi t0, a3, 1365
-; RV32I-NEXT: and a4, a4, t0
-; RV32I-NEXT: and a5, a5, t0
+; RV32I-NEXT: srli a3, a0, 1
+; RV32I-NEXT: lui a2, 349525
+; RV32I-NEXT: addi a2, a2, 1365
+; RV32I-NEXT: and a3, a3, a2
+; RV32I-NEXT: and a5, a5, a2
; RV32I-NEXT: or a1, a5, a1
-; RV32I-NEXT: or a0, a4, a0
-; RV32I-NEXT: or a0, a0, a2
-; RV32I-NEXT: or a1, a1, a7
-; RV32I-NEXT: slli a2, a1, 2
-; RV32I-NEXT: slli a4, a0, 2
-; RV32I-NEXT: lui a5, 838861
-; RV32I-NEXT: addi a5, a5, -820
-; RV32I-NEXT: and a7, a4, a5
-; RV32I-NEXT: and a2, a2, a5
-; RV32I-NEXT: srli a5, a0, 2
+; RV32I-NEXT: or a0, a3, a0
+; RV32I-NEXT: or a0, a0, a7
+; RV32I-NEXT: or a1, a1, a6
+; RV32I-NEXT: slli a6, a1, 2
+; RV32I-NEXT: slli a5, a0, 2
+; RV32I-NEXT: lui a3, 838861
+; RV32I-NEXT: addi a3, a3, -820
+; RV32I-NEXT: and a7, a5, a3
+; RV32I-NEXT: and a6, a6, a3
+; RV32I-NEXT: srli t0, a0, 2
; RV32I-NEXT: srli a3, a1, 2
-; RV32I-NEXT: lui a4, 209715
-; RV32I-NEXT: addi a4, a4, 819
-; RV32I-NEXT: and a3, a3, a4
-; RV32I-NEXT: and a4, a5, a4
-; RV32I-NEXT: or a0, a4, a0
+; RV32I-NEXT: lui a5, 209715
+; RV32I-NEXT: addi a5, a5, 819
+; RV32I-NEXT: and a3, a3, a5
+; RV32I-NEXT: and a5, t0, a5
+; RV32I-NEXT: or a0, a5, a0
; RV32I-NEXT: or a1, a3, a1
-; RV32I-NEXT: or a1, a1, a2
+; RV32I-NEXT: or a1, a1, a6
; RV32I-NEXT: or a0, a0, a7
-; RV32I-NEXT: slli a2, a0, 1
-; RV32I-NEXT: slli a3, a1, 1
-; RV32I-NEXT: and a3, a3, a6
-; RV32I-NEXT: and a2, a2, a6
+; RV32I-NEXT: slli a3, a0, 1
+; RV32I-NEXT: slli a5, a1, 1
+; RV32I-NEXT: and a6, a5, a4
+; RV32I-NEXT: and a3, a3, a4
; RV32I-NEXT: srli a4, a1, 1
; RV32I-NEXT: srli a5, a0, 1
-; RV32I-NEXT: and a5, a5, t0
-; RV32I-NEXT: and a4, a4, t0
-; RV32I-NEXT: or a1, a4, a1
+; RV32I-NEXT: and a5, a5, a2
+; RV32I-NEXT: and a2, a4, a2
+; RV32I-NEXT: or a1, a2, a1
; RV32I-NEXT: or a0, a5, a0
-; RV32I-NEXT: or a0, a0, a2
-; RV32I-NEXT: or a1, a1, a3
+; RV32I-NEXT: or a0, a0, a3
+; RV32I-NEXT: or a1, a1, a6
; RV32I-NEXT: ret
;
; RV32B-LABEL: gorc3b_i64:
; RV32I-NEXT: slli a2, a0, 1
; RV32I-NEXT: slli a3, a1, 1
; RV32I-NEXT: lui a4, 699051
-; RV32I-NEXT: addi a6, a4, -1366
-; RV32I-NEXT: and a3, a3, a6
-; RV32I-NEXT: and a2, a2, a6
+; RV32I-NEXT: addi a4, a4, -1366
+; RV32I-NEXT: and a3, a3, a4
+; RV32I-NEXT: and a2, a2, a4
; RV32I-NEXT: srli a0, a0, 1
; RV32I-NEXT: srli a1, a1, 1
; RV32I-NEXT: lui a5, 349525
; RV32I-NEXT: and a0, a0, a5
; RV32I-NEXT: or a0, a2, a0
; RV32I-NEXT: or a1, a3, a1
-; RV32I-NEXT: slli a2, a1, 2
+; RV32I-NEXT: slli a6, a1, 2
; RV32I-NEXT: slli a3, a0, 2
-; RV32I-NEXT: lui a4, 838861
-; RV32I-NEXT: addi a4, a4, -820
-; RV32I-NEXT: and a3, a3, a4
-; RV32I-NEXT: and a2, a2, a4
+; RV32I-NEXT: lui a2, 838861
+; RV32I-NEXT: addi a2, a2, -820
+; RV32I-NEXT: and a7, a3, a2
+; RV32I-NEXT: and a2, a6, a2
; RV32I-NEXT: srli a1, a1, 2
; RV32I-NEXT: srli a0, a0, 2
-; RV32I-NEXT: lui a4, 209715
-; RV32I-NEXT: addi a4, a4, 819
-; RV32I-NEXT: and a0, a0, a4
-; RV32I-NEXT: and a1, a1, a4
+; RV32I-NEXT: lui a3, 209715
+; RV32I-NEXT: addi a3, a3, 819
+; RV32I-NEXT: and a0, a0, a3
+; RV32I-NEXT: and a1, a1, a3
; RV32I-NEXT: or a1, a2, a1
-; RV32I-NEXT: or a0, a3, a0
+; RV32I-NEXT: or a0, a7, a0
; RV32I-NEXT: slli a2, a0, 1
; RV32I-NEXT: slli a3, a1, 1
-; RV32I-NEXT: and a3, a3, a6
-; RV32I-NEXT: and a2, a2, a6
+; RV32I-NEXT: and a3, a3, a4
+; RV32I-NEXT: and a2, a2, a4
; RV32I-NEXT: srli a0, a0, 1
; RV32I-NEXT: srli a1, a1, 1
; RV32I-NEXT: and a1, a1, a5
; RV32I-NEXT: slli a2, a1, 1
; RV32I-NEXT: slli a3, a0, 1
; RV32I-NEXT: lui a4, 699051
-; RV32I-NEXT: addi a6, a4, -1366
-; RV32I-NEXT: and a3, a3, a6
-; RV32I-NEXT: and a2, a2, a6
+; RV32I-NEXT: addi a4, a4, -1366
+; RV32I-NEXT: and a3, a3, a4
+; RV32I-NEXT: and a2, a2, a4
; RV32I-NEXT: srli a1, a1, 1
; RV32I-NEXT: srli a0, a0, 1
; RV32I-NEXT: lui a5, 349525
-; RV32I-NEXT: addi a7, a5, 1365
-; RV32I-NEXT: and a0, a0, a7
-; RV32I-NEXT: and a1, a1, a7
+; RV32I-NEXT: addi a5, a5, 1365
+; RV32I-NEXT: and a0, a0, a5
+; RV32I-NEXT: and a1, a1, a5
; RV32I-NEXT: or a1, a2, a1
; RV32I-NEXT: or a0, a3, a0
-; RV32I-NEXT: slli a2, a0, 2
+; RV32I-NEXT: slli a6, a0, 2
; RV32I-NEXT: slli a3, a1, 2
-; RV32I-NEXT: lui a4, 838861
-; RV32I-NEXT: addi a4, a4, -820
-; RV32I-NEXT: and a3, a3, a4
-; RV32I-NEXT: and a2, a2, a4
+; RV32I-NEXT: lui a2, 838861
+; RV32I-NEXT: addi a2, a2, -820
+; RV32I-NEXT: and a7, a3, a2
+; RV32I-NEXT: and a6, a6, a2
; RV32I-NEXT: srli a0, a0, 2
; RV32I-NEXT: srli a1, a1, 2
-; RV32I-NEXT: lui a5, 209715
-; RV32I-NEXT: addi a5, a5, 819
-; RV32I-NEXT: and a1, a1, a5
-; RV32I-NEXT: and a0, a0, a5
-; RV32I-NEXT: or a0, a2, a0
-; RV32I-NEXT: or a1, a3, a1
-; RV32I-NEXT: slli a2, a1, 1
-; RV32I-NEXT: slli a3, a0, 1
-; RV32I-NEXT: and a3, a3, a6
-; RV32I-NEXT: and a2, a2, a6
+; RV32I-NEXT: lui a3, 209715
+; RV32I-NEXT: addi a3, a3, 819
+; RV32I-NEXT: and a1, a1, a3
+; RV32I-NEXT: and a0, a0, a3
+; RV32I-NEXT: or t0, a6, a0
+; RV32I-NEXT: or a1, a7, a1
+; RV32I-NEXT: slli a6, a1, 1
+; RV32I-NEXT: slli a0, t0, 1
+; RV32I-NEXT: and a7, a0, a4
+; RV32I-NEXT: and a4, a6, a4
; RV32I-NEXT: srli a1, a1, 1
-; RV32I-NEXT: srli a0, a0, 1
-; RV32I-NEXT: and a0, a0, a7
-; RV32I-NEXT: and a1, a1, a7
-; RV32I-NEXT: or a1, a2, a1
-; RV32I-NEXT: or a0, a3, a0
-; RV32I-NEXT: slli a2, a0, 2
-; RV32I-NEXT: slli a3, a1, 2
-; RV32I-NEXT: and a3, a3, a4
-; RV32I-NEXT: and a2, a2, a4
+; RV32I-NEXT: srli a0, t0, 1
+; RV32I-NEXT: and a0, a0, a5
+; RV32I-NEXT: and a1, a1, a5
+; RV32I-NEXT: or a1, a4, a1
+; RV32I-NEXT: or a0, a7, a0
+; RV32I-NEXT: slli a4, a0, 2
+; RV32I-NEXT: slli a5, a1, 2
+; RV32I-NEXT: and a5, a5, a2
+; RV32I-NEXT: and a2, a4, a2
; RV32I-NEXT: srli a0, a0, 2
; RV32I-NEXT: srli a1, a1, 2
-; RV32I-NEXT: and a1, a1, a5
-; RV32I-NEXT: and a0, a0, a5
+; RV32I-NEXT: and a1, a1, a3
+; RV32I-NEXT: and a0, a0, a3
; RV32I-NEXT: or a0, a2, a0
-; RV32I-NEXT: or a1, a3, a1
+; RV32I-NEXT: or a1, a5, a1
; RV32I-NEXT: ret
;
; RV32B-LABEL: grev0_i64:
; RV32I: # %bb.0:
; RV32I-NEXT: srli a2, a1, 8
; RV32I-NEXT: lui a3, 16
-; RV32I-NEXT: addi a6, a3, -256
-; RV32I-NEXT: and a2, a2, a6
+; RV32I-NEXT: addi t0, a3, -256
+; RV32I-NEXT: and a2, a2, t0
; RV32I-NEXT: srli a4, a1, 24
; RV32I-NEXT: or a2, a2, a4
; RV32I-NEXT: slli a4, a1, 8
-; RV32I-NEXT: lui a7, 4080
-; RV32I-NEXT: and a4, a4, a7
+; RV32I-NEXT: lui a6, 4080
+; RV32I-NEXT: and a4, a4, a6
; RV32I-NEXT: slli a1, a1, 24
; RV32I-NEXT: or a1, a1, a4
; RV32I-NEXT: or a1, a1, a2
; RV32I-NEXT: lui a2, 61681
-; RV32I-NEXT: addi t0, a2, -241
-; RV32I-NEXT: and a2, a1, t0
+; RV32I-NEXT: addi t1, a2, -241
+; RV32I-NEXT: and a2, a1, t1
; RV32I-NEXT: slli a2, a2, 4
-; RV32I-NEXT: lui a3, 986895
-; RV32I-NEXT: addi t1, a3, 240
-; RV32I-NEXT: and a1, a1, t1
+; RV32I-NEXT: lui a5, 986895
+; RV32I-NEXT: addi t2, a5, 240
+; RV32I-NEXT: and a1, a1, t2
; RV32I-NEXT: srli a1, a1, 4
; RV32I-NEXT: or a1, a1, a2
; RV32I-NEXT: lui a2, 209715
-; RV32I-NEXT: addi t2, a2, 819
-; RV32I-NEXT: and a2, a1, t2
-; RV32I-NEXT: slli a2, a2, 2
+; RV32I-NEXT: addi t3, a2, 819
+; RV32I-NEXT: and a3, a1, t3
+; RV32I-NEXT: slli a3, a3, 2
; RV32I-NEXT: lui a4, 838861
-; RV32I-NEXT: addi t3, a4, -820
-; RV32I-NEXT: and a1, a1, t3
+; RV32I-NEXT: addi a4, a4, -820
+; RV32I-NEXT: and a1, a1, a4
; RV32I-NEXT: srli a1, a1, 2
-; RV32I-NEXT: or a1, a1, a2
-; RV32I-NEXT: lui a2, 349525
-; RV32I-NEXT: addi a3, a2, 1365
-; RV32I-NEXT: and a2, a1, a3
-; RV32I-NEXT: slli a2, a2, 1
-; RV32I-NEXT: lui a5, 699051
-; RV32I-NEXT: addi a5, a5, -1366
-; RV32I-NEXT: and a1, a1, a5
+; RV32I-NEXT: or a1, a1, a3
+; RV32I-NEXT: lui a3, 349525
+; RV32I-NEXT: addi a3, a3, 1365
+; RV32I-NEXT: and a5, a1, a3
+; RV32I-NEXT: slli a5, a5, 1
+; RV32I-NEXT: lui a2, 699051
+; RV32I-NEXT: addi a2, a2, -1366
+; RV32I-NEXT: and a1, a1, a2
; RV32I-NEXT: srli a1, a1, 1
-; RV32I-NEXT: or a2, a1, a2
+; RV32I-NEXT: or a7, a1, a5
; RV32I-NEXT: srli a1, a0, 8
-; RV32I-NEXT: and a1, a1, a6
-; RV32I-NEXT: srli a4, a0, 24
-; RV32I-NEXT: or a1, a1, a4
-; RV32I-NEXT: slli a4, a0, 8
-; RV32I-NEXT: and a4, a4, a7
+; RV32I-NEXT: and a1, a1, t0
+; RV32I-NEXT: srli a5, a0, 24
+; RV32I-NEXT: or a1, a1, a5
+; RV32I-NEXT: slli a5, a0, 8
+; RV32I-NEXT: and a5, a5, a6
; RV32I-NEXT: slli a0, a0, 24
-; RV32I-NEXT: or a0, a0, a4
+; RV32I-NEXT: or a0, a0, a5
; RV32I-NEXT: or a0, a0, a1
-; RV32I-NEXT: and a1, a0, t0
+; RV32I-NEXT: and a1, a0, t1
; RV32I-NEXT: slli a1, a1, 4
-; RV32I-NEXT: and a0, a0, t1
+; RV32I-NEXT: and a0, a0, t2
; RV32I-NEXT: srli a0, a0, 4
; RV32I-NEXT: or a0, a0, a1
-; RV32I-NEXT: and a1, a0, t2
+; RV32I-NEXT: and a1, a0, t3
; RV32I-NEXT: slli a1, a1, 2
-; RV32I-NEXT: and a0, a0, t3
+; RV32I-NEXT: and a0, a0, a4
; RV32I-NEXT: srli a0, a0, 2
; RV32I-NEXT: or a0, a0, a1
; RV32I-NEXT: and a1, a0, a3
; RV32I-NEXT: slli a1, a1, 1
-; RV32I-NEXT: and a0, a0, a5
+; RV32I-NEXT: and a0, a0, a2
; RV32I-NEXT: srli a0, a0, 1
; RV32I-NEXT: or a1, a0, a1
-; RV32I-NEXT: mv a0, a2
+; RV32I-NEXT: mv a0, a7
; RV32I-NEXT: ret
;
; RV32B-LABEL: bitreverse_i64:
; RV32I: # %bb.0:
; RV32I-NEXT: srli a3, a1, 8
; RV32I-NEXT: lui a2, 16
-; RV32I-NEXT: addi t0, a2, -256
-; RV32I-NEXT: and a3, a3, t0
+; RV32I-NEXT: addi t1, a2, -256
+; RV32I-NEXT: and a3, a3, t1
; RV32I-NEXT: srli a4, a1, 24
; RV32I-NEXT: or a4, a3, a4
; RV32I-NEXT: slli a5, a1, 8
-; RV32I-NEXT: lui t1, 4080
-; RV32I-NEXT: and a5, a5, t1
+; RV32I-NEXT: lui a6, 4080
+; RV32I-NEXT: and a5, a5, a6
; RV32I-NEXT: slli a1, a1, 24
; RV32I-NEXT: or a1, a1, a5
; RV32I-NEXT: or a1, a1, a4
; RV32I-NEXT: lui a4, 61681
-; RV32I-NEXT: addi a6, a4, -241
-; RV32I-NEXT: and a5, a1, a6
+; RV32I-NEXT: addi a7, a4, -241
+; RV32I-NEXT: and a5, a1, a7
; RV32I-NEXT: slli a5, a5, 4
-; RV32I-NEXT: lui a4, 986895
-; RV32I-NEXT: addi a7, a4, 240
-; RV32I-NEXT: and a1, a1, a7
+; RV32I-NEXT: lui a3, 986895
+; RV32I-NEXT: addi t0, a3, 240
+; RV32I-NEXT: and a1, a1, t0
; RV32I-NEXT: srli a1, a1, 4
; RV32I-NEXT: or a1, a1, a5
; RV32I-NEXT: lui a5, 209715
; RV32I-NEXT: addi t2, a5, 819
; RV32I-NEXT: and a4, a1, t2
; RV32I-NEXT: slli a4, a4, 2
-; RV32I-NEXT: lui a2, 838861
-; RV32I-NEXT: addi t3, a2, -820
+; RV32I-NEXT: lui a3, 838861
+; RV32I-NEXT: addi t3, a3, -820
; RV32I-NEXT: and a1, a1, t3
; RV32I-NEXT: srli a1, a1, 2
; RV32I-NEXT: or a1, a1, a4
; RV32I-NEXT: lui a4, 349525
; RV32I-NEXT: addi a4, a4, 1365
-; RV32I-NEXT: and a3, a1, a4
-; RV32I-NEXT: slli a3, a3, 1
+; RV32I-NEXT: and a2, a1, a4
+; RV32I-NEXT: slli a2, a2, 1
; RV32I-NEXT: lui a5, 699051
; RV32I-NEXT: addi a5, a5, -1366
; RV32I-NEXT: and a1, a1, a5
; RV32I-NEXT: srli a1, a1, 1
-; RV32I-NEXT: or a1, a1, a3
-; RV32I-NEXT: srli a3, a0, 8
-; RV32I-NEXT: and a3, a3, t0
-; RV32I-NEXT: srli a2, a0, 24
-; RV32I-NEXT: or a2, a3, a2
+; RV32I-NEXT: or a1, a1, a2
+; RV32I-NEXT: srli a2, a0, 8
+; RV32I-NEXT: and a2, a2, t1
+; RV32I-NEXT: srli a3, a0, 24
+; RV32I-NEXT: or a2, a2, a3
; RV32I-NEXT: slli a3, a0, 8
-; RV32I-NEXT: and a3, a3, t1
+; RV32I-NEXT: and a3, a3, a6
; RV32I-NEXT: slli a0, a0, 24
; RV32I-NEXT: or a0, a0, a3
; RV32I-NEXT: or a0, a0, a2
-; RV32I-NEXT: and a2, a0, a6
+; RV32I-NEXT: and a2, a0, a7
; RV32I-NEXT: slli a2, a2, 4
-; RV32I-NEXT: and a0, a0, a7
+; RV32I-NEXT: and a0, a0, t0
; RV32I-NEXT: srli a0, a0, 4
; RV32I-NEXT: or a0, a0, a2
; RV32I-NEXT: and a2, a0, t2
; RV32I-NEXT: srli a0, a0, 1
; RV32I-NEXT: or a0, a0, a2
; RV32I-NEXT: srli a2, a0, 8
-; RV32I-NEXT: and a2, a2, t0
+; RV32I-NEXT: and a2, a2, t1
; RV32I-NEXT: srli a3, a0, 24
; RV32I-NEXT: or a2, a2, a3
; RV32I-NEXT: slli a3, a0, 8
-; RV32I-NEXT: and a3, a3, t1
+; RV32I-NEXT: and a3, a3, a6
; RV32I-NEXT: slli a0, a0, 24
; RV32I-NEXT: or a0, a0, a3
; RV32I-NEXT: or a0, a0, a2
; RV32I-NEXT: srli a2, a1, 8
-; RV32I-NEXT: and a2, a2, t0
+; RV32I-NEXT: and a2, a2, t1
; RV32I-NEXT: srli a3, a1, 24
; RV32I-NEXT: or a2, a2, a3
; RV32I-NEXT: slli a3, a1, 8
-; RV32I-NEXT: and a3, a3, t1
+; RV32I-NEXT: and a3, a3, a6
; RV32I-NEXT: slli a1, a1, 24
; RV32I-NEXT: or a1, a1, a3
; RV32I-NEXT: or a1, a1, a2
; RV32I-NEXT: srl a1, a1, a5
; RV32I-NEXT: or a1, t0, a1
; RV32I-NEXT: .LBB13_3:
-; RV32I-NEXT: not t0, a4
-; RV32I-NEXT: andi t3, t0, 63
-; RV32I-NEXT: addi t2, t3, -32
-; RV32I-NEXT: srli t1, a3, 1
-; RV32I-NEXT: bltz t2, .LBB13_5
+; RV32I-NEXT: not t2, a4
+; RV32I-NEXT: andi t1, t2, 63
+; RV32I-NEXT: addi a5, t1, -32
+; RV32I-NEXT: srli t0, a3, 1
+; RV32I-NEXT: bltz a5, .LBB13_5
; RV32I-NEXT: # %bb.4:
-; RV32I-NEXT: srl a2, t1, t2
+; RV32I-NEXT: srl a2, t0, a5
; RV32I-NEXT: bltz a7, .LBB13_6
; RV32I-NEXT: j .LBB13_7
; RV32I-NEXT: .LBB13_5:
-; RV32I-NEXT: srl a5, t1, t0
+; RV32I-NEXT: srl a5, t0, t2
; RV32I-NEXT: or a1, a1, a5
; RV32I-NEXT: slli a3, a3, 31
; RV32I-NEXT: srli a2, a2, 1
; RV32I-NEXT: or a2, a2, a3
-; RV32I-NEXT: srl a2, a2, t0
-; RV32I-NEXT: sub a3, a6, t3
-; RV32I-NEXT: slli a5, t1, 1
+; RV32I-NEXT: srl a2, a2, t2
+; RV32I-NEXT: sub a3, a6, t1
+; RV32I-NEXT: slli a5, t0, 1
; RV32I-NEXT: sll a3, a5, a3
; RV32I-NEXT: or a2, a2, a3
; RV32I-NEXT: bgez a7, .LBB13_7
; RV32I-NEXT: mv a0, a2
; RV32I-NEXT: ret
;
-; RV32IB-LABEL: fshl_i64:
-; RV32IB: # %bb.0:
-; RV32IB-NEXT: sll a7, a1, a4
-; RV32IB-NEXT: andi a5, a4, 63
-; RV32IB-NEXT: addi a6, zero, 31
-; RV32IB-NEXT: sub t0, a6, a5
-; RV32IB-NEXT: srli a1, a0, 1
-; RV32IB-NEXT: srl a1, a1, t0
-; RV32IB-NEXT: or t0, a7, a1
-; RV32IB-NEXT: addi a7, a5, -32
-; RV32IB-NEXT: sll a5, a0, a7
-; RV32IB-NEXT: slti a1, a7, 0
-; RV32IB-NEXT: cmov t1, a1, t0, a5
-; RV32IB-NEXT: not t0, a4
-; RV32IB-NEXT: srli a5, a3, 1
-; RV32IB-NEXT: srl t2, a5, t0
-; RV32IB-NEXT: addi a1, zero, 63
-; RV32IB-NEXT: andn t3, a1, a4
-; RV32IB-NEXT: addi t4, t3, -32
-; RV32IB-NEXT: srai a1, t4, 31
-; RV32IB-NEXT: and a1, a1, t2
-; RV32IB-NEXT: or a1, t1, a1
-; RV32IB-NEXT: fsri a2, a2, a3, 1
-; RV32IB-NEXT: srl t0, a2, t0
-; RV32IB-NEXT: sub a3, a6, t3
-; RV32IB-NEXT: slli a2, a5, 1
-; RV32IB-NEXT: sll a2, a2, a3
-; RV32IB-NEXT: or a2, t0, a2
-; RV32IB-NEXT: srl a3, a5, t4
-; RV32IB-NEXT: slti a5, t4, 0
-; RV32IB-NEXT: cmov a2, a5, a2, a3
-; RV32IB-NEXT: sll a0, a0, a4
-; RV32IB-NEXT: srai a3, a7, 31
-; RV32IB-NEXT: and a0, a3, a0
-; RV32IB-NEXT: or a0, a0, a2
-; RV32IB-NEXT: ret
-;
-; RV32IBT-LABEL: fshl_i64:
-; RV32IBT: # %bb.0:
-; RV32IBT-NEXT: sll a7, a1, a4
-; RV32IBT-NEXT: andi a5, a4, 63
-; RV32IBT-NEXT: addi a6, zero, 31
-; RV32IBT-NEXT: sub t0, a6, a5
-; RV32IBT-NEXT: srli a1, a0, 1
-; RV32IBT-NEXT: srl a1, a1, t0
-; RV32IBT-NEXT: or t0, a7, a1
-; RV32IBT-NEXT: addi a7, a5, -32
-; RV32IBT-NEXT: sll a5, a0, a7
-; RV32IBT-NEXT: slti a1, a7, 0
-; RV32IBT-NEXT: cmov t1, a1, t0, a5
-; RV32IBT-NEXT: not t0, a4
-; RV32IBT-NEXT: srli a5, a3, 1
-; RV32IBT-NEXT: srl t4, a5, t0
-; RV32IBT-NEXT: andi t2, t0, 63
-; RV32IBT-NEXT: addi t3, t2, -32
-; RV32IBT-NEXT: srai a1, t3, 31
-; RV32IBT-NEXT: and a1, a1, t4
-; RV32IBT-NEXT: or a1, t1, a1
-; RV32IBT-NEXT: fsri a2, a2, a3, 1
-; RV32IBT-NEXT: srl t0, a2, t0
-; RV32IBT-NEXT: sub a3, a6, t2
-; RV32IBT-NEXT: slli a2, a5, 1
-; RV32IBT-NEXT: sll a2, a2, a3
-; RV32IBT-NEXT: or a2, t0, a2
-; RV32IBT-NEXT: srl a3, a5, t3
-; RV32IBT-NEXT: slti a5, t3, 0
-; RV32IBT-NEXT: cmov a2, a5, a2, a3
-; RV32IBT-NEXT: sll a0, a0, a4
-; RV32IBT-NEXT: srai a3, a7, 31
-; RV32IBT-NEXT: and a0, a3, a0
-; RV32IBT-NEXT: or a0, a0, a2
-; RV32IBT-NEXT: ret
+; RV32B-LABEL: fshl_i64:
+; RV32B: # %bb.0:
+; RV32B-NEXT: sll a7, a1, a4
+; RV32B-NEXT: andi a5, a4, 63
+; RV32B-NEXT: addi a6, zero, 31
+; RV32B-NEXT: sub t0, a6, a5
+; RV32B-NEXT: srli a1, a0, 1
+; RV32B-NEXT: srl a1, a1, t0
+; RV32B-NEXT: or a7, a7, a1
+; RV32B-NEXT: addi t1, a5, -32
+; RV32B-NEXT: sll t0, a0, t1
+; RV32B-NEXT: slti a1, t1, 0
+; RV32B-NEXT: cmov t0, a1, a7, t0
+; RV32B-NEXT: not a7, a4
+; RV32B-NEXT: srli t4, a3, 1
+; RV32B-NEXT: srl t2, t4, a7
+; RV32B-NEXT: addi a1, zero, 63
+; RV32B-NEXT: andn t3, a1, a4
+; RV32B-NEXT: addi a5, t3, -32
+; RV32B-NEXT: srai a1, a5, 31
+; RV32B-NEXT: and a1, a1, t2
+; RV32B-NEXT: or a1, t0, a1
+; RV32B-NEXT: fsri a2, a2, a3, 1
+; RV32B-NEXT: srl a7, a2, a7
+; RV32B-NEXT: sub a3, a6, t3
+; RV32B-NEXT: slli a2, t4, 1
+; RV32B-NEXT: sll a2, a2, a3
+; RV32B-NEXT: or a2, a7, a2
+; RV32B-NEXT: srl a3, t4, a5
+; RV32B-NEXT: slti a5, a5, 0
+; RV32B-NEXT: cmov a2, a5, a2, a3
+; RV32B-NEXT: sll a0, a0, a4
+; RV32B-NEXT: srai a3, t1, 31
+; RV32B-NEXT: and a0, a3, a0
+; RV32B-NEXT: or a0, a0, a2
+; RV32B-NEXT: ret
+;
+; RV32ZBT-LABEL: fshl_i64:
+; RV32ZBT: # %bb.0:
+; RV32ZBT-NEXT: sll a7, a1, a4
+; RV32ZBT-NEXT: andi a5, a4, 63
+; RV32ZBT-NEXT: addi a6, zero, 31
+; RV32ZBT-NEXT: sub t0, a6, a5
+; RV32ZBT-NEXT: srli a1, a0, 1
+; RV32ZBT-NEXT: srl a1, a1, t0
+; RV32ZBT-NEXT: or a7, a7, a1
+; RV32ZBT-NEXT: addi t1, a5, -32
+; RV32ZBT-NEXT: sll t0, a0, t1
+; RV32ZBT-NEXT: slti a1, t1, 0
+; RV32ZBT-NEXT: cmov t0, a1, a7, t0
+; RV32ZBT-NEXT: not a5, a4
+; RV32ZBT-NEXT: srli a7, a3, 1
+; RV32ZBT-NEXT: srl t4, a7, a5
+; RV32ZBT-NEXT: andi t2, a5, 63
+; RV32ZBT-NEXT: addi t3, t2, -32
+; RV32ZBT-NEXT: srai a1, t3, 31
+; RV32ZBT-NEXT: and a1, a1, t4
+; RV32ZBT-NEXT: or a1, t0, a1
+; RV32ZBT-NEXT: fsri a2, a2, a3, 1
+; RV32ZBT-NEXT: srl a2, a2, a5
+; RV32ZBT-NEXT: sub a3, a6, t2
+; RV32ZBT-NEXT: slli a5, a7, 1
+; RV32ZBT-NEXT: sll a3, a5, a3
+; RV32ZBT-NEXT: or a2, a2, a3
+; RV32ZBT-NEXT: srl a3, a7, t3
+; RV32ZBT-NEXT: slti a5, t3, 0
+; RV32ZBT-NEXT: cmov a2, a5, a2, a3
+; RV32ZBT-NEXT: sll a0, a0, a4
+; RV32ZBT-NEXT: srai a3, t1, 31
+; RV32ZBT-NEXT: and a0, a3, a0
+; RV32ZBT-NEXT: or a0, a0, a2
+; RV32ZBT-NEXT: ret
%1 = tail call i64 @llvm.fshl.i64(i64 %a, i64 %b, i64 %c)
ret i64 %1
}
define i64 @fshr_i64(i64 %a, i64 %b, i64 %c) nounwind {
; RV32I-LABEL: fshr_i64:
; RV32I: # %bb.0:
-; RV32I-NEXT: mv t1, a0
+; RV32I-NEXT: mv t0, a0
; RV32I-NEXT: andi a0, a4, 63
; RV32I-NEXT: addi a6, a0, -32
; RV32I-NEXT: addi a7, zero, 31
; RV32I-NEXT: sll a0, a5, a0
; RV32I-NEXT: or a0, a2, a0
; RV32I-NEXT: .LBB15_3:
-; RV32I-NEXT: not t0, a4
-; RV32I-NEXT: andi a2, t0, 63
-; RV32I-NEXT: addi t2, a2, -32
-; RV32I-NEXT: slli a5, t1, 1
-; RV32I-NEXT: bltz t2, .LBB15_5
+; RV32I-NEXT: not t2, a4
+; RV32I-NEXT: andi a5, t2, 63
+; RV32I-NEXT: addi a2, a5, -32
+; RV32I-NEXT: slli t1, t0, 1
+; RV32I-NEXT: bltz a2, .LBB15_5
; RV32I-NEXT: # %bb.4:
-; RV32I-NEXT: sll a1, a5, t2
+; RV32I-NEXT: sll a1, t1, a2
; RV32I-NEXT: bltz a6, .LBB15_6
; RV32I-NEXT: j .LBB15_7
; RV32I-NEXT: .LBB15_5:
-; RV32I-NEXT: sll a5, a5, t0
-; RV32I-NEXT: or a0, a0, a5
-; RV32I-NEXT: lui a5, 524288
-; RV32I-NEXT: addi a5, a5, -1
-; RV32I-NEXT: and a5, t1, a5
-; RV32I-NEXT: sub a2, a7, a2
-; RV32I-NEXT: srl a2, a5, a2
-; RV32I-NEXT: srli a5, t1, 31
+; RV32I-NEXT: sll a2, t1, t2
+; RV32I-NEXT: or a0, a0, a2
+; RV32I-NEXT: lui a2, 524288
+; RV32I-NEXT: addi a2, a2, -1
+; RV32I-NEXT: and a2, t0, a2
+; RV32I-NEXT: sub a5, a7, a5
+; RV32I-NEXT: srl a2, a2, a5
+; RV32I-NEXT: srli a5, t0, 31
; RV32I-NEXT: slli a1, a1, 1
; RV32I-NEXT: or a1, a1, a5
-; RV32I-NEXT: sll a1, a1, t0
+; RV32I-NEXT: sll a1, a1, t2
; RV32I-NEXT: or a1, a1, a2
; RV32I-NEXT: bgez a6, .LBB15_7
; RV32I-NEXT: .LBB15_6:
; RV32I-NEXT: .LBB15_7:
; RV32I-NEXT: ret
;
-; RV32IB-LABEL: fshr_i64:
-; RV32IB: # %bb.0:
-; RV32IB-NEXT: srl a7, a2, a4
-; RV32IB-NEXT: andi a5, a4, 63
-; RV32IB-NEXT: addi a6, zero, 31
-; RV32IB-NEXT: sub t0, a6, a5
-; RV32IB-NEXT: slli a2, a3, 1
-; RV32IB-NEXT: sll a2, a2, t0
-; RV32IB-NEXT: or t0, a7, a2
-; RV32IB-NEXT: addi a7, a5, -32
-; RV32IB-NEXT: srl a5, a3, a7
-; RV32IB-NEXT: slti a2, a7, 0
-; RV32IB-NEXT: cmov t1, a2, t0, a5
-; RV32IB-NEXT: not t0, a4
-; RV32IB-NEXT: slli t4, a0, 1
-; RV32IB-NEXT: sll t2, t4, t0
-; RV32IB-NEXT: addi a2, zero, 63
-; RV32IB-NEXT: andn a2, a2, a4
-; RV32IB-NEXT: addi t3, a2, -32
-; RV32IB-NEXT: srai a5, t3, 31
-; RV32IB-NEXT: and a5, a5, t2
-; RV32IB-NEXT: or t1, a5, t1
-; RV32IB-NEXT: fsri a1, a0, a1, 31
-; RV32IB-NEXT: sll a1, a1, t0
-; RV32IB-NEXT: sub a2, a6, a2
-; RV32IB-NEXT: bclri a0, a0, 31
-; RV32IB-NEXT: srl a0, a0, a2
-; RV32IB-NEXT: or a0, a1, a0
-; RV32IB-NEXT: sll a1, t4, t3
-; RV32IB-NEXT: slti a2, t3, 0
-; RV32IB-NEXT: cmov a0, a2, a0, a1
-; RV32IB-NEXT: srl a1, a3, a4
-; RV32IB-NEXT: srai a2, a7, 31
-; RV32IB-NEXT: and a1, a2, a1
-; RV32IB-NEXT: or a1, a0, a1
-; RV32IB-NEXT: mv a0, t1
-; RV32IB-NEXT: ret
-;
-; RV32IBT-LABEL: fshr_i64:
-; RV32IBT: # %bb.0:
-; RV32IBT-NEXT: srl a7, a2, a4
-; RV32IBT-NEXT: andi a5, a4, 63
-; RV32IBT-NEXT: addi a6, zero, 31
-; RV32IBT-NEXT: sub t0, a6, a5
-; RV32IBT-NEXT: slli a2, a3, 1
-; RV32IBT-NEXT: sll a2, a2, t0
-; RV32IBT-NEXT: or t0, a7, a2
-; RV32IBT-NEXT: addi a7, a5, -32
-; RV32IBT-NEXT: srl a5, a3, a7
-; RV32IBT-NEXT: slti a2, a7, 0
-; RV32IBT-NEXT: cmov t1, a2, t0, a5
-; RV32IBT-NEXT: not t0, a4
-; RV32IBT-NEXT: slli t4, a0, 1
-; RV32IBT-NEXT: sll t2, t4, t0
-; RV32IBT-NEXT: andi a2, t0, 63
-; RV32IBT-NEXT: addi t3, a2, -32
-; RV32IBT-NEXT: srai a5, t3, 31
-; RV32IBT-NEXT: and a5, a5, t2
-; RV32IBT-NEXT: or t1, a5, t1
-; RV32IBT-NEXT: lui a5, 524288
-; RV32IBT-NEXT: addi a5, a5, -1
-; RV32IBT-NEXT: and a5, a0, a5
-; RV32IBT-NEXT: sub a2, a6, a2
-; RV32IBT-NEXT: srl a2, a5, a2
-; RV32IBT-NEXT: fsri a0, a0, a1, 31
-; RV32IBT-NEXT: sll a0, a0, t0
-; RV32IBT-NEXT: or a0, a0, a2
-; RV32IBT-NEXT: sll a1, t4, t3
-; RV32IBT-NEXT: slti a2, t3, 0
-; RV32IBT-NEXT: cmov a0, a2, a0, a1
-; RV32IBT-NEXT: srl a1, a3, a4
-; RV32IBT-NEXT: srai a2, a7, 31
-; RV32IBT-NEXT: and a1, a2, a1
-; RV32IBT-NEXT: or a1, a0, a1
-; RV32IBT-NEXT: mv a0, t1
-; RV32IBT-NEXT: ret
+; RV32B-LABEL: fshr_i64:
+; RV32B: # %bb.0:
+; RV32B-NEXT: srl a7, a2, a4
+; RV32B-NEXT: andi a5, a4, 63
+; RV32B-NEXT: addi a6, zero, 31
+; RV32B-NEXT: sub t0, a6, a5
+; RV32B-NEXT: slli a2, a3, 1
+; RV32B-NEXT: sll a2, a2, t0
+; RV32B-NEXT: or a7, a7, a2
+; RV32B-NEXT: addi t2, a5, -32
+; RV32B-NEXT: srl t0, a3, t2
+; RV32B-NEXT: slti a2, t2, 0
+; RV32B-NEXT: cmov a7, a2, a7, t0
+; RV32B-NEXT: not t3, a4
+; RV32B-NEXT: slli t0, a0, 1
+; RV32B-NEXT: sll t1, t0, t3
+; RV32B-NEXT: addi a5, zero, 63
+; RV32B-NEXT: andn t4, a5, a4
+; RV32B-NEXT: addi a2, t4, -32
+; RV32B-NEXT: srai a5, a2, 31
+; RV32B-NEXT: and a5, a5, t1
+; RV32B-NEXT: or a7, a5, a7
+; RV32B-NEXT: fsri a1, a0, a1, 31
+; RV32B-NEXT: sll a1, a1, t3
+; RV32B-NEXT: sub a5, a6, t4
+; RV32B-NEXT: bclri a0, a0, 31
+; RV32B-NEXT: srl a0, a0, a5
+; RV32B-NEXT: or a0, a1, a0
+; RV32B-NEXT: sll a1, t0, a2
+; RV32B-NEXT: slti a2, a2, 0
+; RV32B-NEXT: cmov a0, a2, a0, a1
+; RV32B-NEXT: srl a1, a3, a4
+; RV32B-NEXT: srai a2, t2, 31
+; RV32B-NEXT: and a1, a2, a1
+; RV32B-NEXT: or a1, a0, a1
+; RV32B-NEXT: mv a0, a7
+; RV32B-NEXT: ret
+;
+; RV32ZBT-LABEL: fshr_i64:
+; RV32ZBT: # %bb.0:
+; RV32ZBT-NEXT: srl a7, a2, a4
+; RV32ZBT-NEXT: andi a5, a4, 63
+; RV32ZBT-NEXT: addi a6, zero, 31
+; RV32ZBT-NEXT: sub t0, a6, a5
+; RV32ZBT-NEXT: slli a2, a3, 1
+; RV32ZBT-NEXT: sll a2, a2, t0
+; RV32ZBT-NEXT: or a7, a7, a2
+; RV32ZBT-NEXT: addi t2, a5, -32
+; RV32ZBT-NEXT: srl t0, a3, t2
+; RV32ZBT-NEXT: slti a2, t2, 0
+; RV32ZBT-NEXT: cmov a7, a2, a7, t0
+; RV32ZBT-NEXT: not t4, a4
+; RV32ZBT-NEXT: slli t0, a0, 1
+; RV32ZBT-NEXT: sll t1, t0, t4
+; RV32ZBT-NEXT: andi t3, t4, 63
+; RV32ZBT-NEXT: addi a5, t3, -32
+; RV32ZBT-NEXT: srai a2, a5, 31
+; RV32ZBT-NEXT: and a2, a2, t1
+; RV32ZBT-NEXT: or a7, a2, a7
+; RV32ZBT-NEXT: lui a2, 524288
+; RV32ZBT-NEXT: addi a2, a2, -1
+; RV32ZBT-NEXT: and t1, a0, a2
+; RV32ZBT-NEXT: sub a2, a6, t3
+; RV32ZBT-NEXT: srl a2, t1, a2
+; RV32ZBT-NEXT: fsri a0, a0, a1, 31
+; RV32ZBT-NEXT: sll a0, a0, t4
+; RV32ZBT-NEXT: or a0, a0, a2
+; RV32ZBT-NEXT: sll a1, t0, a5
+; RV32ZBT-NEXT: slti a2, a5, 0
+; RV32ZBT-NEXT: cmov a0, a2, a0, a1
+; RV32ZBT-NEXT: srl a1, a3, a4
+; RV32ZBT-NEXT: srai a2, t2, 31
+; RV32ZBT-NEXT: and a1, a2, a1
+; RV32ZBT-NEXT: or a1, a0, a1
+; RV32ZBT-NEXT: mv a0, a7
+; RV32ZBT-NEXT: ret
%1 = tail call i64 @llvm.fshr.i64(i64 %a, i64 %b, i64 %c)
ret i64 %1
}
; LMULMAX1-RV32-LABEL: bitreverse_v8i32:
; LMULMAX1-RV32: # %bb.0:
; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu
-; LMULMAX1-RV32-NEXT: addi a6, a0, 16
-; LMULMAX1-RV32-NEXT: vle32.v v25, (a6)
+; LMULMAX1-RV32-NEXT: addi a7, a0, 16
+; LMULMAX1-RV32-NEXT: vle32.v v25, (a7)
; LMULMAX1-RV32-NEXT: vle32.v v26, (a0)
; LMULMAX1-RV32-NEXT: vsrl.vi v27, v25, 8
; LMULMAX1-RV32-NEXT: lui a2, 16
-; LMULMAX1-RV32-NEXT: addi a7, a2, -256
-; LMULMAX1-RV32-NEXT: vand.vx v27, v27, a7
+; LMULMAX1-RV32-NEXT: addi t0, a2, -256
+; LMULMAX1-RV32-NEXT: vand.vx v27, v27, t0
; LMULMAX1-RV32-NEXT: vsrl.vi v28, v25, 24
; LMULMAX1-RV32-NEXT: vor.vv v27, v27, v28
; LMULMAX1-RV32-NEXT: vsll.vi v28, v25, 8
-; LMULMAX1-RV32-NEXT: lui t0, 4080
-; LMULMAX1-RV32-NEXT: vand.vx v28, v28, t0
+; LMULMAX1-RV32-NEXT: lui a6, 4080
+; LMULMAX1-RV32-NEXT: vand.vx v28, v28, a6
; LMULMAX1-RV32-NEXT: vsll.vi v25, v25, 24
; LMULMAX1-RV32-NEXT: vor.vv v25, v25, v28
; LMULMAX1-RV32-NEXT: vor.vv v25, v25, v27
; LMULMAX1-RV32-NEXT: vand.vx v25, v25, a5
; LMULMAX1-RV32-NEXT: vsrl.vi v25, v25, 4
; LMULMAX1-RV32-NEXT: vor.vv v25, v25, v27
-; LMULMAX1-RV32-NEXT: lui a1, 209715
-; LMULMAX1-RV32-NEXT: addi a1, a1, 819
-; LMULMAX1-RV32-NEXT: vand.vx v27, v25, a1
+; LMULMAX1-RV32-NEXT: lui a3, 209715
+; LMULMAX1-RV32-NEXT: addi a3, a3, 819
+; LMULMAX1-RV32-NEXT: vand.vx v27, v25, a3
; LMULMAX1-RV32-NEXT: vsll.vi v27, v27, 2
-; LMULMAX1-RV32-NEXT: lui a2, 838861
-; LMULMAX1-RV32-NEXT: addi a2, a2, -820
-; LMULMAX1-RV32-NEXT: vand.vx v25, v25, a2
+; LMULMAX1-RV32-NEXT: lui a1, 838861
+; LMULMAX1-RV32-NEXT: addi a1, a1, -820
+; LMULMAX1-RV32-NEXT: vand.vx v25, v25, a1
; LMULMAX1-RV32-NEXT: vsrl.vi v25, v25, 2
; LMULMAX1-RV32-NEXT: vor.vv v25, v25, v27
-; LMULMAX1-RV32-NEXT: lui a3, 349525
-; LMULMAX1-RV32-NEXT: addi a3, a3, 1365
-; LMULMAX1-RV32-NEXT: vand.vx v27, v25, a3
+; LMULMAX1-RV32-NEXT: lui a2, 349525
+; LMULMAX1-RV32-NEXT: addi a2, a2, 1365
+; LMULMAX1-RV32-NEXT: vand.vx v27, v25, a2
; LMULMAX1-RV32-NEXT: vadd.vv v27, v27, v27
; LMULMAX1-RV32-NEXT: lui a4, 699051
; LMULMAX1-RV32-NEXT: addi a4, a4, -1366
; LMULMAX1-RV32-NEXT: vsrl.vi v25, v25, 1
; LMULMAX1-RV32-NEXT: vor.vv v25, v25, v27
; LMULMAX1-RV32-NEXT: vsrl.vi v27, v26, 8
-; LMULMAX1-RV32-NEXT: vand.vx v27, v27, a7
+; LMULMAX1-RV32-NEXT: vand.vx v27, v27, t0
; LMULMAX1-RV32-NEXT: vsrl.vi v28, v26, 24
; LMULMAX1-RV32-NEXT: vor.vv v27, v27, v28
; LMULMAX1-RV32-NEXT: vsll.vi v28, v26, 8
-; LMULMAX1-RV32-NEXT: vand.vx v28, v28, t0
+; LMULMAX1-RV32-NEXT: vand.vx v28, v28, a6
; LMULMAX1-RV32-NEXT: vsll.vi v26, v26, 24
; LMULMAX1-RV32-NEXT: vor.vv v26, v26, v28
; LMULMAX1-RV32-NEXT: vor.vv v26, v26, v27
; LMULMAX1-RV32-NEXT: vand.vx v26, v26, a5
; LMULMAX1-RV32-NEXT: vsrl.vi v26, v26, 4
; LMULMAX1-RV32-NEXT: vor.vv v26, v26, v27
-; LMULMAX1-RV32-NEXT: vand.vx v27, v26, a1
+; LMULMAX1-RV32-NEXT: vand.vx v27, v26, a3
; LMULMAX1-RV32-NEXT: vsll.vi v27, v27, 2
-; LMULMAX1-RV32-NEXT: vand.vx v26, v26, a2
+; LMULMAX1-RV32-NEXT: vand.vx v26, v26, a1
; LMULMAX1-RV32-NEXT: vsrl.vi v26, v26, 2
; LMULMAX1-RV32-NEXT: vor.vv v26, v26, v27
-; LMULMAX1-RV32-NEXT: vand.vx v27, v26, a3
+; LMULMAX1-RV32-NEXT: vand.vx v27, v26, a2
; LMULMAX1-RV32-NEXT: vadd.vv v27, v27, v27
; LMULMAX1-RV32-NEXT: vand.vx v26, v26, a4
; LMULMAX1-RV32-NEXT: vsrl.vi v26, v26, 1
; LMULMAX1-RV32-NEXT: vor.vv v26, v26, v27
; LMULMAX1-RV32-NEXT: vse32.v v26, (a0)
-; LMULMAX1-RV32-NEXT: vse32.v v25, (a6)
+; LMULMAX1-RV32-NEXT: vse32.v v25, (a7)
; LMULMAX1-RV32-NEXT: ret
;
; LMULMAX1-RV64-LABEL: bitreverse_v8i32:
; LMULMAX1-RV64: # %bb.0:
; LMULMAX1-RV64-NEXT: vsetivli zero, 4, e32, m1, ta, mu
-; LMULMAX1-RV64-NEXT: addi a6, a0, 16
-; LMULMAX1-RV64-NEXT: vle32.v v25, (a6)
+; LMULMAX1-RV64-NEXT: addi a7, a0, 16
+; LMULMAX1-RV64-NEXT: vle32.v v25, (a7)
; LMULMAX1-RV64-NEXT: vle32.v v26, (a0)
; LMULMAX1-RV64-NEXT: vsrl.vi v27, v25, 8
; LMULMAX1-RV64-NEXT: lui a2, 16
-; LMULMAX1-RV64-NEXT: addiw a2, a2, -256
-; LMULMAX1-RV64-NEXT: vand.vx v27, v27, a2
+; LMULMAX1-RV64-NEXT: addiw t0, a2, -256
+; LMULMAX1-RV64-NEXT: vand.vx v27, v27, t0
; LMULMAX1-RV64-NEXT: vsrl.vi v28, v25, 24
; LMULMAX1-RV64-NEXT: vor.vv v27, v27, v28
; LMULMAX1-RV64-NEXT: vsll.vi v28, v25, 8
-; LMULMAX1-RV64-NEXT: lui a7, 4080
-; LMULMAX1-RV64-NEXT: vand.vx v28, v28, a7
+; LMULMAX1-RV64-NEXT: lui a6, 4080
+; LMULMAX1-RV64-NEXT: vand.vx v28, v28, a6
; LMULMAX1-RV64-NEXT: vsll.vi v25, v25, 24
; LMULMAX1-RV64-NEXT: vor.vv v25, v25, v28
; LMULMAX1-RV64-NEXT: vor.vv v25, v25, v27
; LMULMAX1-RV64-NEXT: lui a4, 61681
-; LMULMAX1-RV64-NEXT: addiw a4, a4, -241
-; LMULMAX1-RV64-NEXT: vand.vx v27, v25, a4
+; LMULMAX1-RV64-NEXT: addiw t1, a4, -241
+; LMULMAX1-RV64-NEXT: vand.vx v27, v25, t1
; LMULMAX1-RV64-NEXT: vsll.vi v27, v27, 4
; LMULMAX1-RV64-NEXT: lui a5, 241
; LMULMAX1-RV64-NEXT: addiw a5, a5, -241
; LMULMAX1-RV64-NEXT: slli a5, a5, 12
-; LMULMAX1-RV64-NEXT: addi t0, a5, 240
-; LMULMAX1-RV64-NEXT: vand.vx v25, v25, t0
+; LMULMAX1-RV64-NEXT: addi a5, a5, 240
+; LMULMAX1-RV64-NEXT: vand.vx v25, v25, a5
; LMULMAX1-RV64-NEXT: vsrl.vi v25, v25, 4
; LMULMAX1-RV64-NEXT: vor.vv v25, v25, v27
-; LMULMAX1-RV64-NEXT: lui a1, 209715
-; LMULMAX1-RV64-NEXT: addiw a1, a1, 819
-; LMULMAX1-RV64-NEXT: vand.vx v27, v25, a1
+; LMULMAX1-RV64-NEXT: lui a3, 209715
+; LMULMAX1-RV64-NEXT: addiw a3, a3, 819
+; LMULMAX1-RV64-NEXT: vand.vx v27, v25, a3
; LMULMAX1-RV64-NEXT: vsll.vi v27, v27, 2
-; LMULMAX1-RV64-NEXT: lui a3, 205
-; LMULMAX1-RV64-NEXT: addiw a3, a3, -819
-; LMULMAX1-RV64-NEXT: slli a3, a3, 12
-; LMULMAX1-RV64-NEXT: addi t1, a3, -820
-; LMULMAX1-RV64-NEXT: vand.vx v25, v25, t1
+; LMULMAX1-RV64-NEXT: lui a1, 205
+; LMULMAX1-RV64-NEXT: addiw a1, a1, -819
+; LMULMAX1-RV64-NEXT: slli a1, a1, 12
+; LMULMAX1-RV64-NEXT: addi a1, a1, -820
+; LMULMAX1-RV64-NEXT: vand.vx v25, v25, a1
; LMULMAX1-RV64-NEXT: vsrl.vi v25, v25, 2
; LMULMAX1-RV64-NEXT: vor.vv v25, v25, v27
-; LMULMAX1-RV64-NEXT: lui a5, 349525
-; LMULMAX1-RV64-NEXT: addiw a5, a5, 1365
-; LMULMAX1-RV64-NEXT: vand.vx v27, v25, a5
+; LMULMAX1-RV64-NEXT: lui a2, 349525
+; LMULMAX1-RV64-NEXT: addiw a2, a2, 1365
+; LMULMAX1-RV64-NEXT: vand.vx v27, v25, a2
; LMULMAX1-RV64-NEXT: vadd.vv v27, v27, v27
-; LMULMAX1-RV64-NEXT: lui a3, 171
-; LMULMAX1-RV64-NEXT: addiw a3, a3, -1365
-; LMULMAX1-RV64-NEXT: slli a3, a3, 12
-; LMULMAX1-RV64-NEXT: addi a3, a3, -1366
-; LMULMAX1-RV64-NEXT: vand.vx v25, v25, a3
+; LMULMAX1-RV64-NEXT: lui a4, 171
+; LMULMAX1-RV64-NEXT: addiw a4, a4, -1365
+; LMULMAX1-RV64-NEXT: slli a4, a4, 12
+; LMULMAX1-RV64-NEXT: addi a4, a4, -1366
+; LMULMAX1-RV64-NEXT: vand.vx v25, v25, a4
; LMULMAX1-RV64-NEXT: vsrl.vi v25, v25, 1
; LMULMAX1-RV64-NEXT: vor.vv v25, v25, v27
; LMULMAX1-RV64-NEXT: vsrl.vi v27, v26, 8
-; LMULMAX1-RV64-NEXT: vand.vx v27, v27, a2
+; LMULMAX1-RV64-NEXT: vand.vx v27, v27, t0
; LMULMAX1-RV64-NEXT: vsrl.vi v28, v26, 24
; LMULMAX1-RV64-NEXT: vor.vv v27, v27, v28
; LMULMAX1-RV64-NEXT: vsll.vi v28, v26, 8
-; LMULMAX1-RV64-NEXT: vand.vx v28, v28, a7
+; LMULMAX1-RV64-NEXT: vand.vx v28, v28, a6
; LMULMAX1-RV64-NEXT: vsll.vi v26, v26, 24
; LMULMAX1-RV64-NEXT: vor.vv v26, v26, v28
; LMULMAX1-RV64-NEXT: vor.vv v26, v26, v27
-; LMULMAX1-RV64-NEXT: vand.vx v27, v26, a4
+; LMULMAX1-RV64-NEXT: vand.vx v27, v26, t1
; LMULMAX1-RV64-NEXT: vsll.vi v27, v27, 4
-; LMULMAX1-RV64-NEXT: vand.vx v26, v26, t0
+; LMULMAX1-RV64-NEXT: vand.vx v26, v26, a5
; LMULMAX1-RV64-NEXT: vsrl.vi v26, v26, 4
; LMULMAX1-RV64-NEXT: vor.vv v26, v26, v27
-; LMULMAX1-RV64-NEXT: vand.vx v27, v26, a1
+; LMULMAX1-RV64-NEXT: vand.vx v27, v26, a3
; LMULMAX1-RV64-NEXT: vsll.vi v27, v27, 2
-; LMULMAX1-RV64-NEXT: vand.vx v26, v26, t1
+; LMULMAX1-RV64-NEXT: vand.vx v26, v26, a1
; LMULMAX1-RV64-NEXT: vsrl.vi v26, v26, 2
; LMULMAX1-RV64-NEXT: vor.vv v26, v26, v27
-; LMULMAX1-RV64-NEXT: vand.vx v27, v26, a5
+; LMULMAX1-RV64-NEXT: vand.vx v27, v26, a2
; LMULMAX1-RV64-NEXT: vadd.vv v27, v27, v27
-; LMULMAX1-RV64-NEXT: vand.vx v26, v26, a3
+; LMULMAX1-RV64-NEXT: vand.vx v26, v26, a4
; LMULMAX1-RV64-NEXT: vsrl.vi v26, v26, 1
; LMULMAX1-RV64-NEXT: vor.vv v26, v26, v27
; LMULMAX1-RV64-NEXT: vse32.v v26, (a0)
-; LMULMAX1-RV64-NEXT: vse32.v v25, (a6)
+; LMULMAX1-RV64-NEXT: vse32.v v25, (a7)
; LMULMAX1-RV64-NEXT: ret
%a = load <8 x i32>, <8 x i32>* %x
%b = load <8 x i32>, <8 x i32>* %y
; LMULMAX1-RV32-LABEL: bitreverse_v4i64:
; LMULMAX1-RV32: # %bb.0:
; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu
-; LMULMAX1-RV32-NEXT: addi a6, a0, 16
-; LMULMAX1-RV32-NEXT: vle64.v v30, (a6)
+; LMULMAX1-RV32-NEXT: addi a1, a0, 16
+; LMULMAX1-RV32-NEXT: vle64.v v30, (a1)
; LMULMAX1-RV32-NEXT: vle64.v v25, (a0)
; LMULMAX1-RV32-NEXT: addi a2, zero, 56
; LMULMAX1-RV32-NEXT: vsrl.vx v26, v30, a2
; LMULMAX1-RV32-NEXT: vand.vx v27, v27, a4
; LMULMAX1-RV32-NEXT: vor.vv v27, v27, v26
; LMULMAX1-RV32-NEXT: vsrl.vi v26, v30, 24
-; LMULMAX1-RV32-NEXT: lui a5, 4080
-; LMULMAX1-RV32-NEXT: vand.vx v28, v26, a5
-; LMULMAX1-RV32-NEXT: addi a1, zero, 5
+; LMULMAX1-RV32-NEXT: lui a6, 4080
+; LMULMAX1-RV32-NEXT: vand.vx v28, v26, a6
+; LMULMAX1-RV32-NEXT: addi a5, zero, 5
; LMULMAX1-RV32-NEXT: vsetivli zero, 1, e8, mf8, ta, mu
-; LMULMAX1-RV32-NEXT: vmv.s.x v0, a1
+; LMULMAX1-RV32-NEXT: vmv.s.x v0, a5
; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu
; LMULMAX1-RV32-NEXT: vmv.v.i v26, 0
-; LMULMAX1-RV32-NEXT: lui a1, 1044480
-; LMULMAX1-RV32-NEXT: vmerge.vxm v26, v26, a1, v0
+; LMULMAX1-RV32-NEXT: lui a5, 1044480
+; LMULMAX1-RV32-NEXT: vmerge.vxm v26, v26, a5, v0
; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu
; LMULMAX1-RV32-NEXT: vsrl.vi v29, v30, 8
; LMULMAX1-RV32-NEXT: vand.vv v29, v29, v26
; LMULMAX1-RV32-NEXT: vor.vv v28, v29, v28
; LMULMAX1-RV32-NEXT: vor.vv v31, v28, v27
-; LMULMAX1-RV32-NEXT: addi a1, zero, 255
+; LMULMAX1-RV32-NEXT: addi a5, zero, 255
; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu
-; LMULMAX1-RV32-NEXT: vmv.v.x v27, a1
+; LMULMAX1-RV32-NEXT: vmv.v.x v27, a5
; LMULMAX1-RV32-NEXT: vmerge.vim v27, v27, 0, v0
; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu
; LMULMAX1-RV32-NEXT: vsll.vi v28, v30, 8
; LMULMAX1-RV32-NEXT: vor.vv v8, v8, v29
; LMULMAX1-RV32-NEXT: vsll.vx v9, v30, a3
; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu
-; LMULMAX1-RV32-NEXT: vmv.v.x v29, a5
+; LMULMAX1-RV32-NEXT: vmv.v.x v29, a6
; LMULMAX1-RV32-NEXT: vmerge.vim v29, v29, 0, v0
; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu
; LMULMAX1-RV32-NEXT: vand.vv v9, v9, v29
; LMULMAX1-RV32-NEXT: vor.vv v30, v30, v9
; LMULMAX1-RV32-NEXT: vor.vv v30, v30, v8
; LMULMAX1-RV32-NEXT: vor.vv v31, v30, v31
-; LMULMAX1-RV32-NEXT: lui a1, 61681
-; LMULMAX1-RV32-NEXT: addi a1, a1, -241
+; LMULMAX1-RV32-NEXT: lui a5, 61681
+; LMULMAX1-RV32-NEXT: addi a5, a5, -241
; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu
-; LMULMAX1-RV32-NEXT: vmv.v.x v30, a1
+; LMULMAX1-RV32-NEXT: vmv.v.x v30, a5
; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu
; LMULMAX1-RV32-NEXT: vand.vv v8, v31, v30
; LMULMAX1-RV32-NEXT: vsll.vi v8, v8, 4
-; LMULMAX1-RV32-NEXT: lui a1, 986895
-; LMULMAX1-RV32-NEXT: addi a1, a1, 240
+; LMULMAX1-RV32-NEXT: lui a5, 986895
+; LMULMAX1-RV32-NEXT: addi a5, a5, 240
; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu
-; LMULMAX1-RV32-NEXT: vmv.v.x v9, a1
+; LMULMAX1-RV32-NEXT: vmv.v.x v9, a5
; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu
; LMULMAX1-RV32-NEXT: vand.vv v31, v31, v9
; LMULMAX1-RV32-NEXT: vsrl.vi v31, v31, 4
; LMULMAX1-RV32-NEXT: vor.vv v31, v31, v8
-; LMULMAX1-RV32-NEXT: lui a1, 209715
-; LMULMAX1-RV32-NEXT: addi a1, a1, 819
+; LMULMAX1-RV32-NEXT: lui a5, 209715
+; LMULMAX1-RV32-NEXT: addi a5, a5, 819
; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu
-; LMULMAX1-RV32-NEXT: vmv.v.x v8, a1
+; LMULMAX1-RV32-NEXT: vmv.v.x v8, a5
; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu
; LMULMAX1-RV32-NEXT: vand.vv v10, v31, v8
; LMULMAX1-RV32-NEXT: vsll.vi v10, v10, 2
-; LMULMAX1-RV32-NEXT: lui a1, 838861
-; LMULMAX1-RV32-NEXT: addi a1, a1, -820
+; LMULMAX1-RV32-NEXT: lui a5, 838861
+; LMULMAX1-RV32-NEXT: addi a5, a5, -820
; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu
-; LMULMAX1-RV32-NEXT: vmv.v.x v11, a1
+; LMULMAX1-RV32-NEXT: vmv.v.x v11, a5
; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu
; LMULMAX1-RV32-NEXT: vand.vv v31, v31, v11
; LMULMAX1-RV32-NEXT: vsrl.vi v31, v31, 2
; LMULMAX1-RV32-NEXT: vor.vv v31, v31, v10
-; LMULMAX1-RV32-NEXT: lui a1, 349525
-; LMULMAX1-RV32-NEXT: addi a1, a1, 1365
+; LMULMAX1-RV32-NEXT: lui a5, 349525
+; LMULMAX1-RV32-NEXT: addi a5, a5, 1365
; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu
-; LMULMAX1-RV32-NEXT: vmv.v.x v10, a1
+; LMULMAX1-RV32-NEXT: vmv.v.x v10, a5
; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu
; LMULMAX1-RV32-NEXT: vand.vv v12, v31, v10
; LMULMAX1-RV32-NEXT: vadd.vv v12, v12, v12
-; LMULMAX1-RV32-NEXT: lui a1, 699051
-; LMULMAX1-RV32-NEXT: addi a1, a1, -1366
+; LMULMAX1-RV32-NEXT: lui a5, 699051
+; LMULMAX1-RV32-NEXT: addi a5, a5, -1366
; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu
-; LMULMAX1-RV32-NEXT: vmv.v.x v13, a1
+; LMULMAX1-RV32-NEXT: vmv.v.x v13, a5
; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu
; LMULMAX1-RV32-NEXT: vand.vv v31, v31, v13
; LMULMAX1-RV32-NEXT: vsrl.vi v31, v31, 1
; LMULMAX1-RV32-NEXT: vand.vx v14, v14, a4
; LMULMAX1-RV32-NEXT: vor.vv v12, v14, v12
; LMULMAX1-RV32-NEXT: vsrl.vi v14, v25, 24
-; LMULMAX1-RV32-NEXT: vand.vx v14, v14, a5
+; LMULMAX1-RV32-NEXT: vand.vx v14, v14, a6
; LMULMAX1-RV32-NEXT: vsrl.vi v15, v25, 8
; LMULMAX1-RV32-NEXT: vand.vv v26, v15, v26
; LMULMAX1-RV32-NEXT: vor.vv v26, v26, v14
; LMULMAX1-RV32-NEXT: vsrl.vi v25, v25, 1
; LMULMAX1-RV32-NEXT: vor.vv v25, v25, v26
; LMULMAX1-RV32-NEXT: vse64.v v25, (a0)
-; LMULMAX1-RV32-NEXT: vse64.v v31, (a6)
+; LMULMAX1-RV32-NEXT: vse64.v v31, (a1)
; LMULMAX1-RV32-NEXT: ret
;
; LMULMAX1-RV64-LABEL: bitreverse_v4i64:
; LMULMAX1-RV64-NEXT: sd s0, 8(sp) # 8-byte Folded Spill
; LMULMAX1-RV64-NEXT: .cfi_offset s0, -8
; LMULMAX1-RV64-NEXT: vsetivli zero, 2, e64, m1, ta, mu
-; LMULMAX1-RV64-NEXT: addi a6, a0, 16
-; LMULMAX1-RV64-NEXT: vle64.v v26, (a6)
+; LMULMAX1-RV64-NEXT: addi t1, a0, 16
+; LMULMAX1-RV64-NEXT: vle64.v v26, (t1)
; LMULMAX1-RV64-NEXT: vle64.v v25, (a0)
-; LMULMAX1-RV64-NEXT: addi t0, zero, 56
-; LMULMAX1-RV64-NEXT: vsrl.vx v27, v26, t0
-; LMULMAX1-RV64-NEXT: addi t1, zero, 40
-; LMULMAX1-RV64-NEXT: vsrl.vx v28, v26, t1
+; LMULMAX1-RV64-NEXT: addi a7, zero, 56
+; LMULMAX1-RV64-NEXT: vsrl.vx v27, v26, a7
+; LMULMAX1-RV64-NEXT: addi t0, zero, 40
+; LMULMAX1-RV64-NEXT: vsrl.vx v28, v26, t0
; LMULMAX1-RV64-NEXT: lui a1, 16
-; LMULMAX1-RV64-NEXT: addiw t4, a1, -256
-; LMULMAX1-RV64-NEXT: vand.vx v28, v28, t4
+; LMULMAX1-RV64-NEXT: addiw t2, a1, -256
+; LMULMAX1-RV64-NEXT: vand.vx v28, v28, t2
; LMULMAX1-RV64-NEXT: vor.vv v27, v28, v27
; LMULMAX1-RV64-NEXT: vsrl.vi v28, v26, 24
-; LMULMAX1-RV64-NEXT: lui a7, 4080
-; LMULMAX1-RV64-NEXT: vand.vx v28, v28, a7
+; LMULMAX1-RV64-NEXT: lui a6, 4080
+; LMULMAX1-RV64-NEXT: vand.vx v28, v28, a6
; LMULMAX1-RV64-NEXT: vsrl.vi v29, v26, 8
-; LMULMAX1-RV64-NEXT: addi a3, zero, 255
-; LMULMAX1-RV64-NEXT: slli a1, a3, 24
-; LMULMAX1-RV64-NEXT: vand.vx v29, v29, a1
+; LMULMAX1-RV64-NEXT: addi a1, zero, 255
+; LMULMAX1-RV64-NEXT: slli t3, a1, 24
+; LMULMAX1-RV64-NEXT: vand.vx v29, v29, t3
; LMULMAX1-RV64-NEXT: vor.vv v28, v29, v28
; LMULMAX1-RV64-NEXT: vor.vv v27, v28, v27
; LMULMAX1-RV64-NEXT: vsll.vi v28, v26, 8
-; LMULMAX1-RV64-NEXT: slli a5, a3, 32
-; LMULMAX1-RV64-NEXT: vand.vx v28, v28, a5
+; LMULMAX1-RV64-NEXT: slli t4, a1, 32
+; LMULMAX1-RV64-NEXT: vand.vx v28, v28, t4
; LMULMAX1-RV64-NEXT: vsll.vi v29, v26, 24
-; LMULMAX1-RV64-NEXT: slli a2, a3, 40
-; LMULMAX1-RV64-NEXT: vand.vx v29, v29, a2
+; LMULMAX1-RV64-NEXT: slli t5, a1, 40
+; LMULMAX1-RV64-NEXT: vand.vx v29, v29, t5
; LMULMAX1-RV64-NEXT: vor.vv v28, v29, v28
-; LMULMAX1-RV64-NEXT: vsll.vx v29, v26, t0
-; LMULMAX1-RV64-NEXT: vsll.vx v26, v26, t1
-; LMULMAX1-RV64-NEXT: slli a3, a3, 48
-; LMULMAX1-RV64-NEXT: vand.vx v26, v26, a3
+; LMULMAX1-RV64-NEXT: vsll.vx v29, v26, a7
+; LMULMAX1-RV64-NEXT: vsll.vx v26, v26, t0
+; LMULMAX1-RV64-NEXT: slli t6, a1, 48
+; LMULMAX1-RV64-NEXT: vand.vx v26, v26, t6
; LMULMAX1-RV64-NEXT: vor.vv v26, v29, v26
; LMULMAX1-RV64-NEXT: vor.vv v26, v26, v28
; LMULMAX1-RV64-NEXT: vor.vv v26, v26, v27
; LMULMAX1-RV64-NEXT: slli a4, a4, 12
; LMULMAX1-RV64-NEXT: addi a4, a4, 241
; LMULMAX1-RV64-NEXT: slli a4, a4, 12
-; LMULMAX1-RV64-NEXT: addi t2, a4, -241
-; LMULMAX1-RV64-NEXT: vand.vx v27, v26, t2
-; LMULMAX1-RV64-NEXT: vsll.vi v27, v27, 4
-; LMULMAX1-RV64-NEXT: lui a4, 1044721
-; LMULMAX1-RV64-NEXT: addiw a4, a4, -241
-; LMULMAX1-RV64-NEXT: slli a4, a4, 12
-; LMULMAX1-RV64-NEXT: addi a4, a4, 241
-; LMULMAX1-RV64-NEXT: slli a4, a4, 12
; LMULMAX1-RV64-NEXT: addi a4, a4, -241
-; LMULMAX1-RV64-NEXT: slli a4, a4, 12
-; LMULMAX1-RV64-NEXT: addi t3, a4, 240
-; LMULMAX1-RV64-NEXT: vand.vx v26, v26, t3
+; LMULMAX1-RV64-NEXT: vand.vx v27, v26, a4
+; LMULMAX1-RV64-NEXT: vsll.vi v27, v27, 4
+; LMULMAX1-RV64-NEXT: lui a5, 1044721
+; LMULMAX1-RV64-NEXT: addiw a5, a5, -241
+; LMULMAX1-RV64-NEXT: slli a5, a5, 12
+; LMULMAX1-RV64-NEXT: addi a5, a5, 241
+; LMULMAX1-RV64-NEXT: slli a5, a5, 12
+; LMULMAX1-RV64-NEXT: addi a5, a5, -241
+; LMULMAX1-RV64-NEXT: slli a5, a5, 12
+; LMULMAX1-RV64-NEXT: addi a5, a5, 240
+; LMULMAX1-RV64-NEXT: vand.vx v26, v26, a5
; LMULMAX1-RV64-NEXT: vsrl.vi v26, v26, 4
; LMULMAX1-RV64-NEXT: vor.vv v26, v26, v27
-; LMULMAX1-RV64-NEXT: lui a4, 13107
-; LMULMAX1-RV64-NEXT: addiw a4, a4, 819
-; LMULMAX1-RV64-NEXT: slli a4, a4, 12
-; LMULMAX1-RV64-NEXT: addi a4, a4, 819
-; LMULMAX1-RV64-NEXT: slli a4, a4, 12
-; LMULMAX1-RV64-NEXT: addi a4, a4, 819
-; LMULMAX1-RV64-NEXT: slli a4, a4, 12
-; LMULMAX1-RV64-NEXT: addi t5, a4, 819
-; LMULMAX1-RV64-NEXT: vand.vx v27, v26, t5
+; LMULMAX1-RV64-NEXT: lui a2, 13107
+; LMULMAX1-RV64-NEXT: addiw a2, a2, 819
+; LMULMAX1-RV64-NEXT: slli a2, a2, 12
+; LMULMAX1-RV64-NEXT: addi a2, a2, 819
+; LMULMAX1-RV64-NEXT: slli a2, a2, 12
+; LMULMAX1-RV64-NEXT: addi a2, a2, 819
+; LMULMAX1-RV64-NEXT: slli a2, a2, 12
+; LMULMAX1-RV64-NEXT: addi a2, a2, 819
+; LMULMAX1-RV64-NEXT: vand.vx v27, v26, a2
; LMULMAX1-RV64-NEXT: vsll.vi v27, v27, 2
-; LMULMAX1-RV64-NEXT: lui a4, 1035469
-; LMULMAX1-RV64-NEXT: addiw a4, a4, -819
-; LMULMAX1-RV64-NEXT: slli a4, a4, 12
-; LMULMAX1-RV64-NEXT: addi a4, a4, -819
-; LMULMAX1-RV64-NEXT: slli a4, a4, 12
-; LMULMAX1-RV64-NEXT: addi a4, a4, -819
-; LMULMAX1-RV64-NEXT: slli a4, a4, 12
-; LMULMAX1-RV64-NEXT: addi t6, a4, -820
-; LMULMAX1-RV64-NEXT: vand.vx v26, v26, t6
+; LMULMAX1-RV64-NEXT: lui a3, 1035469
+; LMULMAX1-RV64-NEXT: addiw a3, a3, -819
+; LMULMAX1-RV64-NEXT: slli a3, a3, 12
+; LMULMAX1-RV64-NEXT: addi a3, a3, -819
+; LMULMAX1-RV64-NEXT: slli a3, a3, 12
+; LMULMAX1-RV64-NEXT: addi a3, a3, -819
+; LMULMAX1-RV64-NEXT: slli a3, a3, 12
+; LMULMAX1-RV64-NEXT: addi a3, a3, -820
+; LMULMAX1-RV64-NEXT: vand.vx v26, v26, a3
; LMULMAX1-RV64-NEXT: vsrl.vi v26, v26, 2
; LMULMAX1-RV64-NEXT: vor.vv v26, v26, v27
-; LMULMAX1-RV64-NEXT: lui a4, 21845
-; LMULMAX1-RV64-NEXT: addiw a4, a4, 1365
-; LMULMAX1-RV64-NEXT: slli a4, a4, 12
-; LMULMAX1-RV64-NEXT: addi a4, a4, 1365
-; LMULMAX1-RV64-NEXT: slli a4, a4, 12
-; LMULMAX1-RV64-NEXT: addi a4, a4, 1365
-; LMULMAX1-RV64-NEXT: slli a4, a4, 12
-; LMULMAX1-RV64-NEXT: addi a4, a4, 1365
-; LMULMAX1-RV64-NEXT: vand.vx v27, v26, a4
+; LMULMAX1-RV64-NEXT: lui a1, 21845
+; LMULMAX1-RV64-NEXT: addiw a1, a1, 1365
+; LMULMAX1-RV64-NEXT: slli a1, a1, 12
+; LMULMAX1-RV64-NEXT: addi a1, a1, 1365
+; LMULMAX1-RV64-NEXT: slli a1, a1, 12
+; LMULMAX1-RV64-NEXT: addi a1, a1, 1365
+; LMULMAX1-RV64-NEXT: slli a1, a1, 12
+; LMULMAX1-RV64-NEXT: addi a1, a1, 1365
+; LMULMAX1-RV64-NEXT: vand.vx v27, v26, a1
; LMULMAX1-RV64-NEXT: vadd.vv v27, v27, v27
; LMULMAX1-RV64-NEXT: lui s0, 1026731
; LMULMAX1-RV64-NEXT: addiw s0, s0, -1365
; LMULMAX1-RV64-NEXT: vand.vx v26, v26, s0
; LMULMAX1-RV64-NEXT: vsrl.vi v26, v26, 1
; LMULMAX1-RV64-NEXT: vor.vv v26, v26, v27
-; LMULMAX1-RV64-NEXT: vsrl.vx v27, v25, t0
-; LMULMAX1-RV64-NEXT: vsrl.vx v28, v25, t1
-; LMULMAX1-RV64-NEXT: vand.vx v28, v28, t4
+; LMULMAX1-RV64-NEXT: vsrl.vx v27, v25, a7
+; LMULMAX1-RV64-NEXT: vsrl.vx v28, v25, t0
+; LMULMAX1-RV64-NEXT: vand.vx v28, v28, t2
; LMULMAX1-RV64-NEXT: vor.vv v27, v28, v27
; LMULMAX1-RV64-NEXT: vsrl.vi v28, v25, 24
-; LMULMAX1-RV64-NEXT: vand.vx v28, v28, a7
+; LMULMAX1-RV64-NEXT: vand.vx v28, v28, a6
; LMULMAX1-RV64-NEXT: vsrl.vi v29, v25, 8
-; LMULMAX1-RV64-NEXT: vand.vx v29, v29, a1
+; LMULMAX1-RV64-NEXT: vand.vx v29, v29, t3
; LMULMAX1-RV64-NEXT: vor.vv v28, v29, v28
; LMULMAX1-RV64-NEXT: vor.vv v27, v28, v27
; LMULMAX1-RV64-NEXT: vsll.vi v28, v25, 8
-; LMULMAX1-RV64-NEXT: vand.vx v28, v28, a5
+; LMULMAX1-RV64-NEXT: vand.vx v28, v28, t4
; LMULMAX1-RV64-NEXT: vsll.vi v29, v25, 24
-; LMULMAX1-RV64-NEXT: vand.vx v29, v29, a2
+; LMULMAX1-RV64-NEXT: vand.vx v29, v29, t5
; LMULMAX1-RV64-NEXT: vor.vv v28, v29, v28
-; LMULMAX1-RV64-NEXT: vsll.vx v29, v25, t0
-; LMULMAX1-RV64-NEXT: vsll.vx v25, v25, t1
-; LMULMAX1-RV64-NEXT: vand.vx v25, v25, a3
+; LMULMAX1-RV64-NEXT: vsll.vx v29, v25, a7
+; LMULMAX1-RV64-NEXT: vsll.vx v25, v25, t0
+; LMULMAX1-RV64-NEXT: vand.vx v25, v25, t6
; LMULMAX1-RV64-NEXT: vor.vv v25, v29, v25
; LMULMAX1-RV64-NEXT: vor.vv v25, v25, v28
; LMULMAX1-RV64-NEXT: vor.vv v25, v25, v27
-; LMULMAX1-RV64-NEXT: vand.vx v27, v25, t2
+; LMULMAX1-RV64-NEXT: vand.vx v27, v25, a4
; LMULMAX1-RV64-NEXT: vsll.vi v27, v27, 4
-; LMULMAX1-RV64-NEXT: vand.vx v25, v25, t3
+; LMULMAX1-RV64-NEXT: vand.vx v25, v25, a5
; LMULMAX1-RV64-NEXT: vsrl.vi v25, v25, 4
; LMULMAX1-RV64-NEXT: vor.vv v25, v25, v27
-; LMULMAX1-RV64-NEXT: vand.vx v27, v25, t5
+; LMULMAX1-RV64-NEXT: vand.vx v27, v25, a2
; LMULMAX1-RV64-NEXT: vsll.vi v27, v27, 2
-; LMULMAX1-RV64-NEXT: vand.vx v25, v25, t6
+; LMULMAX1-RV64-NEXT: vand.vx v25, v25, a3
; LMULMAX1-RV64-NEXT: vsrl.vi v25, v25, 2
; LMULMAX1-RV64-NEXT: vor.vv v25, v25, v27
-; LMULMAX1-RV64-NEXT: vand.vx v27, v25, a4
+; LMULMAX1-RV64-NEXT: vand.vx v27, v25, a1
; LMULMAX1-RV64-NEXT: vadd.vv v27, v27, v27
; LMULMAX1-RV64-NEXT: vand.vx v25, v25, s0
; LMULMAX1-RV64-NEXT: vsrl.vi v25, v25, 1
; LMULMAX1-RV64-NEXT: vor.vv v25, v25, v27
; LMULMAX1-RV64-NEXT: vse64.v v25, (a0)
-; LMULMAX1-RV64-NEXT: vse64.v v26, (a6)
+; LMULMAX1-RV64-NEXT: vse64.v v26, (t1)
; LMULMAX1-RV64-NEXT: ld s0, 8(sp) # 8-byte Folded Reload
; LMULMAX1-RV64-NEXT: addi sp, sp, 16
; LMULMAX1-RV64-NEXT: ret
; LMULMAX2-RV32-NEXT: vmv.x.s a1, v25
; LMULMAX2-RV32-NEXT: srli a2, a1, 8
; LMULMAX2-RV32-NEXT: lui a3, 16
-; LMULMAX2-RV32-NEXT: addi a6, a3, -256
-; LMULMAX2-RV32-NEXT: and a2, a2, a6
+; LMULMAX2-RV32-NEXT: addi a3, a3, -256
+; LMULMAX2-RV32-NEXT: and a2, a2, a3
; LMULMAX2-RV32-NEXT: srli a4, a1, 24
; LMULMAX2-RV32-NEXT: or a2, a2, a4
; LMULMAX2-RV32-NEXT: slli a4, a1, 8
-; LMULMAX2-RV32-NEXT: lui a5, 4080
-; LMULMAX2-RV32-NEXT: and a4, a4, a5
+; LMULMAX2-RV32-NEXT: lui a6, 4080
+; LMULMAX2-RV32-NEXT: and a4, a4, a6
; LMULMAX2-RV32-NEXT: slli a1, a1, 24
; LMULMAX2-RV32-NEXT: or a1, a1, a4
; LMULMAX2-RV32-NEXT: or a1, a1, a2
; LMULMAX2-RV32-NEXT: vslidedown.vi v26, v25, 1
; LMULMAX2-RV32-NEXT: vmv.x.s a1, v26
; LMULMAX2-RV32-NEXT: srli a2, a1, 8
-; LMULMAX2-RV32-NEXT: and a2, a2, a6
+; LMULMAX2-RV32-NEXT: and a2, a2, a3
; LMULMAX2-RV32-NEXT: srli a4, a1, 24
; LMULMAX2-RV32-NEXT: or a2, a2, a4
; LMULMAX2-RV32-NEXT: slli a4, a1, 8
-; LMULMAX2-RV32-NEXT: and a4, a4, a5
+; LMULMAX2-RV32-NEXT: and a4, a4, a6
; LMULMAX2-RV32-NEXT: slli a1, a1, 24
; LMULMAX2-RV32-NEXT: or a1, a1, a4
; LMULMAX2-RV32-NEXT: or a1, a1, a2
; LMULMAX2-RV32-NEXT: vsrl.vx v25, v25, a1
; LMULMAX2-RV32-NEXT: vmv.x.s a2, v25
; LMULMAX2-RV32-NEXT: srli a4, a2, 8
-; LMULMAX2-RV32-NEXT: and a4, a4, a6
-; LMULMAX2-RV32-NEXT: srli a3, a2, 24
-; LMULMAX2-RV32-NEXT: or a3, a4, a3
-; LMULMAX2-RV32-NEXT: slli a4, a2, 8
-; LMULMAX2-RV32-NEXT: and a4, a4, a5
+; LMULMAX2-RV32-NEXT: and a4, a4, a3
+; LMULMAX2-RV32-NEXT: srli a5, a2, 24
+; LMULMAX2-RV32-NEXT: or a4, a4, a5
+; LMULMAX2-RV32-NEXT: slli a5, a2, 8
+; LMULMAX2-RV32-NEXT: and a5, a5, a6
; LMULMAX2-RV32-NEXT: slli a2, a2, 24
+; LMULMAX2-RV32-NEXT: or a2, a2, a5
; LMULMAX2-RV32-NEXT: or a2, a2, a4
-; LMULMAX2-RV32-NEXT: or a2, a2, a3
; LMULMAX2-RV32-NEXT: sw a2, 16(sp)
; LMULMAX2-RV32-NEXT: vsrl.vx v25, v26, a1
; LMULMAX2-RV32-NEXT: vmv.x.s a1, v25
; LMULMAX2-RV32-NEXT: srli a2, a1, 8
-; LMULMAX2-RV32-NEXT: and a2, a2, a6
+; LMULMAX2-RV32-NEXT: and a2, a2, a3
; LMULMAX2-RV32-NEXT: srli a3, a1, 24
; LMULMAX2-RV32-NEXT: or a2, a2, a3
; LMULMAX2-RV32-NEXT: slli a3, a1, 8
-; LMULMAX2-RV32-NEXT: and a3, a3, a5
+; LMULMAX2-RV32-NEXT: and a3, a3, a6
; LMULMAX2-RV32-NEXT: slli a1, a1, 24
; LMULMAX2-RV32-NEXT: or a1, a1, a3
; LMULMAX2-RV32-NEXT: or a1, a1, a2
; LMULMAX1-RV32-NEXT: vmv.x.s a1, v25
; LMULMAX1-RV32-NEXT: srli a2, a1, 8
; LMULMAX1-RV32-NEXT: lui a3, 16
-; LMULMAX1-RV32-NEXT: addi a6, a3, -256
-; LMULMAX1-RV32-NEXT: and a2, a2, a6
+; LMULMAX1-RV32-NEXT: addi a3, a3, -256
+; LMULMAX1-RV32-NEXT: and a2, a2, a3
; LMULMAX1-RV32-NEXT: srli a4, a1, 24
; LMULMAX1-RV32-NEXT: or a2, a2, a4
; LMULMAX1-RV32-NEXT: slli a4, a1, 8
-; LMULMAX1-RV32-NEXT: lui a5, 4080
-; LMULMAX1-RV32-NEXT: and a4, a4, a5
+; LMULMAX1-RV32-NEXT: lui a6, 4080
+; LMULMAX1-RV32-NEXT: and a4, a4, a6
; LMULMAX1-RV32-NEXT: slli a1, a1, 24
; LMULMAX1-RV32-NEXT: or a1, a1, a4
; LMULMAX1-RV32-NEXT: or a1, a1, a2
; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 1
; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26
; LMULMAX1-RV32-NEXT: srli a2, a1, 8
-; LMULMAX1-RV32-NEXT: and a2, a2, a6
+; LMULMAX1-RV32-NEXT: and a2, a2, a3
; LMULMAX1-RV32-NEXT: srli a4, a1, 24
; LMULMAX1-RV32-NEXT: or a2, a2, a4
; LMULMAX1-RV32-NEXT: slli a4, a1, 8
-; LMULMAX1-RV32-NEXT: and a4, a4, a5
+; LMULMAX1-RV32-NEXT: and a4, a4, a6
; LMULMAX1-RV32-NEXT: slli a1, a1, 24
; LMULMAX1-RV32-NEXT: or a1, a1, a4
; LMULMAX1-RV32-NEXT: or a1, a1, a2
; LMULMAX1-RV32-NEXT: vsrl.vx v25, v25, a1
; LMULMAX1-RV32-NEXT: vmv.x.s a2, v25
; LMULMAX1-RV32-NEXT: srli a4, a2, 8
-; LMULMAX1-RV32-NEXT: and a4, a4, a6
-; LMULMAX1-RV32-NEXT: srli a3, a2, 24
-; LMULMAX1-RV32-NEXT: or a3, a4, a3
-; LMULMAX1-RV32-NEXT: slli a4, a2, 8
-; LMULMAX1-RV32-NEXT: and a4, a4, a5
+; LMULMAX1-RV32-NEXT: and a4, a4, a3
+; LMULMAX1-RV32-NEXT: srli a5, a2, 24
+; LMULMAX1-RV32-NEXT: or a4, a4, a5
+; LMULMAX1-RV32-NEXT: slli a5, a2, 8
+; LMULMAX1-RV32-NEXT: and a5, a5, a6
; LMULMAX1-RV32-NEXT: slli a2, a2, 24
+; LMULMAX1-RV32-NEXT: or a2, a2, a5
; LMULMAX1-RV32-NEXT: or a2, a2, a4
-; LMULMAX1-RV32-NEXT: or a2, a2, a3
; LMULMAX1-RV32-NEXT: sw a2, 16(sp)
; LMULMAX1-RV32-NEXT: vsrl.vx v25, v26, a1
; LMULMAX1-RV32-NEXT: vmv.x.s a1, v25
; LMULMAX1-RV32-NEXT: srli a2, a1, 8
-; LMULMAX1-RV32-NEXT: and a2, a2, a6
+; LMULMAX1-RV32-NEXT: and a2, a2, a3
; LMULMAX1-RV32-NEXT: srli a3, a1, 24
; LMULMAX1-RV32-NEXT: or a2, a2, a3
; LMULMAX1-RV32-NEXT: slli a3, a1, 8
-; LMULMAX1-RV32-NEXT: and a3, a3, a5
+; LMULMAX1-RV32-NEXT: and a3, a3, a6
; LMULMAX1-RV32-NEXT: slli a1, a1, 24
; LMULMAX1-RV32-NEXT: or a1, a1, a3
; LMULMAX1-RV32-NEXT: or a1, a1, a2
; LMULMAX2-RV32-NEXT: vmv.x.s a3, v26
; LMULMAX2-RV32-NEXT: srli a2, a3, 8
; LMULMAX2-RV32-NEXT: lui a1, 16
-; LMULMAX2-RV32-NEXT: addi a6, a1, -256
-; LMULMAX2-RV32-NEXT: and a2, a2, a6
+; LMULMAX2-RV32-NEXT: addi a1, a1, -256
+; LMULMAX2-RV32-NEXT: and a2, a2, a1
; LMULMAX2-RV32-NEXT: srli a4, a3, 24
; LMULMAX2-RV32-NEXT: or a4, a2, a4
; LMULMAX2-RV32-NEXT: slli a5, a3, 8
-; LMULMAX2-RV32-NEXT: lui a2, 4080
-; LMULMAX2-RV32-NEXT: and a5, a5, a2
+; LMULMAX2-RV32-NEXT: lui a6, 4080
+; LMULMAX2-RV32-NEXT: and a5, a5, a6
; LMULMAX2-RV32-NEXT: slli a3, a3, 24
; LMULMAX2-RV32-NEXT: or a3, a3, a5
; LMULMAX2-RV32-NEXT: or a3, a3, a4
; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 3
; LMULMAX2-RV32-NEXT: vmv.x.s a3, v28
; LMULMAX2-RV32-NEXT: srli a4, a3, 8
-; LMULMAX2-RV32-NEXT: and a4, a4, a6
+; LMULMAX2-RV32-NEXT: and a4, a4, a1
; LMULMAX2-RV32-NEXT: srli a5, a3, 24
; LMULMAX2-RV32-NEXT: or a4, a4, a5
; LMULMAX2-RV32-NEXT: slli a5, a3, 8
-; LMULMAX2-RV32-NEXT: and a5, a5, a2
+; LMULMAX2-RV32-NEXT: and a5, a5, a6
; LMULMAX2-RV32-NEXT: slli a3, a3, 24
; LMULMAX2-RV32-NEXT: or a3, a3, a5
; LMULMAX2-RV32-NEXT: or a3, a3, a4
; LMULMAX2-RV32-NEXT: vslidedown.vi v30, v26, 2
; LMULMAX2-RV32-NEXT: vmv.x.s a3, v30
; LMULMAX2-RV32-NEXT: srli a4, a3, 8
-; LMULMAX2-RV32-NEXT: and a4, a4, a6
+; LMULMAX2-RV32-NEXT: and a4, a4, a1
; LMULMAX2-RV32-NEXT: srli a5, a3, 24
; LMULMAX2-RV32-NEXT: or a4, a4, a5
; LMULMAX2-RV32-NEXT: slli a5, a3, 8
-; LMULMAX2-RV32-NEXT: and a5, a5, a2
+; LMULMAX2-RV32-NEXT: and a5, a5, a6
; LMULMAX2-RV32-NEXT: slli a3, a3, 24
; LMULMAX2-RV32-NEXT: or a3, a3, a5
; LMULMAX2-RV32-NEXT: or a3, a3, a4
; LMULMAX2-RV32-NEXT: vslidedown.vi v8, v26, 1
; LMULMAX2-RV32-NEXT: vmv.x.s a3, v8
; LMULMAX2-RV32-NEXT: srli a4, a3, 8
-; LMULMAX2-RV32-NEXT: and a4, a4, a6
+; LMULMAX2-RV32-NEXT: and a4, a4, a1
; LMULMAX2-RV32-NEXT: srli a5, a3, 24
; LMULMAX2-RV32-NEXT: or a4, a4, a5
; LMULMAX2-RV32-NEXT: slli a5, a3, 8
-; LMULMAX2-RV32-NEXT: and a5, a5, a2
+; LMULMAX2-RV32-NEXT: and a5, a5, a6
; LMULMAX2-RV32-NEXT: slli a3, a3, 24
; LMULMAX2-RV32-NEXT: or a3, a3, a5
; LMULMAX2-RV32-NEXT: or a3, a3, a4
; LMULMAX2-RV32-NEXT: vsrl.vx v26, v26, a3
; LMULMAX2-RV32-NEXT: vmv.x.s a4, v26
; LMULMAX2-RV32-NEXT: srli a5, a4, 8
-; LMULMAX2-RV32-NEXT: and a5, a5, a6
-; LMULMAX2-RV32-NEXT: srli a1, a4, 24
-; LMULMAX2-RV32-NEXT: or a1, a5, a1
+; LMULMAX2-RV32-NEXT: and a5, a5, a1
+; LMULMAX2-RV32-NEXT: srli a2, a4, 24
+; LMULMAX2-RV32-NEXT: or a2, a5, a2
; LMULMAX2-RV32-NEXT: slli a5, a4, 8
-; LMULMAX2-RV32-NEXT: and a5, a5, a2
+; LMULMAX2-RV32-NEXT: and a5, a5, a6
; LMULMAX2-RV32-NEXT: slli a4, a4, 24
; LMULMAX2-RV32-NEXT: or a4, a4, a5
-; LMULMAX2-RV32-NEXT: or a1, a4, a1
-; LMULMAX2-RV32-NEXT: sw a1, 32(sp)
+; LMULMAX2-RV32-NEXT: or a2, a4, a2
+; LMULMAX2-RV32-NEXT: sw a2, 32(sp)
; LMULMAX2-RV32-NEXT: vsrl.vx v26, v28, a3
-; LMULMAX2-RV32-NEXT: vmv.x.s a1, v26
-; LMULMAX2-RV32-NEXT: srli a4, a1, 8
-; LMULMAX2-RV32-NEXT: and a4, a4, a6
-; LMULMAX2-RV32-NEXT: srli a5, a1, 24
+; LMULMAX2-RV32-NEXT: vmv.x.s a2, v26
+; LMULMAX2-RV32-NEXT: srli a4, a2, 8
+; LMULMAX2-RV32-NEXT: and a4, a4, a1
+; LMULMAX2-RV32-NEXT: srli a5, a2, 24
; LMULMAX2-RV32-NEXT: or a4, a4, a5
-; LMULMAX2-RV32-NEXT: slli a5, a1, 8
-; LMULMAX2-RV32-NEXT: and a5, a5, a2
-; LMULMAX2-RV32-NEXT: slli a1, a1, 24
-; LMULMAX2-RV32-NEXT: or a1, a1, a5
-; LMULMAX2-RV32-NEXT: or a1, a1, a4
-; LMULMAX2-RV32-NEXT: sw a1, 56(sp)
+; LMULMAX2-RV32-NEXT: slli a5, a2, 8
+; LMULMAX2-RV32-NEXT: and a5, a5, a6
+; LMULMAX2-RV32-NEXT: slli a2, a2, 24
+; LMULMAX2-RV32-NEXT: or a2, a2, a5
+; LMULMAX2-RV32-NEXT: or a2, a2, a4
+; LMULMAX2-RV32-NEXT: sw a2, 56(sp)
; LMULMAX2-RV32-NEXT: vsrl.vx v26, v30, a3
-; LMULMAX2-RV32-NEXT: vmv.x.s a1, v26
-; LMULMAX2-RV32-NEXT: srli a4, a1, 8
-; LMULMAX2-RV32-NEXT: and a4, a4, a6
-; LMULMAX2-RV32-NEXT: srli a5, a1, 24
+; LMULMAX2-RV32-NEXT: vmv.x.s a2, v26
+; LMULMAX2-RV32-NEXT: srli a4, a2, 8
+; LMULMAX2-RV32-NEXT: and a4, a4, a1
+; LMULMAX2-RV32-NEXT: srli a5, a2, 24
; LMULMAX2-RV32-NEXT: or a4, a4, a5
-; LMULMAX2-RV32-NEXT: slli a5, a1, 8
-; LMULMAX2-RV32-NEXT: and a5, a5, a2
-; LMULMAX2-RV32-NEXT: slli a1, a1, 24
-; LMULMAX2-RV32-NEXT: or a1, a1, a5
-; LMULMAX2-RV32-NEXT: or a1, a1, a4
-; LMULMAX2-RV32-NEXT: sw a1, 48(sp)
+; LMULMAX2-RV32-NEXT: slli a5, a2, 8
+; LMULMAX2-RV32-NEXT: and a5, a5, a6
+; LMULMAX2-RV32-NEXT: slli a2, a2, 24
+; LMULMAX2-RV32-NEXT: or a2, a2, a5
+; LMULMAX2-RV32-NEXT: or a2, a2, a4
+; LMULMAX2-RV32-NEXT: sw a2, 48(sp)
; LMULMAX2-RV32-NEXT: vsrl.vx v26, v8, a3
-; LMULMAX2-RV32-NEXT: vmv.x.s a1, v26
-; LMULMAX2-RV32-NEXT: srli a3, a1, 8
-; LMULMAX2-RV32-NEXT: and a3, a3, a6
-; LMULMAX2-RV32-NEXT: srli a4, a1, 24
-; LMULMAX2-RV32-NEXT: or a3, a3, a4
-; LMULMAX2-RV32-NEXT: slli a4, a1, 8
-; LMULMAX2-RV32-NEXT: and a2, a4, a2
-; LMULMAX2-RV32-NEXT: slli a1, a1, 24
-; LMULMAX2-RV32-NEXT: or a1, a1, a2
+; LMULMAX2-RV32-NEXT: vmv.x.s a2, v26
+; LMULMAX2-RV32-NEXT: srli a3, a2, 8
+; LMULMAX2-RV32-NEXT: and a1, a3, a1
+; LMULMAX2-RV32-NEXT: srli a3, a2, 24
; LMULMAX2-RV32-NEXT: or a1, a1, a3
+; LMULMAX2-RV32-NEXT: slli a3, a2, 8
+; LMULMAX2-RV32-NEXT: and a3, a3, a6
+; LMULMAX2-RV32-NEXT: slli a2, a2, 24
+; LMULMAX2-RV32-NEXT: or a2, a2, a3
+; LMULMAX2-RV32-NEXT: or a1, a2, a1
; LMULMAX2-RV32-NEXT: sw a1, 40(sp)
; LMULMAX2-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu
; LMULMAX2-RV32-NEXT: addi a1, sp, 32
; LMULMAX2-RV64-NEXT: vle16.v v25, (a0)
; LMULMAX2-RV64-NEXT: vmv.x.s a2, v25
; LMULMAX2-RV64-NEXT: lui a1, 16
-; LMULMAX2-RV64-NEXT: addiw a1, a1, -1
-; LMULMAX2-RV64-NEXT: and a2, a2, a1
+; LMULMAX2-RV64-NEXT: addiw a6, a1, -1
+; LMULMAX2-RV64-NEXT: and a2, a2, a6
; LMULMAX2-RV64-NEXT: srli a3, a2, 1
; LMULMAX2-RV64-NEXT: or a2, a2, a3
; LMULMAX2-RV64-NEXT: srli a3, a2, 2
; LMULMAX2-RV64-NEXT: slli a2, a2, 12
; LMULMAX2-RV64-NEXT: addi a2, a2, 1365
; LMULMAX2-RV64-NEXT: slli a2, a2, 12
-; LMULMAX2-RV64-NEXT: addi a6, a2, 1365
-; LMULMAX2-RV64-NEXT: and a4, a4, a6
+; LMULMAX2-RV64-NEXT: addi a7, a2, 1365
+; LMULMAX2-RV64-NEXT: and a4, a4, a7
; LMULMAX2-RV64-NEXT: sub a4, a3, a4
; LMULMAX2-RV64-NEXT: lui a3, 13107
; LMULMAX2-RV64-NEXT: addiw a3, a3, 819
; LMULMAX2-RV64-NEXT: slli a4, a4, 12
; LMULMAX2-RV64-NEXT: addi a4, a4, 241
; LMULMAX2-RV64-NEXT: slli a4, a4, 12
-; LMULMAX2-RV64-NEXT: addi a7, a4, -241
-; LMULMAX2-RV64-NEXT: and a2, a5, a7
+; LMULMAX2-RV64-NEXT: addi a4, a4, -241
+; LMULMAX2-RV64-NEXT: and a1, a5, a4
; LMULMAX2-RV64-NEXT: lui a5, 4112
; LMULMAX2-RV64-NEXT: addiw a5, a5, 257
; LMULMAX2-RV64-NEXT: slli a5, a5, 16
; LMULMAX2-RV64-NEXT: addi a5, a5, 257
; LMULMAX2-RV64-NEXT: slli a5, a5, 16
; LMULMAX2-RV64-NEXT: addi a5, a5, 257
-; LMULMAX2-RV64-NEXT: mul a2, a2, a5
-; LMULMAX2-RV64-NEXT: srli a2, a2, 56
-; LMULMAX2-RV64-NEXT: addi a2, a2, -48
-; LMULMAX2-RV64-NEXT: sh a2, 16(sp)
+; LMULMAX2-RV64-NEXT: mul a1, a1, a5
+; LMULMAX2-RV64-NEXT: srli a1, a1, 56
+; LMULMAX2-RV64-NEXT: addi a1, a1, -48
+; LMULMAX2-RV64-NEXT: sh a1, 16(sp)
; LMULMAX2-RV64-NEXT: vsetivli zero, 1, e16, m1, ta, mu
; LMULMAX2-RV64-NEXT: vslidedown.vi v26, v25, 7
-; LMULMAX2-RV64-NEXT: vmv.x.s a2, v26
-; LMULMAX2-RV64-NEXT: and a2, a2, a1
-; LMULMAX2-RV64-NEXT: srli a4, a2, 1
-; LMULMAX2-RV64-NEXT: or a2, a2, a4
-; LMULMAX2-RV64-NEXT: srli a4, a2, 2
-; LMULMAX2-RV64-NEXT: or a2, a2, a4
-; LMULMAX2-RV64-NEXT: srli a4, a2, 4
-; LMULMAX2-RV64-NEXT: or a2, a2, a4
-; LMULMAX2-RV64-NEXT: srli a4, a2, 8
-; LMULMAX2-RV64-NEXT: or a2, a2, a4
-; LMULMAX2-RV64-NEXT: srli a4, a2, 16
-; LMULMAX2-RV64-NEXT: or a2, a2, a4
-; LMULMAX2-RV64-NEXT: srli a4, a2, 32
-; LMULMAX2-RV64-NEXT: or a2, a2, a4
-; LMULMAX2-RV64-NEXT: not a2, a2
-; LMULMAX2-RV64-NEXT: srli a4, a2, 1
-; LMULMAX2-RV64-NEXT: and a4, a4, a6
-; LMULMAX2-RV64-NEXT: sub a2, a2, a4
-; LMULMAX2-RV64-NEXT: and a4, a2, a3
-; LMULMAX2-RV64-NEXT: srli a2, a2, 2
-; LMULMAX2-RV64-NEXT: and a2, a2, a3
-; LMULMAX2-RV64-NEXT: add a2, a4, a2
-; LMULMAX2-RV64-NEXT: srli a4, a2, 4
-; LMULMAX2-RV64-NEXT: add a2, a2, a4
+; LMULMAX2-RV64-NEXT: vmv.x.s a1, v26
+; LMULMAX2-RV64-NEXT: and a1, a1, a6
+; LMULMAX2-RV64-NEXT: srli a2, a1, 1
+; LMULMAX2-RV64-NEXT: or a1, a1, a2
+; LMULMAX2-RV64-NEXT: srli a2, a1, 2
+; LMULMAX2-RV64-NEXT: or a1, a1, a2
+; LMULMAX2-RV64-NEXT: srli a2, a1, 4
+; LMULMAX2-RV64-NEXT: or a1, a1, a2
+; LMULMAX2-RV64-NEXT: srli a2, a1, 8
+; LMULMAX2-RV64-NEXT: or a1, a1, a2
+; LMULMAX2-RV64-NEXT: srli a2, a1, 16
+; LMULMAX2-RV64-NEXT: or a1, a1, a2
+; LMULMAX2-RV64-NEXT: srli a2, a1, 32
+; LMULMAX2-RV64-NEXT: or a1, a1, a2
+; LMULMAX2-RV64-NEXT: not a1, a1
+; LMULMAX2-RV64-NEXT: srli a2, a1, 1
; LMULMAX2-RV64-NEXT: and a2, a2, a7
-; LMULMAX2-RV64-NEXT: mul a2, a2, a5
-; LMULMAX2-RV64-NEXT: srli a2, a2, 56
-; LMULMAX2-RV64-NEXT: addi a2, a2, -48
-; LMULMAX2-RV64-NEXT: sh a2, 30(sp)
+; LMULMAX2-RV64-NEXT: sub a1, a1, a2
+; LMULMAX2-RV64-NEXT: and a2, a1, a3
+; LMULMAX2-RV64-NEXT: srli a1, a1, 2
+; LMULMAX2-RV64-NEXT: and a1, a1, a3
+; LMULMAX2-RV64-NEXT: add a1, a2, a1
+; LMULMAX2-RV64-NEXT: srli a2, a1, 4
+; LMULMAX2-RV64-NEXT: add a1, a1, a2
+; LMULMAX2-RV64-NEXT: and a1, a1, a4
+; LMULMAX2-RV64-NEXT: mul a1, a1, a5
+; LMULMAX2-RV64-NEXT: srli a1, a1, 56
+; LMULMAX2-RV64-NEXT: addi a1, a1, -48
+; LMULMAX2-RV64-NEXT: sh a1, 30(sp)
; LMULMAX2-RV64-NEXT: vslidedown.vi v26, v25, 6
-; LMULMAX2-RV64-NEXT: vmv.x.s a2, v26
-; LMULMAX2-RV64-NEXT: and a2, a2, a1
-; LMULMAX2-RV64-NEXT: srli a4, a2, 1
-; LMULMAX2-RV64-NEXT: or a2, a2, a4
-; LMULMAX2-RV64-NEXT: srli a4, a2, 2
-; LMULMAX2-RV64-NEXT: or a2, a2, a4
-; LMULMAX2-RV64-NEXT: srli a4, a2, 4
-; LMULMAX2-RV64-NEXT: or a2, a2, a4
-; LMULMAX2-RV64-NEXT: srli a4, a2, 8
-; LMULMAX2-RV64-NEXT: or a2, a2, a4
-; LMULMAX2-RV64-NEXT: srli a4, a2, 16
-; LMULMAX2-RV64-NEXT: or a2, a2, a4
-; LMULMAX2-RV64-NEXT: srli a4, a2, 32
-; LMULMAX2-RV64-NEXT: or a2, a2, a4
-; LMULMAX2-RV64-NEXT: not a2, a2
-; LMULMAX2-RV64-NEXT: srli a4, a2, 1
-; LMULMAX2-RV64-NEXT: and a4, a4, a6
-; LMULMAX2-RV64-NEXT: sub a2, a2, a4
-; LMULMAX2-RV64-NEXT: and a4, a2, a3
-; LMULMAX2-RV64-NEXT: srli a2, a2, 2
-; LMULMAX2-RV64-NEXT: and a2, a2, a3
-; LMULMAX2-RV64-NEXT: add a2, a4, a2
-; LMULMAX2-RV64-NEXT: srli a4, a2, 4
-; LMULMAX2-RV64-NEXT: add a2, a2, a4
+; LMULMAX2-RV64-NEXT: vmv.x.s a1, v26
+; LMULMAX2-RV64-NEXT: and a1, a1, a6
+; LMULMAX2-RV64-NEXT: srli a2, a1, 1
+; LMULMAX2-RV64-NEXT: or a1, a1, a2
+; LMULMAX2-RV64-NEXT: srli a2, a1, 2
+; LMULMAX2-RV64-NEXT: or a1, a1, a2
+; LMULMAX2-RV64-NEXT: srli a2, a1, 4
+; LMULMAX2-RV64-NEXT: or a1, a1, a2
+; LMULMAX2-RV64-NEXT: srli a2, a1, 8
+; LMULMAX2-RV64-NEXT: or a1, a1, a2
+; LMULMAX2-RV64-NEXT: srli a2, a1, 16
+; LMULMAX2-RV64-NEXT: or a1, a1, a2
+; LMULMAX2-RV64-NEXT: srli a2, a1, 32
+; LMULMAX2-RV64-NEXT: or a1, a1, a2
+; LMULMAX2-RV64-NEXT: not a1, a1
+; LMULMAX2-RV64-NEXT: srli a2, a1, 1
; LMULMAX2-RV64-NEXT: and a2, a2, a7
-; LMULMAX2-RV64-NEXT: mul a2, a2, a5
-; LMULMAX2-RV64-NEXT: srli a2, a2, 56
-; LMULMAX2-RV64-NEXT: addi a2, a2, -48
-; LMULMAX2-RV64-NEXT: sh a2, 28(sp)
+; LMULMAX2-RV64-NEXT: sub a1, a1, a2
+; LMULMAX2-RV64-NEXT: and a2, a1, a3
+; LMULMAX2-RV64-NEXT: srli a1, a1, 2
+; LMULMAX2-RV64-NEXT: and a1, a1, a3
+; LMULMAX2-RV64-NEXT: add a1, a2, a1
+; LMULMAX2-RV64-NEXT: srli a2, a1, 4
+; LMULMAX2-RV64-NEXT: add a1, a1, a2
+; LMULMAX2-RV64-NEXT: and a1, a1, a4
+; LMULMAX2-RV64-NEXT: mul a1, a1, a5
+; LMULMAX2-RV64-NEXT: srli a1, a1, 56
+; LMULMAX2-RV64-NEXT: addi a1, a1, -48
+; LMULMAX2-RV64-NEXT: sh a1, 28(sp)
; LMULMAX2-RV64-NEXT: vslidedown.vi v26, v25, 5
-; LMULMAX2-RV64-NEXT: vmv.x.s a2, v26
-; LMULMAX2-RV64-NEXT: and a2, a2, a1
-; LMULMAX2-RV64-NEXT: srli a4, a2, 1
-; LMULMAX2-RV64-NEXT: or a2, a2, a4
-; LMULMAX2-RV64-NEXT: srli a4, a2, 2
-; LMULMAX2-RV64-NEXT: or a2, a2, a4
-; LMULMAX2-RV64-NEXT: srli a4, a2, 4
-; LMULMAX2-RV64-NEXT: or a2, a2, a4
-; LMULMAX2-RV64-NEXT: srli a4, a2, 8
-; LMULMAX2-RV64-NEXT: or a2, a2, a4
-; LMULMAX2-RV64-NEXT: srli a4, a2, 16
-; LMULMAX2-RV64-NEXT: or a2, a2, a4
-; LMULMAX2-RV64-NEXT: srli a4, a2, 32
-; LMULMAX2-RV64-NEXT: or a2, a2, a4
-; LMULMAX2-RV64-NEXT: not a2, a2
-; LMULMAX2-RV64-NEXT: srli a4, a2, 1
-; LMULMAX2-RV64-NEXT: and a4, a4, a6
-; LMULMAX2-RV64-NEXT: sub a2, a2, a4
-; LMULMAX2-RV64-NEXT: and a4, a2, a3
-; LMULMAX2-RV64-NEXT: srli a2, a2, 2
-; LMULMAX2-RV64-NEXT: and a2, a2, a3
-; LMULMAX2-RV64-NEXT: add a2, a4, a2
-; LMULMAX2-RV64-NEXT: srli a4, a2, 4
-; LMULMAX2-RV64-NEXT: add a2, a2, a4
+; LMULMAX2-RV64-NEXT: vmv.x.s a1, v26
+; LMULMAX2-RV64-NEXT: and a1, a1, a6
+; LMULMAX2-RV64-NEXT: srli a2, a1, 1
+; LMULMAX2-RV64-NEXT: or a1, a1, a2
+; LMULMAX2-RV64-NEXT: srli a2, a1, 2
+; LMULMAX2-RV64-NEXT: or a1, a1, a2
+; LMULMAX2-RV64-NEXT: srli a2, a1, 4
+; LMULMAX2-RV64-NEXT: or a1, a1, a2
+; LMULMAX2-RV64-NEXT: srli a2, a1, 8
+; LMULMAX2-RV64-NEXT: or a1, a1, a2
+; LMULMAX2-RV64-NEXT: srli a2, a1, 16
+; LMULMAX2-RV64-NEXT: or a1, a1, a2
+; LMULMAX2-RV64-NEXT: srli a2, a1, 32
+; LMULMAX2-RV64-NEXT: or a1, a1, a2
+; LMULMAX2-RV64-NEXT: not a1, a1
+; LMULMAX2-RV64-NEXT: srli a2, a1, 1
; LMULMAX2-RV64-NEXT: and a2, a2, a7
-; LMULMAX2-RV64-NEXT: mul a2, a2, a5
-; LMULMAX2-RV64-NEXT: srli a2, a2, 56
-; LMULMAX2-RV64-NEXT: addi a2, a2, -48
-; LMULMAX2-RV64-NEXT: sh a2, 26(sp)
+; LMULMAX2-RV64-NEXT: sub a1, a1, a2
+; LMULMAX2-RV64-NEXT: and a2, a1, a3
+; LMULMAX2-RV64-NEXT: srli a1, a1, 2
+; LMULMAX2-RV64-NEXT: and a1, a1, a3
+; LMULMAX2-RV64-NEXT: add a1, a2, a1
+; LMULMAX2-RV64-NEXT: srli a2, a1, 4
+; LMULMAX2-RV64-NEXT: add a1, a1, a2
+; LMULMAX2-RV64-NEXT: and a1, a1, a4
+; LMULMAX2-RV64-NEXT: mul a1, a1, a5
+; LMULMAX2-RV64-NEXT: srli a1, a1, 56
+; LMULMAX2-RV64-NEXT: addi a1, a1, -48
+; LMULMAX2-RV64-NEXT: sh a1, 26(sp)
; LMULMAX2-RV64-NEXT: vslidedown.vi v26, v25, 4
-; LMULMAX2-RV64-NEXT: vmv.x.s a2, v26
-; LMULMAX2-RV64-NEXT: and a2, a2, a1
-; LMULMAX2-RV64-NEXT: srli a4, a2, 1
-; LMULMAX2-RV64-NEXT: or a2, a2, a4
-; LMULMAX2-RV64-NEXT: srli a4, a2, 2
-; LMULMAX2-RV64-NEXT: or a2, a2, a4
-; LMULMAX2-RV64-NEXT: srli a4, a2, 4
-; LMULMAX2-RV64-NEXT: or a2, a2, a4
-; LMULMAX2-RV64-NEXT: srli a4, a2, 8
-; LMULMAX2-RV64-NEXT: or a2, a2, a4
-; LMULMAX2-RV64-NEXT: srli a4, a2, 16
-; LMULMAX2-RV64-NEXT: or a2, a2, a4
-; LMULMAX2-RV64-NEXT: srli a4, a2, 32
-; LMULMAX2-RV64-NEXT: or a2, a2, a4
-; LMULMAX2-RV64-NEXT: not a2, a2
-; LMULMAX2-RV64-NEXT: srli a4, a2, 1
-; LMULMAX2-RV64-NEXT: and a4, a4, a6
-; LMULMAX2-RV64-NEXT: sub a2, a2, a4
-; LMULMAX2-RV64-NEXT: and a4, a2, a3
-; LMULMAX2-RV64-NEXT: srli a2, a2, 2
-; LMULMAX2-RV64-NEXT: and a2, a2, a3
-; LMULMAX2-RV64-NEXT: add a2, a4, a2
-; LMULMAX2-RV64-NEXT: srli a4, a2, 4
-; LMULMAX2-RV64-NEXT: add a2, a2, a4
+; LMULMAX2-RV64-NEXT: vmv.x.s a1, v26
+; LMULMAX2-RV64-NEXT: and a1, a1, a6
+; LMULMAX2-RV64-NEXT: srli a2, a1, 1
+; LMULMAX2-RV64-NEXT: or a1, a1, a2
+; LMULMAX2-RV64-NEXT: srli a2, a1, 2
+; LMULMAX2-RV64-NEXT: or a1, a1, a2
+; LMULMAX2-RV64-NEXT: srli a2, a1, 4
+; LMULMAX2-RV64-NEXT: or a1, a1, a2
+; LMULMAX2-RV64-NEXT: srli a2, a1, 8
+; LMULMAX2-RV64-NEXT: or a1, a1, a2
+; LMULMAX2-RV64-NEXT: srli a2, a1, 16
+; LMULMAX2-RV64-NEXT: or a1, a1, a2
+; LMULMAX2-RV64-NEXT: srli a2, a1, 32
+; LMULMAX2-RV64-NEXT: or a1, a1, a2
+; LMULMAX2-RV64-NEXT: not a1, a1
+; LMULMAX2-RV64-NEXT: srli a2, a1, 1
; LMULMAX2-RV64-NEXT: and a2, a2, a7
-; LMULMAX2-RV64-NEXT: mul a2, a2, a5
-; LMULMAX2-RV64-NEXT: srli a2, a2, 56
-; LMULMAX2-RV64-NEXT: addi a2, a2, -48
-; LMULMAX2-RV64-NEXT: sh a2, 24(sp)
+; LMULMAX2-RV64-NEXT: sub a1, a1, a2
+; LMULMAX2-RV64-NEXT: and a2, a1, a3
+; LMULMAX2-RV64-NEXT: srli a1, a1, 2
+; LMULMAX2-RV64-NEXT: and a1, a1, a3
+; LMULMAX2-RV64-NEXT: add a1, a2, a1
+; LMULMAX2-RV64-NEXT: srli a2, a1, 4
+; LMULMAX2-RV64-NEXT: add a1, a1, a2
+; LMULMAX2-RV64-NEXT: and a1, a1, a4
+; LMULMAX2-RV64-NEXT: mul a1, a1, a5
+; LMULMAX2-RV64-NEXT: srli a1, a1, 56
+; LMULMAX2-RV64-NEXT: addi a1, a1, -48
+; LMULMAX2-RV64-NEXT: sh a1, 24(sp)
; LMULMAX2-RV64-NEXT: vslidedown.vi v26, v25, 3
-; LMULMAX2-RV64-NEXT: vmv.x.s a2, v26
-; LMULMAX2-RV64-NEXT: and a2, a2, a1
-; LMULMAX2-RV64-NEXT: srli a4, a2, 1
-; LMULMAX2-RV64-NEXT: or a2, a2, a4
-; LMULMAX2-RV64-NEXT: srli a4, a2, 2
-; LMULMAX2-RV64-NEXT: or a2, a2, a4
-; LMULMAX2-RV64-NEXT: srli a4, a2, 4
-; LMULMAX2-RV64-NEXT: or a2, a2, a4
-; LMULMAX2-RV64-NEXT: srli a4, a2, 8
-; LMULMAX2-RV64-NEXT: or a2, a2, a4
-; LMULMAX2-RV64-NEXT: srli a4, a2, 16
-; LMULMAX2-RV64-NEXT: or a2, a2, a4
-; LMULMAX2-RV64-NEXT: srli a4, a2, 32
-; LMULMAX2-RV64-NEXT: or a2, a2, a4
-; LMULMAX2-RV64-NEXT: not a2, a2
-; LMULMAX2-RV64-NEXT: srli a4, a2, 1
-; LMULMAX2-RV64-NEXT: and a4, a4, a6
-; LMULMAX2-RV64-NEXT: sub a2, a2, a4
-; LMULMAX2-RV64-NEXT: and a4, a2, a3
-; LMULMAX2-RV64-NEXT: srli a2, a2, 2
-; LMULMAX2-RV64-NEXT: and a2, a2, a3
-; LMULMAX2-RV64-NEXT: add a2, a4, a2
-; LMULMAX2-RV64-NEXT: srli a4, a2, 4
-; LMULMAX2-RV64-NEXT: add a2, a2, a4
+; LMULMAX2-RV64-NEXT: vmv.x.s a1, v26
+; LMULMAX2-RV64-NEXT: and a1, a1, a6
+; LMULMAX2-RV64-NEXT: srli a2, a1, 1
+; LMULMAX2-RV64-NEXT: or a1, a1, a2
+; LMULMAX2-RV64-NEXT: srli a2, a1, 2
+; LMULMAX2-RV64-NEXT: or a1, a1, a2
+; LMULMAX2-RV64-NEXT: srli a2, a1, 4
+; LMULMAX2-RV64-NEXT: or a1, a1, a2
+; LMULMAX2-RV64-NEXT: srli a2, a1, 8
+; LMULMAX2-RV64-NEXT: or a1, a1, a2
+; LMULMAX2-RV64-NEXT: srli a2, a1, 16
+; LMULMAX2-RV64-NEXT: or a1, a1, a2
+; LMULMAX2-RV64-NEXT: srli a2, a1, 32
+; LMULMAX2-RV64-NEXT: or a1, a1, a2
+; LMULMAX2-RV64-NEXT: not a1, a1
+; LMULMAX2-RV64-NEXT: srli a2, a1, 1
; LMULMAX2-RV64-NEXT: and a2, a2, a7
-; LMULMAX2-RV64-NEXT: mul a2, a2, a5
-; LMULMAX2-RV64-NEXT: srli a2, a2, 56
-; LMULMAX2-RV64-NEXT: addi a2, a2, -48
-; LMULMAX2-RV64-NEXT: sh a2, 22(sp)
+; LMULMAX2-RV64-NEXT: sub a1, a1, a2
+; LMULMAX2-RV64-NEXT: and a2, a1, a3
+; LMULMAX2-RV64-NEXT: srli a1, a1, 2
+; LMULMAX2-RV64-NEXT: and a1, a1, a3
+; LMULMAX2-RV64-NEXT: add a1, a2, a1
+; LMULMAX2-RV64-NEXT: srli a2, a1, 4
+; LMULMAX2-RV64-NEXT: add a1, a1, a2
+; LMULMAX2-RV64-NEXT: and a1, a1, a4
+; LMULMAX2-RV64-NEXT: mul a1, a1, a5
+; LMULMAX2-RV64-NEXT: srli a1, a1, 56
+; LMULMAX2-RV64-NEXT: addi a1, a1, -48
+; LMULMAX2-RV64-NEXT: sh a1, 22(sp)
; LMULMAX2-RV64-NEXT: vslidedown.vi v26, v25, 2
-; LMULMAX2-RV64-NEXT: vmv.x.s a2, v26
-; LMULMAX2-RV64-NEXT: and a2, a2, a1
-; LMULMAX2-RV64-NEXT: srli a4, a2, 1
-; LMULMAX2-RV64-NEXT: or a2, a2, a4
-; LMULMAX2-RV64-NEXT: srli a4, a2, 2
-; LMULMAX2-RV64-NEXT: or a2, a2, a4
-; LMULMAX2-RV64-NEXT: srli a4, a2, 4
-; LMULMAX2-RV64-NEXT: or a2, a2, a4
-; LMULMAX2-RV64-NEXT: srli a4, a2, 8
-; LMULMAX2-RV64-NEXT: or a2, a2, a4
-; LMULMAX2-RV64-NEXT: srli a4, a2, 16
-; LMULMAX2-RV64-NEXT: or a2, a2, a4
-; LMULMAX2-RV64-NEXT: srli a4, a2, 32
-; LMULMAX2-RV64-NEXT: or a2, a2, a4
-; LMULMAX2-RV64-NEXT: not a2, a2
-; LMULMAX2-RV64-NEXT: srli a4, a2, 1
-; LMULMAX2-RV64-NEXT: and a4, a4, a6
-; LMULMAX2-RV64-NEXT: sub a2, a2, a4
-; LMULMAX2-RV64-NEXT: and a4, a2, a3
-; LMULMAX2-RV64-NEXT: srli a2, a2, 2
-; LMULMAX2-RV64-NEXT: and a2, a2, a3
-; LMULMAX2-RV64-NEXT: add a2, a4, a2
-; LMULMAX2-RV64-NEXT: srli a4, a2, 4
-; LMULMAX2-RV64-NEXT: add a2, a2, a4
+; LMULMAX2-RV64-NEXT: vmv.x.s a1, v26
+; LMULMAX2-RV64-NEXT: and a1, a1, a6
+; LMULMAX2-RV64-NEXT: srli a2, a1, 1
+; LMULMAX2-RV64-NEXT: or a1, a1, a2
+; LMULMAX2-RV64-NEXT: srli a2, a1, 2
+; LMULMAX2-RV64-NEXT: or a1, a1, a2
+; LMULMAX2-RV64-NEXT: srli a2, a1, 4
+; LMULMAX2-RV64-NEXT: or a1, a1, a2
+; LMULMAX2-RV64-NEXT: srli a2, a1, 8
+; LMULMAX2-RV64-NEXT: or a1, a1, a2
+; LMULMAX2-RV64-NEXT: srli a2, a1, 16
+; LMULMAX2-RV64-NEXT: or a1, a1, a2
+; LMULMAX2-RV64-NEXT: srli a2, a1, 32
+; LMULMAX2-RV64-NEXT: or a1, a1, a2
+; LMULMAX2-RV64-NEXT: not a1, a1
+; LMULMAX2-RV64-NEXT: srli a2, a1, 1
; LMULMAX2-RV64-NEXT: and a2, a2, a7
-; LMULMAX2-RV64-NEXT: mul a2, a2, a5
-; LMULMAX2-RV64-NEXT: srli a2, a2, 56
-; LMULMAX2-RV64-NEXT: addi a2, a2, -48
-; LMULMAX2-RV64-NEXT: sh a2, 20(sp)
+; LMULMAX2-RV64-NEXT: sub a1, a1, a2
+; LMULMAX2-RV64-NEXT: and a2, a1, a3
+; LMULMAX2-RV64-NEXT: srli a1, a1, 2
+; LMULMAX2-RV64-NEXT: and a1, a1, a3
+; LMULMAX2-RV64-NEXT: add a1, a2, a1
+; LMULMAX2-RV64-NEXT: srli a2, a1, 4
+; LMULMAX2-RV64-NEXT: add a1, a1, a2
+; LMULMAX2-RV64-NEXT: and a1, a1, a4
+; LMULMAX2-RV64-NEXT: mul a1, a1, a5
+; LMULMAX2-RV64-NEXT: srli a1, a1, 56
+; LMULMAX2-RV64-NEXT: addi a1, a1, -48
+; LMULMAX2-RV64-NEXT: sh a1, 20(sp)
; LMULMAX2-RV64-NEXT: vslidedown.vi v25, v25, 1
-; LMULMAX2-RV64-NEXT: vmv.x.s a2, v25
-; LMULMAX2-RV64-NEXT: and a1, a2, a1
+; LMULMAX2-RV64-NEXT: vmv.x.s a1, v25
+; LMULMAX2-RV64-NEXT: and a1, a1, a6
; LMULMAX2-RV64-NEXT: srli a2, a1, 1
; LMULMAX2-RV64-NEXT: or a1, a1, a2
; LMULMAX2-RV64-NEXT: srli a2, a1, 2
; LMULMAX2-RV64-NEXT: or a1, a1, a2
; LMULMAX2-RV64-NEXT: not a1, a1
; LMULMAX2-RV64-NEXT: srli a2, a1, 1
-; LMULMAX2-RV64-NEXT: and a2, a2, a6
+; LMULMAX2-RV64-NEXT: and a2, a2, a7
; LMULMAX2-RV64-NEXT: sub a1, a1, a2
; LMULMAX2-RV64-NEXT: and a2, a1, a3
; LMULMAX2-RV64-NEXT: srli a1, a1, 2
; LMULMAX2-RV64-NEXT: add a1, a2, a1
; LMULMAX2-RV64-NEXT: srli a2, a1, 4
; LMULMAX2-RV64-NEXT: add a1, a1, a2
-; LMULMAX2-RV64-NEXT: and a1, a1, a7
+; LMULMAX2-RV64-NEXT: and a1, a1, a4
; LMULMAX2-RV64-NEXT: mul a1, a1, a5
; LMULMAX2-RV64-NEXT: srli a1, a1, 56
; LMULMAX2-RV64-NEXT: addi a1, a1, -48
; LMULMAX1-RV64-NEXT: vle16.v v25, (a0)
; LMULMAX1-RV64-NEXT: vmv.x.s a2, v25
; LMULMAX1-RV64-NEXT: lui a1, 16
-; LMULMAX1-RV64-NEXT: addiw a1, a1, -1
-; LMULMAX1-RV64-NEXT: and a2, a2, a1
+; LMULMAX1-RV64-NEXT: addiw a6, a1, -1
+; LMULMAX1-RV64-NEXT: and a2, a2, a6
; LMULMAX1-RV64-NEXT: srli a3, a2, 1
; LMULMAX1-RV64-NEXT: or a2, a2, a3
; LMULMAX1-RV64-NEXT: srli a3, a2, 2
; LMULMAX1-RV64-NEXT: slli a2, a2, 12
; LMULMAX1-RV64-NEXT: addi a2, a2, 1365
; LMULMAX1-RV64-NEXT: slli a2, a2, 12
-; LMULMAX1-RV64-NEXT: addi a6, a2, 1365
-; LMULMAX1-RV64-NEXT: and a4, a4, a6
+; LMULMAX1-RV64-NEXT: addi a7, a2, 1365
+; LMULMAX1-RV64-NEXT: and a4, a4, a7
; LMULMAX1-RV64-NEXT: sub a4, a3, a4
; LMULMAX1-RV64-NEXT: lui a3, 13107
; LMULMAX1-RV64-NEXT: addiw a3, a3, 819
; LMULMAX1-RV64-NEXT: slli a4, a4, 12
; LMULMAX1-RV64-NEXT: addi a4, a4, 241
; LMULMAX1-RV64-NEXT: slli a4, a4, 12
-; LMULMAX1-RV64-NEXT: addi a7, a4, -241
-; LMULMAX1-RV64-NEXT: and a2, a5, a7
+; LMULMAX1-RV64-NEXT: addi a4, a4, -241
+; LMULMAX1-RV64-NEXT: and a1, a5, a4
; LMULMAX1-RV64-NEXT: lui a5, 4112
; LMULMAX1-RV64-NEXT: addiw a5, a5, 257
; LMULMAX1-RV64-NEXT: slli a5, a5, 16
; LMULMAX1-RV64-NEXT: addi a5, a5, 257
; LMULMAX1-RV64-NEXT: slli a5, a5, 16
; LMULMAX1-RV64-NEXT: addi a5, a5, 257
-; LMULMAX1-RV64-NEXT: mul a2, a2, a5
-; LMULMAX1-RV64-NEXT: srli a2, a2, 56
-; LMULMAX1-RV64-NEXT: addi a2, a2, -48
-; LMULMAX1-RV64-NEXT: sh a2, 16(sp)
+; LMULMAX1-RV64-NEXT: mul a1, a1, a5
+; LMULMAX1-RV64-NEXT: srli a1, a1, 56
+; LMULMAX1-RV64-NEXT: addi a1, a1, -48
+; LMULMAX1-RV64-NEXT: sh a1, 16(sp)
; LMULMAX1-RV64-NEXT: vsetivli zero, 1, e16, m1, ta, mu
; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 7
-; LMULMAX1-RV64-NEXT: vmv.x.s a2, v26
-; LMULMAX1-RV64-NEXT: and a2, a2, a1
-; LMULMAX1-RV64-NEXT: srli a4, a2, 1
-; LMULMAX1-RV64-NEXT: or a2, a2, a4
-; LMULMAX1-RV64-NEXT: srli a4, a2, 2
-; LMULMAX1-RV64-NEXT: or a2, a2, a4
-; LMULMAX1-RV64-NEXT: srli a4, a2, 4
-; LMULMAX1-RV64-NEXT: or a2, a2, a4
-; LMULMAX1-RV64-NEXT: srli a4, a2, 8
-; LMULMAX1-RV64-NEXT: or a2, a2, a4
-; LMULMAX1-RV64-NEXT: srli a4, a2, 16
-; LMULMAX1-RV64-NEXT: or a2, a2, a4
-; LMULMAX1-RV64-NEXT: srli a4, a2, 32
-; LMULMAX1-RV64-NEXT: or a2, a2, a4
-; LMULMAX1-RV64-NEXT: not a2, a2
-; LMULMAX1-RV64-NEXT: srli a4, a2, 1
-; LMULMAX1-RV64-NEXT: and a4, a4, a6
-; LMULMAX1-RV64-NEXT: sub a2, a2, a4
-; LMULMAX1-RV64-NEXT: and a4, a2, a3
-; LMULMAX1-RV64-NEXT: srli a2, a2, 2
-; LMULMAX1-RV64-NEXT: and a2, a2, a3
-; LMULMAX1-RV64-NEXT: add a2, a4, a2
-; LMULMAX1-RV64-NEXT: srli a4, a2, 4
-; LMULMAX1-RV64-NEXT: add a2, a2, a4
-; LMULMAX1-RV64-NEXT: and a2, a2, a7
-; LMULMAX1-RV64-NEXT: mul a2, a2, a5
-; LMULMAX1-RV64-NEXT: srli a2, a2, 56
-; LMULMAX1-RV64-NEXT: addi a2, a2, -48
-; LMULMAX1-RV64-NEXT: sh a2, 30(sp)
-; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 6
-; LMULMAX1-RV64-NEXT: vmv.x.s a2, v26
-; LMULMAX1-RV64-NEXT: and a2, a2, a1
-; LMULMAX1-RV64-NEXT: srli a4, a2, 1
-; LMULMAX1-RV64-NEXT: or a2, a2, a4
-; LMULMAX1-RV64-NEXT: srli a4, a2, 2
-; LMULMAX1-RV64-NEXT: or a2, a2, a4
-; LMULMAX1-RV64-NEXT: srli a4, a2, 4
-; LMULMAX1-RV64-NEXT: or a2, a2, a4
-; LMULMAX1-RV64-NEXT: srli a4, a2, 8
-; LMULMAX1-RV64-NEXT: or a2, a2, a4
-; LMULMAX1-RV64-NEXT: srli a4, a2, 16
-; LMULMAX1-RV64-NEXT: or a2, a2, a4
-; LMULMAX1-RV64-NEXT: srli a4, a2, 32
-; LMULMAX1-RV64-NEXT: or a2, a2, a4
-; LMULMAX1-RV64-NEXT: not a2, a2
-; LMULMAX1-RV64-NEXT: srli a4, a2, 1
-; LMULMAX1-RV64-NEXT: and a4, a4, a6
-; LMULMAX1-RV64-NEXT: sub a2, a2, a4
-; LMULMAX1-RV64-NEXT: and a4, a2, a3
-; LMULMAX1-RV64-NEXT: srli a2, a2, 2
-; LMULMAX1-RV64-NEXT: and a2, a2, a3
-; LMULMAX1-RV64-NEXT: add a2, a4, a2
-; LMULMAX1-RV64-NEXT: srli a4, a2, 4
-; LMULMAX1-RV64-NEXT: add a2, a2, a4
-; LMULMAX1-RV64-NEXT: and a2, a2, a7
-; LMULMAX1-RV64-NEXT: mul a2, a2, a5
-; LMULMAX1-RV64-NEXT: srli a2, a2, 56
-; LMULMAX1-RV64-NEXT: addi a2, a2, -48
-; LMULMAX1-RV64-NEXT: sh a2, 28(sp)
-; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 5
-; LMULMAX1-RV64-NEXT: vmv.x.s a2, v26
-; LMULMAX1-RV64-NEXT: and a2, a2, a1
-; LMULMAX1-RV64-NEXT: srli a4, a2, 1
-; LMULMAX1-RV64-NEXT: or a2, a2, a4
-; LMULMAX1-RV64-NEXT: srli a4, a2, 2
-; LMULMAX1-RV64-NEXT: or a2, a2, a4
-; LMULMAX1-RV64-NEXT: srli a4, a2, 4
-; LMULMAX1-RV64-NEXT: or a2, a2, a4
-; LMULMAX1-RV64-NEXT: srli a4, a2, 8
-; LMULMAX1-RV64-NEXT: or a2, a2, a4
-; LMULMAX1-RV64-NEXT: srli a4, a2, 16
-; LMULMAX1-RV64-NEXT: or a2, a2, a4
-; LMULMAX1-RV64-NEXT: srli a4, a2, 32
-; LMULMAX1-RV64-NEXT: or a2, a2, a4
-; LMULMAX1-RV64-NEXT: not a2, a2
-; LMULMAX1-RV64-NEXT: srli a4, a2, 1
-; LMULMAX1-RV64-NEXT: and a4, a4, a6
-; LMULMAX1-RV64-NEXT: sub a2, a2, a4
-; LMULMAX1-RV64-NEXT: and a4, a2, a3
-; LMULMAX1-RV64-NEXT: srli a2, a2, 2
-; LMULMAX1-RV64-NEXT: and a2, a2, a3
-; LMULMAX1-RV64-NEXT: add a2, a4, a2
-; LMULMAX1-RV64-NEXT: srli a4, a2, 4
-; LMULMAX1-RV64-NEXT: add a2, a2, a4
-; LMULMAX1-RV64-NEXT: and a2, a2, a7
-; LMULMAX1-RV64-NEXT: mul a2, a2, a5
-; LMULMAX1-RV64-NEXT: srli a2, a2, 56
-; LMULMAX1-RV64-NEXT: addi a2, a2, -48
-; LMULMAX1-RV64-NEXT: sh a2, 26(sp)
-; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 4
-; LMULMAX1-RV64-NEXT: vmv.x.s a2, v26
-; LMULMAX1-RV64-NEXT: and a2, a2, a1
-; LMULMAX1-RV64-NEXT: srli a4, a2, 1
-; LMULMAX1-RV64-NEXT: or a2, a2, a4
-; LMULMAX1-RV64-NEXT: srli a4, a2, 2
-; LMULMAX1-RV64-NEXT: or a2, a2, a4
-; LMULMAX1-RV64-NEXT: srli a4, a2, 4
-; LMULMAX1-RV64-NEXT: or a2, a2, a4
-; LMULMAX1-RV64-NEXT: srli a4, a2, 8
-; LMULMAX1-RV64-NEXT: or a2, a2, a4
-; LMULMAX1-RV64-NEXT: srli a4, a2, 16
-; LMULMAX1-RV64-NEXT: or a2, a2, a4
-; LMULMAX1-RV64-NEXT: srli a4, a2, 32
-; LMULMAX1-RV64-NEXT: or a2, a2, a4
-; LMULMAX1-RV64-NEXT: not a2, a2
-; LMULMAX1-RV64-NEXT: srli a4, a2, 1
-; LMULMAX1-RV64-NEXT: and a4, a4, a6
-; LMULMAX1-RV64-NEXT: sub a2, a2, a4
-; LMULMAX1-RV64-NEXT: and a4, a2, a3
-; LMULMAX1-RV64-NEXT: srli a2, a2, 2
-; LMULMAX1-RV64-NEXT: and a2, a2, a3
-; LMULMAX1-RV64-NEXT: add a2, a4, a2
-; LMULMAX1-RV64-NEXT: srli a4, a2, 4
-; LMULMAX1-RV64-NEXT: add a2, a2, a4
-; LMULMAX1-RV64-NEXT: and a2, a2, a7
-; LMULMAX1-RV64-NEXT: mul a2, a2, a5
-; LMULMAX1-RV64-NEXT: srli a2, a2, 56
-; LMULMAX1-RV64-NEXT: addi a2, a2, -48
-; LMULMAX1-RV64-NEXT: sh a2, 24(sp)
-; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 3
-; LMULMAX1-RV64-NEXT: vmv.x.s a2, v26
-; LMULMAX1-RV64-NEXT: and a2, a2, a1
-; LMULMAX1-RV64-NEXT: srli a4, a2, 1
-; LMULMAX1-RV64-NEXT: or a2, a2, a4
-; LMULMAX1-RV64-NEXT: srli a4, a2, 2
-; LMULMAX1-RV64-NEXT: or a2, a2, a4
-; LMULMAX1-RV64-NEXT: srli a4, a2, 4
-; LMULMAX1-RV64-NEXT: or a2, a2, a4
-; LMULMAX1-RV64-NEXT: srli a4, a2, 8
-; LMULMAX1-RV64-NEXT: or a2, a2, a4
-; LMULMAX1-RV64-NEXT: srli a4, a2, 16
-; LMULMAX1-RV64-NEXT: or a2, a2, a4
-; LMULMAX1-RV64-NEXT: srli a4, a2, 32
-; LMULMAX1-RV64-NEXT: or a2, a2, a4
-; LMULMAX1-RV64-NEXT: not a2, a2
-; LMULMAX1-RV64-NEXT: srli a4, a2, 1
-; LMULMAX1-RV64-NEXT: and a4, a4, a6
-; LMULMAX1-RV64-NEXT: sub a2, a2, a4
-; LMULMAX1-RV64-NEXT: and a4, a2, a3
-; LMULMAX1-RV64-NEXT: srli a2, a2, 2
-; LMULMAX1-RV64-NEXT: and a2, a2, a3
-; LMULMAX1-RV64-NEXT: add a2, a4, a2
-; LMULMAX1-RV64-NEXT: srli a4, a2, 4
-; LMULMAX1-RV64-NEXT: add a2, a2, a4
-; LMULMAX1-RV64-NEXT: and a2, a2, a7
-; LMULMAX1-RV64-NEXT: mul a2, a2, a5
-; LMULMAX1-RV64-NEXT: srli a2, a2, 56
-; LMULMAX1-RV64-NEXT: addi a2, a2, -48
-; LMULMAX1-RV64-NEXT: sh a2, 22(sp)
-; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 2
-; LMULMAX1-RV64-NEXT: vmv.x.s a2, v26
-; LMULMAX1-RV64-NEXT: and a2, a2, a1
-; LMULMAX1-RV64-NEXT: srli a4, a2, 1
-; LMULMAX1-RV64-NEXT: or a2, a2, a4
-; LMULMAX1-RV64-NEXT: srli a4, a2, 2
-; LMULMAX1-RV64-NEXT: or a2, a2, a4
-; LMULMAX1-RV64-NEXT: srli a4, a2, 4
-; LMULMAX1-RV64-NEXT: or a2, a2, a4
-; LMULMAX1-RV64-NEXT: srli a4, a2, 8
-; LMULMAX1-RV64-NEXT: or a2, a2, a4
-; LMULMAX1-RV64-NEXT: srli a4, a2, 16
-; LMULMAX1-RV64-NEXT: or a2, a2, a4
-; LMULMAX1-RV64-NEXT: srli a4, a2, 32
-; LMULMAX1-RV64-NEXT: or a2, a2, a4
-; LMULMAX1-RV64-NEXT: not a2, a2
-; LMULMAX1-RV64-NEXT: srli a4, a2, 1
-; LMULMAX1-RV64-NEXT: and a4, a4, a6
-; LMULMAX1-RV64-NEXT: sub a2, a2, a4
-; LMULMAX1-RV64-NEXT: and a4, a2, a3
-; LMULMAX1-RV64-NEXT: srli a2, a2, 2
-; LMULMAX1-RV64-NEXT: and a2, a2, a3
-; LMULMAX1-RV64-NEXT: add a2, a4, a2
-; LMULMAX1-RV64-NEXT: srli a4, a2, 4
-; LMULMAX1-RV64-NEXT: add a2, a2, a4
-; LMULMAX1-RV64-NEXT: and a2, a2, a7
-; LMULMAX1-RV64-NEXT: mul a2, a2, a5
-; LMULMAX1-RV64-NEXT: srli a2, a2, 56
-; LMULMAX1-RV64-NEXT: addi a2, a2, -48
-; LMULMAX1-RV64-NEXT: sh a2, 20(sp)
-; LMULMAX1-RV64-NEXT: vslidedown.vi v25, v25, 1
-; LMULMAX1-RV64-NEXT: vmv.x.s a2, v25
-; LMULMAX1-RV64-NEXT: and a1, a2, a1
+; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26
+; LMULMAX1-RV64-NEXT: and a1, a1, a6
; LMULMAX1-RV64-NEXT: srli a2, a1, 1
; LMULMAX1-RV64-NEXT: or a1, a1, a2
; LMULMAX1-RV64-NEXT: srli a2, a1, 2
; LMULMAX1-RV64-NEXT: or a1, a1, a2
; LMULMAX1-RV64-NEXT: not a1, a1
; LMULMAX1-RV64-NEXT: srli a2, a1, 1
-; LMULMAX1-RV64-NEXT: and a2, a2, a6
+; LMULMAX1-RV64-NEXT: and a2, a2, a7
; LMULMAX1-RV64-NEXT: sub a1, a1, a2
; LMULMAX1-RV64-NEXT: and a2, a1, a3
; LMULMAX1-RV64-NEXT: srli a1, a1, 2
; LMULMAX1-RV64-NEXT: add a1, a2, a1
; LMULMAX1-RV64-NEXT: srli a2, a1, 4
; LMULMAX1-RV64-NEXT: add a1, a1, a2
-; LMULMAX1-RV64-NEXT: and a1, a1, a7
+; LMULMAX1-RV64-NEXT: and a1, a1, a4
; LMULMAX1-RV64-NEXT: mul a1, a1, a5
; LMULMAX1-RV64-NEXT: srli a1, a1, 56
; LMULMAX1-RV64-NEXT: addi a1, a1, -48
-; LMULMAX1-RV64-NEXT: sh a1, 18(sp)
-; LMULMAX1-RV64-NEXT: vsetivli zero, 8, e16, m1, ta, mu
-; LMULMAX1-RV64-NEXT: addi a1, sp, 16
-; LMULMAX1-RV64-NEXT: vle16.v v25, (a1)
-; LMULMAX1-RV64-NEXT: vse16.v v25, (a0)
-; LMULMAX1-RV64-NEXT: addi sp, sp, 32
-; LMULMAX1-RV64-NEXT: ret
- %a = load <8 x i16>, <8 x i16>* %x
- %b = load <8 x i16>, <8 x i16>* %y
- %c = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %a, i1 false)
- store <8 x i16> %c, <8 x i16>* %x
- ret void
-}
-declare <8 x i16> @llvm.ctlz.v8i16(<8 x i16>, i1)
-
-define void @ctlz_v4i32(<4 x i32>* %x, <4 x i32>* %y) {
+; LMULMAX1-RV64-NEXT: sh a1, 30(sp)
+; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 6
+; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26
+; LMULMAX1-RV64-NEXT: and a1, a1, a6
+; LMULMAX1-RV64-NEXT: srli a2, a1, 1
+; LMULMAX1-RV64-NEXT: or a1, a1, a2
+; LMULMAX1-RV64-NEXT: srli a2, a1, 2
+; LMULMAX1-RV64-NEXT: or a1, a1, a2
+; LMULMAX1-RV64-NEXT: srli a2, a1, 4
+; LMULMAX1-RV64-NEXT: or a1, a1, a2
+; LMULMAX1-RV64-NEXT: srli a2, a1, 8
+; LMULMAX1-RV64-NEXT: or a1, a1, a2
+; LMULMAX1-RV64-NEXT: srli a2, a1, 16
+; LMULMAX1-RV64-NEXT: or a1, a1, a2
+; LMULMAX1-RV64-NEXT: srli a2, a1, 32
+; LMULMAX1-RV64-NEXT: or a1, a1, a2
+; LMULMAX1-RV64-NEXT: not a1, a1
+; LMULMAX1-RV64-NEXT: srli a2, a1, 1
+; LMULMAX1-RV64-NEXT: and a2, a2, a7
+; LMULMAX1-RV64-NEXT: sub a1, a1, a2
+; LMULMAX1-RV64-NEXT: and a2, a1, a3
+; LMULMAX1-RV64-NEXT: srli a1, a1, 2
+; LMULMAX1-RV64-NEXT: and a1, a1, a3
+; LMULMAX1-RV64-NEXT: add a1, a2, a1
+; LMULMAX1-RV64-NEXT: srli a2, a1, 4
+; LMULMAX1-RV64-NEXT: add a1, a1, a2
+; LMULMAX1-RV64-NEXT: and a1, a1, a4
+; LMULMAX1-RV64-NEXT: mul a1, a1, a5
+; LMULMAX1-RV64-NEXT: srli a1, a1, 56
+; LMULMAX1-RV64-NEXT: addi a1, a1, -48
+; LMULMAX1-RV64-NEXT: sh a1, 28(sp)
+; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 5
+; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26
+; LMULMAX1-RV64-NEXT: and a1, a1, a6
+; LMULMAX1-RV64-NEXT: srli a2, a1, 1
+; LMULMAX1-RV64-NEXT: or a1, a1, a2
+; LMULMAX1-RV64-NEXT: srli a2, a1, 2
+; LMULMAX1-RV64-NEXT: or a1, a1, a2
+; LMULMAX1-RV64-NEXT: srli a2, a1, 4
+; LMULMAX1-RV64-NEXT: or a1, a1, a2
+; LMULMAX1-RV64-NEXT: srli a2, a1, 8
+; LMULMAX1-RV64-NEXT: or a1, a1, a2
+; LMULMAX1-RV64-NEXT: srli a2, a1, 16
+; LMULMAX1-RV64-NEXT: or a1, a1, a2
+; LMULMAX1-RV64-NEXT: srli a2, a1, 32
+; LMULMAX1-RV64-NEXT: or a1, a1, a2
+; LMULMAX1-RV64-NEXT: not a1, a1
+; LMULMAX1-RV64-NEXT: srli a2, a1, 1
+; LMULMAX1-RV64-NEXT: and a2, a2, a7
+; LMULMAX1-RV64-NEXT: sub a1, a1, a2
+; LMULMAX1-RV64-NEXT: and a2, a1, a3
+; LMULMAX1-RV64-NEXT: srli a1, a1, 2
+; LMULMAX1-RV64-NEXT: and a1, a1, a3
+; LMULMAX1-RV64-NEXT: add a1, a2, a1
+; LMULMAX1-RV64-NEXT: srli a2, a1, 4
+; LMULMAX1-RV64-NEXT: add a1, a1, a2
+; LMULMAX1-RV64-NEXT: and a1, a1, a4
+; LMULMAX1-RV64-NEXT: mul a1, a1, a5
+; LMULMAX1-RV64-NEXT: srli a1, a1, 56
+; LMULMAX1-RV64-NEXT: addi a1, a1, -48
+; LMULMAX1-RV64-NEXT: sh a1, 26(sp)
+; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 4
+; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26
+; LMULMAX1-RV64-NEXT: and a1, a1, a6
+; LMULMAX1-RV64-NEXT: srli a2, a1, 1
+; LMULMAX1-RV64-NEXT: or a1, a1, a2
+; LMULMAX1-RV64-NEXT: srli a2, a1, 2
+; LMULMAX1-RV64-NEXT: or a1, a1, a2
+; LMULMAX1-RV64-NEXT: srli a2, a1, 4
+; LMULMAX1-RV64-NEXT: or a1, a1, a2
+; LMULMAX1-RV64-NEXT: srli a2, a1, 8
+; LMULMAX1-RV64-NEXT: or a1, a1, a2
+; LMULMAX1-RV64-NEXT: srli a2, a1, 16
+; LMULMAX1-RV64-NEXT: or a1, a1, a2
+; LMULMAX1-RV64-NEXT: srli a2, a1, 32
+; LMULMAX1-RV64-NEXT: or a1, a1, a2
+; LMULMAX1-RV64-NEXT: not a1, a1
+; LMULMAX1-RV64-NEXT: srli a2, a1, 1
+; LMULMAX1-RV64-NEXT: and a2, a2, a7
+; LMULMAX1-RV64-NEXT: sub a1, a1, a2
+; LMULMAX1-RV64-NEXT: and a2, a1, a3
+; LMULMAX1-RV64-NEXT: srli a1, a1, 2
+; LMULMAX1-RV64-NEXT: and a1, a1, a3
+; LMULMAX1-RV64-NEXT: add a1, a2, a1
+; LMULMAX1-RV64-NEXT: srli a2, a1, 4
+; LMULMAX1-RV64-NEXT: add a1, a1, a2
+; LMULMAX1-RV64-NEXT: and a1, a1, a4
+; LMULMAX1-RV64-NEXT: mul a1, a1, a5
+; LMULMAX1-RV64-NEXT: srli a1, a1, 56
+; LMULMAX1-RV64-NEXT: addi a1, a1, -48
+; LMULMAX1-RV64-NEXT: sh a1, 24(sp)
+; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 3
+; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26
+; LMULMAX1-RV64-NEXT: and a1, a1, a6
+; LMULMAX1-RV64-NEXT: srli a2, a1, 1
+; LMULMAX1-RV64-NEXT: or a1, a1, a2
+; LMULMAX1-RV64-NEXT: srli a2, a1, 2
+; LMULMAX1-RV64-NEXT: or a1, a1, a2
+; LMULMAX1-RV64-NEXT: srli a2, a1, 4
+; LMULMAX1-RV64-NEXT: or a1, a1, a2
+; LMULMAX1-RV64-NEXT: srli a2, a1, 8
+; LMULMAX1-RV64-NEXT: or a1, a1, a2
+; LMULMAX1-RV64-NEXT: srli a2, a1, 16
+; LMULMAX1-RV64-NEXT: or a1, a1, a2
+; LMULMAX1-RV64-NEXT: srli a2, a1, 32
+; LMULMAX1-RV64-NEXT: or a1, a1, a2
+; LMULMAX1-RV64-NEXT: not a1, a1
+; LMULMAX1-RV64-NEXT: srli a2, a1, 1
+; LMULMAX1-RV64-NEXT: and a2, a2, a7
+; LMULMAX1-RV64-NEXT: sub a1, a1, a2
+; LMULMAX1-RV64-NEXT: and a2, a1, a3
+; LMULMAX1-RV64-NEXT: srli a1, a1, 2
+; LMULMAX1-RV64-NEXT: and a1, a1, a3
+; LMULMAX1-RV64-NEXT: add a1, a2, a1
+; LMULMAX1-RV64-NEXT: srli a2, a1, 4
+; LMULMAX1-RV64-NEXT: add a1, a1, a2
+; LMULMAX1-RV64-NEXT: and a1, a1, a4
+; LMULMAX1-RV64-NEXT: mul a1, a1, a5
+; LMULMAX1-RV64-NEXT: srli a1, a1, 56
+; LMULMAX1-RV64-NEXT: addi a1, a1, -48
+; LMULMAX1-RV64-NEXT: sh a1, 22(sp)
+; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 2
+; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26
+; LMULMAX1-RV64-NEXT: and a1, a1, a6
+; LMULMAX1-RV64-NEXT: srli a2, a1, 1
+; LMULMAX1-RV64-NEXT: or a1, a1, a2
+; LMULMAX1-RV64-NEXT: srli a2, a1, 2
+; LMULMAX1-RV64-NEXT: or a1, a1, a2
+; LMULMAX1-RV64-NEXT: srli a2, a1, 4
+; LMULMAX1-RV64-NEXT: or a1, a1, a2
+; LMULMAX1-RV64-NEXT: srli a2, a1, 8
+; LMULMAX1-RV64-NEXT: or a1, a1, a2
+; LMULMAX1-RV64-NEXT: srli a2, a1, 16
+; LMULMAX1-RV64-NEXT: or a1, a1, a2
+; LMULMAX1-RV64-NEXT: srli a2, a1, 32
+; LMULMAX1-RV64-NEXT: or a1, a1, a2
+; LMULMAX1-RV64-NEXT: not a1, a1
+; LMULMAX1-RV64-NEXT: srli a2, a1, 1
+; LMULMAX1-RV64-NEXT: and a2, a2, a7
+; LMULMAX1-RV64-NEXT: sub a1, a1, a2
+; LMULMAX1-RV64-NEXT: and a2, a1, a3
+; LMULMAX1-RV64-NEXT: srli a1, a1, 2
+; LMULMAX1-RV64-NEXT: and a1, a1, a3
+; LMULMAX1-RV64-NEXT: add a1, a2, a1
+; LMULMAX1-RV64-NEXT: srli a2, a1, 4
+; LMULMAX1-RV64-NEXT: add a1, a1, a2
+; LMULMAX1-RV64-NEXT: and a1, a1, a4
+; LMULMAX1-RV64-NEXT: mul a1, a1, a5
+; LMULMAX1-RV64-NEXT: srli a1, a1, 56
+; LMULMAX1-RV64-NEXT: addi a1, a1, -48
+; LMULMAX1-RV64-NEXT: sh a1, 20(sp)
+; LMULMAX1-RV64-NEXT: vslidedown.vi v25, v25, 1
+; LMULMAX1-RV64-NEXT: vmv.x.s a1, v25
+; LMULMAX1-RV64-NEXT: and a1, a1, a6
+; LMULMAX1-RV64-NEXT: srli a2, a1, 1
+; LMULMAX1-RV64-NEXT: or a1, a1, a2
+; LMULMAX1-RV64-NEXT: srli a2, a1, 2
+; LMULMAX1-RV64-NEXT: or a1, a1, a2
+; LMULMAX1-RV64-NEXT: srli a2, a1, 4
+; LMULMAX1-RV64-NEXT: or a1, a1, a2
+; LMULMAX1-RV64-NEXT: srli a2, a1, 8
+; LMULMAX1-RV64-NEXT: or a1, a1, a2
+; LMULMAX1-RV64-NEXT: srli a2, a1, 16
+; LMULMAX1-RV64-NEXT: or a1, a1, a2
+; LMULMAX1-RV64-NEXT: srli a2, a1, 32
+; LMULMAX1-RV64-NEXT: or a1, a1, a2
+; LMULMAX1-RV64-NEXT: not a1, a1
+; LMULMAX1-RV64-NEXT: srli a2, a1, 1
+; LMULMAX1-RV64-NEXT: and a2, a2, a7
+; LMULMAX1-RV64-NEXT: sub a1, a1, a2
+; LMULMAX1-RV64-NEXT: and a2, a1, a3
+; LMULMAX1-RV64-NEXT: srli a1, a1, 2
+; LMULMAX1-RV64-NEXT: and a1, a1, a3
+; LMULMAX1-RV64-NEXT: add a1, a2, a1
+; LMULMAX1-RV64-NEXT: srli a2, a1, 4
+; LMULMAX1-RV64-NEXT: add a1, a1, a2
+; LMULMAX1-RV64-NEXT: and a1, a1, a4
+; LMULMAX1-RV64-NEXT: mul a1, a1, a5
+; LMULMAX1-RV64-NEXT: srli a1, a1, 56
+; LMULMAX1-RV64-NEXT: addi a1, a1, -48
+; LMULMAX1-RV64-NEXT: sh a1, 18(sp)
+; LMULMAX1-RV64-NEXT: vsetivli zero, 8, e16, m1, ta, mu
+; LMULMAX1-RV64-NEXT: addi a1, sp, 16
+; LMULMAX1-RV64-NEXT: vle16.v v25, (a1)
+; LMULMAX1-RV64-NEXT: vse16.v v25, (a0)
+; LMULMAX1-RV64-NEXT: addi sp, sp, 32
+; LMULMAX1-RV64-NEXT: ret
+ %a = load <8 x i16>, <8 x i16>* %x
+ %b = load <8 x i16>, <8 x i16>* %y
+ %c = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %a, i1 false)
+ store <8 x i16> %c, <8 x i16>* %x
+ ret void
+}
+declare <8 x i16> @llvm.ctlz.v8i16(<8 x i16>, i1)
+
+define void @ctlz_v4i32(<4 x i32>* %x, <4 x i32>* %y) {
; LMULMAX2-RV32-LABEL: ctlz_v4i32:
; LMULMAX2-RV32: # %bb.0:
; LMULMAX2-RV32-NEXT: addi sp, sp, -32
; LMULMAX2-RV32-NEXT: vle64.v v25, (a0)
; LMULMAX2-RV32-NEXT: sw zero, 28(sp)
; LMULMAX2-RV32-NEXT: sw zero, 20(sp)
-; LMULMAX2-RV32-NEXT: addi a5, zero, 32
+; LMULMAX2-RV32-NEXT: addi a6, zero, 32
; LMULMAX2-RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu
-; LMULMAX2-RV32-NEXT: vsrl.vx v26, v25, a5
-; LMULMAX2-RV32-NEXT: vmv.x.s a1, v26
-; LMULMAX2-RV32-NEXT: lui a2, 349525
-; LMULMAX2-RV32-NEXT: addi a4, a2, 1365
-; LMULMAX2-RV32-NEXT: lui a2, 209715
-; LMULMAX2-RV32-NEXT: addi a3, a2, 819
-; LMULMAX2-RV32-NEXT: lui a2, 61681
-; LMULMAX2-RV32-NEXT: addi a6, a2, -241
-; LMULMAX2-RV32-NEXT: lui a2, 4112
-; LMULMAX2-RV32-NEXT: addi a7, a2, 257
-; LMULMAX2-RV32-NEXT: bnez a1, .LBB3_2
+; LMULMAX2-RV32-NEXT: vsrl.vx v26, v25, a6
+; LMULMAX2-RV32-NEXT: vmv.x.s a5, v26
+; LMULMAX2-RV32-NEXT: lui a1, 349525
+; LMULMAX2-RV32-NEXT: addi a4, a1, 1365
+; LMULMAX2-RV32-NEXT: lui a1, 209715
+; LMULMAX2-RV32-NEXT: addi a3, a1, 819
+; LMULMAX2-RV32-NEXT: lui a1, 61681
+; LMULMAX2-RV32-NEXT: addi a7, a1, -241
+; LMULMAX2-RV32-NEXT: lui a1, 4112
+; LMULMAX2-RV32-NEXT: addi a2, a1, 257
+; LMULMAX2-RV32-NEXT: bnez a5, .LBB3_2
; LMULMAX2-RV32-NEXT: # %bb.1:
; LMULMAX2-RV32-NEXT: vmv.x.s a1, v25
-; LMULMAX2-RV32-NEXT: srli a2, a1, 1
-; LMULMAX2-RV32-NEXT: or a1, a1, a2
-; LMULMAX2-RV32-NEXT: srli a2, a1, 2
-; LMULMAX2-RV32-NEXT: or a1, a1, a2
-; LMULMAX2-RV32-NEXT: srli a2, a1, 4
-; LMULMAX2-RV32-NEXT: or a1, a1, a2
-; LMULMAX2-RV32-NEXT: srli a2, a1, 8
-; LMULMAX2-RV32-NEXT: or a1, a1, a2
-; LMULMAX2-RV32-NEXT: srli a2, a1, 16
-; LMULMAX2-RV32-NEXT: or a1, a1, a2
+; LMULMAX2-RV32-NEXT: srli a5, a1, 1
+; LMULMAX2-RV32-NEXT: or a1, a1, a5
+; LMULMAX2-RV32-NEXT: srli a5, a1, 2
+; LMULMAX2-RV32-NEXT: or a1, a1, a5
+; LMULMAX2-RV32-NEXT: srli a5, a1, 4
+; LMULMAX2-RV32-NEXT: or a1, a1, a5
+; LMULMAX2-RV32-NEXT: srli a5, a1, 8
+; LMULMAX2-RV32-NEXT: or a1, a1, a5
+; LMULMAX2-RV32-NEXT: srli a5, a1, 16
+; LMULMAX2-RV32-NEXT: or a1, a1, a5
; LMULMAX2-RV32-NEXT: not a1, a1
-; LMULMAX2-RV32-NEXT: srli a2, a1, 1
-; LMULMAX2-RV32-NEXT: and a2, a2, a4
-; LMULMAX2-RV32-NEXT: sub a1, a1, a2
-; LMULMAX2-RV32-NEXT: and a2, a1, a3
+; LMULMAX2-RV32-NEXT: srli a5, a1, 1
+; LMULMAX2-RV32-NEXT: and a5, a5, a4
+; LMULMAX2-RV32-NEXT: sub a1, a1, a5
+; LMULMAX2-RV32-NEXT: and a5, a1, a3
; LMULMAX2-RV32-NEXT: srli a1, a1, 2
; LMULMAX2-RV32-NEXT: and a1, a1, a3
-; LMULMAX2-RV32-NEXT: add a1, a2, a1
-; LMULMAX2-RV32-NEXT: srli a2, a1, 4
-; LMULMAX2-RV32-NEXT: add a1, a1, a2
-; LMULMAX2-RV32-NEXT: and a1, a1, a6
-; LMULMAX2-RV32-NEXT: mul a1, a1, a7
+; LMULMAX2-RV32-NEXT: add a1, a5, a1
+; LMULMAX2-RV32-NEXT: srli a5, a1, 4
+; LMULMAX2-RV32-NEXT: add a1, a1, a5
+; LMULMAX2-RV32-NEXT: and a1, a1, a7
+; LMULMAX2-RV32-NEXT: mul a1, a1, a2
; LMULMAX2-RV32-NEXT: srli a1, a1, 24
-; LMULMAX2-RV32-NEXT: addi a1, a1, 32
+; LMULMAX2-RV32-NEXT: addi a5, a1, 32
; LMULMAX2-RV32-NEXT: j .LBB3_3
; LMULMAX2-RV32-NEXT: .LBB3_2:
-; LMULMAX2-RV32-NEXT: srli a2, a1, 1
-; LMULMAX2-RV32-NEXT: or a1, a1, a2
-; LMULMAX2-RV32-NEXT: srli a2, a1, 2
-; LMULMAX2-RV32-NEXT: or a1, a1, a2
-; LMULMAX2-RV32-NEXT: srli a2, a1, 4
-; LMULMAX2-RV32-NEXT: or a1, a1, a2
-; LMULMAX2-RV32-NEXT: srli a2, a1, 8
-; LMULMAX2-RV32-NEXT: or a1, a1, a2
-; LMULMAX2-RV32-NEXT: srli a2, a1, 16
-; LMULMAX2-RV32-NEXT: or a1, a1, a2
+; LMULMAX2-RV32-NEXT: srli a1, a5, 1
+; LMULMAX2-RV32-NEXT: or a1, a5, a1
+; LMULMAX2-RV32-NEXT: srli a5, a1, 2
+; LMULMAX2-RV32-NEXT: or a1, a1, a5
+; LMULMAX2-RV32-NEXT: srli a5, a1, 4
+; LMULMAX2-RV32-NEXT: or a1, a1, a5
+; LMULMAX2-RV32-NEXT: srli a5, a1, 8
+; LMULMAX2-RV32-NEXT: or a1, a1, a5
+; LMULMAX2-RV32-NEXT: srli a5, a1, 16
+; LMULMAX2-RV32-NEXT: or a1, a1, a5
; LMULMAX2-RV32-NEXT: not a1, a1
-; LMULMAX2-RV32-NEXT: srli a2, a1, 1
-; LMULMAX2-RV32-NEXT: and a2, a2, a4
-; LMULMAX2-RV32-NEXT: sub a1, a1, a2
-; LMULMAX2-RV32-NEXT: and a2, a1, a3
+; LMULMAX2-RV32-NEXT: srli a5, a1, 1
+; LMULMAX2-RV32-NEXT: and a5, a5, a4
+; LMULMAX2-RV32-NEXT: sub a1, a1, a5
+; LMULMAX2-RV32-NEXT: and a5, a1, a3
; LMULMAX2-RV32-NEXT: srli a1, a1, 2
; LMULMAX2-RV32-NEXT: and a1, a1, a3
-; LMULMAX2-RV32-NEXT: add a1, a2, a1
-; LMULMAX2-RV32-NEXT: srli a2, a1, 4
-; LMULMAX2-RV32-NEXT: add a1, a1, a2
-; LMULMAX2-RV32-NEXT: and a1, a1, a6
-; LMULMAX2-RV32-NEXT: mul a1, a1, a7
-; LMULMAX2-RV32-NEXT: srli a1, a1, 24
+; LMULMAX2-RV32-NEXT: add a1, a5, a1
+; LMULMAX2-RV32-NEXT: srli a5, a1, 4
+; LMULMAX2-RV32-NEXT: add a1, a1, a5
+; LMULMAX2-RV32-NEXT: and a1, a1, a7
+; LMULMAX2-RV32-NEXT: mul a1, a1, a2
+; LMULMAX2-RV32-NEXT: srli a5, a1, 24
; LMULMAX2-RV32-NEXT: .LBB3_3:
; LMULMAX2-RV32-NEXT: vslidedown.vi v25, v25, 1
-; LMULMAX2-RV32-NEXT: vsrl.vx v26, v25, a5
-; LMULMAX2-RV32-NEXT: vmv.x.s a5, v26
-; LMULMAX2-RV32-NEXT: sw a1, 16(sp)
-; LMULMAX2-RV32-NEXT: bnez a5, .LBB3_5
+; LMULMAX2-RV32-NEXT: vsrl.vx v26, v25, a6
+; LMULMAX2-RV32-NEXT: vmv.x.s a1, v26
+; LMULMAX2-RV32-NEXT: sw a5, 16(sp)
+; LMULMAX2-RV32-NEXT: bnez a1, .LBB3_5
; LMULMAX2-RV32-NEXT: # %bb.4:
; LMULMAX2-RV32-NEXT: vmv.x.s a1, v25
-; LMULMAX2-RV32-NEXT: srli a2, a1, 1
-; LMULMAX2-RV32-NEXT: or a1, a1, a2
-; LMULMAX2-RV32-NEXT: srli a2, a1, 2
-; LMULMAX2-RV32-NEXT: or a1, a1, a2
-; LMULMAX2-RV32-NEXT: srli a2, a1, 4
-; LMULMAX2-RV32-NEXT: or a1, a1, a2
-; LMULMAX2-RV32-NEXT: srli a2, a1, 8
-; LMULMAX2-RV32-NEXT: or a1, a1, a2
-; LMULMAX2-RV32-NEXT: srli a2, a1, 16
-; LMULMAX2-RV32-NEXT: or a1, a1, a2
+; LMULMAX2-RV32-NEXT: srli a5, a1, 1
+; LMULMAX2-RV32-NEXT: or a1, a1, a5
+; LMULMAX2-RV32-NEXT: srli a5, a1, 2
+; LMULMAX2-RV32-NEXT: or a1, a1, a5
+; LMULMAX2-RV32-NEXT: srli a5, a1, 4
+; LMULMAX2-RV32-NEXT: or a1, a1, a5
+; LMULMAX2-RV32-NEXT: srli a5, a1, 8
+; LMULMAX2-RV32-NEXT: or a1, a1, a5
+; LMULMAX2-RV32-NEXT: srli a5, a1, 16
+; LMULMAX2-RV32-NEXT: or a1, a1, a5
; LMULMAX2-RV32-NEXT: not a1, a1
-; LMULMAX2-RV32-NEXT: srli a2, a1, 1
-; LMULMAX2-RV32-NEXT: and a2, a2, a4
-; LMULMAX2-RV32-NEXT: sub a1, a1, a2
-; LMULMAX2-RV32-NEXT: and a2, a1, a3
+; LMULMAX2-RV32-NEXT: srli a5, a1, 1
+; LMULMAX2-RV32-NEXT: and a4, a5, a4
+; LMULMAX2-RV32-NEXT: sub a1, a1, a4
+; LMULMAX2-RV32-NEXT: and a4, a1, a3
; LMULMAX2-RV32-NEXT: srli a1, a1, 2
; LMULMAX2-RV32-NEXT: and a1, a1, a3
-; LMULMAX2-RV32-NEXT: add a1, a2, a1
-; LMULMAX2-RV32-NEXT: srli a2, a1, 4
-; LMULMAX2-RV32-NEXT: add a1, a1, a2
-; LMULMAX2-RV32-NEXT: and a1, a1, a6
-; LMULMAX2-RV32-NEXT: mul a1, a1, a7
+; LMULMAX2-RV32-NEXT: add a1, a4, a1
+; LMULMAX2-RV32-NEXT: srli a3, a1, 4
+; LMULMAX2-RV32-NEXT: add a1, a1, a3
+; LMULMAX2-RV32-NEXT: and a1, a1, a7
+; LMULMAX2-RV32-NEXT: mul a1, a1, a2
; LMULMAX2-RV32-NEXT: srli a1, a1, 24
; LMULMAX2-RV32-NEXT: addi a1, a1, 32
; LMULMAX2-RV32-NEXT: j .LBB3_6
; LMULMAX2-RV32-NEXT: .LBB3_5:
-; LMULMAX2-RV32-NEXT: srli a1, a5, 1
-; LMULMAX2-RV32-NEXT: or a1, a5, a1
-; LMULMAX2-RV32-NEXT: srli a2, a1, 2
-; LMULMAX2-RV32-NEXT: or a1, a1, a2
-; LMULMAX2-RV32-NEXT: srli a2, a1, 4
-; LMULMAX2-RV32-NEXT: or a1, a1, a2
-; LMULMAX2-RV32-NEXT: srli a2, a1, 8
-; LMULMAX2-RV32-NEXT: or a1, a1, a2
-; LMULMAX2-RV32-NEXT: srli a2, a1, 16
-; LMULMAX2-RV32-NEXT: or a1, a1, a2
+; LMULMAX2-RV32-NEXT: srli a5, a1, 1
+; LMULMAX2-RV32-NEXT: or a1, a1, a5
+; LMULMAX2-RV32-NEXT: srli a5, a1, 2
+; LMULMAX2-RV32-NEXT: or a1, a1, a5
+; LMULMAX2-RV32-NEXT: srli a5, a1, 4
+; LMULMAX2-RV32-NEXT: or a1, a1, a5
+; LMULMAX2-RV32-NEXT: srli a5, a1, 8
+; LMULMAX2-RV32-NEXT: or a1, a1, a5
+; LMULMAX2-RV32-NEXT: srli a5, a1, 16
+; LMULMAX2-RV32-NEXT: or a1, a1, a5
; LMULMAX2-RV32-NEXT: not a1, a1
-; LMULMAX2-RV32-NEXT: srli a2, a1, 1
-; LMULMAX2-RV32-NEXT: and a2, a2, a4
-; LMULMAX2-RV32-NEXT: sub a1, a1, a2
-; LMULMAX2-RV32-NEXT: and a2, a1, a3
+; LMULMAX2-RV32-NEXT: srli a5, a1, 1
+; LMULMAX2-RV32-NEXT: and a4, a5, a4
+; LMULMAX2-RV32-NEXT: sub a1, a1, a4
+; LMULMAX2-RV32-NEXT: and a4, a1, a3
; LMULMAX2-RV32-NEXT: srli a1, a1, 2
; LMULMAX2-RV32-NEXT: and a1, a1, a3
-; LMULMAX2-RV32-NEXT: add a1, a2, a1
-; LMULMAX2-RV32-NEXT: srli a2, a1, 4
-; LMULMAX2-RV32-NEXT: add a1, a1, a2
-; LMULMAX2-RV32-NEXT: and a1, a1, a6
-; LMULMAX2-RV32-NEXT: mul a1, a1, a7
+; LMULMAX2-RV32-NEXT: add a1, a4, a1
+; LMULMAX2-RV32-NEXT: srli a3, a1, 4
+; LMULMAX2-RV32-NEXT: add a1, a1, a3
+; LMULMAX2-RV32-NEXT: and a1, a1, a7
+; LMULMAX2-RV32-NEXT: mul a1, a1, a2
; LMULMAX2-RV32-NEXT: srli a1, a1, 24
; LMULMAX2-RV32-NEXT: .LBB3_6:
; LMULMAX2-RV32-NEXT: sw a1, 24(sp)
; LMULMAX1-RV32-NEXT: vle64.v v25, (a0)
; LMULMAX1-RV32-NEXT: sw zero, 28(sp)
; LMULMAX1-RV32-NEXT: sw zero, 20(sp)
-; LMULMAX1-RV32-NEXT: addi a5, zero, 32
+; LMULMAX1-RV32-NEXT: addi a6, zero, 32
; LMULMAX1-RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu
-; LMULMAX1-RV32-NEXT: vsrl.vx v26, v25, a5
-; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26
-; LMULMAX1-RV32-NEXT: lui a2, 349525
-; LMULMAX1-RV32-NEXT: addi a4, a2, 1365
-; LMULMAX1-RV32-NEXT: lui a2, 209715
-; LMULMAX1-RV32-NEXT: addi a3, a2, 819
-; LMULMAX1-RV32-NEXT: lui a2, 61681
-; LMULMAX1-RV32-NEXT: addi a6, a2, -241
-; LMULMAX1-RV32-NEXT: lui a2, 4112
-; LMULMAX1-RV32-NEXT: addi a7, a2, 257
-; LMULMAX1-RV32-NEXT: bnez a1, .LBB3_2
+; LMULMAX1-RV32-NEXT: vsrl.vx v26, v25, a6
+; LMULMAX1-RV32-NEXT: vmv.x.s a5, v26
+; LMULMAX1-RV32-NEXT: lui a1, 349525
+; LMULMAX1-RV32-NEXT: addi a4, a1, 1365
+; LMULMAX1-RV32-NEXT: lui a1, 209715
+; LMULMAX1-RV32-NEXT: addi a3, a1, 819
+; LMULMAX1-RV32-NEXT: lui a1, 61681
+; LMULMAX1-RV32-NEXT: addi a7, a1, -241
+; LMULMAX1-RV32-NEXT: lui a1, 4112
+; LMULMAX1-RV32-NEXT: addi a2, a1, 257
+; LMULMAX1-RV32-NEXT: bnez a5, .LBB3_2
; LMULMAX1-RV32-NEXT: # %bb.1:
; LMULMAX1-RV32-NEXT: vmv.x.s a1, v25
-; LMULMAX1-RV32-NEXT: srli a2, a1, 1
-; LMULMAX1-RV32-NEXT: or a1, a1, a2
-; LMULMAX1-RV32-NEXT: srli a2, a1, 2
-; LMULMAX1-RV32-NEXT: or a1, a1, a2
-; LMULMAX1-RV32-NEXT: srli a2, a1, 4
-; LMULMAX1-RV32-NEXT: or a1, a1, a2
-; LMULMAX1-RV32-NEXT: srli a2, a1, 8
-; LMULMAX1-RV32-NEXT: or a1, a1, a2
-; LMULMAX1-RV32-NEXT: srli a2, a1, 16
-; LMULMAX1-RV32-NEXT: or a1, a1, a2
+; LMULMAX1-RV32-NEXT: srli a5, a1, 1
+; LMULMAX1-RV32-NEXT: or a1, a1, a5
+; LMULMAX1-RV32-NEXT: srli a5, a1, 2
+; LMULMAX1-RV32-NEXT: or a1, a1, a5
+; LMULMAX1-RV32-NEXT: srli a5, a1, 4
+; LMULMAX1-RV32-NEXT: or a1, a1, a5
+; LMULMAX1-RV32-NEXT: srli a5, a1, 8
+; LMULMAX1-RV32-NEXT: or a1, a1, a5
+; LMULMAX1-RV32-NEXT: srli a5, a1, 16
+; LMULMAX1-RV32-NEXT: or a1, a1, a5
; LMULMAX1-RV32-NEXT: not a1, a1
-; LMULMAX1-RV32-NEXT: srli a2, a1, 1
-; LMULMAX1-RV32-NEXT: and a2, a2, a4
-; LMULMAX1-RV32-NEXT: sub a1, a1, a2
-; LMULMAX1-RV32-NEXT: and a2, a1, a3
+; LMULMAX1-RV32-NEXT: srli a5, a1, 1
+; LMULMAX1-RV32-NEXT: and a5, a5, a4
+; LMULMAX1-RV32-NEXT: sub a1, a1, a5
+; LMULMAX1-RV32-NEXT: and a5, a1, a3
; LMULMAX1-RV32-NEXT: srli a1, a1, 2
; LMULMAX1-RV32-NEXT: and a1, a1, a3
-; LMULMAX1-RV32-NEXT: add a1, a2, a1
-; LMULMAX1-RV32-NEXT: srli a2, a1, 4
-; LMULMAX1-RV32-NEXT: add a1, a1, a2
-; LMULMAX1-RV32-NEXT: and a1, a1, a6
-; LMULMAX1-RV32-NEXT: mul a1, a1, a7
+; LMULMAX1-RV32-NEXT: add a1, a5, a1
+; LMULMAX1-RV32-NEXT: srli a5, a1, 4
+; LMULMAX1-RV32-NEXT: add a1, a1, a5
+; LMULMAX1-RV32-NEXT: and a1, a1, a7
+; LMULMAX1-RV32-NEXT: mul a1, a1, a2
; LMULMAX1-RV32-NEXT: srli a1, a1, 24
-; LMULMAX1-RV32-NEXT: addi a1, a1, 32
+; LMULMAX1-RV32-NEXT: addi a5, a1, 32
; LMULMAX1-RV32-NEXT: j .LBB3_3
; LMULMAX1-RV32-NEXT: .LBB3_2:
-; LMULMAX1-RV32-NEXT: srli a2, a1, 1
-; LMULMAX1-RV32-NEXT: or a1, a1, a2
-; LMULMAX1-RV32-NEXT: srli a2, a1, 2
-; LMULMAX1-RV32-NEXT: or a1, a1, a2
-; LMULMAX1-RV32-NEXT: srli a2, a1, 4
-; LMULMAX1-RV32-NEXT: or a1, a1, a2
-; LMULMAX1-RV32-NEXT: srli a2, a1, 8
-; LMULMAX1-RV32-NEXT: or a1, a1, a2
-; LMULMAX1-RV32-NEXT: srli a2, a1, 16
-; LMULMAX1-RV32-NEXT: or a1, a1, a2
+; LMULMAX1-RV32-NEXT: srli a1, a5, 1
+; LMULMAX1-RV32-NEXT: or a1, a5, a1
+; LMULMAX1-RV32-NEXT: srli a5, a1, 2
+; LMULMAX1-RV32-NEXT: or a1, a1, a5
+; LMULMAX1-RV32-NEXT: srli a5, a1, 4
+; LMULMAX1-RV32-NEXT: or a1, a1, a5
+; LMULMAX1-RV32-NEXT: srli a5, a1, 8
+; LMULMAX1-RV32-NEXT: or a1, a1, a5
+; LMULMAX1-RV32-NEXT: srli a5, a1, 16
+; LMULMAX1-RV32-NEXT: or a1, a1, a5
; LMULMAX1-RV32-NEXT: not a1, a1
-; LMULMAX1-RV32-NEXT: srli a2, a1, 1
-; LMULMAX1-RV32-NEXT: and a2, a2, a4
-; LMULMAX1-RV32-NEXT: sub a1, a1, a2
-; LMULMAX1-RV32-NEXT: and a2, a1, a3
+; LMULMAX1-RV32-NEXT: srli a5, a1, 1
+; LMULMAX1-RV32-NEXT: and a5, a5, a4
+; LMULMAX1-RV32-NEXT: sub a1, a1, a5
+; LMULMAX1-RV32-NEXT: and a5, a1, a3
; LMULMAX1-RV32-NEXT: srli a1, a1, 2
; LMULMAX1-RV32-NEXT: and a1, a1, a3
-; LMULMAX1-RV32-NEXT: add a1, a2, a1
-; LMULMAX1-RV32-NEXT: srli a2, a1, 4
-; LMULMAX1-RV32-NEXT: add a1, a1, a2
-; LMULMAX1-RV32-NEXT: and a1, a1, a6
-; LMULMAX1-RV32-NEXT: mul a1, a1, a7
-; LMULMAX1-RV32-NEXT: srli a1, a1, 24
+; LMULMAX1-RV32-NEXT: add a1, a5, a1
+; LMULMAX1-RV32-NEXT: srli a5, a1, 4
+; LMULMAX1-RV32-NEXT: add a1, a1, a5
+; LMULMAX1-RV32-NEXT: and a1, a1, a7
+; LMULMAX1-RV32-NEXT: mul a1, a1, a2
+; LMULMAX1-RV32-NEXT: srli a5, a1, 24
; LMULMAX1-RV32-NEXT: .LBB3_3:
; LMULMAX1-RV32-NEXT: vslidedown.vi v25, v25, 1
-; LMULMAX1-RV32-NEXT: vsrl.vx v26, v25, a5
-; LMULMAX1-RV32-NEXT: vmv.x.s a5, v26
-; LMULMAX1-RV32-NEXT: sw a1, 16(sp)
-; LMULMAX1-RV32-NEXT: bnez a5, .LBB3_5
+; LMULMAX1-RV32-NEXT: vsrl.vx v26, v25, a6
+; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26
+; LMULMAX1-RV32-NEXT: sw a5, 16(sp)
+; LMULMAX1-RV32-NEXT: bnez a1, .LBB3_5
; LMULMAX1-RV32-NEXT: # %bb.4:
; LMULMAX1-RV32-NEXT: vmv.x.s a1, v25
-; LMULMAX1-RV32-NEXT: srli a2, a1, 1
-; LMULMAX1-RV32-NEXT: or a1, a1, a2
-; LMULMAX1-RV32-NEXT: srli a2, a1, 2
-; LMULMAX1-RV32-NEXT: or a1, a1, a2
-; LMULMAX1-RV32-NEXT: srli a2, a1, 4
-; LMULMAX1-RV32-NEXT: or a1, a1, a2
-; LMULMAX1-RV32-NEXT: srli a2, a1, 8
-; LMULMAX1-RV32-NEXT: or a1, a1, a2
-; LMULMAX1-RV32-NEXT: srli a2, a1, 16
-; LMULMAX1-RV32-NEXT: or a1, a1, a2
+; LMULMAX1-RV32-NEXT: srli a5, a1, 1
+; LMULMAX1-RV32-NEXT: or a1, a1, a5
+; LMULMAX1-RV32-NEXT: srli a5, a1, 2
+; LMULMAX1-RV32-NEXT: or a1, a1, a5
+; LMULMAX1-RV32-NEXT: srli a5, a1, 4
+; LMULMAX1-RV32-NEXT: or a1, a1, a5
+; LMULMAX1-RV32-NEXT: srli a5, a1, 8
+; LMULMAX1-RV32-NEXT: or a1, a1, a5
+; LMULMAX1-RV32-NEXT: srli a5, a1, 16
+; LMULMAX1-RV32-NEXT: or a1, a1, a5
; LMULMAX1-RV32-NEXT: not a1, a1
-; LMULMAX1-RV32-NEXT: srli a2, a1, 1
-; LMULMAX1-RV32-NEXT: and a2, a2, a4
-; LMULMAX1-RV32-NEXT: sub a1, a1, a2
-; LMULMAX1-RV32-NEXT: and a2, a1, a3
+; LMULMAX1-RV32-NEXT: srli a5, a1, 1
+; LMULMAX1-RV32-NEXT: and a4, a5, a4
+; LMULMAX1-RV32-NEXT: sub a1, a1, a4
+; LMULMAX1-RV32-NEXT: and a4, a1, a3
; LMULMAX1-RV32-NEXT: srli a1, a1, 2
; LMULMAX1-RV32-NEXT: and a1, a1, a3
-; LMULMAX1-RV32-NEXT: add a1, a2, a1
-; LMULMAX1-RV32-NEXT: srli a2, a1, 4
-; LMULMAX1-RV32-NEXT: add a1, a1, a2
-; LMULMAX1-RV32-NEXT: and a1, a1, a6
-; LMULMAX1-RV32-NEXT: mul a1, a1, a7
+; LMULMAX1-RV32-NEXT: add a1, a4, a1
+; LMULMAX1-RV32-NEXT: srli a3, a1, 4
+; LMULMAX1-RV32-NEXT: add a1, a1, a3
+; LMULMAX1-RV32-NEXT: and a1, a1, a7
+; LMULMAX1-RV32-NEXT: mul a1, a1, a2
; LMULMAX1-RV32-NEXT: srli a1, a1, 24
; LMULMAX1-RV32-NEXT: addi a1, a1, 32
; LMULMAX1-RV32-NEXT: j .LBB3_6
; LMULMAX1-RV32-NEXT: .LBB3_5:
-; LMULMAX1-RV32-NEXT: srli a1, a5, 1
-; LMULMAX1-RV32-NEXT: or a1, a5, a1
-; LMULMAX1-RV32-NEXT: srli a2, a1, 2
-; LMULMAX1-RV32-NEXT: or a1, a1, a2
-; LMULMAX1-RV32-NEXT: srli a2, a1, 4
-; LMULMAX1-RV32-NEXT: or a1, a1, a2
-; LMULMAX1-RV32-NEXT: srli a2, a1, 8
-; LMULMAX1-RV32-NEXT: or a1, a1, a2
-; LMULMAX1-RV32-NEXT: srli a2, a1, 16
-; LMULMAX1-RV32-NEXT: or a1, a1, a2
+; LMULMAX1-RV32-NEXT: srli a5, a1, 1
+; LMULMAX1-RV32-NEXT: or a1, a1, a5
+; LMULMAX1-RV32-NEXT: srli a5, a1, 2
+; LMULMAX1-RV32-NEXT: or a1, a1, a5
+; LMULMAX1-RV32-NEXT: srli a5, a1, 4
+; LMULMAX1-RV32-NEXT: or a1, a1, a5
+; LMULMAX1-RV32-NEXT: srli a5, a1, 8
+; LMULMAX1-RV32-NEXT: or a1, a1, a5
+; LMULMAX1-RV32-NEXT: srli a5, a1, 16
+; LMULMAX1-RV32-NEXT: or a1, a1, a5
; LMULMAX1-RV32-NEXT: not a1, a1
-; LMULMAX1-RV32-NEXT: srli a2, a1, 1
-; LMULMAX1-RV32-NEXT: and a2, a2, a4
-; LMULMAX1-RV32-NEXT: sub a1, a1, a2
-; LMULMAX1-RV32-NEXT: and a2, a1, a3
+; LMULMAX1-RV32-NEXT: srli a5, a1, 1
+; LMULMAX1-RV32-NEXT: and a4, a5, a4
+; LMULMAX1-RV32-NEXT: sub a1, a1, a4
+; LMULMAX1-RV32-NEXT: and a4, a1, a3
; LMULMAX1-RV32-NEXT: srli a1, a1, 2
; LMULMAX1-RV32-NEXT: and a1, a1, a3
-; LMULMAX1-RV32-NEXT: add a1, a2, a1
-; LMULMAX1-RV32-NEXT: srli a2, a1, 4
-; LMULMAX1-RV32-NEXT: add a1, a1, a2
-; LMULMAX1-RV32-NEXT: and a1, a1, a6
-; LMULMAX1-RV32-NEXT: mul a1, a1, a7
+; LMULMAX1-RV32-NEXT: add a1, a4, a1
+; LMULMAX1-RV32-NEXT: srli a3, a1, 4
+; LMULMAX1-RV32-NEXT: add a1, a1, a3
+; LMULMAX1-RV32-NEXT: and a1, a1, a7
+; LMULMAX1-RV32-NEXT: mul a1, a1, a2
; LMULMAX1-RV32-NEXT: srli a1, a1, 24
; LMULMAX1-RV32-NEXT: .LBB3_6:
; LMULMAX1-RV32-NEXT: sw a1, 24(sp)
; LMULMAX2-RV64-NEXT: vle16.v v26, (a0)
; LMULMAX2-RV64-NEXT: vmv.x.s a2, v26
; LMULMAX2-RV64-NEXT: lui a1, 16
-; LMULMAX2-RV64-NEXT: addiw a1, a1, -1
-; LMULMAX2-RV64-NEXT: and a2, a2, a1
+; LMULMAX2-RV64-NEXT: addiw a6, a1, -1
+; LMULMAX2-RV64-NEXT: and a2, a2, a6
; LMULMAX2-RV64-NEXT: srli a3, a2, 1
; LMULMAX2-RV64-NEXT: or a2, a2, a3
; LMULMAX2-RV64-NEXT: srli a3, a2, 2
; LMULMAX2-RV64-NEXT: slli a2, a2, 12
; LMULMAX2-RV64-NEXT: addi a2, a2, 1365
; LMULMAX2-RV64-NEXT: slli a2, a2, 12
-; LMULMAX2-RV64-NEXT: addi a6, a2, 1365
-; LMULMAX2-RV64-NEXT: and a4, a4, a6
+; LMULMAX2-RV64-NEXT: addi a7, a2, 1365
+; LMULMAX2-RV64-NEXT: and a4, a4, a7
; LMULMAX2-RV64-NEXT: sub a4, a3, a4
; LMULMAX2-RV64-NEXT: lui a3, 13107
; LMULMAX2-RV64-NEXT: addiw a3, a3, 819
; LMULMAX2-RV64-NEXT: slli a4, a4, 12
; LMULMAX2-RV64-NEXT: addi a4, a4, 241
; LMULMAX2-RV64-NEXT: slli a4, a4, 12
-; LMULMAX2-RV64-NEXT: addi a7, a4, -241
-; LMULMAX2-RV64-NEXT: and a2, a5, a7
+; LMULMAX2-RV64-NEXT: addi a4, a4, -241
+; LMULMAX2-RV64-NEXT: and a1, a5, a4
; LMULMAX2-RV64-NEXT: lui a5, 4112
; LMULMAX2-RV64-NEXT: addiw a5, a5, 257
; LMULMAX2-RV64-NEXT: slli a5, a5, 16
; LMULMAX2-RV64-NEXT: addi a5, a5, 257
; LMULMAX2-RV64-NEXT: slli a5, a5, 16
; LMULMAX2-RV64-NEXT: addi a5, a5, 257
-; LMULMAX2-RV64-NEXT: mul a2, a2, a5
-; LMULMAX2-RV64-NEXT: srli a2, a2, 56
-; LMULMAX2-RV64-NEXT: addi a2, a2, -48
-; LMULMAX2-RV64-NEXT: sh a2, 32(sp)
+; LMULMAX2-RV64-NEXT: mul a1, a1, a5
+; LMULMAX2-RV64-NEXT: srli a1, a1, 56
+; LMULMAX2-RV64-NEXT: addi a1, a1, -48
+; LMULMAX2-RV64-NEXT: sh a1, 32(sp)
; LMULMAX2-RV64-NEXT: vsetivli zero, 1, e16, m2, ta, mu
; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 15
-; LMULMAX2-RV64-NEXT: vmv.x.s a2, v28
-; LMULMAX2-RV64-NEXT: and a2, a2, a1
-; LMULMAX2-RV64-NEXT: srli a4, a2, 1
-; LMULMAX2-RV64-NEXT: or a2, a2, a4
-; LMULMAX2-RV64-NEXT: srli a4, a2, 2
-; LMULMAX2-RV64-NEXT: or a2, a2, a4
-; LMULMAX2-RV64-NEXT: srli a4, a2, 4
-; LMULMAX2-RV64-NEXT: or a2, a2, a4
-; LMULMAX2-RV64-NEXT: srli a4, a2, 8
-; LMULMAX2-RV64-NEXT: or a2, a2, a4
-; LMULMAX2-RV64-NEXT: srli a4, a2, 16
-; LMULMAX2-RV64-NEXT: or a2, a2, a4
-; LMULMAX2-RV64-NEXT: srli a4, a2, 32
-; LMULMAX2-RV64-NEXT: or a2, a2, a4
-; LMULMAX2-RV64-NEXT: not a2, a2
-; LMULMAX2-RV64-NEXT: srli a4, a2, 1
-; LMULMAX2-RV64-NEXT: and a4, a4, a6
-; LMULMAX2-RV64-NEXT: sub a2, a2, a4
-; LMULMAX2-RV64-NEXT: and a4, a2, a3
-; LMULMAX2-RV64-NEXT: srli a2, a2, 2
-; LMULMAX2-RV64-NEXT: and a2, a2, a3
-; LMULMAX2-RV64-NEXT: add a2, a4, a2
-; LMULMAX2-RV64-NEXT: srli a4, a2, 4
-; LMULMAX2-RV64-NEXT: add a2, a2, a4
-; LMULMAX2-RV64-NEXT: and a2, a2, a7
-; LMULMAX2-RV64-NEXT: mul a2, a2, a5
-; LMULMAX2-RV64-NEXT: srli a2, a2, 56
-; LMULMAX2-RV64-NEXT: addi a2, a2, -48
-; LMULMAX2-RV64-NEXT: sh a2, 62(sp)
-; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 14
-; LMULMAX2-RV64-NEXT: vmv.x.s a2, v28
-; LMULMAX2-RV64-NEXT: and a2, a2, a1
-; LMULMAX2-RV64-NEXT: srli a4, a2, 1
-; LMULMAX2-RV64-NEXT: or a2, a2, a4
-; LMULMAX2-RV64-NEXT: srli a4, a2, 2
-; LMULMAX2-RV64-NEXT: or a2, a2, a4
-; LMULMAX2-RV64-NEXT: srli a4, a2, 4
-; LMULMAX2-RV64-NEXT: or a2, a2, a4
-; LMULMAX2-RV64-NEXT: srli a4, a2, 8
-; LMULMAX2-RV64-NEXT: or a2, a2, a4
-; LMULMAX2-RV64-NEXT: srli a4, a2, 16
-; LMULMAX2-RV64-NEXT: or a2, a2, a4
-; LMULMAX2-RV64-NEXT: srli a4, a2, 32
-; LMULMAX2-RV64-NEXT: or a2, a2, a4
-; LMULMAX2-RV64-NEXT: not a2, a2
-; LMULMAX2-RV64-NEXT: srli a4, a2, 1
-; LMULMAX2-RV64-NEXT: and a4, a4, a6
-; LMULMAX2-RV64-NEXT: sub a2, a2, a4
-; LMULMAX2-RV64-NEXT: and a4, a2, a3
-; LMULMAX2-RV64-NEXT: srli a2, a2, 2
-; LMULMAX2-RV64-NEXT: and a2, a2, a3
-; LMULMAX2-RV64-NEXT: add a2, a4, a2
-; LMULMAX2-RV64-NEXT: srli a4, a2, 4
-; LMULMAX2-RV64-NEXT: add a2, a2, a4
-; LMULMAX2-RV64-NEXT: and a2, a2, a7
-; LMULMAX2-RV64-NEXT: mul a2, a2, a5
-; LMULMAX2-RV64-NEXT: srli a2, a2, 56
-; LMULMAX2-RV64-NEXT: addi a2, a2, -48
-; LMULMAX2-RV64-NEXT: sh a2, 60(sp)
-; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 13
-; LMULMAX2-RV64-NEXT: vmv.x.s a2, v28
-; LMULMAX2-RV64-NEXT: and a2, a2, a1
-; LMULMAX2-RV64-NEXT: srli a4, a2, 1
-; LMULMAX2-RV64-NEXT: or a2, a2, a4
-; LMULMAX2-RV64-NEXT: srli a4, a2, 2
-; LMULMAX2-RV64-NEXT: or a2, a2, a4
-; LMULMAX2-RV64-NEXT: srli a4, a2, 4
-; LMULMAX2-RV64-NEXT: or a2, a2, a4
-; LMULMAX2-RV64-NEXT: srli a4, a2, 8
-; LMULMAX2-RV64-NEXT: or a2, a2, a4
-; LMULMAX2-RV64-NEXT: srli a4, a2, 16
-; LMULMAX2-RV64-NEXT: or a2, a2, a4
-; LMULMAX2-RV64-NEXT: srli a4, a2, 32
-; LMULMAX2-RV64-NEXT: or a2, a2, a4
-; LMULMAX2-RV64-NEXT: not a2, a2
-; LMULMAX2-RV64-NEXT: srli a4, a2, 1
-; LMULMAX2-RV64-NEXT: and a4, a4, a6
-; LMULMAX2-RV64-NEXT: sub a2, a2, a4
-; LMULMAX2-RV64-NEXT: and a4, a2, a3
-; LMULMAX2-RV64-NEXT: srli a2, a2, 2
-; LMULMAX2-RV64-NEXT: and a2, a2, a3
-; LMULMAX2-RV64-NEXT: add a2, a4, a2
-; LMULMAX2-RV64-NEXT: srli a4, a2, 4
-; LMULMAX2-RV64-NEXT: add a2, a2, a4
-; LMULMAX2-RV64-NEXT: and a2, a2, a7
-; LMULMAX2-RV64-NEXT: mul a2, a2, a5
-; LMULMAX2-RV64-NEXT: srli a2, a2, 56
-; LMULMAX2-RV64-NEXT: addi a2, a2, -48
-; LMULMAX2-RV64-NEXT: sh a2, 58(sp)
-; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 12
-; LMULMAX2-RV64-NEXT: vmv.x.s a2, v28
-; LMULMAX2-RV64-NEXT: and a2, a2, a1
-; LMULMAX2-RV64-NEXT: srli a4, a2, 1
-; LMULMAX2-RV64-NEXT: or a2, a2, a4
-; LMULMAX2-RV64-NEXT: srli a4, a2, 2
-; LMULMAX2-RV64-NEXT: or a2, a2, a4
-; LMULMAX2-RV64-NEXT: srli a4, a2, 4
-; LMULMAX2-RV64-NEXT: or a2, a2, a4
-; LMULMAX2-RV64-NEXT: srli a4, a2, 8
-; LMULMAX2-RV64-NEXT: or a2, a2, a4
-; LMULMAX2-RV64-NEXT: srli a4, a2, 16
-; LMULMAX2-RV64-NEXT: or a2, a2, a4
-; LMULMAX2-RV64-NEXT: srli a4, a2, 32
-; LMULMAX2-RV64-NEXT: or a2, a2, a4
-; LMULMAX2-RV64-NEXT: not a2, a2
-; LMULMAX2-RV64-NEXT: srli a4, a2, 1
-; LMULMAX2-RV64-NEXT: and a4, a4, a6
-; LMULMAX2-RV64-NEXT: sub a2, a2, a4
-; LMULMAX2-RV64-NEXT: and a4, a2, a3
-; LMULMAX2-RV64-NEXT: srli a2, a2, 2
-; LMULMAX2-RV64-NEXT: and a2, a2, a3
-; LMULMAX2-RV64-NEXT: add a2, a4, a2
-; LMULMAX2-RV64-NEXT: srli a4, a2, 4
-; LMULMAX2-RV64-NEXT: add a2, a2, a4
-; LMULMAX2-RV64-NEXT: and a2, a2, a7
-; LMULMAX2-RV64-NEXT: mul a2, a2, a5
-; LMULMAX2-RV64-NEXT: srli a2, a2, 56
-; LMULMAX2-RV64-NEXT: addi a2, a2, -48
-; LMULMAX2-RV64-NEXT: sh a2, 56(sp)
-; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 11
-; LMULMAX2-RV64-NEXT: vmv.x.s a2, v28
-; LMULMAX2-RV64-NEXT: and a2, a2, a1
-; LMULMAX2-RV64-NEXT: srli a4, a2, 1
-; LMULMAX2-RV64-NEXT: or a2, a2, a4
-; LMULMAX2-RV64-NEXT: srli a4, a2, 2
-; LMULMAX2-RV64-NEXT: or a2, a2, a4
-; LMULMAX2-RV64-NEXT: srli a4, a2, 4
-; LMULMAX2-RV64-NEXT: or a2, a2, a4
-; LMULMAX2-RV64-NEXT: srli a4, a2, 8
-; LMULMAX2-RV64-NEXT: or a2, a2, a4
-; LMULMAX2-RV64-NEXT: srli a4, a2, 16
-; LMULMAX2-RV64-NEXT: or a2, a2, a4
-; LMULMAX2-RV64-NEXT: srli a4, a2, 32
-; LMULMAX2-RV64-NEXT: or a2, a2, a4
-; LMULMAX2-RV64-NEXT: not a2, a2
-; LMULMAX2-RV64-NEXT: srli a4, a2, 1
-; LMULMAX2-RV64-NEXT: and a4, a4, a6
-; LMULMAX2-RV64-NEXT: sub a2, a2, a4
-; LMULMAX2-RV64-NEXT: and a4, a2, a3
-; LMULMAX2-RV64-NEXT: srli a2, a2, 2
-; LMULMAX2-RV64-NEXT: and a2, a2, a3
-; LMULMAX2-RV64-NEXT: add a2, a4, a2
-; LMULMAX2-RV64-NEXT: srli a4, a2, 4
-; LMULMAX2-RV64-NEXT: add a2, a2, a4
-; LMULMAX2-RV64-NEXT: and a2, a2, a7
-; LMULMAX2-RV64-NEXT: mul a2, a2, a5
-; LMULMAX2-RV64-NEXT: srli a2, a2, 56
-; LMULMAX2-RV64-NEXT: addi a2, a2, -48
-; LMULMAX2-RV64-NEXT: sh a2, 54(sp)
-; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 10
-; LMULMAX2-RV64-NEXT: vmv.x.s a2, v28
-; LMULMAX2-RV64-NEXT: and a2, a2, a1
-; LMULMAX2-RV64-NEXT: srli a4, a2, 1
-; LMULMAX2-RV64-NEXT: or a2, a2, a4
-; LMULMAX2-RV64-NEXT: srli a4, a2, 2
-; LMULMAX2-RV64-NEXT: or a2, a2, a4
-; LMULMAX2-RV64-NEXT: srli a4, a2, 4
-; LMULMAX2-RV64-NEXT: or a2, a2, a4
-; LMULMAX2-RV64-NEXT: srli a4, a2, 8
-; LMULMAX2-RV64-NEXT: or a2, a2, a4
-; LMULMAX2-RV64-NEXT: srli a4, a2, 16
-; LMULMAX2-RV64-NEXT: or a2, a2, a4
-; LMULMAX2-RV64-NEXT: srli a4, a2, 32
-; LMULMAX2-RV64-NEXT: or a2, a2, a4
-; LMULMAX2-RV64-NEXT: not a2, a2
-; LMULMAX2-RV64-NEXT: srli a4, a2, 1
-; LMULMAX2-RV64-NEXT: and a4, a4, a6
-; LMULMAX2-RV64-NEXT: sub a2, a2, a4
-; LMULMAX2-RV64-NEXT: and a4, a2, a3
-; LMULMAX2-RV64-NEXT: srli a2, a2, 2
-; LMULMAX2-RV64-NEXT: and a2, a2, a3
-; LMULMAX2-RV64-NEXT: add a2, a4, a2
-; LMULMAX2-RV64-NEXT: srli a4, a2, 4
-; LMULMAX2-RV64-NEXT: add a2, a2, a4
-; LMULMAX2-RV64-NEXT: and a2, a2, a7
-; LMULMAX2-RV64-NEXT: mul a2, a2, a5
-; LMULMAX2-RV64-NEXT: srli a2, a2, 56
-; LMULMAX2-RV64-NEXT: addi a2, a2, -48
-; LMULMAX2-RV64-NEXT: sh a2, 52(sp)
-; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 9
-; LMULMAX2-RV64-NEXT: vmv.x.s a2, v28
-; LMULMAX2-RV64-NEXT: and a2, a2, a1
-; LMULMAX2-RV64-NEXT: srli a4, a2, 1
-; LMULMAX2-RV64-NEXT: or a2, a2, a4
-; LMULMAX2-RV64-NEXT: srli a4, a2, 2
-; LMULMAX2-RV64-NEXT: or a2, a2, a4
-; LMULMAX2-RV64-NEXT: srli a4, a2, 4
-; LMULMAX2-RV64-NEXT: or a2, a2, a4
-; LMULMAX2-RV64-NEXT: srli a4, a2, 8
-; LMULMAX2-RV64-NEXT: or a2, a2, a4
-; LMULMAX2-RV64-NEXT: srli a4, a2, 16
-; LMULMAX2-RV64-NEXT: or a2, a2, a4
-; LMULMAX2-RV64-NEXT: srli a4, a2, 32
-; LMULMAX2-RV64-NEXT: or a2, a2, a4
-; LMULMAX2-RV64-NEXT: not a2, a2
-; LMULMAX2-RV64-NEXT: srli a4, a2, 1
-; LMULMAX2-RV64-NEXT: and a4, a4, a6
-; LMULMAX2-RV64-NEXT: sub a2, a2, a4
-; LMULMAX2-RV64-NEXT: and a4, a2, a3
-; LMULMAX2-RV64-NEXT: srli a2, a2, 2
-; LMULMAX2-RV64-NEXT: and a2, a2, a3
-; LMULMAX2-RV64-NEXT: add a2, a4, a2
-; LMULMAX2-RV64-NEXT: srli a4, a2, 4
-; LMULMAX2-RV64-NEXT: add a2, a2, a4
-; LMULMAX2-RV64-NEXT: and a2, a2, a7
-; LMULMAX2-RV64-NEXT: mul a2, a2, a5
-; LMULMAX2-RV64-NEXT: srli a2, a2, 56
-; LMULMAX2-RV64-NEXT: addi a2, a2, -48
-; LMULMAX2-RV64-NEXT: sh a2, 50(sp)
-; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 8
-; LMULMAX2-RV64-NEXT: vmv.x.s a2, v28
-; LMULMAX2-RV64-NEXT: and a2, a2, a1
-; LMULMAX2-RV64-NEXT: srli a4, a2, 1
-; LMULMAX2-RV64-NEXT: or a2, a2, a4
-; LMULMAX2-RV64-NEXT: srli a4, a2, 2
-; LMULMAX2-RV64-NEXT: or a2, a2, a4
-; LMULMAX2-RV64-NEXT: srli a4, a2, 4
-; LMULMAX2-RV64-NEXT: or a2, a2, a4
-; LMULMAX2-RV64-NEXT: srli a4, a2, 8
-; LMULMAX2-RV64-NEXT: or a2, a2, a4
-; LMULMAX2-RV64-NEXT: srli a4, a2, 16
-; LMULMAX2-RV64-NEXT: or a2, a2, a4
-; LMULMAX2-RV64-NEXT: srli a4, a2, 32
-; LMULMAX2-RV64-NEXT: or a2, a2, a4
-; LMULMAX2-RV64-NEXT: not a2, a2
-; LMULMAX2-RV64-NEXT: srli a4, a2, 1
-; LMULMAX2-RV64-NEXT: and a4, a4, a6
-; LMULMAX2-RV64-NEXT: sub a2, a2, a4
-; LMULMAX2-RV64-NEXT: and a4, a2, a3
-; LMULMAX2-RV64-NEXT: srli a2, a2, 2
-; LMULMAX2-RV64-NEXT: and a2, a2, a3
-; LMULMAX2-RV64-NEXT: add a2, a4, a2
-; LMULMAX2-RV64-NEXT: srli a4, a2, 4
-; LMULMAX2-RV64-NEXT: add a2, a2, a4
-; LMULMAX2-RV64-NEXT: and a2, a2, a7
-; LMULMAX2-RV64-NEXT: mul a2, a2, a5
-; LMULMAX2-RV64-NEXT: srli a2, a2, 56
-; LMULMAX2-RV64-NEXT: addi a2, a2, -48
-; LMULMAX2-RV64-NEXT: sh a2, 48(sp)
-; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 7
-; LMULMAX2-RV64-NEXT: vmv.x.s a2, v28
-; LMULMAX2-RV64-NEXT: and a2, a2, a1
-; LMULMAX2-RV64-NEXT: srli a4, a2, 1
-; LMULMAX2-RV64-NEXT: or a2, a2, a4
-; LMULMAX2-RV64-NEXT: srli a4, a2, 2
-; LMULMAX2-RV64-NEXT: or a2, a2, a4
-; LMULMAX2-RV64-NEXT: srli a4, a2, 4
-; LMULMAX2-RV64-NEXT: or a2, a2, a4
-; LMULMAX2-RV64-NEXT: srli a4, a2, 8
-; LMULMAX2-RV64-NEXT: or a2, a2, a4
-; LMULMAX2-RV64-NEXT: srli a4, a2, 16
-; LMULMAX2-RV64-NEXT: or a2, a2, a4
-; LMULMAX2-RV64-NEXT: srli a4, a2, 32
-; LMULMAX2-RV64-NEXT: or a2, a2, a4
-; LMULMAX2-RV64-NEXT: not a2, a2
-; LMULMAX2-RV64-NEXT: srli a4, a2, 1
-; LMULMAX2-RV64-NEXT: and a4, a4, a6
-; LMULMAX2-RV64-NEXT: sub a2, a2, a4
-; LMULMAX2-RV64-NEXT: and a4, a2, a3
-; LMULMAX2-RV64-NEXT: srli a2, a2, 2
-; LMULMAX2-RV64-NEXT: and a2, a2, a3
-; LMULMAX2-RV64-NEXT: add a2, a4, a2
-; LMULMAX2-RV64-NEXT: srli a4, a2, 4
-; LMULMAX2-RV64-NEXT: add a2, a2, a4
-; LMULMAX2-RV64-NEXT: and a2, a2, a7
-; LMULMAX2-RV64-NEXT: mul a2, a2, a5
-; LMULMAX2-RV64-NEXT: srli a2, a2, 56
-; LMULMAX2-RV64-NEXT: addi a2, a2, -48
-; LMULMAX2-RV64-NEXT: sh a2, 46(sp)
-; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 6
-; LMULMAX2-RV64-NEXT: vmv.x.s a2, v28
-; LMULMAX2-RV64-NEXT: and a2, a2, a1
-; LMULMAX2-RV64-NEXT: srli a4, a2, 1
-; LMULMAX2-RV64-NEXT: or a2, a2, a4
-; LMULMAX2-RV64-NEXT: srli a4, a2, 2
-; LMULMAX2-RV64-NEXT: or a2, a2, a4
-; LMULMAX2-RV64-NEXT: srli a4, a2, 4
-; LMULMAX2-RV64-NEXT: or a2, a2, a4
-; LMULMAX2-RV64-NEXT: srli a4, a2, 8
-; LMULMAX2-RV64-NEXT: or a2, a2, a4
-; LMULMAX2-RV64-NEXT: srli a4, a2, 16
-; LMULMAX2-RV64-NEXT: or a2, a2, a4
-; LMULMAX2-RV64-NEXT: srli a4, a2, 32
-; LMULMAX2-RV64-NEXT: or a2, a2, a4
-; LMULMAX2-RV64-NEXT: not a2, a2
-; LMULMAX2-RV64-NEXT: srli a4, a2, 1
-; LMULMAX2-RV64-NEXT: and a4, a4, a6
-; LMULMAX2-RV64-NEXT: sub a2, a2, a4
-; LMULMAX2-RV64-NEXT: and a4, a2, a3
-; LMULMAX2-RV64-NEXT: srli a2, a2, 2
-; LMULMAX2-RV64-NEXT: and a2, a2, a3
-; LMULMAX2-RV64-NEXT: add a2, a4, a2
-; LMULMAX2-RV64-NEXT: srli a4, a2, 4
-; LMULMAX2-RV64-NEXT: add a2, a2, a4
-; LMULMAX2-RV64-NEXT: and a2, a2, a7
-; LMULMAX2-RV64-NEXT: mul a2, a2, a5
-; LMULMAX2-RV64-NEXT: srli a2, a2, 56
-; LMULMAX2-RV64-NEXT: addi a2, a2, -48
-; LMULMAX2-RV64-NEXT: sh a2, 44(sp)
-; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 5
-; LMULMAX2-RV64-NEXT: vmv.x.s a2, v28
-; LMULMAX2-RV64-NEXT: and a2, a2, a1
-; LMULMAX2-RV64-NEXT: srli a4, a2, 1
-; LMULMAX2-RV64-NEXT: or a2, a2, a4
-; LMULMAX2-RV64-NEXT: srli a4, a2, 2
-; LMULMAX2-RV64-NEXT: or a2, a2, a4
-; LMULMAX2-RV64-NEXT: srli a4, a2, 4
-; LMULMAX2-RV64-NEXT: or a2, a2, a4
-; LMULMAX2-RV64-NEXT: srli a4, a2, 8
-; LMULMAX2-RV64-NEXT: or a2, a2, a4
-; LMULMAX2-RV64-NEXT: srli a4, a2, 16
-; LMULMAX2-RV64-NEXT: or a2, a2, a4
-; LMULMAX2-RV64-NEXT: srli a4, a2, 32
-; LMULMAX2-RV64-NEXT: or a2, a2, a4
-; LMULMAX2-RV64-NEXT: not a2, a2
-; LMULMAX2-RV64-NEXT: srli a4, a2, 1
-; LMULMAX2-RV64-NEXT: and a4, a4, a6
-; LMULMAX2-RV64-NEXT: sub a2, a2, a4
-; LMULMAX2-RV64-NEXT: and a4, a2, a3
-; LMULMAX2-RV64-NEXT: srli a2, a2, 2
-; LMULMAX2-RV64-NEXT: and a2, a2, a3
-; LMULMAX2-RV64-NEXT: add a2, a4, a2
-; LMULMAX2-RV64-NEXT: srli a4, a2, 4
-; LMULMAX2-RV64-NEXT: add a2, a2, a4
-; LMULMAX2-RV64-NEXT: and a2, a2, a7
-; LMULMAX2-RV64-NEXT: mul a2, a2, a5
-; LMULMAX2-RV64-NEXT: srli a2, a2, 56
-; LMULMAX2-RV64-NEXT: addi a2, a2, -48
-; LMULMAX2-RV64-NEXT: sh a2, 42(sp)
-; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 4
-; LMULMAX2-RV64-NEXT: vmv.x.s a2, v28
-; LMULMAX2-RV64-NEXT: and a2, a2, a1
-; LMULMAX2-RV64-NEXT: srli a4, a2, 1
-; LMULMAX2-RV64-NEXT: or a2, a2, a4
-; LMULMAX2-RV64-NEXT: srli a4, a2, 2
-; LMULMAX2-RV64-NEXT: or a2, a2, a4
-; LMULMAX2-RV64-NEXT: srli a4, a2, 4
-; LMULMAX2-RV64-NEXT: or a2, a2, a4
-; LMULMAX2-RV64-NEXT: srli a4, a2, 8
-; LMULMAX2-RV64-NEXT: or a2, a2, a4
-; LMULMAX2-RV64-NEXT: srli a4, a2, 16
-; LMULMAX2-RV64-NEXT: or a2, a2, a4
-; LMULMAX2-RV64-NEXT: srli a4, a2, 32
-; LMULMAX2-RV64-NEXT: or a2, a2, a4
-; LMULMAX2-RV64-NEXT: not a2, a2
-; LMULMAX2-RV64-NEXT: srli a4, a2, 1
-; LMULMAX2-RV64-NEXT: and a4, a4, a6
-; LMULMAX2-RV64-NEXT: sub a2, a2, a4
-; LMULMAX2-RV64-NEXT: and a4, a2, a3
-; LMULMAX2-RV64-NEXT: srli a2, a2, 2
-; LMULMAX2-RV64-NEXT: and a2, a2, a3
-; LMULMAX2-RV64-NEXT: add a2, a4, a2
-; LMULMAX2-RV64-NEXT: srli a4, a2, 4
-; LMULMAX2-RV64-NEXT: add a2, a2, a4
-; LMULMAX2-RV64-NEXT: and a2, a2, a7
-; LMULMAX2-RV64-NEXT: mul a2, a2, a5
-; LMULMAX2-RV64-NEXT: srli a2, a2, 56
-; LMULMAX2-RV64-NEXT: addi a2, a2, -48
-; LMULMAX2-RV64-NEXT: sh a2, 40(sp)
-; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 3
-; LMULMAX2-RV64-NEXT: vmv.x.s a2, v28
-; LMULMAX2-RV64-NEXT: and a2, a2, a1
-; LMULMAX2-RV64-NEXT: srli a4, a2, 1
-; LMULMAX2-RV64-NEXT: or a2, a2, a4
-; LMULMAX2-RV64-NEXT: srli a4, a2, 2
-; LMULMAX2-RV64-NEXT: or a2, a2, a4
-; LMULMAX2-RV64-NEXT: srli a4, a2, 4
-; LMULMAX2-RV64-NEXT: or a2, a2, a4
-; LMULMAX2-RV64-NEXT: srli a4, a2, 8
-; LMULMAX2-RV64-NEXT: or a2, a2, a4
-; LMULMAX2-RV64-NEXT: srli a4, a2, 16
-; LMULMAX2-RV64-NEXT: or a2, a2, a4
-; LMULMAX2-RV64-NEXT: srli a4, a2, 32
-; LMULMAX2-RV64-NEXT: or a2, a2, a4
-; LMULMAX2-RV64-NEXT: not a2, a2
-; LMULMAX2-RV64-NEXT: srli a4, a2, 1
-; LMULMAX2-RV64-NEXT: and a4, a4, a6
-; LMULMAX2-RV64-NEXT: sub a2, a2, a4
-; LMULMAX2-RV64-NEXT: and a4, a2, a3
-; LMULMAX2-RV64-NEXT: srli a2, a2, 2
-; LMULMAX2-RV64-NEXT: and a2, a2, a3
-; LMULMAX2-RV64-NEXT: add a2, a4, a2
-; LMULMAX2-RV64-NEXT: srli a4, a2, 4
-; LMULMAX2-RV64-NEXT: add a2, a2, a4
-; LMULMAX2-RV64-NEXT: and a2, a2, a7
-; LMULMAX2-RV64-NEXT: mul a2, a2, a5
-; LMULMAX2-RV64-NEXT: srli a2, a2, 56
-; LMULMAX2-RV64-NEXT: addi a2, a2, -48
-; LMULMAX2-RV64-NEXT: sh a2, 38(sp)
-; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 2
-; LMULMAX2-RV64-NEXT: vmv.x.s a2, v28
-; LMULMAX2-RV64-NEXT: and a2, a2, a1
-; LMULMAX2-RV64-NEXT: srli a4, a2, 1
-; LMULMAX2-RV64-NEXT: or a2, a2, a4
-; LMULMAX2-RV64-NEXT: srli a4, a2, 2
-; LMULMAX2-RV64-NEXT: or a2, a2, a4
-; LMULMAX2-RV64-NEXT: srli a4, a2, 4
-; LMULMAX2-RV64-NEXT: or a2, a2, a4
-; LMULMAX2-RV64-NEXT: srli a4, a2, 8
-; LMULMAX2-RV64-NEXT: or a2, a2, a4
-; LMULMAX2-RV64-NEXT: srli a4, a2, 16
-; LMULMAX2-RV64-NEXT: or a2, a2, a4
-; LMULMAX2-RV64-NEXT: srli a4, a2, 32
-; LMULMAX2-RV64-NEXT: or a2, a2, a4
-; LMULMAX2-RV64-NEXT: not a2, a2
-; LMULMAX2-RV64-NEXT: srli a4, a2, 1
-; LMULMAX2-RV64-NEXT: and a4, a4, a6
-; LMULMAX2-RV64-NEXT: sub a2, a2, a4
-; LMULMAX2-RV64-NEXT: and a4, a2, a3
-; LMULMAX2-RV64-NEXT: srli a2, a2, 2
-; LMULMAX2-RV64-NEXT: and a2, a2, a3
-; LMULMAX2-RV64-NEXT: add a2, a4, a2
-; LMULMAX2-RV64-NEXT: srli a4, a2, 4
-; LMULMAX2-RV64-NEXT: add a2, a2, a4
-; LMULMAX2-RV64-NEXT: and a2, a2, a7
-; LMULMAX2-RV64-NEXT: mul a2, a2, a5
-; LMULMAX2-RV64-NEXT: srli a2, a2, 56
-; LMULMAX2-RV64-NEXT: addi a2, a2, -48
-; LMULMAX2-RV64-NEXT: sh a2, 36(sp)
-; LMULMAX2-RV64-NEXT: vslidedown.vi v26, v26, 1
-; LMULMAX2-RV64-NEXT: vmv.x.s a2, v26
-; LMULMAX2-RV64-NEXT: and a1, a2, a1
+; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28
+; LMULMAX2-RV64-NEXT: and a1, a1, a6
; LMULMAX2-RV64-NEXT: srli a2, a1, 1
; LMULMAX2-RV64-NEXT: or a1, a1, a2
; LMULMAX2-RV64-NEXT: srli a2, a1, 2
; LMULMAX2-RV64-NEXT: or a1, a1, a2
; LMULMAX2-RV64-NEXT: not a1, a1
; LMULMAX2-RV64-NEXT: srli a2, a1, 1
-; LMULMAX2-RV64-NEXT: and a2, a2, a6
+; LMULMAX2-RV64-NEXT: and a2, a2, a7
; LMULMAX2-RV64-NEXT: sub a1, a1, a2
; LMULMAX2-RV64-NEXT: and a2, a1, a3
; LMULMAX2-RV64-NEXT: srli a1, a1, 2
; LMULMAX2-RV64-NEXT: add a1, a2, a1
; LMULMAX2-RV64-NEXT: srli a2, a1, 4
; LMULMAX2-RV64-NEXT: add a1, a1, a2
-; LMULMAX2-RV64-NEXT: and a1, a1, a7
+; LMULMAX2-RV64-NEXT: and a1, a1, a4
; LMULMAX2-RV64-NEXT: mul a1, a1, a5
; LMULMAX2-RV64-NEXT: srli a1, a1, 56
; LMULMAX2-RV64-NEXT: addi a1, a1, -48
-; LMULMAX2-RV64-NEXT: sh a1, 34(sp)
-; LMULMAX2-RV64-NEXT: vsetivli zero, 16, e16, m2, ta, mu
-; LMULMAX2-RV64-NEXT: addi a1, sp, 32
-; LMULMAX2-RV64-NEXT: vle16.v v26, (a1)
-; LMULMAX2-RV64-NEXT: vse16.v v26, (a0)
-; LMULMAX2-RV64-NEXT: addi sp, s0, -96
-; LMULMAX2-RV64-NEXT: ld s0, 80(sp) # 8-byte Folded Reload
-; LMULMAX2-RV64-NEXT: ld ra, 88(sp) # 8-byte Folded Reload
-; LMULMAX2-RV64-NEXT: addi sp, sp, 96
-; LMULMAX2-RV64-NEXT: ret
+; LMULMAX2-RV64-NEXT: sh a1, 62(sp)
+; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 14
+; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28
+; LMULMAX2-RV64-NEXT: and a1, a1, a6
+; LMULMAX2-RV64-NEXT: srli a2, a1, 1
+; LMULMAX2-RV64-NEXT: or a1, a1, a2
+; LMULMAX2-RV64-NEXT: srli a2, a1, 2
+; LMULMAX2-RV64-NEXT: or a1, a1, a2
+; LMULMAX2-RV64-NEXT: srli a2, a1, 4
+; LMULMAX2-RV64-NEXT: or a1, a1, a2
+; LMULMAX2-RV64-NEXT: srli a2, a1, 8
+; LMULMAX2-RV64-NEXT: or a1, a1, a2
+; LMULMAX2-RV64-NEXT: srli a2, a1, 16
+; LMULMAX2-RV64-NEXT: or a1, a1, a2
+; LMULMAX2-RV64-NEXT: srli a2, a1, 32
+; LMULMAX2-RV64-NEXT: or a1, a1, a2
+; LMULMAX2-RV64-NEXT: not a1, a1
+; LMULMAX2-RV64-NEXT: srli a2, a1, 1
+; LMULMAX2-RV64-NEXT: and a2, a2, a7
+; LMULMAX2-RV64-NEXT: sub a1, a1, a2
+; LMULMAX2-RV64-NEXT: and a2, a1, a3
+; LMULMAX2-RV64-NEXT: srli a1, a1, 2
+; LMULMAX2-RV64-NEXT: and a1, a1, a3
+; LMULMAX2-RV64-NEXT: add a1, a2, a1
+; LMULMAX2-RV64-NEXT: srli a2, a1, 4
+; LMULMAX2-RV64-NEXT: add a1, a1, a2
+; LMULMAX2-RV64-NEXT: and a1, a1, a4
+; LMULMAX2-RV64-NEXT: mul a1, a1, a5
+; LMULMAX2-RV64-NEXT: srli a1, a1, 56
+; LMULMAX2-RV64-NEXT: addi a1, a1, -48
+; LMULMAX2-RV64-NEXT: sh a1, 60(sp)
+; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 13
+; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28
+; LMULMAX2-RV64-NEXT: and a1, a1, a6
+; LMULMAX2-RV64-NEXT: srli a2, a1, 1
+; LMULMAX2-RV64-NEXT: or a1, a1, a2
+; LMULMAX2-RV64-NEXT: srli a2, a1, 2
+; LMULMAX2-RV64-NEXT: or a1, a1, a2
+; LMULMAX2-RV64-NEXT: srli a2, a1, 4
+; LMULMAX2-RV64-NEXT: or a1, a1, a2
+; LMULMAX2-RV64-NEXT: srli a2, a1, 8
+; LMULMAX2-RV64-NEXT: or a1, a1, a2
+; LMULMAX2-RV64-NEXT: srli a2, a1, 16
+; LMULMAX2-RV64-NEXT: or a1, a1, a2
+; LMULMAX2-RV64-NEXT: srli a2, a1, 32
+; LMULMAX2-RV64-NEXT: or a1, a1, a2
+; LMULMAX2-RV64-NEXT: not a1, a1
+; LMULMAX2-RV64-NEXT: srli a2, a1, 1
+; LMULMAX2-RV64-NEXT: and a2, a2, a7
+; LMULMAX2-RV64-NEXT: sub a1, a1, a2
+; LMULMAX2-RV64-NEXT: and a2, a1, a3
+; LMULMAX2-RV64-NEXT: srli a1, a1, 2
+; LMULMAX2-RV64-NEXT: and a1, a1, a3
+; LMULMAX2-RV64-NEXT: add a1, a2, a1
+; LMULMAX2-RV64-NEXT: srli a2, a1, 4
+; LMULMAX2-RV64-NEXT: add a1, a1, a2
+; LMULMAX2-RV64-NEXT: and a1, a1, a4
+; LMULMAX2-RV64-NEXT: mul a1, a1, a5
+; LMULMAX2-RV64-NEXT: srli a1, a1, 56
+; LMULMAX2-RV64-NEXT: addi a1, a1, -48
+; LMULMAX2-RV64-NEXT: sh a1, 58(sp)
+; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 12
+; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28
+; LMULMAX2-RV64-NEXT: and a1, a1, a6
+; LMULMAX2-RV64-NEXT: srli a2, a1, 1
+; LMULMAX2-RV64-NEXT: or a1, a1, a2
+; LMULMAX2-RV64-NEXT: srli a2, a1, 2
+; LMULMAX2-RV64-NEXT: or a1, a1, a2
+; LMULMAX2-RV64-NEXT: srli a2, a1, 4
+; LMULMAX2-RV64-NEXT: or a1, a1, a2
+; LMULMAX2-RV64-NEXT: srli a2, a1, 8
+; LMULMAX2-RV64-NEXT: or a1, a1, a2
+; LMULMAX2-RV64-NEXT: srli a2, a1, 16
+; LMULMAX2-RV64-NEXT: or a1, a1, a2
+; LMULMAX2-RV64-NEXT: srli a2, a1, 32
+; LMULMAX2-RV64-NEXT: or a1, a1, a2
+; LMULMAX2-RV64-NEXT: not a1, a1
+; LMULMAX2-RV64-NEXT: srli a2, a1, 1
+; LMULMAX2-RV64-NEXT: and a2, a2, a7
+; LMULMAX2-RV64-NEXT: sub a1, a1, a2
+; LMULMAX2-RV64-NEXT: and a2, a1, a3
+; LMULMAX2-RV64-NEXT: srli a1, a1, 2
+; LMULMAX2-RV64-NEXT: and a1, a1, a3
+; LMULMAX2-RV64-NEXT: add a1, a2, a1
+; LMULMAX2-RV64-NEXT: srli a2, a1, 4
+; LMULMAX2-RV64-NEXT: add a1, a1, a2
+; LMULMAX2-RV64-NEXT: and a1, a1, a4
+; LMULMAX2-RV64-NEXT: mul a1, a1, a5
+; LMULMAX2-RV64-NEXT: srli a1, a1, 56
+; LMULMAX2-RV64-NEXT: addi a1, a1, -48
+; LMULMAX2-RV64-NEXT: sh a1, 56(sp)
+; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 11
+; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28
+; LMULMAX2-RV64-NEXT: and a1, a1, a6
+; LMULMAX2-RV64-NEXT: srli a2, a1, 1
+; LMULMAX2-RV64-NEXT: or a1, a1, a2
+; LMULMAX2-RV64-NEXT: srli a2, a1, 2
+; LMULMAX2-RV64-NEXT: or a1, a1, a2
+; LMULMAX2-RV64-NEXT: srli a2, a1, 4
+; LMULMAX2-RV64-NEXT: or a1, a1, a2
+; LMULMAX2-RV64-NEXT: srli a2, a1, 8
+; LMULMAX2-RV64-NEXT: or a1, a1, a2
+; LMULMAX2-RV64-NEXT: srli a2, a1, 16
+; LMULMAX2-RV64-NEXT: or a1, a1, a2
+; LMULMAX2-RV64-NEXT: srli a2, a1, 32
+; LMULMAX2-RV64-NEXT: or a1, a1, a2
+; LMULMAX2-RV64-NEXT: not a1, a1
+; LMULMAX2-RV64-NEXT: srli a2, a1, 1
+; LMULMAX2-RV64-NEXT: and a2, a2, a7
+; LMULMAX2-RV64-NEXT: sub a1, a1, a2
+; LMULMAX2-RV64-NEXT: and a2, a1, a3
+; LMULMAX2-RV64-NEXT: srli a1, a1, 2
+; LMULMAX2-RV64-NEXT: and a1, a1, a3
+; LMULMAX2-RV64-NEXT: add a1, a2, a1
+; LMULMAX2-RV64-NEXT: srli a2, a1, 4
+; LMULMAX2-RV64-NEXT: add a1, a1, a2
+; LMULMAX2-RV64-NEXT: and a1, a1, a4
+; LMULMAX2-RV64-NEXT: mul a1, a1, a5
+; LMULMAX2-RV64-NEXT: srli a1, a1, 56
+; LMULMAX2-RV64-NEXT: addi a1, a1, -48
+; LMULMAX2-RV64-NEXT: sh a1, 54(sp)
+; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 10
+; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28
+; LMULMAX2-RV64-NEXT: and a1, a1, a6
+; LMULMAX2-RV64-NEXT: srli a2, a1, 1
+; LMULMAX2-RV64-NEXT: or a1, a1, a2
+; LMULMAX2-RV64-NEXT: srli a2, a1, 2
+; LMULMAX2-RV64-NEXT: or a1, a1, a2
+; LMULMAX2-RV64-NEXT: srli a2, a1, 4
+; LMULMAX2-RV64-NEXT: or a1, a1, a2
+; LMULMAX2-RV64-NEXT: srli a2, a1, 8
+; LMULMAX2-RV64-NEXT: or a1, a1, a2
+; LMULMAX2-RV64-NEXT: srli a2, a1, 16
+; LMULMAX2-RV64-NEXT: or a1, a1, a2
+; LMULMAX2-RV64-NEXT: srli a2, a1, 32
+; LMULMAX2-RV64-NEXT: or a1, a1, a2
+; LMULMAX2-RV64-NEXT: not a1, a1
+; LMULMAX2-RV64-NEXT: srli a2, a1, 1
+; LMULMAX2-RV64-NEXT: and a2, a2, a7
+; LMULMAX2-RV64-NEXT: sub a1, a1, a2
+; LMULMAX2-RV64-NEXT: and a2, a1, a3
+; LMULMAX2-RV64-NEXT: srli a1, a1, 2
+; LMULMAX2-RV64-NEXT: and a1, a1, a3
+; LMULMAX2-RV64-NEXT: add a1, a2, a1
+; LMULMAX2-RV64-NEXT: srli a2, a1, 4
+; LMULMAX2-RV64-NEXT: add a1, a1, a2
+; LMULMAX2-RV64-NEXT: and a1, a1, a4
+; LMULMAX2-RV64-NEXT: mul a1, a1, a5
+; LMULMAX2-RV64-NEXT: srli a1, a1, 56
+; LMULMAX2-RV64-NEXT: addi a1, a1, -48
+; LMULMAX2-RV64-NEXT: sh a1, 52(sp)
+; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 9
+; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28
+; LMULMAX2-RV64-NEXT: and a1, a1, a6
+; LMULMAX2-RV64-NEXT: srli a2, a1, 1
+; LMULMAX2-RV64-NEXT: or a1, a1, a2
+; LMULMAX2-RV64-NEXT: srli a2, a1, 2
+; LMULMAX2-RV64-NEXT: or a1, a1, a2
+; LMULMAX2-RV64-NEXT: srli a2, a1, 4
+; LMULMAX2-RV64-NEXT: or a1, a1, a2
+; LMULMAX2-RV64-NEXT: srli a2, a1, 8
+; LMULMAX2-RV64-NEXT: or a1, a1, a2
+; LMULMAX2-RV64-NEXT: srli a2, a1, 16
+; LMULMAX2-RV64-NEXT: or a1, a1, a2
+; LMULMAX2-RV64-NEXT: srli a2, a1, 32
+; LMULMAX2-RV64-NEXT: or a1, a1, a2
+; LMULMAX2-RV64-NEXT: not a1, a1
+; LMULMAX2-RV64-NEXT: srli a2, a1, 1
+; LMULMAX2-RV64-NEXT: and a2, a2, a7
+; LMULMAX2-RV64-NEXT: sub a1, a1, a2
+; LMULMAX2-RV64-NEXT: and a2, a1, a3
+; LMULMAX2-RV64-NEXT: srli a1, a1, 2
+; LMULMAX2-RV64-NEXT: and a1, a1, a3
+; LMULMAX2-RV64-NEXT: add a1, a2, a1
+; LMULMAX2-RV64-NEXT: srli a2, a1, 4
+; LMULMAX2-RV64-NEXT: add a1, a1, a2
+; LMULMAX2-RV64-NEXT: and a1, a1, a4
+; LMULMAX2-RV64-NEXT: mul a1, a1, a5
+; LMULMAX2-RV64-NEXT: srli a1, a1, 56
+; LMULMAX2-RV64-NEXT: addi a1, a1, -48
+; LMULMAX2-RV64-NEXT: sh a1, 50(sp)
+; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 8
+; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28
+; LMULMAX2-RV64-NEXT: and a1, a1, a6
+; LMULMAX2-RV64-NEXT: srli a2, a1, 1
+; LMULMAX2-RV64-NEXT: or a1, a1, a2
+; LMULMAX2-RV64-NEXT: srli a2, a1, 2
+; LMULMAX2-RV64-NEXT: or a1, a1, a2
+; LMULMAX2-RV64-NEXT: srli a2, a1, 4
+; LMULMAX2-RV64-NEXT: or a1, a1, a2
+; LMULMAX2-RV64-NEXT: srli a2, a1, 8
+; LMULMAX2-RV64-NEXT: or a1, a1, a2
+; LMULMAX2-RV64-NEXT: srli a2, a1, 16
+; LMULMAX2-RV64-NEXT: or a1, a1, a2
+; LMULMAX2-RV64-NEXT: srli a2, a1, 32
+; LMULMAX2-RV64-NEXT: or a1, a1, a2
+; LMULMAX2-RV64-NEXT: not a1, a1
+; LMULMAX2-RV64-NEXT: srli a2, a1, 1
+; LMULMAX2-RV64-NEXT: and a2, a2, a7
+; LMULMAX2-RV64-NEXT: sub a1, a1, a2
+; LMULMAX2-RV64-NEXT: and a2, a1, a3
+; LMULMAX2-RV64-NEXT: srli a1, a1, 2
+; LMULMAX2-RV64-NEXT: and a1, a1, a3
+; LMULMAX2-RV64-NEXT: add a1, a2, a1
+; LMULMAX2-RV64-NEXT: srli a2, a1, 4
+; LMULMAX2-RV64-NEXT: add a1, a1, a2
+; LMULMAX2-RV64-NEXT: and a1, a1, a4
+; LMULMAX2-RV64-NEXT: mul a1, a1, a5
+; LMULMAX2-RV64-NEXT: srli a1, a1, 56
+; LMULMAX2-RV64-NEXT: addi a1, a1, -48
+; LMULMAX2-RV64-NEXT: sh a1, 48(sp)
+; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 7
+; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28
+; LMULMAX2-RV64-NEXT: and a1, a1, a6
+; LMULMAX2-RV64-NEXT: srli a2, a1, 1
+; LMULMAX2-RV64-NEXT: or a1, a1, a2
+; LMULMAX2-RV64-NEXT: srli a2, a1, 2
+; LMULMAX2-RV64-NEXT: or a1, a1, a2
+; LMULMAX2-RV64-NEXT: srli a2, a1, 4
+; LMULMAX2-RV64-NEXT: or a1, a1, a2
+; LMULMAX2-RV64-NEXT: srli a2, a1, 8
+; LMULMAX2-RV64-NEXT: or a1, a1, a2
+; LMULMAX2-RV64-NEXT: srli a2, a1, 16
+; LMULMAX2-RV64-NEXT: or a1, a1, a2
+; LMULMAX2-RV64-NEXT: srli a2, a1, 32
+; LMULMAX2-RV64-NEXT: or a1, a1, a2
+; LMULMAX2-RV64-NEXT: not a1, a1
+; LMULMAX2-RV64-NEXT: srli a2, a1, 1
+; LMULMAX2-RV64-NEXT: and a2, a2, a7
+; LMULMAX2-RV64-NEXT: sub a1, a1, a2
+; LMULMAX2-RV64-NEXT: and a2, a1, a3
+; LMULMAX2-RV64-NEXT: srli a1, a1, 2
+; LMULMAX2-RV64-NEXT: and a1, a1, a3
+; LMULMAX2-RV64-NEXT: add a1, a2, a1
+; LMULMAX2-RV64-NEXT: srli a2, a1, 4
+; LMULMAX2-RV64-NEXT: add a1, a1, a2
+; LMULMAX2-RV64-NEXT: and a1, a1, a4
+; LMULMAX2-RV64-NEXT: mul a1, a1, a5
+; LMULMAX2-RV64-NEXT: srli a1, a1, 56
+; LMULMAX2-RV64-NEXT: addi a1, a1, -48
+; LMULMAX2-RV64-NEXT: sh a1, 46(sp)
+; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 6
+; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28
+; LMULMAX2-RV64-NEXT: and a1, a1, a6
+; LMULMAX2-RV64-NEXT: srli a2, a1, 1
+; LMULMAX2-RV64-NEXT: or a1, a1, a2
+; LMULMAX2-RV64-NEXT: srli a2, a1, 2
+; LMULMAX2-RV64-NEXT: or a1, a1, a2
+; LMULMAX2-RV64-NEXT: srli a2, a1, 4
+; LMULMAX2-RV64-NEXT: or a1, a1, a2
+; LMULMAX2-RV64-NEXT: srli a2, a1, 8
+; LMULMAX2-RV64-NEXT: or a1, a1, a2
+; LMULMAX2-RV64-NEXT: srli a2, a1, 16
+; LMULMAX2-RV64-NEXT: or a1, a1, a2
+; LMULMAX2-RV64-NEXT: srli a2, a1, 32
+; LMULMAX2-RV64-NEXT: or a1, a1, a2
+; LMULMAX2-RV64-NEXT: not a1, a1
+; LMULMAX2-RV64-NEXT: srli a2, a1, 1
+; LMULMAX2-RV64-NEXT: and a2, a2, a7
+; LMULMAX2-RV64-NEXT: sub a1, a1, a2
+; LMULMAX2-RV64-NEXT: and a2, a1, a3
+; LMULMAX2-RV64-NEXT: srli a1, a1, 2
+; LMULMAX2-RV64-NEXT: and a1, a1, a3
+; LMULMAX2-RV64-NEXT: add a1, a2, a1
+; LMULMAX2-RV64-NEXT: srli a2, a1, 4
+; LMULMAX2-RV64-NEXT: add a1, a1, a2
+; LMULMAX2-RV64-NEXT: and a1, a1, a4
+; LMULMAX2-RV64-NEXT: mul a1, a1, a5
+; LMULMAX2-RV64-NEXT: srli a1, a1, 56
+; LMULMAX2-RV64-NEXT: addi a1, a1, -48
+; LMULMAX2-RV64-NEXT: sh a1, 44(sp)
+; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 5
+; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28
+; LMULMAX2-RV64-NEXT: and a1, a1, a6
+; LMULMAX2-RV64-NEXT: srli a2, a1, 1
+; LMULMAX2-RV64-NEXT: or a1, a1, a2
+; LMULMAX2-RV64-NEXT: srli a2, a1, 2
+; LMULMAX2-RV64-NEXT: or a1, a1, a2
+; LMULMAX2-RV64-NEXT: srli a2, a1, 4
+; LMULMAX2-RV64-NEXT: or a1, a1, a2
+; LMULMAX2-RV64-NEXT: srli a2, a1, 8
+; LMULMAX2-RV64-NEXT: or a1, a1, a2
+; LMULMAX2-RV64-NEXT: srli a2, a1, 16
+; LMULMAX2-RV64-NEXT: or a1, a1, a2
+; LMULMAX2-RV64-NEXT: srli a2, a1, 32
+; LMULMAX2-RV64-NEXT: or a1, a1, a2
+; LMULMAX2-RV64-NEXT: not a1, a1
+; LMULMAX2-RV64-NEXT: srli a2, a1, 1
+; LMULMAX2-RV64-NEXT: and a2, a2, a7
+; LMULMAX2-RV64-NEXT: sub a1, a1, a2
+; LMULMAX2-RV64-NEXT: and a2, a1, a3
+; LMULMAX2-RV64-NEXT: srli a1, a1, 2
+; LMULMAX2-RV64-NEXT: and a1, a1, a3
+; LMULMAX2-RV64-NEXT: add a1, a2, a1
+; LMULMAX2-RV64-NEXT: srli a2, a1, 4
+; LMULMAX2-RV64-NEXT: add a1, a1, a2
+; LMULMAX2-RV64-NEXT: and a1, a1, a4
+; LMULMAX2-RV64-NEXT: mul a1, a1, a5
+; LMULMAX2-RV64-NEXT: srli a1, a1, 56
+; LMULMAX2-RV64-NEXT: addi a1, a1, -48
+; LMULMAX2-RV64-NEXT: sh a1, 42(sp)
+; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 4
+; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28
+; LMULMAX2-RV64-NEXT: and a1, a1, a6
+; LMULMAX2-RV64-NEXT: srli a2, a1, 1
+; LMULMAX2-RV64-NEXT: or a1, a1, a2
+; LMULMAX2-RV64-NEXT: srli a2, a1, 2
+; LMULMAX2-RV64-NEXT: or a1, a1, a2
+; LMULMAX2-RV64-NEXT: srli a2, a1, 4
+; LMULMAX2-RV64-NEXT: or a1, a1, a2
+; LMULMAX2-RV64-NEXT: srli a2, a1, 8
+; LMULMAX2-RV64-NEXT: or a1, a1, a2
+; LMULMAX2-RV64-NEXT: srli a2, a1, 16
+; LMULMAX2-RV64-NEXT: or a1, a1, a2
+; LMULMAX2-RV64-NEXT: srli a2, a1, 32
+; LMULMAX2-RV64-NEXT: or a1, a1, a2
+; LMULMAX2-RV64-NEXT: not a1, a1
+; LMULMAX2-RV64-NEXT: srli a2, a1, 1
+; LMULMAX2-RV64-NEXT: and a2, a2, a7
+; LMULMAX2-RV64-NEXT: sub a1, a1, a2
+; LMULMAX2-RV64-NEXT: and a2, a1, a3
+; LMULMAX2-RV64-NEXT: srli a1, a1, 2
+; LMULMAX2-RV64-NEXT: and a1, a1, a3
+; LMULMAX2-RV64-NEXT: add a1, a2, a1
+; LMULMAX2-RV64-NEXT: srli a2, a1, 4
+; LMULMAX2-RV64-NEXT: add a1, a1, a2
+; LMULMAX2-RV64-NEXT: and a1, a1, a4
+; LMULMAX2-RV64-NEXT: mul a1, a1, a5
+; LMULMAX2-RV64-NEXT: srli a1, a1, 56
+; LMULMAX2-RV64-NEXT: addi a1, a1, -48
+; LMULMAX2-RV64-NEXT: sh a1, 40(sp)
+; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 3
+; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28
+; LMULMAX2-RV64-NEXT: and a1, a1, a6
+; LMULMAX2-RV64-NEXT: srli a2, a1, 1
+; LMULMAX2-RV64-NEXT: or a1, a1, a2
+; LMULMAX2-RV64-NEXT: srli a2, a1, 2
+; LMULMAX2-RV64-NEXT: or a1, a1, a2
+; LMULMAX2-RV64-NEXT: srli a2, a1, 4
+; LMULMAX2-RV64-NEXT: or a1, a1, a2
+; LMULMAX2-RV64-NEXT: srli a2, a1, 8
+; LMULMAX2-RV64-NEXT: or a1, a1, a2
+; LMULMAX2-RV64-NEXT: srli a2, a1, 16
+; LMULMAX2-RV64-NEXT: or a1, a1, a2
+; LMULMAX2-RV64-NEXT: srli a2, a1, 32
+; LMULMAX2-RV64-NEXT: or a1, a1, a2
+; LMULMAX2-RV64-NEXT: not a1, a1
+; LMULMAX2-RV64-NEXT: srli a2, a1, 1
+; LMULMAX2-RV64-NEXT: and a2, a2, a7
+; LMULMAX2-RV64-NEXT: sub a1, a1, a2
+; LMULMAX2-RV64-NEXT: and a2, a1, a3
+; LMULMAX2-RV64-NEXT: srli a1, a1, 2
+; LMULMAX2-RV64-NEXT: and a1, a1, a3
+; LMULMAX2-RV64-NEXT: add a1, a2, a1
+; LMULMAX2-RV64-NEXT: srli a2, a1, 4
+; LMULMAX2-RV64-NEXT: add a1, a1, a2
+; LMULMAX2-RV64-NEXT: and a1, a1, a4
+; LMULMAX2-RV64-NEXT: mul a1, a1, a5
+; LMULMAX2-RV64-NEXT: srli a1, a1, 56
+; LMULMAX2-RV64-NEXT: addi a1, a1, -48
+; LMULMAX2-RV64-NEXT: sh a1, 38(sp)
+; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 2
+; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28
+; LMULMAX2-RV64-NEXT: and a1, a1, a6
+; LMULMAX2-RV64-NEXT: srli a2, a1, 1
+; LMULMAX2-RV64-NEXT: or a1, a1, a2
+; LMULMAX2-RV64-NEXT: srli a2, a1, 2
+; LMULMAX2-RV64-NEXT: or a1, a1, a2
+; LMULMAX2-RV64-NEXT: srli a2, a1, 4
+; LMULMAX2-RV64-NEXT: or a1, a1, a2
+; LMULMAX2-RV64-NEXT: srli a2, a1, 8
+; LMULMAX2-RV64-NEXT: or a1, a1, a2
+; LMULMAX2-RV64-NEXT: srli a2, a1, 16
+; LMULMAX2-RV64-NEXT: or a1, a1, a2
+; LMULMAX2-RV64-NEXT: srli a2, a1, 32
+; LMULMAX2-RV64-NEXT: or a1, a1, a2
+; LMULMAX2-RV64-NEXT: not a1, a1
+; LMULMAX2-RV64-NEXT: srli a2, a1, 1
+; LMULMAX2-RV64-NEXT: and a2, a2, a7
+; LMULMAX2-RV64-NEXT: sub a1, a1, a2
+; LMULMAX2-RV64-NEXT: and a2, a1, a3
+; LMULMAX2-RV64-NEXT: srli a1, a1, 2
+; LMULMAX2-RV64-NEXT: and a1, a1, a3
+; LMULMAX2-RV64-NEXT: add a1, a2, a1
+; LMULMAX2-RV64-NEXT: srli a2, a1, 4
+; LMULMAX2-RV64-NEXT: add a1, a1, a2
+; LMULMAX2-RV64-NEXT: and a1, a1, a4
+; LMULMAX2-RV64-NEXT: mul a1, a1, a5
+; LMULMAX2-RV64-NEXT: srli a1, a1, 56
+; LMULMAX2-RV64-NEXT: addi a1, a1, -48
+; LMULMAX2-RV64-NEXT: sh a1, 36(sp)
+; LMULMAX2-RV64-NEXT: vslidedown.vi v26, v26, 1
+; LMULMAX2-RV64-NEXT: vmv.x.s a1, v26
+; LMULMAX2-RV64-NEXT: and a1, a1, a6
+; LMULMAX2-RV64-NEXT: srli a2, a1, 1
+; LMULMAX2-RV64-NEXT: or a1, a1, a2
+; LMULMAX2-RV64-NEXT: srli a2, a1, 2
+; LMULMAX2-RV64-NEXT: or a1, a1, a2
+; LMULMAX2-RV64-NEXT: srli a2, a1, 4
+; LMULMAX2-RV64-NEXT: or a1, a1, a2
+; LMULMAX2-RV64-NEXT: srli a2, a1, 8
+; LMULMAX2-RV64-NEXT: or a1, a1, a2
+; LMULMAX2-RV64-NEXT: srli a2, a1, 16
+; LMULMAX2-RV64-NEXT: or a1, a1, a2
+; LMULMAX2-RV64-NEXT: srli a2, a1, 32
+; LMULMAX2-RV64-NEXT: or a1, a1, a2
+; LMULMAX2-RV64-NEXT: not a1, a1
+; LMULMAX2-RV64-NEXT: srli a2, a1, 1
+; LMULMAX2-RV64-NEXT: and a2, a2, a7
+; LMULMAX2-RV64-NEXT: sub a1, a1, a2
+; LMULMAX2-RV64-NEXT: and a2, a1, a3
+; LMULMAX2-RV64-NEXT: srli a1, a1, 2
+; LMULMAX2-RV64-NEXT: and a1, a1, a3
+; LMULMAX2-RV64-NEXT: add a1, a2, a1
+; LMULMAX2-RV64-NEXT: srli a2, a1, 4
+; LMULMAX2-RV64-NEXT: add a1, a1, a2
+; LMULMAX2-RV64-NEXT: and a1, a1, a4
+; LMULMAX2-RV64-NEXT: mul a1, a1, a5
+; LMULMAX2-RV64-NEXT: srli a1, a1, 56
+; LMULMAX2-RV64-NEXT: addi a1, a1, -48
+; LMULMAX2-RV64-NEXT: sh a1, 34(sp)
+; LMULMAX2-RV64-NEXT: vsetivli zero, 16, e16, m2, ta, mu
+; LMULMAX2-RV64-NEXT: addi a1, sp, 32
+; LMULMAX2-RV64-NEXT: vle16.v v26, (a1)
+; LMULMAX2-RV64-NEXT: vse16.v v26, (a0)
+; LMULMAX2-RV64-NEXT: addi sp, s0, -96
+; LMULMAX2-RV64-NEXT: ld s0, 80(sp) # 8-byte Folded Reload
+; LMULMAX2-RV64-NEXT: ld ra, 88(sp) # 8-byte Folded Reload
+; LMULMAX2-RV64-NEXT: addi sp, sp, 96
+; LMULMAX2-RV64-NEXT: ret
;
; LMULMAX1-RV32-LABEL: ctlz_v16i16:
; LMULMAX1-RV32: # %bb.0:
; LMULMAX1-RV64-NEXT: vle16.v v25, (a0)
; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26
; LMULMAX1-RV64-NEXT: lui a2, 16
-; LMULMAX1-RV64-NEXT: addiw a2, a2, -1
-; LMULMAX1-RV64-NEXT: and a1, a1, a2
+; LMULMAX1-RV64-NEXT: addiw a7, a2, -1
+; LMULMAX1-RV64-NEXT: and a1, a1, a7
; LMULMAX1-RV64-NEXT: srli a3, a1, 1
; LMULMAX1-RV64-NEXT: or a1, a1, a3
; LMULMAX1-RV64-NEXT: srli a3, a1, 2
; LMULMAX1-RV64-NEXT: slli a3, a3, 12
; LMULMAX1-RV64-NEXT: addi a3, a3, 1365
; LMULMAX1-RV64-NEXT: slli a3, a3, 12
-; LMULMAX1-RV64-NEXT: addi a7, a3, 1365
-; LMULMAX1-RV64-NEXT: and a4, a4, a7
+; LMULMAX1-RV64-NEXT: addi t0, a3, 1365
+; LMULMAX1-RV64-NEXT: and a4, a4, t0
; LMULMAX1-RV64-NEXT: sub a1, a1, a4
; LMULMAX1-RV64-NEXT: lui a4, 13107
; LMULMAX1-RV64-NEXT: addiw a4, a4, 819
; LMULMAX1-RV64-NEXT: slli a5, a5, 12
; LMULMAX1-RV64-NEXT: addi a5, a5, 241
; LMULMAX1-RV64-NEXT: slli a5, a5, 12
-; LMULMAX1-RV64-NEXT: addi t0, a5, -241
-; LMULMAX1-RV64-NEXT: and a3, a1, t0
+; LMULMAX1-RV64-NEXT: addi a5, a5, -241
+; LMULMAX1-RV64-NEXT: and a2, a1, a5
; LMULMAX1-RV64-NEXT: lui a1, 4112
; LMULMAX1-RV64-NEXT: addiw a1, a1, 257
; LMULMAX1-RV64-NEXT: slli a1, a1, 16
; LMULMAX1-RV64-NEXT: addi a1, a1, 257
; LMULMAX1-RV64-NEXT: slli a1, a1, 16
; LMULMAX1-RV64-NEXT: addi a1, a1, 257
-; LMULMAX1-RV64-NEXT: mul a3, a3, a1
-; LMULMAX1-RV64-NEXT: srli a3, a3, 56
-; LMULMAX1-RV64-NEXT: addi a3, a3, -48
-; LMULMAX1-RV64-NEXT: sh a3, 32(sp)
+; LMULMAX1-RV64-NEXT: mul a2, a2, a1
+; LMULMAX1-RV64-NEXT: srli a2, a2, 56
+; LMULMAX1-RV64-NEXT: addi a2, a2, -48
+; LMULMAX1-RV64-NEXT: sh a2, 32(sp)
; LMULMAX1-RV64-NEXT: vsetivli zero, 1, e16, m1, ta, mu
; LMULMAX1-RV64-NEXT: vslidedown.vi v27, v26, 7
-; LMULMAX1-RV64-NEXT: vmv.x.s a3, v27
-; LMULMAX1-RV64-NEXT: and a3, a3, a2
-; LMULMAX1-RV64-NEXT: srli a5, a3, 1
-; LMULMAX1-RV64-NEXT: or a3, a3, a5
-; LMULMAX1-RV64-NEXT: srli a5, a3, 2
-; LMULMAX1-RV64-NEXT: or a3, a3, a5
-; LMULMAX1-RV64-NEXT: srli a5, a3, 4
-; LMULMAX1-RV64-NEXT: or a3, a3, a5
-; LMULMAX1-RV64-NEXT: srli a5, a3, 8
-; LMULMAX1-RV64-NEXT: or a3, a3, a5
-; LMULMAX1-RV64-NEXT: srli a5, a3, 16
-; LMULMAX1-RV64-NEXT: or a3, a3, a5
-; LMULMAX1-RV64-NEXT: srli a5, a3, 32
-; LMULMAX1-RV64-NEXT: or a3, a3, a5
-; LMULMAX1-RV64-NEXT: not a3, a3
-; LMULMAX1-RV64-NEXT: srli a5, a3, 1
-; LMULMAX1-RV64-NEXT: and a5, a5, a7
-; LMULMAX1-RV64-NEXT: sub a3, a3, a5
-; LMULMAX1-RV64-NEXT: and a5, a3, a4
-; LMULMAX1-RV64-NEXT: srli a3, a3, 2
-; LMULMAX1-RV64-NEXT: and a3, a3, a4
-; LMULMAX1-RV64-NEXT: add a3, a5, a3
-; LMULMAX1-RV64-NEXT: srli a5, a3, 4
-; LMULMAX1-RV64-NEXT: add a3, a3, a5
+; LMULMAX1-RV64-NEXT: vmv.x.s a2, v27
+; LMULMAX1-RV64-NEXT: and a2, a2, a7
+; LMULMAX1-RV64-NEXT: srli a3, a2, 1
+; LMULMAX1-RV64-NEXT: or a2, a2, a3
+; LMULMAX1-RV64-NEXT: srli a3, a2, 2
+; LMULMAX1-RV64-NEXT: or a2, a2, a3
+; LMULMAX1-RV64-NEXT: srli a3, a2, 4
+; LMULMAX1-RV64-NEXT: or a2, a2, a3
+; LMULMAX1-RV64-NEXT: srli a3, a2, 8
+; LMULMAX1-RV64-NEXT: or a2, a2, a3
+; LMULMAX1-RV64-NEXT: srli a3, a2, 16
+; LMULMAX1-RV64-NEXT: or a2, a2, a3
+; LMULMAX1-RV64-NEXT: srli a3, a2, 32
+; LMULMAX1-RV64-NEXT: or a2, a2, a3
+; LMULMAX1-RV64-NEXT: not a2, a2
+; LMULMAX1-RV64-NEXT: srli a3, a2, 1
; LMULMAX1-RV64-NEXT: and a3, a3, t0
-; LMULMAX1-RV64-NEXT: mul a3, a3, a1
-; LMULMAX1-RV64-NEXT: srli a3, a3, 56
-; LMULMAX1-RV64-NEXT: addi a3, a3, -48
-; LMULMAX1-RV64-NEXT: sh a3, 46(sp)
+; LMULMAX1-RV64-NEXT: sub a2, a2, a3
+; LMULMAX1-RV64-NEXT: and a3, a2, a4
+; LMULMAX1-RV64-NEXT: srli a2, a2, 2
+; LMULMAX1-RV64-NEXT: and a2, a2, a4
+; LMULMAX1-RV64-NEXT: add a2, a3, a2
+; LMULMAX1-RV64-NEXT: srli a3, a2, 4
+; LMULMAX1-RV64-NEXT: add a2, a2, a3
+; LMULMAX1-RV64-NEXT: and a2, a2, a5
+; LMULMAX1-RV64-NEXT: mul a2, a2, a1
+; LMULMAX1-RV64-NEXT: srli a2, a2, 56
+; LMULMAX1-RV64-NEXT: addi a2, a2, -48
+; LMULMAX1-RV64-NEXT: sh a2, 46(sp)
; LMULMAX1-RV64-NEXT: vslidedown.vi v27, v26, 6
-; LMULMAX1-RV64-NEXT: vmv.x.s a3, v27
-; LMULMAX1-RV64-NEXT: and a3, a3, a2
-; LMULMAX1-RV64-NEXT: srli a5, a3, 1
-; LMULMAX1-RV64-NEXT: or a3, a3, a5
-; LMULMAX1-RV64-NEXT: srli a5, a3, 2
-; LMULMAX1-RV64-NEXT: or a3, a3, a5
-; LMULMAX1-RV64-NEXT: srli a5, a3, 4
-; LMULMAX1-RV64-NEXT: or a3, a3, a5
-; LMULMAX1-RV64-NEXT: srli a5, a3, 8
-; LMULMAX1-RV64-NEXT: or a3, a3, a5
-; LMULMAX1-RV64-NEXT: srli a5, a3, 16
-; LMULMAX1-RV64-NEXT: or a3, a3, a5
-; LMULMAX1-RV64-NEXT: srli a5, a3, 32
-; LMULMAX1-RV64-NEXT: or a3, a3, a5
-; LMULMAX1-RV64-NEXT: not a3, a3
-; LMULMAX1-RV64-NEXT: srli a5, a3, 1
-; LMULMAX1-RV64-NEXT: and a5, a5, a7
-; LMULMAX1-RV64-NEXT: sub a3, a3, a5
-; LMULMAX1-RV64-NEXT: and a5, a3, a4
-; LMULMAX1-RV64-NEXT: srli a3, a3, 2
-; LMULMAX1-RV64-NEXT: and a3, a3, a4
-; LMULMAX1-RV64-NEXT: add a3, a5, a3
-; LMULMAX1-RV64-NEXT: srli a5, a3, 4
-; LMULMAX1-RV64-NEXT: add a3, a3, a5
+; LMULMAX1-RV64-NEXT: vmv.x.s a2, v27
+; LMULMAX1-RV64-NEXT: and a2, a2, a7
+; LMULMAX1-RV64-NEXT: srli a3, a2, 1
+; LMULMAX1-RV64-NEXT: or a2, a2, a3
+; LMULMAX1-RV64-NEXT: srli a3, a2, 2
+; LMULMAX1-RV64-NEXT: or a2, a2, a3
+; LMULMAX1-RV64-NEXT: srli a3, a2, 4
+; LMULMAX1-RV64-NEXT: or a2, a2, a3
+; LMULMAX1-RV64-NEXT: srli a3, a2, 8
+; LMULMAX1-RV64-NEXT: or a2, a2, a3
+; LMULMAX1-RV64-NEXT: srli a3, a2, 16
+; LMULMAX1-RV64-NEXT: or a2, a2, a3
+; LMULMAX1-RV64-NEXT: srli a3, a2, 32
+; LMULMAX1-RV64-NEXT: or a2, a2, a3
+; LMULMAX1-RV64-NEXT: not a2, a2
+; LMULMAX1-RV64-NEXT: srli a3, a2, 1
; LMULMAX1-RV64-NEXT: and a3, a3, t0
-; LMULMAX1-RV64-NEXT: mul a3, a3, a1
-; LMULMAX1-RV64-NEXT: srli a3, a3, 56
-; LMULMAX1-RV64-NEXT: addi a3, a3, -48
-; LMULMAX1-RV64-NEXT: sh a3, 44(sp)
+; LMULMAX1-RV64-NEXT: sub a2, a2, a3
+; LMULMAX1-RV64-NEXT: and a3, a2, a4
+; LMULMAX1-RV64-NEXT: srli a2, a2, 2
+; LMULMAX1-RV64-NEXT: and a2, a2, a4
+; LMULMAX1-RV64-NEXT: add a2, a3, a2
+; LMULMAX1-RV64-NEXT: srli a3, a2, 4
+; LMULMAX1-RV64-NEXT: add a2, a2, a3
+; LMULMAX1-RV64-NEXT: and a2, a2, a5
+; LMULMAX1-RV64-NEXT: mul a2, a2, a1
+; LMULMAX1-RV64-NEXT: srli a2, a2, 56
+; LMULMAX1-RV64-NEXT: addi a2, a2, -48
+; LMULMAX1-RV64-NEXT: sh a2, 44(sp)
; LMULMAX1-RV64-NEXT: vslidedown.vi v27, v26, 5
-; LMULMAX1-RV64-NEXT: vmv.x.s a3, v27
-; LMULMAX1-RV64-NEXT: and a3, a3, a2
-; LMULMAX1-RV64-NEXT: srli a5, a3, 1
-; LMULMAX1-RV64-NEXT: or a3, a3, a5
-; LMULMAX1-RV64-NEXT: srli a5, a3, 2
-; LMULMAX1-RV64-NEXT: or a3, a3, a5
-; LMULMAX1-RV64-NEXT: srli a5, a3, 4
-; LMULMAX1-RV64-NEXT: or a3, a3, a5
-; LMULMAX1-RV64-NEXT: srli a5, a3, 8
-; LMULMAX1-RV64-NEXT: or a3, a3, a5
-; LMULMAX1-RV64-NEXT: srli a5, a3, 16
-; LMULMAX1-RV64-NEXT: or a3, a3, a5
-; LMULMAX1-RV64-NEXT: srli a5, a3, 32
-; LMULMAX1-RV64-NEXT: or a3, a3, a5
-; LMULMAX1-RV64-NEXT: not a3, a3
-; LMULMAX1-RV64-NEXT: srli a5, a3, 1
-; LMULMAX1-RV64-NEXT: and a5, a5, a7
-; LMULMAX1-RV64-NEXT: sub a3, a3, a5
-; LMULMAX1-RV64-NEXT: and a5, a3, a4
-; LMULMAX1-RV64-NEXT: srli a3, a3, 2
-; LMULMAX1-RV64-NEXT: and a3, a3, a4
-; LMULMAX1-RV64-NEXT: add a3, a5, a3
-; LMULMAX1-RV64-NEXT: srli a5, a3, 4
-; LMULMAX1-RV64-NEXT: add a3, a3, a5
+; LMULMAX1-RV64-NEXT: vmv.x.s a2, v27
+; LMULMAX1-RV64-NEXT: and a2, a2, a7
+; LMULMAX1-RV64-NEXT: srli a3, a2, 1
+; LMULMAX1-RV64-NEXT: or a2, a2, a3
+; LMULMAX1-RV64-NEXT: srli a3, a2, 2
+; LMULMAX1-RV64-NEXT: or a2, a2, a3
+; LMULMAX1-RV64-NEXT: srli a3, a2, 4
+; LMULMAX1-RV64-NEXT: or a2, a2, a3
+; LMULMAX1-RV64-NEXT: srli a3, a2, 8
+; LMULMAX1-RV64-NEXT: or a2, a2, a3
+; LMULMAX1-RV64-NEXT: srli a3, a2, 16
+; LMULMAX1-RV64-NEXT: or a2, a2, a3
+; LMULMAX1-RV64-NEXT: srli a3, a2, 32
+; LMULMAX1-RV64-NEXT: or a2, a2, a3
+; LMULMAX1-RV64-NEXT: not a2, a2
+; LMULMAX1-RV64-NEXT: srli a3, a2, 1
; LMULMAX1-RV64-NEXT: and a3, a3, t0
-; LMULMAX1-RV64-NEXT: mul a3, a3, a1
-; LMULMAX1-RV64-NEXT: srli a3, a3, 56
-; LMULMAX1-RV64-NEXT: addi a3, a3, -48
-; LMULMAX1-RV64-NEXT: sh a3, 42(sp)
+; LMULMAX1-RV64-NEXT: sub a2, a2, a3
+; LMULMAX1-RV64-NEXT: and a3, a2, a4
+; LMULMAX1-RV64-NEXT: srli a2, a2, 2
+; LMULMAX1-RV64-NEXT: and a2, a2, a4
+; LMULMAX1-RV64-NEXT: add a2, a3, a2
+; LMULMAX1-RV64-NEXT: srli a3, a2, 4
+; LMULMAX1-RV64-NEXT: add a2, a2, a3
+; LMULMAX1-RV64-NEXT: and a2, a2, a5
+; LMULMAX1-RV64-NEXT: mul a2, a2, a1
+; LMULMAX1-RV64-NEXT: srli a2, a2, 56
+; LMULMAX1-RV64-NEXT: addi a2, a2, -48
+; LMULMAX1-RV64-NEXT: sh a2, 42(sp)
; LMULMAX1-RV64-NEXT: vslidedown.vi v27, v26, 4
-; LMULMAX1-RV64-NEXT: vmv.x.s a3, v27
-; LMULMAX1-RV64-NEXT: and a3, a3, a2
-; LMULMAX1-RV64-NEXT: srli a5, a3, 1
-; LMULMAX1-RV64-NEXT: or a3, a3, a5
-; LMULMAX1-RV64-NEXT: srli a5, a3, 2
-; LMULMAX1-RV64-NEXT: or a3, a3, a5
-; LMULMAX1-RV64-NEXT: srli a5, a3, 4
-; LMULMAX1-RV64-NEXT: or a3, a3, a5
-; LMULMAX1-RV64-NEXT: srli a5, a3, 8
-; LMULMAX1-RV64-NEXT: or a3, a3, a5
-; LMULMAX1-RV64-NEXT: srli a5, a3, 16
-; LMULMAX1-RV64-NEXT: or a3, a3, a5
-; LMULMAX1-RV64-NEXT: srli a5, a3, 32
-; LMULMAX1-RV64-NEXT: or a3, a3, a5
-; LMULMAX1-RV64-NEXT: not a3, a3
-; LMULMAX1-RV64-NEXT: srli a5, a3, 1
-; LMULMAX1-RV64-NEXT: and a5, a5, a7
-; LMULMAX1-RV64-NEXT: sub a3, a3, a5
-; LMULMAX1-RV64-NEXT: and a5, a3, a4
-; LMULMAX1-RV64-NEXT: srli a3, a3, 2
-; LMULMAX1-RV64-NEXT: and a3, a3, a4
-; LMULMAX1-RV64-NEXT: add a3, a5, a3
-; LMULMAX1-RV64-NEXT: srli a5, a3, 4
-; LMULMAX1-RV64-NEXT: add a3, a3, a5
+; LMULMAX1-RV64-NEXT: vmv.x.s a2, v27
+; LMULMAX1-RV64-NEXT: and a2, a2, a7
+; LMULMAX1-RV64-NEXT: srli a3, a2, 1
+; LMULMAX1-RV64-NEXT: or a2, a2, a3
+; LMULMAX1-RV64-NEXT: srli a3, a2, 2
+; LMULMAX1-RV64-NEXT: or a2, a2, a3
+; LMULMAX1-RV64-NEXT: srli a3, a2, 4
+; LMULMAX1-RV64-NEXT: or a2, a2, a3
+; LMULMAX1-RV64-NEXT: srli a3, a2, 8
+; LMULMAX1-RV64-NEXT: or a2, a2, a3
+; LMULMAX1-RV64-NEXT: srli a3, a2, 16
+; LMULMAX1-RV64-NEXT: or a2, a2, a3
+; LMULMAX1-RV64-NEXT: srli a3, a2, 32
+; LMULMAX1-RV64-NEXT: or a2, a2, a3
+; LMULMAX1-RV64-NEXT: not a2, a2
+; LMULMAX1-RV64-NEXT: srli a3, a2, 1
; LMULMAX1-RV64-NEXT: and a3, a3, t0
-; LMULMAX1-RV64-NEXT: mul a3, a3, a1
-; LMULMAX1-RV64-NEXT: srli a3, a3, 56
-; LMULMAX1-RV64-NEXT: addi a3, a3, -48
-; LMULMAX1-RV64-NEXT: sh a3, 40(sp)
+; LMULMAX1-RV64-NEXT: sub a2, a2, a3
+; LMULMAX1-RV64-NEXT: and a3, a2, a4
+; LMULMAX1-RV64-NEXT: srli a2, a2, 2
+; LMULMAX1-RV64-NEXT: and a2, a2, a4
+; LMULMAX1-RV64-NEXT: add a2, a3, a2
+; LMULMAX1-RV64-NEXT: srli a3, a2, 4
+; LMULMAX1-RV64-NEXT: add a2, a2, a3
+; LMULMAX1-RV64-NEXT: and a2, a2, a5
+; LMULMAX1-RV64-NEXT: mul a2, a2, a1
+; LMULMAX1-RV64-NEXT: srli a2, a2, 56
+; LMULMAX1-RV64-NEXT: addi a2, a2, -48
+; LMULMAX1-RV64-NEXT: sh a2, 40(sp)
; LMULMAX1-RV64-NEXT: vslidedown.vi v27, v26, 3
-; LMULMAX1-RV64-NEXT: vmv.x.s a3, v27
-; LMULMAX1-RV64-NEXT: and a3, a3, a2
-; LMULMAX1-RV64-NEXT: srli a5, a3, 1
-; LMULMAX1-RV64-NEXT: or a3, a3, a5
-; LMULMAX1-RV64-NEXT: srli a5, a3, 2
-; LMULMAX1-RV64-NEXT: or a3, a3, a5
-; LMULMAX1-RV64-NEXT: srli a5, a3, 4
-; LMULMAX1-RV64-NEXT: or a3, a3, a5
-; LMULMAX1-RV64-NEXT: srli a5, a3, 8
-; LMULMAX1-RV64-NEXT: or a3, a3, a5
-; LMULMAX1-RV64-NEXT: srli a5, a3, 16
-; LMULMAX1-RV64-NEXT: or a3, a3, a5
-; LMULMAX1-RV64-NEXT: srli a5, a3, 32
-; LMULMAX1-RV64-NEXT: or a3, a3, a5
-; LMULMAX1-RV64-NEXT: not a3, a3
-; LMULMAX1-RV64-NEXT: srli a5, a3, 1
-; LMULMAX1-RV64-NEXT: and a5, a5, a7
-; LMULMAX1-RV64-NEXT: sub a3, a3, a5
-; LMULMAX1-RV64-NEXT: and a5, a3, a4
-; LMULMAX1-RV64-NEXT: srli a3, a3, 2
-; LMULMAX1-RV64-NEXT: and a3, a3, a4
-; LMULMAX1-RV64-NEXT: add a3, a5, a3
-; LMULMAX1-RV64-NEXT: srli a5, a3, 4
-; LMULMAX1-RV64-NEXT: add a3, a3, a5
+; LMULMAX1-RV64-NEXT: vmv.x.s a2, v27
+; LMULMAX1-RV64-NEXT: and a2, a2, a7
+; LMULMAX1-RV64-NEXT: srli a3, a2, 1
+; LMULMAX1-RV64-NEXT: or a2, a2, a3
+; LMULMAX1-RV64-NEXT: srli a3, a2, 2
+; LMULMAX1-RV64-NEXT: or a2, a2, a3
+; LMULMAX1-RV64-NEXT: srli a3, a2, 4
+; LMULMAX1-RV64-NEXT: or a2, a2, a3
+; LMULMAX1-RV64-NEXT: srli a3, a2, 8
+; LMULMAX1-RV64-NEXT: or a2, a2, a3
+; LMULMAX1-RV64-NEXT: srli a3, a2, 16
+; LMULMAX1-RV64-NEXT: or a2, a2, a3
+; LMULMAX1-RV64-NEXT: srli a3, a2, 32
+; LMULMAX1-RV64-NEXT: or a2, a2, a3
+; LMULMAX1-RV64-NEXT: not a2, a2
+; LMULMAX1-RV64-NEXT: srli a3, a2, 1
; LMULMAX1-RV64-NEXT: and a3, a3, t0
-; LMULMAX1-RV64-NEXT: mul a3, a3, a1
-; LMULMAX1-RV64-NEXT: srli a3, a3, 56
-; LMULMAX1-RV64-NEXT: addi a3, a3, -48
-; LMULMAX1-RV64-NEXT: sh a3, 38(sp)
+; LMULMAX1-RV64-NEXT: sub a2, a2, a3
+; LMULMAX1-RV64-NEXT: and a3, a2, a4
+; LMULMAX1-RV64-NEXT: srli a2, a2, 2
+; LMULMAX1-RV64-NEXT: and a2, a2, a4
+; LMULMAX1-RV64-NEXT: add a2, a3, a2
+; LMULMAX1-RV64-NEXT: srli a3, a2, 4
+; LMULMAX1-RV64-NEXT: add a2, a2, a3
+; LMULMAX1-RV64-NEXT: and a2, a2, a5
+; LMULMAX1-RV64-NEXT: mul a2, a2, a1
+; LMULMAX1-RV64-NEXT: srli a2, a2, 56
+; LMULMAX1-RV64-NEXT: addi a2, a2, -48
+; LMULMAX1-RV64-NEXT: sh a2, 38(sp)
; LMULMAX1-RV64-NEXT: vslidedown.vi v27, v26, 2
-; LMULMAX1-RV64-NEXT: vmv.x.s a3, v27
-; LMULMAX1-RV64-NEXT: and a3, a3, a2
-; LMULMAX1-RV64-NEXT: srli a5, a3, 1
-; LMULMAX1-RV64-NEXT: or a3, a3, a5
-; LMULMAX1-RV64-NEXT: srli a5, a3, 2
-; LMULMAX1-RV64-NEXT: or a3, a3, a5
-; LMULMAX1-RV64-NEXT: srli a5, a3, 4
-; LMULMAX1-RV64-NEXT: or a3, a3, a5
-; LMULMAX1-RV64-NEXT: srli a5, a3, 8
-; LMULMAX1-RV64-NEXT: or a3, a3, a5
-; LMULMAX1-RV64-NEXT: srli a5, a3, 16
-; LMULMAX1-RV64-NEXT: or a3, a3, a5
-; LMULMAX1-RV64-NEXT: srli a5, a3, 32
-; LMULMAX1-RV64-NEXT: or a3, a3, a5
-; LMULMAX1-RV64-NEXT: not a3, a3
-; LMULMAX1-RV64-NEXT: srli a5, a3, 1
-; LMULMAX1-RV64-NEXT: and a5, a5, a7
-; LMULMAX1-RV64-NEXT: sub a3, a3, a5
-; LMULMAX1-RV64-NEXT: and a5, a3, a4
-; LMULMAX1-RV64-NEXT: srli a3, a3, 2
-; LMULMAX1-RV64-NEXT: and a3, a3, a4
-; LMULMAX1-RV64-NEXT: add a3, a5, a3
-; LMULMAX1-RV64-NEXT: srli a5, a3, 4
-; LMULMAX1-RV64-NEXT: add a3, a3, a5
+; LMULMAX1-RV64-NEXT: vmv.x.s a2, v27
+; LMULMAX1-RV64-NEXT: and a2, a2, a7
+; LMULMAX1-RV64-NEXT: srli a3, a2, 1
+; LMULMAX1-RV64-NEXT: or a2, a2, a3
+; LMULMAX1-RV64-NEXT: srli a3, a2, 2
+; LMULMAX1-RV64-NEXT: or a2, a2, a3
+; LMULMAX1-RV64-NEXT: srli a3, a2, 4
+; LMULMAX1-RV64-NEXT: or a2, a2, a3
+; LMULMAX1-RV64-NEXT: srli a3, a2, 8
+; LMULMAX1-RV64-NEXT: or a2, a2, a3
+; LMULMAX1-RV64-NEXT: srli a3, a2, 16
+; LMULMAX1-RV64-NEXT: or a2, a2, a3
+; LMULMAX1-RV64-NEXT: srli a3, a2, 32
+; LMULMAX1-RV64-NEXT: or a2, a2, a3
+; LMULMAX1-RV64-NEXT: not a2, a2
+; LMULMAX1-RV64-NEXT: srli a3, a2, 1
; LMULMAX1-RV64-NEXT: and a3, a3, t0
-; LMULMAX1-RV64-NEXT: mul a3, a3, a1
-; LMULMAX1-RV64-NEXT: srli a3, a3, 56
-; LMULMAX1-RV64-NEXT: addi a3, a3, -48
-; LMULMAX1-RV64-NEXT: sh a3, 36(sp)
+; LMULMAX1-RV64-NEXT: sub a2, a2, a3
+; LMULMAX1-RV64-NEXT: and a3, a2, a4
+; LMULMAX1-RV64-NEXT: srli a2, a2, 2
+; LMULMAX1-RV64-NEXT: and a2, a2, a4
+; LMULMAX1-RV64-NEXT: add a2, a3, a2
+; LMULMAX1-RV64-NEXT: srli a3, a2, 4
+; LMULMAX1-RV64-NEXT: add a2, a2, a3
+; LMULMAX1-RV64-NEXT: and a2, a2, a5
+; LMULMAX1-RV64-NEXT: mul a2, a2, a1
+; LMULMAX1-RV64-NEXT: srli a2, a2, 56
+; LMULMAX1-RV64-NEXT: addi a2, a2, -48
+; LMULMAX1-RV64-NEXT: sh a2, 36(sp)
; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v26, 1
-; LMULMAX1-RV64-NEXT: vmv.x.s a3, v26
-; LMULMAX1-RV64-NEXT: and a3, a3, a2
-; LMULMAX1-RV64-NEXT: srli a5, a3, 1
-; LMULMAX1-RV64-NEXT: or a3, a3, a5
-; LMULMAX1-RV64-NEXT: srli a5, a3, 2
-; LMULMAX1-RV64-NEXT: or a3, a3, a5
-; LMULMAX1-RV64-NEXT: srli a5, a3, 4
-; LMULMAX1-RV64-NEXT: or a3, a3, a5
-; LMULMAX1-RV64-NEXT: srli a5, a3, 8
-; LMULMAX1-RV64-NEXT: or a3, a3, a5
-; LMULMAX1-RV64-NEXT: srli a5, a3, 16
-; LMULMAX1-RV64-NEXT: or a3, a3, a5
-; LMULMAX1-RV64-NEXT: srli a5, a3, 32
-; LMULMAX1-RV64-NEXT: or a3, a3, a5
-; LMULMAX1-RV64-NEXT: not a3, a3
-; LMULMAX1-RV64-NEXT: srli a5, a3, 1
-; LMULMAX1-RV64-NEXT: and a5, a5, a7
-; LMULMAX1-RV64-NEXT: sub a3, a3, a5
-; LMULMAX1-RV64-NEXT: and a5, a3, a4
-; LMULMAX1-RV64-NEXT: srli a3, a3, 2
-; LMULMAX1-RV64-NEXT: and a3, a3, a4
-; LMULMAX1-RV64-NEXT: add a3, a5, a3
-; LMULMAX1-RV64-NEXT: srli a5, a3, 4
-; LMULMAX1-RV64-NEXT: add a3, a3, a5
+; LMULMAX1-RV64-NEXT: vmv.x.s a2, v26
+; LMULMAX1-RV64-NEXT: and a2, a2, a7
+; LMULMAX1-RV64-NEXT: srli a3, a2, 1
+; LMULMAX1-RV64-NEXT: or a2, a2, a3
+; LMULMAX1-RV64-NEXT: srli a3, a2, 2
+; LMULMAX1-RV64-NEXT: or a2, a2, a3
+; LMULMAX1-RV64-NEXT: srli a3, a2, 4
+; LMULMAX1-RV64-NEXT: or a2, a2, a3
+; LMULMAX1-RV64-NEXT: srli a3, a2, 8
+; LMULMAX1-RV64-NEXT: or a2, a2, a3
+; LMULMAX1-RV64-NEXT: srli a3, a2, 16
+; LMULMAX1-RV64-NEXT: or a2, a2, a3
+; LMULMAX1-RV64-NEXT: srli a3, a2, 32
+; LMULMAX1-RV64-NEXT: or a2, a2, a3
+; LMULMAX1-RV64-NEXT: not a2, a2
+; LMULMAX1-RV64-NEXT: srli a3, a2, 1
; LMULMAX1-RV64-NEXT: and a3, a3, t0
-; LMULMAX1-RV64-NEXT: mul a3, a3, a1
-; LMULMAX1-RV64-NEXT: srli a3, a3, 56
-; LMULMAX1-RV64-NEXT: addi a3, a3, -48
-; LMULMAX1-RV64-NEXT: sh a3, 34(sp)
-; LMULMAX1-RV64-NEXT: vmv.x.s a3, v25
-; LMULMAX1-RV64-NEXT: and a3, a3, a2
-; LMULMAX1-RV64-NEXT: srli a5, a3, 1
-; LMULMAX1-RV64-NEXT: or a3, a3, a5
-; LMULMAX1-RV64-NEXT: srli a5, a3, 2
-; LMULMAX1-RV64-NEXT: or a3, a3, a5
-; LMULMAX1-RV64-NEXT: srli a5, a3, 4
-; LMULMAX1-RV64-NEXT: or a3, a3, a5
-; LMULMAX1-RV64-NEXT: srli a5, a3, 8
-; LMULMAX1-RV64-NEXT: or a3, a3, a5
-; LMULMAX1-RV64-NEXT: srli a5, a3, 16
-; LMULMAX1-RV64-NEXT: or a3, a3, a5
-; LMULMAX1-RV64-NEXT: srli a5, a3, 32
-; LMULMAX1-RV64-NEXT: or a3, a3, a5
-; LMULMAX1-RV64-NEXT: not a3, a3
-; LMULMAX1-RV64-NEXT: srli a5, a3, 1
-; LMULMAX1-RV64-NEXT: and a5, a5, a7
-; LMULMAX1-RV64-NEXT: sub a3, a3, a5
-; LMULMAX1-RV64-NEXT: and a5, a3, a4
-; LMULMAX1-RV64-NEXT: srli a3, a3, 2
-; LMULMAX1-RV64-NEXT: and a3, a3, a4
-; LMULMAX1-RV64-NEXT: add a3, a5, a3
-; LMULMAX1-RV64-NEXT: srli a5, a3, 4
-; LMULMAX1-RV64-NEXT: add a3, a3, a5
+; LMULMAX1-RV64-NEXT: sub a2, a2, a3
+; LMULMAX1-RV64-NEXT: and a3, a2, a4
+; LMULMAX1-RV64-NEXT: srli a2, a2, 2
+; LMULMAX1-RV64-NEXT: and a2, a2, a4
+; LMULMAX1-RV64-NEXT: add a2, a3, a2
+; LMULMAX1-RV64-NEXT: srli a3, a2, 4
+; LMULMAX1-RV64-NEXT: add a2, a2, a3
+; LMULMAX1-RV64-NEXT: and a2, a2, a5
+; LMULMAX1-RV64-NEXT: mul a2, a2, a1
+; LMULMAX1-RV64-NEXT: srli a2, a2, 56
+; LMULMAX1-RV64-NEXT: addi a2, a2, -48
+; LMULMAX1-RV64-NEXT: sh a2, 34(sp)
+; LMULMAX1-RV64-NEXT: vmv.x.s a2, v25
+; LMULMAX1-RV64-NEXT: and a2, a2, a7
+; LMULMAX1-RV64-NEXT: srli a3, a2, 1
+; LMULMAX1-RV64-NEXT: or a2, a2, a3
+; LMULMAX1-RV64-NEXT: srli a3, a2, 2
+; LMULMAX1-RV64-NEXT: or a2, a2, a3
+; LMULMAX1-RV64-NEXT: srli a3, a2, 4
+; LMULMAX1-RV64-NEXT: or a2, a2, a3
+; LMULMAX1-RV64-NEXT: srli a3, a2, 8
+; LMULMAX1-RV64-NEXT: or a2, a2, a3
+; LMULMAX1-RV64-NEXT: srli a3, a2, 16
+; LMULMAX1-RV64-NEXT: or a2, a2, a3
+; LMULMAX1-RV64-NEXT: srli a3, a2, 32
+; LMULMAX1-RV64-NEXT: or a2, a2, a3
+; LMULMAX1-RV64-NEXT: not a2, a2
+; LMULMAX1-RV64-NEXT: srli a3, a2, 1
; LMULMAX1-RV64-NEXT: and a3, a3, t0
-; LMULMAX1-RV64-NEXT: mul a3, a3, a1
-; LMULMAX1-RV64-NEXT: srli a3, a3, 56
-; LMULMAX1-RV64-NEXT: addi a3, a3, -48
-; LMULMAX1-RV64-NEXT: sh a3, 16(sp)
+; LMULMAX1-RV64-NEXT: sub a2, a2, a3
+; LMULMAX1-RV64-NEXT: and a3, a2, a4
+; LMULMAX1-RV64-NEXT: srli a2, a2, 2
+; LMULMAX1-RV64-NEXT: and a2, a2, a4
+; LMULMAX1-RV64-NEXT: add a2, a3, a2
+; LMULMAX1-RV64-NEXT: srli a3, a2, 4
+; LMULMAX1-RV64-NEXT: add a2, a2, a3
+; LMULMAX1-RV64-NEXT: and a2, a2, a5
+; LMULMAX1-RV64-NEXT: mul a2, a2, a1
+; LMULMAX1-RV64-NEXT: srli a2, a2, 56
+; LMULMAX1-RV64-NEXT: addi a2, a2, -48
+; LMULMAX1-RV64-NEXT: sh a2, 16(sp)
; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 7
-; LMULMAX1-RV64-NEXT: vmv.x.s a3, v26
-; LMULMAX1-RV64-NEXT: and a3, a3, a2
-; LMULMAX1-RV64-NEXT: srli a5, a3, 1
-; LMULMAX1-RV64-NEXT: or a3, a3, a5
-; LMULMAX1-RV64-NEXT: srli a5, a3, 2
-; LMULMAX1-RV64-NEXT: or a3, a3, a5
-; LMULMAX1-RV64-NEXT: srli a5, a3, 4
-; LMULMAX1-RV64-NEXT: or a3, a3, a5
-; LMULMAX1-RV64-NEXT: srli a5, a3, 8
-; LMULMAX1-RV64-NEXT: or a3, a3, a5
-; LMULMAX1-RV64-NEXT: srli a5, a3, 16
-; LMULMAX1-RV64-NEXT: or a3, a3, a5
-; LMULMAX1-RV64-NEXT: srli a5, a3, 32
-; LMULMAX1-RV64-NEXT: or a3, a3, a5
-; LMULMAX1-RV64-NEXT: not a3, a3
-; LMULMAX1-RV64-NEXT: srli a5, a3, 1
-; LMULMAX1-RV64-NEXT: and a5, a5, a7
-; LMULMAX1-RV64-NEXT: sub a3, a3, a5
-; LMULMAX1-RV64-NEXT: and a5, a3, a4
-; LMULMAX1-RV64-NEXT: srli a3, a3, 2
-; LMULMAX1-RV64-NEXT: and a3, a3, a4
-; LMULMAX1-RV64-NEXT: add a3, a5, a3
-; LMULMAX1-RV64-NEXT: srli a5, a3, 4
-; LMULMAX1-RV64-NEXT: add a3, a3, a5
+; LMULMAX1-RV64-NEXT: vmv.x.s a2, v26
+; LMULMAX1-RV64-NEXT: and a2, a2, a7
+; LMULMAX1-RV64-NEXT: srli a3, a2, 1
+; LMULMAX1-RV64-NEXT: or a2, a2, a3
+; LMULMAX1-RV64-NEXT: srli a3, a2, 2
+; LMULMAX1-RV64-NEXT: or a2, a2, a3
+; LMULMAX1-RV64-NEXT: srli a3, a2, 4
+; LMULMAX1-RV64-NEXT: or a2, a2, a3
+; LMULMAX1-RV64-NEXT: srli a3, a2, 8
+; LMULMAX1-RV64-NEXT: or a2, a2, a3
+; LMULMAX1-RV64-NEXT: srli a3, a2, 16
+; LMULMAX1-RV64-NEXT: or a2, a2, a3
+; LMULMAX1-RV64-NEXT: srli a3, a2, 32
+; LMULMAX1-RV64-NEXT: or a2, a2, a3
+; LMULMAX1-RV64-NEXT: not a2, a2
+; LMULMAX1-RV64-NEXT: srli a3, a2, 1
; LMULMAX1-RV64-NEXT: and a3, a3, t0
-; LMULMAX1-RV64-NEXT: mul a3, a3, a1
-; LMULMAX1-RV64-NEXT: srli a3, a3, 56
-; LMULMAX1-RV64-NEXT: addi a3, a3, -48
-; LMULMAX1-RV64-NEXT: sh a3, 30(sp)
+; LMULMAX1-RV64-NEXT: sub a2, a2, a3
+; LMULMAX1-RV64-NEXT: and a3, a2, a4
+; LMULMAX1-RV64-NEXT: srli a2, a2, 2
+; LMULMAX1-RV64-NEXT: and a2, a2, a4
+; LMULMAX1-RV64-NEXT: add a2, a3, a2
+; LMULMAX1-RV64-NEXT: srli a3, a2, 4
+; LMULMAX1-RV64-NEXT: add a2, a2, a3
+; LMULMAX1-RV64-NEXT: and a2, a2, a5
+; LMULMAX1-RV64-NEXT: mul a2, a2, a1
+; LMULMAX1-RV64-NEXT: srli a2, a2, 56
+; LMULMAX1-RV64-NEXT: addi a2, a2, -48
+; LMULMAX1-RV64-NEXT: sh a2, 30(sp)
; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 6
-; LMULMAX1-RV64-NEXT: vmv.x.s a3, v26
-; LMULMAX1-RV64-NEXT: and a3, a3, a2
-; LMULMAX1-RV64-NEXT: srli a5, a3, 1
-; LMULMAX1-RV64-NEXT: or a3, a3, a5
-; LMULMAX1-RV64-NEXT: srli a5, a3, 2
-; LMULMAX1-RV64-NEXT: or a3, a3, a5
-; LMULMAX1-RV64-NEXT: srli a5, a3, 4
-; LMULMAX1-RV64-NEXT: or a3, a3, a5
-; LMULMAX1-RV64-NEXT: srli a5, a3, 8
-; LMULMAX1-RV64-NEXT: or a3, a3, a5
-; LMULMAX1-RV64-NEXT: srli a5, a3, 16
-; LMULMAX1-RV64-NEXT: or a3, a3, a5
-; LMULMAX1-RV64-NEXT: srli a5, a3, 32
-; LMULMAX1-RV64-NEXT: or a3, a3, a5
-; LMULMAX1-RV64-NEXT: not a3, a3
-; LMULMAX1-RV64-NEXT: srli a5, a3, 1
-; LMULMAX1-RV64-NEXT: and a5, a5, a7
-; LMULMAX1-RV64-NEXT: sub a3, a3, a5
-; LMULMAX1-RV64-NEXT: and a5, a3, a4
-; LMULMAX1-RV64-NEXT: srli a3, a3, 2
-; LMULMAX1-RV64-NEXT: and a3, a3, a4
-; LMULMAX1-RV64-NEXT: add a3, a5, a3
-; LMULMAX1-RV64-NEXT: srli a5, a3, 4
-; LMULMAX1-RV64-NEXT: add a3, a3, a5
+; LMULMAX1-RV64-NEXT: vmv.x.s a2, v26
+; LMULMAX1-RV64-NEXT: and a2, a2, a7
+; LMULMAX1-RV64-NEXT: srli a3, a2, 1
+; LMULMAX1-RV64-NEXT: or a2, a2, a3
+; LMULMAX1-RV64-NEXT: srli a3, a2, 2
+; LMULMAX1-RV64-NEXT: or a2, a2, a3
+; LMULMAX1-RV64-NEXT: srli a3, a2, 4
+; LMULMAX1-RV64-NEXT: or a2, a2, a3
+; LMULMAX1-RV64-NEXT: srli a3, a2, 8
+; LMULMAX1-RV64-NEXT: or a2, a2, a3
+; LMULMAX1-RV64-NEXT: srli a3, a2, 16
+; LMULMAX1-RV64-NEXT: or a2, a2, a3
+; LMULMAX1-RV64-NEXT: srli a3, a2, 32
+; LMULMAX1-RV64-NEXT: or a2, a2, a3
+; LMULMAX1-RV64-NEXT: not a2, a2
+; LMULMAX1-RV64-NEXT: srli a3, a2, 1
; LMULMAX1-RV64-NEXT: and a3, a3, t0
-; LMULMAX1-RV64-NEXT: mul a3, a3, a1
-; LMULMAX1-RV64-NEXT: srli a3, a3, 56
-; LMULMAX1-RV64-NEXT: addi a3, a3, -48
-; LMULMAX1-RV64-NEXT: sh a3, 28(sp)
+; LMULMAX1-RV64-NEXT: sub a2, a2, a3
+; LMULMAX1-RV64-NEXT: and a3, a2, a4
+; LMULMAX1-RV64-NEXT: srli a2, a2, 2
+; LMULMAX1-RV64-NEXT: and a2, a2, a4
+; LMULMAX1-RV64-NEXT: add a2, a3, a2
+; LMULMAX1-RV64-NEXT: srli a3, a2, 4
+; LMULMAX1-RV64-NEXT: add a2, a2, a3
+; LMULMAX1-RV64-NEXT: and a2, a2, a5
+; LMULMAX1-RV64-NEXT: mul a2, a2, a1
+; LMULMAX1-RV64-NEXT: srli a2, a2, 56
+; LMULMAX1-RV64-NEXT: addi a2, a2, -48
+; LMULMAX1-RV64-NEXT: sh a2, 28(sp)
; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 5
-; LMULMAX1-RV64-NEXT: vmv.x.s a3, v26
-; LMULMAX1-RV64-NEXT: and a3, a3, a2
-; LMULMAX1-RV64-NEXT: srli a5, a3, 1
-; LMULMAX1-RV64-NEXT: or a3, a3, a5
-; LMULMAX1-RV64-NEXT: srli a5, a3, 2
-; LMULMAX1-RV64-NEXT: or a3, a3, a5
-; LMULMAX1-RV64-NEXT: srli a5, a3, 4
-; LMULMAX1-RV64-NEXT: or a3, a3, a5
-; LMULMAX1-RV64-NEXT: srli a5, a3, 8
-; LMULMAX1-RV64-NEXT: or a3, a3, a5
-; LMULMAX1-RV64-NEXT: srli a5, a3, 16
-; LMULMAX1-RV64-NEXT: or a3, a3, a5
-; LMULMAX1-RV64-NEXT: srli a5, a3, 32
-; LMULMAX1-RV64-NEXT: or a3, a3, a5
-; LMULMAX1-RV64-NEXT: not a3, a3
-; LMULMAX1-RV64-NEXT: srli a5, a3, 1
-; LMULMAX1-RV64-NEXT: and a5, a5, a7
-; LMULMAX1-RV64-NEXT: sub a3, a3, a5
-; LMULMAX1-RV64-NEXT: and a5, a3, a4
-; LMULMAX1-RV64-NEXT: srli a3, a3, 2
-; LMULMAX1-RV64-NEXT: and a3, a3, a4
-; LMULMAX1-RV64-NEXT: add a3, a5, a3
-; LMULMAX1-RV64-NEXT: srli a5, a3, 4
-; LMULMAX1-RV64-NEXT: add a3, a3, a5
+; LMULMAX1-RV64-NEXT: vmv.x.s a2, v26
+; LMULMAX1-RV64-NEXT: and a2, a2, a7
+; LMULMAX1-RV64-NEXT: srli a3, a2, 1
+; LMULMAX1-RV64-NEXT: or a2, a2, a3
+; LMULMAX1-RV64-NEXT: srli a3, a2, 2
+; LMULMAX1-RV64-NEXT: or a2, a2, a3
+; LMULMAX1-RV64-NEXT: srli a3, a2, 4
+; LMULMAX1-RV64-NEXT: or a2, a2, a3
+; LMULMAX1-RV64-NEXT: srli a3, a2, 8
+; LMULMAX1-RV64-NEXT: or a2, a2, a3
+; LMULMAX1-RV64-NEXT: srli a3, a2, 16
+; LMULMAX1-RV64-NEXT: or a2, a2, a3
+; LMULMAX1-RV64-NEXT: srli a3, a2, 32
+; LMULMAX1-RV64-NEXT: or a2, a2, a3
+; LMULMAX1-RV64-NEXT: not a2, a2
+; LMULMAX1-RV64-NEXT: srli a3, a2, 1
; LMULMAX1-RV64-NEXT: and a3, a3, t0
-; LMULMAX1-RV64-NEXT: mul a3, a3, a1
-; LMULMAX1-RV64-NEXT: srli a3, a3, 56
-; LMULMAX1-RV64-NEXT: addi a3, a3, -48
-; LMULMAX1-RV64-NEXT: sh a3, 26(sp)
+; LMULMAX1-RV64-NEXT: sub a2, a2, a3
+; LMULMAX1-RV64-NEXT: and a3, a2, a4
+; LMULMAX1-RV64-NEXT: srli a2, a2, 2
+; LMULMAX1-RV64-NEXT: and a2, a2, a4
+; LMULMAX1-RV64-NEXT: add a2, a3, a2
+; LMULMAX1-RV64-NEXT: srli a3, a2, 4
+; LMULMAX1-RV64-NEXT: add a2, a2, a3
+; LMULMAX1-RV64-NEXT: and a2, a2, a5
+; LMULMAX1-RV64-NEXT: mul a2, a2, a1
+; LMULMAX1-RV64-NEXT: srli a2, a2, 56
+; LMULMAX1-RV64-NEXT: addi a2, a2, -48
+; LMULMAX1-RV64-NEXT: sh a2, 26(sp)
; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 4
-; LMULMAX1-RV64-NEXT: vmv.x.s a3, v26
-; LMULMAX1-RV64-NEXT: and a3, a3, a2
-; LMULMAX1-RV64-NEXT: srli a5, a3, 1
-; LMULMAX1-RV64-NEXT: or a3, a3, a5
-; LMULMAX1-RV64-NEXT: srli a5, a3, 2
-; LMULMAX1-RV64-NEXT: or a3, a3, a5
-; LMULMAX1-RV64-NEXT: srli a5, a3, 4
-; LMULMAX1-RV64-NEXT: or a3, a3, a5
-; LMULMAX1-RV64-NEXT: srli a5, a3, 8
-; LMULMAX1-RV64-NEXT: or a3, a3, a5
-; LMULMAX1-RV64-NEXT: srli a5, a3, 16
-; LMULMAX1-RV64-NEXT: or a3, a3, a5
-; LMULMAX1-RV64-NEXT: srli a5, a3, 32
-; LMULMAX1-RV64-NEXT: or a3, a3, a5
-; LMULMAX1-RV64-NEXT: not a3, a3
-; LMULMAX1-RV64-NEXT: srli a5, a3, 1
-; LMULMAX1-RV64-NEXT: and a5, a5, a7
-; LMULMAX1-RV64-NEXT: sub a3, a3, a5
-; LMULMAX1-RV64-NEXT: and a5, a3, a4
-; LMULMAX1-RV64-NEXT: srli a3, a3, 2
-; LMULMAX1-RV64-NEXT: and a3, a3, a4
-; LMULMAX1-RV64-NEXT: add a3, a5, a3
-; LMULMAX1-RV64-NEXT: srli a5, a3, 4
-; LMULMAX1-RV64-NEXT: add a3, a3, a5
+; LMULMAX1-RV64-NEXT: vmv.x.s a2, v26
+; LMULMAX1-RV64-NEXT: and a2, a2, a7
+; LMULMAX1-RV64-NEXT: srli a3, a2, 1
+; LMULMAX1-RV64-NEXT: or a2, a2, a3
+; LMULMAX1-RV64-NEXT: srli a3, a2, 2
+; LMULMAX1-RV64-NEXT: or a2, a2, a3
+; LMULMAX1-RV64-NEXT: srli a3, a2, 4
+; LMULMAX1-RV64-NEXT: or a2, a2, a3
+; LMULMAX1-RV64-NEXT: srli a3, a2, 8
+; LMULMAX1-RV64-NEXT: or a2, a2, a3
+; LMULMAX1-RV64-NEXT: srli a3, a2, 16
+; LMULMAX1-RV64-NEXT: or a2, a2, a3
+; LMULMAX1-RV64-NEXT: srli a3, a2, 32
+; LMULMAX1-RV64-NEXT: or a2, a2, a3
+; LMULMAX1-RV64-NEXT: not a2, a2
+; LMULMAX1-RV64-NEXT: srli a3, a2, 1
; LMULMAX1-RV64-NEXT: and a3, a3, t0
-; LMULMAX1-RV64-NEXT: mul a3, a3, a1
-; LMULMAX1-RV64-NEXT: srli a3, a3, 56
-; LMULMAX1-RV64-NEXT: addi a3, a3, -48
-; LMULMAX1-RV64-NEXT: sh a3, 24(sp)
+; LMULMAX1-RV64-NEXT: sub a2, a2, a3
+; LMULMAX1-RV64-NEXT: and a3, a2, a4
+; LMULMAX1-RV64-NEXT: srli a2, a2, 2
+; LMULMAX1-RV64-NEXT: and a2, a2, a4
+; LMULMAX1-RV64-NEXT: add a2, a3, a2
+; LMULMAX1-RV64-NEXT: srli a3, a2, 4
+; LMULMAX1-RV64-NEXT: add a2, a2, a3
+; LMULMAX1-RV64-NEXT: and a2, a2, a5
+; LMULMAX1-RV64-NEXT: mul a2, a2, a1
+; LMULMAX1-RV64-NEXT: srli a2, a2, 56
+; LMULMAX1-RV64-NEXT: addi a2, a2, -48
+; LMULMAX1-RV64-NEXT: sh a2, 24(sp)
; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 3
-; LMULMAX1-RV64-NEXT: vmv.x.s a3, v26
-; LMULMAX1-RV64-NEXT: and a3, a3, a2
-; LMULMAX1-RV64-NEXT: srli a5, a3, 1
-; LMULMAX1-RV64-NEXT: or a3, a3, a5
-; LMULMAX1-RV64-NEXT: srli a5, a3, 2
-; LMULMAX1-RV64-NEXT: or a3, a3, a5
-; LMULMAX1-RV64-NEXT: srli a5, a3, 4
-; LMULMAX1-RV64-NEXT: or a3, a3, a5
-; LMULMAX1-RV64-NEXT: srli a5, a3, 8
-; LMULMAX1-RV64-NEXT: or a3, a3, a5
-; LMULMAX1-RV64-NEXT: srli a5, a3, 16
-; LMULMAX1-RV64-NEXT: or a3, a3, a5
-; LMULMAX1-RV64-NEXT: srli a5, a3, 32
-; LMULMAX1-RV64-NEXT: or a3, a3, a5
-; LMULMAX1-RV64-NEXT: not a3, a3
-; LMULMAX1-RV64-NEXT: srli a5, a3, 1
-; LMULMAX1-RV64-NEXT: and a5, a5, a7
-; LMULMAX1-RV64-NEXT: sub a3, a3, a5
-; LMULMAX1-RV64-NEXT: and a5, a3, a4
-; LMULMAX1-RV64-NEXT: srli a3, a3, 2
-; LMULMAX1-RV64-NEXT: and a3, a3, a4
-; LMULMAX1-RV64-NEXT: add a3, a5, a3
-; LMULMAX1-RV64-NEXT: srli a5, a3, 4
-; LMULMAX1-RV64-NEXT: add a3, a3, a5
+; LMULMAX1-RV64-NEXT: vmv.x.s a2, v26
+; LMULMAX1-RV64-NEXT: and a2, a2, a7
+; LMULMAX1-RV64-NEXT: srli a3, a2, 1
+; LMULMAX1-RV64-NEXT: or a2, a2, a3
+; LMULMAX1-RV64-NEXT: srli a3, a2, 2
+; LMULMAX1-RV64-NEXT: or a2, a2, a3
+; LMULMAX1-RV64-NEXT: srli a3, a2, 4
+; LMULMAX1-RV64-NEXT: or a2, a2, a3
+; LMULMAX1-RV64-NEXT: srli a3, a2, 8
+; LMULMAX1-RV64-NEXT: or a2, a2, a3
+; LMULMAX1-RV64-NEXT: srli a3, a2, 16
+; LMULMAX1-RV64-NEXT: or a2, a2, a3
+; LMULMAX1-RV64-NEXT: srli a3, a2, 32
+; LMULMAX1-RV64-NEXT: or a2, a2, a3
+; LMULMAX1-RV64-NEXT: not a2, a2
+; LMULMAX1-RV64-NEXT: srli a3, a2, 1
; LMULMAX1-RV64-NEXT: and a3, a3, t0
-; LMULMAX1-RV64-NEXT: mul a3, a3, a1
-; LMULMAX1-RV64-NEXT: srli a3, a3, 56
-; LMULMAX1-RV64-NEXT: addi a3, a3, -48
-; LMULMAX1-RV64-NEXT: sh a3, 22(sp)
+; LMULMAX1-RV64-NEXT: sub a2, a2, a3
+; LMULMAX1-RV64-NEXT: and a3, a2, a4
+; LMULMAX1-RV64-NEXT: srli a2, a2, 2
+; LMULMAX1-RV64-NEXT: and a2, a2, a4
+; LMULMAX1-RV64-NEXT: add a2, a3, a2
+; LMULMAX1-RV64-NEXT: srli a3, a2, 4
+; LMULMAX1-RV64-NEXT: add a2, a2, a3
+; LMULMAX1-RV64-NEXT: and a2, a2, a5
+; LMULMAX1-RV64-NEXT: mul a2, a2, a1
+; LMULMAX1-RV64-NEXT: srli a2, a2, 56
+; LMULMAX1-RV64-NEXT: addi a2, a2, -48
+; LMULMAX1-RV64-NEXT: sh a2, 22(sp)
; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 2
-; LMULMAX1-RV64-NEXT: vmv.x.s a3, v26
-; LMULMAX1-RV64-NEXT: and a3, a3, a2
-; LMULMAX1-RV64-NEXT: srli a5, a3, 1
-; LMULMAX1-RV64-NEXT: or a3, a3, a5
-; LMULMAX1-RV64-NEXT: srli a5, a3, 2
-; LMULMAX1-RV64-NEXT: or a3, a3, a5
-; LMULMAX1-RV64-NEXT: srli a5, a3, 4
-; LMULMAX1-RV64-NEXT: or a3, a3, a5
-; LMULMAX1-RV64-NEXT: srli a5, a3, 8
-; LMULMAX1-RV64-NEXT: or a3, a3, a5
-; LMULMAX1-RV64-NEXT: srli a5, a3, 16
-; LMULMAX1-RV64-NEXT: or a3, a3, a5
-; LMULMAX1-RV64-NEXT: srli a5, a3, 32
-; LMULMAX1-RV64-NEXT: or a3, a3, a5
-; LMULMAX1-RV64-NEXT: not a3, a3
-; LMULMAX1-RV64-NEXT: srli a5, a3, 1
-; LMULMAX1-RV64-NEXT: and a5, a5, a7
-; LMULMAX1-RV64-NEXT: sub a3, a3, a5
-; LMULMAX1-RV64-NEXT: and a5, a3, a4
-; LMULMAX1-RV64-NEXT: srli a3, a3, 2
-; LMULMAX1-RV64-NEXT: and a3, a3, a4
-; LMULMAX1-RV64-NEXT: add a3, a5, a3
-; LMULMAX1-RV64-NEXT: srli a5, a3, 4
-; LMULMAX1-RV64-NEXT: add a3, a3, a5
+; LMULMAX1-RV64-NEXT: vmv.x.s a2, v26
+; LMULMAX1-RV64-NEXT: and a2, a2, a7
+; LMULMAX1-RV64-NEXT: srli a3, a2, 1
+; LMULMAX1-RV64-NEXT: or a2, a2, a3
+; LMULMAX1-RV64-NEXT: srli a3, a2, 2
+; LMULMAX1-RV64-NEXT: or a2, a2, a3
+; LMULMAX1-RV64-NEXT: srli a3, a2, 4
+; LMULMAX1-RV64-NEXT: or a2, a2, a3
+; LMULMAX1-RV64-NEXT: srli a3, a2, 8
+; LMULMAX1-RV64-NEXT: or a2, a2, a3
+; LMULMAX1-RV64-NEXT: srli a3, a2, 16
+; LMULMAX1-RV64-NEXT: or a2, a2, a3
+; LMULMAX1-RV64-NEXT: srli a3, a2, 32
+; LMULMAX1-RV64-NEXT: or a2, a2, a3
+; LMULMAX1-RV64-NEXT: not a2, a2
+; LMULMAX1-RV64-NEXT: srli a3, a2, 1
; LMULMAX1-RV64-NEXT: and a3, a3, t0
-; LMULMAX1-RV64-NEXT: mul a3, a3, a1
-; LMULMAX1-RV64-NEXT: srli a3, a3, 56
-; LMULMAX1-RV64-NEXT: addi a3, a3, -48
-; LMULMAX1-RV64-NEXT: sh a3, 20(sp)
+; LMULMAX1-RV64-NEXT: sub a2, a2, a3
+; LMULMAX1-RV64-NEXT: and a3, a2, a4
+; LMULMAX1-RV64-NEXT: srli a2, a2, 2
+; LMULMAX1-RV64-NEXT: and a2, a2, a4
+; LMULMAX1-RV64-NEXT: add a2, a3, a2
+; LMULMAX1-RV64-NEXT: srli a3, a2, 4
+; LMULMAX1-RV64-NEXT: add a2, a2, a3
+; LMULMAX1-RV64-NEXT: and a2, a2, a5
+; LMULMAX1-RV64-NEXT: mul a2, a2, a1
+; LMULMAX1-RV64-NEXT: srli a2, a2, 56
+; LMULMAX1-RV64-NEXT: addi a2, a2, -48
+; LMULMAX1-RV64-NEXT: sh a2, 20(sp)
; LMULMAX1-RV64-NEXT: vslidedown.vi v25, v25, 1
-; LMULMAX1-RV64-NEXT: vmv.x.s a3, v25
-; LMULMAX1-RV64-NEXT: and a2, a3, a2
+; LMULMAX1-RV64-NEXT: vmv.x.s a2, v25
+; LMULMAX1-RV64-NEXT: and a2, a2, a7
; LMULMAX1-RV64-NEXT: srli a3, a2, 1
; LMULMAX1-RV64-NEXT: or a2, a2, a3
; LMULMAX1-RV64-NEXT: srli a3, a2, 2
; LMULMAX1-RV64-NEXT: or a2, a2, a3
; LMULMAX1-RV64-NEXT: not a2, a2
; LMULMAX1-RV64-NEXT: srli a3, a2, 1
-; LMULMAX1-RV64-NEXT: and a3, a3, a7
+; LMULMAX1-RV64-NEXT: and a3, a3, t0
; LMULMAX1-RV64-NEXT: sub a2, a2, a3
; LMULMAX1-RV64-NEXT: and a3, a2, a4
; LMULMAX1-RV64-NEXT: srli a2, a2, 2
; LMULMAX1-RV64-NEXT: add a2, a3, a2
; LMULMAX1-RV64-NEXT: srli a3, a2, 4
; LMULMAX1-RV64-NEXT: add a2, a2, a3
-; LMULMAX1-RV64-NEXT: and a2, a2, t0
+; LMULMAX1-RV64-NEXT: and a2, a2, a5
; LMULMAX1-RV64-NEXT: mul a1, a2, a1
; LMULMAX1-RV64-NEXT: srli a1, a1, 56
; LMULMAX1-RV64-NEXT: addi a1, a1, -48
; LMULMAX2-RV32-NEXT: sw zero, 52(sp)
; LMULMAX2-RV32-NEXT: sw zero, 44(sp)
; LMULMAX2-RV32-NEXT: sw zero, 36(sp)
-; LMULMAX2-RV32-NEXT: addi a5, zero, 32
+; LMULMAX2-RV32-NEXT: addi a6, zero, 32
; LMULMAX2-RV32-NEXT: vsetivli zero, 1, e64, m2, ta, mu
-; LMULMAX2-RV32-NEXT: vsrl.vx v28, v26, a5
-; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28
-; LMULMAX2-RV32-NEXT: lui a2, 349525
-; LMULMAX2-RV32-NEXT: addi a4, a2, 1365
-; LMULMAX2-RV32-NEXT: lui a2, 209715
-; LMULMAX2-RV32-NEXT: addi a3, a2, 819
-; LMULMAX2-RV32-NEXT: lui a2, 61681
-; LMULMAX2-RV32-NEXT: addi a6, a2, -241
-; LMULMAX2-RV32-NEXT: lui a2, 4112
-; LMULMAX2-RV32-NEXT: addi a7, a2, 257
-; LMULMAX2-RV32-NEXT: bnez a1, .LBB7_2
+; LMULMAX2-RV32-NEXT: vsrl.vx v28, v26, a6
+; LMULMAX2-RV32-NEXT: vmv.x.s a5, v28
+; LMULMAX2-RV32-NEXT: lui a1, 349525
+; LMULMAX2-RV32-NEXT: addi a4, a1, 1365
+; LMULMAX2-RV32-NEXT: lui a1, 209715
+; LMULMAX2-RV32-NEXT: addi a3, a1, 819
+; LMULMAX2-RV32-NEXT: lui a1, 61681
+; LMULMAX2-RV32-NEXT: addi a7, a1, -241
+; LMULMAX2-RV32-NEXT: lui a1, 4112
+; LMULMAX2-RV32-NEXT: addi a2, a1, 257
+; LMULMAX2-RV32-NEXT: bnez a5, .LBB7_2
; LMULMAX2-RV32-NEXT: # %bb.1:
; LMULMAX2-RV32-NEXT: vmv.x.s a1, v26
-; LMULMAX2-RV32-NEXT: srli a2, a1, 1
-; LMULMAX2-RV32-NEXT: or a1, a1, a2
-; LMULMAX2-RV32-NEXT: srli a2, a1, 2
-; LMULMAX2-RV32-NEXT: or a1, a1, a2
-; LMULMAX2-RV32-NEXT: srli a2, a1, 4
-; LMULMAX2-RV32-NEXT: or a1, a1, a2
-; LMULMAX2-RV32-NEXT: srli a2, a1, 8
-; LMULMAX2-RV32-NEXT: or a1, a1, a2
-; LMULMAX2-RV32-NEXT: srli a2, a1, 16
-; LMULMAX2-RV32-NEXT: or a1, a1, a2
+; LMULMAX2-RV32-NEXT: srli a5, a1, 1
+; LMULMAX2-RV32-NEXT: or a1, a1, a5
+; LMULMAX2-RV32-NEXT: srli a5, a1, 2
+; LMULMAX2-RV32-NEXT: or a1, a1, a5
+; LMULMAX2-RV32-NEXT: srli a5, a1, 4
+; LMULMAX2-RV32-NEXT: or a1, a1, a5
+; LMULMAX2-RV32-NEXT: srli a5, a1, 8
+; LMULMAX2-RV32-NEXT: or a1, a1, a5
+; LMULMAX2-RV32-NEXT: srli a5, a1, 16
+; LMULMAX2-RV32-NEXT: or a1, a1, a5
; LMULMAX2-RV32-NEXT: not a1, a1
-; LMULMAX2-RV32-NEXT: srli a2, a1, 1
-; LMULMAX2-RV32-NEXT: and a2, a2, a4
-; LMULMAX2-RV32-NEXT: sub a1, a1, a2
-; LMULMAX2-RV32-NEXT: and a2, a1, a3
-; LMULMAX2-RV32-NEXT: srli a1, a1, 2
-; LMULMAX2-RV32-NEXT: and a1, a1, a3
-; LMULMAX2-RV32-NEXT: add a1, a2, a1
-; LMULMAX2-RV32-NEXT: srli a2, a1, 4
-; LMULMAX2-RV32-NEXT: add a1, a1, a2
-; LMULMAX2-RV32-NEXT: and a1, a1, a6
-; LMULMAX2-RV32-NEXT: mul a1, a1, a7
+; LMULMAX2-RV32-NEXT: srli a5, a1, 1
+; LMULMAX2-RV32-NEXT: and a5, a5, a4
+; LMULMAX2-RV32-NEXT: sub a1, a1, a5
+; LMULMAX2-RV32-NEXT: and a5, a1, a3
+; LMULMAX2-RV32-NEXT: srli a1, a1, 2
+; LMULMAX2-RV32-NEXT: and a1, a1, a3
+; LMULMAX2-RV32-NEXT: add a1, a5, a1
+; LMULMAX2-RV32-NEXT: srli a5, a1, 4
+; LMULMAX2-RV32-NEXT: add a1, a1, a5
+; LMULMAX2-RV32-NEXT: and a1, a1, a7
+; LMULMAX2-RV32-NEXT: mul a1, a1, a2
; LMULMAX2-RV32-NEXT: srli a1, a1, 24
-; LMULMAX2-RV32-NEXT: addi a1, a1, 32
+; LMULMAX2-RV32-NEXT: addi a5, a1, 32
; LMULMAX2-RV32-NEXT: j .LBB7_3
; LMULMAX2-RV32-NEXT: .LBB7_2:
-; LMULMAX2-RV32-NEXT: srli a2, a1, 1
-; LMULMAX2-RV32-NEXT: or a1, a1, a2
-; LMULMAX2-RV32-NEXT: srli a2, a1, 2
-; LMULMAX2-RV32-NEXT: or a1, a1, a2
-; LMULMAX2-RV32-NEXT: srli a2, a1, 4
-; LMULMAX2-RV32-NEXT: or a1, a1, a2
-; LMULMAX2-RV32-NEXT: srli a2, a1, 8
-; LMULMAX2-RV32-NEXT: or a1, a1, a2
-; LMULMAX2-RV32-NEXT: srli a2, a1, 16
-; LMULMAX2-RV32-NEXT: or a1, a1, a2
+; LMULMAX2-RV32-NEXT: srli a1, a5, 1
+; LMULMAX2-RV32-NEXT: or a1, a5, a1
+; LMULMAX2-RV32-NEXT: srli a5, a1, 2
+; LMULMAX2-RV32-NEXT: or a1, a1, a5
+; LMULMAX2-RV32-NEXT: srli a5, a1, 4
+; LMULMAX2-RV32-NEXT: or a1, a1, a5
+; LMULMAX2-RV32-NEXT: srli a5, a1, 8
+; LMULMAX2-RV32-NEXT: or a1, a1, a5
+; LMULMAX2-RV32-NEXT: srli a5, a1, 16
+; LMULMAX2-RV32-NEXT: or a1, a1, a5
; LMULMAX2-RV32-NEXT: not a1, a1
-; LMULMAX2-RV32-NEXT: srli a2, a1, 1
-; LMULMAX2-RV32-NEXT: and a2, a2, a4
-; LMULMAX2-RV32-NEXT: sub a1, a1, a2
-; LMULMAX2-RV32-NEXT: and a2, a1, a3
+; LMULMAX2-RV32-NEXT: srli a5, a1, 1
+; LMULMAX2-RV32-NEXT: and a5, a5, a4
+; LMULMAX2-RV32-NEXT: sub a1, a1, a5
+; LMULMAX2-RV32-NEXT: and a5, a1, a3
; LMULMAX2-RV32-NEXT: srli a1, a1, 2
; LMULMAX2-RV32-NEXT: and a1, a1, a3
-; LMULMAX2-RV32-NEXT: add a1, a2, a1
-; LMULMAX2-RV32-NEXT: srli a2, a1, 4
-; LMULMAX2-RV32-NEXT: add a1, a1, a2
-; LMULMAX2-RV32-NEXT: and a1, a1, a6
-; LMULMAX2-RV32-NEXT: mul a1, a1, a7
-; LMULMAX2-RV32-NEXT: srli a1, a1, 24
+; LMULMAX2-RV32-NEXT: add a1, a5, a1
+; LMULMAX2-RV32-NEXT: srli a5, a1, 4
+; LMULMAX2-RV32-NEXT: add a1, a1, a5
+; LMULMAX2-RV32-NEXT: and a1, a1, a7
+; LMULMAX2-RV32-NEXT: mul a1, a1, a2
+; LMULMAX2-RV32-NEXT: srli a5, a1, 24
; LMULMAX2-RV32-NEXT: .LBB7_3:
; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 3
-; LMULMAX2-RV32-NEXT: vsrl.vx v30, v28, a5
-; LMULMAX2-RV32-NEXT: vmv.x.s a2, v30
-; LMULMAX2-RV32-NEXT: sw a1, 32(sp)
-; LMULMAX2-RV32-NEXT: bnez a2, .LBB7_5
+; LMULMAX2-RV32-NEXT: vsrl.vx v30, v28, a6
+; LMULMAX2-RV32-NEXT: vmv.x.s a1, v30
+; LMULMAX2-RV32-NEXT: sw a5, 32(sp)
+; LMULMAX2-RV32-NEXT: bnez a1, .LBB7_5
; LMULMAX2-RV32-NEXT: # %bb.4:
; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28
-; LMULMAX2-RV32-NEXT: srli a2, a1, 1
-; LMULMAX2-RV32-NEXT: or a1, a1, a2
-; LMULMAX2-RV32-NEXT: srli a2, a1, 2
-; LMULMAX2-RV32-NEXT: or a1, a1, a2
-; LMULMAX2-RV32-NEXT: srli a2, a1, 4
-; LMULMAX2-RV32-NEXT: or a1, a1, a2
-; LMULMAX2-RV32-NEXT: srli a2, a1, 8
-; LMULMAX2-RV32-NEXT: or a1, a1, a2
-; LMULMAX2-RV32-NEXT: srli a2, a1, 16
-; LMULMAX2-RV32-NEXT: or a1, a1, a2
+; LMULMAX2-RV32-NEXT: srli a5, a1, 1
+; LMULMAX2-RV32-NEXT: or a1, a1, a5
+; LMULMAX2-RV32-NEXT: srli a5, a1, 2
+; LMULMAX2-RV32-NEXT: or a1, a1, a5
+; LMULMAX2-RV32-NEXT: srli a5, a1, 4
+; LMULMAX2-RV32-NEXT: or a1, a1, a5
+; LMULMAX2-RV32-NEXT: srli a5, a1, 8
+; LMULMAX2-RV32-NEXT: or a1, a1, a5
+; LMULMAX2-RV32-NEXT: srli a5, a1, 16
+; LMULMAX2-RV32-NEXT: or a1, a1, a5
; LMULMAX2-RV32-NEXT: not a1, a1
-; LMULMAX2-RV32-NEXT: srli a2, a1, 1
-; LMULMAX2-RV32-NEXT: and a2, a2, a4
-; LMULMAX2-RV32-NEXT: sub a1, a1, a2
-; LMULMAX2-RV32-NEXT: and a2, a1, a3
+; LMULMAX2-RV32-NEXT: srli a5, a1, 1
+; LMULMAX2-RV32-NEXT: and a5, a5, a4
+; LMULMAX2-RV32-NEXT: sub a1, a1, a5
+; LMULMAX2-RV32-NEXT: and a5, a1, a3
; LMULMAX2-RV32-NEXT: srli a1, a1, 2
; LMULMAX2-RV32-NEXT: and a1, a1, a3
-; LMULMAX2-RV32-NEXT: add a1, a2, a1
-; LMULMAX2-RV32-NEXT: srli a2, a1, 4
-; LMULMAX2-RV32-NEXT: add a1, a1, a2
-; LMULMAX2-RV32-NEXT: and a1, a1, a6
-; LMULMAX2-RV32-NEXT: mul a1, a1, a7
+; LMULMAX2-RV32-NEXT: add a1, a5, a1
+; LMULMAX2-RV32-NEXT: srli a5, a1, 4
+; LMULMAX2-RV32-NEXT: add a1, a1, a5
+; LMULMAX2-RV32-NEXT: and a1, a1, a7
+; LMULMAX2-RV32-NEXT: mul a1, a1, a2
; LMULMAX2-RV32-NEXT: srli a1, a1, 24
-; LMULMAX2-RV32-NEXT: addi a1, a1, 32
+; LMULMAX2-RV32-NEXT: addi a5, a1, 32
; LMULMAX2-RV32-NEXT: j .LBB7_6
; LMULMAX2-RV32-NEXT: .LBB7_5:
-; LMULMAX2-RV32-NEXT: srli a1, a2, 1
-; LMULMAX2-RV32-NEXT: or a1, a2, a1
-; LMULMAX2-RV32-NEXT: srli a2, a1, 2
-; LMULMAX2-RV32-NEXT: or a1, a1, a2
-; LMULMAX2-RV32-NEXT: srli a2, a1, 4
-; LMULMAX2-RV32-NEXT: or a1, a1, a2
-; LMULMAX2-RV32-NEXT: srli a2, a1, 8
-; LMULMAX2-RV32-NEXT: or a1, a1, a2
-; LMULMAX2-RV32-NEXT: srli a2, a1, 16
-; LMULMAX2-RV32-NEXT: or a1, a1, a2
+; LMULMAX2-RV32-NEXT: srli a5, a1, 1
+; LMULMAX2-RV32-NEXT: or a1, a1, a5
+; LMULMAX2-RV32-NEXT: srli a5, a1, 2
+; LMULMAX2-RV32-NEXT: or a1, a1, a5
+; LMULMAX2-RV32-NEXT: srli a5, a1, 4
+; LMULMAX2-RV32-NEXT: or a1, a1, a5
+; LMULMAX2-RV32-NEXT: srli a5, a1, 8
+; LMULMAX2-RV32-NEXT: or a1, a1, a5
+; LMULMAX2-RV32-NEXT: srli a5, a1, 16
+; LMULMAX2-RV32-NEXT: or a1, a1, a5
; LMULMAX2-RV32-NEXT: not a1, a1
-; LMULMAX2-RV32-NEXT: srli a2, a1, 1
-; LMULMAX2-RV32-NEXT: and a2, a2, a4
-; LMULMAX2-RV32-NEXT: sub a1, a1, a2
-; LMULMAX2-RV32-NEXT: and a2, a1, a3
+; LMULMAX2-RV32-NEXT: srli a5, a1, 1
+; LMULMAX2-RV32-NEXT: and a5, a5, a4
+; LMULMAX2-RV32-NEXT: sub a1, a1, a5
+; LMULMAX2-RV32-NEXT: and a5, a1, a3
; LMULMAX2-RV32-NEXT: srli a1, a1, 2
; LMULMAX2-RV32-NEXT: and a1, a1, a3
-; LMULMAX2-RV32-NEXT: add a1, a2, a1
-; LMULMAX2-RV32-NEXT: srli a2, a1, 4
-; LMULMAX2-RV32-NEXT: add a1, a1, a2
-; LMULMAX2-RV32-NEXT: and a1, a1, a6
-; LMULMAX2-RV32-NEXT: mul a1, a1, a7
-; LMULMAX2-RV32-NEXT: srli a1, a1, 24
+; LMULMAX2-RV32-NEXT: add a1, a5, a1
+; LMULMAX2-RV32-NEXT: srli a5, a1, 4
+; LMULMAX2-RV32-NEXT: add a1, a1, a5
+; LMULMAX2-RV32-NEXT: and a1, a1, a7
+; LMULMAX2-RV32-NEXT: mul a1, a1, a2
+; LMULMAX2-RV32-NEXT: srli a5, a1, 24
; LMULMAX2-RV32-NEXT: .LBB7_6:
; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 2
-; LMULMAX2-RV32-NEXT: vsrl.vx v30, v28, a5
-; LMULMAX2-RV32-NEXT: vmv.x.s a2, v30
-; LMULMAX2-RV32-NEXT: sw a1, 56(sp)
-; LMULMAX2-RV32-NEXT: bnez a2, .LBB7_8
+; LMULMAX2-RV32-NEXT: vsrl.vx v30, v28, a6
+; LMULMAX2-RV32-NEXT: vmv.x.s a1, v30
+; LMULMAX2-RV32-NEXT: sw a5, 56(sp)
+; LMULMAX2-RV32-NEXT: bnez a1, .LBB7_8
; LMULMAX2-RV32-NEXT: # %bb.7:
; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28
-; LMULMAX2-RV32-NEXT: srli a2, a1, 1
-; LMULMAX2-RV32-NEXT: or a1, a1, a2
-; LMULMAX2-RV32-NEXT: srli a2, a1, 2
-; LMULMAX2-RV32-NEXT: or a1, a1, a2
-; LMULMAX2-RV32-NEXT: srli a2, a1, 4
-; LMULMAX2-RV32-NEXT: or a1, a1, a2
-; LMULMAX2-RV32-NEXT: srli a2, a1, 8
-; LMULMAX2-RV32-NEXT: or a1, a1, a2
-; LMULMAX2-RV32-NEXT: srli a2, a1, 16
-; LMULMAX2-RV32-NEXT: or a1, a1, a2
+; LMULMAX2-RV32-NEXT: srli a5, a1, 1
+; LMULMAX2-RV32-NEXT: or a1, a1, a5
+; LMULMAX2-RV32-NEXT: srli a5, a1, 2
+; LMULMAX2-RV32-NEXT: or a1, a1, a5
+; LMULMAX2-RV32-NEXT: srli a5, a1, 4
+; LMULMAX2-RV32-NEXT: or a1, a1, a5
+; LMULMAX2-RV32-NEXT: srli a5, a1, 8
+; LMULMAX2-RV32-NEXT: or a1, a1, a5
+; LMULMAX2-RV32-NEXT: srli a5, a1, 16
+; LMULMAX2-RV32-NEXT: or a1, a1, a5
; LMULMAX2-RV32-NEXT: not a1, a1
-; LMULMAX2-RV32-NEXT: srli a2, a1, 1
-; LMULMAX2-RV32-NEXT: and a2, a2, a4
-; LMULMAX2-RV32-NEXT: sub a1, a1, a2
-; LMULMAX2-RV32-NEXT: and a2, a1, a3
+; LMULMAX2-RV32-NEXT: srli a5, a1, 1
+; LMULMAX2-RV32-NEXT: and a5, a5, a4
+; LMULMAX2-RV32-NEXT: sub a1, a1, a5
+; LMULMAX2-RV32-NEXT: and a5, a1, a3
; LMULMAX2-RV32-NEXT: srli a1, a1, 2
; LMULMAX2-RV32-NEXT: and a1, a1, a3
-; LMULMAX2-RV32-NEXT: add a1, a2, a1
-; LMULMAX2-RV32-NEXT: srli a2, a1, 4
-; LMULMAX2-RV32-NEXT: add a1, a1, a2
-; LMULMAX2-RV32-NEXT: and a1, a1, a6
-; LMULMAX2-RV32-NEXT: mul a1, a1, a7
+; LMULMAX2-RV32-NEXT: add a1, a5, a1
+; LMULMAX2-RV32-NEXT: srli a5, a1, 4
+; LMULMAX2-RV32-NEXT: add a1, a1, a5
+; LMULMAX2-RV32-NEXT: and a1, a1, a7
+; LMULMAX2-RV32-NEXT: mul a1, a1, a2
; LMULMAX2-RV32-NEXT: srli a1, a1, 24
-; LMULMAX2-RV32-NEXT: addi a1, a1, 32
+; LMULMAX2-RV32-NEXT: addi a5, a1, 32
; LMULMAX2-RV32-NEXT: j .LBB7_9
; LMULMAX2-RV32-NEXT: .LBB7_8:
-; LMULMAX2-RV32-NEXT: srli a1, a2, 1
-; LMULMAX2-RV32-NEXT: or a1, a2, a1
-; LMULMAX2-RV32-NEXT: srli a2, a1, 2
-; LMULMAX2-RV32-NEXT: or a1, a1, a2
-; LMULMAX2-RV32-NEXT: srli a2, a1, 4
-; LMULMAX2-RV32-NEXT: or a1, a1, a2
-; LMULMAX2-RV32-NEXT: srli a2, a1, 8
-; LMULMAX2-RV32-NEXT: or a1, a1, a2
-; LMULMAX2-RV32-NEXT: srli a2, a1, 16
-; LMULMAX2-RV32-NEXT: or a1, a1, a2
+; LMULMAX2-RV32-NEXT: srli a5, a1, 1
+; LMULMAX2-RV32-NEXT: or a1, a1, a5
+; LMULMAX2-RV32-NEXT: srli a5, a1, 2
+; LMULMAX2-RV32-NEXT: or a1, a1, a5
+; LMULMAX2-RV32-NEXT: srli a5, a1, 4
+; LMULMAX2-RV32-NEXT: or a1, a1, a5
+; LMULMAX2-RV32-NEXT: srli a5, a1, 8
+; LMULMAX2-RV32-NEXT: or a1, a1, a5
+; LMULMAX2-RV32-NEXT: srli a5, a1, 16
+; LMULMAX2-RV32-NEXT: or a1, a1, a5
; LMULMAX2-RV32-NEXT: not a1, a1
-; LMULMAX2-RV32-NEXT: srli a2, a1, 1
-; LMULMAX2-RV32-NEXT: and a2, a2, a4
-; LMULMAX2-RV32-NEXT: sub a1, a1, a2
-; LMULMAX2-RV32-NEXT: and a2, a1, a3
+; LMULMAX2-RV32-NEXT: srli a5, a1, 1
+; LMULMAX2-RV32-NEXT: and a5, a5, a4
+; LMULMAX2-RV32-NEXT: sub a1, a1, a5
+; LMULMAX2-RV32-NEXT: and a5, a1, a3
; LMULMAX2-RV32-NEXT: srli a1, a1, 2
; LMULMAX2-RV32-NEXT: and a1, a1, a3
-; LMULMAX2-RV32-NEXT: add a1, a2, a1
-; LMULMAX2-RV32-NEXT: srli a2, a1, 4
-; LMULMAX2-RV32-NEXT: add a1, a1, a2
-; LMULMAX2-RV32-NEXT: and a1, a1, a6
-; LMULMAX2-RV32-NEXT: mul a1, a1, a7
-; LMULMAX2-RV32-NEXT: srli a1, a1, 24
+; LMULMAX2-RV32-NEXT: add a1, a5, a1
+; LMULMAX2-RV32-NEXT: srli a5, a1, 4
+; LMULMAX2-RV32-NEXT: add a1, a1, a5
+; LMULMAX2-RV32-NEXT: and a1, a1, a7
+; LMULMAX2-RV32-NEXT: mul a1, a1, a2
+; LMULMAX2-RV32-NEXT: srli a5, a1, 24
; LMULMAX2-RV32-NEXT: .LBB7_9:
; LMULMAX2-RV32-NEXT: vslidedown.vi v26, v26, 1
-; LMULMAX2-RV32-NEXT: vsrl.vx v28, v26, a5
-; LMULMAX2-RV32-NEXT: vmv.x.s a2, v28
-; LMULMAX2-RV32-NEXT: sw a1, 48(sp)
-; LMULMAX2-RV32-NEXT: bnez a2, .LBB7_11
+; LMULMAX2-RV32-NEXT: vsrl.vx v28, v26, a6
+; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28
+; LMULMAX2-RV32-NEXT: sw a5, 48(sp)
+; LMULMAX2-RV32-NEXT: bnez a1, .LBB7_11
; LMULMAX2-RV32-NEXT: # %bb.10:
; LMULMAX2-RV32-NEXT: vmv.x.s a1, v26
-; LMULMAX2-RV32-NEXT: srli a2, a1, 1
-; LMULMAX2-RV32-NEXT: or a1, a1, a2
-; LMULMAX2-RV32-NEXT: srli a2, a1, 2
-; LMULMAX2-RV32-NEXT: or a1, a1, a2
-; LMULMAX2-RV32-NEXT: srli a2, a1, 4
-; LMULMAX2-RV32-NEXT: or a1, a1, a2
-; LMULMAX2-RV32-NEXT: srli a2, a1, 8
-; LMULMAX2-RV32-NEXT: or a1, a1, a2
-; LMULMAX2-RV32-NEXT: srli a2, a1, 16
-; LMULMAX2-RV32-NEXT: or a1, a1, a2
+; LMULMAX2-RV32-NEXT: srli a5, a1, 1
+; LMULMAX2-RV32-NEXT: or a1, a1, a5
+; LMULMAX2-RV32-NEXT: srli a5, a1, 2
+; LMULMAX2-RV32-NEXT: or a1, a1, a5
+; LMULMAX2-RV32-NEXT: srli a5, a1, 4
+; LMULMAX2-RV32-NEXT: or a1, a1, a5
+; LMULMAX2-RV32-NEXT: srli a5, a1, 8
+; LMULMAX2-RV32-NEXT: or a1, a1, a5
+; LMULMAX2-RV32-NEXT: srli a5, a1, 16
+; LMULMAX2-RV32-NEXT: or a1, a1, a5
; LMULMAX2-RV32-NEXT: not a1, a1
-; LMULMAX2-RV32-NEXT: srli a2, a1, 1
-; LMULMAX2-RV32-NEXT: and a2, a2, a4
-; LMULMAX2-RV32-NEXT: sub a1, a1, a2
-; LMULMAX2-RV32-NEXT: and a2, a1, a3
+; LMULMAX2-RV32-NEXT: srli a5, a1, 1
+; LMULMAX2-RV32-NEXT: and a4, a5, a4
+; LMULMAX2-RV32-NEXT: sub a1, a1, a4
+; LMULMAX2-RV32-NEXT: and a4, a1, a3
; LMULMAX2-RV32-NEXT: srli a1, a1, 2
; LMULMAX2-RV32-NEXT: and a1, a1, a3
-; LMULMAX2-RV32-NEXT: add a1, a2, a1
-; LMULMAX2-RV32-NEXT: srli a2, a1, 4
-; LMULMAX2-RV32-NEXT: add a1, a1, a2
-; LMULMAX2-RV32-NEXT: and a1, a1, a6
-; LMULMAX2-RV32-NEXT: mul a1, a1, a7
+; LMULMAX2-RV32-NEXT: add a1, a4, a1
+; LMULMAX2-RV32-NEXT: srli a3, a1, 4
+; LMULMAX2-RV32-NEXT: add a1, a1, a3
+; LMULMAX2-RV32-NEXT: and a1, a1, a7
+; LMULMAX2-RV32-NEXT: mul a1, a1, a2
; LMULMAX2-RV32-NEXT: srli a1, a1, 24
; LMULMAX2-RV32-NEXT: addi a1, a1, 32
; LMULMAX2-RV32-NEXT: j .LBB7_12
; LMULMAX2-RV32-NEXT: .LBB7_11:
-; LMULMAX2-RV32-NEXT: srli a1, a2, 1
-; LMULMAX2-RV32-NEXT: or a1, a2, a1
-; LMULMAX2-RV32-NEXT: srli a2, a1, 2
-; LMULMAX2-RV32-NEXT: or a1, a1, a2
-; LMULMAX2-RV32-NEXT: srli a2, a1, 4
-; LMULMAX2-RV32-NEXT: or a1, a1, a2
-; LMULMAX2-RV32-NEXT: srli a2, a1, 8
-; LMULMAX2-RV32-NEXT: or a1, a1, a2
-; LMULMAX2-RV32-NEXT: srli a2, a1, 16
-; LMULMAX2-RV32-NEXT: or a1, a1, a2
+; LMULMAX2-RV32-NEXT: srli a5, a1, 1
+; LMULMAX2-RV32-NEXT: or a1, a1, a5
+; LMULMAX2-RV32-NEXT: srli a5, a1, 2
+; LMULMAX2-RV32-NEXT: or a1, a1, a5
+; LMULMAX2-RV32-NEXT: srli a5, a1, 4
+; LMULMAX2-RV32-NEXT: or a1, a1, a5
+; LMULMAX2-RV32-NEXT: srli a5, a1, 8
+; LMULMAX2-RV32-NEXT: or a1, a1, a5
+; LMULMAX2-RV32-NEXT: srli a5, a1, 16
+; LMULMAX2-RV32-NEXT: or a1, a1, a5
; LMULMAX2-RV32-NEXT: not a1, a1
-; LMULMAX2-RV32-NEXT: srli a2, a1, 1
-; LMULMAX2-RV32-NEXT: and a2, a2, a4
-; LMULMAX2-RV32-NEXT: sub a1, a1, a2
-; LMULMAX2-RV32-NEXT: and a2, a1, a3
+; LMULMAX2-RV32-NEXT: srli a5, a1, 1
+; LMULMAX2-RV32-NEXT: and a4, a5, a4
+; LMULMAX2-RV32-NEXT: sub a1, a1, a4
+; LMULMAX2-RV32-NEXT: and a4, a1, a3
; LMULMAX2-RV32-NEXT: srli a1, a1, 2
; LMULMAX2-RV32-NEXT: and a1, a1, a3
-; LMULMAX2-RV32-NEXT: add a1, a2, a1
-; LMULMAX2-RV32-NEXT: srli a2, a1, 4
-; LMULMAX2-RV32-NEXT: add a1, a1, a2
-; LMULMAX2-RV32-NEXT: and a1, a1, a6
-; LMULMAX2-RV32-NEXT: mul a1, a1, a7
+; LMULMAX2-RV32-NEXT: add a1, a4, a1
+; LMULMAX2-RV32-NEXT: srli a3, a1, 4
+; LMULMAX2-RV32-NEXT: add a1, a1, a3
+; LMULMAX2-RV32-NEXT: and a1, a1, a7
+; LMULMAX2-RV32-NEXT: mul a1, a1, a2
; LMULMAX2-RV32-NEXT: srli a1, a1, 24
; LMULMAX2-RV32-NEXT: .LBB7_12:
; LMULMAX2-RV32-NEXT: sw a1, 40(sp)
; LMULMAX1-RV32-NEXT: vle64.v v25, (a0)
; LMULMAX1-RV32-NEXT: addi a6, a0, 16
; LMULMAX1-RV32-NEXT: vle64.v v26, (a6)
-; LMULMAX1-RV32-NEXT: sw zero, 44(sp)
-; LMULMAX1-RV32-NEXT: sw zero, 36(sp)
-; LMULMAX1-RV32-NEXT: addi a1, zero, 32
-; LMULMAX1-RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu
-; LMULMAX1-RV32-NEXT: vsrl.vx v27, v26, a1
-; LMULMAX1-RV32-NEXT: vmv.x.s a2, v27
-; LMULMAX1-RV32-NEXT: lui a3, 349525
-; LMULMAX1-RV32-NEXT: addi a5, a3, 1365
-; LMULMAX1-RV32-NEXT: lui a3, 209715
-; LMULMAX1-RV32-NEXT: addi a4, a3, 819
-; LMULMAX1-RV32-NEXT: lui a3, 61681
-; LMULMAX1-RV32-NEXT: addi a7, a3, -241
-; LMULMAX1-RV32-NEXT: lui a3, 4112
-; LMULMAX1-RV32-NEXT: addi t0, a3, 257
-; LMULMAX1-RV32-NEXT: bnez a2, .LBB7_2
-; LMULMAX1-RV32-NEXT: # %bb.1:
-; LMULMAX1-RV32-NEXT: vmv.x.s a2, v26
-; LMULMAX1-RV32-NEXT: srli a3, a2, 1
-; LMULMAX1-RV32-NEXT: or a2, a2, a3
-; LMULMAX1-RV32-NEXT: srli a3, a2, 2
-; LMULMAX1-RV32-NEXT: or a2, a2, a3
-; LMULMAX1-RV32-NEXT: srli a3, a2, 4
-; LMULMAX1-RV32-NEXT: or a2, a2, a3
-; LMULMAX1-RV32-NEXT: srli a3, a2, 8
-; LMULMAX1-RV32-NEXT: or a2, a2, a3
-; LMULMAX1-RV32-NEXT: srli a3, a2, 16
-; LMULMAX1-RV32-NEXT: or a2, a2, a3
-; LMULMAX1-RV32-NEXT: not a2, a2
-; LMULMAX1-RV32-NEXT: srli a3, a2, 1
-; LMULMAX1-RV32-NEXT: and a3, a3, a5
-; LMULMAX1-RV32-NEXT: sub a2, a2, a3
-; LMULMAX1-RV32-NEXT: and a3, a2, a4
-; LMULMAX1-RV32-NEXT: srli a2, a2, 2
-; LMULMAX1-RV32-NEXT: and a2, a2, a4
-; LMULMAX1-RV32-NEXT: add a2, a3, a2
-; LMULMAX1-RV32-NEXT: srli a3, a2, 4
-; LMULMAX1-RV32-NEXT: add a2, a2, a3
-; LMULMAX1-RV32-NEXT: and a2, a2, a7
-; LMULMAX1-RV32-NEXT: mul a2, a2, t0
-; LMULMAX1-RV32-NEXT: srli a2, a2, 24
-; LMULMAX1-RV32-NEXT: addi a2, a2, 32
+; LMULMAX1-RV32-NEXT: sw zero, 44(sp)
+; LMULMAX1-RV32-NEXT: sw zero, 36(sp)
+; LMULMAX1-RV32-NEXT: addi a7, zero, 32
+; LMULMAX1-RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu
+; LMULMAX1-RV32-NEXT: vsrl.vx v27, v26, a7
+; LMULMAX1-RV32-NEXT: vmv.x.s a1, v27
+; LMULMAX1-RV32-NEXT: lui a2, 349525
+; LMULMAX1-RV32-NEXT: addi a5, a2, 1365
+; LMULMAX1-RV32-NEXT: lui a2, 209715
+; LMULMAX1-RV32-NEXT: addi a4, a2, 819
+; LMULMAX1-RV32-NEXT: lui a2, 61681
+; LMULMAX1-RV32-NEXT: addi t0, a2, -241
+; LMULMAX1-RV32-NEXT: lui a2, 4112
+; LMULMAX1-RV32-NEXT: addi a3, a2, 257
+; LMULMAX1-RV32-NEXT: bnez a1, .LBB7_2
+; LMULMAX1-RV32-NEXT: # %bb.1:
+; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26
+; LMULMAX1-RV32-NEXT: srli a2, a1, 1
+; LMULMAX1-RV32-NEXT: or a1, a1, a2
+; LMULMAX1-RV32-NEXT: srli a2, a1, 2
+; LMULMAX1-RV32-NEXT: or a1, a1, a2
+; LMULMAX1-RV32-NEXT: srli a2, a1, 4
+; LMULMAX1-RV32-NEXT: or a1, a1, a2
+; LMULMAX1-RV32-NEXT: srli a2, a1, 8
+; LMULMAX1-RV32-NEXT: or a1, a1, a2
+; LMULMAX1-RV32-NEXT: srli a2, a1, 16
+; LMULMAX1-RV32-NEXT: or a1, a1, a2
+; LMULMAX1-RV32-NEXT: not a1, a1
+; LMULMAX1-RV32-NEXT: srli a2, a1, 1
+; LMULMAX1-RV32-NEXT: and a2, a2, a5
+; LMULMAX1-RV32-NEXT: sub a1, a1, a2
+; LMULMAX1-RV32-NEXT: and a2, a1, a4
+; LMULMAX1-RV32-NEXT: srli a1, a1, 2
+; LMULMAX1-RV32-NEXT: and a1, a1, a4
+; LMULMAX1-RV32-NEXT: add a1, a2, a1
+; LMULMAX1-RV32-NEXT: srli a2, a1, 4
+; LMULMAX1-RV32-NEXT: add a1, a1, a2
+; LMULMAX1-RV32-NEXT: and a1, a1, t0
+; LMULMAX1-RV32-NEXT: mul a1, a1, a3
+; LMULMAX1-RV32-NEXT: srli a1, a1, 24
+; LMULMAX1-RV32-NEXT: addi a1, a1, 32
; LMULMAX1-RV32-NEXT: j .LBB7_3
; LMULMAX1-RV32-NEXT: .LBB7_2:
-; LMULMAX1-RV32-NEXT: srli a3, a2, 1
-; LMULMAX1-RV32-NEXT: or a2, a2, a3
-; LMULMAX1-RV32-NEXT: srli a3, a2, 2
-; LMULMAX1-RV32-NEXT: or a2, a2, a3
-; LMULMAX1-RV32-NEXT: srli a3, a2, 4
-; LMULMAX1-RV32-NEXT: or a2, a2, a3
-; LMULMAX1-RV32-NEXT: srli a3, a2, 8
-; LMULMAX1-RV32-NEXT: or a2, a2, a3
-; LMULMAX1-RV32-NEXT: srli a3, a2, 16
-; LMULMAX1-RV32-NEXT: or a2, a2, a3
-; LMULMAX1-RV32-NEXT: not a2, a2
-; LMULMAX1-RV32-NEXT: srli a3, a2, 1
-; LMULMAX1-RV32-NEXT: and a3, a3, a5
-; LMULMAX1-RV32-NEXT: sub a2, a2, a3
-; LMULMAX1-RV32-NEXT: and a3, a2, a4
-; LMULMAX1-RV32-NEXT: srli a2, a2, 2
-; LMULMAX1-RV32-NEXT: and a2, a2, a4
-; LMULMAX1-RV32-NEXT: add a2, a3, a2
-; LMULMAX1-RV32-NEXT: srli a3, a2, 4
-; LMULMAX1-RV32-NEXT: add a2, a2, a3
-; LMULMAX1-RV32-NEXT: and a2, a2, a7
-; LMULMAX1-RV32-NEXT: mul a2, a2, t0
-; LMULMAX1-RV32-NEXT: srli a2, a2, 24
+; LMULMAX1-RV32-NEXT: srli a2, a1, 1
+; LMULMAX1-RV32-NEXT: or a1, a1, a2
+; LMULMAX1-RV32-NEXT: srli a2, a1, 2
+; LMULMAX1-RV32-NEXT: or a1, a1, a2
+; LMULMAX1-RV32-NEXT: srli a2, a1, 4
+; LMULMAX1-RV32-NEXT: or a1, a1, a2
+; LMULMAX1-RV32-NEXT: srli a2, a1, 8
+; LMULMAX1-RV32-NEXT: or a1, a1, a2
+; LMULMAX1-RV32-NEXT: srli a2, a1, 16
+; LMULMAX1-RV32-NEXT: or a1, a1, a2
+; LMULMAX1-RV32-NEXT: not a1, a1
+; LMULMAX1-RV32-NEXT: srli a2, a1, 1
+; LMULMAX1-RV32-NEXT: and a2, a2, a5
+; LMULMAX1-RV32-NEXT: sub a1, a1, a2
+; LMULMAX1-RV32-NEXT: and a2, a1, a4
+; LMULMAX1-RV32-NEXT: srli a1, a1, 2
+; LMULMAX1-RV32-NEXT: and a1, a1, a4
+; LMULMAX1-RV32-NEXT: add a1, a2, a1
+; LMULMAX1-RV32-NEXT: srli a2, a1, 4
+; LMULMAX1-RV32-NEXT: add a1, a1, a2
+; LMULMAX1-RV32-NEXT: and a1, a1, t0
+; LMULMAX1-RV32-NEXT: mul a1, a1, a3
+; LMULMAX1-RV32-NEXT: srli a1, a1, 24
; LMULMAX1-RV32-NEXT: .LBB7_3:
; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v26, 1
-; LMULMAX1-RV32-NEXT: vsrl.vx v27, v26, a1
-; LMULMAX1-RV32-NEXT: vmv.x.s a3, v27
-; LMULMAX1-RV32-NEXT: sw a2, 32(sp)
-; LMULMAX1-RV32-NEXT: bnez a3, .LBB7_5
+; LMULMAX1-RV32-NEXT: vsrl.vx v27, v26, a7
+; LMULMAX1-RV32-NEXT: vmv.x.s a2, v27
+; LMULMAX1-RV32-NEXT: sw a1, 32(sp)
+; LMULMAX1-RV32-NEXT: bnez a2, .LBB7_5
; LMULMAX1-RV32-NEXT: # %bb.4:
-; LMULMAX1-RV32-NEXT: vmv.x.s a2, v26
-; LMULMAX1-RV32-NEXT: srli a3, a2, 1
-; LMULMAX1-RV32-NEXT: or a2, a2, a3
-; LMULMAX1-RV32-NEXT: srli a3, a2, 2
-; LMULMAX1-RV32-NEXT: or a2, a2, a3
-; LMULMAX1-RV32-NEXT: srli a3, a2, 4
-; LMULMAX1-RV32-NEXT: or a2, a2, a3
-; LMULMAX1-RV32-NEXT: srli a3, a2, 8
-; LMULMAX1-RV32-NEXT: or a2, a2, a3
-; LMULMAX1-RV32-NEXT: srli a3, a2, 16
-; LMULMAX1-RV32-NEXT: or a2, a2, a3
-; LMULMAX1-RV32-NEXT: not a2, a2
-; LMULMAX1-RV32-NEXT: srli a3, a2, 1
-; LMULMAX1-RV32-NEXT: and a3, a3, a5
-; LMULMAX1-RV32-NEXT: sub a2, a2, a3
-; LMULMAX1-RV32-NEXT: and a3, a2, a4
-; LMULMAX1-RV32-NEXT: srli a2, a2, 2
-; LMULMAX1-RV32-NEXT: and a2, a2, a4
-; LMULMAX1-RV32-NEXT: add a2, a3, a2
-; LMULMAX1-RV32-NEXT: srli a3, a2, 4
-; LMULMAX1-RV32-NEXT: add a2, a2, a3
-; LMULMAX1-RV32-NEXT: and a2, a2, a7
-; LMULMAX1-RV32-NEXT: mul a2, a2, t0
-; LMULMAX1-RV32-NEXT: srli a2, a2, 24
-; LMULMAX1-RV32-NEXT: addi a2, a2, 32
+; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26
+; LMULMAX1-RV32-NEXT: srli a2, a1, 1
+; LMULMAX1-RV32-NEXT: or a1, a1, a2
+; LMULMAX1-RV32-NEXT: srli a2, a1, 2
+; LMULMAX1-RV32-NEXT: or a1, a1, a2
+; LMULMAX1-RV32-NEXT: srli a2, a1, 4
+; LMULMAX1-RV32-NEXT: or a1, a1, a2
+; LMULMAX1-RV32-NEXT: srli a2, a1, 8
+; LMULMAX1-RV32-NEXT: or a1, a1, a2
+; LMULMAX1-RV32-NEXT: srli a2, a1, 16
+; LMULMAX1-RV32-NEXT: or a1, a1, a2
+; LMULMAX1-RV32-NEXT: not a1, a1
+; LMULMAX1-RV32-NEXT: srli a2, a1, 1
+; LMULMAX1-RV32-NEXT: and a2, a2, a5
+; LMULMAX1-RV32-NEXT: sub a1, a1, a2
+; LMULMAX1-RV32-NEXT: and a2, a1, a4
+; LMULMAX1-RV32-NEXT: srli a1, a1, 2
+; LMULMAX1-RV32-NEXT: and a1, a1, a4
+; LMULMAX1-RV32-NEXT: add a1, a2, a1
+; LMULMAX1-RV32-NEXT: srli a2, a1, 4
+; LMULMAX1-RV32-NEXT: add a1, a1, a2
+; LMULMAX1-RV32-NEXT: and a1, a1, t0
+; LMULMAX1-RV32-NEXT: mul a1, a1, a3
+; LMULMAX1-RV32-NEXT: srli a1, a1, 24
+; LMULMAX1-RV32-NEXT: addi a1, a1, 32
; LMULMAX1-RV32-NEXT: j .LBB7_6
; LMULMAX1-RV32-NEXT: .LBB7_5:
-; LMULMAX1-RV32-NEXT: srli a2, a3, 1
-; LMULMAX1-RV32-NEXT: or a2, a3, a2
-; LMULMAX1-RV32-NEXT: srli a3, a2, 2
-; LMULMAX1-RV32-NEXT: or a2, a2, a3
-; LMULMAX1-RV32-NEXT: srli a3, a2, 4
-; LMULMAX1-RV32-NEXT: or a2, a2, a3
-; LMULMAX1-RV32-NEXT: srli a3, a2, 8
-; LMULMAX1-RV32-NEXT: or a2, a2, a3
-; LMULMAX1-RV32-NEXT: srli a3, a2, 16
-; LMULMAX1-RV32-NEXT: or a2, a2, a3
-; LMULMAX1-RV32-NEXT: not a2, a2
-; LMULMAX1-RV32-NEXT: srli a3, a2, 1
-; LMULMAX1-RV32-NEXT: and a3, a3, a5
-; LMULMAX1-RV32-NEXT: sub a2, a2, a3
-; LMULMAX1-RV32-NEXT: and a3, a2, a4
-; LMULMAX1-RV32-NEXT: srli a2, a2, 2
-; LMULMAX1-RV32-NEXT: and a2, a2, a4
-; LMULMAX1-RV32-NEXT: add a2, a3, a2
-; LMULMAX1-RV32-NEXT: srli a3, a2, 4
-; LMULMAX1-RV32-NEXT: add a2, a2, a3
-; LMULMAX1-RV32-NEXT: and a2, a2, a7
-; LMULMAX1-RV32-NEXT: mul a2, a2, t0
-; LMULMAX1-RV32-NEXT: srli a2, a2, 24
+; LMULMAX1-RV32-NEXT: srli a1, a2, 1
+; LMULMAX1-RV32-NEXT: or a1, a2, a1
+; LMULMAX1-RV32-NEXT: srli a2, a1, 2
+; LMULMAX1-RV32-NEXT: or a1, a1, a2
+; LMULMAX1-RV32-NEXT: srli a2, a1, 4
+; LMULMAX1-RV32-NEXT: or a1, a1, a2
+; LMULMAX1-RV32-NEXT: srli a2, a1, 8
+; LMULMAX1-RV32-NEXT: or a1, a1, a2
+; LMULMAX1-RV32-NEXT: srli a2, a1, 16
+; LMULMAX1-RV32-NEXT: or a1, a1, a2
+; LMULMAX1-RV32-NEXT: not a1, a1
+; LMULMAX1-RV32-NEXT: srli a2, a1, 1
+; LMULMAX1-RV32-NEXT: and a2, a2, a5
+; LMULMAX1-RV32-NEXT: sub a1, a1, a2
+; LMULMAX1-RV32-NEXT: and a2, a1, a4
+; LMULMAX1-RV32-NEXT: srli a1, a1, 2
+; LMULMAX1-RV32-NEXT: and a1, a1, a4
+; LMULMAX1-RV32-NEXT: add a1, a2, a1
+; LMULMAX1-RV32-NEXT: srli a2, a1, 4
+; LMULMAX1-RV32-NEXT: add a1, a1, a2
+; LMULMAX1-RV32-NEXT: and a1, a1, t0
+; LMULMAX1-RV32-NEXT: mul a1, a1, a3
+; LMULMAX1-RV32-NEXT: srli a1, a1, 24
; LMULMAX1-RV32-NEXT: .LBB7_6:
-; LMULMAX1-RV32-NEXT: sw a2, 40(sp)
+; LMULMAX1-RV32-NEXT: sw a1, 40(sp)
; LMULMAX1-RV32-NEXT: sw zero, 28(sp)
-; LMULMAX1-RV32-NEXT: vsrl.vx v26, v25, a1
-; LMULMAX1-RV32-NEXT: vmv.x.s a2, v26
+; LMULMAX1-RV32-NEXT: vsrl.vx v26, v25, a7
+; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26
; LMULMAX1-RV32-NEXT: sw zero, 20(sp)
-; LMULMAX1-RV32-NEXT: bnez a2, .LBB7_8
+; LMULMAX1-RV32-NEXT: bnez a1, .LBB7_8
; LMULMAX1-RV32-NEXT: # %bb.7:
-; LMULMAX1-RV32-NEXT: vmv.x.s a2, v25
-; LMULMAX1-RV32-NEXT: srli a3, a2, 1
-; LMULMAX1-RV32-NEXT: or a2, a2, a3
-; LMULMAX1-RV32-NEXT: srli a3, a2, 2
-; LMULMAX1-RV32-NEXT: or a2, a2, a3
-; LMULMAX1-RV32-NEXT: srli a3, a2, 4
-; LMULMAX1-RV32-NEXT: or a2, a2, a3
-; LMULMAX1-RV32-NEXT: srli a3, a2, 8
-; LMULMAX1-RV32-NEXT: or a2, a2, a3
-; LMULMAX1-RV32-NEXT: srli a3, a2, 16
-; LMULMAX1-RV32-NEXT: or a2, a2, a3
-; LMULMAX1-RV32-NEXT: not a2, a2
-; LMULMAX1-RV32-NEXT: srli a3, a2, 1
-; LMULMAX1-RV32-NEXT: and a3, a3, a5
-; LMULMAX1-RV32-NEXT: sub a2, a2, a3
-; LMULMAX1-RV32-NEXT: and a3, a2, a4
-; LMULMAX1-RV32-NEXT: srli a2, a2, 2
-; LMULMAX1-RV32-NEXT: and a2, a2, a4
-; LMULMAX1-RV32-NEXT: add a2, a3, a2
-; LMULMAX1-RV32-NEXT: srli a3, a2, 4
-; LMULMAX1-RV32-NEXT: add a2, a2, a3
-; LMULMAX1-RV32-NEXT: and a2, a2, a7
-; LMULMAX1-RV32-NEXT: mul a2, a2, t0
-; LMULMAX1-RV32-NEXT: srli a2, a2, 24
-; LMULMAX1-RV32-NEXT: addi a2, a2, 32
+; LMULMAX1-RV32-NEXT: vmv.x.s a1, v25
+; LMULMAX1-RV32-NEXT: srli a2, a1, 1
+; LMULMAX1-RV32-NEXT: or a1, a1, a2
+; LMULMAX1-RV32-NEXT: srli a2, a1, 2
+; LMULMAX1-RV32-NEXT: or a1, a1, a2
+; LMULMAX1-RV32-NEXT: srli a2, a1, 4
+; LMULMAX1-RV32-NEXT: or a1, a1, a2
+; LMULMAX1-RV32-NEXT: srli a2, a1, 8
+; LMULMAX1-RV32-NEXT: or a1, a1, a2
+; LMULMAX1-RV32-NEXT: srli a2, a1, 16
+; LMULMAX1-RV32-NEXT: or a1, a1, a2
+; LMULMAX1-RV32-NEXT: not a1, a1
+; LMULMAX1-RV32-NEXT: srli a2, a1, 1
+; LMULMAX1-RV32-NEXT: and a2, a2, a5
+; LMULMAX1-RV32-NEXT: sub a1, a1, a2
+; LMULMAX1-RV32-NEXT: and a2, a1, a4
+; LMULMAX1-RV32-NEXT: srli a1, a1, 2
+; LMULMAX1-RV32-NEXT: and a1, a1, a4
+; LMULMAX1-RV32-NEXT: add a1, a2, a1
+; LMULMAX1-RV32-NEXT: srli a2, a1, 4
+; LMULMAX1-RV32-NEXT: add a1, a1, a2
+; LMULMAX1-RV32-NEXT: and a1, a1, t0
+; LMULMAX1-RV32-NEXT: mul a1, a1, a3
+; LMULMAX1-RV32-NEXT: srli a1, a1, 24
+; LMULMAX1-RV32-NEXT: addi a1, a1, 32
; LMULMAX1-RV32-NEXT: j .LBB7_9
; LMULMAX1-RV32-NEXT: .LBB7_8:
-; LMULMAX1-RV32-NEXT: srli a3, a2, 1
-; LMULMAX1-RV32-NEXT: or a2, a2, a3
-; LMULMAX1-RV32-NEXT: srli a3, a2, 2
-; LMULMAX1-RV32-NEXT: or a2, a2, a3
-; LMULMAX1-RV32-NEXT: srli a3, a2, 4
-; LMULMAX1-RV32-NEXT: or a2, a2, a3
-; LMULMAX1-RV32-NEXT: srli a3, a2, 8
-; LMULMAX1-RV32-NEXT: or a2, a2, a3
-; LMULMAX1-RV32-NEXT: srli a3, a2, 16
-; LMULMAX1-RV32-NEXT: or a2, a2, a3
-; LMULMAX1-RV32-NEXT: not a2, a2
-; LMULMAX1-RV32-NEXT: srli a3, a2, 1
-; LMULMAX1-RV32-NEXT: and a3, a3, a5
-; LMULMAX1-RV32-NEXT: sub a2, a2, a3
-; LMULMAX1-RV32-NEXT: and a3, a2, a4
-; LMULMAX1-RV32-NEXT: srli a2, a2, 2
-; LMULMAX1-RV32-NEXT: and a2, a2, a4
-; LMULMAX1-RV32-NEXT: add a2, a3, a2
-; LMULMAX1-RV32-NEXT: srli a3, a2, 4
-; LMULMAX1-RV32-NEXT: add a2, a2, a3
-; LMULMAX1-RV32-NEXT: and a2, a2, a7
-; LMULMAX1-RV32-NEXT: mul a2, a2, t0
-; LMULMAX1-RV32-NEXT: srli a2, a2, 24
+; LMULMAX1-RV32-NEXT: srli a2, a1, 1
+; LMULMAX1-RV32-NEXT: or a1, a1, a2
+; LMULMAX1-RV32-NEXT: srli a2, a1, 2
+; LMULMAX1-RV32-NEXT: or a1, a1, a2
+; LMULMAX1-RV32-NEXT: srli a2, a1, 4
+; LMULMAX1-RV32-NEXT: or a1, a1, a2
+; LMULMAX1-RV32-NEXT: srli a2, a1, 8
+; LMULMAX1-RV32-NEXT: or a1, a1, a2
+; LMULMAX1-RV32-NEXT: srli a2, a1, 16
+; LMULMAX1-RV32-NEXT: or a1, a1, a2
+; LMULMAX1-RV32-NEXT: not a1, a1
+; LMULMAX1-RV32-NEXT: srli a2, a1, 1
+; LMULMAX1-RV32-NEXT: and a2, a2, a5
+; LMULMAX1-RV32-NEXT: sub a1, a1, a2
+; LMULMAX1-RV32-NEXT: and a2, a1, a4
+; LMULMAX1-RV32-NEXT: srli a1, a1, 2
+; LMULMAX1-RV32-NEXT: and a1, a1, a4
+; LMULMAX1-RV32-NEXT: add a1, a2, a1
+; LMULMAX1-RV32-NEXT: srli a2, a1, 4
+; LMULMAX1-RV32-NEXT: add a1, a1, a2
+; LMULMAX1-RV32-NEXT: and a1, a1, t0
+; LMULMAX1-RV32-NEXT: mul a1, a1, a3
+; LMULMAX1-RV32-NEXT: srli a1, a1, 24
; LMULMAX1-RV32-NEXT: .LBB7_9:
; LMULMAX1-RV32-NEXT: vslidedown.vi v25, v25, 1
-; LMULMAX1-RV32-NEXT: vsrl.vx v26, v25, a1
-; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26
-; LMULMAX1-RV32-NEXT: sw a2, 16(sp)
-; LMULMAX1-RV32-NEXT: bnez a1, .LBB7_11
+; LMULMAX1-RV32-NEXT: vsrl.vx v26, v25, a7
+; LMULMAX1-RV32-NEXT: vmv.x.s a2, v26
+; LMULMAX1-RV32-NEXT: sw a1, 16(sp)
+; LMULMAX1-RV32-NEXT: bnez a2, .LBB7_11
; LMULMAX1-RV32-NEXT: # %bb.10:
; LMULMAX1-RV32-NEXT: vmv.x.s a1, v25
; LMULMAX1-RV32-NEXT: srli a2, a1, 1
; LMULMAX1-RV32-NEXT: add a1, a2, a1
; LMULMAX1-RV32-NEXT: srli a2, a1, 4
; LMULMAX1-RV32-NEXT: add a1, a1, a2
-; LMULMAX1-RV32-NEXT: and a1, a1, a7
-; LMULMAX1-RV32-NEXT: mul a1, a1, t0
+; LMULMAX1-RV32-NEXT: and a1, a1, t0
+; LMULMAX1-RV32-NEXT: mul a1, a1, a3
; LMULMAX1-RV32-NEXT: srli a1, a1, 24
; LMULMAX1-RV32-NEXT: addi a1, a1, 32
; LMULMAX1-RV32-NEXT: j .LBB7_12
; LMULMAX1-RV32-NEXT: .LBB7_11:
-; LMULMAX1-RV32-NEXT: srli a2, a1, 1
-; LMULMAX1-RV32-NEXT: or a1, a1, a2
+; LMULMAX1-RV32-NEXT: srli a1, a2, 1
+; LMULMAX1-RV32-NEXT: or a1, a2, a1
; LMULMAX1-RV32-NEXT: srli a2, a1, 2
; LMULMAX1-RV32-NEXT: or a1, a1, a2
; LMULMAX1-RV32-NEXT: srli a2, a1, 4
; LMULMAX1-RV32-NEXT: add a1, a2, a1
; LMULMAX1-RV32-NEXT: srli a2, a1, 4
; LMULMAX1-RV32-NEXT: add a1, a1, a2
-; LMULMAX1-RV32-NEXT: and a1, a1, a7
-; LMULMAX1-RV32-NEXT: mul a1, a1, t0
+; LMULMAX1-RV32-NEXT: and a1, a1, t0
+; LMULMAX1-RV32-NEXT: mul a1, a1, a3
; LMULMAX1-RV32-NEXT: srli a1, a1, 24
; LMULMAX1-RV32-NEXT: .LBB7_12:
; LMULMAX1-RV32-NEXT: sw a1, 24(sp)
; LMULMAX2-RV64-NEXT: vslidedown.vi v26, v25, 3
; LMULMAX2-RV64-NEXT: vmv.x.s a2, v26
; LMULMAX2-RV64-NEXT: addi a1, zero, 1
-; LMULMAX2-RV64-NEXT: slli a1, a1, 32
-; LMULMAX2-RV64-NEXT: or a2, a2, a1
+; LMULMAX2-RV64-NEXT: slli a6, a1, 32
+; LMULMAX2-RV64-NEXT: or a2, a2, a6
; LMULMAX2-RV64-NEXT: addi a3, a2, -1
; LMULMAX2-RV64-NEXT: not a2, a2
; LMULMAX2-RV64-NEXT: and a3, a2, a3
; LMULMAX2-RV64-NEXT: slli a2, a2, 12
; LMULMAX2-RV64-NEXT: addi a2, a2, 1365
; LMULMAX2-RV64-NEXT: slli a2, a2, 12
-; LMULMAX2-RV64-NEXT: addi a6, a2, 1365
-; LMULMAX2-RV64-NEXT: and a4, a4, a6
+; LMULMAX2-RV64-NEXT: addi a7, a2, 1365
+; LMULMAX2-RV64-NEXT: and a4, a4, a7
; LMULMAX2-RV64-NEXT: sub a4, a3, a4
; LMULMAX2-RV64-NEXT: lui a3, 13107
; LMULMAX2-RV64-NEXT: addiw a3, a3, 819
; LMULMAX2-RV64-NEXT: slli a5, a5, 12
; LMULMAX2-RV64-NEXT: addi a5, a5, 241
; LMULMAX2-RV64-NEXT: slli a5, a5, 12
-; LMULMAX2-RV64-NEXT: addi a7, a5, -241
-; LMULMAX2-RV64-NEXT: and a4, a4, a7
-; LMULMAX2-RV64-NEXT: lui a2, 4112
-; LMULMAX2-RV64-NEXT: addiw a2, a2, 257
-; LMULMAX2-RV64-NEXT: slli a2, a2, 16
-; LMULMAX2-RV64-NEXT: addi a2, a2, 257
-; LMULMAX2-RV64-NEXT: slli a2, a2, 16
-; LMULMAX2-RV64-NEXT: addi a2, a2, 257
-; LMULMAX2-RV64-NEXT: mul a4, a4, a2
+; LMULMAX2-RV64-NEXT: addi a5, a5, -241
+; LMULMAX2-RV64-NEXT: and a4, a4, a5
+; LMULMAX2-RV64-NEXT: lui a1, 4112
+; LMULMAX2-RV64-NEXT: addiw a1, a1, 257
+; LMULMAX2-RV64-NEXT: slli a1, a1, 16
+; LMULMAX2-RV64-NEXT: addi a1, a1, 257
+; LMULMAX2-RV64-NEXT: slli a1, a1, 16
+; LMULMAX2-RV64-NEXT: addi a1, a1, 257
+; LMULMAX2-RV64-NEXT: mul a4, a4, a1
; LMULMAX2-RV64-NEXT: srli a4, a4, 56
; LMULMAX2-RV64-NEXT: sw a4, 28(sp)
; LMULMAX2-RV64-NEXT: vslidedown.vi v26, v25, 2
; LMULMAX2-RV64-NEXT: vmv.x.s a4, v26
-; LMULMAX2-RV64-NEXT: or a4, a4, a1
-; LMULMAX2-RV64-NEXT: addi a5, a4, -1
+; LMULMAX2-RV64-NEXT: or a4, a4, a6
+; LMULMAX2-RV64-NEXT: addi a2, a4, -1
; LMULMAX2-RV64-NEXT: not a4, a4
-; LMULMAX2-RV64-NEXT: and a4, a4, a5
-; LMULMAX2-RV64-NEXT: srli a5, a4, 1
-; LMULMAX2-RV64-NEXT: and a5, a5, a6
-; LMULMAX2-RV64-NEXT: sub a4, a4, a5
-; LMULMAX2-RV64-NEXT: and a5, a4, a3
-; LMULMAX2-RV64-NEXT: srli a4, a4, 2
-; LMULMAX2-RV64-NEXT: and a4, a4, a3
-; LMULMAX2-RV64-NEXT: add a4, a5, a4
-; LMULMAX2-RV64-NEXT: srli a5, a4, 4
-; LMULMAX2-RV64-NEXT: add a4, a4, a5
+; LMULMAX2-RV64-NEXT: and a2, a4, a2
+; LMULMAX2-RV64-NEXT: srli a4, a2, 1
; LMULMAX2-RV64-NEXT: and a4, a4, a7
-; LMULMAX2-RV64-NEXT: mul a4, a4, a2
-; LMULMAX2-RV64-NEXT: srli a4, a4, 56
-; LMULMAX2-RV64-NEXT: sw a4, 24(sp)
+; LMULMAX2-RV64-NEXT: sub a2, a2, a4
+; LMULMAX2-RV64-NEXT: and a4, a2, a3
+; LMULMAX2-RV64-NEXT: srli a2, a2, 2
+; LMULMAX2-RV64-NEXT: and a2, a2, a3
+; LMULMAX2-RV64-NEXT: add a2, a4, a2
+; LMULMAX2-RV64-NEXT: srli a4, a2, 4
+; LMULMAX2-RV64-NEXT: add a2, a2, a4
+; LMULMAX2-RV64-NEXT: and a2, a2, a5
+; LMULMAX2-RV64-NEXT: mul a2, a2, a1
+; LMULMAX2-RV64-NEXT: srli a2, a2, 56
+; LMULMAX2-RV64-NEXT: sw a2, 24(sp)
; LMULMAX2-RV64-NEXT: vslidedown.vi v26, v25, 1
-; LMULMAX2-RV64-NEXT: vmv.x.s a4, v26
-; LMULMAX2-RV64-NEXT: or a4, a4, a1
-; LMULMAX2-RV64-NEXT: addi a5, a4, -1
-; LMULMAX2-RV64-NEXT: not a4, a4
-; LMULMAX2-RV64-NEXT: and a4, a4, a5
-; LMULMAX2-RV64-NEXT: srli a5, a4, 1
-; LMULMAX2-RV64-NEXT: and a5, a5, a6
-; LMULMAX2-RV64-NEXT: sub a4, a4, a5
-; LMULMAX2-RV64-NEXT: and a5, a4, a3
-; LMULMAX2-RV64-NEXT: srli a4, a4, 2
-; LMULMAX2-RV64-NEXT: and a4, a4, a3
-; LMULMAX2-RV64-NEXT: add a4, a5, a4
-; LMULMAX2-RV64-NEXT: srli a5, a4, 4
-; LMULMAX2-RV64-NEXT: add a4, a4, a5
+; LMULMAX2-RV64-NEXT: vmv.x.s a2, v26
+; LMULMAX2-RV64-NEXT: or a2, a2, a6
+; LMULMAX2-RV64-NEXT: addi a4, a2, -1
+; LMULMAX2-RV64-NEXT: not a2, a2
+; LMULMAX2-RV64-NEXT: and a2, a2, a4
+; LMULMAX2-RV64-NEXT: srli a4, a2, 1
; LMULMAX2-RV64-NEXT: and a4, a4, a7
-; LMULMAX2-RV64-NEXT: mul a4, a4, a2
-; LMULMAX2-RV64-NEXT: srli a4, a4, 56
-; LMULMAX2-RV64-NEXT: sw a4, 20(sp)
-; LMULMAX2-RV64-NEXT: vmv.x.s a4, v25
-; LMULMAX2-RV64-NEXT: or a1, a4, a1
-; LMULMAX2-RV64-NEXT: addi a4, a1, -1
-; LMULMAX2-RV64-NEXT: not a1, a1
-; LMULMAX2-RV64-NEXT: and a1, a1, a4
-; LMULMAX2-RV64-NEXT: srli a4, a1, 1
-; LMULMAX2-RV64-NEXT: and a4, a4, a6
-; LMULMAX2-RV64-NEXT: sub a1, a1, a4
-; LMULMAX2-RV64-NEXT: and a4, a1, a3
-; LMULMAX2-RV64-NEXT: srli a1, a1, 2
-; LMULMAX2-RV64-NEXT: and a1, a1, a3
-; LMULMAX2-RV64-NEXT: add a1, a4, a1
-; LMULMAX2-RV64-NEXT: srli a3, a1, 4
-; LMULMAX2-RV64-NEXT: add a1, a1, a3
-; LMULMAX2-RV64-NEXT: and a1, a1, a7
-; LMULMAX2-RV64-NEXT: mul a1, a1, a2
+; LMULMAX2-RV64-NEXT: sub a2, a2, a4
+; LMULMAX2-RV64-NEXT: and a4, a2, a3
+; LMULMAX2-RV64-NEXT: srli a2, a2, 2
+; LMULMAX2-RV64-NEXT: and a2, a2, a3
+; LMULMAX2-RV64-NEXT: add a2, a4, a2
+; LMULMAX2-RV64-NEXT: srli a4, a2, 4
+; LMULMAX2-RV64-NEXT: add a2, a2, a4
+; LMULMAX2-RV64-NEXT: and a2, a2, a5
+; LMULMAX2-RV64-NEXT: mul a2, a2, a1
+; LMULMAX2-RV64-NEXT: srli a2, a2, 56
+; LMULMAX2-RV64-NEXT: sw a2, 20(sp)
+; LMULMAX2-RV64-NEXT: vmv.x.s a2, v25
+; LMULMAX2-RV64-NEXT: or a2, a2, a6
+; LMULMAX2-RV64-NEXT: addi a4, a2, -1
+; LMULMAX2-RV64-NEXT: not a2, a2
+; LMULMAX2-RV64-NEXT: and a2, a2, a4
+; LMULMAX2-RV64-NEXT: srli a4, a2, 1
+; LMULMAX2-RV64-NEXT: and a4, a4, a7
+; LMULMAX2-RV64-NEXT: sub a2, a2, a4
+; LMULMAX2-RV64-NEXT: and a4, a2, a3
+; LMULMAX2-RV64-NEXT: srli a2, a2, 2
+; LMULMAX2-RV64-NEXT: and a2, a2, a3
+; LMULMAX2-RV64-NEXT: add a2, a4, a2
+; LMULMAX2-RV64-NEXT: srli a3, a2, 4
+; LMULMAX2-RV64-NEXT: add a2, a2, a3
+; LMULMAX2-RV64-NEXT: and a2, a2, a5
+; LMULMAX2-RV64-NEXT: mul a1, a2, a1
; LMULMAX2-RV64-NEXT: srli a1, a1, 56
; LMULMAX2-RV64-NEXT: sw a1, 16(sp)
; LMULMAX2-RV64-NEXT: vsetivli zero, 4, e32, m1, ta, mu
; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 3
; LMULMAX1-RV64-NEXT: vmv.x.s a2, v26
; LMULMAX1-RV64-NEXT: addi a1, zero, 1
-; LMULMAX1-RV64-NEXT: slli a1, a1, 32
-; LMULMAX1-RV64-NEXT: or a2, a2, a1
+; LMULMAX1-RV64-NEXT: slli a6, a1, 32
+; LMULMAX1-RV64-NEXT: or a2, a2, a6
; LMULMAX1-RV64-NEXT: addi a3, a2, -1
; LMULMAX1-RV64-NEXT: not a2, a2
; LMULMAX1-RV64-NEXT: and a3, a2, a3
; LMULMAX1-RV64-NEXT: slli a2, a2, 12
; LMULMAX1-RV64-NEXT: addi a2, a2, 1365
; LMULMAX1-RV64-NEXT: slli a2, a2, 12
-; LMULMAX1-RV64-NEXT: addi a6, a2, 1365
-; LMULMAX1-RV64-NEXT: and a4, a4, a6
+; LMULMAX1-RV64-NEXT: addi a7, a2, 1365
+; LMULMAX1-RV64-NEXT: and a4, a4, a7
; LMULMAX1-RV64-NEXT: sub a4, a3, a4
; LMULMAX1-RV64-NEXT: lui a3, 13107
; LMULMAX1-RV64-NEXT: addiw a3, a3, 819
; LMULMAX1-RV64-NEXT: slli a5, a5, 12
; LMULMAX1-RV64-NEXT: addi a5, a5, 241
; LMULMAX1-RV64-NEXT: slli a5, a5, 12
-; LMULMAX1-RV64-NEXT: addi a7, a5, -241
-; LMULMAX1-RV64-NEXT: and a4, a4, a7
-; LMULMAX1-RV64-NEXT: lui a2, 4112
-; LMULMAX1-RV64-NEXT: addiw a2, a2, 257
-; LMULMAX1-RV64-NEXT: slli a2, a2, 16
-; LMULMAX1-RV64-NEXT: addi a2, a2, 257
-; LMULMAX1-RV64-NEXT: slli a2, a2, 16
-; LMULMAX1-RV64-NEXT: addi a2, a2, 257
-; LMULMAX1-RV64-NEXT: mul a4, a4, a2
+; LMULMAX1-RV64-NEXT: addi a5, a5, -241
+; LMULMAX1-RV64-NEXT: and a4, a4, a5
+; LMULMAX1-RV64-NEXT: lui a1, 4112
+; LMULMAX1-RV64-NEXT: addiw a1, a1, 257
+; LMULMAX1-RV64-NEXT: slli a1, a1, 16
+; LMULMAX1-RV64-NEXT: addi a1, a1, 257
+; LMULMAX1-RV64-NEXT: slli a1, a1, 16
+; LMULMAX1-RV64-NEXT: addi a1, a1, 257
+; LMULMAX1-RV64-NEXT: mul a4, a4, a1
; LMULMAX1-RV64-NEXT: srli a4, a4, 56
; LMULMAX1-RV64-NEXT: sw a4, 28(sp)
; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 2
; LMULMAX1-RV64-NEXT: vmv.x.s a4, v26
-; LMULMAX1-RV64-NEXT: or a4, a4, a1
-; LMULMAX1-RV64-NEXT: addi a5, a4, -1
+; LMULMAX1-RV64-NEXT: or a4, a4, a6
+; LMULMAX1-RV64-NEXT: addi a2, a4, -1
; LMULMAX1-RV64-NEXT: not a4, a4
-; LMULMAX1-RV64-NEXT: and a4, a4, a5
-; LMULMAX1-RV64-NEXT: srli a5, a4, 1
-; LMULMAX1-RV64-NEXT: and a5, a5, a6
-; LMULMAX1-RV64-NEXT: sub a4, a4, a5
-; LMULMAX1-RV64-NEXT: and a5, a4, a3
-; LMULMAX1-RV64-NEXT: srli a4, a4, 2
-; LMULMAX1-RV64-NEXT: and a4, a4, a3
-; LMULMAX1-RV64-NEXT: add a4, a5, a4
-; LMULMAX1-RV64-NEXT: srli a5, a4, 4
-; LMULMAX1-RV64-NEXT: add a4, a4, a5
+; LMULMAX1-RV64-NEXT: and a2, a4, a2
+; LMULMAX1-RV64-NEXT: srli a4, a2, 1
; LMULMAX1-RV64-NEXT: and a4, a4, a7
-; LMULMAX1-RV64-NEXT: mul a4, a4, a2
-; LMULMAX1-RV64-NEXT: srli a4, a4, 56
-; LMULMAX1-RV64-NEXT: sw a4, 24(sp)
+; LMULMAX1-RV64-NEXT: sub a2, a2, a4
+; LMULMAX1-RV64-NEXT: and a4, a2, a3
+; LMULMAX1-RV64-NEXT: srli a2, a2, 2
+; LMULMAX1-RV64-NEXT: and a2, a2, a3
+; LMULMAX1-RV64-NEXT: add a2, a4, a2
+; LMULMAX1-RV64-NEXT: srli a4, a2, 4
+; LMULMAX1-RV64-NEXT: add a2, a2, a4
+; LMULMAX1-RV64-NEXT: and a2, a2, a5
+; LMULMAX1-RV64-NEXT: mul a2, a2, a1
+; LMULMAX1-RV64-NEXT: srli a2, a2, 56
+; LMULMAX1-RV64-NEXT: sw a2, 24(sp)
; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 1
-; LMULMAX1-RV64-NEXT: vmv.x.s a4, v26
-; LMULMAX1-RV64-NEXT: or a4, a4, a1
-; LMULMAX1-RV64-NEXT: addi a5, a4, -1
-; LMULMAX1-RV64-NEXT: not a4, a4
-; LMULMAX1-RV64-NEXT: and a4, a4, a5
-; LMULMAX1-RV64-NEXT: srli a5, a4, 1
-; LMULMAX1-RV64-NEXT: and a5, a5, a6
-; LMULMAX1-RV64-NEXT: sub a4, a4, a5
-; LMULMAX1-RV64-NEXT: and a5, a4, a3
-; LMULMAX1-RV64-NEXT: srli a4, a4, 2
-; LMULMAX1-RV64-NEXT: and a4, a4, a3
-; LMULMAX1-RV64-NEXT: add a4, a5, a4
-; LMULMAX1-RV64-NEXT: srli a5, a4, 4
-; LMULMAX1-RV64-NEXT: add a4, a4, a5
+; LMULMAX1-RV64-NEXT: vmv.x.s a2, v26
+; LMULMAX1-RV64-NEXT: or a2, a2, a6
+; LMULMAX1-RV64-NEXT: addi a4, a2, -1
+; LMULMAX1-RV64-NEXT: not a2, a2
+; LMULMAX1-RV64-NEXT: and a2, a2, a4
+; LMULMAX1-RV64-NEXT: srli a4, a2, 1
; LMULMAX1-RV64-NEXT: and a4, a4, a7
-; LMULMAX1-RV64-NEXT: mul a4, a4, a2
-; LMULMAX1-RV64-NEXT: srli a4, a4, 56
-; LMULMAX1-RV64-NEXT: sw a4, 20(sp)
-; LMULMAX1-RV64-NEXT: vmv.x.s a4, v25
-; LMULMAX1-RV64-NEXT: or a1, a4, a1
-; LMULMAX1-RV64-NEXT: addi a4, a1, -1
-; LMULMAX1-RV64-NEXT: not a1, a1
-; LMULMAX1-RV64-NEXT: and a1, a1, a4
-; LMULMAX1-RV64-NEXT: srli a4, a1, 1
-; LMULMAX1-RV64-NEXT: and a4, a4, a6
-; LMULMAX1-RV64-NEXT: sub a1, a1, a4
-; LMULMAX1-RV64-NEXT: and a4, a1, a3
-; LMULMAX1-RV64-NEXT: srli a1, a1, 2
-; LMULMAX1-RV64-NEXT: and a1, a1, a3
-; LMULMAX1-RV64-NEXT: add a1, a4, a1
-; LMULMAX1-RV64-NEXT: srli a3, a1, 4
-; LMULMAX1-RV64-NEXT: add a1, a1, a3
-; LMULMAX1-RV64-NEXT: and a1, a1, a7
-; LMULMAX1-RV64-NEXT: mul a1, a1, a2
+; LMULMAX1-RV64-NEXT: sub a2, a2, a4
+; LMULMAX1-RV64-NEXT: and a4, a2, a3
+; LMULMAX1-RV64-NEXT: srli a2, a2, 2
+; LMULMAX1-RV64-NEXT: and a2, a2, a3
+; LMULMAX1-RV64-NEXT: add a2, a4, a2
+; LMULMAX1-RV64-NEXT: srli a4, a2, 4
+; LMULMAX1-RV64-NEXT: add a2, a2, a4
+; LMULMAX1-RV64-NEXT: and a2, a2, a5
+; LMULMAX1-RV64-NEXT: mul a2, a2, a1
+; LMULMAX1-RV64-NEXT: srli a2, a2, 56
+; LMULMAX1-RV64-NEXT: sw a2, 20(sp)
+; LMULMAX1-RV64-NEXT: vmv.x.s a2, v25
+; LMULMAX1-RV64-NEXT: or a2, a2, a6
+; LMULMAX1-RV64-NEXT: addi a4, a2, -1
+; LMULMAX1-RV64-NEXT: not a2, a2
+; LMULMAX1-RV64-NEXT: and a2, a2, a4
+; LMULMAX1-RV64-NEXT: srli a4, a2, 1
+; LMULMAX1-RV64-NEXT: and a4, a4, a7
+; LMULMAX1-RV64-NEXT: sub a2, a2, a4
+; LMULMAX1-RV64-NEXT: and a4, a2, a3
+; LMULMAX1-RV64-NEXT: srli a2, a2, 2
+; LMULMAX1-RV64-NEXT: and a2, a2, a3
+; LMULMAX1-RV64-NEXT: add a2, a4, a2
+; LMULMAX1-RV64-NEXT: srli a3, a2, 4
+; LMULMAX1-RV64-NEXT: add a2, a2, a3
+; LMULMAX1-RV64-NEXT: and a2, a2, a5
+; LMULMAX1-RV64-NEXT: mul a1, a2, a1
; LMULMAX1-RV64-NEXT: srli a1, a1, 56
; LMULMAX1-RV64-NEXT: sw a1, 16(sp)
; LMULMAX1-RV64-NEXT: vsetivli zero, 4, e32, m1, ta, mu
; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 7
; LMULMAX2-RV64-NEXT: vmv.x.s a2, v28
; LMULMAX2-RV64-NEXT: addi a1, zero, 1
-; LMULMAX2-RV64-NEXT: slli a1, a1, 32
-; LMULMAX2-RV64-NEXT: or a2, a2, a1
+; LMULMAX2-RV64-NEXT: slli a6, a1, 32
+; LMULMAX2-RV64-NEXT: or a2, a2, a6
; LMULMAX2-RV64-NEXT: addi a3, a2, -1
; LMULMAX2-RV64-NEXT: not a2, a2
; LMULMAX2-RV64-NEXT: and a3, a2, a3
; LMULMAX2-RV64-NEXT: slli a2, a2, 12
; LMULMAX2-RV64-NEXT: addi a2, a2, 1365
; LMULMAX2-RV64-NEXT: slli a2, a2, 12
-; LMULMAX2-RV64-NEXT: addi a6, a2, 1365
-; LMULMAX2-RV64-NEXT: and a4, a4, a6
+; LMULMAX2-RV64-NEXT: addi a7, a2, 1365
+; LMULMAX2-RV64-NEXT: and a4, a4, a7
; LMULMAX2-RV64-NEXT: sub a4, a3, a4
; LMULMAX2-RV64-NEXT: lui a3, 13107
; LMULMAX2-RV64-NEXT: addiw a3, a3, 819
; LMULMAX2-RV64-NEXT: slli a4, a4, 12
; LMULMAX2-RV64-NEXT: addi a4, a4, 241
; LMULMAX2-RV64-NEXT: slli a4, a4, 12
-; LMULMAX2-RV64-NEXT: addi a7, a4, -241
-; LMULMAX2-RV64-NEXT: and a2, a5, a7
+; LMULMAX2-RV64-NEXT: addi a4, a4, -241
+; LMULMAX2-RV64-NEXT: and a1, a5, a4
; LMULMAX2-RV64-NEXT: lui a5, 4112
; LMULMAX2-RV64-NEXT: addiw a5, a5, 257
; LMULMAX2-RV64-NEXT: slli a5, a5, 16
; LMULMAX2-RV64-NEXT: addi a5, a5, 257
; LMULMAX2-RV64-NEXT: slli a5, a5, 16
; LMULMAX2-RV64-NEXT: addi a5, a5, 257
-; LMULMAX2-RV64-NEXT: mul a2, a2, a5
-; LMULMAX2-RV64-NEXT: srli a2, a2, 56
-; LMULMAX2-RV64-NEXT: sw a2, 60(sp)
+; LMULMAX2-RV64-NEXT: mul a1, a1, a5
+; LMULMAX2-RV64-NEXT: srli a1, a1, 56
+; LMULMAX2-RV64-NEXT: sw a1, 60(sp)
; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 6
-; LMULMAX2-RV64-NEXT: vmv.x.s a2, v28
-; LMULMAX2-RV64-NEXT: or a2, a2, a1
-; LMULMAX2-RV64-NEXT: addi a4, a2, -1
-; LMULMAX2-RV64-NEXT: not a2, a2
-; LMULMAX2-RV64-NEXT: and a2, a2, a4
-; LMULMAX2-RV64-NEXT: srli a4, a2, 1
-; LMULMAX2-RV64-NEXT: and a4, a4, a6
-; LMULMAX2-RV64-NEXT: sub a2, a2, a4
-; LMULMAX2-RV64-NEXT: and a4, a2, a3
-; LMULMAX2-RV64-NEXT: srli a2, a2, 2
-; LMULMAX2-RV64-NEXT: and a2, a2, a3
-; LMULMAX2-RV64-NEXT: add a2, a4, a2
-; LMULMAX2-RV64-NEXT: srli a4, a2, 4
-; LMULMAX2-RV64-NEXT: add a2, a2, a4
+; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28
+; LMULMAX2-RV64-NEXT: or a1, a1, a6
+; LMULMAX2-RV64-NEXT: addi a2, a1, -1
+; LMULMAX2-RV64-NEXT: not a1, a1
+; LMULMAX2-RV64-NEXT: and a1, a1, a2
+; LMULMAX2-RV64-NEXT: srli a2, a1, 1
; LMULMAX2-RV64-NEXT: and a2, a2, a7
-; LMULMAX2-RV64-NEXT: mul a2, a2, a5
-; LMULMAX2-RV64-NEXT: srli a2, a2, 56
-; LMULMAX2-RV64-NEXT: sw a2, 56(sp)
+; LMULMAX2-RV64-NEXT: sub a1, a1, a2
+; LMULMAX2-RV64-NEXT: and a2, a1, a3
+; LMULMAX2-RV64-NEXT: srli a1, a1, 2
+; LMULMAX2-RV64-NEXT: and a1, a1, a3
+; LMULMAX2-RV64-NEXT: add a1, a2, a1
+; LMULMAX2-RV64-NEXT: srli a2, a1, 4
+; LMULMAX2-RV64-NEXT: add a1, a1, a2
+; LMULMAX2-RV64-NEXT: and a1, a1, a4
+; LMULMAX2-RV64-NEXT: mul a1, a1, a5
+; LMULMAX2-RV64-NEXT: srli a1, a1, 56
+; LMULMAX2-RV64-NEXT: sw a1, 56(sp)
; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 5
-; LMULMAX2-RV64-NEXT: vmv.x.s a2, v28
-; LMULMAX2-RV64-NEXT: or a2, a2, a1
-; LMULMAX2-RV64-NEXT: addi a4, a2, -1
-; LMULMAX2-RV64-NEXT: not a2, a2
-; LMULMAX2-RV64-NEXT: and a2, a2, a4
-; LMULMAX2-RV64-NEXT: srli a4, a2, 1
-; LMULMAX2-RV64-NEXT: and a4, a4, a6
-; LMULMAX2-RV64-NEXT: sub a2, a2, a4
-; LMULMAX2-RV64-NEXT: and a4, a2, a3
-; LMULMAX2-RV64-NEXT: srli a2, a2, 2
-; LMULMAX2-RV64-NEXT: and a2, a2, a3
-; LMULMAX2-RV64-NEXT: add a2, a4, a2
-; LMULMAX2-RV64-NEXT: srli a4, a2, 4
-; LMULMAX2-RV64-NEXT: add a2, a2, a4
+; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28
+; LMULMAX2-RV64-NEXT: or a1, a1, a6
+; LMULMAX2-RV64-NEXT: addi a2, a1, -1
+; LMULMAX2-RV64-NEXT: not a1, a1
+; LMULMAX2-RV64-NEXT: and a1, a1, a2
+; LMULMAX2-RV64-NEXT: srli a2, a1, 1
; LMULMAX2-RV64-NEXT: and a2, a2, a7
-; LMULMAX2-RV64-NEXT: mul a2, a2, a5
-; LMULMAX2-RV64-NEXT: srli a2, a2, 56
-; LMULMAX2-RV64-NEXT: sw a2, 52(sp)
+; LMULMAX2-RV64-NEXT: sub a1, a1, a2
+; LMULMAX2-RV64-NEXT: and a2, a1, a3
+; LMULMAX2-RV64-NEXT: srli a1, a1, 2
+; LMULMAX2-RV64-NEXT: and a1, a1, a3
+; LMULMAX2-RV64-NEXT: add a1, a2, a1
+; LMULMAX2-RV64-NEXT: srli a2, a1, 4
+; LMULMAX2-RV64-NEXT: add a1, a1, a2
+; LMULMAX2-RV64-NEXT: and a1, a1, a4
+; LMULMAX2-RV64-NEXT: mul a1, a1, a5
+; LMULMAX2-RV64-NEXT: srli a1, a1, 56
+; LMULMAX2-RV64-NEXT: sw a1, 52(sp)
; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 4
-; LMULMAX2-RV64-NEXT: vmv.x.s a2, v28
-; LMULMAX2-RV64-NEXT: or a2, a2, a1
-; LMULMAX2-RV64-NEXT: addi a4, a2, -1
-; LMULMAX2-RV64-NEXT: not a2, a2
-; LMULMAX2-RV64-NEXT: and a2, a2, a4
-; LMULMAX2-RV64-NEXT: srli a4, a2, 1
-; LMULMAX2-RV64-NEXT: and a4, a4, a6
-; LMULMAX2-RV64-NEXT: sub a2, a2, a4
-; LMULMAX2-RV64-NEXT: and a4, a2, a3
-; LMULMAX2-RV64-NEXT: srli a2, a2, 2
-; LMULMAX2-RV64-NEXT: and a2, a2, a3
-; LMULMAX2-RV64-NEXT: add a2, a4, a2
-; LMULMAX2-RV64-NEXT: srli a4, a2, 4
-; LMULMAX2-RV64-NEXT: add a2, a2, a4
+; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28
+; LMULMAX2-RV64-NEXT: or a1, a1, a6
+; LMULMAX2-RV64-NEXT: addi a2, a1, -1
+; LMULMAX2-RV64-NEXT: not a1, a1
+; LMULMAX2-RV64-NEXT: and a1, a1, a2
+; LMULMAX2-RV64-NEXT: srli a2, a1, 1
; LMULMAX2-RV64-NEXT: and a2, a2, a7
-; LMULMAX2-RV64-NEXT: mul a2, a2, a5
-; LMULMAX2-RV64-NEXT: srli a2, a2, 56
-; LMULMAX2-RV64-NEXT: sw a2, 48(sp)
+; LMULMAX2-RV64-NEXT: sub a1, a1, a2
+; LMULMAX2-RV64-NEXT: and a2, a1, a3
+; LMULMAX2-RV64-NEXT: srli a1, a1, 2
+; LMULMAX2-RV64-NEXT: and a1, a1, a3
+; LMULMAX2-RV64-NEXT: add a1, a2, a1
+; LMULMAX2-RV64-NEXT: srli a2, a1, 4
+; LMULMAX2-RV64-NEXT: add a1, a1, a2
+; LMULMAX2-RV64-NEXT: and a1, a1, a4
+; LMULMAX2-RV64-NEXT: mul a1, a1, a5
+; LMULMAX2-RV64-NEXT: srli a1, a1, 56
+; LMULMAX2-RV64-NEXT: sw a1, 48(sp)
; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 3
-; LMULMAX2-RV64-NEXT: vmv.x.s a2, v28
-; LMULMAX2-RV64-NEXT: or a2, a2, a1
-; LMULMAX2-RV64-NEXT: addi a4, a2, -1
-; LMULMAX2-RV64-NEXT: not a2, a2
-; LMULMAX2-RV64-NEXT: and a2, a2, a4
-; LMULMAX2-RV64-NEXT: srli a4, a2, 1
-; LMULMAX2-RV64-NEXT: and a4, a4, a6
-; LMULMAX2-RV64-NEXT: sub a2, a2, a4
-; LMULMAX2-RV64-NEXT: and a4, a2, a3
-; LMULMAX2-RV64-NEXT: srli a2, a2, 2
-; LMULMAX2-RV64-NEXT: and a2, a2, a3
-; LMULMAX2-RV64-NEXT: add a2, a4, a2
-; LMULMAX2-RV64-NEXT: srli a4, a2, 4
-; LMULMAX2-RV64-NEXT: add a2, a2, a4
+; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28
+; LMULMAX2-RV64-NEXT: or a1, a1, a6
+; LMULMAX2-RV64-NEXT: addi a2, a1, -1
+; LMULMAX2-RV64-NEXT: not a1, a1
+; LMULMAX2-RV64-NEXT: and a1, a1, a2
+; LMULMAX2-RV64-NEXT: srli a2, a1, 1
; LMULMAX2-RV64-NEXT: and a2, a2, a7
-; LMULMAX2-RV64-NEXT: mul a2, a2, a5
-; LMULMAX2-RV64-NEXT: srli a2, a2, 56
-; LMULMAX2-RV64-NEXT: sw a2, 44(sp)
+; LMULMAX2-RV64-NEXT: sub a1, a1, a2
+; LMULMAX2-RV64-NEXT: and a2, a1, a3
+; LMULMAX2-RV64-NEXT: srli a1, a1, 2
+; LMULMAX2-RV64-NEXT: and a1, a1, a3
+; LMULMAX2-RV64-NEXT: add a1, a2, a1
+; LMULMAX2-RV64-NEXT: srli a2, a1, 4
+; LMULMAX2-RV64-NEXT: add a1, a1, a2
+; LMULMAX2-RV64-NEXT: and a1, a1, a4
+; LMULMAX2-RV64-NEXT: mul a1, a1, a5
+; LMULMAX2-RV64-NEXT: srli a1, a1, 56
+; LMULMAX2-RV64-NEXT: sw a1, 44(sp)
; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 2
-; LMULMAX2-RV64-NEXT: vmv.x.s a2, v28
-; LMULMAX2-RV64-NEXT: or a2, a2, a1
-; LMULMAX2-RV64-NEXT: addi a4, a2, -1
-; LMULMAX2-RV64-NEXT: not a2, a2
-; LMULMAX2-RV64-NEXT: and a2, a2, a4
-; LMULMAX2-RV64-NEXT: srli a4, a2, 1
-; LMULMAX2-RV64-NEXT: and a4, a4, a6
-; LMULMAX2-RV64-NEXT: sub a2, a2, a4
-; LMULMAX2-RV64-NEXT: and a4, a2, a3
-; LMULMAX2-RV64-NEXT: srli a2, a2, 2
-; LMULMAX2-RV64-NEXT: and a2, a2, a3
-; LMULMAX2-RV64-NEXT: add a2, a4, a2
-; LMULMAX2-RV64-NEXT: srli a4, a2, 4
-; LMULMAX2-RV64-NEXT: add a2, a2, a4
+; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28
+; LMULMAX2-RV64-NEXT: or a1, a1, a6
+; LMULMAX2-RV64-NEXT: addi a2, a1, -1
+; LMULMAX2-RV64-NEXT: not a1, a1
+; LMULMAX2-RV64-NEXT: and a1, a1, a2
+; LMULMAX2-RV64-NEXT: srli a2, a1, 1
; LMULMAX2-RV64-NEXT: and a2, a2, a7
-; LMULMAX2-RV64-NEXT: mul a2, a2, a5
-; LMULMAX2-RV64-NEXT: srli a2, a2, 56
-; LMULMAX2-RV64-NEXT: sw a2, 40(sp)
+; LMULMAX2-RV64-NEXT: sub a1, a1, a2
+; LMULMAX2-RV64-NEXT: and a2, a1, a3
+; LMULMAX2-RV64-NEXT: srli a1, a1, 2
+; LMULMAX2-RV64-NEXT: and a1, a1, a3
+; LMULMAX2-RV64-NEXT: add a1, a2, a1
+; LMULMAX2-RV64-NEXT: srli a2, a1, 4
+; LMULMAX2-RV64-NEXT: add a1, a1, a2
+; LMULMAX2-RV64-NEXT: and a1, a1, a4
+; LMULMAX2-RV64-NEXT: mul a1, a1, a5
+; LMULMAX2-RV64-NEXT: srli a1, a1, 56
+; LMULMAX2-RV64-NEXT: sw a1, 40(sp)
; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 1
-; LMULMAX2-RV64-NEXT: vmv.x.s a2, v28
-; LMULMAX2-RV64-NEXT: or a2, a2, a1
-; LMULMAX2-RV64-NEXT: addi a4, a2, -1
-; LMULMAX2-RV64-NEXT: not a2, a2
-; LMULMAX2-RV64-NEXT: and a2, a2, a4
-; LMULMAX2-RV64-NEXT: srli a4, a2, 1
-; LMULMAX2-RV64-NEXT: and a4, a4, a6
-; LMULMAX2-RV64-NEXT: sub a2, a2, a4
-; LMULMAX2-RV64-NEXT: and a4, a2, a3
-; LMULMAX2-RV64-NEXT: srli a2, a2, 2
-; LMULMAX2-RV64-NEXT: and a2, a2, a3
-; LMULMAX2-RV64-NEXT: add a2, a4, a2
-; LMULMAX2-RV64-NEXT: srli a4, a2, 4
-; LMULMAX2-RV64-NEXT: add a2, a2, a4
+; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28
+; LMULMAX2-RV64-NEXT: or a1, a1, a6
+; LMULMAX2-RV64-NEXT: addi a2, a1, -1
+; LMULMAX2-RV64-NEXT: not a1, a1
+; LMULMAX2-RV64-NEXT: and a1, a1, a2
+; LMULMAX2-RV64-NEXT: srli a2, a1, 1
; LMULMAX2-RV64-NEXT: and a2, a2, a7
-; LMULMAX2-RV64-NEXT: mul a2, a2, a5
-; LMULMAX2-RV64-NEXT: srli a2, a2, 56
-; LMULMAX2-RV64-NEXT: sw a2, 36(sp)
-; LMULMAX2-RV64-NEXT: vmv.x.s a2, v26
-; LMULMAX2-RV64-NEXT: or a1, a2, a1
+; LMULMAX2-RV64-NEXT: sub a1, a1, a2
+; LMULMAX2-RV64-NEXT: and a2, a1, a3
+; LMULMAX2-RV64-NEXT: srli a1, a1, 2
+; LMULMAX2-RV64-NEXT: and a1, a1, a3
+; LMULMAX2-RV64-NEXT: add a1, a2, a1
+; LMULMAX2-RV64-NEXT: srli a2, a1, 4
+; LMULMAX2-RV64-NEXT: add a1, a1, a2
+; LMULMAX2-RV64-NEXT: and a1, a1, a4
+; LMULMAX2-RV64-NEXT: mul a1, a1, a5
+; LMULMAX2-RV64-NEXT: srli a1, a1, 56
+; LMULMAX2-RV64-NEXT: sw a1, 36(sp)
+; LMULMAX2-RV64-NEXT: vmv.x.s a1, v26
+; LMULMAX2-RV64-NEXT: or a1, a1, a6
; LMULMAX2-RV64-NEXT: addi a2, a1, -1
; LMULMAX2-RV64-NEXT: not a1, a1
; LMULMAX2-RV64-NEXT: and a1, a1, a2
; LMULMAX2-RV64-NEXT: srli a2, a1, 1
-; LMULMAX2-RV64-NEXT: and a2, a2, a6
+; LMULMAX2-RV64-NEXT: and a2, a2, a7
; LMULMAX2-RV64-NEXT: sub a1, a1, a2
; LMULMAX2-RV64-NEXT: and a2, a1, a3
; LMULMAX2-RV64-NEXT: srli a1, a1, 2
; LMULMAX2-RV64-NEXT: add a1, a2, a1
; LMULMAX2-RV64-NEXT: srli a2, a1, 4
; LMULMAX2-RV64-NEXT: add a1, a1, a2
-; LMULMAX2-RV64-NEXT: and a1, a1, a7
+; LMULMAX2-RV64-NEXT: and a1, a1, a4
; LMULMAX2-RV64-NEXT: mul a1, a1, a5
; LMULMAX2-RV64-NEXT: srli a1, a1, 56
; LMULMAX2-RV64-NEXT: sw a1, 32(sp)
; LMULMAX1-RV64-NEXT: vle32.v v25, (a0)
; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26
; LMULMAX1-RV64-NEXT: addi a2, zero, 1
-; LMULMAX1-RV64-NEXT: slli a2, a2, 32
-; LMULMAX1-RV64-NEXT: or a1, a1, a2
+; LMULMAX1-RV64-NEXT: slli a7, a2, 32
+; LMULMAX1-RV64-NEXT: or a1, a1, a7
; LMULMAX1-RV64-NEXT: addi a3, a1, -1
; LMULMAX1-RV64-NEXT: not a1, a1
; LMULMAX1-RV64-NEXT: and a1, a1, a3
; LMULMAX1-RV64-NEXT: slli a3, a3, 12
; LMULMAX1-RV64-NEXT: addi a3, a3, 1365
; LMULMAX1-RV64-NEXT: slli a3, a3, 12
-; LMULMAX1-RV64-NEXT: addi a7, a3, 1365
-; LMULMAX1-RV64-NEXT: and a4, a4, a7
+; LMULMAX1-RV64-NEXT: addi t0, a3, 1365
+; LMULMAX1-RV64-NEXT: and a4, a4, t0
; LMULMAX1-RV64-NEXT: sub a1, a1, a4
; LMULMAX1-RV64-NEXT: lui a4, 13107
; LMULMAX1-RV64-NEXT: addiw a4, a4, 819
; LMULMAX1-RV64-NEXT: slli a5, a5, 12
; LMULMAX1-RV64-NEXT: addi a5, a5, 241
; LMULMAX1-RV64-NEXT: slli a5, a5, 12
-; LMULMAX1-RV64-NEXT: addi t0, a5, -241
-; LMULMAX1-RV64-NEXT: and a3, a1, t0
+; LMULMAX1-RV64-NEXT: addi a5, a5, -241
+; LMULMAX1-RV64-NEXT: and a2, a1, a5
; LMULMAX1-RV64-NEXT: lui a1, 4112
; LMULMAX1-RV64-NEXT: addiw a1, a1, 257
; LMULMAX1-RV64-NEXT: slli a1, a1, 16
; LMULMAX1-RV64-NEXT: addi a1, a1, 257
; LMULMAX1-RV64-NEXT: slli a1, a1, 16
; LMULMAX1-RV64-NEXT: addi a1, a1, 257
-; LMULMAX1-RV64-NEXT: mul a3, a3, a1
-; LMULMAX1-RV64-NEXT: srli a3, a3, 56
-; LMULMAX1-RV64-NEXT: sw a3, 32(sp)
+; LMULMAX1-RV64-NEXT: mul a2, a2, a1
+; LMULMAX1-RV64-NEXT: srli a2, a2, 56
+; LMULMAX1-RV64-NEXT: sw a2, 32(sp)
; LMULMAX1-RV64-NEXT: vsetivli zero, 1, e32, m1, ta, mu
; LMULMAX1-RV64-NEXT: vslidedown.vi v27, v26, 3
-; LMULMAX1-RV64-NEXT: vmv.x.s a3, v27
-; LMULMAX1-RV64-NEXT: or a3, a3, a2
-; LMULMAX1-RV64-NEXT: addi a5, a3, -1
-; LMULMAX1-RV64-NEXT: not a3, a3
-; LMULMAX1-RV64-NEXT: and a3, a3, a5
-; LMULMAX1-RV64-NEXT: srli a5, a3, 1
-; LMULMAX1-RV64-NEXT: and a5, a5, a7
-; LMULMAX1-RV64-NEXT: sub a3, a3, a5
-; LMULMAX1-RV64-NEXT: and a5, a3, a4
-; LMULMAX1-RV64-NEXT: srli a3, a3, 2
-; LMULMAX1-RV64-NEXT: and a3, a3, a4
-; LMULMAX1-RV64-NEXT: add a3, a5, a3
-; LMULMAX1-RV64-NEXT: srli a5, a3, 4
-; LMULMAX1-RV64-NEXT: add a3, a3, a5
+; LMULMAX1-RV64-NEXT: vmv.x.s a2, v27
+; LMULMAX1-RV64-NEXT: or a2, a2, a7
+; LMULMAX1-RV64-NEXT: addi a3, a2, -1
+; LMULMAX1-RV64-NEXT: not a2, a2
+; LMULMAX1-RV64-NEXT: and a2, a2, a3
+; LMULMAX1-RV64-NEXT: srli a3, a2, 1
; LMULMAX1-RV64-NEXT: and a3, a3, t0
-; LMULMAX1-RV64-NEXT: mul a3, a3, a1
-; LMULMAX1-RV64-NEXT: srli a3, a3, 56
-; LMULMAX1-RV64-NEXT: sw a3, 44(sp)
+; LMULMAX1-RV64-NEXT: sub a2, a2, a3
+; LMULMAX1-RV64-NEXT: and a3, a2, a4
+; LMULMAX1-RV64-NEXT: srli a2, a2, 2
+; LMULMAX1-RV64-NEXT: and a2, a2, a4
+; LMULMAX1-RV64-NEXT: add a2, a3, a2
+; LMULMAX1-RV64-NEXT: srli a3, a2, 4
+; LMULMAX1-RV64-NEXT: add a2, a2, a3
+; LMULMAX1-RV64-NEXT: and a2, a2, a5
+; LMULMAX1-RV64-NEXT: mul a2, a2, a1
+; LMULMAX1-RV64-NEXT: srli a2, a2, 56
+; LMULMAX1-RV64-NEXT: sw a2, 44(sp)
; LMULMAX1-RV64-NEXT: vslidedown.vi v27, v26, 2
-; LMULMAX1-RV64-NEXT: vmv.x.s a3, v27
-; LMULMAX1-RV64-NEXT: or a3, a3, a2
-; LMULMAX1-RV64-NEXT: addi a5, a3, -1
-; LMULMAX1-RV64-NEXT: not a3, a3
-; LMULMAX1-RV64-NEXT: and a3, a3, a5
-; LMULMAX1-RV64-NEXT: srli a5, a3, 1
-; LMULMAX1-RV64-NEXT: and a5, a5, a7
-; LMULMAX1-RV64-NEXT: sub a3, a3, a5
-; LMULMAX1-RV64-NEXT: and a5, a3, a4
-; LMULMAX1-RV64-NEXT: srli a3, a3, 2
-; LMULMAX1-RV64-NEXT: and a3, a3, a4
-; LMULMAX1-RV64-NEXT: add a3, a5, a3
-; LMULMAX1-RV64-NEXT: srli a5, a3, 4
-; LMULMAX1-RV64-NEXT: add a3, a3, a5
+; LMULMAX1-RV64-NEXT: vmv.x.s a2, v27
+; LMULMAX1-RV64-NEXT: or a2, a2, a7
+; LMULMAX1-RV64-NEXT: addi a3, a2, -1
+; LMULMAX1-RV64-NEXT: not a2, a2
+; LMULMAX1-RV64-NEXT: and a2, a2, a3
+; LMULMAX1-RV64-NEXT: srli a3, a2, 1
; LMULMAX1-RV64-NEXT: and a3, a3, t0
-; LMULMAX1-RV64-NEXT: mul a3, a3, a1
-; LMULMAX1-RV64-NEXT: srli a3, a3, 56
-; LMULMAX1-RV64-NEXT: sw a3, 40(sp)
+; LMULMAX1-RV64-NEXT: sub a2, a2, a3
+; LMULMAX1-RV64-NEXT: and a3, a2, a4
+; LMULMAX1-RV64-NEXT: srli a2, a2, 2
+; LMULMAX1-RV64-NEXT: and a2, a2, a4
+; LMULMAX1-RV64-NEXT: add a2, a3, a2
+; LMULMAX1-RV64-NEXT: srli a3, a2, 4
+; LMULMAX1-RV64-NEXT: add a2, a2, a3
+; LMULMAX1-RV64-NEXT: and a2, a2, a5
+; LMULMAX1-RV64-NEXT: mul a2, a2, a1
+; LMULMAX1-RV64-NEXT: srli a2, a2, 56
+; LMULMAX1-RV64-NEXT: sw a2, 40(sp)
; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v26, 1
-; LMULMAX1-RV64-NEXT: vmv.x.s a3, v26
-; LMULMAX1-RV64-NEXT: or a3, a3, a2
-; LMULMAX1-RV64-NEXT: addi a5, a3, -1
-; LMULMAX1-RV64-NEXT: not a3, a3
-; LMULMAX1-RV64-NEXT: and a3, a3, a5
-; LMULMAX1-RV64-NEXT: srli a5, a3, 1
-; LMULMAX1-RV64-NEXT: and a5, a5, a7
-; LMULMAX1-RV64-NEXT: sub a3, a3, a5
-; LMULMAX1-RV64-NEXT: and a5, a3, a4
-; LMULMAX1-RV64-NEXT: srli a3, a3, 2
-; LMULMAX1-RV64-NEXT: and a3, a3, a4
-; LMULMAX1-RV64-NEXT: add a3, a5, a3
-; LMULMAX1-RV64-NEXT: srli a5, a3, 4
-; LMULMAX1-RV64-NEXT: add a3, a3, a5
+; LMULMAX1-RV64-NEXT: vmv.x.s a2, v26
+; LMULMAX1-RV64-NEXT: or a2, a2, a7
+; LMULMAX1-RV64-NEXT: addi a3, a2, -1
+; LMULMAX1-RV64-NEXT: not a2, a2
+; LMULMAX1-RV64-NEXT: and a2, a2, a3
+; LMULMAX1-RV64-NEXT: srli a3, a2, 1
; LMULMAX1-RV64-NEXT: and a3, a3, t0
-; LMULMAX1-RV64-NEXT: mul a3, a3, a1
-; LMULMAX1-RV64-NEXT: srli a3, a3, 56
-; LMULMAX1-RV64-NEXT: sw a3, 36(sp)
+; LMULMAX1-RV64-NEXT: sub a2, a2, a3
+; LMULMAX1-RV64-NEXT: and a3, a2, a4
+; LMULMAX1-RV64-NEXT: srli a2, a2, 2
+; LMULMAX1-RV64-NEXT: and a2, a2, a4
+; LMULMAX1-RV64-NEXT: add a2, a3, a2
+; LMULMAX1-RV64-NEXT: srli a3, a2, 4
+; LMULMAX1-RV64-NEXT: add a2, a2, a3
+; LMULMAX1-RV64-NEXT: and a2, a2, a5
+; LMULMAX1-RV64-NEXT: mul a2, a2, a1
+; LMULMAX1-RV64-NEXT: srli a2, a2, 56
+; LMULMAX1-RV64-NEXT: sw a2, 36(sp)
; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 3
-; LMULMAX1-RV64-NEXT: vmv.x.s a3, v26
-; LMULMAX1-RV64-NEXT: or a3, a3, a2
-; LMULMAX1-RV64-NEXT: addi a5, a3, -1
-; LMULMAX1-RV64-NEXT: not a3, a3
-; LMULMAX1-RV64-NEXT: and a3, a3, a5
-; LMULMAX1-RV64-NEXT: srli a5, a3, 1
-; LMULMAX1-RV64-NEXT: and a5, a5, a7
-; LMULMAX1-RV64-NEXT: sub a3, a3, a5
-; LMULMAX1-RV64-NEXT: and a5, a3, a4
-; LMULMAX1-RV64-NEXT: srli a3, a3, 2
-; LMULMAX1-RV64-NEXT: and a3, a3, a4
-; LMULMAX1-RV64-NEXT: add a3, a5, a3
-; LMULMAX1-RV64-NEXT: srli a5, a3, 4
-; LMULMAX1-RV64-NEXT: add a3, a3, a5
+; LMULMAX1-RV64-NEXT: vmv.x.s a2, v26
+; LMULMAX1-RV64-NEXT: or a2, a2, a7
+; LMULMAX1-RV64-NEXT: addi a3, a2, -1
+; LMULMAX1-RV64-NEXT: not a2, a2
+; LMULMAX1-RV64-NEXT: and a2, a2, a3
+; LMULMAX1-RV64-NEXT: srli a3, a2, 1
; LMULMAX1-RV64-NEXT: and a3, a3, t0
-; LMULMAX1-RV64-NEXT: mul a3, a3, a1
-; LMULMAX1-RV64-NEXT: srli a3, a3, 56
-; LMULMAX1-RV64-NEXT: sw a3, 28(sp)
+; LMULMAX1-RV64-NEXT: sub a2, a2, a3
+; LMULMAX1-RV64-NEXT: and a3, a2, a4
+; LMULMAX1-RV64-NEXT: srli a2, a2, 2
+; LMULMAX1-RV64-NEXT: and a2, a2, a4
+; LMULMAX1-RV64-NEXT: add a2, a3, a2
+; LMULMAX1-RV64-NEXT: srli a3, a2, 4
+; LMULMAX1-RV64-NEXT: add a2, a2, a3
+; LMULMAX1-RV64-NEXT: and a2, a2, a5
+; LMULMAX1-RV64-NEXT: mul a2, a2, a1
+; LMULMAX1-RV64-NEXT: srli a2, a2, 56
+; LMULMAX1-RV64-NEXT: sw a2, 28(sp)
; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 2
-; LMULMAX1-RV64-NEXT: vmv.x.s a3, v26
-; LMULMAX1-RV64-NEXT: or a3, a3, a2
-; LMULMAX1-RV64-NEXT: addi a5, a3, -1
-; LMULMAX1-RV64-NEXT: not a3, a3
-; LMULMAX1-RV64-NEXT: and a3, a3, a5
-; LMULMAX1-RV64-NEXT: srli a5, a3, 1
-; LMULMAX1-RV64-NEXT: and a5, a5, a7
-; LMULMAX1-RV64-NEXT: sub a3, a3, a5
-; LMULMAX1-RV64-NEXT: and a5, a3, a4
-; LMULMAX1-RV64-NEXT: srli a3, a3, 2
-; LMULMAX1-RV64-NEXT: and a3, a3, a4
-; LMULMAX1-RV64-NEXT: add a3, a5, a3
-; LMULMAX1-RV64-NEXT: srli a5, a3, 4
-; LMULMAX1-RV64-NEXT: add a3, a3, a5
+; LMULMAX1-RV64-NEXT: vmv.x.s a2, v26
+; LMULMAX1-RV64-NEXT: or a2, a2, a7
+; LMULMAX1-RV64-NEXT: addi a3, a2, -1
+; LMULMAX1-RV64-NEXT: not a2, a2
+; LMULMAX1-RV64-NEXT: and a2, a2, a3
+; LMULMAX1-RV64-NEXT: srli a3, a2, 1
; LMULMAX1-RV64-NEXT: and a3, a3, t0
-; LMULMAX1-RV64-NEXT: mul a3, a3, a1
-; LMULMAX1-RV64-NEXT: srli a3, a3, 56
-; LMULMAX1-RV64-NEXT: sw a3, 24(sp)
+; LMULMAX1-RV64-NEXT: sub a2, a2, a3
+; LMULMAX1-RV64-NEXT: and a3, a2, a4
+; LMULMAX1-RV64-NEXT: srli a2, a2, 2
+; LMULMAX1-RV64-NEXT: and a2, a2, a4
+; LMULMAX1-RV64-NEXT: add a2, a3, a2
+; LMULMAX1-RV64-NEXT: srli a3, a2, 4
+; LMULMAX1-RV64-NEXT: add a2, a2, a3
+; LMULMAX1-RV64-NEXT: and a2, a2, a5
+; LMULMAX1-RV64-NEXT: mul a2, a2, a1
+; LMULMAX1-RV64-NEXT: srli a2, a2, 56
+; LMULMAX1-RV64-NEXT: sw a2, 24(sp)
; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 1
-; LMULMAX1-RV64-NEXT: vmv.x.s a3, v26
-; LMULMAX1-RV64-NEXT: or a3, a3, a2
-; LMULMAX1-RV64-NEXT: addi a5, a3, -1
-; LMULMAX1-RV64-NEXT: not a3, a3
-; LMULMAX1-RV64-NEXT: and a3, a3, a5
-; LMULMAX1-RV64-NEXT: srli a5, a3, 1
-; LMULMAX1-RV64-NEXT: and a5, a5, a7
-; LMULMAX1-RV64-NEXT: sub a3, a3, a5
-; LMULMAX1-RV64-NEXT: and a5, a3, a4
-; LMULMAX1-RV64-NEXT: srli a3, a3, 2
-; LMULMAX1-RV64-NEXT: and a3, a3, a4
-; LMULMAX1-RV64-NEXT: add a3, a5, a3
-; LMULMAX1-RV64-NEXT: srli a5, a3, 4
-; LMULMAX1-RV64-NEXT: add a3, a3, a5
+; LMULMAX1-RV64-NEXT: vmv.x.s a2, v26
+; LMULMAX1-RV64-NEXT: or a2, a2, a7
+; LMULMAX1-RV64-NEXT: addi a3, a2, -1
+; LMULMAX1-RV64-NEXT: not a2, a2
+; LMULMAX1-RV64-NEXT: and a2, a2, a3
+; LMULMAX1-RV64-NEXT: srli a3, a2, 1
; LMULMAX1-RV64-NEXT: and a3, a3, t0
-; LMULMAX1-RV64-NEXT: mul a3, a3, a1
-; LMULMAX1-RV64-NEXT: srli a3, a3, 56
-; LMULMAX1-RV64-NEXT: sw a3, 20(sp)
-; LMULMAX1-RV64-NEXT: vmv.x.s a3, v25
-; LMULMAX1-RV64-NEXT: or a2, a3, a2
+; LMULMAX1-RV64-NEXT: sub a2, a2, a3
+; LMULMAX1-RV64-NEXT: and a3, a2, a4
+; LMULMAX1-RV64-NEXT: srli a2, a2, 2
+; LMULMAX1-RV64-NEXT: and a2, a2, a4
+; LMULMAX1-RV64-NEXT: add a2, a3, a2
+; LMULMAX1-RV64-NEXT: srli a3, a2, 4
+; LMULMAX1-RV64-NEXT: add a2, a2, a3
+; LMULMAX1-RV64-NEXT: and a2, a2, a5
+; LMULMAX1-RV64-NEXT: mul a2, a2, a1
+; LMULMAX1-RV64-NEXT: srli a2, a2, 56
+; LMULMAX1-RV64-NEXT: sw a2, 20(sp)
+; LMULMAX1-RV64-NEXT: vmv.x.s a2, v25
+; LMULMAX1-RV64-NEXT: or a2, a2, a7
; LMULMAX1-RV64-NEXT: addi a3, a2, -1
; LMULMAX1-RV64-NEXT: not a2, a2
; LMULMAX1-RV64-NEXT: and a2, a2, a3
; LMULMAX1-RV64-NEXT: srli a3, a2, 1
-; LMULMAX1-RV64-NEXT: and a3, a3, a7
+; LMULMAX1-RV64-NEXT: and a3, a3, t0
; LMULMAX1-RV64-NEXT: sub a2, a2, a3
; LMULMAX1-RV64-NEXT: and a3, a2, a4
; LMULMAX1-RV64-NEXT: srli a2, a2, 2
; LMULMAX1-RV64-NEXT: add a2, a3, a2
; LMULMAX1-RV64-NEXT: srli a3, a2, 4
; LMULMAX1-RV64-NEXT: add a2, a2, a3
-; LMULMAX1-RV64-NEXT: and a2, a2, t0
+; LMULMAX1-RV64-NEXT: and a2, a2, a5
; LMULMAX1-RV64-NEXT: mul a1, a2, a1
; LMULMAX1-RV64-NEXT: srli a1, a1, 56
; LMULMAX1-RV64-NEXT: sw a1, 16(sp)
; LMULMAX1-RV32-NEXT: .cfi_def_cfa_offset 48
; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu
; LMULMAX1-RV32-NEXT: vle64.v v25, (a0)
-; LMULMAX1-RV32-NEXT: addi a6, a0, 16
-; LMULMAX1-RV32-NEXT: vle64.v v26, (a6)
+; LMULMAX1-RV32-NEXT: addi a7, a0, 16
+; LMULMAX1-RV32-NEXT: vle64.v v26, (a7)
; LMULMAX1-RV32-NEXT: sw zero, 44(sp)
; LMULMAX1-RV32-NEXT: sw zero, 36(sp)
-; LMULMAX1-RV32-NEXT: addi a7, zero, 32
+; LMULMAX1-RV32-NEXT: addi a6, zero, 32
; LMULMAX1-RV32-NEXT: lui a1, 349525
; LMULMAX1-RV32-NEXT: addi a5, a1, 1365
; LMULMAX1-RV32-NEXT: lui a1, 209715
; LMULMAX1-RV32-NEXT: bnez a1, .LBB7_2
; LMULMAX1-RV32-NEXT: # %bb.1:
; LMULMAX1-RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu
-; LMULMAX1-RV32-NEXT: vsrl.vx v27, v26, a7
+; LMULMAX1-RV32-NEXT: vsrl.vx v27, v26, a6
; LMULMAX1-RV32-NEXT: vmv.x.s a1, v27
; LMULMAX1-RV32-NEXT: addi a2, a1, -1
; LMULMAX1-RV32-NEXT: not a1, a1
; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26
; LMULMAX1-RV32-NEXT: bnez a1, .LBB7_5
; LMULMAX1-RV32-NEXT: # %bb.4:
-; LMULMAX1-RV32-NEXT: vsrl.vx v26, v26, a7
+; LMULMAX1-RV32-NEXT: vsrl.vx v26, v26, a6
; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26
; LMULMAX1-RV32-NEXT: addi a2, a1, -1
; LMULMAX1-RV32-NEXT: not a1, a1
; LMULMAX1-RV32-NEXT: sw zero, 20(sp)
; LMULMAX1-RV32-NEXT: bnez a1, .LBB7_8
; LMULMAX1-RV32-NEXT: # %bb.7:
-; LMULMAX1-RV32-NEXT: vsrl.vx v26, v25, a7
+; LMULMAX1-RV32-NEXT: vsrl.vx v26, v25, a6
; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26
; LMULMAX1-RV32-NEXT: addi a2, a1, -1
; LMULMAX1-RV32-NEXT: not a1, a1
; LMULMAX1-RV32-NEXT: sw a1, 16(sp)
; LMULMAX1-RV32-NEXT: bnez a2, .LBB7_11
; LMULMAX1-RV32-NEXT: # %bb.10:
-; LMULMAX1-RV32-NEXT: vsrl.vx v25, v25, a7
+; LMULMAX1-RV32-NEXT: vsrl.vx v25, v25, a6
; LMULMAX1-RV32-NEXT: vmv.x.s a1, v25
; LMULMAX1-RV32-NEXT: addi a2, a1, -1
; LMULMAX1-RV32-NEXT: not a1, a1
; LMULMAX1-RV32-NEXT: vle32.v v26, (a1)
; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu
; LMULMAX1-RV32-NEXT: vse64.v v25, (a0)
-; LMULMAX1-RV32-NEXT: vse64.v v26, (a6)
+; LMULMAX1-RV32-NEXT: vse64.v v26, (a7)
; LMULMAX1-RV32-NEXT: addi sp, sp, 48
; LMULMAX1-RV32-NEXT: ret
;
;
; RV32IM-LABEL: fold_srem_vec_2:
; RV32IM: # %bb.0:
-; RV32IM-NEXT: lh a7, 12(a1)
+; RV32IM-NEXT: lh a6, 12(a1)
; RV32IM-NEXT: lh a3, 8(a1)
; RV32IM-NEXT: lh a4, 0(a1)
; RV32IM-NEXT: lh a1, 4(a1)
; RV32IM-NEXT: lui a5, 706409
-; RV32IM-NEXT: addi a6, a5, 389
-; RV32IM-NEXT: mulh a5, a4, a6
-; RV32IM-NEXT: add a5, a5, a4
-; RV32IM-NEXT: srli a2, a5, 31
-; RV32IM-NEXT: srli a5, a5, 6
-; RV32IM-NEXT: add a2, a5, a2
-; RV32IM-NEXT: addi a5, zero, 95
-; RV32IM-NEXT: mul a2, a2, a5
+; RV32IM-NEXT: addi a5, a5, 389
+; RV32IM-NEXT: mulh a2, a4, a5
+; RV32IM-NEXT: add a2, a2, a4
+; RV32IM-NEXT: srli a7, a2, 31
+; RV32IM-NEXT: srli a2, a2, 6
+; RV32IM-NEXT: add a2, a2, a7
+; RV32IM-NEXT: addi a7, zero, 95
+; RV32IM-NEXT: mul a2, a2, a7
; RV32IM-NEXT: sub t0, a4, a2
-; RV32IM-NEXT: mulh a4, a1, a6
+; RV32IM-NEXT: mulh a4, a1, a5
; RV32IM-NEXT: add a4, a4, a1
; RV32IM-NEXT: srli a2, a4, 31
; RV32IM-NEXT: srli a4, a4, 6
; RV32IM-NEXT: add a2, a4, a2
-; RV32IM-NEXT: mul a2, a2, a5
+; RV32IM-NEXT: mul a2, a2, a7
; RV32IM-NEXT: sub a1, a1, a2
-; RV32IM-NEXT: mulh a2, a3, a6
+; RV32IM-NEXT: mulh a2, a3, a5
; RV32IM-NEXT: add a2, a2, a3
; RV32IM-NEXT: srli a4, a2, 31
; RV32IM-NEXT: srli a2, a2, 6
; RV32IM-NEXT: add a2, a2, a4
-; RV32IM-NEXT: mul a2, a2, a5
+; RV32IM-NEXT: mul a2, a2, a7
; RV32IM-NEXT: sub a2, a3, a2
-; RV32IM-NEXT: mulh a3, a7, a6
-; RV32IM-NEXT: add a3, a3, a7
+; RV32IM-NEXT: mulh a3, a6, a5
+; RV32IM-NEXT: add a3, a3, a6
; RV32IM-NEXT: srli a4, a3, 31
; RV32IM-NEXT: srli a3, a3, 6
; RV32IM-NEXT: add a3, a3, a4
-; RV32IM-NEXT: mul a3, a3, a5
-; RV32IM-NEXT: sub a3, a7, a3
+; RV32IM-NEXT: mul a3, a3, a7
+; RV32IM-NEXT: sub a3, a6, a3
; RV32IM-NEXT: sh a3, 6(a0)
; RV32IM-NEXT: sh a2, 4(a0)
; RV32IM-NEXT: sh a1, 2(a0)
;
; RV64IM-LABEL: fold_srem_vec_2:
; RV64IM: # %bb.0:
-; RV64IM-NEXT: lh a7, 24(a1)
-; RV64IM-NEXT: lh a3, 16(a1)
+; RV64IM-NEXT: lh a6, 24(a1)
+; RV64IM-NEXT: lh a7, 16(a1)
; RV64IM-NEXT: lh a4, 8(a1)
; RV64IM-NEXT: lh a1, 0(a1)
; RV64IM-NEXT: lui a5, 1045903
; RV64IM-NEXT: slli a5, a5, 12
; RV64IM-NEXT: addi a5, a5, -905
; RV64IM-NEXT: slli a5, a5, 12
-; RV64IM-NEXT: addi a6, a5, -1767
-; RV64IM-NEXT: mulh a5, a1, a6
-; RV64IM-NEXT: add a5, a5, a1
-; RV64IM-NEXT: srli a2, a5, 63
-; RV64IM-NEXT: srli a5, a5, 6
-; RV64IM-NEXT: add a2, a5, a2
-; RV64IM-NEXT: addi a5, zero, 95
-; RV64IM-NEXT: mul a2, a2, a5
+; RV64IM-NEXT: addi a5, a5, -1767
+; RV64IM-NEXT: mulh a2, a1, a5
+; RV64IM-NEXT: add a2, a2, a1
+; RV64IM-NEXT: srli a3, a2, 63
+; RV64IM-NEXT: srli a2, a2, 6
+; RV64IM-NEXT: add a2, a2, a3
+; RV64IM-NEXT: addi a3, zero, 95
+; RV64IM-NEXT: mul a2, a2, a3
; RV64IM-NEXT: sub t0, a1, a2
-; RV64IM-NEXT: mulh a2, a4, a6
+; RV64IM-NEXT: mulh a2, a4, a5
; RV64IM-NEXT: add a2, a2, a4
; RV64IM-NEXT: srli a1, a2, 63
; RV64IM-NEXT: srli a2, a2, 6
; RV64IM-NEXT: add a1, a2, a1
-; RV64IM-NEXT: mul a1, a1, a5
+; RV64IM-NEXT: mul a1, a1, a3
; RV64IM-NEXT: sub a1, a4, a1
-; RV64IM-NEXT: mulh a2, a3, a6
-; RV64IM-NEXT: add a2, a2, a3
+; RV64IM-NEXT: mulh a2, a7, a5
+; RV64IM-NEXT: add a2, a2, a7
; RV64IM-NEXT: srli a4, a2, 63
; RV64IM-NEXT: srli a2, a2, 6
; RV64IM-NEXT: add a2, a2, a4
-; RV64IM-NEXT: mul a2, a2, a5
-; RV64IM-NEXT: sub a2, a3, a2
-; RV64IM-NEXT: mulh a3, a7, a6
-; RV64IM-NEXT: add a3, a3, a7
-; RV64IM-NEXT: srli a4, a3, 63
-; RV64IM-NEXT: srli a3, a3, 6
-; RV64IM-NEXT: add a3, a3, a4
-; RV64IM-NEXT: mul a3, a3, a5
-; RV64IM-NEXT: sub a3, a7, a3
+; RV64IM-NEXT: mul a2, a2, a3
+; RV64IM-NEXT: sub a2, a7, a2
+; RV64IM-NEXT: mulh a4, a6, a5
+; RV64IM-NEXT: add a4, a4, a6
+; RV64IM-NEXT: srli a5, a4, 63
+; RV64IM-NEXT: srli a4, a4, 6
+; RV64IM-NEXT: add a4, a4, a5
+; RV64IM-NEXT: mul a3, a4, a3
+; RV64IM-NEXT: sub a3, a6, a3
; RV64IM-NEXT: sh a3, 6(a0)
; RV64IM-NEXT: sh a2, 4(a0)
; RV64IM-NEXT: sh a1, 2(a0)
;
; RV32IM-LABEL: combine_srem_sdiv:
; RV32IM: # %bb.0:
-; RV32IM-NEXT: lh a7, 0(a1)
+; RV32IM-NEXT: lh a6, 0(a1)
; RV32IM-NEXT: lh a3, 4(a1)
; RV32IM-NEXT: lh a4, 12(a1)
; RV32IM-NEXT: lh a1, 8(a1)
; RV32IM-NEXT: lui a5, 706409
-; RV32IM-NEXT: addi a6, a5, 389
-; RV32IM-NEXT: mulh a5, a4, a6
-; RV32IM-NEXT: add a5, a5, a4
-; RV32IM-NEXT: srli a2, a5, 31
-; RV32IM-NEXT: srai a5, a5, 6
-; RV32IM-NEXT: add t3, a5, a2
-; RV32IM-NEXT: addi t0, zero, 95
-; RV32IM-NEXT: mul a5, t3, t0
-; RV32IM-NEXT: sub t1, a4, a5
-; RV32IM-NEXT: mulh a5, a1, a6
-; RV32IM-NEXT: add a5, a5, a1
-; RV32IM-NEXT: srli a4, a5, 31
-; RV32IM-NEXT: srai a5, a5, 6
+; RV32IM-NEXT: addi a5, a5, 389
+; RV32IM-NEXT: mulh a2, a4, a5
+; RV32IM-NEXT: add a2, a2, a4
+; RV32IM-NEXT: srli a7, a2, 31
+; RV32IM-NEXT: srai a2, a2, 6
+; RV32IM-NEXT: add t0, a2, a7
+; RV32IM-NEXT: addi a7, zero, 95
+; RV32IM-NEXT: mul a2, t0, a7
+; RV32IM-NEXT: sub t1, a4, a2
+; RV32IM-NEXT: mulh a4, a1, a5
+; RV32IM-NEXT: add a4, a4, a1
+; RV32IM-NEXT: srli a2, a4, 31
+; RV32IM-NEXT: srai a4, a4, 6
+; RV32IM-NEXT: add a2, a4, a2
+; RV32IM-NEXT: mul a4, a2, a7
+; RV32IM-NEXT: sub t2, a1, a4
+; RV32IM-NEXT: mulh a4, a3, a5
+; RV32IM-NEXT: add a4, a4, a3
+; RV32IM-NEXT: srli a1, a4, 31
+; RV32IM-NEXT: srai a4, a4, 6
+; RV32IM-NEXT: add a1, a4, a1
+; RV32IM-NEXT: mul a4, a1, a7
+; RV32IM-NEXT: sub a3, a3, a4
+; RV32IM-NEXT: mulh a4, a6, a5
+; RV32IM-NEXT: add a4, a4, a6
+; RV32IM-NEXT: srli a5, a4, 31
+; RV32IM-NEXT: srai a4, a4, 6
+; RV32IM-NEXT: add a4, a4, a5
+; RV32IM-NEXT: mul a5, a4, a7
+; RV32IM-NEXT: sub a5, a6, a5
; RV32IM-NEXT: add a4, a5, a4
-; RV32IM-NEXT: mul a5, a4, t0
-; RV32IM-NEXT: sub t2, a1, a5
-; RV32IM-NEXT: mulh a5, a3, a6
-; RV32IM-NEXT: add a5, a5, a3
-; RV32IM-NEXT: srli a1, a5, 31
-; RV32IM-NEXT: srai a5, a5, 6
-; RV32IM-NEXT: add a1, a5, a1
-; RV32IM-NEXT: mul a5, a1, t0
-; RV32IM-NEXT: sub a3, a3, a5
-; RV32IM-NEXT: mulh a5, a7, a6
-; RV32IM-NEXT: add a5, a5, a7
-; RV32IM-NEXT: srli a2, a5, 31
-; RV32IM-NEXT: srai a5, a5, 6
-; RV32IM-NEXT: add a2, a5, a2
-; RV32IM-NEXT: mul a5, a2, t0
-; RV32IM-NEXT: sub a5, a7, a5
-; RV32IM-NEXT: add a2, a5, a2
; RV32IM-NEXT: add a1, a3, a1
-; RV32IM-NEXT: add a3, t2, a4
-; RV32IM-NEXT: add a4, t1, t3
-; RV32IM-NEXT: sh a4, 6(a0)
-; RV32IM-NEXT: sh a3, 4(a0)
+; RV32IM-NEXT: add a2, t2, a2
+; RV32IM-NEXT: add a3, t1, t0
+; RV32IM-NEXT: sh a3, 6(a0)
+; RV32IM-NEXT: sh a2, 4(a0)
; RV32IM-NEXT: sh a1, 2(a0)
-; RV32IM-NEXT: sh a2, 0(a0)
+; RV32IM-NEXT: sh a4, 0(a0)
; RV32IM-NEXT: ret
;
; RV64I-LABEL: combine_srem_sdiv:
;
; RV64IM-LABEL: combine_srem_sdiv:
; RV64IM: # %bb.0:
-; RV64IM-NEXT: lh a7, 0(a1)
-; RV64IM-NEXT: lh a3, 8(a1)
+; RV64IM-NEXT: lh a6, 0(a1)
+; RV64IM-NEXT: lh a7, 8(a1)
; RV64IM-NEXT: lh a4, 16(a1)
; RV64IM-NEXT: lh a1, 24(a1)
; RV64IM-NEXT: lui a5, 1045903
; RV64IM-NEXT: slli a5, a5, 12
; RV64IM-NEXT: addi a5, a5, -905
; RV64IM-NEXT: slli a5, a5, 12
-; RV64IM-NEXT: addi a6, a5, -1767
-; RV64IM-NEXT: mulh a5, a1, a6
-; RV64IM-NEXT: add a5, a5, a1
-; RV64IM-NEXT: srli a2, a5, 63
-; RV64IM-NEXT: srai a5, a5, 6
-; RV64IM-NEXT: add t3, a5, a2
+; RV64IM-NEXT: addi a5, a5, -1767
+; RV64IM-NEXT: mulh a2, a1, a5
+; RV64IM-NEXT: add a2, a2, a1
+; RV64IM-NEXT: srli a3, a2, 63
+; RV64IM-NEXT: srai a2, a2, 6
+; RV64IM-NEXT: add t3, a2, a3
; RV64IM-NEXT: addi t0, zero, 95
-; RV64IM-NEXT: mul a5, t3, t0
-; RV64IM-NEXT: sub t1, a1, a5
-; RV64IM-NEXT: mulh a5, a4, a6
-; RV64IM-NEXT: add a5, a5, a4
-; RV64IM-NEXT: srli a1, a5, 63
-; RV64IM-NEXT: srai a5, a5, 6
-; RV64IM-NEXT: add a1, a5, a1
-; RV64IM-NEXT: mul a5, a1, t0
-; RV64IM-NEXT: sub t2, a4, a5
-; RV64IM-NEXT: mulh a5, a3, a6
-; RV64IM-NEXT: add a5, a5, a3
-; RV64IM-NEXT: srli a4, a5, 63
-; RV64IM-NEXT: srai a5, a5, 6
-; RV64IM-NEXT: add a4, a5, a4
-; RV64IM-NEXT: mul a5, a4, t0
-; RV64IM-NEXT: sub a3, a3, a5
-; RV64IM-NEXT: mulh a5, a7, a6
-; RV64IM-NEXT: add a5, a5, a7
+; RV64IM-NEXT: mul a3, t3, t0
+; RV64IM-NEXT: sub t1, a1, a3
+; RV64IM-NEXT: mulh a3, a4, a5
+; RV64IM-NEXT: add a3, a3, a4
+; RV64IM-NEXT: srli a1, a3, 63
+; RV64IM-NEXT: srai a3, a3, 6
+; RV64IM-NEXT: add a1, a3, a1
+; RV64IM-NEXT: mul a3, a1, t0
+; RV64IM-NEXT: sub t2, a4, a3
+; RV64IM-NEXT: mulh a4, a7, a5
+; RV64IM-NEXT: add a4, a4, a7
+; RV64IM-NEXT: srli a3, a4, 63
+; RV64IM-NEXT: srai a4, a4, 6
+; RV64IM-NEXT: add a3, a4, a3
+; RV64IM-NEXT: mul a4, a3, t0
+; RV64IM-NEXT: sub a4, a7, a4
+; RV64IM-NEXT: mulh a5, a6, a5
+; RV64IM-NEXT: add a5, a5, a6
; RV64IM-NEXT: srli a2, a5, 63
; RV64IM-NEXT: srai a5, a5, 6
; RV64IM-NEXT: add a2, a5, a2
; RV64IM-NEXT: mul a5, a2, t0
-; RV64IM-NEXT: sub a5, a7, a5
+; RV64IM-NEXT: sub a5, a6, a5
; RV64IM-NEXT: add a2, a5, a2
-; RV64IM-NEXT: add a3, a3, a4
+; RV64IM-NEXT: add a3, a4, a3
; RV64IM-NEXT: add a1, t2, a1
; RV64IM-NEXT: add a4, t1, t3
; RV64IM-NEXT: sh a4, 6(a0)
;
; RV32IM-LABEL: fold_urem_vec_2:
; RV32IM: # %bb.0:
-; RV32IM-NEXT: lhu a7, 12(a1)
-; RV32IM-NEXT: lhu a3, 8(a1)
+; RV32IM-NEXT: lhu a6, 12(a1)
+; RV32IM-NEXT: lhu a7, 8(a1)
; RV32IM-NEXT: lhu a4, 0(a1)
; RV32IM-NEXT: lhu a1, 4(a1)
; RV32IM-NEXT: lui a5, 364242
-; RV32IM-NEXT: addi a6, a5, 777
-; RV32IM-NEXT: mulhu a5, a4, a6
-; RV32IM-NEXT: sub a2, a4, a5
-; RV32IM-NEXT: srli a2, a2, 1
-; RV32IM-NEXT: add a2, a2, a5
+; RV32IM-NEXT: addi a5, a5, 777
+; RV32IM-NEXT: mulhu a2, a4, a5
+; RV32IM-NEXT: sub a3, a4, a2
+; RV32IM-NEXT: srli a3, a3, 1
+; RV32IM-NEXT: add a2, a3, a2
; RV32IM-NEXT: srli a2, a2, 6
-; RV32IM-NEXT: addi a5, zero, 95
-; RV32IM-NEXT: mul a2, a2, a5
+; RV32IM-NEXT: addi a3, zero, 95
+; RV32IM-NEXT: mul a2, a2, a3
; RV32IM-NEXT: sub t0, a4, a2
-; RV32IM-NEXT: mulhu a4, a1, a6
+; RV32IM-NEXT: mulhu a4, a1, a5
; RV32IM-NEXT: sub a2, a1, a4
; RV32IM-NEXT: srli a2, a2, 1
; RV32IM-NEXT: add a2, a2, a4
; RV32IM-NEXT: srli a2, a2, 6
-; RV32IM-NEXT: mul a2, a2, a5
+; RV32IM-NEXT: mul a2, a2, a3
; RV32IM-NEXT: sub a1, a1, a2
-; RV32IM-NEXT: mulhu a2, a3, a6
-; RV32IM-NEXT: sub a4, a3, a2
+; RV32IM-NEXT: mulhu a2, a7, a5
+; RV32IM-NEXT: sub a4, a7, a2
; RV32IM-NEXT: srli a4, a4, 1
; RV32IM-NEXT: add a2, a4, a2
; RV32IM-NEXT: srli a2, a2, 6
-; RV32IM-NEXT: mul a2, a2, a5
-; RV32IM-NEXT: sub a2, a3, a2
-; RV32IM-NEXT: mulhu a3, a7, a6
-; RV32IM-NEXT: sub a4, a7, a3
-; RV32IM-NEXT: srli a4, a4, 1
-; RV32IM-NEXT: add a3, a4, a3
-; RV32IM-NEXT: srli a3, a3, 6
-; RV32IM-NEXT: mul a3, a3, a5
-; RV32IM-NEXT: sub a3, a7, a3
+; RV32IM-NEXT: mul a2, a2, a3
+; RV32IM-NEXT: sub a2, a7, a2
+; RV32IM-NEXT: mulhu a4, a6, a5
+; RV32IM-NEXT: sub a5, a6, a4
+; RV32IM-NEXT: srli a5, a5, 1
+; RV32IM-NEXT: add a4, a5, a4
+; RV32IM-NEXT: srli a4, a4, 6
+; RV32IM-NEXT: mul a3, a4, a3
+; RV32IM-NEXT: sub a3, a6, a3
; RV32IM-NEXT: sh a3, 6(a0)
; RV32IM-NEXT: sh a2, 4(a0)
; RV32IM-NEXT: sh a1, 2(a0)
;
; RV64IM-LABEL: fold_urem_vec_2:
; RV64IM: # %bb.0:
-; RV64IM-NEXT: lhu a7, 24(a1)
-; RV64IM-NEXT: lhu a3, 16(a1)
+; RV64IM-NEXT: lhu a6, 24(a1)
+; RV64IM-NEXT: lhu a7, 16(a1)
; RV64IM-NEXT: lhu a4, 8(a1)
; RV64IM-NEXT: lhu a1, 0(a1)
; RV64IM-NEXT: lui a5, 1423
; RV64IM-NEXT: slli a5, a5, 13
; RV64IM-NEXT: addi a5, a5, -1811
; RV64IM-NEXT: slli a5, a5, 12
-; RV64IM-NEXT: addi a6, a5, 561
-; RV64IM-NEXT: mulhu a5, a1, a6
-; RV64IM-NEXT: sub a2, a1, a5
-; RV64IM-NEXT: srli a2, a2, 1
-; RV64IM-NEXT: add a2, a2, a5
+; RV64IM-NEXT: addi a5, a5, 561
+; RV64IM-NEXT: mulhu a2, a1, a5
+; RV64IM-NEXT: sub a3, a1, a2
+; RV64IM-NEXT: srli a3, a3, 1
+; RV64IM-NEXT: add a2, a3, a2
; RV64IM-NEXT: srli a2, a2, 6
-; RV64IM-NEXT: addi a5, zero, 95
-; RV64IM-NEXT: mul a2, a2, a5
+; RV64IM-NEXT: addi a3, zero, 95
+; RV64IM-NEXT: mul a2, a2, a3
; RV64IM-NEXT: sub t0, a1, a2
-; RV64IM-NEXT: mulhu a2, a4, a6
+; RV64IM-NEXT: mulhu a2, a4, a5
; RV64IM-NEXT: sub a1, a4, a2
; RV64IM-NEXT: srli a1, a1, 1
; RV64IM-NEXT: add a1, a1, a2
; RV64IM-NEXT: srli a1, a1, 6
-; RV64IM-NEXT: mul a1, a1, a5
+; RV64IM-NEXT: mul a1, a1, a3
; RV64IM-NEXT: sub a1, a4, a1
-; RV64IM-NEXT: mulhu a2, a3, a6
-; RV64IM-NEXT: sub a4, a3, a2
+; RV64IM-NEXT: mulhu a2, a7, a5
+; RV64IM-NEXT: sub a4, a7, a2
; RV64IM-NEXT: srli a4, a4, 1
; RV64IM-NEXT: add a2, a4, a2
; RV64IM-NEXT: srli a2, a2, 6
-; RV64IM-NEXT: mul a2, a2, a5
-; RV64IM-NEXT: sub a2, a3, a2
-; RV64IM-NEXT: mulhu a3, a7, a6
-; RV64IM-NEXT: sub a4, a7, a3
-; RV64IM-NEXT: srli a4, a4, 1
-; RV64IM-NEXT: add a3, a4, a3
-; RV64IM-NEXT: srli a3, a3, 6
-; RV64IM-NEXT: mul a3, a3, a5
-; RV64IM-NEXT: sub a3, a7, a3
+; RV64IM-NEXT: mul a2, a2, a3
+; RV64IM-NEXT: sub a2, a7, a2
+; RV64IM-NEXT: mulhu a4, a6, a5
+; RV64IM-NEXT: sub a5, a6, a4
+; RV64IM-NEXT: srli a5, a5, 1
+; RV64IM-NEXT: add a4, a5, a4
+; RV64IM-NEXT: srli a4, a4, 6
+; RV64IM-NEXT: mul a3, a4, a3
+; RV64IM-NEXT: sub a3, a6, a3
; RV64IM-NEXT: sh a3, 6(a0)
; RV64IM-NEXT: sh a2, 4(a0)
; RV64IM-NEXT: sh a1, 2(a0)
;
; RV32IM-LABEL: combine_urem_udiv:
; RV32IM: # %bb.0:
-; RV32IM-NEXT: lhu a7, 0(a1)
-; RV32IM-NEXT: lhu a3, 4(a1)
+; RV32IM-NEXT: lhu a6, 0(a1)
+; RV32IM-NEXT: lhu a7, 4(a1)
; RV32IM-NEXT: lhu a4, 12(a1)
; RV32IM-NEXT: lhu a1, 8(a1)
; RV32IM-NEXT: lui a5, 364242
-; RV32IM-NEXT: addi a6, a5, 777
-; RV32IM-NEXT: mulhu a5, a4, a6
-; RV32IM-NEXT: sub a2, a4, a5
-; RV32IM-NEXT: srli a2, a2, 1
-; RV32IM-NEXT: add a2, a2, a5
+; RV32IM-NEXT: addi a5, a5, 777
+; RV32IM-NEXT: mulhu a2, a4, a5
+; RV32IM-NEXT: sub a3, a4, a2
+; RV32IM-NEXT: srli a3, a3, 1
+; RV32IM-NEXT: add a2, a3, a2
; RV32IM-NEXT: srli t3, a2, 6
; RV32IM-NEXT: addi t0, zero, 95
-; RV32IM-NEXT: mul a5, t3, t0
-; RV32IM-NEXT: sub t1, a4, a5
-; RV32IM-NEXT: mulhu a5, a1, a6
-; RV32IM-NEXT: sub a4, a1, a5
-; RV32IM-NEXT: srli a4, a4, 1
-; RV32IM-NEXT: add a4, a4, a5
-; RV32IM-NEXT: srli a4, a4, 6
-; RV32IM-NEXT: mul a5, a4, t0
-; RV32IM-NEXT: sub t2, a1, a5
-; RV32IM-NEXT: mulhu a5, a3, a6
-; RV32IM-NEXT: sub a1, a3, a5
+; RV32IM-NEXT: mul a3, t3, t0
+; RV32IM-NEXT: sub t1, a4, a3
+; RV32IM-NEXT: mulhu a4, a1, a5
+; RV32IM-NEXT: sub a3, a1, a4
+; RV32IM-NEXT: srli a3, a3, 1
+; RV32IM-NEXT: add a3, a3, a4
+; RV32IM-NEXT: srli a3, a3, 6
+; RV32IM-NEXT: mul a4, a3, t0
+; RV32IM-NEXT: sub t2, a1, a4
+; RV32IM-NEXT: mulhu a4, a7, a5
+; RV32IM-NEXT: sub a1, a7, a4
; RV32IM-NEXT: srli a1, a1, 1
-; RV32IM-NEXT: add a1, a1, a5
+; RV32IM-NEXT: add a1, a1, a4
; RV32IM-NEXT: srli a1, a1, 6
-; RV32IM-NEXT: mul a5, a1, t0
-; RV32IM-NEXT: sub a3, a3, a5
-; RV32IM-NEXT: mulhu a5, a7, a6
-; RV32IM-NEXT: sub a2, a7, a5
+; RV32IM-NEXT: mul a4, a1, t0
+; RV32IM-NEXT: sub a4, a7, a4
+; RV32IM-NEXT: mulhu a5, a6, a5
+; RV32IM-NEXT: sub a2, a6, a5
; RV32IM-NEXT: srli a2, a2, 1
; RV32IM-NEXT: add a2, a2, a5
; RV32IM-NEXT: srli a2, a2, 6
; RV32IM-NEXT: mul a5, a2, t0
-; RV32IM-NEXT: sub a5, a7, a5
+; RV32IM-NEXT: sub a5, a6, a5
; RV32IM-NEXT: add a2, a5, a2
-; RV32IM-NEXT: add a1, a3, a1
-; RV32IM-NEXT: add a3, t2, a4
+; RV32IM-NEXT: add a1, a4, a1
+; RV32IM-NEXT: add a3, t2, a3
; RV32IM-NEXT: add a4, t1, t3
; RV32IM-NEXT: sh a4, 6(a0)
; RV32IM-NEXT: sh a3, 4(a0)
;
; RV64IM-LABEL: combine_urem_udiv:
; RV64IM: # %bb.0:
-; RV64IM-NEXT: lhu a7, 0(a1)
-; RV64IM-NEXT: lhu a3, 8(a1)
+; RV64IM-NEXT: lhu a6, 0(a1)
+; RV64IM-NEXT: lhu a7, 8(a1)
; RV64IM-NEXT: lhu a4, 16(a1)
; RV64IM-NEXT: lhu a1, 24(a1)
; RV64IM-NEXT: lui a5, 1423
; RV64IM-NEXT: slli a5, a5, 13
; RV64IM-NEXT: addi a5, a5, -1811
; RV64IM-NEXT: slli a5, a5, 12
-; RV64IM-NEXT: addi a6, a5, 561
-; RV64IM-NEXT: mulhu a5, a1, a6
-; RV64IM-NEXT: sub a2, a1, a5
-; RV64IM-NEXT: srli a2, a2, 1
-; RV64IM-NEXT: add a2, a2, a5
+; RV64IM-NEXT: addi a5, a5, 561
+; RV64IM-NEXT: mulhu a2, a1, a5
+; RV64IM-NEXT: sub a3, a1, a2
+; RV64IM-NEXT: srli a3, a3, 1
+; RV64IM-NEXT: add a2, a3, a2
; RV64IM-NEXT: srli t3, a2, 6
; RV64IM-NEXT: addi t0, zero, 95
-; RV64IM-NEXT: mul a5, t3, t0
-; RV64IM-NEXT: sub t1, a1, a5
-; RV64IM-NEXT: mulhu a5, a4, a6
-; RV64IM-NEXT: sub a1, a4, a5
+; RV64IM-NEXT: mul a3, t3, t0
+; RV64IM-NEXT: sub t1, a1, a3
+; RV64IM-NEXT: mulhu a3, a4, a5
+; RV64IM-NEXT: sub a1, a4, a3
; RV64IM-NEXT: srli a1, a1, 1
-; RV64IM-NEXT: add a1, a1, a5
+; RV64IM-NEXT: add a1, a1, a3
; RV64IM-NEXT: srli a1, a1, 6
-; RV64IM-NEXT: mul a5, a1, t0
-; RV64IM-NEXT: sub t2, a4, a5
-; RV64IM-NEXT: mulhu a5, a3, a6
-; RV64IM-NEXT: sub a4, a3, a5
-; RV64IM-NEXT: srli a4, a4, 1
-; RV64IM-NEXT: add a4, a4, a5
-; RV64IM-NEXT: srli a4, a4, 6
-; RV64IM-NEXT: mul a5, a4, t0
-; RV64IM-NEXT: sub a3, a3, a5
-; RV64IM-NEXT: mulhu a5, a7, a6
-; RV64IM-NEXT: sub a2, a7, a5
+; RV64IM-NEXT: mul a3, a1, t0
+; RV64IM-NEXT: sub t2, a4, a3
+; RV64IM-NEXT: mulhu a4, a7, a5
+; RV64IM-NEXT: sub a3, a7, a4
+; RV64IM-NEXT: srli a3, a3, 1
+; RV64IM-NEXT: add a3, a3, a4
+; RV64IM-NEXT: srli a3, a3, 6
+; RV64IM-NEXT: mul a4, a3, t0
+; RV64IM-NEXT: sub a4, a7, a4
+; RV64IM-NEXT: mulhu a5, a6, a5
+; RV64IM-NEXT: sub a2, a6, a5
; RV64IM-NEXT: srli a2, a2, 1
; RV64IM-NEXT: add a2, a2, a5
; RV64IM-NEXT: srli a2, a2, 6
; RV64IM-NEXT: mul a5, a2, t0
-; RV64IM-NEXT: sub a5, a7, a5
+; RV64IM-NEXT: sub a5, a6, a5
; RV64IM-NEXT: add a2, a5, a2
-; RV64IM-NEXT: add a3, a3, a4
+; RV64IM-NEXT: add a3, a4, a3
; RV64IM-NEXT: add a1, t2, a1
; RV64IM-NEXT: add a4, t1, t3
; RV64IM-NEXT: sh a4, 6(a0)
-; RUN: llc < %s -mtriple=thumb-apple-darwin -disable-cgp-branch-opts -disable-post-ra -verify-machineinstrs | FileCheck %s -check-prefix=CHECK
-; RUN: llc < %s -mtriple=thumb-apple-darwin -disable-cgp-branch-opts -disable-post-ra -regalloc=basic -verify-machineinstrs | FileCheck %s -check-prefix=CHECK
+; RUN: llc < %s -mtriple=thumb-apple-darwin -disable-cgp-branch-opts -disable-post-ra -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=RA_GREEDY
+; RUN: llc < %s -mtriple=thumb-apple-darwin -disable-cgp-branch-opts -disable-post-ra -regalloc=basic -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=RA_BASIC
%struct.state = type { i32, %struct.info*, float**, i32, i32, i32, i32, i32, i32, i32, i32, i32, i64, i64, i64, i64, i64, i64, i8* }
%struct.info = type { i32, i32, i32, i32, i32, i32, i32, i8* }
; CHECK: sub sp, #
; CHECK: mov r[[R0:[0-9]+]], sp
; CHECK: str r{{[0-9+]}}, [r[[R0]]
-; CHECK: str r{{[0-9+]}}, [r[[R0]]
+; RA_GREEDY: str r{{[0-9+]}}, [r[[R0]]
+; RA_BASIC: stm r[[R0]]!
; CHECK-NOT: ldr r0, [sp
; CHECK: mov r[[R1:[0-9]+]], sp
; CHECK: subs r[[R2:[0-9]+]], r[[R1]], r{{[0-9]+}}
define dso_local void @check_option(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 {
; ENABLED-LABEL: check_option:
; ENABLED: @ %bb.0: @ %entry
-; ENABLED-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr}
+; ENABLED-NEXT: push.w {r4, r5, r6, r7, r8, lr}
; ENABLED-NEXT: cmp r3, #1
; ENABLED-NEXT: blt .LBB0_4
; ENABLED-NEXT: @ %bb.1: @ %vector.ph.preheader
; ENABLED-NEXT: letp lr, .LBB0_3
; ENABLED-NEXT: b .LBB0_2
; ENABLED-NEXT: .LBB0_4: @ %for.cond.cleanup
-; ENABLED-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc}
+; ENABLED-NEXT: pop.w {r4, r5, r6, r7, r8, pc}
;
; DISABLED-LABEL: check_option:
; DISABLED: @ %bb.0: @ %entry
-; DISABLED-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr}
+; DISABLED-NEXT: push.w {r4, r5, r6, r7, r8, lr}
; DISABLED-NEXT: cmp r3, #1
; DISABLED-NEXT: blt .LBB0_4
; DISABLED-NEXT: @ %bb.1: @ %vector.ph.preheader
; DISABLED-NEXT: .LBB0_2: @ %vector.ph
; DISABLED-NEXT: @ =>This Loop Header: Depth=1
; DISABLED-NEXT: @ Child Loop BB0_3 Depth 2
-; DISABLED-NEXT: mov r9, r8
+; DISABLED-NEXT: mov r7, r8
; DISABLED-NEXT: mov r12, r0
; DISABLED-NEXT: mov r4, r2
; DISABLED-NEXT: mov r5, r1
; DISABLED-NEXT: .LBB0_3: @ %vector.body
; DISABLED-NEXT: @ Parent Loop BB0_2 Depth=1
; DISABLED-NEXT: @ => This Inner Loop Header: Depth=2
-; DISABLED-NEXT: mov lr, r9
+; DISABLED-NEXT: mov lr, r7
; DISABLED-NEXT: vctp.32 r6
-; DISABLED-NEXT: sub.w r9, r9, #1
+; DISABLED-NEXT: subs r7, #1
; DISABLED-NEXT: subs r6, #4
; DISABLED-NEXT: vpstt
; DISABLED-NEXT: vldrwt.u32 q0, [r5], #16
; DISABLED-NEXT: le lr, .LBB0_3
; DISABLED-NEXT: b .LBB0_2
; DISABLED-NEXT: .LBB0_4: @ %for.cond.cleanup
-; DISABLED-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc}
+; DISABLED-NEXT: pop.w {r4, r5, r6, r7, r8, pc}
entry:
%cmp8 = icmp sgt i32 %N, 0
%0 = add i32 %N, 3
define dso_local void @varying_outer_2d_reduction(i16* nocapture readonly %Input, i16* nocapture %Output, i16 signext %Size, i16 signext %N, i16 signext %Scale) local_unnamed_addr {
; ENABLED-LABEL: varying_outer_2d_reduction:
; ENABLED: @ %bb.0: @ %entry
-; ENABLED-NEXT: push.w {r4, r5, r6, r7, r9, r10, r11, lr}
+; ENABLED-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr}
+; ENABLED-NEXT: sub sp, #4
; ENABLED-NEXT: cmp r3, #1
+; ENABLED-NEXT: str r0, [sp] @ 4-byte Spill
; ENABLED-NEXT: blt .LBB0_8
; ENABLED-NEXT: @ %bb.1: @ %for.body.lr.ph
-; ENABLED-NEXT: mov r11, r0
-; ENABLED-NEXT: ldr r0, [sp, #32]
-; ENABLED-NEXT: add.w r9, r2, #3
-; ENABLED-NEXT: mov.w r12, #0
-; ENABLED-NEXT: mov r10, r11
+; ENABLED-NEXT: ldr r0, [sp, #36]
+; ENABLED-NEXT: add.w r12, r2, #3
+; ENABLED-NEXT: ldr.w r10, [sp] @ 4-byte Reload
+; ENABLED-NEXT: mov.w r8, #0
+; ENABLED-NEXT: mov r9, r12
; ENABLED-NEXT: uxth r0, r0
; ENABLED-NEXT: rsbs r5, r0, #0
; ENABLED-NEXT: b .LBB0_4
; ENABLED-NEXT: @ in Loop: Header=BB0_4 Depth=1
; ENABLED-NEXT: lsrs r0, r0, #16
; ENABLED-NEXT: sub.w r9, r9, #1
-; ENABLED-NEXT: strh.w r0, [r1, r12, lsl #1]
-; ENABLED-NEXT: add.w r12, r12, #1
+; ENABLED-NEXT: strh.w r0, [r1, r8, lsl #1]
+; ENABLED-NEXT: add.w r8, r8, #1
; ENABLED-NEXT: add.w r10, r10, #2
-; ENABLED-NEXT: cmp r12, r3
+; ENABLED-NEXT: cmp r8, r3
; ENABLED-NEXT: beq .LBB0_8
; ENABLED-NEXT: .LBB0_4: @ %for.body
; ENABLED-NEXT: @ =>This Loop Header: Depth=1
; ENABLED-NEXT: @ Child Loop BB0_6 Depth 2
-; ENABLED-NEXT: cmp r2, r12
+; ENABLED-NEXT: cmp r2, r8
; ENABLED-NEXT: ble .LBB0_2
; ENABLED-NEXT: @ %bb.5: @ %vector.ph
; ENABLED-NEXT: @ in Loop: Header=BB0_4 Depth=1
; ENABLED-NEXT: bic r0, r9, #3
; ENABLED-NEXT: movs r7, #1
; ENABLED-NEXT: subs r0, #4
-; ENABLED-NEXT: sub.w r4, r2, r12
+; ENABLED-NEXT: sub.w r4, r2, r8
; ENABLED-NEXT: vmov.i32 q1, #0x0
; ENABLED-NEXT: add.w r6, r7, r0, lsr #2
-; ENABLED-NEXT: adds r0, r2, #3
-; ENABLED-NEXT: sub.w r0, r0, r12
+; ENABLED-NEXT: sub.w r0, r12, r8
; ENABLED-NEXT: bic r0, r0, #3
; ENABLED-NEXT: subs r0, #4
; ENABLED-NEXT: add.w r0, r7, r0, lsr #2
; ENABLED-NEXT: mov r7, r10
; ENABLED-NEXT: dls lr, r0
-; ENABLED-NEXT: mov r0, r11
+; ENABLED-NEXT: ldr r0, [sp] @ 4-byte Reload
; ENABLED-NEXT: .LBB0_6: @ %vector.body
; ENABLED-NEXT: @ Parent Loop BB0_4 Depth=1
; ENABLED-NEXT: @ => This Inner Loop Header: Depth=2
; ENABLED-NEXT: vaddv.u32 r0, q0
; ENABLED-NEXT: b .LBB0_3
; ENABLED-NEXT: .LBB0_8: @ %for.end17
-; ENABLED-NEXT: pop.w {r4, r5, r6, r7, r9, r10, r11, pc}
+; ENABLED-NEXT: add sp, #4
+; ENABLED-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc}
;
; NOREDUCTIONS-LABEL: varying_outer_2d_reduction:
; NOREDUCTIONS: @ %bb.0: @ %entry
-; NOREDUCTIONS-NEXT: push.w {r4, r5, r6, r7, r9, r10, r11, lr}
+; NOREDUCTIONS-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr}
+; NOREDUCTIONS-NEXT: sub sp, #4
; NOREDUCTIONS-NEXT: cmp r3, #1
+; NOREDUCTIONS-NEXT: str r0, [sp] @ 4-byte Spill
; NOREDUCTIONS-NEXT: blt .LBB0_8
; NOREDUCTIONS-NEXT: @ %bb.1: @ %for.body.lr.ph
-; NOREDUCTIONS-NEXT: mov r11, r0
-; NOREDUCTIONS-NEXT: ldr r0, [sp, #32]
-; NOREDUCTIONS-NEXT: add.w r9, r2, #3
-; NOREDUCTIONS-NEXT: mov.w r12, #0
-; NOREDUCTIONS-NEXT: mov r10, r11
+; NOREDUCTIONS-NEXT: ldr r0, [sp, #36]
+; NOREDUCTIONS-NEXT: add.w r12, r2, #3
+; NOREDUCTIONS-NEXT: ldr.w r10, [sp] @ 4-byte Reload
+; NOREDUCTIONS-NEXT: mov.w r8, #0
+; NOREDUCTIONS-NEXT: mov r9, r12
; NOREDUCTIONS-NEXT: uxth r0, r0
; NOREDUCTIONS-NEXT: rsbs r5, r0, #0
; NOREDUCTIONS-NEXT: b .LBB0_4
; NOREDUCTIONS-NEXT: @ in Loop: Header=BB0_4 Depth=1
; NOREDUCTIONS-NEXT: lsrs r0, r0, #16
; NOREDUCTIONS-NEXT: sub.w r9, r9, #1
-; NOREDUCTIONS-NEXT: strh.w r0, [r1, r12, lsl #1]
-; NOREDUCTIONS-NEXT: add.w r12, r12, #1
+; NOREDUCTIONS-NEXT: strh.w r0, [r1, r8, lsl #1]
+; NOREDUCTIONS-NEXT: add.w r8, r8, #1
; NOREDUCTIONS-NEXT: add.w r10, r10, #2
-; NOREDUCTIONS-NEXT: cmp r12, r3
+; NOREDUCTIONS-NEXT: cmp r8, r3
; NOREDUCTIONS-NEXT: beq .LBB0_8
; NOREDUCTIONS-NEXT: .LBB0_4: @ %for.body
; NOREDUCTIONS-NEXT: @ =>This Loop Header: Depth=1
; NOREDUCTIONS-NEXT: @ Child Loop BB0_6 Depth 2
-; NOREDUCTIONS-NEXT: cmp r2, r12
+; NOREDUCTIONS-NEXT: cmp r2, r8
; NOREDUCTIONS-NEXT: ble .LBB0_2
; NOREDUCTIONS-NEXT: @ %bb.5: @ %vector.ph
; NOREDUCTIONS-NEXT: @ in Loop: Header=BB0_4 Depth=1
; NOREDUCTIONS-NEXT: bic r0, r9, #3
; NOREDUCTIONS-NEXT: movs r7, #1
; NOREDUCTIONS-NEXT: subs r0, #4
-; NOREDUCTIONS-NEXT: sub.w r4, r2, r12
+; NOREDUCTIONS-NEXT: sub.w r4, r2, r8
; NOREDUCTIONS-NEXT: vmov.i32 q1, #0x0
; NOREDUCTIONS-NEXT: add.w r6, r7, r0, lsr #2
-; NOREDUCTIONS-NEXT: adds r0, r2, #3
-; NOREDUCTIONS-NEXT: sub.w r0, r0, r12
+; NOREDUCTIONS-NEXT: sub.w r0, r12, r8
; NOREDUCTIONS-NEXT: bic r0, r0, #3
; NOREDUCTIONS-NEXT: subs r0, #4
; NOREDUCTIONS-NEXT: add.w r0, r7, r0, lsr #2
; NOREDUCTIONS-NEXT: mov r7, r10
; NOREDUCTIONS-NEXT: dls lr, r0
-; NOREDUCTIONS-NEXT: mov r0, r11
+; NOREDUCTIONS-NEXT: ldr r0, [sp] @ 4-byte Reload
; NOREDUCTIONS-NEXT: .LBB0_6: @ %vector.body
; NOREDUCTIONS-NEXT: @ Parent Loop BB0_4 Depth=1
; NOREDUCTIONS-NEXT: @ => This Inner Loop Header: Depth=2
; NOREDUCTIONS-NEXT: vaddv.u32 r0, q0
; NOREDUCTIONS-NEXT: b .LBB0_3
; NOREDUCTIONS-NEXT: .LBB0_8: @ %for.end17
-; NOREDUCTIONS-NEXT: pop.w {r4, r5, r6, r7, r9, r10, r11, pc}
+; NOREDUCTIONS-NEXT: add sp, #4
+; NOREDUCTIONS-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc}
entry:
%conv = sext i16 %N to i32
%cmp36 = icmp sgt i16 %N, 0
; CHECK-NEXT: add r7, sp, #12
; CHECK-NEXT: .save {r8, r9, r10, r11}
; CHECK-NEXT: push.w {r8, r9, r10, r11}
-; CHECK-NEXT: .pad #8
-; CHECK-NEXT: sub sp, #8
+; CHECK-NEXT: .pad #12
+; CHECK-NEXT: sub sp, #12
; CHECK-NEXT: wls lr, r1, .LBB2_3
; CHECK-NEXT: @ %bb.1: @ %while.body.preheader
-; CHECK-NEXT: adds r4, r3, #4
+; CHECK-NEXT: mov r4, r2
+; CHECK-NEXT: adds r2, r3, #4
; CHECK-NEXT: add.w r9, r0, #4
; CHECK-NEXT: mvn r11, #1
; CHECK-NEXT: @ implicit-def: $r6
; CHECK-NEXT: @ implicit-def: $r12
-; CHECK-NEXT: str r2, [sp] @ 4-byte Spill
+; CHECK-NEXT: str r4, [sp] @ 4-byte Spill
; CHECK-NEXT: .LBB2_2: @ %while.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ldr r1, [r9, #-4]
-; CHECK-NEXT: ldr.w r10, [r4]
+; CHECK-NEXT: ldr.w r10, [r2]
; CHECK-NEXT: str r0, [sp, #4] @ 4-byte Spill
; CHECK-NEXT: muls r1, r3, r1
; CHECK-NEXT: adds.w r8, r1, #-2147483648
; CHECK-NEXT: asr.w r5, r1, #31
; CHECK-NEXT: adc r1, r5, #0
; CHECK-NEXT: mul r5, r10, r0
-; CHECK-NEXT: mov r0, r4
+; CHECK-NEXT: mov r0, r2
+; CHECK-NEXT: ldr.w r2, [r11, #4]
+; CHECK-NEXT: str r2, [sp, #8] @ 4-byte Spill
+; CHECK-NEXT: ldr r2, [sp, #8] @ 4-byte Reload
; CHECK-NEXT: add.w r5, r5, #-2147483648
; CHECK-NEXT: asrl r8, r1, r5
; CHECK-NEXT: smull r4, r5, r10, r8
; CHECK-NEXT: mov r4, r5
; CHECK-NEXT: lsll r4, r1, r10
; CHECK-NEXT: lsll r4, r1, #30
-; CHECK-NEXT: ldrd r4, r8, [r11]
+; CHECK-NEXT: ldr.w r4, [r11]
; CHECK-NEXT: asrs r5, r1, #31
+; CHECK-NEXT: mov r8, r1
; CHECK-NEXT: muls r4, r6, r4
-; CHECK-NEXT: adds r2, r4, #2
-; CHECK-NEXT: mov r4, r1
-; CHECK-NEXT: lsll r4, r5, r2
-; CHECK-NEXT: add.w r1, r4, #-2147483648
+; CHECK-NEXT: adds r4, #2
+; CHECK-NEXT: lsll r8, r5, r4
; CHECK-NEXT: ldr r4, [r9], #4
; CHECK-NEXT: asr.w r5, r12, #31
+; CHECK-NEXT: add.w r8, r8, #-2147483648
; CHECK-NEXT: muls r4, r3, r4
; CHECK-NEXT: adds r3, #4
-; CHECK-NEXT: adds.w r2, r12, r4
+; CHECK-NEXT: adds.w r1, r12, r4
; CHECK-NEXT: adc.w r5, r5, r4, asr #31
-; CHECK-NEXT: smull r6, r4, r8, r6
-; CHECK-NEXT: adds.w r2, r2, #-2147483648
-; CHECK-NEXT: adc r2, r5, #0
-; CHECK-NEXT: asrs r5, r2, #31
-; CHECK-NEXT: subs r6, r2, r6
+; CHECK-NEXT: smull r6, r4, r2, r6
+; CHECK-NEXT: adds.w r1, r1, #-2147483648
+; CHECK-NEXT: adc r1, r5, #0
+; CHECK-NEXT: mov r2, r0
+; CHECK-NEXT: asrs r5, r1, #31
+; CHECK-NEXT: subs r6, r1, r6
; CHECK-NEXT: sbcs r5, r4
; CHECK-NEXT: adds.w r6, r6, #-2147483648
; CHECK-NEXT: adc r5, r5, #0
-; CHECK-NEXT: mov r4, r0
-; CHECK-NEXT: asrl r6, r5, r1
-; CHECK-NEXT: movs r1, #2
+; CHECK-NEXT: asrl r6, r5, r8
; CHECK-NEXT: lsrl r6, r5, #2
-; CHECK-NEXT: ldr r0, [sp, #4] @ 4-byte Reload
-; CHECK-NEXT: str r6, [r1]
-; CHECK-NEXT: ldr r1, [r11], #-4
-; CHECK-NEXT: adds r0, #4
-; CHECK-NEXT: mls r1, r1, r10, r2
+; CHECK-NEXT: movs r5, #2
+; CHECK-NEXT: str r6, [r5]
+; CHECK-NEXT: ldr r5, [r11], #-4
+; CHECK-NEXT: mls r1, r5, r10, r1
; CHECK-NEXT: adds.w r12, r1, #-2147483648
-; CHECK-NEXT: asr.w r2, r1, #31
-; CHECK-NEXT: adc r1, r2, #0
-; CHECK-NEXT: ldr r2, [sp] @ 4-byte Reload
+; CHECK-NEXT: asr.w r4, r1, #31
+; CHECK-NEXT: adc r1, r4, #0
+; CHECK-NEXT: ldrd r4, r0, [sp] @ 8-byte Folded Reload
; CHECK-NEXT: lsrl r12, r1, #2
; CHECK-NEXT: rsb.w r1, r12, #0
-; CHECK-NEXT: str r1, [r2]
-; CHECK-NEXT: str r1, [r4, #-4]
-; CHECK-NEXT: adds r4, #4
+; CHECK-NEXT: adds r0, #4
+; CHECK-NEXT: str r1, [r4]
+; CHECK-NEXT: str r1, [r2, #-4]
+; CHECK-NEXT: adds r2, #4
; CHECK-NEXT: le lr, .LBB2_2
; CHECK-NEXT: .LBB2_3: @ %while.end
-; CHECK-NEXT: add sp, #8
+; CHECK-NEXT: add sp, #12
; CHECK-NEXT: pop.w {r8, r9, r10, r11}
; CHECK-NEXT: pop {r4, r5, r6, r7, pc}
entry:
; CHECK-NEXT: ldrd r8, lr, [r7, #20]
; CHECK-NEXT: movs r5, #0
; CHECK-NEXT: cmp r5, #0
-; CHECK-NEXT: ldm.w r10, {r4, r6, r10}
-; CHECK-NEXT: ldrd r12, r9, [r7, #28]
+; CHECK-NEXT: ldm.w r10, {r4, r9, r10}
+; CHECK-NEXT: ldr.w r12, [r7, #28]
; CHECK-NEXT: ittt ne
; CHECK-NEXT: addne sp, #292
; CHECK-NEXT: popne.w {r8, r10, r11}
; CHECK-NEXT: @ %bb.3: @ %bb420
; CHECK-NEXT: movw r5, :lower16:(L_zz_hold$non_lazy_ptr-(LPC0_0+4))
; CHECK-NEXT: movt r5, :upper16:(L_zz_hold$non_lazy_ptr-(LPC0_0+4))
+; CHECK-NEXT: movw r11, :lower16:(L_zz_res$non_lazy_ptr-(LPC0_1+4))
; CHECK-NEXT: LPC0_0:
; CHECK-NEXT: add r5, pc
-; CHECK-NEXT: ldr.w r11, [r5]
-; CHECK-NEXT: str.w r11, [sp, #28] @ 4-byte Spill
-; CHECK-NEXT: movw r5, :lower16:(L_zz_res$non_lazy_ptr-(LPC0_1+4))
-; CHECK-NEXT: movt r5, :upper16:(L_zz_res$non_lazy_ptr-(LPC0_1+4))
+; CHECK-NEXT: movt r11, :upper16:(L_zz_res$non_lazy_ptr-(LPC0_1+4))
; CHECK-NEXT: LPC0_1:
-; CHECK-NEXT: add r5, pc
+; CHECK-NEXT: add r11, pc
; CHECK-NEXT: ldr r5, [r5]
; CHECK-NEXT: str r5, [sp, #32] @ 4-byte Spill
+; CHECK-NEXT: ldr.w r5, [r11]
+; CHECK-NEXT: mov.w r11, #0
+; CHECK-NEXT: str r5, [sp, #28] @ 4-byte Spill
+; CHECK-NEXT: ldr r5, [sp, #32] @ 4-byte Reload
+; CHECK-NEXT: str.w r11, [r5]
; CHECK-NEXT: movs r5, #0
-; CHECK-NEXT: str.w r5, [r11]
-; CHECK-NEXT: ldr.w r11, [sp, #32] @ 4-byte Reload
-; CHECK-NEXT: str.w r5, [r11]
-; CHECK-NEXT: ldr r5, [sp, #28] @ 4-byte Reload
+; CHECK-NEXT: ldr r6, [sp, #28] @ 4-byte Reload
+; CHECK-NEXT: str r5, [r6]
+; CHECK-NEXT: ldr r5, [sp, #32] @ 4-byte Reload
; CHECK-NEXT: str r0, [r5]
-; CHECK-NEXT: stm.w sp, {r4, r6, r10}
+; CHECK-NEXT: ldr r0, [r7, #32]
+; CHECK-NEXT: stm.w sp, {r4, r9, r10}
; CHECK-NEXT: strd r8, lr, [sp, #12]
-; CHECK-NEXT: strd r12, r9, [sp, #20]
+; CHECK-NEXT: str.w r12, [sp, #20]
+; CHECK-NEXT: str r0, [sp, #24]
; CHECK-NEXT: bl _Manifest
; CHECK-NEXT: trap
; CHECK-NEXT: LBB0_4: @ %bb20
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
-; CHECK-NEXT: .pad #20
-; CHECK-NEXT: sub sp, #20
+; CHECK-NEXT: .pad #24
+; CHECK-NEXT: sub sp, #24
; CHECK-NEXT: cmp r3, #8
-; CHECK-NEXT: str r1, [sp, #16] @ 4-byte Spill
+; CHECK-NEXT: str r1, [sp, #20] @ 4-byte Spill
; CHECK-NEXT: blo.w .LBB16_12
; CHECK-NEXT: @ %bb.1: @ %entry
; CHECK-NEXT: lsrs.w r12, r3, #2
; CHECK-NEXT: str r1, [sp] @ 4-byte Spill
; CHECK-NEXT: subs r1, r7, #2
; CHECK-NEXT: rsbs r7, r4, #0
-; CHECK-NEXT: str r4, [sp, #8] @ 4-byte Spill
+; CHECK-NEXT: str r7, [sp, #8] @ 4-byte Spill
+; CHECK-NEXT: add.w r7, r3, #16
+; CHECK-NEXT: str r4, [sp, #12] @ 4-byte Spill
; CHECK-NEXT: str r7, [sp, #4] @ 4-byte Spill
-; CHECK-NEXT: str r0, [sp, #12] @ 4-byte Spill
+; CHECK-NEXT: str r0, [sp, #16] @ 4-byte Spill
; CHECK-NEXT: b .LBB16_5
; CHECK-NEXT: .LBB16_3: @ %for.end
; CHECK-NEXT: @ in Loop: Header=BB16_5 Depth=1
-; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT: ldr r0, [sp, #16] @ 4-byte Reload
; CHECK-NEXT: wls lr, r0, .LBB16_4
; CHECK-NEXT: b .LBB16_9
; CHECK-NEXT: .LBB16_4: @ %while.end
; CHECK-NEXT: @ in Loop: Header=BB16_5 Depth=1
-; CHECK-NEXT: ldr r0, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT: ldr r0, [sp, #8] @ 4-byte Reload
; CHECK-NEXT: subs.w r12, r12, #1
; CHECK-NEXT: vstrb.8 q0, [r2], #8
-; CHECK-NEXT: add.w r0, r6, r0, lsl #1
+; CHECK-NEXT: add.w r0, r5, r0, lsl #1
; CHECK-NEXT: add.w r5, r0, #8
; CHECK-NEXT: beq.w .LBB16_12
; CHECK-NEXT: .LBB16_5: @ %while.body
; CHECK-NEXT: @ =>This Loop Header: Depth=1
; CHECK-NEXT: @ Child Loop BB16_7 Depth 2
; CHECK-NEXT: @ Child Loop BB16_10 Depth 2
-; CHECK-NEXT: ldr r0, [sp, #16] @ 4-byte Reload
+; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload
; CHECK-NEXT: ldrh.w lr, [r3, #14]
; CHECK-NEXT: vldrw.u32 q0, [r0], #8
-; CHECK-NEXT: ldrh.w r10, [r3, #12]
+; CHECK-NEXT: ldrh.w r8, [r3, #12]
; CHECK-NEXT: ldrh r7, [r3, #10]
; CHECK-NEXT: ldrh r4, [r3, #8]
; CHECK-NEXT: ldrh r6, [r3, #6]
; CHECK-NEXT: ldrh.w r9, [r3, #4]
; CHECK-NEXT: ldrh.w r11, [r3, #2]
-; CHECK-NEXT: ldrh.w r8, [r3]
+; CHECK-NEXT: ldrh.w r10, [r3]
; CHECK-NEXT: vstrb.8 q0, [r1], #8
; CHECK-NEXT: vldrw.u32 q0, [r5]
-; CHECK-NEXT: str r0, [sp, #16] @ 4-byte Spill
+; CHECK-NEXT: str r0, [sp, #20] @ 4-byte Spill
; CHECK-NEXT: adds r0, r5, #2
; CHECK-NEXT: vldrw.u32 q1, [r0]
-; CHECK-NEXT: vmul.f16 q0, q0, r8
+; CHECK-NEXT: vmul.f16 q0, q0, r10
; CHECK-NEXT: adds r0, r5, #6
; CHECK-NEXT: vfma.f16 q0, q1, r11
; CHECK-NEXT: vldrw.u32 q1, [r5, #4]
; CHECK-NEXT: add.w r0, r5, #10
; CHECK-NEXT: vfma.f16 q0, q1, r6
; CHECK-NEXT: vldrw.u32 q1, [r5, #8]
-; CHECK-NEXT: add.w r6, r5, #16
; CHECK-NEXT: vfma.f16 q0, q1, r4
; CHECK-NEXT: vldrw.u32 q1, [r0]
; CHECK-NEXT: add.w r0, r5, #14
; CHECK-NEXT: vfma.f16 q0, q1, r7
; CHECK-NEXT: vldrw.u32 q1, [r5, #12]
-; CHECK-NEXT: vfma.f16 q0, q1, r10
+; CHECK-NEXT: adds r5, #16
+; CHECK-NEXT: vfma.f16 q0, q1, r8
; CHECK-NEXT: vldrw.u32 q1, [r0]
-; CHECK-NEXT: ldr r0, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload
; CHECK-NEXT: vfma.f16 q0, q1, lr
; CHECK-NEXT: cmp r0, #16
; CHECK-NEXT: blo .LBB16_8
; CHECK-NEXT: @ %bb.6: @ %for.body.preheader
; CHECK-NEXT: @ in Loop: Header=BB16_5 Depth=1
; CHECK-NEXT: ldr r0, [sp] @ 4-byte Reload
-; CHECK-NEXT: add.w r5, r3, #16
; CHECK-NEXT: dls lr, r0
+; CHECK-NEXT: ldr r6, [sp, #4] @ 4-byte Reload
; CHECK-NEXT: .LBB16_7: @ %for.body
; CHECK-NEXT: @ Parent Loop BB16_5 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT: ldrh r0, [r5], #16
-; CHECK-NEXT: vldrw.u32 q1, [r6]
-; CHECK-NEXT: adds r4, r6, #2
+; CHECK-NEXT: ldrh r0, [r6], #16
+; CHECK-NEXT: vldrw.u32 q1, [r5]
+; CHECK-NEXT: adds r4, r5, #2
; CHECK-NEXT: vfma.f16 q0, q1, r0
; CHECK-NEXT: vldrw.u32 q1, [r4]
-; CHECK-NEXT: ldrh r0, [r5, #-14]
-; CHECK-NEXT: adds r4, r6, #6
+; CHECK-NEXT: ldrh r0, [r6, #-14]
+; CHECK-NEXT: adds r4, r5, #6
; CHECK-NEXT: vfma.f16 q0, q1, r0
-; CHECK-NEXT: ldrh r0, [r5, #-12]
-; CHECK-NEXT: vldrw.u32 q1, [r6, #4]
+; CHECK-NEXT: ldrh r0, [r6, #-12]
+; CHECK-NEXT: vldrw.u32 q1, [r5, #4]
; CHECK-NEXT: vfma.f16 q0, q1, r0
; CHECK-NEXT: vldrw.u32 q1, [r4]
-; CHECK-NEXT: ldrh r0, [r5, #-10]
-; CHECK-NEXT: add.w r4, r6, #10
+; CHECK-NEXT: ldrh r0, [r6, #-10]
+; CHECK-NEXT: add.w r4, r5, #10
; CHECK-NEXT: vfma.f16 q0, q1, r0
-; CHECK-NEXT: ldrh r0, [r5, #-8]
-; CHECK-NEXT: vldrw.u32 q1, [r6, #8]
+; CHECK-NEXT: ldrh r0, [r6, #-8]
+; CHECK-NEXT: vldrw.u32 q1, [r5, #8]
; CHECK-NEXT: vfma.f16 q0, q1, r0
; CHECK-NEXT: vldrw.u32 q1, [r4]
-; CHECK-NEXT: ldrh r0, [r5, #-6]
-; CHECK-NEXT: ldrh r4, [r5, #-2]
+; CHECK-NEXT: ldrh r0, [r6, #-6]
+; CHECK-NEXT: ldrh r4, [r6, #-2]
; CHECK-NEXT: vfma.f16 q0, q1, r0
-; CHECK-NEXT: ldrh r0, [r5, #-4]
-; CHECK-NEXT: vldrw.u32 q1, [r6, #12]
+; CHECK-NEXT: ldrh r0, [r6, #-4]
+; CHECK-NEXT: vldrw.u32 q1, [r5, #12]
; CHECK-NEXT: vfma.f16 q0, q1, r0
-; CHECK-NEXT: add.w r0, r6, #14
+; CHECK-NEXT: add.w r0, r5, #14
; CHECK-NEXT: vldrw.u32 q1, [r0]
-; CHECK-NEXT: adds r6, #16
+; CHECK-NEXT: adds r5, #16
; CHECK-NEXT: vfma.f16 q0, q1, r4
; CHECK-NEXT: le lr, .LBB16_7
; CHECK-NEXT: b .LBB16_3
; CHECK-NEXT: .LBB16_8: @ in Loop: Header=BB16_5 Depth=1
-; CHECK-NEXT: add.w r5, r3, #16
+; CHECK-NEXT: ldr r6, [sp, #4] @ 4-byte Reload
; CHECK-NEXT: b .LBB16_3
; CHECK-NEXT: .LBB16_9: @ %while.body76.preheader
; CHECK-NEXT: @ in Loop: Header=BB16_5 Depth=1
-; CHECK-NEXT: mov r0, r6
+; CHECK-NEXT: mov r0, r5
; CHECK-NEXT: .LBB16_10: @ %while.body76
; CHECK-NEXT: @ Parent Loop BB16_5 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT: ldrh r4, [r5], #2
+; CHECK-NEXT: ldrh r4, [r6], #2
; CHECK-NEXT: vldrh.u16 q1, [r0], #2
; CHECK-NEXT: vfma.f16 q0, q1, r4
; CHECK-NEXT: le lr, .LBB16_10
; CHECK-NEXT: @ %bb.11: @ %while.end.loopexit
; CHECK-NEXT: @ in Loop: Header=BB16_5 Depth=1
-; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload
-; CHECK-NEXT: add.w r6, r6, r0, lsl #1
+; CHECK-NEXT: ldr r0, [sp, #16] @ 4-byte Reload
+; CHECK-NEXT: add.w r5, r5, r0, lsl #1
; CHECK-NEXT: b .LBB16_4
; CHECK-NEXT: .LBB16_12: @ %if.end
-; CHECK-NEXT: add sp, #20
+; CHECK-NEXT: add sp, #24
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
entry:
%pState1 = getelementptr inbounds %struct.arm_fir_instance_f32, %struct.arm_fir_instance_f32* %S, i32 0, i32 1
; CHECK-NEXT: sub sp, #4
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13}
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT: .pad #24
-; CHECK-NEXT: sub sp, #24
+; CHECK-NEXT: .pad #32
+; CHECK-NEXT: sub sp, #32
; CHECK-NEXT: cmp r3, #8
; CHECK-NEXT: blo.w .LBB16_12
; CHECK-NEXT: @ %bb.1: @ %entry
; CHECK-NEXT: beq.w .LBB16_12
; CHECK-NEXT: @ %bb.2: @ %while.body.lr.ph
; CHECK-NEXT: ldrh r6, [r0]
-; CHECK-NEXT: movs r4, #1
-; CHECK-NEXT: ldrd r7, r10, [r0, #4]
+; CHECK-NEXT: movs r5, #1
+; CHECK-NEXT: ldrd r4, r10, [r0, #4]
; CHECK-NEXT: sub.w r0, r6, #8
; CHECK-NEXT: add.w r3, r0, r0, lsr #29
; CHECK-NEXT: and r0, r0, #7
-; CHECK-NEXT: asrs r5, r3, #3
-; CHECK-NEXT: cmp r5, #1
+; CHECK-NEXT: asrs r7, r3, #3
+; CHECK-NEXT: cmp r7, #1
; CHECK-NEXT: it gt
-; CHECK-NEXT: asrgt r4, r3, #3
-; CHECK-NEXT: add.w r3, r7, r6, lsl #2
+; CHECK-NEXT: asrgt r5, r3, #3
+; CHECK-NEXT: add.w r3, r4, r6, lsl #2
; CHECK-NEXT: sub.w r9, r3, #4
; CHECK-NEXT: rsbs r3, r6, #0
-; CHECK-NEXT: str r4, [sp] @ 4-byte Spill
-; CHECK-NEXT: str r6, [sp, #8] @ 4-byte Spill
-; CHECK-NEXT: str r3, [sp, #4] @ 4-byte Spill
-; CHECK-NEXT: str r0, [sp, #12] @ 4-byte Spill
+; CHECK-NEXT: str r3, [sp, #12] @ 4-byte Spill
+; CHECK-NEXT: add.w r3, r10, #32
+; CHECK-NEXT: str r5, [sp, #4] @ 4-byte Spill
+; CHECK-NEXT: str r6, [sp, #16] @ 4-byte Spill
+; CHECK-NEXT: str r3, [sp, #8] @ 4-byte Spill
+; CHECK-NEXT: str r0, [sp, #20] @ 4-byte Spill
; CHECK-NEXT: b .LBB16_5
; CHECK-NEXT: .LBB16_3: @ %for.end
; CHECK-NEXT: @ in Loop: Header=BB16_5 Depth=1
-; CHECK-NEXT: ldr r1, [sp, #20] @ 4-byte Reload
-; CHECK-NEXT: ldrd r0, r9, [sp, #12] @ 8-byte Folded Reload
+; CHECK-NEXT: ldr r1, [sp, #28] @ 4-byte Reload
+; CHECK-NEXT: ldrd r0, r9, [sp, #20] @ 8-byte Folded Reload
; CHECK-NEXT: wls lr, r0, .LBB16_4
; CHECK-NEXT: b .LBB16_9
; CHECK-NEXT: .LBB16_4: @ %while.end
; CHECK-NEXT: @ in Loop: Header=BB16_5 Depth=1
-; CHECK-NEXT: ldr r0, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload
; CHECK-NEXT: subs.w r12, r12, #1
; CHECK-NEXT: vstrb.8 q0, [r2], #16
-; CHECK-NEXT: add.w r0, r7, r0, lsl #2
-; CHECK-NEXT: add.w r7, r0, #16
+; CHECK-NEXT: add.w r0, r4, r0, lsl #2
+; CHECK-NEXT: add.w r4, r0, #16
; CHECK-NEXT: beq .LBB16_12
; CHECK-NEXT: .LBB16_5: @ %while.body
; CHECK-NEXT: @ =>This Loop Header: Depth=1
; CHECK-NEXT: @ Child Loop BB16_10 Depth 2
; CHECK-NEXT: add.w lr, r10, #8
; CHECK-NEXT: vldrw.u32 q0, [r1], #16
-; CHECK-NEXT: ldrd r3, r4, [r10]
+; CHECK-NEXT: ldrd r3, r7, [r10]
; CHECK-NEXT: ldm.w lr, {r0, r5, r6, lr}
; CHECK-NEXT: ldrd r11, r8, [r10, #24]
; CHECK-NEXT: vstrb.8 q0, [r9], #16
-; CHECK-NEXT: vldrw.u32 q0, [r7], #32
-; CHECK-NEXT: strd r9, r1, [sp, #16] @ 8-byte Folded Spill
-; CHECK-NEXT: vldrw.u32 q1, [r7, #-28]
+; CHECK-NEXT: vldrw.u32 q0, [r4], #32
+; CHECK-NEXT: strd r9, r1, [sp, #24] @ 8-byte Folded Spill
+; CHECK-NEXT: vldrw.u32 q1, [r4, #-28]
; CHECK-NEXT: vmul.f32 q0, q0, r3
-; CHECK-NEXT: vldrw.u32 q6, [r7, #-24]
-; CHECK-NEXT: vldrw.u32 q4, [r7, #-20]
-; CHECK-NEXT: vfma.f32 q0, q1, r4
-; CHECK-NEXT: vldrw.u32 q5, [r7, #-16]
+; CHECK-NEXT: vldrw.u32 q6, [r4, #-24]
+; CHECK-NEXT: vldrw.u32 q4, [r4, #-20]
+; CHECK-NEXT: vfma.f32 q0, q1, r7
+; CHECK-NEXT: vldrw.u32 q5, [r4, #-16]
; CHECK-NEXT: vfma.f32 q0, q6, r0
-; CHECK-NEXT: vldrw.u32 q2, [r7, #-12]
+; CHECK-NEXT: vldrw.u32 q2, [r4, #-12]
; CHECK-NEXT: vfma.f32 q0, q4, r5
-; CHECK-NEXT: vldrw.u32 q3, [r7, #-8]
+; CHECK-NEXT: vldrw.u32 q3, [r4, #-8]
; CHECK-NEXT: vfma.f32 q0, q5, r6
-; CHECK-NEXT: ldr r0, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT: ldr r0, [sp, #16] @ 4-byte Reload
; CHECK-NEXT: vfma.f32 q0, q2, lr
-; CHECK-NEXT: vldrw.u32 q1, [r7, #-4]
+; CHECK-NEXT: vldrw.u32 q1, [r4, #-4]
; CHECK-NEXT: vfma.f32 q0, q3, r11
; CHECK-NEXT: cmp r0, #16
; CHECK-NEXT: vfma.f32 q0, q1, r8
; CHECK-NEXT: blo .LBB16_8
; CHECK-NEXT: @ %bb.6: @ %for.body.preheader
; CHECK-NEXT: @ in Loop: Header=BB16_5 Depth=1
-; CHECK-NEXT: ldr r0, [sp] @ 4-byte Reload
-; CHECK-NEXT: add.w r4, r10, #32
+; CHECK-NEXT: ldr r0, [sp, #4] @ 4-byte Reload
; CHECK-NEXT: dls lr, r0
+; CHECK-NEXT: ldr r7, [sp, #8] @ 4-byte Reload
; CHECK-NEXT: .LBB16_7: @ %for.body
; CHECK-NEXT: @ Parent Loop BB16_5 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT: ldm.w r4, {r0, r3, r5, r6, r8, r11}
-; CHECK-NEXT: vldrw.u32 q1, [r7], #32
-; CHECK-NEXT: vldrw.u32 q6, [r7, #-24]
-; CHECK-NEXT: vldrw.u32 q4, [r7, #-20]
+; CHECK-NEXT: ldm.w r7, {r0, r3, r5, r6, r8, r11}
+; CHECK-NEXT: vldrw.u32 q1, [r4], #32
+; CHECK-NEXT: vldrw.u32 q6, [r4, #-24]
+; CHECK-NEXT: vldrw.u32 q4, [r4, #-20]
; CHECK-NEXT: vfma.f32 q0, q1, r0
-; CHECK-NEXT: vldrw.u32 q1, [r7, #-28]
-; CHECK-NEXT: vldrw.u32 q5, [r7, #-16]
-; CHECK-NEXT: vldrw.u32 q2, [r7, #-12]
+; CHECK-NEXT: vldrw.u32 q1, [r4, #-28]
+; CHECK-NEXT: vldrw.u32 q5, [r4, #-16]
+; CHECK-NEXT: vldrw.u32 q2, [r4, #-12]
; CHECK-NEXT: vfma.f32 q0, q1, r3
-; CHECK-NEXT: ldrd r9, r1, [r4, #24]
+; CHECK-NEXT: ldrd r9, r1, [r7, #24]
; CHECK-NEXT: vfma.f32 q0, q6, r5
-; CHECK-NEXT: vldrw.u32 q3, [r7, #-8]
+; CHECK-NEXT: vldrw.u32 q3, [r4, #-8]
; CHECK-NEXT: vfma.f32 q0, q4, r6
-; CHECK-NEXT: vldrw.u32 q1, [r7, #-4]
+; CHECK-NEXT: vldrw.u32 q1, [r4, #-4]
; CHECK-NEXT: vfma.f32 q0, q5, r8
-; CHECK-NEXT: adds r4, #32
+; CHECK-NEXT: adds r7, #32
; CHECK-NEXT: vfma.f32 q0, q2, r11
; CHECK-NEXT: vfma.f32 q0, q3, r9
; CHECK-NEXT: vfma.f32 q0, q1, r1
; CHECK-NEXT: le lr, .LBB16_7
; CHECK-NEXT: b .LBB16_3
; CHECK-NEXT: .LBB16_8: @ in Loop: Header=BB16_5 Depth=1
-; CHECK-NEXT: add.w r4, r10, #32
+; CHECK-NEXT: ldr r7, [sp, #8] @ 4-byte Reload
; CHECK-NEXT: b .LBB16_3
; CHECK-NEXT: .LBB16_9: @ %while.body76.preheader
; CHECK-NEXT: @ in Loop: Header=BB16_5 Depth=1
-; CHECK-NEXT: mov r3, r7
+; CHECK-NEXT: mov r3, r4
; CHECK-NEXT: .LBB16_10: @ %while.body76
; CHECK-NEXT: @ Parent Loop BB16_5 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT: ldr r0, [r4], #4
+; CHECK-NEXT: ldr r0, [r7], #4
; CHECK-NEXT: vldrw.u32 q1, [r3], #4
; CHECK-NEXT: vfma.f32 q0, q1, r0
; CHECK-NEXT: le lr, .LBB16_10
; CHECK-NEXT: @ %bb.11: @ %while.end.loopexit
; CHECK-NEXT: @ in Loop: Header=BB16_5 Depth=1
-; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload
-; CHECK-NEXT: add.w r7, r7, r0, lsl #2
+; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload
+; CHECK-NEXT: add.w r4, r4, r0, lsl #2
; CHECK-NEXT: b .LBB16_4
; CHECK-NEXT: .LBB16_12: @ %if.end
-; CHECK-NEXT: add sp, #24
+; CHECK-NEXT: add sp, #32
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13}
; CHECK-NEXT: add sp, #4
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
; CHECK-NEXT: .pad #4
; CHECK-NEXT: sub sp, #4
-; CHECK-NEXT: ldr r3, [r0, #4]
-; CHECK-NEXT: subs r3, #2
-; CHECK-NEXT: str r3, [sp] @ 4-byte Spill
-; CHECK-NEXT: cmp r3, #2
+; CHECK-NEXT: str r1, [sp] @ 4-byte Spill
+; CHECK-NEXT: ldr r1, [r0, #4]
+; CHECK-NEXT: subs r1, #2
+; CHECK-NEXT: cmp r1, #2
; CHECK-NEXT: blo .LBB1_5
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
; CHECK-NEXT: ldr.w r12, [r0, #8]
; CHECK-NEXT: movs r4, #1
; CHECK-NEXT: ldr r3, [r0]
; CHECK-NEXT: add.w r11, r3, r12, lsl #2
-; CHECK-NEXT: add.w r6, r3, r12, lsl #3
-; CHECK-NEXT: lsl.w r10, r12, #3
+; CHECK-NEXT: add.w r7, r3, r12, lsl #3
+; CHECK-NEXT: lsl.w r9, r12, #3
; CHECK-NEXT: .LBB1_2: @ %for.body
; CHECK-NEXT: @ =>This Loop Header: Depth=1
; CHECK-NEXT: @ Child Loop BB1_3 Depth 2
+; CHECK-NEXT: ldr r5, [sp] @ 4-byte Reload
; CHECK-NEXT: vmov.i32 q0, #0x0
-; CHECK-NEXT: add.w r9, r4, #1
-; CHECK-NEXT: mov r5, r1
+; CHECK-NEXT: add.w r10, r4, #1
; CHECK-NEXT: mov r3, r11
-; CHECK-NEXT: mov r0, r6
+; CHECK-NEXT: mov r0, r7
; CHECK-NEXT: vmov q1, q0
; CHECK-NEXT: dlstp.32 lr, r12
; CHECK-NEXT: .LBB1_3: @ %vector.body
; CHECK-NEXT: @ %bb.4: @ %middle.block
; CHECK-NEXT: @ in Loop: Header=BB1_2 Depth=1
; CHECK-NEXT: vadd.f32 s2, s2, s3
-; CHECK-NEXT: add.w r0, r2, r9, lsl #2
+; CHECK-NEXT: add.w r0, r2, r10, lsl #2
; CHECK-NEXT: vadd.f32 s0, s0, s1
-; CHECK-NEXT: add r11, r10
+; CHECK-NEXT: add r11, r9
; CHECK-NEXT: vadd.f32 s6, s6, s7
-; CHECK-NEXT: add r6, r10
+; CHECK-NEXT: add r7, r9
; CHECK-NEXT: vadd.f32 s4, s4, s5
; CHECK-NEXT: vadd.f32 s0, s0, s2
; CHECK-NEXT: vadd.f32 s2, s4, s6
; CHECK-NEXT: vstr s0, [r0]
; CHECK-NEXT: add.w r0, r2, r4, lsl #2
; CHECK-NEXT: adds r4, #2
+; CHECK-NEXT: cmp r4, r1
; CHECK-NEXT: vstr s2, [r0]
-; CHECK-NEXT: ldr r0, [sp] @ 4-byte Reload
-; CHECK-NEXT: cmp r4, r0
; CHECK-NEXT: blo .LBB1_2
; CHECK-NEXT: .LBB1_5: @ %for.cond.cleanup
; CHECK-NEXT: add sp, #4
; CHECK-NEXT: sub sp, #4
; CHECK-NEXT: .vsave {d8, d9}
; CHECK-NEXT: vpush {d8, d9}
-; CHECK-NEXT: .pad #16
-; CHECK-NEXT: sub sp, #16
-; CHECK-NEXT: str r1, [sp, #12] @ 4-byte Spill
+; CHECK-NEXT: .pad #24
+; CHECK-NEXT: sub sp, #24
+; CHECK-NEXT: str r1, [sp, #16] @ 4-byte Spill
; CHECK-NEXT: ldr r1, [r0, #4]
+; CHECK-NEXT: str r2, [sp, #8] @ 4-byte Spill
; CHECK-NEXT: subs r1, #3
-; CHECK-NEXT: str r1, [sp, #8] @ 4-byte Spill
+; CHECK-NEXT: str r1, [sp, #12] @ 4-byte Spill
; CHECK-NEXT: cmp r1, #2
; CHECK-NEXT: blo .LBB2_5
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
-; CHECK-NEXT: ldr.w r9, [r0, #8]
+; CHECK-NEXT: ldr r3, [r0, #8]
; CHECK-NEXT: movs r5, #1
; CHECK-NEXT: ldr r1, [r0]
-; CHECK-NEXT: add.w r0, r9, r9, lsl #1
-; CHECK-NEXT: add.w r10, r1, r9, lsl #2
-; CHECK-NEXT: add.w r12, r1, r9, lsl #3
-; CHECK-NEXT: add.w r8, r1, r0, lsl #2
-; CHECK-NEXT: add.w r1, r9, #3
-; CHECK-NEXT: bic r1, r1, #3
-; CHECK-NEXT: lsl.w r11, r0, #2
-; CHECK-NEXT: subs r1, #4
-; CHECK-NEXT: add.w r1, r5, r1, lsr #2
-; CHECK-NEXT: str r1, [sp, #4] @ 4-byte Spill
+; CHECK-NEXT: str r3, [sp, #4] @ 4-byte Spill
+; CHECK-NEXT: add.w r0, r3, r3, lsl #1
+; CHECK-NEXT: add.w r9, r1, r3, lsl #2
+; CHECK-NEXT: add.w r12, r1, r3, lsl #3
+; CHECK-NEXT: adds r3, #3
+; CHECK-NEXT: bic r3, r3, #3
; CHECK-NEXT: ldr r7, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT: add.w r10, r1, r0, lsl #2
+; CHECK-NEXT: subs r3, #4
+; CHECK-NEXT: lsl.w r11, r0, #2
+; CHECK-NEXT: add.w r1, r5, r3, lsr #2
+; CHECK-NEXT: str r1, [sp] @ 4-byte Spill
; CHECK-NEXT: .LBB2_2: @ %for.body
; CHECK-NEXT: @ =>This Loop Header: Depth=1
; CHECK-NEXT: @ Child Loop BB2_3 Depth 2
-; CHECK-NEXT: ldr r6, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT: ldr r6, [sp, #16] @ 4-byte Reload
; CHECK-NEXT: vmov.i32 q0, #0x0
-; CHECK-NEXT: mov r3, r10
+; CHECK-NEXT: ldr r1, [sp] @ 4-byte Reload
+; CHECK-NEXT: adds r0, r5, #2
+; CHECK-NEXT: adds r2, r5, #1
+; CHECK-NEXT: str r0, [sp, #20] @ 4-byte Spill
+; CHECK-NEXT: mov r3, r9
; CHECK-NEXT: mov r0, r12
-; CHECK-NEXT: mov r4, r8
+; CHECK-NEXT: mov r4, r10
; CHECK-NEXT: vmov q2, q0
; CHECK-NEXT: vmov q1, q0
-; CHECK-NEXT: dlstp.32 lr, r9
+; CHECK-NEXT: dlstp.32 lr, r7
; CHECK-NEXT: .LBB2_3: @ %vector.body
; CHECK-NEXT: @ Parent Loop BB2_2 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
; CHECK-NEXT: @ %bb.4: @ %middle.block
; CHECK-NEXT: @ in Loop: Header=BB2_2 Depth=1
; CHECK-NEXT: vadd.f32 s10, s10, s11
-; CHECK-NEXT: adds r0, r5, #1
+; CHECK-NEXT: ldr r1, [sp, #8] @ 4-byte Reload
; CHECK-NEXT: vadd.f32 s8, s8, s9
-; CHECK-NEXT: add r10, r11
+; CHECK-NEXT: add r9, r11
; CHECK-NEXT: vadd.f32 s6, s6, s7
-; CHECK-NEXT: add.w r0, r2, r0, lsl #2
+; CHECK-NEXT: add.w r0, r1, r2, lsl #2
; CHECK-NEXT: vadd.f32 s4, s4, s5
; CHECK-NEXT: add r12, r11
; CHECK-NEXT: vadd.f32 s2, s2, s3
-; CHECK-NEXT: add r8, r11
+; CHECK-NEXT: add r10, r11
; CHECK-NEXT: vadd.f32 s0, s0, s1
; CHECK-NEXT: vadd.f32 s8, s8, s10
; CHECK-NEXT: vadd.f32 s4, s4, s6
; CHECK-NEXT: vadd.f32 s0, s0, s2
; CHECK-NEXT: vstr s8, [r0]
-; CHECK-NEXT: add.w r0, r2, r5, lsl #2
-; CHECK-NEXT: vstr s4, [r0]
-; CHECK-NEXT: adds r0, r5, #2
+; CHECK-NEXT: add.w r0, r1, r5, lsl #2
; CHECK-NEXT: adds r5, #3
-; CHECK-NEXT: add.w r0, r2, r0, lsl #2
+; CHECK-NEXT: vstr s4, [r0]
+; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload
+; CHECK-NEXT: add.w r0, r1, r0, lsl #2
; CHECK-NEXT: vstr s0, [r0]
-; CHECK-NEXT: ldr r0, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload
; CHECK-NEXT: cmp r5, r0
; CHECK-NEXT: blo .LBB2_2
; CHECK-NEXT: .LBB2_5: @ %for.cond.cleanup
-; CHECK-NEXT: add sp, #16
+; CHECK-NEXT: add sp, #24
; CHECK-NEXT: vpop {d8, d9}
; CHECK-NEXT: add sp, #4
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
; CHECK-NEXT: sub sp, #4
; CHECK-NEXT: .vsave {d8, d9, d10, d11}
; CHECK-NEXT: vpush {d8, d9, d10, d11}
-; CHECK-NEXT: .pad #24
-; CHECK-NEXT: sub sp, #24
-; CHECK-NEXT: str r1, [sp, #20] @ 4-byte Spill
+; CHECK-NEXT: .pad #40
+; CHECK-NEXT: sub sp, #40
+; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill
; CHECK-NEXT: ldr r1, [r0, #4]
-; CHECK-NEXT: str r2, [sp, #12] @ 4-byte Spill
+; CHECK-NEXT: str r2, [sp, #16] @ 4-byte Spill
; CHECK-NEXT: subs r1, #4
-; CHECK-NEXT: str r1, [sp, #16] @ 4-byte Spill
+; CHECK-NEXT: str r1, [sp, #20] @ 4-byte Spill
; CHECK-NEXT: cmp r1, #2
-; CHECK-NEXT: blo .LBB3_5
+; CHECK-NEXT: blo.w .LBB3_5
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
; CHECK-NEXT: ldr r2, [r0, #8]
; CHECK-NEXT: movs r6, #1
; CHECK-NEXT: add.w r0, r2, r2, lsl #1
; CHECK-NEXT: add.w r12, r1, r2, lsl #2
; CHECK-NEXT: add.w r8, r1, r2, lsl #3
-; CHECK-NEXT: add.w r10, r1, r2, lsl #4
-; CHECK-NEXT: add.w r9, r1, r0, lsl #2
+; CHECK-NEXT: add.w r9, r1, r2, lsl #4
+; CHECK-NEXT: add.w r11, r1, r0, lsl #2
; CHECK-NEXT: adds r0, r2, #3
; CHECK-NEXT: bic r0, r0, #3
; CHECK-NEXT: subs r0, #4
; CHECK-NEXT: add.w r0, r6, r0, lsr #2
-; CHECK-NEXT: strd r0, r2, [sp, #4] @ 8-byte Folded Spill
+; CHECK-NEXT: strd r0, r2, [sp, #8] @ 8-byte Folded Spill
; CHECK-NEXT: lsls r0, r2, #4
-; CHECK-NEXT: ldrd r2, r7, [sp, #4] @ 8-byte Folded Reload
-; CHECK-NEXT: str r0, [sp] @ 4-byte Spill
+; CHECK-NEXT: ldrd r2, r7, [sp, #8] @ 8-byte Folded Reload
+; CHECK-NEXT: str r0, [sp, #4] @ 4-byte Spill
; CHECK-NEXT: .LBB3_2: @ %for.body
; CHECK-NEXT: @ =>This Loop Header: Depth=1
; CHECK-NEXT: @ Child Loop BB3_3 Depth 2
-; CHECK-NEXT: ldr r1, [sp, #20] @ 4-byte Reload
+; CHECK-NEXT: adds r0, r6, #3
+; CHECK-NEXT: str r0, [sp, #36] @ 4-byte Spill
+; CHECK-NEXT: adds r0, r6, #2
+; CHECK-NEXT: ldr r1, [sp, #24] @ 4-byte Reload
; CHECK-NEXT: vmov.i32 q0, #0x0
+; CHECK-NEXT: str r0, [sp, #32] @ 4-byte Spill
+; CHECK-NEXT: adds r0, r6, #1
+; CHECK-NEXT: str r0, [sp, #28] @ 4-byte Spill
; CHECK-NEXT: mov r3, r12
; CHECK-NEXT: mov r0, r8
-; CHECK-NEXT: mov r5, r9
-; CHECK-NEXT: mov r4, r10
+; CHECK-NEXT: mov r5, r11
+; CHECK-NEXT: mov r4, r9
; CHECK-NEXT: vmov q1, q0
; CHECK-NEXT: vmov q2, q0
; CHECK-NEXT: vmov q3, q0
; CHECK-NEXT: @ %bb.4: @ %middle.block
; CHECK-NEXT: @ in Loop: Header=BB3_2 Depth=1
; CHECK-NEXT: vadd.f32 s14, s14, s15
-; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT: ldr r0, [sp, #28] @ 4-byte Reload
; CHECK-NEXT: vadd.f32 s12, s12, s13
-; CHECK-NEXT: adds r0, r6, #1
+; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload
; CHECK-NEXT: vadd.f32 s10, s10, s11
; CHECK-NEXT: vadd.f32 s8, s8, s9
; CHECK-NEXT: add.w r0, r1, r0, lsl #2
; CHECK-NEXT: vadd.f32 s0, s0, s2
; CHECK-NEXT: vstr s12, [r0]
; CHECK-NEXT: add.w r0, r1, r6, lsl #2
+; CHECK-NEXT: adds r6, #4
; CHECK-NEXT: vstr s8, [r0]
-; CHECK-NEXT: adds r0, r6, #2
+; CHECK-NEXT: ldr r0, [sp, #32] @ 4-byte Reload
; CHECK-NEXT: add.w r0, r1, r0, lsl #2
; CHECK-NEXT: vstr s4, [r0]
-; CHECK-NEXT: adds r0, r6, #3
-; CHECK-NEXT: adds r6, #4
+; CHECK-NEXT: ldr r0, [sp, #36] @ 4-byte Reload
; CHECK-NEXT: add.w r0, r1, r0, lsl #2
; CHECK-NEXT: vstr s0, [r0]
-; CHECK-NEXT: ldr r0, [sp] @ 4-byte Reload
+; CHECK-NEXT: ldr r0, [sp, #4] @ 4-byte Reload
; CHECK-NEXT: add r12, r0
; CHECK-NEXT: add r8, r0
+; CHECK-NEXT: add r11, r0
; CHECK-NEXT: add r9, r0
-; CHECK-NEXT: add r10, r0
-; CHECK-NEXT: ldr r0, [sp, #16] @ 4-byte Reload
+; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload
; CHECK-NEXT: cmp r6, r0
; CHECK-NEXT: blo .LBB3_2
; CHECK-NEXT: .LBB3_5: @ %for.cond.cleanup
-; CHECK-NEXT: add sp, #24
+; CHECK-NEXT: add sp, #40
; CHECK-NEXT: vpop {d8, d9, d10, d11}
; CHECK-NEXT: add sp, #4
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
; CHECK-NEXT: sub sp, #4
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13}
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT: .pad #16
-; CHECK-NEXT: sub sp, #16
-; CHECK-NEXT: str r1, [sp, #12] @ 4-byte Spill
+; CHECK-NEXT: .pad #32
+; CHECK-NEXT: sub sp, #32
+; CHECK-NEXT: str r1, [sp, #20] @ 4-byte Spill
; CHECK-NEXT: ldr r1, [r0, #4]
; CHECK-NEXT: subs r1, #5
-; CHECK-NEXT: str r1, [sp, #8] @ 4-byte Spill
+; CHECK-NEXT: str r1, [sp, #16] @ 4-byte Spill
; CHECK-NEXT: cmp r1, #2
; CHECK-NEXT: blo.w .LBB4_5
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
-; CHECK-NEXT: ldr r1, [r0, #8]
-; CHECK-NEXT: ldr r3, [r0]
-; CHECK-NEXT: adds r0, r1, #3
+; CHECK-NEXT: ldr r3, [r0, #8]
+; CHECK-NEXT: ldr r1, [r0]
+; CHECK-NEXT: adds r0, r3, #3
+; CHECK-NEXT: str r3, [sp, #12] @ 4-byte Spill
; CHECK-NEXT: bic r0, r0, #3
-; CHECK-NEXT: add.w r12, r3, r1, lsl #2
-; CHECK-NEXT: subs r3, r0, #4
+; CHECK-NEXT: add.w r8, r1, r3, lsl #2
+; CHECK-NEXT: subs r1, r0, #4
; CHECK-NEXT: movs r0, #1
-; CHECK-NEXT: lsls r5, r1, #2
-; CHECK-NEXT: add.w r3, r0, r3, lsr #2
-; CHECK-NEXT: str r3, [sp, #4] @ 4-byte Spill
-; CHECK-NEXT: add.w r3, r1, r1, lsl #2
-; CHECK-NEXT: lsls r3, r3, #2
-; CHECK-NEXT: str r3, [sp] @ 4-byte Spill
+; CHECK-NEXT: lsls r5, r3, #2
+; CHECK-NEXT: add.w r1, r0, r1, lsr #2
+; CHECK-NEXT: str r1, [sp, #8] @ 4-byte Spill
+; CHECK-NEXT: add.w r1, r3, r3, lsl #2
+; CHECK-NEXT: lsls r1, r1, #2
+; CHECK-NEXT: str r1, [sp, #4] @ 4-byte Spill
; CHECK-NEXT: .LBB4_2: @ %for.body
; CHECK-NEXT: @ =>This Loop Header: Depth=1
; CHECK-NEXT: @ Child Loop BB4_3 Depth 2
-; CHECK-NEXT: ldr r4, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT: ldr r7, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT: adds r1, r0, #4
+; CHECK-NEXT: ldr r4, [sp, #20] @ 4-byte Reload
; CHECK-NEXT: vmov.i32 q1, #0x0
-; CHECK-NEXT: ldr r7, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT: ldr r6, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT: add.w r10, r0, #2
+; CHECK-NEXT: str r1, [sp, #28] @ 4-byte Spill
+; CHECK-NEXT: adds r1, r0, #3
; CHECK-NEXT: add.w r11, r0, #1
-; CHECK-NEXT: mov r3, r12
+; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill
+; CHECK-NEXT: mov r3, r8
; CHECK-NEXT: vmov q0, q1
; CHECK-NEXT: vmov q3, q1
; CHECK-NEXT: vmov q2, q1
; CHECK-NEXT: vmov q4, q1
-; CHECK-NEXT: dlstp.32 lr, r1
+; CHECK-NEXT: dlstp.32 lr, r7
; CHECK-NEXT: .LBB4_3: @ %vector.body
; CHECK-NEXT: @ Parent Loop BB4_2 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
; CHECK-NEXT: vldrw.u32 q5, [r4], #16
; CHECK-NEXT: vldrw.u32 q6, [r3], #16
; CHECK-NEXT: vfma.f32 q3, q6, q5
-; CHECK-NEXT: add.w r10, r9, r5
+; CHECK-NEXT: add.w r12, r9, r5
; CHECK-NEXT: vldrw.u32 q6, [r9]
; CHECK-NEXT: vfma.f32 q4, q6, q5
-; CHECK-NEXT: add.w r6, r10, r5
-; CHECK-NEXT: vldrw.u32 q6, [r10]
+; CHECK-NEXT: add.w r6, r12, r5
+; CHECK-NEXT: vldrw.u32 q6, [r12]
; CHECK-NEXT: vfma.f32 q2, q6, q5
; CHECK-NEXT: adds r7, r6, r5
; CHECK-NEXT: vldrw.u32 q6, [r6]
; CHECK-NEXT: @ %bb.4: @ %middle.block
; CHECK-NEXT: @ in Loop: Header=BB4_2 Depth=1
; CHECK-NEXT: vadd.f32 s18, s18, s19
-; CHECK-NEXT: add.w r3, r2, r11, lsl #2
+; CHECK-NEXT: add.w r1, r2, r11, lsl #2
; CHECK-NEXT: vadd.f32 s16, s16, s17
; CHECK-NEXT: vadd.f32 s14, s14, s15
; CHECK-NEXT: vadd.f32 s12, s12, s13
; CHECK-NEXT: vadd.f32 s8, s8, s9
; CHECK-NEXT: vadd.f32 s0, s0, s1
; CHECK-NEXT: vadd.f32 s1, s16, s18
-; CHECK-NEXT: vadd.f32 s12, s12, s14
; CHECK-NEXT: vadd.f32 s2, s2, s3
+; CHECK-NEXT: vadd.f32 s12, s12, s14
; CHECK-NEXT: vadd.f32 s4, s4, s6
; CHECK-NEXT: vadd.f32 s6, s8, s10
-; CHECK-NEXT: vstr s1, [r3]
-; CHECK-NEXT: add.w r3, r2, r0, lsl #2
-; CHECK-NEXT: vstr s12, [r3]
-; CHECK-NEXT: adds r3, r0, #2
+; CHECK-NEXT: vstr s1, [r1]
+; CHECK-NEXT: add.w r1, r2, r0, lsl #2
; CHECK-NEXT: vadd.f32 s0, s0, s2
-; CHECK-NEXT: add.w r3, r2, r3, lsl #2
-; CHECK-NEXT: vstr s6, [r3]
-; CHECK-NEXT: adds r3, r0, #3
-; CHECK-NEXT: add.w r3, r2, r3, lsl #2
-; CHECK-NEXT: vstr s0, [r3]
-; CHECK-NEXT: adds r3, r0, #4
; CHECK-NEXT: adds r0, #5
-; CHECK-NEXT: add.w r3, r2, r3, lsl #2
-; CHECK-NEXT: vstr s4, [r3]
-; CHECK-NEXT: ldr r3, [sp] @ 4-byte Reload
-; CHECK-NEXT: add r12, r3
-; CHECK-NEXT: ldr r3, [sp, #8] @ 4-byte Reload
-; CHECK-NEXT: cmp r0, r3
-; CHECK-NEXT: blo .LBB4_2
+; CHECK-NEXT: vstr s12, [r1]
+; CHECK-NEXT: add.w r1, r2, r10, lsl #2
+; CHECK-NEXT: vstr s6, [r1]
+; CHECK-NEXT: ldr r1, [sp, #24] @ 4-byte Reload
+; CHECK-NEXT: add.w r1, r2, r1, lsl #2
+; CHECK-NEXT: vstr s0, [r1]
+; CHECK-NEXT: ldr r1, [sp, #28] @ 4-byte Reload
+; CHECK-NEXT: add.w r1, r2, r1, lsl #2
+; CHECK-NEXT: vstr s4, [r1]
+; CHECK-NEXT: ldr r1, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT: add r8, r1
+; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload
+; CHECK-NEXT: cmp r0, r1
+; CHECK-NEXT: blo.w .LBB4_2
; CHECK-NEXT: .LBB4_5: @ %for.cond.cleanup
-; CHECK-NEXT: add sp, #16
+; CHECK-NEXT: add sp, #32
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13}
; CHECK-NEXT: add sp, #4
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
; CHECK-NEXT: sub sp, #4
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT: .pad #16
-; CHECK-NEXT: sub sp, #16
-; CHECK-NEXT: str r1, [sp, #12] @ 4-byte Spill
+; CHECK-NEXT: .pad #32
+; CHECK-NEXT: sub sp, #32
+; CHECK-NEXT: str r1, [sp, #16] @ 4-byte Spill
; CHECK-NEXT: ldr r1, [r0, #4]
; CHECK-NEXT: subs r1, #6
-; CHECK-NEXT: str r1, [sp, #8] @ 4-byte Spill
+; CHECK-NEXT: str r1, [sp, #12] @ 4-byte Spill
; CHECK-NEXT: cmp r1, #2
; CHECK-NEXT: blo.w .LBB5_5
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
-; CHECK-NEXT: ldr.w r9, [r0, #8]
+; CHECK-NEXT: ldr r3, [r0, #8]
; CHECK-NEXT: ldr r1, [r0]
-; CHECK-NEXT: add.w r0, r9, #3
+; CHECK-NEXT: adds r0, r3, #3
+; CHECK-NEXT: str r3, [sp, #8] @ 4-byte Spill
; CHECK-NEXT: bic r0, r0, #3
-; CHECK-NEXT: add.w r12, r1, r9, lsl #2
+; CHECK-NEXT: add.w r8, r1, r3, lsl #2
; CHECK-NEXT: subs r1, r0, #4
; CHECK-NEXT: movs r0, #1
-; CHECK-NEXT: lsl.w r5, r9, #2
+; CHECK-NEXT: lsls r5, r3, #2
; CHECK-NEXT: add.w r1, r0, r1, lsr #2
; CHECK-NEXT: str r1, [sp, #4] @ 4-byte Spill
-; CHECK-NEXT: add.w r1, r9, r9, lsl #1
+; CHECK-NEXT: add.w r1, r3, r3, lsl #1
; CHECK-NEXT: lsls r1, r1, #3
; CHECK-NEXT: str r1, [sp] @ 4-byte Spill
; CHECK-NEXT: .LBB5_2: @ %for.body
; CHECK-NEXT: @ =>This Loop Header: Depth=1
; CHECK-NEXT: @ Child Loop BB5_3 Depth 2
-; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT: adds r1, r0, #5
+; CHECK-NEXT: str r1, [sp, #28] @ 4-byte Spill
+; CHECK-NEXT: adds r1, r0, #4
+; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill
+; CHECK-NEXT: adds r1, r0, #3
+; CHECK-NEXT: ldr r7, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT: str r1, [sp, #20] @ 4-byte Spill
; CHECK-NEXT: vmov.i32 q1, #0x0
-; CHECK-NEXT: ldr r7, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload
+; CHECK-NEXT: add.w r11, r0, #2
+; CHECK-NEXT: ldr r6, [sp, #4] @ 4-byte Reload
; CHECK-NEXT: adds r4, r0, #1
-; CHECK-NEXT: mov r3, r12
+; CHECK-NEXT: mov r3, r8
; CHECK-NEXT: vmov q3, q1
; CHECK-NEXT: vmov q4, q1
; CHECK-NEXT: vmov q0, q1
; CHECK-NEXT: vmov q5, q1
; CHECK-NEXT: vmov q2, q1
-; CHECK-NEXT: dlstp.32 lr, r9
+; CHECK-NEXT: dlstp.32 lr, r7
; CHECK-NEXT: .LBB5_3: @ %vector.body
; CHECK-NEXT: @ Parent Loop BB5_2 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT: add.w r10, r3, r5
+; CHECK-NEXT: add.w r12, r3, r5
; CHECK-NEXT: vldrw.u32 q6, [r1], #16
; CHECK-NEXT: vldrw.u32 q7, [r3], #16
; CHECK-NEXT: vfma.f32 q4, q7, q6
-; CHECK-NEXT: add.w r11, r10, r5
-; CHECK-NEXT: vldrw.u32 q7, [r10]
+; CHECK-NEXT: add.w r10, r12, r5
+; CHECK-NEXT: vldrw.u32 q7, [r12]
; CHECK-NEXT: vfma.f32 q5, q7, q6
-; CHECK-NEXT: add.w r6, r11, r5
-; CHECK-NEXT: vldrw.u32 q7, [r11]
+; CHECK-NEXT: add.w r6, r10, r5
+; CHECK-NEXT: vldrw.u32 q7, [r10]
; CHECK-NEXT: vfma.f32 q2, q7, q6
; CHECK-NEXT: adds r7, r6, r5
; CHECK-NEXT: vldrw.u32 q7, [r6]
; CHECK-NEXT: vstr s1, [r1]
; CHECK-NEXT: add.w r1, r2, r0, lsl #2
; CHECK-NEXT: vadd.f32 s0, s0, s2
+; CHECK-NEXT: adds r0, #6
; CHECK-NEXT: vstr s3, [r1]
-; CHECK-NEXT: adds r1, r0, #2
+; CHECK-NEXT: add.w r1, r2, r11, lsl #2
; CHECK-NEXT: vadd.f32 s4, s4, s6
-; CHECK-NEXT: add.w r1, r2, r1, lsl #2
; CHECK-NEXT: vstr s8, [r1]
-; CHECK-NEXT: adds r1, r0, #3
+; CHECK-NEXT: ldr r1, [sp, #20] @ 4-byte Reload
; CHECK-NEXT: vadd.f32 s6, s12, s14
; CHECK-NEXT: add.w r1, r2, r1, lsl #2
; CHECK-NEXT: vstr s0, [r1]
-; CHECK-NEXT: adds r1, r0, #4
+; CHECK-NEXT: ldr r1, [sp, #24] @ 4-byte Reload
; CHECK-NEXT: add.w r1, r2, r1, lsl #2
; CHECK-NEXT: vstr s6, [r1]
-; CHECK-NEXT: adds r1, r0, #5
-; CHECK-NEXT: adds r0, #6
+; CHECK-NEXT: ldr r1, [sp, #28] @ 4-byte Reload
; CHECK-NEXT: add.w r1, r2, r1, lsl #2
; CHECK-NEXT: vstr s4, [r1]
; CHECK-NEXT: ldr r1, [sp] @ 4-byte Reload
-; CHECK-NEXT: add r12, r1
-; CHECK-NEXT: ldr r1, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT: add r8, r1
+; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload
; CHECK-NEXT: cmp r0, r1
; CHECK-NEXT: blo.w .LBB5_2
; CHECK-NEXT: .LBB5_5: @ %for.cond.cleanup
-; CHECK-NEXT: add sp, #16
+; CHECK-NEXT: add sp, #32
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: add sp, #4
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
; CHECK-NEXT: sub sp, #4
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT: .pad #48
-; CHECK-NEXT: sub sp, #48
-; CHECK-NEXT: str r1, [sp, #12] @ 4-byte Spill
+; CHECK-NEXT: .pad #72
+; CHECK-NEXT: sub sp, #72
+; CHECK-NEXT: str r1, [sp, #20] @ 4-byte Spill
; CHECK-NEXT: ldr r1, [r0, #4]
; CHECK-NEXT: subs r1, #7
-; CHECK-NEXT: str r1, [sp, #8] @ 4-byte Spill
+; CHECK-NEXT: str r1, [sp, #16] @ 4-byte Spill
; CHECK-NEXT: cmp r1, #2
; CHECK-NEXT: blo.w .LBB6_5
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
-; CHECK-NEXT: ldr.w r10, [r0, #8]
+; CHECK-NEXT: ldr r3, [r0, #8]
; CHECK-NEXT: ldr r1, [r0]
-; CHECK-NEXT: add.w r0, r10, #3
+; CHECK-NEXT: adds r0, r3, #3
+; CHECK-NEXT: str r3, [sp, #12] @ 4-byte Spill
; CHECK-NEXT: bic r0, r0, #3
-; CHECK-NEXT: add.w r8, r1, r10, lsl #2
+; CHECK-NEXT: add.w r9, r1, r3, lsl #2
; CHECK-NEXT: subs r1, r0, #4
; CHECK-NEXT: movs r0, #1
-; CHECK-NEXT: lsl.w r5, r10, #2
+; CHECK-NEXT: lsls r5, r3, #2
; CHECK-NEXT: add.w r1, r0, r1, lsr #2
-; CHECK-NEXT: str r1, [sp, #4] @ 4-byte Spill
-; CHECK-NEXT: rsb r1, r10, r10, lsl #3
+; CHECK-NEXT: str r1, [sp, #8] @ 4-byte Spill
+; CHECK-NEXT: rsb r1, r3, r3, lsl #3
; CHECK-NEXT: lsls r1, r1, #2
-; CHECK-NEXT: str r1, [sp] @ 4-byte Spill
+; CHECK-NEXT: str r1, [sp, #4] @ 4-byte Spill
; CHECK-NEXT: .LBB6_2: @ %for.body
; CHECK-NEXT: @ =>This Loop Header: Depth=1
; CHECK-NEXT: @ Child Loop BB6_3 Depth 2
-; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT: adds r1, r0, #6
+; CHECK-NEXT: str r1, [sp, #36] @ 4-byte Spill
+; CHECK-NEXT: adds r1, r0, #5
+; CHECK-NEXT: str r1, [sp, #32] @ 4-byte Spill
+; CHECK-NEXT: adds r1, r0, #4
+; CHECK-NEXT: str r1, [sp, #28] @ 4-byte Spill
+; CHECK-NEXT: adds r1, r0, #3
+; CHECK-NEXT: ldr r7, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill
; CHECK-NEXT: vmov.i32 q2, #0x0
-; CHECK-NEXT: ldr r7, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT: ldr r1, [sp, #20] @ 4-byte Reload
; CHECK-NEXT: adds r4, r0, #2
-; CHECK-NEXT: add.w r12, r0, #1
-; CHECK-NEXT: mov r3, r8
+; CHECK-NEXT: ldr r6, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT: add.w r8, r0, #1
+; CHECK-NEXT: mov r3, r9
; CHECK-NEXT: vmov q4, q2
; CHECK-NEXT: vmov q5, q2
; CHECK-NEXT: vmov q3, q2
; CHECK-NEXT: vmov q6, q2
; CHECK-NEXT: vmov q1, q2
-; CHECK-NEXT: mov r9, r10
-; CHECK-NEXT: vstrw.32 q2, [sp, #32] @ 16-byte Spill
-; CHECK-NEXT: dls lr, r7
+; CHECK-NEXT: mov r12, r7
+; CHECK-NEXT: vstrw.32 q2, [sp, #56] @ 16-byte Spill
+; CHECK-NEXT: dls lr, r6
; CHECK-NEXT: .LBB6_3: @ %vector.body
; CHECK-NEXT: @ Parent Loop BB6_2 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT: add.w r11, r3, r5
-; CHECK-NEXT: vctp.32 r9
+; CHECK-NEXT: add.w r10, r3, r5
+; CHECK-NEXT: vctp.32 r12
; CHECK-NEXT: vpsttt
; CHECK-NEXT: vldrwt.u32 q7, [r1], #16
; CHECK-NEXT: vldrwt.u32 q0, [r3], #16
; CHECK-NEXT: vfmat.f32 q5, q0, q7
-; CHECK-NEXT: add.w r6, r11, r5
+; CHECK-NEXT: add.w r11, r10, r5
; CHECK-NEXT: vpstt
-; CHECK-NEXT: vldrwt.u32 q0, [r11]
+; CHECK-NEXT: vldrwt.u32 q0, [r10]
; CHECK-NEXT: vfmat.f32 q6, q0, q7
-; CHECK-NEXT: vstrw.32 q6, [sp, #16] @ 16-byte Spill
+; CHECK-NEXT: vstrw.32 q6, [sp, #40] @ 16-byte Spill
; CHECK-NEXT: vpstt
-; CHECK-NEXT: vldrwt.u32 q0, [r6]
+; CHECK-NEXT: vldrwt.u32 q0, [r11]
; CHECK-NEXT: vfmat.f32 q1, q0, q7
-; CHECK-NEXT: adds r7, r6, r5
+; CHECK-NEXT: add.w r6, r11, r5
; CHECK-NEXT: vmov q6, q5
; CHECK-NEXT: vmov q5, q4
; CHECK-NEXT: vmov q4, q3
; CHECK-NEXT: vpst
-; CHECK-NEXT: vldrwt.u32 q0, [r7]
+; CHECK-NEXT: vldrwt.u32 q0, [r6]
; CHECK-NEXT: vmov q3, q1
-; CHECK-NEXT: vldrw.u32 q1, [sp, #32] @ 16-byte Reload
+; CHECK-NEXT: vldrw.u32 q1, [sp, #56] @ 16-byte Reload
; CHECK-NEXT: vpst
; CHECK-NEXT: vfmat.f32 q1, q0, q7
-; CHECK-NEXT: adds r6, r7, r5
-; CHECK-NEXT: vstrw.32 q1, [sp, #32] @ 16-byte Spill
+; CHECK-NEXT: adds r7, r6, r5
+; CHECK-NEXT: vstrw.32 q1, [sp, #56] @ 16-byte Spill
; CHECK-NEXT: vmov q1, q3
; CHECK-NEXT: vmov q3, q4
; CHECK-NEXT: vmov q4, q5
; CHECK-NEXT: vmov q5, q6
-; CHECK-NEXT: vldrw.u32 q6, [sp, #16] @ 16-byte Reload
-; CHECK-NEXT: sub.w r9, r9, #4
-; CHECK-NEXT: adds r7, r6, r5
+; CHECK-NEXT: vldrw.u32 q6, [sp, #40] @ 16-byte Reload
+; CHECK-NEXT: sub.w r12, r12, #4
+; CHECK-NEXT: adds r6, r7, r5
; CHECK-NEXT: vpstt
-; CHECK-NEXT: vldrwt.u32 q0, [r6]
+; CHECK-NEXT: vldrwt.u32 q0, [r7]
; CHECK-NEXT: vfmat.f32 q3, q0, q7
-; CHECK-NEXT: adds r6, r7, r5
+; CHECK-NEXT: adds r7, r6, r5
; CHECK-NEXT: vpstttt
-; CHECK-NEXT: vldrwt.u32 q0, [r7]
-; CHECK-NEXT: vfmat.f32 q4, q0, q7
; CHECK-NEXT: vldrwt.u32 q0, [r6]
+; CHECK-NEXT: vfmat.f32 q4, q0, q7
+; CHECK-NEXT: vldrwt.u32 q0, [r7]
; CHECK-NEXT: vfmat.f32 q2, q0, q7
; CHECK-NEXT: le lr, .LBB6_3
; CHECK-NEXT: @ %bb.4: @ %middle.block
; CHECK-NEXT: @ in Loop: Header=BB6_2 Depth=1
; CHECK-NEXT: vadd.f32 s0, s26, s27
-; CHECK-NEXT: add.w r1, r2, r12, lsl #2
+; CHECK-NEXT: add.w r1, r2, r8, lsl #2
; CHECK-NEXT: vadd.f32 s2, s24, s25
; CHECK-NEXT: vadd.f32 s1, s22, s23
; CHECK-NEXT: vadd.f32 s3, s20, s21
; CHECK-NEXT: vadd.f32 s4, s4, s5
; CHECK-NEXT: vadd.f32 s10, s10, s11
; CHECK-NEXT: vadd.f32 s8, s8, s9
+; CHECK-NEXT: vadd.f32 s0, s2, s0
; CHECK-NEXT: vadd.f32 s9, s18, s19
; CHECK-NEXT: vadd.f32 s11, s16, s17
-; CHECK-NEXT: vldrw.u32 q4, [sp, #32] @ 16-byte Reload
-; CHECK-NEXT: vadd.f32 s0, s2, s0
+; CHECK-NEXT: vldrw.u32 q4, [sp, #56] @ 16-byte Reload
+; CHECK-NEXT: vadd.f32 s2, s3, s1
; CHECK-NEXT: vadd.f32 s5, s18, s19
; CHECK-NEXT: vadd.f32 s7, s16, s17
-; CHECK-NEXT: vadd.f32 s2, s3, s1
; CHECK-NEXT: vadd.f32 s4, s4, s6
-; CHECK-NEXT: vadd.f32 s14, s14, s15
-; CHECK-NEXT: vadd.f32 s12, s12, s13
; CHECK-NEXT: vstr s0, [r1]
; CHECK-NEXT: add.w r1, r2, r0, lsl #2
-; CHECK-NEXT: vadd.f32 s8, s8, s10
-; CHECK-NEXT: vadd.f32 s6, s7, s5
+; CHECK-NEXT: vadd.f32 s14, s14, s15
+; CHECK-NEXT: adds r0, #7
+; CHECK-NEXT: vadd.f32 s12, s12, s13
; CHECK-NEXT: vstr s2, [r1]
; CHECK-NEXT: add.w r1, r2, r4, lsl #2
-; CHECK-NEXT: vadd.f32 s10, s11, s9
+; CHECK-NEXT: vadd.f32 s8, s8, s10
+; CHECK-NEXT: vadd.f32 s6, s7, s5
; CHECK-NEXT: vstr s4, [r1]
-; CHECK-NEXT: adds r1, r0, #3
+; CHECK-NEXT: vadd.f32 s10, s11, s9
+; CHECK-NEXT: ldr r1, [sp, #24] @ 4-byte Reload
; CHECK-NEXT: vadd.f32 s12, s12, s14
; CHECK-NEXT: add.w r1, r2, r1, lsl #2
; CHECK-NEXT: vstr s6, [r1]
-; CHECK-NEXT: adds r1, r0, #4
+; CHECK-NEXT: ldr r1, [sp, #28] @ 4-byte Reload
; CHECK-NEXT: add.w r1, r2, r1, lsl #2
; CHECK-NEXT: vstr s12, [r1]
-; CHECK-NEXT: adds r1, r0, #5
+; CHECK-NEXT: ldr r1, [sp, #32] @ 4-byte Reload
; CHECK-NEXT: add.w r1, r2, r1, lsl #2
; CHECK-NEXT: vstr s10, [r1]
-; CHECK-NEXT: adds r1, r0, #6
-; CHECK-NEXT: adds r0, #7
+; CHECK-NEXT: ldr r1, [sp, #36] @ 4-byte Reload
; CHECK-NEXT: add.w r1, r2, r1, lsl #2
; CHECK-NEXT: vstr s8, [r1]
-; CHECK-NEXT: ldr r1, [sp] @ 4-byte Reload
-; CHECK-NEXT: add r8, r1
-; CHECK-NEXT: ldr r1, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT: ldr r1, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT: add r9, r1
+; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload
; CHECK-NEXT: cmp r0, r1
; CHECK-NEXT: blo.w .LBB6_2
; CHECK-NEXT: .LBB6_5: @ %for.cond.cleanup
-; CHECK-NEXT: add sp, #48
+; CHECK-NEXT: add sp, #72
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: add sp, #4
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
; CHECK-NEXT: sub sp, #4
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT: .pad #64
-; CHECK-NEXT: sub sp, #64
-; CHECK-NEXT: str r1, [sp, #12] @ 4-byte Spill
+; CHECK-NEXT: .pad #88
+; CHECK-NEXT: sub sp, #88
+; CHECK-NEXT: str r1, [sp, #20] @ 4-byte Spill
; CHECK-NEXT: ldr r1, [r0, #4]
; CHECK-NEXT: subs r1, #8
-; CHECK-NEXT: str r1, [sp, #8] @ 4-byte Spill
+; CHECK-NEXT: str r1, [sp, #16] @ 4-byte Spill
; CHECK-NEXT: cmp r1, #2
; CHECK-NEXT: blo.w .LBB7_5
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
-; CHECK-NEXT: ldr.w r11, [r0, #8]
+; CHECK-NEXT: ldr r3, [r0, #8]
; CHECK-NEXT: ldr r1, [r0]
-; CHECK-NEXT: add.w r0, r11, #3
+; CHECK-NEXT: adds r0, r3, #3
+; CHECK-NEXT: str r3, [sp, #12] @ 4-byte Spill
; CHECK-NEXT: bic r0, r0, #3
-; CHECK-NEXT: add.w r9, r1, r11, lsl #2
+; CHECK-NEXT: add.w r12, r1, r3, lsl #2
; CHECK-NEXT: subs r1, r0, #4
; CHECK-NEXT: movs r0, #1
-; CHECK-NEXT: lsl.w r5, r11, #2
+; CHECK-NEXT: lsls r5, r3, #2
; CHECK-NEXT: add.w r1, r0, r1, lsr #2
+; CHECK-NEXT: str r1, [sp, #8] @ 4-byte Spill
+; CHECK-NEXT: lsls r1, r3, #5
; CHECK-NEXT: str r1, [sp, #4] @ 4-byte Spill
-; CHECK-NEXT: lsl.w r1, r11, #5
-; CHECK-NEXT: str r1, [sp] @ 4-byte Spill
; CHECK-NEXT: .LBB7_2: @ %for.body
; CHECK-NEXT: @ =>This Loop Header: Depth=1
; CHECK-NEXT: @ Child Loop BB7_3 Depth 2
-; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT: adds r1, r0, #7
+; CHECK-NEXT: str r1, [sp, #36] @ 4-byte Spill
+; CHECK-NEXT: adds r1, r0, #6
+; CHECK-NEXT: str r1, [sp, #32] @ 4-byte Spill
+; CHECK-NEXT: adds r1, r0, #5
+; CHECK-NEXT: ldr r7, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT: str r1, [sp, #28] @ 4-byte Spill
+; CHECK-NEXT: adds r1, r0, #4
+; CHECK-NEXT: ldr.w r9, [sp, #20] @ 4-byte Reload
; CHECK-NEXT: vmov.i32 q3, #0x0
-; CHECK-NEXT: ldr r7, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT: ldr r6, [sp, #8] @ 4-byte Reload
; CHECK-NEXT: adds r4, r0, #3
-; CHECK-NEXT: add.w r12, r0, #2
-; CHECK-NEXT: add.w r8, r0, #1
-; CHECK-NEXT: mov r3, r9
+; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill
+; CHECK-NEXT: add.w r8, r0, #2
+; CHECK-NEXT: adds r1, r0, #1
+; CHECK-NEXT: mov r3, r12
; CHECK-NEXT: vmov q5, q3
; CHECK-NEXT: vmov q6, q3
; CHECK-NEXT: vmov q4, q3
; CHECK-NEXT: vmov q7, q3
; CHECK-NEXT: vmov q2, q3
-; CHECK-NEXT: mov r10, r11
-; CHECK-NEXT: vstrw.32 q3, [sp, #32] @ 16-byte Spill
-; CHECK-NEXT: vstrw.32 q3, [sp, #48] @ 16-byte Spill
-; CHECK-NEXT: dls lr, r7
+; CHECK-NEXT: mov r10, r7
+; CHECK-NEXT: vstrw.32 q3, [sp, #56] @ 16-byte Spill
+; CHECK-NEXT: vstrw.32 q3, [sp, #72] @ 16-byte Spill
+; CHECK-NEXT: dls lr, r6
; CHECK-NEXT: .LBB7_3: @ %vector.body
; CHECK-NEXT: @ Parent Loop BB7_2 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT: adds r6, r3, r5
+; CHECK-NEXT: add.w r11, r3, r5
; CHECK-NEXT: vctp.32 r10
; CHECK-NEXT: vpsttt
-; CHECK-NEXT: vldrwt.u32 q0, [r1], #16
+; CHECK-NEXT: vldrwt.u32 q0, [r9], #16
; CHECK-NEXT: vldrwt.u32 q1, [r3], #16
; CHECK-NEXT: vfmat.f32 q6, q1, q0
-; CHECK-NEXT: vstrw.32 q6, [sp, #16] @ 16-byte Spill
+; CHECK-NEXT: vstrw.32 q6, [sp, #40] @ 16-byte Spill
; CHECK-NEXT: vpstt
-; CHECK-NEXT: vldrwt.u32 q1, [r6]
+; CHECK-NEXT: vldrwt.u32 q1, [r11]
; CHECK-NEXT: vfmat.f32 q7, q1, q0
-; CHECK-NEXT: adds r7, r6, r5
+; CHECK-NEXT: add.w r6, r11, r5
; CHECK-NEXT: vmov q6, q5
; CHECK-NEXT: vmov q5, q3
; CHECK-NEXT: vmov q3, q4
; CHECK-NEXT: vpst
-; CHECK-NEXT: vldrwt.u32 q1, [r7]
+; CHECK-NEXT: vldrwt.u32 q1, [r6]
; CHECK-NEXT: vmov q4, q2
-; CHECK-NEXT: vldrw.u32 q2, [sp, #32] @ 16-byte Reload
+; CHECK-NEXT: vldrw.u32 q2, [sp, #56] @ 16-byte Reload
; CHECK-NEXT: vpst
; CHECK-NEXT: vfmat.f32 q2, q1, q0
-; CHECK-NEXT: vstrw.32 q2, [sp, #32] @ 16-byte Spill
-; CHECK-NEXT: adds r6, r7, r5
-; CHECK-NEXT: vpst
-; CHECK-NEXT: vldrwt.u32 q1, [r6]
-; CHECK-NEXT: vldrw.u32 q2, [sp, #48] @ 16-byte Reload
+; CHECK-NEXT: vstrw.32 q2, [sp, #56] @ 16-byte Spill
; CHECK-NEXT: adds r7, r6, r5
; CHECK-NEXT: vpst
+; CHECK-NEXT: vldrwt.u32 q1, [r7]
+; CHECK-NEXT: vldrw.u32 q2, [sp, #72] @ 16-byte Reload
+; CHECK-NEXT: adds r6, r7, r5
+; CHECK-NEXT: vpst
; CHECK-NEXT: vfmat.f32 q2, q1, q0
-; CHECK-NEXT: vstrw.32 q2, [sp, #48] @ 16-byte Spill
+; CHECK-NEXT: vstrw.32 q2, [sp, #72] @ 16-byte Spill
; CHECK-NEXT: vmov q2, q4
; CHECK-NEXT: vmov q4, q3
; CHECK-NEXT: vmov q3, q5
; CHECK-NEXT: vmov q5, q6
-; CHECK-NEXT: vldrw.u32 q6, [sp, #16] @ 16-byte Reload
-; CHECK-NEXT: adds r6, r7, r5
+; CHECK-NEXT: vldrw.u32 q6, [sp, #40] @ 16-byte Reload
+; CHECK-NEXT: adds r7, r6, r5
; CHECK-NEXT: vpstt
-; CHECK-NEXT: vldrwt.u32 q1, [r7]
+; CHECK-NEXT: vldrwt.u32 q1, [r6]
; CHECK-NEXT: vfmat.f32 q2, q1, q0
; CHECK-NEXT: sub.w r10, r10, #4
-; CHECK-NEXT: adds r7, r6, r5
+; CHECK-NEXT: adds r6, r7, r5
; CHECK-NEXT: vpstttt
-; CHECK-NEXT: vldrwt.u32 q1, [r6]
-; CHECK-NEXT: vfmat.f32 q4, q1, q0
; CHECK-NEXT: vldrwt.u32 q1, [r7]
+; CHECK-NEXT: vfmat.f32 q4, q1, q0
+; CHECK-NEXT: vldrwt.u32 q1, [r6]
; CHECK-NEXT: vfmat.f32 q5, q1, q0
-; CHECK-NEXT: adds r6, r7, r5
+; CHECK-NEXT: add r6, r5
; CHECK-NEXT: vpstt
; CHECK-NEXT: vldrwt.u32 q1, [r6]
; CHECK-NEXT: vfmat.f32 q3, q1, q0
; CHECK-NEXT: @ %bb.4: @ %middle.block
; CHECK-NEXT: @ in Loop: Header=BB7_2 Depth=1
; CHECK-NEXT: vadd.f32 s0, s30, s31
-; CHECK-NEXT: add.w r1, r2, r8, lsl #2
+; CHECK-NEXT: add.w r1, r2, r1, lsl #2
; CHECK-NEXT: vadd.f32 s2, s28, s29
; CHECK-NEXT: vadd.f32 s4, s26, s27
; CHECK-NEXT: vadd.f32 s6, s24, s25
; CHECK-NEXT: vadd.f32 s5, s18, s19
; CHECK-NEXT: vadd.f32 s7, s16, s17
-; CHECK-NEXT: vldrw.u32 q4, [sp, #32] @ 16-byte Reload
+; CHECK-NEXT: vldrw.u32 q4, [sp, #56] @ 16-byte Reload
; CHECK-NEXT: vadd.f32 s10, s10, s11
; CHECK-NEXT: vadd.f32 s8, s8, s9
; CHECK-NEXT: vadd.f32 s9, s18, s19
; CHECK-NEXT: vadd.f32 s11, s16, s17
-; CHECK-NEXT: vldrw.u32 q4, [sp, #48] @ 16-byte Reload
+; CHECK-NEXT: vldrw.u32 q4, [sp, #72] @ 16-byte Reload
; CHECK-NEXT: vadd.f32 s14, s14, s15
; CHECK-NEXT: vadd.f32 s12, s12, s13
; CHECK-NEXT: vadd.f32 s13, s18, s19
; CHECK-NEXT: vstr s0, [r1]
; CHECK-NEXT: add.w r1, r2, r0, lsl #2
; CHECK-NEXT: vadd.f32 s3, s20, s21
+; CHECK-NEXT: adds r0, #8
; CHECK-NEXT: vstr s2, [r1]
-; CHECK-NEXT: add.w r1, r2, r12, lsl #2
+; CHECK-NEXT: add.w r1, r2, r8, lsl #2
; CHECK-NEXT: vadd.f32 s12, s7, s5
; CHECK-NEXT: vstr s10, [r1]
; CHECK-NEXT: add.w r1, r2, r4, lsl #2
; CHECK-NEXT: vstr s14, [r1]
-; CHECK-NEXT: adds r1, r0, #4
-; CHECK-NEXT: add.w r1, r2, r1, lsl #2
+; CHECK-NEXT: ldr r1, [sp, #24] @ 4-byte Reload
; CHECK-NEXT: vadd.f32 s4, s3, s1
+; CHECK-NEXT: add.w r1, r2, r1, lsl #2
; CHECK-NEXT: vstr s8, [r1]
-; CHECK-NEXT: adds r1, r0, #5
+; CHECK-NEXT: ldr r1, [sp, #28] @ 4-byte Reload
; CHECK-NEXT: add.w r1, r2, r1, lsl #2
; CHECK-NEXT: vstr s12, [r1]
-; CHECK-NEXT: adds r1, r0, #6
+; CHECK-NEXT: ldr r1, [sp, #32] @ 4-byte Reload
; CHECK-NEXT: add.w r1, r2, r1, lsl #2
; CHECK-NEXT: vstr s4, [r1]
-; CHECK-NEXT: adds r1, r0, #7
-; CHECK-NEXT: adds r0, #8
+; CHECK-NEXT: ldr r1, [sp, #36] @ 4-byte Reload
; CHECK-NEXT: add.w r1, r2, r1, lsl #2
; CHECK-NEXT: vstr s6, [r1]
-; CHECK-NEXT: ldr r1, [sp] @ 4-byte Reload
-; CHECK-NEXT: add r9, r1
-; CHECK-NEXT: ldr r1, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT: ldr r1, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT: add r12, r1
+; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload
; CHECK-NEXT: cmp r0, r1
; CHECK-NEXT: blo.w .LBB7_2
; CHECK-NEXT: .LBB7_5: @ %for.cond.cleanup
-; CHECK-NEXT: add sp, #64
+; CHECK-NEXT: add sp, #88
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: add sp, #4
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
; CHECK-NEXT: adcq %rdx, 8(%rdi)
; CHECK-NEXT: movq %rax, %rdx
; CHECK-NEXT: adcq %rcx, %rdx
-; CHECK-NEXT: movq 24(%rdi), %r14
-; CHECK-NEXT: leaq (%r8,%r14), %r11
+; CHECK-NEXT: movq 24(%rdi), %r11
+; CHECK-NEXT: leaq (%r8,%r11), %r14
; CHECK-NEXT: xorl %ebx, %ebx
; CHECK-NEXT: cmpq %r10, %rdx
; CHECK-NEXT: setb %bl
; CHECK-NEXT: addq %rcx, %rax
-; CHECK-NEXT: adcq %r11, %rbx
-; CHECK-NEXT: movq 32(%rdi), %rcx
-; CHECK-NEXT: leaq (%r9,%rcx), %r10
+; CHECK-NEXT: adcq %r14, %rbx
+; CHECK-NEXT: movq 32(%rdi), %r10
+; CHECK-NEXT: leaq (%r9,%r10), %rcx
; CHECK-NEXT: xorl %esi, %esi
-; CHECK-NEXT: cmpq %r11, %rbx
+; CHECK-NEXT: cmpq %r14, %rbx
; CHECK-NEXT: setb %sil
-; CHECK-NEXT: addq %r14, %r8
-; CHECK-NEXT: adcq %r10, %rsi
+; CHECK-NEXT: addq %r11, %r8
+; CHECK-NEXT: adcq %rcx, %rsi
; CHECK-NEXT: xorl %eax, %eax
-; CHECK-NEXT: cmpq %r10, %rsi
+; CHECK-NEXT: cmpq %rcx, %rsi
; CHECK-NEXT: setb %al
-; CHECK-NEXT: addq %rcx, %r9
+; CHECK-NEXT: addq %r10, %r9
; CHECK-NEXT: movq %rdx, 16(%rdi)
; CHECK-NEXT: movq %rbx, 24(%rdi)
; CHECK-NEXT: movq %rsi, 32(%rdi)
; CHECK-NEXT: testb %al, %al
; CHECK-NEXT: jne .LBB0_5
; CHECK-NEXT: # %bb.1: # %bb5
-; CHECK-NEXT: movq %rsi, %r12
+; CHECK-NEXT: movq %rsi, %r14
; CHECK-NEXT: movslq %edi, %rbp
; CHECK-NEXT: leaq (,%rbp,8), %rax
-; CHECK-NEXT: leaq global(%rax,%rax,2), %r14
-; CHECK-NEXT: leaq global+4(%rax,%rax,2), %r15
+; CHECK-NEXT: leaq global(%rax,%rax,2), %r15
+; CHECK-NEXT: leaq global+4(%rax,%rax,2), %r12
; CHECK-NEXT: xorl %r13d, %r13d
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB0_2: # %bb8
; CHECK-NEXT: callq bar@PLT
; CHECK-NEXT: movq %rax, %rbx
; CHECK-NEXT: movq %rax, %rdi
-; CHECK-NEXT: callq *%r12
-; CHECK-NEXT: movq %r14, %rdi
-; CHECK-NEXT: callq hoge@PLT
+; CHECK-NEXT: callq *%r14
; CHECK-NEXT: movq %r15, %rdi
; CHECK-NEXT: callq hoge@PLT
+; CHECK-NEXT: movq %r12, %rdi
+; CHECK-NEXT: callq hoge@PLT
; CHECK-NEXT: testb %r13b, %r13b
; CHECK-NEXT: jne .LBB0_2
; CHECK-NEXT: # %bb.3: # %bb15
; CHECK-NEXT: movl (%r15), %eax
; CHECK-NEXT: leal 8(,%rcx,8), %ecx
; CHECK-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT: leaq 8(%r12), %rcx
+; CHECK-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: leaq 32(%r12), %rbx
; CHECK-NEXT: shlq $3, %r13
; CHECK-NEXT: xorl %esi, %esi
; CHECK-NEXT: jae .LBB1_7
; CHECK-NEXT: # %bb.6: # %vector.memcheck
; CHECK-NEXT: # in Loop: Header=BB1_2 Depth=1
-; CHECK-NEXT: leaq 8(%r12), %rax
-; CHECK-NEXT: addq %rax, %r10
+; CHECK-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Folded Reload
; CHECK-NEXT: leaq (%r10,%r11,8), %rax
; CHECK-NEXT: cmpq %rcx, %rax
; CHECK-NEXT: ja .LBB1_14
; CHECK-NEXT: .LBB1_7: # %vector.body.preheader
; CHECK-NEXT: # in Loop: Header=BB1_2 Depth=1
-; CHECK-NEXT: leaq -4(%r8), %r10
-; CHECK-NEXT: movq %r10, %rax
-; CHECK-NEXT: shrq $2, %rax
-; CHECK-NEXT: btl $2, %r10d
+; CHECK-NEXT: leaq -4(%r8), %rax
+; CHECK-NEXT: movq %rax, %r10
+; CHECK-NEXT: shrq $2, %r10
+; CHECK-NEXT: btl $2, %eax
; CHECK-NEXT: jb .LBB1_8
; CHECK-NEXT: # %bb.9: # %vector.body.prol.preheader
; CHECK-NEXT: # in Loop: Header=BB1_2 Depth=1
; CHECK-NEXT: movdqu %xmm0, (%rdi,%r9,8)
; CHECK-NEXT: movdqu %xmm0, 16(%rdi,%r9,8)
; CHECK-NEXT: movl $4, %r11d
-; CHECK-NEXT: testq %rax, %rax
+; CHECK-NEXT: testq %r10, %r10
; CHECK-NEXT: jne .LBB1_11
; CHECK-NEXT: jmp .LBB1_13
; CHECK-NEXT: .LBB1_8: # in Loop: Header=BB1_2 Depth=1
; CHECK-NEXT: xorl %r11d, %r11d
-; CHECK-NEXT: testq %rax, %rax
+; CHECK-NEXT: testq %r10, %r10
; CHECK-NEXT: je .LBB1_13
; CHECK-NEXT: .LBB1_11: # %vector.body.preheader.new
; CHECK-NEXT: # in Loop: Header=BB1_2 Depth=1
; CHECK: pushl %eax
; CHECK: subl $20, %esp
; CHECK: movl %esp, %[[beg:[^ ]*]]
+; CHECK: leal 12(%[[beg]]), %[[end:[^ ]*]]
call void @begin(%Iter* sret(%Iter) %temp.lvalue)
; CHECK: calll _begin
to label %invoke.cont unwind label %lpad
; Uses end as sret param.
-; CHECK: leal 12(%[[beg]]), %[[end:[^ ]*]]
; CHECK: pushl %[[end]]
; CHECK: calll _plus
; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-linux -stop-after=early-machinelicm -o - | FileCheck %s -check-prefix=MIR
-
-; This tests should fail as MachineLICM does not compute register pressure
+; This tests currently fails as MachineLICM does not compute register pressure
; correctly. More details: llvm.org/PR23143
-
-; It however does not show any spills because leaq is rematerialized instead
-; of spilling.
-
-; Stopping after MachineLICM however exposes all ADD64ri8 instructions
-; to be hoisted which still has to be avoided.
-
; XFAIL: *
; MachineLICM should take register pressure into account.
-; CHECK-LABEL: {{^}}test:
-; CHECK-NOT: Spill
-; CHECK-COUNT-4: leaq
-; CHECK-NOT: Spill
-; CHECK: [[LOOP:\.LBB[0-9_]+]]:
-; CHECK-NOT: Reload
-; CHECK-COUNT-2: leaq
-; CHECK-NOT: Reload
-; CHECK: jne [[LOOP]]
-
-; MIR-LABEL: name: test
-; MIR: bb.0.entry:
-; MIR-COUNT-4: ADD64ri8
-; MIR: bb.1.loop-body:
-; MIR-COUNT-2: ADD64ri8
-; MIR: JCC_1 %bb.1
+; CHECK-NOT: Spill
%struct.A = type { i32, i32, i32, i32, i32, i32, i32 }
; CHECK-NEXT: ## %bb.10: ## %do.end
; CHECK-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
; CHECK-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; CHECK-NEXT: xorl %r12d, %r12d
-; CHECK-NEXT: testb %r12b, %r12b
+; CHECK-NEXT: xorl %r13d, %r13d
+; CHECK-NEXT: testb %r13b, %r13b
; CHECK-NEXT: jne LBB0_11
; CHECK-NEXT: ## %bb.12: ## %while.body200.preheader
-; CHECK-NEXT: xorl %ebx, %ebx
+; CHECK-NEXT: xorl %r12d, %r12d
; CHECK-NEXT: leaq LJTI0_0(%rip), %rdx
-; CHECK-NEXT: leaq LJTI0_1(%rip), %r13
+; CHECK-NEXT: leaq LJTI0_1(%rip), %rbx
; CHECK-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Folded Spill
; CHECK-NEXT: xorl %r14d, %r14d
; CHECK-NEXT: jmp LBB0_13
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: LBB0_20: ## %sw.bb256
; CHECK-NEXT: ## in Loop: Header=BB0_13 Depth=1
-; CHECK-NEXT: movl %r12d, %r14d
+; CHECK-NEXT: movl %r13d, %r14d
; CHECK-NEXT: LBB0_21: ## %while.cond197.backedge
; CHECK-NEXT: ## in Loop: Header=BB0_13 Depth=1
; CHECK-NEXT: decl %r15d
; CHECK-NEXT: testl %r15d, %r15d
-; CHECK-NEXT: movl %r14d, %r12d
+; CHECK-NEXT: movl %r14d, %r13d
; CHECK-NEXT: jle LBB0_22
; CHECK-NEXT: LBB0_13: ## %while.body200
; CHECK-NEXT: ## =>This Loop Header: Depth=1
; CHECK-NEXT: ## Child Loop BB0_29 Depth 2
; CHECK-NEXT: ## Child Loop BB0_38 Depth 2
-; CHECK-NEXT: leal -268(%r12), %eax
+; CHECK-NEXT: leal -268(%r13), %eax
; CHECK-NEXT: cmpl $105, %eax
; CHECK-NEXT: ja LBB0_14
; CHECK-NEXT: ## %bb.56: ## %while.body200
; CHECK-NEXT: ## in Loop: Header=BB0_13 Depth=1
-; CHECK-NEXT: movslq (%r13,%rax,4), %rax
-; CHECK-NEXT: addq %r13, %rax
+; CHECK-NEXT: movslq (%rbx,%rax,4), %rax
+; CHECK-NEXT: addq %rbx, %rax
; CHECK-NEXT: jmpq *%rax
; CHECK-NEXT: LBB0_44: ## %while.cond1037.preheader
; CHECK-NEXT: ## in Loop: Header=BB0_13 Depth=1
-; CHECK-NEXT: testb %bl, %bl
-; CHECK-NEXT: movl %r12d, %r14d
+; CHECK-NEXT: testb %r12b, %r12b
+; CHECK-NEXT: movl %r13d, %r14d
; CHECK-NEXT: jne LBB0_21
; CHECK-NEXT: jmp LBB0_55
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: LBB0_14: ## %while.body200
; CHECK-NEXT: ## in Loop: Header=BB0_13 Depth=1
-; CHECK-NEXT: leal 1(%r12), %eax
+; CHECK-NEXT: leal 1(%r13), %eax
; CHECK-NEXT: cmpl $21, %eax
; CHECK-NEXT: ja LBB0_20
; CHECK-NEXT: ## %bb.15: ## %while.body200
; CHECK-NEXT: jmp LBB0_21
; CHECK-NEXT: LBB0_26: ## %sw.bb474
; CHECK-NEXT: ## in Loop: Header=BB0_13 Depth=1
-; CHECK-NEXT: testb %bl, %bl
+; CHECK-NEXT: testb %r12b, %r12b
; CHECK-NEXT: ## implicit-def: $rbp
; CHECK-NEXT: jne LBB0_34
; CHECK-NEXT: ## %bb.27: ## %do.body479.preheader
; CHECK-NEXT: ## in Loop: Header=BB0_13 Depth=1
-; CHECK-NEXT: testb %bl, %bl
+; CHECK-NEXT: testb %r12b, %r12b
; CHECK-NEXT: ## implicit-def: $rbp
; CHECK-NEXT: jne LBB0_34
; CHECK-NEXT: ## %bb.28: ## %land.rhs485.preheader
; CHECK-NEXT: LBB0_32: ## %do.body479.backedge
; CHECK-NEXT: ## in Loop: Header=BB0_29 Depth=2
; CHECK-NEXT: leaq 1(%rbp), %rax
-; CHECK-NEXT: testb %bl, %bl
+; CHECK-NEXT: testb %r12b, %r12b
; CHECK-NEXT: je LBB0_33
; CHECK-NEXT: LBB0_29: ## %land.rhs485
; CHECK-NEXT: ## Parent Loop BB0_13 Depth=1
; CHECK-NEXT: ## %bb.30: ## %cond.true.i.i2780
; CHECK-NEXT: ## in Loop: Header=BB0_29 Depth=2
; CHECK-NEXT: movq %rax, %rbp
-; CHECK-NEXT: testb %bl, %bl
+; CHECK-NEXT: testb %r12b, %r12b
; CHECK-NEXT: jne LBB0_32
; CHECK-NEXT: ## %bb.31: ## %lor.rhs500
; CHECK-NEXT: ## in Loop: Header=BB0_29 Depth=2
; CHECK-NEXT: movl $256, %esi ## imm = 0x100
; CHECK-NEXT: callq ___maskrune
-; CHECK-NEXT: testb %bl, %bl
+; CHECK-NEXT: testb %r12b, %r12b
; CHECK-NEXT: jne LBB0_32
; CHECK-NEXT: jmp LBB0_34
; CHECK-NEXT: LBB0_45: ## %sw.bb1134
; CHECK-NEXT: LBB0_38: ## %for.cond534
; CHECK-NEXT: ## Parent Loop BB0_13 Depth=1
; CHECK-NEXT: ## => This Inner Loop Header: Depth=2
-; CHECK-NEXT: testb %bl, %bl
+; CHECK-NEXT: testb %r12b, %r12b
; CHECK-NEXT: jne LBB0_38
; CHECK-NEXT: ## %bb.39: ## %for.cond542.preheader
; CHECK-NEXT: ## in Loop: Header=BB0_13 Depth=1
-; CHECK-NEXT: testb %bl, %bl
+; CHECK-NEXT: testb %r12b, %r12b
; CHECK-NEXT: movb $0, (%rbp)
-; CHECK-NEXT: movl %r12d, %r14d
+; CHECK-NEXT: movl %r13d, %r14d
; CHECK-NEXT: leaq LJTI0_0(%rip), %rdx
; CHECK-NEXT: jmp LBB0_21
; CHECK-NEXT: .p2align 4, 0x90
; X64-NEXT: movq %r12, %rcx
; X64-NEXT: callq __divti3@PLT
; X64-NEXT: movq %rax, %r13
+; X64-NEXT: decq %rax
+; X64-NEXT: movq %rax, (%rsp) # 8-byte Spill
; X64-NEXT: testq %rbx, %rbx
; X64-NEXT: sets %al
; X64-NEXT: testq %r12, %r12
; X64-NEXT: orq %rax, %rdx
; X64-NEXT: setne %al
; X64-NEXT: testb %bpl, %al
-; X64-NEXT: leaq -1(%r13), %rax
-; X64-NEXT: cmovneq %rax, %r13
+; X64-NEXT: cmovneq (%rsp), %r13 # 8-byte Folded Reload
; X64-NEXT: movq %r13, %rax
; X64-NEXT: addq $8, %rsp
; X64-NEXT: popq %rbx