From 08d7eec06e8cf5c15a96ce11f311f1480291a441 Mon Sep 17 00:00:00 2001 From: Stanislav Mekhanoshin Date: Fri, 24 Sep 2021 09:53:51 -0700 Subject: [PATCH] Revert "Allow rematerialization of virtual reg uses" Reverted due to two distcint performance regression reports. This reverts commit 92c1fd19abb15bc68b1127a26137a69e033cdb39. --- llvm/include/llvm/CodeGen/TargetInstrInfo.h | 12 +- llvm/lib/CodeGen/TargetInstrInfo.cpp | 9 +- llvm/test/CodeGen/AMDGPU/remat-sop.mir | 60 - llvm/test/CodeGen/ARM/arm-shrink-wrapping-linux.ll | 28 +- llvm/test/CodeGen/ARM/funnel-shift-rot.ll | 32 +- llvm/test/CodeGen/ARM/funnel-shift.ll | 30 +- .../test/CodeGen/ARM/illegal-bitfield-loadstore.ll | 30 +- llvm/test/CodeGen/ARM/neon-copy.ll | 10 +- llvm/test/CodeGen/Mips/llvm-ir/ashr.ll | 227 +- llvm/test/CodeGen/Mips/llvm-ir/lshr.ll | 206 +- llvm/test/CodeGen/Mips/llvm-ir/shl.ll | 95 +- llvm/test/CodeGen/Mips/llvm-ir/sub.ll | 31 +- llvm/test/CodeGen/Mips/tls.ll | 4 +- llvm/test/CodeGen/RISCV/atomic-rmw.ll | 120 +- llvm/test/CodeGen/RISCV/atomic-signext.ll | 24 +- llvm/test/CodeGen/RISCV/bswap-ctlz-cttz-ctpop.ll | 96 +- llvm/test/CodeGen/RISCV/mul.ll | 72 +- llvm/test/CodeGen/RISCV/rv32i-rv64i-half.ll | 12 +- llvm/test/CodeGen/RISCV/rv32zbb-zbp.ll | 270 +- llvm/test/CodeGen/RISCV/rv32zbb.ll | 94 +- llvm/test/CodeGen/RISCV/rv32zbp.ll | 262 +- llvm/test/CodeGen/RISCV/rv32zbt.ll | 206 +- .../CodeGen/RISCV/rvv/fixed-vectors-bitreverse.ll | 150 +- llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap.ll | 146 +- llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll | 3584 ++++++++++---------- llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll | 664 ++-- llvm/test/CodeGen/RISCV/shifts.ll | 308 +- llvm/test/CodeGen/RISCV/srem-vector-lkk.ll | 208 +- llvm/test/CodeGen/RISCV/urem-vector-lkk.ll | 190 +- llvm/test/CodeGen/Thumb/dyn-stackalloc.ll | 7 +- .../tail-pred-disabled-in-loloops.ll | 14 +- .../LowOverheadLoops/varying-outer-2d-reduction.ll | 64 +- .../CodeGen/Thumb2/LowOverheadLoops/while-loops.ll | 67 +- llvm/test/CodeGen/Thumb2/ldr-str-imm12.ll | 30 +- llvm/test/CodeGen/Thumb2/mve-float16regloops.ll | 82 +- llvm/test/CodeGen/Thumb2/mve-float32regloops.ll | 98 +- llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll | 529 +-- llvm/test/CodeGen/X86/addcarry.ll | 20 +- llvm/test/CodeGen/X86/callbr-asm-blockplacement.ll | 12 +- llvm/test/CodeGen/X86/dag-update-nodetomatch.ll | 17 +- .../X86/delete-dead-instrs-with-live-uses.mir | 4 +- llvm/test/CodeGen/X86/inalloca-invoke.ll | 2 +- llvm/test/CodeGen/X86/licm-regpressure.ll | 28 +- llvm/test/CodeGen/X86/ragreedy-hoist-spill.ll | 40 +- llvm/test/CodeGen/X86/sdiv_fix.ll | 5 +- 45 files changed, 4093 insertions(+), 4106 deletions(-) diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h index a0c52e2..c394ac9 100644 --- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h +++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h @@ -117,11 +117,10 @@ public: const MachineFunction &MF) const; /// Return true if the instruction is trivially rematerializable, meaning it - /// has no side effects. Uses of constants and unallocatable physical - /// registers are always trivial to rematerialize so that the instructions - /// result is independent of the place in the function. Uses of virtual - /// registers are allowed but it is caller's responsility to ensure these - /// operands are valid at the point the instruction is beeing moved. + /// has no side effects and requires no operands that aren't always available. + /// This means the only allowed uses are constants and unallocatable physical + /// registers so that the instructions result is independent of the place + /// in the function. bool isTriviallyReMaterializable(const MachineInstr &MI, AAResults *AA = nullptr) const { return MI.getOpcode() == TargetOpcode::IMPLICIT_DEF || @@ -141,7 +140,8 @@ protected: /// set, this hook lets the target specify whether the instruction is actually /// trivially rematerializable, taking into consideration its operands. This /// predicate must return false if the instruction has any side effects other - /// than producing a value. + /// than producing a value, or if it requres any address registers that are + /// not always available. /// Requirements must be check as stated in isTriviallyReMaterializable() . virtual bool isReallyTriviallyReMaterializable(const MachineInstr &MI, AAResults *AA) const { diff --git a/llvm/lib/CodeGen/TargetInstrInfo.cpp b/llvm/lib/CodeGen/TargetInstrInfo.cpp index fe7d60e..1eab8e7 100644 --- a/llvm/lib/CodeGen/TargetInstrInfo.cpp +++ b/llvm/lib/CodeGen/TargetInstrInfo.cpp @@ -921,8 +921,7 @@ bool TargetInstrInfo::isReallyTriviallyReMaterializableGeneric( const MachineRegisterInfo &MRI = MF.getRegInfo(); // Remat clients assume operand 0 is the defined register. - if (!MI.getNumOperands() || !MI.getOperand(0).isReg() || - MI.getOperand(0).isTied()) + if (!MI.getNumOperands() || !MI.getOperand(0).isReg()) return false; Register DefReg = MI.getOperand(0).getReg(); @@ -984,6 +983,12 @@ bool TargetInstrInfo::isReallyTriviallyReMaterializableGeneric( // same virtual register, though. if (MO.isDef() && Reg != DefReg) return false; + + // Don't allow any virtual-register uses. Rematting an instruction with + // virtual register uses would length the live ranges of the uses, which + // is not necessarily a good idea, certainly not "trivial". + if (MO.isUse()) + return false; } // Everything checked out. diff --git a/llvm/test/CodeGen/AMDGPU/remat-sop.mir b/llvm/test/CodeGen/AMDGPU/remat-sop.mir index c9915aa..ed799bf 100644 --- a/llvm/test/CodeGen/AMDGPU/remat-sop.mir +++ b/llvm/test/CodeGen/AMDGPU/remat-sop.mir @@ -51,66 +51,6 @@ body: | S_NOP 0, implicit %2 S_ENDPGM 0 ... -# The liverange of %0 covers a point of rematerialization, source value is -# availabe. ---- -name: test_remat_s_mov_b32_vreg_src_long_lr -tracksRegLiveness: true -machineFunctionInfo: - stackPtrOffsetReg: $sgpr32 -body: | - bb.0: - ; GCN-LABEL: name: test_remat_s_mov_b32_vreg_src_long_lr - ; GCN: renamable $sgpr0 = IMPLICIT_DEF - ; GCN: renamable $sgpr1 = S_MOV_B32 renamable $sgpr0 - ; GCN: S_NOP 0, implicit killed renamable $sgpr1 - ; GCN: renamable $sgpr1 = S_MOV_B32 renamable $sgpr0 - ; GCN: S_NOP 0, implicit killed renamable $sgpr1 - ; GCN: renamable $sgpr1 = S_MOV_B32 renamable $sgpr0 - ; GCN: S_NOP 0, implicit killed renamable $sgpr1 - ; GCN: S_NOP 0, implicit killed renamable $sgpr0 - ; GCN: S_ENDPGM 0 - %0:sreg_32 = IMPLICIT_DEF - %1:sreg_32 = S_MOV_B32 %0:sreg_32 - %2:sreg_32 = S_MOV_B32 %0:sreg_32 - %3:sreg_32 = S_MOV_B32 %0:sreg_32 - S_NOP 0, implicit %1 - S_NOP 0, implicit %2 - S_NOP 0, implicit %3 - S_NOP 0, implicit %0 - S_ENDPGM 0 -... -# The liverange of %0 does not cover a point of rematerialization, source value is -# unavailabe and we do not want to artificially extend the liverange. ---- -name: test_no_remat_s_mov_b32_vreg_src_short_lr -tracksRegLiveness: true -machineFunctionInfo: - stackPtrOffsetReg: $sgpr32 -body: | - bb.0: - ; GCN-LABEL: name: test_no_remat_s_mov_b32_vreg_src_short_lr - ; GCN: renamable $sgpr0 = IMPLICIT_DEF - ; GCN: renamable $sgpr1 = S_MOV_B32 renamable $sgpr0 - ; GCN: SI_SPILL_S32_SAVE killed renamable $sgpr1, %stack.1, implicit $exec, implicit $sgpr32 :: (store (s32) into %stack.1, addrspace 5) - ; GCN: renamable $sgpr1 = S_MOV_B32 renamable $sgpr0 - ; GCN: SI_SPILL_S32_SAVE killed renamable $sgpr1, %stack.0, implicit $exec, implicit $sgpr32 :: (store (s32) into %stack.0, addrspace 5) - ; GCN: renamable $sgpr0 = S_MOV_B32 killed renamable $sgpr0 - ; GCN: renamable $sgpr1 = SI_SPILL_S32_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.1, addrspace 5) - ; GCN: S_NOP 0, implicit killed renamable $sgpr1 - ; GCN: renamable $sgpr1 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.0, addrspace 5) - ; GCN: S_NOP 0, implicit killed renamable $sgpr1 - ; GCN: S_NOP 0, implicit killed renamable $sgpr0 - ; GCN: S_ENDPGM 0 - %0:sreg_32 = IMPLICIT_DEF - %1:sreg_32 = S_MOV_B32 %0:sreg_32 - %2:sreg_32 = S_MOV_B32 %0:sreg_32 - %3:sreg_32 = S_MOV_B32 %0:sreg_32 - S_NOP 0, implicit %1 - S_NOP 0, implicit %2 - S_NOP 0, implicit %3 - S_ENDPGM 0 -... --- name: test_remat_s_mov_b64 tracksRegLiveness: true diff --git a/llvm/test/CodeGen/ARM/arm-shrink-wrapping-linux.ll b/llvm/test/CodeGen/ARM/arm-shrink-wrapping-linux.ll index 175a206..a424327 100644 --- a/llvm/test/CodeGen/ARM/arm-shrink-wrapping-linux.ll +++ b/llvm/test/CodeGen/ARM/arm-shrink-wrapping-linux.ll @@ -29,20 +29,20 @@ define fastcc i8* @wrongUseOfPostDominate(i8* readonly %s, i32 %off, i8* readnon ; ENABLE-NEXT: pophs {r11, pc} ; ENABLE-NEXT: .LBB0_3: @ %while.body.preheader ; ENABLE-NEXT: movw r12, :lower16:skip -; ENABLE-NEXT: sub r3, r1, #1 +; ENABLE-NEXT: sub r1, r1, #1 ; ENABLE-NEXT: movt r12, :upper16:skip ; ENABLE-NEXT: .LBB0_4: @ %while.body ; ENABLE-NEXT: @ =>This Inner Loop Header: Depth=1 -; ENABLE-NEXT: ldrb r1, [r0] -; ENABLE-NEXT: ldrb r1, [r12, r1] -; ENABLE-NEXT: add r0, r0, r1 -; ENABLE-NEXT: sub r1, r3, #1 -; ENABLE-NEXT: cmp r1, r3 +; ENABLE-NEXT: ldrb r3, [r0] +; ENABLE-NEXT: ldrb r3, [r12, r3] +; ENABLE-NEXT: add r0, r0, r3 +; ENABLE-NEXT: sub r3, r1, #1 +; ENABLE-NEXT: cmp r3, r1 ; ENABLE-NEXT: bhs .LBB0_6 ; ENABLE-NEXT: @ %bb.5: @ %while.body ; ENABLE-NEXT: @ in Loop: Header=BB0_4 Depth=1 ; ENABLE-NEXT: cmp r0, r2 -; ENABLE-NEXT: mov r3, r1 +; ENABLE-NEXT: mov r1, r3 ; ENABLE-NEXT: blo .LBB0_4 ; ENABLE-NEXT: .LBB0_6: @ %if.end29 ; ENABLE-NEXT: pop {r11, pc} @@ -119,20 +119,20 @@ define fastcc i8* @wrongUseOfPostDominate(i8* readonly %s, i32 %off, i8* readnon ; DISABLE-NEXT: pophs {r11, pc} ; DISABLE-NEXT: .LBB0_3: @ %while.body.preheader ; DISABLE-NEXT: movw r12, :lower16:skip -; DISABLE-NEXT: sub r3, r1, #1 +; DISABLE-NEXT: sub r1, r1, #1 ; DISABLE-NEXT: movt r12, :upper16:skip ; DISABLE-NEXT: .LBB0_4: @ %while.body ; DISABLE-NEXT: @ =>This Inner Loop Header: Depth=1 -; DISABLE-NEXT: ldrb r1, [r0] -; DISABLE-NEXT: ldrb r1, [r12, r1] -; DISABLE-NEXT: add r0, r0, r1 -; DISABLE-NEXT: sub r1, r3, #1 -; DISABLE-NEXT: cmp r1, r3 +; DISABLE-NEXT: ldrb r3, [r0] +; DISABLE-NEXT: ldrb r3, [r12, r3] +; DISABLE-NEXT: add r0, r0, r3 +; DISABLE-NEXT: sub r3, r1, #1 +; DISABLE-NEXT: cmp r3, r1 ; DISABLE-NEXT: bhs .LBB0_6 ; DISABLE-NEXT: @ %bb.5: @ %while.body ; DISABLE-NEXT: @ in Loop: Header=BB0_4 Depth=1 ; DISABLE-NEXT: cmp r0, r2 -; DISABLE-NEXT: mov r3, r1 +; DISABLE-NEXT: mov r1, r3 ; DISABLE-NEXT: blo .LBB0_4 ; DISABLE-NEXT: .LBB0_6: @ %if.end29 ; DISABLE-NEXT: pop {r11, pc} diff --git a/llvm/test/CodeGen/ARM/funnel-shift-rot.ll b/llvm/test/CodeGen/ARM/funnel-shift-rot.ll index ea15fcc..5515787 100644 --- a/llvm/test/CodeGen/ARM/funnel-shift-rot.ll +++ b/llvm/test/CodeGen/ARM/funnel-shift-rot.ll @@ -73,13 +73,13 @@ define i64 @rotl_i64(i64 %x, i64 %z) { ; SCALAR-NEXT: push {r4, r5, r11, lr} ; SCALAR-NEXT: rsb r3, r2, #0 ; SCALAR-NEXT: and r4, r2, #63 -; SCALAR-NEXT: and r12, r3, #63 -; SCALAR-NEXT: rsb r3, r12, #32 +; SCALAR-NEXT: and lr, r3, #63 +; SCALAR-NEXT: rsb r3, lr, #32 ; SCALAR-NEXT: lsl r2, r0, r4 -; SCALAR-NEXT: lsr lr, r0, r12 -; SCALAR-NEXT: orr r3, lr, r1, lsl r3 -; SCALAR-NEXT: subs lr, r12, #32 -; SCALAR-NEXT: lsrpl r3, r1, lr +; SCALAR-NEXT: lsr r12, r0, lr +; SCALAR-NEXT: orr r3, r12, r1, lsl r3 +; SCALAR-NEXT: subs r12, lr, #32 +; SCALAR-NEXT: lsrpl r3, r1, r12 ; SCALAR-NEXT: subs r5, r4, #32 ; SCALAR-NEXT: movwpl r2, #0 ; SCALAR-NEXT: cmp r5, #0 @@ -88,8 +88,8 @@ define i64 @rotl_i64(i64 %x, i64 %z) { ; SCALAR-NEXT: lsr r3, r0, r3 ; SCALAR-NEXT: orr r3, r3, r1, lsl r4 ; SCALAR-NEXT: lslpl r3, r0, r5 -; SCALAR-NEXT: lsr r0, r1, r12 -; SCALAR-NEXT: cmp lr, #0 +; SCALAR-NEXT: lsr r0, r1, lr +; SCALAR-NEXT: cmp r12, #0 ; SCALAR-NEXT: movwpl r0, #0 ; SCALAR-NEXT: orr r1, r3, r0 ; SCALAR-NEXT: mov r0, r2 @@ -245,15 +245,15 @@ define i64 @rotr_i64(i64 %x, i64 %z) { ; CHECK: @ %bb.0: ; CHECK-NEXT: .save {r4, r5, r11, lr} ; CHECK-NEXT: push {r4, r5, r11, lr} -; CHECK-NEXT: and r12, r2, #63 +; CHECK-NEXT: and lr, r2, #63 ; CHECK-NEXT: rsb r2, r2, #0 -; CHECK-NEXT: rsb r3, r12, #32 +; CHECK-NEXT: rsb r3, lr, #32 ; CHECK-NEXT: and r4, r2, #63 -; CHECK-NEXT: lsr lr, r0, r12 -; CHECK-NEXT: orr r3, lr, r1, lsl r3 -; CHECK-NEXT: subs lr, r12, #32 +; CHECK-NEXT: lsr r12, r0, lr +; CHECK-NEXT: orr r3, r12, r1, lsl r3 +; CHECK-NEXT: subs r12, lr, #32 ; CHECK-NEXT: lsl r2, r0, r4 -; CHECK-NEXT: lsrpl r3, r1, lr +; CHECK-NEXT: lsrpl r3, r1, r12 ; CHECK-NEXT: subs r5, r4, #32 ; CHECK-NEXT: movwpl r2, #0 ; CHECK-NEXT: cmp r5, #0 @@ -262,8 +262,8 @@ define i64 @rotr_i64(i64 %x, i64 %z) { ; CHECK-NEXT: lsr r3, r0, r3 ; CHECK-NEXT: orr r3, r3, r1, lsl r4 ; CHECK-NEXT: lslpl r3, r0, r5 -; CHECK-NEXT: lsr r0, r1, r12 -; CHECK-NEXT: cmp lr, #0 +; CHECK-NEXT: lsr r0, r1, lr +; CHECK-NEXT: cmp r12, #0 ; CHECK-NEXT: movwpl r0, #0 ; CHECK-NEXT: orr r1, r0, r3 ; CHECK-NEXT: mov r0, r2 diff --git a/llvm/test/CodeGen/ARM/funnel-shift.ll b/llvm/test/CodeGen/ARM/funnel-shift.ll index 6372f9b..54c93b4 100644 --- a/llvm/test/CodeGen/ARM/funnel-shift.ll +++ b/llvm/test/CodeGen/ARM/funnel-shift.ll @@ -224,31 +224,31 @@ define i37 @fshr_i37(i37 %x, i37 %y, i37 %z) { ; CHECK-NEXT: mov r3, #0 ; CHECK-NEXT: bl __aeabi_uldivmod ; CHECK-NEXT: add r0, r2, #27 -; CHECK-NEXT: lsl r2, r7, #27 -; CHECK-NEXT: and r12, r0, #63 ; CHECK-NEXT: lsl r6, r6, #27 +; CHECK-NEXT: and r1, r0, #63 +; CHECK-NEXT: lsl r2, r7, #27 ; CHECK-NEXT: orr r7, r6, r7, lsr #5 -; CHECK-NEXT: rsb r3, r12, #32 -; CHECK-NEXT: lsr r2, r2, r12 ; CHECK-NEXT: mov r6, #63 -; CHECK-NEXT: orr r2, r2, r7, lsl r3 -; CHECK-NEXT: subs r3, r12, #32 +; CHECK-NEXT: rsb r3, r1, #32 +; CHECK-NEXT: lsr r2, r2, r1 +; CHECK-NEXT: subs r12, r1, #32 ; CHECK-NEXT: bic r6, r6, r0 +; CHECK-NEXT: orr r2, r2, r7, lsl r3 ; CHECK-NEXT: lsl r5, r9, #1 -; CHECK-NEXT: lsrpl r2, r7, r3 -; CHECK-NEXT: subs r1, r6, #32 +; CHECK-NEXT: lsrpl r2, r7, r12 ; CHECK-NEXT: lsl r0, r5, r6 -; CHECK-NEXT: lsl r4, r8, #1 +; CHECK-NEXT: subs r4, r6, #32 +; CHECK-NEXT: lsl r3, r8, #1 ; CHECK-NEXT: movwpl r0, #0 -; CHECK-NEXT: orr r4, r4, r9, lsr #31 +; CHECK-NEXT: orr r3, r3, r9, lsr #31 ; CHECK-NEXT: orr r0, r0, r2 ; CHECK-NEXT: rsb r2, r6, #32 -; CHECK-NEXT: cmp r1, #0 +; CHECK-NEXT: cmp r4, #0 +; CHECK-NEXT: lsr r1, r7, r1 ; CHECK-NEXT: lsr r2, r5, r2 -; CHECK-NEXT: orr r2, r2, r4, lsl r6 -; CHECK-NEXT: lslpl r2, r5, r1 -; CHECK-NEXT: lsr r1, r7, r12 -; CHECK-NEXT: cmp r3, #0 +; CHECK-NEXT: orr r2, r2, r3, lsl r6 +; CHECK-NEXT: lslpl r2, r5, r4 +; CHECK-NEXT: cmp r12, #0 ; CHECK-NEXT: movwpl r1, #0 ; CHECK-NEXT: orr r1, r2, r1 ; CHECK-NEXT: pop {r4, r5, r6, r7, r8, r9, r11, pc} diff --git a/llvm/test/CodeGen/ARM/illegal-bitfield-loadstore.ll b/llvm/test/CodeGen/ARM/illegal-bitfield-loadstore.ll index 0a0bb62..2922e0e 100644 --- a/llvm/test/CodeGen/ARM/illegal-bitfield-loadstore.ll +++ b/llvm/test/CodeGen/ARM/illegal-bitfield-loadstore.ll @@ -91,17 +91,17 @@ define void @i56_or(i56* %a) { ; BE-LABEL: i56_or: ; BE: @ %bb.0: ; BE-NEXT: mov r1, r0 +; BE-NEXT: ldr r12, [r0] ; BE-NEXT: ldrh r2, [r1, #4]! ; BE-NEXT: ldrb r3, [r1, #2] ; BE-NEXT: orr r2, r3, r2, lsl #8 -; BE-NEXT: ldr r3, [r0] -; BE-NEXT: orr r2, r2, r3, lsl #24 -; BE-NEXT: orr r12, r2, #384 -; BE-NEXT: strb r12, [r1, #2] -; BE-NEXT: lsr r2, r12, #8 -; BE-NEXT: strh r2, [r1] -; BE-NEXT: bic r1, r3, #255 -; BE-NEXT: orr r1, r1, r12, lsr #24 +; BE-NEXT: orr r2, r2, r12, lsl #24 +; BE-NEXT: orr r2, r2, #384 +; BE-NEXT: strb r2, [r1, #2] +; BE-NEXT: lsr r3, r2, #8 +; BE-NEXT: strh r3, [r1] +; BE-NEXT: bic r1, r12, #255 +; BE-NEXT: orr r1, r1, r2, lsr #24 ; BE-NEXT: str r1, [r0] ; BE-NEXT: mov pc, lr %aa = load i56, i56* %a @@ -127,13 +127,13 @@ define void @i56_and_or(i56* %a) { ; BE-NEXT: ldrb r3, [r1, #2] ; BE-NEXT: strb r2, [r1, #2] ; BE-NEXT: orr r2, r3, r12, lsl #8 -; BE-NEXT: ldr r3, [r0] -; BE-NEXT: orr r2, r2, r3, lsl #24 -; BE-NEXT: orr r12, r2, #384 -; BE-NEXT: lsr r2, r12, #8 -; BE-NEXT: strh r2, [r1] -; BE-NEXT: bic r1, r3, #255 -; BE-NEXT: orr r1, r1, r12, lsr #24 +; BE-NEXT: ldr r12, [r0] +; BE-NEXT: orr r2, r2, r12, lsl #24 +; BE-NEXT: orr r2, r2, #384 +; BE-NEXT: lsr r3, r2, #8 +; BE-NEXT: strh r3, [r1] +; BE-NEXT: bic r1, r12, #255 +; BE-NEXT: orr r1, r1, r2, lsr #24 ; BE-NEXT: str r1, [r0] ; BE-NEXT: mov pc, lr diff --git a/llvm/test/CodeGen/ARM/neon-copy.ll b/llvm/test/CodeGen/ARM/neon-copy.ll index 46490ef..09a991d 100644 --- a/llvm/test/CodeGen/ARM/neon-copy.ll +++ b/llvm/test/CodeGen/ARM/neon-copy.ll @@ -1340,16 +1340,16 @@ define <4 x i16> @test_extracts_inserts_varidx_insert(<8 x i16> %x, i32 %idx) { ; CHECK-NEXT: .pad #8 ; CHECK-NEXT: sub sp, sp, #8 ; CHECK-NEXT: vmov.u16 r1, d0[1] -; CHECK-NEXT: and r12, r0, #3 +; CHECK-NEXT: and r0, r0, #3 ; CHECK-NEXT: vmov.u16 r2, d0[2] -; CHECK-NEXT: mov r0, sp -; CHECK-NEXT: vmov.u16 r3, d0[3] -; CHECK-NEXT: orr r0, r0, r12, lsl #1 +; CHECK-NEXT: mov r3, sp +; CHECK-NEXT: vmov.u16 r12, d0[3] +; CHECK-NEXT: orr r0, r3, r0, lsl #1 ; CHECK-NEXT: vst1.16 {d0[0]}, [r0:16] ; CHECK-NEXT: vldr d0, [sp] ; CHECK-NEXT: vmov.16 d0[1], r1 ; CHECK-NEXT: vmov.16 d0[2], r2 -; CHECK-NEXT: vmov.16 d0[3], r3 +; CHECK-NEXT: vmov.16 d0[3], r12 ; CHECK-NEXT: add sp, sp, #8 ; CHECK-NEXT: bx lr %tmp = extractelement <8 x i16> %x, i32 0 diff --git a/llvm/test/CodeGen/Mips/llvm-ir/ashr.ll b/llvm/test/CodeGen/Mips/llvm-ir/ashr.ll index a125446..8be7100 100644 --- a/llvm/test/CodeGen/Mips/llvm-ir/ashr.ll +++ b/llvm/test/CodeGen/Mips/llvm-ir/ashr.ll @@ -766,85 +766,79 @@ define signext i128 @ashr_i128(i128 signext %a, i128 signext %b) { ; MMR3-NEXT: .cfi_offset 17, -4 ; MMR3-NEXT: .cfi_offset 16, -8 ; MMR3-NEXT: move $8, $7 -; MMR3-NEXT: move $2, $6 -; MMR3-NEXT: sw $5, 0($sp) # 4-byte Folded Spill -; MMR3-NEXT: sw $4, 12($sp) # 4-byte Folded Spill +; MMR3-NEXT: sw $6, 32($sp) # 4-byte Folded Spill +; MMR3-NEXT: sw $5, 36($sp) # 4-byte Folded Spill +; MMR3-NEXT: sw $4, 8($sp) # 4-byte Folded Spill ; MMR3-NEXT: lw $16, 76($sp) -; MMR3-NEXT: srlv $3, $7, $16 -; MMR3-NEXT: not16 $6, $16 -; MMR3-NEXT: sw $6, 24($sp) # 4-byte Folded Spill -; MMR3-NEXT: move $4, $2 -; MMR3-NEXT: sw $2, 32($sp) # 4-byte Folded Spill -; MMR3-NEXT: sll16 $2, $2, 1 -; MMR3-NEXT: sllv $2, $2, $6 -; MMR3-NEXT: li16 $6, 64 -; MMR3-NEXT: or16 $2, $3 -; MMR3-NEXT: srlv $4, $4, $16 -; MMR3-NEXT: sw $4, 16($sp) # 4-byte Folded Spill -; MMR3-NEXT: subu16 $7, $6, $16 +; MMR3-NEXT: srlv $4, $7, $16 +; MMR3-NEXT: not16 $3, $16 +; MMR3-NEXT: sw $3, 24($sp) # 4-byte Folded Spill +; MMR3-NEXT: sll16 $2, $6, 1 +; MMR3-NEXT: sllv $3, $2, $3 +; MMR3-NEXT: li16 $2, 64 +; MMR3-NEXT: or16 $3, $4 +; MMR3-NEXT: srlv $6, $6, $16 +; MMR3-NEXT: sw $6, 12($sp) # 4-byte Folded Spill +; MMR3-NEXT: subu16 $7, $2, $16 ; MMR3-NEXT: sllv $9, $5, $7 -; MMR3-NEXT: andi16 $5, $7, 32 -; MMR3-NEXT: sw $5, 28($sp) # 4-byte Folded Spill -; MMR3-NEXT: andi16 $6, $16, 32 -; MMR3-NEXT: sw $6, 36($sp) # 4-byte Folded Spill -; MMR3-NEXT: move $3, $9 +; MMR3-NEXT: andi16 $2, $7, 32 +; MMR3-NEXT: sw $2, 28($sp) # 4-byte Folded Spill +; MMR3-NEXT: andi16 $5, $16, 32 +; MMR3-NEXT: sw $5, 16($sp) # 4-byte Folded Spill +; MMR3-NEXT: move $4, $9 ; MMR3-NEXT: li16 $17, 0 -; MMR3-NEXT: movn $3, $17, $5 -; MMR3-NEXT: movn $2, $4, $6 -; MMR3-NEXT: addiu $4, $16, -64 -; MMR3-NEXT: lw $17, 0($sp) # 4-byte Folded Reload -; MMR3-NEXT: srlv $4, $17, $4 -; MMR3-NEXT: sw $4, 20($sp) # 4-byte Folded Spill -; MMR3-NEXT: lw $6, 12($sp) # 4-byte Folded Reload -; MMR3-NEXT: sll16 $4, $6, 1 -; MMR3-NEXT: sw $4, 8($sp) # 4-byte Folded Spill -; MMR3-NEXT: addiu $5, $16, -64 -; MMR3-NEXT: not16 $5, $5 -; MMR3-NEXT: sllv $5, $4, $5 -; MMR3-NEXT: or16 $2, $3 -; MMR3-NEXT: lw $3, 20($sp) # 4-byte Folded Reload -; MMR3-NEXT: or16 $5, $3 -; MMR3-NEXT: addiu $3, $16, -64 -; MMR3-NEXT: srav $1, $6, $3 -; MMR3-NEXT: andi16 $3, $3, 32 -; MMR3-NEXT: sw $3, 20($sp) # 4-byte Folded Spill -; MMR3-NEXT: movn $5, $1, $3 -; MMR3-NEXT: sllv $3, $6, $7 -; MMR3-NEXT: sw $3, 4($sp) # 4-byte Folded Spill -; MMR3-NEXT: not16 $3, $7 -; MMR3-NEXT: srl16 $4, $17, 1 -; MMR3-NEXT: srlv $3, $4, $3 +; MMR3-NEXT: movn $4, $17, $2 +; MMR3-NEXT: movn $3, $6, $5 +; MMR3-NEXT: addiu $2, $16, -64 +; MMR3-NEXT: lw $5, 36($sp) # 4-byte Folded Reload +; MMR3-NEXT: srlv $5, $5, $2 +; MMR3-NEXT: sw $5, 20($sp) # 4-byte Folded Spill +; MMR3-NEXT: lw $17, 8($sp) # 4-byte Folded Reload +; MMR3-NEXT: sll16 $6, $17, 1 +; MMR3-NEXT: sw $6, 4($sp) # 4-byte Folded Spill +; MMR3-NEXT: not16 $5, $2 +; MMR3-NEXT: sllv $5, $6, $5 +; MMR3-NEXT: or16 $3, $4 +; MMR3-NEXT: lw $4, 20($sp) # 4-byte Folded Reload +; MMR3-NEXT: or16 $5, $4 +; MMR3-NEXT: srav $1, $17, $2 +; MMR3-NEXT: andi16 $2, $2, 32 +; MMR3-NEXT: sw $2, 20($sp) # 4-byte Folded Spill +; MMR3-NEXT: movn $5, $1, $2 +; MMR3-NEXT: sllv $2, $17, $7 +; MMR3-NEXT: not16 $4, $7 +; MMR3-NEXT: lw $7, 36($sp) # 4-byte Folded Reload +; MMR3-NEXT: srl16 $6, $7, 1 +; MMR3-NEXT: srlv $6, $6, $4 ; MMR3-NEXT: sltiu $10, $16, 64 -; MMR3-NEXT: movn $5, $2, $10 -; MMR3-NEXT: lw $2, 4($sp) # 4-byte Folded Reload +; MMR3-NEXT: movn $5, $3, $10 +; MMR3-NEXT: or16 $6, $2 +; MMR3-NEXT: srlv $2, $7, $16 +; MMR3-NEXT: lw $3, 24($sp) # 4-byte Folded Reload +; MMR3-NEXT: lw $4, 4($sp) # 4-byte Folded Reload +; MMR3-NEXT: sllv $3, $4, $3 ; MMR3-NEXT: or16 $3, $2 -; MMR3-NEXT: srlv $2, $17, $16 -; MMR3-NEXT: lw $4, 24($sp) # 4-byte Folded Reload -; MMR3-NEXT: lw $7, 8($sp) # 4-byte Folded Reload -; MMR3-NEXT: sllv $17, $7, $4 -; MMR3-NEXT: or16 $17, $2 -; MMR3-NEXT: srav $11, $6, $16 -; MMR3-NEXT: lw $2, 36($sp) # 4-byte Folded Reload -; MMR3-NEXT: movn $17, $11, $2 -; MMR3-NEXT: sra $2, $6, 31 +; MMR3-NEXT: srav $11, $17, $16 +; MMR3-NEXT: lw $4, 16($sp) # 4-byte Folded Reload +; MMR3-NEXT: movn $3, $11, $4 +; MMR3-NEXT: sra $2, $17, 31 ; MMR3-NEXT: movz $5, $8, $16 -; MMR3-NEXT: move $4, $2 -; MMR3-NEXT: movn $4, $17, $10 -; MMR3-NEXT: lw $6, 28($sp) # 4-byte Folded Reload -; MMR3-NEXT: movn $3, $9, $6 -; MMR3-NEXT: lw $6, 36($sp) # 4-byte Folded Reload -; MMR3-NEXT: li16 $17, 0 -; MMR3-NEXT: lw $7, 16($sp) # 4-byte Folded Reload -; MMR3-NEXT: movn $7, $17, $6 -; MMR3-NEXT: or16 $7, $3 +; MMR3-NEXT: move $8, $2 +; MMR3-NEXT: movn $8, $3, $10 +; MMR3-NEXT: lw $3, 28($sp) # 4-byte Folded Reload +; MMR3-NEXT: movn $6, $9, $3 +; MMR3-NEXT: li16 $3, 0 +; MMR3-NEXT: lw $7, 12($sp) # 4-byte Folded Reload +; MMR3-NEXT: movn $7, $3, $4 +; MMR3-NEXT: or16 $7, $6 ; MMR3-NEXT: lw $3, 20($sp) # 4-byte Folded Reload ; MMR3-NEXT: movn $1, $2, $3 ; MMR3-NEXT: movn $1, $7, $10 ; MMR3-NEXT: lw $3, 32($sp) # 4-byte Folded Reload ; MMR3-NEXT: movz $1, $3, $16 -; MMR3-NEXT: movn $11, $2, $6 +; MMR3-NEXT: movn $11, $2, $4 ; MMR3-NEXT: movn $2, $11, $10 -; MMR3-NEXT: move $3, $4 +; MMR3-NEXT: move $3, $8 ; MMR3-NEXT: move $4, $1 ; MMR3-NEXT: lwp $16, 40($sp) ; MMR3-NEXT: addiusp 48 @@ -858,80 +852,79 @@ define signext i128 @ashr_i128(i128 signext %a, i128 signext %b) { ; MMR6-NEXT: sw $16, 8($sp) # 4-byte Folded Spill ; MMR6-NEXT: .cfi_offset 17, -4 ; MMR6-NEXT: .cfi_offset 16, -8 -; MMR6-NEXT: move $12, $7 +; MMR6-NEXT: move $1, $7 ; MMR6-NEXT: lw $3, 44($sp) ; MMR6-NEXT: li16 $2, 64 -; MMR6-NEXT: subu16 $16, $2, $3 -; MMR6-NEXT: sllv $1, $5, $16 -; MMR6-NEXT: andi16 $2, $16, 32 -; MMR6-NEXT: selnez $8, $1, $2 -; MMR6-NEXT: sllv $9, $4, $16 -; MMR6-NEXT: not16 $16, $16 -; MMR6-NEXT: srl16 $17, $5, 1 -; MMR6-NEXT: srlv $10, $17, $16 -; MMR6-NEXT: or $9, $9, $10 -; MMR6-NEXT: seleqz $9, $9, $2 -; MMR6-NEXT: or $8, $8, $9 -; MMR6-NEXT: srlv $9, $7, $3 -; MMR6-NEXT: not16 $7, $3 -; MMR6-NEXT: sw $7, 4($sp) # 4-byte Folded Spill +; MMR6-NEXT: subu16 $7, $2, $3 +; MMR6-NEXT: sllv $8, $5, $7 +; MMR6-NEXT: andi16 $2, $7, 32 +; MMR6-NEXT: selnez $9, $8, $2 +; MMR6-NEXT: sllv $10, $4, $7 +; MMR6-NEXT: not16 $7, $7 +; MMR6-NEXT: srl16 $16, $5, 1 +; MMR6-NEXT: srlv $7, $16, $7 +; MMR6-NEXT: or $7, $10, $7 +; MMR6-NEXT: seleqz $7, $7, $2 +; MMR6-NEXT: or $7, $9, $7 +; MMR6-NEXT: srlv $9, $1, $3 +; MMR6-NEXT: not16 $16, $3 +; MMR6-NEXT: sw $16, 4($sp) # 4-byte Folded Spill ; MMR6-NEXT: sll16 $17, $6, 1 -; MMR6-NEXT: sllv $10, $17, $7 +; MMR6-NEXT: sllv $10, $17, $16 ; MMR6-NEXT: or $9, $10, $9 ; MMR6-NEXT: andi16 $17, $3, 32 ; MMR6-NEXT: seleqz $9, $9, $17 ; MMR6-NEXT: srlv $10, $6, $3 ; MMR6-NEXT: selnez $11, $10, $17 ; MMR6-NEXT: seleqz $10, $10, $17 -; MMR6-NEXT: or $8, $10, $8 -; MMR6-NEXT: seleqz $1, $1, $2 -; MMR6-NEXT: or $9, $11, $9 +; MMR6-NEXT: or $10, $10, $7 +; MMR6-NEXT: seleqz $12, $8, $2 +; MMR6-NEXT: or $8, $11, $9 ; MMR6-NEXT: addiu $2, $3, -64 -; MMR6-NEXT: srlv $10, $5, $2 +; MMR6-NEXT: srlv $9, $5, $2 ; MMR6-NEXT: sll16 $7, $4, 1 ; MMR6-NEXT: not16 $16, $2 ; MMR6-NEXT: sllv $11, $7, $16 ; MMR6-NEXT: sltiu $13, $3, 64 -; MMR6-NEXT: or $1, $9, $1 -; MMR6-NEXT: selnez $8, $8, $13 -; MMR6-NEXT: or $9, $11, $10 -; MMR6-NEXT: srav $10, $4, $2 +; MMR6-NEXT: or $8, $8, $12 +; MMR6-NEXT: selnez $10, $10, $13 +; MMR6-NEXT: or $9, $11, $9 +; MMR6-NEXT: srav $11, $4, $2 ; MMR6-NEXT: andi16 $2, $2, 32 -; MMR6-NEXT: seleqz $11, $10, $2 +; MMR6-NEXT: seleqz $12, $11, $2 ; MMR6-NEXT: sra $14, $4, 31 ; MMR6-NEXT: selnez $15, $14, $2 ; MMR6-NEXT: seleqz $9, $9, $2 -; MMR6-NEXT: or $11, $15, $11 -; MMR6-NEXT: seleqz $11, $11, $13 -; MMR6-NEXT: selnez $2, $10, $2 -; MMR6-NEXT: seleqz $10, $14, $13 -; MMR6-NEXT: or $8, $8, $11 -; MMR6-NEXT: selnez $8, $8, $3 -; MMR6-NEXT: selnez $1, $1, $13 +; MMR6-NEXT: or $12, $15, $12 +; MMR6-NEXT: seleqz $12, $12, $13 +; MMR6-NEXT: selnez $2, $11, $2 +; MMR6-NEXT: seleqz $11, $14, $13 +; MMR6-NEXT: or $10, $10, $12 +; MMR6-NEXT: selnez $10, $10, $3 +; MMR6-NEXT: selnez $8, $8, $13 ; MMR6-NEXT: or $2, $2, $9 ; MMR6-NEXT: srav $9, $4, $3 ; MMR6-NEXT: seleqz $4, $9, $17 -; MMR6-NEXT: selnez $11, $14, $17 -; MMR6-NEXT: or $4, $11, $4 -; MMR6-NEXT: selnez $11, $4, $13 +; MMR6-NEXT: selnez $12, $14, $17 +; MMR6-NEXT: or $4, $12, $4 +; MMR6-NEXT: selnez $12, $4, $13 ; MMR6-NEXT: seleqz $2, $2, $13 ; MMR6-NEXT: seleqz $4, $6, $3 -; MMR6-NEXT: seleqz $6, $12, $3 +; MMR6-NEXT: seleqz $1, $1, $3 +; MMR6-NEXT: or $2, $8, $2 +; MMR6-NEXT: selnez $2, $2, $3 ; MMR6-NEXT: or $1, $1, $2 -; MMR6-NEXT: selnez $1, $1, $3 -; MMR6-NEXT: or $1, $6, $1 -; MMR6-NEXT: or $4, $4, $8 -; MMR6-NEXT: or $6, $11, $10 -; MMR6-NEXT: srlv $2, $5, $3 -; MMR6-NEXT: lw $3, 4($sp) # 4-byte Folded Reload -; MMR6-NEXT: sllv $3, $7, $3 -; MMR6-NEXT: or $2, $3, $2 -; MMR6-NEXT: seleqz $2, $2, $17 -; MMR6-NEXT: selnez $3, $9, $17 -; MMR6-NEXT: or $2, $3, $2 -; MMR6-NEXT: selnez $2, $2, $13 -; MMR6-NEXT: or $3, $2, $10 -; MMR6-NEXT: move $2, $6 +; MMR6-NEXT: or $4, $4, $10 +; MMR6-NEXT: or $2, $12, $11 +; MMR6-NEXT: srlv $3, $5, $3 +; MMR6-NEXT: lw $5, 4($sp) # 4-byte Folded Reload +; MMR6-NEXT: sllv $5, $7, $5 +; MMR6-NEXT: or $3, $5, $3 +; MMR6-NEXT: seleqz $3, $3, $17 +; MMR6-NEXT: selnez $5, $9, $17 +; MMR6-NEXT: or $3, $5, $3 +; MMR6-NEXT: selnez $3, $3, $13 +; MMR6-NEXT: or $3, $3, $11 ; MMR6-NEXT: move $5, $1 ; MMR6-NEXT: lw $16, 8($sp) # 4-byte Folded Reload ; MMR6-NEXT: lw $17, 12($sp) # 4-byte Folded Reload diff --git a/llvm/test/CodeGen/Mips/llvm-ir/lshr.ll b/llvm/test/CodeGen/Mips/llvm-ir/lshr.ll index e4b4b3a..ed2bfc9 100644 --- a/llvm/test/CodeGen/Mips/llvm-ir/lshr.ll +++ b/llvm/test/CodeGen/Mips/llvm-ir/lshr.ll @@ -776,77 +776,76 @@ define signext i128 @lshr_i128(i128 signext %a, i128 signext %b) { ; MMR3-NEXT: .cfi_offset 17, -4 ; MMR3-NEXT: .cfi_offset 16, -8 ; MMR3-NEXT: move $8, $7 -; MMR3-NEXT: sw $5, 4($sp) # 4-byte Folded Spill +; MMR3-NEXT: sw $6, 24($sp) # 4-byte Folded Spill ; MMR3-NEXT: sw $4, 28($sp) # 4-byte Folded Spill ; MMR3-NEXT: lw $16, 68($sp) ; MMR3-NEXT: li16 $2, 64 -; MMR3-NEXT: subu16 $17, $2, $16 -; MMR3-NEXT: sllv $9, $5, $17 -; MMR3-NEXT: andi16 $3, $17, 32 +; MMR3-NEXT: subu16 $7, $2, $16 +; MMR3-NEXT: sllv $9, $5, $7 +; MMR3-NEXT: move $17, $5 +; MMR3-NEXT: sw $5, 0($sp) # 4-byte Folded Spill +; MMR3-NEXT: andi16 $3, $7, 32 ; MMR3-NEXT: sw $3, 20($sp) # 4-byte Folded Spill ; MMR3-NEXT: li16 $2, 0 ; MMR3-NEXT: move $4, $9 ; MMR3-NEXT: movn $4, $2, $3 -; MMR3-NEXT: srlv $5, $7, $16 +; MMR3-NEXT: srlv $5, $8, $16 ; MMR3-NEXT: not16 $3, $16 ; MMR3-NEXT: sw $3, 16($sp) # 4-byte Folded Spill ; MMR3-NEXT: sll16 $2, $6, 1 -; MMR3-NEXT: sw $6, 24($sp) # 4-byte Folded Spill ; MMR3-NEXT: sllv $2, $2, $3 ; MMR3-NEXT: or16 $2, $5 -; MMR3-NEXT: srlv $7, $6, $16 +; MMR3-NEXT: srlv $5, $6, $16 +; MMR3-NEXT: sw $5, 4($sp) # 4-byte Folded Spill ; MMR3-NEXT: andi16 $3, $16, 32 ; MMR3-NEXT: sw $3, 12($sp) # 4-byte Folded Spill -; MMR3-NEXT: movn $2, $7, $3 +; MMR3-NEXT: movn $2, $5, $3 ; MMR3-NEXT: addiu $3, $16, -64 ; MMR3-NEXT: or16 $2, $4 -; MMR3-NEXT: lw $6, 4($sp) # 4-byte Folded Reload -; MMR3-NEXT: srlv $3, $6, $3 -; MMR3-NEXT: sw $3, 8($sp) # 4-byte Folded Spill -; MMR3-NEXT: lw $3, 28($sp) # 4-byte Folded Reload -; MMR3-NEXT: sll16 $4, $3, 1 -; MMR3-NEXT: sw $4, 0($sp) # 4-byte Folded Spill -; MMR3-NEXT: addiu $5, $16, -64 -; MMR3-NEXT: not16 $5, $5 -; MMR3-NEXT: sllv $5, $4, $5 -; MMR3-NEXT: lw $4, 8($sp) # 4-byte Folded Reload -; MMR3-NEXT: or16 $5, $4 -; MMR3-NEXT: addiu $4, $16, -64 -; MMR3-NEXT: srlv $1, $3, $4 -; MMR3-NEXT: andi16 $4, $4, 32 +; MMR3-NEXT: srlv $4, $17, $3 ; MMR3-NEXT: sw $4, 8($sp) # 4-byte Folded Spill -; MMR3-NEXT: movn $5, $1, $4 +; MMR3-NEXT: lw $4, 28($sp) # 4-byte Folded Reload +; MMR3-NEXT: sll16 $6, $4, 1 +; MMR3-NEXT: not16 $5, $3 +; MMR3-NEXT: sllv $5, $6, $5 +; MMR3-NEXT: lw $17, 8($sp) # 4-byte Folded Reload +; MMR3-NEXT: or16 $5, $17 +; MMR3-NEXT: srlv $1, $4, $3 +; MMR3-NEXT: andi16 $3, $3, 32 +; MMR3-NEXT: sw $3, 8($sp) # 4-byte Folded Spill +; MMR3-NEXT: movn $5, $1, $3 ; MMR3-NEXT: sltiu $10, $16, 64 ; MMR3-NEXT: movn $5, $2, $10 -; MMR3-NEXT: sllv $2, $3, $17 -; MMR3-NEXT: not16 $3, $17 -; MMR3-NEXT: srl16 $4, $6, 1 +; MMR3-NEXT: sllv $2, $4, $7 +; MMR3-NEXT: not16 $3, $7 +; MMR3-NEXT: lw $7, 0($sp) # 4-byte Folded Reload +; MMR3-NEXT: srl16 $4, $7, 1 ; MMR3-NEXT: srlv $4, $4, $3 ; MMR3-NEXT: or16 $4, $2 -; MMR3-NEXT: srlv $2, $6, $16 +; MMR3-NEXT: srlv $2, $7, $16 ; MMR3-NEXT: lw $3, 16($sp) # 4-byte Folded Reload -; MMR3-NEXT: lw $6, 0($sp) # 4-byte Folded Reload ; MMR3-NEXT: sllv $3, $6, $3 ; MMR3-NEXT: or16 $3, $2 ; MMR3-NEXT: lw $2, 28($sp) # 4-byte Folded Reload ; MMR3-NEXT: srlv $2, $2, $16 -; MMR3-NEXT: lw $6, 12($sp) # 4-byte Folded Reload -; MMR3-NEXT: movn $3, $2, $6 +; MMR3-NEXT: lw $17, 12($sp) # 4-byte Folded Reload +; MMR3-NEXT: movn $3, $2, $17 ; MMR3-NEXT: movz $5, $8, $16 -; MMR3-NEXT: li16 $17, 0 -; MMR3-NEXT: movz $3, $17, $10 -; MMR3-NEXT: lw $17, 20($sp) # 4-byte Folded Reload -; MMR3-NEXT: movn $4, $9, $17 -; MMR3-NEXT: li16 $17, 0 -; MMR3-NEXT: movn $7, $17, $6 -; MMR3-NEXT: or16 $7, $4 +; MMR3-NEXT: li16 $6, 0 +; MMR3-NEXT: movz $3, $6, $10 +; MMR3-NEXT: lw $7, 20($sp) # 4-byte Folded Reload +; MMR3-NEXT: movn $4, $9, $7 +; MMR3-NEXT: lw $6, 4($sp) # 4-byte Folded Reload +; MMR3-NEXT: li16 $7, 0 +; MMR3-NEXT: movn $6, $7, $17 +; MMR3-NEXT: or16 $6, $4 ; MMR3-NEXT: lw $4, 8($sp) # 4-byte Folded Reload -; MMR3-NEXT: movn $1, $17, $4 -; MMR3-NEXT: li16 $17, 0 -; MMR3-NEXT: movn $1, $7, $10 +; MMR3-NEXT: movn $1, $7, $4 +; MMR3-NEXT: li16 $7, 0 +; MMR3-NEXT: movn $1, $6, $10 ; MMR3-NEXT: lw $4, 24($sp) # 4-byte Folded Reload ; MMR3-NEXT: movz $1, $4, $16 -; MMR3-NEXT: movn $2, $17, $6 +; MMR3-NEXT: movn $2, $7, $17 ; MMR3-NEXT: li16 $4, 0 ; MMR3-NEXT: movz $2, $4, $10 ; MMR3-NEXT: move $4, $1 @@ -856,91 +855,98 @@ define signext i128 @lshr_i128(i128 signext %a, i128 signext %b) { ; ; MMR6-LABEL: lshr_i128: ; MMR6: # %bb.0: # %entry -; MMR6-NEXT: addiu $sp, $sp, -24 -; MMR6-NEXT: .cfi_def_cfa_offset 24 -; MMR6-NEXT: sw $17, 20($sp) # 4-byte Folded Spill -; MMR6-NEXT: sw $16, 16($sp) # 4-byte Folded Spill +; MMR6-NEXT: addiu $sp, $sp, -32 +; MMR6-NEXT: .cfi_def_cfa_offset 32 +; MMR6-NEXT: sw $17, 28($sp) # 4-byte Folded Spill +; MMR6-NEXT: sw $16, 24($sp) # 4-byte Folded Spill ; MMR6-NEXT: .cfi_offset 17, -4 ; MMR6-NEXT: .cfi_offset 16, -8 ; MMR6-NEXT: move $1, $7 -; MMR6-NEXT: move $7, $4 -; MMR6-NEXT: lw $3, 52($sp) +; MMR6-NEXT: move $7, $5 +; MMR6-NEXT: lw $3, 60($sp) ; MMR6-NEXT: srlv $2, $1, $3 -; MMR6-NEXT: not16 $16, $3 -; MMR6-NEXT: sw $16, 8($sp) # 4-byte Folded Spill -; MMR6-NEXT: move $4, $6 -; MMR6-NEXT: sw $6, 12($sp) # 4-byte Folded Spill +; MMR6-NEXT: not16 $5, $3 +; MMR6-NEXT: sw $5, 12($sp) # 4-byte Folded Spill +; MMR6-NEXT: move $17, $6 +; MMR6-NEXT: sw $6, 16($sp) # 4-byte Folded Spill ; MMR6-NEXT: sll16 $6, $6, 1 -; MMR6-NEXT: sllv $6, $6, $16 +; MMR6-NEXT: sllv $6, $6, $5 ; MMR6-NEXT: or $8, $6, $2 -; MMR6-NEXT: addiu $6, $3, -64 -; MMR6-NEXT: srlv $9, $5, $6 -; MMR6-NEXT: sll16 $2, $7, 1 -; MMR6-NEXT: sw $2, 4($sp) # 4-byte Folded Spill -; MMR6-NEXT: not16 $16, $6 +; MMR6-NEXT: addiu $5, $3, -64 +; MMR6-NEXT: srlv $9, $7, $5 +; MMR6-NEXT: move $6, $4 +; MMR6-NEXT: sll16 $2, $4, 1 +; MMR6-NEXT: sw $2, 8($sp) # 4-byte Folded Spill +; MMR6-NEXT: not16 $16, $5 ; MMR6-NEXT: sllv $10, $2, $16 ; MMR6-NEXT: andi16 $16, $3, 32 ; MMR6-NEXT: seleqz $8, $8, $16 ; MMR6-NEXT: or $9, $10, $9 -; MMR6-NEXT: srlv $10, $4, $3 +; MMR6-NEXT: srlv $10, $17, $3 ; MMR6-NEXT: selnez $11, $10, $16 ; MMR6-NEXT: li16 $17, 64 ; MMR6-NEXT: subu16 $2, $17, $3 -; MMR6-NEXT: sllv $12, $5, $2 +; MMR6-NEXT: sllv $12, $7, $2 +; MMR6-NEXT: move $17, $7 ; MMR6-NEXT: andi16 $4, $2, 32 -; MMR6-NEXT: andi16 $17, $6, 32 -; MMR6-NEXT: seleqz $9, $9, $17 +; MMR6-NEXT: andi16 $7, $5, 32 +; MMR6-NEXT: sw $7, 20($sp) # 4-byte Folded Spill +; MMR6-NEXT: seleqz $9, $9, $7 ; MMR6-NEXT: seleqz $13, $12, $4 ; MMR6-NEXT: or $8, $11, $8 ; MMR6-NEXT: selnez $11, $12, $4 -; MMR6-NEXT: sllv $12, $7, $2 +; MMR6-NEXT: sllv $12, $6, $2 +; MMR6-NEXT: move $7, $6 +; MMR6-NEXT: sw $6, 4($sp) # 4-byte Folded Spill ; MMR6-NEXT: not16 $2, $2 -; MMR6-NEXT: srl16 $6, $5, 1 +; MMR6-NEXT: srl16 $6, $17, 1 ; MMR6-NEXT: srlv $2, $6, $2 ; MMR6-NEXT: or $2, $12, $2 ; MMR6-NEXT: seleqz $2, $2, $4 -; MMR6-NEXT: addiu $4, $3, -64 -; MMR6-NEXT: srlv $4, $7, $4 -; MMR6-NEXT: or $12, $11, $2 -; MMR6-NEXT: or $6, $8, $13 -; MMR6-NEXT: srlv $5, $5, $3 -; MMR6-NEXT: selnez $8, $4, $17 -; MMR6-NEXT: sltiu $11, $3, 64 -; MMR6-NEXT: selnez $13, $6, $11 -; MMR6-NEXT: or $8, $8, $9 +; MMR6-NEXT: srlv $4, $7, $5 +; MMR6-NEXT: or $11, $11, $2 +; MMR6-NEXT: or $5, $8, $13 +; MMR6-NEXT: srlv $6, $17, $3 +; MMR6-NEXT: lw $2, 20($sp) # 4-byte Folded Reload +; MMR6-NEXT: selnez $7, $4, $2 +; MMR6-NEXT: sltiu $8, $3, 64 +; MMR6-NEXT: selnez $12, $5, $8 +; MMR6-NEXT: or $7, $7, $9 +; MMR6-NEXT: lw $5, 12($sp) # 4-byte Folded Reload ; MMR6-NEXT: lw $2, 8($sp) # 4-byte Folded Reload -; MMR6-NEXT: lw $6, 4($sp) # 4-byte Folded Reload -; MMR6-NEXT: sllv $9, $6, $2 +; MMR6-NEXT: sllv $9, $2, $5 ; MMR6-NEXT: seleqz $10, $10, $16 -; MMR6-NEXT: li16 $2, 0 -; MMR6-NEXT: or $10, $10, $12 -; MMR6-NEXT: or $9, $9, $5 -; MMR6-NEXT: seleqz $5, $8, $11 -; MMR6-NEXT: seleqz $8, $2, $11 -; MMR6-NEXT: srlv $7, $7, $3 -; MMR6-NEXT: seleqz $2, $7, $16 -; MMR6-NEXT: selnez $2, $2, $11 +; MMR6-NEXT: li16 $5, 0 +; MMR6-NEXT: or $10, $10, $11 +; MMR6-NEXT: or $6, $9, $6 +; MMR6-NEXT: seleqz $2, $7, $8 +; MMR6-NEXT: seleqz $7, $5, $8 +; MMR6-NEXT: lw $5, 4($sp) # 4-byte Folded Reload +; MMR6-NEXT: srlv $9, $5, $3 +; MMR6-NEXT: seleqz $11, $9, $16 +; MMR6-NEXT: selnez $11, $11, $8 ; MMR6-NEXT: seleqz $1, $1, $3 -; MMR6-NEXT: or $5, $13, $5 -; MMR6-NEXT: selnez $5, $5, $3 -; MMR6-NEXT: or $5, $1, $5 -; MMR6-NEXT: or $2, $8, $2 -; MMR6-NEXT: seleqz $1, $9, $16 -; MMR6-NEXT: selnez $6, $7, $16 -; MMR6-NEXT: lw $7, 12($sp) # 4-byte Folded Reload -; MMR6-NEXT: seleqz $7, $7, $3 -; MMR6-NEXT: selnez $9, $10, $11 -; MMR6-NEXT: seleqz $4, $4, $17 -; MMR6-NEXT: seleqz $4, $4, $11 -; MMR6-NEXT: or $4, $9, $4 +; MMR6-NEXT: or $2, $12, $2 +; MMR6-NEXT: selnez $2, $2, $3 +; MMR6-NEXT: or $5, $1, $2 +; MMR6-NEXT: or $2, $7, $11 +; MMR6-NEXT: seleqz $1, $6, $16 +; MMR6-NEXT: selnez $6, $9, $16 +; MMR6-NEXT: lw $16, 16($sp) # 4-byte Folded Reload +; MMR6-NEXT: seleqz $9, $16, $3 +; MMR6-NEXT: selnez $10, $10, $8 +; MMR6-NEXT: lw $16, 20($sp) # 4-byte Folded Reload +; MMR6-NEXT: seleqz $4, $4, $16 +; MMR6-NEXT: seleqz $4, $4, $8 +; MMR6-NEXT: or $4, $10, $4 ; MMR6-NEXT: selnez $3, $4, $3 -; MMR6-NEXT: or $4, $7, $3 +; MMR6-NEXT: or $4, $9, $3 ; MMR6-NEXT: or $1, $6, $1 -; MMR6-NEXT: selnez $1, $1, $11 -; MMR6-NEXT: or $3, $8, $1 -; MMR6-NEXT: lw $16, 16($sp) # 4-byte Folded Reload -; MMR6-NEXT: lw $17, 20($sp) # 4-byte Folded Reload -; MMR6-NEXT: addiu $sp, $sp, 24 +; MMR6-NEXT: selnez $1, $1, $8 +; MMR6-NEXT: or $3, $7, $1 +; MMR6-NEXT: lw $16, 24($sp) # 4-byte Folded Reload +; MMR6-NEXT: lw $17, 28($sp) # 4-byte Folded Reload +; MMR6-NEXT: addiu $sp, $sp, 32 ; MMR6-NEXT: jrc $ra entry: diff --git a/llvm/test/CodeGen/Mips/llvm-ir/shl.ll b/llvm/test/CodeGen/Mips/llvm-ir/shl.ll index 5050cf4..a8d829b 100644 --- a/llvm/test/CodeGen/Mips/llvm-ir/shl.ll +++ b/llvm/test/CodeGen/Mips/llvm-ir/shl.ll @@ -849,78 +849,77 @@ define signext i128 @shl_i128(i128 signext %a, i128 signext %b) { ; MMR3-NEXT: swp $16, 32($sp) ; MMR3-NEXT: .cfi_offset 17, -4 ; MMR3-NEXT: .cfi_offset 16, -8 -; MMR3-NEXT: sw $7, 8($sp) # 4-byte Folded Spill -; MMR3-NEXT: move $17, $6 -; MMR3-NEXT: sw $5, 28($sp) # 4-byte Folded Spill +; MMR3-NEXT: move $17, $7 +; MMR3-NEXT: sw $7, 4($sp) # 4-byte Folded Spill +; MMR3-NEXT: move $7, $6 ; MMR3-NEXT: move $1, $4 ; MMR3-NEXT: lw $16, 68($sp) ; MMR3-NEXT: li16 $2, 64 ; MMR3-NEXT: subu16 $6, $2, $16 -; MMR3-NEXT: srlv $9, $17, $6 -; MMR3-NEXT: andi16 $7, $6, 32 -; MMR3-NEXT: sw $7, 24($sp) # 4-byte Folded Spill +; MMR3-NEXT: srlv $9, $7, $6 +; MMR3-NEXT: andi16 $4, $6, 32 +; MMR3-NEXT: sw $4, 24($sp) # 4-byte Folded Spill ; MMR3-NEXT: li16 $3, 0 -; MMR3-NEXT: move $4, $9 -; MMR3-NEXT: movn $4, $3, $7 -; MMR3-NEXT: sllv $7, $1, $16 -; MMR3-NEXT: not16 $2, $16 -; MMR3-NEXT: sw $2, 20($sp) # 4-byte Folded Spill +; MMR3-NEXT: move $2, $9 +; MMR3-NEXT: movn $2, $3, $4 +; MMR3-NEXT: sllv $3, $1, $16 +; MMR3-NEXT: sw $3, 16($sp) # 4-byte Folded Spill +; MMR3-NEXT: not16 $4, $16 +; MMR3-NEXT: sw $4, 20($sp) # 4-byte Folded Spill +; MMR3-NEXT: sw $5, 28($sp) # 4-byte Folded Spill ; MMR3-NEXT: srl16 $3, $5, 1 -; MMR3-NEXT: srlv $3, $3, $2 -; MMR3-NEXT: or16 $3, $7 -; MMR3-NEXT: sllv $5, $5, $16 -; MMR3-NEXT: sw $5, 4($sp) # 4-byte Folded Spill -; MMR3-NEXT: andi16 $2, $16, 32 -; MMR3-NEXT: sw $2, 16($sp) # 4-byte Folded Spill -; MMR3-NEXT: movn $3, $5, $2 -; MMR3-NEXT: addiu $7, $16, -64 +; MMR3-NEXT: srlv $3, $3, $4 +; MMR3-NEXT: lw $4, 16($sp) # 4-byte Folded Reload ; MMR3-NEXT: or16 $3, $4 -; MMR3-NEXT: sllv $2, $17, $7 +; MMR3-NEXT: sllv $5, $5, $16 +; MMR3-NEXT: sw $5, 8($sp) # 4-byte Folded Spill +; MMR3-NEXT: andi16 $4, $16, 32 +; MMR3-NEXT: sw $4, 16($sp) # 4-byte Folded Spill +; MMR3-NEXT: movn $3, $5, $4 +; MMR3-NEXT: addiu $4, $16, -64 +; MMR3-NEXT: or16 $3, $2 +; MMR3-NEXT: sllv $2, $7, $4 ; MMR3-NEXT: sw $2, 12($sp) # 4-byte Folded Spill -; MMR3-NEXT: lw $4, 8($sp) # 4-byte Folded Reload -; MMR3-NEXT: srl16 $5, $4, 1 -; MMR3-NEXT: not16 $2, $7 +; MMR3-NEXT: srl16 $5, $17, 1 +; MMR3-NEXT: not16 $2, $4 ; MMR3-NEXT: srlv $2, $5, $2 -; MMR3-NEXT: lw $7, 12($sp) # 4-byte Folded Reload -; MMR3-NEXT: or16 $2, $7 -; MMR3-NEXT: addiu $7, $16, -64 -; MMR3-NEXT: sllv $8, $4, $7 -; MMR3-NEXT: andi16 $7, $7, 32 -; MMR3-NEXT: sw $7, 12($sp) # 4-byte Folded Spill -; MMR3-NEXT: movn $2, $8, $7 +; MMR3-NEXT: lw $17, 12($sp) # 4-byte Folded Reload +; MMR3-NEXT: or16 $2, $17 +; MMR3-NEXT: lw $17, 4($sp) # 4-byte Folded Reload +; MMR3-NEXT: sllv $8, $17, $4 +; MMR3-NEXT: andi16 $4, $4, 32 +; MMR3-NEXT: sw $4, 12($sp) # 4-byte Folded Spill +; MMR3-NEXT: movn $2, $8, $4 ; MMR3-NEXT: sltiu $10, $16, 64 ; MMR3-NEXT: movn $2, $3, $10 -; MMR3-NEXT: srlv $3, $4, $6 -; MMR3-NEXT: sw $3, 0($sp) # 4-byte Folded Spill -; MMR3-NEXT: move $7, $4 +; MMR3-NEXT: srlv $4, $17, $6 ; MMR3-NEXT: not16 $3, $6 -; MMR3-NEXT: sll16 $4, $17, 1 -; MMR3-NEXT: sllv $3, $4, $3 -; MMR3-NEXT: lw $4, 0($sp) # 4-byte Folded Reload +; MMR3-NEXT: sll16 $6, $7, 1 +; MMR3-NEXT: sllv $3, $6, $3 ; MMR3-NEXT: or16 $3, $4 -; MMR3-NEXT: sllv $6, $17, $16 +; MMR3-NEXT: sllv $6, $7, $16 ; MMR3-NEXT: lw $4, 20($sp) # 4-byte Folded Reload ; MMR3-NEXT: srlv $4, $5, $4 ; MMR3-NEXT: or16 $4, $6 -; MMR3-NEXT: sllv $6, $7, $16 -; MMR3-NEXT: lw $7, 16($sp) # 4-byte Folded Reload -; MMR3-NEXT: movn $4, $6, $7 +; MMR3-NEXT: sllv $6, $17, $16 +; MMR3-NEXT: lw $17, 16($sp) # 4-byte Folded Reload +; MMR3-NEXT: movn $4, $6, $17 ; MMR3-NEXT: movz $2, $1, $16 ; MMR3-NEXT: li16 $5, 0 ; MMR3-NEXT: movz $4, $5, $10 -; MMR3-NEXT: lw $17, 24($sp) # 4-byte Folded Reload -; MMR3-NEXT: movn $3, $9, $17 -; MMR3-NEXT: lw $5, 4($sp) # 4-byte Folded Reload -; MMR3-NEXT: li16 $17, 0 -; MMR3-NEXT: movn $5, $17, $7 +; MMR3-NEXT: lw $7, 24($sp) # 4-byte Folded Reload +; MMR3-NEXT: movn $3, $9, $7 +; MMR3-NEXT: lw $5, 8($sp) # 4-byte Folded Reload +; MMR3-NEXT: li16 $7, 0 +; MMR3-NEXT: movn $5, $7, $17 ; MMR3-NEXT: or16 $5, $3 ; MMR3-NEXT: lw $3, 12($sp) # 4-byte Folded Reload -; MMR3-NEXT: movn $8, $17, $3 -; MMR3-NEXT: li16 $17, 0 +; MMR3-NEXT: movn $8, $7, $3 +; MMR3-NEXT: li16 $7, 0 ; MMR3-NEXT: movn $8, $5, $10 ; MMR3-NEXT: lw $3, 28($sp) # 4-byte Folded Reload ; MMR3-NEXT: movz $8, $3, $16 -; MMR3-NEXT: movn $6, $17, $7 +; MMR3-NEXT: movn $6, $7, $17 ; MMR3-NEXT: li16 $3, 0 ; MMR3-NEXT: movz $6, $3, $10 ; MMR3-NEXT: move $3, $8 diff --git a/llvm/test/CodeGen/Mips/llvm-ir/sub.ll b/llvm/test/CodeGen/Mips/llvm-ir/sub.ll index bc9ce44..51dccce 100644 --- a/llvm/test/CodeGen/Mips/llvm-ir/sub.ll +++ b/llvm/test/CodeGen/Mips/llvm-ir/sub.ll @@ -162,32 +162,35 @@ entry: ; MMR3: lw $[[T20:[0-9]+]], 0($sp) ; MMR3: subu16 $5, $[[T19]], $[[T20]] -; MMR6: sw $7, 4($sp) -; MMR6: sw $4, 8($sp) +; MMR6: move $[[T0:[0-9]+]], $7 +; MMR6: sw $7, 8($sp) +; MMR6: move $[[T1:[0-9]+]], $5 +; MMR6: sw $4, 12($sp) ; MMR6: lw $[[T2:[0-9]+]], 48($sp) ; MMR6: sltu $[[T3:[0-9]+]], $6, $[[T2]] ; MMR6: xor $[[T4:[0-9]+]], $6, $[[T2]] ; MMR6: sltiu $[[T5:[0-9]+]], $[[T4]], 1 ; MMR6: seleqz $[[T6:[0-9]+]], $[[T3]], $[[T5]] ; MMR6: lw $[[T7:[0-9]+]], 52($sp) -; MMR6: sltu $[[T8:[0-9]+]], $7, $[[T7]] +; MMR6: sltu $[[T8:[0-9]+]], $[[T0]], $[[T7]] ; MMR6: selnez $[[T9:[0-9]+]], $[[T8]], $[[T5]] ; MMR6: or $[[T10:[0-9]+]], $[[T9]], $[[T6]] ; MMR6: lw $[[T11:[0-9]+]], 44($sp) -; MMR6: subu16 $[[T12:[0-9]+]], $5, $[[T11]] -; MMR6: lw $[[T1:[0-9]+]], 12($sp) -; MMR6: subu16 $[[T13:[0-9]+]], $[[T12]], $[[T1]] -; MMR6: sltu $[[T16:[0-9]+]], $[[T12]], $[[T1]] -; MMR6: sltu $[[T17:[0-9]+]], $5, $[[T11]] -; MMR6: lw $[[T19:[0-9]+]], 8($sp) -; MMR6: subu16 $[[T20:[0-9]+]], $[[T19]], $5 +; MMR6: subu16 $[[T12:[0-9]+]], $[[T1]], $[[T11]] +; MMR6: subu16 $[[T13:[0-9]+]], $[[T12]], $[[T7]] +; MMR6: sltu $[[T16:[0-9]+]], $[[T12]], $[[T7]] +; MMR6: sltu $[[T17:[0-9]+]], $[[T1]], $[[T11]] +; MMR6: lw $[[T18:[0-9]+]], 40($sp) +; MMR6: lw $[[T19:[0-9]+]], 12($sp) +; MMR6: subu16 $[[T20:[0-9]+]], $[[T19]], $[[T18]] ; MMR6: subu16 $[[T21:[0-9]+]], $[[T20]], $[[T17]] ; MMR6: subu16 $[[T22:[0-9]+]], $[[T21]], $[[T16]] ; MMR6: subu16 $[[T23:[0-9]+]], $6, $[[T2]] -; MMR6: subu16 $4, $[[T23]], $[[T8]] -; MMR6: lw $[[T24:[0-9]+]], 4($sp) -; MMR6: subu16 $5, $[[T24]], $[[T7]] -; MMR6: lw $3, 0($sp) +; MMR6: subu16 $4, $[[T23]], $5 +; MMR6: lw $[[T24:[0-9]+]], 8($sp) +; MMR6: lw $[[T25:[0-9]+]], 0($sp) +; MMR6: subu16 $5, $[[T24]], $[[T25]] +; MMR6: lw $3, 4($sp) ; FIXME: The sltu, dsll, dsrl pattern here occurs when an i32 is zero ; extended to 64 bits. Fortunately slt(i)(u) actually gives an i1. diff --git a/llvm/test/CodeGen/Mips/tls.ll b/llvm/test/CodeGen/Mips/tls.ll index 39bd856..4ef885e 100644 --- a/llvm/test/CodeGen/Mips/tls.ll +++ b/llvm/test/CodeGen/Mips/tls.ll @@ -71,8 +71,8 @@ define dso_preemptable i32 @f3() nounwind { entry: ; PIC32-LABEL: f3: ; PIC32: addu $[[R0:[a-z0-9]+]], $2, $25 -; PIC32: lw $25, %call16(__tls_get_addr)($[[R0]]) ; PIC32: addiu $4, $[[R0]], %tlsldm(f3.i) +; PIC32: lw $25, %call16(__tls_get_addr)($[[R0]]) ; PIC32: jalr $25 ; PIC32: lui $[[R0:[0-9]+]], %dtprel_hi(f3.i) ; PIC32: addu $[[R1:[0-9]+]], $[[R0]], $2 @@ -84,8 +84,8 @@ entry: ; PIC64: lui $[[R0:[a-z0-9]+]], %hi(%neg(%gp_rel(f3))) ; PIC64: daddu $[[R0]], $[[R0]], $25 ; PIC64: daddiu $[[R1:[a-z0-9]+]], $[[R0]], %lo(%neg(%gp_rel(f3))) -; PIC64: ld $25, %call16(__tls_get_addr)($[[R1]]) ; PIC64: daddiu $4, $[[R1]], %tlsldm(f3.i) +; PIC64: ld $25, %call16(__tls_get_addr)($[[R1]]) ; PIC64: jalr $25 ; PIC64: lui $[[R0:[0-9]+]], %dtprel_hi(f3.i) ; PIC64: daddu $[[R1:[0-9]+]], $[[R0]], $2 diff --git a/llvm/test/CodeGen/RISCV/atomic-rmw.ll b/llvm/test/CodeGen/RISCV/atomic-rmw.ll index ee144e1..eaf650a 100644 --- a/llvm/test/CodeGen/RISCV/atomic-rmw.ll +++ b/llvm/test/CodeGen/RISCV/atomic-rmw.ll @@ -8388,17 +8388,17 @@ define i16 @atomicrmw_umax_i16_monotonic(i16 *%a, i16 %b) nounwind { ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: mv s2, a1 -; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: mv s3, a0 ; RV32I-NEXT: lhu a1, 0(a0) ; RV32I-NEXT: lui a0, 16 -; RV32I-NEXT: addi s3, a0, -1 -; RV32I-NEXT: and s0, s2, s3 +; RV32I-NEXT: addi s0, a0, -1 +; RV32I-NEXT: and s1, s2, s0 ; RV32I-NEXT: j .LBB100_2 ; RV32I-NEXT: .LBB100_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB100_2 Depth=1 ; RV32I-NEXT: sh a1, 10(sp) ; RV32I-NEXT: addi a1, sp, 10 -; RV32I-NEXT: mv a0, s1 +; RV32I-NEXT: mv a0, s3 ; RV32I-NEXT: mv a3, zero ; RV32I-NEXT: mv a4, zero ; RV32I-NEXT: call __atomic_compare_exchange_2@plt @@ -8406,9 +8406,9 @@ define i16 @atomicrmw_umax_i16_monotonic(i16 *%a, i16 %b) nounwind { ; RV32I-NEXT: bnez a0, .LBB100_4 ; RV32I-NEXT: .LBB100_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: and a0, a1, s3 +; RV32I-NEXT: and a0, a1, s0 ; RV32I-NEXT: mv a2, a1 -; RV32I-NEXT: bltu s0, a0, .LBB100_1 +; RV32I-NEXT: bltu s1, a0, .LBB100_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB100_2 Depth=1 ; RV32I-NEXT: mv a2, s2 @@ -8530,11 +8530,11 @@ define i16 @atomicrmw_umax_i16_acquire(i16 *%a, i16 %b) nounwind { ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: mv s2, a1 -; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: mv s3, a0 ; RV32I-NEXT: lhu a1, 0(a0) ; RV32I-NEXT: lui a0, 16 -; RV32I-NEXT: addi s3, a0, -1 -; RV32I-NEXT: and s0, s2, s3 +; RV32I-NEXT: addi s0, a0, -1 +; RV32I-NEXT: and s1, s2, s0 ; RV32I-NEXT: j .LBB101_2 ; RV32I-NEXT: .LBB101_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB101_2 Depth=1 @@ -8542,15 +8542,15 @@ define i16 @atomicrmw_umax_i16_acquire(i16 *%a, i16 %b) nounwind { ; RV32I-NEXT: addi a1, sp, 10 ; RV32I-NEXT: addi a3, zero, 2 ; RV32I-NEXT: addi a4, zero, 2 -; RV32I-NEXT: mv a0, s1 +; RV32I-NEXT: mv a0, s3 ; RV32I-NEXT: call __atomic_compare_exchange_2@plt ; RV32I-NEXT: lh a1, 10(sp) ; RV32I-NEXT: bnez a0, .LBB101_4 ; RV32I-NEXT: .LBB101_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: and a0, a1, s3 +; RV32I-NEXT: and a0, a1, s0 ; RV32I-NEXT: mv a2, a1 -; RV32I-NEXT: bltu s0, a0, .LBB101_1 +; RV32I-NEXT: bltu s1, a0, .LBB101_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB101_2 Depth=1 ; RV32I-NEXT: mv a2, s2 @@ -8672,27 +8672,27 @@ define i16 @atomicrmw_umax_i16_release(i16 *%a, i16 %b) nounwind { ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: mv s2, a1 -; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: mv s3, a0 ; RV32I-NEXT: lhu a1, 0(a0) ; RV32I-NEXT: lui a0, 16 -; RV32I-NEXT: addi s3, a0, -1 -; RV32I-NEXT: and s0, s2, s3 +; RV32I-NEXT: addi s0, a0, -1 +; RV32I-NEXT: and s1, s2, s0 ; RV32I-NEXT: j .LBB102_2 ; RV32I-NEXT: .LBB102_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB102_2 Depth=1 ; RV32I-NEXT: sh a1, 10(sp) ; RV32I-NEXT: addi a1, sp, 10 ; RV32I-NEXT: addi a3, zero, 3 -; RV32I-NEXT: mv a0, s1 +; RV32I-NEXT: mv a0, s3 ; RV32I-NEXT: mv a4, zero ; RV32I-NEXT: call __atomic_compare_exchange_2@plt ; RV32I-NEXT: lh a1, 10(sp) ; RV32I-NEXT: bnez a0, .LBB102_4 ; RV32I-NEXT: .LBB102_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: and a0, a1, s3 +; RV32I-NEXT: and a0, a1, s0 ; RV32I-NEXT: mv a2, a1 -; RV32I-NEXT: bltu s0, a0, .LBB102_1 +; RV32I-NEXT: bltu s1, a0, .LBB102_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB102_2 Depth=1 ; RV32I-NEXT: mv a2, s2 @@ -8814,11 +8814,11 @@ define i16 @atomicrmw_umax_i16_acq_rel(i16 *%a, i16 %b) nounwind { ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: mv s2, a1 -; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: mv s3, a0 ; RV32I-NEXT: lhu a1, 0(a0) ; RV32I-NEXT: lui a0, 16 -; RV32I-NEXT: addi s3, a0, -1 -; RV32I-NEXT: and s0, s2, s3 +; RV32I-NEXT: addi s0, a0, -1 +; RV32I-NEXT: and s1, s2, s0 ; RV32I-NEXT: j .LBB103_2 ; RV32I-NEXT: .LBB103_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB103_2 Depth=1 @@ -8826,15 +8826,15 @@ define i16 @atomicrmw_umax_i16_acq_rel(i16 *%a, i16 %b) nounwind { ; RV32I-NEXT: addi a1, sp, 10 ; RV32I-NEXT: addi a3, zero, 4 ; RV32I-NEXT: addi a4, zero, 2 -; RV32I-NEXT: mv a0, s1 +; RV32I-NEXT: mv a0, s3 ; RV32I-NEXT: call __atomic_compare_exchange_2@plt ; RV32I-NEXT: lh a1, 10(sp) ; RV32I-NEXT: bnez a0, .LBB103_4 ; RV32I-NEXT: .LBB103_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: and a0, a1, s3 +; RV32I-NEXT: and a0, a1, s0 ; RV32I-NEXT: mv a2, a1 -; RV32I-NEXT: bltu s0, a0, .LBB103_1 +; RV32I-NEXT: bltu s1, a0, .LBB103_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB103_2 Depth=1 ; RV32I-NEXT: mv a2, s2 @@ -8956,11 +8956,11 @@ define i16 @atomicrmw_umax_i16_seq_cst(i16 *%a, i16 %b) nounwind { ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: mv s2, a1 -; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: mv s3, a0 ; RV32I-NEXT: lhu a1, 0(a0) ; RV32I-NEXT: lui a0, 16 -; RV32I-NEXT: addi s3, a0, -1 -; RV32I-NEXT: and s0, s2, s3 +; RV32I-NEXT: addi s0, a0, -1 +; RV32I-NEXT: and s1, s2, s0 ; RV32I-NEXT: j .LBB104_2 ; RV32I-NEXT: .LBB104_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB104_2 Depth=1 @@ -8968,15 +8968,15 @@ define i16 @atomicrmw_umax_i16_seq_cst(i16 *%a, i16 %b) nounwind { ; RV32I-NEXT: addi a1, sp, 10 ; RV32I-NEXT: addi a3, zero, 5 ; RV32I-NEXT: addi a4, zero, 5 -; RV32I-NEXT: mv a0, s1 +; RV32I-NEXT: mv a0, s3 ; RV32I-NEXT: call __atomic_compare_exchange_2@plt ; RV32I-NEXT: lh a1, 10(sp) ; RV32I-NEXT: bnez a0, .LBB104_4 ; RV32I-NEXT: .LBB104_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: and a0, a1, s3 +; RV32I-NEXT: and a0, a1, s0 ; RV32I-NEXT: mv a2, a1 -; RV32I-NEXT: bltu s0, a0, .LBB104_1 +; RV32I-NEXT: bltu s1, a0, .LBB104_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB104_2 Depth=1 ; RV32I-NEXT: mv a2, s2 @@ -9098,17 +9098,17 @@ define i16 @atomicrmw_umin_i16_monotonic(i16 *%a, i16 %b) nounwind { ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: mv s2, a1 -; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: mv s3, a0 ; RV32I-NEXT: lhu a1, 0(a0) ; RV32I-NEXT: lui a0, 16 -; RV32I-NEXT: addi s3, a0, -1 -; RV32I-NEXT: and s0, s2, s3 +; RV32I-NEXT: addi s0, a0, -1 +; RV32I-NEXT: and s1, s2, s0 ; RV32I-NEXT: j .LBB105_2 ; RV32I-NEXT: .LBB105_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB105_2 Depth=1 ; RV32I-NEXT: sh a1, 10(sp) ; RV32I-NEXT: addi a1, sp, 10 -; RV32I-NEXT: mv a0, s1 +; RV32I-NEXT: mv a0, s3 ; RV32I-NEXT: mv a3, zero ; RV32I-NEXT: mv a4, zero ; RV32I-NEXT: call __atomic_compare_exchange_2@plt @@ -9116,9 +9116,9 @@ define i16 @atomicrmw_umin_i16_monotonic(i16 *%a, i16 %b) nounwind { ; RV32I-NEXT: bnez a0, .LBB105_4 ; RV32I-NEXT: .LBB105_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: and a0, a1, s3 +; RV32I-NEXT: and a0, a1, s0 ; RV32I-NEXT: mv a2, a1 -; RV32I-NEXT: bgeu s0, a0, .LBB105_1 +; RV32I-NEXT: bgeu s1, a0, .LBB105_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB105_2 Depth=1 ; RV32I-NEXT: mv a2, s2 @@ -9240,11 +9240,11 @@ define i16 @atomicrmw_umin_i16_acquire(i16 *%a, i16 %b) nounwind { ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: mv s2, a1 -; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: mv s3, a0 ; RV32I-NEXT: lhu a1, 0(a0) ; RV32I-NEXT: lui a0, 16 -; RV32I-NEXT: addi s3, a0, -1 -; RV32I-NEXT: and s0, s2, s3 +; RV32I-NEXT: addi s0, a0, -1 +; RV32I-NEXT: and s1, s2, s0 ; RV32I-NEXT: j .LBB106_2 ; RV32I-NEXT: .LBB106_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB106_2 Depth=1 @@ -9252,15 +9252,15 @@ define i16 @atomicrmw_umin_i16_acquire(i16 *%a, i16 %b) nounwind { ; RV32I-NEXT: addi a1, sp, 10 ; RV32I-NEXT: addi a3, zero, 2 ; RV32I-NEXT: addi a4, zero, 2 -; RV32I-NEXT: mv a0, s1 +; RV32I-NEXT: mv a0, s3 ; RV32I-NEXT: call __atomic_compare_exchange_2@plt ; RV32I-NEXT: lh a1, 10(sp) ; RV32I-NEXT: bnez a0, .LBB106_4 ; RV32I-NEXT: .LBB106_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: and a0, a1, s3 +; RV32I-NEXT: and a0, a1, s0 ; RV32I-NEXT: mv a2, a1 -; RV32I-NEXT: bgeu s0, a0, .LBB106_1 +; RV32I-NEXT: bgeu s1, a0, .LBB106_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB106_2 Depth=1 ; RV32I-NEXT: mv a2, s2 @@ -9382,27 +9382,27 @@ define i16 @atomicrmw_umin_i16_release(i16 *%a, i16 %b) nounwind { ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: mv s2, a1 -; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: mv s3, a0 ; RV32I-NEXT: lhu a1, 0(a0) ; RV32I-NEXT: lui a0, 16 -; RV32I-NEXT: addi s3, a0, -1 -; RV32I-NEXT: and s0, s2, s3 +; RV32I-NEXT: addi s0, a0, -1 +; RV32I-NEXT: and s1, s2, s0 ; RV32I-NEXT: j .LBB107_2 ; RV32I-NEXT: .LBB107_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB107_2 Depth=1 ; RV32I-NEXT: sh a1, 10(sp) ; RV32I-NEXT: addi a1, sp, 10 ; RV32I-NEXT: addi a3, zero, 3 -; RV32I-NEXT: mv a0, s1 +; RV32I-NEXT: mv a0, s3 ; RV32I-NEXT: mv a4, zero ; RV32I-NEXT: call __atomic_compare_exchange_2@plt ; RV32I-NEXT: lh a1, 10(sp) ; RV32I-NEXT: bnez a0, .LBB107_4 ; RV32I-NEXT: .LBB107_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: and a0, a1, s3 +; RV32I-NEXT: and a0, a1, s0 ; RV32I-NEXT: mv a2, a1 -; RV32I-NEXT: bgeu s0, a0, .LBB107_1 +; RV32I-NEXT: bgeu s1, a0, .LBB107_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB107_2 Depth=1 ; RV32I-NEXT: mv a2, s2 @@ -9524,11 +9524,11 @@ define i16 @atomicrmw_umin_i16_acq_rel(i16 *%a, i16 %b) nounwind { ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: mv s2, a1 -; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: mv s3, a0 ; RV32I-NEXT: lhu a1, 0(a0) ; RV32I-NEXT: lui a0, 16 -; RV32I-NEXT: addi s3, a0, -1 -; RV32I-NEXT: and s0, s2, s3 +; RV32I-NEXT: addi s0, a0, -1 +; RV32I-NEXT: and s1, s2, s0 ; RV32I-NEXT: j .LBB108_2 ; RV32I-NEXT: .LBB108_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB108_2 Depth=1 @@ -9536,15 +9536,15 @@ define i16 @atomicrmw_umin_i16_acq_rel(i16 *%a, i16 %b) nounwind { ; RV32I-NEXT: addi a1, sp, 10 ; RV32I-NEXT: addi a3, zero, 4 ; RV32I-NEXT: addi a4, zero, 2 -; RV32I-NEXT: mv a0, s1 +; RV32I-NEXT: mv a0, s3 ; RV32I-NEXT: call __atomic_compare_exchange_2@plt ; RV32I-NEXT: lh a1, 10(sp) ; RV32I-NEXT: bnez a0, .LBB108_4 ; RV32I-NEXT: .LBB108_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: and a0, a1, s3 +; RV32I-NEXT: and a0, a1, s0 ; RV32I-NEXT: mv a2, a1 -; RV32I-NEXT: bgeu s0, a0, .LBB108_1 +; RV32I-NEXT: bgeu s1, a0, .LBB108_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB108_2 Depth=1 ; RV32I-NEXT: mv a2, s2 @@ -9666,11 +9666,11 @@ define i16 @atomicrmw_umin_i16_seq_cst(i16 *%a, i16 %b) nounwind { ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: mv s2, a1 -; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: mv s3, a0 ; RV32I-NEXT: lhu a1, 0(a0) ; RV32I-NEXT: lui a0, 16 -; RV32I-NEXT: addi s3, a0, -1 -; RV32I-NEXT: and s0, s2, s3 +; RV32I-NEXT: addi s0, a0, -1 +; RV32I-NEXT: and s1, s2, s0 ; RV32I-NEXT: j .LBB109_2 ; RV32I-NEXT: .LBB109_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB109_2 Depth=1 @@ -9678,15 +9678,15 @@ define i16 @atomicrmw_umin_i16_seq_cst(i16 *%a, i16 %b) nounwind { ; RV32I-NEXT: addi a1, sp, 10 ; RV32I-NEXT: addi a3, zero, 5 ; RV32I-NEXT: addi a4, zero, 5 -; RV32I-NEXT: mv a0, s1 +; RV32I-NEXT: mv a0, s3 ; RV32I-NEXT: call __atomic_compare_exchange_2@plt ; RV32I-NEXT: lh a1, 10(sp) ; RV32I-NEXT: bnez a0, .LBB109_4 ; RV32I-NEXT: .LBB109_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: and a0, a1, s3 +; RV32I-NEXT: and a0, a1, s0 ; RV32I-NEXT: mv a2, a1 -; RV32I-NEXT: bgeu s0, a0, .LBB109_1 +; RV32I-NEXT: bgeu s1, a0, .LBB109_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB109_2 Depth=1 ; RV32I-NEXT: mv a2, s2 diff --git a/llvm/test/CodeGen/RISCV/atomic-signext.ll b/llvm/test/CodeGen/RISCV/atomic-signext.ll index 8fbaa84..e7ec2bf 100644 --- a/llvm/test/CodeGen/RISCV/atomic-signext.ll +++ b/llvm/test/CodeGen/RISCV/atomic-signext.ll @@ -1952,17 +1952,17 @@ define signext i16 @atomicrmw_umax_i16_monotonic(i16 *%a, i16 %b) nounwind { ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: mv s2, a1 -; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: mv s3, a0 ; RV32I-NEXT: lhu a1, 0(a0) ; RV32I-NEXT: lui a0, 16 -; RV32I-NEXT: addi s3, a0, -1 -; RV32I-NEXT: and s0, s2, s3 +; RV32I-NEXT: addi s0, a0, -1 +; RV32I-NEXT: and s1, s2, s0 ; RV32I-NEXT: j .LBB23_2 ; RV32I-NEXT: .LBB23_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB23_2 Depth=1 ; RV32I-NEXT: sh a1, 10(sp) ; RV32I-NEXT: addi a1, sp, 10 -; RV32I-NEXT: mv a0, s1 +; RV32I-NEXT: mv a0, s3 ; RV32I-NEXT: mv a3, zero ; RV32I-NEXT: mv a4, zero ; RV32I-NEXT: call __atomic_compare_exchange_2@plt @@ -1970,9 +1970,9 @@ define signext i16 @atomicrmw_umax_i16_monotonic(i16 *%a, i16 %b) nounwind { ; RV32I-NEXT: bnez a0, .LBB23_4 ; RV32I-NEXT: .LBB23_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: and a0, a1, s3 +; RV32I-NEXT: and a0, a1, s0 ; RV32I-NEXT: mv a2, a1 -; RV32I-NEXT: bltu s0, a0, .LBB23_1 +; RV32I-NEXT: bltu s1, a0, .LBB23_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB23_2 Depth=1 ; RV32I-NEXT: mv a2, s2 @@ -2100,17 +2100,17 @@ define signext i16 @atomicrmw_umin_i16_monotonic(i16 *%a, i16 %b) nounwind { ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: mv s2, a1 -; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: mv s3, a0 ; RV32I-NEXT: lhu a1, 0(a0) ; RV32I-NEXT: lui a0, 16 -; RV32I-NEXT: addi s3, a0, -1 -; RV32I-NEXT: and s0, s2, s3 +; RV32I-NEXT: addi s0, a0, -1 +; RV32I-NEXT: and s1, s2, s0 ; RV32I-NEXT: j .LBB24_2 ; RV32I-NEXT: .LBB24_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB24_2 Depth=1 ; RV32I-NEXT: sh a1, 10(sp) ; RV32I-NEXT: addi a1, sp, 10 -; RV32I-NEXT: mv a0, s1 +; RV32I-NEXT: mv a0, s3 ; RV32I-NEXT: mv a3, zero ; RV32I-NEXT: mv a4, zero ; RV32I-NEXT: call __atomic_compare_exchange_2@plt @@ -2118,9 +2118,9 @@ define signext i16 @atomicrmw_umin_i16_monotonic(i16 *%a, i16 %b) nounwind { ; RV32I-NEXT: bnez a0, .LBB24_4 ; RV32I-NEXT: .LBB24_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: and a0, a1, s3 +; RV32I-NEXT: and a0, a1, s0 ; RV32I-NEXT: mv a2, a1 -; RV32I-NEXT: bgeu s0, a0, .LBB24_1 +; RV32I-NEXT: bgeu s1, a0, .LBB24_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB24_2 Depth=1 ; RV32I-NEXT: mv a2, s2 diff --git a/llvm/test/CodeGen/RISCV/bswap-ctlz-cttz-ctpop.ll b/llvm/test/CodeGen/RISCV/bswap-ctlz-cttz-ctpop.ll index 8db5859..d3a332c 100644 --- a/llvm/test/CodeGen/RISCV/bswap-ctlz-cttz-ctpop.ll +++ b/llvm/test/CodeGen/RISCV/bswap-ctlz-cttz-ctpop.ll @@ -569,21 +569,21 @@ define i64 @test_cttz_i64(i64 %a) nounwind { ; RV32I-NEXT: sw s4, 8(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s5, 4(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s6, 0(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv s1, a1 -; RV32I-NEXT: mv s0, a0 +; RV32I-NEXT: mv s3, a1 +; RV32I-NEXT: mv s4, a0 ; RV32I-NEXT: addi a0, a0, -1 -; RV32I-NEXT: not a1, s0 +; RV32I-NEXT: not a1, s4 ; RV32I-NEXT: and a0, a1, a0 ; RV32I-NEXT: srli a1, a0, 1 ; RV32I-NEXT: lui a2, 349525 -; RV32I-NEXT: addi s4, a2, 1365 -; RV32I-NEXT: and a1, a1, s4 +; RV32I-NEXT: addi s5, a2, 1365 +; RV32I-NEXT: and a1, a1, s5 ; RV32I-NEXT: sub a0, a0, a1 ; RV32I-NEXT: lui a1, 209715 -; RV32I-NEXT: addi s5, a1, 819 -; RV32I-NEXT: and a1, a0, s5 +; RV32I-NEXT: addi s0, a1, 819 +; RV32I-NEXT: and a1, a0, s0 ; RV32I-NEXT: srli a0, a0, 2 -; RV32I-NEXT: and a0, a0, s5 +; RV32I-NEXT: and a0, a0, s0 ; RV32I-NEXT: add a0, a1, a0 ; RV32I-NEXT: srli a1, a0, 4 ; RV32I-NEXT: add a0, a0, a1 @@ -591,26 +591,26 @@ define i64 @test_cttz_i64(i64 %a) nounwind { ; RV32I-NEXT: addi s6, a1, -241 ; RV32I-NEXT: and a0, a0, s6 ; RV32I-NEXT: lui a1, 4112 -; RV32I-NEXT: addi s3, a1, 257 -; RV32I-NEXT: mv a1, s3 +; RV32I-NEXT: addi s1, a1, 257 +; RV32I-NEXT: mv a1, s1 ; RV32I-NEXT: call __mulsi3@plt ; RV32I-NEXT: mv s2, a0 -; RV32I-NEXT: addi a0, s1, -1 -; RV32I-NEXT: not a1, s1 +; RV32I-NEXT: addi a0, s3, -1 +; RV32I-NEXT: not a1, s3 ; RV32I-NEXT: and a0, a1, a0 ; RV32I-NEXT: srli a1, a0, 1 -; RV32I-NEXT: and a1, a1, s4 +; RV32I-NEXT: and a1, a1, s5 ; RV32I-NEXT: sub a0, a0, a1 -; RV32I-NEXT: and a1, a0, s5 +; RV32I-NEXT: and a1, a0, s0 ; RV32I-NEXT: srli a0, a0, 2 -; RV32I-NEXT: and a0, a0, s5 +; RV32I-NEXT: and a0, a0, s0 ; RV32I-NEXT: add a0, a1, a0 ; RV32I-NEXT: srli a1, a0, 4 ; RV32I-NEXT: add a0, a0, a1 ; RV32I-NEXT: and a0, a0, s6 -; RV32I-NEXT: mv a1, s3 +; RV32I-NEXT: mv a1, s1 ; RV32I-NEXT: call __mulsi3@plt -; RV32I-NEXT: bnez s0, .LBB7_2 +; RV32I-NEXT: bnez s4, .LBB7_2 ; RV32I-NEXT: # %bb.1: ; RV32I-NEXT: srli a0, a0, 24 ; RV32I-NEXT: addi a0, a0, 32 @@ -967,21 +967,21 @@ define i64 @test_cttz_i64_zero_undef(i64 %a) nounwind { ; RV32I-NEXT: sw s4, 8(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s5, 4(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s6, 0(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv s1, a1 -; RV32I-NEXT: mv s0, a0 +; RV32I-NEXT: mv s3, a1 +; RV32I-NEXT: mv s4, a0 ; RV32I-NEXT: addi a0, a0, -1 -; RV32I-NEXT: not a1, s0 +; RV32I-NEXT: not a1, s4 ; RV32I-NEXT: and a0, a1, a0 ; RV32I-NEXT: srli a1, a0, 1 ; RV32I-NEXT: lui a2, 349525 -; RV32I-NEXT: addi s4, a2, 1365 -; RV32I-NEXT: and a1, a1, s4 +; RV32I-NEXT: addi s5, a2, 1365 +; RV32I-NEXT: and a1, a1, s5 ; RV32I-NEXT: sub a0, a0, a1 ; RV32I-NEXT: lui a1, 209715 -; RV32I-NEXT: addi s5, a1, 819 -; RV32I-NEXT: and a1, a0, s5 +; RV32I-NEXT: addi s0, a1, 819 +; RV32I-NEXT: and a1, a0, s0 ; RV32I-NEXT: srli a0, a0, 2 -; RV32I-NEXT: and a0, a0, s5 +; RV32I-NEXT: and a0, a0, s0 ; RV32I-NEXT: add a0, a1, a0 ; RV32I-NEXT: srli a1, a0, 4 ; RV32I-NEXT: add a0, a0, a1 @@ -989,26 +989,26 @@ define i64 @test_cttz_i64_zero_undef(i64 %a) nounwind { ; RV32I-NEXT: addi s6, a1, -241 ; RV32I-NEXT: and a0, a0, s6 ; RV32I-NEXT: lui a1, 4112 -; RV32I-NEXT: addi s3, a1, 257 -; RV32I-NEXT: mv a1, s3 +; RV32I-NEXT: addi s1, a1, 257 +; RV32I-NEXT: mv a1, s1 ; RV32I-NEXT: call __mulsi3@plt ; RV32I-NEXT: mv s2, a0 -; RV32I-NEXT: addi a0, s1, -1 -; RV32I-NEXT: not a1, s1 +; RV32I-NEXT: addi a0, s3, -1 +; RV32I-NEXT: not a1, s3 ; RV32I-NEXT: and a0, a1, a0 ; RV32I-NEXT: srli a1, a0, 1 -; RV32I-NEXT: and a1, a1, s4 +; RV32I-NEXT: and a1, a1, s5 ; RV32I-NEXT: sub a0, a0, a1 -; RV32I-NEXT: and a1, a0, s5 +; RV32I-NEXT: and a1, a0, s0 ; RV32I-NEXT: srli a0, a0, 2 -; RV32I-NEXT: and a0, a0, s5 +; RV32I-NEXT: and a0, a0, s0 ; RV32I-NEXT: add a0, a1, a0 ; RV32I-NEXT: srli a1, a0, 4 ; RV32I-NEXT: add a0, a0, a1 ; RV32I-NEXT: and a0, a0, s6 -; RV32I-NEXT: mv a1, s3 +; RV32I-NEXT: mv a1, s1 ; RV32I-NEXT: call __mulsi3@plt -; RV32I-NEXT: bnez s0, .LBB11_2 +; RV32I-NEXT: bnez s4, .LBB11_2 ; RV32I-NEXT: # %bb.1: ; RV32I-NEXT: srli a0, a0, 24 ; RV32I-NEXT: addi a0, a0, 32 @@ -1173,17 +1173,17 @@ define i64 @test_ctpop_i64(i64 %a) nounwind { ; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s4, 8(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s5, 4(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv s0, a0 +; RV32I-NEXT: mv s2, a0 ; RV32I-NEXT: srli a0, a1, 1 ; RV32I-NEXT: lui a2, 349525 -; RV32I-NEXT: addi s2, a2, 1365 -; RV32I-NEXT: and a0, a0, s2 +; RV32I-NEXT: addi s3, a2, 1365 +; RV32I-NEXT: and a0, a0, s3 ; RV32I-NEXT: sub a0, a1, a0 ; RV32I-NEXT: lui a1, 209715 -; RV32I-NEXT: addi s1, a1, 819 -; RV32I-NEXT: and a1, a0, s1 +; RV32I-NEXT: addi s0, a1, 819 +; RV32I-NEXT: and a1, a0, s0 ; RV32I-NEXT: srli a0, a0, 2 -; RV32I-NEXT: and a0, a0, s1 +; RV32I-NEXT: and a0, a0, s0 ; RV32I-NEXT: add a0, a1, a0 ; RV32I-NEXT: srli a1, a0, 4 ; RV32I-NEXT: add a0, a0, a1 @@ -1191,21 +1191,21 @@ define i64 @test_ctpop_i64(i64 %a) nounwind { ; RV32I-NEXT: addi s4, a1, -241 ; RV32I-NEXT: and a0, a0, s4 ; RV32I-NEXT: lui a1, 4112 -; RV32I-NEXT: addi s3, a1, 257 -; RV32I-NEXT: mv a1, s3 +; RV32I-NEXT: addi s1, a1, 257 +; RV32I-NEXT: mv a1, s1 ; RV32I-NEXT: call __mulsi3@plt ; RV32I-NEXT: srli s5, a0, 24 -; RV32I-NEXT: srli a0, s0, 1 -; RV32I-NEXT: and a0, a0, s2 -; RV32I-NEXT: sub a0, s0, a0 -; RV32I-NEXT: and a1, a0, s1 +; RV32I-NEXT: srli a0, s2, 1 +; RV32I-NEXT: and a0, a0, s3 +; RV32I-NEXT: sub a0, s2, a0 +; RV32I-NEXT: and a1, a0, s0 ; RV32I-NEXT: srli a0, a0, 2 -; RV32I-NEXT: and a0, a0, s1 +; RV32I-NEXT: and a0, a0, s0 ; RV32I-NEXT: add a0, a1, a0 ; RV32I-NEXT: srli a1, a0, 4 ; RV32I-NEXT: add a0, a0, a1 ; RV32I-NEXT: and a0, a0, s4 -; RV32I-NEXT: mv a1, s3 +; RV32I-NEXT: mv a1, s1 ; RV32I-NEXT: call __mulsi3@plt ; RV32I-NEXT: srli a0, a0, 24 ; RV32I-NEXT: add a0, a0, s5 diff --git a/llvm/test/CodeGen/RISCV/mul.ll b/llvm/test/CodeGen/RISCV/mul.ll index 2616592..7a6cf4a7 100644 --- a/llvm/test/CodeGen/RISCV/mul.ll +++ b/llvm/test/CodeGen/RISCV/mul.ll @@ -1192,52 +1192,54 @@ define i128 @muli128_m3840(i128 %a) nounwind { ; RV32IM-NEXT: addi sp, sp, -16 ; RV32IM-NEXT: sw s0, 12(sp) # 4-byte Folded Spill ; RV32IM-NEXT: sw s1, 8(sp) # 4-byte Folded Spill +; RV32IM-NEXT: sw s2, 4(sp) # 4-byte Folded Spill ; RV32IM-NEXT: lw a6, 12(a1) -; RV32IM-NEXT: lw t0, 8(a1) +; RV32IM-NEXT: lw a7, 8(a1) ; RV32IM-NEXT: lw a4, 0(a1) ; RV32IM-NEXT: lw a1, 4(a1) ; RV32IM-NEXT: lui a5, 1048575 -; RV32IM-NEXT: addi a7, a5, 256 -; RV32IM-NEXT: mulhu a2, a4, a7 -; RV32IM-NEXT: mul a5, a1, a7 -; RV32IM-NEXT: add a2, a5, a2 -; RV32IM-NEXT: sltu a5, a2, a5 -; RV32IM-NEXT: mulhu a3, a1, a7 -; RV32IM-NEXT: add t5, a3, a5 -; RV32IM-NEXT: sub t1, a2, a4 +; RV32IM-NEXT: addi a5, a5, 256 +; RV32IM-NEXT: mulhu a2, a4, a5 +; RV32IM-NEXT: mul a3, a1, a5 +; RV32IM-NEXT: add a2, a3, a2 +; RV32IM-NEXT: sltu t0, a2, a3 +; RV32IM-NEXT: mulhu a3, a1, a5 +; RV32IM-NEXT: add t5, a3, t0 +; RV32IM-NEXT: sub t0, a2, a4 ; RV32IM-NEXT: neg t4, a4 -; RV32IM-NEXT: sltu a2, t1, t4 +; RV32IM-NEXT: sltu t1, t0, t4 ; RV32IM-NEXT: addi t2, zero, -1 ; RV32IM-NEXT: mulhu t3, a4, t2 -; RV32IM-NEXT: add a2, t3, a2 -; RV32IM-NEXT: add a2, t5, a2 -; RV32IM-NEXT: sub a5, a2, a1 -; RV32IM-NEXT: mul a3, t0, a7 -; RV32IM-NEXT: sub a3, a3, a4 -; RV32IM-NEXT: add t6, a5, a3 -; RV32IM-NEXT: sltu s0, t6, a5 +; RV32IM-NEXT: add a2, t3, t1 +; RV32IM-NEXT: add t1, t5, a2 +; RV32IM-NEXT: sub a3, t1, a1 +; RV32IM-NEXT: mul a2, a7, a5 +; RV32IM-NEXT: sub a2, a2, a4 +; RV32IM-NEXT: add t6, a3, a2 +; RV32IM-NEXT: sltu s2, t6, a3 ; RV32IM-NEXT: neg s1, a1 -; RV32IM-NEXT: sltu a5, a5, s1 -; RV32IM-NEXT: sltu a2, a2, t5 -; RV32IM-NEXT: mulhu s1, a1, t2 -; RV32IM-NEXT: add a2, s1, a2 -; RV32IM-NEXT: add a2, a2, a5 -; RV32IM-NEXT: sltu a3, a3, t4 -; RV32IM-NEXT: mul a5, a6, a7 -; RV32IM-NEXT: mulhu s1, t0, a7 -; RV32IM-NEXT: sub s1, s1, t0 -; RV32IM-NEXT: add a5, s1, a5 -; RV32IM-NEXT: sub s1, t3, a4 -; RV32IM-NEXT: sub a1, s1, a1 -; RV32IM-NEXT: add a1, a1, a5 -; RV32IM-NEXT: add a1, a1, a3 -; RV32IM-NEXT: add a1, a2, a1 -; RV32IM-NEXT: add a1, a1, s0 -; RV32IM-NEXT: mul a2, a4, a7 +; RV32IM-NEXT: sltu a3, a3, s1 +; RV32IM-NEXT: sltu s1, t1, t5 +; RV32IM-NEXT: mulhu s0, a1, t2 +; RV32IM-NEXT: add s1, s0, s1 +; RV32IM-NEXT: add a3, s1, a3 +; RV32IM-NEXT: sltu a2, a2, t4 +; RV32IM-NEXT: mul s1, a6, a5 +; RV32IM-NEXT: mulhu s0, a7, a5 +; RV32IM-NEXT: sub s0, s0, a7 +; RV32IM-NEXT: add s1, s0, s1 +; RV32IM-NEXT: sub s0, t3, a4 +; RV32IM-NEXT: sub a1, s0, a1 +; RV32IM-NEXT: add a1, a1, s1 +; RV32IM-NEXT: add a1, a1, a2 +; RV32IM-NEXT: add a1, a3, a1 +; RV32IM-NEXT: add a1, a1, s2 +; RV32IM-NEXT: mul a2, a4, a5 ; RV32IM-NEXT: sw a2, 0(a0) -; RV32IM-NEXT: sw t1, 4(a0) +; RV32IM-NEXT: sw t0, 4(a0) ; RV32IM-NEXT: sw t6, 8(a0) ; RV32IM-NEXT: sw a1, 12(a0) +; RV32IM-NEXT: lw s2, 4(sp) # 4-byte Folded Reload ; RV32IM-NEXT: lw s1, 8(sp) # 4-byte Folded Reload ; RV32IM-NEXT: lw s0, 12(sp) # 4-byte Folded Reload ; RV32IM-NEXT: addi sp, sp, 16 diff --git a/llvm/test/CodeGen/RISCV/rv32i-rv64i-half.ll b/llvm/test/CodeGen/RISCV/rv32i-rv64i-half.ll index 32cae6f..7f7cf04 100644 --- a/llvm/test/CodeGen/RISCV/rv32i-rv64i-half.ll +++ b/llvm/test/CodeGen/RISCV/rv32i-rv64i-half.ll @@ -19,18 +19,18 @@ define half @half_test(half %a, half %b) nounwind { ; RV32I-NEXT: sw s2, 0(sp) # 4-byte Folded Spill ; RV32I-NEXT: mv s0, a1 ; RV32I-NEXT: lui a1, 16 -; RV32I-NEXT: addi s2, a1, -1 -; RV32I-NEXT: and a0, a0, s2 +; RV32I-NEXT: addi s1, a1, -1 +; RV32I-NEXT: and a0, a0, s1 ; RV32I-NEXT: call __gnu_h2f_ieee@plt -; RV32I-NEXT: mv s1, a0 -; RV32I-NEXT: and a0, s0, s2 +; RV32I-NEXT: mv s2, a0 +; RV32I-NEXT: and a0, s0, s1 ; RV32I-NEXT: call __gnu_h2f_ieee@plt ; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: mv a0, s1 +; RV32I-NEXT: mv a0, s2 ; RV32I-NEXT: mv a1, s0 ; RV32I-NEXT: call __addsf3@plt ; RV32I-NEXT: call __gnu_f2h_ieee@plt -; RV32I-NEXT: and a0, a0, s2 +; RV32I-NEXT: and a0, a0, s1 ; RV32I-NEXT: call __gnu_h2f_ieee@plt ; RV32I-NEXT: mv a1, s0 ; RV32I-NEXT: call __divsf3@plt diff --git a/llvm/test/CodeGen/RISCV/rv32zbb-zbp.ll b/llvm/test/CodeGen/RISCV/rv32zbb-zbp.ll index 9f87f10..e30edbd 100644 --- a/llvm/test/CodeGen/RISCV/rv32zbb-zbp.ll +++ b/llvm/test/CodeGen/RISCV/rv32zbb-zbp.ll @@ -218,38 +218,38 @@ declare i64 @llvm.fshl.i64(i64, i64, i64) define i64 @rol_i64(i64 %a, i64 %b) nounwind { ; RV32I-LABEL: rol_i64: ; RV32I: # %bb.0: -; RV32I-NEXT: mv t1, a1 +; RV32I-NEXT: mv a7, a1 ; RV32I-NEXT: andi a1, a2, 63 -; RV32I-NEXT: addi a7, a1, -32 +; RV32I-NEXT: addi t0, a1, -32 ; RV32I-NEXT: addi a6, zero, 31 -; RV32I-NEXT: bltz a7, .LBB7_2 +; RV32I-NEXT: bltz t0, .LBB7_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: sll a1, a0, a7 +; RV32I-NEXT: sll a1, a0, t0 ; RV32I-NEXT: j .LBB7_3 ; RV32I-NEXT: .LBB7_2: -; RV32I-NEXT: sll a4, t1, a2 +; RV32I-NEXT: sll a3, a7, a2 ; RV32I-NEXT: sub a1, a6, a1 -; RV32I-NEXT: srli a5, a0, 1 -; RV32I-NEXT: srl a1, a5, a1 -; RV32I-NEXT: or a1, a4, a1 +; RV32I-NEXT: srli a4, a0, 1 +; RV32I-NEXT: srl a1, a4, a1 +; RV32I-NEXT: or a1, a3, a1 ; RV32I-NEXT: .LBB7_3: ; RV32I-NEXT: neg a5, a2 -; RV32I-NEXT: andi a4, a5, 63 -; RV32I-NEXT: addi t0, a4, -32 -; RV32I-NEXT: bltz t0, .LBB7_5 +; RV32I-NEXT: andi a3, a5, 63 +; RV32I-NEXT: addi a4, a3, -32 +; RV32I-NEXT: bltz a4, .LBB7_5 ; RV32I-NEXT: # %bb.4: -; RV32I-NEXT: srl a3, t1, t0 -; RV32I-NEXT: bltz a7, .LBB7_6 +; RV32I-NEXT: srl a3, a7, a4 +; RV32I-NEXT: bltz t0, .LBB7_6 ; RV32I-NEXT: j .LBB7_7 ; RV32I-NEXT: .LBB7_5: -; RV32I-NEXT: srl a3, t1, a5 -; RV32I-NEXT: or a1, a1, a3 -; RV32I-NEXT: srl a3, a0, a5 -; RV32I-NEXT: sub a4, a6, a4 -; RV32I-NEXT: slli a5, t1, 1 -; RV32I-NEXT: sll a4, a5, a4 -; RV32I-NEXT: or a3, a3, a4 -; RV32I-NEXT: bgez a7, .LBB7_7 +; RV32I-NEXT: srl a4, a7, a5 +; RV32I-NEXT: or a1, a1, a4 +; RV32I-NEXT: srl a4, a0, a5 +; RV32I-NEXT: sub a3, a6, a3 +; RV32I-NEXT: slli a5, a7, 1 +; RV32I-NEXT: sll a3, a5, a3 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: bgez t0, .LBB7_7 ; RV32I-NEXT: .LBB7_6: ; RV32I-NEXT: sll a0, a0, a2 ; RV32I-NEXT: or a3, a3, a0 @@ -265,67 +265,67 @@ define i64 @rol_i64(i64 %a, i64 %b) nounwind { ; RV32B-NEXT: sub a5, a6, a4 ; RV32B-NEXT: srli a3, a0, 1 ; RV32B-NEXT: srl a3, a3, a5 -; RV32B-NEXT: or a3, a7, a3 -; RV32B-NEXT: addi a7, a4, -32 -; RV32B-NEXT: sll a5, a0, a7 -; RV32B-NEXT: slti a4, a7, 0 -; RV32B-NEXT: cmov t0, a4, a3, a5 -; RV32B-NEXT: neg a4, a2 -; RV32B-NEXT: srl t2, a1, a4 -; RV32B-NEXT: andi a3, a4, 63 -; RV32B-NEXT: addi t1, a3, -32 -; RV32B-NEXT: srai a5, t1, 31 -; RV32B-NEXT: and a5, a5, t2 -; RV32B-NEXT: or t0, t0, a5 -; RV32B-NEXT: srl a4, a0, a4 -; RV32B-NEXT: sub a3, a6, a3 -; RV32B-NEXT: slli a5, a1, 1 -; RV32B-NEXT: sll a3, a5, a3 -; RV32B-NEXT: or a3, a4, a3 -; RV32B-NEXT: srl a1, a1, t1 -; RV32B-NEXT: slti a4, t1, 0 +; RV32B-NEXT: or a7, a7, a3 +; RV32B-NEXT: addi t1, a4, -32 +; RV32B-NEXT: sll a5, a0, t1 +; RV32B-NEXT: slti a3, t1, 0 +; RV32B-NEXT: cmov a7, a3, a7, a5 +; RV32B-NEXT: neg a5, a2 +; RV32B-NEXT: srl t0, a1, a5 +; RV32B-NEXT: andi t2, a5, 63 +; RV32B-NEXT: addi a4, t2, -32 +; RV32B-NEXT: srai a3, a4, 31 +; RV32B-NEXT: and a3, a3, t0 +; RV32B-NEXT: or a7, a7, a3 +; RV32B-NEXT: srl t0, a0, a5 +; RV32B-NEXT: sub a5, a6, t2 +; RV32B-NEXT: slli a3, a1, 1 +; RV32B-NEXT: sll a3, a3, a5 +; RV32B-NEXT: or a3, t0, a3 +; RV32B-NEXT: srl a1, a1, a4 +; RV32B-NEXT: slti a4, a4, 0 ; RV32B-NEXT: cmov a1, a4, a3, a1 ; RV32B-NEXT: sll a0, a0, a2 -; RV32B-NEXT: srai a2, a7, 31 +; RV32B-NEXT: srai a2, t1, 31 ; RV32B-NEXT: and a0, a2, a0 ; RV32B-NEXT: or a0, a0, a1 -; RV32B-NEXT: mv a1, t0 +; RV32B-NEXT: mv a1, a7 ; RV32B-NEXT: ret ; ; RV32ZBB-LABEL: rol_i64: ; RV32ZBB: # %bb.0: -; RV32ZBB-NEXT: mv t1, a1 +; RV32ZBB-NEXT: mv a7, a1 ; RV32ZBB-NEXT: andi a1, a2, 63 -; RV32ZBB-NEXT: addi a7, a1, -32 +; RV32ZBB-NEXT: addi t0, a1, -32 ; RV32ZBB-NEXT: addi a6, zero, 31 -; RV32ZBB-NEXT: bltz a7, .LBB7_2 +; RV32ZBB-NEXT: bltz t0, .LBB7_2 ; RV32ZBB-NEXT: # %bb.1: -; RV32ZBB-NEXT: sll a1, a0, a7 +; RV32ZBB-NEXT: sll a1, a0, t0 ; RV32ZBB-NEXT: j .LBB7_3 ; RV32ZBB-NEXT: .LBB7_2: -; RV32ZBB-NEXT: sll a4, t1, a2 +; RV32ZBB-NEXT: sll a3, a7, a2 ; RV32ZBB-NEXT: sub a1, a6, a1 -; RV32ZBB-NEXT: srli a5, a0, 1 -; RV32ZBB-NEXT: srl a1, a5, a1 -; RV32ZBB-NEXT: or a1, a4, a1 +; RV32ZBB-NEXT: srli a4, a0, 1 +; RV32ZBB-NEXT: srl a1, a4, a1 +; RV32ZBB-NEXT: or a1, a3, a1 ; RV32ZBB-NEXT: .LBB7_3: ; RV32ZBB-NEXT: neg a5, a2 -; RV32ZBB-NEXT: andi a4, a5, 63 -; RV32ZBB-NEXT: addi t0, a4, -32 -; RV32ZBB-NEXT: bltz t0, .LBB7_5 +; RV32ZBB-NEXT: andi a3, a5, 63 +; RV32ZBB-NEXT: addi a4, a3, -32 +; RV32ZBB-NEXT: bltz a4, .LBB7_5 ; RV32ZBB-NEXT: # %bb.4: -; RV32ZBB-NEXT: srl a3, t1, t0 -; RV32ZBB-NEXT: bltz a7, .LBB7_6 +; RV32ZBB-NEXT: srl a3, a7, a4 +; RV32ZBB-NEXT: bltz t0, .LBB7_6 ; RV32ZBB-NEXT: j .LBB7_7 ; RV32ZBB-NEXT: .LBB7_5: -; RV32ZBB-NEXT: srl a3, t1, a5 -; RV32ZBB-NEXT: or a1, a1, a3 -; RV32ZBB-NEXT: srl a3, a0, a5 -; RV32ZBB-NEXT: sub a4, a6, a4 -; RV32ZBB-NEXT: slli a5, t1, 1 -; RV32ZBB-NEXT: sll a4, a5, a4 -; RV32ZBB-NEXT: or a3, a3, a4 -; RV32ZBB-NEXT: bgez a7, .LBB7_7 +; RV32ZBB-NEXT: srl a4, a7, a5 +; RV32ZBB-NEXT: or a1, a1, a4 +; RV32ZBB-NEXT: srl a4, a0, a5 +; RV32ZBB-NEXT: sub a3, a6, a3 +; RV32ZBB-NEXT: slli a5, a7, 1 +; RV32ZBB-NEXT: sll a3, a5, a3 +; RV32ZBB-NEXT: or a3, a4, a3 +; RV32ZBB-NEXT: bgez t0, .LBB7_7 ; RV32ZBB-NEXT: .LBB7_6: ; RV32ZBB-NEXT: sll a0, a0, a2 ; RV32ZBB-NEXT: or a3, a3, a0 @@ -335,38 +335,38 @@ define i64 @rol_i64(i64 %a, i64 %b) nounwind { ; ; RV32ZBP-LABEL: rol_i64: ; RV32ZBP: # %bb.0: -; RV32ZBP-NEXT: mv t1, a1 +; RV32ZBP-NEXT: mv a7, a1 ; RV32ZBP-NEXT: andi a1, a2, 63 -; RV32ZBP-NEXT: addi a7, a1, -32 +; RV32ZBP-NEXT: addi t0, a1, -32 ; RV32ZBP-NEXT: addi a6, zero, 31 -; RV32ZBP-NEXT: bltz a7, .LBB7_2 +; RV32ZBP-NEXT: bltz t0, .LBB7_2 ; RV32ZBP-NEXT: # %bb.1: -; RV32ZBP-NEXT: sll a1, a0, a7 +; RV32ZBP-NEXT: sll a1, a0, t0 ; RV32ZBP-NEXT: j .LBB7_3 ; RV32ZBP-NEXT: .LBB7_2: -; RV32ZBP-NEXT: sll a4, t1, a2 +; RV32ZBP-NEXT: sll a3, a7, a2 ; RV32ZBP-NEXT: sub a1, a6, a1 -; RV32ZBP-NEXT: srli a5, a0, 1 -; RV32ZBP-NEXT: srl a1, a5, a1 -; RV32ZBP-NEXT: or a1, a4, a1 +; RV32ZBP-NEXT: srli a4, a0, 1 +; RV32ZBP-NEXT: srl a1, a4, a1 +; RV32ZBP-NEXT: or a1, a3, a1 ; RV32ZBP-NEXT: .LBB7_3: ; RV32ZBP-NEXT: neg a5, a2 -; RV32ZBP-NEXT: andi a4, a5, 63 -; RV32ZBP-NEXT: addi t0, a4, -32 -; RV32ZBP-NEXT: bltz t0, .LBB7_5 +; RV32ZBP-NEXT: andi a3, a5, 63 +; RV32ZBP-NEXT: addi a4, a3, -32 +; RV32ZBP-NEXT: bltz a4, .LBB7_5 ; RV32ZBP-NEXT: # %bb.4: -; RV32ZBP-NEXT: srl a3, t1, t0 -; RV32ZBP-NEXT: bltz a7, .LBB7_6 +; RV32ZBP-NEXT: srl a3, a7, a4 +; RV32ZBP-NEXT: bltz t0, .LBB7_6 ; RV32ZBP-NEXT: j .LBB7_7 ; RV32ZBP-NEXT: .LBB7_5: -; RV32ZBP-NEXT: srl a3, t1, a5 -; RV32ZBP-NEXT: or a1, a1, a3 -; RV32ZBP-NEXT: srl a3, a0, a5 -; RV32ZBP-NEXT: sub a4, a6, a4 -; RV32ZBP-NEXT: slli a5, t1, 1 -; RV32ZBP-NEXT: sll a4, a5, a4 -; RV32ZBP-NEXT: or a3, a3, a4 -; RV32ZBP-NEXT: bgez a7, .LBB7_7 +; RV32ZBP-NEXT: srl a4, a7, a5 +; RV32ZBP-NEXT: or a1, a1, a4 +; RV32ZBP-NEXT: srl a4, a0, a5 +; RV32ZBP-NEXT: sub a3, a6, a3 +; RV32ZBP-NEXT: slli a5, a7, 1 +; RV32ZBP-NEXT: sll a3, a5, a3 +; RV32ZBP-NEXT: or a3, a4, a3 +; RV32ZBP-NEXT: bgez t0, .LBB7_7 ; RV32ZBP-NEXT: .LBB7_6: ; RV32ZBP-NEXT: sll a0, a0, a2 ; RV32ZBP-NEXT: or a3, a3, a0 @@ -416,7 +416,7 @@ declare i64 @llvm.fshr.i64(i64, i64, i64) define i64 @ror_i64(i64 %a, i64 %b) nounwind { ; RV32I-LABEL: ror_i64: ; RV32I: # %bb.0: -; RV32I-NEXT: mv t1, a0 +; RV32I-NEXT: mv t0, a0 ; RV32I-NEXT: andi a0, a2, 63 ; RV32I-NEXT: addi a7, a0, -32 ; RV32I-NEXT: addi a6, zero, 31 @@ -425,26 +425,26 @@ define i64 @ror_i64(i64 %a, i64 %b) nounwind { ; RV32I-NEXT: srl a0, a1, a7 ; RV32I-NEXT: j .LBB9_3 ; RV32I-NEXT: .LBB9_2: -; RV32I-NEXT: srl a4, t1, a2 +; RV32I-NEXT: srl a3, t0, a2 ; RV32I-NEXT: sub a0, a6, a0 -; RV32I-NEXT: slli a5, a1, 1 -; RV32I-NEXT: sll a0, a5, a0 -; RV32I-NEXT: or a0, a4, a0 +; RV32I-NEXT: slli a4, a1, 1 +; RV32I-NEXT: sll a0, a4, a0 +; RV32I-NEXT: or a0, a3, a0 ; RV32I-NEXT: .LBB9_3: ; RV32I-NEXT: neg a5, a2 ; RV32I-NEXT: andi a4, a5, 63 -; RV32I-NEXT: addi t0, a4, -32 -; RV32I-NEXT: bltz t0, .LBB9_5 +; RV32I-NEXT: addi a3, a4, -32 +; RV32I-NEXT: bltz a3, .LBB9_5 ; RV32I-NEXT: # %bb.4: -; RV32I-NEXT: sll a3, t1, t0 +; RV32I-NEXT: sll a3, t0, a3 ; RV32I-NEXT: bltz a7, .LBB9_6 ; RV32I-NEXT: j .LBB9_7 ; RV32I-NEXT: .LBB9_5: -; RV32I-NEXT: sll a3, t1, a5 +; RV32I-NEXT: sll a3, t0, a5 ; RV32I-NEXT: or a0, a0, a3 ; RV32I-NEXT: sll a3, a1, a5 ; RV32I-NEXT: sub a4, a6, a4 -; RV32I-NEXT: srli a5, t1, 1 +; RV32I-NEXT: srli a5, t0, 1 ; RV32I-NEXT: srl a4, a5, a4 ; RV32I-NEXT: or a3, a3, a4 ; RV32I-NEXT: bgez a7, .LBB9_7 @@ -463,36 +463,36 @@ define i64 @ror_i64(i64 %a, i64 %b) nounwind { ; RV32B-NEXT: sub a5, a6, a4 ; RV32B-NEXT: slli a3, a1, 1 ; RV32B-NEXT: sll a3, a3, a5 -; RV32B-NEXT: or a3, a7, a3 -; RV32B-NEXT: addi a7, a4, -32 -; RV32B-NEXT: srl a5, a1, a7 -; RV32B-NEXT: slti a4, a7, 0 -; RV32B-NEXT: cmov t0, a4, a3, a5 -; RV32B-NEXT: neg a4, a2 -; RV32B-NEXT: sll t2, a0, a4 -; RV32B-NEXT: andi a3, a4, 63 -; RV32B-NEXT: addi t1, a3, -32 -; RV32B-NEXT: srai a5, t1, 31 -; RV32B-NEXT: and a5, a5, t2 -; RV32B-NEXT: or t0, t0, a5 -; RV32B-NEXT: sll a4, a1, a4 -; RV32B-NEXT: sub a3, a6, a3 -; RV32B-NEXT: srli a5, a0, 1 -; RV32B-NEXT: srl a3, a5, a3 -; RV32B-NEXT: or a3, a4, a3 -; RV32B-NEXT: sll a0, a0, t1 -; RV32B-NEXT: slti a4, t1, 0 +; RV32B-NEXT: or a7, a7, a3 +; RV32B-NEXT: addi t1, a4, -32 +; RV32B-NEXT: srl a5, a1, t1 +; RV32B-NEXT: slti a3, t1, 0 +; RV32B-NEXT: cmov a7, a3, a7, a5 +; RV32B-NEXT: neg a5, a2 +; RV32B-NEXT: sll t0, a0, a5 +; RV32B-NEXT: andi t2, a5, 63 +; RV32B-NEXT: addi a4, t2, -32 +; RV32B-NEXT: srai a3, a4, 31 +; RV32B-NEXT: and a3, a3, t0 +; RV32B-NEXT: or a7, a7, a3 +; RV32B-NEXT: sll t0, a1, a5 +; RV32B-NEXT: sub a5, a6, t2 +; RV32B-NEXT: srli a3, a0, 1 +; RV32B-NEXT: srl a3, a3, a5 +; RV32B-NEXT: or a3, t0, a3 +; RV32B-NEXT: sll a0, a0, a4 +; RV32B-NEXT: slti a4, a4, 0 ; RV32B-NEXT: cmov a0, a4, a3, a0 ; RV32B-NEXT: srl a1, a1, a2 -; RV32B-NEXT: srai a2, a7, 31 +; RV32B-NEXT: srai a2, t1, 31 ; RV32B-NEXT: and a1, a2, a1 ; RV32B-NEXT: or a1, a1, a0 -; RV32B-NEXT: mv a0, t0 +; RV32B-NEXT: mv a0, a7 ; RV32B-NEXT: ret ; ; RV32ZBB-LABEL: ror_i64: ; RV32ZBB: # %bb.0: -; RV32ZBB-NEXT: mv t1, a0 +; RV32ZBB-NEXT: mv t0, a0 ; RV32ZBB-NEXT: andi a0, a2, 63 ; RV32ZBB-NEXT: addi a7, a0, -32 ; RV32ZBB-NEXT: addi a6, zero, 31 @@ -501,26 +501,26 @@ define i64 @ror_i64(i64 %a, i64 %b) nounwind { ; RV32ZBB-NEXT: srl a0, a1, a7 ; RV32ZBB-NEXT: j .LBB9_3 ; RV32ZBB-NEXT: .LBB9_2: -; RV32ZBB-NEXT: srl a4, t1, a2 +; RV32ZBB-NEXT: srl a3, t0, a2 ; RV32ZBB-NEXT: sub a0, a6, a0 -; RV32ZBB-NEXT: slli a5, a1, 1 -; RV32ZBB-NEXT: sll a0, a5, a0 -; RV32ZBB-NEXT: or a0, a4, a0 +; RV32ZBB-NEXT: slli a4, a1, 1 +; RV32ZBB-NEXT: sll a0, a4, a0 +; RV32ZBB-NEXT: or a0, a3, a0 ; RV32ZBB-NEXT: .LBB9_3: ; RV32ZBB-NEXT: neg a5, a2 ; RV32ZBB-NEXT: andi a4, a5, 63 -; RV32ZBB-NEXT: addi t0, a4, -32 -; RV32ZBB-NEXT: bltz t0, .LBB9_5 +; RV32ZBB-NEXT: addi a3, a4, -32 +; RV32ZBB-NEXT: bltz a3, .LBB9_5 ; RV32ZBB-NEXT: # %bb.4: -; RV32ZBB-NEXT: sll a3, t1, t0 +; RV32ZBB-NEXT: sll a3, t0, a3 ; RV32ZBB-NEXT: bltz a7, .LBB9_6 ; RV32ZBB-NEXT: j .LBB9_7 ; RV32ZBB-NEXT: .LBB9_5: -; RV32ZBB-NEXT: sll a3, t1, a5 +; RV32ZBB-NEXT: sll a3, t0, a5 ; RV32ZBB-NEXT: or a0, a0, a3 ; RV32ZBB-NEXT: sll a3, a1, a5 ; RV32ZBB-NEXT: sub a4, a6, a4 -; RV32ZBB-NEXT: srli a5, t1, 1 +; RV32ZBB-NEXT: srli a5, t0, 1 ; RV32ZBB-NEXT: srl a4, a5, a4 ; RV32ZBB-NEXT: or a3, a3, a4 ; RV32ZBB-NEXT: bgez a7, .LBB9_7 @@ -533,7 +533,7 @@ define i64 @ror_i64(i64 %a, i64 %b) nounwind { ; ; RV32ZBP-LABEL: ror_i64: ; RV32ZBP: # %bb.0: -; RV32ZBP-NEXT: mv t1, a0 +; RV32ZBP-NEXT: mv t0, a0 ; RV32ZBP-NEXT: andi a0, a2, 63 ; RV32ZBP-NEXT: addi a7, a0, -32 ; RV32ZBP-NEXT: addi a6, zero, 31 @@ -542,26 +542,26 @@ define i64 @ror_i64(i64 %a, i64 %b) nounwind { ; RV32ZBP-NEXT: srl a0, a1, a7 ; RV32ZBP-NEXT: j .LBB9_3 ; RV32ZBP-NEXT: .LBB9_2: -; RV32ZBP-NEXT: srl a4, t1, a2 +; RV32ZBP-NEXT: srl a3, t0, a2 ; RV32ZBP-NEXT: sub a0, a6, a0 -; RV32ZBP-NEXT: slli a5, a1, 1 -; RV32ZBP-NEXT: sll a0, a5, a0 -; RV32ZBP-NEXT: or a0, a4, a0 +; RV32ZBP-NEXT: slli a4, a1, 1 +; RV32ZBP-NEXT: sll a0, a4, a0 +; RV32ZBP-NEXT: or a0, a3, a0 ; RV32ZBP-NEXT: .LBB9_3: ; RV32ZBP-NEXT: neg a5, a2 ; RV32ZBP-NEXT: andi a4, a5, 63 -; RV32ZBP-NEXT: addi t0, a4, -32 -; RV32ZBP-NEXT: bltz t0, .LBB9_5 +; RV32ZBP-NEXT: addi a3, a4, -32 +; RV32ZBP-NEXT: bltz a3, .LBB9_5 ; RV32ZBP-NEXT: # %bb.4: -; RV32ZBP-NEXT: sll a3, t1, t0 +; RV32ZBP-NEXT: sll a3, t0, a3 ; RV32ZBP-NEXT: bltz a7, .LBB9_6 ; RV32ZBP-NEXT: j .LBB9_7 ; RV32ZBP-NEXT: .LBB9_5: -; RV32ZBP-NEXT: sll a3, t1, a5 +; RV32ZBP-NEXT: sll a3, t0, a5 ; RV32ZBP-NEXT: or a0, a0, a3 ; RV32ZBP-NEXT: sll a3, a1, a5 ; RV32ZBP-NEXT: sub a4, a6, a4 -; RV32ZBP-NEXT: srli a5, t1, 1 +; RV32ZBP-NEXT: srli a5, t0, 1 ; RV32ZBP-NEXT: srl a4, a5, a4 ; RV32ZBP-NEXT: or a3, a3, a4 ; RV32ZBP-NEXT: bgez a7, .LBB9_7 diff --git a/llvm/test/CodeGen/RISCV/rv32zbb.ll b/llvm/test/CodeGen/RISCV/rv32zbb.ll index c8ac9e6..9cf2c3a 100644 --- a/llvm/test/CodeGen/RISCV/rv32zbb.ll +++ b/llvm/test/CodeGen/RISCV/rv32zbb.ll @@ -80,8 +80,8 @@ define i64 @ctlz_i64(i64 %a) nounwind { ; RV32I-NEXT: sw s4, 8(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s5, 4(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s6, 0(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv s0, a1 -; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: mv s3, a1 +; RV32I-NEXT: mv s4, a0 ; RV32I-NEXT: srli a0, a1, 1 ; RV32I-NEXT: or a0, a1, a0 ; RV32I-NEXT: srli a1, a0, 2 @@ -95,14 +95,14 @@ define i64 @ctlz_i64(i64 %a) nounwind { ; RV32I-NEXT: not a0, a0 ; RV32I-NEXT: srli a1, a0, 1 ; RV32I-NEXT: lui a2, 349525 -; RV32I-NEXT: addi s4, a2, 1365 -; RV32I-NEXT: and a1, a1, s4 +; RV32I-NEXT: addi s5, a2, 1365 +; RV32I-NEXT: and a1, a1, s5 ; RV32I-NEXT: sub a0, a0, a1 ; RV32I-NEXT: lui a1, 209715 -; RV32I-NEXT: addi s5, a1, 819 -; RV32I-NEXT: and a1, a0, s5 +; RV32I-NEXT: addi s1, a1, 819 +; RV32I-NEXT: and a1, a0, s1 ; RV32I-NEXT: srli a0, a0, 2 -; RV32I-NEXT: and a0, a0, s5 +; RV32I-NEXT: and a0, a0, s1 ; RV32I-NEXT: add a0, a1, a0 ; RV32I-NEXT: srli a1, a0, 4 ; RV32I-NEXT: add a0, a0, a1 @@ -110,12 +110,12 @@ define i64 @ctlz_i64(i64 %a) nounwind { ; RV32I-NEXT: addi s6, a1, -241 ; RV32I-NEXT: and a0, a0, s6 ; RV32I-NEXT: lui a1, 4112 -; RV32I-NEXT: addi s3, a1, 257 -; RV32I-NEXT: mv a1, s3 +; RV32I-NEXT: addi s0, a1, 257 +; RV32I-NEXT: mv a1, s0 ; RV32I-NEXT: call __mulsi3@plt ; RV32I-NEXT: mv s2, a0 -; RV32I-NEXT: srli a0, s1, 1 -; RV32I-NEXT: or a0, s1, a0 +; RV32I-NEXT: srli a0, s4, 1 +; RV32I-NEXT: or a0, s4, a0 ; RV32I-NEXT: srli a1, a0, 2 ; RV32I-NEXT: or a0, a0, a1 ; RV32I-NEXT: srli a1, a0, 4 @@ -126,18 +126,18 @@ define i64 @ctlz_i64(i64 %a) nounwind { ; RV32I-NEXT: or a0, a0, a1 ; RV32I-NEXT: not a0, a0 ; RV32I-NEXT: srli a1, a0, 1 -; RV32I-NEXT: and a1, a1, s4 +; RV32I-NEXT: and a1, a1, s5 ; RV32I-NEXT: sub a0, a0, a1 -; RV32I-NEXT: and a1, a0, s5 +; RV32I-NEXT: and a1, a0, s1 ; RV32I-NEXT: srli a0, a0, 2 -; RV32I-NEXT: and a0, a0, s5 +; RV32I-NEXT: and a0, a0, s1 ; RV32I-NEXT: add a0, a1, a0 ; RV32I-NEXT: srli a1, a0, 4 ; RV32I-NEXT: add a0, a0, a1 ; RV32I-NEXT: and a0, a0, s6 -; RV32I-NEXT: mv a1, s3 +; RV32I-NEXT: mv a1, s0 ; RV32I-NEXT: call __mulsi3@plt -; RV32I-NEXT: bnez s0, .LBB1_2 +; RV32I-NEXT: bnez s3, .LBB1_2 ; RV32I-NEXT: # %bb.1: ; RV32I-NEXT: srli a0, a0, 24 ; RV32I-NEXT: addi a0, a0, 32 @@ -248,21 +248,21 @@ define i64 @cttz_i64(i64 %a) nounwind { ; RV32I-NEXT: sw s4, 8(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s5, 4(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s6, 0(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv s1, a1 -; RV32I-NEXT: mv s0, a0 +; RV32I-NEXT: mv s3, a1 +; RV32I-NEXT: mv s4, a0 ; RV32I-NEXT: addi a0, a0, -1 -; RV32I-NEXT: not a1, s0 +; RV32I-NEXT: not a1, s4 ; RV32I-NEXT: and a0, a1, a0 ; RV32I-NEXT: srli a1, a0, 1 ; RV32I-NEXT: lui a2, 349525 -; RV32I-NEXT: addi s4, a2, 1365 -; RV32I-NEXT: and a1, a1, s4 +; RV32I-NEXT: addi s5, a2, 1365 +; RV32I-NEXT: and a1, a1, s5 ; RV32I-NEXT: sub a0, a0, a1 ; RV32I-NEXT: lui a1, 209715 -; RV32I-NEXT: addi s5, a1, 819 -; RV32I-NEXT: and a1, a0, s5 +; RV32I-NEXT: addi s0, a1, 819 +; RV32I-NEXT: and a1, a0, s0 ; RV32I-NEXT: srli a0, a0, 2 -; RV32I-NEXT: and a0, a0, s5 +; RV32I-NEXT: and a0, a0, s0 ; RV32I-NEXT: add a0, a1, a0 ; RV32I-NEXT: srli a1, a0, 4 ; RV32I-NEXT: add a0, a0, a1 @@ -270,26 +270,26 @@ define i64 @cttz_i64(i64 %a) nounwind { ; RV32I-NEXT: addi s6, a1, -241 ; RV32I-NEXT: and a0, a0, s6 ; RV32I-NEXT: lui a1, 4112 -; RV32I-NEXT: addi s3, a1, 257 -; RV32I-NEXT: mv a1, s3 +; RV32I-NEXT: addi s1, a1, 257 +; RV32I-NEXT: mv a1, s1 ; RV32I-NEXT: call __mulsi3@plt ; RV32I-NEXT: mv s2, a0 -; RV32I-NEXT: addi a0, s1, -1 -; RV32I-NEXT: not a1, s1 +; RV32I-NEXT: addi a0, s3, -1 +; RV32I-NEXT: not a1, s3 ; RV32I-NEXT: and a0, a1, a0 ; RV32I-NEXT: srli a1, a0, 1 -; RV32I-NEXT: and a1, a1, s4 +; RV32I-NEXT: and a1, a1, s5 ; RV32I-NEXT: sub a0, a0, a1 -; RV32I-NEXT: and a1, a0, s5 +; RV32I-NEXT: and a1, a0, s0 ; RV32I-NEXT: srli a0, a0, 2 -; RV32I-NEXT: and a0, a0, s5 +; RV32I-NEXT: and a0, a0, s0 ; RV32I-NEXT: add a0, a1, a0 ; RV32I-NEXT: srli a1, a0, 4 ; RV32I-NEXT: add a0, a0, a1 ; RV32I-NEXT: and a0, a0, s6 -; RV32I-NEXT: mv a1, s3 +; RV32I-NEXT: mv a1, s1 ; RV32I-NEXT: call __mulsi3@plt -; RV32I-NEXT: bnez s0, .LBB3_2 +; RV32I-NEXT: bnez s4, .LBB3_2 ; RV32I-NEXT: # %bb.1: ; RV32I-NEXT: srli a0, a0, 24 ; RV32I-NEXT: addi a0, a0, 32 @@ -391,17 +391,17 @@ define i64 @ctpop_i64(i64 %a) nounwind { ; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s4, 8(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s5, 4(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv s0, a0 +; RV32I-NEXT: mv s2, a0 ; RV32I-NEXT: srli a0, a1, 1 ; RV32I-NEXT: lui a2, 349525 -; RV32I-NEXT: addi s2, a2, 1365 -; RV32I-NEXT: and a0, a0, s2 +; RV32I-NEXT: addi s3, a2, 1365 +; RV32I-NEXT: and a0, a0, s3 ; RV32I-NEXT: sub a0, a1, a0 ; RV32I-NEXT: lui a1, 209715 -; RV32I-NEXT: addi s1, a1, 819 -; RV32I-NEXT: and a1, a0, s1 +; RV32I-NEXT: addi s0, a1, 819 +; RV32I-NEXT: and a1, a0, s0 ; RV32I-NEXT: srli a0, a0, 2 -; RV32I-NEXT: and a0, a0, s1 +; RV32I-NEXT: and a0, a0, s0 ; RV32I-NEXT: add a0, a1, a0 ; RV32I-NEXT: srli a1, a0, 4 ; RV32I-NEXT: add a0, a0, a1 @@ -409,21 +409,21 @@ define i64 @ctpop_i64(i64 %a) nounwind { ; RV32I-NEXT: addi s4, a1, -241 ; RV32I-NEXT: and a0, a0, s4 ; RV32I-NEXT: lui a1, 4112 -; RV32I-NEXT: addi s3, a1, 257 -; RV32I-NEXT: mv a1, s3 +; RV32I-NEXT: addi s1, a1, 257 +; RV32I-NEXT: mv a1, s1 ; RV32I-NEXT: call __mulsi3@plt ; RV32I-NEXT: srli s5, a0, 24 -; RV32I-NEXT: srli a0, s0, 1 -; RV32I-NEXT: and a0, a0, s2 -; RV32I-NEXT: sub a0, s0, a0 -; RV32I-NEXT: and a1, a0, s1 +; RV32I-NEXT: srli a0, s2, 1 +; RV32I-NEXT: and a0, a0, s3 +; RV32I-NEXT: sub a0, s2, a0 +; RV32I-NEXT: and a1, a0, s0 ; RV32I-NEXT: srli a0, a0, 2 -; RV32I-NEXT: and a0, a0, s1 +; RV32I-NEXT: and a0, a0, s0 ; RV32I-NEXT: add a0, a1, a0 ; RV32I-NEXT: srli a1, a0, 4 ; RV32I-NEXT: add a0, a0, a1 ; RV32I-NEXT: and a0, a0, s4 -; RV32I-NEXT: mv a1, s3 +; RV32I-NEXT: mv a1, s1 ; RV32I-NEXT: call __mulsi3@plt ; RV32I-NEXT: srli a0, a0, 24 ; RV32I-NEXT: add a0, a0, s5 diff --git a/llvm/test/CodeGen/RISCV/rv32zbp.ll b/llvm/test/CodeGen/RISCV/rv32zbp.ll index 32ef963..1d5702a 100644 --- a/llvm/test/CodeGen/RISCV/rv32zbp.ll +++ b/llvm/test/CodeGen/RISCV/rv32zbp.ll @@ -1067,47 +1067,47 @@ define i64 @gorc3b_i64(i64 %a) nounwind { ; RV32I-NEXT: slli a2, a0, 1 ; RV32I-NEXT: slli a3, a1, 1 ; RV32I-NEXT: lui a4, 699051 -; RV32I-NEXT: addi a6, a4, -1366 -; RV32I-NEXT: and a7, a3, a6 -; RV32I-NEXT: and a2, a2, a6 +; RV32I-NEXT: addi a4, a4, -1366 +; RV32I-NEXT: and a6, a3, a4 +; RV32I-NEXT: and a7, a2, a4 ; RV32I-NEXT: srli a5, a1, 1 -; RV32I-NEXT: srli a4, a0, 1 -; RV32I-NEXT: lui a3, 349525 -; RV32I-NEXT: addi t0, a3, 1365 -; RV32I-NEXT: and a4, a4, t0 -; RV32I-NEXT: and a5, a5, t0 +; RV32I-NEXT: srli a3, a0, 1 +; RV32I-NEXT: lui a2, 349525 +; RV32I-NEXT: addi a2, a2, 1365 +; RV32I-NEXT: and a3, a3, a2 +; RV32I-NEXT: and a5, a5, a2 ; RV32I-NEXT: or a1, a5, a1 -; RV32I-NEXT: or a0, a4, a0 -; RV32I-NEXT: or a0, a0, a2 -; RV32I-NEXT: or a1, a1, a7 -; RV32I-NEXT: slli a2, a1, 2 -; RV32I-NEXT: slli a4, a0, 2 -; RV32I-NEXT: lui a5, 838861 -; RV32I-NEXT: addi a5, a5, -820 -; RV32I-NEXT: and a7, a4, a5 -; RV32I-NEXT: and a2, a2, a5 -; RV32I-NEXT: srli a5, a0, 2 +; RV32I-NEXT: or a0, a3, a0 +; RV32I-NEXT: or a0, a0, a7 +; RV32I-NEXT: or a1, a1, a6 +; RV32I-NEXT: slli a6, a1, 2 +; RV32I-NEXT: slli a5, a0, 2 +; RV32I-NEXT: lui a3, 838861 +; RV32I-NEXT: addi a3, a3, -820 +; RV32I-NEXT: and a7, a5, a3 +; RV32I-NEXT: and a6, a6, a3 +; RV32I-NEXT: srli t0, a0, 2 ; RV32I-NEXT: srli a3, a1, 2 -; RV32I-NEXT: lui a4, 209715 -; RV32I-NEXT: addi a4, a4, 819 -; RV32I-NEXT: and a3, a3, a4 -; RV32I-NEXT: and a4, a5, a4 -; RV32I-NEXT: or a0, a4, a0 +; RV32I-NEXT: lui a5, 209715 +; RV32I-NEXT: addi a5, a5, 819 +; RV32I-NEXT: and a3, a3, a5 +; RV32I-NEXT: and a5, t0, a5 +; RV32I-NEXT: or a0, a5, a0 ; RV32I-NEXT: or a1, a3, a1 -; RV32I-NEXT: or a1, a1, a2 +; RV32I-NEXT: or a1, a1, a6 ; RV32I-NEXT: or a0, a0, a7 -; RV32I-NEXT: slli a2, a0, 1 -; RV32I-NEXT: slli a3, a1, 1 -; RV32I-NEXT: and a3, a3, a6 -; RV32I-NEXT: and a2, a2, a6 +; RV32I-NEXT: slli a3, a0, 1 +; RV32I-NEXT: slli a5, a1, 1 +; RV32I-NEXT: and a6, a5, a4 +; RV32I-NEXT: and a3, a3, a4 ; RV32I-NEXT: srli a4, a1, 1 ; RV32I-NEXT: srli a5, a0, 1 -; RV32I-NEXT: and a5, a5, t0 -; RV32I-NEXT: and a4, a4, t0 -; RV32I-NEXT: or a1, a4, a1 +; RV32I-NEXT: and a5, a5, a2 +; RV32I-NEXT: and a2, a4, a2 +; RV32I-NEXT: or a1, a2, a1 ; RV32I-NEXT: or a0, a5, a0 -; RV32I-NEXT: or a0, a0, a2 -; RV32I-NEXT: or a1, a1, a3 +; RV32I-NEXT: or a0, a0, a3 +; RV32I-NEXT: or a1, a1, a6 ; RV32I-NEXT: ret ; ; RV32B-LABEL: gorc3b_i64: @@ -2049,9 +2049,9 @@ define i64 @grev2b_i64(i64 %a) nounwind { ; RV32I-NEXT: slli a2, a0, 1 ; RV32I-NEXT: slli a3, a1, 1 ; RV32I-NEXT: lui a4, 699051 -; RV32I-NEXT: addi a6, a4, -1366 -; RV32I-NEXT: and a3, a3, a6 -; RV32I-NEXT: and a2, a2, a6 +; RV32I-NEXT: addi a4, a4, -1366 +; RV32I-NEXT: and a3, a3, a4 +; RV32I-NEXT: and a2, a2, a4 ; RV32I-NEXT: srli a0, a0, 1 ; RV32I-NEXT: srli a1, a1, 1 ; RV32I-NEXT: lui a5, 349525 @@ -2060,24 +2060,24 @@ define i64 @grev2b_i64(i64 %a) nounwind { ; RV32I-NEXT: and a0, a0, a5 ; RV32I-NEXT: or a0, a2, a0 ; RV32I-NEXT: or a1, a3, a1 -; RV32I-NEXT: slli a2, a1, 2 +; RV32I-NEXT: slli a6, a1, 2 ; RV32I-NEXT: slli a3, a0, 2 -; RV32I-NEXT: lui a4, 838861 -; RV32I-NEXT: addi a4, a4, -820 -; RV32I-NEXT: and a3, a3, a4 -; RV32I-NEXT: and a2, a2, a4 +; RV32I-NEXT: lui a2, 838861 +; RV32I-NEXT: addi a2, a2, -820 +; RV32I-NEXT: and a7, a3, a2 +; RV32I-NEXT: and a2, a6, a2 ; RV32I-NEXT: srli a1, a1, 2 ; RV32I-NEXT: srli a0, a0, 2 -; RV32I-NEXT: lui a4, 209715 -; RV32I-NEXT: addi a4, a4, 819 -; RV32I-NEXT: and a0, a0, a4 -; RV32I-NEXT: and a1, a1, a4 +; RV32I-NEXT: lui a3, 209715 +; RV32I-NEXT: addi a3, a3, 819 +; RV32I-NEXT: and a0, a0, a3 +; RV32I-NEXT: and a1, a1, a3 ; RV32I-NEXT: or a1, a2, a1 -; RV32I-NEXT: or a0, a3, a0 +; RV32I-NEXT: or a0, a7, a0 ; RV32I-NEXT: slli a2, a0, 1 ; RV32I-NEXT: slli a3, a1, 1 -; RV32I-NEXT: and a3, a3, a6 -; RV32I-NEXT: and a2, a2, a6 +; RV32I-NEXT: and a3, a3, a4 +; RV32I-NEXT: and a2, a2, a4 ; RV32I-NEXT: srli a0, a0, 1 ; RV32I-NEXT: srli a1, a1, 1 ; RV32I-NEXT: and a1, a1, a5 @@ -2186,51 +2186,51 @@ define i64 @grev0_i64(i64 %a) nounwind { ; RV32I-NEXT: slli a2, a1, 1 ; RV32I-NEXT: slli a3, a0, 1 ; RV32I-NEXT: lui a4, 699051 -; RV32I-NEXT: addi a6, a4, -1366 -; RV32I-NEXT: and a3, a3, a6 -; RV32I-NEXT: and a2, a2, a6 +; RV32I-NEXT: addi a4, a4, -1366 +; RV32I-NEXT: and a3, a3, a4 +; RV32I-NEXT: and a2, a2, a4 ; RV32I-NEXT: srli a1, a1, 1 ; RV32I-NEXT: srli a0, a0, 1 ; RV32I-NEXT: lui a5, 349525 -; RV32I-NEXT: addi a7, a5, 1365 -; RV32I-NEXT: and a0, a0, a7 -; RV32I-NEXT: and a1, a1, a7 +; RV32I-NEXT: addi a5, a5, 1365 +; RV32I-NEXT: and a0, a0, a5 +; RV32I-NEXT: and a1, a1, a5 ; RV32I-NEXT: or a1, a2, a1 ; RV32I-NEXT: or a0, a3, a0 -; RV32I-NEXT: slli a2, a0, 2 +; RV32I-NEXT: slli a6, a0, 2 ; RV32I-NEXT: slli a3, a1, 2 -; RV32I-NEXT: lui a4, 838861 -; RV32I-NEXT: addi a4, a4, -820 -; RV32I-NEXT: and a3, a3, a4 -; RV32I-NEXT: and a2, a2, a4 +; RV32I-NEXT: lui a2, 838861 +; RV32I-NEXT: addi a2, a2, -820 +; RV32I-NEXT: and a7, a3, a2 +; RV32I-NEXT: and a6, a6, a2 ; RV32I-NEXT: srli a0, a0, 2 ; RV32I-NEXT: srli a1, a1, 2 -; RV32I-NEXT: lui a5, 209715 -; RV32I-NEXT: addi a5, a5, 819 -; RV32I-NEXT: and a1, a1, a5 -; RV32I-NEXT: and a0, a0, a5 -; RV32I-NEXT: or a0, a2, a0 -; RV32I-NEXT: or a1, a3, a1 -; RV32I-NEXT: slli a2, a1, 1 -; RV32I-NEXT: slli a3, a0, 1 -; RV32I-NEXT: and a3, a3, a6 -; RV32I-NEXT: and a2, a2, a6 +; RV32I-NEXT: lui a3, 209715 +; RV32I-NEXT: addi a3, a3, 819 +; RV32I-NEXT: and a1, a1, a3 +; RV32I-NEXT: and a0, a0, a3 +; RV32I-NEXT: or t0, a6, a0 +; RV32I-NEXT: or a1, a7, a1 +; RV32I-NEXT: slli a6, a1, 1 +; RV32I-NEXT: slli a0, t0, 1 +; RV32I-NEXT: and a7, a0, a4 +; RV32I-NEXT: and a4, a6, a4 ; RV32I-NEXT: srli a1, a1, 1 -; RV32I-NEXT: srli a0, a0, 1 -; RV32I-NEXT: and a0, a0, a7 -; RV32I-NEXT: and a1, a1, a7 -; RV32I-NEXT: or a1, a2, a1 -; RV32I-NEXT: or a0, a3, a0 -; RV32I-NEXT: slli a2, a0, 2 -; RV32I-NEXT: slli a3, a1, 2 -; RV32I-NEXT: and a3, a3, a4 -; RV32I-NEXT: and a2, a2, a4 +; RV32I-NEXT: srli a0, t0, 1 +; RV32I-NEXT: and a0, a0, a5 +; RV32I-NEXT: and a1, a1, a5 +; RV32I-NEXT: or a1, a4, a1 +; RV32I-NEXT: or a0, a7, a0 +; RV32I-NEXT: slli a4, a0, 2 +; RV32I-NEXT: slli a5, a1, 2 +; RV32I-NEXT: and a5, a5, a2 +; RV32I-NEXT: and a2, a4, a2 ; RV32I-NEXT: srli a0, a0, 2 ; RV32I-NEXT: srli a1, a1, 2 -; RV32I-NEXT: and a1, a1, a5 -; RV32I-NEXT: and a0, a0, a5 +; RV32I-NEXT: and a1, a1, a3 +; RV32I-NEXT: and a0, a0, a3 ; RV32I-NEXT: or a0, a2, a0 -; RV32I-NEXT: or a1, a3, a1 +; RV32I-NEXT: or a1, a5, a1 ; RV32I-NEXT: ret ; ; RV32B-LABEL: grev0_i64: @@ -2580,13 +2580,13 @@ define i64 @bitreverse_i64(i64 %a) nounwind { ; RV32I: # %bb.0: ; RV32I-NEXT: srli a2, a1, 8 ; RV32I-NEXT: lui a3, 16 -; RV32I-NEXT: addi a6, a3, -256 -; RV32I-NEXT: and a2, a2, a6 +; RV32I-NEXT: addi a7, a3, -256 +; RV32I-NEXT: and a2, a2, a7 ; RV32I-NEXT: srli a4, a1, 24 ; RV32I-NEXT: or a2, a2, a4 ; RV32I-NEXT: slli a4, a1, 8 -; RV32I-NEXT: lui a7, 4080 -; RV32I-NEXT: and a4, a4, a7 +; RV32I-NEXT: lui a6, 4080 +; RV32I-NEXT: and a4, a4, a6 ; RV32I-NEXT: slli a1, a1, 24 ; RV32I-NEXT: or a1, a1, a4 ; RV32I-NEXT: or a1, a1, a2 @@ -2598,25 +2598,25 @@ define i64 @bitreverse_i64(i64 %a) nounwind { ; RV32I-NEXT: slli a1, a1, 4 ; RV32I-NEXT: or a1, a2, a1 ; RV32I-NEXT: srli a2, a1, 2 -; RV32I-NEXT: lui a3, 209715 -; RV32I-NEXT: addi a3, a3, 819 -; RV32I-NEXT: and a2, a2, a3 -; RV32I-NEXT: and a1, a1, a3 +; RV32I-NEXT: lui a5, 209715 +; RV32I-NEXT: addi a5, a5, 819 +; RV32I-NEXT: and a2, a2, a5 +; RV32I-NEXT: and a1, a1, a5 ; RV32I-NEXT: slli a1, a1, 2 ; RV32I-NEXT: or a1, a2, a1 ; RV32I-NEXT: srli a2, a1, 1 -; RV32I-NEXT: lui a5, 349525 -; RV32I-NEXT: addi a5, a5, 1365 -; RV32I-NEXT: and a2, a2, a5 -; RV32I-NEXT: and a1, a1, a5 +; RV32I-NEXT: lui a3, 349525 +; RV32I-NEXT: addi a3, a3, 1365 +; RV32I-NEXT: and a2, a2, a3 +; RV32I-NEXT: and a1, a1, a3 ; RV32I-NEXT: slli a1, a1, 1 ; RV32I-NEXT: or t0, a2, a1 ; RV32I-NEXT: srli a1, a0, 8 -; RV32I-NEXT: and a1, a1, a6 +; RV32I-NEXT: and a1, a1, a7 ; RV32I-NEXT: srli a2, a0, 24 ; RV32I-NEXT: or a1, a1, a2 ; RV32I-NEXT: slli a2, a0, 8 -; RV32I-NEXT: and a2, a2, a7 +; RV32I-NEXT: and a2, a2, a6 ; RV32I-NEXT: slli a0, a0, 24 ; RV32I-NEXT: or a0, a0, a2 ; RV32I-NEXT: or a0, a0, a1 @@ -2626,13 +2626,13 @@ define i64 @bitreverse_i64(i64 %a) nounwind { ; RV32I-NEXT: slli a0, a0, 4 ; RV32I-NEXT: or a0, a1, a0 ; RV32I-NEXT: srli a1, a0, 2 -; RV32I-NEXT: and a1, a1, a3 -; RV32I-NEXT: and a0, a0, a3 +; RV32I-NEXT: and a1, a1, a5 +; RV32I-NEXT: and a0, a0, a5 ; RV32I-NEXT: slli a0, a0, 2 ; RV32I-NEXT: or a0, a1, a0 ; RV32I-NEXT: srli a1, a0, 1 -; RV32I-NEXT: and a1, a1, a5 -; RV32I-NEXT: and a0, a0, a5 +; RV32I-NEXT: and a1, a1, a3 +; RV32I-NEXT: and a0, a0, a3 ; RV32I-NEXT: slli a0, a0, 1 ; RV32I-NEXT: or a1, a1, a0 ; RV32I-NEXT: mv a0, t0 @@ -2789,76 +2789,76 @@ define i64 @bitreverse_bswap_i64(i64 %a) { ; RV32I: # %bb.0: ; RV32I-NEXT: srli a3, a1, 8 ; RV32I-NEXT: lui a2, 16 -; RV32I-NEXT: addi a6, a2, -256 -; RV32I-NEXT: and a3, a3, a6 +; RV32I-NEXT: addi a7, a2, -256 +; RV32I-NEXT: and a3, a3, a7 ; RV32I-NEXT: srli a4, a1, 24 ; RV32I-NEXT: or a4, a3, a4 ; RV32I-NEXT: slli a5, a1, 8 -; RV32I-NEXT: lui a7, 4080 -; RV32I-NEXT: and a5, a5, a7 +; RV32I-NEXT: lui a6, 4080 +; RV32I-NEXT: and a5, a5, a6 ; RV32I-NEXT: slli a1, a1, 24 ; RV32I-NEXT: or a1, a1, a5 ; RV32I-NEXT: or a1, a1, a4 ; RV32I-NEXT: srli a4, a1, 4 ; RV32I-NEXT: lui a5, 61681 -; RV32I-NEXT: addi t0, a5, -241 -; RV32I-NEXT: and a4, a4, t0 -; RV32I-NEXT: and a1, a1, t0 +; RV32I-NEXT: addi a5, a5, -241 +; RV32I-NEXT: and a4, a4, a5 +; RV32I-NEXT: and a1, a1, a5 ; RV32I-NEXT: slli a1, a1, 4 ; RV32I-NEXT: or a1, a4, a1 ; RV32I-NEXT: srli a4, a1, 2 -; RV32I-NEXT: lui a2, 209715 -; RV32I-NEXT: addi a2, a2, 819 -; RV32I-NEXT: and a4, a4, a2 -; RV32I-NEXT: and a1, a1, a2 +; RV32I-NEXT: lui a3, 209715 +; RV32I-NEXT: addi a3, a3, 819 +; RV32I-NEXT: and a4, a4, a3 +; RV32I-NEXT: and a1, a1, a3 ; RV32I-NEXT: slli a1, a1, 2 ; RV32I-NEXT: or a1, a4, a1 ; RV32I-NEXT: srli a4, a1, 1 -; RV32I-NEXT: lui a3, 349525 -; RV32I-NEXT: addi a3, a3, 1365 -; RV32I-NEXT: and a4, a4, a3 -; RV32I-NEXT: and a1, a1, a3 +; RV32I-NEXT: lui a2, 349525 +; RV32I-NEXT: addi a2, a2, 1365 +; RV32I-NEXT: and a4, a4, a2 +; RV32I-NEXT: and a1, a1, a2 ; RV32I-NEXT: slli a1, a1, 1 ; RV32I-NEXT: or a1, a4, a1 ; RV32I-NEXT: srli a4, a0, 8 +; RV32I-NEXT: and t0, a4, a7 +; RV32I-NEXT: srli a4, a0, 24 +; RV32I-NEXT: or t0, t0, a4 +; RV32I-NEXT: slli a4, a0, 8 ; RV32I-NEXT: and a4, a4, a6 -; RV32I-NEXT: srli a5, a0, 24 -; RV32I-NEXT: or a4, a4, a5 -; RV32I-NEXT: slli a5, a0, 8 -; RV32I-NEXT: and a5, a5, a7 ; RV32I-NEXT: slli a0, a0, 24 -; RV32I-NEXT: or a0, a0, a5 ; RV32I-NEXT: or a0, a0, a4 +; RV32I-NEXT: or a0, a0, t0 ; RV32I-NEXT: srli a4, a0, 4 -; RV32I-NEXT: and a4, a4, t0 -; RV32I-NEXT: and a0, a0, t0 +; RV32I-NEXT: and a4, a4, a5 +; RV32I-NEXT: and a0, a0, a5 ; RV32I-NEXT: slli a0, a0, 4 ; RV32I-NEXT: or a0, a4, a0 ; RV32I-NEXT: srli a4, a0, 2 -; RV32I-NEXT: and a4, a4, a2 -; RV32I-NEXT: and a0, a0, a2 +; RV32I-NEXT: and a4, a4, a3 +; RV32I-NEXT: and a0, a0, a3 ; RV32I-NEXT: slli a0, a0, 2 ; RV32I-NEXT: or a0, a4, a0 -; RV32I-NEXT: srli a2, a0, 1 -; RV32I-NEXT: and a2, a2, a3 -; RV32I-NEXT: and a0, a0, a3 +; RV32I-NEXT: srli a3, a0, 1 +; RV32I-NEXT: and a3, a3, a2 +; RV32I-NEXT: and a0, a0, a2 ; RV32I-NEXT: slli a0, a0, 1 -; RV32I-NEXT: or a0, a2, a0 +; RV32I-NEXT: or a0, a3, a0 ; RV32I-NEXT: srli a2, a0, 8 -; RV32I-NEXT: and a2, a2, a6 +; RV32I-NEXT: and a2, a2, a7 ; RV32I-NEXT: srli a3, a0, 24 ; RV32I-NEXT: or a2, a2, a3 ; RV32I-NEXT: slli a3, a0, 8 -; RV32I-NEXT: and a3, a3, a7 +; RV32I-NEXT: and a3, a3, a6 ; RV32I-NEXT: slli a0, a0, 24 ; RV32I-NEXT: or a0, a0, a3 ; RV32I-NEXT: or a0, a0, a2 ; RV32I-NEXT: srli a2, a1, 8 -; RV32I-NEXT: and a2, a2, a6 +; RV32I-NEXT: and a2, a2, a7 ; RV32I-NEXT: srli a3, a1, 24 ; RV32I-NEXT: or a2, a2, a3 ; RV32I-NEXT: slli a3, a1, 8 -; RV32I-NEXT: and a3, a3, a7 +; RV32I-NEXT: and a3, a3, a6 ; RV32I-NEXT: slli a1, a1, 24 ; RV32I-NEXT: or a1, a1, a3 ; RV32I-NEXT: or a1, a1, a2 diff --git a/llvm/test/CodeGen/RISCV/rv32zbt.ll b/llvm/test/CodeGen/RISCV/rv32zbt.ll index 06f2a26..6b420de 100644 --- a/llvm/test/CodeGen/RISCV/rv32zbt.ll +++ b/llvm/test/CodeGen/RISCV/rv32zbt.ll @@ -457,24 +457,24 @@ define i64 @fshl_i64(i64 %a, i64 %b, i64 %c) nounwind { ; RV32I-NEXT: srl a1, a1, a5 ; RV32I-NEXT: or a1, t0, a1 ; RV32I-NEXT: .LBB13_3: -; RV32I-NEXT: not t0, a4 -; RV32I-NEXT: andi t3, t0, 63 -; RV32I-NEXT: addi t2, t3, -32 -; RV32I-NEXT: srli t1, a3, 1 -; RV32I-NEXT: bltz t2, .LBB13_5 +; RV32I-NEXT: not t2, a4 +; RV32I-NEXT: andi t1, t2, 63 +; RV32I-NEXT: addi a5, t1, -32 +; RV32I-NEXT: srli t0, a3, 1 +; RV32I-NEXT: bltz a5, .LBB13_5 ; RV32I-NEXT: # %bb.4: -; RV32I-NEXT: srl a2, t1, t2 +; RV32I-NEXT: srl a2, t0, a5 ; RV32I-NEXT: bltz a7, .LBB13_6 ; RV32I-NEXT: j .LBB13_7 ; RV32I-NEXT: .LBB13_5: -; RV32I-NEXT: srl a5, t1, t0 +; RV32I-NEXT: srl a5, t0, t2 ; RV32I-NEXT: or a1, a1, a5 ; RV32I-NEXT: slli a3, a3, 31 ; RV32I-NEXT: srli a2, a2, 1 ; RV32I-NEXT: or a2, a2, a3 -; RV32I-NEXT: srl a2, a2, t0 -; RV32I-NEXT: sub a3, a6, t3 -; RV32I-NEXT: slli a5, t1, 1 +; RV32I-NEXT: srl a2, a2, t2 +; RV32I-NEXT: sub a3, a6, t1 +; RV32I-NEXT: slli a5, t0, 1 ; RV32I-NEXT: sll a3, a5, a3 ; RV32I-NEXT: or a2, a2, a3 ; RV32I-NEXT: bgez a7, .LBB13_7 @@ -493,31 +493,31 @@ define i64 @fshl_i64(i64 %a, i64 %b, i64 %c) nounwind { ; RV32B-NEXT: sub t0, a6, a5 ; RV32B-NEXT: srli a1, a0, 1 ; RV32B-NEXT: srl a1, a1, t0 -; RV32B-NEXT: or t0, a7, a1 -; RV32B-NEXT: addi a7, a5, -32 -; RV32B-NEXT: sll a5, a0, a7 -; RV32B-NEXT: slti a1, a7, 0 -; RV32B-NEXT: cmov t1, a1, t0, a5 -; RV32B-NEXT: not t0, a4 -; RV32B-NEXT: srli a5, a3, 1 -; RV32B-NEXT: srl t2, a5, t0 +; RV32B-NEXT: or a7, a7, a1 +; RV32B-NEXT: addi t1, a5, -32 +; RV32B-NEXT: sll t0, a0, t1 +; RV32B-NEXT: slti a1, t1, 0 +; RV32B-NEXT: cmov t0, a1, a7, t0 +; RV32B-NEXT: not a7, a4 +; RV32B-NEXT: srli t4, a3, 1 +; RV32B-NEXT: srl t2, t4, a7 ; RV32B-NEXT: addi a1, zero, 63 ; RV32B-NEXT: andn t3, a1, a4 -; RV32B-NEXT: addi t4, t3, -32 -; RV32B-NEXT: srai a1, t4, 31 +; RV32B-NEXT: addi a5, t3, -32 +; RV32B-NEXT: srai a1, a5, 31 ; RV32B-NEXT: and a1, a1, t2 -; RV32B-NEXT: or a1, t1, a1 +; RV32B-NEXT: or a1, t0, a1 ; RV32B-NEXT: fsri a2, a2, a3, 1 -; RV32B-NEXT: srl t0, a2, t0 +; RV32B-NEXT: srl a7, a2, a7 ; RV32B-NEXT: sub a3, a6, t3 -; RV32B-NEXT: slli a2, a5, 1 +; RV32B-NEXT: slli a2, t4, 1 ; RV32B-NEXT: sll a2, a2, a3 -; RV32B-NEXT: or a2, t0, a2 -; RV32B-NEXT: srl a3, a5, t4 -; RV32B-NEXT: slti a5, t4, 0 +; RV32B-NEXT: or a2, a7, a2 +; RV32B-NEXT: srl a3, t4, a5 +; RV32B-NEXT: slti a5, a5, 0 ; RV32B-NEXT: cmov a2, a5, a2, a3 ; RV32B-NEXT: sll a0, a0, a4 -; RV32B-NEXT: srai a3, a7, 31 +; RV32B-NEXT: srai a3, t1, 31 ; RV32B-NEXT: and a0, a3, a0 ; RV32B-NEXT: or a0, a0, a2 ; RV32B-NEXT: ret @@ -530,30 +530,30 @@ define i64 @fshl_i64(i64 %a, i64 %b, i64 %c) nounwind { ; RV32ZBT-NEXT: sub t0, a6, a5 ; RV32ZBT-NEXT: srli a1, a0, 1 ; RV32ZBT-NEXT: srl a1, a1, t0 -; RV32ZBT-NEXT: or t0, a7, a1 -; RV32ZBT-NEXT: addi a7, a5, -32 -; RV32ZBT-NEXT: sll a5, a0, a7 -; RV32ZBT-NEXT: slti a1, a7, 0 -; RV32ZBT-NEXT: cmov t1, a1, t0, a5 -; RV32ZBT-NEXT: not t0, a4 -; RV32ZBT-NEXT: srli a5, a3, 1 -; RV32ZBT-NEXT: srl t4, a5, t0 -; RV32ZBT-NEXT: andi t2, t0, 63 +; RV32ZBT-NEXT: or a7, a7, a1 +; RV32ZBT-NEXT: addi t1, a5, -32 +; RV32ZBT-NEXT: sll t0, a0, t1 +; RV32ZBT-NEXT: slti a1, t1, 0 +; RV32ZBT-NEXT: cmov t0, a1, a7, t0 +; RV32ZBT-NEXT: not a5, a4 +; RV32ZBT-NEXT: srli a7, a3, 1 +; RV32ZBT-NEXT: srl t4, a7, a5 +; RV32ZBT-NEXT: andi t2, a5, 63 ; RV32ZBT-NEXT: addi t3, t2, -32 ; RV32ZBT-NEXT: srai a1, t3, 31 ; RV32ZBT-NEXT: and a1, a1, t4 -; RV32ZBT-NEXT: or a1, t1, a1 +; RV32ZBT-NEXT: or a1, t0, a1 ; RV32ZBT-NEXT: fsri a2, a2, a3, 1 -; RV32ZBT-NEXT: srl t0, a2, t0 +; RV32ZBT-NEXT: srl a2, a2, a5 ; RV32ZBT-NEXT: sub a3, a6, t2 -; RV32ZBT-NEXT: slli a2, a5, 1 -; RV32ZBT-NEXT: sll a2, a2, a3 -; RV32ZBT-NEXT: or a2, t0, a2 -; RV32ZBT-NEXT: srl a3, a5, t3 +; RV32ZBT-NEXT: slli a5, a7, 1 +; RV32ZBT-NEXT: sll a3, a5, a3 +; RV32ZBT-NEXT: or a2, a2, a3 +; RV32ZBT-NEXT: srl a3, a7, t3 ; RV32ZBT-NEXT: slti a5, t3, 0 ; RV32ZBT-NEXT: cmov a2, a5, a2, a3 ; RV32ZBT-NEXT: sll a0, a0, a4 -; RV32ZBT-NEXT: srai a3, a7, 31 +; RV32ZBT-NEXT: srai a3, t1, 31 ; RV32ZBT-NEXT: and a0, a3, a0 ; RV32ZBT-NEXT: or a0, a0, a2 ; RV32ZBT-NEXT: ret @@ -599,7 +599,7 @@ declare i64 @llvm.fshr.i64(i64, i64, i64) define i64 @fshr_i64(i64 %a, i64 %b, i64 %c) nounwind { ; RV32I-LABEL: fshr_i64: ; RV32I: # %bb.0: -; RV32I-NEXT: mv t1, a0 +; RV32I-NEXT: mv t0, a0 ; RV32I-NEXT: andi a0, a4, 63 ; RV32I-NEXT: addi a6, a0, -32 ; RV32I-NEXT: addi a7, zero, 31 @@ -614,27 +614,27 @@ define i64 @fshr_i64(i64 %a, i64 %b, i64 %c) nounwind { ; RV32I-NEXT: sll a0, a5, a0 ; RV32I-NEXT: or a0, a2, a0 ; RV32I-NEXT: .LBB15_3: -; RV32I-NEXT: not t0, a4 -; RV32I-NEXT: andi a2, t0, 63 -; RV32I-NEXT: addi t2, a2, -32 -; RV32I-NEXT: slli a5, t1, 1 -; RV32I-NEXT: bltz t2, .LBB15_5 +; RV32I-NEXT: not t2, a4 +; RV32I-NEXT: andi a5, t2, 63 +; RV32I-NEXT: addi a2, a5, -32 +; RV32I-NEXT: slli t1, t0, 1 +; RV32I-NEXT: bltz a2, .LBB15_5 ; RV32I-NEXT: # %bb.4: -; RV32I-NEXT: sll a1, a5, t2 +; RV32I-NEXT: sll a1, t1, a2 ; RV32I-NEXT: bltz a6, .LBB15_6 ; RV32I-NEXT: j .LBB15_7 ; RV32I-NEXT: .LBB15_5: -; RV32I-NEXT: sll a5, a5, t0 -; RV32I-NEXT: or a0, a0, a5 -; RV32I-NEXT: lui a5, 524288 -; RV32I-NEXT: addi a5, a5, -1 -; RV32I-NEXT: and a5, t1, a5 -; RV32I-NEXT: sub a2, a7, a2 -; RV32I-NEXT: srl a2, a5, a2 -; RV32I-NEXT: srli a5, t1, 31 +; RV32I-NEXT: sll a2, t1, t2 +; RV32I-NEXT: or a0, a0, a2 +; RV32I-NEXT: lui a2, 524288 +; RV32I-NEXT: addi a2, a2, -1 +; RV32I-NEXT: and a2, t0, a2 +; RV32I-NEXT: sub a5, a7, a5 +; RV32I-NEXT: srl a2, a2, a5 +; RV32I-NEXT: srli a5, t0, 31 ; RV32I-NEXT: slli a1, a1, 1 ; RV32I-NEXT: or a1, a1, a5 -; RV32I-NEXT: sll a1, a1, t0 +; RV32I-NEXT: sll a1, a1, t2 ; RV32I-NEXT: or a1, a1, a2 ; RV32I-NEXT: bgez a6, .LBB15_7 ; RV32I-NEXT: .LBB15_6: @@ -651,34 +651,34 @@ define i64 @fshr_i64(i64 %a, i64 %b, i64 %c) nounwind { ; RV32B-NEXT: sub t0, a6, a5 ; RV32B-NEXT: slli a2, a3, 1 ; RV32B-NEXT: sll a2, a2, t0 -; RV32B-NEXT: or t0, a7, a2 -; RV32B-NEXT: addi a7, a5, -32 -; RV32B-NEXT: srl a5, a3, a7 -; RV32B-NEXT: slti a2, a7, 0 -; RV32B-NEXT: cmov t1, a2, t0, a5 -; RV32B-NEXT: not t0, a4 -; RV32B-NEXT: slli t4, a0, 1 -; RV32B-NEXT: sll t2, t4, t0 -; RV32B-NEXT: addi a2, zero, 63 -; RV32B-NEXT: andn a2, a2, a4 -; RV32B-NEXT: addi t3, a2, -32 -; RV32B-NEXT: srai a5, t3, 31 -; RV32B-NEXT: and a5, a5, t2 -; RV32B-NEXT: or t1, a5, t1 +; RV32B-NEXT: or a7, a7, a2 +; RV32B-NEXT: addi t2, a5, -32 +; RV32B-NEXT: srl t0, a3, t2 +; RV32B-NEXT: slti a2, t2, 0 +; RV32B-NEXT: cmov a7, a2, a7, t0 +; RV32B-NEXT: not t3, a4 +; RV32B-NEXT: slli t0, a0, 1 +; RV32B-NEXT: sll t1, t0, t3 +; RV32B-NEXT: addi a5, zero, 63 +; RV32B-NEXT: andn t4, a5, a4 +; RV32B-NEXT: addi a2, t4, -32 +; RV32B-NEXT: srai a5, a2, 31 +; RV32B-NEXT: and a5, a5, t1 +; RV32B-NEXT: or a7, a5, a7 ; RV32B-NEXT: fsri a1, a0, a1, 31 -; RV32B-NEXT: sll a1, a1, t0 -; RV32B-NEXT: sub a2, a6, a2 +; RV32B-NEXT: sll a1, a1, t3 +; RV32B-NEXT: sub a5, a6, t4 ; RV32B-NEXT: bclri a0, a0, 31 -; RV32B-NEXT: srl a0, a0, a2 +; RV32B-NEXT: srl a0, a0, a5 ; RV32B-NEXT: or a0, a1, a0 -; RV32B-NEXT: sll a1, t4, t3 -; RV32B-NEXT: slti a2, t3, 0 +; RV32B-NEXT: sll a1, t0, a2 +; RV32B-NEXT: slti a2, a2, 0 ; RV32B-NEXT: cmov a0, a2, a0, a1 ; RV32B-NEXT: srl a1, a3, a4 -; RV32B-NEXT: srai a2, a7, 31 +; RV32B-NEXT: srai a2, t2, 31 ; RV32B-NEXT: and a1, a2, a1 ; RV32B-NEXT: or a1, a0, a1 -; RV32B-NEXT: mv a0, t1 +; RV32B-NEXT: mv a0, a7 ; RV32B-NEXT: ret ; ; RV32ZBT-LABEL: fshr_i64: @@ -689,35 +689,35 @@ define i64 @fshr_i64(i64 %a, i64 %b, i64 %c) nounwind { ; RV32ZBT-NEXT: sub t0, a6, a5 ; RV32ZBT-NEXT: slli a2, a3, 1 ; RV32ZBT-NEXT: sll a2, a2, t0 -; RV32ZBT-NEXT: or t0, a7, a2 -; RV32ZBT-NEXT: addi a7, a5, -32 -; RV32ZBT-NEXT: srl a5, a3, a7 -; RV32ZBT-NEXT: slti a2, a7, 0 -; RV32ZBT-NEXT: cmov t1, a2, t0, a5 -; RV32ZBT-NEXT: not t0, a4 -; RV32ZBT-NEXT: slli t4, a0, 1 -; RV32ZBT-NEXT: sll t2, t4, t0 -; RV32ZBT-NEXT: andi a2, t0, 63 -; RV32ZBT-NEXT: addi t3, a2, -32 -; RV32ZBT-NEXT: srai a5, t3, 31 -; RV32ZBT-NEXT: and a5, a5, t2 -; RV32ZBT-NEXT: or t1, a5, t1 -; RV32ZBT-NEXT: lui a5, 524288 -; RV32ZBT-NEXT: addi a5, a5, -1 -; RV32ZBT-NEXT: and a5, a0, a5 -; RV32ZBT-NEXT: sub a2, a6, a2 -; RV32ZBT-NEXT: srl a2, a5, a2 +; RV32ZBT-NEXT: or a7, a7, a2 +; RV32ZBT-NEXT: addi t2, a5, -32 +; RV32ZBT-NEXT: srl t0, a3, t2 +; RV32ZBT-NEXT: slti a2, t2, 0 +; RV32ZBT-NEXT: cmov a7, a2, a7, t0 +; RV32ZBT-NEXT: not t4, a4 +; RV32ZBT-NEXT: slli t0, a0, 1 +; RV32ZBT-NEXT: sll t1, t0, t4 +; RV32ZBT-NEXT: andi t3, t4, 63 +; RV32ZBT-NEXT: addi a5, t3, -32 +; RV32ZBT-NEXT: srai a2, a5, 31 +; RV32ZBT-NEXT: and a2, a2, t1 +; RV32ZBT-NEXT: or a7, a2, a7 +; RV32ZBT-NEXT: lui a2, 524288 +; RV32ZBT-NEXT: addi a2, a2, -1 +; RV32ZBT-NEXT: and t1, a0, a2 +; RV32ZBT-NEXT: sub a2, a6, t3 +; RV32ZBT-NEXT: srl a2, t1, a2 ; RV32ZBT-NEXT: fsri a0, a0, a1, 31 -; RV32ZBT-NEXT: sll a0, a0, t0 +; RV32ZBT-NEXT: sll a0, a0, t4 ; RV32ZBT-NEXT: or a0, a0, a2 -; RV32ZBT-NEXT: sll a1, t4, t3 -; RV32ZBT-NEXT: slti a2, t3, 0 +; RV32ZBT-NEXT: sll a1, t0, a5 +; RV32ZBT-NEXT: slti a2, a5, 0 ; RV32ZBT-NEXT: cmov a0, a2, a0, a1 ; RV32ZBT-NEXT: srl a1, a3, a4 -; RV32ZBT-NEXT: srai a2, a7, 31 +; RV32ZBT-NEXT: srai a2, t2, 31 ; RV32ZBT-NEXT: and a1, a2, a1 ; RV32ZBT-NEXT: or a1, a0, a1 -; RV32ZBT-NEXT: mv a0, t1 +; RV32ZBT-NEXT: mv a0, a7 ; RV32ZBT-NEXT: ret %1 = tail call i64 @llvm.fshr.i64(i64 %a, i64 %b, i64 %c) ret i64 %1 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse.ll index bc51bf8..c857165 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse.ll @@ -892,8 +892,8 @@ define void @bitreverse_v8i32(<8 x i32>* %x, <8 x i32>* %y) { ; LMULMAX1-RV32-LABEL: bitreverse_v8i32: ; LMULMAX1-RV32: # %bb.0: ; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; LMULMAX1-RV32-NEXT: addi a6, a0, 16 -; LMULMAX1-RV32-NEXT: vle32.v v25, (a6) +; LMULMAX1-RV32-NEXT: addi a1, a0, 16 +; LMULMAX1-RV32-NEXT: vle32.v v25, (a1) ; LMULMAX1-RV32-NEXT: vle32.v v26, (a0) ; LMULMAX1-RV32-NEXT: vsrl.vi v27, v25, 8 ; LMULMAX1-RV32-NEXT: lui a2, 16 @@ -902,8 +902,8 @@ define void @bitreverse_v8i32(<8 x i32>* %x, <8 x i32>* %y) { ; LMULMAX1-RV32-NEXT: vsrl.vi v28, v25, 24 ; LMULMAX1-RV32-NEXT: vor.vv v27, v27, v28 ; LMULMAX1-RV32-NEXT: vsll.vi v28, v25, 8 -; LMULMAX1-RV32-NEXT: lui a3, 4080 -; LMULMAX1-RV32-NEXT: vand.vx v28, v28, a3 +; LMULMAX1-RV32-NEXT: lui a6, 4080 +; LMULMAX1-RV32-NEXT: vand.vx v28, v28, a6 ; LMULMAX1-RV32-NEXT: vsll.vi v25, v25, 24 ; LMULMAX1-RV32-NEXT: vor.vv v25, v25, v28 ; LMULMAX1-RV32-NEXT: vor.vv v25, v25, v27 @@ -922,10 +922,10 @@ define void @bitreverse_v8i32(<8 x i32>* %x, <8 x i32>* %y) { ; LMULMAX1-RV32-NEXT: vsll.vi v25, v25, 2 ; LMULMAX1-RV32-NEXT: vor.vv v25, v27, v25 ; LMULMAX1-RV32-NEXT: vsrl.vi v27, v25, 1 -; LMULMAX1-RV32-NEXT: lui a1, 349525 -; LMULMAX1-RV32-NEXT: addi a1, a1, 1365 -; LMULMAX1-RV32-NEXT: vand.vx v27, v27, a1 -; LMULMAX1-RV32-NEXT: vand.vx v25, v25, a1 +; LMULMAX1-RV32-NEXT: lui a3, 349525 +; LMULMAX1-RV32-NEXT: addi a3, a3, 1365 +; LMULMAX1-RV32-NEXT: vand.vx v27, v27, a3 +; LMULMAX1-RV32-NEXT: vand.vx v25, v25, a3 ; LMULMAX1-RV32-NEXT: vadd.vv v25, v25, v25 ; LMULMAX1-RV32-NEXT: vor.vv v25, v27, v25 ; LMULMAX1-RV32-NEXT: vsrl.vi v27, v26, 8 @@ -933,7 +933,7 @@ define void @bitreverse_v8i32(<8 x i32>* %x, <8 x i32>* %y) { ; LMULMAX1-RV32-NEXT: vsrl.vi v28, v26, 24 ; LMULMAX1-RV32-NEXT: vor.vv v27, v27, v28 ; LMULMAX1-RV32-NEXT: vsll.vi v28, v26, 8 -; LMULMAX1-RV32-NEXT: vand.vx v28, v28, a3 +; LMULMAX1-RV32-NEXT: vand.vx v28, v28, a6 ; LMULMAX1-RV32-NEXT: vsll.vi v26, v26, 24 ; LMULMAX1-RV32-NEXT: vor.vv v26, v26, v28 ; LMULMAX1-RV32-NEXT: vor.vv v26, v26, v27 @@ -948,19 +948,19 @@ define void @bitreverse_v8i32(<8 x i32>* %x, <8 x i32>* %y) { ; LMULMAX1-RV32-NEXT: vsll.vi v26, v26, 2 ; LMULMAX1-RV32-NEXT: vor.vv v26, v27, v26 ; LMULMAX1-RV32-NEXT: vsrl.vi v27, v26, 1 -; LMULMAX1-RV32-NEXT: vand.vx v27, v27, a1 -; LMULMAX1-RV32-NEXT: vand.vx v26, v26, a1 +; LMULMAX1-RV32-NEXT: vand.vx v27, v27, a3 +; LMULMAX1-RV32-NEXT: vand.vx v26, v26, a3 ; LMULMAX1-RV32-NEXT: vadd.vv v26, v26, v26 ; LMULMAX1-RV32-NEXT: vor.vv v26, v27, v26 ; LMULMAX1-RV32-NEXT: vse32.v v26, (a0) -; LMULMAX1-RV32-NEXT: vse32.v v25, (a6) +; LMULMAX1-RV32-NEXT: vse32.v v25, (a1) ; LMULMAX1-RV32-NEXT: ret ; ; LMULMAX1-RV64-LABEL: bitreverse_v8i32: ; LMULMAX1-RV64: # %bb.0: ; LMULMAX1-RV64-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; LMULMAX1-RV64-NEXT: addi a6, a0, 16 -; LMULMAX1-RV64-NEXT: vle32.v v25, (a6) +; LMULMAX1-RV64-NEXT: addi a1, a0, 16 +; LMULMAX1-RV64-NEXT: vle32.v v25, (a1) ; LMULMAX1-RV64-NEXT: vle32.v v26, (a0) ; LMULMAX1-RV64-NEXT: vsrl.vi v27, v25, 8 ; LMULMAX1-RV64-NEXT: lui a2, 16 @@ -969,8 +969,8 @@ define void @bitreverse_v8i32(<8 x i32>* %x, <8 x i32>* %y) { ; LMULMAX1-RV64-NEXT: vsrl.vi v28, v25, 24 ; LMULMAX1-RV64-NEXT: vor.vv v27, v27, v28 ; LMULMAX1-RV64-NEXT: vsll.vi v28, v25, 8 -; LMULMAX1-RV64-NEXT: lui a3, 4080 -; LMULMAX1-RV64-NEXT: vand.vx v28, v28, a3 +; LMULMAX1-RV64-NEXT: lui a6, 4080 +; LMULMAX1-RV64-NEXT: vand.vx v28, v28, a6 ; LMULMAX1-RV64-NEXT: vsll.vi v25, v25, 24 ; LMULMAX1-RV64-NEXT: vor.vv v25, v25, v28 ; LMULMAX1-RV64-NEXT: vor.vv v25, v25, v27 @@ -989,10 +989,10 @@ define void @bitreverse_v8i32(<8 x i32>* %x, <8 x i32>* %y) { ; LMULMAX1-RV64-NEXT: vsll.vi v25, v25, 2 ; LMULMAX1-RV64-NEXT: vor.vv v25, v27, v25 ; LMULMAX1-RV64-NEXT: vsrl.vi v27, v25, 1 -; LMULMAX1-RV64-NEXT: lui a1, 349525 -; LMULMAX1-RV64-NEXT: addiw a1, a1, 1365 -; LMULMAX1-RV64-NEXT: vand.vx v27, v27, a1 -; LMULMAX1-RV64-NEXT: vand.vx v25, v25, a1 +; LMULMAX1-RV64-NEXT: lui a3, 349525 +; LMULMAX1-RV64-NEXT: addiw a3, a3, 1365 +; LMULMAX1-RV64-NEXT: vand.vx v27, v27, a3 +; LMULMAX1-RV64-NEXT: vand.vx v25, v25, a3 ; LMULMAX1-RV64-NEXT: vadd.vv v25, v25, v25 ; LMULMAX1-RV64-NEXT: vor.vv v25, v27, v25 ; LMULMAX1-RV64-NEXT: vsrl.vi v27, v26, 8 @@ -1000,7 +1000,7 @@ define void @bitreverse_v8i32(<8 x i32>* %x, <8 x i32>* %y) { ; LMULMAX1-RV64-NEXT: vsrl.vi v28, v26, 24 ; LMULMAX1-RV64-NEXT: vor.vv v27, v27, v28 ; LMULMAX1-RV64-NEXT: vsll.vi v28, v26, 8 -; LMULMAX1-RV64-NEXT: vand.vx v28, v28, a3 +; LMULMAX1-RV64-NEXT: vand.vx v28, v28, a6 ; LMULMAX1-RV64-NEXT: vsll.vi v26, v26, 24 ; LMULMAX1-RV64-NEXT: vor.vv v26, v26, v28 ; LMULMAX1-RV64-NEXT: vor.vv v26, v26, v27 @@ -1015,12 +1015,12 @@ define void @bitreverse_v8i32(<8 x i32>* %x, <8 x i32>* %y) { ; LMULMAX1-RV64-NEXT: vsll.vi v26, v26, 2 ; LMULMAX1-RV64-NEXT: vor.vv v26, v27, v26 ; LMULMAX1-RV64-NEXT: vsrl.vi v27, v26, 1 -; LMULMAX1-RV64-NEXT: vand.vx v27, v27, a1 -; LMULMAX1-RV64-NEXT: vand.vx v26, v26, a1 +; LMULMAX1-RV64-NEXT: vand.vx v27, v27, a3 +; LMULMAX1-RV64-NEXT: vand.vx v26, v26, a3 ; LMULMAX1-RV64-NEXT: vadd.vv v26, v26, v26 ; LMULMAX1-RV64-NEXT: vor.vv v26, v27, v26 ; LMULMAX1-RV64-NEXT: vse32.v v26, (a0) -; LMULMAX1-RV64-NEXT: vse32.v v25, (a6) +; LMULMAX1-RV64-NEXT: vse32.v v25, (a1) ; LMULMAX1-RV64-NEXT: ret %a = load <8 x i32>, <8 x i32>* %x %b = load <8 x i32>, <8 x i32>* %y @@ -1195,8 +1195,8 @@ define void @bitreverse_v4i64(<4 x i64>* %x, <4 x i64>* %y) { ; LMULMAX1-RV32-LABEL: bitreverse_v4i64: ; LMULMAX1-RV32: # %bb.0: ; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu -; LMULMAX1-RV32-NEXT: addi a6, a0, 16 -; LMULMAX1-RV32-NEXT: vle64.v v29, (a6) +; LMULMAX1-RV32-NEXT: addi a1, a0, 16 +; LMULMAX1-RV32-NEXT: vle64.v v29, (a1) ; LMULMAX1-RV32-NEXT: vle64.v v25, (a0) ; LMULMAX1-RV32-NEXT: addi a2, zero, 56 ; LMULMAX1-RV32-NEXT: vsrl.vx v26, v29, a2 @@ -1207,23 +1207,23 @@ define void @bitreverse_v4i64(<4 x i64>* %x, <4 x i64>* %y) { ; LMULMAX1-RV32-NEXT: vand.vx v27, v27, a4 ; LMULMAX1-RV32-NEXT: vor.vv v27, v27, v26 ; LMULMAX1-RV32-NEXT: vsrl.vi v26, v29, 24 -; LMULMAX1-RV32-NEXT: lui a5, 4080 -; LMULMAX1-RV32-NEXT: vand.vx v28, v26, a5 -; LMULMAX1-RV32-NEXT: addi a1, zero, 5 +; LMULMAX1-RV32-NEXT: lui a6, 4080 +; LMULMAX1-RV32-NEXT: vand.vx v28, v26, a6 +; LMULMAX1-RV32-NEXT: addi a5, zero, 5 ; LMULMAX1-RV32-NEXT: vsetivli zero, 1, e8, mf8, ta, mu -; LMULMAX1-RV32-NEXT: vmv.s.x v0, a1 +; LMULMAX1-RV32-NEXT: vmv.s.x v0, a5 ; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; LMULMAX1-RV32-NEXT: vmv.v.i v26, 0 -; LMULMAX1-RV32-NEXT: lui a1, 1044480 -; LMULMAX1-RV32-NEXT: vmerge.vxm v26, v26, a1, v0 +; LMULMAX1-RV32-NEXT: lui a5, 1044480 +; LMULMAX1-RV32-NEXT: vmerge.vxm v26, v26, a5, v0 ; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; LMULMAX1-RV32-NEXT: vsrl.vi v30, v29, 8 ; LMULMAX1-RV32-NEXT: vand.vv v30, v30, v26 ; LMULMAX1-RV32-NEXT: vor.vv v28, v30, v28 ; LMULMAX1-RV32-NEXT: vor.vv v30, v28, v27 -; LMULMAX1-RV32-NEXT: addi a1, zero, 255 +; LMULMAX1-RV32-NEXT: addi a5, zero, 255 ; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; LMULMAX1-RV32-NEXT: vmv.v.x v27, a1 +; LMULMAX1-RV32-NEXT: vmv.v.x v27, a5 ; LMULMAX1-RV32-NEXT: vmerge.vim v27, v27, 0, v0 ; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; LMULMAX1-RV32-NEXT: vsll.vi v28, v29, 8 @@ -1237,7 +1237,7 @@ define void @bitreverse_v4i64(<4 x i64>* %x, <4 x i64>* %y) { ; LMULMAX1-RV32-NEXT: vor.vv v31, v8, v31 ; LMULMAX1-RV32-NEXT: vsll.vx v8, v29, a3 ; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; LMULMAX1-RV32-NEXT: vmv.v.x v9, a5 +; LMULMAX1-RV32-NEXT: vmv.v.x v9, a6 ; LMULMAX1-RV32-NEXT: vmerge.vim v9, v9, 0, v0 ; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; LMULMAX1-RV32-NEXT: vand.vv v8, v8, v9 @@ -1246,30 +1246,30 @@ define void @bitreverse_v4i64(<4 x i64>* %x, <4 x i64>* %y) { ; LMULMAX1-RV32-NEXT: vor.vv v29, v29, v31 ; LMULMAX1-RV32-NEXT: vor.vv v29, v29, v30 ; LMULMAX1-RV32-NEXT: vsrl.vi v30, v29, 4 -; LMULMAX1-RV32-NEXT: lui a1, 61681 -; LMULMAX1-RV32-NEXT: addi a1, a1, -241 +; LMULMAX1-RV32-NEXT: lui a5, 61681 +; LMULMAX1-RV32-NEXT: addi a5, a5, -241 ; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; LMULMAX1-RV32-NEXT: vmv.v.x v31, a1 +; LMULMAX1-RV32-NEXT: vmv.v.x v31, a5 ; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; LMULMAX1-RV32-NEXT: vand.vv v30, v30, v31 ; LMULMAX1-RV32-NEXT: vand.vv v29, v29, v31 ; LMULMAX1-RV32-NEXT: vsll.vi v29, v29, 4 ; LMULMAX1-RV32-NEXT: vor.vv v29, v30, v29 ; LMULMAX1-RV32-NEXT: vsrl.vi v30, v29, 2 -; LMULMAX1-RV32-NEXT: lui a1, 209715 -; LMULMAX1-RV32-NEXT: addi a1, a1, 819 +; LMULMAX1-RV32-NEXT: lui a5, 209715 +; LMULMAX1-RV32-NEXT: addi a5, a5, 819 ; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; LMULMAX1-RV32-NEXT: vmv.v.x v8, a1 +; LMULMAX1-RV32-NEXT: vmv.v.x v8, a5 ; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; LMULMAX1-RV32-NEXT: vand.vv v30, v30, v8 ; LMULMAX1-RV32-NEXT: vand.vv v29, v29, v8 ; LMULMAX1-RV32-NEXT: vsll.vi v29, v29, 2 ; LMULMAX1-RV32-NEXT: vor.vv v29, v30, v29 ; LMULMAX1-RV32-NEXT: vsrl.vi v30, v29, 1 -; LMULMAX1-RV32-NEXT: lui a1, 349525 -; LMULMAX1-RV32-NEXT: addi a1, a1, 1365 +; LMULMAX1-RV32-NEXT: lui a5, 349525 +; LMULMAX1-RV32-NEXT: addi a5, a5, 1365 ; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; LMULMAX1-RV32-NEXT: vmv.v.x v10, a1 +; LMULMAX1-RV32-NEXT: vmv.v.x v10, a5 ; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; LMULMAX1-RV32-NEXT: vand.vv v30, v30, v10 ; LMULMAX1-RV32-NEXT: vand.vv v29, v29, v10 @@ -1280,7 +1280,7 @@ define void @bitreverse_v4i64(<4 x i64>* %x, <4 x i64>* %y) { ; LMULMAX1-RV32-NEXT: vand.vx v11, v11, a4 ; LMULMAX1-RV32-NEXT: vor.vv v30, v11, v30 ; LMULMAX1-RV32-NEXT: vsrl.vi v11, v25, 24 -; LMULMAX1-RV32-NEXT: vand.vx v11, v11, a5 +; LMULMAX1-RV32-NEXT: vand.vx v11, v11, a6 ; LMULMAX1-RV32-NEXT: vsrl.vi v12, v25, 8 ; LMULMAX1-RV32-NEXT: vand.vv v26, v12, v26 ; LMULMAX1-RV32-NEXT: vor.vv v26, v26, v11 @@ -1312,14 +1312,14 @@ define void @bitreverse_v4i64(<4 x i64>* %x, <4 x i64>* %y) { ; LMULMAX1-RV32-NEXT: vadd.vv v25, v25, v25 ; LMULMAX1-RV32-NEXT: vor.vv v25, v26, v25 ; LMULMAX1-RV32-NEXT: vse64.v v25, (a0) -; LMULMAX1-RV32-NEXT: vse64.v v29, (a6) +; LMULMAX1-RV32-NEXT: vse64.v v29, (a1) ; LMULMAX1-RV32-NEXT: ret ; ; LMULMAX1-RV64-LABEL: bitreverse_v4i64: ; LMULMAX1-RV64: # %bb.0: ; LMULMAX1-RV64-NEXT: vsetivli zero, 2, e64, m1, ta, mu -; LMULMAX1-RV64-NEXT: addi a6, a0, 16 -; LMULMAX1-RV64-NEXT: vle64.v v26, (a6) +; LMULMAX1-RV64-NEXT: addi a7, a0, 16 +; LMULMAX1-RV64-NEXT: vle64.v v26, (a7) ; LMULMAX1-RV64-NEXT: vle64.v v25, (a0) ; LMULMAX1-RV64-NEXT: addi t0, zero, 56 ; LMULMAX1-RV64-NEXT: vsrl.vx v27, v26, t0 @@ -1330,17 +1330,17 @@ define void @bitreverse_v4i64(<4 x i64>* %x, <4 x i64>* %y) { ; LMULMAX1-RV64-NEXT: vand.vx v28, v28, t2 ; LMULMAX1-RV64-NEXT: vor.vv v27, v28, v27 ; LMULMAX1-RV64-NEXT: vsrl.vi v28, v26, 24 -; LMULMAX1-RV64-NEXT: lui a7, 4080 -; LMULMAX1-RV64-NEXT: vand.vx v28, v28, a7 +; LMULMAX1-RV64-NEXT: lui a6, 4080 +; LMULMAX1-RV64-NEXT: vand.vx v28, v28, a6 ; LMULMAX1-RV64-NEXT: vsrl.vi v29, v26, 8 ; LMULMAX1-RV64-NEXT: addi a3, zero, 255 -; LMULMAX1-RV64-NEXT: slli t4, a3, 24 -; LMULMAX1-RV64-NEXT: vand.vx v29, v29, t4 +; LMULMAX1-RV64-NEXT: slli t3, a3, 24 +; LMULMAX1-RV64-NEXT: vand.vx v29, v29, t3 ; LMULMAX1-RV64-NEXT: vor.vv v28, v29, v28 ; LMULMAX1-RV64-NEXT: vor.vv v27, v28, v27 ; LMULMAX1-RV64-NEXT: vsll.vi v28, v26, 8 -; LMULMAX1-RV64-NEXT: slli a5, a3, 32 -; LMULMAX1-RV64-NEXT: vand.vx v28, v28, a5 +; LMULMAX1-RV64-NEXT: slli t4, a3, 32 +; LMULMAX1-RV64-NEXT: vand.vx v28, v28, t4 ; LMULMAX1-RV64-NEXT: vsll.vi v29, v26, 24 ; LMULMAX1-RV64-NEXT: slli a2, a3, 40 ; LMULMAX1-RV64-NEXT: vand.vx v29, v29, a2 @@ -1360,22 +1360,22 @@ define void @bitreverse_v4i64(<4 x i64>* %x, <4 x i64>* %y) { ; LMULMAX1-RV64-NEXT: slli a4, a4, 12 ; LMULMAX1-RV64-NEXT: addi a4, a4, 241 ; LMULMAX1-RV64-NEXT: slli a4, a4, 12 -; LMULMAX1-RV64-NEXT: addi t3, a4, -241 -; LMULMAX1-RV64-NEXT: vand.vx v27, v27, t3 -; LMULMAX1-RV64-NEXT: vand.vx v26, v26, t3 +; LMULMAX1-RV64-NEXT: addi a4, a4, -241 +; LMULMAX1-RV64-NEXT: vand.vx v27, v27, a4 +; LMULMAX1-RV64-NEXT: vand.vx v26, v26, a4 ; LMULMAX1-RV64-NEXT: vsll.vi v26, v26, 4 ; LMULMAX1-RV64-NEXT: vor.vv v26, v27, v26 ; LMULMAX1-RV64-NEXT: vsrl.vi v27, v26, 2 -; LMULMAX1-RV64-NEXT: lui a4, 13107 -; LMULMAX1-RV64-NEXT: addiw a4, a4, 819 -; LMULMAX1-RV64-NEXT: slli a4, a4, 12 -; LMULMAX1-RV64-NEXT: addi a4, a4, 819 -; LMULMAX1-RV64-NEXT: slli a4, a4, 12 -; LMULMAX1-RV64-NEXT: addi a4, a4, 819 -; LMULMAX1-RV64-NEXT: slli a4, a4, 12 -; LMULMAX1-RV64-NEXT: addi a4, a4, 819 -; LMULMAX1-RV64-NEXT: vand.vx v27, v27, a4 -; LMULMAX1-RV64-NEXT: vand.vx v26, v26, a4 +; LMULMAX1-RV64-NEXT: lui a5, 13107 +; LMULMAX1-RV64-NEXT: addiw a5, a5, 819 +; LMULMAX1-RV64-NEXT: slli a5, a5, 12 +; LMULMAX1-RV64-NEXT: addi a5, a5, 819 +; LMULMAX1-RV64-NEXT: slli a5, a5, 12 +; LMULMAX1-RV64-NEXT: addi a5, a5, 819 +; LMULMAX1-RV64-NEXT: slli a5, a5, 12 +; LMULMAX1-RV64-NEXT: addi a5, a5, 819 +; LMULMAX1-RV64-NEXT: vand.vx v27, v27, a5 +; LMULMAX1-RV64-NEXT: vand.vx v26, v26, a5 ; LMULMAX1-RV64-NEXT: vsll.vi v26, v26, 2 ; LMULMAX1-RV64-NEXT: vor.vv v26, v27, v26 ; LMULMAX1-RV64-NEXT: vsrl.vi v27, v26, 1 @@ -1396,13 +1396,13 @@ define void @bitreverse_v4i64(<4 x i64>* %x, <4 x i64>* %y) { ; LMULMAX1-RV64-NEXT: vand.vx v28, v28, t2 ; LMULMAX1-RV64-NEXT: vor.vv v27, v28, v27 ; LMULMAX1-RV64-NEXT: vsrl.vi v28, v25, 24 -; LMULMAX1-RV64-NEXT: vand.vx v28, v28, a7 +; LMULMAX1-RV64-NEXT: vand.vx v28, v28, a6 ; LMULMAX1-RV64-NEXT: vsrl.vi v29, v25, 8 -; LMULMAX1-RV64-NEXT: vand.vx v29, v29, t4 +; LMULMAX1-RV64-NEXT: vand.vx v29, v29, t3 ; LMULMAX1-RV64-NEXT: vor.vv v28, v29, v28 ; LMULMAX1-RV64-NEXT: vor.vv v27, v28, v27 ; LMULMAX1-RV64-NEXT: vsll.vi v28, v25, 8 -; LMULMAX1-RV64-NEXT: vand.vx v28, v28, a5 +; LMULMAX1-RV64-NEXT: vand.vx v28, v28, t4 ; LMULMAX1-RV64-NEXT: vsll.vi v29, v25, 24 ; LMULMAX1-RV64-NEXT: vand.vx v29, v29, a2 ; LMULMAX1-RV64-NEXT: vor.vv v28, v29, v28 @@ -1413,13 +1413,13 @@ define void @bitreverse_v4i64(<4 x i64>* %x, <4 x i64>* %y) { ; LMULMAX1-RV64-NEXT: vor.vv v25, v25, v28 ; LMULMAX1-RV64-NEXT: vor.vv v25, v25, v27 ; LMULMAX1-RV64-NEXT: vsrl.vi v27, v25, 4 -; LMULMAX1-RV64-NEXT: vand.vx v27, v27, t3 -; LMULMAX1-RV64-NEXT: vand.vx v25, v25, t3 +; LMULMAX1-RV64-NEXT: vand.vx v27, v27, a4 +; LMULMAX1-RV64-NEXT: vand.vx v25, v25, a4 ; LMULMAX1-RV64-NEXT: vsll.vi v25, v25, 4 ; LMULMAX1-RV64-NEXT: vor.vv v25, v27, v25 ; LMULMAX1-RV64-NEXT: vsrl.vi v27, v25, 2 -; LMULMAX1-RV64-NEXT: vand.vx v27, v27, a4 -; LMULMAX1-RV64-NEXT: vand.vx v25, v25, a4 +; LMULMAX1-RV64-NEXT: vand.vx v27, v27, a5 +; LMULMAX1-RV64-NEXT: vand.vx v25, v25, a5 ; LMULMAX1-RV64-NEXT: vsll.vi v25, v25, 2 ; LMULMAX1-RV64-NEXT: vor.vv v25, v27, v25 ; LMULMAX1-RV64-NEXT: vsrl.vi v27, v25, 1 @@ -1428,7 +1428,7 @@ define void @bitreverse_v4i64(<4 x i64>* %x, <4 x i64>* %y) { ; LMULMAX1-RV64-NEXT: vadd.vv v25, v25, v25 ; LMULMAX1-RV64-NEXT: vor.vv v25, v27, v25 ; LMULMAX1-RV64-NEXT: vse64.v v25, (a0) -; LMULMAX1-RV64-NEXT: vse64.v v26, (a6) +; LMULMAX1-RV64-NEXT: vse64.v v26, (a7) ; LMULMAX1-RV64-NEXT: ret %a = load <4 x i64>, <4 x i64>* %x %b = load <4 x i64>, <4 x i64>* %y diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap.ll index b1535a8..262a02e 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap.ll @@ -562,13 +562,13 @@ define void @bswap_v2i64(<2 x i64>* %x, <2 x i64>* %y) { ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v25 ; LMULMAX2-RV32-NEXT: srli a2, a1, 8 ; LMULMAX2-RV32-NEXT: lui a3, 16 -; LMULMAX2-RV32-NEXT: addi a6, a3, -256 -; LMULMAX2-RV32-NEXT: and a2, a2, a6 +; LMULMAX2-RV32-NEXT: addi a3, a3, -256 +; LMULMAX2-RV32-NEXT: and a2, a2, a3 ; LMULMAX2-RV32-NEXT: srli a4, a1, 24 ; LMULMAX2-RV32-NEXT: or a2, a2, a4 ; LMULMAX2-RV32-NEXT: slli a4, a1, 8 -; LMULMAX2-RV32-NEXT: lui a5, 4080 -; LMULMAX2-RV32-NEXT: and a4, a4, a5 +; LMULMAX2-RV32-NEXT: lui a6, 4080 +; LMULMAX2-RV32-NEXT: and a4, a4, a6 ; LMULMAX2-RV32-NEXT: slli a1, a1, 24 ; LMULMAX2-RV32-NEXT: or a1, a1, a4 ; LMULMAX2-RV32-NEXT: or a1, a1, a2 @@ -577,11 +577,11 @@ define void @bswap_v2i64(<2 x i64>* %x, <2 x i64>* %y) { ; LMULMAX2-RV32-NEXT: vslidedown.vi v26, v25, 1 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX2-RV32-NEXT: srli a2, a1, 8 -; LMULMAX2-RV32-NEXT: and a2, a2, a6 +; LMULMAX2-RV32-NEXT: and a2, a2, a3 ; LMULMAX2-RV32-NEXT: srli a4, a1, 24 ; LMULMAX2-RV32-NEXT: or a2, a2, a4 ; LMULMAX2-RV32-NEXT: slli a4, a1, 8 -; LMULMAX2-RV32-NEXT: and a4, a4, a5 +; LMULMAX2-RV32-NEXT: and a4, a4, a6 ; LMULMAX2-RV32-NEXT: slli a1, a1, 24 ; LMULMAX2-RV32-NEXT: or a1, a1, a4 ; LMULMAX2-RV32-NEXT: or a1, a1, a2 @@ -590,23 +590,23 @@ define void @bswap_v2i64(<2 x i64>* %x, <2 x i64>* %y) { ; LMULMAX2-RV32-NEXT: vsrl.vx v25, v25, a1 ; LMULMAX2-RV32-NEXT: vmv.x.s a2, v25 ; LMULMAX2-RV32-NEXT: srli a4, a2, 8 -; LMULMAX2-RV32-NEXT: and a4, a4, a6 -; LMULMAX2-RV32-NEXT: srli a3, a2, 24 -; LMULMAX2-RV32-NEXT: or a3, a4, a3 -; LMULMAX2-RV32-NEXT: slli a4, a2, 8 -; LMULMAX2-RV32-NEXT: and a4, a4, a5 +; LMULMAX2-RV32-NEXT: and a4, a4, a3 +; LMULMAX2-RV32-NEXT: srli a5, a2, 24 +; LMULMAX2-RV32-NEXT: or a4, a4, a5 +; LMULMAX2-RV32-NEXT: slli a5, a2, 8 +; LMULMAX2-RV32-NEXT: and a5, a5, a6 ; LMULMAX2-RV32-NEXT: slli a2, a2, 24 +; LMULMAX2-RV32-NEXT: or a2, a2, a5 ; LMULMAX2-RV32-NEXT: or a2, a2, a4 -; LMULMAX2-RV32-NEXT: or a2, a2, a3 ; LMULMAX2-RV32-NEXT: sw a2, 16(sp) ; LMULMAX2-RV32-NEXT: vsrl.vx v25, v26, a1 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v25 ; LMULMAX2-RV32-NEXT: srli a2, a1, 8 -; LMULMAX2-RV32-NEXT: and a2, a2, a6 +; LMULMAX2-RV32-NEXT: and a2, a2, a3 ; LMULMAX2-RV32-NEXT: srli a3, a1, 24 ; LMULMAX2-RV32-NEXT: or a2, a2, a3 ; LMULMAX2-RV32-NEXT: slli a3, a1, 8 -; LMULMAX2-RV32-NEXT: and a3, a3, a5 +; LMULMAX2-RV32-NEXT: and a3, a3, a6 ; LMULMAX2-RV32-NEXT: slli a1, a1, 24 ; LMULMAX2-RV32-NEXT: or a1, a1, a3 ; LMULMAX2-RV32-NEXT: or a1, a1, a2 @@ -693,13 +693,13 @@ define void @bswap_v2i64(<2 x i64>* %x, <2 x i64>* %y) { ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v25 ; LMULMAX1-RV32-NEXT: srli a2, a1, 8 ; LMULMAX1-RV32-NEXT: lui a3, 16 -; LMULMAX1-RV32-NEXT: addi a6, a3, -256 -; LMULMAX1-RV32-NEXT: and a2, a2, a6 +; LMULMAX1-RV32-NEXT: addi a3, a3, -256 +; LMULMAX1-RV32-NEXT: and a2, a2, a3 ; LMULMAX1-RV32-NEXT: srli a4, a1, 24 ; LMULMAX1-RV32-NEXT: or a2, a2, a4 ; LMULMAX1-RV32-NEXT: slli a4, a1, 8 -; LMULMAX1-RV32-NEXT: lui a5, 4080 -; LMULMAX1-RV32-NEXT: and a4, a4, a5 +; LMULMAX1-RV32-NEXT: lui a6, 4080 +; LMULMAX1-RV32-NEXT: and a4, a4, a6 ; LMULMAX1-RV32-NEXT: slli a1, a1, 24 ; LMULMAX1-RV32-NEXT: or a1, a1, a4 ; LMULMAX1-RV32-NEXT: or a1, a1, a2 @@ -708,11 +708,11 @@ define void @bswap_v2i64(<2 x i64>* %x, <2 x i64>* %y) { ; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 1 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV32-NEXT: srli a2, a1, 8 -; LMULMAX1-RV32-NEXT: and a2, a2, a6 +; LMULMAX1-RV32-NEXT: and a2, a2, a3 ; LMULMAX1-RV32-NEXT: srli a4, a1, 24 ; LMULMAX1-RV32-NEXT: or a2, a2, a4 ; LMULMAX1-RV32-NEXT: slli a4, a1, 8 -; LMULMAX1-RV32-NEXT: and a4, a4, a5 +; LMULMAX1-RV32-NEXT: and a4, a4, a6 ; LMULMAX1-RV32-NEXT: slli a1, a1, 24 ; LMULMAX1-RV32-NEXT: or a1, a1, a4 ; LMULMAX1-RV32-NEXT: or a1, a1, a2 @@ -721,23 +721,23 @@ define void @bswap_v2i64(<2 x i64>* %x, <2 x i64>* %y) { ; LMULMAX1-RV32-NEXT: vsrl.vx v25, v25, a1 ; LMULMAX1-RV32-NEXT: vmv.x.s a2, v25 ; LMULMAX1-RV32-NEXT: srli a4, a2, 8 -; LMULMAX1-RV32-NEXT: and a4, a4, a6 -; LMULMAX1-RV32-NEXT: srli a3, a2, 24 -; LMULMAX1-RV32-NEXT: or a3, a4, a3 -; LMULMAX1-RV32-NEXT: slli a4, a2, 8 -; LMULMAX1-RV32-NEXT: and a4, a4, a5 +; LMULMAX1-RV32-NEXT: and a4, a4, a3 +; LMULMAX1-RV32-NEXT: srli a5, a2, 24 +; LMULMAX1-RV32-NEXT: or a4, a4, a5 +; LMULMAX1-RV32-NEXT: slli a5, a2, 8 +; LMULMAX1-RV32-NEXT: and a5, a5, a6 ; LMULMAX1-RV32-NEXT: slli a2, a2, 24 +; LMULMAX1-RV32-NEXT: or a2, a2, a5 ; LMULMAX1-RV32-NEXT: or a2, a2, a4 -; LMULMAX1-RV32-NEXT: or a2, a2, a3 ; LMULMAX1-RV32-NEXT: sw a2, 16(sp) ; LMULMAX1-RV32-NEXT: vsrl.vx v25, v26, a1 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v25 ; LMULMAX1-RV32-NEXT: srli a2, a1, 8 -; LMULMAX1-RV32-NEXT: and a2, a2, a6 +; LMULMAX1-RV32-NEXT: and a2, a2, a3 ; LMULMAX1-RV32-NEXT: srli a3, a1, 24 ; LMULMAX1-RV32-NEXT: or a2, a2, a3 ; LMULMAX1-RV32-NEXT: slli a3, a1, 8 -; LMULMAX1-RV32-NEXT: and a3, a3, a5 +; LMULMAX1-RV32-NEXT: and a3, a3, a6 ; LMULMAX1-RV32-NEXT: slli a1, a1, 24 ; LMULMAX1-RV32-NEXT: or a1, a1, a3 ; LMULMAX1-RV32-NEXT: or a1, a1, a2 @@ -1859,13 +1859,13 @@ define void @bswap_v4i64(<4 x i64>* %x, <4 x i64>* %y) { ; LMULMAX2-RV32-NEXT: vmv.x.s a3, v26 ; LMULMAX2-RV32-NEXT: srli a2, a3, 8 ; LMULMAX2-RV32-NEXT: lui a1, 16 -; LMULMAX2-RV32-NEXT: addi a6, a1, -256 -; LMULMAX2-RV32-NEXT: and a2, a2, a6 +; LMULMAX2-RV32-NEXT: addi a1, a1, -256 +; LMULMAX2-RV32-NEXT: and a2, a2, a1 ; LMULMAX2-RV32-NEXT: srli a4, a3, 24 ; LMULMAX2-RV32-NEXT: or a4, a2, a4 ; LMULMAX2-RV32-NEXT: slli a5, a3, 8 -; LMULMAX2-RV32-NEXT: lui a2, 4080 -; LMULMAX2-RV32-NEXT: and a5, a5, a2 +; LMULMAX2-RV32-NEXT: lui a6, 4080 +; LMULMAX2-RV32-NEXT: and a5, a5, a6 ; LMULMAX2-RV32-NEXT: slli a3, a3, 24 ; LMULMAX2-RV32-NEXT: or a3, a3, a5 ; LMULMAX2-RV32-NEXT: or a3, a3, a4 @@ -1874,11 +1874,11 @@ define void @bswap_v4i64(<4 x i64>* %x, <4 x i64>* %y) { ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 3 ; LMULMAX2-RV32-NEXT: vmv.x.s a3, v28 ; LMULMAX2-RV32-NEXT: srli a4, a3, 8 -; LMULMAX2-RV32-NEXT: and a4, a4, a6 +; LMULMAX2-RV32-NEXT: and a4, a4, a1 ; LMULMAX2-RV32-NEXT: srli a5, a3, 24 ; LMULMAX2-RV32-NEXT: or a4, a4, a5 ; LMULMAX2-RV32-NEXT: slli a5, a3, 8 -; LMULMAX2-RV32-NEXT: and a5, a5, a2 +; LMULMAX2-RV32-NEXT: and a5, a5, a6 ; LMULMAX2-RV32-NEXT: slli a3, a3, 24 ; LMULMAX2-RV32-NEXT: or a3, a3, a5 ; LMULMAX2-RV32-NEXT: or a3, a3, a4 @@ -1886,11 +1886,11 @@ define void @bswap_v4i64(<4 x i64>* %x, <4 x i64>* %y) { ; LMULMAX2-RV32-NEXT: vslidedown.vi v30, v26, 2 ; LMULMAX2-RV32-NEXT: vmv.x.s a3, v30 ; LMULMAX2-RV32-NEXT: srli a4, a3, 8 -; LMULMAX2-RV32-NEXT: and a4, a4, a6 +; LMULMAX2-RV32-NEXT: and a4, a4, a1 ; LMULMAX2-RV32-NEXT: srli a5, a3, 24 ; LMULMAX2-RV32-NEXT: or a4, a4, a5 ; LMULMAX2-RV32-NEXT: slli a5, a3, 8 -; LMULMAX2-RV32-NEXT: and a5, a5, a2 +; LMULMAX2-RV32-NEXT: and a5, a5, a6 ; LMULMAX2-RV32-NEXT: slli a3, a3, 24 ; LMULMAX2-RV32-NEXT: or a3, a3, a5 ; LMULMAX2-RV32-NEXT: or a3, a3, a4 @@ -1898,11 +1898,11 @@ define void @bswap_v4i64(<4 x i64>* %x, <4 x i64>* %y) { ; LMULMAX2-RV32-NEXT: vslidedown.vi v8, v26, 1 ; LMULMAX2-RV32-NEXT: vmv.x.s a3, v8 ; LMULMAX2-RV32-NEXT: srli a4, a3, 8 -; LMULMAX2-RV32-NEXT: and a4, a4, a6 +; LMULMAX2-RV32-NEXT: and a4, a4, a1 ; LMULMAX2-RV32-NEXT: srli a5, a3, 24 ; LMULMAX2-RV32-NEXT: or a4, a4, a5 ; LMULMAX2-RV32-NEXT: slli a5, a3, 8 -; LMULMAX2-RV32-NEXT: and a5, a5, a2 +; LMULMAX2-RV32-NEXT: and a5, a5, a6 ; LMULMAX2-RV32-NEXT: slli a3, a3, 24 ; LMULMAX2-RV32-NEXT: or a3, a3, a5 ; LMULMAX2-RV32-NEXT: or a3, a3, a4 @@ -1911,50 +1911,50 @@ define void @bswap_v4i64(<4 x i64>* %x, <4 x i64>* %y) { ; LMULMAX2-RV32-NEXT: vsrl.vx v26, v26, a3 ; LMULMAX2-RV32-NEXT: vmv.x.s a4, v26 ; LMULMAX2-RV32-NEXT: srli a5, a4, 8 -; LMULMAX2-RV32-NEXT: and a5, a5, a6 -; LMULMAX2-RV32-NEXT: srli a1, a4, 24 -; LMULMAX2-RV32-NEXT: or a1, a5, a1 +; LMULMAX2-RV32-NEXT: and a5, a5, a1 +; LMULMAX2-RV32-NEXT: srli a2, a4, 24 +; LMULMAX2-RV32-NEXT: or a2, a5, a2 ; LMULMAX2-RV32-NEXT: slli a5, a4, 8 -; LMULMAX2-RV32-NEXT: and a5, a5, a2 +; LMULMAX2-RV32-NEXT: and a5, a5, a6 ; LMULMAX2-RV32-NEXT: slli a4, a4, 24 ; LMULMAX2-RV32-NEXT: or a4, a4, a5 -; LMULMAX2-RV32-NEXT: or a1, a4, a1 -; LMULMAX2-RV32-NEXT: sw a1, 32(sp) +; LMULMAX2-RV32-NEXT: or a2, a4, a2 +; LMULMAX2-RV32-NEXT: sw a2, 32(sp) ; LMULMAX2-RV32-NEXT: vsrl.vx v26, v28, a3 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v26 -; LMULMAX2-RV32-NEXT: srli a4, a1, 8 -; LMULMAX2-RV32-NEXT: and a4, a4, a6 -; LMULMAX2-RV32-NEXT: srli a5, a1, 24 +; LMULMAX2-RV32-NEXT: vmv.x.s a2, v26 +; LMULMAX2-RV32-NEXT: srli a4, a2, 8 +; LMULMAX2-RV32-NEXT: and a4, a4, a1 +; LMULMAX2-RV32-NEXT: srli a5, a2, 24 ; LMULMAX2-RV32-NEXT: or a4, a4, a5 -; LMULMAX2-RV32-NEXT: slli a5, a1, 8 -; LMULMAX2-RV32-NEXT: and a5, a5, a2 -; LMULMAX2-RV32-NEXT: slli a1, a1, 24 -; LMULMAX2-RV32-NEXT: or a1, a1, a5 -; LMULMAX2-RV32-NEXT: or a1, a1, a4 -; LMULMAX2-RV32-NEXT: sw a1, 56(sp) +; LMULMAX2-RV32-NEXT: slli a5, a2, 8 +; LMULMAX2-RV32-NEXT: and a5, a5, a6 +; LMULMAX2-RV32-NEXT: slli a2, a2, 24 +; LMULMAX2-RV32-NEXT: or a2, a2, a5 +; LMULMAX2-RV32-NEXT: or a2, a2, a4 +; LMULMAX2-RV32-NEXT: sw a2, 56(sp) ; LMULMAX2-RV32-NEXT: vsrl.vx v26, v30, a3 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v26 -; LMULMAX2-RV32-NEXT: srli a4, a1, 8 -; LMULMAX2-RV32-NEXT: and a4, a4, a6 -; LMULMAX2-RV32-NEXT: srli a5, a1, 24 +; LMULMAX2-RV32-NEXT: vmv.x.s a2, v26 +; LMULMAX2-RV32-NEXT: srli a4, a2, 8 +; LMULMAX2-RV32-NEXT: and a4, a4, a1 +; LMULMAX2-RV32-NEXT: srli a5, a2, 24 ; LMULMAX2-RV32-NEXT: or a4, a4, a5 -; LMULMAX2-RV32-NEXT: slli a5, a1, 8 -; LMULMAX2-RV32-NEXT: and a5, a5, a2 -; LMULMAX2-RV32-NEXT: slli a1, a1, 24 -; LMULMAX2-RV32-NEXT: or a1, a1, a5 -; LMULMAX2-RV32-NEXT: or a1, a1, a4 -; LMULMAX2-RV32-NEXT: sw a1, 48(sp) +; LMULMAX2-RV32-NEXT: slli a5, a2, 8 +; LMULMAX2-RV32-NEXT: and a5, a5, a6 +; LMULMAX2-RV32-NEXT: slli a2, a2, 24 +; LMULMAX2-RV32-NEXT: or a2, a2, a5 +; LMULMAX2-RV32-NEXT: or a2, a2, a4 +; LMULMAX2-RV32-NEXT: sw a2, 48(sp) ; LMULMAX2-RV32-NEXT: vsrl.vx v26, v8, a3 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v26 -; LMULMAX2-RV32-NEXT: srli a3, a1, 8 -; LMULMAX2-RV32-NEXT: and a3, a3, a6 -; LMULMAX2-RV32-NEXT: srli a4, a1, 24 -; LMULMAX2-RV32-NEXT: or a3, a3, a4 -; LMULMAX2-RV32-NEXT: slli a4, a1, 8 -; LMULMAX2-RV32-NEXT: and a2, a4, a2 -; LMULMAX2-RV32-NEXT: slli a1, a1, 24 -; LMULMAX2-RV32-NEXT: or a1, a1, a2 +; LMULMAX2-RV32-NEXT: vmv.x.s a2, v26 +; LMULMAX2-RV32-NEXT: srli a3, a2, 8 +; LMULMAX2-RV32-NEXT: and a1, a3, a1 +; LMULMAX2-RV32-NEXT: srli a3, a2, 24 ; LMULMAX2-RV32-NEXT: or a1, a1, a3 +; LMULMAX2-RV32-NEXT: slli a3, a2, 8 +; LMULMAX2-RV32-NEXT: and a3, a3, a6 +; LMULMAX2-RV32-NEXT: slli a2, a2, 24 +; LMULMAX2-RV32-NEXT: or a2, a2, a3 +; LMULMAX2-RV32-NEXT: or a1, a2, a1 ; LMULMAX2-RV32-NEXT: sw a1, 40(sp) ; LMULMAX2-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu ; LMULMAX2-RV32-NEXT: addi a1, sp, 32 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll index 4fcf361..f063bf0 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll @@ -2252,8 +2252,8 @@ define void @ctlz_v8i16(<8 x i16>* %x, <8 x i16>* %y) { ; LMULMAX2-RV64-NEXT: vle16.v v25, (a0) ; LMULMAX2-RV64-NEXT: vmv.x.s a2, v25 ; LMULMAX2-RV64-NEXT: lui a1, 16 -; LMULMAX2-RV64-NEXT: addiw a1, a1, -1 -; LMULMAX2-RV64-NEXT: and a2, a2, a1 +; LMULMAX2-RV64-NEXT: addiw a6, a1, -1 +; LMULMAX2-RV64-NEXT: and a2, a2, a6 ; LMULMAX2-RV64-NEXT: srli a3, a2, 1 ; LMULMAX2-RV64-NEXT: or a2, a2, a3 ; LMULMAX2-RV64-NEXT: srli a3, a2, 2 @@ -2275,8 +2275,8 @@ define void @ctlz_v8i16(<8 x i16>* %x, <8 x i16>* %y) { ; LMULMAX2-RV64-NEXT: slli a2, a2, 12 ; LMULMAX2-RV64-NEXT: addi a2, a2, 1365 ; LMULMAX2-RV64-NEXT: slli a2, a2, 12 -; LMULMAX2-RV64-NEXT: addi a6, a2, 1365 -; LMULMAX2-RV64-NEXT: and a4, a4, a6 +; LMULMAX2-RV64-NEXT: addi a7, a2, 1365 +; LMULMAX2-RV64-NEXT: and a4, a4, a7 ; LMULMAX2-RV64-NEXT: sub a4, a3, a4 ; LMULMAX2-RV64-NEXT: lui a3, 13107 ; LMULMAX2-RV64-NEXT: addiw a3, a3, 819 @@ -2299,202 +2299,202 @@ define void @ctlz_v8i16(<8 x i16>* %x, <8 x i16>* %y) { ; LMULMAX2-RV64-NEXT: slli a4, a4, 12 ; LMULMAX2-RV64-NEXT: addi a4, a4, 241 ; LMULMAX2-RV64-NEXT: slli a4, a4, 12 -; LMULMAX2-RV64-NEXT: addi a7, a4, -241 -; LMULMAX2-RV64-NEXT: and a2, a5, a7 +; LMULMAX2-RV64-NEXT: addi a4, a4, -241 +; LMULMAX2-RV64-NEXT: and a1, a5, a4 ; LMULMAX2-RV64-NEXT: lui a5, 4112 ; LMULMAX2-RV64-NEXT: addiw a5, a5, 257 ; LMULMAX2-RV64-NEXT: slli a5, a5, 16 ; LMULMAX2-RV64-NEXT: addi a5, a5, 257 ; LMULMAX2-RV64-NEXT: slli a5, a5, 16 ; LMULMAX2-RV64-NEXT: addi a5, a5, 257 -; LMULMAX2-RV64-NEXT: mul a2, a2, a5 -; LMULMAX2-RV64-NEXT: srli a2, a2, 56 -; LMULMAX2-RV64-NEXT: addiw a2, a2, -48 -; LMULMAX2-RV64-NEXT: sh a2, 16(sp) +; LMULMAX2-RV64-NEXT: mul a1, a1, a5 +; LMULMAX2-RV64-NEXT: srli a1, a1, 56 +; LMULMAX2-RV64-NEXT: addiw a1, a1, -48 +; LMULMAX2-RV64-NEXT: sh a1, 16(sp) ; LMULMAX2-RV64-NEXT: vsetivli zero, 1, e16, m1, ta, mu ; LMULMAX2-RV64-NEXT: vslidedown.vi v26, v25, 7 -; LMULMAX2-RV64-NEXT: vmv.x.s a2, v26 -; LMULMAX2-RV64-NEXT: and a2, a2, a1 -; LMULMAX2-RV64-NEXT: srli a4, a2, 1 -; LMULMAX2-RV64-NEXT: or a2, a2, a4 -; LMULMAX2-RV64-NEXT: srli a4, a2, 2 -; LMULMAX2-RV64-NEXT: or a2, a2, a4 -; LMULMAX2-RV64-NEXT: srli a4, a2, 4 -; LMULMAX2-RV64-NEXT: or a2, a2, a4 -; LMULMAX2-RV64-NEXT: srli a4, a2, 8 -; LMULMAX2-RV64-NEXT: or a2, a2, a4 -; LMULMAX2-RV64-NEXT: srli a4, a2, 16 -; LMULMAX2-RV64-NEXT: or a2, a2, a4 -; LMULMAX2-RV64-NEXT: srli a4, a2, 32 -; LMULMAX2-RV64-NEXT: or a2, a2, a4 -; LMULMAX2-RV64-NEXT: not a2, a2 -; LMULMAX2-RV64-NEXT: srli a4, a2, 1 -; LMULMAX2-RV64-NEXT: and a4, a4, a6 -; LMULMAX2-RV64-NEXT: sub a2, a2, a4 -; LMULMAX2-RV64-NEXT: and a4, a2, a3 -; LMULMAX2-RV64-NEXT: srli a2, a2, 2 -; LMULMAX2-RV64-NEXT: and a2, a2, a3 -; LMULMAX2-RV64-NEXT: add a2, a4, a2 -; LMULMAX2-RV64-NEXT: srli a4, a2, 4 -; LMULMAX2-RV64-NEXT: add a2, a2, a4 +; LMULMAX2-RV64-NEXT: vmv.x.s a1, v26 +; LMULMAX2-RV64-NEXT: and a1, a1, a6 +; LMULMAX2-RV64-NEXT: srli a2, a1, 1 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: srli a2, a1, 2 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: srli a2, a1, 4 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: srli a2, a1, 8 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: srli a2, a1, 16 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: srli a2, a1, 32 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: not a1, a1 +; LMULMAX2-RV64-NEXT: srli a2, a1, 1 ; LMULMAX2-RV64-NEXT: and a2, a2, a7 -; LMULMAX2-RV64-NEXT: mul a2, a2, a5 -; LMULMAX2-RV64-NEXT: srli a2, a2, 56 -; LMULMAX2-RV64-NEXT: addiw a2, a2, -48 -; LMULMAX2-RV64-NEXT: sh a2, 30(sp) +; LMULMAX2-RV64-NEXT: sub a1, a1, a2 +; LMULMAX2-RV64-NEXT: and a2, a1, a3 +; LMULMAX2-RV64-NEXT: srli a1, a1, 2 +; LMULMAX2-RV64-NEXT: and a1, a1, a3 +; LMULMAX2-RV64-NEXT: add a1, a2, a1 +; LMULMAX2-RV64-NEXT: srli a2, a1, 4 +; LMULMAX2-RV64-NEXT: add a1, a1, a2 +; LMULMAX2-RV64-NEXT: and a1, a1, a4 +; LMULMAX2-RV64-NEXT: mul a1, a1, a5 +; LMULMAX2-RV64-NEXT: srli a1, a1, 56 +; LMULMAX2-RV64-NEXT: addiw a1, a1, -48 +; LMULMAX2-RV64-NEXT: sh a1, 30(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v26, v25, 6 -; LMULMAX2-RV64-NEXT: vmv.x.s a2, v26 -; LMULMAX2-RV64-NEXT: and a2, a2, a1 -; LMULMAX2-RV64-NEXT: srli a4, a2, 1 -; LMULMAX2-RV64-NEXT: or a2, a2, a4 -; LMULMAX2-RV64-NEXT: srli a4, a2, 2 -; LMULMAX2-RV64-NEXT: or a2, a2, a4 -; LMULMAX2-RV64-NEXT: srli a4, a2, 4 -; LMULMAX2-RV64-NEXT: or a2, a2, a4 -; LMULMAX2-RV64-NEXT: srli a4, a2, 8 -; LMULMAX2-RV64-NEXT: or a2, a2, a4 -; LMULMAX2-RV64-NEXT: srli a4, a2, 16 -; LMULMAX2-RV64-NEXT: or a2, a2, a4 -; LMULMAX2-RV64-NEXT: srli a4, a2, 32 -; LMULMAX2-RV64-NEXT: or a2, a2, a4 -; LMULMAX2-RV64-NEXT: not a2, a2 -; LMULMAX2-RV64-NEXT: srli a4, a2, 1 -; LMULMAX2-RV64-NEXT: and a4, a4, a6 -; LMULMAX2-RV64-NEXT: sub a2, a2, a4 -; LMULMAX2-RV64-NEXT: and a4, a2, a3 -; LMULMAX2-RV64-NEXT: srli a2, a2, 2 -; LMULMAX2-RV64-NEXT: and a2, a2, a3 -; LMULMAX2-RV64-NEXT: add a2, a4, a2 -; LMULMAX2-RV64-NEXT: srli a4, a2, 4 -; LMULMAX2-RV64-NEXT: add a2, a2, a4 +; LMULMAX2-RV64-NEXT: vmv.x.s a1, v26 +; LMULMAX2-RV64-NEXT: and a1, a1, a6 +; LMULMAX2-RV64-NEXT: srli a2, a1, 1 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: srli a2, a1, 2 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: srli a2, a1, 4 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: srli a2, a1, 8 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: srli a2, a1, 16 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: srli a2, a1, 32 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: not a1, a1 +; LMULMAX2-RV64-NEXT: srli a2, a1, 1 ; LMULMAX2-RV64-NEXT: and a2, a2, a7 -; LMULMAX2-RV64-NEXT: mul a2, a2, a5 -; LMULMAX2-RV64-NEXT: srli a2, a2, 56 -; LMULMAX2-RV64-NEXT: addiw a2, a2, -48 -; LMULMAX2-RV64-NEXT: sh a2, 28(sp) +; LMULMAX2-RV64-NEXT: sub a1, a1, a2 +; LMULMAX2-RV64-NEXT: and a2, a1, a3 +; LMULMAX2-RV64-NEXT: srli a1, a1, 2 +; LMULMAX2-RV64-NEXT: and a1, a1, a3 +; LMULMAX2-RV64-NEXT: add a1, a2, a1 +; LMULMAX2-RV64-NEXT: srli a2, a1, 4 +; LMULMAX2-RV64-NEXT: add a1, a1, a2 +; LMULMAX2-RV64-NEXT: and a1, a1, a4 +; LMULMAX2-RV64-NEXT: mul a1, a1, a5 +; LMULMAX2-RV64-NEXT: srli a1, a1, 56 +; LMULMAX2-RV64-NEXT: addiw a1, a1, -48 +; LMULMAX2-RV64-NEXT: sh a1, 28(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v26, v25, 5 -; LMULMAX2-RV64-NEXT: vmv.x.s a2, v26 -; LMULMAX2-RV64-NEXT: and a2, a2, a1 -; LMULMAX2-RV64-NEXT: srli a4, a2, 1 -; LMULMAX2-RV64-NEXT: or a2, a2, a4 -; LMULMAX2-RV64-NEXT: srli a4, a2, 2 -; LMULMAX2-RV64-NEXT: or a2, a2, a4 -; LMULMAX2-RV64-NEXT: srli a4, a2, 4 -; LMULMAX2-RV64-NEXT: or a2, a2, a4 -; LMULMAX2-RV64-NEXT: srli a4, a2, 8 -; LMULMAX2-RV64-NEXT: or a2, a2, a4 -; LMULMAX2-RV64-NEXT: srli a4, a2, 16 -; LMULMAX2-RV64-NEXT: or a2, a2, a4 -; LMULMAX2-RV64-NEXT: srli a4, a2, 32 -; LMULMAX2-RV64-NEXT: or a2, a2, a4 -; LMULMAX2-RV64-NEXT: not a2, a2 -; LMULMAX2-RV64-NEXT: srli a4, a2, 1 -; LMULMAX2-RV64-NEXT: and a4, a4, a6 -; LMULMAX2-RV64-NEXT: sub a2, a2, a4 -; LMULMAX2-RV64-NEXT: and a4, a2, a3 -; LMULMAX2-RV64-NEXT: srli a2, a2, 2 -; LMULMAX2-RV64-NEXT: and a2, a2, a3 -; LMULMAX2-RV64-NEXT: add a2, a4, a2 -; LMULMAX2-RV64-NEXT: srli a4, a2, 4 -; LMULMAX2-RV64-NEXT: add a2, a2, a4 +; LMULMAX2-RV64-NEXT: vmv.x.s a1, v26 +; LMULMAX2-RV64-NEXT: and a1, a1, a6 +; LMULMAX2-RV64-NEXT: srli a2, a1, 1 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: srli a2, a1, 2 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: srli a2, a1, 4 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: srli a2, a1, 8 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: srli a2, a1, 16 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: srli a2, a1, 32 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: not a1, a1 +; LMULMAX2-RV64-NEXT: srli a2, a1, 1 ; LMULMAX2-RV64-NEXT: and a2, a2, a7 -; LMULMAX2-RV64-NEXT: mul a2, a2, a5 -; LMULMAX2-RV64-NEXT: srli a2, a2, 56 -; LMULMAX2-RV64-NEXT: addiw a2, a2, -48 -; LMULMAX2-RV64-NEXT: sh a2, 26(sp) +; LMULMAX2-RV64-NEXT: sub a1, a1, a2 +; LMULMAX2-RV64-NEXT: and a2, a1, a3 +; LMULMAX2-RV64-NEXT: srli a1, a1, 2 +; LMULMAX2-RV64-NEXT: and a1, a1, a3 +; LMULMAX2-RV64-NEXT: add a1, a2, a1 +; LMULMAX2-RV64-NEXT: srli a2, a1, 4 +; LMULMAX2-RV64-NEXT: add a1, a1, a2 +; LMULMAX2-RV64-NEXT: and a1, a1, a4 +; LMULMAX2-RV64-NEXT: mul a1, a1, a5 +; LMULMAX2-RV64-NEXT: srli a1, a1, 56 +; LMULMAX2-RV64-NEXT: addiw a1, a1, -48 +; LMULMAX2-RV64-NEXT: sh a1, 26(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v26, v25, 4 -; LMULMAX2-RV64-NEXT: vmv.x.s a2, v26 -; LMULMAX2-RV64-NEXT: and a2, a2, a1 -; LMULMAX2-RV64-NEXT: srli a4, a2, 1 -; LMULMAX2-RV64-NEXT: or a2, a2, a4 -; LMULMAX2-RV64-NEXT: srli a4, a2, 2 -; LMULMAX2-RV64-NEXT: or a2, a2, a4 -; LMULMAX2-RV64-NEXT: srli a4, a2, 4 -; LMULMAX2-RV64-NEXT: or a2, a2, a4 -; LMULMAX2-RV64-NEXT: srli a4, a2, 8 -; LMULMAX2-RV64-NEXT: or a2, a2, a4 -; LMULMAX2-RV64-NEXT: srli a4, a2, 16 -; LMULMAX2-RV64-NEXT: or a2, a2, a4 -; LMULMAX2-RV64-NEXT: srli a4, a2, 32 -; LMULMAX2-RV64-NEXT: or a2, a2, a4 -; LMULMAX2-RV64-NEXT: not a2, a2 -; LMULMAX2-RV64-NEXT: srli a4, a2, 1 -; LMULMAX2-RV64-NEXT: and a4, a4, a6 -; LMULMAX2-RV64-NEXT: sub a2, a2, a4 -; LMULMAX2-RV64-NEXT: and a4, a2, a3 -; LMULMAX2-RV64-NEXT: srli a2, a2, 2 -; LMULMAX2-RV64-NEXT: and a2, a2, a3 -; LMULMAX2-RV64-NEXT: add a2, a4, a2 -; LMULMAX2-RV64-NEXT: srli a4, a2, 4 -; LMULMAX2-RV64-NEXT: add a2, a2, a4 +; LMULMAX2-RV64-NEXT: vmv.x.s a1, v26 +; LMULMAX2-RV64-NEXT: and a1, a1, a6 +; LMULMAX2-RV64-NEXT: srli a2, a1, 1 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: srli a2, a1, 2 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: srli a2, a1, 4 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: srli a2, a1, 8 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: srli a2, a1, 16 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: srli a2, a1, 32 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: not a1, a1 +; LMULMAX2-RV64-NEXT: srli a2, a1, 1 ; LMULMAX2-RV64-NEXT: and a2, a2, a7 -; LMULMAX2-RV64-NEXT: mul a2, a2, a5 -; LMULMAX2-RV64-NEXT: srli a2, a2, 56 -; LMULMAX2-RV64-NEXT: addiw a2, a2, -48 -; LMULMAX2-RV64-NEXT: sh a2, 24(sp) +; LMULMAX2-RV64-NEXT: sub a1, a1, a2 +; LMULMAX2-RV64-NEXT: and a2, a1, a3 +; LMULMAX2-RV64-NEXT: srli a1, a1, 2 +; LMULMAX2-RV64-NEXT: and a1, a1, a3 +; LMULMAX2-RV64-NEXT: add a1, a2, a1 +; LMULMAX2-RV64-NEXT: srli a2, a1, 4 +; LMULMAX2-RV64-NEXT: add a1, a1, a2 +; LMULMAX2-RV64-NEXT: and a1, a1, a4 +; LMULMAX2-RV64-NEXT: mul a1, a1, a5 +; LMULMAX2-RV64-NEXT: srli a1, a1, 56 +; LMULMAX2-RV64-NEXT: addiw a1, a1, -48 +; LMULMAX2-RV64-NEXT: sh a1, 24(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v26, v25, 3 -; LMULMAX2-RV64-NEXT: vmv.x.s a2, v26 -; LMULMAX2-RV64-NEXT: and a2, a2, a1 -; LMULMAX2-RV64-NEXT: srli a4, a2, 1 -; LMULMAX2-RV64-NEXT: or a2, a2, a4 -; LMULMAX2-RV64-NEXT: srli a4, a2, 2 -; LMULMAX2-RV64-NEXT: or a2, a2, a4 -; LMULMAX2-RV64-NEXT: srli a4, a2, 4 -; LMULMAX2-RV64-NEXT: or a2, a2, a4 -; LMULMAX2-RV64-NEXT: srli a4, a2, 8 -; LMULMAX2-RV64-NEXT: or a2, a2, a4 -; LMULMAX2-RV64-NEXT: srli a4, a2, 16 -; LMULMAX2-RV64-NEXT: or a2, a2, a4 -; LMULMAX2-RV64-NEXT: srli a4, a2, 32 -; LMULMAX2-RV64-NEXT: or a2, a2, a4 -; LMULMAX2-RV64-NEXT: not a2, a2 -; LMULMAX2-RV64-NEXT: srli a4, a2, 1 -; LMULMAX2-RV64-NEXT: and a4, a4, a6 -; LMULMAX2-RV64-NEXT: sub a2, a2, a4 -; LMULMAX2-RV64-NEXT: and a4, a2, a3 -; LMULMAX2-RV64-NEXT: srli a2, a2, 2 -; LMULMAX2-RV64-NEXT: and a2, a2, a3 -; LMULMAX2-RV64-NEXT: add a2, a4, a2 -; LMULMAX2-RV64-NEXT: srli a4, a2, 4 -; LMULMAX2-RV64-NEXT: add a2, a2, a4 +; LMULMAX2-RV64-NEXT: vmv.x.s a1, v26 +; LMULMAX2-RV64-NEXT: and a1, a1, a6 +; LMULMAX2-RV64-NEXT: srli a2, a1, 1 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: srli a2, a1, 2 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: srli a2, a1, 4 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: srli a2, a1, 8 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: srli a2, a1, 16 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: srli a2, a1, 32 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: not a1, a1 +; LMULMAX2-RV64-NEXT: srli a2, a1, 1 ; LMULMAX2-RV64-NEXT: and a2, a2, a7 -; LMULMAX2-RV64-NEXT: mul a2, a2, a5 -; LMULMAX2-RV64-NEXT: srli a2, a2, 56 -; LMULMAX2-RV64-NEXT: addiw a2, a2, -48 -; LMULMAX2-RV64-NEXT: sh a2, 22(sp) +; LMULMAX2-RV64-NEXT: sub a1, a1, a2 +; LMULMAX2-RV64-NEXT: and a2, a1, a3 +; LMULMAX2-RV64-NEXT: srli a1, a1, 2 +; LMULMAX2-RV64-NEXT: and a1, a1, a3 +; LMULMAX2-RV64-NEXT: add a1, a2, a1 +; LMULMAX2-RV64-NEXT: srli a2, a1, 4 +; LMULMAX2-RV64-NEXT: add a1, a1, a2 +; LMULMAX2-RV64-NEXT: and a1, a1, a4 +; LMULMAX2-RV64-NEXT: mul a1, a1, a5 +; LMULMAX2-RV64-NEXT: srli a1, a1, 56 +; LMULMAX2-RV64-NEXT: addiw a1, a1, -48 +; LMULMAX2-RV64-NEXT: sh a1, 22(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v26, v25, 2 -; LMULMAX2-RV64-NEXT: vmv.x.s a2, v26 -; LMULMAX2-RV64-NEXT: and a2, a2, a1 -; LMULMAX2-RV64-NEXT: srli a4, a2, 1 -; LMULMAX2-RV64-NEXT: or a2, a2, a4 -; LMULMAX2-RV64-NEXT: srli a4, a2, 2 -; LMULMAX2-RV64-NEXT: or a2, a2, a4 -; LMULMAX2-RV64-NEXT: srli a4, a2, 4 -; LMULMAX2-RV64-NEXT: or a2, a2, a4 -; LMULMAX2-RV64-NEXT: srli a4, a2, 8 -; LMULMAX2-RV64-NEXT: or a2, a2, a4 -; LMULMAX2-RV64-NEXT: srli a4, a2, 16 -; LMULMAX2-RV64-NEXT: or a2, a2, a4 -; LMULMAX2-RV64-NEXT: srli a4, a2, 32 -; LMULMAX2-RV64-NEXT: or a2, a2, a4 -; LMULMAX2-RV64-NEXT: not a2, a2 -; LMULMAX2-RV64-NEXT: srli a4, a2, 1 -; LMULMAX2-RV64-NEXT: and a4, a4, a6 -; LMULMAX2-RV64-NEXT: sub a2, a2, a4 -; LMULMAX2-RV64-NEXT: and a4, a2, a3 -; LMULMAX2-RV64-NEXT: srli a2, a2, 2 -; LMULMAX2-RV64-NEXT: and a2, a2, a3 -; LMULMAX2-RV64-NEXT: add a2, a4, a2 -; LMULMAX2-RV64-NEXT: srli a4, a2, 4 -; LMULMAX2-RV64-NEXT: add a2, a2, a4 +; LMULMAX2-RV64-NEXT: vmv.x.s a1, v26 +; LMULMAX2-RV64-NEXT: and a1, a1, a6 +; LMULMAX2-RV64-NEXT: srli a2, a1, 1 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: srli a2, a1, 2 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: srli a2, a1, 4 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: srli a2, a1, 8 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: srli a2, a1, 16 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: srli a2, a1, 32 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: not a1, a1 +; LMULMAX2-RV64-NEXT: srli a2, a1, 1 ; LMULMAX2-RV64-NEXT: and a2, a2, a7 -; LMULMAX2-RV64-NEXT: mul a2, a2, a5 -; LMULMAX2-RV64-NEXT: srli a2, a2, 56 -; LMULMAX2-RV64-NEXT: addiw a2, a2, -48 -; LMULMAX2-RV64-NEXT: sh a2, 20(sp) +; LMULMAX2-RV64-NEXT: sub a1, a1, a2 +; LMULMAX2-RV64-NEXT: and a2, a1, a3 +; LMULMAX2-RV64-NEXT: srli a1, a1, 2 +; LMULMAX2-RV64-NEXT: and a1, a1, a3 +; LMULMAX2-RV64-NEXT: add a1, a2, a1 +; LMULMAX2-RV64-NEXT: srli a2, a1, 4 +; LMULMAX2-RV64-NEXT: add a1, a1, a2 +; LMULMAX2-RV64-NEXT: and a1, a1, a4 +; LMULMAX2-RV64-NEXT: mul a1, a1, a5 +; LMULMAX2-RV64-NEXT: srli a1, a1, 56 +; LMULMAX2-RV64-NEXT: addiw a1, a1, -48 +; LMULMAX2-RV64-NEXT: sh a1, 20(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v25, v25, 1 -; LMULMAX2-RV64-NEXT: vmv.x.s a2, v25 -; LMULMAX2-RV64-NEXT: and a1, a2, a1 +; LMULMAX2-RV64-NEXT: vmv.x.s a1, v25 +; LMULMAX2-RV64-NEXT: and a1, a1, a6 ; LMULMAX2-RV64-NEXT: srli a2, a1, 1 ; LMULMAX2-RV64-NEXT: or a1, a1, a2 ; LMULMAX2-RV64-NEXT: srli a2, a1, 2 @@ -2509,7 +2509,7 @@ define void @ctlz_v8i16(<8 x i16>* %x, <8 x i16>* %y) { ; LMULMAX2-RV64-NEXT: or a1, a1, a2 ; LMULMAX2-RV64-NEXT: not a1, a1 ; LMULMAX2-RV64-NEXT: srli a2, a1, 1 -; LMULMAX2-RV64-NEXT: and a2, a2, a6 +; LMULMAX2-RV64-NEXT: and a2, a2, a7 ; LMULMAX2-RV64-NEXT: sub a1, a1, a2 ; LMULMAX2-RV64-NEXT: and a2, a1, a3 ; LMULMAX2-RV64-NEXT: srli a1, a1, 2 @@ -2517,7 +2517,7 @@ define void @ctlz_v8i16(<8 x i16>* %x, <8 x i16>* %y) { ; LMULMAX2-RV64-NEXT: add a1, a2, a1 ; LMULMAX2-RV64-NEXT: srli a2, a1, 4 ; LMULMAX2-RV64-NEXT: add a1, a1, a2 -; LMULMAX2-RV64-NEXT: and a1, a1, a7 +; LMULMAX2-RV64-NEXT: and a1, a1, a4 ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 ; LMULMAX2-RV64-NEXT: addiw a1, a1, -48 @@ -2784,8 +2784,8 @@ define void @ctlz_v8i16(<8 x i16>* %x, <8 x i16>* %y) { ; LMULMAX1-RV64-NEXT: vle16.v v25, (a0) ; LMULMAX1-RV64-NEXT: vmv.x.s a2, v25 ; LMULMAX1-RV64-NEXT: lui a1, 16 -; LMULMAX1-RV64-NEXT: addiw a1, a1, -1 -; LMULMAX1-RV64-NEXT: and a2, a2, a1 +; LMULMAX1-RV64-NEXT: addiw a6, a1, -1 +; LMULMAX1-RV64-NEXT: and a2, a2, a6 ; LMULMAX1-RV64-NEXT: srli a3, a2, 1 ; LMULMAX1-RV64-NEXT: or a2, a2, a3 ; LMULMAX1-RV64-NEXT: srli a3, a2, 2 @@ -2807,8 +2807,8 @@ define void @ctlz_v8i16(<8 x i16>* %x, <8 x i16>* %y) { ; LMULMAX1-RV64-NEXT: slli a2, a2, 12 ; LMULMAX1-RV64-NEXT: addi a2, a2, 1365 ; LMULMAX1-RV64-NEXT: slli a2, a2, 12 -; LMULMAX1-RV64-NEXT: addi a6, a2, 1365 -; LMULMAX1-RV64-NEXT: and a4, a4, a6 +; LMULMAX1-RV64-NEXT: addi a7, a2, 1365 +; LMULMAX1-RV64-NEXT: and a4, a4, a7 ; LMULMAX1-RV64-NEXT: sub a4, a3, a4 ; LMULMAX1-RV64-NEXT: lui a3, 13107 ; LMULMAX1-RV64-NEXT: addiw a3, a3, 819 @@ -2831,202 +2831,22 @@ define void @ctlz_v8i16(<8 x i16>* %x, <8 x i16>* %y) { ; LMULMAX1-RV64-NEXT: slli a4, a4, 12 ; LMULMAX1-RV64-NEXT: addi a4, a4, 241 ; LMULMAX1-RV64-NEXT: slli a4, a4, 12 -; LMULMAX1-RV64-NEXT: addi a7, a4, -241 -; LMULMAX1-RV64-NEXT: and a2, a5, a7 +; LMULMAX1-RV64-NEXT: addi a4, a4, -241 +; LMULMAX1-RV64-NEXT: and a1, a5, a4 ; LMULMAX1-RV64-NEXT: lui a5, 4112 ; LMULMAX1-RV64-NEXT: addiw a5, a5, 257 ; LMULMAX1-RV64-NEXT: slli a5, a5, 16 ; LMULMAX1-RV64-NEXT: addi a5, a5, 257 ; LMULMAX1-RV64-NEXT: slli a5, a5, 16 ; LMULMAX1-RV64-NEXT: addi a5, a5, 257 -; LMULMAX1-RV64-NEXT: mul a2, a2, a5 -; LMULMAX1-RV64-NEXT: srli a2, a2, 56 -; LMULMAX1-RV64-NEXT: addiw a2, a2, -48 -; LMULMAX1-RV64-NEXT: sh a2, 16(sp) +; LMULMAX1-RV64-NEXT: mul a1, a1, a5 +; LMULMAX1-RV64-NEXT: srli a1, a1, 56 +; LMULMAX1-RV64-NEXT: addiw a1, a1, -48 +; LMULMAX1-RV64-NEXT: sh a1, 16(sp) ; LMULMAX1-RV64-NEXT: vsetivli zero, 1, e16, m1, ta, mu ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 7 -; LMULMAX1-RV64-NEXT: vmv.x.s a2, v26 -; LMULMAX1-RV64-NEXT: and a2, a2, a1 -; LMULMAX1-RV64-NEXT: srli a4, a2, 1 -; LMULMAX1-RV64-NEXT: or a2, a2, a4 -; LMULMAX1-RV64-NEXT: srli a4, a2, 2 -; LMULMAX1-RV64-NEXT: or a2, a2, a4 -; LMULMAX1-RV64-NEXT: srli a4, a2, 4 -; LMULMAX1-RV64-NEXT: or a2, a2, a4 -; LMULMAX1-RV64-NEXT: srli a4, a2, 8 -; LMULMAX1-RV64-NEXT: or a2, a2, a4 -; LMULMAX1-RV64-NEXT: srli a4, a2, 16 -; LMULMAX1-RV64-NEXT: or a2, a2, a4 -; LMULMAX1-RV64-NEXT: srli a4, a2, 32 -; LMULMAX1-RV64-NEXT: or a2, a2, a4 -; LMULMAX1-RV64-NEXT: not a2, a2 -; LMULMAX1-RV64-NEXT: srli a4, a2, 1 -; LMULMAX1-RV64-NEXT: and a4, a4, a6 -; LMULMAX1-RV64-NEXT: sub a2, a2, a4 -; LMULMAX1-RV64-NEXT: and a4, a2, a3 -; LMULMAX1-RV64-NEXT: srli a2, a2, 2 -; LMULMAX1-RV64-NEXT: and a2, a2, a3 -; LMULMAX1-RV64-NEXT: add a2, a4, a2 -; LMULMAX1-RV64-NEXT: srli a4, a2, 4 -; LMULMAX1-RV64-NEXT: add a2, a2, a4 -; LMULMAX1-RV64-NEXT: and a2, a2, a7 -; LMULMAX1-RV64-NEXT: mul a2, a2, a5 -; LMULMAX1-RV64-NEXT: srli a2, a2, 56 -; LMULMAX1-RV64-NEXT: addiw a2, a2, -48 -; LMULMAX1-RV64-NEXT: sh a2, 30(sp) -; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 6 -; LMULMAX1-RV64-NEXT: vmv.x.s a2, v26 -; LMULMAX1-RV64-NEXT: and a2, a2, a1 -; LMULMAX1-RV64-NEXT: srli a4, a2, 1 -; LMULMAX1-RV64-NEXT: or a2, a2, a4 -; LMULMAX1-RV64-NEXT: srli a4, a2, 2 -; LMULMAX1-RV64-NEXT: or a2, a2, a4 -; LMULMAX1-RV64-NEXT: srli a4, a2, 4 -; LMULMAX1-RV64-NEXT: or a2, a2, a4 -; LMULMAX1-RV64-NEXT: srli a4, a2, 8 -; LMULMAX1-RV64-NEXT: or a2, a2, a4 -; LMULMAX1-RV64-NEXT: srli a4, a2, 16 -; LMULMAX1-RV64-NEXT: or a2, a2, a4 -; LMULMAX1-RV64-NEXT: srli a4, a2, 32 -; LMULMAX1-RV64-NEXT: or a2, a2, a4 -; LMULMAX1-RV64-NEXT: not a2, a2 -; LMULMAX1-RV64-NEXT: srli a4, a2, 1 -; LMULMAX1-RV64-NEXT: and a4, a4, a6 -; LMULMAX1-RV64-NEXT: sub a2, a2, a4 -; LMULMAX1-RV64-NEXT: and a4, a2, a3 -; LMULMAX1-RV64-NEXT: srli a2, a2, 2 -; LMULMAX1-RV64-NEXT: and a2, a2, a3 -; LMULMAX1-RV64-NEXT: add a2, a4, a2 -; LMULMAX1-RV64-NEXT: srli a4, a2, 4 -; LMULMAX1-RV64-NEXT: add a2, a2, a4 -; LMULMAX1-RV64-NEXT: and a2, a2, a7 -; LMULMAX1-RV64-NEXT: mul a2, a2, a5 -; LMULMAX1-RV64-NEXT: srli a2, a2, 56 -; LMULMAX1-RV64-NEXT: addiw a2, a2, -48 -; LMULMAX1-RV64-NEXT: sh a2, 28(sp) -; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 5 -; LMULMAX1-RV64-NEXT: vmv.x.s a2, v26 -; LMULMAX1-RV64-NEXT: and a2, a2, a1 -; LMULMAX1-RV64-NEXT: srli a4, a2, 1 -; LMULMAX1-RV64-NEXT: or a2, a2, a4 -; LMULMAX1-RV64-NEXT: srli a4, a2, 2 -; LMULMAX1-RV64-NEXT: or a2, a2, a4 -; LMULMAX1-RV64-NEXT: srli a4, a2, 4 -; LMULMAX1-RV64-NEXT: or a2, a2, a4 -; LMULMAX1-RV64-NEXT: srli a4, a2, 8 -; LMULMAX1-RV64-NEXT: or a2, a2, a4 -; LMULMAX1-RV64-NEXT: srli a4, a2, 16 -; LMULMAX1-RV64-NEXT: or a2, a2, a4 -; LMULMAX1-RV64-NEXT: srli a4, a2, 32 -; LMULMAX1-RV64-NEXT: or a2, a2, a4 -; LMULMAX1-RV64-NEXT: not a2, a2 -; LMULMAX1-RV64-NEXT: srli a4, a2, 1 -; LMULMAX1-RV64-NEXT: and a4, a4, a6 -; LMULMAX1-RV64-NEXT: sub a2, a2, a4 -; LMULMAX1-RV64-NEXT: and a4, a2, a3 -; LMULMAX1-RV64-NEXT: srli a2, a2, 2 -; LMULMAX1-RV64-NEXT: and a2, a2, a3 -; LMULMAX1-RV64-NEXT: add a2, a4, a2 -; LMULMAX1-RV64-NEXT: srli a4, a2, 4 -; LMULMAX1-RV64-NEXT: add a2, a2, a4 -; LMULMAX1-RV64-NEXT: and a2, a2, a7 -; LMULMAX1-RV64-NEXT: mul a2, a2, a5 -; LMULMAX1-RV64-NEXT: srli a2, a2, 56 -; LMULMAX1-RV64-NEXT: addiw a2, a2, -48 -; LMULMAX1-RV64-NEXT: sh a2, 26(sp) -; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 4 -; LMULMAX1-RV64-NEXT: vmv.x.s a2, v26 -; LMULMAX1-RV64-NEXT: and a2, a2, a1 -; LMULMAX1-RV64-NEXT: srli a4, a2, 1 -; LMULMAX1-RV64-NEXT: or a2, a2, a4 -; LMULMAX1-RV64-NEXT: srli a4, a2, 2 -; LMULMAX1-RV64-NEXT: or a2, a2, a4 -; LMULMAX1-RV64-NEXT: srli a4, a2, 4 -; LMULMAX1-RV64-NEXT: or a2, a2, a4 -; LMULMAX1-RV64-NEXT: srli a4, a2, 8 -; LMULMAX1-RV64-NEXT: or a2, a2, a4 -; LMULMAX1-RV64-NEXT: srli a4, a2, 16 -; LMULMAX1-RV64-NEXT: or a2, a2, a4 -; LMULMAX1-RV64-NEXT: srli a4, a2, 32 -; LMULMAX1-RV64-NEXT: or a2, a2, a4 -; LMULMAX1-RV64-NEXT: not a2, a2 -; LMULMAX1-RV64-NEXT: srli a4, a2, 1 -; LMULMAX1-RV64-NEXT: and a4, a4, a6 -; LMULMAX1-RV64-NEXT: sub a2, a2, a4 -; LMULMAX1-RV64-NEXT: and a4, a2, a3 -; LMULMAX1-RV64-NEXT: srli a2, a2, 2 -; LMULMAX1-RV64-NEXT: and a2, a2, a3 -; LMULMAX1-RV64-NEXT: add a2, a4, a2 -; LMULMAX1-RV64-NEXT: srli a4, a2, 4 -; LMULMAX1-RV64-NEXT: add a2, a2, a4 -; LMULMAX1-RV64-NEXT: and a2, a2, a7 -; LMULMAX1-RV64-NEXT: mul a2, a2, a5 -; LMULMAX1-RV64-NEXT: srli a2, a2, 56 -; LMULMAX1-RV64-NEXT: addiw a2, a2, -48 -; LMULMAX1-RV64-NEXT: sh a2, 24(sp) -; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 3 -; LMULMAX1-RV64-NEXT: vmv.x.s a2, v26 -; LMULMAX1-RV64-NEXT: and a2, a2, a1 -; LMULMAX1-RV64-NEXT: srli a4, a2, 1 -; LMULMAX1-RV64-NEXT: or a2, a2, a4 -; LMULMAX1-RV64-NEXT: srli a4, a2, 2 -; LMULMAX1-RV64-NEXT: or a2, a2, a4 -; LMULMAX1-RV64-NEXT: srli a4, a2, 4 -; LMULMAX1-RV64-NEXT: or a2, a2, a4 -; LMULMAX1-RV64-NEXT: srli a4, a2, 8 -; LMULMAX1-RV64-NEXT: or a2, a2, a4 -; LMULMAX1-RV64-NEXT: srli a4, a2, 16 -; LMULMAX1-RV64-NEXT: or a2, a2, a4 -; LMULMAX1-RV64-NEXT: srli a4, a2, 32 -; LMULMAX1-RV64-NEXT: or a2, a2, a4 -; LMULMAX1-RV64-NEXT: not a2, a2 -; LMULMAX1-RV64-NEXT: srli a4, a2, 1 -; LMULMAX1-RV64-NEXT: and a4, a4, a6 -; LMULMAX1-RV64-NEXT: sub a2, a2, a4 -; LMULMAX1-RV64-NEXT: and a4, a2, a3 -; LMULMAX1-RV64-NEXT: srli a2, a2, 2 -; LMULMAX1-RV64-NEXT: and a2, a2, a3 -; LMULMAX1-RV64-NEXT: add a2, a4, a2 -; LMULMAX1-RV64-NEXT: srli a4, a2, 4 -; LMULMAX1-RV64-NEXT: add a2, a2, a4 -; LMULMAX1-RV64-NEXT: and a2, a2, a7 -; LMULMAX1-RV64-NEXT: mul a2, a2, a5 -; LMULMAX1-RV64-NEXT: srli a2, a2, 56 -; LMULMAX1-RV64-NEXT: addiw a2, a2, -48 -; LMULMAX1-RV64-NEXT: sh a2, 22(sp) -; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 2 -; LMULMAX1-RV64-NEXT: vmv.x.s a2, v26 -; LMULMAX1-RV64-NEXT: and a2, a2, a1 -; LMULMAX1-RV64-NEXT: srli a4, a2, 1 -; LMULMAX1-RV64-NEXT: or a2, a2, a4 -; LMULMAX1-RV64-NEXT: srli a4, a2, 2 -; LMULMAX1-RV64-NEXT: or a2, a2, a4 -; LMULMAX1-RV64-NEXT: srli a4, a2, 4 -; LMULMAX1-RV64-NEXT: or a2, a2, a4 -; LMULMAX1-RV64-NEXT: srli a4, a2, 8 -; LMULMAX1-RV64-NEXT: or a2, a2, a4 -; LMULMAX1-RV64-NEXT: srli a4, a2, 16 -; LMULMAX1-RV64-NEXT: or a2, a2, a4 -; LMULMAX1-RV64-NEXT: srli a4, a2, 32 -; LMULMAX1-RV64-NEXT: or a2, a2, a4 -; LMULMAX1-RV64-NEXT: not a2, a2 -; LMULMAX1-RV64-NEXT: srli a4, a2, 1 -; LMULMAX1-RV64-NEXT: and a4, a4, a6 -; LMULMAX1-RV64-NEXT: sub a2, a2, a4 -; LMULMAX1-RV64-NEXT: and a4, a2, a3 -; LMULMAX1-RV64-NEXT: srli a2, a2, 2 -; LMULMAX1-RV64-NEXT: and a2, a2, a3 -; LMULMAX1-RV64-NEXT: add a2, a4, a2 -; LMULMAX1-RV64-NEXT: srli a4, a2, 4 -; LMULMAX1-RV64-NEXT: add a2, a2, a4 -; LMULMAX1-RV64-NEXT: and a2, a2, a7 -; LMULMAX1-RV64-NEXT: mul a2, a2, a5 -; LMULMAX1-RV64-NEXT: srli a2, a2, 56 -; LMULMAX1-RV64-NEXT: addiw a2, a2, -48 -; LMULMAX1-RV64-NEXT: sh a2, 20(sp) -; LMULMAX1-RV64-NEXT: vslidedown.vi v25, v25, 1 -; LMULMAX1-RV64-NEXT: vmv.x.s a2, v25 -; LMULMAX1-RV64-NEXT: and a1, a2, a1 +; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 +; LMULMAX1-RV64-NEXT: and a1, a1, a6 ; LMULMAX1-RV64-NEXT: srli a2, a1, 1 ; LMULMAX1-RV64-NEXT: or a1, a1, a2 ; LMULMAX1-RV64-NEXT: srli a2, a1, 2 @@ -3041,7 +2861,7 @@ define void @ctlz_v8i16(<8 x i16>* %x, <8 x i16>* %y) { ; LMULMAX1-RV64-NEXT: or a1, a1, a2 ; LMULMAX1-RV64-NEXT: not a1, a1 ; LMULMAX1-RV64-NEXT: srli a2, a1, 1 -; LMULMAX1-RV64-NEXT: and a2, a2, a6 +; LMULMAX1-RV64-NEXT: and a2, a2, a7 ; LMULMAX1-RV64-NEXT: sub a1, a1, a2 ; LMULMAX1-RV64-NEXT: and a2, a1, a3 ; LMULMAX1-RV64-NEXT: srli a1, a1, 2 @@ -3049,26 +2869,206 @@ define void @ctlz_v8i16(<8 x i16>* %x, <8 x i16>* %y) { ; LMULMAX1-RV64-NEXT: add a1, a2, a1 ; LMULMAX1-RV64-NEXT: srli a2, a1, 4 ; LMULMAX1-RV64-NEXT: add a1, a1, a2 -; LMULMAX1-RV64-NEXT: and a1, a1, a7 +; LMULMAX1-RV64-NEXT: and a1, a1, a4 ; LMULMAX1-RV64-NEXT: mul a1, a1, a5 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 ; LMULMAX1-RV64-NEXT: addiw a1, a1, -48 -; LMULMAX1-RV64-NEXT: sh a1, 18(sp) -; LMULMAX1-RV64-NEXT: vsetivli zero, 8, e16, m1, ta, mu -; LMULMAX1-RV64-NEXT: addi a1, sp, 16 -; LMULMAX1-RV64-NEXT: vle16.v v25, (a1) -; LMULMAX1-RV64-NEXT: vse16.v v25, (a0) -; LMULMAX1-RV64-NEXT: addi sp, sp, 32 -; LMULMAX1-RV64-NEXT: ret - %a = load <8 x i16>, <8 x i16>* %x - %b = load <8 x i16>, <8 x i16>* %y - %c = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %a, i1 false) - store <8 x i16> %c, <8 x i16>* %x - ret void -} -declare <8 x i16> @llvm.ctlz.v8i16(<8 x i16>, i1) - -define void @ctlz_v4i32(<4 x i32>* %x, <4 x i32>* %y) { +; LMULMAX1-RV64-NEXT: sh a1, 30(sp) +; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 6 +; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 +; LMULMAX1-RV64-NEXT: and a1, a1, a6 +; LMULMAX1-RV64-NEXT: srli a2, a1, 1 +; LMULMAX1-RV64-NEXT: or a1, a1, a2 +; LMULMAX1-RV64-NEXT: srli a2, a1, 2 +; LMULMAX1-RV64-NEXT: or a1, a1, a2 +; LMULMAX1-RV64-NEXT: srli a2, a1, 4 +; LMULMAX1-RV64-NEXT: or a1, a1, a2 +; LMULMAX1-RV64-NEXT: srli a2, a1, 8 +; LMULMAX1-RV64-NEXT: or a1, a1, a2 +; LMULMAX1-RV64-NEXT: srli a2, a1, 16 +; LMULMAX1-RV64-NEXT: or a1, a1, a2 +; LMULMAX1-RV64-NEXT: srli a2, a1, 32 +; LMULMAX1-RV64-NEXT: or a1, a1, a2 +; LMULMAX1-RV64-NEXT: not a1, a1 +; LMULMAX1-RV64-NEXT: srli a2, a1, 1 +; LMULMAX1-RV64-NEXT: and a2, a2, a7 +; LMULMAX1-RV64-NEXT: sub a1, a1, a2 +; LMULMAX1-RV64-NEXT: and a2, a1, a3 +; LMULMAX1-RV64-NEXT: srli a1, a1, 2 +; LMULMAX1-RV64-NEXT: and a1, a1, a3 +; LMULMAX1-RV64-NEXT: add a1, a2, a1 +; LMULMAX1-RV64-NEXT: srli a2, a1, 4 +; LMULMAX1-RV64-NEXT: add a1, a1, a2 +; LMULMAX1-RV64-NEXT: and a1, a1, a4 +; LMULMAX1-RV64-NEXT: mul a1, a1, a5 +; LMULMAX1-RV64-NEXT: srli a1, a1, 56 +; LMULMAX1-RV64-NEXT: addiw a1, a1, -48 +; LMULMAX1-RV64-NEXT: sh a1, 28(sp) +; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 5 +; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 +; LMULMAX1-RV64-NEXT: and a1, a1, a6 +; LMULMAX1-RV64-NEXT: srli a2, a1, 1 +; LMULMAX1-RV64-NEXT: or a1, a1, a2 +; LMULMAX1-RV64-NEXT: srli a2, a1, 2 +; LMULMAX1-RV64-NEXT: or a1, a1, a2 +; LMULMAX1-RV64-NEXT: srli a2, a1, 4 +; LMULMAX1-RV64-NEXT: or a1, a1, a2 +; LMULMAX1-RV64-NEXT: srli a2, a1, 8 +; LMULMAX1-RV64-NEXT: or a1, a1, a2 +; LMULMAX1-RV64-NEXT: srli a2, a1, 16 +; LMULMAX1-RV64-NEXT: or a1, a1, a2 +; LMULMAX1-RV64-NEXT: srli a2, a1, 32 +; LMULMAX1-RV64-NEXT: or a1, a1, a2 +; LMULMAX1-RV64-NEXT: not a1, a1 +; LMULMAX1-RV64-NEXT: srli a2, a1, 1 +; LMULMAX1-RV64-NEXT: and a2, a2, a7 +; LMULMAX1-RV64-NEXT: sub a1, a1, a2 +; LMULMAX1-RV64-NEXT: and a2, a1, a3 +; LMULMAX1-RV64-NEXT: srli a1, a1, 2 +; LMULMAX1-RV64-NEXT: and a1, a1, a3 +; LMULMAX1-RV64-NEXT: add a1, a2, a1 +; LMULMAX1-RV64-NEXT: srli a2, a1, 4 +; LMULMAX1-RV64-NEXT: add a1, a1, a2 +; LMULMAX1-RV64-NEXT: and a1, a1, a4 +; LMULMAX1-RV64-NEXT: mul a1, a1, a5 +; LMULMAX1-RV64-NEXT: srli a1, a1, 56 +; LMULMAX1-RV64-NEXT: addiw a1, a1, -48 +; LMULMAX1-RV64-NEXT: sh a1, 26(sp) +; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 4 +; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 +; LMULMAX1-RV64-NEXT: and a1, a1, a6 +; LMULMAX1-RV64-NEXT: srli a2, a1, 1 +; LMULMAX1-RV64-NEXT: or a1, a1, a2 +; LMULMAX1-RV64-NEXT: srli a2, a1, 2 +; LMULMAX1-RV64-NEXT: or a1, a1, a2 +; LMULMAX1-RV64-NEXT: srli a2, a1, 4 +; LMULMAX1-RV64-NEXT: or a1, a1, a2 +; LMULMAX1-RV64-NEXT: srli a2, a1, 8 +; LMULMAX1-RV64-NEXT: or a1, a1, a2 +; LMULMAX1-RV64-NEXT: srli a2, a1, 16 +; LMULMAX1-RV64-NEXT: or a1, a1, a2 +; LMULMAX1-RV64-NEXT: srli a2, a1, 32 +; LMULMAX1-RV64-NEXT: or a1, a1, a2 +; LMULMAX1-RV64-NEXT: not a1, a1 +; LMULMAX1-RV64-NEXT: srli a2, a1, 1 +; LMULMAX1-RV64-NEXT: and a2, a2, a7 +; LMULMAX1-RV64-NEXT: sub a1, a1, a2 +; LMULMAX1-RV64-NEXT: and a2, a1, a3 +; LMULMAX1-RV64-NEXT: srli a1, a1, 2 +; LMULMAX1-RV64-NEXT: and a1, a1, a3 +; LMULMAX1-RV64-NEXT: add a1, a2, a1 +; LMULMAX1-RV64-NEXT: srli a2, a1, 4 +; LMULMAX1-RV64-NEXT: add a1, a1, a2 +; LMULMAX1-RV64-NEXT: and a1, a1, a4 +; LMULMAX1-RV64-NEXT: mul a1, a1, a5 +; LMULMAX1-RV64-NEXT: srli a1, a1, 56 +; LMULMAX1-RV64-NEXT: addiw a1, a1, -48 +; LMULMAX1-RV64-NEXT: sh a1, 24(sp) +; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 3 +; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 +; LMULMAX1-RV64-NEXT: and a1, a1, a6 +; LMULMAX1-RV64-NEXT: srli a2, a1, 1 +; LMULMAX1-RV64-NEXT: or a1, a1, a2 +; LMULMAX1-RV64-NEXT: srli a2, a1, 2 +; LMULMAX1-RV64-NEXT: or a1, a1, a2 +; LMULMAX1-RV64-NEXT: srli a2, a1, 4 +; LMULMAX1-RV64-NEXT: or a1, a1, a2 +; LMULMAX1-RV64-NEXT: srli a2, a1, 8 +; LMULMAX1-RV64-NEXT: or a1, a1, a2 +; LMULMAX1-RV64-NEXT: srli a2, a1, 16 +; LMULMAX1-RV64-NEXT: or a1, a1, a2 +; LMULMAX1-RV64-NEXT: srli a2, a1, 32 +; LMULMAX1-RV64-NEXT: or a1, a1, a2 +; LMULMAX1-RV64-NEXT: not a1, a1 +; LMULMAX1-RV64-NEXT: srli a2, a1, 1 +; LMULMAX1-RV64-NEXT: and a2, a2, a7 +; LMULMAX1-RV64-NEXT: sub a1, a1, a2 +; LMULMAX1-RV64-NEXT: and a2, a1, a3 +; LMULMAX1-RV64-NEXT: srli a1, a1, 2 +; LMULMAX1-RV64-NEXT: and a1, a1, a3 +; LMULMAX1-RV64-NEXT: add a1, a2, a1 +; LMULMAX1-RV64-NEXT: srli a2, a1, 4 +; LMULMAX1-RV64-NEXT: add a1, a1, a2 +; LMULMAX1-RV64-NEXT: and a1, a1, a4 +; LMULMAX1-RV64-NEXT: mul a1, a1, a5 +; LMULMAX1-RV64-NEXT: srli a1, a1, 56 +; LMULMAX1-RV64-NEXT: addiw a1, a1, -48 +; LMULMAX1-RV64-NEXT: sh a1, 22(sp) +; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 2 +; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 +; LMULMAX1-RV64-NEXT: and a1, a1, a6 +; LMULMAX1-RV64-NEXT: srli a2, a1, 1 +; LMULMAX1-RV64-NEXT: or a1, a1, a2 +; LMULMAX1-RV64-NEXT: srli a2, a1, 2 +; LMULMAX1-RV64-NEXT: or a1, a1, a2 +; LMULMAX1-RV64-NEXT: srli a2, a1, 4 +; LMULMAX1-RV64-NEXT: or a1, a1, a2 +; LMULMAX1-RV64-NEXT: srli a2, a1, 8 +; LMULMAX1-RV64-NEXT: or a1, a1, a2 +; LMULMAX1-RV64-NEXT: srli a2, a1, 16 +; LMULMAX1-RV64-NEXT: or a1, a1, a2 +; LMULMAX1-RV64-NEXT: srli a2, a1, 32 +; LMULMAX1-RV64-NEXT: or a1, a1, a2 +; LMULMAX1-RV64-NEXT: not a1, a1 +; LMULMAX1-RV64-NEXT: srli a2, a1, 1 +; LMULMAX1-RV64-NEXT: and a2, a2, a7 +; LMULMAX1-RV64-NEXT: sub a1, a1, a2 +; LMULMAX1-RV64-NEXT: and a2, a1, a3 +; LMULMAX1-RV64-NEXT: srli a1, a1, 2 +; LMULMAX1-RV64-NEXT: and a1, a1, a3 +; LMULMAX1-RV64-NEXT: add a1, a2, a1 +; LMULMAX1-RV64-NEXT: srli a2, a1, 4 +; LMULMAX1-RV64-NEXT: add a1, a1, a2 +; LMULMAX1-RV64-NEXT: and a1, a1, a4 +; LMULMAX1-RV64-NEXT: mul a1, a1, a5 +; LMULMAX1-RV64-NEXT: srli a1, a1, 56 +; LMULMAX1-RV64-NEXT: addiw a1, a1, -48 +; LMULMAX1-RV64-NEXT: sh a1, 20(sp) +; LMULMAX1-RV64-NEXT: vslidedown.vi v25, v25, 1 +; LMULMAX1-RV64-NEXT: vmv.x.s a1, v25 +; LMULMAX1-RV64-NEXT: and a1, a1, a6 +; LMULMAX1-RV64-NEXT: srli a2, a1, 1 +; LMULMAX1-RV64-NEXT: or a1, a1, a2 +; LMULMAX1-RV64-NEXT: srli a2, a1, 2 +; LMULMAX1-RV64-NEXT: or a1, a1, a2 +; LMULMAX1-RV64-NEXT: srli a2, a1, 4 +; LMULMAX1-RV64-NEXT: or a1, a1, a2 +; LMULMAX1-RV64-NEXT: srli a2, a1, 8 +; LMULMAX1-RV64-NEXT: or a1, a1, a2 +; LMULMAX1-RV64-NEXT: srli a2, a1, 16 +; LMULMAX1-RV64-NEXT: or a1, a1, a2 +; LMULMAX1-RV64-NEXT: srli a2, a1, 32 +; LMULMAX1-RV64-NEXT: or a1, a1, a2 +; LMULMAX1-RV64-NEXT: not a1, a1 +; LMULMAX1-RV64-NEXT: srli a2, a1, 1 +; LMULMAX1-RV64-NEXT: and a2, a2, a7 +; LMULMAX1-RV64-NEXT: sub a1, a1, a2 +; LMULMAX1-RV64-NEXT: and a2, a1, a3 +; LMULMAX1-RV64-NEXT: srli a1, a1, 2 +; LMULMAX1-RV64-NEXT: and a1, a1, a3 +; LMULMAX1-RV64-NEXT: add a1, a2, a1 +; LMULMAX1-RV64-NEXT: srli a2, a1, 4 +; LMULMAX1-RV64-NEXT: add a1, a1, a2 +; LMULMAX1-RV64-NEXT: and a1, a1, a4 +; LMULMAX1-RV64-NEXT: mul a1, a1, a5 +; LMULMAX1-RV64-NEXT: srli a1, a1, 56 +; LMULMAX1-RV64-NEXT: addiw a1, a1, -48 +; LMULMAX1-RV64-NEXT: sh a1, 18(sp) +; LMULMAX1-RV64-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; LMULMAX1-RV64-NEXT: addi a1, sp, 16 +; LMULMAX1-RV64-NEXT: vle16.v v25, (a1) +; LMULMAX1-RV64-NEXT: vse16.v v25, (a0) +; LMULMAX1-RV64-NEXT: addi sp, sp, 32 +; LMULMAX1-RV64-NEXT: ret + %a = load <8 x i16>, <8 x i16>* %x + %b = load <8 x i16>, <8 x i16>* %y + %c = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %a, i1 false) + store <8 x i16> %c, <8 x i16>* %x + ret void +} +declare <8 x i16> @llvm.ctlz.v8i16(<8 x i16>, i1) + +define void @ctlz_v4i32(<4 x i32>* %x, <4 x i32>* %y) { ; LMULMAX2-RV32-LABEL: ctlz_v4i32: ; LMULMAX2-RV32: # %bb.0: ; LMULMAX2-RV32-NEXT: addi sp, sp, -32 @@ -3669,126 +3669,126 @@ define void @ctlz_v2i64(<2 x i64>* %x, <2 x i64>* %y) { ; LMULMAX2-RV32-NEXT: vle64.v v25, (a0) ; LMULMAX2-RV32-NEXT: sw zero, 28(sp) ; LMULMAX2-RV32-NEXT: sw zero, 20(sp) -; LMULMAX2-RV32-NEXT: addi a5, zero, 32 +; LMULMAX2-RV32-NEXT: addi a6, zero, 32 ; LMULMAX2-RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; LMULMAX2-RV32-NEXT: vsrl.vx v26, v25, a5 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v26 -; LMULMAX2-RV32-NEXT: lui a2, 349525 -; LMULMAX2-RV32-NEXT: addi a4, a2, 1365 -; LMULMAX2-RV32-NEXT: lui a2, 209715 -; LMULMAX2-RV32-NEXT: addi a3, a2, 819 -; LMULMAX2-RV32-NEXT: lui a2, 61681 -; LMULMAX2-RV32-NEXT: addi a6, a2, -241 -; LMULMAX2-RV32-NEXT: lui a2, 4112 -; LMULMAX2-RV32-NEXT: addi a7, a2, 257 -; LMULMAX2-RV32-NEXT: bnez a1, .LBB3_2 +; LMULMAX2-RV32-NEXT: vsrl.vx v26, v25, a6 +; LMULMAX2-RV32-NEXT: vmv.x.s a5, v26 +; LMULMAX2-RV32-NEXT: lui a1, 349525 +; LMULMAX2-RV32-NEXT: addi a4, a1, 1365 +; LMULMAX2-RV32-NEXT: lui a1, 209715 +; LMULMAX2-RV32-NEXT: addi a3, a1, 819 +; LMULMAX2-RV32-NEXT: lui a1, 61681 +; LMULMAX2-RV32-NEXT: addi a7, a1, -241 +; LMULMAX2-RV32-NEXT: lui a1, 4112 +; LMULMAX2-RV32-NEXT: addi a2, a1, 257 +; LMULMAX2-RV32-NEXT: bnez a5, .LBB3_2 ; LMULMAX2-RV32-NEXT: # %bb.1: ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v25 -; LMULMAX2-RV32-NEXT: srli a2, a1, 1 -; LMULMAX2-RV32-NEXT: or a1, a1, a2 -; LMULMAX2-RV32-NEXT: srli a2, a1, 2 -; LMULMAX2-RV32-NEXT: or a1, a1, a2 -; LMULMAX2-RV32-NEXT: srli a2, a1, 4 -; LMULMAX2-RV32-NEXT: or a1, a1, a2 -; LMULMAX2-RV32-NEXT: srli a2, a1, 8 -; LMULMAX2-RV32-NEXT: or a1, a1, a2 -; LMULMAX2-RV32-NEXT: srli a2, a1, 16 -; LMULMAX2-RV32-NEXT: or a1, a1, a2 +; LMULMAX2-RV32-NEXT: srli a5, a1, 1 +; LMULMAX2-RV32-NEXT: or a1, a1, a5 +; LMULMAX2-RV32-NEXT: srli a5, a1, 2 +; LMULMAX2-RV32-NEXT: or a1, a1, a5 +; LMULMAX2-RV32-NEXT: srli a5, a1, 4 +; LMULMAX2-RV32-NEXT: or a1, a1, a5 +; LMULMAX2-RV32-NEXT: srli a5, a1, 8 +; LMULMAX2-RV32-NEXT: or a1, a1, a5 +; LMULMAX2-RV32-NEXT: srli a5, a1, 16 +; LMULMAX2-RV32-NEXT: or a1, a1, a5 ; LMULMAX2-RV32-NEXT: not a1, a1 -; LMULMAX2-RV32-NEXT: srli a2, a1, 1 -; LMULMAX2-RV32-NEXT: and a2, a2, a4 -; LMULMAX2-RV32-NEXT: sub a1, a1, a2 -; LMULMAX2-RV32-NEXT: and a2, a1, a3 +; LMULMAX2-RV32-NEXT: srli a5, a1, 1 +; LMULMAX2-RV32-NEXT: and a5, a5, a4 +; LMULMAX2-RV32-NEXT: sub a1, a1, a5 +; LMULMAX2-RV32-NEXT: and a5, a1, a3 ; LMULMAX2-RV32-NEXT: srli a1, a1, 2 ; LMULMAX2-RV32-NEXT: and a1, a1, a3 -; LMULMAX2-RV32-NEXT: add a1, a2, a1 -; LMULMAX2-RV32-NEXT: srli a2, a1, 4 -; LMULMAX2-RV32-NEXT: add a1, a1, a2 -; LMULMAX2-RV32-NEXT: and a1, a1, a6 -; LMULMAX2-RV32-NEXT: mul a1, a1, a7 +; LMULMAX2-RV32-NEXT: add a1, a5, a1 +; LMULMAX2-RV32-NEXT: srli a5, a1, 4 +; LMULMAX2-RV32-NEXT: add a1, a1, a5 +; LMULMAX2-RV32-NEXT: and a1, a1, a7 +; LMULMAX2-RV32-NEXT: mul a1, a1, a2 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 -; LMULMAX2-RV32-NEXT: addi a1, a1, 32 +; LMULMAX2-RV32-NEXT: addi a5, a1, 32 ; LMULMAX2-RV32-NEXT: j .LBB3_3 ; LMULMAX2-RV32-NEXT: .LBB3_2: -; LMULMAX2-RV32-NEXT: srli a2, a1, 1 -; LMULMAX2-RV32-NEXT: or a1, a1, a2 -; LMULMAX2-RV32-NEXT: srli a2, a1, 2 -; LMULMAX2-RV32-NEXT: or a1, a1, a2 -; LMULMAX2-RV32-NEXT: srli a2, a1, 4 -; LMULMAX2-RV32-NEXT: or a1, a1, a2 -; LMULMAX2-RV32-NEXT: srli a2, a1, 8 -; LMULMAX2-RV32-NEXT: or a1, a1, a2 -; LMULMAX2-RV32-NEXT: srli a2, a1, 16 -; LMULMAX2-RV32-NEXT: or a1, a1, a2 +; LMULMAX2-RV32-NEXT: srli a1, a5, 1 +; LMULMAX2-RV32-NEXT: or a1, a5, a1 +; LMULMAX2-RV32-NEXT: srli a5, a1, 2 +; LMULMAX2-RV32-NEXT: or a1, a1, a5 +; LMULMAX2-RV32-NEXT: srli a5, a1, 4 +; LMULMAX2-RV32-NEXT: or a1, a1, a5 +; LMULMAX2-RV32-NEXT: srli a5, a1, 8 +; LMULMAX2-RV32-NEXT: or a1, a1, a5 +; LMULMAX2-RV32-NEXT: srli a5, a1, 16 +; LMULMAX2-RV32-NEXT: or a1, a1, a5 ; LMULMAX2-RV32-NEXT: not a1, a1 -; LMULMAX2-RV32-NEXT: srli a2, a1, 1 -; LMULMAX2-RV32-NEXT: and a2, a2, a4 -; LMULMAX2-RV32-NEXT: sub a1, a1, a2 -; LMULMAX2-RV32-NEXT: and a2, a1, a3 +; LMULMAX2-RV32-NEXT: srli a5, a1, 1 +; LMULMAX2-RV32-NEXT: and a5, a5, a4 +; LMULMAX2-RV32-NEXT: sub a1, a1, a5 +; LMULMAX2-RV32-NEXT: and a5, a1, a3 ; LMULMAX2-RV32-NEXT: srli a1, a1, 2 ; LMULMAX2-RV32-NEXT: and a1, a1, a3 -; LMULMAX2-RV32-NEXT: add a1, a2, a1 -; LMULMAX2-RV32-NEXT: srli a2, a1, 4 -; LMULMAX2-RV32-NEXT: add a1, a1, a2 -; LMULMAX2-RV32-NEXT: and a1, a1, a6 -; LMULMAX2-RV32-NEXT: mul a1, a1, a7 -; LMULMAX2-RV32-NEXT: srli a1, a1, 24 +; LMULMAX2-RV32-NEXT: add a1, a5, a1 +; LMULMAX2-RV32-NEXT: srli a5, a1, 4 +; LMULMAX2-RV32-NEXT: add a1, a1, a5 +; LMULMAX2-RV32-NEXT: and a1, a1, a7 +; LMULMAX2-RV32-NEXT: mul a1, a1, a2 +; LMULMAX2-RV32-NEXT: srli a5, a1, 24 ; LMULMAX2-RV32-NEXT: .LBB3_3: ; LMULMAX2-RV32-NEXT: vslidedown.vi v25, v25, 1 -; LMULMAX2-RV32-NEXT: vsrl.vx v26, v25, a5 -; LMULMAX2-RV32-NEXT: vmv.x.s a5, v26 -; LMULMAX2-RV32-NEXT: sw a1, 16(sp) -; LMULMAX2-RV32-NEXT: bnez a5, .LBB3_5 +; LMULMAX2-RV32-NEXT: vsrl.vx v26, v25, a6 +; LMULMAX2-RV32-NEXT: vmv.x.s a1, v26 +; LMULMAX2-RV32-NEXT: sw a5, 16(sp) +; LMULMAX2-RV32-NEXT: bnez a1, .LBB3_5 ; LMULMAX2-RV32-NEXT: # %bb.4: ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v25 -; LMULMAX2-RV32-NEXT: srli a2, a1, 1 -; LMULMAX2-RV32-NEXT: or a1, a1, a2 -; LMULMAX2-RV32-NEXT: srli a2, a1, 2 -; LMULMAX2-RV32-NEXT: or a1, a1, a2 -; LMULMAX2-RV32-NEXT: srli a2, a1, 4 -; LMULMAX2-RV32-NEXT: or a1, a1, a2 -; LMULMAX2-RV32-NEXT: srli a2, a1, 8 -; LMULMAX2-RV32-NEXT: or a1, a1, a2 -; LMULMAX2-RV32-NEXT: srli a2, a1, 16 -; LMULMAX2-RV32-NEXT: or a1, a1, a2 +; LMULMAX2-RV32-NEXT: srli a5, a1, 1 +; LMULMAX2-RV32-NEXT: or a1, a1, a5 +; LMULMAX2-RV32-NEXT: srli a5, a1, 2 +; LMULMAX2-RV32-NEXT: or a1, a1, a5 +; LMULMAX2-RV32-NEXT: srli a5, a1, 4 +; LMULMAX2-RV32-NEXT: or a1, a1, a5 +; LMULMAX2-RV32-NEXT: srli a5, a1, 8 +; LMULMAX2-RV32-NEXT: or a1, a1, a5 +; LMULMAX2-RV32-NEXT: srli a5, a1, 16 +; LMULMAX2-RV32-NEXT: or a1, a1, a5 ; LMULMAX2-RV32-NEXT: not a1, a1 -; LMULMAX2-RV32-NEXT: srli a2, a1, 1 -; LMULMAX2-RV32-NEXT: and a2, a2, a4 -; LMULMAX2-RV32-NEXT: sub a1, a1, a2 -; LMULMAX2-RV32-NEXT: and a2, a1, a3 +; LMULMAX2-RV32-NEXT: srli a5, a1, 1 +; LMULMAX2-RV32-NEXT: and a4, a5, a4 +; LMULMAX2-RV32-NEXT: sub a1, a1, a4 +; LMULMAX2-RV32-NEXT: and a4, a1, a3 ; LMULMAX2-RV32-NEXT: srli a1, a1, 2 ; LMULMAX2-RV32-NEXT: and a1, a1, a3 -; LMULMAX2-RV32-NEXT: add a1, a2, a1 -; LMULMAX2-RV32-NEXT: srli a2, a1, 4 -; LMULMAX2-RV32-NEXT: add a1, a1, a2 -; LMULMAX2-RV32-NEXT: and a1, a1, a6 -; LMULMAX2-RV32-NEXT: mul a1, a1, a7 +; LMULMAX2-RV32-NEXT: add a1, a4, a1 +; LMULMAX2-RV32-NEXT: srli a3, a1, 4 +; LMULMAX2-RV32-NEXT: add a1, a1, a3 +; LMULMAX2-RV32-NEXT: and a1, a1, a7 +; LMULMAX2-RV32-NEXT: mul a1, a1, a2 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 ; LMULMAX2-RV32-NEXT: addi a1, a1, 32 ; LMULMAX2-RV32-NEXT: j .LBB3_6 ; LMULMAX2-RV32-NEXT: .LBB3_5: -; LMULMAX2-RV32-NEXT: srli a1, a5, 1 -; LMULMAX2-RV32-NEXT: or a1, a5, a1 -; LMULMAX2-RV32-NEXT: srli a2, a1, 2 -; LMULMAX2-RV32-NEXT: or a1, a1, a2 -; LMULMAX2-RV32-NEXT: srli a2, a1, 4 -; LMULMAX2-RV32-NEXT: or a1, a1, a2 -; LMULMAX2-RV32-NEXT: srli a2, a1, 8 -; LMULMAX2-RV32-NEXT: or a1, a1, a2 -; LMULMAX2-RV32-NEXT: srli a2, a1, 16 -; LMULMAX2-RV32-NEXT: or a1, a1, a2 +; LMULMAX2-RV32-NEXT: srli a5, a1, 1 +; LMULMAX2-RV32-NEXT: or a1, a1, a5 +; LMULMAX2-RV32-NEXT: srli a5, a1, 2 +; LMULMAX2-RV32-NEXT: or a1, a1, a5 +; LMULMAX2-RV32-NEXT: srli a5, a1, 4 +; LMULMAX2-RV32-NEXT: or a1, a1, a5 +; LMULMAX2-RV32-NEXT: srli a5, a1, 8 +; LMULMAX2-RV32-NEXT: or a1, a1, a5 +; LMULMAX2-RV32-NEXT: srli a5, a1, 16 +; LMULMAX2-RV32-NEXT: or a1, a1, a5 ; LMULMAX2-RV32-NEXT: not a1, a1 -; LMULMAX2-RV32-NEXT: srli a2, a1, 1 -; LMULMAX2-RV32-NEXT: and a2, a2, a4 -; LMULMAX2-RV32-NEXT: sub a1, a1, a2 -; LMULMAX2-RV32-NEXT: and a2, a1, a3 +; LMULMAX2-RV32-NEXT: srli a5, a1, 1 +; LMULMAX2-RV32-NEXT: and a4, a5, a4 +; LMULMAX2-RV32-NEXT: sub a1, a1, a4 +; LMULMAX2-RV32-NEXT: and a4, a1, a3 ; LMULMAX2-RV32-NEXT: srli a1, a1, 2 ; LMULMAX2-RV32-NEXT: and a1, a1, a3 -; LMULMAX2-RV32-NEXT: add a1, a2, a1 -; LMULMAX2-RV32-NEXT: srli a2, a1, 4 -; LMULMAX2-RV32-NEXT: add a1, a1, a2 -; LMULMAX2-RV32-NEXT: and a1, a1, a6 -; LMULMAX2-RV32-NEXT: mul a1, a1, a7 +; LMULMAX2-RV32-NEXT: add a1, a4, a1 +; LMULMAX2-RV32-NEXT: srli a3, a1, 4 +; LMULMAX2-RV32-NEXT: add a1, a1, a3 +; LMULMAX2-RV32-NEXT: and a1, a1, a7 +; LMULMAX2-RV32-NEXT: mul a1, a1, a2 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 ; LMULMAX2-RV32-NEXT: .LBB3_6: ; LMULMAX2-RV32-NEXT: sw a1, 24(sp) @@ -3903,126 +3903,126 @@ define void @ctlz_v2i64(<2 x i64>* %x, <2 x i64>* %y) { ; LMULMAX1-RV32-NEXT: vle64.v v25, (a0) ; LMULMAX1-RV32-NEXT: sw zero, 28(sp) ; LMULMAX1-RV32-NEXT: sw zero, 20(sp) -; LMULMAX1-RV32-NEXT: addi a5, zero, 32 +; LMULMAX1-RV32-NEXT: addi a6, zero, 32 ; LMULMAX1-RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; LMULMAX1-RV32-NEXT: vsrl.vx v26, v25, a5 -; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 -; LMULMAX1-RV32-NEXT: lui a2, 349525 -; LMULMAX1-RV32-NEXT: addi a4, a2, 1365 -; LMULMAX1-RV32-NEXT: lui a2, 209715 -; LMULMAX1-RV32-NEXT: addi a3, a2, 819 -; LMULMAX1-RV32-NEXT: lui a2, 61681 -; LMULMAX1-RV32-NEXT: addi a6, a2, -241 -; LMULMAX1-RV32-NEXT: lui a2, 4112 -; LMULMAX1-RV32-NEXT: addi a7, a2, 257 -; LMULMAX1-RV32-NEXT: bnez a1, .LBB3_2 +; LMULMAX1-RV32-NEXT: vsrl.vx v26, v25, a6 +; LMULMAX1-RV32-NEXT: vmv.x.s a5, v26 +; LMULMAX1-RV32-NEXT: lui a1, 349525 +; LMULMAX1-RV32-NEXT: addi a4, a1, 1365 +; LMULMAX1-RV32-NEXT: lui a1, 209715 +; LMULMAX1-RV32-NEXT: addi a3, a1, 819 +; LMULMAX1-RV32-NEXT: lui a1, 61681 +; LMULMAX1-RV32-NEXT: addi a7, a1, -241 +; LMULMAX1-RV32-NEXT: lui a1, 4112 +; LMULMAX1-RV32-NEXT: addi a2, a1, 257 +; LMULMAX1-RV32-NEXT: bnez a5, .LBB3_2 ; LMULMAX1-RV32-NEXT: # %bb.1: ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v25 -; LMULMAX1-RV32-NEXT: srli a2, a1, 1 -; LMULMAX1-RV32-NEXT: or a1, a1, a2 -; LMULMAX1-RV32-NEXT: srli a2, a1, 2 -; LMULMAX1-RV32-NEXT: or a1, a1, a2 -; LMULMAX1-RV32-NEXT: srli a2, a1, 4 -; LMULMAX1-RV32-NEXT: or a1, a1, a2 -; LMULMAX1-RV32-NEXT: srli a2, a1, 8 -; LMULMAX1-RV32-NEXT: or a1, a1, a2 -; LMULMAX1-RV32-NEXT: srli a2, a1, 16 -; LMULMAX1-RV32-NEXT: or a1, a1, a2 +; LMULMAX1-RV32-NEXT: srli a5, a1, 1 +; LMULMAX1-RV32-NEXT: or a1, a1, a5 +; LMULMAX1-RV32-NEXT: srli a5, a1, 2 +; LMULMAX1-RV32-NEXT: or a1, a1, a5 +; LMULMAX1-RV32-NEXT: srli a5, a1, 4 +; LMULMAX1-RV32-NEXT: or a1, a1, a5 +; LMULMAX1-RV32-NEXT: srli a5, a1, 8 +; LMULMAX1-RV32-NEXT: or a1, a1, a5 +; LMULMAX1-RV32-NEXT: srli a5, a1, 16 +; LMULMAX1-RV32-NEXT: or a1, a1, a5 ; LMULMAX1-RV32-NEXT: not a1, a1 -; LMULMAX1-RV32-NEXT: srli a2, a1, 1 -; LMULMAX1-RV32-NEXT: and a2, a2, a4 -; LMULMAX1-RV32-NEXT: sub a1, a1, a2 -; LMULMAX1-RV32-NEXT: and a2, a1, a3 +; LMULMAX1-RV32-NEXT: srli a5, a1, 1 +; LMULMAX1-RV32-NEXT: and a5, a5, a4 +; LMULMAX1-RV32-NEXT: sub a1, a1, a5 +; LMULMAX1-RV32-NEXT: and a5, a1, a3 ; LMULMAX1-RV32-NEXT: srli a1, a1, 2 ; LMULMAX1-RV32-NEXT: and a1, a1, a3 -; LMULMAX1-RV32-NEXT: add a1, a2, a1 -; LMULMAX1-RV32-NEXT: srli a2, a1, 4 -; LMULMAX1-RV32-NEXT: add a1, a1, a2 -; LMULMAX1-RV32-NEXT: and a1, a1, a6 -; LMULMAX1-RV32-NEXT: mul a1, a1, a7 +; LMULMAX1-RV32-NEXT: add a1, a5, a1 +; LMULMAX1-RV32-NEXT: srli a5, a1, 4 +; LMULMAX1-RV32-NEXT: add a1, a1, a5 +; LMULMAX1-RV32-NEXT: and a1, a1, a7 +; LMULMAX1-RV32-NEXT: mul a1, a1, a2 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 -; LMULMAX1-RV32-NEXT: addi a1, a1, 32 +; LMULMAX1-RV32-NEXT: addi a5, a1, 32 ; LMULMAX1-RV32-NEXT: j .LBB3_3 ; LMULMAX1-RV32-NEXT: .LBB3_2: -; LMULMAX1-RV32-NEXT: srli a2, a1, 1 -; LMULMAX1-RV32-NEXT: or a1, a1, a2 -; LMULMAX1-RV32-NEXT: srli a2, a1, 2 -; LMULMAX1-RV32-NEXT: or a1, a1, a2 -; LMULMAX1-RV32-NEXT: srli a2, a1, 4 -; LMULMAX1-RV32-NEXT: or a1, a1, a2 -; LMULMAX1-RV32-NEXT: srli a2, a1, 8 -; LMULMAX1-RV32-NEXT: or a1, a1, a2 -; LMULMAX1-RV32-NEXT: srli a2, a1, 16 -; LMULMAX1-RV32-NEXT: or a1, a1, a2 +; LMULMAX1-RV32-NEXT: srli a1, a5, 1 +; LMULMAX1-RV32-NEXT: or a1, a5, a1 +; LMULMAX1-RV32-NEXT: srli a5, a1, 2 +; LMULMAX1-RV32-NEXT: or a1, a1, a5 +; LMULMAX1-RV32-NEXT: srli a5, a1, 4 +; LMULMAX1-RV32-NEXT: or a1, a1, a5 +; LMULMAX1-RV32-NEXT: srli a5, a1, 8 +; LMULMAX1-RV32-NEXT: or a1, a1, a5 +; LMULMAX1-RV32-NEXT: srli a5, a1, 16 +; LMULMAX1-RV32-NEXT: or a1, a1, a5 ; LMULMAX1-RV32-NEXT: not a1, a1 -; LMULMAX1-RV32-NEXT: srli a2, a1, 1 -; LMULMAX1-RV32-NEXT: and a2, a2, a4 -; LMULMAX1-RV32-NEXT: sub a1, a1, a2 -; LMULMAX1-RV32-NEXT: and a2, a1, a3 +; LMULMAX1-RV32-NEXT: srli a5, a1, 1 +; LMULMAX1-RV32-NEXT: and a5, a5, a4 +; LMULMAX1-RV32-NEXT: sub a1, a1, a5 +; LMULMAX1-RV32-NEXT: and a5, a1, a3 ; LMULMAX1-RV32-NEXT: srli a1, a1, 2 ; LMULMAX1-RV32-NEXT: and a1, a1, a3 -; LMULMAX1-RV32-NEXT: add a1, a2, a1 -; LMULMAX1-RV32-NEXT: srli a2, a1, 4 -; LMULMAX1-RV32-NEXT: add a1, a1, a2 -; LMULMAX1-RV32-NEXT: and a1, a1, a6 -; LMULMAX1-RV32-NEXT: mul a1, a1, a7 -; LMULMAX1-RV32-NEXT: srli a1, a1, 24 +; LMULMAX1-RV32-NEXT: add a1, a5, a1 +; LMULMAX1-RV32-NEXT: srli a5, a1, 4 +; LMULMAX1-RV32-NEXT: add a1, a1, a5 +; LMULMAX1-RV32-NEXT: and a1, a1, a7 +; LMULMAX1-RV32-NEXT: mul a1, a1, a2 +; LMULMAX1-RV32-NEXT: srli a5, a1, 24 ; LMULMAX1-RV32-NEXT: .LBB3_3: ; LMULMAX1-RV32-NEXT: vslidedown.vi v25, v25, 1 -; LMULMAX1-RV32-NEXT: vsrl.vx v26, v25, a5 -; LMULMAX1-RV32-NEXT: vmv.x.s a5, v26 -; LMULMAX1-RV32-NEXT: sw a1, 16(sp) -; LMULMAX1-RV32-NEXT: bnez a5, .LBB3_5 +; LMULMAX1-RV32-NEXT: vsrl.vx v26, v25, a6 +; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 +; LMULMAX1-RV32-NEXT: sw a5, 16(sp) +; LMULMAX1-RV32-NEXT: bnez a1, .LBB3_5 ; LMULMAX1-RV32-NEXT: # %bb.4: ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v25 -; LMULMAX1-RV32-NEXT: srli a2, a1, 1 -; LMULMAX1-RV32-NEXT: or a1, a1, a2 -; LMULMAX1-RV32-NEXT: srli a2, a1, 2 -; LMULMAX1-RV32-NEXT: or a1, a1, a2 -; LMULMAX1-RV32-NEXT: srli a2, a1, 4 -; LMULMAX1-RV32-NEXT: or a1, a1, a2 -; LMULMAX1-RV32-NEXT: srli a2, a1, 8 -; LMULMAX1-RV32-NEXT: or a1, a1, a2 -; LMULMAX1-RV32-NEXT: srli a2, a1, 16 -; LMULMAX1-RV32-NEXT: or a1, a1, a2 +; LMULMAX1-RV32-NEXT: srli a5, a1, 1 +; LMULMAX1-RV32-NEXT: or a1, a1, a5 +; LMULMAX1-RV32-NEXT: srli a5, a1, 2 +; LMULMAX1-RV32-NEXT: or a1, a1, a5 +; LMULMAX1-RV32-NEXT: srli a5, a1, 4 +; LMULMAX1-RV32-NEXT: or a1, a1, a5 +; LMULMAX1-RV32-NEXT: srli a5, a1, 8 +; LMULMAX1-RV32-NEXT: or a1, a1, a5 +; LMULMAX1-RV32-NEXT: srli a5, a1, 16 +; LMULMAX1-RV32-NEXT: or a1, a1, a5 ; LMULMAX1-RV32-NEXT: not a1, a1 -; LMULMAX1-RV32-NEXT: srli a2, a1, 1 -; LMULMAX1-RV32-NEXT: and a2, a2, a4 -; LMULMAX1-RV32-NEXT: sub a1, a1, a2 -; LMULMAX1-RV32-NEXT: and a2, a1, a3 +; LMULMAX1-RV32-NEXT: srli a5, a1, 1 +; LMULMAX1-RV32-NEXT: and a4, a5, a4 +; LMULMAX1-RV32-NEXT: sub a1, a1, a4 +; LMULMAX1-RV32-NEXT: and a4, a1, a3 ; LMULMAX1-RV32-NEXT: srli a1, a1, 2 ; LMULMAX1-RV32-NEXT: and a1, a1, a3 -; LMULMAX1-RV32-NEXT: add a1, a2, a1 -; LMULMAX1-RV32-NEXT: srli a2, a1, 4 -; LMULMAX1-RV32-NEXT: add a1, a1, a2 -; LMULMAX1-RV32-NEXT: and a1, a1, a6 -; LMULMAX1-RV32-NEXT: mul a1, a1, a7 +; LMULMAX1-RV32-NEXT: add a1, a4, a1 +; LMULMAX1-RV32-NEXT: srli a3, a1, 4 +; LMULMAX1-RV32-NEXT: add a1, a1, a3 +; LMULMAX1-RV32-NEXT: and a1, a1, a7 +; LMULMAX1-RV32-NEXT: mul a1, a1, a2 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 ; LMULMAX1-RV32-NEXT: addi a1, a1, 32 ; LMULMAX1-RV32-NEXT: j .LBB3_6 ; LMULMAX1-RV32-NEXT: .LBB3_5: -; LMULMAX1-RV32-NEXT: srli a1, a5, 1 -; LMULMAX1-RV32-NEXT: or a1, a5, a1 -; LMULMAX1-RV32-NEXT: srli a2, a1, 2 -; LMULMAX1-RV32-NEXT: or a1, a1, a2 -; LMULMAX1-RV32-NEXT: srli a2, a1, 4 -; LMULMAX1-RV32-NEXT: or a1, a1, a2 -; LMULMAX1-RV32-NEXT: srli a2, a1, 8 -; LMULMAX1-RV32-NEXT: or a1, a1, a2 -; LMULMAX1-RV32-NEXT: srli a2, a1, 16 -; LMULMAX1-RV32-NEXT: or a1, a1, a2 +; LMULMAX1-RV32-NEXT: srli a5, a1, 1 +; LMULMAX1-RV32-NEXT: or a1, a1, a5 +; LMULMAX1-RV32-NEXT: srli a5, a1, 2 +; LMULMAX1-RV32-NEXT: or a1, a1, a5 +; LMULMAX1-RV32-NEXT: srli a5, a1, 4 +; LMULMAX1-RV32-NEXT: or a1, a1, a5 +; LMULMAX1-RV32-NEXT: srli a5, a1, 8 +; LMULMAX1-RV32-NEXT: or a1, a1, a5 +; LMULMAX1-RV32-NEXT: srli a5, a1, 16 +; LMULMAX1-RV32-NEXT: or a1, a1, a5 ; LMULMAX1-RV32-NEXT: not a1, a1 -; LMULMAX1-RV32-NEXT: srli a2, a1, 1 -; LMULMAX1-RV32-NEXT: and a2, a2, a4 -; LMULMAX1-RV32-NEXT: sub a1, a1, a2 -; LMULMAX1-RV32-NEXT: and a2, a1, a3 +; LMULMAX1-RV32-NEXT: srli a5, a1, 1 +; LMULMAX1-RV32-NEXT: and a4, a5, a4 +; LMULMAX1-RV32-NEXT: sub a1, a1, a4 +; LMULMAX1-RV32-NEXT: and a4, a1, a3 ; LMULMAX1-RV32-NEXT: srli a1, a1, 2 ; LMULMAX1-RV32-NEXT: and a1, a1, a3 -; LMULMAX1-RV32-NEXT: add a1, a2, a1 -; LMULMAX1-RV32-NEXT: srli a2, a1, 4 -; LMULMAX1-RV32-NEXT: add a1, a1, a2 -; LMULMAX1-RV32-NEXT: and a1, a1, a6 -; LMULMAX1-RV32-NEXT: mul a1, a1, a7 +; LMULMAX1-RV32-NEXT: add a1, a4, a1 +; LMULMAX1-RV32-NEXT: srli a3, a1, 4 +; LMULMAX1-RV32-NEXT: add a1, a1, a3 +; LMULMAX1-RV32-NEXT: and a1, a1, a7 +; LMULMAX1-RV32-NEXT: mul a1, a1, a2 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 ; LMULMAX1-RV32-NEXT: .LBB3_6: ; LMULMAX1-RV32-NEXT: sw a1, 24(sp) @@ -8511,8 +8511,8 @@ define void @ctlz_v16i16(<16 x i16>* %x, <16 x i16>* %y) { ; LMULMAX2-RV64-NEXT: vle16.v v26, (a0) ; LMULMAX2-RV64-NEXT: vmv.x.s a2, v26 ; LMULMAX2-RV64-NEXT: lui a1, 16 -; LMULMAX2-RV64-NEXT: addiw a1, a1, -1 -; LMULMAX2-RV64-NEXT: and a2, a2, a1 +; LMULMAX2-RV64-NEXT: addiw a6, a1, -1 +; LMULMAX2-RV64-NEXT: and a2, a2, a6 ; LMULMAX2-RV64-NEXT: srli a3, a2, 1 ; LMULMAX2-RV64-NEXT: or a2, a2, a3 ; LMULMAX2-RV64-NEXT: srli a3, a2, 2 @@ -8534,8 +8534,8 @@ define void @ctlz_v16i16(<16 x i16>* %x, <16 x i16>* %y) { ; LMULMAX2-RV64-NEXT: slli a2, a2, 12 ; LMULMAX2-RV64-NEXT: addi a2, a2, 1365 ; LMULMAX2-RV64-NEXT: slli a2, a2, 12 -; LMULMAX2-RV64-NEXT: addi a6, a2, 1365 -; LMULMAX2-RV64-NEXT: and a4, a4, a6 +; LMULMAX2-RV64-NEXT: addi a7, a2, 1365 +; LMULMAX2-RV64-NEXT: and a4, a4, a7 ; LMULMAX2-RV64-NEXT: sub a4, a3, a4 ; LMULMAX2-RV64-NEXT: lui a3, 13107 ; LMULMAX2-RV64-NEXT: addiw a3, a3, 819 @@ -8558,442 +8558,22 @@ define void @ctlz_v16i16(<16 x i16>* %x, <16 x i16>* %y) { ; LMULMAX2-RV64-NEXT: slli a4, a4, 12 ; LMULMAX2-RV64-NEXT: addi a4, a4, 241 ; LMULMAX2-RV64-NEXT: slli a4, a4, 12 -; LMULMAX2-RV64-NEXT: addi a7, a4, -241 -; LMULMAX2-RV64-NEXT: and a2, a5, a7 +; LMULMAX2-RV64-NEXT: addi a4, a4, -241 +; LMULMAX2-RV64-NEXT: and a1, a5, a4 ; LMULMAX2-RV64-NEXT: lui a5, 4112 ; LMULMAX2-RV64-NEXT: addiw a5, a5, 257 ; LMULMAX2-RV64-NEXT: slli a5, a5, 16 ; LMULMAX2-RV64-NEXT: addi a5, a5, 257 ; LMULMAX2-RV64-NEXT: slli a5, a5, 16 ; LMULMAX2-RV64-NEXT: addi a5, a5, 257 -; LMULMAX2-RV64-NEXT: mul a2, a2, a5 -; LMULMAX2-RV64-NEXT: srli a2, a2, 56 -; LMULMAX2-RV64-NEXT: addiw a2, a2, -48 -; LMULMAX2-RV64-NEXT: sh a2, 32(sp) +; LMULMAX2-RV64-NEXT: mul a1, a1, a5 +; LMULMAX2-RV64-NEXT: srli a1, a1, 56 +; LMULMAX2-RV64-NEXT: addiw a1, a1, -48 +; LMULMAX2-RV64-NEXT: sh a1, 32(sp) ; LMULMAX2-RV64-NEXT: vsetivli zero, 1, e16, m2, ta, mu ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 15 -; LMULMAX2-RV64-NEXT: vmv.x.s a2, v28 -; LMULMAX2-RV64-NEXT: and a2, a2, a1 -; LMULMAX2-RV64-NEXT: srli a4, a2, 1 -; LMULMAX2-RV64-NEXT: or a2, a2, a4 -; LMULMAX2-RV64-NEXT: srli a4, a2, 2 -; LMULMAX2-RV64-NEXT: or a2, a2, a4 -; LMULMAX2-RV64-NEXT: srli a4, a2, 4 -; LMULMAX2-RV64-NEXT: or a2, a2, a4 -; LMULMAX2-RV64-NEXT: srli a4, a2, 8 -; LMULMAX2-RV64-NEXT: or a2, a2, a4 -; LMULMAX2-RV64-NEXT: srli a4, a2, 16 -; LMULMAX2-RV64-NEXT: or a2, a2, a4 -; LMULMAX2-RV64-NEXT: srli a4, a2, 32 -; LMULMAX2-RV64-NEXT: or a2, a2, a4 -; LMULMAX2-RV64-NEXT: not a2, a2 -; LMULMAX2-RV64-NEXT: srli a4, a2, 1 -; LMULMAX2-RV64-NEXT: and a4, a4, a6 -; LMULMAX2-RV64-NEXT: sub a2, a2, a4 -; LMULMAX2-RV64-NEXT: and a4, a2, a3 -; LMULMAX2-RV64-NEXT: srli a2, a2, 2 -; LMULMAX2-RV64-NEXT: and a2, a2, a3 -; LMULMAX2-RV64-NEXT: add a2, a4, a2 -; LMULMAX2-RV64-NEXT: srli a4, a2, 4 -; LMULMAX2-RV64-NEXT: add a2, a2, a4 -; LMULMAX2-RV64-NEXT: and a2, a2, a7 -; LMULMAX2-RV64-NEXT: mul a2, a2, a5 -; LMULMAX2-RV64-NEXT: srli a2, a2, 56 -; LMULMAX2-RV64-NEXT: addiw a2, a2, -48 -; LMULMAX2-RV64-NEXT: sh a2, 62(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 14 -; LMULMAX2-RV64-NEXT: vmv.x.s a2, v28 -; LMULMAX2-RV64-NEXT: and a2, a2, a1 -; LMULMAX2-RV64-NEXT: srli a4, a2, 1 -; LMULMAX2-RV64-NEXT: or a2, a2, a4 -; LMULMAX2-RV64-NEXT: srli a4, a2, 2 -; LMULMAX2-RV64-NEXT: or a2, a2, a4 -; LMULMAX2-RV64-NEXT: srli a4, a2, 4 -; LMULMAX2-RV64-NEXT: or a2, a2, a4 -; LMULMAX2-RV64-NEXT: srli a4, a2, 8 -; LMULMAX2-RV64-NEXT: or a2, a2, a4 -; LMULMAX2-RV64-NEXT: srli a4, a2, 16 -; LMULMAX2-RV64-NEXT: or a2, a2, a4 -; LMULMAX2-RV64-NEXT: srli a4, a2, 32 -; LMULMAX2-RV64-NEXT: or a2, a2, a4 -; LMULMAX2-RV64-NEXT: not a2, a2 -; LMULMAX2-RV64-NEXT: srli a4, a2, 1 -; LMULMAX2-RV64-NEXT: and a4, a4, a6 -; LMULMAX2-RV64-NEXT: sub a2, a2, a4 -; LMULMAX2-RV64-NEXT: and a4, a2, a3 -; LMULMAX2-RV64-NEXT: srli a2, a2, 2 -; LMULMAX2-RV64-NEXT: and a2, a2, a3 -; LMULMAX2-RV64-NEXT: add a2, a4, a2 -; LMULMAX2-RV64-NEXT: srli a4, a2, 4 -; LMULMAX2-RV64-NEXT: add a2, a2, a4 -; LMULMAX2-RV64-NEXT: and a2, a2, a7 -; LMULMAX2-RV64-NEXT: mul a2, a2, a5 -; LMULMAX2-RV64-NEXT: srli a2, a2, 56 -; LMULMAX2-RV64-NEXT: addiw a2, a2, -48 -; LMULMAX2-RV64-NEXT: sh a2, 60(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 13 -; LMULMAX2-RV64-NEXT: vmv.x.s a2, v28 -; LMULMAX2-RV64-NEXT: and a2, a2, a1 -; LMULMAX2-RV64-NEXT: srli a4, a2, 1 -; LMULMAX2-RV64-NEXT: or a2, a2, a4 -; LMULMAX2-RV64-NEXT: srli a4, a2, 2 -; LMULMAX2-RV64-NEXT: or a2, a2, a4 -; LMULMAX2-RV64-NEXT: srli a4, a2, 4 -; LMULMAX2-RV64-NEXT: or a2, a2, a4 -; LMULMAX2-RV64-NEXT: srli a4, a2, 8 -; LMULMAX2-RV64-NEXT: or a2, a2, a4 -; LMULMAX2-RV64-NEXT: srli a4, a2, 16 -; LMULMAX2-RV64-NEXT: or a2, a2, a4 -; LMULMAX2-RV64-NEXT: srli a4, a2, 32 -; LMULMAX2-RV64-NEXT: or a2, a2, a4 -; LMULMAX2-RV64-NEXT: not a2, a2 -; LMULMAX2-RV64-NEXT: srli a4, a2, 1 -; LMULMAX2-RV64-NEXT: and a4, a4, a6 -; LMULMAX2-RV64-NEXT: sub a2, a2, a4 -; LMULMAX2-RV64-NEXT: and a4, a2, a3 -; LMULMAX2-RV64-NEXT: srli a2, a2, 2 -; LMULMAX2-RV64-NEXT: and a2, a2, a3 -; LMULMAX2-RV64-NEXT: add a2, a4, a2 -; LMULMAX2-RV64-NEXT: srli a4, a2, 4 -; LMULMAX2-RV64-NEXT: add a2, a2, a4 -; LMULMAX2-RV64-NEXT: and a2, a2, a7 -; LMULMAX2-RV64-NEXT: mul a2, a2, a5 -; LMULMAX2-RV64-NEXT: srli a2, a2, 56 -; LMULMAX2-RV64-NEXT: addiw a2, a2, -48 -; LMULMAX2-RV64-NEXT: sh a2, 58(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 12 -; LMULMAX2-RV64-NEXT: vmv.x.s a2, v28 -; LMULMAX2-RV64-NEXT: and a2, a2, a1 -; LMULMAX2-RV64-NEXT: srli a4, a2, 1 -; LMULMAX2-RV64-NEXT: or a2, a2, a4 -; LMULMAX2-RV64-NEXT: srli a4, a2, 2 -; LMULMAX2-RV64-NEXT: or a2, a2, a4 -; LMULMAX2-RV64-NEXT: srli a4, a2, 4 -; LMULMAX2-RV64-NEXT: or a2, a2, a4 -; LMULMAX2-RV64-NEXT: srli a4, a2, 8 -; LMULMAX2-RV64-NEXT: or a2, a2, a4 -; LMULMAX2-RV64-NEXT: srli a4, a2, 16 -; LMULMAX2-RV64-NEXT: or a2, a2, a4 -; LMULMAX2-RV64-NEXT: srli a4, a2, 32 -; LMULMAX2-RV64-NEXT: or a2, a2, a4 -; LMULMAX2-RV64-NEXT: not a2, a2 -; LMULMAX2-RV64-NEXT: srli a4, a2, 1 -; LMULMAX2-RV64-NEXT: and a4, a4, a6 -; LMULMAX2-RV64-NEXT: sub a2, a2, a4 -; LMULMAX2-RV64-NEXT: and a4, a2, a3 -; LMULMAX2-RV64-NEXT: srli a2, a2, 2 -; LMULMAX2-RV64-NEXT: and a2, a2, a3 -; LMULMAX2-RV64-NEXT: add a2, a4, a2 -; LMULMAX2-RV64-NEXT: srli a4, a2, 4 -; LMULMAX2-RV64-NEXT: add a2, a2, a4 -; LMULMAX2-RV64-NEXT: and a2, a2, a7 -; LMULMAX2-RV64-NEXT: mul a2, a2, a5 -; LMULMAX2-RV64-NEXT: srli a2, a2, 56 -; LMULMAX2-RV64-NEXT: addiw a2, a2, -48 -; LMULMAX2-RV64-NEXT: sh a2, 56(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 11 -; LMULMAX2-RV64-NEXT: vmv.x.s a2, v28 -; LMULMAX2-RV64-NEXT: and a2, a2, a1 -; LMULMAX2-RV64-NEXT: srli a4, a2, 1 -; LMULMAX2-RV64-NEXT: or a2, a2, a4 -; LMULMAX2-RV64-NEXT: srli a4, a2, 2 -; LMULMAX2-RV64-NEXT: or a2, a2, a4 -; LMULMAX2-RV64-NEXT: srli a4, a2, 4 -; LMULMAX2-RV64-NEXT: or a2, a2, a4 -; LMULMAX2-RV64-NEXT: srli a4, a2, 8 -; LMULMAX2-RV64-NEXT: or a2, a2, a4 -; LMULMAX2-RV64-NEXT: srli a4, a2, 16 -; LMULMAX2-RV64-NEXT: or a2, a2, a4 -; LMULMAX2-RV64-NEXT: srli a4, a2, 32 -; LMULMAX2-RV64-NEXT: or a2, a2, a4 -; LMULMAX2-RV64-NEXT: not a2, a2 -; LMULMAX2-RV64-NEXT: srli a4, a2, 1 -; LMULMAX2-RV64-NEXT: and a4, a4, a6 -; LMULMAX2-RV64-NEXT: sub a2, a2, a4 -; LMULMAX2-RV64-NEXT: and a4, a2, a3 -; LMULMAX2-RV64-NEXT: srli a2, a2, 2 -; LMULMAX2-RV64-NEXT: and a2, a2, a3 -; LMULMAX2-RV64-NEXT: add a2, a4, a2 -; LMULMAX2-RV64-NEXT: srli a4, a2, 4 -; LMULMAX2-RV64-NEXT: add a2, a2, a4 -; LMULMAX2-RV64-NEXT: and a2, a2, a7 -; LMULMAX2-RV64-NEXT: mul a2, a2, a5 -; LMULMAX2-RV64-NEXT: srli a2, a2, 56 -; LMULMAX2-RV64-NEXT: addiw a2, a2, -48 -; LMULMAX2-RV64-NEXT: sh a2, 54(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 10 -; LMULMAX2-RV64-NEXT: vmv.x.s a2, v28 -; LMULMAX2-RV64-NEXT: and a2, a2, a1 -; LMULMAX2-RV64-NEXT: srli a4, a2, 1 -; LMULMAX2-RV64-NEXT: or a2, a2, a4 -; LMULMAX2-RV64-NEXT: srli a4, a2, 2 -; LMULMAX2-RV64-NEXT: or a2, a2, a4 -; LMULMAX2-RV64-NEXT: srli a4, a2, 4 -; LMULMAX2-RV64-NEXT: or a2, a2, a4 -; LMULMAX2-RV64-NEXT: srli a4, a2, 8 -; LMULMAX2-RV64-NEXT: or a2, a2, a4 -; LMULMAX2-RV64-NEXT: srli a4, a2, 16 -; LMULMAX2-RV64-NEXT: or a2, a2, a4 -; LMULMAX2-RV64-NEXT: srli a4, a2, 32 -; LMULMAX2-RV64-NEXT: or a2, a2, a4 -; LMULMAX2-RV64-NEXT: not a2, a2 -; LMULMAX2-RV64-NEXT: srli a4, a2, 1 -; LMULMAX2-RV64-NEXT: and a4, a4, a6 -; LMULMAX2-RV64-NEXT: sub a2, a2, a4 -; LMULMAX2-RV64-NEXT: and a4, a2, a3 -; LMULMAX2-RV64-NEXT: srli a2, a2, 2 -; LMULMAX2-RV64-NEXT: and a2, a2, a3 -; LMULMAX2-RV64-NEXT: add a2, a4, a2 -; LMULMAX2-RV64-NEXT: srli a4, a2, 4 -; LMULMAX2-RV64-NEXT: add a2, a2, a4 -; LMULMAX2-RV64-NEXT: and a2, a2, a7 -; LMULMAX2-RV64-NEXT: mul a2, a2, a5 -; LMULMAX2-RV64-NEXT: srli a2, a2, 56 -; LMULMAX2-RV64-NEXT: addiw a2, a2, -48 -; LMULMAX2-RV64-NEXT: sh a2, 52(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 9 -; LMULMAX2-RV64-NEXT: vmv.x.s a2, v28 -; LMULMAX2-RV64-NEXT: and a2, a2, a1 -; LMULMAX2-RV64-NEXT: srli a4, a2, 1 -; LMULMAX2-RV64-NEXT: or a2, a2, a4 -; LMULMAX2-RV64-NEXT: srli a4, a2, 2 -; LMULMAX2-RV64-NEXT: or a2, a2, a4 -; LMULMAX2-RV64-NEXT: srli a4, a2, 4 -; LMULMAX2-RV64-NEXT: or a2, a2, a4 -; LMULMAX2-RV64-NEXT: srli a4, a2, 8 -; LMULMAX2-RV64-NEXT: or a2, a2, a4 -; LMULMAX2-RV64-NEXT: srli a4, a2, 16 -; LMULMAX2-RV64-NEXT: or a2, a2, a4 -; LMULMAX2-RV64-NEXT: srli a4, a2, 32 -; LMULMAX2-RV64-NEXT: or a2, a2, a4 -; LMULMAX2-RV64-NEXT: not a2, a2 -; LMULMAX2-RV64-NEXT: srli a4, a2, 1 -; LMULMAX2-RV64-NEXT: and a4, a4, a6 -; LMULMAX2-RV64-NEXT: sub a2, a2, a4 -; LMULMAX2-RV64-NEXT: and a4, a2, a3 -; LMULMAX2-RV64-NEXT: srli a2, a2, 2 -; LMULMAX2-RV64-NEXT: and a2, a2, a3 -; LMULMAX2-RV64-NEXT: add a2, a4, a2 -; LMULMAX2-RV64-NEXT: srli a4, a2, 4 -; LMULMAX2-RV64-NEXT: add a2, a2, a4 -; LMULMAX2-RV64-NEXT: and a2, a2, a7 -; LMULMAX2-RV64-NEXT: mul a2, a2, a5 -; LMULMAX2-RV64-NEXT: srli a2, a2, 56 -; LMULMAX2-RV64-NEXT: addiw a2, a2, -48 -; LMULMAX2-RV64-NEXT: sh a2, 50(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 8 -; LMULMAX2-RV64-NEXT: vmv.x.s a2, v28 -; LMULMAX2-RV64-NEXT: and a2, a2, a1 -; LMULMAX2-RV64-NEXT: srli a4, a2, 1 -; LMULMAX2-RV64-NEXT: or a2, a2, a4 -; LMULMAX2-RV64-NEXT: srli a4, a2, 2 -; LMULMAX2-RV64-NEXT: or a2, a2, a4 -; LMULMAX2-RV64-NEXT: srli a4, a2, 4 -; LMULMAX2-RV64-NEXT: or a2, a2, a4 -; LMULMAX2-RV64-NEXT: srli a4, a2, 8 -; LMULMAX2-RV64-NEXT: or a2, a2, a4 -; LMULMAX2-RV64-NEXT: srli a4, a2, 16 -; LMULMAX2-RV64-NEXT: or a2, a2, a4 -; LMULMAX2-RV64-NEXT: srli a4, a2, 32 -; LMULMAX2-RV64-NEXT: or a2, a2, a4 -; LMULMAX2-RV64-NEXT: not a2, a2 -; LMULMAX2-RV64-NEXT: srli a4, a2, 1 -; LMULMAX2-RV64-NEXT: and a4, a4, a6 -; LMULMAX2-RV64-NEXT: sub a2, a2, a4 -; LMULMAX2-RV64-NEXT: and a4, a2, a3 -; LMULMAX2-RV64-NEXT: srli a2, a2, 2 -; LMULMAX2-RV64-NEXT: and a2, a2, a3 -; LMULMAX2-RV64-NEXT: add a2, a4, a2 -; LMULMAX2-RV64-NEXT: srli a4, a2, 4 -; LMULMAX2-RV64-NEXT: add a2, a2, a4 -; LMULMAX2-RV64-NEXT: and a2, a2, a7 -; LMULMAX2-RV64-NEXT: mul a2, a2, a5 -; LMULMAX2-RV64-NEXT: srli a2, a2, 56 -; LMULMAX2-RV64-NEXT: addiw a2, a2, -48 -; LMULMAX2-RV64-NEXT: sh a2, 48(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 7 -; LMULMAX2-RV64-NEXT: vmv.x.s a2, v28 -; LMULMAX2-RV64-NEXT: and a2, a2, a1 -; LMULMAX2-RV64-NEXT: srli a4, a2, 1 -; LMULMAX2-RV64-NEXT: or a2, a2, a4 -; LMULMAX2-RV64-NEXT: srli a4, a2, 2 -; LMULMAX2-RV64-NEXT: or a2, a2, a4 -; LMULMAX2-RV64-NEXT: srli a4, a2, 4 -; LMULMAX2-RV64-NEXT: or a2, a2, a4 -; LMULMAX2-RV64-NEXT: srli a4, a2, 8 -; LMULMAX2-RV64-NEXT: or a2, a2, a4 -; LMULMAX2-RV64-NEXT: srli a4, a2, 16 -; LMULMAX2-RV64-NEXT: or a2, a2, a4 -; LMULMAX2-RV64-NEXT: srli a4, a2, 32 -; LMULMAX2-RV64-NEXT: or a2, a2, a4 -; LMULMAX2-RV64-NEXT: not a2, a2 -; LMULMAX2-RV64-NEXT: srli a4, a2, 1 -; LMULMAX2-RV64-NEXT: and a4, a4, a6 -; LMULMAX2-RV64-NEXT: sub a2, a2, a4 -; LMULMAX2-RV64-NEXT: and a4, a2, a3 -; LMULMAX2-RV64-NEXT: srli a2, a2, 2 -; LMULMAX2-RV64-NEXT: and a2, a2, a3 -; LMULMAX2-RV64-NEXT: add a2, a4, a2 -; LMULMAX2-RV64-NEXT: srli a4, a2, 4 -; LMULMAX2-RV64-NEXT: add a2, a2, a4 -; LMULMAX2-RV64-NEXT: and a2, a2, a7 -; LMULMAX2-RV64-NEXT: mul a2, a2, a5 -; LMULMAX2-RV64-NEXT: srli a2, a2, 56 -; LMULMAX2-RV64-NEXT: addiw a2, a2, -48 -; LMULMAX2-RV64-NEXT: sh a2, 46(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 6 -; LMULMAX2-RV64-NEXT: vmv.x.s a2, v28 -; LMULMAX2-RV64-NEXT: and a2, a2, a1 -; LMULMAX2-RV64-NEXT: srli a4, a2, 1 -; LMULMAX2-RV64-NEXT: or a2, a2, a4 -; LMULMAX2-RV64-NEXT: srli a4, a2, 2 -; LMULMAX2-RV64-NEXT: or a2, a2, a4 -; LMULMAX2-RV64-NEXT: srli a4, a2, 4 -; LMULMAX2-RV64-NEXT: or a2, a2, a4 -; LMULMAX2-RV64-NEXT: srli a4, a2, 8 -; LMULMAX2-RV64-NEXT: or a2, a2, a4 -; LMULMAX2-RV64-NEXT: srli a4, a2, 16 -; LMULMAX2-RV64-NEXT: or a2, a2, a4 -; LMULMAX2-RV64-NEXT: srli a4, a2, 32 -; LMULMAX2-RV64-NEXT: or a2, a2, a4 -; LMULMAX2-RV64-NEXT: not a2, a2 -; LMULMAX2-RV64-NEXT: srli a4, a2, 1 -; LMULMAX2-RV64-NEXT: and a4, a4, a6 -; LMULMAX2-RV64-NEXT: sub a2, a2, a4 -; LMULMAX2-RV64-NEXT: and a4, a2, a3 -; LMULMAX2-RV64-NEXT: srli a2, a2, 2 -; LMULMAX2-RV64-NEXT: and a2, a2, a3 -; LMULMAX2-RV64-NEXT: add a2, a4, a2 -; LMULMAX2-RV64-NEXT: srli a4, a2, 4 -; LMULMAX2-RV64-NEXT: add a2, a2, a4 -; LMULMAX2-RV64-NEXT: and a2, a2, a7 -; LMULMAX2-RV64-NEXT: mul a2, a2, a5 -; LMULMAX2-RV64-NEXT: srli a2, a2, 56 -; LMULMAX2-RV64-NEXT: addiw a2, a2, -48 -; LMULMAX2-RV64-NEXT: sh a2, 44(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 5 -; LMULMAX2-RV64-NEXT: vmv.x.s a2, v28 -; LMULMAX2-RV64-NEXT: and a2, a2, a1 -; LMULMAX2-RV64-NEXT: srli a4, a2, 1 -; LMULMAX2-RV64-NEXT: or a2, a2, a4 -; LMULMAX2-RV64-NEXT: srli a4, a2, 2 -; LMULMAX2-RV64-NEXT: or a2, a2, a4 -; LMULMAX2-RV64-NEXT: srli a4, a2, 4 -; LMULMAX2-RV64-NEXT: or a2, a2, a4 -; LMULMAX2-RV64-NEXT: srli a4, a2, 8 -; LMULMAX2-RV64-NEXT: or a2, a2, a4 -; LMULMAX2-RV64-NEXT: srli a4, a2, 16 -; LMULMAX2-RV64-NEXT: or a2, a2, a4 -; LMULMAX2-RV64-NEXT: srli a4, a2, 32 -; LMULMAX2-RV64-NEXT: or a2, a2, a4 -; LMULMAX2-RV64-NEXT: not a2, a2 -; LMULMAX2-RV64-NEXT: srli a4, a2, 1 -; LMULMAX2-RV64-NEXT: and a4, a4, a6 -; LMULMAX2-RV64-NEXT: sub a2, a2, a4 -; LMULMAX2-RV64-NEXT: and a4, a2, a3 -; LMULMAX2-RV64-NEXT: srli a2, a2, 2 -; LMULMAX2-RV64-NEXT: and a2, a2, a3 -; LMULMAX2-RV64-NEXT: add a2, a4, a2 -; LMULMAX2-RV64-NEXT: srli a4, a2, 4 -; LMULMAX2-RV64-NEXT: add a2, a2, a4 -; LMULMAX2-RV64-NEXT: and a2, a2, a7 -; LMULMAX2-RV64-NEXT: mul a2, a2, a5 -; LMULMAX2-RV64-NEXT: srli a2, a2, 56 -; LMULMAX2-RV64-NEXT: addiw a2, a2, -48 -; LMULMAX2-RV64-NEXT: sh a2, 42(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 4 -; LMULMAX2-RV64-NEXT: vmv.x.s a2, v28 -; LMULMAX2-RV64-NEXT: and a2, a2, a1 -; LMULMAX2-RV64-NEXT: srli a4, a2, 1 -; LMULMAX2-RV64-NEXT: or a2, a2, a4 -; LMULMAX2-RV64-NEXT: srli a4, a2, 2 -; LMULMAX2-RV64-NEXT: or a2, a2, a4 -; LMULMAX2-RV64-NEXT: srli a4, a2, 4 -; LMULMAX2-RV64-NEXT: or a2, a2, a4 -; LMULMAX2-RV64-NEXT: srli a4, a2, 8 -; LMULMAX2-RV64-NEXT: or a2, a2, a4 -; LMULMAX2-RV64-NEXT: srli a4, a2, 16 -; LMULMAX2-RV64-NEXT: or a2, a2, a4 -; LMULMAX2-RV64-NEXT: srli a4, a2, 32 -; LMULMAX2-RV64-NEXT: or a2, a2, a4 -; LMULMAX2-RV64-NEXT: not a2, a2 -; LMULMAX2-RV64-NEXT: srli a4, a2, 1 -; LMULMAX2-RV64-NEXT: and a4, a4, a6 -; LMULMAX2-RV64-NEXT: sub a2, a2, a4 -; LMULMAX2-RV64-NEXT: and a4, a2, a3 -; LMULMAX2-RV64-NEXT: srli a2, a2, 2 -; LMULMAX2-RV64-NEXT: and a2, a2, a3 -; LMULMAX2-RV64-NEXT: add a2, a4, a2 -; LMULMAX2-RV64-NEXT: srli a4, a2, 4 -; LMULMAX2-RV64-NEXT: add a2, a2, a4 -; LMULMAX2-RV64-NEXT: and a2, a2, a7 -; LMULMAX2-RV64-NEXT: mul a2, a2, a5 -; LMULMAX2-RV64-NEXT: srli a2, a2, 56 -; LMULMAX2-RV64-NEXT: addiw a2, a2, -48 -; LMULMAX2-RV64-NEXT: sh a2, 40(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 3 -; LMULMAX2-RV64-NEXT: vmv.x.s a2, v28 -; LMULMAX2-RV64-NEXT: and a2, a2, a1 -; LMULMAX2-RV64-NEXT: srli a4, a2, 1 -; LMULMAX2-RV64-NEXT: or a2, a2, a4 -; LMULMAX2-RV64-NEXT: srli a4, a2, 2 -; LMULMAX2-RV64-NEXT: or a2, a2, a4 -; LMULMAX2-RV64-NEXT: srli a4, a2, 4 -; LMULMAX2-RV64-NEXT: or a2, a2, a4 -; LMULMAX2-RV64-NEXT: srli a4, a2, 8 -; LMULMAX2-RV64-NEXT: or a2, a2, a4 -; LMULMAX2-RV64-NEXT: srli a4, a2, 16 -; LMULMAX2-RV64-NEXT: or a2, a2, a4 -; LMULMAX2-RV64-NEXT: srli a4, a2, 32 -; LMULMAX2-RV64-NEXT: or a2, a2, a4 -; LMULMAX2-RV64-NEXT: not a2, a2 -; LMULMAX2-RV64-NEXT: srli a4, a2, 1 -; LMULMAX2-RV64-NEXT: and a4, a4, a6 -; LMULMAX2-RV64-NEXT: sub a2, a2, a4 -; LMULMAX2-RV64-NEXT: and a4, a2, a3 -; LMULMAX2-RV64-NEXT: srli a2, a2, 2 -; LMULMAX2-RV64-NEXT: and a2, a2, a3 -; LMULMAX2-RV64-NEXT: add a2, a4, a2 -; LMULMAX2-RV64-NEXT: srli a4, a2, 4 -; LMULMAX2-RV64-NEXT: add a2, a2, a4 -; LMULMAX2-RV64-NEXT: and a2, a2, a7 -; LMULMAX2-RV64-NEXT: mul a2, a2, a5 -; LMULMAX2-RV64-NEXT: srli a2, a2, 56 -; LMULMAX2-RV64-NEXT: addiw a2, a2, -48 -; LMULMAX2-RV64-NEXT: sh a2, 38(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 2 -; LMULMAX2-RV64-NEXT: vmv.x.s a2, v28 -; LMULMAX2-RV64-NEXT: and a2, a2, a1 -; LMULMAX2-RV64-NEXT: srli a4, a2, 1 -; LMULMAX2-RV64-NEXT: or a2, a2, a4 -; LMULMAX2-RV64-NEXT: srli a4, a2, 2 -; LMULMAX2-RV64-NEXT: or a2, a2, a4 -; LMULMAX2-RV64-NEXT: srli a4, a2, 4 -; LMULMAX2-RV64-NEXT: or a2, a2, a4 -; LMULMAX2-RV64-NEXT: srli a4, a2, 8 -; LMULMAX2-RV64-NEXT: or a2, a2, a4 -; LMULMAX2-RV64-NEXT: srli a4, a2, 16 -; LMULMAX2-RV64-NEXT: or a2, a2, a4 -; LMULMAX2-RV64-NEXT: srli a4, a2, 32 -; LMULMAX2-RV64-NEXT: or a2, a2, a4 -; LMULMAX2-RV64-NEXT: not a2, a2 -; LMULMAX2-RV64-NEXT: srli a4, a2, 1 -; LMULMAX2-RV64-NEXT: and a4, a4, a6 -; LMULMAX2-RV64-NEXT: sub a2, a2, a4 -; LMULMAX2-RV64-NEXT: and a4, a2, a3 -; LMULMAX2-RV64-NEXT: srli a2, a2, 2 -; LMULMAX2-RV64-NEXT: and a2, a2, a3 -; LMULMAX2-RV64-NEXT: add a2, a4, a2 -; LMULMAX2-RV64-NEXT: srli a4, a2, 4 -; LMULMAX2-RV64-NEXT: add a2, a2, a4 -; LMULMAX2-RV64-NEXT: and a2, a2, a7 -; LMULMAX2-RV64-NEXT: mul a2, a2, a5 -; LMULMAX2-RV64-NEXT: srli a2, a2, 56 -; LMULMAX2-RV64-NEXT: addiw a2, a2, -48 -; LMULMAX2-RV64-NEXT: sh a2, 36(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v26, v26, 1 -; LMULMAX2-RV64-NEXT: vmv.x.s a2, v26 -; LMULMAX2-RV64-NEXT: and a1, a2, a1 +; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 +; LMULMAX2-RV64-NEXT: and a1, a1, a6 ; LMULMAX2-RV64-NEXT: srli a2, a1, 1 ; LMULMAX2-RV64-NEXT: or a1, a1, a2 ; LMULMAX2-RV64-NEXT: srli a2, a1, 2 @@ -9008,7 +8588,7 @@ define void @ctlz_v16i16(<16 x i16>* %x, <16 x i16>* %y) { ; LMULMAX2-RV64-NEXT: or a1, a1, a2 ; LMULMAX2-RV64-NEXT: not a1, a1 ; LMULMAX2-RV64-NEXT: srli a2, a1, 1 -; LMULMAX2-RV64-NEXT: and a2, a2, a6 +; LMULMAX2-RV64-NEXT: and a2, a2, a7 ; LMULMAX2-RV64-NEXT: sub a1, a1, a2 ; LMULMAX2-RV64-NEXT: and a2, a1, a3 ; LMULMAX2-RV64-NEXT: srli a1, a1, 2 @@ -9016,20 +8596,440 @@ define void @ctlz_v16i16(<16 x i16>* %x, <16 x i16>* %y) { ; LMULMAX2-RV64-NEXT: add a1, a2, a1 ; LMULMAX2-RV64-NEXT: srli a2, a1, 4 ; LMULMAX2-RV64-NEXT: add a1, a1, a2 -; LMULMAX2-RV64-NEXT: and a1, a1, a7 +; LMULMAX2-RV64-NEXT: and a1, a1, a4 ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 ; LMULMAX2-RV64-NEXT: addiw a1, a1, -48 -; LMULMAX2-RV64-NEXT: sh a1, 34(sp) -; LMULMAX2-RV64-NEXT: vsetivli zero, 16, e16, m2, ta, mu -; LMULMAX2-RV64-NEXT: addi a1, sp, 32 -; LMULMAX2-RV64-NEXT: vle16.v v26, (a1) -; LMULMAX2-RV64-NEXT: vse16.v v26, (a0) -; LMULMAX2-RV64-NEXT: addi sp, s0, -96 -; LMULMAX2-RV64-NEXT: ld s0, 80(sp) # 8-byte Folded Reload -; LMULMAX2-RV64-NEXT: ld ra, 88(sp) # 8-byte Folded Reload -; LMULMAX2-RV64-NEXT: addi sp, sp, 96 -; LMULMAX2-RV64-NEXT: ret +; LMULMAX2-RV64-NEXT: sh a1, 62(sp) +; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 14 +; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 +; LMULMAX2-RV64-NEXT: and a1, a1, a6 +; LMULMAX2-RV64-NEXT: srli a2, a1, 1 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: srli a2, a1, 2 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: srli a2, a1, 4 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: srli a2, a1, 8 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: srli a2, a1, 16 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: srli a2, a1, 32 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: not a1, a1 +; LMULMAX2-RV64-NEXT: srli a2, a1, 1 +; LMULMAX2-RV64-NEXT: and a2, a2, a7 +; LMULMAX2-RV64-NEXT: sub a1, a1, a2 +; LMULMAX2-RV64-NEXT: and a2, a1, a3 +; LMULMAX2-RV64-NEXT: srli a1, a1, 2 +; LMULMAX2-RV64-NEXT: and a1, a1, a3 +; LMULMAX2-RV64-NEXT: add a1, a2, a1 +; LMULMAX2-RV64-NEXT: srli a2, a1, 4 +; LMULMAX2-RV64-NEXT: add a1, a1, a2 +; LMULMAX2-RV64-NEXT: and a1, a1, a4 +; LMULMAX2-RV64-NEXT: mul a1, a1, a5 +; LMULMAX2-RV64-NEXT: srli a1, a1, 56 +; LMULMAX2-RV64-NEXT: addiw a1, a1, -48 +; LMULMAX2-RV64-NEXT: sh a1, 60(sp) +; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 13 +; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 +; LMULMAX2-RV64-NEXT: and a1, a1, a6 +; LMULMAX2-RV64-NEXT: srli a2, a1, 1 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: srli a2, a1, 2 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: srli a2, a1, 4 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: srli a2, a1, 8 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: srli a2, a1, 16 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: srli a2, a1, 32 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: not a1, a1 +; LMULMAX2-RV64-NEXT: srli a2, a1, 1 +; LMULMAX2-RV64-NEXT: and a2, a2, a7 +; LMULMAX2-RV64-NEXT: sub a1, a1, a2 +; LMULMAX2-RV64-NEXT: and a2, a1, a3 +; LMULMAX2-RV64-NEXT: srli a1, a1, 2 +; LMULMAX2-RV64-NEXT: and a1, a1, a3 +; LMULMAX2-RV64-NEXT: add a1, a2, a1 +; LMULMAX2-RV64-NEXT: srli a2, a1, 4 +; LMULMAX2-RV64-NEXT: add a1, a1, a2 +; LMULMAX2-RV64-NEXT: and a1, a1, a4 +; LMULMAX2-RV64-NEXT: mul a1, a1, a5 +; LMULMAX2-RV64-NEXT: srli a1, a1, 56 +; LMULMAX2-RV64-NEXT: addiw a1, a1, -48 +; LMULMAX2-RV64-NEXT: sh a1, 58(sp) +; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 12 +; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 +; LMULMAX2-RV64-NEXT: and a1, a1, a6 +; LMULMAX2-RV64-NEXT: srli a2, a1, 1 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: srli a2, a1, 2 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: srli a2, a1, 4 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: srli a2, a1, 8 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: srli a2, a1, 16 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: srli a2, a1, 32 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: not a1, a1 +; LMULMAX2-RV64-NEXT: srli a2, a1, 1 +; LMULMAX2-RV64-NEXT: and a2, a2, a7 +; LMULMAX2-RV64-NEXT: sub a1, a1, a2 +; LMULMAX2-RV64-NEXT: and a2, a1, a3 +; LMULMAX2-RV64-NEXT: srli a1, a1, 2 +; LMULMAX2-RV64-NEXT: and a1, a1, a3 +; LMULMAX2-RV64-NEXT: add a1, a2, a1 +; LMULMAX2-RV64-NEXT: srli a2, a1, 4 +; LMULMAX2-RV64-NEXT: add a1, a1, a2 +; LMULMAX2-RV64-NEXT: and a1, a1, a4 +; LMULMAX2-RV64-NEXT: mul a1, a1, a5 +; LMULMAX2-RV64-NEXT: srli a1, a1, 56 +; LMULMAX2-RV64-NEXT: addiw a1, a1, -48 +; LMULMAX2-RV64-NEXT: sh a1, 56(sp) +; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 11 +; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 +; LMULMAX2-RV64-NEXT: and a1, a1, a6 +; LMULMAX2-RV64-NEXT: srli a2, a1, 1 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: srli a2, a1, 2 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: srli a2, a1, 4 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: srli a2, a1, 8 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: srli a2, a1, 16 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: srli a2, a1, 32 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: not a1, a1 +; LMULMAX2-RV64-NEXT: srli a2, a1, 1 +; LMULMAX2-RV64-NEXT: and a2, a2, a7 +; LMULMAX2-RV64-NEXT: sub a1, a1, a2 +; LMULMAX2-RV64-NEXT: and a2, a1, a3 +; LMULMAX2-RV64-NEXT: srli a1, a1, 2 +; LMULMAX2-RV64-NEXT: and a1, a1, a3 +; LMULMAX2-RV64-NEXT: add a1, a2, a1 +; LMULMAX2-RV64-NEXT: srli a2, a1, 4 +; LMULMAX2-RV64-NEXT: add a1, a1, a2 +; LMULMAX2-RV64-NEXT: and a1, a1, a4 +; LMULMAX2-RV64-NEXT: mul a1, a1, a5 +; LMULMAX2-RV64-NEXT: srli a1, a1, 56 +; LMULMAX2-RV64-NEXT: addiw a1, a1, -48 +; LMULMAX2-RV64-NEXT: sh a1, 54(sp) +; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 10 +; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 +; LMULMAX2-RV64-NEXT: and a1, a1, a6 +; LMULMAX2-RV64-NEXT: srli a2, a1, 1 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: srli a2, a1, 2 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: srli a2, a1, 4 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: srli a2, a1, 8 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: srli a2, a1, 16 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: srli a2, a1, 32 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: not a1, a1 +; LMULMAX2-RV64-NEXT: srli a2, a1, 1 +; LMULMAX2-RV64-NEXT: and a2, a2, a7 +; LMULMAX2-RV64-NEXT: sub a1, a1, a2 +; LMULMAX2-RV64-NEXT: and a2, a1, a3 +; LMULMAX2-RV64-NEXT: srli a1, a1, 2 +; LMULMAX2-RV64-NEXT: and a1, a1, a3 +; LMULMAX2-RV64-NEXT: add a1, a2, a1 +; LMULMAX2-RV64-NEXT: srli a2, a1, 4 +; LMULMAX2-RV64-NEXT: add a1, a1, a2 +; LMULMAX2-RV64-NEXT: and a1, a1, a4 +; LMULMAX2-RV64-NEXT: mul a1, a1, a5 +; LMULMAX2-RV64-NEXT: srli a1, a1, 56 +; LMULMAX2-RV64-NEXT: addiw a1, a1, -48 +; LMULMAX2-RV64-NEXT: sh a1, 52(sp) +; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 9 +; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 +; LMULMAX2-RV64-NEXT: and a1, a1, a6 +; LMULMAX2-RV64-NEXT: srli a2, a1, 1 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: srli a2, a1, 2 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: srli a2, a1, 4 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: srli a2, a1, 8 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: srli a2, a1, 16 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: srli a2, a1, 32 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: not a1, a1 +; LMULMAX2-RV64-NEXT: srli a2, a1, 1 +; LMULMAX2-RV64-NEXT: and a2, a2, a7 +; LMULMAX2-RV64-NEXT: sub a1, a1, a2 +; LMULMAX2-RV64-NEXT: and a2, a1, a3 +; LMULMAX2-RV64-NEXT: srli a1, a1, 2 +; LMULMAX2-RV64-NEXT: and a1, a1, a3 +; LMULMAX2-RV64-NEXT: add a1, a2, a1 +; LMULMAX2-RV64-NEXT: srli a2, a1, 4 +; LMULMAX2-RV64-NEXT: add a1, a1, a2 +; LMULMAX2-RV64-NEXT: and a1, a1, a4 +; LMULMAX2-RV64-NEXT: mul a1, a1, a5 +; LMULMAX2-RV64-NEXT: srli a1, a1, 56 +; LMULMAX2-RV64-NEXT: addiw a1, a1, -48 +; LMULMAX2-RV64-NEXT: sh a1, 50(sp) +; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 8 +; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 +; LMULMAX2-RV64-NEXT: and a1, a1, a6 +; LMULMAX2-RV64-NEXT: srli a2, a1, 1 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: srli a2, a1, 2 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: srli a2, a1, 4 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: srli a2, a1, 8 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: srli a2, a1, 16 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: srli a2, a1, 32 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: not a1, a1 +; LMULMAX2-RV64-NEXT: srli a2, a1, 1 +; LMULMAX2-RV64-NEXT: and a2, a2, a7 +; LMULMAX2-RV64-NEXT: sub a1, a1, a2 +; LMULMAX2-RV64-NEXT: and a2, a1, a3 +; LMULMAX2-RV64-NEXT: srli a1, a1, 2 +; LMULMAX2-RV64-NEXT: and a1, a1, a3 +; LMULMAX2-RV64-NEXT: add a1, a2, a1 +; LMULMAX2-RV64-NEXT: srli a2, a1, 4 +; LMULMAX2-RV64-NEXT: add a1, a1, a2 +; LMULMAX2-RV64-NEXT: and a1, a1, a4 +; LMULMAX2-RV64-NEXT: mul a1, a1, a5 +; LMULMAX2-RV64-NEXT: srli a1, a1, 56 +; LMULMAX2-RV64-NEXT: addiw a1, a1, -48 +; LMULMAX2-RV64-NEXT: sh a1, 48(sp) +; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 7 +; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 +; LMULMAX2-RV64-NEXT: and a1, a1, a6 +; LMULMAX2-RV64-NEXT: srli a2, a1, 1 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: srli a2, a1, 2 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: srli a2, a1, 4 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: srli a2, a1, 8 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: srli a2, a1, 16 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: srli a2, a1, 32 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: not a1, a1 +; LMULMAX2-RV64-NEXT: srli a2, a1, 1 +; LMULMAX2-RV64-NEXT: and a2, a2, a7 +; LMULMAX2-RV64-NEXT: sub a1, a1, a2 +; LMULMAX2-RV64-NEXT: and a2, a1, a3 +; LMULMAX2-RV64-NEXT: srli a1, a1, 2 +; LMULMAX2-RV64-NEXT: and a1, a1, a3 +; LMULMAX2-RV64-NEXT: add a1, a2, a1 +; LMULMAX2-RV64-NEXT: srli a2, a1, 4 +; LMULMAX2-RV64-NEXT: add a1, a1, a2 +; LMULMAX2-RV64-NEXT: and a1, a1, a4 +; LMULMAX2-RV64-NEXT: mul a1, a1, a5 +; LMULMAX2-RV64-NEXT: srli a1, a1, 56 +; LMULMAX2-RV64-NEXT: addiw a1, a1, -48 +; LMULMAX2-RV64-NEXT: sh a1, 46(sp) +; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 6 +; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 +; LMULMAX2-RV64-NEXT: and a1, a1, a6 +; LMULMAX2-RV64-NEXT: srli a2, a1, 1 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: srli a2, a1, 2 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: srli a2, a1, 4 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: srli a2, a1, 8 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: srli a2, a1, 16 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: srli a2, a1, 32 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: not a1, a1 +; LMULMAX2-RV64-NEXT: srli a2, a1, 1 +; LMULMAX2-RV64-NEXT: and a2, a2, a7 +; LMULMAX2-RV64-NEXT: sub a1, a1, a2 +; LMULMAX2-RV64-NEXT: and a2, a1, a3 +; LMULMAX2-RV64-NEXT: srli a1, a1, 2 +; LMULMAX2-RV64-NEXT: and a1, a1, a3 +; LMULMAX2-RV64-NEXT: add a1, a2, a1 +; LMULMAX2-RV64-NEXT: srli a2, a1, 4 +; LMULMAX2-RV64-NEXT: add a1, a1, a2 +; LMULMAX2-RV64-NEXT: and a1, a1, a4 +; LMULMAX2-RV64-NEXT: mul a1, a1, a5 +; LMULMAX2-RV64-NEXT: srli a1, a1, 56 +; LMULMAX2-RV64-NEXT: addiw a1, a1, -48 +; LMULMAX2-RV64-NEXT: sh a1, 44(sp) +; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 5 +; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 +; LMULMAX2-RV64-NEXT: and a1, a1, a6 +; LMULMAX2-RV64-NEXT: srli a2, a1, 1 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: srli a2, a1, 2 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: srli a2, a1, 4 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: srli a2, a1, 8 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: srli a2, a1, 16 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: srli a2, a1, 32 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: not a1, a1 +; LMULMAX2-RV64-NEXT: srli a2, a1, 1 +; LMULMAX2-RV64-NEXT: and a2, a2, a7 +; LMULMAX2-RV64-NEXT: sub a1, a1, a2 +; LMULMAX2-RV64-NEXT: and a2, a1, a3 +; LMULMAX2-RV64-NEXT: srli a1, a1, 2 +; LMULMAX2-RV64-NEXT: and a1, a1, a3 +; LMULMAX2-RV64-NEXT: add a1, a2, a1 +; LMULMAX2-RV64-NEXT: srli a2, a1, 4 +; LMULMAX2-RV64-NEXT: add a1, a1, a2 +; LMULMAX2-RV64-NEXT: and a1, a1, a4 +; LMULMAX2-RV64-NEXT: mul a1, a1, a5 +; LMULMAX2-RV64-NEXT: srli a1, a1, 56 +; LMULMAX2-RV64-NEXT: addiw a1, a1, -48 +; LMULMAX2-RV64-NEXT: sh a1, 42(sp) +; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 4 +; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 +; LMULMAX2-RV64-NEXT: and a1, a1, a6 +; LMULMAX2-RV64-NEXT: srli a2, a1, 1 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: srli a2, a1, 2 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: srli a2, a1, 4 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: srli a2, a1, 8 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: srli a2, a1, 16 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: srli a2, a1, 32 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: not a1, a1 +; LMULMAX2-RV64-NEXT: srli a2, a1, 1 +; LMULMAX2-RV64-NEXT: and a2, a2, a7 +; LMULMAX2-RV64-NEXT: sub a1, a1, a2 +; LMULMAX2-RV64-NEXT: and a2, a1, a3 +; LMULMAX2-RV64-NEXT: srli a1, a1, 2 +; LMULMAX2-RV64-NEXT: and a1, a1, a3 +; LMULMAX2-RV64-NEXT: add a1, a2, a1 +; LMULMAX2-RV64-NEXT: srli a2, a1, 4 +; LMULMAX2-RV64-NEXT: add a1, a1, a2 +; LMULMAX2-RV64-NEXT: and a1, a1, a4 +; LMULMAX2-RV64-NEXT: mul a1, a1, a5 +; LMULMAX2-RV64-NEXT: srli a1, a1, 56 +; LMULMAX2-RV64-NEXT: addiw a1, a1, -48 +; LMULMAX2-RV64-NEXT: sh a1, 40(sp) +; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 3 +; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 +; LMULMAX2-RV64-NEXT: and a1, a1, a6 +; LMULMAX2-RV64-NEXT: srli a2, a1, 1 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: srli a2, a1, 2 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: srli a2, a1, 4 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: srli a2, a1, 8 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: srli a2, a1, 16 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: srli a2, a1, 32 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: not a1, a1 +; LMULMAX2-RV64-NEXT: srli a2, a1, 1 +; LMULMAX2-RV64-NEXT: and a2, a2, a7 +; LMULMAX2-RV64-NEXT: sub a1, a1, a2 +; LMULMAX2-RV64-NEXT: and a2, a1, a3 +; LMULMAX2-RV64-NEXT: srli a1, a1, 2 +; LMULMAX2-RV64-NEXT: and a1, a1, a3 +; LMULMAX2-RV64-NEXT: add a1, a2, a1 +; LMULMAX2-RV64-NEXT: srli a2, a1, 4 +; LMULMAX2-RV64-NEXT: add a1, a1, a2 +; LMULMAX2-RV64-NEXT: and a1, a1, a4 +; LMULMAX2-RV64-NEXT: mul a1, a1, a5 +; LMULMAX2-RV64-NEXT: srli a1, a1, 56 +; LMULMAX2-RV64-NEXT: addiw a1, a1, -48 +; LMULMAX2-RV64-NEXT: sh a1, 38(sp) +; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 2 +; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 +; LMULMAX2-RV64-NEXT: and a1, a1, a6 +; LMULMAX2-RV64-NEXT: srli a2, a1, 1 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: srli a2, a1, 2 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: srli a2, a1, 4 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: srli a2, a1, 8 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: srli a2, a1, 16 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: srli a2, a1, 32 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: not a1, a1 +; LMULMAX2-RV64-NEXT: srli a2, a1, 1 +; LMULMAX2-RV64-NEXT: and a2, a2, a7 +; LMULMAX2-RV64-NEXT: sub a1, a1, a2 +; LMULMAX2-RV64-NEXT: and a2, a1, a3 +; LMULMAX2-RV64-NEXT: srli a1, a1, 2 +; LMULMAX2-RV64-NEXT: and a1, a1, a3 +; LMULMAX2-RV64-NEXT: add a1, a2, a1 +; LMULMAX2-RV64-NEXT: srli a2, a1, 4 +; LMULMAX2-RV64-NEXT: add a1, a1, a2 +; LMULMAX2-RV64-NEXT: and a1, a1, a4 +; LMULMAX2-RV64-NEXT: mul a1, a1, a5 +; LMULMAX2-RV64-NEXT: srli a1, a1, 56 +; LMULMAX2-RV64-NEXT: addiw a1, a1, -48 +; LMULMAX2-RV64-NEXT: sh a1, 36(sp) +; LMULMAX2-RV64-NEXT: vslidedown.vi v26, v26, 1 +; LMULMAX2-RV64-NEXT: vmv.x.s a1, v26 +; LMULMAX2-RV64-NEXT: and a1, a1, a6 +; LMULMAX2-RV64-NEXT: srli a2, a1, 1 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: srli a2, a1, 2 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: srli a2, a1, 4 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: srli a2, a1, 8 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: srli a2, a1, 16 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: srli a2, a1, 32 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: not a1, a1 +; LMULMAX2-RV64-NEXT: srli a2, a1, 1 +; LMULMAX2-RV64-NEXT: and a2, a2, a7 +; LMULMAX2-RV64-NEXT: sub a1, a1, a2 +; LMULMAX2-RV64-NEXT: and a2, a1, a3 +; LMULMAX2-RV64-NEXT: srli a1, a1, 2 +; LMULMAX2-RV64-NEXT: and a1, a1, a3 +; LMULMAX2-RV64-NEXT: add a1, a2, a1 +; LMULMAX2-RV64-NEXT: srli a2, a1, 4 +; LMULMAX2-RV64-NEXT: add a1, a1, a2 +; LMULMAX2-RV64-NEXT: and a1, a1, a4 +; LMULMAX2-RV64-NEXT: mul a1, a1, a5 +; LMULMAX2-RV64-NEXT: srli a1, a1, 56 +; LMULMAX2-RV64-NEXT: addiw a1, a1, -48 +; LMULMAX2-RV64-NEXT: sh a1, 34(sp) +; LMULMAX2-RV64-NEXT: vsetivli zero, 16, e16, m2, ta, mu +; LMULMAX2-RV64-NEXT: addi a1, sp, 32 +; LMULMAX2-RV64-NEXT: vle16.v v26, (a1) +; LMULMAX2-RV64-NEXT: vse16.v v26, (a0) +; LMULMAX2-RV64-NEXT: addi sp, s0, -96 +; LMULMAX2-RV64-NEXT: ld s0, 80(sp) # 8-byte Folded Reload +; LMULMAX2-RV64-NEXT: ld ra, 88(sp) # 8-byte Folded Reload +; LMULMAX2-RV64-NEXT: addi sp, sp, 96 +; LMULMAX2-RV64-NEXT: ret ; ; LMULMAX1-RV32-LABEL: ctlz_v16i16: ; LMULMAX1-RV32: # %bb.0: @@ -9516,8 +9516,8 @@ define void @ctlz_v16i16(<16 x i16>* %x, <16 x i16>* %y) { ; LMULMAX1-RV64-NEXT: vle16.v v25, (a0) ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV64-NEXT: lui a2, 16 -; LMULMAX1-RV64-NEXT: addiw a2, a2, -1 -; LMULMAX1-RV64-NEXT: and a1, a1, a2 +; LMULMAX1-RV64-NEXT: addiw a7, a2, -1 +; LMULMAX1-RV64-NEXT: and a1, a1, a7 ; LMULMAX1-RV64-NEXT: srli a3, a1, 1 ; LMULMAX1-RV64-NEXT: or a1, a1, a3 ; LMULMAX1-RV64-NEXT: srli a3, a1, 2 @@ -9539,8 +9539,8 @@ define void @ctlz_v16i16(<16 x i16>* %x, <16 x i16>* %y) { ; LMULMAX1-RV64-NEXT: slli a3, a3, 12 ; LMULMAX1-RV64-NEXT: addi a3, a3, 1365 ; LMULMAX1-RV64-NEXT: slli a3, a3, 12 -; LMULMAX1-RV64-NEXT: addi a7, a3, 1365 -; LMULMAX1-RV64-NEXT: and a4, a4, a7 +; LMULMAX1-RV64-NEXT: addi t0, a3, 1365 +; LMULMAX1-RV64-NEXT: and a4, a4, t0 ; LMULMAX1-RV64-NEXT: sub a1, a1, a4 ; LMULMAX1-RV64-NEXT: lui a4, 13107 ; LMULMAX1-RV64-NEXT: addiw a4, a4, 819 @@ -9563,441 +9563,441 @@ define void @ctlz_v16i16(<16 x i16>* %x, <16 x i16>* %y) { ; LMULMAX1-RV64-NEXT: slli a5, a5, 12 ; LMULMAX1-RV64-NEXT: addi a5, a5, 241 ; LMULMAX1-RV64-NEXT: slli a5, a5, 12 -; LMULMAX1-RV64-NEXT: addi t0, a5, -241 -; LMULMAX1-RV64-NEXT: and a3, a1, t0 +; LMULMAX1-RV64-NEXT: addi a5, a5, -241 +; LMULMAX1-RV64-NEXT: and a2, a1, a5 ; LMULMAX1-RV64-NEXT: lui a1, 4112 ; LMULMAX1-RV64-NEXT: addiw a1, a1, 257 ; LMULMAX1-RV64-NEXT: slli a1, a1, 16 ; LMULMAX1-RV64-NEXT: addi a1, a1, 257 ; LMULMAX1-RV64-NEXT: slli a1, a1, 16 ; LMULMAX1-RV64-NEXT: addi a1, a1, 257 -; LMULMAX1-RV64-NEXT: mul a3, a3, a1 -; LMULMAX1-RV64-NEXT: srli a3, a3, 56 -; LMULMAX1-RV64-NEXT: addiw a3, a3, -48 -; LMULMAX1-RV64-NEXT: sh a3, 32(sp) +; LMULMAX1-RV64-NEXT: mul a2, a2, a1 +; LMULMAX1-RV64-NEXT: srli a2, a2, 56 +; LMULMAX1-RV64-NEXT: addiw a2, a2, -48 +; LMULMAX1-RV64-NEXT: sh a2, 32(sp) ; LMULMAX1-RV64-NEXT: vsetivli zero, 1, e16, m1, ta, mu ; LMULMAX1-RV64-NEXT: vslidedown.vi v27, v26, 7 -; LMULMAX1-RV64-NEXT: vmv.x.s a3, v27 -; LMULMAX1-RV64-NEXT: and a3, a3, a2 -; LMULMAX1-RV64-NEXT: srli a5, a3, 1 -; LMULMAX1-RV64-NEXT: or a3, a3, a5 -; LMULMAX1-RV64-NEXT: srli a5, a3, 2 -; LMULMAX1-RV64-NEXT: or a3, a3, a5 -; LMULMAX1-RV64-NEXT: srli a5, a3, 4 -; LMULMAX1-RV64-NEXT: or a3, a3, a5 -; LMULMAX1-RV64-NEXT: srli a5, a3, 8 -; LMULMAX1-RV64-NEXT: or a3, a3, a5 -; LMULMAX1-RV64-NEXT: srli a5, a3, 16 -; LMULMAX1-RV64-NEXT: or a3, a3, a5 -; LMULMAX1-RV64-NEXT: srli a5, a3, 32 -; LMULMAX1-RV64-NEXT: or a3, a3, a5 -; LMULMAX1-RV64-NEXT: not a3, a3 -; LMULMAX1-RV64-NEXT: srli a5, a3, 1 -; LMULMAX1-RV64-NEXT: and a5, a5, a7 -; LMULMAX1-RV64-NEXT: sub a3, a3, a5 -; LMULMAX1-RV64-NEXT: and a5, a3, a4 -; LMULMAX1-RV64-NEXT: srli a3, a3, 2 -; LMULMAX1-RV64-NEXT: and a3, a3, a4 -; LMULMAX1-RV64-NEXT: add a3, a5, a3 -; LMULMAX1-RV64-NEXT: srli a5, a3, 4 -; LMULMAX1-RV64-NEXT: add a3, a3, a5 +; LMULMAX1-RV64-NEXT: vmv.x.s a2, v27 +; LMULMAX1-RV64-NEXT: and a2, a2, a7 +; LMULMAX1-RV64-NEXT: srli a3, a2, 1 +; LMULMAX1-RV64-NEXT: or a2, a2, a3 +; LMULMAX1-RV64-NEXT: srli a3, a2, 2 +; LMULMAX1-RV64-NEXT: or a2, a2, a3 +; LMULMAX1-RV64-NEXT: srli a3, a2, 4 +; LMULMAX1-RV64-NEXT: or a2, a2, a3 +; LMULMAX1-RV64-NEXT: srli a3, a2, 8 +; LMULMAX1-RV64-NEXT: or a2, a2, a3 +; LMULMAX1-RV64-NEXT: srli a3, a2, 16 +; LMULMAX1-RV64-NEXT: or a2, a2, a3 +; LMULMAX1-RV64-NEXT: srli a3, a2, 32 +; LMULMAX1-RV64-NEXT: or a2, a2, a3 +; LMULMAX1-RV64-NEXT: not a2, a2 +; LMULMAX1-RV64-NEXT: srli a3, a2, 1 ; LMULMAX1-RV64-NEXT: and a3, a3, t0 -; LMULMAX1-RV64-NEXT: mul a3, a3, a1 -; LMULMAX1-RV64-NEXT: srli a3, a3, 56 -; LMULMAX1-RV64-NEXT: addiw a3, a3, -48 -; LMULMAX1-RV64-NEXT: sh a3, 46(sp) +; LMULMAX1-RV64-NEXT: sub a2, a2, a3 +; LMULMAX1-RV64-NEXT: and a3, a2, a4 +; LMULMAX1-RV64-NEXT: srli a2, a2, 2 +; LMULMAX1-RV64-NEXT: and a2, a2, a4 +; LMULMAX1-RV64-NEXT: add a2, a3, a2 +; LMULMAX1-RV64-NEXT: srli a3, a2, 4 +; LMULMAX1-RV64-NEXT: add a2, a2, a3 +; LMULMAX1-RV64-NEXT: and a2, a2, a5 +; LMULMAX1-RV64-NEXT: mul a2, a2, a1 +; LMULMAX1-RV64-NEXT: srli a2, a2, 56 +; LMULMAX1-RV64-NEXT: addiw a2, a2, -48 +; LMULMAX1-RV64-NEXT: sh a2, 46(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v27, v26, 6 -; LMULMAX1-RV64-NEXT: vmv.x.s a3, v27 -; LMULMAX1-RV64-NEXT: and a3, a3, a2 -; LMULMAX1-RV64-NEXT: srli a5, a3, 1 -; LMULMAX1-RV64-NEXT: or a3, a3, a5 -; LMULMAX1-RV64-NEXT: srli a5, a3, 2 -; LMULMAX1-RV64-NEXT: or a3, a3, a5 -; LMULMAX1-RV64-NEXT: srli a5, a3, 4 -; LMULMAX1-RV64-NEXT: or a3, a3, a5 -; LMULMAX1-RV64-NEXT: srli a5, a3, 8 -; LMULMAX1-RV64-NEXT: or a3, a3, a5 -; LMULMAX1-RV64-NEXT: srli a5, a3, 16 -; LMULMAX1-RV64-NEXT: or a3, a3, a5 -; LMULMAX1-RV64-NEXT: srli a5, a3, 32 -; LMULMAX1-RV64-NEXT: or a3, a3, a5 -; LMULMAX1-RV64-NEXT: not a3, a3 -; LMULMAX1-RV64-NEXT: srli a5, a3, 1 -; LMULMAX1-RV64-NEXT: and a5, a5, a7 -; LMULMAX1-RV64-NEXT: sub a3, a3, a5 -; LMULMAX1-RV64-NEXT: and a5, a3, a4 -; LMULMAX1-RV64-NEXT: srli a3, a3, 2 -; LMULMAX1-RV64-NEXT: and a3, a3, a4 -; LMULMAX1-RV64-NEXT: add a3, a5, a3 -; LMULMAX1-RV64-NEXT: srli a5, a3, 4 -; LMULMAX1-RV64-NEXT: add a3, a3, a5 +; LMULMAX1-RV64-NEXT: vmv.x.s a2, v27 +; LMULMAX1-RV64-NEXT: and a2, a2, a7 +; LMULMAX1-RV64-NEXT: srli a3, a2, 1 +; LMULMAX1-RV64-NEXT: or a2, a2, a3 +; LMULMAX1-RV64-NEXT: srli a3, a2, 2 +; LMULMAX1-RV64-NEXT: or a2, a2, a3 +; LMULMAX1-RV64-NEXT: srli a3, a2, 4 +; LMULMAX1-RV64-NEXT: or a2, a2, a3 +; LMULMAX1-RV64-NEXT: srli a3, a2, 8 +; LMULMAX1-RV64-NEXT: or a2, a2, a3 +; LMULMAX1-RV64-NEXT: srli a3, a2, 16 +; LMULMAX1-RV64-NEXT: or a2, a2, a3 +; LMULMAX1-RV64-NEXT: srli a3, a2, 32 +; LMULMAX1-RV64-NEXT: or a2, a2, a3 +; LMULMAX1-RV64-NEXT: not a2, a2 +; LMULMAX1-RV64-NEXT: srli a3, a2, 1 ; LMULMAX1-RV64-NEXT: and a3, a3, t0 -; LMULMAX1-RV64-NEXT: mul a3, a3, a1 -; LMULMAX1-RV64-NEXT: srli a3, a3, 56 -; LMULMAX1-RV64-NEXT: addiw a3, a3, -48 -; LMULMAX1-RV64-NEXT: sh a3, 44(sp) +; LMULMAX1-RV64-NEXT: sub a2, a2, a3 +; LMULMAX1-RV64-NEXT: and a3, a2, a4 +; LMULMAX1-RV64-NEXT: srli a2, a2, 2 +; LMULMAX1-RV64-NEXT: and a2, a2, a4 +; LMULMAX1-RV64-NEXT: add a2, a3, a2 +; LMULMAX1-RV64-NEXT: srli a3, a2, 4 +; LMULMAX1-RV64-NEXT: add a2, a2, a3 +; LMULMAX1-RV64-NEXT: and a2, a2, a5 +; LMULMAX1-RV64-NEXT: mul a2, a2, a1 +; LMULMAX1-RV64-NEXT: srli a2, a2, 56 +; LMULMAX1-RV64-NEXT: addiw a2, a2, -48 +; LMULMAX1-RV64-NEXT: sh a2, 44(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v27, v26, 5 -; LMULMAX1-RV64-NEXT: vmv.x.s a3, v27 -; LMULMAX1-RV64-NEXT: and a3, a3, a2 -; LMULMAX1-RV64-NEXT: srli a5, a3, 1 -; LMULMAX1-RV64-NEXT: or a3, a3, a5 -; LMULMAX1-RV64-NEXT: srli a5, a3, 2 -; LMULMAX1-RV64-NEXT: or a3, a3, a5 -; LMULMAX1-RV64-NEXT: srli a5, a3, 4 -; LMULMAX1-RV64-NEXT: or a3, a3, a5 -; LMULMAX1-RV64-NEXT: srli a5, a3, 8 -; LMULMAX1-RV64-NEXT: or a3, a3, a5 -; LMULMAX1-RV64-NEXT: srli a5, a3, 16 -; LMULMAX1-RV64-NEXT: or a3, a3, a5 -; LMULMAX1-RV64-NEXT: srli a5, a3, 32 -; LMULMAX1-RV64-NEXT: or a3, a3, a5 -; LMULMAX1-RV64-NEXT: not a3, a3 -; LMULMAX1-RV64-NEXT: srli a5, a3, 1 -; LMULMAX1-RV64-NEXT: and a5, a5, a7 -; LMULMAX1-RV64-NEXT: sub a3, a3, a5 -; LMULMAX1-RV64-NEXT: and a5, a3, a4 -; LMULMAX1-RV64-NEXT: srli a3, a3, 2 -; LMULMAX1-RV64-NEXT: and a3, a3, a4 -; LMULMAX1-RV64-NEXT: add a3, a5, a3 -; LMULMAX1-RV64-NEXT: srli a5, a3, 4 -; LMULMAX1-RV64-NEXT: add a3, a3, a5 +; LMULMAX1-RV64-NEXT: vmv.x.s a2, v27 +; LMULMAX1-RV64-NEXT: and a2, a2, a7 +; LMULMAX1-RV64-NEXT: srli a3, a2, 1 +; LMULMAX1-RV64-NEXT: or a2, a2, a3 +; LMULMAX1-RV64-NEXT: srli a3, a2, 2 +; LMULMAX1-RV64-NEXT: or a2, a2, a3 +; LMULMAX1-RV64-NEXT: srli a3, a2, 4 +; LMULMAX1-RV64-NEXT: or a2, a2, a3 +; LMULMAX1-RV64-NEXT: srli a3, a2, 8 +; LMULMAX1-RV64-NEXT: or a2, a2, a3 +; LMULMAX1-RV64-NEXT: srli a3, a2, 16 +; LMULMAX1-RV64-NEXT: or a2, a2, a3 +; LMULMAX1-RV64-NEXT: srli a3, a2, 32 +; LMULMAX1-RV64-NEXT: or a2, a2, a3 +; LMULMAX1-RV64-NEXT: not a2, a2 +; LMULMAX1-RV64-NEXT: srli a3, a2, 1 ; LMULMAX1-RV64-NEXT: and a3, a3, t0 -; LMULMAX1-RV64-NEXT: mul a3, a3, a1 -; LMULMAX1-RV64-NEXT: srli a3, a3, 56 -; LMULMAX1-RV64-NEXT: addiw a3, a3, -48 -; LMULMAX1-RV64-NEXT: sh a3, 42(sp) +; LMULMAX1-RV64-NEXT: sub a2, a2, a3 +; LMULMAX1-RV64-NEXT: and a3, a2, a4 +; LMULMAX1-RV64-NEXT: srli a2, a2, 2 +; LMULMAX1-RV64-NEXT: and a2, a2, a4 +; LMULMAX1-RV64-NEXT: add a2, a3, a2 +; LMULMAX1-RV64-NEXT: srli a3, a2, 4 +; LMULMAX1-RV64-NEXT: add a2, a2, a3 +; LMULMAX1-RV64-NEXT: and a2, a2, a5 +; LMULMAX1-RV64-NEXT: mul a2, a2, a1 +; LMULMAX1-RV64-NEXT: srli a2, a2, 56 +; LMULMAX1-RV64-NEXT: addiw a2, a2, -48 +; LMULMAX1-RV64-NEXT: sh a2, 42(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v27, v26, 4 -; LMULMAX1-RV64-NEXT: vmv.x.s a3, v27 -; LMULMAX1-RV64-NEXT: and a3, a3, a2 -; LMULMAX1-RV64-NEXT: srli a5, a3, 1 -; LMULMAX1-RV64-NEXT: or a3, a3, a5 -; LMULMAX1-RV64-NEXT: srli a5, a3, 2 -; LMULMAX1-RV64-NEXT: or a3, a3, a5 -; LMULMAX1-RV64-NEXT: srli a5, a3, 4 -; LMULMAX1-RV64-NEXT: or a3, a3, a5 -; LMULMAX1-RV64-NEXT: srli a5, a3, 8 -; LMULMAX1-RV64-NEXT: or a3, a3, a5 -; LMULMAX1-RV64-NEXT: srli a5, a3, 16 -; LMULMAX1-RV64-NEXT: or a3, a3, a5 -; LMULMAX1-RV64-NEXT: srli a5, a3, 32 -; LMULMAX1-RV64-NEXT: or a3, a3, a5 -; LMULMAX1-RV64-NEXT: not a3, a3 -; LMULMAX1-RV64-NEXT: srli a5, a3, 1 -; LMULMAX1-RV64-NEXT: and a5, a5, a7 -; LMULMAX1-RV64-NEXT: sub a3, a3, a5 -; LMULMAX1-RV64-NEXT: and a5, a3, a4 -; LMULMAX1-RV64-NEXT: srli a3, a3, 2 -; LMULMAX1-RV64-NEXT: and a3, a3, a4 -; LMULMAX1-RV64-NEXT: add a3, a5, a3 -; LMULMAX1-RV64-NEXT: srli a5, a3, 4 -; LMULMAX1-RV64-NEXT: add a3, a3, a5 +; LMULMAX1-RV64-NEXT: vmv.x.s a2, v27 +; LMULMAX1-RV64-NEXT: and a2, a2, a7 +; LMULMAX1-RV64-NEXT: srli a3, a2, 1 +; LMULMAX1-RV64-NEXT: or a2, a2, a3 +; LMULMAX1-RV64-NEXT: srli a3, a2, 2 +; LMULMAX1-RV64-NEXT: or a2, a2, a3 +; LMULMAX1-RV64-NEXT: srli a3, a2, 4 +; LMULMAX1-RV64-NEXT: or a2, a2, a3 +; LMULMAX1-RV64-NEXT: srli a3, a2, 8 +; LMULMAX1-RV64-NEXT: or a2, a2, a3 +; LMULMAX1-RV64-NEXT: srli a3, a2, 16 +; LMULMAX1-RV64-NEXT: or a2, a2, a3 +; LMULMAX1-RV64-NEXT: srli a3, a2, 32 +; LMULMAX1-RV64-NEXT: or a2, a2, a3 +; LMULMAX1-RV64-NEXT: not a2, a2 +; LMULMAX1-RV64-NEXT: srli a3, a2, 1 ; LMULMAX1-RV64-NEXT: and a3, a3, t0 -; LMULMAX1-RV64-NEXT: mul a3, a3, a1 -; LMULMAX1-RV64-NEXT: srli a3, a3, 56 -; LMULMAX1-RV64-NEXT: addiw a3, a3, -48 -; LMULMAX1-RV64-NEXT: sh a3, 40(sp) +; LMULMAX1-RV64-NEXT: sub a2, a2, a3 +; LMULMAX1-RV64-NEXT: and a3, a2, a4 +; LMULMAX1-RV64-NEXT: srli a2, a2, 2 +; LMULMAX1-RV64-NEXT: and a2, a2, a4 +; LMULMAX1-RV64-NEXT: add a2, a3, a2 +; LMULMAX1-RV64-NEXT: srli a3, a2, 4 +; LMULMAX1-RV64-NEXT: add a2, a2, a3 +; LMULMAX1-RV64-NEXT: and a2, a2, a5 +; LMULMAX1-RV64-NEXT: mul a2, a2, a1 +; LMULMAX1-RV64-NEXT: srli a2, a2, 56 +; LMULMAX1-RV64-NEXT: addiw a2, a2, -48 +; LMULMAX1-RV64-NEXT: sh a2, 40(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v27, v26, 3 -; LMULMAX1-RV64-NEXT: vmv.x.s a3, v27 -; LMULMAX1-RV64-NEXT: and a3, a3, a2 -; LMULMAX1-RV64-NEXT: srli a5, a3, 1 -; LMULMAX1-RV64-NEXT: or a3, a3, a5 -; LMULMAX1-RV64-NEXT: srli a5, a3, 2 -; LMULMAX1-RV64-NEXT: or a3, a3, a5 -; LMULMAX1-RV64-NEXT: srli a5, a3, 4 -; LMULMAX1-RV64-NEXT: or a3, a3, a5 -; LMULMAX1-RV64-NEXT: srli a5, a3, 8 -; LMULMAX1-RV64-NEXT: or a3, a3, a5 -; LMULMAX1-RV64-NEXT: srli a5, a3, 16 -; LMULMAX1-RV64-NEXT: or a3, a3, a5 -; LMULMAX1-RV64-NEXT: srli a5, a3, 32 -; LMULMAX1-RV64-NEXT: or a3, a3, a5 -; LMULMAX1-RV64-NEXT: not a3, a3 -; LMULMAX1-RV64-NEXT: srli a5, a3, 1 -; LMULMAX1-RV64-NEXT: and a5, a5, a7 -; LMULMAX1-RV64-NEXT: sub a3, a3, a5 -; LMULMAX1-RV64-NEXT: and a5, a3, a4 -; LMULMAX1-RV64-NEXT: srli a3, a3, 2 -; LMULMAX1-RV64-NEXT: and a3, a3, a4 -; LMULMAX1-RV64-NEXT: add a3, a5, a3 -; LMULMAX1-RV64-NEXT: srli a5, a3, 4 -; LMULMAX1-RV64-NEXT: add a3, a3, a5 +; LMULMAX1-RV64-NEXT: vmv.x.s a2, v27 +; LMULMAX1-RV64-NEXT: and a2, a2, a7 +; LMULMAX1-RV64-NEXT: srli a3, a2, 1 +; LMULMAX1-RV64-NEXT: or a2, a2, a3 +; LMULMAX1-RV64-NEXT: srli a3, a2, 2 +; LMULMAX1-RV64-NEXT: or a2, a2, a3 +; LMULMAX1-RV64-NEXT: srli a3, a2, 4 +; LMULMAX1-RV64-NEXT: or a2, a2, a3 +; LMULMAX1-RV64-NEXT: srli a3, a2, 8 +; LMULMAX1-RV64-NEXT: or a2, a2, a3 +; LMULMAX1-RV64-NEXT: srli a3, a2, 16 +; LMULMAX1-RV64-NEXT: or a2, a2, a3 +; LMULMAX1-RV64-NEXT: srli a3, a2, 32 +; LMULMAX1-RV64-NEXT: or a2, a2, a3 +; LMULMAX1-RV64-NEXT: not a2, a2 +; LMULMAX1-RV64-NEXT: srli a3, a2, 1 ; LMULMAX1-RV64-NEXT: and a3, a3, t0 -; LMULMAX1-RV64-NEXT: mul a3, a3, a1 -; LMULMAX1-RV64-NEXT: srli a3, a3, 56 -; LMULMAX1-RV64-NEXT: addiw a3, a3, -48 -; LMULMAX1-RV64-NEXT: sh a3, 38(sp) +; LMULMAX1-RV64-NEXT: sub a2, a2, a3 +; LMULMAX1-RV64-NEXT: and a3, a2, a4 +; LMULMAX1-RV64-NEXT: srli a2, a2, 2 +; LMULMAX1-RV64-NEXT: and a2, a2, a4 +; LMULMAX1-RV64-NEXT: add a2, a3, a2 +; LMULMAX1-RV64-NEXT: srli a3, a2, 4 +; LMULMAX1-RV64-NEXT: add a2, a2, a3 +; LMULMAX1-RV64-NEXT: and a2, a2, a5 +; LMULMAX1-RV64-NEXT: mul a2, a2, a1 +; LMULMAX1-RV64-NEXT: srli a2, a2, 56 +; LMULMAX1-RV64-NEXT: addiw a2, a2, -48 +; LMULMAX1-RV64-NEXT: sh a2, 38(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v27, v26, 2 -; LMULMAX1-RV64-NEXT: vmv.x.s a3, v27 -; LMULMAX1-RV64-NEXT: and a3, a3, a2 -; LMULMAX1-RV64-NEXT: srli a5, a3, 1 -; LMULMAX1-RV64-NEXT: or a3, a3, a5 -; LMULMAX1-RV64-NEXT: srli a5, a3, 2 -; LMULMAX1-RV64-NEXT: or a3, a3, a5 -; LMULMAX1-RV64-NEXT: srli a5, a3, 4 -; LMULMAX1-RV64-NEXT: or a3, a3, a5 -; LMULMAX1-RV64-NEXT: srli a5, a3, 8 -; LMULMAX1-RV64-NEXT: or a3, a3, a5 -; LMULMAX1-RV64-NEXT: srli a5, a3, 16 -; LMULMAX1-RV64-NEXT: or a3, a3, a5 -; LMULMAX1-RV64-NEXT: srli a5, a3, 32 -; LMULMAX1-RV64-NEXT: or a3, a3, a5 -; LMULMAX1-RV64-NEXT: not a3, a3 -; LMULMAX1-RV64-NEXT: srli a5, a3, 1 -; LMULMAX1-RV64-NEXT: and a5, a5, a7 -; LMULMAX1-RV64-NEXT: sub a3, a3, a5 -; LMULMAX1-RV64-NEXT: and a5, a3, a4 -; LMULMAX1-RV64-NEXT: srli a3, a3, 2 -; LMULMAX1-RV64-NEXT: and a3, a3, a4 -; LMULMAX1-RV64-NEXT: add a3, a5, a3 -; LMULMAX1-RV64-NEXT: srli a5, a3, 4 -; LMULMAX1-RV64-NEXT: add a3, a3, a5 +; LMULMAX1-RV64-NEXT: vmv.x.s a2, v27 +; LMULMAX1-RV64-NEXT: and a2, a2, a7 +; LMULMAX1-RV64-NEXT: srli a3, a2, 1 +; LMULMAX1-RV64-NEXT: or a2, a2, a3 +; LMULMAX1-RV64-NEXT: srli a3, a2, 2 +; LMULMAX1-RV64-NEXT: or a2, a2, a3 +; LMULMAX1-RV64-NEXT: srli a3, a2, 4 +; LMULMAX1-RV64-NEXT: or a2, a2, a3 +; LMULMAX1-RV64-NEXT: srli a3, a2, 8 +; LMULMAX1-RV64-NEXT: or a2, a2, a3 +; LMULMAX1-RV64-NEXT: srli a3, a2, 16 +; LMULMAX1-RV64-NEXT: or a2, a2, a3 +; LMULMAX1-RV64-NEXT: srli a3, a2, 32 +; LMULMAX1-RV64-NEXT: or a2, a2, a3 +; LMULMAX1-RV64-NEXT: not a2, a2 +; LMULMAX1-RV64-NEXT: srli a3, a2, 1 ; LMULMAX1-RV64-NEXT: and a3, a3, t0 -; LMULMAX1-RV64-NEXT: mul a3, a3, a1 -; LMULMAX1-RV64-NEXT: srli a3, a3, 56 -; LMULMAX1-RV64-NEXT: addiw a3, a3, -48 -; LMULMAX1-RV64-NEXT: sh a3, 36(sp) +; LMULMAX1-RV64-NEXT: sub a2, a2, a3 +; LMULMAX1-RV64-NEXT: and a3, a2, a4 +; LMULMAX1-RV64-NEXT: srli a2, a2, 2 +; LMULMAX1-RV64-NEXT: and a2, a2, a4 +; LMULMAX1-RV64-NEXT: add a2, a3, a2 +; LMULMAX1-RV64-NEXT: srli a3, a2, 4 +; LMULMAX1-RV64-NEXT: add a2, a2, a3 +; LMULMAX1-RV64-NEXT: and a2, a2, a5 +; LMULMAX1-RV64-NEXT: mul a2, a2, a1 +; LMULMAX1-RV64-NEXT: srli a2, a2, 56 +; LMULMAX1-RV64-NEXT: addiw a2, a2, -48 +; LMULMAX1-RV64-NEXT: sh a2, 36(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v26, 1 -; LMULMAX1-RV64-NEXT: vmv.x.s a3, v26 -; LMULMAX1-RV64-NEXT: and a3, a3, a2 -; LMULMAX1-RV64-NEXT: srli a5, a3, 1 -; LMULMAX1-RV64-NEXT: or a3, a3, a5 -; LMULMAX1-RV64-NEXT: srli a5, a3, 2 -; LMULMAX1-RV64-NEXT: or a3, a3, a5 -; LMULMAX1-RV64-NEXT: srli a5, a3, 4 -; LMULMAX1-RV64-NEXT: or a3, a3, a5 -; LMULMAX1-RV64-NEXT: srli a5, a3, 8 -; LMULMAX1-RV64-NEXT: or a3, a3, a5 -; LMULMAX1-RV64-NEXT: srli a5, a3, 16 -; LMULMAX1-RV64-NEXT: or a3, a3, a5 -; LMULMAX1-RV64-NEXT: srli a5, a3, 32 -; LMULMAX1-RV64-NEXT: or a3, a3, a5 -; LMULMAX1-RV64-NEXT: not a3, a3 -; LMULMAX1-RV64-NEXT: srli a5, a3, 1 -; LMULMAX1-RV64-NEXT: and a5, a5, a7 -; LMULMAX1-RV64-NEXT: sub a3, a3, a5 -; LMULMAX1-RV64-NEXT: and a5, a3, a4 -; LMULMAX1-RV64-NEXT: srli a3, a3, 2 -; LMULMAX1-RV64-NEXT: and a3, a3, a4 -; LMULMAX1-RV64-NEXT: add a3, a5, a3 -; LMULMAX1-RV64-NEXT: srli a5, a3, 4 -; LMULMAX1-RV64-NEXT: add a3, a3, a5 +; LMULMAX1-RV64-NEXT: vmv.x.s a2, v26 +; LMULMAX1-RV64-NEXT: and a2, a2, a7 +; LMULMAX1-RV64-NEXT: srli a3, a2, 1 +; LMULMAX1-RV64-NEXT: or a2, a2, a3 +; LMULMAX1-RV64-NEXT: srli a3, a2, 2 +; LMULMAX1-RV64-NEXT: or a2, a2, a3 +; LMULMAX1-RV64-NEXT: srli a3, a2, 4 +; LMULMAX1-RV64-NEXT: or a2, a2, a3 +; LMULMAX1-RV64-NEXT: srli a3, a2, 8 +; LMULMAX1-RV64-NEXT: or a2, a2, a3 +; LMULMAX1-RV64-NEXT: srli a3, a2, 16 +; LMULMAX1-RV64-NEXT: or a2, a2, a3 +; LMULMAX1-RV64-NEXT: srli a3, a2, 32 +; LMULMAX1-RV64-NEXT: or a2, a2, a3 +; LMULMAX1-RV64-NEXT: not a2, a2 +; LMULMAX1-RV64-NEXT: srli a3, a2, 1 ; LMULMAX1-RV64-NEXT: and a3, a3, t0 -; LMULMAX1-RV64-NEXT: mul a3, a3, a1 -; LMULMAX1-RV64-NEXT: srli a3, a3, 56 -; LMULMAX1-RV64-NEXT: addiw a3, a3, -48 -; LMULMAX1-RV64-NEXT: sh a3, 34(sp) -; LMULMAX1-RV64-NEXT: vmv.x.s a3, v25 -; LMULMAX1-RV64-NEXT: and a3, a3, a2 -; LMULMAX1-RV64-NEXT: srli a5, a3, 1 -; LMULMAX1-RV64-NEXT: or a3, a3, a5 -; LMULMAX1-RV64-NEXT: srli a5, a3, 2 -; LMULMAX1-RV64-NEXT: or a3, a3, a5 -; LMULMAX1-RV64-NEXT: srli a5, a3, 4 -; LMULMAX1-RV64-NEXT: or a3, a3, a5 -; LMULMAX1-RV64-NEXT: srli a5, a3, 8 -; LMULMAX1-RV64-NEXT: or a3, a3, a5 -; LMULMAX1-RV64-NEXT: srli a5, a3, 16 -; LMULMAX1-RV64-NEXT: or a3, a3, a5 -; LMULMAX1-RV64-NEXT: srli a5, a3, 32 -; LMULMAX1-RV64-NEXT: or a3, a3, a5 -; LMULMAX1-RV64-NEXT: not a3, a3 -; LMULMAX1-RV64-NEXT: srli a5, a3, 1 -; LMULMAX1-RV64-NEXT: and a5, a5, a7 -; LMULMAX1-RV64-NEXT: sub a3, a3, a5 -; LMULMAX1-RV64-NEXT: and a5, a3, a4 -; LMULMAX1-RV64-NEXT: srli a3, a3, 2 -; LMULMAX1-RV64-NEXT: and a3, a3, a4 -; LMULMAX1-RV64-NEXT: add a3, a5, a3 -; LMULMAX1-RV64-NEXT: srli a5, a3, 4 -; LMULMAX1-RV64-NEXT: add a3, a3, a5 +; LMULMAX1-RV64-NEXT: sub a2, a2, a3 +; LMULMAX1-RV64-NEXT: and a3, a2, a4 +; LMULMAX1-RV64-NEXT: srli a2, a2, 2 +; LMULMAX1-RV64-NEXT: and a2, a2, a4 +; LMULMAX1-RV64-NEXT: add a2, a3, a2 +; LMULMAX1-RV64-NEXT: srli a3, a2, 4 +; LMULMAX1-RV64-NEXT: add a2, a2, a3 +; LMULMAX1-RV64-NEXT: and a2, a2, a5 +; LMULMAX1-RV64-NEXT: mul a2, a2, a1 +; LMULMAX1-RV64-NEXT: srli a2, a2, 56 +; LMULMAX1-RV64-NEXT: addiw a2, a2, -48 +; LMULMAX1-RV64-NEXT: sh a2, 34(sp) +; LMULMAX1-RV64-NEXT: vmv.x.s a2, v25 +; LMULMAX1-RV64-NEXT: and a2, a2, a7 +; LMULMAX1-RV64-NEXT: srli a3, a2, 1 +; LMULMAX1-RV64-NEXT: or a2, a2, a3 +; LMULMAX1-RV64-NEXT: srli a3, a2, 2 +; LMULMAX1-RV64-NEXT: or a2, a2, a3 +; LMULMAX1-RV64-NEXT: srli a3, a2, 4 +; LMULMAX1-RV64-NEXT: or a2, a2, a3 +; LMULMAX1-RV64-NEXT: srli a3, a2, 8 +; LMULMAX1-RV64-NEXT: or a2, a2, a3 +; LMULMAX1-RV64-NEXT: srli a3, a2, 16 +; LMULMAX1-RV64-NEXT: or a2, a2, a3 +; LMULMAX1-RV64-NEXT: srli a3, a2, 32 +; LMULMAX1-RV64-NEXT: or a2, a2, a3 +; LMULMAX1-RV64-NEXT: not a2, a2 +; LMULMAX1-RV64-NEXT: srli a3, a2, 1 ; LMULMAX1-RV64-NEXT: and a3, a3, t0 -; LMULMAX1-RV64-NEXT: mul a3, a3, a1 -; LMULMAX1-RV64-NEXT: srli a3, a3, 56 -; LMULMAX1-RV64-NEXT: addiw a3, a3, -48 -; LMULMAX1-RV64-NEXT: sh a3, 16(sp) +; LMULMAX1-RV64-NEXT: sub a2, a2, a3 +; LMULMAX1-RV64-NEXT: and a3, a2, a4 +; LMULMAX1-RV64-NEXT: srli a2, a2, 2 +; LMULMAX1-RV64-NEXT: and a2, a2, a4 +; LMULMAX1-RV64-NEXT: add a2, a3, a2 +; LMULMAX1-RV64-NEXT: srli a3, a2, 4 +; LMULMAX1-RV64-NEXT: add a2, a2, a3 +; LMULMAX1-RV64-NEXT: and a2, a2, a5 +; LMULMAX1-RV64-NEXT: mul a2, a2, a1 +; LMULMAX1-RV64-NEXT: srli a2, a2, 56 +; LMULMAX1-RV64-NEXT: addiw a2, a2, -48 +; LMULMAX1-RV64-NEXT: sh a2, 16(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 7 -; LMULMAX1-RV64-NEXT: vmv.x.s a3, v26 -; LMULMAX1-RV64-NEXT: and a3, a3, a2 -; LMULMAX1-RV64-NEXT: srli a5, a3, 1 -; LMULMAX1-RV64-NEXT: or a3, a3, a5 -; LMULMAX1-RV64-NEXT: srli a5, a3, 2 -; LMULMAX1-RV64-NEXT: or a3, a3, a5 -; LMULMAX1-RV64-NEXT: srli a5, a3, 4 -; LMULMAX1-RV64-NEXT: or a3, a3, a5 -; LMULMAX1-RV64-NEXT: srli a5, a3, 8 -; LMULMAX1-RV64-NEXT: or a3, a3, a5 -; LMULMAX1-RV64-NEXT: srli a5, a3, 16 -; LMULMAX1-RV64-NEXT: or a3, a3, a5 -; LMULMAX1-RV64-NEXT: srli a5, a3, 32 -; LMULMAX1-RV64-NEXT: or a3, a3, a5 -; LMULMAX1-RV64-NEXT: not a3, a3 -; LMULMAX1-RV64-NEXT: srli a5, a3, 1 -; LMULMAX1-RV64-NEXT: and a5, a5, a7 -; LMULMAX1-RV64-NEXT: sub a3, a3, a5 -; LMULMAX1-RV64-NEXT: and a5, a3, a4 -; LMULMAX1-RV64-NEXT: srli a3, a3, 2 -; LMULMAX1-RV64-NEXT: and a3, a3, a4 -; LMULMAX1-RV64-NEXT: add a3, a5, a3 -; LMULMAX1-RV64-NEXT: srli a5, a3, 4 -; LMULMAX1-RV64-NEXT: add a3, a3, a5 +; LMULMAX1-RV64-NEXT: vmv.x.s a2, v26 +; LMULMAX1-RV64-NEXT: and a2, a2, a7 +; LMULMAX1-RV64-NEXT: srli a3, a2, 1 +; LMULMAX1-RV64-NEXT: or a2, a2, a3 +; LMULMAX1-RV64-NEXT: srli a3, a2, 2 +; LMULMAX1-RV64-NEXT: or a2, a2, a3 +; LMULMAX1-RV64-NEXT: srli a3, a2, 4 +; LMULMAX1-RV64-NEXT: or a2, a2, a3 +; LMULMAX1-RV64-NEXT: srli a3, a2, 8 +; LMULMAX1-RV64-NEXT: or a2, a2, a3 +; LMULMAX1-RV64-NEXT: srli a3, a2, 16 +; LMULMAX1-RV64-NEXT: or a2, a2, a3 +; LMULMAX1-RV64-NEXT: srli a3, a2, 32 +; LMULMAX1-RV64-NEXT: or a2, a2, a3 +; LMULMAX1-RV64-NEXT: not a2, a2 +; LMULMAX1-RV64-NEXT: srli a3, a2, 1 ; LMULMAX1-RV64-NEXT: and a3, a3, t0 -; LMULMAX1-RV64-NEXT: mul a3, a3, a1 -; LMULMAX1-RV64-NEXT: srli a3, a3, 56 -; LMULMAX1-RV64-NEXT: addiw a3, a3, -48 -; LMULMAX1-RV64-NEXT: sh a3, 30(sp) +; LMULMAX1-RV64-NEXT: sub a2, a2, a3 +; LMULMAX1-RV64-NEXT: and a3, a2, a4 +; LMULMAX1-RV64-NEXT: srli a2, a2, 2 +; LMULMAX1-RV64-NEXT: and a2, a2, a4 +; LMULMAX1-RV64-NEXT: add a2, a3, a2 +; LMULMAX1-RV64-NEXT: srli a3, a2, 4 +; LMULMAX1-RV64-NEXT: add a2, a2, a3 +; LMULMAX1-RV64-NEXT: and a2, a2, a5 +; LMULMAX1-RV64-NEXT: mul a2, a2, a1 +; LMULMAX1-RV64-NEXT: srli a2, a2, 56 +; LMULMAX1-RV64-NEXT: addiw a2, a2, -48 +; LMULMAX1-RV64-NEXT: sh a2, 30(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 6 -; LMULMAX1-RV64-NEXT: vmv.x.s a3, v26 -; LMULMAX1-RV64-NEXT: and a3, a3, a2 -; LMULMAX1-RV64-NEXT: srli a5, a3, 1 -; LMULMAX1-RV64-NEXT: or a3, a3, a5 -; LMULMAX1-RV64-NEXT: srli a5, a3, 2 -; LMULMAX1-RV64-NEXT: or a3, a3, a5 -; LMULMAX1-RV64-NEXT: srli a5, a3, 4 -; LMULMAX1-RV64-NEXT: or a3, a3, a5 -; LMULMAX1-RV64-NEXT: srli a5, a3, 8 -; LMULMAX1-RV64-NEXT: or a3, a3, a5 -; LMULMAX1-RV64-NEXT: srli a5, a3, 16 -; LMULMAX1-RV64-NEXT: or a3, a3, a5 -; LMULMAX1-RV64-NEXT: srli a5, a3, 32 -; LMULMAX1-RV64-NEXT: or a3, a3, a5 -; LMULMAX1-RV64-NEXT: not a3, a3 -; LMULMAX1-RV64-NEXT: srli a5, a3, 1 -; LMULMAX1-RV64-NEXT: and a5, a5, a7 -; LMULMAX1-RV64-NEXT: sub a3, a3, a5 -; LMULMAX1-RV64-NEXT: and a5, a3, a4 -; LMULMAX1-RV64-NEXT: srli a3, a3, 2 -; LMULMAX1-RV64-NEXT: and a3, a3, a4 -; LMULMAX1-RV64-NEXT: add a3, a5, a3 -; LMULMAX1-RV64-NEXT: srli a5, a3, 4 -; LMULMAX1-RV64-NEXT: add a3, a3, a5 +; LMULMAX1-RV64-NEXT: vmv.x.s a2, v26 +; LMULMAX1-RV64-NEXT: and a2, a2, a7 +; LMULMAX1-RV64-NEXT: srli a3, a2, 1 +; LMULMAX1-RV64-NEXT: or a2, a2, a3 +; LMULMAX1-RV64-NEXT: srli a3, a2, 2 +; LMULMAX1-RV64-NEXT: or a2, a2, a3 +; LMULMAX1-RV64-NEXT: srli a3, a2, 4 +; LMULMAX1-RV64-NEXT: or a2, a2, a3 +; LMULMAX1-RV64-NEXT: srli a3, a2, 8 +; LMULMAX1-RV64-NEXT: or a2, a2, a3 +; LMULMAX1-RV64-NEXT: srli a3, a2, 16 +; LMULMAX1-RV64-NEXT: or a2, a2, a3 +; LMULMAX1-RV64-NEXT: srli a3, a2, 32 +; LMULMAX1-RV64-NEXT: or a2, a2, a3 +; LMULMAX1-RV64-NEXT: not a2, a2 +; LMULMAX1-RV64-NEXT: srli a3, a2, 1 ; LMULMAX1-RV64-NEXT: and a3, a3, t0 -; LMULMAX1-RV64-NEXT: mul a3, a3, a1 -; LMULMAX1-RV64-NEXT: srli a3, a3, 56 -; LMULMAX1-RV64-NEXT: addiw a3, a3, -48 -; LMULMAX1-RV64-NEXT: sh a3, 28(sp) +; LMULMAX1-RV64-NEXT: sub a2, a2, a3 +; LMULMAX1-RV64-NEXT: and a3, a2, a4 +; LMULMAX1-RV64-NEXT: srli a2, a2, 2 +; LMULMAX1-RV64-NEXT: and a2, a2, a4 +; LMULMAX1-RV64-NEXT: add a2, a3, a2 +; LMULMAX1-RV64-NEXT: srli a3, a2, 4 +; LMULMAX1-RV64-NEXT: add a2, a2, a3 +; LMULMAX1-RV64-NEXT: and a2, a2, a5 +; LMULMAX1-RV64-NEXT: mul a2, a2, a1 +; LMULMAX1-RV64-NEXT: srli a2, a2, 56 +; LMULMAX1-RV64-NEXT: addiw a2, a2, -48 +; LMULMAX1-RV64-NEXT: sh a2, 28(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 5 -; LMULMAX1-RV64-NEXT: vmv.x.s a3, v26 -; LMULMAX1-RV64-NEXT: and a3, a3, a2 -; LMULMAX1-RV64-NEXT: srli a5, a3, 1 -; LMULMAX1-RV64-NEXT: or a3, a3, a5 -; LMULMAX1-RV64-NEXT: srli a5, a3, 2 -; LMULMAX1-RV64-NEXT: or a3, a3, a5 -; LMULMAX1-RV64-NEXT: srli a5, a3, 4 -; LMULMAX1-RV64-NEXT: or a3, a3, a5 -; LMULMAX1-RV64-NEXT: srli a5, a3, 8 -; LMULMAX1-RV64-NEXT: or a3, a3, a5 -; LMULMAX1-RV64-NEXT: srli a5, a3, 16 -; LMULMAX1-RV64-NEXT: or a3, a3, a5 -; LMULMAX1-RV64-NEXT: srli a5, a3, 32 -; LMULMAX1-RV64-NEXT: or a3, a3, a5 -; LMULMAX1-RV64-NEXT: not a3, a3 -; LMULMAX1-RV64-NEXT: srli a5, a3, 1 -; LMULMAX1-RV64-NEXT: and a5, a5, a7 -; LMULMAX1-RV64-NEXT: sub a3, a3, a5 -; LMULMAX1-RV64-NEXT: and a5, a3, a4 -; LMULMAX1-RV64-NEXT: srli a3, a3, 2 -; LMULMAX1-RV64-NEXT: and a3, a3, a4 -; LMULMAX1-RV64-NEXT: add a3, a5, a3 -; LMULMAX1-RV64-NEXT: srli a5, a3, 4 -; LMULMAX1-RV64-NEXT: add a3, a3, a5 +; LMULMAX1-RV64-NEXT: vmv.x.s a2, v26 +; LMULMAX1-RV64-NEXT: and a2, a2, a7 +; LMULMAX1-RV64-NEXT: srli a3, a2, 1 +; LMULMAX1-RV64-NEXT: or a2, a2, a3 +; LMULMAX1-RV64-NEXT: srli a3, a2, 2 +; LMULMAX1-RV64-NEXT: or a2, a2, a3 +; LMULMAX1-RV64-NEXT: srli a3, a2, 4 +; LMULMAX1-RV64-NEXT: or a2, a2, a3 +; LMULMAX1-RV64-NEXT: srli a3, a2, 8 +; LMULMAX1-RV64-NEXT: or a2, a2, a3 +; LMULMAX1-RV64-NEXT: srli a3, a2, 16 +; LMULMAX1-RV64-NEXT: or a2, a2, a3 +; LMULMAX1-RV64-NEXT: srli a3, a2, 32 +; LMULMAX1-RV64-NEXT: or a2, a2, a3 +; LMULMAX1-RV64-NEXT: not a2, a2 +; LMULMAX1-RV64-NEXT: srli a3, a2, 1 ; LMULMAX1-RV64-NEXT: and a3, a3, t0 -; LMULMAX1-RV64-NEXT: mul a3, a3, a1 -; LMULMAX1-RV64-NEXT: srli a3, a3, 56 -; LMULMAX1-RV64-NEXT: addiw a3, a3, -48 -; LMULMAX1-RV64-NEXT: sh a3, 26(sp) +; LMULMAX1-RV64-NEXT: sub a2, a2, a3 +; LMULMAX1-RV64-NEXT: and a3, a2, a4 +; LMULMAX1-RV64-NEXT: srli a2, a2, 2 +; LMULMAX1-RV64-NEXT: and a2, a2, a4 +; LMULMAX1-RV64-NEXT: add a2, a3, a2 +; LMULMAX1-RV64-NEXT: srli a3, a2, 4 +; LMULMAX1-RV64-NEXT: add a2, a2, a3 +; LMULMAX1-RV64-NEXT: and a2, a2, a5 +; LMULMAX1-RV64-NEXT: mul a2, a2, a1 +; LMULMAX1-RV64-NEXT: srli a2, a2, 56 +; LMULMAX1-RV64-NEXT: addiw a2, a2, -48 +; LMULMAX1-RV64-NEXT: sh a2, 26(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 4 -; LMULMAX1-RV64-NEXT: vmv.x.s a3, v26 -; LMULMAX1-RV64-NEXT: and a3, a3, a2 -; LMULMAX1-RV64-NEXT: srli a5, a3, 1 -; LMULMAX1-RV64-NEXT: or a3, a3, a5 -; LMULMAX1-RV64-NEXT: srli a5, a3, 2 -; LMULMAX1-RV64-NEXT: or a3, a3, a5 -; LMULMAX1-RV64-NEXT: srli a5, a3, 4 -; LMULMAX1-RV64-NEXT: or a3, a3, a5 -; LMULMAX1-RV64-NEXT: srli a5, a3, 8 -; LMULMAX1-RV64-NEXT: or a3, a3, a5 -; LMULMAX1-RV64-NEXT: srli a5, a3, 16 -; LMULMAX1-RV64-NEXT: or a3, a3, a5 -; LMULMAX1-RV64-NEXT: srli a5, a3, 32 -; LMULMAX1-RV64-NEXT: or a3, a3, a5 -; LMULMAX1-RV64-NEXT: not a3, a3 -; LMULMAX1-RV64-NEXT: srli a5, a3, 1 -; LMULMAX1-RV64-NEXT: and a5, a5, a7 -; LMULMAX1-RV64-NEXT: sub a3, a3, a5 -; LMULMAX1-RV64-NEXT: and a5, a3, a4 -; LMULMAX1-RV64-NEXT: srli a3, a3, 2 -; LMULMAX1-RV64-NEXT: and a3, a3, a4 -; LMULMAX1-RV64-NEXT: add a3, a5, a3 -; LMULMAX1-RV64-NEXT: srli a5, a3, 4 -; LMULMAX1-RV64-NEXT: add a3, a3, a5 +; LMULMAX1-RV64-NEXT: vmv.x.s a2, v26 +; LMULMAX1-RV64-NEXT: and a2, a2, a7 +; LMULMAX1-RV64-NEXT: srli a3, a2, 1 +; LMULMAX1-RV64-NEXT: or a2, a2, a3 +; LMULMAX1-RV64-NEXT: srli a3, a2, 2 +; LMULMAX1-RV64-NEXT: or a2, a2, a3 +; LMULMAX1-RV64-NEXT: srli a3, a2, 4 +; LMULMAX1-RV64-NEXT: or a2, a2, a3 +; LMULMAX1-RV64-NEXT: srli a3, a2, 8 +; LMULMAX1-RV64-NEXT: or a2, a2, a3 +; LMULMAX1-RV64-NEXT: srli a3, a2, 16 +; LMULMAX1-RV64-NEXT: or a2, a2, a3 +; LMULMAX1-RV64-NEXT: srli a3, a2, 32 +; LMULMAX1-RV64-NEXT: or a2, a2, a3 +; LMULMAX1-RV64-NEXT: not a2, a2 +; LMULMAX1-RV64-NEXT: srli a3, a2, 1 ; LMULMAX1-RV64-NEXT: and a3, a3, t0 -; LMULMAX1-RV64-NEXT: mul a3, a3, a1 -; LMULMAX1-RV64-NEXT: srli a3, a3, 56 -; LMULMAX1-RV64-NEXT: addiw a3, a3, -48 -; LMULMAX1-RV64-NEXT: sh a3, 24(sp) +; LMULMAX1-RV64-NEXT: sub a2, a2, a3 +; LMULMAX1-RV64-NEXT: and a3, a2, a4 +; LMULMAX1-RV64-NEXT: srli a2, a2, 2 +; LMULMAX1-RV64-NEXT: and a2, a2, a4 +; LMULMAX1-RV64-NEXT: add a2, a3, a2 +; LMULMAX1-RV64-NEXT: srli a3, a2, 4 +; LMULMAX1-RV64-NEXT: add a2, a2, a3 +; LMULMAX1-RV64-NEXT: and a2, a2, a5 +; LMULMAX1-RV64-NEXT: mul a2, a2, a1 +; LMULMAX1-RV64-NEXT: srli a2, a2, 56 +; LMULMAX1-RV64-NEXT: addiw a2, a2, -48 +; LMULMAX1-RV64-NEXT: sh a2, 24(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 3 -; LMULMAX1-RV64-NEXT: vmv.x.s a3, v26 -; LMULMAX1-RV64-NEXT: and a3, a3, a2 -; LMULMAX1-RV64-NEXT: srli a5, a3, 1 -; LMULMAX1-RV64-NEXT: or a3, a3, a5 -; LMULMAX1-RV64-NEXT: srli a5, a3, 2 -; LMULMAX1-RV64-NEXT: or a3, a3, a5 -; LMULMAX1-RV64-NEXT: srli a5, a3, 4 -; LMULMAX1-RV64-NEXT: or a3, a3, a5 -; LMULMAX1-RV64-NEXT: srli a5, a3, 8 -; LMULMAX1-RV64-NEXT: or a3, a3, a5 -; LMULMAX1-RV64-NEXT: srli a5, a3, 16 -; LMULMAX1-RV64-NEXT: or a3, a3, a5 -; LMULMAX1-RV64-NEXT: srli a5, a3, 32 -; LMULMAX1-RV64-NEXT: or a3, a3, a5 -; LMULMAX1-RV64-NEXT: not a3, a3 -; LMULMAX1-RV64-NEXT: srli a5, a3, 1 -; LMULMAX1-RV64-NEXT: and a5, a5, a7 -; LMULMAX1-RV64-NEXT: sub a3, a3, a5 -; LMULMAX1-RV64-NEXT: and a5, a3, a4 -; LMULMAX1-RV64-NEXT: srli a3, a3, 2 -; LMULMAX1-RV64-NEXT: and a3, a3, a4 -; LMULMAX1-RV64-NEXT: add a3, a5, a3 -; LMULMAX1-RV64-NEXT: srli a5, a3, 4 -; LMULMAX1-RV64-NEXT: add a3, a3, a5 +; LMULMAX1-RV64-NEXT: vmv.x.s a2, v26 +; LMULMAX1-RV64-NEXT: and a2, a2, a7 +; LMULMAX1-RV64-NEXT: srli a3, a2, 1 +; LMULMAX1-RV64-NEXT: or a2, a2, a3 +; LMULMAX1-RV64-NEXT: srli a3, a2, 2 +; LMULMAX1-RV64-NEXT: or a2, a2, a3 +; LMULMAX1-RV64-NEXT: srli a3, a2, 4 +; LMULMAX1-RV64-NEXT: or a2, a2, a3 +; LMULMAX1-RV64-NEXT: srli a3, a2, 8 +; LMULMAX1-RV64-NEXT: or a2, a2, a3 +; LMULMAX1-RV64-NEXT: srli a3, a2, 16 +; LMULMAX1-RV64-NEXT: or a2, a2, a3 +; LMULMAX1-RV64-NEXT: srli a3, a2, 32 +; LMULMAX1-RV64-NEXT: or a2, a2, a3 +; LMULMAX1-RV64-NEXT: not a2, a2 +; LMULMAX1-RV64-NEXT: srli a3, a2, 1 ; LMULMAX1-RV64-NEXT: and a3, a3, t0 -; LMULMAX1-RV64-NEXT: mul a3, a3, a1 -; LMULMAX1-RV64-NEXT: srli a3, a3, 56 -; LMULMAX1-RV64-NEXT: addiw a3, a3, -48 -; LMULMAX1-RV64-NEXT: sh a3, 22(sp) +; LMULMAX1-RV64-NEXT: sub a2, a2, a3 +; LMULMAX1-RV64-NEXT: and a3, a2, a4 +; LMULMAX1-RV64-NEXT: srli a2, a2, 2 +; LMULMAX1-RV64-NEXT: and a2, a2, a4 +; LMULMAX1-RV64-NEXT: add a2, a3, a2 +; LMULMAX1-RV64-NEXT: srli a3, a2, 4 +; LMULMAX1-RV64-NEXT: add a2, a2, a3 +; LMULMAX1-RV64-NEXT: and a2, a2, a5 +; LMULMAX1-RV64-NEXT: mul a2, a2, a1 +; LMULMAX1-RV64-NEXT: srli a2, a2, 56 +; LMULMAX1-RV64-NEXT: addiw a2, a2, -48 +; LMULMAX1-RV64-NEXT: sh a2, 22(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 2 -; LMULMAX1-RV64-NEXT: vmv.x.s a3, v26 -; LMULMAX1-RV64-NEXT: and a3, a3, a2 -; LMULMAX1-RV64-NEXT: srli a5, a3, 1 -; LMULMAX1-RV64-NEXT: or a3, a3, a5 -; LMULMAX1-RV64-NEXT: srli a5, a3, 2 -; LMULMAX1-RV64-NEXT: or a3, a3, a5 -; LMULMAX1-RV64-NEXT: srli a5, a3, 4 -; LMULMAX1-RV64-NEXT: or a3, a3, a5 -; LMULMAX1-RV64-NEXT: srli a5, a3, 8 -; LMULMAX1-RV64-NEXT: or a3, a3, a5 -; LMULMAX1-RV64-NEXT: srli a5, a3, 16 -; LMULMAX1-RV64-NEXT: or a3, a3, a5 -; LMULMAX1-RV64-NEXT: srli a5, a3, 32 -; LMULMAX1-RV64-NEXT: or a3, a3, a5 -; LMULMAX1-RV64-NEXT: not a3, a3 -; LMULMAX1-RV64-NEXT: srli a5, a3, 1 -; LMULMAX1-RV64-NEXT: and a5, a5, a7 -; LMULMAX1-RV64-NEXT: sub a3, a3, a5 -; LMULMAX1-RV64-NEXT: and a5, a3, a4 -; LMULMAX1-RV64-NEXT: srli a3, a3, 2 -; LMULMAX1-RV64-NEXT: and a3, a3, a4 -; LMULMAX1-RV64-NEXT: add a3, a5, a3 -; LMULMAX1-RV64-NEXT: srli a5, a3, 4 -; LMULMAX1-RV64-NEXT: add a3, a3, a5 +; LMULMAX1-RV64-NEXT: vmv.x.s a2, v26 +; LMULMAX1-RV64-NEXT: and a2, a2, a7 +; LMULMAX1-RV64-NEXT: srli a3, a2, 1 +; LMULMAX1-RV64-NEXT: or a2, a2, a3 +; LMULMAX1-RV64-NEXT: srli a3, a2, 2 +; LMULMAX1-RV64-NEXT: or a2, a2, a3 +; LMULMAX1-RV64-NEXT: srli a3, a2, 4 +; LMULMAX1-RV64-NEXT: or a2, a2, a3 +; LMULMAX1-RV64-NEXT: srli a3, a2, 8 +; LMULMAX1-RV64-NEXT: or a2, a2, a3 +; LMULMAX1-RV64-NEXT: srli a3, a2, 16 +; LMULMAX1-RV64-NEXT: or a2, a2, a3 +; LMULMAX1-RV64-NEXT: srli a3, a2, 32 +; LMULMAX1-RV64-NEXT: or a2, a2, a3 +; LMULMAX1-RV64-NEXT: not a2, a2 +; LMULMAX1-RV64-NEXT: srli a3, a2, 1 ; LMULMAX1-RV64-NEXT: and a3, a3, t0 -; LMULMAX1-RV64-NEXT: mul a3, a3, a1 -; LMULMAX1-RV64-NEXT: srli a3, a3, 56 -; LMULMAX1-RV64-NEXT: addiw a3, a3, -48 -; LMULMAX1-RV64-NEXT: sh a3, 20(sp) +; LMULMAX1-RV64-NEXT: sub a2, a2, a3 +; LMULMAX1-RV64-NEXT: and a3, a2, a4 +; LMULMAX1-RV64-NEXT: srli a2, a2, 2 +; LMULMAX1-RV64-NEXT: and a2, a2, a4 +; LMULMAX1-RV64-NEXT: add a2, a3, a2 +; LMULMAX1-RV64-NEXT: srli a3, a2, 4 +; LMULMAX1-RV64-NEXT: add a2, a2, a3 +; LMULMAX1-RV64-NEXT: and a2, a2, a5 +; LMULMAX1-RV64-NEXT: mul a2, a2, a1 +; LMULMAX1-RV64-NEXT: srli a2, a2, 56 +; LMULMAX1-RV64-NEXT: addiw a2, a2, -48 +; LMULMAX1-RV64-NEXT: sh a2, 20(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v25, v25, 1 -; LMULMAX1-RV64-NEXT: vmv.x.s a3, v25 -; LMULMAX1-RV64-NEXT: and a2, a3, a2 +; LMULMAX1-RV64-NEXT: vmv.x.s a2, v25 +; LMULMAX1-RV64-NEXT: and a2, a2, a7 ; LMULMAX1-RV64-NEXT: srli a3, a2, 1 ; LMULMAX1-RV64-NEXT: or a2, a2, a3 ; LMULMAX1-RV64-NEXT: srli a3, a2, 2 @@ -10012,7 +10012,7 @@ define void @ctlz_v16i16(<16 x i16>* %x, <16 x i16>* %y) { ; LMULMAX1-RV64-NEXT: or a2, a2, a3 ; LMULMAX1-RV64-NEXT: not a2, a2 ; LMULMAX1-RV64-NEXT: srli a3, a2, 1 -; LMULMAX1-RV64-NEXT: and a3, a3, a7 +; LMULMAX1-RV64-NEXT: and a3, a3, t0 ; LMULMAX1-RV64-NEXT: sub a2, a2, a3 ; LMULMAX1-RV64-NEXT: and a3, a2, a4 ; LMULMAX1-RV64-NEXT: srli a2, a2, 2 @@ -10020,7 +10020,7 @@ define void @ctlz_v16i16(<16 x i16>* %x, <16 x i16>* %y) { ; LMULMAX1-RV64-NEXT: add a2, a3, a2 ; LMULMAX1-RV64-NEXT: srli a3, a2, 4 ; LMULMAX1-RV64-NEXT: add a2, a2, a3 -; LMULMAX1-RV64-NEXT: and a2, a2, t0 +; LMULMAX1-RV64-NEXT: and a2, a2, a5 ; LMULMAX1-RV64-NEXT: mul a1, a2, a1 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 ; LMULMAX1-RV64-NEXT: addiw a1, a1, -48 @@ -11136,240 +11136,240 @@ define void @ctlz_v4i64(<4 x i64>* %x, <4 x i64>* %y) { ; LMULMAX2-RV32-NEXT: sw zero, 52(sp) ; LMULMAX2-RV32-NEXT: sw zero, 44(sp) ; LMULMAX2-RV32-NEXT: sw zero, 36(sp) -; LMULMAX2-RV32-NEXT: addi a5, zero, 32 +; LMULMAX2-RV32-NEXT: addi a6, zero, 32 ; LMULMAX2-RV32-NEXT: vsetivli zero, 1, e64, m2, ta, mu -; LMULMAX2-RV32-NEXT: vsrl.vx v28, v26, a5 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: lui a2, 349525 -; LMULMAX2-RV32-NEXT: addi a4, a2, 1365 -; LMULMAX2-RV32-NEXT: lui a2, 209715 -; LMULMAX2-RV32-NEXT: addi a3, a2, 819 -; LMULMAX2-RV32-NEXT: lui a2, 61681 -; LMULMAX2-RV32-NEXT: addi a6, a2, -241 -; LMULMAX2-RV32-NEXT: lui a2, 4112 -; LMULMAX2-RV32-NEXT: addi a7, a2, 257 -; LMULMAX2-RV32-NEXT: bnez a1, .LBB7_2 +; LMULMAX2-RV32-NEXT: vsrl.vx v28, v26, a6 +; LMULMAX2-RV32-NEXT: vmv.x.s a5, v28 +; LMULMAX2-RV32-NEXT: lui a1, 349525 +; LMULMAX2-RV32-NEXT: addi a4, a1, 1365 +; LMULMAX2-RV32-NEXT: lui a1, 209715 +; LMULMAX2-RV32-NEXT: addi a3, a1, 819 +; LMULMAX2-RV32-NEXT: lui a1, 61681 +; LMULMAX2-RV32-NEXT: addi a7, a1, -241 +; LMULMAX2-RV32-NEXT: lui a1, 4112 +; LMULMAX2-RV32-NEXT: addi a2, a1, 257 +; LMULMAX2-RV32-NEXT: bnez a5, .LBB7_2 ; LMULMAX2-RV32-NEXT: # %bb.1: ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v26 -; LMULMAX2-RV32-NEXT: srli a2, a1, 1 -; LMULMAX2-RV32-NEXT: or a1, a1, a2 -; LMULMAX2-RV32-NEXT: srli a2, a1, 2 -; LMULMAX2-RV32-NEXT: or a1, a1, a2 -; LMULMAX2-RV32-NEXT: srli a2, a1, 4 -; LMULMAX2-RV32-NEXT: or a1, a1, a2 -; LMULMAX2-RV32-NEXT: srli a2, a1, 8 -; LMULMAX2-RV32-NEXT: or a1, a1, a2 -; LMULMAX2-RV32-NEXT: srli a2, a1, 16 -; LMULMAX2-RV32-NEXT: or a1, a1, a2 +; LMULMAX2-RV32-NEXT: srli a5, a1, 1 +; LMULMAX2-RV32-NEXT: or a1, a1, a5 +; LMULMAX2-RV32-NEXT: srli a5, a1, 2 +; LMULMAX2-RV32-NEXT: or a1, a1, a5 +; LMULMAX2-RV32-NEXT: srli a5, a1, 4 +; LMULMAX2-RV32-NEXT: or a1, a1, a5 +; LMULMAX2-RV32-NEXT: srli a5, a1, 8 +; LMULMAX2-RV32-NEXT: or a1, a1, a5 +; LMULMAX2-RV32-NEXT: srli a5, a1, 16 +; LMULMAX2-RV32-NEXT: or a1, a1, a5 ; LMULMAX2-RV32-NEXT: not a1, a1 -; LMULMAX2-RV32-NEXT: srli a2, a1, 1 -; LMULMAX2-RV32-NEXT: and a2, a2, a4 -; LMULMAX2-RV32-NEXT: sub a1, a1, a2 -; LMULMAX2-RV32-NEXT: and a2, a1, a3 -; LMULMAX2-RV32-NEXT: srli a1, a1, 2 -; LMULMAX2-RV32-NEXT: and a1, a1, a3 -; LMULMAX2-RV32-NEXT: add a1, a2, a1 -; LMULMAX2-RV32-NEXT: srli a2, a1, 4 -; LMULMAX2-RV32-NEXT: add a1, a1, a2 -; LMULMAX2-RV32-NEXT: and a1, a1, a6 -; LMULMAX2-RV32-NEXT: mul a1, a1, a7 +; LMULMAX2-RV32-NEXT: srli a5, a1, 1 +; LMULMAX2-RV32-NEXT: and a5, a5, a4 +; LMULMAX2-RV32-NEXT: sub a1, a1, a5 +; LMULMAX2-RV32-NEXT: and a5, a1, a3 +; LMULMAX2-RV32-NEXT: srli a1, a1, 2 +; LMULMAX2-RV32-NEXT: and a1, a1, a3 +; LMULMAX2-RV32-NEXT: add a1, a5, a1 +; LMULMAX2-RV32-NEXT: srli a5, a1, 4 +; LMULMAX2-RV32-NEXT: add a1, a1, a5 +; LMULMAX2-RV32-NEXT: and a1, a1, a7 +; LMULMAX2-RV32-NEXT: mul a1, a1, a2 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 -; LMULMAX2-RV32-NEXT: addi a1, a1, 32 +; LMULMAX2-RV32-NEXT: addi a5, a1, 32 ; LMULMAX2-RV32-NEXT: j .LBB7_3 ; LMULMAX2-RV32-NEXT: .LBB7_2: -; LMULMAX2-RV32-NEXT: srli a2, a1, 1 -; LMULMAX2-RV32-NEXT: or a1, a1, a2 -; LMULMAX2-RV32-NEXT: srli a2, a1, 2 -; LMULMAX2-RV32-NEXT: or a1, a1, a2 -; LMULMAX2-RV32-NEXT: srli a2, a1, 4 -; LMULMAX2-RV32-NEXT: or a1, a1, a2 -; LMULMAX2-RV32-NEXT: srli a2, a1, 8 -; LMULMAX2-RV32-NEXT: or a1, a1, a2 -; LMULMAX2-RV32-NEXT: srli a2, a1, 16 -; LMULMAX2-RV32-NEXT: or a1, a1, a2 +; LMULMAX2-RV32-NEXT: srli a1, a5, 1 +; LMULMAX2-RV32-NEXT: or a1, a5, a1 +; LMULMAX2-RV32-NEXT: srli a5, a1, 2 +; LMULMAX2-RV32-NEXT: or a1, a1, a5 +; LMULMAX2-RV32-NEXT: srli a5, a1, 4 +; LMULMAX2-RV32-NEXT: or a1, a1, a5 +; LMULMAX2-RV32-NEXT: srli a5, a1, 8 +; LMULMAX2-RV32-NEXT: or a1, a1, a5 +; LMULMAX2-RV32-NEXT: srli a5, a1, 16 +; LMULMAX2-RV32-NEXT: or a1, a1, a5 ; LMULMAX2-RV32-NEXT: not a1, a1 -; LMULMAX2-RV32-NEXT: srli a2, a1, 1 -; LMULMAX2-RV32-NEXT: and a2, a2, a4 -; LMULMAX2-RV32-NEXT: sub a1, a1, a2 -; LMULMAX2-RV32-NEXT: and a2, a1, a3 +; LMULMAX2-RV32-NEXT: srli a5, a1, 1 +; LMULMAX2-RV32-NEXT: and a5, a5, a4 +; LMULMAX2-RV32-NEXT: sub a1, a1, a5 +; LMULMAX2-RV32-NEXT: and a5, a1, a3 ; LMULMAX2-RV32-NEXT: srli a1, a1, 2 ; LMULMAX2-RV32-NEXT: and a1, a1, a3 -; LMULMAX2-RV32-NEXT: add a1, a2, a1 -; LMULMAX2-RV32-NEXT: srli a2, a1, 4 -; LMULMAX2-RV32-NEXT: add a1, a1, a2 -; LMULMAX2-RV32-NEXT: and a1, a1, a6 -; LMULMAX2-RV32-NEXT: mul a1, a1, a7 -; LMULMAX2-RV32-NEXT: srli a1, a1, 24 +; LMULMAX2-RV32-NEXT: add a1, a5, a1 +; LMULMAX2-RV32-NEXT: srli a5, a1, 4 +; LMULMAX2-RV32-NEXT: add a1, a1, a5 +; LMULMAX2-RV32-NEXT: and a1, a1, a7 +; LMULMAX2-RV32-NEXT: mul a1, a1, a2 +; LMULMAX2-RV32-NEXT: srli a5, a1, 24 ; LMULMAX2-RV32-NEXT: .LBB7_3: ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 3 -; LMULMAX2-RV32-NEXT: vsrl.vx v30, v28, a5 -; LMULMAX2-RV32-NEXT: vmv.x.s a2, v30 -; LMULMAX2-RV32-NEXT: sw a1, 32(sp) -; LMULMAX2-RV32-NEXT: bnez a2, .LBB7_5 +; LMULMAX2-RV32-NEXT: vsrl.vx v30, v28, a6 +; LMULMAX2-RV32-NEXT: vmv.x.s a1, v30 +; LMULMAX2-RV32-NEXT: sw a5, 32(sp) +; LMULMAX2-RV32-NEXT: bnez a1, .LBB7_5 ; LMULMAX2-RV32-NEXT: # %bb.4: ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: srli a2, a1, 1 -; LMULMAX2-RV32-NEXT: or a1, a1, a2 -; LMULMAX2-RV32-NEXT: srli a2, a1, 2 -; LMULMAX2-RV32-NEXT: or a1, a1, a2 -; LMULMAX2-RV32-NEXT: srli a2, a1, 4 -; LMULMAX2-RV32-NEXT: or a1, a1, a2 -; LMULMAX2-RV32-NEXT: srli a2, a1, 8 -; LMULMAX2-RV32-NEXT: or a1, a1, a2 -; LMULMAX2-RV32-NEXT: srli a2, a1, 16 -; LMULMAX2-RV32-NEXT: or a1, a1, a2 +; LMULMAX2-RV32-NEXT: srli a5, a1, 1 +; LMULMAX2-RV32-NEXT: or a1, a1, a5 +; LMULMAX2-RV32-NEXT: srli a5, a1, 2 +; LMULMAX2-RV32-NEXT: or a1, a1, a5 +; LMULMAX2-RV32-NEXT: srli a5, a1, 4 +; LMULMAX2-RV32-NEXT: or a1, a1, a5 +; LMULMAX2-RV32-NEXT: srli a5, a1, 8 +; LMULMAX2-RV32-NEXT: or a1, a1, a5 +; LMULMAX2-RV32-NEXT: srli a5, a1, 16 +; LMULMAX2-RV32-NEXT: or a1, a1, a5 ; LMULMAX2-RV32-NEXT: not a1, a1 -; LMULMAX2-RV32-NEXT: srli a2, a1, 1 -; LMULMAX2-RV32-NEXT: and a2, a2, a4 -; LMULMAX2-RV32-NEXT: sub a1, a1, a2 -; LMULMAX2-RV32-NEXT: and a2, a1, a3 +; LMULMAX2-RV32-NEXT: srli a5, a1, 1 +; LMULMAX2-RV32-NEXT: and a5, a5, a4 +; LMULMAX2-RV32-NEXT: sub a1, a1, a5 +; LMULMAX2-RV32-NEXT: and a5, a1, a3 ; LMULMAX2-RV32-NEXT: srli a1, a1, 2 ; LMULMAX2-RV32-NEXT: and a1, a1, a3 -; LMULMAX2-RV32-NEXT: add a1, a2, a1 -; LMULMAX2-RV32-NEXT: srli a2, a1, 4 -; LMULMAX2-RV32-NEXT: add a1, a1, a2 -; LMULMAX2-RV32-NEXT: and a1, a1, a6 -; LMULMAX2-RV32-NEXT: mul a1, a1, a7 +; LMULMAX2-RV32-NEXT: add a1, a5, a1 +; LMULMAX2-RV32-NEXT: srli a5, a1, 4 +; LMULMAX2-RV32-NEXT: add a1, a1, a5 +; LMULMAX2-RV32-NEXT: and a1, a1, a7 +; LMULMAX2-RV32-NEXT: mul a1, a1, a2 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 -; LMULMAX2-RV32-NEXT: addi a1, a1, 32 +; LMULMAX2-RV32-NEXT: addi a5, a1, 32 ; LMULMAX2-RV32-NEXT: j .LBB7_6 ; LMULMAX2-RV32-NEXT: .LBB7_5: -; LMULMAX2-RV32-NEXT: srli a1, a2, 1 -; LMULMAX2-RV32-NEXT: or a1, a2, a1 -; LMULMAX2-RV32-NEXT: srli a2, a1, 2 -; LMULMAX2-RV32-NEXT: or a1, a1, a2 -; LMULMAX2-RV32-NEXT: srli a2, a1, 4 -; LMULMAX2-RV32-NEXT: or a1, a1, a2 -; LMULMAX2-RV32-NEXT: srli a2, a1, 8 -; LMULMAX2-RV32-NEXT: or a1, a1, a2 -; LMULMAX2-RV32-NEXT: srli a2, a1, 16 -; LMULMAX2-RV32-NEXT: or a1, a1, a2 +; LMULMAX2-RV32-NEXT: srli a5, a1, 1 +; LMULMAX2-RV32-NEXT: or a1, a1, a5 +; LMULMAX2-RV32-NEXT: srli a5, a1, 2 +; LMULMAX2-RV32-NEXT: or a1, a1, a5 +; LMULMAX2-RV32-NEXT: srli a5, a1, 4 +; LMULMAX2-RV32-NEXT: or a1, a1, a5 +; LMULMAX2-RV32-NEXT: srli a5, a1, 8 +; LMULMAX2-RV32-NEXT: or a1, a1, a5 +; LMULMAX2-RV32-NEXT: srli a5, a1, 16 +; LMULMAX2-RV32-NEXT: or a1, a1, a5 ; LMULMAX2-RV32-NEXT: not a1, a1 -; LMULMAX2-RV32-NEXT: srli a2, a1, 1 -; LMULMAX2-RV32-NEXT: and a2, a2, a4 -; LMULMAX2-RV32-NEXT: sub a1, a1, a2 -; LMULMAX2-RV32-NEXT: and a2, a1, a3 +; LMULMAX2-RV32-NEXT: srli a5, a1, 1 +; LMULMAX2-RV32-NEXT: and a5, a5, a4 +; LMULMAX2-RV32-NEXT: sub a1, a1, a5 +; LMULMAX2-RV32-NEXT: and a5, a1, a3 ; LMULMAX2-RV32-NEXT: srli a1, a1, 2 ; LMULMAX2-RV32-NEXT: and a1, a1, a3 -; LMULMAX2-RV32-NEXT: add a1, a2, a1 -; LMULMAX2-RV32-NEXT: srli a2, a1, 4 -; LMULMAX2-RV32-NEXT: add a1, a1, a2 -; LMULMAX2-RV32-NEXT: and a1, a1, a6 -; LMULMAX2-RV32-NEXT: mul a1, a1, a7 -; LMULMAX2-RV32-NEXT: srli a1, a1, 24 +; LMULMAX2-RV32-NEXT: add a1, a5, a1 +; LMULMAX2-RV32-NEXT: srli a5, a1, 4 +; LMULMAX2-RV32-NEXT: add a1, a1, a5 +; LMULMAX2-RV32-NEXT: and a1, a1, a7 +; LMULMAX2-RV32-NEXT: mul a1, a1, a2 +; LMULMAX2-RV32-NEXT: srli a5, a1, 24 ; LMULMAX2-RV32-NEXT: .LBB7_6: ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 2 -; LMULMAX2-RV32-NEXT: vsrl.vx v30, v28, a5 -; LMULMAX2-RV32-NEXT: vmv.x.s a2, v30 -; LMULMAX2-RV32-NEXT: sw a1, 56(sp) -; LMULMAX2-RV32-NEXT: bnez a2, .LBB7_8 +; LMULMAX2-RV32-NEXT: vsrl.vx v30, v28, a6 +; LMULMAX2-RV32-NEXT: vmv.x.s a1, v30 +; LMULMAX2-RV32-NEXT: sw a5, 56(sp) +; LMULMAX2-RV32-NEXT: bnez a1, .LBB7_8 ; LMULMAX2-RV32-NEXT: # %bb.7: ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: srli a2, a1, 1 -; LMULMAX2-RV32-NEXT: or a1, a1, a2 -; LMULMAX2-RV32-NEXT: srli a2, a1, 2 -; LMULMAX2-RV32-NEXT: or a1, a1, a2 -; LMULMAX2-RV32-NEXT: srli a2, a1, 4 -; LMULMAX2-RV32-NEXT: or a1, a1, a2 -; LMULMAX2-RV32-NEXT: srli a2, a1, 8 -; LMULMAX2-RV32-NEXT: or a1, a1, a2 -; LMULMAX2-RV32-NEXT: srli a2, a1, 16 -; LMULMAX2-RV32-NEXT: or a1, a1, a2 +; LMULMAX2-RV32-NEXT: srli a5, a1, 1 +; LMULMAX2-RV32-NEXT: or a1, a1, a5 +; LMULMAX2-RV32-NEXT: srli a5, a1, 2 +; LMULMAX2-RV32-NEXT: or a1, a1, a5 +; LMULMAX2-RV32-NEXT: srli a5, a1, 4 +; LMULMAX2-RV32-NEXT: or a1, a1, a5 +; LMULMAX2-RV32-NEXT: srli a5, a1, 8 +; LMULMAX2-RV32-NEXT: or a1, a1, a5 +; LMULMAX2-RV32-NEXT: srli a5, a1, 16 +; LMULMAX2-RV32-NEXT: or a1, a1, a5 ; LMULMAX2-RV32-NEXT: not a1, a1 -; LMULMAX2-RV32-NEXT: srli a2, a1, 1 -; LMULMAX2-RV32-NEXT: and a2, a2, a4 -; LMULMAX2-RV32-NEXT: sub a1, a1, a2 -; LMULMAX2-RV32-NEXT: and a2, a1, a3 +; LMULMAX2-RV32-NEXT: srli a5, a1, 1 +; LMULMAX2-RV32-NEXT: and a5, a5, a4 +; LMULMAX2-RV32-NEXT: sub a1, a1, a5 +; LMULMAX2-RV32-NEXT: and a5, a1, a3 ; LMULMAX2-RV32-NEXT: srli a1, a1, 2 ; LMULMAX2-RV32-NEXT: and a1, a1, a3 -; LMULMAX2-RV32-NEXT: add a1, a2, a1 -; LMULMAX2-RV32-NEXT: srli a2, a1, 4 -; LMULMAX2-RV32-NEXT: add a1, a1, a2 -; LMULMAX2-RV32-NEXT: and a1, a1, a6 -; LMULMAX2-RV32-NEXT: mul a1, a1, a7 +; LMULMAX2-RV32-NEXT: add a1, a5, a1 +; LMULMAX2-RV32-NEXT: srli a5, a1, 4 +; LMULMAX2-RV32-NEXT: add a1, a1, a5 +; LMULMAX2-RV32-NEXT: and a1, a1, a7 +; LMULMAX2-RV32-NEXT: mul a1, a1, a2 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 -; LMULMAX2-RV32-NEXT: addi a1, a1, 32 +; LMULMAX2-RV32-NEXT: addi a5, a1, 32 ; LMULMAX2-RV32-NEXT: j .LBB7_9 ; LMULMAX2-RV32-NEXT: .LBB7_8: -; LMULMAX2-RV32-NEXT: srli a1, a2, 1 -; LMULMAX2-RV32-NEXT: or a1, a2, a1 -; LMULMAX2-RV32-NEXT: srli a2, a1, 2 -; LMULMAX2-RV32-NEXT: or a1, a1, a2 -; LMULMAX2-RV32-NEXT: srli a2, a1, 4 -; LMULMAX2-RV32-NEXT: or a1, a1, a2 -; LMULMAX2-RV32-NEXT: srli a2, a1, 8 -; LMULMAX2-RV32-NEXT: or a1, a1, a2 -; LMULMAX2-RV32-NEXT: srli a2, a1, 16 -; LMULMAX2-RV32-NEXT: or a1, a1, a2 +; LMULMAX2-RV32-NEXT: srli a5, a1, 1 +; LMULMAX2-RV32-NEXT: or a1, a1, a5 +; LMULMAX2-RV32-NEXT: srli a5, a1, 2 +; LMULMAX2-RV32-NEXT: or a1, a1, a5 +; LMULMAX2-RV32-NEXT: srli a5, a1, 4 +; LMULMAX2-RV32-NEXT: or a1, a1, a5 +; LMULMAX2-RV32-NEXT: srli a5, a1, 8 +; LMULMAX2-RV32-NEXT: or a1, a1, a5 +; LMULMAX2-RV32-NEXT: srli a5, a1, 16 +; LMULMAX2-RV32-NEXT: or a1, a1, a5 ; LMULMAX2-RV32-NEXT: not a1, a1 -; LMULMAX2-RV32-NEXT: srli a2, a1, 1 -; LMULMAX2-RV32-NEXT: and a2, a2, a4 -; LMULMAX2-RV32-NEXT: sub a1, a1, a2 -; LMULMAX2-RV32-NEXT: and a2, a1, a3 +; LMULMAX2-RV32-NEXT: srli a5, a1, 1 +; LMULMAX2-RV32-NEXT: and a5, a5, a4 +; LMULMAX2-RV32-NEXT: sub a1, a1, a5 +; LMULMAX2-RV32-NEXT: and a5, a1, a3 ; LMULMAX2-RV32-NEXT: srli a1, a1, 2 ; LMULMAX2-RV32-NEXT: and a1, a1, a3 -; LMULMAX2-RV32-NEXT: add a1, a2, a1 -; LMULMAX2-RV32-NEXT: srli a2, a1, 4 -; LMULMAX2-RV32-NEXT: add a1, a1, a2 -; LMULMAX2-RV32-NEXT: and a1, a1, a6 -; LMULMAX2-RV32-NEXT: mul a1, a1, a7 -; LMULMAX2-RV32-NEXT: srli a1, a1, 24 +; LMULMAX2-RV32-NEXT: add a1, a5, a1 +; LMULMAX2-RV32-NEXT: srli a5, a1, 4 +; LMULMAX2-RV32-NEXT: add a1, a1, a5 +; LMULMAX2-RV32-NEXT: and a1, a1, a7 +; LMULMAX2-RV32-NEXT: mul a1, a1, a2 +; LMULMAX2-RV32-NEXT: srli a5, a1, 24 ; LMULMAX2-RV32-NEXT: .LBB7_9: ; LMULMAX2-RV32-NEXT: vslidedown.vi v26, v26, 1 -; LMULMAX2-RV32-NEXT: vsrl.vx v28, v26, a5 -; LMULMAX2-RV32-NEXT: vmv.x.s a2, v28 -; LMULMAX2-RV32-NEXT: sw a1, 48(sp) -; LMULMAX2-RV32-NEXT: bnez a2, .LBB7_11 +; LMULMAX2-RV32-NEXT: vsrl.vx v28, v26, a6 +; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 +; LMULMAX2-RV32-NEXT: sw a5, 48(sp) +; LMULMAX2-RV32-NEXT: bnez a1, .LBB7_11 ; LMULMAX2-RV32-NEXT: # %bb.10: ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v26 -; LMULMAX2-RV32-NEXT: srli a2, a1, 1 -; LMULMAX2-RV32-NEXT: or a1, a1, a2 -; LMULMAX2-RV32-NEXT: srli a2, a1, 2 -; LMULMAX2-RV32-NEXT: or a1, a1, a2 -; LMULMAX2-RV32-NEXT: srli a2, a1, 4 -; LMULMAX2-RV32-NEXT: or a1, a1, a2 -; LMULMAX2-RV32-NEXT: srli a2, a1, 8 -; LMULMAX2-RV32-NEXT: or a1, a1, a2 -; LMULMAX2-RV32-NEXT: srli a2, a1, 16 -; LMULMAX2-RV32-NEXT: or a1, a1, a2 +; LMULMAX2-RV32-NEXT: srli a5, a1, 1 +; LMULMAX2-RV32-NEXT: or a1, a1, a5 +; LMULMAX2-RV32-NEXT: srli a5, a1, 2 +; LMULMAX2-RV32-NEXT: or a1, a1, a5 +; LMULMAX2-RV32-NEXT: srli a5, a1, 4 +; LMULMAX2-RV32-NEXT: or a1, a1, a5 +; LMULMAX2-RV32-NEXT: srli a5, a1, 8 +; LMULMAX2-RV32-NEXT: or a1, a1, a5 +; LMULMAX2-RV32-NEXT: srli a5, a1, 16 +; LMULMAX2-RV32-NEXT: or a1, a1, a5 ; LMULMAX2-RV32-NEXT: not a1, a1 -; LMULMAX2-RV32-NEXT: srli a2, a1, 1 -; LMULMAX2-RV32-NEXT: and a2, a2, a4 -; LMULMAX2-RV32-NEXT: sub a1, a1, a2 -; LMULMAX2-RV32-NEXT: and a2, a1, a3 +; LMULMAX2-RV32-NEXT: srli a5, a1, 1 +; LMULMAX2-RV32-NEXT: and a4, a5, a4 +; LMULMAX2-RV32-NEXT: sub a1, a1, a4 +; LMULMAX2-RV32-NEXT: and a4, a1, a3 ; LMULMAX2-RV32-NEXT: srli a1, a1, 2 ; LMULMAX2-RV32-NEXT: and a1, a1, a3 -; LMULMAX2-RV32-NEXT: add a1, a2, a1 -; LMULMAX2-RV32-NEXT: srli a2, a1, 4 -; LMULMAX2-RV32-NEXT: add a1, a1, a2 -; LMULMAX2-RV32-NEXT: and a1, a1, a6 -; LMULMAX2-RV32-NEXT: mul a1, a1, a7 +; LMULMAX2-RV32-NEXT: add a1, a4, a1 +; LMULMAX2-RV32-NEXT: srli a3, a1, 4 +; LMULMAX2-RV32-NEXT: add a1, a1, a3 +; LMULMAX2-RV32-NEXT: and a1, a1, a7 +; LMULMAX2-RV32-NEXT: mul a1, a1, a2 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 ; LMULMAX2-RV32-NEXT: addi a1, a1, 32 ; LMULMAX2-RV32-NEXT: j .LBB7_12 ; LMULMAX2-RV32-NEXT: .LBB7_11: -; LMULMAX2-RV32-NEXT: srli a1, a2, 1 -; LMULMAX2-RV32-NEXT: or a1, a2, a1 -; LMULMAX2-RV32-NEXT: srli a2, a1, 2 -; LMULMAX2-RV32-NEXT: or a1, a1, a2 -; LMULMAX2-RV32-NEXT: srli a2, a1, 4 -; LMULMAX2-RV32-NEXT: or a1, a1, a2 -; LMULMAX2-RV32-NEXT: srli a2, a1, 8 -; LMULMAX2-RV32-NEXT: or a1, a1, a2 -; LMULMAX2-RV32-NEXT: srli a2, a1, 16 -; LMULMAX2-RV32-NEXT: or a1, a1, a2 +; LMULMAX2-RV32-NEXT: srli a5, a1, 1 +; LMULMAX2-RV32-NEXT: or a1, a1, a5 +; LMULMAX2-RV32-NEXT: srli a5, a1, 2 +; LMULMAX2-RV32-NEXT: or a1, a1, a5 +; LMULMAX2-RV32-NEXT: srli a5, a1, 4 +; LMULMAX2-RV32-NEXT: or a1, a1, a5 +; LMULMAX2-RV32-NEXT: srli a5, a1, 8 +; LMULMAX2-RV32-NEXT: or a1, a1, a5 +; LMULMAX2-RV32-NEXT: srli a5, a1, 16 +; LMULMAX2-RV32-NEXT: or a1, a1, a5 ; LMULMAX2-RV32-NEXT: not a1, a1 -; LMULMAX2-RV32-NEXT: srli a2, a1, 1 -; LMULMAX2-RV32-NEXT: and a2, a2, a4 -; LMULMAX2-RV32-NEXT: sub a1, a1, a2 -; LMULMAX2-RV32-NEXT: and a2, a1, a3 +; LMULMAX2-RV32-NEXT: srli a5, a1, 1 +; LMULMAX2-RV32-NEXT: and a4, a5, a4 +; LMULMAX2-RV32-NEXT: sub a1, a1, a4 +; LMULMAX2-RV32-NEXT: and a4, a1, a3 ; LMULMAX2-RV32-NEXT: srli a1, a1, 2 ; LMULMAX2-RV32-NEXT: and a1, a1, a3 -; LMULMAX2-RV32-NEXT: add a1, a2, a1 -; LMULMAX2-RV32-NEXT: srli a2, a1, 4 -; LMULMAX2-RV32-NEXT: add a1, a1, a2 -; LMULMAX2-RV32-NEXT: and a1, a1, a6 -; LMULMAX2-RV32-NEXT: mul a1, a1, a7 +; LMULMAX2-RV32-NEXT: add a1, a4, a1 +; LMULMAX2-RV32-NEXT: srli a3, a1, 4 +; LMULMAX2-RV32-NEXT: add a1, a1, a3 +; LMULMAX2-RV32-NEXT: and a1, a1, a7 +; LMULMAX2-RV32-NEXT: mul a1, a1, a2 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 ; LMULMAX2-RV32-NEXT: .LBB7_12: ; LMULMAX2-RV32-NEXT: sw a1, 40(sp) @@ -11557,193 +11557,193 @@ define void @ctlz_v4i64(<4 x i64>* %x, <4 x i64>* %y) { ; LMULMAX1-RV32-NEXT: vle64.v v25, (a0) ; LMULMAX1-RV32-NEXT: addi a6, a0, 16 ; LMULMAX1-RV32-NEXT: vle64.v v26, (a6) -; LMULMAX1-RV32-NEXT: sw zero, 44(sp) -; LMULMAX1-RV32-NEXT: sw zero, 36(sp) -; LMULMAX1-RV32-NEXT: addi a1, zero, 32 -; LMULMAX1-RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; LMULMAX1-RV32-NEXT: vsrl.vx v27, v26, a1 -; LMULMAX1-RV32-NEXT: vmv.x.s a2, v27 -; LMULMAX1-RV32-NEXT: lui a3, 349525 -; LMULMAX1-RV32-NEXT: addi a5, a3, 1365 -; LMULMAX1-RV32-NEXT: lui a3, 209715 -; LMULMAX1-RV32-NEXT: addi a4, a3, 819 -; LMULMAX1-RV32-NEXT: lui a3, 61681 -; LMULMAX1-RV32-NEXT: addi a7, a3, -241 -; LMULMAX1-RV32-NEXT: lui a3, 4112 -; LMULMAX1-RV32-NEXT: addi t0, a3, 257 -; LMULMAX1-RV32-NEXT: bnez a2, .LBB7_2 -; LMULMAX1-RV32-NEXT: # %bb.1: -; LMULMAX1-RV32-NEXT: vmv.x.s a2, v26 -; LMULMAX1-RV32-NEXT: srli a3, a2, 1 -; LMULMAX1-RV32-NEXT: or a2, a2, a3 -; LMULMAX1-RV32-NEXT: srli a3, a2, 2 -; LMULMAX1-RV32-NEXT: or a2, a2, a3 -; LMULMAX1-RV32-NEXT: srli a3, a2, 4 -; LMULMAX1-RV32-NEXT: or a2, a2, a3 -; LMULMAX1-RV32-NEXT: srli a3, a2, 8 -; LMULMAX1-RV32-NEXT: or a2, a2, a3 -; LMULMAX1-RV32-NEXT: srli a3, a2, 16 -; LMULMAX1-RV32-NEXT: or a2, a2, a3 -; LMULMAX1-RV32-NEXT: not a2, a2 -; LMULMAX1-RV32-NEXT: srli a3, a2, 1 -; LMULMAX1-RV32-NEXT: and a3, a3, a5 -; LMULMAX1-RV32-NEXT: sub a2, a2, a3 -; LMULMAX1-RV32-NEXT: and a3, a2, a4 -; LMULMAX1-RV32-NEXT: srli a2, a2, 2 -; LMULMAX1-RV32-NEXT: and a2, a2, a4 -; LMULMAX1-RV32-NEXT: add a2, a3, a2 -; LMULMAX1-RV32-NEXT: srli a3, a2, 4 -; LMULMAX1-RV32-NEXT: add a2, a2, a3 -; LMULMAX1-RV32-NEXT: and a2, a2, a7 -; LMULMAX1-RV32-NEXT: mul a2, a2, t0 -; LMULMAX1-RV32-NEXT: srli a2, a2, 24 -; LMULMAX1-RV32-NEXT: addi a2, a2, 32 +; LMULMAX1-RV32-NEXT: sw zero, 44(sp) +; LMULMAX1-RV32-NEXT: sw zero, 36(sp) +; LMULMAX1-RV32-NEXT: addi a7, zero, 32 +; LMULMAX1-RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; LMULMAX1-RV32-NEXT: vsrl.vx v27, v26, a7 +; LMULMAX1-RV32-NEXT: vmv.x.s a1, v27 +; LMULMAX1-RV32-NEXT: lui a2, 349525 +; LMULMAX1-RV32-NEXT: addi a5, a2, 1365 +; LMULMAX1-RV32-NEXT: lui a2, 209715 +; LMULMAX1-RV32-NEXT: addi a4, a2, 819 +; LMULMAX1-RV32-NEXT: lui a2, 61681 +; LMULMAX1-RV32-NEXT: addi t0, a2, -241 +; LMULMAX1-RV32-NEXT: lui a2, 4112 +; LMULMAX1-RV32-NEXT: addi a3, a2, 257 +; LMULMAX1-RV32-NEXT: bnez a1, .LBB7_2 +; LMULMAX1-RV32-NEXT: # %bb.1: +; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 +; LMULMAX1-RV32-NEXT: srli a2, a1, 1 +; LMULMAX1-RV32-NEXT: or a1, a1, a2 +; LMULMAX1-RV32-NEXT: srli a2, a1, 2 +; LMULMAX1-RV32-NEXT: or a1, a1, a2 +; LMULMAX1-RV32-NEXT: srli a2, a1, 4 +; LMULMAX1-RV32-NEXT: or a1, a1, a2 +; LMULMAX1-RV32-NEXT: srli a2, a1, 8 +; LMULMAX1-RV32-NEXT: or a1, a1, a2 +; LMULMAX1-RV32-NEXT: srli a2, a1, 16 +; LMULMAX1-RV32-NEXT: or a1, a1, a2 +; LMULMAX1-RV32-NEXT: not a1, a1 +; LMULMAX1-RV32-NEXT: srli a2, a1, 1 +; LMULMAX1-RV32-NEXT: and a2, a2, a5 +; LMULMAX1-RV32-NEXT: sub a1, a1, a2 +; LMULMAX1-RV32-NEXT: and a2, a1, a4 +; LMULMAX1-RV32-NEXT: srli a1, a1, 2 +; LMULMAX1-RV32-NEXT: and a1, a1, a4 +; LMULMAX1-RV32-NEXT: add a1, a2, a1 +; LMULMAX1-RV32-NEXT: srli a2, a1, 4 +; LMULMAX1-RV32-NEXT: add a1, a1, a2 +; LMULMAX1-RV32-NEXT: and a1, a1, t0 +; LMULMAX1-RV32-NEXT: mul a1, a1, a3 +; LMULMAX1-RV32-NEXT: srli a1, a1, 24 +; LMULMAX1-RV32-NEXT: addi a1, a1, 32 ; LMULMAX1-RV32-NEXT: j .LBB7_3 ; LMULMAX1-RV32-NEXT: .LBB7_2: -; LMULMAX1-RV32-NEXT: srli a3, a2, 1 -; LMULMAX1-RV32-NEXT: or a2, a2, a3 -; LMULMAX1-RV32-NEXT: srli a3, a2, 2 -; LMULMAX1-RV32-NEXT: or a2, a2, a3 -; LMULMAX1-RV32-NEXT: srli a3, a2, 4 -; LMULMAX1-RV32-NEXT: or a2, a2, a3 -; LMULMAX1-RV32-NEXT: srli a3, a2, 8 -; LMULMAX1-RV32-NEXT: or a2, a2, a3 -; LMULMAX1-RV32-NEXT: srli a3, a2, 16 -; LMULMAX1-RV32-NEXT: or a2, a2, a3 -; LMULMAX1-RV32-NEXT: not a2, a2 -; LMULMAX1-RV32-NEXT: srli a3, a2, 1 -; LMULMAX1-RV32-NEXT: and a3, a3, a5 -; LMULMAX1-RV32-NEXT: sub a2, a2, a3 -; LMULMAX1-RV32-NEXT: and a3, a2, a4 -; LMULMAX1-RV32-NEXT: srli a2, a2, 2 -; LMULMAX1-RV32-NEXT: and a2, a2, a4 -; LMULMAX1-RV32-NEXT: add a2, a3, a2 -; LMULMAX1-RV32-NEXT: srli a3, a2, 4 -; LMULMAX1-RV32-NEXT: add a2, a2, a3 -; LMULMAX1-RV32-NEXT: and a2, a2, a7 -; LMULMAX1-RV32-NEXT: mul a2, a2, t0 -; LMULMAX1-RV32-NEXT: srli a2, a2, 24 +; LMULMAX1-RV32-NEXT: srli a2, a1, 1 +; LMULMAX1-RV32-NEXT: or a1, a1, a2 +; LMULMAX1-RV32-NEXT: srli a2, a1, 2 +; LMULMAX1-RV32-NEXT: or a1, a1, a2 +; LMULMAX1-RV32-NEXT: srli a2, a1, 4 +; LMULMAX1-RV32-NEXT: or a1, a1, a2 +; LMULMAX1-RV32-NEXT: srli a2, a1, 8 +; LMULMAX1-RV32-NEXT: or a1, a1, a2 +; LMULMAX1-RV32-NEXT: srli a2, a1, 16 +; LMULMAX1-RV32-NEXT: or a1, a1, a2 +; LMULMAX1-RV32-NEXT: not a1, a1 +; LMULMAX1-RV32-NEXT: srli a2, a1, 1 +; LMULMAX1-RV32-NEXT: and a2, a2, a5 +; LMULMAX1-RV32-NEXT: sub a1, a1, a2 +; LMULMAX1-RV32-NEXT: and a2, a1, a4 +; LMULMAX1-RV32-NEXT: srli a1, a1, 2 +; LMULMAX1-RV32-NEXT: and a1, a1, a4 +; LMULMAX1-RV32-NEXT: add a1, a2, a1 +; LMULMAX1-RV32-NEXT: srli a2, a1, 4 +; LMULMAX1-RV32-NEXT: add a1, a1, a2 +; LMULMAX1-RV32-NEXT: and a1, a1, t0 +; LMULMAX1-RV32-NEXT: mul a1, a1, a3 +; LMULMAX1-RV32-NEXT: srli a1, a1, 24 ; LMULMAX1-RV32-NEXT: .LBB7_3: ; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v26, 1 -; LMULMAX1-RV32-NEXT: vsrl.vx v27, v26, a1 -; LMULMAX1-RV32-NEXT: vmv.x.s a3, v27 -; LMULMAX1-RV32-NEXT: sw a2, 32(sp) -; LMULMAX1-RV32-NEXT: bnez a3, .LBB7_5 +; LMULMAX1-RV32-NEXT: vsrl.vx v27, v26, a7 +; LMULMAX1-RV32-NEXT: vmv.x.s a2, v27 +; LMULMAX1-RV32-NEXT: sw a1, 32(sp) +; LMULMAX1-RV32-NEXT: bnez a2, .LBB7_5 ; LMULMAX1-RV32-NEXT: # %bb.4: -; LMULMAX1-RV32-NEXT: vmv.x.s a2, v26 -; LMULMAX1-RV32-NEXT: srli a3, a2, 1 -; LMULMAX1-RV32-NEXT: or a2, a2, a3 -; LMULMAX1-RV32-NEXT: srli a3, a2, 2 -; LMULMAX1-RV32-NEXT: or a2, a2, a3 -; LMULMAX1-RV32-NEXT: srli a3, a2, 4 -; LMULMAX1-RV32-NEXT: or a2, a2, a3 -; LMULMAX1-RV32-NEXT: srli a3, a2, 8 -; LMULMAX1-RV32-NEXT: or a2, a2, a3 -; LMULMAX1-RV32-NEXT: srli a3, a2, 16 -; LMULMAX1-RV32-NEXT: or a2, a2, a3 -; LMULMAX1-RV32-NEXT: not a2, a2 -; LMULMAX1-RV32-NEXT: srli a3, a2, 1 -; LMULMAX1-RV32-NEXT: and a3, a3, a5 -; LMULMAX1-RV32-NEXT: sub a2, a2, a3 -; LMULMAX1-RV32-NEXT: and a3, a2, a4 -; LMULMAX1-RV32-NEXT: srli a2, a2, 2 -; LMULMAX1-RV32-NEXT: and a2, a2, a4 -; LMULMAX1-RV32-NEXT: add a2, a3, a2 -; LMULMAX1-RV32-NEXT: srli a3, a2, 4 -; LMULMAX1-RV32-NEXT: add a2, a2, a3 -; LMULMAX1-RV32-NEXT: and a2, a2, a7 -; LMULMAX1-RV32-NEXT: mul a2, a2, t0 -; LMULMAX1-RV32-NEXT: srli a2, a2, 24 -; LMULMAX1-RV32-NEXT: addi a2, a2, 32 +; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 +; LMULMAX1-RV32-NEXT: srli a2, a1, 1 +; LMULMAX1-RV32-NEXT: or a1, a1, a2 +; LMULMAX1-RV32-NEXT: srli a2, a1, 2 +; LMULMAX1-RV32-NEXT: or a1, a1, a2 +; LMULMAX1-RV32-NEXT: srli a2, a1, 4 +; LMULMAX1-RV32-NEXT: or a1, a1, a2 +; LMULMAX1-RV32-NEXT: srli a2, a1, 8 +; LMULMAX1-RV32-NEXT: or a1, a1, a2 +; LMULMAX1-RV32-NEXT: srli a2, a1, 16 +; LMULMAX1-RV32-NEXT: or a1, a1, a2 +; LMULMAX1-RV32-NEXT: not a1, a1 +; LMULMAX1-RV32-NEXT: srli a2, a1, 1 +; LMULMAX1-RV32-NEXT: and a2, a2, a5 +; LMULMAX1-RV32-NEXT: sub a1, a1, a2 +; LMULMAX1-RV32-NEXT: and a2, a1, a4 +; LMULMAX1-RV32-NEXT: srli a1, a1, 2 +; LMULMAX1-RV32-NEXT: and a1, a1, a4 +; LMULMAX1-RV32-NEXT: add a1, a2, a1 +; LMULMAX1-RV32-NEXT: srli a2, a1, 4 +; LMULMAX1-RV32-NEXT: add a1, a1, a2 +; LMULMAX1-RV32-NEXT: and a1, a1, t0 +; LMULMAX1-RV32-NEXT: mul a1, a1, a3 +; LMULMAX1-RV32-NEXT: srli a1, a1, 24 +; LMULMAX1-RV32-NEXT: addi a1, a1, 32 ; LMULMAX1-RV32-NEXT: j .LBB7_6 ; LMULMAX1-RV32-NEXT: .LBB7_5: -; LMULMAX1-RV32-NEXT: srli a2, a3, 1 -; LMULMAX1-RV32-NEXT: or a2, a3, a2 -; LMULMAX1-RV32-NEXT: srli a3, a2, 2 -; LMULMAX1-RV32-NEXT: or a2, a2, a3 -; LMULMAX1-RV32-NEXT: srli a3, a2, 4 -; LMULMAX1-RV32-NEXT: or a2, a2, a3 -; LMULMAX1-RV32-NEXT: srli a3, a2, 8 -; LMULMAX1-RV32-NEXT: or a2, a2, a3 -; LMULMAX1-RV32-NEXT: srli a3, a2, 16 -; LMULMAX1-RV32-NEXT: or a2, a2, a3 -; LMULMAX1-RV32-NEXT: not a2, a2 -; LMULMAX1-RV32-NEXT: srli a3, a2, 1 -; LMULMAX1-RV32-NEXT: and a3, a3, a5 -; LMULMAX1-RV32-NEXT: sub a2, a2, a3 -; LMULMAX1-RV32-NEXT: and a3, a2, a4 -; LMULMAX1-RV32-NEXT: srli a2, a2, 2 -; LMULMAX1-RV32-NEXT: and a2, a2, a4 -; LMULMAX1-RV32-NEXT: add a2, a3, a2 -; LMULMAX1-RV32-NEXT: srli a3, a2, 4 -; LMULMAX1-RV32-NEXT: add a2, a2, a3 -; LMULMAX1-RV32-NEXT: and a2, a2, a7 -; LMULMAX1-RV32-NEXT: mul a2, a2, t0 -; LMULMAX1-RV32-NEXT: srli a2, a2, 24 +; LMULMAX1-RV32-NEXT: srli a1, a2, 1 +; LMULMAX1-RV32-NEXT: or a1, a2, a1 +; LMULMAX1-RV32-NEXT: srli a2, a1, 2 +; LMULMAX1-RV32-NEXT: or a1, a1, a2 +; LMULMAX1-RV32-NEXT: srli a2, a1, 4 +; LMULMAX1-RV32-NEXT: or a1, a1, a2 +; LMULMAX1-RV32-NEXT: srli a2, a1, 8 +; LMULMAX1-RV32-NEXT: or a1, a1, a2 +; LMULMAX1-RV32-NEXT: srli a2, a1, 16 +; LMULMAX1-RV32-NEXT: or a1, a1, a2 +; LMULMAX1-RV32-NEXT: not a1, a1 +; LMULMAX1-RV32-NEXT: srli a2, a1, 1 +; LMULMAX1-RV32-NEXT: and a2, a2, a5 +; LMULMAX1-RV32-NEXT: sub a1, a1, a2 +; LMULMAX1-RV32-NEXT: and a2, a1, a4 +; LMULMAX1-RV32-NEXT: srli a1, a1, 2 +; LMULMAX1-RV32-NEXT: and a1, a1, a4 +; LMULMAX1-RV32-NEXT: add a1, a2, a1 +; LMULMAX1-RV32-NEXT: srli a2, a1, 4 +; LMULMAX1-RV32-NEXT: add a1, a1, a2 +; LMULMAX1-RV32-NEXT: and a1, a1, t0 +; LMULMAX1-RV32-NEXT: mul a1, a1, a3 +; LMULMAX1-RV32-NEXT: srli a1, a1, 24 ; LMULMAX1-RV32-NEXT: .LBB7_6: -; LMULMAX1-RV32-NEXT: sw a2, 40(sp) +; LMULMAX1-RV32-NEXT: sw a1, 40(sp) ; LMULMAX1-RV32-NEXT: sw zero, 28(sp) -; LMULMAX1-RV32-NEXT: vsrl.vx v26, v25, a1 -; LMULMAX1-RV32-NEXT: vmv.x.s a2, v26 +; LMULMAX1-RV32-NEXT: vsrl.vx v26, v25, a7 +; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV32-NEXT: sw zero, 20(sp) -; LMULMAX1-RV32-NEXT: bnez a2, .LBB7_8 +; LMULMAX1-RV32-NEXT: bnez a1, .LBB7_8 ; LMULMAX1-RV32-NEXT: # %bb.7: -; LMULMAX1-RV32-NEXT: vmv.x.s a2, v25 -; LMULMAX1-RV32-NEXT: srli a3, a2, 1 -; LMULMAX1-RV32-NEXT: or a2, a2, a3 -; LMULMAX1-RV32-NEXT: srli a3, a2, 2 -; LMULMAX1-RV32-NEXT: or a2, a2, a3 -; LMULMAX1-RV32-NEXT: srli a3, a2, 4 -; LMULMAX1-RV32-NEXT: or a2, a2, a3 -; LMULMAX1-RV32-NEXT: srli a3, a2, 8 -; LMULMAX1-RV32-NEXT: or a2, a2, a3 -; LMULMAX1-RV32-NEXT: srli a3, a2, 16 -; LMULMAX1-RV32-NEXT: or a2, a2, a3 -; LMULMAX1-RV32-NEXT: not a2, a2 -; LMULMAX1-RV32-NEXT: srli a3, a2, 1 -; LMULMAX1-RV32-NEXT: and a3, a3, a5 -; LMULMAX1-RV32-NEXT: sub a2, a2, a3 -; LMULMAX1-RV32-NEXT: and a3, a2, a4 -; LMULMAX1-RV32-NEXT: srli a2, a2, 2 -; LMULMAX1-RV32-NEXT: and a2, a2, a4 -; LMULMAX1-RV32-NEXT: add a2, a3, a2 -; LMULMAX1-RV32-NEXT: srli a3, a2, 4 -; LMULMAX1-RV32-NEXT: add a2, a2, a3 -; LMULMAX1-RV32-NEXT: and a2, a2, a7 -; LMULMAX1-RV32-NEXT: mul a2, a2, t0 -; LMULMAX1-RV32-NEXT: srli a2, a2, 24 -; LMULMAX1-RV32-NEXT: addi a2, a2, 32 +; LMULMAX1-RV32-NEXT: vmv.x.s a1, v25 +; LMULMAX1-RV32-NEXT: srli a2, a1, 1 +; LMULMAX1-RV32-NEXT: or a1, a1, a2 +; LMULMAX1-RV32-NEXT: srli a2, a1, 2 +; LMULMAX1-RV32-NEXT: or a1, a1, a2 +; LMULMAX1-RV32-NEXT: srli a2, a1, 4 +; LMULMAX1-RV32-NEXT: or a1, a1, a2 +; LMULMAX1-RV32-NEXT: srli a2, a1, 8 +; LMULMAX1-RV32-NEXT: or a1, a1, a2 +; LMULMAX1-RV32-NEXT: srli a2, a1, 16 +; LMULMAX1-RV32-NEXT: or a1, a1, a2 +; LMULMAX1-RV32-NEXT: not a1, a1 +; LMULMAX1-RV32-NEXT: srli a2, a1, 1 +; LMULMAX1-RV32-NEXT: and a2, a2, a5 +; LMULMAX1-RV32-NEXT: sub a1, a1, a2 +; LMULMAX1-RV32-NEXT: and a2, a1, a4 +; LMULMAX1-RV32-NEXT: srli a1, a1, 2 +; LMULMAX1-RV32-NEXT: and a1, a1, a4 +; LMULMAX1-RV32-NEXT: add a1, a2, a1 +; LMULMAX1-RV32-NEXT: srli a2, a1, 4 +; LMULMAX1-RV32-NEXT: add a1, a1, a2 +; LMULMAX1-RV32-NEXT: and a1, a1, t0 +; LMULMAX1-RV32-NEXT: mul a1, a1, a3 +; LMULMAX1-RV32-NEXT: srli a1, a1, 24 +; LMULMAX1-RV32-NEXT: addi a1, a1, 32 ; LMULMAX1-RV32-NEXT: j .LBB7_9 ; LMULMAX1-RV32-NEXT: .LBB7_8: -; LMULMAX1-RV32-NEXT: srli a3, a2, 1 -; LMULMAX1-RV32-NEXT: or a2, a2, a3 -; LMULMAX1-RV32-NEXT: srli a3, a2, 2 -; LMULMAX1-RV32-NEXT: or a2, a2, a3 -; LMULMAX1-RV32-NEXT: srli a3, a2, 4 -; LMULMAX1-RV32-NEXT: or a2, a2, a3 -; LMULMAX1-RV32-NEXT: srli a3, a2, 8 -; LMULMAX1-RV32-NEXT: or a2, a2, a3 -; LMULMAX1-RV32-NEXT: srli a3, a2, 16 -; LMULMAX1-RV32-NEXT: or a2, a2, a3 -; LMULMAX1-RV32-NEXT: not a2, a2 -; LMULMAX1-RV32-NEXT: srli a3, a2, 1 -; LMULMAX1-RV32-NEXT: and a3, a3, a5 -; LMULMAX1-RV32-NEXT: sub a2, a2, a3 -; LMULMAX1-RV32-NEXT: and a3, a2, a4 -; LMULMAX1-RV32-NEXT: srli a2, a2, 2 -; LMULMAX1-RV32-NEXT: and a2, a2, a4 -; LMULMAX1-RV32-NEXT: add a2, a3, a2 -; LMULMAX1-RV32-NEXT: srli a3, a2, 4 -; LMULMAX1-RV32-NEXT: add a2, a2, a3 -; LMULMAX1-RV32-NEXT: and a2, a2, a7 -; LMULMAX1-RV32-NEXT: mul a2, a2, t0 -; LMULMAX1-RV32-NEXT: srli a2, a2, 24 +; LMULMAX1-RV32-NEXT: srli a2, a1, 1 +; LMULMAX1-RV32-NEXT: or a1, a1, a2 +; LMULMAX1-RV32-NEXT: srli a2, a1, 2 +; LMULMAX1-RV32-NEXT: or a1, a1, a2 +; LMULMAX1-RV32-NEXT: srli a2, a1, 4 +; LMULMAX1-RV32-NEXT: or a1, a1, a2 +; LMULMAX1-RV32-NEXT: srli a2, a1, 8 +; LMULMAX1-RV32-NEXT: or a1, a1, a2 +; LMULMAX1-RV32-NEXT: srli a2, a1, 16 +; LMULMAX1-RV32-NEXT: or a1, a1, a2 +; LMULMAX1-RV32-NEXT: not a1, a1 +; LMULMAX1-RV32-NEXT: srli a2, a1, 1 +; LMULMAX1-RV32-NEXT: and a2, a2, a5 +; LMULMAX1-RV32-NEXT: sub a1, a1, a2 +; LMULMAX1-RV32-NEXT: and a2, a1, a4 +; LMULMAX1-RV32-NEXT: srli a1, a1, 2 +; LMULMAX1-RV32-NEXT: and a1, a1, a4 +; LMULMAX1-RV32-NEXT: add a1, a2, a1 +; LMULMAX1-RV32-NEXT: srli a2, a1, 4 +; LMULMAX1-RV32-NEXT: add a1, a1, a2 +; LMULMAX1-RV32-NEXT: and a1, a1, t0 +; LMULMAX1-RV32-NEXT: mul a1, a1, a3 +; LMULMAX1-RV32-NEXT: srli a1, a1, 24 ; LMULMAX1-RV32-NEXT: .LBB7_9: ; LMULMAX1-RV32-NEXT: vslidedown.vi v25, v25, 1 -; LMULMAX1-RV32-NEXT: vsrl.vx v26, v25, a1 -; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 -; LMULMAX1-RV32-NEXT: sw a2, 16(sp) -; LMULMAX1-RV32-NEXT: bnez a1, .LBB7_11 +; LMULMAX1-RV32-NEXT: vsrl.vx v26, v25, a7 +; LMULMAX1-RV32-NEXT: vmv.x.s a2, v26 +; LMULMAX1-RV32-NEXT: sw a1, 16(sp) +; LMULMAX1-RV32-NEXT: bnez a2, .LBB7_11 ; LMULMAX1-RV32-NEXT: # %bb.10: ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v25 ; LMULMAX1-RV32-NEXT: srli a2, a1, 1 @@ -11766,14 +11766,14 @@ define void @ctlz_v4i64(<4 x i64>* %x, <4 x i64>* %y) { ; LMULMAX1-RV32-NEXT: add a1, a2, a1 ; LMULMAX1-RV32-NEXT: srli a2, a1, 4 ; LMULMAX1-RV32-NEXT: add a1, a1, a2 -; LMULMAX1-RV32-NEXT: and a1, a1, a7 -; LMULMAX1-RV32-NEXT: mul a1, a1, t0 +; LMULMAX1-RV32-NEXT: and a1, a1, t0 +; LMULMAX1-RV32-NEXT: mul a1, a1, a3 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 ; LMULMAX1-RV32-NEXT: addi a1, a1, 32 ; LMULMAX1-RV32-NEXT: j .LBB7_12 ; LMULMAX1-RV32-NEXT: .LBB7_11: -; LMULMAX1-RV32-NEXT: srli a2, a1, 1 -; LMULMAX1-RV32-NEXT: or a1, a1, a2 +; LMULMAX1-RV32-NEXT: srli a1, a2, 1 +; LMULMAX1-RV32-NEXT: or a1, a2, a1 ; LMULMAX1-RV32-NEXT: srli a2, a1, 2 ; LMULMAX1-RV32-NEXT: or a1, a1, a2 ; LMULMAX1-RV32-NEXT: srli a2, a1, 4 @@ -11792,8 +11792,8 @@ define void @ctlz_v4i64(<4 x i64>* %x, <4 x i64>* %y) { ; LMULMAX1-RV32-NEXT: add a1, a2, a1 ; LMULMAX1-RV32-NEXT: srli a2, a1, 4 ; LMULMAX1-RV32-NEXT: add a1, a1, a2 -; LMULMAX1-RV32-NEXT: and a1, a1, a7 -; LMULMAX1-RV32-NEXT: mul a1, a1, t0 +; LMULMAX1-RV32-NEXT: and a1, a1, t0 +; LMULMAX1-RV32-NEXT: mul a1, a1, a3 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 ; LMULMAX1-RV32-NEXT: .LBB7_12: ; LMULMAX1-RV32-NEXT: sw a1, 24(sp) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll index 1a0e864..b1eab3d 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll @@ -2208,8 +2208,8 @@ define void @cttz_v4i32(<4 x i32>* %x, <4 x i32>* %y) { ; LMULMAX2-RV64-NEXT: vslidedown.vi v26, v25, 3 ; LMULMAX2-RV64-NEXT: vmv.x.s a2, v26 ; LMULMAX2-RV64-NEXT: addi a1, zero, 1 -; LMULMAX2-RV64-NEXT: slli a1, a1, 32 -; LMULMAX2-RV64-NEXT: or a2, a2, a1 +; LMULMAX2-RV64-NEXT: slli a6, a1, 32 +; LMULMAX2-RV64-NEXT: or a2, a2, a6 ; LMULMAX2-RV64-NEXT: addi a3, a2, -1 ; LMULMAX2-RV64-NEXT: not a2, a2 ; LMULMAX2-RV64-NEXT: and a3, a2, a3 @@ -2221,8 +2221,8 @@ define void @cttz_v4i32(<4 x i32>* %x, <4 x i32>* %y) { ; LMULMAX2-RV64-NEXT: slli a2, a2, 12 ; LMULMAX2-RV64-NEXT: addi a2, a2, 1365 ; LMULMAX2-RV64-NEXT: slli a2, a2, 12 -; LMULMAX2-RV64-NEXT: addi a6, a2, 1365 -; LMULMAX2-RV64-NEXT: and a4, a4, a6 +; LMULMAX2-RV64-NEXT: addi a7, a2, 1365 +; LMULMAX2-RV64-NEXT: and a4, a4, a7 ; LMULMAX2-RV64-NEXT: sub a4, a3, a4 ; LMULMAX2-RV64-NEXT: lui a3, 13107 ; LMULMAX2-RV64-NEXT: addiw a3, a3, 819 @@ -2245,62 +2245,62 @@ define void @cttz_v4i32(<4 x i32>* %x, <4 x i32>* %y) { ; LMULMAX2-RV64-NEXT: slli a4, a4, 12 ; LMULMAX2-RV64-NEXT: addi a4, a4, 241 ; LMULMAX2-RV64-NEXT: slli a4, a4, 12 -; LMULMAX2-RV64-NEXT: addi a7, a4, -241 -; LMULMAX2-RV64-NEXT: and a2, a5, a7 +; LMULMAX2-RV64-NEXT: addi a4, a4, -241 +; LMULMAX2-RV64-NEXT: and a1, a5, a4 ; LMULMAX2-RV64-NEXT: lui a5, 4112 ; LMULMAX2-RV64-NEXT: addiw a5, a5, 257 ; LMULMAX2-RV64-NEXT: slli a5, a5, 16 ; LMULMAX2-RV64-NEXT: addi a5, a5, 257 ; LMULMAX2-RV64-NEXT: slli a5, a5, 16 ; LMULMAX2-RV64-NEXT: addi a5, a5, 257 -; LMULMAX2-RV64-NEXT: mul a2, a2, a5 -; LMULMAX2-RV64-NEXT: srli a2, a2, 56 -; LMULMAX2-RV64-NEXT: sw a2, 28(sp) +; LMULMAX2-RV64-NEXT: mul a1, a1, a5 +; LMULMAX2-RV64-NEXT: srli a1, a1, 56 +; LMULMAX2-RV64-NEXT: sw a1, 28(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v26, v25, 2 -; LMULMAX2-RV64-NEXT: vmv.x.s a2, v26 -; LMULMAX2-RV64-NEXT: or a2, a2, a1 -; LMULMAX2-RV64-NEXT: addi a4, a2, -1 -; LMULMAX2-RV64-NEXT: not a2, a2 -; LMULMAX2-RV64-NEXT: and a2, a2, a4 -; LMULMAX2-RV64-NEXT: srli a4, a2, 1 -; LMULMAX2-RV64-NEXT: and a4, a4, a6 -; LMULMAX2-RV64-NEXT: sub a2, a2, a4 -; LMULMAX2-RV64-NEXT: and a4, a2, a3 -; LMULMAX2-RV64-NEXT: srli a2, a2, 2 -; LMULMAX2-RV64-NEXT: and a2, a2, a3 -; LMULMAX2-RV64-NEXT: add a2, a4, a2 -; LMULMAX2-RV64-NEXT: srli a4, a2, 4 -; LMULMAX2-RV64-NEXT: add a2, a2, a4 +; LMULMAX2-RV64-NEXT: vmv.x.s a1, v26 +; LMULMAX2-RV64-NEXT: or a1, a1, a6 +; LMULMAX2-RV64-NEXT: addi a2, a1, -1 +; LMULMAX2-RV64-NEXT: not a1, a1 +; LMULMAX2-RV64-NEXT: and a1, a1, a2 +; LMULMAX2-RV64-NEXT: srli a2, a1, 1 ; LMULMAX2-RV64-NEXT: and a2, a2, a7 -; LMULMAX2-RV64-NEXT: mul a2, a2, a5 -; LMULMAX2-RV64-NEXT: srli a2, a2, 56 -; LMULMAX2-RV64-NEXT: sw a2, 24(sp) +; LMULMAX2-RV64-NEXT: sub a1, a1, a2 +; LMULMAX2-RV64-NEXT: and a2, a1, a3 +; LMULMAX2-RV64-NEXT: srli a1, a1, 2 +; LMULMAX2-RV64-NEXT: and a1, a1, a3 +; LMULMAX2-RV64-NEXT: add a1, a2, a1 +; LMULMAX2-RV64-NEXT: srli a2, a1, 4 +; LMULMAX2-RV64-NEXT: add a1, a1, a2 +; LMULMAX2-RV64-NEXT: and a1, a1, a4 +; LMULMAX2-RV64-NEXT: mul a1, a1, a5 +; LMULMAX2-RV64-NEXT: srli a1, a1, 56 +; LMULMAX2-RV64-NEXT: sw a1, 24(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v26, v25, 1 -; LMULMAX2-RV64-NEXT: vmv.x.s a2, v26 -; LMULMAX2-RV64-NEXT: or a2, a2, a1 -; LMULMAX2-RV64-NEXT: addi a4, a2, -1 -; LMULMAX2-RV64-NEXT: not a2, a2 -; LMULMAX2-RV64-NEXT: and a2, a2, a4 -; LMULMAX2-RV64-NEXT: srli a4, a2, 1 -; LMULMAX2-RV64-NEXT: and a4, a4, a6 -; LMULMAX2-RV64-NEXT: sub a2, a2, a4 -; LMULMAX2-RV64-NEXT: and a4, a2, a3 -; LMULMAX2-RV64-NEXT: srli a2, a2, 2 -; LMULMAX2-RV64-NEXT: and a2, a2, a3 -; LMULMAX2-RV64-NEXT: add a2, a4, a2 -; LMULMAX2-RV64-NEXT: srli a4, a2, 4 -; LMULMAX2-RV64-NEXT: add a2, a2, a4 +; LMULMAX2-RV64-NEXT: vmv.x.s a1, v26 +; LMULMAX2-RV64-NEXT: or a1, a1, a6 +; LMULMAX2-RV64-NEXT: addi a2, a1, -1 +; LMULMAX2-RV64-NEXT: not a1, a1 +; LMULMAX2-RV64-NEXT: and a1, a1, a2 +; LMULMAX2-RV64-NEXT: srli a2, a1, 1 ; LMULMAX2-RV64-NEXT: and a2, a2, a7 -; LMULMAX2-RV64-NEXT: mul a2, a2, a5 -; LMULMAX2-RV64-NEXT: srli a2, a2, 56 -; LMULMAX2-RV64-NEXT: sw a2, 20(sp) -; LMULMAX2-RV64-NEXT: vmv.x.s a2, v25 -; LMULMAX2-RV64-NEXT: or a1, a2, a1 +; LMULMAX2-RV64-NEXT: sub a1, a1, a2 +; LMULMAX2-RV64-NEXT: and a2, a1, a3 +; LMULMAX2-RV64-NEXT: srli a1, a1, 2 +; LMULMAX2-RV64-NEXT: and a1, a1, a3 +; LMULMAX2-RV64-NEXT: add a1, a2, a1 +; LMULMAX2-RV64-NEXT: srli a2, a1, 4 +; LMULMAX2-RV64-NEXT: add a1, a1, a2 +; LMULMAX2-RV64-NEXT: and a1, a1, a4 +; LMULMAX2-RV64-NEXT: mul a1, a1, a5 +; LMULMAX2-RV64-NEXT: srli a1, a1, 56 +; LMULMAX2-RV64-NEXT: sw a1, 20(sp) +; LMULMAX2-RV64-NEXT: vmv.x.s a1, v25 +; LMULMAX2-RV64-NEXT: or a1, a1, a6 ; LMULMAX2-RV64-NEXT: addi a2, a1, -1 ; LMULMAX2-RV64-NEXT: not a1, a1 ; LMULMAX2-RV64-NEXT: and a1, a1, a2 ; LMULMAX2-RV64-NEXT: srli a2, a1, 1 -; LMULMAX2-RV64-NEXT: and a2, a2, a6 +; LMULMAX2-RV64-NEXT: and a2, a2, a7 ; LMULMAX2-RV64-NEXT: sub a1, a1, a2 ; LMULMAX2-RV64-NEXT: and a2, a1, a3 ; LMULMAX2-RV64-NEXT: srli a1, a1, 2 @@ -2308,7 +2308,7 @@ define void @cttz_v4i32(<4 x i32>* %x, <4 x i32>* %y) { ; LMULMAX2-RV64-NEXT: add a1, a2, a1 ; LMULMAX2-RV64-NEXT: srli a2, a1, 4 ; LMULMAX2-RV64-NEXT: add a1, a1, a2 -; LMULMAX2-RV64-NEXT: and a1, a1, a7 +; LMULMAX2-RV64-NEXT: and a1, a1, a4 ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 ; LMULMAX2-RV64-NEXT: sw a1, 16(sp) @@ -2422,8 +2422,8 @@ define void @cttz_v4i32(<4 x i32>* %x, <4 x i32>* %y) { ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 3 ; LMULMAX1-RV64-NEXT: vmv.x.s a2, v26 ; LMULMAX1-RV64-NEXT: addi a1, zero, 1 -; LMULMAX1-RV64-NEXT: slli a1, a1, 32 -; LMULMAX1-RV64-NEXT: or a2, a2, a1 +; LMULMAX1-RV64-NEXT: slli a6, a1, 32 +; LMULMAX1-RV64-NEXT: or a2, a2, a6 ; LMULMAX1-RV64-NEXT: addi a3, a2, -1 ; LMULMAX1-RV64-NEXT: not a2, a2 ; LMULMAX1-RV64-NEXT: and a3, a2, a3 @@ -2435,8 +2435,8 @@ define void @cttz_v4i32(<4 x i32>* %x, <4 x i32>* %y) { ; LMULMAX1-RV64-NEXT: slli a2, a2, 12 ; LMULMAX1-RV64-NEXT: addi a2, a2, 1365 ; LMULMAX1-RV64-NEXT: slli a2, a2, 12 -; LMULMAX1-RV64-NEXT: addi a6, a2, 1365 -; LMULMAX1-RV64-NEXT: and a4, a4, a6 +; LMULMAX1-RV64-NEXT: addi a7, a2, 1365 +; LMULMAX1-RV64-NEXT: and a4, a4, a7 ; LMULMAX1-RV64-NEXT: sub a4, a3, a4 ; LMULMAX1-RV64-NEXT: lui a3, 13107 ; LMULMAX1-RV64-NEXT: addiw a3, a3, 819 @@ -2459,62 +2459,62 @@ define void @cttz_v4i32(<4 x i32>* %x, <4 x i32>* %y) { ; LMULMAX1-RV64-NEXT: slli a4, a4, 12 ; LMULMAX1-RV64-NEXT: addi a4, a4, 241 ; LMULMAX1-RV64-NEXT: slli a4, a4, 12 -; LMULMAX1-RV64-NEXT: addi a7, a4, -241 -; LMULMAX1-RV64-NEXT: and a2, a5, a7 +; LMULMAX1-RV64-NEXT: addi a4, a4, -241 +; LMULMAX1-RV64-NEXT: and a1, a5, a4 ; LMULMAX1-RV64-NEXT: lui a5, 4112 ; LMULMAX1-RV64-NEXT: addiw a5, a5, 257 ; LMULMAX1-RV64-NEXT: slli a5, a5, 16 ; LMULMAX1-RV64-NEXT: addi a5, a5, 257 ; LMULMAX1-RV64-NEXT: slli a5, a5, 16 ; LMULMAX1-RV64-NEXT: addi a5, a5, 257 -; LMULMAX1-RV64-NEXT: mul a2, a2, a5 -; LMULMAX1-RV64-NEXT: srli a2, a2, 56 -; LMULMAX1-RV64-NEXT: sw a2, 28(sp) +; LMULMAX1-RV64-NEXT: mul a1, a1, a5 +; LMULMAX1-RV64-NEXT: srli a1, a1, 56 +; LMULMAX1-RV64-NEXT: sw a1, 28(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 2 -; LMULMAX1-RV64-NEXT: vmv.x.s a2, v26 -; LMULMAX1-RV64-NEXT: or a2, a2, a1 -; LMULMAX1-RV64-NEXT: addi a4, a2, -1 -; LMULMAX1-RV64-NEXT: not a2, a2 -; LMULMAX1-RV64-NEXT: and a2, a2, a4 -; LMULMAX1-RV64-NEXT: srli a4, a2, 1 -; LMULMAX1-RV64-NEXT: and a4, a4, a6 -; LMULMAX1-RV64-NEXT: sub a2, a2, a4 -; LMULMAX1-RV64-NEXT: and a4, a2, a3 -; LMULMAX1-RV64-NEXT: srli a2, a2, 2 -; LMULMAX1-RV64-NEXT: and a2, a2, a3 -; LMULMAX1-RV64-NEXT: add a2, a4, a2 -; LMULMAX1-RV64-NEXT: srli a4, a2, 4 -; LMULMAX1-RV64-NEXT: add a2, a2, a4 +; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 +; LMULMAX1-RV64-NEXT: or a1, a1, a6 +; LMULMAX1-RV64-NEXT: addi a2, a1, -1 +; LMULMAX1-RV64-NEXT: not a1, a1 +; LMULMAX1-RV64-NEXT: and a1, a1, a2 +; LMULMAX1-RV64-NEXT: srli a2, a1, 1 ; LMULMAX1-RV64-NEXT: and a2, a2, a7 -; LMULMAX1-RV64-NEXT: mul a2, a2, a5 -; LMULMAX1-RV64-NEXT: srli a2, a2, 56 -; LMULMAX1-RV64-NEXT: sw a2, 24(sp) +; LMULMAX1-RV64-NEXT: sub a1, a1, a2 +; LMULMAX1-RV64-NEXT: and a2, a1, a3 +; LMULMAX1-RV64-NEXT: srli a1, a1, 2 +; LMULMAX1-RV64-NEXT: and a1, a1, a3 +; LMULMAX1-RV64-NEXT: add a1, a2, a1 +; LMULMAX1-RV64-NEXT: srli a2, a1, 4 +; LMULMAX1-RV64-NEXT: add a1, a1, a2 +; LMULMAX1-RV64-NEXT: and a1, a1, a4 +; LMULMAX1-RV64-NEXT: mul a1, a1, a5 +; LMULMAX1-RV64-NEXT: srli a1, a1, 56 +; LMULMAX1-RV64-NEXT: sw a1, 24(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 1 -; LMULMAX1-RV64-NEXT: vmv.x.s a2, v26 -; LMULMAX1-RV64-NEXT: or a2, a2, a1 -; LMULMAX1-RV64-NEXT: addi a4, a2, -1 -; LMULMAX1-RV64-NEXT: not a2, a2 -; LMULMAX1-RV64-NEXT: and a2, a2, a4 -; LMULMAX1-RV64-NEXT: srli a4, a2, 1 -; LMULMAX1-RV64-NEXT: and a4, a4, a6 -; LMULMAX1-RV64-NEXT: sub a2, a2, a4 -; LMULMAX1-RV64-NEXT: and a4, a2, a3 -; LMULMAX1-RV64-NEXT: srli a2, a2, 2 -; LMULMAX1-RV64-NEXT: and a2, a2, a3 -; LMULMAX1-RV64-NEXT: add a2, a4, a2 -; LMULMAX1-RV64-NEXT: srli a4, a2, 4 -; LMULMAX1-RV64-NEXT: add a2, a2, a4 +; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 +; LMULMAX1-RV64-NEXT: or a1, a1, a6 +; LMULMAX1-RV64-NEXT: addi a2, a1, -1 +; LMULMAX1-RV64-NEXT: not a1, a1 +; LMULMAX1-RV64-NEXT: and a1, a1, a2 +; LMULMAX1-RV64-NEXT: srli a2, a1, 1 ; LMULMAX1-RV64-NEXT: and a2, a2, a7 -; LMULMAX1-RV64-NEXT: mul a2, a2, a5 -; LMULMAX1-RV64-NEXT: srli a2, a2, 56 -; LMULMAX1-RV64-NEXT: sw a2, 20(sp) -; LMULMAX1-RV64-NEXT: vmv.x.s a2, v25 -; LMULMAX1-RV64-NEXT: or a1, a2, a1 +; LMULMAX1-RV64-NEXT: sub a1, a1, a2 +; LMULMAX1-RV64-NEXT: and a2, a1, a3 +; LMULMAX1-RV64-NEXT: srli a1, a1, 2 +; LMULMAX1-RV64-NEXT: and a1, a1, a3 +; LMULMAX1-RV64-NEXT: add a1, a2, a1 +; LMULMAX1-RV64-NEXT: srli a2, a1, 4 +; LMULMAX1-RV64-NEXT: add a1, a1, a2 +; LMULMAX1-RV64-NEXT: and a1, a1, a4 +; LMULMAX1-RV64-NEXT: mul a1, a1, a5 +; LMULMAX1-RV64-NEXT: srli a1, a1, 56 +; LMULMAX1-RV64-NEXT: sw a1, 20(sp) +; LMULMAX1-RV64-NEXT: vmv.x.s a1, v25 +; LMULMAX1-RV64-NEXT: or a1, a1, a6 ; LMULMAX1-RV64-NEXT: addi a2, a1, -1 ; LMULMAX1-RV64-NEXT: not a1, a1 ; LMULMAX1-RV64-NEXT: and a1, a1, a2 ; LMULMAX1-RV64-NEXT: srli a2, a1, 1 -; LMULMAX1-RV64-NEXT: and a2, a2, a6 +; LMULMAX1-RV64-NEXT: and a2, a2, a7 ; LMULMAX1-RV64-NEXT: sub a1, a1, a2 ; LMULMAX1-RV64-NEXT: and a2, a1, a3 ; LMULMAX1-RV64-NEXT: srli a1, a1, 2 @@ -2522,7 +2522,7 @@ define void @cttz_v4i32(<4 x i32>* %x, <4 x i32>* %y) { ; LMULMAX1-RV64-NEXT: add a1, a2, a1 ; LMULMAX1-RV64-NEXT: srli a2, a1, 4 ; LMULMAX1-RV64-NEXT: add a1, a1, a2 -; LMULMAX1-RV64-NEXT: and a1, a1, a7 +; LMULMAX1-RV64-NEXT: and a1, a1, a4 ; LMULMAX1-RV64-NEXT: mul a1, a1, a5 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 ; LMULMAX1-RV64-NEXT: sw a1, 16(sp) @@ -7089,8 +7089,8 @@ define void @cttz_v8i32(<8 x i32>* %x, <8 x i32>* %y) { ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 7 ; LMULMAX2-RV64-NEXT: vmv.x.s a2, v28 ; LMULMAX2-RV64-NEXT: addi a1, zero, 1 -; LMULMAX2-RV64-NEXT: slli a1, a1, 32 -; LMULMAX2-RV64-NEXT: or a2, a2, a1 +; LMULMAX2-RV64-NEXT: slli a6, a1, 32 +; LMULMAX2-RV64-NEXT: or a2, a2, a6 ; LMULMAX2-RV64-NEXT: addi a3, a2, -1 ; LMULMAX2-RV64-NEXT: not a2, a2 ; LMULMAX2-RV64-NEXT: and a3, a2, a3 @@ -7102,8 +7102,8 @@ define void @cttz_v8i32(<8 x i32>* %x, <8 x i32>* %y) { ; LMULMAX2-RV64-NEXT: slli a2, a2, 12 ; LMULMAX2-RV64-NEXT: addi a2, a2, 1365 ; LMULMAX2-RV64-NEXT: slli a2, a2, 12 -; LMULMAX2-RV64-NEXT: addi a6, a2, 1365 -; LMULMAX2-RV64-NEXT: and a4, a4, a6 +; LMULMAX2-RV64-NEXT: addi a7, a2, 1365 +; LMULMAX2-RV64-NEXT: and a4, a4, a7 ; LMULMAX2-RV64-NEXT: sub a4, a3, a4 ; LMULMAX2-RV64-NEXT: lui a3, 13107 ; LMULMAX2-RV64-NEXT: addiw a3, a3, 819 @@ -7126,138 +7126,138 @@ define void @cttz_v8i32(<8 x i32>* %x, <8 x i32>* %y) { ; LMULMAX2-RV64-NEXT: slli a4, a4, 12 ; LMULMAX2-RV64-NEXT: addi a4, a4, 241 ; LMULMAX2-RV64-NEXT: slli a4, a4, 12 -; LMULMAX2-RV64-NEXT: addi a7, a4, -241 -; LMULMAX2-RV64-NEXT: and a2, a5, a7 +; LMULMAX2-RV64-NEXT: addi a4, a4, -241 +; LMULMAX2-RV64-NEXT: and a1, a5, a4 ; LMULMAX2-RV64-NEXT: lui a5, 4112 ; LMULMAX2-RV64-NEXT: addiw a5, a5, 257 ; LMULMAX2-RV64-NEXT: slli a5, a5, 16 ; LMULMAX2-RV64-NEXT: addi a5, a5, 257 ; LMULMAX2-RV64-NEXT: slli a5, a5, 16 ; LMULMAX2-RV64-NEXT: addi a5, a5, 257 -; LMULMAX2-RV64-NEXT: mul a2, a2, a5 -; LMULMAX2-RV64-NEXT: srli a2, a2, 56 -; LMULMAX2-RV64-NEXT: sw a2, 60(sp) +; LMULMAX2-RV64-NEXT: mul a1, a1, a5 +; LMULMAX2-RV64-NEXT: srli a1, a1, 56 +; LMULMAX2-RV64-NEXT: sw a1, 60(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 6 -; LMULMAX2-RV64-NEXT: vmv.x.s a2, v28 -; LMULMAX2-RV64-NEXT: or a2, a2, a1 -; LMULMAX2-RV64-NEXT: addi a4, a2, -1 -; LMULMAX2-RV64-NEXT: not a2, a2 -; LMULMAX2-RV64-NEXT: and a2, a2, a4 -; LMULMAX2-RV64-NEXT: srli a4, a2, 1 -; LMULMAX2-RV64-NEXT: and a4, a4, a6 -; LMULMAX2-RV64-NEXT: sub a2, a2, a4 -; LMULMAX2-RV64-NEXT: and a4, a2, a3 -; LMULMAX2-RV64-NEXT: srli a2, a2, 2 -; LMULMAX2-RV64-NEXT: and a2, a2, a3 -; LMULMAX2-RV64-NEXT: add a2, a4, a2 -; LMULMAX2-RV64-NEXT: srli a4, a2, 4 -; LMULMAX2-RV64-NEXT: add a2, a2, a4 +; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 +; LMULMAX2-RV64-NEXT: or a1, a1, a6 +; LMULMAX2-RV64-NEXT: addi a2, a1, -1 +; LMULMAX2-RV64-NEXT: not a1, a1 +; LMULMAX2-RV64-NEXT: and a1, a1, a2 +; LMULMAX2-RV64-NEXT: srli a2, a1, 1 ; LMULMAX2-RV64-NEXT: and a2, a2, a7 -; LMULMAX2-RV64-NEXT: mul a2, a2, a5 -; LMULMAX2-RV64-NEXT: srli a2, a2, 56 -; LMULMAX2-RV64-NEXT: sw a2, 56(sp) +; LMULMAX2-RV64-NEXT: sub a1, a1, a2 +; LMULMAX2-RV64-NEXT: and a2, a1, a3 +; LMULMAX2-RV64-NEXT: srli a1, a1, 2 +; LMULMAX2-RV64-NEXT: and a1, a1, a3 +; LMULMAX2-RV64-NEXT: add a1, a2, a1 +; LMULMAX2-RV64-NEXT: srli a2, a1, 4 +; LMULMAX2-RV64-NEXT: add a1, a1, a2 +; LMULMAX2-RV64-NEXT: and a1, a1, a4 +; LMULMAX2-RV64-NEXT: mul a1, a1, a5 +; LMULMAX2-RV64-NEXT: srli a1, a1, 56 +; LMULMAX2-RV64-NEXT: sw a1, 56(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 5 -; LMULMAX2-RV64-NEXT: vmv.x.s a2, v28 -; LMULMAX2-RV64-NEXT: or a2, a2, a1 -; LMULMAX2-RV64-NEXT: addi a4, a2, -1 -; LMULMAX2-RV64-NEXT: not a2, a2 -; LMULMAX2-RV64-NEXT: and a2, a2, a4 -; LMULMAX2-RV64-NEXT: srli a4, a2, 1 -; LMULMAX2-RV64-NEXT: and a4, a4, a6 -; LMULMAX2-RV64-NEXT: sub a2, a2, a4 -; LMULMAX2-RV64-NEXT: and a4, a2, a3 -; LMULMAX2-RV64-NEXT: srli a2, a2, 2 -; LMULMAX2-RV64-NEXT: and a2, a2, a3 -; LMULMAX2-RV64-NEXT: add a2, a4, a2 -; LMULMAX2-RV64-NEXT: srli a4, a2, 4 -; LMULMAX2-RV64-NEXT: add a2, a2, a4 +; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 +; LMULMAX2-RV64-NEXT: or a1, a1, a6 +; LMULMAX2-RV64-NEXT: addi a2, a1, -1 +; LMULMAX2-RV64-NEXT: not a1, a1 +; LMULMAX2-RV64-NEXT: and a1, a1, a2 +; LMULMAX2-RV64-NEXT: srli a2, a1, 1 ; LMULMAX2-RV64-NEXT: and a2, a2, a7 -; LMULMAX2-RV64-NEXT: mul a2, a2, a5 -; LMULMAX2-RV64-NEXT: srli a2, a2, 56 -; LMULMAX2-RV64-NEXT: sw a2, 52(sp) +; LMULMAX2-RV64-NEXT: sub a1, a1, a2 +; LMULMAX2-RV64-NEXT: and a2, a1, a3 +; LMULMAX2-RV64-NEXT: srli a1, a1, 2 +; LMULMAX2-RV64-NEXT: and a1, a1, a3 +; LMULMAX2-RV64-NEXT: add a1, a2, a1 +; LMULMAX2-RV64-NEXT: srli a2, a1, 4 +; LMULMAX2-RV64-NEXT: add a1, a1, a2 +; LMULMAX2-RV64-NEXT: and a1, a1, a4 +; LMULMAX2-RV64-NEXT: mul a1, a1, a5 +; LMULMAX2-RV64-NEXT: srli a1, a1, 56 +; LMULMAX2-RV64-NEXT: sw a1, 52(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 4 -; LMULMAX2-RV64-NEXT: vmv.x.s a2, v28 -; LMULMAX2-RV64-NEXT: or a2, a2, a1 -; LMULMAX2-RV64-NEXT: addi a4, a2, -1 -; LMULMAX2-RV64-NEXT: not a2, a2 -; LMULMAX2-RV64-NEXT: and a2, a2, a4 -; LMULMAX2-RV64-NEXT: srli a4, a2, 1 -; LMULMAX2-RV64-NEXT: and a4, a4, a6 -; LMULMAX2-RV64-NEXT: sub a2, a2, a4 -; LMULMAX2-RV64-NEXT: and a4, a2, a3 -; LMULMAX2-RV64-NEXT: srli a2, a2, 2 -; LMULMAX2-RV64-NEXT: and a2, a2, a3 -; LMULMAX2-RV64-NEXT: add a2, a4, a2 -; LMULMAX2-RV64-NEXT: srli a4, a2, 4 -; LMULMAX2-RV64-NEXT: add a2, a2, a4 +; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 +; LMULMAX2-RV64-NEXT: or a1, a1, a6 +; LMULMAX2-RV64-NEXT: addi a2, a1, -1 +; LMULMAX2-RV64-NEXT: not a1, a1 +; LMULMAX2-RV64-NEXT: and a1, a1, a2 +; LMULMAX2-RV64-NEXT: srli a2, a1, 1 ; LMULMAX2-RV64-NEXT: and a2, a2, a7 -; LMULMAX2-RV64-NEXT: mul a2, a2, a5 -; LMULMAX2-RV64-NEXT: srli a2, a2, 56 -; LMULMAX2-RV64-NEXT: sw a2, 48(sp) +; LMULMAX2-RV64-NEXT: sub a1, a1, a2 +; LMULMAX2-RV64-NEXT: and a2, a1, a3 +; LMULMAX2-RV64-NEXT: srli a1, a1, 2 +; LMULMAX2-RV64-NEXT: and a1, a1, a3 +; LMULMAX2-RV64-NEXT: add a1, a2, a1 +; LMULMAX2-RV64-NEXT: srli a2, a1, 4 +; LMULMAX2-RV64-NEXT: add a1, a1, a2 +; LMULMAX2-RV64-NEXT: and a1, a1, a4 +; LMULMAX2-RV64-NEXT: mul a1, a1, a5 +; LMULMAX2-RV64-NEXT: srli a1, a1, 56 +; LMULMAX2-RV64-NEXT: sw a1, 48(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 3 -; LMULMAX2-RV64-NEXT: vmv.x.s a2, v28 -; LMULMAX2-RV64-NEXT: or a2, a2, a1 -; LMULMAX2-RV64-NEXT: addi a4, a2, -1 -; LMULMAX2-RV64-NEXT: not a2, a2 -; LMULMAX2-RV64-NEXT: and a2, a2, a4 -; LMULMAX2-RV64-NEXT: srli a4, a2, 1 -; LMULMAX2-RV64-NEXT: and a4, a4, a6 -; LMULMAX2-RV64-NEXT: sub a2, a2, a4 -; LMULMAX2-RV64-NEXT: and a4, a2, a3 -; LMULMAX2-RV64-NEXT: srli a2, a2, 2 -; LMULMAX2-RV64-NEXT: and a2, a2, a3 -; LMULMAX2-RV64-NEXT: add a2, a4, a2 -; LMULMAX2-RV64-NEXT: srli a4, a2, 4 -; LMULMAX2-RV64-NEXT: add a2, a2, a4 +; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 +; LMULMAX2-RV64-NEXT: or a1, a1, a6 +; LMULMAX2-RV64-NEXT: addi a2, a1, -1 +; LMULMAX2-RV64-NEXT: not a1, a1 +; LMULMAX2-RV64-NEXT: and a1, a1, a2 +; LMULMAX2-RV64-NEXT: srli a2, a1, 1 ; LMULMAX2-RV64-NEXT: and a2, a2, a7 -; LMULMAX2-RV64-NEXT: mul a2, a2, a5 -; LMULMAX2-RV64-NEXT: srli a2, a2, 56 -; LMULMAX2-RV64-NEXT: sw a2, 44(sp) +; LMULMAX2-RV64-NEXT: sub a1, a1, a2 +; LMULMAX2-RV64-NEXT: and a2, a1, a3 +; LMULMAX2-RV64-NEXT: srli a1, a1, 2 +; LMULMAX2-RV64-NEXT: and a1, a1, a3 +; LMULMAX2-RV64-NEXT: add a1, a2, a1 +; LMULMAX2-RV64-NEXT: srli a2, a1, 4 +; LMULMAX2-RV64-NEXT: add a1, a1, a2 +; LMULMAX2-RV64-NEXT: and a1, a1, a4 +; LMULMAX2-RV64-NEXT: mul a1, a1, a5 +; LMULMAX2-RV64-NEXT: srli a1, a1, 56 +; LMULMAX2-RV64-NEXT: sw a1, 44(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 2 -; LMULMAX2-RV64-NEXT: vmv.x.s a2, v28 -; LMULMAX2-RV64-NEXT: or a2, a2, a1 -; LMULMAX2-RV64-NEXT: addi a4, a2, -1 -; LMULMAX2-RV64-NEXT: not a2, a2 -; LMULMAX2-RV64-NEXT: and a2, a2, a4 -; LMULMAX2-RV64-NEXT: srli a4, a2, 1 -; LMULMAX2-RV64-NEXT: and a4, a4, a6 -; LMULMAX2-RV64-NEXT: sub a2, a2, a4 -; LMULMAX2-RV64-NEXT: and a4, a2, a3 -; LMULMAX2-RV64-NEXT: srli a2, a2, 2 -; LMULMAX2-RV64-NEXT: and a2, a2, a3 -; LMULMAX2-RV64-NEXT: add a2, a4, a2 -; LMULMAX2-RV64-NEXT: srli a4, a2, 4 -; LMULMAX2-RV64-NEXT: add a2, a2, a4 +; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 +; LMULMAX2-RV64-NEXT: or a1, a1, a6 +; LMULMAX2-RV64-NEXT: addi a2, a1, -1 +; LMULMAX2-RV64-NEXT: not a1, a1 +; LMULMAX2-RV64-NEXT: and a1, a1, a2 +; LMULMAX2-RV64-NEXT: srli a2, a1, 1 ; LMULMAX2-RV64-NEXT: and a2, a2, a7 -; LMULMAX2-RV64-NEXT: mul a2, a2, a5 -; LMULMAX2-RV64-NEXT: srli a2, a2, 56 -; LMULMAX2-RV64-NEXT: sw a2, 40(sp) +; LMULMAX2-RV64-NEXT: sub a1, a1, a2 +; LMULMAX2-RV64-NEXT: and a2, a1, a3 +; LMULMAX2-RV64-NEXT: srli a1, a1, 2 +; LMULMAX2-RV64-NEXT: and a1, a1, a3 +; LMULMAX2-RV64-NEXT: add a1, a2, a1 +; LMULMAX2-RV64-NEXT: srli a2, a1, 4 +; LMULMAX2-RV64-NEXT: add a1, a1, a2 +; LMULMAX2-RV64-NEXT: and a1, a1, a4 +; LMULMAX2-RV64-NEXT: mul a1, a1, a5 +; LMULMAX2-RV64-NEXT: srli a1, a1, 56 +; LMULMAX2-RV64-NEXT: sw a1, 40(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 1 -; LMULMAX2-RV64-NEXT: vmv.x.s a2, v28 -; LMULMAX2-RV64-NEXT: or a2, a2, a1 -; LMULMAX2-RV64-NEXT: addi a4, a2, -1 -; LMULMAX2-RV64-NEXT: not a2, a2 -; LMULMAX2-RV64-NEXT: and a2, a2, a4 -; LMULMAX2-RV64-NEXT: srli a4, a2, 1 -; LMULMAX2-RV64-NEXT: and a4, a4, a6 -; LMULMAX2-RV64-NEXT: sub a2, a2, a4 -; LMULMAX2-RV64-NEXT: and a4, a2, a3 -; LMULMAX2-RV64-NEXT: srli a2, a2, 2 -; LMULMAX2-RV64-NEXT: and a2, a2, a3 -; LMULMAX2-RV64-NEXT: add a2, a4, a2 -; LMULMAX2-RV64-NEXT: srli a4, a2, 4 -; LMULMAX2-RV64-NEXT: add a2, a2, a4 +; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 +; LMULMAX2-RV64-NEXT: or a1, a1, a6 +; LMULMAX2-RV64-NEXT: addi a2, a1, -1 +; LMULMAX2-RV64-NEXT: not a1, a1 +; LMULMAX2-RV64-NEXT: and a1, a1, a2 +; LMULMAX2-RV64-NEXT: srli a2, a1, 1 ; LMULMAX2-RV64-NEXT: and a2, a2, a7 -; LMULMAX2-RV64-NEXT: mul a2, a2, a5 -; LMULMAX2-RV64-NEXT: srli a2, a2, 56 -; LMULMAX2-RV64-NEXT: sw a2, 36(sp) -; LMULMAX2-RV64-NEXT: vmv.x.s a2, v26 -; LMULMAX2-RV64-NEXT: or a1, a2, a1 +; LMULMAX2-RV64-NEXT: sub a1, a1, a2 +; LMULMAX2-RV64-NEXT: and a2, a1, a3 +; LMULMAX2-RV64-NEXT: srli a1, a1, 2 +; LMULMAX2-RV64-NEXT: and a1, a1, a3 +; LMULMAX2-RV64-NEXT: add a1, a2, a1 +; LMULMAX2-RV64-NEXT: srli a2, a1, 4 +; LMULMAX2-RV64-NEXT: add a1, a1, a2 +; LMULMAX2-RV64-NEXT: and a1, a1, a4 +; LMULMAX2-RV64-NEXT: mul a1, a1, a5 +; LMULMAX2-RV64-NEXT: srli a1, a1, 56 +; LMULMAX2-RV64-NEXT: sw a1, 36(sp) +; LMULMAX2-RV64-NEXT: vmv.x.s a1, v26 +; LMULMAX2-RV64-NEXT: or a1, a1, a6 ; LMULMAX2-RV64-NEXT: addi a2, a1, -1 ; LMULMAX2-RV64-NEXT: not a1, a1 ; LMULMAX2-RV64-NEXT: and a1, a1, a2 ; LMULMAX2-RV64-NEXT: srli a2, a1, 1 -; LMULMAX2-RV64-NEXT: and a2, a2, a6 +; LMULMAX2-RV64-NEXT: and a2, a2, a7 ; LMULMAX2-RV64-NEXT: sub a1, a1, a2 ; LMULMAX2-RV64-NEXT: and a2, a1, a3 ; LMULMAX2-RV64-NEXT: srli a1, a1, 2 @@ -7265,7 +7265,7 @@ define void @cttz_v8i32(<8 x i32>* %x, <8 x i32>* %y) { ; LMULMAX2-RV64-NEXT: add a1, a2, a1 ; LMULMAX2-RV64-NEXT: srli a2, a1, 4 ; LMULMAX2-RV64-NEXT: add a1, a1, a2 -; LMULMAX2-RV64-NEXT: and a1, a1, a7 +; LMULMAX2-RV64-NEXT: and a1, a1, a4 ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 ; LMULMAX2-RV64-NEXT: sw a1, 32(sp) @@ -7458,8 +7458,8 @@ define void @cttz_v8i32(<8 x i32>* %x, <8 x i32>* %y) { ; LMULMAX1-RV64-NEXT: vle32.v v25, (a0) ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV64-NEXT: addi a2, zero, 1 -; LMULMAX1-RV64-NEXT: slli a2, a2, 32 -; LMULMAX1-RV64-NEXT: or a1, a1, a2 +; LMULMAX1-RV64-NEXT: slli a7, a2, 32 +; LMULMAX1-RV64-NEXT: or a1, a1, a7 ; LMULMAX1-RV64-NEXT: addi a3, a1, -1 ; LMULMAX1-RV64-NEXT: not a1, a1 ; LMULMAX1-RV64-NEXT: and a1, a1, a3 @@ -7471,8 +7471,8 @@ define void @cttz_v8i32(<8 x i32>* %x, <8 x i32>* %y) { ; LMULMAX1-RV64-NEXT: slli a3, a3, 12 ; LMULMAX1-RV64-NEXT: addi a3, a3, 1365 ; LMULMAX1-RV64-NEXT: slli a3, a3, 12 -; LMULMAX1-RV64-NEXT: addi a7, a3, 1365 -; LMULMAX1-RV64-NEXT: and a4, a4, a7 +; LMULMAX1-RV64-NEXT: addi t0, a3, 1365 +; LMULMAX1-RV64-NEXT: and a4, a4, t0 ; LMULMAX1-RV64-NEXT: sub a1, a1, a4 ; LMULMAX1-RV64-NEXT: lui a4, 13107 ; LMULMAX1-RV64-NEXT: addiw a4, a4, 819 @@ -7495,139 +7495,139 @@ define void @cttz_v8i32(<8 x i32>* %x, <8 x i32>* %y) { ; LMULMAX1-RV64-NEXT: slli a5, a5, 12 ; LMULMAX1-RV64-NEXT: addi a5, a5, 241 ; LMULMAX1-RV64-NEXT: slli a5, a5, 12 -; LMULMAX1-RV64-NEXT: addi t0, a5, -241 -; LMULMAX1-RV64-NEXT: and a3, a1, t0 +; LMULMAX1-RV64-NEXT: addi a5, a5, -241 +; LMULMAX1-RV64-NEXT: and a2, a1, a5 ; LMULMAX1-RV64-NEXT: lui a1, 4112 ; LMULMAX1-RV64-NEXT: addiw a1, a1, 257 ; LMULMAX1-RV64-NEXT: slli a1, a1, 16 ; LMULMAX1-RV64-NEXT: addi a1, a1, 257 ; LMULMAX1-RV64-NEXT: slli a1, a1, 16 ; LMULMAX1-RV64-NEXT: addi a1, a1, 257 -; LMULMAX1-RV64-NEXT: mul a3, a3, a1 -; LMULMAX1-RV64-NEXT: srli a3, a3, 56 -; LMULMAX1-RV64-NEXT: sw a3, 32(sp) +; LMULMAX1-RV64-NEXT: mul a2, a2, a1 +; LMULMAX1-RV64-NEXT: srli a2, a2, 56 +; LMULMAX1-RV64-NEXT: sw a2, 32(sp) ; LMULMAX1-RV64-NEXT: vsetivli zero, 1, e32, m1, ta, mu ; LMULMAX1-RV64-NEXT: vslidedown.vi v27, v26, 3 -; LMULMAX1-RV64-NEXT: vmv.x.s a3, v27 -; LMULMAX1-RV64-NEXT: or a3, a3, a2 -; LMULMAX1-RV64-NEXT: addi a5, a3, -1 -; LMULMAX1-RV64-NEXT: not a3, a3 -; LMULMAX1-RV64-NEXT: and a3, a3, a5 -; LMULMAX1-RV64-NEXT: srli a5, a3, 1 -; LMULMAX1-RV64-NEXT: and a5, a5, a7 -; LMULMAX1-RV64-NEXT: sub a3, a3, a5 -; LMULMAX1-RV64-NEXT: and a5, a3, a4 -; LMULMAX1-RV64-NEXT: srli a3, a3, 2 -; LMULMAX1-RV64-NEXT: and a3, a3, a4 -; LMULMAX1-RV64-NEXT: add a3, a5, a3 -; LMULMAX1-RV64-NEXT: srli a5, a3, 4 -; LMULMAX1-RV64-NEXT: add a3, a3, a5 +; LMULMAX1-RV64-NEXT: vmv.x.s a2, v27 +; LMULMAX1-RV64-NEXT: or a2, a2, a7 +; LMULMAX1-RV64-NEXT: addi a3, a2, -1 +; LMULMAX1-RV64-NEXT: not a2, a2 +; LMULMAX1-RV64-NEXT: and a2, a2, a3 +; LMULMAX1-RV64-NEXT: srli a3, a2, 1 ; LMULMAX1-RV64-NEXT: and a3, a3, t0 -; LMULMAX1-RV64-NEXT: mul a3, a3, a1 -; LMULMAX1-RV64-NEXT: srli a3, a3, 56 -; LMULMAX1-RV64-NEXT: sw a3, 44(sp) +; LMULMAX1-RV64-NEXT: sub a2, a2, a3 +; LMULMAX1-RV64-NEXT: and a3, a2, a4 +; LMULMAX1-RV64-NEXT: srli a2, a2, 2 +; LMULMAX1-RV64-NEXT: and a2, a2, a4 +; LMULMAX1-RV64-NEXT: add a2, a3, a2 +; LMULMAX1-RV64-NEXT: srli a3, a2, 4 +; LMULMAX1-RV64-NEXT: add a2, a2, a3 +; LMULMAX1-RV64-NEXT: and a2, a2, a5 +; LMULMAX1-RV64-NEXT: mul a2, a2, a1 +; LMULMAX1-RV64-NEXT: srli a2, a2, 56 +; LMULMAX1-RV64-NEXT: sw a2, 44(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v27, v26, 2 -; LMULMAX1-RV64-NEXT: vmv.x.s a3, v27 -; LMULMAX1-RV64-NEXT: or a3, a3, a2 -; LMULMAX1-RV64-NEXT: addi a5, a3, -1 -; LMULMAX1-RV64-NEXT: not a3, a3 -; LMULMAX1-RV64-NEXT: and a3, a3, a5 -; LMULMAX1-RV64-NEXT: srli a5, a3, 1 -; LMULMAX1-RV64-NEXT: and a5, a5, a7 -; LMULMAX1-RV64-NEXT: sub a3, a3, a5 -; LMULMAX1-RV64-NEXT: and a5, a3, a4 -; LMULMAX1-RV64-NEXT: srli a3, a3, 2 -; LMULMAX1-RV64-NEXT: and a3, a3, a4 -; LMULMAX1-RV64-NEXT: add a3, a5, a3 -; LMULMAX1-RV64-NEXT: srli a5, a3, 4 -; LMULMAX1-RV64-NEXT: add a3, a3, a5 +; LMULMAX1-RV64-NEXT: vmv.x.s a2, v27 +; LMULMAX1-RV64-NEXT: or a2, a2, a7 +; LMULMAX1-RV64-NEXT: addi a3, a2, -1 +; LMULMAX1-RV64-NEXT: not a2, a2 +; LMULMAX1-RV64-NEXT: and a2, a2, a3 +; LMULMAX1-RV64-NEXT: srli a3, a2, 1 ; LMULMAX1-RV64-NEXT: and a3, a3, t0 -; LMULMAX1-RV64-NEXT: mul a3, a3, a1 -; LMULMAX1-RV64-NEXT: srli a3, a3, 56 -; LMULMAX1-RV64-NEXT: sw a3, 40(sp) +; LMULMAX1-RV64-NEXT: sub a2, a2, a3 +; LMULMAX1-RV64-NEXT: and a3, a2, a4 +; LMULMAX1-RV64-NEXT: srli a2, a2, 2 +; LMULMAX1-RV64-NEXT: and a2, a2, a4 +; LMULMAX1-RV64-NEXT: add a2, a3, a2 +; LMULMAX1-RV64-NEXT: srli a3, a2, 4 +; LMULMAX1-RV64-NEXT: add a2, a2, a3 +; LMULMAX1-RV64-NEXT: and a2, a2, a5 +; LMULMAX1-RV64-NEXT: mul a2, a2, a1 +; LMULMAX1-RV64-NEXT: srli a2, a2, 56 +; LMULMAX1-RV64-NEXT: sw a2, 40(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v26, 1 -; LMULMAX1-RV64-NEXT: vmv.x.s a3, v26 -; LMULMAX1-RV64-NEXT: or a3, a3, a2 -; LMULMAX1-RV64-NEXT: addi a5, a3, -1 -; LMULMAX1-RV64-NEXT: not a3, a3 -; LMULMAX1-RV64-NEXT: and a3, a3, a5 -; LMULMAX1-RV64-NEXT: srli a5, a3, 1 -; LMULMAX1-RV64-NEXT: and a5, a5, a7 -; LMULMAX1-RV64-NEXT: sub a3, a3, a5 -; LMULMAX1-RV64-NEXT: and a5, a3, a4 -; LMULMAX1-RV64-NEXT: srli a3, a3, 2 -; LMULMAX1-RV64-NEXT: and a3, a3, a4 -; LMULMAX1-RV64-NEXT: add a3, a5, a3 -; LMULMAX1-RV64-NEXT: srli a5, a3, 4 -; LMULMAX1-RV64-NEXT: add a3, a3, a5 +; LMULMAX1-RV64-NEXT: vmv.x.s a2, v26 +; LMULMAX1-RV64-NEXT: or a2, a2, a7 +; LMULMAX1-RV64-NEXT: addi a3, a2, -1 +; LMULMAX1-RV64-NEXT: not a2, a2 +; LMULMAX1-RV64-NEXT: and a2, a2, a3 +; LMULMAX1-RV64-NEXT: srli a3, a2, 1 ; LMULMAX1-RV64-NEXT: and a3, a3, t0 -; LMULMAX1-RV64-NEXT: mul a3, a3, a1 -; LMULMAX1-RV64-NEXT: srli a3, a3, 56 -; LMULMAX1-RV64-NEXT: sw a3, 36(sp) +; LMULMAX1-RV64-NEXT: sub a2, a2, a3 +; LMULMAX1-RV64-NEXT: and a3, a2, a4 +; LMULMAX1-RV64-NEXT: srli a2, a2, 2 +; LMULMAX1-RV64-NEXT: and a2, a2, a4 +; LMULMAX1-RV64-NEXT: add a2, a3, a2 +; LMULMAX1-RV64-NEXT: srli a3, a2, 4 +; LMULMAX1-RV64-NEXT: add a2, a2, a3 +; LMULMAX1-RV64-NEXT: and a2, a2, a5 +; LMULMAX1-RV64-NEXT: mul a2, a2, a1 +; LMULMAX1-RV64-NEXT: srli a2, a2, 56 +; LMULMAX1-RV64-NEXT: sw a2, 36(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 3 -; LMULMAX1-RV64-NEXT: vmv.x.s a3, v26 -; LMULMAX1-RV64-NEXT: or a3, a3, a2 -; LMULMAX1-RV64-NEXT: addi a5, a3, -1 -; LMULMAX1-RV64-NEXT: not a3, a3 -; LMULMAX1-RV64-NEXT: and a3, a3, a5 -; LMULMAX1-RV64-NEXT: srli a5, a3, 1 -; LMULMAX1-RV64-NEXT: and a5, a5, a7 -; LMULMAX1-RV64-NEXT: sub a3, a3, a5 -; LMULMAX1-RV64-NEXT: and a5, a3, a4 -; LMULMAX1-RV64-NEXT: srli a3, a3, 2 -; LMULMAX1-RV64-NEXT: and a3, a3, a4 -; LMULMAX1-RV64-NEXT: add a3, a5, a3 -; LMULMAX1-RV64-NEXT: srli a5, a3, 4 -; LMULMAX1-RV64-NEXT: add a3, a3, a5 +; LMULMAX1-RV64-NEXT: vmv.x.s a2, v26 +; LMULMAX1-RV64-NEXT: or a2, a2, a7 +; LMULMAX1-RV64-NEXT: addi a3, a2, -1 +; LMULMAX1-RV64-NEXT: not a2, a2 +; LMULMAX1-RV64-NEXT: and a2, a2, a3 +; LMULMAX1-RV64-NEXT: srli a3, a2, 1 ; LMULMAX1-RV64-NEXT: and a3, a3, t0 -; LMULMAX1-RV64-NEXT: mul a3, a3, a1 -; LMULMAX1-RV64-NEXT: srli a3, a3, 56 -; LMULMAX1-RV64-NEXT: sw a3, 28(sp) +; LMULMAX1-RV64-NEXT: sub a2, a2, a3 +; LMULMAX1-RV64-NEXT: and a3, a2, a4 +; LMULMAX1-RV64-NEXT: srli a2, a2, 2 +; LMULMAX1-RV64-NEXT: and a2, a2, a4 +; LMULMAX1-RV64-NEXT: add a2, a3, a2 +; LMULMAX1-RV64-NEXT: srli a3, a2, 4 +; LMULMAX1-RV64-NEXT: add a2, a2, a3 +; LMULMAX1-RV64-NEXT: and a2, a2, a5 +; LMULMAX1-RV64-NEXT: mul a2, a2, a1 +; LMULMAX1-RV64-NEXT: srli a2, a2, 56 +; LMULMAX1-RV64-NEXT: sw a2, 28(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 2 -; LMULMAX1-RV64-NEXT: vmv.x.s a3, v26 -; LMULMAX1-RV64-NEXT: or a3, a3, a2 -; LMULMAX1-RV64-NEXT: addi a5, a3, -1 -; LMULMAX1-RV64-NEXT: not a3, a3 -; LMULMAX1-RV64-NEXT: and a3, a3, a5 -; LMULMAX1-RV64-NEXT: srli a5, a3, 1 -; LMULMAX1-RV64-NEXT: and a5, a5, a7 -; LMULMAX1-RV64-NEXT: sub a3, a3, a5 -; LMULMAX1-RV64-NEXT: and a5, a3, a4 -; LMULMAX1-RV64-NEXT: srli a3, a3, 2 -; LMULMAX1-RV64-NEXT: and a3, a3, a4 -; LMULMAX1-RV64-NEXT: add a3, a5, a3 -; LMULMAX1-RV64-NEXT: srli a5, a3, 4 -; LMULMAX1-RV64-NEXT: add a3, a3, a5 +; LMULMAX1-RV64-NEXT: vmv.x.s a2, v26 +; LMULMAX1-RV64-NEXT: or a2, a2, a7 +; LMULMAX1-RV64-NEXT: addi a3, a2, -1 +; LMULMAX1-RV64-NEXT: not a2, a2 +; LMULMAX1-RV64-NEXT: and a2, a2, a3 +; LMULMAX1-RV64-NEXT: srli a3, a2, 1 ; LMULMAX1-RV64-NEXT: and a3, a3, t0 -; LMULMAX1-RV64-NEXT: mul a3, a3, a1 -; LMULMAX1-RV64-NEXT: srli a3, a3, 56 -; LMULMAX1-RV64-NEXT: sw a3, 24(sp) +; LMULMAX1-RV64-NEXT: sub a2, a2, a3 +; LMULMAX1-RV64-NEXT: and a3, a2, a4 +; LMULMAX1-RV64-NEXT: srli a2, a2, 2 +; LMULMAX1-RV64-NEXT: and a2, a2, a4 +; LMULMAX1-RV64-NEXT: add a2, a3, a2 +; LMULMAX1-RV64-NEXT: srli a3, a2, 4 +; LMULMAX1-RV64-NEXT: add a2, a2, a3 +; LMULMAX1-RV64-NEXT: and a2, a2, a5 +; LMULMAX1-RV64-NEXT: mul a2, a2, a1 +; LMULMAX1-RV64-NEXT: srli a2, a2, 56 +; LMULMAX1-RV64-NEXT: sw a2, 24(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 1 -; LMULMAX1-RV64-NEXT: vmv.x.s a3, v26 -; LMULMAX1-RV64-NEXT: or a3, a3, a2 -; LMULMAX1-RV64-NEXT: addi a5, a3, -1 -; LMULMAX1-RV64-NEXT: not a3, a3 -; LMULMAX1-RV64-NEXT: and a3, a3, a5 -; LMULMAX1-RV64-NEXT: srli a5, a3, 1 -; LMULMAX1-RV64-NEXT: and a5, a5, a7 -; LMULMAX1-RV64-NEXT: sub a3, a3, a5 -; LMULMAX1-RV64-NEXT: and a5, a3, a4 -; LMULMAX1-RV64-NEXT: srli a3, a3, 2 -; LMULMAX1-RV64-NEXT: and a3, a3, a4 -; LMULMAX1-RV64-NEXT: add a3, a5, a3 -; LMULMAX1-RV64-NEXT: srli a5, a3, 4 -; LMULMAX1-RV64-NEXT: add a3, a3, a5 +; LMULMAX1-RV64-NEXT: vmv.x.s a2, v26 +; LMULMAX1-RV64-NEXT: or a2, a2, a7 +; LMULMAX1-RV64-NEXT: addi a3, a2, -1 +; LMULMAX1-RV64-NEXT: not a2, a2 +; LMULMAX1-RV64-NEXT: and a2, a2, a3 +; LMULMAX1-RV64-NEXT: srli a3, a2, 1 ; LMULMAX1-RV64-NEXT: and a3, a3, t0 -; LMULMAX1-RV64-NEXT: mul a3, a3, a1 -; LMULMAX1-RV64-NEXT: srli a3, a3, 56 -; LMULMAX1-RV64-NEXT: sw a3, 20(sp) -; LMULMAX1-RV64-NEXT: vmv.x.s a3, v25 -; LMULMAX1-RV64-NEXT: or a2, a3, a2 +; LMULMAX1-RV64-NEXT: sub a2, a2, a3 +; LMULMAX1-RV64-NEXT: and a3, a2, a4 +; LMULMAX1-RV64-NEXT: srli a2, a2, 2 +; LMULMAX1-RV64-NEXT: and a2, a2, a4 +; LMULMAX1-RV64-NEXT: add a2, a3, a2 +; LMULMAX1-RV64-NEXT: srli a3, a2, 4 +; LMULMAX1-RV64-NEXT: add a2, a2, a3 +; LMULMAX1-RV64-NEXT: and a2, a2, a5 +; LMULMAX1-RV64-NEXT: mul a2, a2, a1 +; LMULMAX1-RV64-NEXT: srli a2, a2, 56 +; LMULMAX1-RV64-NEXT: sw a2, 20(sp) +; LMULMAX1-RV64-NEXT: vmv.x.s a2, v25 +; LMULMAX1-RV64-NEXT: or a2, a2, a7 ; LMULMAX1-RV64-NEXT: addi a3, a2, -1 ; LMULMAX1-RV64-NEXT: not a2, a2 ; LMULMAX1-RV64-NEXT: and a2, a2, a3 ; LMULMAX1-RV64-NEXT: srli a3, a2, 1 -; LMULMAX1-RV64-NEXT: and a3, a3, a7 +; LMULMAX1-RV64-NEXT: and a3, a3, t0 ; LMULMAX1-RV64-NEXT: sub a2, a2, a3 ; LMULMAX1-RV64-NEXT: and a3, a2, a4 ; LMULMAX1-RV64-NEXT: srli a2, a2, 2 @@ -7635,7 +7635,7 @@ define void @cttz_v8i32(<8 x i32>* %x, <8 x i32>* %y) { ; LMULMAX1-RV64-NEXT: add a2, a3, a2 ; LMULMAX1-RV64-NEXT: srli a3, a2, 4 ; LMULMAX1-RV64-NEXT: add a2, a2, a3 -; LMULMAX1-RV64-NEXT: and a2, a2, t0 +; LMULMAX1-RV64-NEXT: and a2, a2, a5 ; LMULMAX1-RV64-NEXT: mul a1, a2, a1 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 ; LMULMAX1-RV64-NEXT: sw a1, 16(sp) @@ -7990,11 +7990,11 @@ define void @cttz_v4i64(<4 x i64>* %x, <4 x i64>* %y) { ; LMULMAX1-RV32-NEXT: .cfi_def_cfa_offset 48 ; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; LMULMAX1-RV32-NEXT: vle64.v v25, (a0) -; LMULMAX1-RV32-NEXT: addi a6, a0, 16 -; LMULMAX1-RV32-NEXT: vle64.v v26, (a6) +; LMULMAX1-RV32-NEXT: addi a7, a0, 16 +; LMULMAX1-RV32-NEXT: vle64.v v26, (a7) ; LMULMAX1-RV32-NEXT: sw zero, 44(sp) ; LMULMAX1-RV32-NEXT: sw zero, 36(sp) -; LMULMAX1-RV32-NEXT: addi a7, zero, 32 +; LMULMAX1-RV32-NEXT: addi a6, zero, 32 ; LMULMAX1-RV32-NEXT: lui a1, 349525 ; LMULMAX1-RV32-NEXT: addi a5, a1, 1365 ; LMULMAX1-RV32-NEXT: lui a1, 209715 @@ -8007,7 +8007,7 @@ define void @cttz_v4i64(<4 x i64>* %x, <4 x i64>* %y) { ; LMULMAX1-RV32-NEXT: bnez a1, .LBB7_2 ; LMULMAX1-RV32-NEXT: # %bb.1: ; LMULMAX1-RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; LMULMAX1-RV32-NEXT: vsrl.vx v27, v26, a7 +; LMULMAX1-RV32-NEXT: vsrl.vx v27, v26, a6 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v27 ; LMULMAX1-RV32-NEXT: addi a2, a1, -1 ; LMULMAX1-RV32-NEXT: not a1, a1 @@ -8049,7 +8049,7 @@ define void @cttz_v4i64(<4 x i64>* %x, <4 x i64>* %y) { ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV32-NEXT: bnez a1, .LBB7_5 ; LMULMAX1-RV32-NEXT: # %bb.4: -; LMULMAX1-RV32-NEXT: vsrl.vx v26, v26, a7 +; LMULMAX1-RV32-NEXT: vsrl.vx v26, v26, a6 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV32-NEXT: addi a2, a1, -1 ; LMULMAX1-RV32-NEXT: not a1, a1 @@ -8091,7 +8091,7 @@ define void @cttz_v4i64(<4 x i64>* %x, <4 x i64>* %y) { ; LMULMAX1-RV32-NEXT: sw zero, 20(sp) ; LMULMAX1-RV32-NEXT: bnez a1, .LBB7_8 ; LMULMAX1-RV32-NEXT: # %bb.7: -; LMULMAX1-RV32-NEXT: vsrl.vx v26, v25, a7 +; LMULMAX1-RV32-NEXT: vsrl.vx v26, v25, a6 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV32-NEXT: addi a2, a1, -1 ; LMULMAX1-RV32-NEXT: not a1, a1 @@ -8132,7 +8132,7 @@ define void @cttz_v4i64(<4 x i64>* %x, <4 x i64>* %y) { ; LMULMAX1-RV32-NEXT: sw a1, 16(sp) ; LMULMAX1-RV32-NEXT: bnez a2, .LBB7_11 ; LMULMAX1-RV32-NEXT: # %bb.10: -; LMULMAX1-RV32-NEXT: vsrl.vx v25, v25, a7 +; LMULMAX1-RV32-NEXT: vsrl.vx v25, v25, a6 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v25 ; LMULMAX1-RV32-NEXT: addi a2, a1, -1 ; LMULMAX1-RV32-NEXT: not a1, a1 @@ -8176,7 +8176,7 @@ define void @cttz_v4i64(<4 x i64>* %x, <4 x i64>* %y) { ; LMULMAX1-RV32-NEXT: vle32.v v26, (a1) ; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; LMULMAX1-RV32-NEXT: vse64.v v25, (a0) -; LMULMAX1-RV32-NEXT: vse64.v v26, (a6) +; LMULMAX1-RV32-NEXT: vse64.v v26, (a7) ; LMULMAX1-RV32-NEXT: addi sp, sp, 48 ; LMULMAX1-RV32-NEXT: ret ; diff --git a/llvm/test/CodeGen/RISCV/shifts.ll b/llvm/test/CodeGen/RISCV/shifts.ll index 08ea1a0..cbdbec3 100644 --- a/llvm/test/CodeGen/RISCV/shifts.ll +++ b/llvm/test/CodeGen/RISCV/shifts.ll @@ -148,114 +148,114 @@ define i128 @lshr128(i128 %a, i128 %b) nounwind { ; RV32I-NEXT: addi sp, sp, -16 ; RV32I-NEXT: sw s0, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: lw a2, 0(a2) -; RV32I-NEXT: lw t1, 8(a1) -; RV32I-NEXT: lw t6, 12(a1) +; RV32I-NEXT: lw t0, 8(a1) +; RV32I-NEXT: lw t4, 12(a1) ; RV32I-NEXT: addi a6, zero, 64 -; RV32I-NEXT: sub t5, a6, a2 +; RV32I-NEXT: sub t1, a6, a2 ; RV32I-NEXT: addi a3, zero, 32 -; RV32I-NEXT: sub s0, a3, a2 +; RV32I-NEXT: sub t5, a3, a2 ; RV32I-NEXT: addi t2, zero, 31 -; RV32I-NEXT: bltz s0, .LBB6_2 +; RV32I-NEXT: bltz t5, .LBB6_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: sll a4, t1, s0 +; RV32I-NEXT: sll a3, t0, t5 ; RV32I-NEXT: j .LBB6_3 ; RV32I-NEXT: .LBB6_2: -; RV32I-NEXT: sll a3, t6, t5 -; RV32I-NEXT: sub a4, t2, t5 -; RV32I-NEXT: srli a5, t1, 1 +; RV32I-NEXT: sll a3, t4, t1 +; RV32I-NEXT: sub a4, t2, t1 +; RV32I-NEXT: srli a5, t0, 1 ; RV32I-NEXT: srl a4, a5, a4 -; RV32I-NEXT: or a4, a3, a4 +; RV32I-NEXT: or a3, a3, a4 ; RV32I-NEXT: .LBB6_3: -; RV32I-NEXT: lw a3, 4(a1) -; RV32I-NEXT: addi t0, a2, -32 -; RV32I-NEXT: bgez t0, .LBB6_5 +; RV32I-NEXT: lw a5, 4(a1) +; RV32I-NEXT: addi t6, a2, -32 +; RV32I-NEXT: bgez t6, .LBB6_5 ; RV32I-NEXT: # %bb.4: -; RV32I-NEXT: srl a5, a3, a2 -; RV32I-NEXT: or a4, a4, a5 +; RV32I-NEXT: srl a4, a5, a2 +; RV32I-NEXT: or a3, a3, a4 ; RV32I-NEXT: .LBB6_5: -; RV32I-NEXT: addi t4, a2, -96 +; RV32I-NEXT: addi a4, a2, -96 ; RV32I-NEXT: addi t3, a2, -64 -; RV32I-NEXT: bltz t4, .LBB6_7 +; RV32I-NEXT: bltz a4, .LBB6_7 ; RV32I-NEXT: # %bb.6: -; RV32I-NEXT: mv a5, zero +; RV32I-NEXT: mv a7, zero ; RV32I-NEXT: bgeu a2, a6, .LBB6_8 ; RV32I-NEXT: j .LBB6_9 ; RV32I-NEXT: .LBB6_7: -; RV32I-NEXT: srl a5, t6, t3 +; RV32I-NEXT: srl a7, t4, t3 ; RV32I-NEXT: bltu a2, a6, .LBB6_9 ; RV32I-NEXT: .LBB6_8: -; RV32I-NEXT: mv a4, a5 +; RV32I-NEXT: mv a3, a7 ; RV32I-NEXT: .LBB6_9: -; RV32I-NEXT: mv a7, a3 +; RV32I-NEXT: mv a7, a5 ; RV32I-NEXT: beqz a2, .LBB6_11 ; RV32I-NEXT: # %bb.10: -; RV32I-NEXT: mv a7, a4 +; RV32I-NEXT: mv a7, a3 ; RV32I-NEXT: .LBB6_11: -; RV32I-NEXT: lw a1, 0(a1) +; RV32I-NEXT: lw s0, 0(a1) ; RV32I-NEXT: sub t2, t2, a2 -; RV32I-NEXT: bltz t0, .LBB6_13 +; RV32I-NEXT: bltz t6, .LBB6_13 ; RV32I-NEXT: # %bb.12: -; RV32I-NEXT: srl a3, a3, t0 -; RV32I-NEXT: bltz s0, .LBB6_14 +; RV32I-NEXT: srl a5, a5, t6 +; RV32I-NEXT: bltz t5, .LBB6_14 ; RV32I-NEXT: j .LBB6_15 ; RV32I-NEXT: .LBB6_13: -; RV32I-NEXT: srl a4, a1, a2 -; RV32I-NEXT: slli a3, a3, 1 -; RV32I-NEXT: sll a3, a3, t2 -; RV32I-NEXT: or a3, a4, a3 -; RV32I-NEXT: bgez s0, .LBB6_15 +; RV32I-NEXT: srl a3, s0, a2 +; RV32I-NEXT: slli a5, a5, 1 +; RV32I-NEXT: sll a5, a5, t2 +; RV32I-NEXT: or a5, a3, a5 +; RV32I-NEXT: bgez t5, .LBB6_15 ; RV32I-NEXT: .LBB6_14: -; RV32I-NEXT: sll a4, t1, t5 -; RV32I-NEXT: or a3, a3, a4 +; RV32I-NEXT: sll a3, t0, t1 +; RV32I-NEXT: or a5, a5, a3 ; RV32I-NEXT: .LBB6_15: -; RV32I-NEXT: slli a4, t6, 1 -; RV32I-NEXT: bltz t4, .LBB6_17 +; RV32I-NEXT: slli a3, t4, 1 +; RV32I-NEXT: bltz a4, .LBB6_17 ; RV32I-NEXT: # %bb.16: -; RV32I-NEXT: srl a5, t6, t4 +; RV32I-NEXT: srl a4, t4, a4 ; RV32I-NEXT: bgeu a2, a6, .LBB6_18 ; RV32I-NEXT: j .LBB6_19 ; RV32I-NEXT: .LBB6_17: -; RV32I-NEXT: addi a5, zero, 95 -; RV32I-NEXT: sub a5, a5, a2 -; RV32I-NEXT: sll a5, a4, a5 -; RV32I-NEXT: srl s0, t1, t3 -; RV32I-NEXT: or a5, s0, a5 +; RV32I-NEXT: addi a4, zero, 95 +; RV32I-NEXT: sub a4, a4, a2 +; RV32I-NEXT: sll a4, a3, a4 +; RV32I-NEXT: srl a1, t0, t3 +; RV32I-NEXT: or a4, a1, a4 ; RV32I-NEXT: bltu a2, a6, .LBB6_19 ; RV32I-NEXT: .LBB6_18: -; RV32I-NEXT: mv a3, a5 +; RV32I-NEXT: mv a5, a4 ; RV32I-NEXT: .LBB6_19: ; RV32I-NEXT: bnez a2, .LBB6_22 ; RV32I-NEXT: # %bb.20: -; RV32I-NEXT: bltz t0, .LBB6_23 +; RV32I-NEXT: bltz t6, .LBB6_23 ; RV32I-NEXT: .LBB6_21: -; RV32I-NEXT: srl a3, t6, t0 +; RV32I-NEXT: srl a3, t4, t6 ; RV32I-NEXT: bgeu a2, a6, .LBB6_24 ; RV32I-NEXT: j .LBB6_25 ; RV32I-NEXT: .LBB6_22: -; RV32I-NEXT: mv a1, a3 -; RV32I-NEXT: bgez t0, .LBB6_21 +; RV32I-NEXT: mv s0, a5 +; RV32I-NEXT: bgez t6, .LBB6_21 ; RV32I-NEXT: .LBB6_23: -; RV32I-NEXT: srl a3, t1, a2 -; RV32I-NEXT: sll a4, a4, t2 -; RV32I-NEXT: or a3, a3, a4 +; RV32I-NEXT: srl a1, t0, a2 +; RV32I-NEXT: sll a3, a3, t2 +; RV32I-NEXT: or a3, a1, a3 ; RV32I-NEXT: bltu a2, a6, .LBB6_25 ; RV32I-NEXT: .LBB6_24: ; RV32I-NEXT: mv a3, zero ; RV32I-NEXT: .LBB6_25: -; RV32I-NEXT: bltz t0, .LBB6_27 +; RV32I-NEXT: bltz t6, .LBB6_27 ; RV32I-NEXT: # %bb.26: ; RV32I-NEXT: mv a4, zero ; RV32I-NEXT: bgeu a2, a6, .LBB6_28 ; RV32I-NEXT: j .LBB6_29 ; RV32I-NEXT: .LBB6_27: -; RV32I-NEXT: srl a4, t6, a2 +; RV32I-NEXT: srl a4, t4, a2 ; RV32I-NEXT: bltu a2, a6, .LBB6_29 ; RV32I-NEXT: .LBB6_28: ; RV32I-NEXT: mv a4, zero ; RV32I-NEXT: .LBB6_29: ; RV32I-NEXT: sw a4, 12(a0) ; RV32I-NEXT: sw a3, 8(a0) -; RV32I-NEXT: sw a1, 0(a0) +; RV32I-NEXT: sw s0, 0(a0) ; RV32I-NEXT: sw a7, 4(a0) ; RV32I-NEXT: lw s0, 12(sp) # 4-byte Folded Reload ; RV32I-NEXT: addi sp, sp, 16 @@ -289,115 +289,115 @@ define i128 @ashr128(i128 %a, i128 %b) nounwind { ; RV32I-NEXT: sw s0, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s1, 8(sp) # 4-byte Folded Spill ; RV32I-NEXT: lw a2, 0(a2) -; RV32I-NEXT: lw t6, 8(a1) -; RV32I-NEXT: lw s1, 12(a1) +; RV32I-NEXT: lw t2, 8(a1) +; RV32I-NEXT: lw t5, 12(a1) ; RV32I-NEXT: addi a6, zero, 64 -; RV32I-NEXT: sub t2, a6, a2 +; RV32I-NEXT: sub t1, a6, a2 ; RV32I-NEXT: addi a3, zero, 32 -; RV32I-NEXT: sub a3, a3, a2 +; RV32I-NEXT: sub t6, a3, a2 ; RV32I-NEXT: addi t4, zero, 31 -; RV32I-NEXT: bltz a3, .LBB7_2 +; RV32I-NEXT: bltz t6, .LBB7_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: sll s0, t6, a3 +; RV32I-NEXT: sll s0, t2, t6 ; RV32I-NEXT: j .LBB7_3 ; RV32I-NEXT: .LBB7_2: -; RV32I-NEXT: sll a4, s1, t2 -; RV32I-NEXT: sub a5, t4, t2 -; RV32I-NEXT: srli s0, t6, 1 -; RV32I-NEXT: srl a5, s0, a5 -; RV32I-NEXT: or s0, a4, a5 +; RV32I-NEXT: sll a3, t5, t1 +; RV32I-NEXT: sub a4, t4, t1 +; RV32I-NEXT: srli a5, t2, 1 +; RV32I-NEXT: srl a4, a5, a4 +; RV32I-NEXT: or s0, a3, a4 ; RV32I-NEXT: .LBB7_3: ; RV32I-NEXT: lw a5, 4(a1) -; RV32I-NEXT: addi a7, a2, -32 -; RV32I-NEXT: bgez a7, .LBB7_5 +; RV32I-NEXT: addi a3, a2, -32 +; RV32I-NEXT: bgez a3, .LBB7_5 ; RV32I-NEXT: # %bb.4: ; RV32I-NEXT: srl a4, a5, a2 ; RV32I-NEXT: or s0, s0, a4 ; RV32I-NEXT: .LBB7_5: ; RV32I-NEXT: addi t3, a2, -64 -; RV32I-NEXT: addi t5, a2, -96 -; RV32I-NEXT: srai t1, s1, 31 -; RV32I-NEXT: bltz t5, .LBB7_7 +; RV32I-NEXT: addi a4, a2, -96 +; RV32I-NEXT: srai a7, t5, 31 +; RV32I-NEXT: bltz a4, .LBB7_7 ; RV32I-NEXT: # %bb.6: -; RV32I-NEXT: mv a4, t1 +; RV32I-NEXT: mv t0, a7 ; RV32I-NEXT: bgeu a2, a6, .LBB7_8 ; RV32I-NEXT: j .LBB7_9 ; RV32I-NEXT: .LBB7_7: -; RV32I-NEXT: sra a4, s1, t3 +; RV32I-NEXT: sra t0, t5, t3 ; RV32I-NEXT: bltu a2, a6, .LBB7_9 ; RV32I-NEXT: .LBB7_8: -; RV32I-NEXT: mv s0, a4 +; RV32I-NEXT: mv s0, t0 ; RV32I-NEXT: .LBB7_9: ; RV32I-NEXT: mv t0, a5 ; RV32I-NEXT: beqz a2, .LBB7_11 ; RV32I-NEXT: # %bb.10: ; RV32I-NEXT: mv t0, s0 ; RV32I-NEXT: .LBB7_11: -; RV32I-NEXT: lw a1, 0(a1) +; RV32I-NEXT: lw s1, 0(a1) ; RV32I-NEXT: sub t4, t4, a2 -; RV32I-NEXT: bltz a7, .LBB7_13 +; RV32I-NEXT: bltz a3, .LBB7_13 ; RV32I-NEXT: # %bb.12: -; RV32I-NEXT: srl a5, a5, a7 -; RV32I-NEXT: bltz a3, .LBB7_14 +; RV32I-NEXT: srl a5, a5, a3 +; RV32I-NEXT: bltz t6, .LBB7_14 ; RV32I-NEXT: j .LBB7_15 ; RV32I-NEXT: .LBB7_13: -; RV32I-NEXT: srl a4, a1, a2 +; RV32I-NEXT: srl s0, s1, a2 ; RV32I-NEXT: slli a5, a5, 1 ; RV32I-NEXT: sll a5, a5, t4 -; RV32I-NEXT: or a5, a4, a5 -; RV32I-NEXT: bgez a3, .LBB7_15 +; RV32I-NEXT: or a5, s0, a5 +; RV32I-NEXT: bgez t6, .LBB7_15 ; RV32I-NEXT: .LBB7_14: -; RV32I-NEXT: sll a3, t6, t2 -; RV32I-NEXT: or a5, a5, a3 +; RV32I-NEXT: sll s0, t2, t1 +; RV32I-NEXT: or a5, a5, s0 ; RV32I-NEXT: .LBB7_15: -; RV32I-NEXT: slli a3, s1, 1 -; RV32I-NEXT: bltz t5, .LBB7_17 +; RV32I-NEXT: slli s0, t5, 1 +; RV32I-NEXT: bltz a4, .LBB7_17 ; RV32I-NEXT: # %bb.16: -; RV32I-NEXT: sra s0, s1, t5 +; RV32I-NEXT: sra a4, t5, a4 ; RV32I-NEXT: bgeu a2, a6, .LBB7_18 ; RV32I-NEXT: j .LBB7_19 ; RV32I-NEXT: .LBB7_17: ; RV32I-NEXT: addi a4, zero, 95 ; RV32I-NEXT: sub a4, a4, a2 -; RV32I-NEXT: sll a4, a3, a4 -; RV32I-NEXT: srl s0, t6, t3 -; RV32I-NEXT: or s0, s0, a4 +; RV32I-NEXT: sll a4, s0, a4 +; RV32I-NEXT: srl a1, t2, t3 +; RV32I-NEXT: or a4, a1, a4 ; RV32I-NEXT: bltu a2, a6, .LBB7_19 ; RV32I-NEXT: .LBB7_18: -; RV32I-NEXT: mv a5, s0 +; RV32I-NEXT: mv a5, a4 ; RV32I-NEXT: .LBB7_19: ; RV32I-NEXT: bnez a2, .LBB7_22 ; RV32I-NEXT: # %bb.20: -; RV32I-NEXT: bltz a7, .LBB7_23 +; RV32I-NEXT: bltz a3, .LBB7_23 ; RV32I-NEXT: .LBB7_21: -; RV32I-NEXT: sra a3, s1, a7 +; RV32I-NEXT: sra a4, t5, a3 ; RV32I-NEXT: bgeu a2, a6, .LBB7_24 ; RV32I-NEXT: j .LBB7_25 ; RV32I-NEXT: .LBB7_22: -; RV32I-NEXT: mv a1, a5 -; RV32I-NEXT: bgez a7, .LBB7_21 +; RV32I-NEXT: mv s1, a5 +; RV32I-NEXT: bgez a3, .LBB7_21 ; RV32I-NEXT: .LBB7_23: -; RV32I-NEXT: srl a4, t6, a2 -; RV32I-NEXT: sll a3, a3, t4 -; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: srl a1, t2, a2 +; RV32I-NEXT: sll a4, s0, t4 +; RV32I-NEXT: or a4, a1, a4 ; RV32I-NEXT: bltu a2, a6, .LBB7_25 ; RV32I-NEXT: .LBB7_24: -; RV32I-NEXT: mv a3, t1 +; RV32I-NEXT: mv a4, a7 ; RV32I-NEXT: .LBB7_25: -; RV32I-NEXT: bltz a7, .LBB7_27 +; RV32I-NEXT: bltz a3, .LBB7_27 ; RV32I-NEXT: # %bb.26: -; RV32I-NEXT: mv a4, t1 +; RV32I-NEXT: mv a3, a7 ; RV32I-NEXT: bgeu a2, a6, .LBB7_28 ; RV32I-NEXT: j .LBB7_29 ; RV32I-NEXT: .LBB7_27: -; RV32I-NEXT: sra a4, s1, a2 +; RV32I-NEXT: sra a3, t5, a2 ; RV32I-NEXT: bltu a2, a6, .LBB7_29 ; RV32I-NEXT: .LBB7_28: -; RV32I-NEXT: mv a4, t1 +; RV32I-NEXT: mv a3, a7 ; RV32I-NEXT: .LBB7_29: -; RV32I-NEXT: sw a4, 12(a0) -; RV32I-NEXT: sw a3, 8(a0) -; RV32I-NEXT: sw a1, 0(a0) +; RV32I-NEXT: sw a3, 12(a0) +; RV32I-NEXT: sw a4, 8(a0) +; RV32I-NEXT: sw s1, 0(a0) ; RV32I-NEXT: sw t0, 4(a0) ; RV32I-NEXT: lw s1, 8(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 12(sp) # 4-byte Folded Reload @@ -431,114 +431,114 @@ define i128 @shl128(i128 %a, i128 %b) nounwind { ; RV32I-NEXT: addi sp, sp, -16 ; RV32I-NEXT: sw s0, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: lw a2, 0(a2) -; RV32I-NEXT: lw t1, 4(a1) -; RV32I-NEXT: lw t6, 0(a1) +; RV32I-NEXT: lw t0, 4(a1) +; RV32I-NEXT: lw t4, 0(a1) ; RV32I-NEXT: addi a6, zero, 64 -; RV32I-NEXT: sub t5, a6, a2 +; RV32I-NEXT: sub t1, a6, a2 ; RV32I-NEXT: addi a3, zero, 32 -; RV32I-NEXT: sub s0, a3, a2 +; RV32I-NEXT: sub t5, a3, a2 ; RV32I-NEXT: addi t2, zero, 31 -; RV32I-NEXT: bltz s0, .LBB8_2 +; RV32I-NEXT: bltz t5, .LBB8_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: srl a4, t1, s0 +; RV32I-NEXT: srl a3, t0, t5 ; RV32I-NEXT: j .LBB8_3 ; RV32I-NEXT: .LBB8_2: -; RV32I-NEXT: srl a3, t6, t5 -; RV32I-NEXT: sub a4, t2, t5 -; RV32I-NEXT: slli a5, t1, 1 +; RV32I-NEXT: srl a3, t4, t1 +; RV32I-NEXT: sub a4, t2, t1 +; RV32I-NEXT: slli a5, t0, 1 ; RV32I-NEXT: sll a4, a5, a4 -; RV32I-NEXT: or a4, a3, a4 +; RV32I-NEXT: or a3, a3, a4 ; RV32I-NEXT: .LBB8_3: -; RV32I-NEXT: lw a3, 8(a1) -; RV32I-NEXT: addi t0, a2, -32 -; RV32I-NEXT: bgez t0, .LBB8_5 +; RV32I-NEXT: lw a5, 8(a1) +; RV32I-NEXT: addi t6, a2, -32 +; RV32I-NEXT: bgez t6, .LBB8_5 ; RV32I-NEXT: # %bb.4: -; RV32I-NEXT: sll a5, a3, a2 -; RV32I-NEXT: or a4, a4, a5 +; RV32I-NEXT: sll a4, a5, a2 +; RV32I-NEXT: or a3, a3, a4 ; RV32I-NEXT: .LBB8_5: -; RV32I-NEXT: addi t4, a2, -96 +; RV32I-NEXT: addi a4, a2, -96 ; RV32I-NEXT: addi t3, a2, -64 -; RV32I-NEXT: bltz t4, .LBB8_7 +; RV32I-NEXT: bltz a4, .LBB8_7 ; RV32I-NEXT: # %bb.6: -; RV32I-NEXT: mv a5, zero +; RV32I-NEXT: mv a7, zero ; RV32I-NEXT: bgeu a2, a6, .LBB8_8 ; RV32I-NEXT: j .LBB8_9 ; RV32I-NEXT: .LBB8_7: -; RV32I-NEXT: sll a5, t6, t3 +; RV32I-NEXT: sll a7, t4, t3 ; RV32I-NEXT: bltu a2, a6, .LBB8_9 ; RV32I-NEXT: .LBB8_8: -; RV32I-NEXT: mv a4, a5 +; RV32I-NEXT: mv a3, a7 ; RV32I-NEXT: .LBB8_9: -; RV32I-NEXT: mv a7, a3 +; RV32I-NEXT: mv a7, a5 ; RV32I-NEXT: beqz a2, .LBB8_11 ; RV32I-NEXT: # %bb.10: -; RV32I-NEXT: mv a7, a4 +; RV32I-NEXT: mv a7, a3 ; RV32I-NEXT: .LBB8_11: -; RV32I-NEXT: lw a1, 12(a1) +; RV32I-NEXT: lw s0, 12(a1) ; RV32I-NEXT: sub t2, t2, a2 -; RV32I-NEXT: bltz t0, .LBB8_13 +; RV32I-NEXT: bltz t6, .LBB8_13 ; RV32I-NEXT: # %bb.12: -; RV32I-NEXT: sll a3, a3, t0 -; RV32I-NEXT: bltz s0, .LBB8_14 +; RV32I-NEXT: sll a5, a5, t6 +; RV32I-NEXT: bltz t5, .LBB8_14 ; RV32I-NEXT: j .LBB8_15 ; RV32I-NEXT: .LBB8_13: -; RV32I-NEXT: sll a4, a1, a2 -; RV32I-NEXT: srli a3, a3, 1 -; RV32I-NEXT: srl a3, a3, t2 -; RV32I-NEXT: or a3, a4, a3 -; RV32I-NEXT: bgez s0, .LBB8_15 +; RV32I-NEXT: sll a3, s0, a2 +; RV32I-NEXT: srli a5, a5, 1 +; RV32I-NEXT: srl a5, a5, t2 +; RV32I-NEXT: or a5, a3, a5 +; RV32I-NEXT: bgez t5, .LBB8_15 ; RV32I-NEXT: .LBB8_14: -; RV32I-NEXT: srl a4, t1, t5 -; RV32I-NEXT: or a3, a3, a4 +; RV32I-NEXT: srl a3, t0, t1 +; RV32I-NEXT: or a5, a5, a3 ; RV32I-NEXT: .LBB8_15: -; RV32I-NEXT: srli a4, t6, 1 -; RV32I-NEXT: bltz t4, .LBB8_17 +; RV32I-NEXT: srli a3, t4, 1 +; RV32I-NEXT: bltz a4, .LBB8_17 ; RV32I-NEXT: # %bb.16: -; RV32I-NEXT: sll a5, t6, t4 +; RV32I-NEXT: sll a4, t4, a4 ; RV32I-NEXT: bgeu a2, a6, .LBB8_18 ; RV32I-NEXT: j .LBB8_19 ; RV32I-NEXT: .LBB8_17: -; RV32I-NEXT: addi a5, zero, 95 -; RV32I-NEXT: sub a5, a5, a2 -; RV32I-NEXT: srl a5, a4, a5 -; RV32I-NEXT: sll s0, t1, t3 -; RV32I-NEXT: or a5, s0, a5 +; RV32I-NEXT: addi a4, zero, 95 +; RV32I-NEXT: sub a4, a4, a2 +; RV32I-NEXT: srl a4, a3, a4 +; RV32I-NEXT: sll a1, t0, t3 +; RV32I-NEXT: or a4, a1, a4 ; RV32I-NEXT: bltu a2, a6, .LBB8_19 ; RV32I-NEXT: .LBB8_18: -; RV32I-NEXT: mv a3, a5 +; RV32I-NEXT: mv a5, a4 ; RV32I-NEXT: .LBB8_19: ; RV32I-NEXT: bnez a2, .LBB8_22 ; RV32I-NEXT: # %bb.20: -; RV32I-NEXT: bltz t0, .LBB8_23 +; RV32I-NEXT: bltz t6, .LBB8_23 ; RV32I-NEXT: .LBB8_21: -; RV32I-NEXT: sll a3, t6, t0 +; RV32I-NEXT: sll a3, t4, t6 ; RV32I-NEXT: bgeu a2, a6, .LBB8_24 ; RV32I-NEXT: j .LBB8_25 ; RV32I-NEXT: .LBB8_22: -; RV32I-NEXT: mv a1, a3 -; RV32I-NEXT: bgez t0, .LBB8_21 +; RV32I-NEXT: mv s0, a5 +; RV32I-NEXT: bgez t6, .LBB8_21 ; RV32I-NEXT: .LBB8_23: -; RV32I-NEXT: sll a3, t1, a2 -; RV32I-NEXT: srl a4, a4, t2 -; RV32I-NEXT: or a3, a3, a4 +; RV32I-NEXT: sll a1, t0, a2 +; RV32I-NEXT: srl a3, a3, t2 +; RV32I-NEXT: or a3, a1, a3 ; RV32I-NEXT: bltu a2, a6, .LBB8_25 ; RV32I-NEXT: .LBB8_24: ; RV32I-NEXT: mv a3, zero ; RV32I-NEXT: .LBB8_25: -; RV32I-NEXT: bltz t0, .LBB8_27 +; RV32I-NEXT: bltz t6, .LBB8_27 ; RV32I-NEXT: # %bb.26: ; RV32I-NEXT: mv a4, zero ; RV32I-NEXT: bgeu a2, a6, .LBB8_28 ; RV32I-NEXT: j .LBB8_29 ; RV32I-NEXT: .LBB8_27: -; RV32I-NEXT: sll a4, t6, a2 +; RV32I-NEXT: sll a4, t4, a2 ; RV32I-NEXT: bltu a2, a6, .LBB8_29 ; RV32I-NEXT: .LBB8_28: ; RV32I-NEXT: mv a4, zero ; RV32I-NEXT: .LBB8_29: ; RV32I-NEXT: sw a4, 0(a0) ; RV32I-NEXT: sw a3, 4(a0) -; RV32I-NEXT: sw a1, 12(a0) +; RV32I-NEXT: sw s0, 12(a0) ; RV32I-NEXT: sw a7, 8(a0) ; RV32I-NEXT: lw s0, 12(sp) # 4-byte Folded Reload ; RV32I-NEXT: addi sp, sp, 16 diff --git a/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll b/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll index bf7cbd1..42a7ee5 100644 --- a/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll +++ b/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll @@ -270,41 +270,41 @@ define <4 x i16> @fold_srem_vec_2(<4 x i16> %x) nounwind { ; ; RV32IM-LABEL: fold_srem_vec_2: ; RV32IM: # %bb.0: -; RV32IM-NEXT: lh a7, 12(a1) +; RV32IM-NEXT: lh a6, 12(a1) ; RV32IM-NEXT: lh a3, 8(a1) ; RV32IM-NEXT: lh a4, 0(a1) ; RV32IM-NEXT: lh a1, 4(a1) ; RV32IM-NEXT: lui a5, 706409 -; RV32IM-NEXT: addi a6, a5, 389 -; RV32IM-NEXT: mulh a5, a4, a6 -; RV32IM-NEXT: add a5, a5, a4 -; RV32IM-NEXT: srli a2, a5, 31 -; RV32IM-NEXT: srli a5, a5, 6 -; RV32IM-NEXT: add a2, a5, a2 -; RV32IM-NEXT: addi a5, zero, 95 -; RV32IM-NEXT: mul a2, a2, a5 +; RV32IM-NEXT: addi a5, a5, 389 +; RV32IM-NEXT: mulh a2, a4, a5 +; RV32IM-NEXT: add a2, a2, a4 +; RV32IM-NEXT: srli a7, a2, 31 +; RV32IM-NEXT: srli a2, a2, 6 +; RV32IM-NEXT: add a2, a2, a7 +; RV32IM-NEXT: addi a7, zero, 95 +; RV32IM-NEXT: mul a2, a2, a7 ; RV32IM-NEXT: sub t0, a4, a2 -; RV32IM-NEXT: mulh a4, a1, a6 +; RV32IM-NEXT: mulh a4, a1, a5 ; RV32IM-NEXT: add a4, a4, a1 ; RV32IM-NEXT: srli a2, a4, 31 ; RV32IM-NEXT: srli a4, a4, 6 ; RV32IM-NEXT: add a2, a4, a2 -; RV32IM-NEXT: mul a2, a2, a5 +; RV32IM-NEXT: mul a2, a2, a7 ; RV32IM-NEXT: sub a1, a1, a2 -; RV32IM-NEXT: mulh a2, a3, a6 +; RV32IM-NEXT: mulh a2, a3, a5 ; RV32IM-NEXT: add a2, a2, a3 ; RV32IM-NEXT: srli a4, a2, 31 ; RV32IM-NEXT: srli a2, a2, 6 ; RV32IM-NEXT: add a2, a2, a4 -; RV32IM-NEXT: mul a2, a2, a5 +; RV32IM-NEXT: mul a2, a2, a7 ; RV32IM-NEXT: sub a2, a3, a2 -; RV32IM-NEXT: mulh a3, a7, a6 -; RV32IM-NEXT: add a3, a3, a7 +; RV32IM-NEXT: mulh a3, a6, a5 +; RV32IM-NEXT: add a3, a3, a6 ; RV32IM-NEXT: srli a4, a3, 31 ; RV32IM-NEXT: srli a3, a3, 6 ; RV32IM-NEXT: add a3, a3, a4 -; RV32IM-NEXT: mul a3, a3, a5 -; RV32IM-NEXT: sub a3, a7, a3 +; RV32IM-NEXT: mul a3, a3, a7 +; RV32IM-NEXT: sub a3, a6, a3 ; RV32IM-NEXT: sh a3, 6(a0) ; RV32IM-NEXT: sh a2, 4(a0) ; RV32IM-NEXT: sh a1, 2(a0) @@ -357,8 +357,8 @@ define <4 x i16> @fold_srem_vec_2(<4 x i16> %x) nounwind { ; ; RV64IM-LABEL: fold_srem_vec_2: ; RV64IM: # %bb.0: -; RV64IM-NEXT: lh a7, 24(a1) -; RV64IM-NEXT: lh a3, 16(a1) +; RV64IM-NEXT: lh a6, 24(a1) +; RV64IM-NEXT: lh a7, 16(a1) ; RV64IM-NEXT: lh a4, 8(a1) ; RV64IM-NEXT: lh a1, 0(a1) ; RV64IM-NEXT: lui a5, 1045903 @@ -368,36 +368,36 @@ define <4 x i16> @fold_srem_vec_2(<4 x i16> %x) nounwind { ; RV64IM-NEXT: slli a5, a5, 12 ; RV64IM-NEXT: addi a5, a5, -905 ; RV64IM-NEXT: slli a5, a5, 12 -; RV64IM-NEXT: addi a6, a5, -1767 -; RV64IM-NEXT: mulh a5, a1, a6 -; RV64IM-NEXT: add a5, a5, a1 -; RV64IM-NEXT: srli a2, a5, 63 -; RV64IM-NEXT: srli a5, a5, 6 -; RV64IM-NEXT: addw a2, a5, a2 -; RV64IM-NEXT: addi a5, zero, 95 -; RV64IM-NEXT: mulw a2, a2, a5 +; RV64IM-NEXT: addi a5, a5, -1767 +; RV64IM-NEXT: mulh a2, a1, a5 +; RV64IM-NEXT: add a2, a2, a1 +; RV64IM-NEXT: srli a3, a2, 63 +; RV64IM-NEXT: srli a2, a2, 6 +; RV64IM-NEXT: addw a2, a2, a3 +; RV64IM-NEXT: addi a3, zero, 95 +; RV64IM-NEXT: mulw a2, a2, a3 ; RV64IM-NEXT: subw t0, a1, a2 -; RV64IM-NEXT: mulh a2, a4, a6 +; RV64IM-NEXT: mulh a2, a4, a5 ; RV64IM-NEXT: add a2, a2, a4 ; RV64IM-NEXT: srli a1, a2, 63 ; RV64IM-NEXT: srli a2, a2, 6 ; RV64IM-NEXT: addw a1, a2, a1 -; RV64IM-NEXT: mulw a1, a1, a5 +; RV64IM-NEXT: mulw a1, a1, a3 ; RV64IM-NEXT: subw a1, a4, a1 -; RV64IM-NEXT: mulh a2, a3, a6 -; RV64IM-NEXT: add a2, a2, a3 +; RV64IM-NEXT: mulh a2, a7, a5 +; RV64IM-NEXT: add a2, a2, a7 ; RV64IM-NEXT: srli a4, a2, 63 ; RV64IM-NEXT: srli a2, a2, 6 ; RV64IM-NEXT: addw a2, a2, a4 -; RV64IM-NEXT: mulw a2, a2, a5 -; RV64IM-NEXT: subw a2, a3, a2 -; RV64IM-NEXT: mulh a3, a7, a6 -; RV64IM-NEXT: add a3, a3, a7 -; RV64IM-NEXT: srli a4, a3, 63 -; RV64IM-NEXT: srli a3, a3, 6 -; RV64IM-NEXT: addw a3, a3, a4 -; RV64IM-NEXT: mulw a3, a3, a5 -; RV64IM-NEXT: subw a3, a7, a3 +; RV64IM-NEXT: mulw a2, a2, a3 +; RV64IM-NEXT: subw a2, a7, a2 +; RV64IM-NEXT: mulh a4, a6, a5 +; RV64IM-NEXT: add a4, a4, a6 +; RV64IM-NEXT: srli a5, a4, 63 +; RV64IM-NEXT: srli a4, a4, 6 +; RV64IM-NEXT: addw a4, a4, a5 +; RV64IM-NEXT: mulw a3, a4, a3 +; RV64IM-NEXT: subw a3, a6, a3 ; RV64IM-NEXT: sh a3, 6(a0) ; RV64IM-NEXT: sh a2, 4(a0) ; RV64IM-NEXT: sh a1, 2(a0) @@ -484,49 +484,49 @@ define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) nounwind { ; ; RV32IM-LABEL: combine_srem_sdiv: ; RV32IM: # %bb.0: -; RV32IM-NEXT: lh a7, 0(a1) +; RV32IM-NEXT: lh a6, 0(a1) ; RV32IM-NEXT: lh a3, 4(a1) ; RV32IM-NEXT: lh a4, 12(a1) ; RV32IM-NEXT: lh a1, 8(a1) ; RV32IM-NEXT: lui a5, 706409 -; RV32IM-NEXT: addi a6, a5, 389 -; RV32IM-NEXT: mulh a5, a4, a6 -; RV32IM-NEXT: add a5, a5, a4 -; RV32IM-NEXT: srli a2, a5, 31 -; RV32IM-NEXT: srai a5, a5, 6 -; RV32IM-NEXT: add t3, a5, a2 -; RV32IM-NEXT: addi t0, zero, 95 -; RV32IM-NEXT: mul a5, t3, t0 -; RV32IM-NEXT: sub t1, a4, a5 -; RV32IM-NEXT: mulh a5, a1, a6 -; RV32IM-NEXT: add a5, a5, a1 -; RV32IM-NEXT: srli a4, a5, 31 -; RV32IM-NEXT: srai a5, a5, 6 +; RV32IM-NEXT: addi a5, a5, 389 +; RV32IM-NEXT: mulh a2, a4, a5 +; RV32IM-NEXT: add a2, a2, a4 +; RV32IM-NEXT: srli a7, a2, 31 +; RV32IM-NEXT: srai a2, a2, 6 +; RV32IM-NEXT: add t0, a2, a7 +; RV32IM-NEXT: addi a7, zero, 95 +; RV32IM-NEXT: mul a2, t0, a7 +; RV32IM-NEXT: sub t1, a4, a2 +; RV32IM-NEXT: mulh a4, a1, a5 +; RV32IM-NEXT: add a4, a4, a1 +; RV32IM-NEXT: srli a2, a4, 31 +; RV32IM-NEXT: srai a4, a4, 6 +; RV32IM-NEXT: add a2, a4, a2 +; RV32IM-NEXT: mul a4, a2, a7 +; RV32IM-NEXT: sub t2, a1, a4 +; RV32IM-NEXT: mulh a4, a3, a5 +; RV32IM-NEXT: add a4, a4, a3 +; RV32IM-NEXT: srli a1, a4, 31 +; RV32IM-NEXT: srai a4, a4, 6 +; RV32IM-NEXT: add a1, a4, a1 +; RV32IM-NEXT: mul a4, a1, a7 +; RV32IM-NEXT: sub a3, a3, a4 +; RV32IM-NEXT: mulh a4, a6, a5 +; RV32IM-NEXT: add a4, a4, a6 +; RV32IM-NEXT: srli a5, a4, 31 +; RV32IM-NEXT: srai a4, a4, 6 +; RV32IM-NEXT: add a4, a4, a5 +; RV32IM-NEXT: mul a5, a4, a7 +; RV32IM-NEXT: sub a5, a6, a5 ; RV32IM-NEXT: add a4, a5, a4 -; RV32IM-NEXT: mul a5, a4, t0 -; RV32IM-NEXT: sub t2, a1, a5 -; RV32IM-NEXT: mulh a5, a3, a6 -; RV32IM-NEXT: add a5, a5, a3 -; RV32IM-NEXT: srli a1, a5, 31 -; RV32IM-NEXT: srai a5, a5, 6 -; RV32IM-NEXT: add a1, a5, a1 -; RV32IM-NEXT: mul a5, a1, t0 -; RV32IM-NEXT: sub a3, a3, a5 -; RV32IM-NEXT: mulh a5, a7, a6 -; RV32IM-NEXT: add a5, a5, a7 -; RV32IM-NEXT: srli a2, a5, 31 -; RV32IM-NEXT: srai a5, a5, 6 -; RV32IM-NEXT: add a2, a5, a2 -; RV32IM-NEXT: mul a5, a2, t0 -; RV32IM-NEXT: sub a5, a7, a5 -; RV32IM-NEXT: add a2, a5, a2 ; RV32IM-NEXT: add a1, a3, a1 -; RV32IM-NEXT: add a3, t2, a4 -; RV32IM-NEXT: add a4, t1, t3 -; RV32IM-NEXT: sh a4, 6(a0) -; RV32IM-NEXT: sh a3, 4(a0) +; RV32IM-NEXT: add a2, t2, a2 +; RV32IM-NEXT: add a3, t1, t0 +; RV32IM-NEXT: sh a3, 6(a0) +; RV32IM-NEXT: sh a2, 4(a0) ; RV32IM-NEXT: sh a1, 2(a0) -; RV32IM-NEXT: sh a2, 0(a0) +; RV32IM-NEXT: sh a4, 0(a0) ; RV32IM-NEXT: ret ; ; RV64I-LABEL: combine_srem_sdiv: @@ -603,8 +603,8 @@ define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) nounwind { ; ; RV64IM-LABEL: combine_srem_sdiv: ; RV64IM: # %bb.0: -; RV64IM-NEXT: lh a7, 0(a1) -; RV64IM-NEXT: lh a3, 8(a1) +; RV64IM-NEXT: lh a6, 0(a1) +; RV64IM-NEXT: lh a7, 8(a1) ; RV64IM-NEXT: lh a4, 16(a1) ; RV64IM-NEXT: lh a1, 24(a1) ; RV64IM-NEXT: lui a5, 1045903 @@ -614,38 +614,38 @@ define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) nounwind { ; RV64IM-NEXT: slli a5, a5, 12 ; RV64IM-NEXT: addi a5, a5, -905 ; RV64IM-NEXT: slli a5, a5, 12 -; RV64IM-NEXT: addi a6, a5, -1767 -; RV64IM-NEXT: mulh a5, a1, a6 -; RV64IM-NEXT: add a5, a5, a1 -; RV64IM-NEXT: srli a2, a5, 63 -; RV64IM-NEXT: srai a5, a5, 6 -; RV64IM-NEXT: addw t3, a5, a2 +; RV64IM-NEXT: addi a5, a5, -1767 +; RV64IM-NEXT: mulh a2, a1, a5 +; RV64IM-NEXT: add a2, a2, a1 +; RV64IM-NEXT: srli a3, a2, 63 +; RV64IM-NEXT: srai a2, a2, 6 +; RV64IM-NEXT: addw t3, a2, a3 ; RV64IM-NEXT: addi t0, zero, 95 -; RV64IM-NEXT: mulw a5, t3, t0 -; RV64IM-NEXT: subw t1, a1, a5 -; RV64IM-NEXT: mulh a5, a4, a6 -; RV64IM-NEXT: add a5, a5, a4 -; RV64IM-NEXT: srli a1, a5, 63 -; RV64IM-NEXT: srai a5, a5, 6 -; RV64IM-NEXT: addw a1, a5, a1 -; RV64IM-NEXT: mulw a5, a1, t0 -; RV64IM-NEXT: subw t2, a4, a5 -; RV64IM-NEXT: mulh a5, a3, a6 -; RV64IM-NEXT: add a5, a5, a3 -; RV64IM-NEXT: srli a4, a5, 63 -; RV64IM-NEXT: srai a5, a5, 6 -; RV64IM-NEXT: addw a4, a5, a4 -; RV64IM-NEXT: mulw a5, a4, t0 -; RV64IM-NEXT: subw a3, a3, a5 -; RV64IM-NEXT: mulh a5, a7, a6 -; RV64IM-NEXT: add a5, a5, a7 +; RV64IM-NEXT: mulw a3, t3, t0 +; RV64IM-NEXT: subw t1, a1, a3 +; RV64IM-NEXT: mulh a3, a4, a5 +; RV64IM-NEXT: add a3, a3, a4 +; RV64IM-NEXT: srli a1, a3, 63 +; RV64IM-NEXT: srai a3, a3, 6 +; RV64IM-NEXT: addw a1, a3, a1 +; RV64IM-NEXT: mulw a3, a1, t0 +; RV64IM-NEXT: subw t2, a4, a3 +; RV64IM-NEXT: mulh a4, a7, a5 +; RV64IM-NEXT: add a4, a4, a7 +; RV64IM-NEXT: srli a3, a4, 63 +; RV64IM-NEXT: srai a4, a4, 6 +; RV64IM-NEXT: addw a3, a4, a3 +; RV64IM-NEXT: mulw a4, a3, t0 +; RV64IM-NEXT: subw a4, a7, a4 +; RV64IM-NEXT: mulh a5, a6, a5 +; RV64IM-NEXT: add a5, a5, a6 ; RV64IM-NEXT: srli a2, a5, 63 ; RV64IM-NEXT: srai a5, a5, 6 ; RV64IM-NEXT: addw a2, a5, a2 ; RV64IM-NEXT: mulw a5, a2, t0 -; RV64IM-NEXT: subw a5, a7, a5 +; RV64IM-NEXT: subw a5, a6, a5 ; RV64IM-NEXT: addw a2, a5, a2 -; RV64IM-NEXT: addw a3, a3, a4 +; RV64IM-NEXT: addw a3, a4, a3 ; RV64IM-NEXT: addw a1, t2, a1 ; RV64IM-NEXT: addw a4, t1, t3 ; RV64IM-NEXT: sh a4, 6(a0) diff --git a/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll b/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll index 06d76ea..17b68b9 100644 --- a/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll +++ b/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll @@ -261,41 +261,41 @@ define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) nounwind { ; ; RV32IM-LABEL: fold_urem_vec_2: ; RV32IM: # %bb.0: -; RV32IM-NEXT: lhu a7, 12(a1) -; RV32IM-NEXT: lhu a3, 8(a1) +; RV32IM-NEXT: lhu a6, 12(a1) +; RV32IM-NEXT: lhu a7, 8(a1) ; RV32IM-NEXT: lhu a4, 0(a1) ; RV32IM-NEXT: lhu a1, 4(a1) ; RV32IM-NEXT: lui a5, 364242 -; RV32IM-NEXT: addi a6, a5, 777 -; RV32IM-NEXT: mulhu a5, a4, a6 -; RV32IM-NEXT: sub a2, a4, a5 -; RV32IM-NEXT: srli a2, a2, 1 -; RV32IM-NEXT: add a2, a2, a5 +; RV32IM-NEXT: addi a5, a5, 777 +; RV32IM-NEXT: mulhu a2, a4, a5 +; RV32IM-NEXT: sub a3, a4, a2 +; RV32IM-NEXT: srli a3, a3, 1 +; RV32IM-NEXT: add a2, a3, a2 ; RV32IM-NEXT: srli a2, a2, 6 -; RV32IM-NEXT: addi a5, zero, 95 -; RV32IM-NEXT: mul a2, a2, a5 +; RV32IM-NEXT: addi a3, zero, 95 +; RV32IM-NEXT: mul a2, a2, a3 ; RV32IM-NEXT: sub t0, a4, a2 -; RV32IM-NEXT: mulhu a4, a1, a6 +; RV32IM-NEXT: mulhu a4, a1, a5 ; RV32IM-NEXT: sub a2, a1, a4 ; RV32IM-NEXT: srli a2, a2, 1 ; RV32IM-NEXT: add a2, a2, a4 ; RV32IM-NEXT: srli a2, a2, 6 -; RV32IM-NEXT: mul a2, a2, a5 +; RV32IM-NEXT: mul a2, a2, a3 ; RV32IM-NEXT: sub a1, a1, a2 -; RV32IM-NEXT: mulhu a2, a3, a6 -; RV32IM-NEXT: sub a4, a3, a2 +; RV32IM-NEXT: mulhu a2, a7, a5 +; RV32IM-NEXT: sub a4, a7, a2 ; RV32IM-NEXT: srli a4, a4, 1 ; RV32IM-NEXT: add a2, a4, a2 ; RV32IM-NEXT: srli a2, a2, 6 -; RV32IM-NEXT: mul a2, a2, a5 -; RV32IM-NEXT: sub a2, a3, a2 -; RV32IM-NEXT: mulhu a3, a7, a6 -; RV32IM-NEXT: sub a4, a7, a3 -; RV32IM-NEXT: srli a4, a4, 1 -; RV32IM-NEXT: add a3, a4, a3 -; RV32IM-NEXT: srli a3, a3, 6 -; RV32IM-NEXT: mul a3, a3, a5 -; RV32IM-NEXT: sub a3, a7, a3 +; RV32IM-NEXT: mul a2, a2, a3 +; RV32IM-NEXT: sub a2, a7, a2 +; RV32IM-NEXT: mulhu a4, a6, a5 +; RV32IM-NEXT: sub a5, a6, a4 +; RV32IM-NEXT: srli a5, a5, 1 +; RV32IM-NEXT: add a4, a5, a4 +; RV32IM-NEXT: srli a4, a4, 6 +; RV32IM-NEXT: mul a3, a4, a3 +; RV32IM-NEXT: sub a3, a6, a3 ; RV32IM-NEXT: sh a3, 6(a0) ; RV32IM-NEXT: sh a2, 4(a0) ; RV32IM-NEXT: sh a1, 2(a0) @@ -348,8 +348,8 @@ define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) nounwind { ; ; RV64IM-LABEL: fold_urem_vec_2: ; RV64IM: # %bb.0: -; RV64IM-NEXT: lhu a7, 24(a1) -; RV64IM-NEXT: lhu a3, 16(a1) +; RV64IM-NEXT: lhu a6, 24(a1) +; RV64IM-NEXT: lhu a7, 16(a1) ; RV64IM-NEXT: lhu a4, 8(a1) ; RV64IM-NEXT: lhu a1, 0(a1) ; RV64IM-NEXT: lui a5, 1423 @@ -359,36 +359,36 @@ define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) nounwind { ; RV64IM-NEXT: slli a5, a5, 13 ; RV64IM-NEXT: addi a5, a5, -1811 ; RV64IM-NEXT: slli a5, a5, 12 -; RV64IM-NEXT: addi a6, a5, 561 -; RV64IM-NEXT: mulhu a5, a1, a6 -; RV64IM-NEXT: sub a2, a1, a5 -; RV64IM-NEXT: srli a2, a2, 1 -; RV64IM-NEXT: add a2, a2, a5 +; RV64IM-NEXT: addi a5, a5, 561 +; RV64IM-NEXT: mulhu a2, a1, a5 +; RV64IM-NEXT: sub a3, a1, a2 +; RV64IM-NEXT: srli a3, a3, 1 +; RV64IM-NEXT: add a2, a3, a2 ; RV64IM-NEXT: srli a2, a2, 6 -; RV64IM-NEXT: addi a5, zero, 95 -; RV64IM-NEXT: mulw a2, a2, a5 +; RV64IM-NEXT: addi a3, zero, 95 +; RV64IM-NEXT: mulw a2, a2, a3 ; RV64IM-NEXT: subw t0, a1, a2 -; RV64IM-NEXT: mulhu a2, a4, a6 +; RV64IM-NEXT: mulhu a2, a4, a5 ; RV64IM-NEXT: sub a1, a4, a2 ; RV64IM-NEXT: srli a1, a1, 1 ; RV64IM-NEXT: add a1, a1, a2 ; RV64IM-NEXT: srli a1, a1, 6 -; RV64IM-NEXT: mulw a1, a1, a5 +; RV64IM-NEXT: mulw a1, a1, a3 ; RV64IM-NEXT: subw a1, a4, a1 -; RV64IM-NEXT: mulhu a2, a3, a6 -; RV64IM-NEXT: sub a4, a3, a2 +; RV64IM-NEXT: mulhu a2, a7, a5 +; RV64IM-NEXT: sub a4, a7, a2 ; RV64IM-NEXT: srli a4, a4, 1 ; RV64IM-NEXT: add a2, a4, a2 ; RV64IM-NEXT: srli a2, a2, 6 -; RV64IM-NEXT: mulw a2, a2, a5 -; RV64IM-NEXT: subw a2, a3, a2 -; RV64IM-NEXT: mulhu a3, a7, a6 -; RV64IM-NEXT: sub a4, a7, a3 -; RV64IM-NEXT: srli a4, a4, 1 -; RV64IM-NEXT: add a3, a4, a3 -; RV64IM-NEXT: srli a3, a3, 6 -; RV64IM-NEXT: mulw a3, a3, a5 -; RV64IM-NEXT: subw a3, a7, a3 +; RV64IM-NEXT: mulw a2, a2, a3 +; RV64IM-NEXT: subw a2, a7, a2 +; RV64IM-NEXT: mulhu a4, a6, a5 +; RV64IM-NEXT: sub a5, a6, a4 +; RV64IM-NEXT: srli a5, a5, 1 +; RV64IM-NEXT: add a4, a5, a4 +; RV64IM-NEXT: srli a4, a4, 6 +; RV64IM-NEXT: mulw a3, a4, a3 +; RV64IM-NEXT: subw a3, a6, a3 ; RV64IM-NEXT: sh a3, 6(a0) ; RV64IM-NEXT: sh a2, 4(a0) ; RV64IM-NEXT: sh a1, 2(a0) @@ -475,44 +475,44 @@ define <4 x i16> @combine_urem_udiv(<4 x i16> %x) nounwind { ; ; RV32IM-LABEL: combine_urem_udiv: ; RV32IM: # %bb.0: -; RV32IM-NEXT: lhu a7, 0(a1) -; RV32IM-NEXT: lhu a3, 4(a1) +; RV32IM-NEXT: lhu a6, 0(a1) +; RV32IM-NEXT: lhu a7, 4(a1) ; RV32IM-NEXT: lhu a4, 12(a1) ; RV32IM-NEXT: lhu a1, 8(a1) ; RV32IM-NEXT: lui a5, 364242 -; RV32IM-NEXT: addi a6, a5, 777 -; RV32IM-NEXT: mulhu a5, a4, a6 -; RV32IM-NEXT: sub a2, a4, a5 -; RV32IM-NEXT: srli a2, a2, 1 -; RV32IM-NEXT: add a2, a2, a5 +; RV32IM-NEXT: addi a5, a5, 777 +; RV32IM-NEXT: mulhu a2, a4, a5 +; RV32IM-NEXT: sub a3, a4, a2 +; RV32IM-NEXT: srli a3, a3, 1 +; RV32IM-NEXT: add a2, a3, a2 ; RV32IM-NEXT: srli t3, a2, 6 ; RV32IM-NEXT: addi t0, zero, 95 -; RV32IM-NEXT: mul a5, t3, t0 -; RV32IM-NEXT: sub t1, a4, a5 -; RV32IM-NEXT: mulhu a5, a1, a6 -; RV32IM-NEXT: sub a4, a1, a5 -; RV32IM-NEXT: srli a4, a4, 1 -; RV32IM-NEXT: add a4, a4, a5 -; RV32IM-NEXT: srli a4, a4, 6 -; RV32IM-NEXT: mul a5, a4, t0 -; RV32IM-NEXT: sub t2, a1, a5 -; RV32IM-NEXT: mulhu a5, a3, a6 -; RV32IM-NEXT: sub a1, a3, a5 +; RV32IM-NEXT: mul a3, t3, t0 +; RV32IM-NEXT: sub t1, a4, a3 +; RV32IM-NEXT: mulhu a4, a1, a5 +; RV32IM-NEXT: sub a3, a1, a4 +; RV32IM-NEXT: srli a3, a3, 1 +; RV32IM-NEXT: add a3, a3, a4 +; RV32IM-NEXT: srli a3, a3, 6 +; RV32IM-NEXT: mul a4, a3, t0 +; RV32IM-NEXT: sub t2, a1, a4 +; RV32IM-NEXT: mulhu a4, a7, a5 +; RV32IM-NEXT: sub a1, a7, a4 ; RV32IM-NEXT: srli a1, a1, 1 -; RV32IM-NEXT: add a1, a1, a5 +; RV32IM-NEXT: add a1, a1, a4 ; RV32IM-NEXT: srli a1, a1, 6 -; RV32IM-NEXT: mul a5, a1, t0 -; RV32IM-NEXT: sub a3, a3, a5 -; RV32IM-NEXT: mulhu a5, a7, a6 -; RV32IM-NEXT: sub a2, a7, a5 +; RV32IM-NEXT: mul a4, a1, t0 +; RV32IM-NEXT: sub a4, a7, a4 +; RV32IM-NEXT: mulhu a5, a6, a5 +; RV32IM-NEXT: sub a2, a6, a5 ; RV32IM-NEXT: srli a2, a2, 1 ; RV32IM-NEXT: add a2, a2, a5 ; RV32IM-NEXT: srli a2, a2, 6 ; RV32IM-NEXT: mul a5, a2, t0 -; RV32IM-NEXT: sub a5, a7, a5 +; RV32IM-NEXT: sub a5, a6, a5 ; RV32IM-NEXT: add a2, a5, a2 -; RV32IM-NEXT: add a1, a3, a1 -; RV32IM-NEXT: add a3, t2, a4 +; RV32IM-NEXT: add a1, a4, a1 +; RV32IM-NEXT: add a3, t2, a3 ; RV32IM-NEXT: add a4, t1, t3 ; RV32IM-NEXT: sh a4, 6(a0) ; RV32IM-NEXT: sh a3, 4(a0) @@ -594,8 +594,8 @@ define <4 x i16> @combine_urem_udiv(<4 x i16> %x) nounwind { ; ; RV64IM-LABEL: combine_urem_udiv: ; RV64IM: # %bb.0: -; RV64IM-NEXT: lhu a7, 0(a1) -; RV64IM-NEXT: lhu a3, 8(a1) +; RV64IM-NEXT: lhu a6, 0(a1) +; RV64IM-NEXT: lhu a7, 8(a1) ; RV64IM-NEXT: lhu a4, 16(a1) ; RV64IM-NEXT: lhu a1, 24(a1) ; RV64IM-NEXT: lui a5, 1423 @@ -605,38 +605,38 @@ define <4 x i16> @combine_urem_udiv(<4 x i16> %x) nounwind { ; RV64IM-NEXT: slli a5, a5, 13 ; RV64IM-NEXT: addi a5, a5, -1811 ; RV64IM-NEXT: slli a5, a5, 12 -; RV64IM-NEXT: addi a6, a5, 561 -; RV64IM-NEXT: mulhu a5, a1, a6 -; RV64IM-NEXT: sub a2, a1, a5 -; RV64IM-NEXT: srli a2, a2, 1 -; RV64IM-NEXT: add a2, a2, a5 +; RV64IM-NEXT: addi a5, a5, 561 +; RV64IM-NEXT: mulhu a2, a1, a5 +; RV64IM-NEXT: sub a3, a1, a2 +; RV64IM-NEXT: srli a3, a3, 1 +; RV64IM-NEXT: add a2, a3, a2 ; RV64IM-NEXT: srli t3, a2, 6 ; RV64IM-NEXT: addi t0, zero, 95 -; RV64IM-NEXT: mulw a5, t3, t0 -; RV64IM-NEXT: subw t1, a1, a5 -; RV64IM-NEXT: mulhu a5, a4, a6 -; RV64IM-NEXT: sub a1, a4, a5 +; RV64IM-NEXT: mulw a3, t3, t0 +; RV64IM-NEXT: subw t1, a1, a3 +; RV64IM-NEXT: mulhu a3, a4, a5 +; RV64IM-NEXT: sub a1, a4, a3 ; RV64IM-NEXT: srli a1, a1, 1 -; RV64IM-NEXT: add a1, a1, a5 +; RV64IM-NEXT: add a1, a1, a3 ; RV64IM-NEXT: srli a1, a1, 6 -; RV64IM-NEXT: mulw a5, a1, t0 -; RV64IM-NEXT: subw t2, a4, a5 -; RV64IM-NEXT: mulhu a5, a3, a6 -; RV64IM-NEXT: sub a4, a3, a5 -; RV64IM-NEXT: srli a4, a4, 1 -; RV64IM-NEXT: add a4, a4, a5 -; RV64IM-NEXT: srli a4, a4, 6 -; RV64IM-NEXT: mulw a5, a4, t0 -; RV64IM-NEXT: subw a3, a3, a5 -; RV64IM-NEXT: mulhu a5, a7, a6 -; RV64IM-NEXT: sub a2, a7, a5 +; RV64IM-NEXT: mulw a3, a1, t0 +; RV64IM-NEXT: subw t2, a4, a3 +; RV64IM-NEXT: mulhu a4, a7, a5 +; RV64IM-NEXT: sub a3, a7, a4 +; RV64IM-NEXT: srli a3, a3, 1 +; RV64IM-NEXT: add a3, a3, a4 +; RV64IM-NEXT: srli a3, a3, 6 +; RV64IM-NEXT: mulw a4, a3, t0 +; RV64IM-NEXT: subw a4, a7, a4 +; RV64IM-NEXT: mulhu a5, a6, a5 +; RV64IM-NEXT: sub a2, a6, a5 ; RV64IM-NEXT: srli a2, a2, 1 ; RV64IM-NEXT: add a2, a2, a5 ; RV64IM-NEXT: srli a2, a2, 6 ; RV64IM-NEXT: mulw a5, a2, t0 -; RV64IM-NEXT: subw a5, a7, a5 +; RV64IM-NEXT: subw a5, a6, a5 ; RV64IM-NEXT: addw a2, a5, a2 -; RV64IM-NEXT: addw a3, a3, a4 +; RV64IM-NEXT: addw a3, a4, a3 ; RV64IM-NEXT: addw a1, t2, a1 ; RV64IM-NEXT: addw a4, t1, t3 ; RV64IM-NEXT: sh a4, 6(a0) diff --git a/llvm/test/CodeGen/Thumb/dyn-stackalloc.ll b/llvm/test/CodeGen/Thumb/dyn-stackalloc.ll index 2e75510..c6b5c7b 100644 --- a/llvm/test/CodeGen/Thumb/dyn-stackalloc.ll +++ b/llvm/test/CodeGen/Thumb/dyn-stackalloc.ll @@ -1,5 +1,5 @@ -; RUN: llc < %s -mtriple=thumb-apple-darwin -disable-cgp-branch-opts -disable-post-ra -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -; RUN: llc < %s -mtriple=thumb-apple-darwin -disable-cgp-branch-opts -disable-post-ra -regalloc=basic -verify-machineinstrs | FileCheck %s -check-prefix=CHECK +; RUN: llc < %s -mtriple=thumb-apple-darwin -disable-cgp-branch-opts -disable-post-ra -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=RA_GREEDY +; RUN: llc < %s -mtriple=thumb-apple-darwin -disable-cgp-branch-opts -disable-post-ra -regalloc=basic -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=RA_BASIC %struct.state = type { i32, %struct.info*, float**, i32, i32, i32, i32, i32, i32, i32, i32, i32, i64, i64, i64, i64, i64, i64, i8* } %struct.info = type { i32, i32, i32, i32, i32, i32, i32, i8* } @@ -45,7 +45,8 @@ define void @t2(%struct.comment* %vc, i8* %tag, i8* %contents) { ; CHECK: sub sp, # ; CHECK: mov r[[R0:[0-9]+]], sp ; CHECK: str r{{[0-9+]}}, [r[[R0]] -; CHECK: str r{{[0-9+]}}, [r[[R0]] +; RA_GREEDY: str r{{[0-9+]}}, [r[[R0]] +; RA_BASIC: stm r[[R0]]! ; CHECK-NOT: ldr r0, [sp ; CHECK: mov r[[R1:[0-9]+]], sp ; CHECK: subs r[[R2:[0-9]+]], r[[R1]], r{{[0-9]+}} diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-disabled-in-loloops.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-disabled-in-loloops.ll index be68cb3..7386594 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-disabled-in-loloops.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-disabled-in-loloops.ll @@ -10,7 +10,7 @@ define dso_local void @check_option(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 { ; ENABLED-LABEL: check_option: ; ENABLED: @ %bb.0: @ %entry -; ENABLED-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr} +; ENABLED-NEXT: push.w {r4, r5, r6, r7, r8, lr} ; ENABLED-NEXT: cmp r3, #1 ; ENABLED-NEXT: blt .LBB0_4 ; ENABLED-NEXT: @ %bb.1: @ %vector.ph.preheader @@ -32,11 +32,11 @@ define dso_local void @check_option(i32* noalias nocapture %A, i32* noalias noca ; ENABLED-NEXT: letp lr, .LBB0_3 ; ENABLED-NEXT: b .LBB0_2 ; ENABLED-NEXT: .LBB0_4: @ %for.cond.cleanup -; ENABLED-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc} +; ENABLED-NEXT: pop.w {r4, r5, r6, r7, r8, pc} ; ; DISABLED-LABEL: check_option: ; DISABLED: @ %bb.0: @ %entry -; DISABLED-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr} +; DISABLED-NEXT: push.w {r4, r5, r6, r7, r8, lr} ; DISABLED-NEXT: cmp r3, #1 ; DISABLED-NEXT: blt .LBB0_4 ; DISABLED-NEXT: @ %bb.1: @ %vector.ph.preheader @@ -48,7 +48,7 @@ define dso_local void @check_option(i32* noalias nocapture %A, i32* noalias noca ; DISABLED-NEXT: .LBB0_2: @ %vector.ph ; DISABLED-NEXT: @ =>This Loop Header: Depth=1 ; DISABLED-NEXT: @ Child Loop BB0_3 Depth 2 -; DISABLED-NEXT: mov r9, r8 +; DISABLED-NEXT: mov r7, r8 ; DISABLED-NEXT: mov r12, r0 ; DISABLED-NEXT: mov r4, r2 ; DISABLED-NEXT: mov r5, r1 @@ -57,9 +57,9 @@ define dso_local void @check_option(i32* noalias nocapture %A, i32* noalias noca ; DISABLED-NEXT: .LBB0_3: @ %vector.body ; DISABLED-NEXT: @ Parent Loop BB0_2 Depth=1 ; DISABLED-NEXT: @ => This Inner Loop Header: Depth=2 -; DISABLED-NEXT: mov lr, r9 +; DISABLED-NEXT: mov lr, r7 ; DISABLED-NEXT: vctp.32 r6 -; DISABLED-NEXT: sub.w r9, r9, #1 +; DISABLED-NEXT: subs r7, #1 ; DISABLED-NEXT: subs r6, #4 ; DISABLED-NEXT: vpstt ; DISABLED-NEXT: vldrwt.u32 q0, [r5], #16 @@ -70,7 +70,7 @@ define dso_local void @check_option(i32* noalias nocapture %A, i32* noalias noca ; DISABLED-NEXT: le lr, .LBB0_3 ; DISABLED-NEXT: b .LBB0_2 ; DISABLED-NEXT: .LBB0_4: @ %for.cond.cleanup -; DISABLED-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc} +; DISABLED-NEXT: pop.w {r4, r5, r6, r7, r8, pc} entry: %cmp8 = icmp sgt i32 %N, 0 %0 = add i32 %N, 3 diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll index 1337650..af5c76f 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll @@ -17,15 +17,17 @@ define dso_local void @varying_outer_2d_reduction(i16* nocapture readonly %Input, i16* nocapture %Output, i16 signext %Size, i16 signext %N, i16 signext %Scale) local_unnamed_addr { ; ENABLED-LABEL: varying_outer_2d_reduction: ; ENABLED: @ %bb.0: @ %entry -; ENABLED-NEXT: push.w {r4, r5, r6, r7, r9, r10, r11, lr} +; ENABLED-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr} +; ENABLED-NEXT: sub sp, #4 ; ENABLED-NEXT: cmp r3, #1 +; ENABLED-NEXT: str r0, [sp] @ 4-byte Spill ; ENABLED-NEXT: blt .LBB0_8 ; ENABLED-NEXT: @ %bb.1: @ %for.body.lr.ph -; ENABLED-NEXT: mov r11, r0 -; ENABLED-NEXT: ldr r0, [sp, #32] -; ENABLED-NEXT: add.w r9, r2, #3 -; ENABLED-NEXT: mov.w r12, #0 -; ENABLED-NEXT: mov r10, r11 +; ENABLED-NEXT: ldr r0, [sp, #36] +; ENABLED-NEXT: add.w r12, r2, #3 +; ENABLED-NEXT: ldr.w r10, [sp] @ 4-byte Reload +; ENABLED-NEXT: mov.w r8, #0 +; ENABLED-NEXT: mov r9, r12 ; ENABLED-NEXT: uxth r0, r0 ; ENABLED-NEXT: rsbs r5, r0, #0 ; ENABLED-NEXT: b .LBB0_4 @@ -35,32 +37,31 @@ define dso_local void @varying_outer_2d_reduction(i16* nocapture readonly %Input ; ENABLED-NEXT: @ in Loop: Header=BB0_4 Depth=1 ; ENABLED-NEXT: lsrs r0, r0, #16 ; ENABLED-NEXT: sub.w r9, r9, #1 -; ENABLED-NEXT: strh.w r0, [r1, r12, lsl #1] -; ENABLED-NEXT: add.w r12, r12, #1 +; ENABLED-NEXT: strh.w r0, [r1, r8, lsl #1] +; ENABLED-NEXT: add.w r8, r8, #1 ; ENABLED-NEXT: add.w r10, r10, #2 -; ENABLED-NEXT: cmp r12, r3 +; ENABLED-NEXT: cmp r8, r3 ; ENABLED-NEXT: beq .LBB0_8 ; ENABLED-NEXT: .LBB0_4: @ %for.body ; ENABLED-NEXT: @ =>This Loop Header: Depth=1 ; ENABLED-NEXT: @ Child Loop BB0_6 Depth 2 -; ENABLED-NEXT: cmp r2, r12 +; ENABLED-NEXT: cmp r2, r8 ; ENABLED-NEXT: ble .LBB0_2 ; ENABLED-NEXT: @ %bb.5: @ %vector.ph ; ENABLED-NEXT: @ in Loop: Header=BB0_4 Depth=1 ; ENABLED-NEXT: bic r0, r9, #3 ; ENABLED-NEXT: movs r7, #1 ; ENABLED-NEXT: subs r0, #4 -; ENABLED-NEXT: sub.w r4, r2, r12 +; ENABLED-NEXT: sub.w r4, r2, r8 ; ENABLED-NEXT: vmov.i32 q1, #0x0 ; ENABLED-NEXT: add.w r6, r7, r0, lsr #2 -; ENABLED-NEXT: adds r0, r2, #3 -; ENABLED-NEXT: sub.w r0, r0, r12 +; ENABLED-NEXT: sub.w r0, r12, r8 ; ENABLED-NEXT: bic r0, r0, #3 ; ENABLED-NEXT: subs r0, #4 ; ENABLED-NEXT: add.w r0, r7, r0, lsr #2 ; ENABLED-NEXT: mov r7, r10 ; ENABLED-NEXT: dls lr, r0 -; ENABLED-NEXT: mov r0, r11 +; ENABLED-NEXT: ldr r0, [sp] @ 4-byte Reload ; ENABLED-NEXT: .LBB0_6: @ %vector.body ; ENABLED-NEXT: @ Parent Loop BB0_4 Depth=1 ; ENABLED-NEXT: @ => This Inner Loop Header: Depth=2 @@ -82,19 +83,22 @@ define dso_local void @varying_outer_2d_reduction(i16* nocapture readonly %Input ; ENABLED-NEXT: vaddv.u32 r0, q0 ; ENABLED-NEXT: b .LBB0_3 ; ENABLED-NEXT: .LBB0_8: @ %for.end17 -; ENABLED-NEXT: pop.w {r4, r5, r6, r7, r9, r10, r11, pc} +; ENABLED-NEXT: add sp, #4 +; ENABLED-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc} ; ; NOREDUCTIONS-LABEL: varying_outer_2d_reduction: ; NOREDUCTIONS: @ %bb.0: @ %entry -; NOREDUCTIONS-NEXT: push.w {r4, r5, r6, r7, r9, r10, r11, lr} +; NOREDUCTIONS-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr} +; NOREDUCTIONS-NEXT: sub sp, #4 ; NOREDUCTIONS-NEXT: cmp r3, #1 +; NOREDUCTIONS-NEXT: str r0, [sp] @ 4-byte Spill ; NOREDUCTIONS-NEXT: blt .LBB0_8 ; NOREDUCTIONS-NEXT: @ %bb.1: @ %for.body.lr.ph -; NOREDUCTIONS-NEXT: mov r11, r0 -; NOREDUCTIONS-NEXT: ldr r0, [sp, #32] -; NOREDUCTIONS-NEXT: add.w r9, r2, #3 -; NOREDUCTIONS-NEXT: mov.w r12, #0 -; NOREDUCTIONS-NEXT: mov r10, r11 +; NOREDUCTIONS-NEXT: ldr r0, [sp, #36] +; NOREDUCTIONS-NEXT: add.w r12, r2, #3 +; NOREDUCTIONS-NEXT: ldr.w r10, [sp] @ 4-byte Reload +; NOREDUCTIONS-NEXT: mov.w r8, #0 +; NOREDUCTIONS-NEXT: mov r9, r12 ; NOREDUCTIONS-NEXT: uxth r0, r0 ; NOREDUCTIONS-NEXT: rsbs r5, r0, #0 ; NOREDUCTIONS-NEXT: b .LBB0_4 @@ -104,32 +108,31 @@ define dso_local void @varying_outer_2d_reduction(i16* nocapture readonly %Input ; NOREDUCTIONS-NEXT: @ in Loop: Header=BB0_4 Depth=1 ; NOREDUCTIONS-NEXT: lsrs r0, r0, #16 ; NOREDUCTIONS-NEXT: sub.w r9, r9, #1 -; NOREDUCTIONS-NEXT: strh.w r0, [r1, r12, lsl #1] -; NOREDUCTIONS-NEXT: add.w r12, r12, #1 +; NOREDUCTIONS-NEXT: strh.w r0, [r1, r8, lsl #1] +; NOREDUCTIONS-NEXT: add.w r8, r8, #1 ; NOREDUCTIONS-NEXT: add.w r10, r10, #2 -; NOREDUCTIONS-NEXT: cmp r12, r3 +; NOREDUCTIONS-NEXT: cmp r8, r3 ; NOREDUCTIONS-NEXT: beq .LBB0_8 ; NOREDUCTIONS-NEXT: .LBB0_4: @ %for.body ; NOREDUCTIONS-NEXT: @ =>This Loop Header: Depth=1 ; NOREDUCTIONS-NEXT: @ Child Loop BB0_6 Depth 2 -; NOREDUCTIONS-NEXT: cmp r2, r12 +; NOREDUCTIONS-NEXT: cmp r2, r8 ; NOREDUCTIONS-NEXT: ble .LBB0_2 ; NOREDUCTIONS-NEXT: @ %bb.5: @ %vector.ph ; NOREDUCTIONS-NEXT: @ in Loop: Header=BB0_4 Depth=1 ; NOREDUCTIONS-NEXT: bic r0, r9, #3 ; NOREDUCTIONS-NEXT: movs r7, #1 ; NOREDUCTIONS-NEXT: subs r0, #4 -; NOREDUCTIONS-NEXT: sub.w r4, r2, r12 +; NOREDUCTIONS-NEXT: sub.w r4, r2, r8 ; NOREDUCTIONS-NEXT: vmov.i32 q1, #0x0 ; NOREDUCTIONS-NEXT: add.w r6, r7, r0, lsr #2 -; NOREDUCTIONS-NEXT: adds r0, r2, #3 -; NOREDUCTIONS-NEXT: sub.w r0, r0, r12 +; NOREDUCTIONS-NEXT: sub.w r0, r12, r8 ; NOREDUCTIONS-NEXT: bic r0, r0, #3 ; NOREDUCTIONS-NEXT: subs r0, #4 ; NOREDUCTIONS-NEXT: add.w r0, r7, r0, lsr #2 ; NOREDUCTIONS-NEXT: mov r7, r10 ; NOREDUCTIONS-NEXT: dls lr, r0 -; NOREDUCTIONS-NEXT: mov r0, r11 +; NOREDUCTIONS-NEXT: ldr r0, [sp] @ 4-byte Reload ; NOREDUCTIONS-NEXT: .LBB0_6: @ %vector.body ; NOREDUCTIONS-NEXT: @ Parent Loop BB0_4 Depth=1 ; NOREDUCTIONS-NEXT: @ => This Inner Loop Header: Depth=2 @@ -151,7 +154,8 @@ define dso_local void @varying_outer_2d_reduction(i16* nocapture readonly %Input ; NOREDUCTIONS-NEXT: vaddv.u32 r0, q0 ; NOREDUCTIONS-NEXT: b .LBB0_3 ; NOREDUCTIONS-NEXT: .LBB0_8: @ %for.end17 -; NOREDUCTIONS-NEXT: pop.w {r4, r5, r6, r7, r9, r10, r11, pc} +; NOREDUCTIONS-NEXT: add sp, #4 +; NOREDUCTIONS-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc} entry: %conv = sext i16 %N to i32 %cmp36 = icmp sgt i16 %N, 0 diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/while-loops.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/while-loops.ll index 810c2544..fba800b 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/while-loops.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/while-loops.ll @@ -160,27 +160,31 @@ define dso_local i32 @b(i32* %c, i32 %d, i32 %e, i32* %n) "frame-pointer"="all" ; CHECK-NEXT: add r7, sp, #12 ; CHECK-NEXT: .save {r8, r9, r10, r11} ; CHECK-NEXT: push.w {r8, r9, r10, r11} -; CHECK-NEXT: .pad #8 -; CHECK-NEXT: sub sp, #8 +; CHECK-NEXT: .pad #12 +; CHECK-NEXT: sub sp, #12 ; CHECK-NEXT: wls lr, r1, .LBB2_3 ; CHECK-NEXT: @ %bb.1: @ %while.body.preheader -; CHECK-NEXT: adds r4, r3, #4 +; CHECK-NEXT: mov r4, r2 +; CHECK-NEXT: adds r2, r3, #4 ; CHECK-NEXT: add.w r9, r0, #4 ; CHECK-NEXT: mvn r11, #1 ; CHECK-NEXT: @ implicit-def: $r6 ; CHECK-NEXT: @ implicit-def: $r12 -; CHECK-NEXT: str r2, [sp] @ 4-byte Spill +; CHECK-NEXT: str r4, [sp] @ 4-byte Spill ; CHECK-NEXT: .LBB2_2: @ %while.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldr r1, [r9, #-4] -; CHECK-NEXT: ldr.w r10, [r4] +; CHECK-NEXT: ldr.w r10, [r2] ; CHECK-NEXT: str r0, [sp, #4] @ 4-byte Spill ; CHECK-NEXT: muls r1, r3, r1 ; CHECK-NEXT: adds.w r8, r1, #-2147483648 ; CHECK-NEXT: asr.w r5, r1, #31 ; CHECK-NEXT: adc r1, r5, #0 ; CHECK-NEXT: mul r5, r10, r0 -; CHECK-NEXT: mov r0, r4 +; CHECK-NEXT: mov r0, r2 +; CHECK-NEXT: ldr.w r2, [r11, #4] +; CHECK-NEXT: str r2, [sp, #8] @ 4-byte Spill +; CHECK-NEXT: ldr r2, [sp, #8] @ 4-byte Reload ; CHECK-NEXT: add.w r5, r5, #-2147483648 ; CHECK-NEXT: asrl r8, r1, r5 ; CHECK-NEXT: smull r4, r5, r10, r8 @@ -189,48 +193,47 @@ define dso_local i32 @b(i32* %c, i32 %d, i32 %e, i32* %n) "frame-pointer"="all" ; CHECK-NEXT: mov r4, r5 ; CHECK-NEXT: lsll r4, r1, r10 ; CHECK-NEXT: lsll r4, r1, #30 -; CHECK-NEXT: ldrd r4, r8, [r11] +; CHECK-NEXT: ldr.w r4, [r11] ; CHECK-NEXT: asrs r5, r1, #31 +; CHECK-NEXT: mov r8, r1 ; CHECK-NEXT: muls r4, r6, r4 -; CHECK-NEXT: adds r2, r4, #2 -; CHECK-NEXT: mov r4, r1 -; CHECK-NEXT: lsll r4, r5, r2 -; CHECK-NEXT: add.w r1, r4, #-2147483648 +; CHECK-NEXT: adds r4, #2 +; CHECK-NEXT: lsll r8, r5, r4 ; CHECK-NEXT: ldr r4, [r9], #4 ; CHECK-NEXT: asr.w r5, r12, #31 +; CHECK-NEXT: add.w r8, r8, #-2147483648 ; CHECK-NEXT: muls r4, r3, r4 ; CHECK-NEXT: adds r3, #4 -; CHECK-NEXT: adds.w r2, r12, r4 +; CHECK-NEXT: adds.w r1, r12, r4 ; CHECK-NEXT: adc.w r5, r5, r4, asr #31 -; CHECK-NEXT: smull r6, r4, r8, r6 -; CHECK-NEXT: adds.w r2, r2, #-2147483648 -; CHECK-NEXT: adc r2, r5, #0 -; CHECK-NEXT: asrs r5, r2, #31 -; CHECK-NEXT: subs r6, r2, r6 +; CHECK-NEXT: smull r6, r4, r2, r6 +; CHECK-NEXT: adds.w r1, r1, #-2147483648 +; CHECK-NEXT: adc r1, r5, #0 +; CHECK-NEXT: mov r2, r0 +; CHECK-NEXT: asrs r5, r1, #31 +; CHECK-NEXT: subs r6, r1, r6 ; CHECK-NEXT: sbcs r5, r4 ; CHECK-NEXT: adds.w r6, r6, #-2147483648 ; CHECK-NEXT: adc r5, r5, #0 -; CHECK-NEXT: mov r4, r0 -; CHECK-NEXT: asrl r6, r5, r1 -; CHECK-NEXT: movs r1, #2 +; CHECK-NEXT: asrl r6, r5, r8 ; CHECK-NEXT: lsrl r6, r5, #2 -; CHECK-NEXT: ldr r0, [sp, #4] @ 4-byte Reload -; CHECK-NEXT: str r6, [r1] -; CHECK-NEXT: ldr r1, [r11], #-4 -; CHECK-NEXT: adds r0, #4 -; CHECK-NEXT: mls r1, r1, r10, r2 +; CHECK-NEXT: movs r5, #2 +; CHECK-NEXT: str r6, [r5] +; CHECK-NEXT: ldr r5, [r11], #-4 +; CHECK-NEXT: mls r1, r5, r10, r1 ; CHECK-NEXT: adds.w r12, r1, #-2147483648 -; CHECK-NEXT: asr.w r2, r1, #31 -; CHECK-NEXT: adc r1, r2, #0 -; CHECK-NEXT: ldr r2, [sp] @ 4-byte Reload +; CHECK-NEXT: asr.w r4, r1, #31 +; CHECK-NEXT: adc r1, r4, #0 +; CHECK-NEXT: ldrd r4, r0, [sp] @ 8-byte Folded Reload ; CHECK-NEXT: lsrl r12, r1, #2 ; CHECK-NEXT: rsb.w r1, r12, #0 -; CHECK-NEXT: str r1, [r2] -; CHECK-NEXT: str r1, [r4, #-4] -; CHECK-NEXT: adds r4, #4 +; CHECK-NEXT: adds r0, #4 +; CHECK-NEXT: str r1, [r4] +; CHECK-NEXT: str r1, [r2, #-4] +; CHECK-NEXT: adds r2, #4 ; CHECK-NEXT: le lr, .LBB2_2 ; CHECK-NEXT: .LBB2_3: @ %while.end -; CHECK-NEXT: add sp, #8 +; CHECK-NEXT: add sp, #12 ; CHECK-NEXT: pop.w {r8, r9, r10, r11} ; CHECK-NEXT: pop {r4, r5, r6, r7, pc} entry: diff --git a/llvm/test/CodeGen/Thumb2/ldr-str-imm12.ll b/llvm/test/CodeGen/Thumb2/ldr-str-imm12.ll index fe72b50..e6beb75 100644 --- a/llvm/test/CodeGen/Thumb2/ldr-str-imm12.ll +++ b/llvm/test/CodeGen/Thumb2/ldr-str-imm12.ll @@ -31,8 +31,8 @@ define %union.rec* @Manifest(%union.rec* %x, %union.rec* %env, %struct.STYLE* %s ; CHECK-NEXT: ldrd r8, lr, [r7, #20] ; CHECK-NEXT: movs r5, #0 ; CHECK-NEXT: cmp r5, #0 -; CHECK-NEXT: ldm.w r10, {r4, r6, r10} -; CHECK-NEXT: ldrd r12, r9, [r7, #28] +; CHECK-NEXT: ldm.w r10, {r4, r9, r10} +; CHECK-NEXT: ldr.w r12, [r7, #28] ; CHECK-NEXT: ittt ne ; CHECK-NEXT: addne sp, #292 ; CHECK-NEXT: popne.w {r8, r10, r11} @@ -46,25 +46,29 @@ define %union.rec* @Manifest(%union.rec* %x, %union.rec* %env, %struct.STYLE* %s ; CHECK-NEXT: @ %bb.3: @ %bb420 ; CHECK-NEXT: movw r5, :lower16:(L_zz_hold$non_lazy_ptr-(LPC0_0+4)) ; CHECK-NEXT: movt r5, :upper16:(L_zz_hold$non_lazy_ptr-(LPC0_0+4)) +; CHECK-NEXT: movw r11, :lower16:(L_zz_res$non_lazy_ptr-(LPC0_1+4)) ; CHECK-NEXT: LPC0_0: ; CHECK-NEXT: add r5, pc -; CHECK-NEXT: ldr.w r11, [r5] -; CHECK-NEXT: str.w r11, [sp, #28] @ 4-byte Spill -; CHECK-NEXT: movw r5, :lower16:(L_zz_res$non_lazy_ptr-(LPC0_1+4)) -; CHECK-NEXT: movt r5, :upper16:(L_zz_res$non_lazy_ptr-(LPC0_1+4)) +; CHECK-NEXT: movt r11, :upper16:(L_zz_res$non_lazy_ptr-(LPC0_1+4)) ; CHECK-NEXT: LPC0_1: -; CHECK-NEXT: add r5, pc +; CHECK-NEXT: add r11, pc ; CHECK-NEXT: ldr r5, [r5] ; CHECK-NEXT: str r5, [sp, #32] @ 4-byte Spill +; CHECK-NEXT: ldr.w r5, [r11] +; CHECK-NEXT: mov.w r11, #0 +; CHECK-NEXT: str r5, [sp, #28] @ 4-byte Spill +; CHECK-NEXT: ldr r5, [sp, #32] @ 4-byte Reload +; CHECK-NEXT: str.w r11, [r5] ; CHECK-NEXT: movs r5, #0 -; CHECK-NEXT: str.w r5, [r11] -; CHECK-NEXT: ldr.w r11, [sp, #32] @ 4-byte Reload -; CHECK-NEXT: str.w r5, [r11] -; CHECK-NEXT: ldr r5, [sp, #28] @ 4-byte Reload +; CHECK-NEXT: ldr r6, [sp, #28] @ 4-byte Reload +; CHECK-NEXT: str r5, [r6] +; CHECK-NEXT: ldr r5, [sp, #32] @ 4-byte Reload ; CHECK-NEXT: str r0, [r5] -; CHECK-NEXT: stm.w sp, {r4, r6, r10} +; CHECK-NEXT: ldr r0, [r7, #32] +; CHECK-NEXT: stm.w sp, {r4, r9, r10} ; CHECK-NEXT: strd r8, lr, [sp, #12] -; CHECK-NEXT: strd r12, r9, [sp, #20] +; CHECK-NEXT: str.w r12, [sp, #20] +; CHECK-NEXT: str r0, [sp, #24] ; CHECK-NEXT: bl _Manifest ; CHECK-NEXT: trap ; CHECK-NEXT: LBB0_4: @ %bb20 diff --git a/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll b/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll index b62b5c7..45ff239 100644 --- a/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll +++ b/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll @@ -1049,10 +1049,10 @@ define void @fir(%struct.arm_fir_instance_f32* nocapture readonly %S, half* noca ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} -; CHECK-NEXT: .pad #20 -; CHECK-NEXT: sub sp, #20 +; CHECK-NEXT: .pad #24 +; CHECK-NEXT: sub sp, #24 ; CHECK-NEXT: cmp r3, #8 -; CHECK-NEXT: str r1, [sp, #16] @ 4-byte Spill +; CHECK-NEXT: str r1, [sp, #20] @ 4-byte Spill ; CHECK-NEXT: blo.w .LBB16_12 ; CHECK-NEXT: @ %bb.1: @ %if.then ; CHECK-NEXT: lsrs.w r12, r3, #2 @@ -1072,43 +1072,45 @@ define void @fir(%struct.arm_fir_instance_f32* nocapture readonly %S, half* noca ; CHECK-NEXT: str r1, [sp] @ 4-byte Spill ; CHECK-NEXT: subs r1, r7, #2 ; CHECK-NEXT: rsbs r7, r4, #0 -; CHECK-NEXT: str r4, [sp, #8] @ 4-byte Spill +; CHECK-NEXT: str r7, [sp, #8] @ 4-byte Spill +; CHECK-NEXT: add.w r7, r3, #16 +; CHECK-NEXT: str r4, [sp, #12] @ 4-byte Spill ; CHECK-NEXT: str r7, [sp, #4] @ 4-byte Spill -; CHECK-NEXT: str r0, [sp, #12] @ 4-byte Spill +; CHECK-NEXT: str r0, [sp, #16] @ 4-byte Spill ; CHECK-NEXT: b .LBB16_5 ; CHECK-NEXT: .LBB16_3: @ %for.end ; CHECK-NEXT: @ in Loop: Header=BB16_5 Depth=1 -; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: ldr r0, [sp, #16] @ 4-byte Reload ; CHECK-NEXT: wls lr, r0, .LBB16_4 ; CHECK-NEXT: b .LBB16_9 ; CHECK-NEXT: .LBB16_4: @ %while.end ; CHECK-NEXT: @ in Loop: Header=BB16_5 Depth=1 -; CHECK-NEXT: ldr r0, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: ldr r0, [sp, #8] @ 4-byte Reload ; CHECK-NEXT: subs.w r12, r12, #1 ; CHECK-NEXT: vstrb.8 q0, [r2], #8 -; CHECK-NEXT: add.w r0, r6, r0, lsl #1 +; CHECK-NEXT: add.w r0, r5, r0, lsl #1 ; CHECK-NEXT: add.w r5, r0, #8 ; CHECK-NEXT: beq.w .LBB16_12 ; CHECK-NEXT: .LBB16_5: @ %while.body ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB16_7 Depth 2 ; CHECK-NEXT: @ Child Loop BB16_10 Depth 2 -; CHECK-NEXT: ldr r0, [sp, #16] @ 4-byte Reload +; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload ; CHECK-NEXT: ldrh.w lr, [r3, #14] ; CHECK-NEXT: vldrw.u32 q0, [r0], #8 -; CHECK-NEXT: ldrh.w r10, [r3, #12] +; CHECK-NEXT: ldrh.w r8, [r3, #12] ; CHECK-NEXT: ldrh r7, [r3, #10] ; CHECK-NEXT: ldrh r4, [r3, #8] ; CHECK-NEXT: ldrh r6, [r3, #6] ; CHECK-NEXT: ldrh.w r9, [r3, #4] ; CHECK-NEXT: ldrh.w r11, [r3, #2] -; CHECK-NEXT: ldrh.w r8, [r3] +; CHECK-NEXT: ldrh.w r10, [r3] ; CHECK-NEXT: vstrb.8 q0, [r1], #8 ; CHECK-NEXT: vldrw.u32 q0, [r5] -; CHECK-NEXT: str r0, [sp, #16] @ 4-byte Spill +; CHECK-NEXT: str r0, [sp, #20] @ 4-byte Spill ; CHECK-NEXT: adds r0, r5, #2 ; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vmul.f16 q0, q0, r8 +; CHECK-NEXT: vmul.f16 q0, q0, r10 ; CHECK-NEXT: adds r0, r5, #6 ; CHECK-NEXT: vfma.f16 q0, q1, r11 ; CHECK-NEXT: vldrw.u32 q1, [r5, #4] @@ -1117,77 +1119,77 @@ define void @fir(%struct.arm_fir_instance_f32* nocapture readonly %S, half* noca ; CHECK-NEXT: add.w r0, r5, #10 ; CHECK-NEXT: vfma.f16 q0, q1, r6 ; CHECK-NEXT: vldrw.u32 q1, [r5, #8] -; CHECK-NEXT: add.w r6, r5, #16 ; CHECK-NEXT: vfma.f16 q0, q1, r4 ; CHECK-NEXT: vldrw.u32 q1, [r0] ; CHECK-NEXT: add.w r0, r5, #14 ; CHECK-NEXT: vfma.f16 q0, q1, r7 ; CHECK-NEXT: vldrw.u32 q1, [r5, #12] -; CHECK-NEXT: vfma.f16 q0, q1, r10 +; CHECK-NEXT: adds r5, #16 +; CHECK-NEXT: vfma.f16 q0, q1, r8 ; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: ldr r0, [sp, #8] @ 4-byte Reload +; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload ; CHECK-NEXT: vfma.f16 q0, q1, lr ; CHECK-NEXT: cmp r0, #16 ; CHECK-NEXT: blo .LBB16_8 ; CHECK-NEXT: @ %bb.6: @ %for.body.preheader ; CHECK-NEXT: @ in Loop: Header=BB16_5 Depth=1 ; CHECK-NEXT: ldr r0, [sp] @ 4-byte Reload -; CHECK-NEXT: add.w r5, r3, #16 ; CHECK-NEXT: dls lr, r0 +; CHECK-NEXT: ldr r6, [sp, #4] @ 4-byte Reload ; CHECK-NEXT: .LBB16_7: @ %for.body ; CHECK-NEXT: @ Parent Loop BB16_5 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: ldrh r0, [r5], #16 -; CHECK-NEXT: vldrw.u32 q1, [r6] -; CHECK-NEXT: adds r4, r6, #2 +; CHECK-NEXT: ldrh r0, [r6], #16 +; CHECK-NEXT: vldrw.u32 q1, [r5] +; CHECK-NEXT: adds r4, r5, #2 ; CHECK-NEXT: vfma.f16 q0, q1, r0 ; CHECK-NEXT: vldrw.u32 q1, [r4] -; CHECK-NEXT: ldrh r0, [r5, #-14] -; CHECK-NEXT: adds r4, r6, #6 +; CHECK-NEXT: ldrh r0, [r6, #-14] +; CHECK-NEXT: adds r4, r5, #6 ; CHECK-NEXT: vfma.f16 q0, q1, r0 -; CHECK-NEXT: ldrh r0, [r5, #-12] -; CHECK-NEXT: vldrw.u32 q1, [r6, #4] +; CHECK-NEXT: ldrh r0, [r6, #-12] +; CHECK-NEXT: vldrw.u32 q1, [r5, #4] ; CHECK-NEXT: vfma.f16 q0, q1, r0 ; CHECK-NEXT: vldrw.u32 q1, [r4] -; CHECK-NEXT: ldrh r0, [r5, #-10] -; CHECK-NEXT: add.w r4, r6, #10 +; CHECK-NEXT: ldrh r0, [r6, #-10] +; CHECK-NEXT: add.w r4, r5, #10 ; CHECK-NEXT: vfma.f16 q0, q1, r0 -; CHECK-NEXT: ldrh r0, [r5, #-8] -; CHECK-NEXT: vldrw.u32 q1, [r6, #8] +; CHECK-NEXT: ldrh r0, [r6, #-8] +; CHECK-NEXT: vldrw.u32 q1, [r5, #8] ; CHECK-NEXT: vfma.f16 q0, q1, r0 ; CHECK-NEXT: vldrw.u32 q1, [r4] -; CHECK-NEXT: ldrh r0, [r5, #-6] -; CHECK-NEXT: ldrh r4, [r5, #-2] +; CHECK-NEXT: ldrh r0, [r6, #-6] +; CHECK-NEXT: ldrh r4, [r6, #-2] ; CHECK-NEXT: vfma.f16 q0, q1, r0 -; CHECK-NEXT: ldrh r0, [r5, #-4] -; CHECK-NEXT: vldrw.u32 q1, [r6, #12] +; CHECK-NEXT: ldrh r0, [r6, #-4] +; CHECK-NEXT: vldrw.u32 q1, [r5, #12] ; CHECK-NEXT: vfma.f16 q0, q1, r0 -; CHECK-NEXT: add.w r0, r6, #14 +; CHECK-NEXT: add.w r0, r5, #14 ; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: adds r6, #16 +; CHECK-NEXT: adds r5, #16 ; CHECK-NEXT: vfma.f16 q0, q1, r4 ; CHECK-NEXT: le lr, .LBB16_7 ; CHECK-NEXT: b .LBB16_3 ; CHECK-NEXT: .LBB16_8: @ in Loop: Header=BB16_5 Depth=1 -; CHECK-NEXT: add.w r5, r3, #16 +; CHECK-NEXT: ldr r6, [sp, #4] @ 4-byte Reload ; CHECK-NEXT: b .LBB16_3 ; CHECK-NEXT: .LBB16_9: @ %while.body76.preheader ; CHECK-NEXT: @ in Loop: Header=BB16_5 Depth=1 -; CHECK-NEXT: mov r0, r6 +; CHECK-NEXT: mov r0, r5 ; CHECK-NEXT: .LBB16_10: @ %while.body76 ; CHECK-NEXT: @ Parent Loop BB16_5 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: ldrh r4, [r5], #2 +; CHECK-NEXT: ldrh r4, [r6], #2 ; CHECK-NEXT: vldrh.u16 q1, [r0], #2 ; CHECK-NEXT: vfma.f16 q0, q1, r4 ; CHECK-NEXT: le lr, .LBB16_10 ; CHECK-NEXT: @ %bb.11: @ %while.end.loopexit ; CHECK-NEXT: @ in Loop: Header=BB16_5 Depth=1 -; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload -; CHECK-NEXT: add.w r6, r6, r0, lsl #1 +; CHECK-NEXT: ldr r0, [sp, #16] @ 4-byte Reload +; CHECK-NEXT: add.w r5, r5, r0, lsl #1 ; CHECK-NEXT: b .LBB16_4 ; CHECK-NEXT: .LBB16_12: @ %if.end -; CHECK-NEXT: add sp, #20 +; CHECK-NEXT: add sp, #24 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} entry: %pState1 = getelementptr inbounds %struct.arm_fir_instance_f32, %struct.arm_fir_instance_f32* %S, i32 0, i32 1 diff --git a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll index 6fa8637..ee91dcc 100644 --- a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll +++ b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll @@ -1044,8 +1044,8 @@ define void @fir(%struct.arm_fir_instance_f32* nocapture readonly %S, float* noc ; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: .pad #24 -; CHECK-NEXT: sub sp, #24 +; CHECK-NEXT: .pad #32 +; CHECK-NEXT: sub sp, #32 ; CHECK-NEXT: cmp r3, #8 ; CHECK-NEXT: blo.w .LBB16_12 ; CHECK-NEXT: @ %bb.1: @ %if.then @@ -1053,36 +1053,38 @@ define void @fir(%struct.arm_fir_instance_f32* nocapture readonly %S, float* noc ; CHECK-NEXT: beq.w .LBB16_12 ; CHECK-NEXT: @ %bb.2: @ %while.body.lr.ph ; CHECK-NEXT: ldrh r6, [r0] -; CHECK-NEXT: movs r4, #1 -; CHECK-NEXT: ldrd r7, r10, [r0, #4] +; CHECK-NEXT: movs r5, #1 +; CHECK-NEXT: ldrd r4, r10, [r0, #4] ; CHECK-NEXT: sub.w r0, r6, #8 ; CHECK-NEXT: add.w r3, r0, r0, lsr #29 ; CHECK-NEXT: and r0, r0, #7 -; CHECK-NEXT: asrs r5, r3, #3 -; CHECK-NEXT: cmp r5, #1 +; CHECK-NEXT: asrs r7, r3, #3 +; CHECK-NEXT: cmp r7, #1 ; CHECK-NEXT: it gt -; CHECK-NEXT: asrgt r4, r3, #3 -; CHECK-NEXT: add.w r3, r7, r6, lsl #2 +; CHECK-NEXT: asrgt r5, r3, #3 +; CHECK-NEXT: add.w r3, r4, r6, lsl #2 ; CHECK-NEXT: sub.w r9, r3, #4 ; CHECK-NEXT: rsbs r3, r6, #0 -; CHECK-NEXT: str r4, [sp] @ 4-byte Spill -; CHECK-NEXT: str r6, [sp, #8] @ 4-byte Spill -; CHECK-NEXT: str r3, [sp, #4] @ 4-byte Spill -; CHECK-NEXT: str r0, [sp, #12] @ 4-byte Spill +; CHECK-NEXT: str r3, [sp, #12] @ 4-byte Spill +; CHECK-NEXT: add.w r3, r10, #32 +; CHECK-NEXT: str r5, [sp, #4] @ 4-byte Spill +; CHECK-NEXT: str r6, [sp, #16] @ 4-byte Spill +; CHECK-NEXT: str r3, [sp, #8] @ 4-byte Spill +; CHECK-NEXT: str r0, [sp, #20] @ 4-byte Spill ; CHECK-NEXT: b .LBB16_5 ; CHECK-NEXT: .LBB16_3: @ %for.end ; CHECK-NEXT: @ in Loop: Header=BB16_5 Depth=1 -; CHECK-NEXT: ldr r1, [sp, #20] @ 4-byte Reload -; CHECK-NEXT: ldrd r0, r9, [sp, #12] @ 8-byte Folded Reload +; CHECK-NEXT: ldr r1, [sp, #28] @ 4-byte Reload +; CHECK-NEXT: ldrd r0, r9, [sp, #20] @ 8-byte Folded Reload ; CHECK-NEXT: wls lr, r0, .LBB16_4 ; CHECK-NEXT: b .LBB16_9 ; CHECK-NEXT: .LBB16_4: @ %while.end ; CHECK-NEXT: @ in Loop: Header=BB16_5 Depth=1 -; CHECK-NEXT: ldr r0, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload ; CHECK-NEXT: subs.w r12, r12, #1 ; CHECK-NEXT: vstrb.8 q0, [r2], #16 -; CHECK-NEXT: add.w r0, r7, r0, lsl #2 -; CHECK-NEXT: add.w r7, r0, #16 +; CHECK-NEXT: add.w r0, r4, r0, lsl #2 +; CHECK-NEXT: add.w r4, r0, #16 ; CHECK-NEXT: beq .LBB16_12 ; CHECK-NEXT: .LBB16_5: @ %while.body ; CHECK-NEXT: @ =>This Loop Header: Depth=1 @@ -1090,79 +1092,79 @@ define void @fir(%struct.arm_fir_instance_f32* nocapture readonly %S, float* noc ; CHECK-NEXT: @ Child Loop BB16_10 Depth 2 ; CHECK-NEXT: add.w lr, r10, #8 ; CHECK-NEXT: vldrw.u32 q0, [r1], #16 -; CHECK-NEXT: ldrd r3, r4, [r10] +; CHECK-NEXT: ldrd r3, r7, [r10] ; CHECK-NEXT: ldm.w lr, {r0, r5, r6, lr} ; CHECK-NEXT: ldrd r11, r8, [r10, #24] ; CHECK-NEXT: vstrb.8 q0, [r9], #16 -; CHECK-NEXT: vldrw.u32 q0, [r7], #32 -; CHECK-NEXT: strd r9, r1, [sp, #16] @ 8-byte Folded Spill -; CHECK-NEXT: vldrw.u32 q1, [r7, #-28] +; CHECK-NEXT: vldrw.u32 q0, [r4], #32 +; CHECK-NEXT: strd r9, r1, [sp, #24] @ 8-byte Folded Spill +; CHECK-NEXT: vldrw.u32 q1, [r4, #-28] ; CHECK-NEXT: vmul.f32 q0, q0, r3 -; CHECK-NEXT: vldrw.u32 q6, [r7, #-24] -; CHECK-NEXT: vldrw.u32 q4, [r7, #-20] -; CHECK-NEXT: vfma.f32 q0, q1, r4 -; CHECK-NEXT: vldrw.u32 q5, [r7, #-16] +; CHECK-NEXT: vldrw.u32 q6, [r4, #-24] +; CHECK-NEXT: vldrw.u32 q4, [r4, #-20] +; CHECK-NEXT: vfma.f32 q0, q1, r7 +; CHECK-NEXT: vldrw.u32 q5, [r4, #-16] ; CHECK-NEXT: vfma.f32 q0, q6, r0 -; CHECK-NEXT: vldrw.u32 q2, [r7, #-12] +; CHECK-NEXT: vldrw.u32 q2, [r4, #-12] ; CHECK-NEXT: vfma.f32 q0, q4, r5 -; CHECK-NEXT: vldrw.u32 q3, [r7, #-8] +; CHECK-NEXT: vldrw.u32 q3, [r4, #-8] ; CHECK-NEXT: vfma.f32 q0, q5, r6 -; CHECK-NEXT: ldr r0, [sp, #8] @ 4-byte Reload +; CHECK-NEXT: ldr r0, [sp, #16] @ 4-byte Reload ; CHECK-NEXT: vfma.f32 q0, q2, lr -; CHECK-NEXT: vldrw.u32 q1, [r7, #-4] +; CHECK-NEXT: vldrw.u32 q1, [r4, #-4] ; CHECK-NEXT: vfma.f32 q0, q3, r11 ; CHECK-NEXT: cmp r0, #16 ; CHECK-NEXT: vfma.f32 q0, q1, r8 ; CHECK-NEXT: blo .LBB16_8 ; CHECK-NEXT: @ %bb.6: @ %for.body.preheader ; CHECK-NEXT: @ in Loop: Header=BB16_5 Depth=1 -; CHECK-NEXT: ldr r0, [sp] @ 4-byte Reload -; CHECK-NEXT: add.w r4, r10, #32 +; CHECK-NEXT: ldr r0, [sp, #4] @ 4-byte Reload ; CHECK-NEXT: dls lr, r0 +; CHECK-NEXT: ldr r7, [sp, #8] @ 4-byte Reload ; CHECK-NEXT: .LBB16_7: @ %for.body ; CHECK-NEXT: @ Parent Loop BB16_5 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: ldm.w r4, {r0, r3, r5, r6, r8, r11} -; CHECK-NEXT: vldrw.u32 q1, [r7], #32 -; CHECK-NEXT: vldrw.u32 q6, [r7, #-24] -; CHECK-NEXT: vldrw.u32 q4, [r7, #-20] +; CHECK-NEXT: ldm.w r7, {r0, r3, r5, r6, r8, r11} +; CHECK-NEXT: vldrw.u32 q1, [r4], #32 +; CHECK-NEXT: vldrw.u32 q6, [r4, #-24] +; CHECK-NEXT: vldrw.u32 q4, [r4, #-20] ; CHECK-NEXT: vfma.f32 q0, q1, r0 -; CHECK-NEXT: vldrw.u32 q1, [r7, #-28] -; CHECK-NEXT: vldrw.u32 q5, [r7, #-16] -; CHECK-NEXT: vldrw.u32 q2, [r7, #-12] +; CHECK-NEXT: vldrw.u32 q1, [r4, #-28] +; CHECK-NEXT: vldrw.u32 q5, [r4, #-16] +; CHECK-NEXT: vldrw.u32 q2, [r4, #-12] ; CHECK-NEXT: vfma.f32 q0, q1, r3 -; CHECK-NEXT: ldrd r9, r1, [r4, #24] +; CHECK-NEXT: ldrd r9, r1, [r7, #24] ; CHECK-NEXT: vfma.f32 q0, q6, r5 -; CHECK-NEXT: vldrw.u32 q3, [r7, #-8] +; CHECK-NEXT: vldrw.u32 q3, [r4, #-8] ; CHECK-NEXT: vfma.f32 q0, q4, r6 -; CHECK-NEXT: vldrw.u32 q1, [r7, #-4] +; CHECK-NEXT: vldrw.u32 q1, [r4, #-4] ; CHECK-NEXT: vfma.f32 q0, q5, r8 -; CHECK-NEXT: adds r4, #32 +; CHECK-NEXT: adds r7, #32 ; CHECK-NEXT: vfma.f32 q0, q2, r11 ; CHECK-NEXT: vfma.f32 q0, q3, r9 ; CHECK-NEXT: vfma.f32 q0, q1, r1 ; CHECK-NEXT: le lr, .LBB16_7 ; CHECK-NEXT: b .LBB16_3 ; CHECK-NEXT: .LBB16_8: @ in Loop: Header=BB16_5 Depth=1 -; CHECK-NEXT: add.w r4, r10, #32 +; CHECK-NEXT: ldr r7, [sp, #8] @ 4-byte Reload ; CHECK-NEXT: b .LBB16_3 ; CHECK-NEXT: .LBB16_9: @ %while.body76.preheader ; CHECK-NEXT: @ in Loop: Header=BB16_5 Depth=1 -; CHECK-NEXT: mov r3, r7 +; CHECK-NEXT: mov r3, r4 ; CHECK-NEXT: .LBB16_10: @ %while.body76 ; CHECK-NEXT: @ Parent Loop BB16_5 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: ldr r0, [r4], #4 +; CHECK-NEXT: ldr r0, [r7], #4 ; CHECK-NEXT: vldrw.u32 q1, [r3], #4 ; CHECK-NEXT: vfma.f32 q0, q1, r0 ; CHECK-NEXT: le lr, .LBB16_10 ; CHECK-NEXT: @ %bb.11: @ %while.end.loopexit ; CHECK-NEXT: @ in Loop: Header=BB16_5 Depth=1 -; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload -; CHECK-NEXT: add.w r7, r7, r0, lsl #2 +; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload +; CHECK-NEXT: add.w r4, r4, r0, lsl #2 ; CHECK-NEXT: b .LBB16_4 ; CHECK-NEXT: .LBB16_12: @ %if.end -; CHECK-NEXT: add sp, #24 +; CHECK-NEXT: add sp, #32 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} diff --git a/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll b/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll index 9f44be1..d4ad249 100644 --- a/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll +++ b/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll @@ -105,26 +105,26 @@ define void @DCT_mve2(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float* ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-NEXT: .pad #4 ; CHECK-NEXT: sub sp, #4 -; CHECK-NEXT: ldr r3, [r0, #4] -; CHECK-NEXT: subs r3, #2 -; CHECK-NEXT: str r3, [sp] @ 4-byte Spill -; CHECK-NEXT: cmp r3, #2 +; CHECK-NEXT: str r1, [sp] @ 4-byte Spill +; CHECK-NEXT: ldr r1, [r0, #4] +; CHECK-NEXT: subs r1, #2 +; CHECK-NEXT: cmp r1, #2 ; CHECK-NEXT: blo .LBB1_5 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader ; CHECK-NEXT: ldr.w r12, [r0, #8] ; CHECK-NEXT: movs r4, #1 ; CHECK-NEXT: ldr r3, [r0] ; CHECK-NEXT: add.w r11, r3, r12, lsl #2 -; CHECK-NEXT: add.w r6, r3, r12, lsl #3 -; CHECK-NEXT: lsl.w r10, r12, #3 +; CHECK-NEXT: add.w r7, r3, r12, lsl #3 +; CHECK-NEXT: lsl.w r9, r12, #3 ; CHECK-NEXT: .LBB1_2: @ %for.body ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB1_3 Depth 2 +; CHECK-NEXT: ldr r5, [sp] @ 4-byte Reload ; CHECK-NEXT: vmov.i32 q0, #0x0 -; CHECK-NEXT: add.w r9, r4, #1 -; CHECK-NEXT: mov r5, r1 +; CHECK-NEXT: add.w r10, r4, #1 ; CHECK-NEXT: mov r3, r11 -; CHECK-NEXT: mov r0, r6 +; CHECK-NEXT: mov r0, r7 ; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: dlstp.32 lr, r12 ; CHECK-NEXT: .LBB1_3: @ %vector.body @@ -139,20 +139,19 @@ define void @DCT_mve2(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float* ; CHECK-NEXT: @ %bb.4: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB1_2 Depth=1 ; CHECK-NEXT: vadd.f32 s2, s2, s3 -; CHECK-NEXT: add.w r0, r2, r9, lsl #2 +; CHECK-NEXT: add.w r0, r2, r10, lsl #2 ; CHECK-NEXT: vadd.f32 s0, s0, s1 -; CHECK-NEXT: add r11, r10 +; CHECK-NEXT: add r11, r9 ; CHECK-NEXT: vadd.f32 s6, s6, s7 -; CHECK-NEXT: add r6, r10 +; CHECK-NEXT: add r7, r9 ; CHECK-NEXT: vadd.f32 s4, s4, s5 ; CHECK-NEXT: vadd.f32 s0, s0, s2 ; CHECK-NEXT: vadd.f32 s2, s4, s6 ; CHECK-NEXT: vstr s0, [r0] ; CHECK-NEXT: add.w r0, r2, r4, lsl #2 ; CHECK-NEXT: adds r4, #2 +; CHECK-NEXT: cmp r4, r1 ; CHECK-NEXT: vstr s2, [r0] -; CHECK-NEXT: ldr r0, [sp] @ 4-byte Reload -; CHECK-NEXT: cmp r4, r0 ; CHECK-NEXT: blo .LBB1_2 ; CHECK-NEXT: .LBB1_5: @ %for.cond.cleanup ; CHECK-NEXT: add sp, #4 @@ -232,40 +231,46 @@ define void @DCT_mve3(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float* ; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: .vsave {d8, d9} ; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: .pad #16 -; CHECK-NEXT: sub sp, #16 -; CHECK-NEXT: str r1, [sp, #12] @ 4-byte Spill +; CHECK-NEXT: .pad #24 +; CHECK-NEXT: sub sp, #24 +; CHECK-NEXT: str r1, [sp, #16] @ 4-byte Spill ; CHECK-NEXT: ldr r1, [r0, #4] +; CHECK-NEXT: str r2, [sp, #8] @ 4-byte Spill ; CHECK-NEXT: subs r1, #3 -; CHECK-NEXT: str r1, [sp, #8] @ 4-byte Spill +; CHECK-NEXT: str r1, [sp, #12] @ 4-byte Spill ; CHECK-NEXT: cmp r1, #2 ; CHECK-NEXT: blo .LBB2_5 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader -; CHECK-NEXT: ldr.w r9, [r0, #8] +; CHECK-NEXT: ldr r3, [r0, #8] ; CHECK-NEXT: movs r5, #1 ; CHECK-NEXT: ldr r1, [r0] -; CHECK-NEXT: add.w r0, r9, r9, lsl #1 -; CHECK-NEXT: add.w r10, r1, r9, lsl #2 -; CHECK-NEXT: add.w r12, r1, r9, lsl #3 -; CHECK-NEXT: add.w r8, r1, r0, lsl #2 -; CHECK-NEXT: add.w r1, r9, #3 -; CHECK-NEXT: bic r1, r1, #3 -; CHECK-NEXT: lsl.w r11, r0, #2 -; CHECK-NEXT: subs r1, #4 -; CHECK-NEXT: add.w r1, r5, r1, lsr #2 -; CHECK-NEXT: str r1, [sp, #4] @ 4-byte Spill +; CHECK-NEXT: str r3, [sp, #4] @ 4-byte Spill +; CHECK-NEXT: add.w r0, r3, r3, lsl #1 +; CHECK-NEXT: add.w r9, r1, r3, lsl #2 +; CHECK-NEXT: add.w r12, r1, r3, lsl #3 +; CHECK-NEXT: adds r3, #3 +; CHECK-NEXT: bic r3, r3, #3 ; CHECK-NEXT: ldr r7, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: add.w r10, r1, r0, lsl #2 +; CHECK-NEXT: subs r3, #4 +; CHECK-NEXT: lsl.w r11, r0, #2 +; CHECK-NEXT: add.w r1, r5, r3, lsr #2 +; CHECK-NEXT: str r1, [sp] @ 4-byte Spill ; CHECK-NEXT: .LBB2_2: @ %for.body ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB2_3 Depth 2 -; CHECK-NEXT: ldr r6, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: ldr r6, [sp, #16] @ 4-byte Reload ; CHECK-NEXT: vmov.i32 q0, #0x0 -; CHECK-NEXT: mov r3, r10 +; CHECK-NEXT: ldr r1, [sp] @ 4-byte Reload +; CHECK-NEXT: adds r0, r5, #2 +; CHECK-NEXT: adds r2, r5, #1 +; CHECK-NEXT: str r0, [sp, #20] @ 4-byte Spill +; CHECK-NEXT: mov r3, r9 ; CHECK-NEXT: mov r0, r12 -; CHECK-NEXT: mov r4, r8 +; CHECK-NEXT: mov r4, r10 ; CHECK-NEXT: vmov q2, q0 ; CHECK-NEXT: vmov q1, q0 -; CHECK-NEXT: dlstp.32 lr, r9 +; CHECK-NEXT: dlstp.32 lr, r7 ; CHECK-NEXT: .LBB2_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB2_2 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 @@ -280,31 +285,31 @@ define void @DCT_mve3(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float* ; CHECK-NEXT: @ %bb.4: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB2_2 Depth=1 ; CHECK-NEXT: vadd.f32 s10, s10, s11 -; CHECK-NEXT: adds r0, r5, #1 +; CHECK-NEXT: ldr r1, [sp, #8] @ 4-byte Reload ; CHECK-NEXT: vadd.f32 s8, s8, s9 -; CHECK-NEXT: add r10, r11 +; CHECK-NEXT: add r9, r11 ; CHECK-NEXT: vadd.f32 s6, s6, s7 -; CHECK-NEXT: add.w r0, r2, r0, lsl #2 +; CHECK-NEXT: add.w r0, r1, r2, lsl #2 ; CHECK-NEXT: vadd.f32 s4, s4, s5 ; CHECK-NEXT: add r12, r11 ; CHECK-NEXT: vadd.f32 s2, s2, s3 -; CHECK-NEXT: add r8, r11 +; CHECK-NEXT: add r10, r11 ; CHECK-NEXT: vadd.f32 s0, s0, s1 ; CHECK-NEXT: vadd.f32 s8, s8, s10 ; CHECK-NEXT: vadd.f32 s4, s4, s6 ; CHECK-NEXT: vadd.f32 s0, s0, s2 ; CHECK-NEXT: vstr s8, [r0] -; CHECK-NEXT: add.w r0, r2, r5, lsl #2 -; CHECK-NEXT: vstr s4, [r0] -; CHECK-NEXT: adds r0, r5, #2 +; CHECK-NEXT: add.w r0, r1, r5, lsl #2 ; CHECK-NEXT: adds r5, #3 -; CHECK-NEXT: add.w r0, r2, r0, lsl #2 +; CHECK-NEXT: vstr s4, [r0] +; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload +; CHECK-NEXT: add.w r0, r1, r0, lsl #2 ; CHECK-NEXT: vstr s0, [r0] -; CHECK-NEXT: ldr r0, [sp, #8] @ 4-byte Reload +; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload ; CHECK-NEXT: cmp r5, r0 ; CHECK-NEXT: blo .LBB2_2 ; CHECK-NEXT: .LBB2_5: @ %for.cond.cleanup -; CHECK-NEXT: add sp, #16 +; CHECK-NEXT: add sp, #24 ; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} @@ -396,15 +401,15 @@ define void @DCT_mve4(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float* ; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: .vsave {d8, d9, d10, d11} ; CHECK-NEXT: vpush {d8, d9, d10, d11} -; CHECK-NEXT: .pad #24 -; CHECK-NEXT: sub sp, #24 -; CHECK-NEXT: str r1, [sp, #20] @ 4-byte Spill +; CHECK-NEXT: .pad #40 +; CHECK-NEXT: sub sp, #40 +; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill ; CHECK-NEXT: ldr r1, [r0, #4] -; CHECK-NEXT: str r2, [sp, #12] @ 4-byte Spill +; CHECK-NEXT: str r2, [sp, #16] @ 4-byte Spill ; CHECK-NEXT: subs r1, #4 -; CHECK-NEXT: str r1, [sp, #16] @ 4-byte Spill +; CHECK-NEXT: str r1, [sp, #20] @ 4-byte Spill ; CHECK-NEXT: cmp r1, #2 -; CHECK-NEXT: blo .LBB3_5 +; CHECK-NEXT: blo.w .LBB3_5 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader ; CHECK-NEXT: ldr r2, [r0, #8] ; CHECK-NEXT: movs r6, #1 @@ -412,25 +417,31 @@ define void @DCT_mve4(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float* ; CHECK-NEXT: add.w r0, r2, r2, lsl #1 ; CHECK-NEXT: add.w r12, r1, r2, lsl #2 ; CHECK-NEXT: add.w r8, r1, r2, lsl #3 -; CHECK-NEXT: add.w r10, r1, r2, lsl #4 -; CHECK-NEXT: add.w r9, r1, r0, lsl #2 +; CHECK-NEXT: add.w r9, r1, r2, lsl #4 +; CHECK-NEXT: add.w r11, r1, r0, lsl #2 ; CHECK-NEXT: adds r0, r2, #3 ; CHECK-NEXT: bic r0, r0, #3 ; CHECK-NEXT: subs r0, #4 ; CHECK-NEXT: add.w r0, r6, r0, lsr #2 -; CHECK-NEXT: strd r0, r2, [sp, #4] @ 8-byte Folded Spill +; CHECK-NEXT: strd r0, r2, [sp, #8] @ 8-byte Folded Spill ; CHECK-NEXT: lsls r0, r2, #4 -; CHECK-NEXT: ldrd r2, r7, [sp, #4] @ 8-byte Folded Reload -; CHECK-NEXT: str r0, [sp] @ 4-byte Spill +; CHECK-NEXT: ldrd r2, r7, [sp, #8] @ 8-byte Folded Reload +; CHECK-NEXT: str r0, [sp, #4] @ 4-byte Spill ; CHECK-NEXT: .LBB3_2: @ %for.body ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB3_3 Depth 2 -; CHECK-NEXT: ldr r1, [sp, #20] @ 4-byte Reload +; CHECK-NEXT: adds r0, r6, #3 +; CHECK-NEXT: str r0, [sp, #36] @ 4-byte Spill +; CHECK-NEXT: adds r0, r6, #2 +; CHECK-NEXT: ldr r1, [sp, #24] @ 4-byte Reload ; CHECK-NEXT: vmov.i32 q0, #0x0 +; CHECK-NEXT: str r0, [sp, #32] @ 4-byte Spill +; CHECK-NEXT: adds r0, r6, #1 +; CHECK-NEXT: str r0, [sp, #28] @ 4-byte Spill ; CHECK-NEXT: mov r3, r12 ; CHECK-NEXT: mov r0, r8 -; CHECK-NEXT: mov r5, r9 -; CHECK-NEXT: mov r4, r10 +; CHECK-NEXT: mov r5, r11 +; CHECK-NEXT: mov r4, r9 ; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: vmov q2, q0 ; CHECK-NEXT: vmov q3, q0 @@ -451,9 +462,9 @@ define void @DCT_mve4(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float* ; CHECK-NEXT: @ %bb.4: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB3_2 Depth=1 ; CHECK-NEXT: vadd.f32 s14, s14, s15 -; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: ldr r0, [sp, #28] @ 4-byte Reload ; CHECK-NEXT: vadd.f32 s12, s12, s13 -; CHECK-NEXT: adds r0, r6, #1 +; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload ; CHECK-NEXT: vadd.f32 s10, s10, s11 ; CHECK-NEXT: vadd.f32 s8, s8, s9 ; CHECK-NEXT: add.w r0, r1, r0, lsl #2 @@ -467,24 +478,24 @@ define void @DCT_mve4(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float* ; CHECK-NEXT: vadd.f32 s0, s0, s2 ; CHECK-NEXT: vstr s12, [r0] ; CHECK-NEXT: add.w r0, r1, r6, lsl #2 +; CHECK-NEXT: adds r6, #4 ; CHECK-NEXT: vstr s8, [r0] -; CHECK-NEXT: adds r0, r6, #2 +; CHECK-NEXT: ldr r0, [sp, #32] @ 4-byte Reload ; CHECK-NEXT: add.w r0, r1, r0, lsl #2 ; CHECK-NEXT: vstr s4, [r0] -; CHECK-NEXT: adds r0, r6, #3 -; CHECK-NEXT: adds r6, #4 +; CHECK-NEXT: ldr r0, [sp, #36] @ 4-byte Reload ; CHECK-NEXT: add.w r0, r1, r0, lsl #2 ; CHECK-NEXT: vstr s0, [r0] -; CHECK-NEXT: ldr r0, [sp] @ 4-byte Reload +; CHECK-NEXT: ldr r0, [sp, #4] @ 4-byte Reload ; CHECK-NEXT: add r12, r0 ; CHECK-NEXT: add r8, r0 +; CHECK-NEXT: add r11, r0 ; CHECK-NEXT: add r9, r0 -; CHECK-NEXT: add r10, r0 -; CHECK-NEXT: ldr r0, [sp, #16] @ 4-byte Reload +; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload ; CHECK-NEXT: cmp r6, r0 ; CHECK-NEXT: blo .LBB3_2 ; CHECK-NEXT: .LBB3_5: @ %for.cond.cleanup -; CHECK-NEXT: add sp, #24 +; CHECK-NEXT: add sp, #40 ; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} @@ -589,41 +600,48 @@ define void @DCT_mve5(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float* ; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: .pad #16 -; CHECK-NEXT: sub sp, #16 -; CHECK-NEXT: str r1, [sp, #12] @ 4-byte Spill +; CHECK-NEXT: .pad #32 +; CHECK-NEXT: sub sp, #32 +; CHECK-NEXT: str r1, [sp, #20] @ 4-byte Spill ; CHECK-NEXT: ldr r1, [r0, #4] ; CHECK-NEXT: subs r1, #5 -; CHECK-NEXT: str r1, [sp, #8] @ 4-byte Spill +; CHECK-NEXT: str r1, [sp, #16] @ 4-byte Spill ; CHECK-NEXT: cmp r1, #2 ; CHECK-NEXT: blo.w .LBB4_5 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader -; CHECK-NEXT: ldr r1, [r0, #8] -; CHECK-NEXT: ldr r3, [r0] -; CHECK-NEXT: adds r0, r1, #3 +; CHECK-NEXT: ldr r3, [r0, #8] +; CHECK-NEXT: ldr r1, [r0] +; CHECK-NEXT: adds r0, r3, #3 +; CHECK-NEXT: str r3, [sp, #12] @ 4-byte Spill ; CHECK-NEXT: bic r0, r0, #3 -; CHECK-NEXT: add.w r12, r3, r1, lsl #2 -; CHECK-NEXT: subs r3, r0, #4 +; CHECK-NEXT: add.w r8, r1, r3, lsl #2 +; CHECK-NEXT: subs r1, r0, #4 ; CHECK-NEXT: movs r0, #1 -; CHECK-NEXT: lsls r5, r1, #2 -; CHECK-NEXT: add.w r3, r0, r3, lsr #2 -; CHECK-NEXT: str r3, [sp, #4] @ 4-byte Spill -; CHECK-NEXT: add.w r3, r1, r1, lsl #2 -; CHECK-NEXT: lsls r3, r3, #2 -; CHECK-NEXT: str r3, [sp] @ 4-byte Spill +; CHECK-NEXT: lsls r5, r3, #2 +; CHECK-NEXT: add.w r1, r0, r1, lsr #2 +; CHECK-NEXT: str r1, [sp, #8] @ 4-byte Spill +; CHECK-NEXT: add.w r1, r3, r3, lsl #2 +; CHECK-NEXT: lsls r1, r1, #2 +; CHECK-NEXT: str r1, [sp, #4] @ 4-byte Spill ; CHECK-NEXT: .LBB4_2: @ %for.body ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB4_3 Depth 2 -; CHECK-NEXT: ldr r4, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: ldr r7, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: adds r1, r0, #4 +; CHECK-NEXT: ldr r4, [sp, #20] @ 4-byte Reload ; CHECK-NEXT: vmov.i32 q1, #0x0 -; CHECK-NEXT: ldr r7, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: ldr r6, [sp, #8] @ 4-byte Reload +; CHECK-NEXT: add.w r10, r0, #2 +; CHECK-NEXT: str r1, [sp, #28] @ 4-byte Spill +; CHECK-NEXT: adds r1, r0, #3 ; CHECK-NEXT: add.w r11, r0, #1 -; CHECK-NEXT: mov r3, r12 +; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill +; CHECK-NEXT: mov r3, r8 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: vmov q3, q1 ; CHECK-NEXT: vmov q2, q1 ; CHECK-NEXT: vmov q4, q1 -; CHECK-NEXT: dlstp.32 lr, r1 +; CHECK-NEXT: dlstp.32 lr, r7 ; CHECK-NEXT: .LBB4_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB4_2 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 @@ -631,11 +649,11 @@ define void @DCT_mve5(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float* ; CHECK-NEXT: vldrw.u32 q5, [r4], #16 ; CHECK-NEXT: vldrw.u32 q6, [r3], #16 ; CHECK-NEXT: vfma.f32 q3, q6, q5 -; CHECK-NEXT: add.w r10, r9, r5 +; CHECK-NEXT: add.w r12, r9, r5 ; CHECK-NEXT: vldrw.u32 q6, [r9] ; CHECK-NEXT: vfma.f32 q4, q6, q5 -; CHECK-NEXT: add.w r6, r10, r5 -; CHECK-NEXT: vldrw.u32 q6, [r10] +; CHECK-NEXT: add.w r6, r12, r5 +; CHECK-NEXT: vldrw.u32 q6, [r12] ; CHECK-NEXT: vfma.f32 q2, q6, q5 ; CHECK-NEXT: adds r7, r6, r5 ; CHECK-NEXT: vldrw.u32 q6, [r6] @@ -646,7 +664,7 @@ define void @DCT_mve5(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float* ; CHECK-NEXT: @ %bb.4: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB4_2 Depth=1 ; CHECK-NEXT: vadd.f32 s18, s18, s19 -; CHECK-NEXT: add.w r3, r2, r11, lsl #2 +; CHECK-NEXT: add.w r1, r2, r11, lsl #2 ; CHECK-NEXT: vadd.f32 s16, s16, s17 ; CHECK-NEXT: vadd.f32 s14, s14, s15 ; CHECK-NEXT: vadd.f32 s12, s12, s13 @@ -656,31 +674,30 @@ define void @DCT_mve5(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float* ; CHECK-NEXT: vadd.f32 s8, s8, s9 ; CHECK-NEXT: vadd.f32 s0, s0, s1 ; CHECK-NEXT: vadd.f32 s1, s16, s18 -; CHECK-NEXT: vadd.f32 s12, s12, s14 ; CHECK-NEXT: vadd.f32 s2, s2, s3 +; CHECK-NEXT: vadd.f32 s12, s12, s14 ; CHECK-NEXT: vadd.f32 s4, s4, s6 ; CHECK-NEXT: vadd.f32 s6, s8, s10 -; CHECK-NEXT: vstr s1, [r3] -; CHECK-NEXT: add.w r3, r2, r0, lsl #2 -; CHECK-NEXT: vstr s12, [r3] -; CHECK-NEXT: adds r3, r0, #2 +; CHECK-NEXT: vstr s1, [r1] +; CHECK-NEXT: add.w r1, r2, r0, lsl #2 ; CHECK-NEXT: vadd.f32 s0, s0, s2 -; CHECK-NEXT: add.w r3, r2, r3, lsl #2 -; CHECK-NEXT: vstr s6, [r3] -; CHECK-NEXT: adds r3, r0, #3 -; CHECK-NEXT: add.w r3, r2, r3, lsl #2 -; CHECK-NEXT: vstr s0, [r3] -; CHECK-NEXT: adds r3, r0, #4 ; CHECK-NEXT: adds r0, #5 -; CHECK-NEXT: add.w r3, r2, r3, lsl #2 -; CHECK-NEXT: vstr s4, [r3] -; CHECK-NEXT: ldr r3, [sp] @ 4-byte Reload -; CHECK-NEXT: add r12, r3 -; CHECK-NEXT: ldr r3, [sp, #8] @ 4-byte Reload -; CHECK-NEXT: cmp r0, r3 -; CHECK-NEXT: blo .LBB4_2 +; CHECK-NEXT: vstr s12, [r1] +; CHECK-NEXT: add.w r1, r2, r10, lsl #2 +; CHECK-NEXT: vstr s6, [r1] +; CHECK-NEXT: ldr r1, [sp, #24] @ 4-byte Reload +; CHECK-NEXT: add.w r1, r2, r1, lsl #2 +; CHECK-NEXT: vstr s0, [r1] +; CHECK-NEXT: ldr r1, [sp, #28] @ 4-byte Reload +; CHECK-NEXT: add.w r1, r2, r1, lsl #2 +; CHECK-NEXT: vstr s4, [r1] +; CHECK-NEXT: ldr r1, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: add r8, r1 +; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload +; CHECK-NEXT: cmp r0, r1 +; CHECK-NEXT: blo.w .LBB4_2 ; CHECK-NEXT: .LBB4_5: @ %for.cond.cleanup -; CHECK-NEXT: add sp, #16 +; CHECK-NEXT: add sp, #32 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} @@ -798,54 +815,63 @@ define void @DCT_mve6(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float* ; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #16 -; CHECK-NEXT: sub sp, #16 -; CHECK-NEXT: str r1, [sp, #12] @ 4-byte Spill +; CHECK-NEXT: .pad #32 +; CHECK-NEXT: sub sp, #32 +; CHECK-NEXT: str r1, [sp, #16] @ 4-byte Spill ; CHECK-NEXT: ldr r1, [r0, #4] ; CHECK-NEXT: subs r1, #6 -; CHECK-NEXT: str r1, [sp, #8] @ 4-byte Spill +; CHECK-NEXT: str r1, [sp, #12] @ 4-byte Spill ; CHECK-NEXT: cmp r1, #2 ; CHECK-NEXT: blo.w .LBB5_5 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader -; CHECK-NEXT: ldr.w r9, [r0, #8] +; CHECK-NEXT: ldr r3, [r0, #8] ; CHECK-NEXT: ldr r1, [r0] -; CHECK-NEXT: add.w r0, r9, #3 +; CHECK-NEXT: adds r0, r3, #3 +; CHECK-NEXT: str r3, [sp, #8] @ 4-byte Spill ; CHECK-NEXT: bic r0, r0, #3 -; CHECK-NEXT: add.w r12, r1, r9, lsl #2 +; CHECK-NEXT: add.w r8, r1, r3, lsl #2 ; CHECK-NEXT: subs r1, r0, #4 ; CHECK-NEXT: movs r0, #1 -; CHECK-NEXT: lsl.w r5, r9, #2 +; CHECK-NEXT: lsls r5, r3, #2 ; CHECK-NEXT: add.w r1, r0, r1, lsr #2 ; CHECK-NEXT: str r1, [sp, #4] @ 4-byte Spill -; CHECK-NEXT: add.w r1, r9, r9, lsl #1 +; CHECK-NEXT: add.w r1, r3, r3, lsl #1 ; CHECK-NEXT: lsls r1, r1, #3 ; CHECK-NEXT: str r1, [sp] @ 4-byte Spill ; CHECK-NEXT: .LBB5_2: @ %for.body ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB5_3 Depth 2 -; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: adds r1, r0, #5 +; CHECK-NEXT: str r1, [sp, #28] @ 4-byte Spill +; CHECK-NEXT: adds r1, r0, #4 +; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill +; CHECK-NEXT: adds r1, r0, #3 +; CHECK-NEXT: ldr r7, [sp, #8] @ 4-byte Reload +; CHECK-NEXT: str r1, [sp, #20] @ 4-byte Spill ; CHECK-NEXT: vmov.i32 q1, #0x0 -; CHECK-NEXT: ldr r7, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload +; CHECK-NEXT: add.w r11, r0, #2 +; CHECK-NEXT: ldr r6, [sp, #4] @ 4-byte Reload ; CHECK-NEXT: adds r4, r0, #1 -; CHECK-NEXT: mov r3, r12 +; CHECK-NEXT: mov r3, r8 ; CHECK-NEXT: vmov q3, q1 ; CHECK-NEXT: vmov q4, q1 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: vmov q5, q1 ; CHECK-NEXT: vmov q2, q1 -; CHECK-NEXT: dlstp.32 lr, r9 +; CHECK-NEXT: dlstp.32 lr, r7 ; CHECK-NEXT: .LBB5_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB5_2 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: add.w r10, r3, r5 +; CHECK-NEXT: add.w r12, r3, r5 ; CHECK-NEXT: vldrw.u32 q6, [r1], #16 ; CHECK-NEXT: vldrw.u32 q7, [r3], #16 ; CHECK-NEXT: vfma.f32 q4, q7, q6 -; CHECK-NEXT: add.w r11, r10, r5 -; CHECK-NEXT: vldrw.u32 q7, [r10] +; CHECK-NEXT: add.w r10, r12, r5 +; CHECK-NEXT: vldrw.u32 q7, [r12] ; CHECK-NEXT: vfma.f32 q5, q7, q6 -; CHECK-NEXT: add.w r6, r11, r5 -; CHECK-NEXT: vldrw.u32 q7, [r11] +; CHECK-NEXT: add.w r6, r10, r5 +; CHECK-NEXT: vldrw.u32 q7, [r10] ; CHECK-NEXT: vfma.f32 q2, q7, q6 ; CHECK-NEXT: adds r7, r6, r5 ; CHECK-NEXT: vldrw.u32 q7, [r6] @@ -877,29 +903,28 @@ define void @DCT_mve6(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float* ; CHECK-NEXT: vstr s1, [r1] ; CHECK-NEXT: add.w r1, r2, r0, lsl #2 ; CHECK-NEXT: vadd.f32 s0, s0, s2 +; CHECK-NEXT: adds r0, #6 ; CHECK-NEXT: vstr s3, [r1] -; CHECK-NEXT: adds r1, r0, #2 +; CHECK-NEXT: add.w r1, r2, r11, lsl #2 ; CHECK-NEXT: vadd.f32 s4, s4, s6 -; CHECK-NEXT: add.w r1, r2, r1, lsl #2 ; CHECK-NEXT: vstr s8, [r1] -; CHECK-NEXT: adds r1, r0, #3 +; CHECK-NEXT: ldr r1, [sp, #20] @ 4-byte Reload ; CHECK-NEXT: vadd.f32 s6, s12, s14 ; CHECK-NEXT: add.w r1, r2, r1, lsl #2 ; CHECK-NEXT: vstr s0, [r1] -; CHECK-NEXT: adds r1, r0, #4 +; CHECK-NEXT: ldr r1, [sp, #24] @ 4-byte Reload ; CHECK-NEXT: add.w r1, r2, r1, lsl #2 ; CHECK-NEXT: vstr s6, [r1] -; CHECK-NEXT: adds r1, r0, #5 -; CHECK-NEXT: adds r0, #6 +; CHECK-NEXT: ldr r1, [sp, #28] @ 4-byte Reload ; CHECK-NEXT: add.w r1, r2, r1, lsl #2 ; CHECK-NEXT: vstr s4, [r1] ; CHECK-NEXT: ldr r1, [sp] @ 4-byte Reload -; CHECK-NEXT: add r12, r1 -; CHECK-NEXT: ldr r1, [sp, #8] @ 4-byte Reload +; CHECK-NEXT: add r8, r1 +; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload ; CHECK-NEXT: cmp r0, r1 ; CHECK-NEXT: blo.w .LBB5_2 ; CHECK-NEXT: .LBB5_5: @ %for.cond.cleanup -; CHECK-NEXT: add sp, #16 +; CHECK-NEXT: add sp, #32 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} @@ -1030,95 +1055,105 @@ define void @DCT_mve7(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float* ; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #48 -; CHECK-NEXT: sub sp, #48 -; CHECK-NEXT: str r1, [sp, #12] @ 4-byte Spill +; CHECK-NEXT: .pad #72 +; CHECK-NEXT: sub sp, #72 +; CHECK-NEXT: str r1, [sp, #20] @ 4-byte Spill ; CHECK-NEXT: ldr r1, [r0, #4] ; CHECK-NEXT: subs r1, #7 -; CHECK-NEXT: str r1, [sp, #8] @ 4-byte Spill +; CHECK-NEXT: str r1, [sp, #16] @ 4-byte Spill ; CHECK-NEXT: cmp r1, #2 ; CHECK-NEXT: blo.w .LBB6_5 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader -; CHECK-NEXT: ldr.w r10, [r0, #8] +; CHECK-NEXT: ldr r3, [r0, #8] ; CHECK-NEXT: ldr r1, [r0] -; CHECK-NEXT: add.w r0, r10, #3 +; CHECK-NEXT: adds r0, r3, #3 +; CHECK-NEXT: str r3, [sp, #12] @ 4-byte Spill ; CHECK-NEXT: bic r0, r0, #3 -; CHECK-NEXT: add.w r8, r1, r10, lsl #2 +; CHECK-NEXT: add.w r9, r1, r3, lsl #2 ; CHECK-NEXT: subs r1, r0, #4 ; CHECK-NEXT: movs r0, #1 -; CHECK-NEXT: lsl.w r5, r10, #2 +; CHECK-NEXT: lsls r5, r3, #2 ; CHECK-NEXT: add.w r1, r0, r1, lsr #2 -; CHECK-NEXT: str r1, [sp, #4] @ 4-byte Spill -; CHECK-NEXT: rsb r1, r10, r10, lsl #3 +; CHECK-NEXT: str r1, [sp, #8] @ 4-byte Spill +; CHECK-NEXT: rsb r1, r3, r3, lsl #3 ; CHECK-NEXT: lsls r1, r1, #2 -; CHECK-NEXT: str r1, [sp] @ 4-byte Spill +; CHECK-NEXT: str r1, [sp, #4] @ 4-byte Spill ; CHECK-NEXT: .LBB6_2: @ %for.body ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB6_3 Depth 2 -; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: adds r1, r0, #6 +; CHECK-NEXT: str r1, [sp, #36] @ 4-byte Spill +; CHECK-NEXT: adds r1, r0, #5 +; CHECK-NEXT: str r1, [sp, #32] @ 4-byte Spill +; CHECK-NEXT: adds r1, r0, #4 +; CHECK-NEXT: str r1, [sp, #28] @ 4-byte Spill +; CHECK-NEXT: adds r1, r0, #3 +; CHECK-NEXT: ldr r7, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill ; CHECK-NEXT: vmov.i32 q2, #0x0 -; CHECK-NEXT: ldr r7, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: ldr r1, [sp, #20] @ 4-byte Reload ; CHECK-NEXT: adds r4, r0, #2 -; CHECK-NEXT: add.w r12, r0, #1 -; CHECK-NEXT: mov r3, r8 +; CHECK-NEXT: ldr r6, [sp, #8] @ 4-byte Reload +; CHECK-NEXT: add.w r8, r0, #1 +; CHECK-NEXT: mov r3, r9 ; CHECK-NEXT: vmov q4, q2 ; CHECK-NEXT: vmov q5, q2 ; CHECK-NEXT: vmov q3, q2 ; CHECK-NEXT: vmov q6, q2 ; CHECK-NEXT: vmov q1, q2 -; CHECK-NEXT: mov r9, r10 -; CHECK-NEXT: vstrw.32 q2, [sp, #32] @ 16-byte Spill -; CHECK-NEXT: dls lr, r7 +; CHECK-NEXT: mov r12, r7 +; CHECK-NEXT: vstrw.32 q2, [sp, #56] @ 16-byte Spill +; CHECK-NEXT: dls lr, r6 ; CHECK-NEXT: .LBB6_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB6_2 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: add.w r11, r3, r5 -; CHECK-NEXT: vctp.32 r9 +; CHECK-NEXT: add.w r10, r3, r5 +; CHECK-NEXT: vctp.32 r12 ; CHECK-NEXT: vpsttt ; CHECK-NEXT: vldrwt.u32 q7, [r1], #16 ; CHECK-NEXT: vldrwt.u32 q0, [r3], #16 ; CHECK-NEXT: vfmat.f32 q5, q0, q7 -; CHECK-NEXT: add.w r6, r11, r5 +; CHECK-NEXT: add.w r11, r10, r5 ; CHECK-NEXT: vpstt -; CHECK-NEXT: vldrwt.u32 q0, [r11] +; CHECK-NEXT: vldrwt.u32 q0, [r10] ; CHECK-NEXT: vfmat.f32 q6, q0, q7 -; CHECK-NEXT: vstrw.32 q6, [sp, #16] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q6, [sp, #40] @ 16-byte Spill ; CHECK-NEXT: vpstt -; CHECK-NEXT: vldrwt.u32 q0, [r6] +; CHECK-NEXT: vldrwt.u32 q0, [r11] ; CHECK-NEXT: vfmat.f32 q1, q0, q7 -; CHECK-NEXT: adds r7, r6, r5 +; CHECK-NEXT: add.w r6, r11, r5 ; CHECK-NEXT: vmov q6, q5 ; CHECK-NEXT: vmov q5, q4 ; CHECK-NEXT: vmov q4, q3 ; CHECK-NEXT: vpst -; CHECK-NEXT: vldrwt.u32 q0, [r7] +; CHECK-NEXT: vldrwt.u32 q0, [r6] ; CHECK-NEXT: vmov q3, q1 -; CHECK-NEXT: vldrw.u32 q1, [sp, #32] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q1, [sp, #56] @ 16-byte Reload ; CHECK-NEXT: vpst ; CHECK-NEXT: vfmat.f32 q1, q0, q7 -; CHECK-NEXT: adds r6, r7, r5 -; CHECK-NEXT: vstrw.32 q1, [sp, #32] @ 16-byte Spill +; CHECK-NEXT: adds r7, r6, r5 +; CHECK-NEXT: vstrw.32 q1, [sp, #56] @ 16-byte Spill ; CHECK-NEXT: vmov q1, q3 ; CHECK-NEXT: vmov q3, q4 ; CHECK-NEXT: vmov q4, q5 ; CHECK-NEXT: vmov q5, q6 -; CHECK-NEXT: vldrw.u32 q6, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: sub.w r9, r9, #4 -; CHECK-NEXT: adds r7, r6, r5 +; CHECK-NEXT: vldrw.u32 q6, [sp, #40] @ 16-byte Reload +; CHECK-NEXT: sub.w r12, r12, #4 +; CHECK-NEXT: adds r6, r7, r5 ; CHECK-NEXT: vpstt -; CHECK-NEXT: vldrwt.u32 q0, [r6] +; CHECK-NEXT: vldrwt.u32 q0, [r7] ; CHECK-NEXT: vfmat.f32 q3, q0, q7 -; CHECK-NEXT: adds r6, r7, r5 +; CHECK-NEXT: adds r7, r6, r5 ; CHECK-NEXT: vpstttt -; CHECK-NEXT: vldrwt.u32 q0, [r7] -; CHECK-NEXT: vfmat.f32 q4, q0, q7 ; CHECK-NEXT: vldrwt.u32 q0, [r6] +; CHECK-NEXT: vfmat.f32 q4, q0, q7 +; CHECK-NEXT: vldrwt.u32 q0, [r7] ; CHECK-NEXT: vfmat.f32 q2, q0, q7 ; CHECK-NEXT: le lr, .LBB6_3 ; CHECK-NEXT: @ %bb.4: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB6_2 Depth=1 ; CHECK-NEXT: vadd.f32 s0, s26, s27 -; CHECK-NEXT: add.w r1, r2, r12, lsl #2 +; CHECK-NEXT: add.w r1, r2, r8, lsl #2 ; CHECK-NEXT: vadd.f32 s2, s24, s25 ; CHECK-NEXT: vadd.f32 s1, s22, s23 ; CHECK-NEXT: vadd.f32 s3, s20, s21 @@ -1126,45 +1161,45 @@ define void @DCT_mve7(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float* ; CHECK-NEXT: vadd.f32 s4, s4, s5 ; CHECK-NEXT: vadd.f32 s10, s10, s11 ; CHECK-NEXT: vadd.f32 s8, s8, s9 +; CHECK-NEXT: vadd.f32 s0, s2, s0 ; CHECK-NEXT: vadd.f32 s9, s18, s19 ; CHECK-NEXT: vadd.f32 s11, s16, s17 -; CHECK-NEXT: vldrw.u32 q4, [sp, #32] @ 16-byte Reload -; CHECK-NEXT: vadd.f32 s0, s2, s0 +; CHECK-NEXT: vldrw.u32 q4, [sp, #56] @ 16-byte Reload +; CHECK-NEXT: vadd.f32 s2, s3, s1 ; CHECK-NEXT: vadd.f32 s5, s18, s19 ; CHECK-NEXT: vadd.f32 s7, s16, s17 -; CHECK-NEXT: vadd.f32 s2, s3, s1 ; CHECK-NEXT: vadd.f32 s4, s4, s6 -; CHECK-NEXT: vadd.f32 s14, s14, s15 -; CHECK-NEXT: vadd.f32 s12, s12, s13 ; CHECK-NEXT: vstr s0, [r1] ; CHECK-NEXT: add.w r1, r2, r0, lsl #2 -; CHECK-NEXT: vadd.f32 s8, s8, s10 -; CHECK-NEXT: vadd.f32 s6, s7, s5 +; CHECK-NEXT: vadd.f32 s14, s14, s15 +; CHECK-NEXT: adds r0, #7 +; CHECK-NEXT: vadd.f32 s12, s12, s13 ; CHECK-NEXT: vstr s2, [r1] ; CHECK-NEXT: add.w r1, r2, r4, lsl #2 -; CHECK-NEXT: vadd.f32 s10, s11, s9 +; CHECK-NEXT: vadd.f32 s8, s8, s10 +; CHECK-NEXT: vadd.f32 s6, s7, s5 ; CHECK-NEXT: vstr s4, [r1] -; CHECK-NEXT: adds r1, r0, #3 +; CHECK-NEXT: vadd.f32 s10, s11, s9 +; CHECK-NEXT: ldr r1, [sp, #24] @ 4-byte Reload ; CHECK-NEXT: vadd.f32 s12, s12, s14 ; CHECK-NEXT: add.w r1, r2, r1, lsl #2 ; CHECK-NEXT: vstr s6, [r1] -; CHECK-NEXT: adds r1, r0, #4 +; CHECK-NEXT: ldr r1, [sp, #28] @ 4-byte Reload ; CHECK-NEXT: add.w r1, r2, r1, lsl #2 ; CHECK-NEXT: vstr s12, [r1] -; CHECK-NEXT: adds r1, r0, #5 +; CHECK-NEXT: ldr r1, [sp, #32] @ 4-byte Reload ; CHECK-NEXT: add.w r1, r2, r1, lsl #2 ; CHECK-NEXT: vstr s10, [r1] -; CHECK-NEXT: adds r1, r0, #6 -; CHECK-NEXT: adds r0, #7 +; CHECK-NEXT: ldr r1, [sp, #36] @ 4-byte Reload ; CHECK-NEXT: add.w r1, r2, r1, lsl #2 ; CHECK-NEXT: vstr s8, [r1] -; CHECK-NEXT: ldr r1, [sp] @ 4-byte Reload -; CHECK-NEXT: add r8, r1 -; CHECK-NEXT: ldr r1, [sp, #8] @ 4-byte Reload +; CHECK-NEXT: ldr r1, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: add r9, r1 +; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload ; CHECK-NEXT: cmp r0, r1 ; CHECK-NEXT: blo.w .LBB6_2 ; CHECK-NEXT: .LBB6_5: @ %for.cond.cleanup -; CHECK-NEXT: add sp, #48 +; CHECK-NEXT: add sp, #72 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} @@ -1308,95 +1343,105 @@ define void @DCT_mve8(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float* ; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #64 -; CHECK-NEXT: sub sp, #64 -; CHECK-NEXT: str r1, [sp, #12] @ 4-byte Spill +; CHECK-NEXT: .pad #88 +; CHECK-NEXT: sub sp, #88 +; CHECK-NEXT: str r1, [sp, #20] @ 4-byte Spill ; CHECK-NEXT: ldr r1, [r0, #4] ; CHECK-NEXT: subs r1, #8 -; CHECK-NEXT: str r1, [sp, #8] @ 4-byte Spill +; CHECK-NEXT: str r1, [sp, #16] @ 4-byte Spill ; CHECK-NEXT: cmp r1, #2 ; CHECK-NEXT: blo.w .LBB7_5 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader -; CHECK-NEXT: ldr.w r11, [r0, #8] +; CHECK-NEXT: ldr r3, [r0, #8] ; CHECK-NEXT: ldr r1, [r0] -; CHECK-NEXT: add.w r0, r11, #3 +; CHECK-NEXT: adds r0, r3, #3 +; CHECK-NEXT: str r3, [sp, #12] @ 4-byte Spill ; CHECK-NEXT: bic r0, r0, #3 -; CHECK-NEXT: add.w r9, r1, r11, lsl #2 +; CHECK-NEXT: add.w r12, r1, r3, lsl #2 ; CHECK-NEXT: subs r1, r0, #4 ; CHECK-NEXT: movs r0, #1 -; CHECK-NEXT: lsl.w r5, r11, #2 +; CHECK-NEXT: lsls r5, r3, #2 ; CHECK-NEXT: add.w r1, r0, r1, lsr #2 +; CHECK-NEXT: str r1, [sp, #8] @ 4-byte Spill +; CHECK-NEXT: lsls r1, r3, #5 ; CHECK-NEXT: str r1, [sp, #4] @ 4-byte Spill -; CHECK-NEXT: lsl.w r1, r11, #5 -; CHECK-NEXT: str r1, [sp] @ 4-byte Spill ; CHECK-NEXT: .LBB7_2: @ %for.body ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB7_3 Depth 2 -; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: adds r1, r0, #7 +; CHECK-NEXT: str r1, [sp, #36] @ 4-byte Spill +; CHECK-NEXT: adds r1, r0, #6 +; CHECK-NEXT: str r1, [sp, #32] @ 4-byte Spill +; CHECK-NEXT: adds r1, r0, #5 +; CHECK-NEXT: ldr r7, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: str r1, [sp, #28] @ 4-byte Spill +; CHECK-NEXT: adds r1, r0, #4 +; CHECK-NEXT: ldr.w r9, [sp, #20] @ 4-byte Reload ; CHECK-NEXT: vmov.i32 q3, #0x0 -; CHECK-NEXT: ldr r7, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: ldr r6, [sp, #8] @ 4-byte Reload ; CHECK-NEXT: adds r4, r0, #3 -; CHECK-NEXT: add.w r12, r0, #2 -; CHECK-NEXT: add.w r8, r0, #1 -; CHECK-NEXT: mov r3, r9 +; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill +; CHECK-NEXT: add.w r8, r0, #2 +; CHECK-NEXT: adds r1, r0, #1 +; CHECK-NEXT: mov r3, r12 ; CHECK-NEXT: vmov q5, q3 ; CHECK-NEXT: vmov q6, q3 ; CHECK-NEXT: vmov q4, q3 ; CHECK-NEXT: vmov q7, q3 ; CHECK-NEXT: vmov q2, q3 -; CHECK-NEXT: mov r10, r11 -; CHECK-NEXT: vstrw.32 q3, [sp, #32] @ 16-byte Spill -; CHECK-NEXT: vstrw.32 q3, [sp, #48] @ 16-byte Spill -; CHECK-NEXT: dls lr, r7 +; CHECK-NEXT: mov r10, r7 +; CHECK-NEXT: vstrw.32 q3, [sp, #56] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q3, [sp, #72] @ 16-byte Spill +; CHECK-NEXT: dls lr, r6 ; CHECK-NEXT: .LBB7_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB7_2 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: adds r6, r3, r5 +; CHECK-NEXT: add.w r11, r3, r5 ; CHECK-NEXT: vctp.32 r10 ; CHECK-NEXT: vpsttt -; CHECK-NEXT: vldrwt.u32 q0, [r1], #16 +; CHECK-NEXT: vldrwt.u32 q0, [r9], #16 ; CHECK-NEXT: vldrwt.u32 q1, [r3], #16 ; CHECK-NEXT: vfmat.f32 q6, q1, q0 -; CHECK-NEXT: vstrw.32 q6, [sp, #16] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q6, [sp, #40] @ 16-byte Spill ; CHECK-NEXT: vpstt -; CHECK-NEXT: vldrwt.u32 q1, [r6] +; CHECK-NEXT: vldrwt.u32 q1, [r11] ; CHECK-NEXT: vfmat.f32 q7, q1, q0 -; CHECK-NEXT: adds r7, r6, r5 +; CHECK-NEXT: add.w r6, r11, r5 ; CHECK-NEXT: vmov q6, q5 ; CHECK-NEXT: vmov q5, q3 ; CHECK-NEXT: vmov q3, q4 ; CHECK-NEXT: vpst -; CHECK-NEXT: vldrwt.u32 q1, [r7] +; CHECK-NEXT: vldrwt.u32 q1, [r6] ; CHECK-NEXT: vmov q4, q2 -; CHECK-NEXT: vldrw.u32 q2, [sp, #32] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q2, [sp, #56] @ 16-byte Reload ; CHECK-NEXT: vpst ; CHECK-NEXT: vfmat.f32 q2, q1, q0 -; CHECK-NEXT: vstrw.32 q2, [sp, #32] @ 16-byte Spill -; CHECK-NEXT: adds r6, r7, r5 -; CHECK-NEXT: vpst -; CHECK-NEXT: vldrwt.u32 q1, [r6] -; CHECK-NEXT: vldrw.u32 q2, [sp, #48] @ 16-byte Reload +; CHECK-NEXT: vstrw.32 q2, [sp, #56] @ 16-byte Spill ; CHECK-NEXT: adds r7, r6, r5 ; CHECK-NEXT: vpst +; CHECK-NEXT: vldrwt.u32 q1, [r7] +; CHECK-NEXT: vldrw.u32 q2, [sp, #72] @ 16-byte Reload +; CHECK-NEXT: adds r6, r7, r5 +; CHECK-NEXT: vpst ; CHECK-NEXT: vfmat.f32 q2, q1, q0 -; CHECK-NEXT: vstrw.32 q2, [sp, #48] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q2, [sp, #72] @ 16-byte Spill ; CHECK-NEXT: vmov q2, q4 ; CHECK-NEXT: vmov q4, q3 ; CHECK-NEXT: vmov q3, q5 ; CHECK-NEXT: vmov q5, q6 -; CHECK-NEXT: vldrw.u32 q6, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: adds r6, r7, r5 +; CHECK-NEXT: vldrw.u32 q6, [sp, #40] @ 16-byte Reload +; CHECK-NEXT: adds r7, r6, r5 ; CHECK-NEXT: vpstt -; CHECK-NEXT: vldrwt.u32 q1, [r7] +; CHECK-NEXT: vldrwt.u32 q1, [r6] ; CHECK-NEXT: vfmat.f32 q2, q1, q0 ; CHECK-NEXT: sub.w r10, r10, #4 -; CHECK-NEXT: adds r7, r6, r5 +; CHECK-NEXT: adds r6, r7, r5 ; CHECK-NEXT: vpstttt -; CHECK-NEXT: vldrwt.u32 q1, [r6] -; CHECK-NEXT: vfmat.f32 q4, q1, q0 ; CHECK-NEXT: vldrwt.u32 q1, [r7] +; CHECK-NEXT: vfmat.f32 q4, q1, q0 +; CHECK-NEXT: vldrwt.u32 q1, [r6] ; CHECK-NEXT: vfmat.f32 q5, q1, q0 -; CHECK-NEXT: adds r6, r7, r5 +; CHECK-NEXT: add r6, r5 ; CHECK-NEXT: vpstt ; CHECK-NEXT: vldrwt.u32 q1, [r6] ; CHECK-NEXT: vfmat.f32 q3, q1, q0 @@ -1404,18 +1449,18 @@ define void @DCT_mve8(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float* ; CHECK-NEXT: @ %bb.4: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB7_2 Depth=1 ; CHECK-NEXT: vadd.f32 s0, s30, s31 -; CHECK-NEXT: add.w r1, r2, r8, lsl #2 +; CHECK-NEXT: add.w r1, r2, r1, lsl #2 ; CHECK-NEXT: vadd.f32 s2, s28, s29 ; CHECK-NEXT: vadd.f32 s4, s26, s27 ; CHECK-NEXT: vadd.f32 s6, s24, s25 ; CHECK-NEXT: vadd.f32 s5, s18, s19 ; CHECK-NEXT: vadd.f32 s7, s16, s17 -; CHECK-NEXT: vldrw.u32 q4, [sp, #32] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q4, [sp, #56] @ 16-byte Reload ; CHECK-NEXT: vadd.f32 s10, s10, s11 ; CHECK-NEXT: vadd.f32 s8, s8, s9 ; CHECK-NEXT: vadd.f32 s9, s18, s19 ; CHECK-NEXT: vadd.f32 s11, s16, s17 -; CHECK-NEXT: vldrw.u32 q4, [sp, #48] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q4, [sp, #72] @ 16-byte Reload ; CHECK-NEXT: vadd.f32 s14, s14, s15 ; CHECK-NEXT: vadd.f32 s12, s12, s13 ; CHECK-NEXT: vadd.f32 s13, s18, s19 @@ -1430,33 +1475,33 @@ define void @DCT_mve8(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float* ; CHECK-NEXT: vstr s0, [r1] ; CHECK-NEXT: add.w r1, r2, r0, lsl #2 ; CHECK-NEXT: vadd.f32 s3, s20, s21 +; CHECK-NEXT: adds r0, #8 ; CHECK-NEXT: vstr s2, [r1] -; CHECK-NEXT: add.w r1, r2, r12, lsl #2 +; CHECK-NEXT: add.w r1, r2, r8, lsl #2 ; CHECK-NEXT: vadd.f32 s12, s7, s5 ; CHECK-NEXT: vstr s10, [r1] ; CHECK-NEXT: add.w r1, r2, r4, lsl #2 ; CHECK-NEXT: vstr s14, [r1] -; CHECK-NEXT: adds r1, r0, #4 -; CHECK-NEXT: add.w r1, r2, r1, lsl #2 +; CHECK-NEXT: ldr r1, [sp, #24] @ 4-byte Reload ; CHECK-NEXT: vadd.f32 s4, s3, s1 +; CHECK-NEXT: add.w r1, r2, r1, lsl #2 ; CHECK-NEXT: vstr s8, [r1] -; CHECK-NEXT: adds r1, r0, #5 +; CHECK-NEXT: ldr r1, [sp, #28] @ 4-byte Reload ; CHECK-NEXT: add.w r1, r2, r1, lsl #2 ; CHECK-NEXT: vstr s12, [r1] -; CHECK-NEXT: adds r1, r0, #6 +; CHECK-NEXT: ldr r1, [sp, #32] @ 4-byte Reload ; CHECK-NEXT: add.w r1, r2, r1, lsl #2 ; CHECK-NEXT: vstr s4, [r1] -; CHECK-NEXT: adds r1, r0, #7 -; CHECK-NEXT: adds r0, #8 +; CHECK-NEXT: ldr r1, [sp, #36] @ 4-byte Reload ; CHECK-NEXT: add.w r1, r2, r1, lsl #2 ; CHECK-NEXT: vstr s6, [r1] -; CHECK-NEXT: ldr r1, [sp] @ 4-byte Reload -; CHECK-NEXT: add r9, r1 -; CHECK-NEXT: ldr r1, [sp, #8] @ 4-byte Reload +; CHECK-NEXT: ldr r1, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: add r12, r1 +; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload ; CHECK-NEXT: cmp r0, r1 ; CHECK-NEXT: blo.w .LBB7_2 ; CHECK-NEXT: .LBB7_5: @ %for.cond.cleanup -; CHECK-NEXT: add sp, #64 +; CHECK-NEXT: add sp, #88 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} diff --git a/llvm/test/CodeGen/X86/addcarry.ll b/llvm/test/CodeGen/X86/addcarry.ll index 9cde4c1..0db0d4e 100644 --- a/llvm/test/CodeGen/X86/addcarry.ll +++ b/llvm/test/CodeGen/X86/addcarry.ll @@ -432,24 +432,24 @@ define i32 @add_U320_without_i128_add(%struct.U320* nocapture dereferenceable(40 ; CHECK-NEXT: adcq %rdx, 8(%rdi) ; CHECK-NEXT: movq %rax, %rdx ; CHECK-NEXT: adcq %rcx, %rdx -; CHECK-NEXT: movq 24(%rdi), %r14 -; CHECK-NEXT: leaq (%r8,%r14), %r11 +; CHECK-NEXT: movq 24(%rdi), %r11 +; CHECK-NEXT: leaq (%r8,%r11), %r14 ; CHECK-NEXT: xorl %ebx, %ebx ; CHECK-NEXT: cmpq %r10, %rdx ; CHECK-NEXT: setb %bl ; CHECK-NEXT: addq %rcx, %rax -; CHECK-NEXT: adcq %r11, %rbx -; CHECK-NEXT: movq 32(%rdi), %rcx -; CHECK-NEXT: leaq (%r9,%rcx), %r10 +; CHECK-NEXT: adcq %r14, %rbx +; CHECK-NEXT: movq 32(%rdi), %r10 +; CHECK-NEXT: leaq (%r9,%r10), %rcx ; CHECK-NEXT: xorl %esi, %esi -; CHECK-NEXT: cmpq %r11, %rbx +; CHECK-NEXT: cmpq %r14, %rbx ; CHECK-NEXT: setb %sil -; CHECK-NEXT: addq %r14, %r8 -; CHECK-NEXT: adcq %r10, %rsi +; CHECK-NEXT: addq %r11, %r8 +; CHECK-NEXT: adcq %rcx, %rsi ; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: cmpq %r10, %rsi +; CHECK-NEXT: cmpq %rcx, %rsi ; CHECK-NEXT: setb %al -; CHECK-NEXT: addq %rcx, %r9 +; CHECK-NEXT: addq %r10, %r9 ; CHECK-NEXT: movq %rdx, 16(%rdi) ; CHECK-NEXT: movq %rbx, 24(%rdi) ; CHECK-NEXT: movq %rsi, 32(%rdi) diff --git a/llvm/test/CodeGen/X86/callbr-asm-blockplacement.ll b/llvm/test/CodeGen/X86/callbr-asm-blockplacement.ll index 3f79a20..fc3303f 100644 --- a/llvm/test/CodeGen/X86/callbr-asm-blockplacement.ll +++ b/llvm/test/CodeGen/X86/callbr-asm-blockplacement.ll @@ -23,11 +23,11 @@ define i32 @foo(i32 %arg, i32 (i8*)* %arg3) nounwind { ; CHECK-NEXT: testb %al, %al ; CHECK-NEXT: jne .LBB0_5 ; CHECK-NEXT: # %bb.1: # %bb5 -; CHECK-NEXT: movq %rsi, %r12 +; CHECK-NEXT: movq %rsi, %r14 ; CHECK-NEXT: movslq %edi, %rbp ; CHECK-NEXT: leaq (,%rbp,8), %rax -; CHECK-NEXT: leaq global(%rax,%rax,2), %r14 -; CHECK-NEXT: leaq global+4(%rax,%rax,2), %r15 +; CHECK-NEXT: leaq global(%rax,%rax,2), %r15 +; CHECK-NEXT: leaq global+4(%rax,%rax,2), %r12 ; CHECK-NEXT: xorl %r13d, %r13d ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB0_2: # %bb8 @@ -35,11 +35,11 @@ define i32 @foo(i32 %arg, i32 (i8*)* %arg3) nounwind { ; CHECK-NEXT: callq bar@PLT ; CHECK-NEXT: movq %rax, %rbx ; CHECK-NEXT: movq %rax, %rdi -; CHECK-NEXT: callq *%r12 -; CHECK-NEXT: movq %r14, %rdi -; CHECK-NEXT: callq hoge@PLT +; CHECK-NEXT: callq *%r14 ; CHECK-NEXT: movq %r15, %rdi ; CHECK-NEXT: callq hoge@PLT +; CHECK-NEXT: movq %r12, %rdi +; CHECK-NEXT: callq hoge@PLT ; CHECK-NEXT: testb %r13b, %r13b ; CHECK-NEXT: jne .LBB0_2 ; CHECK-NEXT: # %bb.3: # %bb15 diff --git a/llvm/test/CodeGen/X86/dag-update-nodetomatch.ll b/llvm/test/CodeGen/X86/dag-update-nodetomatch.ll index 17cab1b..75439f8 100644 --- a/llvm/test/CodeGen/X86/dag-update-nodetomatch.ll +++ b/llvm/test/CodeGen/X86/dag-update-nodetomatch.ll @@ -136,6 +136,8 @@ define void @_Z2x6v() local_unnamed_addr { ; CHECK-NEXT: movl (%r15), %eax ; CHECK-NEXT: leal 8(,%rcx,8), %ecx ; CHECK-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: leaq 8(%r12), %rcx +; CHECK-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: leaq 32(%r12), %rbx ; CHECK-NEXT: shlq $3, %r13 ; CHECK-NEXT: xorl %esi, %esi @@ -187,17 +189,16 @@ define void @_Z2x6v() local_unnamed_addr { ; CHECK-NEXT: jae .LBB1_7 ; CHECK-NEXT: # %bb.6: # %vector.memcheck ; CHECK-NEXT: # in Loop: Header=BB1_2 Depth=1 -; CHECK-NEXT: leaq 8(%r12), %rax -; CHECK-NEXT: addq %rax, %r10 +; CHECK-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Folded Reload ; CHECK-NEXT: leaq (%r10,%r11,8), %rax ; CHECK-NEXT: cmpq %rcx, %rax ; CHECK-NEXT: ja .LBB1_14 ; CHECK-NEXT: .LBB1_7: # %vector.body.preheader ; CHECK-NEXT: # in Loop: Header=BB1_2 Depth=1 -; CHECK-NEXT: leaq -4(%r8), %r10 -; CHECK-NEXT: movq %r10, %rax -; CHECK-NEXT: shrq $2, %rax -; CHECK-NEXT: btl $2, %r10d +; CHECK-NEXT: leaq -4(%r8), %rax +; CHECK-NEXT: movq %rax, %r10 +; CHECK-NEXT: shrq $2, %r10 +; CHECK-NEXT: btl $2, %eax ; CHECK-NEXT: jb .LBB1_8 ; CHECK-NEXT: # %bb.9: # %vector.body.prol.preheader ; CHECK-NEXT: # in Loop: Header=BB1_2 Depth=1 @@ -206,12 +207,12 @@ define void @_Z2x6v() local_unnamed_addr { ; CHECK-NEXT: movdqu %xmm0, (%rdi,%r9,8) ; CHECK-NEXT: movdqu %xmm0, 16(%rdi,%r9,8) ; CHECK-NEXT: movl $4, %r11d -; CHECK-NEXT: testq %rax, %rax +; CHECK-NEXT: testq %r10, %r10 ; CHECK-NEXT: jne .LBB1_11 ; CHECK-NEXT: jmp .LBB1_13 ; CHECK-NEXT: .LBB1_8: # in Loop: Header=BB1_2 Depth=1 ; CHECK-NEXT: xorl %r11d, %r11d -; CHECK-NEXT: testq %rax, %rax +; CHECK-NEXT: testq %r10, %r10 ; CHECK-NEXT: je .LBB1_13 ; CHECK-NEXT: .LBB1_11: # %vector.body.preheader.new ; CHECK-NEXT: # in Loop: Header=BB1_2 Depth=1 diff --git a/llvm/test/CodeGen/X86/delete-dead-instrs-with-live-uses.mir b/llvm/test/CodeGen/X86/delete-dead-instrs-with-live-uses.mir index 776b155f..10ee445 100644 --- a/llvm/test/CodeGen/X86/delete-dead-instrs-with-live-uses.mir +++ b/llvm/test/CodeGen/X86/delete-dead-instrs-with-live-uses.mir @@ -7,8 +7,8 @@ # CHECK: jne # CHECK: andl $-16, %edx # CHECK: xorl %ebx, %ebx -# CHECK: movl %edx, -16(%ebp) -# CHECK: xorl %esi, %esi +# CHECK: movl -16(%ebp), %esi +# CHECK: xorl %eax, %eax name: test tracksRegLiveness: true diff --git a/llvm/test/CodeGen/X86/inalloca-invoke.ll b/llvm/test/CodeGen/X86/inalloca-invoke.ll index 047a9ac..39a9ac5 100644 --- a/llvm/test/CodeGen/X86/inalloca-invoke.ll +++ b/llvm/test/CodeGen/X86/inalloca-invoke.ll @@ -24,6 +24,7 @@ blah: ; CHECK: pushl %eax ; CHECK: subl $20, %esp ; CHECK: movl %esp, %[[beg:[^ ]*]] +; CHECK: leal 12(%[[beg]]), %[[end:[^ ]*]] call void @begin(%Iter* sret(%Iter) %temp.lvalue) ; CHECK: calll _begin @@ -32,7 +33,6 @@ blah: to label %invoke.cont unwind label %lpad ; Uses end as sret param. -; CHECK: leal 12(%[[beg]]), %[[end:[^ ]*]] ; CHECK: pushl %[[end]] ; CHECK: calll _plus diff --git a/llvm/test/CodeGen/X86/licm-regpressure.ll b/llvm/test/CodeGen/X86/licm-regpressure.ll index c189142..0ab6554 100644 --- a/llvm/test/CodeGen/X86/licm-regpressure.ll +++ b/llvm/test/CodeGen/X86/licm-regpressure.ll @@ -1,34 +1,10 @@ ; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s -; RUN: llc < %s -mtriple=x86_64-linux -stop-after=early-machinelicm -o - | FileCheck %s -check-prefix=MIR - -; This tests should fail as MachineLICM does not compute register pressure +; This tests currently fails as MachineLICM does not compute register pressure ; correctly. More details: llvm.org/PR23143 - -; It however does not show any spills because leaq is rematerialized instead -; of spilling. - -; Stopping after MachineLICM however exposes all ADD64ri8 instructions -; to be hoisted which still has to be avoided. - ; XFAIL: * ; MachineLICM should take register pressure into account. -; CHECK-LABEL: {{^}}test: -; CHECK-NOT: Spill -; CHECK-COUNT-4: leaq -; CHECK-NOT: Spill -; CHECK: [[LOOP:\.LBB[0-9_]+]]: -; CHECK-NOT: Reload -; CHECK-COUNT-2: leaq -; CHECK-NOT: Reload -; CHECK: jne [[LOOP]] - -; MIR-LABEL: name: test -; MIR: bb.0.entry: -; MIR-COUNT-4: ADD64ri8 -; MIR: bb.1.loop-body: -; MIR-COUNT-2: ADD64ri8 -; MIR: JCC_1 %bb.1 +; CHECK-NOT: Spill %struct.A = type { i32, i32, i32, i32, i32, i32, i32 } diff --git a/llvm/test/CodeGen/X86/ragreedy-hoist-spill.ll b/llvm/test/CodeGen/X86/ragreedy-hoist-spill.ll index 6a629e3..f53b06e 100644 --- a/llvm/test/CodeGen/X86/ragreedy-hoist-spill.ll +++ b/llvm/test/CodeGen/X86/ragreedy-hoist-spill.ll @@ -91,48 +91,48 @@ define i8* @SyFgets(i8* %line, i64 %length, i64 %fid) { ; CHECK-NEXT: ## %bb.10: ## %do.end ; CHECK-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; CHECK-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; CHECK-NEXT: xorl %r12d, %r12d -; CHECK-NEXT: testb %r12b, %r12b +; CHECK-NEXT: xorl %r13d, %r13d +; CHECK-NEXT: testb %r13b, %r13b ; CHECK-NEXT: jne LBB0_11 ; CHECK-NEXT: ## %bb.12: ## %while.body200.preheader -; CHECK-NEXT: xorl %ebx, %ebx +; CHECK-NEXT: xorl %r12d, %r12d ; CHECK-NEXT: leaq LJTI0_0(%rip), %rdx -; CHECK-NEXT: leaq LJTI0_1(%rip), %r13 +; CHECK-NEXT: leaq LJTI0_1(%rip), %rbx ; CHECK-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Folded Spill ; CHECK-NEXT: xorl %r14d, %r14d ; CHECK-NEXT: jmp LBB0_13 ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: LBB0_20: ## %sw.bb256 ; CHECK-NEXT: ## in Loop: Header=BB0_13 Depth=1 -; CHECK-NEXT: movl %r12d, %r14d +; CHECK-NEXT: movl %r13d, %r14d ; CHECK-NEXT: LBB0_21: ## %while.cond197.backedge ; CHECK-NEXT: ## in Loop: Header=BB0_13 Depth=1 ; CHECK-NEXT: decl %r15d ; CHECK-NEXT: testl %r15d, %r15d -; CHECK-NEXT: movl %r14d, %r12d +; CHECK-NEXT: movl %r14d, %r13d ; CHECK-NEXT: jle LBB0_22 ; CHECK-NEXT: LBB0_13: ## %while.body200 ; CHECK-NEXT: ## =>This Loop Header: Depth=1 ; CHECK-NEXT: ## Child Loop BB0_29 Depth 2 ; CHECK-NEXT: ## Child Loop BB0_38 Depth 2 -; CHECK-NEXT: leal -268(%r12), %eax +; CHECK-NEXT: leal -268(%r13), %eax ; CHECK-NEXT: cmpl $105, %eax ; CHECK-NEXT: ja LBB0_14 ; CHECK-NEXT: ## %bb.56: ## %while.body200 ; CHECK-NEXT: ## in Loop: Header=BB0_13 Depth=1 -; CHECK-NEXT: movslq (%r13,%rax,4), %rax -; CHECK-NEXT: addq %r13, %rax +; CHECK-NEXT: movslq (%rbx,%rax,4), %rax +; CHECK-NEXT: addq %rbx, %rax ; CHECK-NEXT: jmpq *%rax ; CHECK-NEXT: LBB0_44: ## %while.cond1037.preheader ; CHECK-NEXT: ## in Loop: Header=BB0_13 Depth=1 -; CHECK-NEXT: testb %bl, %bl -; CHECK-NEXT: movl %r12d, %r14d +; CHECK-NEXT: testb %r12b, %r12b +; CHECK-NEXT: movl %r13d, %r14d ; CHECK-NEXT: jne LBB0_21 ; CHECK-NEXT: jmp LBB0_55 ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: LBB0_14: ## %while.body200 ; CHECK-NEXT: ## in Loop: Header=BB0_13 Depth=1 -; CHECK-NEXT: leal 1(%r12), %eax +; CHECK-NEXT: leal 1(%r13), %eax ; CHECK-NEXT: cmpl $21, %eax ; CHECK-NEXT: ja LBB0_20 ; CHECK-NEXT: ## %bb.15: ## %while.body200 @@ -147,12 +147,12 @@ define i8* @SyFgets(i8* %line, i64 %length, i64 %fid) { ; CHECK-NEXT: jmp LBB0_21 ; CHECK-NEXT: LBB0_26: ## %sw.bb474 ; CHECK-NEXT: ## in Loop: Header=BB0_13 Depth=1 -; CHECK-NEXT: testb %bl, %bl +; CHECK-NEXT: testb %r12b, %r12b ; CHECK-NEXT: ## implicit-def: $rbp ; CHECK-NEXT: jne LBB0_34 ; CHECK-NEXT: ## %bb.27: ## %do.body479.preheader ; CHECK-NEXT: ## in Loop: Header=BB0_13 Depth=1 -; CHECK-NEXT: testb %bl, %bl +; CHECK-NEXT: testb %r12b, %r12b ; CHECK-NEXT: ## implicit-def: $rbp ; CHECK-NEXT: jne LBB0_34 ; CHECK-NEXT: ## %bb.28: ## %land.rhs485.preheader @@ -163,7 +163,7 @@ define i8* @SyFgets(i8* %line, i64 %length, i64 %fid) { ; CHECK-NEXT: LBB0_32: ## %do.body479.backedge ; CHECK-NEXT: ## in Loop: Header=BB0_29 Depth=2 ; CHECK-NEXT: leaq 1(%rbp), %rax -; CHECK-NEXT: testb %bl, %bl +; CHECK-NEXT: testb %r12b, %r12b ; CHECK-NEXT: je LBB0_33 ; CHECK-NEXT: LBB0_29: ## %land.rhs485 ; CHECK-NEXT: ## Parent Loop BB0_13 Depth=1 @@ -173,13 +173,13 @@ define i8* @SyFgets(i8* %line, i64 %length, i64 %fid) { ; CHECK-NEXT: ## %bb.30: ## %cond.true.i.i2780 ; CHECK-NEXT: ## in Loop: Header=BB0_29 Depth=2 ; CHECK-NEXT: movq %rax, %rbp -; CHECK-NEXT: testb %bl, %bl +; CHECK-NEXT: testb %r12b, %r12b ; CHECK-NEXT: jne LBB0_32 ; CHECK-NEXT: ## %bb.31: ## %lor.rhs500 ; CHECK-NEXT: ## in Loop: Header=BB0_29 Depth=2 ; CHECK-NEXT: movl $256, %esi ## imm = 0x100 ; CHECK-NEXT: callq ___maskrune -; CHECK-NEXT: testb %bl, %bl +; CHECK-NEXT: testb %r12b, %r12b ; CHECK-NEXT: jne LBB0_32 ; CHECK-NEXT: jmp LBB0_34 ; CHECK-NEXT: LBB0_45: ## %sw.bb1134 @@ -229,13 +229,13 @@ define i8* @SyFgets(i8* %line, i64 %length, i64 %fid) { ; CHECK-NEXT: LBB0_38: ## %for.cond534 ; CHECK-NEXT: ## Parent Loop BB0_13 Depth=1 ; CHECK-NEXT: ## => This Inner Loop Header: Depth=2 -; CHECK-NEXT: testb %bl, %bl +; CHECK-NEXT: testb %r12b, %r12b ; CHECK-NEXT: jne LBB0_38 ; CHECK-NEXT: ## %bb.39: ## %for.cond542.preheader ; CHECK-NEXT: ## in Loop: Header=BB0_13 Depth=1 -; CHECK-NEXT: testb %bl, %bl +; CHECK-NEXT: testb %r12b, %r12b ; CHECK-NEXT: movb $0, (%rbp) -; CHECK-NEXT: movl %r12d, %r14d +; CHECK-NEXT: movl %r13d, %r14d ; CHECK-NEXT: leaq LJTI0_0(%rip), %rdx ; CHECK-NEXT: jmp LBB0_21 ; CHECK-NEXT: .p2align 4, 0x90 diff --git a/llvm/test/CodeGen/X86/sdiv_fix.ll b/llvm/test/CodeGen/X86/sdiv_fix.ll index b9f9a48..a4ed5fc 100644 --- a/llvm/test/CodeGen/X86/sdiv_fix.ll +++ b/llvm/test/CodeGen/X86/sdiv_fix.ll @@ -278,6 +278,8 @@ define i64 @func5(i64 %x, i64 %y) nounwind { ; X64-NEXT: movq %r12, %rcx ; X64-NEXT: callq __divti3@PLT ; X64-NEXT: movq %rax, %r13 +; X64-NEXT: decq %rax +; X64-NEXT: movq %rax, (%rsp) # 8-byte Spill ; X64-NEXT: testq %rbx, %rbx ; X64-NEXT: sets %al ; X64-NEXT: testq %r12, %r12 @@ -291,8 +293,7 @@ define i64 @func5(i64 %x, i64 %y) nounwind { ; X64-NEXT: orq %rax, %rdx ; X64-NEXT: setne %al ; X64-NEXT: testb %bpl, %al -; X64-NEXT: leaq -1(%r13), %rax -; X64-NEXT: cmovneq %rax, %r13 +; X64-NEXT: cmovneq (%rsp), %r13 # 8-byte Folded Reload ; X64-NEXT: movq %r13, %rax ; X64-NEXT: addq $8, %rsp ; X64-NEXT: popq %rbx -- 2.7.4