MF.getInfo<RISCVMachineFunctionInfo>();
SDLoc DL(N);
- // We use the same frame index we use for moving two i32s into 64-bit FPR.
- // This is an analogous operation.
- int FI = FuncInfo->getMoveF64FrameIndex(MF);
+ // Create temporary stack for each expanding node.
+ SDValue StackSlot =
+ CurDAG->CreateStackTemporary(TypeSize::Fixed(8), Align(4));
+ int FI = cast<FrameIndexSDNode>(StackSlot.getNode())->getIndex();
MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, FI);
const TargetLowering &TLI = CurDAG->getTargetLoweringInfo();
- SDValue StackSlot =
- CurDAG->getFrameIndex(FI, TLI.getPointerTy(CurDAG->getDataLayout()));
SDValue Chain = CurDAG->getEntryNode();
Lo = CurDAG->getStore(Chain, DL, Lo, StackSlot, MPI, Align(8));
define <vscale x 1 x i64> @bitreverse_nxv1i64(<vscale x 1 x i64> %va) {
; RV32-LABEL: bitreverse_nxv1i64:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
-; RV32-NEXT: sw zero, 12(sp)
+; RV32-NEXT: addi sp, sp, -32
+; RV32-NEXT: .cfi_def_cfa_offset 32
+; RV32-NEXT: sw zero, 4(sp)
; RV32-NEXT: lui a0, 1044480
-; RV32-NEXT: sw a0, 8(sp)
+; RV32-NEXT: sw a0, 0(sp)
; RV32-NEXT: lui a0, 61681
; RV32-NEXT: addi a0, a0, -241
-; RV32-NEXT: sw a0, 12(sp)
-; RV32-NEXT: sw a0, 8(sp)
+; RV32-NEXT: sw a0, 28(sp)
+; RV32-NEXT: sw a0, 24(sp)
; RV32-NEXT: lui a0, 209715
; RV32-NEXT: addi a0, a0, 819
-; RV32-NEXT: sw a0, 12(sp)
-; RV32-NEXT: sw a0, 8(sp)
+; RV32-NEXT: sw a0, 20(sp)
+; RV32-NEXT: sw a0, 16(sp)
; RV32-NEXT: lui a0, 349525
; RV32-NEXT: addi a0, a0, 1365
; RV32-NEXT: sw a0, 12(sp)
; RV32-NEXT: li a2, 40
; RV32-NEXT: vsll.vx v10, v10, a2
; RV32-NEXT: vor.vv v9, v9, v10
-; RV32-NEXT: addi a3, sp, 8
+; RV32-NEXT: mv a3, sp
; RV32-NEXT: vlse64.v v10, (a3), zero
-; RV32-NEXT: lui a4, 4080
-; RV32-NEXT: vand.vx v11, v8, a4
+; RV32-NEXT: lui a3, 4080
+; RV32-NEXT: vand.vx v11, v8, a3
; RV32-NEXT: vsll.vi v11, v11, 24
; RV32-NEXT: vand.vv v12, v8, v10
; RV32-NEXT: vsll.vi v12, v12, 8
; RV32-NEXT: vand.vx v12, v12, a1
; RV32-NEXT: vor.vv v11, v12, v11
; RV32-NEXT: vsrl.vi v12, v8, 24
-; RV32-NEXT: vand.vx v12, v12, a4
+; RV32-NEXT: vand.vx v12, v12, a3
; RV32-NEXT: vsrl.vi v8, v8, 8
; RV32-NEXT: vand.vv v8, v8, v10
; RV32-NEXT: vor.vv v8, v8, v12
-; RV32-NEXT: vlse64.v v10, (a3), zero
+; RV32-NEXT: addi a0, sp, 24
+; RV32-NEXT: vlse64.v v10, (a0), zero
; RV32-NEXT: vor.vv v8, v8, v11
; RV32-NEXT: vor.vv v8, v9, v8
; RV32-NEXT: vsrl.vi v9, v8, 4
; RV32-NEXT: vand.vv v9, v9, v10
; RV32-NEXT: vand.vv v8, v8, v10
-; RV32-NEXT: vlse64.v v10, (a3), zero
+; RV32-NEXT: addi a0, sp, 16
+; RV32-NEXT: vlse64.v v10, (a0), zero
; RV32-NEXT: vsll.vi v8, v8, 4
; RV32-NEXT: vor.vv v8, v9, v8
; RV32-NEXT: vsrl.vi v9, v8, 2
; RV32-NEXT: vand.vv v9, v9, v10
; RV32-NEXT: vand.vv v8, v8, v10
-; RV32-NEXT: vlse64.v v10, (a3), zero
+; RV32-NEXT: addi a0, sp, 8
+; RV32-NEXT: vlse64.v v10, (a0), zero
; RV32-NEXT: vsll.vi v8, v8, 2
; RV32-NEXT: vor.vv v8, v9, v8
; RV32-NEXT: vsrl.vi v9, v8, 1
; RV32-NEXT: vand.vv v8, v8, v10
; RV32-NEXT: vadd.vv v8, v8, v8
; RV32-NEXT: vor.vv v8, v9, v8
-; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: addi sp, sp, 32
; RV32-NEXT: ret
;
; RV64-LABEL: bitreverse_nxv1i64:
define <vscale x 2 x i64> @bitreverse_nxv2i64(<vscale x 2 x i64> %va) {
; RV32-LABEL: bitreverse_nxv2i64:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
-; RV32-NEXT: sw zero, 12(sp)
+; RV32-NEXT: addi sp, sp, -32
+; RV32-NEXT: .cfi_def_cfa_offset 32
+; RV32-NEXT: sw zero, 4(sp)
; RV32-NEXT: lui a0, 1044480
-; RV32-NEXT: sw a0, 8(sp)
+; RV32-NEXT: sw a0, 0(sp)
; RV32-NEXT: lui a0, 61681
; RV32-NEXT: addi a0, a0, -241
-; RV32-NEXT: sw a0, 12(sp)
-; RV32-NEXT: sw a0, 8(sp)
+; RV32-NEXT: sw a0, 28(sp)
+; RV32-NEXT: sw a0, 24(sp)
; RV32-NEXT: lui a0, 209715
; RV32-NEXT: addi a0, a0, 819
-; RV32-NEXT: sw a0, 12(sp)
-; RV32-NEXT: sw a0, 8(sp)
+; RV32-NEXT: sw a0, 20(sp)
+; RV32-NEXT: sw a0, 16(sp)
; RV32-NEXT: lui a0, 349525
; RV32-NEXT: addi a0, a0, 1365
; RV32-NEXT: sw a0, 12(sp)
; RV32-NEXT: li a2, 40
; RV32-NEXT: vsll.vx v12, v12, a2
; RV32-NEXT: vor.vv v10, v10, v12
-; RV32-NEXT: addi a3, sp, 8
+; RV32-NEXT: mv a3, sp
; RV32-NEXT: vlse64.v v12, (a3), zero
-; RV32-NEXT: lui a4, 4080
-; RV32-NEXT: vand.vx v14, v8, a4
+; RV32-NEXT: lui a3, 4080
+; RV32-NEXT: vand.vx v14, v8, a3
; RV32-NEXT: vsll.vi v14, v14, 24
; RV32-NEXT: vand.vv v16, v8, v12
; RV32-NEXT: vsll.vi v16, v16, 8
; RV32-NEXT: vand.vx v16, v16, a1
; RV32-NEXT: vor.vv v14, v16, v14
; RV32-NEXT: vsrl.vi v16, v8, 24
-; RV32-NEXT: vand.vx v16, v16, a4
+; RV32-NEXT: vand.vx v16, v16, a3
; RV32-NEXT: vsrl.vi v8, v8, 8
; RV32-NEXT: vand.vv v8, v8, v12
; RV32-NEXT: vor.vv v8, v8, v16
-; RV32-NEXT: vlse64.v v12, (a3), zero
+; RV32-NEXT: addi a0, sp, 24
+; RV32-NEXT: vlse64.v v12, (a0), zero
; RV32-NEXT: vor.vv v8, v8, v14
; RV32-NEXT: vor.vv v8, v10, v8
; RV32-NEXT: vsrl.vi v10, v8, 4
; RV32-NEXT: vand.vv v10, v10, v12
; RV32-NEXT: vand.vv v8, v8, v12
-; RV32-NEXT: vlse64.v v12, (a3), zero
+; RV32-NEXT: addi a0, sp, 16
+; RV32-NEXT: vlse64.v v12, (a0), zero
; RV32-NEXT: vsll.vi v8, v8, 4
; RV32-NEXT: vor.vv v8, v10, v8
; RV32-NEXT: vsrl.vi v10, v8, 2
; RV32-NEXT: vand.vv v10, v10, v12
; RV32-NEXT: vand.vv v8, v8, v12
-; RV32-NEXT: vlse64.v v12, (a3), zero
+; RV32-NEXT: addi a0, sp, 8
+; RV32-NEXT: vlse64.v v12, (a0), zero
; RV32-NEXT: vsll.vi v8, v8, 2
; RV32-NEXT: vor.vv v8, v10, v8
; RV32-NEXT: vsrl.vi v10, v8, 1
; RV32-NEXT: vand.vv v8, v8, v12
; RV32-NEXT: vadd.vv v8, v8, v8
; RV32-NEXT: vor.vv v8, v10, v8
-; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: addi sp, sp, 32
; RV32-NEXT: ret
;
; RV64-LABEL: bitreverse_nxv2i64:
define <vscale x 4 x i64> @bitreverse_nxv4i64(<vscale x 4 x i64> %va) {
; RV32-LABEL: bitreverse_nxv4i64:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
-; RV32-NEXT: sw zero, 12(sp)
+; RV32-NEXT: addi sp, sp, -32
+; RV32-NEXT: .cfi_def_cfa_offset 32
+; RV32-NEXT: sw zero, 4(sp)
; RV32-NEXT: lui a0, 1044480
-; RV32-NEXT: sw a0, 8(sp)
+; RV32-NEXT: sw a0, 0(sp)
; RV32-NEXT: lui a0, 61681
; RV32-NEXT: addi a0, a0, -241
-; RV32-NEXT: sw a0, 12(sp)
-; RV32-NEXT: sw a0, 8(sp)
+; RV32-NEXT: sw a0, 28(sp)
+; RV32-NEXT: sw a0, 24(sp)
; RV32-NEXT: lui a0, 209715
; RV32-NEXT: addi a0, a0, 819
-; RV32-NEXT: sw a0, 12(sp)
-; RV32-NEXT: sw a0, 8(sp)
+; RV32-NEXT: sw a0, 20(sp)
+; RV32-NEXT: sw a0, 16(sp)
; RV32-NEXT: lui a0, 349525
; RV32-NEXT: addi a0, a0, 1365
; RV32-NEXT: sw a0, 12(sp)
; RV32-NEXT: li a2, 40
; RV32-NEXT: vsll.vx v16, v16, a2
; RV32-NEXT: vor.vv v12, v12, v16
-; RV32-NEXT: addi a3, sp, 8
+; RV32-NEXT: mv a3, sp
; RV32-NEXT: vlse64.v v16, (a3), zero
-; RV32-NEXT: lui a4, 4080
-; RV32-NEXT: vand.vx v20, v8, a4
+; RV32-NEXT: lui a3, 4080
+; RV32-NEXT: vand.vx v20, v8, a3
; RV32-NEXT: vsll.vi v20, v20, 24
; RV32-NEXT: vand.vv v24, v8, v16
; RV32-NEXT: vsll.vi v24, v24, 8
; RV32-NEXT: vand.vx v24, v24, a1
; RV32-NEXT: vor.vv v20, v24, v20
; RV32-NEXT: vsrl.vi v24, v8, 24
-; RV32-NEXT: vand.vx v24, v24, a4
+; RV32-NEXT: vand.vx v24, v24, a3
; RV32-NEXT: vsrl.vi v8, v8, 8
; RV32-NEXT: vand.vv v8, v8, v16
; RV32-NEXT: vor.vv v8, v8, v24
-; RV32-NEXT: vlse64.v v16, (a3), zero
+; RV32-NEXT: addi a0, sp, 24
+; RV32-NEXT: vlse64.v v16, (a0), zero
; RV32-NEXT: vor.vv v8, v8, v20
; RV32-NEXT: vor.vv v8, v12, v8
; RV32-NEXT: vsrl.vi v12, v8, 4
; RV32-NEXT: vand.vv v12, v12, v16
; RV32-NEXT: vand.vv v8, v8, v16
-; RV32-NEXT: vlse64.v v16, (a3), zero
+; RV32-NEXT: addi a0, sp, 16
+; RV32-NEXT: vlse64.v v16, (a0), zero
; RV32-NEXT: vsll.vi v8, v8, 4
; RV32-NEXT: vor.vv v8, v12, v8
; RV32-NEXT: vsrl.vi v12, v8, 2
; RV32-NEXT: vand.vv v12, v12, v16
; RV32-NEXT: vand.vv v8, v8, v16
-; RV32-NEXT: vlse64.v v16, (a3), zero
+; RV32-NEXT: addi a0, sp, 8
+; RV32-NEXT: vlse64.v v16, (a0), zero
; RV32-NEXT: vsll.vi v8, v8, 2
; RV32-NEXT: vor.vv v8, v12, v8
; RV32-NEXT: vsrl.vi v12, v8, 1
; RV32-NEXT: vand.vv v8, v8, v16
; RV32-NEXT: vadd.vv v8, v8, v8
; RV32-NEXT: vor.vv v8, v12, v8
-; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: addi sp, sp, 32
; RV32-NEXT: ret
;
; RV64-LABEL: bitreverse_nxv4i64:
define <vscale x 8 x i64> @bitreverse_nxv8i64(<vscale x 8 x i64> %va) {
; RV32-LABEL: bitreverse_nxv8i64:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: addi sp, sp, -48
+; RV32-NEXT: .cfi_def_cfa_offset 48
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 3
; RV32-NEXT: sub sp, sp, a0
-; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; RV32-NEXT: sw zero, 12(sp)
+; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 8 * vlenb
+; RV32-NEXT: sw zero, 20(sp)
; RV32-NEXT: lui a0, 1044480
-; RV32-NEXT: sw a0, 8(sp)
+; RV32-NEXT: sw a0, 16(sp)
; RV32-NEXT: lui a0, 61681
; RV32-NEXT: addi a0, a0, -241
-; RV32-NEXT: sw a0, 12(sp)
-; RV32-NEXT: sw a0, 8(sp)
+; RV32-NEXT: sw a0, 44(sp)
+; RV32-NEXT: sw a0, 40(sp)
; RV32-NEXT: lui a0, 209715
; RV32-NEXT: addi a0, a0, 819
-; RV32-NEXT: sw a0, 12(sp)
-; RV32-NEXT: sw a0, 8(sp)
+; RV32-NEXT: sw a0, 36(sp)
+; RV32-NEXT: sw a0, 32(sp)
; RV32-NEXT: lui a0, 349525
; RV32-NEXT: addi a0, a0, 1365
-; RV32-NEXT: sw a0, 12(sp)
-; RV32-NEXT: sw a0, 8(sp)
+; RV32-NEXT: sw a0, 28(sp)
+; RV32-NEXT: sw a0, 24(sp)
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsetvli a1, zero, e64, m8, ta, ma
; RV32-NEXT: vsll.vx v16, v8, a0
; RV32-NEXT: li a2, 40
; RV32-NEXT: vsll.vx v24, v24, a2
; RV32-NEXT: vor.vv v16, v16, v24
-; RV32-NEXT: addi a3, sp, 16
+; RV32-NEXT: addi a3, sp, 48
; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: addi a3, sp, 8
+; RV32-NEXT: addi a3, sp, 16
; RV32-NEXT: vlse64.v v24, (a3), zero
-; RV32-NEXT: lui a4, 4080
-; RV32-NEXT: vand.vx v0, v8, a4
+; RV32-NEXT: lui a3, 4080
+; RV32-NEXT: vand.vx v0, v8, a3
; RV32-NEXT: vsll.vi v0, v0, 24
; RV32-NEXT: vand.vv v16, v8, v24
; RV32-NEXT: vsll.vi v16, v16, 8
; RV32-NEXT: vor.vv v16, v0, v16
-; RV32-NEXT: addi a5, sp, 16
-; RV32-NEXT: vl8r.v v0, (a5) # Unknown-size Folded Reload
+; RV32-NEXT: addi a4, sp, 48
+; RV32-NEXT: vl8r.v v0, (a4) # Unknown-size Folded Reload
; RV32-NEXT: vor.vv v16, v0, v16
-; RV32-NEXT: vs8r.v v16, (a5) # Unknown-size Folded Spill
+; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill
; RV32-NEXT: vsrl.vx v0, v8, a2
; RV32-NEXT: vand.vx v0, v0, a1
; RV32-NEXT: vsrl.vx v16, v8, a0
; RV32-NEXT: vsrl.vi v16, v8, 8
; RV32-NEXT: vand.vv v16, v16, v24
; RV32-NEXT: vsrl.vi v8, v8, 24
-; RV32-NEXT: vand.vx v8, v8, a4
+; RV32-NEXT: vand.vx v8, v8, a3
; RV32-NEXT: vor.vv v8, v16, v8
-; RV32-NEXT: vlse64.v v16, (a3), zero
+; RV32-NEXT: addi a0, sp, 40
+; RV32-NEXT: vlse64.v v16, (a0), zero
; RV32-NEXT: vor.vv v8, v8, v0
-; RV32-NEXT: addi a0, sp, 16
+; RV32-NEXT: addi a0, sp, 48
; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vor.vv v8, v24, v8
; RV32-NEXT: vsrl.vi v24, v8, 4
; RV32-NEXT: vand.vv v24, v24, v16
; RV32-NEXT: vand.vv v8, v8, v16
-; RV32-NEXT: vlse64.v v16, (a3), zero
+; RV32-NEXT: addi a0, sp, 32
+; RV32-NEXT: vlse64.v v16, (a0), zero
; RV32-NEXT: vsll.vi v8, v8, 4
; RV32-NEXT: vor.vv v8, v24, v8
; RV32-NEXT: vsrl.vi v24, v8, 2
; RV32-NEXT: vand.vv v24, v24, v16
; RV32-NEXT: vand.vv v8, v8, v16
-; RV32-NEXT: vlse64.v v16, (a3), zero
+; RV32-NEXT: addi a0, sp, 24
+; RV32-NEXT: vlse64.v v16, (a0), zero
; RV32-NEXT: vsll.vi v8, v8, 2
; RV32-NEXT: vor.vv v8, v24, v8
; RV32-NEXT: vsrl.vi v24, v8, 1
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 3
; RV32-NEXT: add sp, sp, a0
-; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: addi sp, sp, 48
; RV32-NEXT: ret
;
; RV64-LABEL: bitreverse_nxv8i64:
define <vscale x 1 x i64> @vp_bitreverse_nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
; RV32-LABEL: vp_bitreverse_nxv1i64:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
-; RV32-NEXT: sw zero, 12(sp)
+; RV32-NEXT: addi sp, sp, -32
+; RV32-NEXT: .cfi_def_cfa_offset 32
+; RV32-NEXT: sw zero, 4(sp)
; RV32-NEXT: lui a1, 1044480
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 0(sp)
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 28(sp)
+; RV32-NEXT: sw a1, 24(sp)
; RV32-NEXT: lui a1, 209715
; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 20(sp)
+; RV32-NEXT: sw a1, 16(sp)
; RV32-NEXT: lui a1, 349525
; RV32-NEXT: addi a1, a1, 1365
; RV32-NEXT: sw a1, 12(sp)
; RV32-NEXT: sw a1, 8(sp)
-; RV32-NEXT: li a2, 56
+; RV32-NEXT: li a1, 56
; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT: vsll.vx v9, v8, a2, v0.t
-; RV32-NEXT: lui a1, 16
-; RV32-NEXT: addi a3, a1, -256
-; RV32-NEXT: vand.vx v10, v8, a3, v0.t
-; RV32-NEXT: li a4, 40
-; RV32-NEXT: vsll.vx v10, v10, a4, v0.t
+; RV32-NEXT: vsll.vx v9, v8, a1, v0.t
+; RV32-NEXT: lui a2, 16
+; RV32-NEXT: addi a2, a2, -256
+; RV32-NEXT: vand.vx v10, v8, a2, v0.t
+; RV32-NEXT: li a3, 40
+; RV32-NEXT: vsll.vx v10, v10, a3, v0.t
; RV32-NEXT: vor.vv v9, v9, v10, v0.t
-; RV32-NEXT: lui a5, 4080
-; RV32-NEXT: vand.vx v10, v8, a5, v0.t
+; RV32-NEXT: lui a4, 4080
+; RV32-NEXT: vand.vx v10, v8, a4, v0.t
; RV32-NEXT: vsll.vi v10, v10, 24, v0.t
-; RV32-NEXT: addi a1, sp, 8
+; RV32-NEXT: mv a5, sp
; RV32-NEXT: vsetvli a6, zero, e64, m1, ta, ma
-; RV32-NEXT: vlse64.v v11, (a1), zero
+; RV32-NEXT: vlse64.v v11, (a5), zero
; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
; RV32-NEXT: vand.vv v12, v8, v11, v0.t
; RV32-NEXT: vsll.vi v12, v12, 8, v0.t
; RV32-NEXT: vor.vv v10, v10, v12, v0.t
; RV32-NEXT: vor.vv v9, v9, v10, v0.t
-; RV32-NEXT: vsrl.vx v10, v8, a2, v0.t
-; RV32-NEXT: vsrl.vx v12, v8, a4, v0.t
-; RV32-NEXT: vand.vx v12, v12, a3, v0.t
+; RV32-NEXT: vsrl.vx v10, v8, a1, v0.t
+; RV32-NEXT: vsrl.vx v12, v8, a3, v0.t
+; RV32-NEXT: vand.vx v12, v12, a2, v0.t
; RV32-NEXT: vor.vv v10, v12, v10, v0.t
; RV32-NEXT: vsrl.vi v12, v8, 24, v0.t
-; RV32-NEXT: vand.vx v12, v12, a5, v0.t
+; RV32-NEXT: vand.vx v12, v12, a4, v0.t
; RV32-NEXT: vsrl.vi v8, v8, 8, v0.t
; RV32-NEXT: vand.vv v8, v8, v11, v0.t
; RV32-NEXT: vor.vv v8, v8, v12, v0.t
; RV32-NEXT: vor.vv v8, v8, v10, v0.t
; RV32-NEXT: vor.vv v8, v9, v8, v0.t
; RV32-NEXT: vsrl.vi v9, v8, 4, v0.t
+; RV32-NEXT: addi a1, sp, 24
; RV32-NEXT: vsetvli a2, zero, e64, m1, ta, ma
; RV32-NEXT: vlse64.v v10, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
; RV32-NEXT: vsll.vi v8, v8, 4, v0.t
; RV32-NEXT: vor.vv v8, v9, v8, v0.t
; RV32-NEXT: vsrl.vi v9, v8, 2, v0.t
+; RV32-NEXT: addi a1, sp, 16
; RV32-NEXT: vsetvli a2, zero, e64, m1, ta, ma
; RV32-NEXT: vlse64.v v10, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
; RV32-NEXT: vsll.vi v8, v8, 2, v0.t
; RV32-NEXT: vor.vv v8, v9, v8, v0.t
; RV32-NEXT: vsrl.vi v9, v8, 1, v0.t
+; RV32-NEXT: addi a1, sp, 8
; RV32-NEXT: vsetvli a2, zero, e64, m1, ta, ma
; RV32-NEXT: vlse64.v v10, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
; RV32-NEXT: vand.vv v8, v8, v10, v0.t
; RV32-NEXT: vsll.vi v8, v8, 1, v0.t
; RV32-NEXT: vor.vv v8, v9, v8, v0.t
-; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: addi sp, sp, 32
; RV32-NEXT: ret
;
; RV64-LABEL: vp_bitreverse_nxv1i64:
define <vscale x 1 x i64> @vp_bitreverse_nxv1i64_unmasked(<vscale x 1 x i64> %va, i32 zeroext %evl) {
; RV32-LABEL: vp_bitreverse_nxv1i64_unmasked:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
-; RV32-NEXT: sw zero, 12(sp)
+; RV32-NEXT: addi sp, sp, -32
+; RV32-NEXT: .cfi_def_cfa_offset 32
+; RV32-NEXT: sw zero, 4(sp)
; RV32-NEXT: lui a1, 1044480
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 0(sp)
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 28(sp)
+; RV32-NEXT: sw a1, 24(sp)
; RV32-NEXT: lui a1, 209715
; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 20(sp)
+; RV32-NEXT: sw a1, 16(sp)
; RV32-NEXT: lui a1, 349525
; RV32-NEXT: addi a1, a1, 1365
; RV32-NEXT: sw a1, 12(sp)
; RV32-NEXT: lui a4, 4080
; RV32-NEXT: vand.vx v10, v8, a4
; RV32-NEXT: vsll.vi v10, v10, 24
-; RV32-NEXT: addi a5, sp, 8
+; RV32-NEXT: mv a5, sp
; RV32-NEXT: vsetvli a6, zero, e64, m1, ta, ma
; RV32-NEXT: vlse64.v v11, (a5), zero
; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
; RV32-NEXT: vor.vv v8, v8, v10
; RV32-NEXT: vor.vv v8, v9, v8
; RV32-NEXT: vsrl.vi v9, v8, 4
-; RV32-NEXT: vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT: vlse64.v v10, (a5), zero
+; RV32-NEXT: addi a1, sp, 24
+; RV32-NEXT: vsetvli a2, zero, e64, m1, ta, ma
+; RV32-NEXT: vlse64.v v10, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
; RV32-NEXT: vand.vv v9, v9, v10
; RV32-NEXT: vand.vv v8, v8, v10
; RV32-NEXT: vsll.vi v8, v8, 4
; RV32-NEXT: vor.vv v8, v9, v8
; RV32-NEXT: vsrl.vi v9, v8, 2
-; RV32-NEXT: vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT: vlse64.v v10, (a5), zero
+; RV32-NEXT: addi a1, sp, 16
+; RV32-NEXT: vsetvli a2, zero, e64, m1, ta, ma
+; RV32-NEXT: vlse64.v v10, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
; RV32-NEXT: vand.vv v9, v9, v10
; RV32-NEXT: vand.vv v8, v8, v10
; RV32-NEXT: vsll.vi v8, v8, 2
; RV32-NEXT: vor.vv v8, v9, v8
; RV32-NEXT: vsrl.vi v9, v8, 1
-; RV32-NEXT: vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT: vlse64.v v10, (a5), zero
+; RV32-NEXT: addi a1, sp, 8
+; RV32-NEXT: vsetvli a2, zero, e64, m1, ta, ma
+; RV32-NEXT: vlse64.v v10, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
; RV32-NEXT: vand.vv v9, v9, v10
; RV32-NEXT: vand.vv v8, v8, v10
; RV32-NEXT: vadd.vv v8, v8, v8
; RV32-NEXT: vor.vv v8, v9, v8
-; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: addi sp, sp, 32
; RV32-NEXT: ret
;
; RV64-LABEL: vp_bitreverse_nxv1i64_unmasked:
define <vscale x 2 x i64> @vp_bitreverse_nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
; RV32-LABEL: vp_bitreverse_nxv2i64:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
-; RV32-NEXT: sw zero, 12(sp)
+; RV32-NEXT: addi sp, sp, -32
+; RV32-NEXT: .cfi_def_cfa_offset 32
+; RV32-NEXT: sw zero, 4(sp)
; RV32-NEXT: lui a1, 1044480
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 0(sp)
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 28(sp)
+; RV32-NEXT: sw a1, 24(sp)
; RV32-NEXT: lui a1, 209715
; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 20(sp)
+; RV32-NEXT: sw a1, 16(sp)
; RV32-NEXT: lui a1, 349525
; RV32-NEXT: addi a1, a1, 1365
; RV32-NEXT: sw a1, 12(sp)
; RV32-NEXT: sw a1, 8(sp)
-; RV32-NEXT: li a2, 56
+; RV32-NEXT: li a1, 56
; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT: vsll.vx v10, v8, a2, v0.t
-; RV32-NEXT: lui a1, 16
-; RV32-NEXT: addi a3, a1, -256
-; RV32-NEXT: vand.vx v12, v8, a3, v0.t
-; RV32-NEXT: li a4, 40
-; RV32-NEXT: vsll.vx v12, v12, a4, v0.t
+; RV32-NEXT: vsll.vx v10, v8, a1, v0.t
+; RV32-NEXT: lui a2, 16
+; RV32-NEXT: addi a2, a2, -256
+; RV32-NEXT: vand.vx v12, v8, a2, v0.t
+; RV32-NEXT: li a3, 40
+; RV32-NEXT: vsll.vx v12, v12, a3, v0.t
; RV32-NEXT: vor.vv v10, v10, v12, v0.t
-; RV32-NEXT: lui a5, 4080
-; RV32-NEXT: vand.vx v12, v8, a5, v0.t
+; RV32-NEXT: lui a4, 4080
+; RV32-NEXT: vand.vx v12, v8, a4, v0.t
; RV32-NEXT: vsll.vi v12, v12, 24, v0.t
-; RV32-NEXT: addi a1, sp, 8
+; RV32-NEXT: mv a5, sp
; RV32-NEXT: vsetvli a6, zero, e64, m2, ta, ma
-; RV32-NEXT: vlse64.v v14, (a1), zero
+; RV32-NEXT: vlse64.v v14, (a5), zero
; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
; RV32-NEXT: vand.vv v16, v8, v14, v0.t
; RV32-NEXT: vsll.vi v16, v16, 8, v0.t
; RV32-NEXT: vor.vv v12, v12, v16, v0.t
; RV32-NEXT: vor.vv v10, v10, v12, v0.t
-; RV32-NEXT: vsrl.vx v12, v8, a2, v0.t
-; RV32-NEXT: vsrl.vx v16, v8, a4, v0.t
-; RV32-NEXT: vand.vx v16, v16, a3, v0.t
+; RV32-NEXT: vsrl.vx v12, v8, a1, v0.t
+; RV32-NEXT: vsrl.vx v16, v8, a3, v0.t
+; RV32-NEXT: vand.vx v16, v16, a2, v0.t
; RV32-NEXT: vor.vv v12, v16, v12, v0.t
; RV32-NEXT: vsrl.vi v16, v8, 24, v0.t
-; RV32-NEXT: vand.vx v16, v16, a5, v0.t
+; RV32-NEXT: vand.vx v16, v16, a4, v0.t
; RV32-NEXT: vsrl.vi v8, v8, 8, v0.t
; RV32-NEXT: vand.vv v8, v8, v14, v0.t
; RV32-NEXT: vor.vv v8, v8, v16, v0.t
; RV32-NEXT: vor.vv v8, v8, v12, v0.t
; RV32-NEXT: vor.vv v8, v10, v8, v0.t
; RV32-NEXT: vsrl.vi v10, v8, 4, v0.t
+; RV32-NEXT: addi a1, sp, 24
; RV32-NEXT: vsetvli a2, zero, e64, m2, ta, ma
; RV32-NEXT: vlse64.v v12, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
; RV32-NEXT: vsll.vi v8, v8, 4, v0.t
; RV32-NEXT: vor.vv v8, v10, v8, v0.t
; RV32-NEXT: vsrl.vi v10, v8, 2, v0.t
+; RV32-NEXT: addi a1, sp, 16
; RV32-NEXT: vsetvli a2, zero, e64, m2, ta, ma
; RV32-NEXT: vlse64.v v12, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
; RV32-NEXT: vsll.vi v8, v8, 2, v0.t
; RV32-NEXT: vor.vv v8, v10, v8, v0.t
; RV32-NEXT: vsrl.vi v10, v8, 1, v0.t
+; RV32-NEXT: addi a1, sp, 8
; RV32-NEXT: vsetvli a2, zero, e64, m2, ta, ma
; RV32-NEXT: vlse64.v v12, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
; RV32-NEXT: vand.vv v8, v8, v12, v0.t
; RV32-NEXT: vsll.vi v8, v8, 1, v0.t
; RV32-NEXT: vor.vv v8, v10, v8, v0.t
-; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: addi sp, sp, 32
; RV32-NEXT: ret
;
; RV64-LABEL: vp_bitreverse_nxv2i64:
define <vscale x 2 x i64> @vp_bitreverse_nxv2i64_unmasked(<vscale x 2 x i64> %va, i32 zeroext %evl) {
; RV32-LABEL: vp_bitreverse_nxv2i64_unmasked:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
-; RV32-NEXT: sw zero, 12(sp)
+; RV32-NEXT: addi sp, sp, -32
+; RV32-NEXT: .cfi_def_cfa_offset 32
+; RV32-NEXT: sw zero, 4(sp)
; RV32-NEXT: lui a1, 1044480
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 0(sp)
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 28(sp)
+; RV32-NEXT: sw a1, 24(sp)
; RV32-NEXT: lui a1, 209715
; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 20(sp)
+; RV32-NEXT: sw a1, 16(sp)
; RV32-NEXT: lui a1, 349525
; RV32-NEXT: addi a1, a1, 1365
; RV32-NEXT: sw a1, 12(sp)
; RV32-NEXT: lui a4, 4080
; RV32-NEXT: vand.vx v12, v8, a4
; RV32-NEXT: vsll.vi v12, v12, 24
-; RV32-NEXT: addi a5, sp, 8
+; RV32-NEXT: mv a5, sp
; RV32-NEXT: vsetvli a6, zero, e64, m2, ta, ma
; RV32-NEXT: vlse64.v v14, (a5), zero
; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
; RV32-NEXT: vor.vv v8, v8, v12
; RV32-NEXT: vor.vv v8, v10, v8
; RV32-NEXT: vsrl.vi v10, v8, 4
-; RV32-NEXT: vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT: vlse64.v v12, (a5), zero
+; RV32-NEXT: addi a1, sp, 24
+; RV32-NEXT: vsetvli a2, zero, e64, m2, ta, ma
+; RV32-NEXT: vlse64.v v12, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
; RV32-NEXT: vand.vv v10, v10, v12
; RV32-NEXT: vand.vv v8, v8, v12
; RV32-NEXT: vsll.vi v8, v8, 4
; RV32-NEXT: vor.vv v8, v10, v8
; RV32-NEXT: vsrl.vi v10, v8, 2
-; RV32-NEXT: vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT: vlse64.v v12, (a5), zero
+; RV32-NEXT: addi a1, sp, 16
+; RV32-NEXT: vsetvli a2, zero, e64, m2, ta, ma
+; RV32-NEXT: vlse64.v v12, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
; RV32-NEXT: vand.vv v10, v10, v12
; RV32-NEXT: vand.vv v8, v8, v12
; RV32-NEXT: vsll.vi v8, v8, 2
; RV32-NEXT: vor.vv v8, v10, v8
; RV32-NEXT: vsrl.vi v10, v8, 1
-; RV32-NEXT: vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT: vlse64.v v12, (a5), zero
+; RV32-NEXT: addi a1, sp, 8
+; RV32-NEXT: vsetvli a2, zero, e64, m2, ta, ma
+; RV32-NEXT: vlse64.v v12, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
; RV32-NEXT: vand.vv v10, v10, v12
; RV32-NEXT: vand.vv v8, v8, v12
; RV32-NEXT: vadd.vv v8, v8, v8
; RV32-NEXT: vor.vv v8, v10, v8
-; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: addi sp, sp, 32
; RV32-NEXT: ret
;
; RV64-LABEL: vp_bitreverse_nxv2i64_unmasked:
define <vscale x 4 x i64> @vp_bitreverse_nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
; RV32-LABEL: vp_bitreverse_nxv4i64:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
-; RV32-NEXT: sw zero, 12(sp)
+; RV32-NEXT: addi sp, sp, -32
+; RV32-NEXT: .cfi_def_cfa_offset 32
+; RV32-NEXT: sw zero, 4(sp)
; RV32-NEXT: lui a1, 1044480
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 0(sp)
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 28(sp)
+; RV32-NEXT: sw a1, 24(sp)
; RV32-NEXT: lui a1, 209715
; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 20(sp)
+; RV32-NEXT: sw a1, 16(sp)
; RV32-NEXT: lui a1, 349525
; RV32-NEXT: addi a1, a1, 1365
; RV32-NEXT: sw a1, 12(sp)
; RV32-NEXT: sw a1, 8(sp)
-; RV32-NEXT: li a2, 56
+; RV32-NEXT: li a1, 56
; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT: vsll.vx v12, v8, a2, v0.t
-; RV32-NEXT: lui a1, 16
-; RV32-NEXT: addi a3, a1, -256
-; RV32-NEXT: vand.vx v16, v8, a3, v0.t
-; RV32-NEXT: li a4, 40
-; RV32-NEXT: vsll.vx v16, v16, a4, v0.t
+; RV32-NEXT: vsll.vx v12, v8, a1, v0.t
+; RV32-NEXT: lui a2, 16
+; RV32-NEXT: addi a2, a2, -256
+; RV32-NEXT: vand.vx v16, v8, a2, v0.t
+; RV32-NEXT: li a3, 40
+; RV32-NEXT: vsll.vx v16, v16, a3, v0.t
; RV32-NEXT: vor.vv v16, v12, v16, v0.t
-; RV32-NEXT: lui a5, 4080
-; RV32-NEXT: vand.vx v12, v8, a5, v0.t
+; RV32-NEXT: lui a4, 4080
+; RV32-NEXT: vand.vx v12, v8, a4, v0.t
; RV32-NEXT: vsll.vi v20, v12, 24, v0.t
-; RV32-NEXT: addi a1, sp, 8
+; RV32-NEXT: mv a5, sp
; RV32-NEXT: vsetvli a6, zero, e64, m4, ta, ma
-; RV32-NEXT: vlse64.v v12, (a1), zero
+; RV32-NEXT: vlse64.v v12, (a5), zero
; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
; RV32-NEXT: vand.vv v24, v8, v12, v0.t
; RV32-NEXT: vsll.vi v24, v24, 8, v0.t
; RV32-NEXT: vor.vv v20, v20, v24, v0.t
; RV32-NEXT: vor.vv v16, v16, v20, v0.t
-; RV32-NEXT: vsrl.vx v20, v8, a2, v0.t
-; RV32-NEXT: vsrl.vx v24, v8, a4, v0.t
-; RV32-NEXT: vand.vx v24, v24, a3, v0.t
+; RV32-NEXT: vsrl.vx v20, v8, a1, v0.t
+; RV32-NEXT: vsrl.vx v24, v8, a3, v0.t
+; RV32-NEXT: vand.vx v24, v24, a2, v0.t
; RV32-NEXT: vor.vv v20, v24, v20, v0.t
; RV32-NEXT: vsrl.vi v24, v8, 24, v0.t
-; RV32-NEXT: vand.vx v24, v24, a5, v0.t
+; RV32-NEXT: vand.vx v24, v24, a4, v0.t
; RV32-NEXT: vsrl.vi v8, v8, 8, v0.t
; RV32-NEXT: vand.vv v8, v8, v12, v0.t
; RV32-NEXT: vor.vv v8, v8, v24, v0.t
; RV32-NEXT: vor.vv v8, v8, v20, v0.t
; RV32-NEXT: vor.vv v8, v16, v8, v0.t
; RV32-NEXT: vsrl.vi v12, v8, 4, v0.t
+; RV32-NEXT: addi a1, sp, 24
; RV32-NEXT: vsetvli a2, zero, e64, m4, ta, ma
; RV32-NEXT: vlse64.v v16, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
; RV32-NEXT: vsll.vi v8, v8, 4, v0.t
; RV32-NEXT: vor.vv v8, v12, v8, v0.t
; RV32-NEXT: vsrl.vi v12, v8, 2, v0.t
+; RV32-NEXT: addi a1, sp, 16
; RV32-NEXT: vsetvli a2, zero, e64, m4, ta, ma
; RV32-NEXT: vlse64.v v16, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
; RV32-NEXT: vsll.vi v8, v8, 2, v0.t
; RV32-NEXT: vor.vv v8, v12, v8, v0.t
; RV32-NEXT: vsrl.vi v12, v8, 1, v0.t
+; RV32-NEXT: addi a1, sp, 8
; RV32-NEXT: vsetvli a2, zero, e64, m4, ta, ma
; RV32-NEXT: vlse64.v v16, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
; RV32-NEXT: vand.vv v8, v8, v16, v0.t
; RV32-NEXT: vsll.vi v8, v8, 1, v0.t
; RV32-NEXT: vor.vv v8, v12, v8, v0.t
-; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: addi sp, sp, 32
; RV32-NEXT: ret
;
; RV64-LABEL: vp_bitreverse_nxv4i64:
define <vscale x 4 x i64> @vp_bitreverse_nxv4i64_unmasked(<vscale x 4 x i64> %va, i32 zeroext %evl) {
; RV32-LABEL: vp_bitreverse_nxv4i64_unmasked:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
-; RV32-NEXT: sw zero, 12(sp)
+; RV32-NEXT: addi sp, sp, -32
+; RV32-NEXT: .cfi_def_cfa_offset 32
+; RV32-NEXT: sw zero, 4(sp)
; RV32-NEXT: lui a1, 1044480
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 0(sp)
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 28(sp)
+; RV32-NEXT: sw a1, 24(sp)
; RV32-NEXT: lui a1, 209715
; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 20(sp)
+; RV32-NEXT: sw a1, 16(sp)
; RV32-NEXT: lui a1, 349525
; RV32-NEXT: addi a1, a1, 1365
; RV32-NEXT: sw a1, 12(sp)
; RV32-NEXT: lui a4, 4080
; RV32-NEXT: vand.vx v16, v8, a4
; RV32-NEXT: vsll.vi v16, v16, 24
-; RV32-NEXT: addi a5, sp, 8
+; RV32-NEXT: mv a5, sp
; RV32-NEXT: vsetvli a6, zero, e64, m4, ta, ma
; RV32-NEXT: vlse64.v v20, (a5), zero
; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
; RV32-NEXT: vor.vv v8, v8, v16
; RV32-NEXT: vor.vv v8, v12, v8
; RV32-NEXT: vsrl.vi v12, v8, 4
-; RV32-NEXT: vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT: vlse64.v v16, (a5), zero
+; RV32-NEXT: addi a1, sp, 24
+; RV32-NEXT: vsetvli a2, zero, e64, m4, ta, ma
+; RV32-NEXT: vlse64.v v16, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
; RV32-NEXT: vand.vv v12, v12, v16
; RV32-NEXT: vand.vv v8, v8, v16
; RV32-NEXT: vsll.vi v8, v8, 4
; RV32-NEXT: vor.vv v8, v12, v8
; RV32-NEXT: vsrl.vi v12, v8, 2
-; RV32-NEXT: vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT: vlse64.v v16, (a5), zero
+; RV32-NEXT: addi a1, sp, 16
+; RV32-NEXT: vsetvli a2, zero, e64, m4, ta, ma
+; RV32-NEXT: vlse64.v v16, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
; RV32-NEXT: vand.vv v12, v12, v16
; RV32-NEXT: vand.vv v8, v8, v16
; RV32-NEXT: vsll.vi v8, v8, 2
; RV32-NEXT: vor.vv v8, v12, v8
; RV32-NEXT: vsrl.vi v12, v8, 1
-; RV32-NEXT: vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT: vlse64.v v16, (a5), zero
+; RV32-NEXT: addi a1, sp, 8
+; RV32-NEXT: vsetvli a2, zero, e64, m4, ta, ma
+; RV32-NEXT: vlse64.v v16, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
; RV32-NEXT: vand.vv v12, v12, v16
; RV32-NEXT: vand.vv v8, v8, v16
; RV32-NEXT: vadd.vv v8, v8, v8
; RV32-NEXT: vor.vv v8, v12, v8
-; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: addi sp, sp, 32
; RV32-NEXT: ret
;
; RV64-LABEL: vp_bitreverse_nxv4i64_unmasked:
define <vscale x 7 x i64> @vp_bitreverse_nxv7i64(<vscale x 7 x i64> %va, <vscale x 7 x i1> %m, i32 zeroext %evl) {
; RV32-LABEL: vp_bitreverse_nxv7i64:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: addi sp, sp, -48
+; RV32-NEXT: .cfi_def_cfa_offset 48
; RV32-NEXT: csrr a1, vlenb
; RV32-NEXT: li a2, 24
; RV32-NEXT: mul a1, a1, a2
; RV32-NEXT: sub sp, sp, a1
-; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
-; RV32-NEXT: sw zero, 12(sp)
+; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 24 * vlenb
+; RV32-NEXT: sw zero, 20(sp)
; RV32-NEXT: lui a1, 1044480
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 16(sp)
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 44(sp)
+; RV32-NEXT: sw a1, 40(sp)
; RV32-NEXT: lui a1, 209715
; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 36(sp)
+; RV32-NEXT: sw a1, 32(sp)
; RV32-NEXT: lui a1, 349525
; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
-; RV32-NEXT: li a2, 56
+; RV32-NEXT: sw a1, 28(sp)
+; RV32-NEXT: sw a1, 24(sp)
+; RV32-NEXT: li a1, 56
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsll.vx v16, v8, a2, v0.t
-; RV32-NEXT: lui a1, 16
-; RV32-NEXT: addi a3, a1, -256
-; RV32-NEXT: vand.vx v24, v8, a3, v0.t
-; RV32-NEXT: li a4, 40
-; RV32-NEXT: vsll.vx v24, v24, a4, v0.t
+; RV32-NEXT: vsll.vx v16, v8, a1, v0.t
+; RV32-NEXT: lui a2, 16
+; RV32-NEXT: addi a2, a2, -256
+; RV32-NEXT: vand.vx v24, v8, a2, v0.t
+; RV32-NEXT: li a3, 40
+; RV32-NEXT: vsll.vx v24, v24, a3, v0.t
; RV32-NEXT: vor.vv v16, v16, v24, v0.t
-; RV32-NEXT: addi a1, sp, 16
-; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
-; RV32-NEXT: lui a5, 4080
-; RV32-NEXT: vand.vx v16, v8, a5, v0.t
+; RV32-NEXT: addi a4, sp, 48
+; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill
+; RV32-NEXT: lui a4, 4080
+; RV32-NEXT: vand.vx v16, v8, a4, v0.t
; RV32-NEXT: vsll.vi v16, v16, 24, v0.t
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 4
-; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
-; RV32-NEXT: addi a1, sp, 8
+; RV32-NEXT: csrr a5, vlenb
+; RV32-NEXT: slli a5, a5, 4
+; RV32-NEXT: add a5, sp, a5
+; RV32-NEXT: addi a5, a5, 48
+; RV32-NEXT: vs8r.v v16, (a5) # Unknown-size Folded Spill
+; RV32-NEXT: addi a5, sp, 16
; RV32-NEXT: vsetvli a6, zero, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v16, (a1), zero
-; RV32-NEXT: csrr a6, vlenb
-; RV32-NEXT: slli a6, a6, 3
-; RV32-NEXT: add a6, sp, a6
-; RV32-NEXT: addi a6, a6, 16
-; RV32-NEXT: vs8r.v v16, (a6) # Unknown-size Folded Spill
+; RV32-NEXT: vlse64.v v16, (a5), zero
+; RV32-NEXT: csrr a5, vlenb
+; RV32-NEXT: slli a5, a5, 3
+; RV32-NEXT: add a5, sp, a5
+; RV32-NEXT: addi a5, a5, 48
+; RV32-NEXT: vs8r.v v16, (a5) # Unknown-size Folded Spill
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vand.vv v16, v8, v16, v0.t
; RV32-NEXT: vsll.vi v16, v16, 8, v0.t
-; RV32-NEXT: csrr a6, vlenb
-; RV32-NEXT: slli a6, a6, 4
-; RV32-NEXT: add a6, sp, a6
-; RV32-NEXT: addi a6, a6, 16
-; RV32-NEXT: vl8r.v v24, (a6) # Unknown-size Folded Reload
+; RV32-NEXT: csrr a5, vlenb
+; RV32-NEXT: slli a5, a5, 4
+; RV32-NEXT: add a5, sp, a5
+; RV32-NEXT: addi a5, a5, 48
+; RV32-NEXT: vl8r.v v24, (a5) # Unknown-size Folded Reload
; RV32-NEXT: vor.vv v16, v24, v16, v0.t
-; RV32-NEXT: addi a6, sp, 16
-; RV32-NEXT: vl8r.v v24, (a6) # Unknown-size Folded Reload
+; RV32-NEXT: addi a5, sp, 48
+; RV32-NEXT: vl8r.v v24, (a5) # Unknown-size Folded Reload
; RV32-NEXT: vor.vv v16, v24, v16, v0.t
-; RV32-NEXT: csrr a6, vlenb
-; RV32-NEXT: slli a6, a6, 4
-; RV32-NEXT: add a6, sp, a6
-; RV32-NEXT: addi a6, a6, 16
-; RV32-NEXT: vs8r.v v16, (a6) # Unknown-size Folded Spill
-; RV32-NEXT: vsrl.vx v24, v8, a2, v0.t
-; RV32-NEXT: vsrl.vx v16, v8, a4, v0.t
-; RV32-NEXT: vand.vx v16, v16, a3, v0.t
+; RV32-NEXT: csrr a5, vlenb
+; RV32-NEXT: slli a5, a5, 4
+; RV32-NEXT: add a5, sp, a5
+; RV32-NEXT: addi a5, a5, 48
+; RV32-NEXT: vs8r.v v16, (a5) # Unknown-size Folded Spill
+; RV32-NEXT: vsrl.vx v24, v8, a1, v0.t
+; RV32-NEXT: vsrl.vx v16, v8, a3, v0.t
+; RV32-NEXT: vand.vx v16, v16, a2, v0.t
; RV32-NEXT: vor.vv v16, v16, v24, v0.t
-; RV32-NEXT: addi a2, sp, 16
-; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
+; RV32-NEXT: addi a1, sp, 48
+; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
; RV32-NEXT: vsrl.vi v24, v8, 24, v0.t
-; RV32-NEXT: vand.vx v24, v24, a5, v0.t
+; RV32-NEXT: vand.vx v24, v24, a4, v0.t
; RV32-NEXT: vsrl.vi v8, v8, 8, v0.t
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 3
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 16
-; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 3
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 48
+; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
; RV32-NEXT: vand.vv v8, v8, v16, v0.t
; RV32-NEXT: vor.vv v8, v8, v24, v0.t
-; RV32-NEXT: addi a2, sp, 16
-; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload
+; RV32-NEXT: addi a1, sp, 48
+; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
; RV32-NEXT: vor.vv v8, v8, v16, v0.t
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 4
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 16
-; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 4
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 48
+; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
; RV32-NEXT: vor.vv v8, v16, v8, v0.t
; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t
+; RV32-NEXT: addi a1, sp, 40
; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v24, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsll.vi v8, v8, 4, v0.t
; RV32-NEXT: vor.vv v8, v16, v8, v0.t
; RV32-NEXT: vsrl.vi v16, v8, 2, v0.t
+; RV32-NEXT: addi a1, sp, 32
; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v24, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsll.vi v8, v8, 2, v0.t
; RV32-NEXT: vor.vv v8, v16, v8, v0.t
; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t
+; RV32-NEXT: addi a1, sp, 24
; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v24, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: li a1, 24
; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add sp, sp, a0
-; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: addi sp, sp, 48
; RV32-NEXT: ret
;
; RV64-LABEL: vp_bitreverse_nxv7i64:
define <vscale x 7 x i64> @vp_bitreverse_nxv7i64_unmasked(<vscale x 7 x i64> %va, i32 zeroext %evl) {
; RV32-LABEL: vp_bitreverse_nxv7i64_unmasked:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: addi sp, sp, -48
+; RV32-NEXT: .cfi_def_cfa_offset 48
; RV32-NEXT: csrr a1, vlenb
; RV32-NEXT: slli a1, a1, 3
; RV32-NEXT: sub sp, sp, a1
-; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; RV32-NEXT: sw zero, 12(sp)
+; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 8 * vlenb
+; RV32-NEXT: sw zero, 20(sp)
; RV32-NEXT: lui a1, 1044480
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 16(sp)
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 44(sp)
+; RV32-NEXT: sw a1, 40(sp)
; RV32-NEXT: lui a1, 209715
; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 36(sp)
+; RV32-NEXT: sw a1, 32(sp)
; RV32-NEXT: lui a1, 349525
; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 28(sp)
+; RV32-NEXT: sw a1, 24(sp)
; RV32-NEXT: li a1, 56
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsll.vx v16, v8, a1
; RV32-NEXT: li a3, 40
; RV32-NEXT: vsll.vx v24, v24, a3
; RV32-NEXT: vor.vv v16, v16, v24
-; RV32-NEXT: addi a4, sp, 16
+; RV32-NEXT: addi a4, sp, 48
; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill
; RV32-NEXT: lui a4, 4080
; RV32-NEXT: vand.vx v16, v8, a4
; RV32-NEXT: vsll.vi v0, v16, 24
-; RV32-NEXT: addi a5, sp, 8
+; RV32-NEXT: addi a5, sp, 16
; RV32-NEXT: vsetvli a6, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v16, (a5), zero
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vand.vv v24, v8, v16
; RV32-NEXT: vsll.vi v24, v24, 8
; RV32-NEXT: vor.vv v24, v0, v24
-; RV32-NEXT: addi a6, sp, 16
-; RV32-NEXT: vl8r.v v0, (a6) # Unknown-size Folded Reload
+; RV32-NEXT: addi a5, sp, 48
+; RV32-NEXT: vl8r.v v0, (a5) # Unknown-size Folded Reload
; RV32-NEXT: vor.vv v24, v0, v24
-; RV32-NEXT: vs8r.v v24, (a6) # Unknown-size Folded Spill
+; RV32-NEXT: vs8r.v v24, (a5) # Unknown-size Folded Spill
; RV32-NEXT: vsrl.vx v0, v8, a3
; RV32-NEXT: vand.vx v0, v0, a2
; RV32-NEXT: vsrl.vx v24, v8, a1
; RV32-NEXT: vand.vx v8, v8, a4
; RV32-NEXT: vor.vv v8, v16, v8
; RV32-NEXT: vor.vv v8, v8, v24
-; RV32-NEXT: addi a1, sp, 16
+; RV32-NEXT: addi a1, sp, 48
; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
; RV32-NEXT: vor.vv v8, v16, v8
; RV32-NEXT: vsrl.vi v16, v8, 4
-; RV32-NEXT: vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v24, (a5), zero
+; RV32-NEXT: addi a1, sp, 40
+; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, ma
+; RV32-NEXT: vlse64.v v24, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vand.vv v16, v16, v24
; RV32-NEXT: vand.vv v8, v8, v24
; RV32-NEXT: vsll.vi v8, v8, 4
; RV32-NEXT: vor.vv v8, v16, v8
; RV32-NEXT: vsrl.vi v16, v8, 2
-; RV32-NEXT: vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v24, (a5), zero
+; RV32-NEXT: addi a1, sp, 32
+; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, ma
+; RV32-NEXT: vlse64.v v24, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vand.vv v16, v16, v24
; RV32-NEXT: vand.vv v8, v8, v24
; RV32-NEXT: vsll.vi v8, v8, 2
; RV32-NEXT: vor.vv v8, v16, v8
; RV32-NEXT: vsrl.vi v16, v8, 1
-; RV32-NEXT: vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v24, (a5), zero
+; RV32-NEXT: addi a1, sp, 24
+; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, ma
+; RV32-NEXT: vlse64.v v24, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vand.vv v16, v16, v24
; RV32-NEXT: vand.vv v8, v8, v24
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 3
; RV32-NEXT: add sp, sp, a0
-; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: addi sp, sp, 48
; RV32-NEXT: ret
;
; RV64-LABEL: vp_bitreverse_nxv7i64_unmasked:
define <vscale x 8 x i64> @vp_bitreverse_nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
; RV32-LABEL: vp_bitreverse_nxv8i64:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: addi sp, sp, -48
+; RV32-NEXT: .cfi_def_cfa_offset 48
; RV32-NEXT: csrr a1, vlenb
; RV32-NEXT: li a2, 24
; RV32-NEXT: mul a1, a1, a2
; RV32-NEXT: sub sp, sp, a1
-; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
-; RV32-NEXT: sw zero, 12(sp)
+; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 24 * vlenb
+; RV32-NEXT: sw zero, 20(sp)
; RV32-NEXT: lui a1, 1044480
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 16(sp)
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 44(sp)
+; RV32-NEXT: sw a1, 40(sp)
; RV32-NEXT: lui a1, 209715
; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 36(sp)
+; RV32-NEXT: sw a1, 32(sp)
; RV32-NEXT: lui a1, 349525
; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
-; RV32-NEXT: li a2, 56
+; RV32-NEXT: sw a1, 28(sp)
+; RV32-NEXT: sw a1, 24(sp)
+; RV32-NEXT: li a1, 56
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsll.vx v16, v8, a2, v0.t
-; RV32-NEXT: lui a1, 16
-; RV32-NEXT: addi a3, a1, -256
-; RV32-NEXT: vand.vx v24, v8, a3, v0.t
-; RV32-NEXT: li a4, 40
-; RV32-NEXT: vsll.vx v24, v24, a4, v0.t
+; RV32-NEXT: vsll.vx v16, v8, a1, v0.t
+; RV32-NEXT: lui a2, 16
+; RV32-NEXT: addi a2, a2, -256
+; RV32-NEXT: vand.vx v24, v8, a2, v0.t
+; RV32-NEXT: li a3, 40
+; RV32-NEXT: vsll.vx v24, v24, a3, v0.t
; RV32-NEXT: vor.vv v16, v16, v24, v0.t
-; RV32-NEXT: addi a1, sp, 16
-; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
-; RV32-NEXT: lui a5, 4080
-; RV32-NEXT: vand.vx v16, v8, a5, v0.t
+; RV32-NEXT: addi a4, sp, 48
+; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill
+; RV32-NEXT: lui a4, 4080
+; RV32-NEXT: vand.vx v16, v8, a4, v0.t
; RV32-NEXT: vsll.vi v16, v16, 24, v0.t
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 4
-; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
-; RV32-NEXT: addi a1, sp, 8
+; RV32-NEXT: csrr a5, vlenb
+; RV32-NEXT: slli a5, a5, 4
+; RV32-NEXT: add a5, sp, a5
+; RV32-NEXT: addi a5, a5, 48
+; RV32-NEXT: vs8r.v v16, (a5) # Unknown-size Folded Spill
+; RV32-NEXT: addi a5, sp, 16
; RV32-NEXT: vsetvli a6, zero, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v16, (a1), zero
-; RV32-NEXT: csrr a6, vlenb
-; RV32-NEXT: slli a6, a6, 3
-; RV32-NEXT: add a6, sp, a6
-; RV32-NEXT: addi a6, a6, 16
-; RV32-NEXT: vs8r.v v16, (a6) # Unknown-size Folded Spill
+; RV32-NEXT: vlse64.v v16, (a5), zero
+; RV32-NEXT: csrr a5, vlenb
+; RV32-NEXT: slli a5, a5, 3
+; RV32-NEXT: add a5, sp, a5
+; RV32-NEXT: addi a5, a5, 48
+; RV32-NEXT: vs8r.v v16, (a5) # Unknown-size Folded Spill
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vand.vv v16, v8, v16, v0.t
; RV32-NEXT: vsll.vi v16, v16, 8, v0.t
-; RV32-NEXT: csrr a6, vlenb
-; RV32-NEXT: slli a6, a6, 4
-; RV32-NEXT: add a6, sp, a6
-; RV32-NEXT: addi a6, a6, 16
-; RV32-NEXT: vl8r.v v24, (a6) # Unknown-size Folded Reload
+; RV32-NEXT: csrr a5, vlenb
+; RV32-NEXT: slli a5, a5, 4
+; RV32-NEXT: add a5, sp, a5
+; RV32-NEXT: addi a5, a5, 48
+; RV32-NEXT: vl8r.v v24, (a5) # Unknown-size Folded Reload
; RV32-NEXT: vor.vv v16, v24, v16, v0.t
-; RV32-NEXT: addi a6, sp, 16
-; RV32-NEXT: vl8r.v v24, (a6) # Unknown-size Folded Reload
+; RV32-NEXT: addi a5, sp, 48
+; RV32-NEXT: vl8r.v v24, (a5) # Unknown-size Folded Reload
; RV32-NEXT: vor.vv v16, v24, v16, v0.t
-; RV32-NEXT: csrr a6, vlenb
-; RV32-NEXT: slli a6, a6, 4
-; RV32-NEXT: add a6, sp, a6
-; RV32-NEXT: addi a6, a6, 16
-; RV32-NEXT: vs8r.v v16, (a6) # Unknown-size Folded Spill
-; RV32-NEXT: vsrl.vx v24, v8, a2, v0.t
-; RV32-NEXT: vsrl.vx v16, v8, a4, v0.t
-; RV32-NEXT: vand.vx v16, v16, a3, v0.t
+; RV32-NEXT: csrr a5, vlenb
+; RV32-NEXT: slli a5, a5, 4
+; RV32-NEXT: add a5, sp, a5
+; RV32-NEXT: addi a5, a5, 48
+; RV32-NEXT: vs8r.v v16, (a5) # Unknown-size Folded Spill
+; RV32-NEXT: vsrl.vx v24, v8, a1, v0.t
+; RV32-NEXT: vsrl.vx v16, v8, a3, v0.t
+; RV32-NEXT: vand.vx v16, v16, a2, v0.t
; RV32-NEXT: vor.vv v16, v16, v24, v0.t
-; RV32-NEXT: addi a2, sp, 16
-; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
+; RV32-NEXT: addi a1, sp, 48
+; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
; RV32-NEXT: vsrl.vi v24, v8, 24, v0.t
-; RV32-NEXT: vand.vx v24, v24, a5, v0.t
+; RV32-NEXT: vand.vx v24, v24, a4, v0.t
; RV32-NEXT: vsrl.vi v8, v8, 8, v0.t
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 3
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 16
-; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 3
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 48
+; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
; RV32-NEXT: vand.vv v8, v8, v16, v0.t
; RV32-NEXT: vor.vv v8, v8, v24, v0.t
-; RV32-NEXT: addi a2, sp, 16
-; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload
+; RV32-NEXT: addi a1, sp, 48
+; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
; RV32-NEXT: vor.vv v8, v8, v16, v0.t
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 4
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 16
-; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 4
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 48
+; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
; RV32-NEXT: vor.vv v8, v16, v8, v0.t
; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t
+; RV32-NEXT: addi a1, sp, 40
; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v24, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsll.vi v8, v8, 4, v0.t
; RV32-NEXT: vor.vv v8, v16, v8, v0.t
; RV32-NEXT: vsrl.vi v16, v8, 2, v0.t
+; RV32-NEXT: addi a1, sp, 32
; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v24, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsll.vi v8, v8, 2, v0.t
; RV32-NEXT: vor.vv v8, v16, v8, v0.t
; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t
+; RV32-NEXT: addi a1, sp, 24
; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v24, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: li a1, 24
; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add sp, sp, a0
-; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: addi sp, sp, 48
; RV32-NEXT: ret
;
; RV64-LABEL: vp_bitreverse_nxv8i64:
define <vscale x 8 x i64> @vp_bitreverse_nxv8i64_unmasked(<vscale x 8 x i64> %va, i32 zeroext %evl) {
; RV32-LABEL: vp_bitreverse_nxv8i64_unmasked:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: addi sp, sp, -48
+; RV32-NEXT: .cfi_def_cfa_offset 48
; RV32-NEXT: csrr a1, vlenb
; RV32-NEXT: slli a1, a1, 3
; RV32-NEXT: sub sp, sp, a1
-; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; RV32-NEXT: sw zero, 12(sp)
+; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 8 * vlenb
+; RV32-NEXT: sw zero, 20(sp)
; RV32-NEXT: lui a1, 1044480
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 16(sp)
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 44(sp)
+; RV32-NEXT: sw a1, 40(sp)
; RV32-NEXT: lui a1, 209715
; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 36(sp)
+; RV32-NEXT: sw a1, 32(sp)
; RV32-NEXT: lui a1, 349525
; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 28(sp)
+; RV32-NEXT: sw a1, 24(sp)
; RV32-NEXT: li a1, 56
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsll.vx v16, v8, a1
; RV32-NEXT: li a3, 40
; RV32-NEXT: vsll.vx v24, v24, a3
; RV32-NEXT: vor.vv v16, v16, v24
-; RV32-NEXT: addi a4, sp, 16
+; RV32-NEXT: addi a4, sp, 48
; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill
; RV32-NEXT: lui a4, 4080
; RV32-NEXT: vand.vx v16, v8, a4
; RV32-NEXT: vsll.vi v0, v16, 24
-; RV32-NEXT: addi a5, sp, 8
+; RV32-NEXT: addi a5, sp, 16
; RV32-NEXT: vsetvli a6, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v16, (a5), zero
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vand.vv v24, v8, v16
; RV32-NEXT: vsll.vi v24, v24, 8
; RV32-NEXT: vor.vv v24, v0, v24
-; RV32-NEXT: addi a6, sp, 16
-; RV32-NEXT: vl8r.v v0, (a6) # Unknown-size Folded Reload
+; RV32-NEXT: addi a5, sp, 48
+; RV32-NEXT: vl8r.v v0, (a5) # Unknown-size Folded Reload
; RV32-NEXT: vor.vv v24, v0, v24
-; RV32-NEXT: vs8r.v v24, (a6) # Unknown-size Folded Spill
+; RV32-NEXT: vs8r.v v24, (a5) # Unknown-size Folded Spill
; RV32-NEXT: vsrl.vx v0, v8, a3
; RV32-NEXT: vand.vx v0, v0, a2
; RV32-NEXT: vsrl.vx v24, v8, a1
; RV32-NEXT: vand.vx v8, v8, a4
; RV32-NEXT: vor.vv v8, v16, v8
; RV32-NEXT: vor.vv v8, v8, v24
-; RV32-NEXT: addi a1, sp, 16
+; RV32-NEXT: addi a1, sp, 48
; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
; RV32-NEXT: vor.vv v8, v16, v8
; RV32-NEXT: vsrl.vi v16, v8, 4
-; RV32-NEXT: vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v24, (a5), zero
+; RV32-NEXT: addi a1, sp, 40
+; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, ma
+; RV32-NEXT: vlse64.v v24, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vand.vv v16, v16, v24
; RV32-NEXT: vand.vv v8, v8, v24
; RV32-NEXT: vsll.vi v8, v8, 4
; RV32-NEXT: vor.vv v8, v16, v8
; RV32-NEXT: vsrl.vi v16, v8, 2
-; RV32-NEXT: vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v24, (a5), zero
+; RV32-NEXT: addi a1, sp, 32
+; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, ma
+; RV32-NEXT: vlse64.v v24, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vand.vv v16, v16, v24
; RV32-NEXT: vand.vv v8, v8, v24
; RV32-NEXT: vsll.vi v8, v8, 2
; RV32-NEXT: vor.vv v8, v16, v8
; RV32-NEXT: vsrl.vi v16, v8, 1
-; RV32-NEXT: vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v24, (a5), zero
+; RV32-NEXT: addi a1, sp, 24
+; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, ma
+; RV32-NEXT: vlse64.v v24, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vand.vv v16, v16, v24
; RV32-NEXT: vand.vv v8, v8, v24
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 3
; RV32-NEXT: add sp, sp, a0
-; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: addi sp, sp, 48
; RV32-NEXT: ret
;
; RV64-LABEL: vp_bitreverse_nxv8i64_unmasked:
define <vscale x 1 x i64> @ctlz_nxv1i64(<vscale x 1 x i64> %va) {
; RV32I-LABEL: ctlz_nxv1i64:
; RV32I: # %bb.0:
-; RV32I-NEXT: addi sp, sp, -16
-; RV32I-NEXT: .cfi_def_cfa_offset 16
+; RV32I-NEXT: addi sp, sp, -32
+; RV32I-NEXT: .cfi_def_cfa_offset 32
; RV32I-NEXT: lui a0, 349525
; RV32I-NEXT: addi a0, a0, 1365
-; RV32I-NEXT: sw a0, 12(sp)
-; RV32I-NEXT: sw a0, 8(sp)
+; RV32I-NEXT: sw a0, 28(sp)
+; RV32I-NEXT: sw a0, 24(sp)
; RV32I-NEXT: lui a0, 209715
; RV32I-NEXT: addi a0, a0, 819
-; RV32I-NEXT: sw a0, 12(sp)
-; RV32I-NEXT: sw a0, 8(sp)
+; RV32I-NEXT: sw a0, 20(sp)
+; RV32I-NEXT: sw a0, 16(sp)
; RV32I-NEXT: lui a0, 61681
; RV32I-NEXT: addi a0, a0, -241
; RV32I-NEXT: sw a0, 12(sp)
; RV32I-NEXT: sw a0, 8(sp)
; RV32I-NEXT: lui a0, 4112
; RV32I-NEXT: addi a0, a0, 257
-; RV32I-NEXT: sw a0, 12(sp)
-; RV32I-NEXT: sw a0, 8(sp)
+; RV32I-NEXT: sw a0, 4(sp)
+; RV32I-NEXT: sw a0, 0(sp)
; RV32I-NEXT: vsetvli a0, zero, e64, m1, ta, ma
; RV32I-NEXT: vsrl.vi v9, v8, 1
; RV32I-NEXT: vor.vv v8, v8, v9
; RV32I-NEXT: li a0, 32
; RV32I-NEXT: vsrl.vx v9, v8, a0
; RV32I-NEXT: vor.vv v8, v8, v9
-; RV32I-NEXT: addi a0, sp, 8
-; RV32I-NEXT: vlse64.v v9, (a0), zero
; RV32I-NEXT: vnot.v v8, v8
+; RV32I-NEXT: addi a0, sp, 24
+; RV32I-NEXT: vlse64.v v9, (a0), zero
+; RV32I-NEXT: addi a0, sp, 16
; RV32I-NEXT: vlse64.v v10, (a0), zero
; RV32I-NEXT: vsrl.vi v11, v8, 1
; RV32I-NEXT: vand.vv v9, v11, v9
; RV32I-NEXT: vsrl.vi v8, v8, 2
; RV32I-NEXT: vand.vv v8, v8, v10
; RV32I-NEXT: vadd.vv v8, v9, v8
+; RV32I-NEXT: addi a0, sp, 8
; RV32I-NEXT: vlse64.v v9, (a0), zero
+; RV32I-NEXT: mv a0, sp
; RV32I-NEXT: vlse64.v v10, (a0), zero
; RV32I-NEXT: vsrl.vi v11, v8, 4
; RV32I-NEXT: vadd.vv v8, v8, v11
; RV32I-NEXT: vmul.vv v8, v8, v10
; RV32I-NEXT: li a0, 56
; RV32I-NEXT: vsrl.vx v8, v8, a0
-; RV32I-NEXT: addi sp, sp, 16
+; RV32I-NEXT: addi sp, sp, 32
; RV32I-NEXT: ret
;
; RV64I-LABEL: ctlz_nxv1i64:
define <vscale x 2 x i64> @ctlz_nxv2i64(<vscale x 2 x i64> %va) {
; RV32I-LABEL: ctlz_nxv2i64:
; RV32I: # %bb.0:
-; RV32I-NEXT: addi sp, sp, -16
-; RV32I-NEXT: .cfi_def_cfa_offset 16
+; RV32I-NEXT: addi sp, sp, -32
+; RV32I-NEXT: .cfi_def_cfa_offset 32
; RV32I-NEXT: lui a0, 349525
; RV32I-NEXT: addi a0, a0, 1365
-; RV32I-NEXT: sw a0, 12(sp)
-; RV32I-NEXT: sw a0, 8(sp)
+; RV32I-NEXT: sw a0, 28(sp)
+; RV32I-NEXT: sw a0, 24(sp)
; RV32I-NEXT: lui a0, 209715
; RV32I-NEXT: addi a0, a0, 819
-; RV32I-NEXT: sw a0, 12(sp)
-; RV32I-NEXT: sw a0, 8(sp)
+; RV32I-NEXT: sw a0, 20(sp)
+; RV32I-NEXT: sw a0, 16(sp)
; RV32I-NEXT: lui a0, 61681
; RV32I-NEXT: addi a0, a0, -241
; RV32I-NEXT: sw a0, 12(sp)
; RV32I-NEXT: sw a0, 8(sp)
; RV32I-NEXT: lui a0, 4112
; RV32I-NEXT: addi a0, a0, 257
-; RV32I-NEXT: sw a0, 12(sp)
-; RV32I-NEXT: sw a0, 8(sp)
+; RV32I-NEXT: sw a0, 4(sp)
+; RV32I-NEXT: sw a0, 0(sp)
; RV32I-NEXT: vsetvli a0, zero, e64, m2, ta, ma
; RV32I-NEXT: vsrl.vi v10, v8, 1
; RV32I-NEXT: vor.vv v8, v8, v10
; RV32I-NEXT: li a0, 32
; RV32I-NEXT: vsrl.vx v10, v8, a0
; RV32I-NEXT: vor.vv v8, v8, v10
-; RV32I-NEXT: addi a0, sp, 8
-; RV32I-NEXT: vlse64.v v10, (a0), zero
; RV32I-NEXT: vnot.v v8, v8
+; RV32I-NEXT: addi a0, sp, 24
+; RV32I-NEXT: vlse64.v v10, (a0), zero
+; RV32I-NEXT: addi a0, sp, 16
; RV32I-NEXT: vlse64.v v12, (a0), zero
; RV32I-NEXT: vsrl.vi v14, v8, 1
; RV32I-NEXT: vand.vv v10, v14, v10
; RV32I-NEXT: vsrl.vi v8, v8, 2
; RV32I-NEXT: vand.vv v8, v8, v12
; RV32I-NEXT: vadd.vv v8, v10, v8
+; RV32I-NEXT: addi a0, sp, 8
; RV32I-NEXT: vlse64.v v10, (a0), zero
+; RV32I-NEXT: mv a0, sp
; RV32I-NEXT: vlse64.v v12, (a0), zero
; RV32I-NEXT: vsrl.vi v14, v8, 4
; RV32I-NEXT: vadd.vv v8, v8, v14
; RV32I-NEXT: vmul.vv v8, v8, v12
; RV32I-NEXT: li a0, 56
; RV32I-NEXT: vsrl.vx v8, v8, a0
-; RV32I-NEXT: addi sp, sp, 16
+; RV32I-NEXT: addi sp, sp, 32
; RV32I-NEXT: ret
;
; RV64I-LABEL: ctlz_nxv2i64:
define <vscale x 4 x i64> @ctlz_nxv4i64(<vscale x 4 x i64> %va) {
; RV32I-LABEL: ctlz_nxv4i64:
; RV32I: # %bb.0:
-; RV32I-NEXT: addi sp, sp, -16
-; RV32I-NEXT: .cfi_def_cfa_offset 16
+; RV32I-NEXT: addi sp, sp, -32
+; RV32I-NEXT: .cfi_def_cfa_offset 32
; RV32I-NEXT: lui a0, 349525
; RV32I-NEXT: addi a0, a0, 1365
-; RV32I-NEXT: sw a0, 12(sp)
-; RV32I-NEXT: sw a0, 8(sp)
+; RV32I-NEXT: sw a0, 28(sp)
+; RV32I-NEXT: sw a0, 24(sp)
; RV32I-NEXT: lui a0, 209715
; RV32I-NEXT: addi a0, a0, 819
-; RV32I-NEXT: sw a0, 12(sp)
-; RV32I-NEXT: sw a0, 8(sp)
+; RV32I-NEXT: sw a0, 20(sp)
+; RV32I-NEXT: sw a0, 16(sp)
; RV32I-NEXT: lui a0, 61681
; RV32I-NEXT: addi a0, a0, -241
; RV32I-NEXT: sw a0, 12(sp)
; RV32I-NEXT: sw a0, 8(sp)
; RV32I-NEXT: lui a0, 4112
; RV32I-NEXT: addi a0, a0, 257
-; RV32I-NEXT: sw a0, 12(sp)
-; RV32I-NEXT: sw a0, 8(sp)
+; RV32I-NEXT: sw a0, 4(sp)
+; RV32I-NEXT: sw a0, 0(sp)
; RV32I-NEXT: vsetvli a0, zero, e64, m4, ta, ma
; RV32I-NEXT: vsrl.vi v12, v8, 1
; RV32I-NEXT: vor.vv v8, v8, v12
; RV32I-NEXT: li a0, 32
; RV32I-NEXT: vsrl.vx v12, v8, a0
; RV32I-NEXT: vor.vv v8, v8, v12
-; RV32I-NEXT: addi a0, sp, 8
-; RV32I-NEXT: vlse64.v v12, (a0), zero
; RV32I-NEXT: vnot.v v8, v8
+; RV32I-NEXT: addi a0, sp, 24
+; RV32I-NEXT: vlse64.v v12, (a0), zero
+; RV32I-NEXT: addi a0, sp, 16
; RV32I-NEXT: vlse64.v v16, (a0), zero
; RV32I-NEXT: vsrl.vi v20, v8, 1
; RV32I-NEXT: vand.vv v12, v20, v12
; RV32I-NEXT: vsrl.vi v8, v8, 2
; RV32I-NEXT: vand.vv v8, v8, v16
; RV32I-NEXT: vadd.vv v8, v12, v8
+; RV32I-NEXT: addi a0, sp, 8
; RV32I-NEXT: vlse64.v v12, (a0), zero
+; RV32I-NEXT: mv a0, sp
; RV32I-NEXT: vlse64.v v16, (a0), zero
; RV32I-NEXT: vsrl.vi v20, v8, 4
; RV32I-NEXT: vadd.vv v8, v8, v20
; RV32I-NEXT: vmul.vv v8, v8, v16
; RV32I-NEXT: li a0, 56
; RV32I-NEXT: vsrl.vx v8, v8, a0
-; RV32I-NEXT: addi sp, sp, 16
+; RV32I-NEXT: addi sp, sp, 32
; RV32I-NEXT: ret
;
; RV64I-LABEL: ctlz_nxv4i64:
define <vscale x 8 x i64> @ctlz_nxv8i64(<vscale x 8 x i64> %va) {
; RV32I-LABEL: ctlz_nxv8i64:
; RV32I: # %bb.0:
-; RV32I-NEXT: addi sp, sp, -16
-; RV32I-NEXT: .cfi_def_cfa_offset 16
+; RV32I-NEXT: addi sp, sp, -32
+; RV32I-NEXT: .cfi_def_cfa_offset 32
; RV32I-NEXT: lui a0, 349525
; RV32I-NEXT: addi a0, a0, 1365
-; RV32I-NEXT: sw a0, 12(sp)
-; RV32I-NEXT: sw a0, 8(sp)
+; RV32I-NEXT: sw a0, 28(sp)
+; RV32I-NEXT: sw a0, 24(sp)
; RV32I-NEXT: lui a0, 209715
; RV32I-NEXT: addi a0, a0, 819
-; RV32I-NEXT: sw a0, 12(sp)
-; RV32I-NEXT: sw a0, 8(sp)
+; RV32I-NEXT: sw a0, 20(sp)
+; RV32I-NEXT: sw a0, 16(sp)
; RV32I-NEXT: lui a0, 61681
; RV32I-NEXT: addi a0, a0, -241
; RV32I-NEXT: sw a0, 12(sp)
; RV32I-NEXT: sw a0, 8(sp)
; RV32I-NEXT: lui a0, 4112
; RV32I-NEXT: addi a0, a0, 257
-; RV32I-NEXT: sw a0, 12(sp)
-; RV32I-NEXT: sw a0, 8(sp)
+; RV32I-NEXT: sw a0, 4(sp)
+; RV32I-NEXT: sw a0, 0(sp)
; RV32I-NEXT: vsetvli a0, zero, e64, m8, ta, ma
; RV32I-NEXT: vsrl.vi v16, v8, 1
; RV32I-NEXT: vor.vv v8, v8, v16
; RV32I-NEXT: li a0, 32
; RV32I-NEXT: vsrl.vx v16, v8, a0
; RV32I-NEXT: vor.vv v8, v8, v16
-; RV32I-NEXT: addi a0, sp, 8
-; RV32I-NEXT: vlse64.v v16, (a0), zero
; RV32I-NEXT: vnot.v v8, v8
+; RV32I-NEXT: addi a0, sp, 24
+; RV32I-NEXT: vlse64.v v16, (a0), zero
+; RV32I-NEXT: addi a0, sp, 16
; RV32I-NEXT: vlse64.v v24, (a0), zero
; RV32I-NEXT: vsrl.vi v0, v8, 1
; RV32I-NEXT: vand.vv v16, v0, v16
; RV32I-NEXT: vsrl.vi v8, v8, 2
; RV32I-NEXT: vand.vv v8, v8, v24
; RV32I-NEXT: vadd.vv v8, v16, v8
+; RV32I-NEXT: addi a0, sp, 8
; RV32I-NEXT: vlse64.v v16, (a0), zero
+; RV32I-NEXT: mv a0, sp
; RV32I-NEXT: vlse64.v v24, (a0), zero
; RV32I-NEXT: vsrl.vi v0, v8, 4
; RV32I-NEXT: vadd.vv v8, v8, v0
; RV32I-NEXT: vmul.vv v8, v8, v24
; RV32I-NEXT: li a0, 56
; RV32I-NEXT: vsrl.vx v8, v8, a0
-; RV32I-NEXT: addi sp, sp, 16
+; RV32I-NEXT: addi sp, sp, 32
; RV32I-NEXT: ret
;
; RV64I-LABEL: ctlz_nxv8i64:
define <vscale x 1 x i64> @ctlz_zero_undef_nxv1i64(<vscale x 1 x i64> %va) {
; RV32I-LABEL: ctlz_zero_undef_nxv1i64:
; RV32I: # %bb.0:
-; RV32I-NEXT: addi sp, sp, -16
-; RV32I-NEXT: .cfi_def_cfa_offset 16
+; RV32I-NEXT: addi sp, sp, -32
+; RV32I-NEXT: .cfi_def_cfa_offset 32
; RV32I-NEXT: lui a0, 349525
; RV32I-NEXT: addi a0, a0, 1365
-; RV32I-NEXT: sw a0, 12(sp)
-; RV32I-NEXT: sw a0, 8(sp)
+; RV32I-NEXT: sw a0, 28(sp)
+; RV32I-NEXT: sw a0, 24(sp)
; RV32I-NEXT: lui a0, 209715
; RV32I-NEXT: addi a0, a0, 819
-; RV32I-NEXT: sw a0, 12(sp)
-; RV32I-NEXT: sw a0, 8(sp)
+; RV32I-NEXT: sw a0, 20(sp)
+; RV32I-NEXT: sw a0, 16(sp)
; RV32I-NEXT: lui a0, 61681
; RV32I-NEXT: addi a0, a0, -241
; RV32I-NEXT: sw a0, 12(sp)
; RV32I-NEXT: sw a0, 8(sp)
; RV32I-NEXT: lui a0, 4112
; RV32I-NEXT: addi a0, a0, 257
-; RV32I-NEXT: sw a0, 12(sp)
-; RV32I-NEXT: sw a0, 8(sp)
+; RV32I-NEXT: sw a0, 4(sp)
+; RV32I-NEXT: sw a0, 0(sp)
; RV32I-NEXT: vsetvli a0, zero, e64, m1, ta, ma
; RV32I-NEXT: vsrl.vi v9, v8, 1
; RV32I-NEXT: vor.vv v8, v8, v9
; RV32I-NEXT: li a0, 32
; RV32I-NEXT: vsrl.vx v9, v8, a0
; RV32I-NEXT: vor.vv v8, v8, v9
-; RV32I-NEXT: addi a0, sp, 8
-; RV32I-NEXT: vlse64.v v9, (a0), zero
; RV32I-NEXT: vnot.v v8, v8
+; RV32I-NEXT: addi a0, sp, 24
+; RV32I-NEXT: vlse64.v v9, (a0), zero
+; RV32I-NEXT: addi a0, sp, 16
; RV32I-NEXT: vlse64.v v10, (a0), zero
; RV32I-NEXT: vsrl.vi v11, v8, 1
; RV32I-NEXT: vand.vv v9, v11, v9
; RV32I-NEXT: vsrl.vi v8, v8, 2
; RV32I-NEXT: vand.vv v8, v8, v10
; RV32I-NEXT: vadd.vv v8, v9, v8
+; RV32I-NEXT: addi a0, sp, 8
; RV32I-NEXT: vlse64.v v9, (a0), zero
+; RV32I-NEXT: mv a0, sp
; RV32I-NEXT: vlse64.v v10, (a0), zero
; RV32I-NEXT: vsrl.vi v11, v8, 4
; RV32I-NEXT: vadd.vv v8, v8, v11
; RV32I-NEXT: vmul.vv v8, v8, v10
; RV32I-NEXT: li a0, 56
; RV32I-NEXT: vsrl.vx v8, v8, a0
-; RV32I-NEXT: addi sp, sp, 16
+; RV32I-NEXT: addi sp, sp, 32
; RV32I-NEXT: ret
;
; RV64I-LABEL: ctlz_zero_undef_nxv1i64:
define <vscale x 2 x i64> @ctlz_zero_undef_nxv2i64(<vscale x 2 x i64> %va) {
; RV32I-LABEL: ctlz_zero_undef_nxv2i64:
; RV32I: # %bb.0:
-; RV32I-NEXT: addi sp, sp, -16
-; RV32I-NEXT: .cfi_def_cfa_offset 16
+; RV32I-NEXT: addi sp, sp, -32
+; RV32I-NEXT: .cfi_def_cfa_offset 32
; RV32I-NEXT: lui a0, 349525
; RV32I-NEXT: addi a0, a0, 1365
-; RV32I-NEXT: sw a0, 12(sp)
-; RV32I-NEXT: sw a0, 8(sp)
+; RV32I-NEXT: sw a0, 28(sp)
+; RV32I-NEXT: sw a0, 24(sp)
; RV32I-NEXT: lui a0, 209715
; RV32I-NEXT: addi a0, a0, 819
-; RV32I-NEXT: sw a0, 12(sp)
-; RV32I-NEXT: sw a0, 8(sp)
+; RV32I-NEXT: sw a0, 20(sp)
+; RV32I-NEXT: sw a0, 16(sp)
; RV32I-NEXT: lui a0, 61681
; RV32I-NEXT: addi a0, a0, -241
; RV32I-NEXT: sw a0, 12(sp)
; RV32I-NEXT: sw a0, 8(sp)
; RV32I-NEXT: lui a0, 4112
; RV32I-NEXT: addi a0, a0, 257
-; RV32I-NEXT: sw a0, 12(sp)
-; RV32I-NEXT: sw a0, 8(sp)
+; RV32I-NEXT: sw a0, 4(sp)
+; RV32I-NEXT: sw a0, 0(sp)
; RV32I-NEXT: vsetvli a0, zero, e64, m2, ta, ma
; RV32I-NEXT: vsrl.vi v10, v8, 1
; RV32I-NEXT: vor.vv v8, v8, v10
; RV32I-NEXT: li a0, 32
; RV32I-NEXT: vsrl.vx v10, v8, a0
; RV32I-NEXT: vor.vv v8, v8, v10
-; RV32I-NEXT: addi a0, sp, 8
-; RV32I-NEXT: vlse64.v v10, (a0), zero
; RV32I-NEXT: vnot.v v8, v8
+; RV32I-NEXT: addi a0, sp, 24
+; RV32I-NEXT: vlse64.v v10, (a0), zero
+; RV32I-NEXT: addi a0, sp, 16
; RV32I-NEXT: vlse64.v v12, (a0), zero
; RV32I-NEXT: vsrl.vi v14, v8, 1
; RV32I-NEXT: vand.vv v10, v14, v10
; RV32I-NEXT: vsrl.vi v8, v8, 2
; RV32I-NEXT: vand.vv v8, v8, v12
; RV32I-NEXT: vadd.vv v8, v10, v8
+; RV32I-NEXT: addi a0, sp, 8
; RV32I-NEXT: vlse64.v v10, (a0), zero
+; RV32I-NEXT: mv a0, sp
; RV32I-NEXT: vlse64.v v12, (a0), zero
; RV32I-NEXT: vsrl.vi v14, v8, 4
; RV32I-NEXT: vadd.vv v8, v8, v14
; RV32I-NEXT: vmul.vv v8, v8, v12
; RV32I-NEXT: li a0, 56
; RV32I-NEXT: vsrl.vx v8, v8, a0
-; RV32I-NEXT: addi sp, sp, 16
+; RV32I-NEXT: addi sp, sp, 32
; RV32I-NEXT: ret
;
; RV64I-LABEL: ctlz_zero_undef_nxv2i64:
define <vscale x 4 x i64> @ctlz_zero_undef_nxv4i64(<vscale x 4 x i64> %va) {
; RV32I-LABEL: ctlz_zero_undef_nxv4i64:
; RV32I: # %bb.0:
-; RV32I-NEXT: addi sp, sp, -16
-; RV32I-NEXT: .cfi_def_cfa_offset 16
+; RV32I-NEXT: addi sp, sp, -32
+; RV32I-NEXT: .cfi_def_cfa_offset 32
; RV32I-NEXT: lui a0, 349525
; RV32I-NEXT: addi a0, a0, 1365
-; RV32I-NEXT: sw a0, 12(sp)
-; RV32I-NEXT: sw a0, 8(sp)
+; RV32I-NEXT: sw a0, 28(sp)
+; RV32I-NEXT: sw a0, 24(sp)
; RV32I-NEXT: lui a0, 209715
; RV32I-NEXT: addi a0, a0, 819
-; RV32I-NEXT: sw a0, 12(sp)
-; RV32I-NEXT: sw a0, 8(sp)
+; RV32I-NEXT: sw a0, 20(sp)
+; RV32I-NEXT: sw a0, 16(sp)
; RV32I-NEXT: lui a0, 61681
; RV32I-NEXT: addi a0, a0, -241
; RV32I-NEXT: sw a0, 12(sp)
; RV32I-NEXT: sw a0, 8(sp)
; RV32I-NEXT: lui a0, 4112
; RV32I-NEXT: addi a0, a0, 257
-; RV32I-NEXT: sw a0, 12(sp)
-; RV32I-NEXT: sw a0, 8(sp)
+; RV32I-NEXT: sw a0, 4(sp)
+; RV32I-NEXT: sw a0, 0(sp)
; RV32I-NEXT: vsetvli a0, zero, e64, m4, ta, ma
; RV32I-NEXT: vsrl.vi v12, v8, 1
; RV32I-NEXT: vor.vv v8, v8, v12
; RV32I-NEXT: li a0, 32
; RV32I-NEXT: vsrl.vx v12, v8, a0
; RV32I-NEXT: vor.vv v8, v8, v12
-; RV32I-NEXT: addi a0, sp, 8
-; RV32I-NEXT: vlse64.v v12, (a0), zero
; RV32I-NEXT: vnot.v v8, v8
+; RV32I-NEXT: addi a0, sp, 24
+; RV32I-NEXT: vlse64.v v12, (a0), zero
+; RV32I-NEXT: addi a0, sp, 16
; RV32I-NEXT: vlse64.v v16, (a0), zero
; RV32I-NEXT: vsrl.vi v20, v8, 1
; RV32I-NEXT: vand.vv v12, v20, v12
; RV32I-NEXT: vsrl.vi v8, v8, 2
; RV32I-NEXT: vand.vv v8, v8, v16
; RV32I-NEXT: vadd.vv v8, v12, v8
+; RV32I-NEXT: addi a0, sp, 8
; RV32I-NEXT: vlse64.v v12, (a0), zero
+; RV32I-NEXT: mv a0, sp
; RV32I-NEXT: vlse64.v v16, (a0), zero
; RV32I-NEXT: vsrl.vi v20, v8, 4
; RV32I-NEXT: vadd.vv v8, v8, v20
; RV32I-NEXT: vmul.vv v8, v8, v16
; RV32I-NEXT: li a0, 56
; RV32I-NEXT: vsrl.vx v8, v8, a0
-; RV32I-NEXT: addi sp, sp, 16
+; RV32I-NEXT: addi sp, sp, 32
; RV32I-NEXT: ret
;
; RV64I-LABEL: ctlz_zero_undef_nxv4i64:
define <vscale x 8 x i64> @ctlz_zero_undef_nxv8i64(<vscale x 8 x i64> %va) {
; RV32I-LABEL: ctlz_zero_undef_nxv8i64:
; RV32I: # %bb.0:
-; RV32I-NEXT: addi sp, sp, -16
-; RV32I-NEXT: .cfi_def_cfa_offset 16
+; RV32I-NEXT: addi sp, sp, -32
+; RV32I-NEXT: .cfi_def_cfa_offset 32
; RV32I-NEXT: lui a0, 349525
; RV32I-NEXT: addi a0, a0, 1365
-; RV32I-NEXT: sw a0, 12(sp)
-; RV32I-NEXT: sw a0, 8(sp)
+; RV32I-NEXT: sw a0, 28(sp)
+; RV32I-NEXT: sw a0, 24(sp)
; RV32I-NEXT: lui a0, 209715
; RV32I-NEXT: addi a0, a0, 819
-; RV32I-NEXT: sw a0, 12(sp)
-; RV32I-NEXT: sw a0, 8(sp)
+; RV32I-NEXT: sw a0, 20(sp)
+; RV32I-NEXT: sw a0, 16(sp)
; RV32I-NEXT: lui a0, 61681
; RV32I-NEXT: addi a0, a0, -241
; RV32I-NEXT: sw a0, 12(sp)
; RV32I-NEXT: sw a0, 8(sp)
; RV32I-NEXT: lui a0, 4112
; RV32I-NEXT: addi a0, a0, 257
-; RV32I-NEXT: sw a0, 12(sp)
-; RV32I-NEXT: sw a0, 8(sp)
+; RV32I-NEXT: sw a0, 4(sp)
+; RV32I-NEXT: sw a0, 0(sp)
; RV32I-NEXT: vsetvli a0, zero, e64, m8, ta, ma
; RV32I-NEXT: vsrl.vi v16, v8, 1
; RV32I-NEXT: vor.vv v8, v8, v16
; RV32I-NEXT: li a0, 32
; RV32I-NEXT: vsrl.vx v16, v8, a0
; RV32I-NEXT: vor.vv v8, v8, v16
-; RV32I-NEXT: addi a0, sp, 8
-; RV32I-NEXT: vlse64.v v16, (a0), zero
; RV32I-NEXT: vnot.v v8, v8
+; RV32I-NEXT: addi a0, sp, 24
+; RV32I-NEXT: vlse64.v v16, (a0), zero
+; RV32I-NEXT: addi a0, sp, 16
; RV32I-NEXT: vlse64.v v24, (a0), zero
; RV32I-NEXT: vsrl.vi v0, v8, 1
; RV32I-NEXT: vand.vv v16, v0, v16
; RV32I-NEXT: vsrl.vi v8, v8, 2
; RV32I-NEXT: vand.vv v8, v8, v24
; RV32I-NEXT: vadd.vv v8, v16, v8
+; RV32I-NEXT: addi a0, sp, 8
; RV32I-NEXT: vlse64.v v16, (a0), zero
+; RV32I-NEXT: mv a0, sp
; RV32I-NEXT: vlse64.v v24, (a0), zero
; RV32I-NEXT: vsrl.vi v0, v8, 4
; RV32I-NEXT: vadd.vv v8, v8, v0
; RV32I-NEXT: vmul.vv v8, v8, v24
; RV32I-NEXT: li a0, 56
; RV32I-NEXT: vsrl.vx v8, v8, a0
-; RV32I-NEXT: addi sp, sp, 16
+; RV32I-NEXT: addi sp, sp, 32
; RV32I-NEXT: ret
;
; RV64I-LABEL: ctlz_zero_undef_nxv8i64:
define <vscale x 1 x i64> @vp_ctlz_nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
; RV32-LABEL: vp_ctlz_nxv1i64:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: addi sp, sp, -32
+; RV32-NEXT: .cfi_def_cfa_offset 32
; RV32-NEXT: lui a1, 349525
; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 28(sp)
+; RV32-NEXT: sw a1, 24(sp)
; RV32-NEXT: lui a1, 209715
; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 20(sp)
+; RV32-NEXT: sw a1, 16(sp)
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
; RV32-NEXT: sw a1, 12(sp)
; RV32-NEXT: sw a1, 8(sp)
; RV32-NEXT: lui a1, 4112
; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 4(sp)
+; RV32-NEXT: sw a1, 0(sp)
; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
; RV32-NEXT: vsrl.vi v9, v8, 1, v0.t
; RV32-NEXT: vor.vv v8, v8, v9, v0.t
; RV32-NEXT: vor.vv v8, v8, v9, v0.t
; RV32-NEXT: vnot.v v8, v8, v0.t
; RV32-NEXT: vsrl.vi v9, v8, 1, v0.t
-; RV32-NEXT: addi a1, sp, 8
+; RV32-NEXT: addi a1, sp, 24
; RV32-NEXT: vsetvli a2, zero, e64, m1, ta, ma
; RV32-NEXT: vlse64.v v10, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
; RV32-NEXT: vand.vv v9, v9, v10, v0.t
; RV32-NEXT: vsub.vv v8, v8, v9, v0.t
+; RV32-NEXT: addi a1, sp, 16
; RV32-NEXT: vsetvli a2, zero, e64, m1, ta, ma
; RV32-NEXT: vlse64.v v9, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
; RV32-NEXT: vadd.vv v8, v10, v8, v0.t
; RV32-NEXT: vsrl.vi v9, v8, 4, v0.t
; RV32-NEXT: vadd.vv v8, v8, v9, v0.t
+; RV32-NEXT: addi a1, sp, 8
; RV32-NEXT: vsetvli a2, zero, e64, m1, ta, ma
; RV32-NEXT: vlse64.v v9, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
; RV32-NEXT: vand.vv v8, v8, v9, v0.t
+; RV32-NEXT: mv a1, sp
; RV32-NEXT: vsetvli a2, zero, e64, m1, ta, ma
; RV32-NEXT: vlse64.v v9, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
; RV32-NEXT: vmul.vv v8, v8, v9, v0.t
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t
-; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: addi sp, sp, 32
; RV32-NEXT: ret
;
; RV64-LABEL: vp_ctlz_nxv1i64:
define <vscale x 1 x i64> @vp_ctlz_nxv1i64_unmasked(<vscale x 1 x i64> %va, i32 zeroext %evl) {
; RV32-LABEL: vp_ctlz_nxv1i64_unmasked:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: addi sp, sp, -32
+; RV32-NEXT: .cfi_def_cfa_offset 32
; RV32-NEXT: lui a1, 349525
; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 28(sp)
+; RV32-NEXT: sw a1, 24(sp)
; RV32-NEXT: lui a1, 209715
; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 20(sp)
+; RV32-NEXT: sw a1, 16(sp)
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
; RV32-NEXT: sw a1, 12(sp)
; RV32-NEXT: sw a1, 8(sp)
; RV32-NEXT: lui a1, 4112
; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 4(sp)
+; RV32-NEXT: sw a1, 0(sp)
; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
; RV32-NEXT: vsrl.vi v9, v8, 1
; RV32-NEXT: vor.vv v8, v8, v9
; RV32-NEXT: vor.vv v8, v8, v9
; RV32-NEXT: vnot.v v8, v8
; RV32-NEXT: vsrl.vi v9, v8, 1
-; RV32-NEXT: addi a1, sp, 8
+; RV32-NEXT: addi a1, sp, 24
; RV32-NEXT: vsetvli a2, zero, e64, m1, ta, ma
; RV32-NEXT: vlse64.v v10, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
; RV32-NEXT: vand.vv v9, v9, v10
; RV32-NEXT: vsub.vv v8, v8, v9
+; RV32-NEXT: addi a1, sp, 16
; RV32-NEXT: vsetvli a2, zero, e64, m1, ta, ma
; RV32-NEXT: vlse64.v v9, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
; RV32-NEXT: vadd.vv v8, v10, v8
; RV32-NEXT: vsrl.vi v9, v8, 4
; RV32-NEXT: vadd.vv v8, v8, v9
+; RV32-NEXT: addi a1, sp, 8
; RV32-NEXT: vsetvli a2, zero, e64, m1, ta, ma
; RV32-NEXT: vlse64.v v9, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
; RV32-NEXT: vand.vv v8, v8, v9
+; RV32-NEXT: mv a1, sp
; RV32-NEXT: vsetvli a2, zero, e64, m1, ta, ma
; RV32-NEXT: vlse64.v v9, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
; RV32-NEXT: vmul.vv v8, v8, v9
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0
-; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: addi sp, sp, 32
; RV32-NEXT: ret
;
; RV64-LABEL: vp_ctlz_nxv1i64_unmasked:
define <vscale x 2 x i64> @vp_ctlz_nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
; RV32-LABEL: vp_ctlz_nxv2i64:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: addi sp, sp, -32
+; RV32-NEXT: .cfi_def_cfa_offset 32
; RV32-NEXT: lui a1, 349525
; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 28(sp)
+; RV32-NEXT: sw a1, 24(sp)
; RV32-NEXT: lui a1, 209715
; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 20(sp)
+; RV32-NEXT: sw a1, 16(sp)
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
; RV32-NEXT: sw a1, 12(sp)
; RV32-NEXT: sw a1, 8(sp)
; RV32-NEXT: lui a1, 4112
; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 4(sp)
+; RV32-NEXT: sw a1, 0(sp)
; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
; RV32-NEXT: vsrl.vi v10, v8, 1, v0.t
; RV32-NEXT: vor.vv v8, v8, v10, v0.t
; RV32-NEXT: vor.vv v8, v8, v10, v0.t
; RV32-NEXT: vnot.v v8, v8, v0.t
; RV32-NEXT: vsrl.vi v10, v8, 1, v0.t
-; RV32-NEXT: addi a1, sp, 8
+; RV32-NEXT: addi a1, sp, 24
; RV32-NEXT: vsetvli a2, zero, e64, m2, ta, ma
; RV32-NEXT: vlse64.v v12, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
; RV32-NEXT: vand.vv v10, v10, v12, v0.t
; RV32-NEXT: vsub.vv v8, v8, v10, v0.t
+; RV32-NEXT: addi a1, sp, 16
; RV32-NEXT: vsetvli a2, zero, e64, m2, ta, ma
; RV32-NEXT: vlse64.v v10, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
; RV32-NEXT: vadd.vv v8, v12, v8, v0.t
; RV32-NEXT: vsrl.vi v10, v8, 4, v0.t
; RV32-NEXT: vadd.vv v8, v8, v10, v0.t
+; RV32-NEXT: addi a1, sp, 8
; RV32-NEXT: vsetvli a2, zero, e64, m2, ta, ma
; RV32-NEXT: vlse64.v v10, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
; RV32-NEXT: vand.vv v8, v8, v10, v0.t
+; RV32-NEXT: mv a1, sp
; RV32-NEXT: vsetvli a2, zero, e64, m2, ta, ma
; RV32-NEXT: vlse64.v v10, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
; RV32-NEXT: vmul.vv v8, v8, v10, v0.t
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t
-; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: addi sp, sp, 32
; RV32-NEXT: ret
;
; RV64-LABEL: vp_ctlz_nxv2i64:
define <vscale x 2 x i64> @vp_ctlz_nxv2i64_unmasked(<vscale x 2 x i64> %va, i32 zeroext %evl) {
; RV32-LABEL: vp_ctlz_nxv2i64_unmasked:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: addi sp, sp, -32
+; RV32-NEXT: .cfi_def_cfa_offset 32
; RV32-NEXT: lui a1, 349525
; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 28(sp)
+; RV32-NEXT: sw a1, 24(sp)
; RV32-NEXT: lui a1, 209715
; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 20(sp)
+; RV32-NEXT: sw a1, 16(sp)
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
; RV32-NEXT: sw a1, 12(sp)
; RV32-NEXT: sw a1, 8(sp)
; RV32-NEXT: lui a1, 4112
; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 4(sp)
+; RV32-NEXT: sw a1, 0(sp)
; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
; RV32-NEXT: vsrl.vi v10, v8, 1
; RV32-NEXT: vor.vv v8, v8, v10
; RV32-NEXT: vor.vv v8, v8, v10
; RV32-NEXT: vnot.v v8, v8
; RV32-NEXT: vsrl.vi v10, v8, 1
-; RV32-NEXT: addi a1, sp, 8
+; RV32-NEXT: addi a1, sp, 24
; RV32-NEXT: vsetvli a2, zero, e64, m2, ta, ma
; RV32-NEXT: vlse64.v v12, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
; RV32-NEXT: vand.vv v10, v10, v12
; RV32-NEXT: vsub.vv v8, v8, v10
+; RV32-NEXT: addi a1, sp, 16
; RV32-NEXT: vsetvli a2, zero, e64, m2, ta, ma
; RV32-NEXT: vlse64.v v10, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
; RV32-NEXT: vadd.vv v8, v12, v8
; RV32-NEXT: vsrl.vi v10, v8, 4
; RV32-NEXT: vadd.vv v8, v8, v10
+; RV32-NEXT: addi a1, sp, 8
; RV32-NEXT: vsetvli a2, zero, e64, m2, ta, ma
; RV32-NEXT: vlse64.v v10, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
; RV32-NEXT: vand.vv v8, v8, v10
+; RV32-NEXT: mv a1, sp
; RV32-NEXT: vsetvli a2, zero, e64, m2, ta, ma
; RV32-NEXT: vlse64.v v10, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
; RV32-NEXT: vmul.vv v8, v8, v10
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0
-; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: addi sp, sp, 32
; RV32-NEXT: ret
;
; RV64-LABEL: vp_ctlz_nxv2i64_unmasked:
define <vscale x 4 x i64> @vp_ctlz_nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
; RV32-LABEL: vp_ctlz_nxv4i64:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: addi sp, sp, -32
+; RV32-NEXT: .cfi_def_cfa_offset 32
; RV32-NEXT: lui a1, 349525
; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 28(sp)
+; RV32-NEXT: sw a1, 24(sp)
; RV32-NEXT: lui a1, 209715
; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 20(sp)
+; RV32-NEXT: sw a1, 16(sp)
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
; RV32-NEXT: sw a1, 12(sp)
; RV32-NEXT: sw a1, 8(sp)
; RV32-NEXT: lui a1, 4112
; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 4(sp)
+; RV32-NEXT: sw a1, 0(sp)
; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
; RV32-NEXT: vsrl.vi v12, v8, 1, v0.t
; RV32-NEXT: vor.vv v8, v8, v12, v0.t
; RV32-NEXT: vor.vv v8, v8, v12, v0.t
; RV32-NEXT: vnot.v v8, v8, v0.t
; RV32-NEXT: vsrl.vi v12, v8, 1, v0.t
-; RV32-NEXT: addi a1, sp, 8
+; RV32-NEXT: addi a1, sp, 24
; RV32-NEXT: vsetvli a2, zero, e64, m4, ta, ma
; RV32-NEXT: vlse64.v v16, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
; RV32-NEXT: vand.vv v12, v12, v16, v0.t
; RV32-NEXT: vsub.vv v8, v8, v12, v0.t
+; RV32-NEXT: addi a1, sp, 16
; RV32-NEXT: vsetvli a2, zero, e64, m4, ta, ma
; RV32-NEXT: vlse64.v v12, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
; RV32-NEXT: vadd.vv v8, v16, v8, v0.t
; RV32-NEXT: vsrl.vi v12, v8, 4, v0.t
; RV32-NEXT: vadd.vv v8, v8, v12, v0.t
+; RV32-NEXT: addi a1, sp, 8
; RV32-NEXT: vsetvli a2, zero, e64, m4, ta, ma
; RV32-NEXT: vlse64.v v12, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
; RV32-NEXT: vand.vv v8, v8, v12, v0.t
+; RV32-NEXT: mv a1, sp
; RV32-NEXT: vsetvli a2, zero, e64, m4, ta, ma
; RV32-NEXT: vlse64.v v12, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
; RV32-NEXT: vmul.vv v8, v8, v12, v0.t
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t
-; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: addi sp, sp, 32
; RV32-NEXT: ret
;
; RV64-LABEL: vp_ctlz_nxv4i64:
define <vscale x 4 x i64> @vp_ctlz_nxv4i64_unmasked(<vscale x 4 x i64> %va, i32 zeroext %evl) {
; RV32-LABEL: vp_ctlz_nxv4i64_unmasked:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: addi sp, sp, -32
+; RV32-NEXT: .cfi_def_cfa_offset 32
; RV32-NEXT: lui a1, 349525
; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 28(sp)
+; RV32-NEXT: sw a1, 24(sp)
; RV32-NEXT: lui a1, 209715
; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 20(sp)
+; RV32-NEXT: sw a1, 16(sp)
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
; RV32-NEXT: sw a1, 12(sp)
; RV32-NEXT: sw a1, 8(sp)
; RV32-NEXT: lui a1, 4112
; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 4(sp)
+; RV32-NEXT: sw a1, 0(sp)
; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
; RV32-NEXT: vsrl.vi v12, v8, 1
; RV32-NEXT: vor.vv v8, v8, v12
; RV32-NEXT: vor.vv v8, v8, v12
; RV32-NEXT: vnot.v v8, v8
; RV32-NEXT: vsrl.vi v12, v8, 1
-; RV32-NEXT: addi a1, sp, 8
+; RV32-NEXT: addi a1, sp, 24
; RV32-NEXT: vsetvli a2, zero, e64, m4, ta, ma
; RV32-NEXT: vlse64.v v16, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
; RV32-NEXT: vand.vv v12, v12, v16
; RV32-NEXT: vsub.vv v8, v8, v12
+; RV32-NEXT: addi a1, sp, 16
; RV32-NEXT: vsetvli a2, zero, e64, m4, ta, ma
; RV32-NEXT: vlse64.v v12, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
; RV32-NEXT: vadd.vv v8, v16, v8
; RV32-NEXT: vsrl.vi v12, v8, 4
; RV32-NEXT: vadd.vv v8, v8, v12
+; RV32-NEXT: addi a1, sp, 8
; RV32-NEXT: vsetvli a2, zero, e64, m4, ta, ma
; RV32-NEXT: vlse64.v v12, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
; RV32-NEXT: vand.vv v8, v8, v12
+; RV32-NEXT: mv a1, sp
; RV32-NEXT: vsetvli a2, zero, e64, m4, ta, ma
; RV32-NEXT: vlse64.v v12, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
; RV32-NEXT: vmul.vv v8, v8, v12
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0
-; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: addi sp, sp, 32
; RV32-NEXT: ret
;
; RV64-LABEL: vp_ctlz_nxv4i64_unmasked:
define <vscale x 7 x i64> @vp_ctlz_nxv7i64(<vscale x 7 x i64> %va, <vscale x 7 x i1> %m, i32 zeroext %evl) {
; RV32-LABEL: vp_ctlz_nxv7i64:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: addi sp, sp, -32
+; RV32-NEXT: .cfi_def_cfa_offset 32
; RV32-NEXT: lui a1, 349525
; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 28(sp)
+; RV32-NEXT: sw a1, 24(sp)
; RV32-NEXT: lui a1, 209715
; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 20(sp)
+; RV32-NEXT: sw a1, 16(sp)
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
; RV32-NEXT: sw a1, 12(sp)
; RV32-NEXT: sw a1, 8(sp)
; RV32-NEXT: lui a1, 4112
; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 4(sp)
+; RV32-NEXT: sw a1, 0(sp)
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t
; RV32-NEXT: vor.vv v8, v8, v16, v0.t
; RV32-NEXT: vor.vv v8, v8, v16, v0.t
; RV32-NEXT: vnot.v v8, v8, v0.t
; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t
-; RV32-NEXT: addi a1, sp, 8
+; RV32-NEXT: addi a1, sp, 24
; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v24, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vand.vv v16, v16, v24, v0.t
; RV32-NEXT: vsub.vv v8, v8, v16, v0.t
+; RV32-NEXT: addi a1, sp, 16
; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v16, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vadd.vv v8, v24, v8, v0.t
; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t
; RV32-NEXT: vadd.vv v8, v8, v16, v0.t
+; RV32-NEXT: addi a1, sp, 8
; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v16, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vand.vv v8, v8, v16, v0.t
+; RV32-NEXT: mv a1, sp
; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v16, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vmul.vv v8, v8, v16, v0.t
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t
-; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: addi sp, sp, 32
; RV32-NEXT: ret
;
; RV64-LABEL: vp_ctlz_nxv7i64:
define <vscale x 7 x i64> @vp_ctlz_nxv7i64_unmasked(<vscale x 7 x i64> %va, i32 zeroext %evl) {
; RV32-LABEL: vp_ctlz_nxv7i64_unmasked:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: addi sp, sp, -32
+; RV32-NEXT: .cfi_def_cfa_offset 32
; RV32-NEXT: lui a1, 349525
; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 28(sp)
+; RV32-NEXT: sw a1, 24(sp)
; RV32-NEXT: lui a1, 209715
; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 20(sp)
+; RV32-NEXT: sw a1, 16(sp)
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
; RV32-NEXT: sw a1, 12(sp)
; RV32-NEXT: sw a1, 8(sp)
; RV32-NEXT: lui a1, 4112
; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 4(sp)
+; RV32-NEXT: sw a1, 0(sp)
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsrl.vi v16, v8, 1
; RV32-NEXT: vor.vv v8, v8, v16
; RV32-NEXT: vor.vv v8, v8, v16
; RV32-NEXT: vnot.v v8, v8
; RV32-NEXT: vsrl.vi v16, v8, 1
-; RV32-NEXT: addi a1, sp, 8
+; RV32-NEXT: addi a1, sp, 24
; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v24, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vand.vv v16, v16, v24
; RV32-NEXT: vsub.vv v8, v8, v16
+; RV32-NEXT: addi a1, sp, 16
; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v16, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vadd.vv v8, v24, v8
; RV32-NEXT: vsrl.vi v16, v8, 4
; RV32-NEXT: vadd.vv v8, v8, v16
+; RV32-NEXT: addi a1, sp, 8
; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v16, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vand.vv v8, v8, v16
+; RV32-NEXT: mv a1, sp
; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v16, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vmul.vv v8, v8, v16
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0
-; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: addi sp, sp, 32
; RV32-NEXT: ret
;
; RV64-LABEL: vp_ctlz_nxv7i64_unmasked:
define <vscale x 8 x i64> @vp_ctlz_nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
; RV32-LABEL: vp_ctlz_nxv8i64:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: addi sp, sp, -32
+; RV32-NEXT: .cfi_def_cfa_offset 32
; RV32-NEXT: lui a1, 349525
; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 28(sp)
+; RV32-NEXT: sw a1, 24(sp)
; RV32-NEXT: lui a1, 209715
; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 20(sp)
+; RV32-NEXT: sw a1, 16(sp)
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
; RV32-NEXT: sw a1, 12(sp)
; RV32-NEXT: sw a1, 8(sp)
; RV32-NEXT: lui a1, 4112
; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 4(sp)
+; RV32-NEXT: sw a1, 0(sp)
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t
; RV32-NEXT: vor.vv v8, v8, v16, v0.t
; RV32-NEXT: vor.vv v8, v8, v16, v0.t
; RV32-NEXT: vnot.v v8, v8, v0.t
; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t
-; RV32-NEXT: addi a1, sp, 8
+; RV32-NEXT: addi a1, sp, 24
; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v24, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vand.vv v16, v16, v24, v0.t
; RV32-NEXT: vsub.vv v8, v8, v16, v0.t
+; RV32-NEXT: addi a1, sp, 16
; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v16, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vadd.vv v8, v24, v8, v0.t
; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t
; RV32-NEXT: vadd.vv v8, v8, v16, v0.t
+; RV32-NEXT: addi a1, sp, 8
; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v16, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vand.vv v8, v8, v16, v0.t
+; RV32-NEXT: mv a1, sp
; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v16, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vmul.vv v8, v8, v16, v0.t
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t
-; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: addi sp, sp, 32
; RV32-NEXT: ret
;
; RV64-LABEL: vp_ctlz_nxv8i64:
define <vscale x 8 x i64> @vp_ctlz_nxv8i64_unmasked(<vscale x 8 x i64> %va, i32 zeroext %evl) {
; RV32-LABEL: vp_ctlz_nxv8i64_unmasked:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: addi sp, sp, -32
+; RV32-NEXT: .cfi_def_cfa_offset 32
; RV32-NEXT: lui a1, 349525
; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 28(sp)
+; RV32-NEXT: sw a1, 24(sp)
; RV32-NEXT: lui a1, 209715
; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 20(sp)
+; RV32-NEXT: sw a1, 16(sp)
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
; RV32-NEXT: sw a1, 12(sp)
; RV32-NEXT: sw a1, 8(sp)
; RV32-NEXT: lui a1, 4112
; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 4(sp)
+; RV32-NEXT: sw a1, 0(sp)
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsrl.vi v16, v8, 1
; RV32-NEXT: vor.vv v8, v8, v16
; RV32-NEXT: vor.vv v8, v8, v16
; RV32-NEXT: vnot.v v8, v8
; RV32-NEXT: vsrl.vi v16, v8, 1
-; RV32-NEXT: addi a1, sp, 8
+; RV32-NEXT: addi a1, sp, 24
; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v24, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vand.vv v16, v16, v24
; RV32-NEXT: vsub.vv v8, v8, v16
+; RV32-NEXT: addi a1, sp, 16
; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v16, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vadd.vv v8, v24, v8
; RV32-NEXT: vsrl.vi v16, v8, 4
; RV32-NEXT: vadd.vv v8, v8, v16
+; RV32-NEXT: addi a1, sp, 8
; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v16, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vand.vv v8, v8, v16
+; RV32-NEXT: mv a1, sp
; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v16, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vmul.vv v8, v8, v16
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0
-; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: addi sp, sp, 32
; RV32-NEXT: ret
;
; RV64-LABEL: vp_ctlz_nxv8i64_unmasked:
define <vscale x 16 x i64> @vp_ctlz_nxv16i64(<vscale x 16 x i64> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
; RV32-LABEL: vp_ctlz_nxv16i64:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: addi sp, sp, -48
+; RV32-NEXT: .cfi_def_cfa_offset 48
; RV32-NEXT: csrr a1, vlenb
; RV32-NEXT: li a2, 40
; RV32-NEXT: mul a1, a1, a2
; RV32-NEXT: sub sp, sp, a1
-; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
+; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 40 * vlenb
; RV32-NEXT: vmv1r.v v1, v0
; RV32-NEXT: csrr a1, vlenb
; RV32-NEXT: srli a2, a1, 3
; RV32-NEXT: vslidedown.vx v0, v0, a2
; RV32-NEXT: lui a2, 349525
; RV32-NEXT: addi a2, a2, 1365
-; RV32-NEXT: sw a2, 12(sp)
-; RV32-NEXT: sw a2, 8(sp)
+; RV32-NEXT: sw a2, 44(sp)
+; RV32-NEXT: sw a2, 40(sp)
; RV32-NEXT: lui a2, 209715
; RV32-NEXT: addi a2, a2, 819
-; RV32-NEXT: sw a2, 12(sp)
-; RV32-NEXT: sw a2, 8(sp)
+; RV32-NEXT: sw a2, 36(sp)
+; RV32-NEXT: sw a2, 32(sp)
; RV32-NEXT: lui a2, 61681
; RV32-NEXT: addi a2, a2, -241
-; RV32-NEXT: sw a2, 12(sp)
-; RV32-NEXT: sw a2, 8(sp)
+; RV32-NEXT: sw a2, 28(sp)
+; RV32-NEXT: sw a2, 24(sp)
; RV32-NEXT: lui a2, 4112
; RV32-NEXT: addi a2, a2, 257
-; RV32-NEXT: sw a2, 12(sp)
-; RV32-NEXT: sw a2, 8(sp)
+; RV32-NEXT: sw a2, 20(sp)
+; RV32-NEXT: sw a2, 16(sp)
; RV32-NEXT: sub a2, a0, a1
; RV32-NEXT: sltu a3, a0, a2
; RV32-NEXT: addi a3, a3, -1
; RV32-NEXT: csrr a4, vlenb
; RV32-NEXT: slli a4, a4, 5
; RV32-NEXT: add a4, sp, a4
-; RV32-NEXT: addi a4, a4, 16
+; RV32-NEXT: addi a4, a4, 48
; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill
; RV32-NEXT: vsrl.vi v24, v16, 1, v0.t
-; RV32-NEXT: addi a4, sp, 8
+; RV32-NEXT: addi a4, sp, 40
; RV32-NEXT: vsetvli a5, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v16, (a4), zero
; RV32-NEXT: vsetvli zero, a3, e64, m8, ta, ma
-; RV32-NEXT: csrr a5, vlenb
-; RV32-NEXT: slli a5, a5, 4
-; RV32-NEXT: add a5, sp, a5
-; RV32-NEXT: addi a5, a5, 16
-; RV32-NEXT: vs8r.v v16, (a5) # Unknown-size Folded Spill
+; RV32-NEXT: csrr a4, vlenb
+; RV32-NEXT: slli a4, a4, 4
+; RV32-NEXT: add a4, sp, a4
+; RV32-NEXT: addi a4, a4, 48
+; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill
; RV32-NEXT: vand.vv v24, v24, v16, v0.t
-; RV32-NEXT: csrr a5, vlenb
-; RV32-NEXT: slli a5, a5, 5
-; RV32-NEXT: add a5, sp, a5
-; RV32-NEXT: addi a5, a5, 16
-; RV32-NEXT: vl8r.v v16, (a5) # Unknown-size Folded Reload
+; RV32-NEXT: csrr a4, vlenb
+; RV32-NEXT: slli a4, a4, 5
+; RV32-NEXT: add a4, sp, a4
+; RV32-NEXT: addi a4, a4, 48
+; RV32-NEXT: vl8r.v v16, (a4) # Unknown-size Folded Reload
; RV32-NEXT: vsub.vv v16, v16, v24, v0.t
-; RV32-NEXT: csrr a5, vlenb
-; RV32-NEXT: li a6, 24
-; RV32-NEXT: mul a5, a5, a6
-; RV32-NEXT: add a5, sp, a5
-; RV32-NEXT: addi a5, a5, 16
-; RV32-NEXT: vs8r.v v16, (a5) # Unknown-size Folded Spill
+; RV32-NEXT: csrr a4, vlenb
+; RV32-NEXT: li a5, 24
+; RV32-NEXT: mul a4, a4, a5
+; RV32-NEXT: add a4, sp, a4
+; RV32-NEXT: addi a4, a4, 48
+; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill
+; RV32-NEXT: addi a4, sp, 32
; RV32-NEXT: vsetvli a5, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v16, (a4), zero
-; RV32-NEXT: csrr a5, vlenb
-; RV32-NEXT: slli a5, a5, 5
-; RV32-NEXT: add a5, sp, a5
-; RV32-NEXT: addi a5, a5, 16
-; RV32-NEXT: vs8r.v v16, (a5) # Unknown-size Folded Spill
+; RV32-NEXT: csrr a4, vlenb
+; RV32-NEXT: slli a4, a4, 5
+; RV32-NEXT: add a4, sp, a4
+; RV32-NEXT: addi a4, a4, 48
+; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill
; RV32-NEXT: vsetvli zero, a3, e64, m8, ta, ma
-; RV32-NEXT: csrr a5, vlenb
-; RV32-NEXT: li a6, 24
-; RV32-NEXT: mul a5, a5, a6
-; RV32-NEXT: add a5, sp, a5
-; RV32-NEXT: addi a5, a5, 16
-; RV32-NEXT: vl8r.v v16, (a5) # Unknown-size Folded Reload
-; RV32-NEXT: csrr a5, vlenb
-; RV32-NEXT: slli a5, a5, 5
-; RV32-NEXT: add a5, sp, a5
-; RV32-NEXT: addi a5, a5, 16
-; RV32-NEXT: vl8r.v v24, (a5) # Unknown-size Folded Reload
+; RV32-NEXT: csrr a4, vlenb
+; RV32-NEXT: li a5, 24
+; RV32-NEXT: mul a4, a4, a5
+; RV32-NEXT: add a4, sp, a4
+; RV32-NEXT: addi a4, a4, 48
+; RV32-NEXT: vl8r.v v16, (a4) # Unknown-size Folded Reload
+; RV32-NEXT: csrr a4, vlenb
+; RV32-NEXT: slli a4, a4, 5
+; RV32-NEXT: add a4, sp, a4
+; RV32-NEXT: addi a4, a4, 48
+; RV32-NEXT: vl8r.v v24, (a4) # Unknown-size Folded Reload
; RV32-NEXT: vand.vv v16, v16, v24, v0.t
-; RV32-NEXT: csrr a5, vlenb
-; RV32-NEXT: slli a5, a5, 3
-; RV32-NEXT: add a5, sp, a5
-; RV32-NEXT: addi a5, a5, 16
-; RV32-NEXT: vs8r.v v16, (a5) # Unknown-size Folded Spill
-; RV32-NEXT: csrr a5, vlenb
-; RV32-NEXT: li a6, 24
-; RV32-NEXT: mul a5, a5, a6
-; RV32-NEXT: add a5, sp, a5
-; RV32-NEXT: addi a5, a5, 16
-; RV32-NEXT: vl8r.v v16, (a5) # Unknown-size Folded Reload
+; RV32-NEXT: csrr a4, vlenb
+; RV32-NEXT: slli a4, a4, 3
+; RV32-NEXT: add a4, sp, a4
+; RV32-NEXT: addi a4, a4, 48
+; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill
+; RV32-NEXT: csrr a4, vlenb
+; RV32-NEXT: li a5, 24
+; RV32-NEXT: mul a4, a4, a5
+; RV32-NEXT: add a4, sp, a4
+; RV32-NEXT: addi a4, a4, 48
+; RV32-NEXT: vl8r.v v16, (a4) # Unknown-size Folded Reload
; RV32-NEXT: vsrl.vi v16, v16, 2, v0.t
-; RV32-NEXT: csrr a5, vlenb
-; RV32-NEXT: li a6, 24
-; RV32-NEXT: mul a5, a5, a6
-; RV32-NEXT: add a5, sp, a5
-; RV32-NEXT: addi a5, a5, 16
-; RV32-NEXT: vs8r.v v16, (a5) # Unknown-size Folded Spill
-; RV32-NEXT: csrr a5, vlenb
-; RV32-NEXT: slli a5, a5, 5
-; RV32-NEXT: add a5, sp, a5
-; RV32-NEXT: addi a5, a5, 16
-; RV32-NEXT: vl8r.v v16, (a5) # Unknown-size Folded Reload
-; RV32-NEXT: csrr a5, vlenb
-; RV32-NEXT: li a6, 24
-; RV32-NEXT: mul a5, a5, a6
-; RV32-NEXT: add a5, sp, a5
-; RV32-NEXT: addi a5, a5, 16
-; RV32-NEXT: vl8r.v v24, (a5) # Unknown-size Folded Reload
+; RV32-NEXT: csrr a4, vlenb
+; RV32-NEXT: li a5, 24
+; RV32-NEXT: mul a4, a4, a5
+; RV32-NEXT: add a4, sp, a4
+; RV32-NEXT: addi a4, a4, 48
+; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill
+; RV32-NEXT: csrr a4, vlenb
+; RV32-NEXT: slli a4, a4, 5
+; RV32-NEXT: add a4, sp, a4
+; RV32-NEXT: addi a4, a4, 48
+; RV32-NEXT: vl8r.v v16, (a4) # Unknown-size Folded Reload
+; RV32-NEXT: csrr a4, vlenb
+; RV32-NEXT: li a5, 24
+; RV32-NEXT: mul a4, a4, a5
+; RV32-NEXT: add a4, sp, a4
+; RV32-NEXT: addi a4, a4, 48
+; RV32-NEXT: vl8r.v v24, (a4) # Unknown-size Folded Reload
; RV32-NEXT: vand.vv v16, v24, v16, v0.t
-; RV32-NEXT: csrr a5, vlenb
-; RV32-NEXT: slli a5, a5, 3
-; RV32-NEXT: add a5, sp, a5
-; RV32-NEXT: addi a5, a5, 16
-; RV32-NEXT: vl8r.v v24, (a5) # Unknown-size Folded Reload
+; RV32-NEXT: csrr a4, vlenb
+; RV32-NEXT: slli a4, a4, 3
+; RV32-NEXT: add a4, sp, a4
+; RV32-NEXT: addi a4, a4, 48
+; RV32-NEXT: vl8r.v v24, (a4) # Unknown-size Folded Reload
; RV32-NEXT: vadd.vv v16, v24, v16, v0.t
; RV32-NEXT: vsrl.vi v24, v16, 4, v0.t
; RV32-NEXT: vadd.vv v24, v16, v24, v0.t
+; RV32-NEXT: addi a4, sp, 24
; RV32-NEXT: vsetvli a5, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v16, (a4), zero
; RV32-NEXT: vsetvli zero, a3, e64, m8, ta, ma
-; RV32-NEXT: csrr a5, vlenb
-; RV32-NEXT: li a6, 24
-; RV32-NEXT: mul a5, a5, a6
-; RV32-NEXT: add a5, sp, a5
-; RV32-NEXT: addi a5, a5, 16
-; RV32-NEXT: vs8r.v v16, (a5) # Unknown-size Folded Spill
+; RV32-NEXT: csrr a4, vlenb
+; RV32-NEXT: li a5, 24
+; RV32-NEXT: mul a4, a4, a5
+; RV32-NEXT: add a4, sp, a4
+; RV32-NEXT: addi a4, a4, 48
+; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill
; RV32-NEXT: vand.vv v24, v24, v16, v0.t
+; RV32-NEXT: addi a4, sp, 16
; RV32-NEXT: vsetvli a5, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v16, (a4), zero
; RV32-NEXT: vsetvli zero, a3, e64, m8, ta, ma
; RV32-NEXT: csrr a3, vlenb
; RV32-NEXT: slli a3, a3, 3
; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 16
+; RV32-NEXT: addi a3, a3, 48
; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
; RV32-NEXT: vmul.vv v16, v24, v16, v0.t
; RV32-NEXT: li a3, 56
; RV32-NEXT: vsrl.vx v16, v16, a3, v0.t
-; RV32-NEXT: addi a4, sp, 16
+; RV32-NEXT: addi a4, sp, 48
; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill
; RV32-NEXT: bltu a0, a1, .LBB46_2
; RV32-NEXT: # %bb.1:
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 4
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: addi a0, a0, 48
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vand.vv v16, v24, v16, v0.t
; RV32-NEXT: vsub.vv v8, v8, v16, v0.t
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 5
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: addi a0, a0, 48
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vand.vv v24, v8, v16, v0.t
; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t
; RV32-NEXT: li a1, 24
; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: addi a0, a0, 48
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vand.vv v8, v8, v16, v0.t
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 3
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: addi a0, a0, 48
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vmul.vv v8, v8, v16, v0.t
; RV32-NEXT: vsrl.vx v8, v8, a3, v0.t
-; RV32-NEXT: addi a0, sp, 16
+; RV32-NEXT: addi a0, sp, 48
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: li a1, 40
; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add sp, sp, a0
-; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: addi sp, sp, 48
; RV32-NEXT: ret
;
; RV64-LABEL: vp_ctlz_nxv16i64:
define <vscale x 16 x i64> @vp_ctlz_nxv16i64_unmasked(<vscale x 16 x i64> %va, i32 zeroext %evl) {
; RV32-LABEL: vp_ctlz_nxv16i64_unmasked:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: addi sp, sp, -48
+; RV32-NEXT: .cfi_def_cfa_offset 48
; RV32-NEXT: csrr a1, vlenb
; RV32-NEXT: slli a1, a1, 5
; RV32-NEXT: sub sp, sp, a1
-; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 32 * vlenb
; RV32-NEXT: lui a1, 349525
; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 44(sp)
+; RV32-NEXT: sw a1, 40(sp)
; RV32-NEXT: lui a1, 209715
; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 36(sp)
+; RV32-NEXT: sw a1, 32(sp)
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 28(sp)
+; RV32-NEXT: sw a1, 24(sp)
; RV32-NEXT: lui a1, 4112
; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 20(sp)
+; RV32-NEXT: sw a1, 16(sp)
; RV32-NEXT: csrr a1, vlenb
; RV32-NEXT: sub a2, a0, a1
; RV32-NEXT: sltu a3, a0, a2
; RV32-NEXT: vor.vv v16, v16, v24
; RV32-NEXT: vnot.v v16, v16
; RV32-NEXT: vsrl.vi v24, v16, 1
-; RV32-NEXT: addi a4, sp, 8
+; RV32-NEXT: addi a4, sp, 40
; RV32-NEXT: vsetvli a5, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v0, (a4), zero
; RV32-NEXT: vsetvli zero, a3, e64, m8, ta, ma
-; RV32-NEXT: csrr a5, vlenb
-; RV32-NEXT: li a6, 24
-; RV32-NEXT: mul a5, a5, a6
-; RV32-NEXT: add a5, sp, a5
-; RV32-NEXT: addi a5, a5, 16
-; RV32-NEXT: vs8r.v v0, (a5) # Unknown-size Folded Spill
+; RV32-NEXT: csrr a4, vlenb
+; RV32-NEXT: li a5, 24
+; RV32-NEXT: mul a4, a4, a5
+; RV32-NEXT: add a4, sp, a4
+; RV32-NEXT: addi a4, a4, 48
+; RV32-NEXT: vs8r.v v0, (a4) # Unknown-size Folded Spill
; RV32-NEXT: vand.vv v24, v24, v0
; RV32-NEXT: vsub.vv v16, v16, v24
+; RV32-NEXT: addi a4, sp, 32
; RV32-NEXT: vsetvli a5, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v0, (a4), zero
; RV32-NEXT: vsetvli zero, a3, e64, m8, ta, ma
; RV32-NEXT: vadd.vv v16, v24, v16
; RV32-NEXT: vsrl.vi v24, v16, 4
; RV32-NEXT: vadd.vv v16, v16, v24
+; RV32-NEXT: addi a4, sp, 24
; RV32-NEXT: vsetvli a5, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v24, (a4), zero
; RV32-NEXT: vsetvli zero, a3, e64, m8, ta, ma
-; RV32-NEXT: csrr a5, vlenb
-; RV32-NEXT: slli a5, a5, 4
-; RV32-NEXT: add a5, sp, a5
-; RV32-NEXT: addi a5, a5, 16
-; RV32-NEXT: vs8r.v v24, (a5) # Unknown-size Folded Spill
+; RV32-NEXT: csrr a4, vlenb
+; RV32-NEXT: slli a4, a4, 4
+; RV32-NEXT: add a4, sp, a4
+; RV32-NEXT: addi a4, a4, 48
+; RV32-NEXT: vs8r.v v24, (a4) # Unknown-size Folded Spill
; RV32-NEXT: vand.vv v24, v16, v24
+; RV32-NEXT: addi a4, sp, 16
; RV32-NEXT: vsetvli a5, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v16, (a4), zero
; RV32-NEXT: vsetvli zero, a3, e64, m8, ta, ma
; RV32-NEXT: csrr a3, vlenb
; RV32-NEXT: slli a3, a3, 3
; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 16
+; RV32-NEXT: addi a3, a3, 48
; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
; RV32-NEXT: vmul.vv v24, v24, v16
; RV32-NEXT: li a3, 56
; RV32-NEXT: vsrl.vx v16, v24, a3
-; RV32-NEXT: addi a4, sp, 16
+; RV32-NEXT: addi a4, sp, 48
; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill
; RV32-NEXT: bltu a0, a1, .LBB47_2
; RV32-NEXT: # %bb.1:
; RV32-NEXT: li a1, 24
; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: addi a0, a0, 48
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vand.vv v24, v24, v16
; RV32-NEXT: vsub.vv v8, v8, v24
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 4
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: addi a0, a0, 48
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vand.vv v8, v8, v16
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 3
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: addi a0, a0, 48
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vmul.vv v8, v8, v16
; RV32-NEXT: vsrl.vx v8, v8, a3
-; RV32-NEXT: addi a0, sp, 16
+; RV32-NEXT: addi a0, sp, 48
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 5
; RV32-NEXT: add sp, sp, a0
-; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: addi sp, sp, 48
; RV32-NEXT: ret
;
; RV64-LABEL: vp_ctlz_nxv16i64_unmasked:
define <vscale x 1 x i64> @vp_ctlz_zero_undef_nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
; RV32-LABEL: vp_ctlz_zero_undef_nxv1i64:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: addi sp, sp, -32
+; RV32-NEXT: .cfi_def_cfa_offset 32
; RV32-NEXT: lui a1, 349525
; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 28(sp)
+; RV32-NEXT: sw a1, 24(sp)
; RV32-NEXT: lui a1, 209715
; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 20(sp)
+; RV32-NEXT: sw a1, 16(sp)
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
; RV32-NEXT: sw a1, 12(sp)
; RV32-NEXT: sw a1, 8(sp)
; RV32-NEXT: lui a1, 4112
; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 4(sp)
+; RV32-NEXT: sw a1, 0(sp)
; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
; RV32-NEXT: vsrl.vi v9, v8, 1, v0.t
; RV32-NEXT: vor.vv v8, v8, v9, v0.t
; RV32-NEXT: vor.vv v8, v8, v9, v0.t
; RV32-NEXT: vnot.v v8, v8, v0.t
; RV32-NEXT: vsrl.vi v9, v8, 1, v0.t
-; RV32-NEXT: addi a1, sp, 8
+; RV32-NEXT: addi a1, sp, 24
; RV32-NEXT: vsetvli a2, zero, e64, m1, ta, ma
; RV32-NEXT: vlse64.v v10, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
; RV32-NEXT: vand.vv v9, v9, v10, v0.t
; RV32-NEXT: vsub.vv v8, v8, v9, v0.t
+; RV32-NEXT: addi a1, sp, 16
; RV32-NEXT: vsetvli a2, zero, e64, m1, ta, ma
; RV32-NEXT: vlse64.v v9, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
; RV32-NEXT: vadd.vv v8, v10, v8, v0.t
; RV32-NEXT: vsrl.vi v9, v8, 4, v0.t
; RV32-NEXT: vadd.vv v8, v8, v9, v0.t
+; RV32-NEXT: addi a1, sp, 8
; RV32-NEXT: vsetvli a2, zero, e64, m1, ta, ma
; RV32-NEXT: vlse64.v v9, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
; RV32-NEXT: vand.vv v8, v8, v9, v0.t
+; RV32-NEXT: mv a1, sp
; RV32-NEXT: vsetvli a2, zero, e64, m1, ta, ma
; RV32-NEXT: vlse64.v v9, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
; RV32-NEXT: vmul.vv v8, v8, v9, v0.t
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t
-; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: addi sp, sp, 32
; RV32-NEXT: ret
;
; RV64-LABEL: vp_ctlz_zero_undef_nxv1i64:
define <vscale x 1 x i64> @vp_ctlz_zero_undef_nxv1i64_unmasked(<vscale x 1 x i64> %va, i32 zeroext %evl) {
; RV32-LABEL: vp_ctlz_zero_undef_nxv1i64_unmasked:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: addi sp, sp, -32
+; RV32-NEXT: .cfi_def_cfa_offset 32
; RV32-NEXT: lui a1, 349525
; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 28(sp)
+; RV32-NEXT: sw a1, 24(sp)
; RV32-NEXT: lui a1, 209715
; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 20(sp)
+; RV32-NEXT: sw a1, 16(sp)
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
; RV32-NEXT: sw a1, 12(sp)
; RV32-NEXT: sw a1, 8(sp)
; RV32-NEXT: lui a1, 4112
; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 4(sp)
+; RV32-NEXT: sw a1, 0(sp)
; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
; RV32-NEXT: vsrl.vi v9, v8, 1
; RV32-NEXT: vor.vv v8, v8, v9
; RV32-NEXT: vor.vv v8, v8, v9
; RV32-NEXT: vnot.v v8, v8
; RV32-NEXT: vsrl.vi v9, v8, 1
-; RV32-NEXT: addi a1, sp, 8
+; RV32-NEXT: addi a1, sp, 24
; RV32-NEXT: vsetvli a2, zero, e64, m1, ta, ma
; RV32-NEXT: vlse64.v v10, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
; RV32-NEXT: vand.vv v9, v9, v10
; RV32-NEXT: vsub.vv v8, v8, v9
+; RV32-NEXT: addi a1, sp, 16
; RV32-NEXT: vsetvli a2, zero, e64, m1, ta, ma
; RV32-NEXT: vlse64.v v9, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
; RV32-NEXT: vadd.vv v8, v10, v8
; RV32-NEXT: vsrl.vi v9, v8, 4
; RV32-NEXT: vadd.vv v8, v8, v9
+; RV32-NEXT: addi a1, sp, 8
; RV32-NEXT: vsetvli a2, zero, e64, m1, ta, ma
; RV32-NEXT: vlse64.v v9, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
; RV32-NEXT: vand.vv v8, v8, v9
+; RV32-NEXT: mv a1, sp
; RV32-NEXT: vsetvli a2, zero, e64, m1, ta, ma
; RV32-NEXT: vlse64.v v9, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
; RV32-NEXT: vmul.vv v8, v8, v9
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0
-; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: addi sp, sp, 32
; RV32-NEXT: ret
;
; RV64-LABEL: vp_ctlz_zero_undef_nxv1i64_unmasked:
define <vscale x 2 x i64> @vp_ctlz_zero_undef_nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
; RV32-LABEL: vp_ctlz_zero_undef_nxv2i64:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: addi sp, sp, -32
+; RV32-NEXT: .cfi_def_cfa_offset 32
; RV32-NEXT: lui a1, 349525
; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 28(sp)
+; RV32-NEXT: sw a1, 24(sp)
; RV32-NEXT: lui a1, 209715
; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 20(sp)
+; RV32-NEXT: sw a1, 16(sp)
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
; RV32-NEXT: sw a1, 12(sp)
; RV32-NEXT: sw a1, 8(sp)
; RV32-NEXT: lui a1, 4112
; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 4(sp)
+; RV32-NEXT: sw a1, 0(sp)
; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
; RV32-NEXT: vsrl.vi v10, v8, 1, v0.t
; RV32-NEXT: vor.vv v8, v8, v10, v0.t
; RV32-NEXT: vor.vv v8, v8, v10, v0.t
; RV32-NEXT: vnot.v v8, v8, v0.t
; RV32-NEXT: vsrl.vi v10, v8, 1, v0.t
-; RV32-NEXT: addi a1, sp, 8
+; RV32-NEXT: addi a1, sp, 24
; RV32-NEXT: vsetvli a2, zero, e64, m2, ta, ma
; RV32-NEXT: vlse64.v v12, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
; RV32-NEXT: vand.vv v10, v10, v12, v0.t
; RV32-NEXT: vsub.vv v8, v8, v10, v0.t
+; RV32-NEXT: addi a1, sp, 16
; RV32-NEXT: vsetvli a2, zero, e64, m2, ta, ma
; RV32-NEXT: vlse64.v v10, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
; RV32-NEXT: vadd.vv v8, v12, v8, v0.t
; RV32-NEXT: vsrl.vi v10, v8, 4, v0.t
; RV32-NEXT: vadd.vv v8, v8, v10, v0.t
+; RV32-NEXT: addi a1, sp, 8
; RV32-NEXT: vsetvli a2, zero, e64, m2, ta, ma
; RV32-NEXT: vlse64.v v10, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
; RV32-NEXT: vand.vv v8, v8, v10, v0.t
+; RV32-NEXT: mv a1, sp
; RV32-NEXT: vsetvli a2, zero, e64, m2, ta, ma
; RV32-NEXT: vlse64.v v10, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
; RV32-NEXT: vmul.vv v8, v8, v10, v0.t
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t
-; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: addi sp, sp, 32
; RV32-NEXT: ret
;
; RV64-LABEL: vp_ctlz_zero_undef_nxv2i64:
define <vscale x 2 x i64> @vp_ctlz_zero_undef_nxv2i64_unmasked(<vscale x 2 x i64> %va, i32 zeroext %evl) {
; RV32-LABEL: vp_ctlz_zero_undef_nxv2i64_unmasked:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: addi sp, sp, -32
+; RV32-NEXT: .cfi_def_cfa_offset 32
; RV32-NEXT: lui a1, 349525
; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 28(sp)
+; RV32-NEXT: sw a1, 24(sp)
; RV32-NEXT: lui a1, 209715
; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 20(sp)
+; RV32-NEXT: sw a1, 16(sp)
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
; RV32-NEXT: sw a1, 12(sp)
; RV32-NEXT: sw a1, 8(sp)
; RV32-NEXT: lui a1, 4112
; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 4(sp)
+; RV32-NEXT: sw a1, 0(sp)
; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
; RV32-NEXT: vsrl.vi v10, v8, 1
; RV32-NEXT: vor.vv v8, v8, v10
; RV32-NEXT: vor.vv v8, v8, v10
; RV32-NEXT: vnot.v v8, v8
; RV32-NEXT: vsrl.vi v10, v8, 1
-; RV32-NEXT: addi a1, sp, 8
+; RV32-NEXT: addi a1, sp, 24
; RV32-NEXT: vsetvli a2, zero, e64, m2, ta, ma
; RV32-NEXT: vlse64.v v12, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
; RV32-NEXT: vand.vv v10, v10, v12
; RV32-NEXT: vsub.vv v8, v8, v10
+; RV32-NEXT: addi a1, sp, 16
; RV32-NEXT: vsetvli a2, zero, e64, m2, ta, ma
; RV32-NEXT: vlse64.v v10, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
; RV32-NEXT: vadd.vv v8, v12, v8
; RV32-NEXT: vsrl.vi v10, v8, 4
; RV32-NEXT: vadd.vv v8, v8, v10
+; RV32-NEXT: addi a1, sp, 8
; RV32-NEXT: vsetvli a2, zero, e64, m2, ta, ma
; RV32-NEXT: vlse64.v v10, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
; RV32-NEXT: vand.vv v8, v8, v10
+; RV32-NEXT: mv a1, sp
; RV32-NEXT: vsetvli a2, zero, e64, m2, ta, ma
; RV32-NEXT: vlse64.v v10, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
; RV32-NEXT: vmul.vv v8, v8, v10
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0
-; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: addi sp, sp, 32
; RV32-NEXT: ret
;
; RV64-LABEL: vp_ctlz_zero_undef_nxv2i64_unmasked:
define <vscale x 4 x i64> @vp_ctlz_zero_undef_nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
; RV32-LABEL: vp_ctlz_zero_undef_nxv4i64:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: addi sp, sp, -32
+; RV32-NEXT: .cfi_def_cfa_offset 32
; RV32-NEXT: lui a1, 349525
; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 28(sp)
+; RV32-NEXT: sw a1, 24(sp)
; RV32-NEXT: lui a1, 209715
; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 20(sp)
+; RV32-NEXT: sw a1, 16(sp)
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
; RV32-NEXT: sw a1, 12(sp)
; RV32-NEXT: sw a1, 8(sp)
; RV32-NEXT: lui a1, 4112
; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 4(sp)
+; RV32-NEXT: sw a1, 0(sp)
; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
; RV32-NEXT: vsrl.vi v12, v8, 1, v0.t
; RV32-NEXT: vor.vv v8, v8, v12, v0.t
; RV32-NEXT: vor.vv v8, v8, v12, v0.t
; RV32-NEXT: vnot.v v8, v8, v0.t
; RV32-NEXT: vsrl.vi v12, v8, 1, v0.t
-; RV32-NEXT: addi a1, sp, 8
+; RV32-NEXT: addi a1, sp, 24
; RV32-NEXT: vsetvli a2, zero, e64, m4, ta, ma
; RV32-NEXT: vlse64.v v16, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
; RV32-NEXT: vand.vv v12, v12, v16, v0.t
; RV32-NEXT: vsub.vv v8, v8, v12, v0.t
+; RV32-NEXT: addi a1, sp, 16
; RV32-NEXT: vsetvli a2, zero, e64, m4, ta, ma
; RV32-NEXT: vlse64.v v12, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
; RV32-NEXT: vadd.vv v8, v16, v8, v0.t
; RV32-NEXT: vsrl.vi v12, v8, 4, v0.t
; RV32-NEXT: vadd.vv v8, v8, v12, v0.t
+; RV32-NEXT: addi a1, sp, 8
; RV32-NEXT: vsetvli a2, zero, e64, m4, ta, ma
; RV32-NEXT: vlse64.v v12, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
; RV32-NEXT: vand.vv v8, v8, v12, v0.t
+; RV32-NEXT: mv a1, sp
; RV32-NEXT: vsetvli a2, zero, e64, m4, ta, ma
; RV32-NEXT: vlse64.v v12, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
; RV32-NEXT: vmul.vv v8, v8, v12, v0.t
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t
-; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: addi sp, sp, 32
; RV32-NEXT: ret
;
; RV64-LABEL: vp_ctlz_zero_undef_nxv4i64:
define <vscale x 4 x i64> @vp_ctlz_zero_undef_nxv4i64_unmasked(<vscale x 4 x i64> %va, i32 zeroext %evl) {
; RV32-LABEL: vp_ctlz_zero_undef_nxv4i64_unmasked:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: addi sp, sp, -32
+; RV32-NEXT: .cfi_def_cfa_offset 32
; RV32-NEXT: lui a1, 349525
; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 28(sp)
+; RV32-NEXT: sw a1, 24(sp)
; RV32-NEXT: lui a1, 209715
; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 20(sp)
+; RV32-NEXT: sw a1, 16(sp)
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
; RV32-NEXT: sw a1, 12(sp)
; RV32-NEXT: sw a1, 8(sp)
; RV32-NEXT: lui a1, 4112
; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 4(sp)
+; RV32-NEXT: sw a1, 0(sp)
; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
; RV32-NEXT: vsrl.vi v12, v8, 1
; RV32-NEXT: vor.vv v8, v8, v12
; RV32-NEXT: vor.vv v8, v8, v12
; RV32-NEXT: vnot.v v8, v8
; RV32-NEXT: vsrl.vi v12, v8, 1
-; RV32-NEXT: addi a1, sp, 8
+; RV32-NEXT: addi a1, sp, 24
; RV32-NEXT: vsetvli a2, zero, e64, m4, ta, ma
; RV32-NEXT: vlse64.v v16, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
; RV32-NEXT: vand.vv v12, v12, v16
; RV32-NEXT: vsub.vv v8, v8, v12
+; RV32-NEXT: addi a1, sp, 16
; RV32-NEXT: vsetvli a2, zero, e64, m4, ta, ma
; RV32-NEXT: vlse64.v v12, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
; RV32-NEXT: vadd.vv v8, v16, v8
; RV32-NEXT: vsrl.vi v12, v8, 4
; RV32-NEXT: vadd.vv v8, v8, v12
+; RV32-NEXT: addi a1, sp, 8
; RV32-NEXT: vsetvli a2, zero, e64, m4, ta, ma
; RV32-NEXT: vlse64.v v12, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
; RV32-NEXT: vand.vv v8, v8, v12
+; RV32-NEXT: mv a1, sp
; RV32-NEXT: vsetvli a2, zero, e64, m4, ta, ma
; RV32-NEXT: vlse64.v v12, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
; RV32-NEXT: vmul.vv v8, v8, v12
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0
-; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: addi sp, sp, 32
; RV32-NEXT: ret
;
; RV64-LABEL: vp_ctlz_zero_undef_nxv4i64_unmasked:
define <vscale x 7 x i64> @vp_ctlz_zero_undef_nxv7i64(<vscale x 7 x i64> %va, <vscale x 7 x i1> %m, i32 zeroext %evl) {
; RV32-LABEL: vp_ctlz_zero_undef_nxv7i64:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: addi sp, sp, -32
+; RV32-NEXT: .cfi_def_cfa_offset 32
; RV32-NEXT: lui a1, 349525
; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 28(sp)
+; RV32-NEXT: sw a1, 24(sp)
; RV32-NEXT: lui a1, 209715
; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 20(sp)
+; RV32-NEXT: sw a1, 16(sp)
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
; RV32-NEXT: sw a1, 12(sp)
; RV32-NEXT: sw a1, 8(sp)
; RV32-NEXT: lui a1, 4112
; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 4(sp)
+; RV32-NEXT: sw a1, 0(sp)
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t
; RV32-NEXT: vor.vv v8, v8, v16, v0.t
; RV32-NEXT: vor.vv v8, v8, v16, v0.t
; RV32-NEXT: vnot.v v8, v8, v0.t
; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t
-; RV32-NEXT: addi a1, sp, 8
+; RV32-NEXT: addi a1, sp, 24
; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v24, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vand.vv v16, v16, v24, v0.t
; RV32-NEXT: vsub.vv v8, v8, v16, v0.t
+; RV32-NEXT: addi a1, sp, 16
; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v16, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vadd.vv v8, v24, v8, v0.t
; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t
; RV32-NEXT: vadd.vv v8, v8, v16, v0.t
+; RV32-NEXT: addi a1, sp, 8
; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v16, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vand.vv v8, v8, v16, v0.t
+; RV32-NEXT: mv a1, sp
; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v16, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vmul.vv v8, v8, v16, v0.t
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t
-; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: addi sp, sp, 32
; RV32-NEXT: ret
;
; RV64-LABEL: vp_ctlz_zero_undef_nxv7i64:
define <vscale x 7 x i64> @vp_ctlz_zero_undef_nxv7i64_unmasked(<vscale x 7 x i64> %va, i32 zeroext %evl) {
; RV32-LABEL: vp_ctlz_zero_undef_nxv7i64_unmasked:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: addi sp, sp, -32
+; RV32-NEXT: .cfi_def_cfa_offset 32
; RV32-NEXT: lui a1, 349525
; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 28(sp)
+; RV32-NEXT: sw a1, 24(sp)
; RV32-NEXT: lui a1, 209715
; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 20(sp)
+; RV32-NEXT: sw a1, 16(sp)
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
; RV32-NEXT: sw a1, 12(sp)
; RV32-NEXT: sw a1, 8(sp)
; RV32-NEXT: lui a1, 4112
; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 4(sp)
+; RV32-NEXT: sw a1, 0(sp)
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsrl.vi v16, v8, 1
; RV32-NEXT: vor.vv v8, v8, v16
; RV32-NEXT: vor.vv v8, v8, v16
; RV32-NEXT: vnot.v v8, v8
; RV32-NEXT: vsrl.vi v16, v8, 1
-; RV32-NEXT: addi a1, sp, 8
+; RV32-NEXT: addi a1, sp, 24
; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v24, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vand.vv v16, v16, v24
; RV32-NEXT: vsub.vv v8, v8, v16
+; RV32-NEXT: addi a1, sp, 16
; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v16, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vadd.vv v8, v24, v8
; RV32-NEXT: vsrl.vi v16, v8, 4
; RV32-NEXT: vadd.vv v8, v8, v16
+; RV32-NEXT: addi a1, sp, 8
; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v16, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vand.vv v8, v8, v16
+; RV32-NEXT: mv a1, sp
; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v16, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vmul.vv v8, v8, v16
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0
-; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: addi sp, sp, 32
; RV32-NEXT: ret
;
; RV64-LABEL: vp_ctlz_zero_undef_nxv7i64_unmasked:
define <vscale x 8 x i64> @vp_ctlz_zero_undef_nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
; RV32-LABEL: vp_ctlz_zero_undef_nxv8i64:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: addi sp, sp, -32
+; RV32-NEXT: .cfi_def_cfa_offset 32
; RV32-NEXT: lui a1, 349525
; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 28(sp)
+; RV32-NEXT: sw a1, 24(sp)
; RV32-NEXT: lui a1, 209715
; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 20(sp)
+; RV32-NEXT: sw a1, 16(sp)
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
; RV32-NEXT: sw a1, 12(sp)
; RV32-NEXT: sw a1, 8(sp)
; RV32-NEXT: lui a1, 4112
; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 4(sp)
+; RV32-NEXT: sw a1, 0(sp)
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t
; RV32-NEXT: vor.vv v8, v8, v16, v0.t
; RV32-NEXT: vor.vv v8, v8, v16, v0.t
; RV32-NEXT: vnot.v v8, v8, v0.t
; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t
-; RV32-NEXT: addi a1, sp, 8
+; RV32-NEXT: addi a1, sp, 24
; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v24, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vand.vv v16, v16, v24, v0.t
; RV32-NEXT: vsub.vv v8, v8, v16, v0.t
+; RV32-NEXT: addi a1, sp, 16
; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v16, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vadd.vv v8, v24, v8, v0.t
; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t
; RV32-NEXT: vadd.vv v8, v8, v16, v0.t
+; RV32-NEXT: addi a1, sp, 8
; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v16, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vand.vv v8, v8, v16, v0.t
+; RV32-NEXT: mv a1, sp
; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v16, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vmul.vv v8, v8, v16, v0.t
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t
-; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: addi sp, sp, 32
; RV32-NEXT: ret
;
; RV64-LABEL: vp_ctlz_zero_undef_nxv8i64:
define <vscale x 8 x i64> @vp_ctlz_zero_undef_nxv8i64_unmasked(<vscale x 8 x i64> %va, i32 zeroext %evl) {
; RV32-LABEL: vp_ctlz_zero_undef_nxv8i64_unmasked:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: addi sp, sp, -32
+; RV32-NEXT: .cfi_def_cfa_offset 32
; RV32-NEXT: lui a1, 349525
; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 28(sp)
+; RV32-NEXT: sw a1, 24(sp)
; RV32-NEXT: lui a1, 209715
; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 20(sp)
+; RV32-NEXT: sw a1, 16(sp)
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
; RV32-NEXT: sw a1, 12(sp)
; RV32-NEXT: sw a1, 8(sp)
; RV32-NEXT: lui a1, 4112
; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 4(sp)
+; RV32-NEXT: sw a1, 0(sp)
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsrl.vi v16, v8, 1
; RV32-NEXT: vor.vv v8, v8, v16
; RV32-NEXT: vor.vv v8, v8, v16
; RV32-NEXT: vnot.v v8, v8
; RV32-NEXT: vsrl.vi v16, v8, 1
-; RV32-NEXT: addi a1, sp, 8
+; RV32-NEXT: addi a1, sp, 24
; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v24, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vand.vv v16, v16, v24
; RV32-NEXT: vsub.vv v8, v8, v16
+; RV32-NEXT: addi a1, sp, 16
; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v16, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vadd.vv v8, v24, v8
; RV32-NEXT: vsrl.vi v16, v8, 4
; RV32-NEXT: vadd.vv v8, v8, v16
+; RV32-NEXT: addi a1, sp, 8
; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v16, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vand.vv v8, v8, v16
+; RV32-NEXT: mv a1, sp
; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v16, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vmul.vv v8, v8, v16
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0
-; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: addi sp, sp, 32
; RV32-NEXT: ret
;
; RV64-LABEL: vp_ctlz_zero_undef_nxv8i64_unmasked:
define <vscale x 16 x i64> @vp_ctlz_zero_undef_nxv16i64(<vscale x 16 x i64> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
; RV32-LABEL: vp_ctlz_zero_undef_nxv16i64:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: addi sp, sp, -48
+; RV32-NEXT: .cfi_def_cfa_offset 48
; RV32-NEXT: csrr a1, vlenb
; RV32-NEXT: li a2, 40
; RV32-NEXT: mul a1, a1, a2
; RV32-NEXT: sub sp, sp, a1
-; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
+; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 40 * vlenb
; RV32-NEXT: vmv1r.v v1, v0
; RV32-NEXT: csrr a1, vlenb
; RV32-NEXT: srli a2, a1, 3
; RV32-NEXT: vslidedown.vx v0, v0, a2
; RV32-NEXT: lui a2, 349525
; RV32-NEXT: addi a2, a2, 1365
-; RV32-NEXT: sw a2, 12(sp)
-; RV32-NEXT: sw a2, 8(sp)
+; RV32-NEXT: sw a2, 44(sp)
+; RV32-NEXT: sw a2, 40(sp)
; RV32-NEXT: lui a2, 209715
; RV32-NEXT: addi a2, a2, 819
-; RV32-NEXT: sw a2, 12(sp)
-; RV32-NEXT: sw a2, 8(sp)
+; RV32-NEXT: sw a2, 36(sp)
+; RV32-NEXT: sw a2, 32(sp)
; RV32-NEXT: lui a2, 61681
; RV32-NEXT: addi a2, a2, -241
-; RV32-NEXT: sw a2, 12(sp)
-; RV32-NEXT: sw a2, 8(sp)
+; RV32-NEXT: sw a2, 28(sp)
+; RV32-NEXT: sw a2, 24(sp)
; RV32-NEXT: lui a2, 4112
; RV32-NEXT: addi a2, a2, 257
-; RV32-NEXT: sw a2, 12(sp)
-; RV32-NEXT: sw a2, 8(sp)
+; RV32-NEXT: sw a2, 20(sp)
+; RV32-NEXT: sw a2, 16(sp)
; RV32-NEXT: sub a2, a0, a1
; RV32-NEXT: sltu a3, a0, a2
; RV32-NEXT: addi a3, a3, -1
; RV32-NEXT: csrr a4, vlenb
; RV32-NEXT: slli a4, a4, 5
; RV32-NEXT: add a4, sp, a4
-; RV32-NEXT: addi a4, a4, 16
+; RV32-NEXT: addi a4, a4, 48
; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill
; RV32-NEXT: vsrl.vi v24, v16, 1, v0.t
-; RV32-NEXT: addi a4, sp, 8
+; RV32-NEXT: addi a4, sp, 40
; RV32-NEXT: vsetvli a5, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v16, (a4), zero
; RV32-NEXT: vsetvli zero, a3, e64, m8, ta, ma
-; RV32-NEXT: csrr a5, vlenb
-; RV32-NEXT: slli a5, a5, 4
-; RV32-NEXT: add a5, sp, a5
-; RV32-NEXT: addi a5, a5, 16
-; RV32-NEXT: vs8r.v v16, (a5) # Unknown-size Folded Spill
+; RV32-NEXT: csrr a4, vlenb
+; RV32-NEXT: slli a4, a4, 4
+; RV32-NEXT: add a4, sp, a4
+; RV32-NEXT: addi a4, a4, 48
+; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill
; RV32-NEXT: vand.vv v24, v24, v16, v0.t
-; RV32-NEXT: csrr a5, vlenb
-; RV32-NEXT: slli a5, a5, 5
-; RV32-NEXT: add a5, sp, a5
-; RV32-NEXT: addi a5, a5, 16
-; RV32-NEXT: vl8r.v v16, (a5) # Unknown-size Folded Reload
+; RV32-NEXT: csrr a4, vlenb
+; RV32-NEXT: slli a4, a4, 5
+; RV32-NEXT: add a4, sp, a4
+; RV32-NEXT: addi a4, a4, 48
+; RV32-NEXT: vl8r.v v16, (a4) # Unknown-size Folded Reload
; RV32-NEXT: vsub.vv v16, v16, v24, v0.t
-; RV32-NEXT: csrr a5, vlenb
-; RV32-NEXT: li a6, 24
-; RV32-NEXT: mul a5, a5, a6
-; RV32-NEXT: add a5, sp, a5
-; RV32-NEXT: addi a5, a5, 16
-; RV32-NEXT: vs8r.v v16, (a5) # Unknown-size Folded Spill
+; RV32-NEXT: csrr a4, vlenb
+; RV32-NEXT: li a5, 24
+; RV32-NEXT: mul a4, a4, a5
+; RV32-NEXT: add a4, sp, a4
+; RV32-NEXT: addi a4, a4, 48
+; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill
+; RV32-NEXT: addi a4, sp, 32
; RV32-NEXT: vsetvli a5, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v16, (a4), zero
-; RV32-NEXT: csrr a5, vlenb
-; RV32-NEXT: slli a5, a5, 5
-; RV32-NEXT: add a5, sp, a5
-; RV32-NEXT: addi a5, a5, 16
-; RV32-NEXT: vs8r.v v16, (a5) # Unknown-size Folded Spill
+; RV32-NEXT: csrr a4, vlenb
+; RV32-NEXT: slli a4, a4, 5
+; RV32-NEXT: add a4, sp, a4
+; RV32-NEXT: addi a4, a4, 48
+; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill
; RV32-NEXT: vsetvli zero, a3, e64, m8, ta, ma
-; RV32-NEXT: csrr a5, vlenb
-; RV32-NEXT: li a6, 24
-; RV32-NEXT: mul a5, a5, a6
-; RV32-NEXT: add a5, sp, a5
-; RV32-NEXT: addi a5, a5, 16
-; RV32-NEXT: vl8r.v v16, (a5) # Unknown-size Folded Reload
-; RV32-NEXT: csrr a5, vlenb
-; RV32-NEXT: slli a5, a5, 5
-; RV32-NEXT: add a5, sp, a5
-; RV32-NEXT: addi a5, a5, 16
-; RV32-NEXT: vl8r.v v24, (a5) # Unknown-size Folded Reload
+; RV32-NEXT: csrr a4, vlenb
+; RV32-NEXT: li a5, 24
+; RV32-NEXT: mul a4, a4, a5
+; RV32-NEXT: add a4, sp, a4
+; RV32-NEXT: addi a4, a4, 48
+; RV32-NEXT: vl8r.v v16, (a4) # Unknown-size Folded Reload
+; RV32-NEXT: csrr a4, vlenb
+; RV32-NEXT: slli a4, a4, 5
+; RV32-NEXT: add a4, sp, a4
+; RV32-NEXT: addi a4, a4, 48
+; RV32-NEXT: vl8r.v v24, (a4) # Unknown-size Folded Reload
; RV32-NEXT: vand.vv v16, v16, v24, v0.t
-; RV32-NEXT: csrr a5, vlenb
-; RV32-NEXT: slli a5, a5, 3
-; RV32-NEXT: add a5, sp, a5
-; RV32-NEXT: addi a5, a5, 16
-; RV32-NEXT: vs8r.v v16, (a5) # Unknown-size Folded Spill
-; RV32-NEXT: csrr a5, vlenb
-; RV32-NEXT: li a6, 24
-; RV32-NEXT: mul a5, a5, a6
-; RV32-NEXT: add a5, sp, a5
-; RV32-NEXT: addi a5, a5, 16
-; RV32-NEXT: vl8r.v v16, (a5) # Unknown-size Folded Reload
+; RV32-NEXT: csrr a4, vlenb
+; RV32-NEXT: slli a4, a4, 3
+; RV32-NEXT: add a4, sp, a4
+; RV32-NEXT: addi a4, a4, 48
+; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill
+; RV32-NEXT: csrr a4, vlenb
+; RV32-NEXT: li a5, 24
+; RV32-NEXT: mul a4, a4, a5
+; RV32-NEXT: add a4, sp, a4
+; RV32-NEXT: addi a4, a4, 48
+; RV32-NEXT: vl8r.v v16, (a4) # Unknown-size Folded Reload
; RV32-NEXT: vsrl.vi v16, v16, 2, v0.t
-; RV32-NEXT: csrr a5, vlenb
-; RV32-NEXT: li a6, 24
-; RV32-NEXT: mul a5, a5, a6
-; RV32-NEXT: add a5, sp, a5
-; RV32-NEXT: addi a5, a5, 16
-; RV32-NEXT: vs8r.v v16, (a5) # Unknown-size Folded Spill
-; RV32-NEXT: csrr a5, vlenb
-; RV32-NEXT: slli a5, a5, 5
-; RV32-NEXT: add a5, sp, a5
-; RV32-NEXT: addi a5, a5, 16
-; RV32-NEXT: vl8r.v v16, (a5) # Unknown-size Folded Reload
-; RV32-NEXT: csrr a5, vlenb
-; RV32-NEXT: li a6, 24
-; RV32-NEXT: mul a5, a5, a6
-; RV32-NEXT: add a5, sp, a5
-; RV32-NEXT: addi a5, a5, 16
-; RV32-NEXT: vl8r.v v24, (a5) # Unknown-size Folded Reload
+; RV32-NEXT: csrr a4, vlenb
+; RV32-NEXT: li a5, 24
+; RV32-NEXT: mul a4, a4, a5
+; RV32-NEXT: add a4, sp, a4
+; RV32-NEXT: addi a4, a4, 48
+; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill
+; RV32-NEXT: csrr a4, vlenb
+; RV32-NEXT: slli a4, a4, 5
+; RV32-NEXT: add a4, sp, a4
+; RV32-NEXT: addi a4, a4, 48
+; RV32-NEXT: vl8r.v v16, (a4) # Unknown-size Folded Reload
+; RV32-NEXT: csrr a4, vlenb
+; RV32-NEXT: li a5, 24
+; RV32-NEXT: mul a4, a4, a5
+; RV32-NEXT: add a4, sp, a4
+; RV32-NEXT: addi a4, a4, 48
+; RV32-NEXT: vl8r.v v24, (a4) # Unknown-size Folded Reload
; RV32-NEXT: vand.vv v16, v24, v16, v0.t
-; RV32-NEXT: csrr a5, vlenb
-; RV32-NEXT: slli a5, a5, 3
-; RV32-NEXT: add a5, sp, a5
-; RV32-NEXT: addi a5, a5, 16
-; RV32-NEXT: vl8r.v v24, (a5) # Unknown-size Folded Reload
+; RV32-NEXT: csrr a4, vlenb
+; RV32-NEXT: slli a4, a4, 3
+; RV32-NEXT: add a4, sp, a4
+; RV32-NEXT: addi a4, a4, 48
+; RV32-NEXT: vl8r.v v24, (a4) # Unknown-size Folded Reload
; RV32-NEXT: vadd.vv v16, v24, v16, v0.t
; RV32-NEXT: vsrl.vi v24, v16, 4, v0.t
; RV32-NEXT: vadd.vv v24, v16, v24, v0.t
+; RV32-NEXT: addi a4, sp, 24
; RV32-NEXT: vsetvli a5, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v16, (a4), zero
; RV32-NEXT: vsetvli zero, a3, e64, m8, ta, ma
-; RV32-NEXT: csrr a5, vlenb
-; RV32-NEXT: li a6, 24
-; RV32-NEXT: mul a5, a5, a6
-; RV32-NEXT: add a5, sp, a5
-; RV32-NEXT: addi a5, a5, 16
-; RV32-NEXT: vs8r.v v16, (a5) # Unknown-size Folded Spill
+; RV32-NEXT: csrr a4, vlenb
+; RV32-NEXT: li a5, 24
+; RV32-NEXT: mul a4, a4, a5
+; RV32-NEXT: add a4, sp, a4
+; RV32-NEXT: addi a4, a4, 48
+; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill
; RV32-NEXT: vand.vv v24, v24, v16, v0.t
+; RV32-NEXT: addi a4, sp, 16
; RV32-NEXT: vsetvli a5, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v16, (a4), zero
; RV32-NEXT: vsetvli zero, a3, e64, m8, ta, ma
; RV32-NEXT: csrr a3, vlenb
; RV32-NEXT: slli a3, a3, 3
; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 16
+; RV32-NEXT: addi a3, a3, 48
; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
; RV32-NEXT: vmul.vv v16, v24, v16, v0.t
; RV32-NEXT: li a3, 56
; RV32-NEXT: vsrl.vx v16, v16, a3, v0.t
-; RV32-NEXT: addi a4, sp, 16
+; RV32-NEXT: addi a4, sp, 48
; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill
; RV32-NEXT: bltu a0, a1, .LBB94_2
; RV32-NEXT: # %bb.1:
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 4
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: addi a0, a0, 48
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vand.vv v16, v24, v16, v0.t
; RV32-NEXT: vsub.vv v8, v8, v16, v0.t
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 5
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: addi a0, a0, 48
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vand.vv v24, v8, v16, v0.t
; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t
; RV32-NEXT: li a1, 24
; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: addi a0, a0, 48
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vand.vv v8, v8, v16, v0.t
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 3
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: addi a0, a0, 48
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vmul.vv v8, v8, v16, v0.t
; RV32-NEXT: vsrl.vx v8, v8, a3, v0.t
-; RV32-NEXT: addi a0, sp, 16
+; RV32-NEXT: addi a0, sp, 48
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: li a1, 40
; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add sp, sp, a0
-; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: addi sp, sp, 48
; RV32-NEXT: ret
;
; RV64-LABEL: vp_ctlz_zero_undef_nxv16i64:
define <vscale x 16 x i64> @vp_ctlz_zero_undef_nxv16i64_unmasked(<vscale x 16 x i64> %va, i32 zeroext %evl) {
; RV32-LABEL: vp_ctlz_zero_undef_nxv16i64_unmasked:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: addi sp, sp, -48
+; RV32-NEXT: .cfi_def_cfa_offset 48
; RV32-NEXT: csrr a1, vlenb
; RV32-NEXT: slli a1, a1, 5
; RV32-NEXT: sub sp, sp, a1
-; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 32 * vlenb
; RV32-NEXT: lui a1, 349525
; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 44(sp)
+; RV32-NEXT: sw a1, 40(sp)
; RV32-NEXT: lui a1, 209715
; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 36(sp)
+; RV32-NEXT: sw a1, 32(sp)
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 28(sp)
+; RV32-NEXT: sw a1, 24(sp)
; RV32-NEXT: lui a1, 4112
; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 20(sp)
+; RV32-NEXT: sw a1, 16(sp)
; RV32-NEXT: csrr a1, vlenb
; RV32-NEXT: sub a2, a0, a1
; RV32-NEXT: sltu a3, a0, a2
; RV32-NEXT: vor.vv v16, v16, v24
; RV32-NEXT: vnot.v v16, v16
; RV32-NEXT: vsrl.vi v24, v16, 1
-; RV32-NEXT: addi a4, sp, 8
+; RV32-NEXT: addi a4, sp, 40
; RV32-NEXT: vsetvli a5, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v0, (a4), zero
; RV32-NEXT: vsetvli zero, a3, e64, m8, ta, ma
-; RV32-NEXT: csrr a5, vlenb
-; RV32-NEXT: li a6, 24
-; RV32-NEXT: mul a5, a5, a6
-; RV32-NEXT: add a5, sp, a5
-; RV32-NEXT: addi a5, a5, 16
-; RV32-NEXT: vs8r.v v0, (a5) # Unknown-size Folded Spill
+; RV32-NEXT: csrr a4, vlenb
+; RV32-NEXT: li a5, 24
+; RV32-NEXT: mul a4, a4, a5
+; RV32-NEXT: add a4, sp, a4
+; RV32-NEXT: addi a4, a4, 48
+; RV32-NEXT: vs8r.v v0, (a4) # Unknown-size Folded Spill
; RV32-NEXT: vand.vv v24, v24, v0
; RV32-NEXT: vsub.vv v16, v16, v24
+; RV32-NEXT: addi a4, sp, 32
; RV32-NEXT: vsetvli a5, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v0, (a4), zero
; RV32-NEXT: vsetvli zero, a3, e64, m8, ta, ma
; RV32-NEXT: vadd.vv v16, v24, v16
; RV32-NEXT: vsrl.vi v24, v16, 4
; RV32-NEXT: vadd.vv v16, v16, v24
+; RV32-NEXT: addi a4, sp, 24
; RV32-NEXT: vsetvli a5, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v24, (a4), zero
; RV32-NEXT: vsetvli zero, a3, e64, m8, ta, ma
-; RV32-NEXT: csrr a5, vlenb
-; RV32-NEXT: slli a5, a5, 4
-; RV32-NEXT: add a5, sp, a5
-; RV32-NEXT: addi a5, a5, 16
-; RV32-NEXT: vs8r.v v24, (a5) # Unknown-size Folded Spill
+; RV32-NEXT: csrr a4, vlenb
+; RV32-NEXT: slli a4, a4, 4
+; RV32-NEXT: add a4, sp, a4
+; RV32-NEXT: addi a4, a4, 48
+; RV32-NEXT: vs8r.v v24, (a4) # Unknown-size Folded Spill
; RV32-NEXT: vand.vv v24, v16, v24
+; RV32-NEXT: addi a4, sp, 16
; RV32-NEXT: vsetvli a5, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v16, (a4), zero
; RV32-NEXT: vsetvli zero, a3, e64, m8, ta, ma
; RV32-NEXT: csrr a3, vlenb
; RV32-NEXT: slli a3, a3, 3
; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 16
+; RV32-NEXT: addi a3, a3, 48
; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
; RV32-NEXT: vmul.vv v24, v24, v16
; RV32-NEXT: li a3, 56
; RV32-NEXT: vsrl.vx v16, v24, a3
-; RV32-NEXT: addi a4, sp, 16
+; RV32-NEXT: addi a4, sp, 48
; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill
; RV32-NEXT: bltu a0, a1, .LBB95_2
; RV32-NEXT: # %bb.1:
; RV32-NEXT: li a1, 24
; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: addi a0, a0, 48
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vand.vv v24, v24, v16
; RV32-NEXT: vsub.vv v8, v8, v24
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 4
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: addi a0, a0, 48
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vand.vv v8, v8, v16
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 3
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: addi a0, a0, 48
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vmul.vv v8, v8, v16
; RV32-NEXT: vsrl.vx v8, v8, a3
-; RV32-NEXT: addi a0, sp, 16
+; RV32-NEXT: addi a0, sp, 48
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 5
; RV32-NEXT: add sp, sp, a0
-; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: addi sp, sp, 48
; RV32-NEXT: ret
;
; RV64-LABEL: vp_ctlz_zero_undef_nxv16i64_unmasked:
define <vscale x 1 x i64> @ctpop_nxv1i64(<vscale x 1 x i64> %va) {
; RV32-LABEL: ctpop_nxv1i64:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: addi sp, sp, -32
+; RV32-NEXT: .cfi_def_cfa_offset 32
; RV32-NEXT: lui a0, 349525
; RV32-NEXT: addi a0, a0, 1365
-; RV32-NEXT: sw a0, 12(sp)
-; RV32-NEXT: sw a0, 8(sp)
+; RV32-NEXT: sw a0, 28(sp)
+; RV32-NEXT: sw a0, 24(sp)
; RV32-NEXT: lui a0, 209715
; RV32-NEXT: addi a0, a0, 819
-; RV32-NEXT: sw a0, 12(sp)
-; RV32-NEXT: sw a0, 8(sp)
+; RV32-NEXT: sw a0, 20(sp)
+; RV32-NEXT: sw a0, 16(sp)
; RV32-NEXT: lui a0, 61681
; RV32-NEXT: addi a0, a0, -241
; RV32-NEXT: sw a0, 12(sp)
; RV32-NEXT: sw a0, 8(sp)
; RV32-NEXT: lui a0, 4112
; RV32-NEXT: addi a0, a0, 257
-; RV32-NEXT: sw a0, 12(sp)
-; RV32-NEXT: sw a0, 8(sp)
+; RV32-NEXT: sw a0, 4(sp)
+; RV32-NEXT: sw a0, 0(sp)
; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, ma
-; RV32-NEXT: addi a0, sp, 8
+; RV32-NEXT: addi a0, sp, 24
; RV32-NEXT: vlse64.v v9, (a0), zero
+; RV32-NEXT: addi a0, sp, 16
; RV32-NEXT: vlse64.v v10, (a0), zero
; RV32-NEXT: vsrl.vi v11, v8, 1
; RV32-NEXT: vand.vv v9, v11, v9
; RV32-NEXT: vsrl.vi v8, v8, 2
; RV32-NEXT: vand.vv v8, v8, v10
; RV32-NEXT: vadd.vv v8, v9, v8
+; RV32-NEXT: addi a0, sp, 8
; RV32-NEXT: vlse64.v v9, (a0), zero
+; RV32-NEXT: mv a0, sp
; RV32-NEXT: vlse64.v v10, (a0), zero
; RV32-NEXT: vsrl.vi v11, v8, 4
; RV32-NEXT: vadd.vv v8, v8, v11
; RV32-NEXT: vmul.vv v8, v8, v10
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0
-; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: addi sp, sp, 32
; RV32-NEXT: ret
;
; RV64-LABEL: ctpop_nxv1i64:
define <vscale x 2 x i64> @ctpop_nxv2i64(<vscale x 2 x i64> %va) {
; RV32-LABEL: ctpop_nxv2i64:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: addi sp, sp, -32
+; RV32-NEXT: .cfi_def_cfa_offset 32
; RV32-NEXT: lui a0, 349525
; RV32-NEXT: addi a0, a0, 1365
-; RV32-NEXT: sw a0, 12(sp)
-; RV32-NEXT: sw a0, 8(sp)
+; RV32-NEXT: sw a0, 28(sp)
+; RV32-NEXT: sw a0, 24(sp)
; RV32-NEXT: lui a0, 209715
; RV32-NEXT: addi a0, a0, 819
-; RV32-NEXT: sw a0, 12(sp)
-; RV32-NEXT: sw a0, 8(sp)
+; RV32-NEXT: sw a0, 20(sp)
+; RV32-NEXT: sw a0, 16(sp)
; RV32-NEXT: lui a0, 61681
; RV32-NEXT: addi a0, a0, -241
; RV32-NEXT: sw a0, 12(sp)
; RV32-NEXT: sw a0, 8(sp)
; RV32-NEXT: lui a0, 4112
; RV32-NEXT: addi a0, a0, 257
-; RV32-NEXT: sw a0, 12(sp)
-; RV32-NEXT: sw a0, 8(sp)
+; RV32-NEXT: sw a0, 4(sp)
+; RV32-NEXT: sw a0, 0(sp)
; RV32-NEXT: vsetvli a0, zero, e64, m2, ta, ma
-; RV32-NEXT: addi a0, sp, 8
+; RV32-NEXT: addi a0, sp, 24
; RV32-NEXT: vlse64.v v10, (a0), zero
+; RV32-NEXT: addi a0, sp, 16
; RV32-NEXT: vlse64.v v12, (a0), zero
; RV32-NEXT: vsrl.vi v14, v8, 1
; RV32-NEXT: vand.vv v10, v14, v10
; RV32-NEXT: vsrl.vi v8, v8, 2
; RV32-NEXT: vand.vv v8, v8, v12
; RV32-NEXT: vadd.vv v8, v10, v8
+; RV32-NEXT: addi a0, sp, 8
; RV32-NEXT: vlse64.v v10, (a0), zero
+; RV32-NEXT: mv a0, sp
; RV32-NEXT: vlse64.v v12, (a0), zero
; RV32-NEXT: vsrl.vi v14, v8, 4
; RV32-NEXT: vadd.vv v8, v8, v14
; RV32-NEXT: vmul.vv v8, v8, v12
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0
-; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: addi sp, sp, 32
; RV32-NEXT: ret
;
; RV64-LABEL: ctpop_nxv2i64:
define <vscale x 4 x i64> @ctpop_nxv4i64(<vscale x 4 x i64> %va) {
; RV32-LABEL: ctpop_nxv4i64:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: addi sp, sp, -32
+; RV32-NEXT: .cfi_def_cfa_offset 32
; RV32-NEXT: lui a0, 349525
; RV32-NEXT: addi a0, a0, 1365
-; RV32-NEXT: sw a0, 12(sp)
-; RV32-NEXT: sw a0, 8(sp)
+; RV32-NEXT: sw a0, 28(sp)
+; RV32-NEXT: sw a0, 24(sp)
; RV32-NEXT: lui a0, 209715
; RV32-NEXT: addi a0, a0, 819
-; RV32-NEXT: sw a0, 12(sp)
-; RV32-NEXT: sw a0, 8(sp)
+; RV32-NEXT: sw a0, 20(sp)
+; RV32-NEXT: sw a0, 16(sp)
; RV32-NEXT: lui a0, 61681
; RV32-NEXT: addi a0, a0, -241
; RV32-NEXT: sw a0, 12(sp)
; RV32-NEXT: sw a0, 8(sp)
; RV32-NEXT: lui a0, 4112
; RV32-NEXT: addi a0, a0, 257
-; RV32-NEXT: sw a0, 12(sp)
-; RV32-NEXT: sw a0, 8(sp)
+; RV32-NEXT: sw a0, 4(sp)
+; RV32-NEXT: sw a0, 0(sp)
; RV32-NEXT: vsetvli a0, zero, e64, m4, ta, ma
-; RV32-NEXT: addi a0, sp, 8
+; RV32-NEXT: addi a0, sp, 24
; RV32-NEXT: vlse64.v v12, (a0), zero
+; RV32-NEXT: addi a0, sp, 16
; RV32-NEXT: vlse64.v v16, (a0), zero
; RV32-NEXT: vsrl.vi v20, v8, 1
; RV32-NEXT: vand.vv v12, v20, v12
; RV32-NEXT: vsrl.vi v8, v8, 2
; RV32-NEXT: vand.vv v8, v8, v16
; RV32-NEXT: vadd.vv v8, v12, v8
+; RV32-NEXT: addi a0, sp, 8
; RV32-NEXT: vlse64.v v12, (a0), zero
+; RV32-NEXT: mv a0, sp
; RV32-NEXT: vlse64.v v16, (a0), zero
; RV32-NEXT: vsrl.vi v20, v8, 4
; RV32-NEXT: vadd.vv v8, v8, v20
; RV32-NEXT: vmul.vv v8, v8, v16
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0
-; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: addi sp, sp, 32
; RV32-NEXT: ret
;
; RV64-LABEL: ctpop_nxv4i64:
define <vscale x 8 x i64> @ctpop_nxv8i64(<vscale x 8 x i64> %va) {
; RV32-LABEL: ctpop_nxv8i64:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: addi sp, sp, -32
+; RV32-NEXT: .cfi_def_cfa_offset 32
; RV32-NEXT: lui a0, 349525
; RV32-NEXT: addi a0, a0, 1365
-; RV32-NEXT: sw a0, 12(sp)
-; RV32-NEXT: sw a0, 8(sp)
+; RV32-NEXT: sw a0, 28(sp)
+; RV32-NEXT: sw a0, 24(sp)
; RV32-NEXT: lui a0, 209715
; RV32-NEXT: addi a0, a0, 819
-; RV32-NEXT: sw a0, 12(sp)
-; RV32-NEXT: sw a0, 8(sp)
+; RV32-NEXT: sw a0, 20(sp)
+; RV32-NEXT: sw a0, 16(sp)
; RV32-NEXT: lui a0, 61681
; RV32-NEXT: addi a0, a0, -241
; RV32-NEXT: sw a0, 12(sp)
; RV32-NEXT: sw a0, 8(sp)
; RV32-NEXT: lui a0, 4112
; RV32-NEXT: addi a0, a0, 257
-; RV32-NEXT: sw a0, 12(sp)
-; RV32-NEXT: sw a0, 8(sp)
+; RV32-NEXT: sw a0, 4(sp)
+; RV32-NEXT: sw a0, 0(sp)
; RV32-NEXT: vsetvli a0, zero, e64, m8, ta, ma
-; RV32-NEXT: addi a0, sp, 8
+; RV32-NEXT: addi a0, sp, 24
; RV32-NEXT: vlse64.v v16, (a0), zero
+; RV32-NEXT: addi a0, sp, 16
; RV32-NEXT: vlse64.v v24, (a0), zero
; RV32-NEXT: vsrl.vi v0, v8, 1
; RV32-NEXT: vand.vv v16, v0, v16
; RV32-NEXT: vsrl.vi v8, v8, 2
; RV32-NEXT: vand.vv v8, v8, v24
; RV32-NEXT: vadd.vv v8, v16, v8
+; RV32-NEXT: addi a0, sp, 8
; RV32-NEXT: vlse64.v v16, (a0), zero
+; RV32-NEXT: mv a0, sp
; RV32-NEXT: vlse64.v v24, (a0), zero
; RV32-NEXT: vsrl.vi v0, v8, 4
; RV32-NEXT: vadd.vv v8, v8, v0
; RV32-NEXT: vmul.vv v8, v8, v24
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0
-; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: addi sp, sp, 32
; RV32-NEXT: ret
;
; RV64-LABEL: ctpop_nxv8i64:
define <vscale x 1 x i64> @vp_ctpop_nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
; RV32-LABEL: vp_ctpop_nxv1i64:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: addi sp, sp, -32
+; RV32-NEXT: .cfi_def_cfa_offset 32
; RV32-NEXT: lui a1, 349525
; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 28(sp)
+; RV32-NEXT: sw a1, 24(sp)
; RV32-NEXT: lui a1, 209715
; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 20(sp)
+; RV32-NEXT: sw a1, 16(sp)
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
; RV32-NEXT: sw a1, 12(sp)
; RV32-NEXT: sw a1, 8(sp)
; RV32-NEXT: lui a1, 4112
; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 4(sp)
+; RV32-NEXT: sw a1, 0(sp)
; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
; RV32-NEXT: vsrl.vi v9, v8, 1, v0.t
-; RV32-NEXT: addi a1, sp, 8
+; RV32-NEXT: addi a1, sp, 24
; RV32-NEXT: vsetvli a2, zero, e64, m1, ta, ma
; RV32-NEXT: vlse64.v v10, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
; RV32-NEXT: vand.vv v9, v9, v10, v0.t
; RV32-NEXT: vsub.vv v8, v8, v9, v0.t
+; RV32-NEXT: addi a1, sp, 16
; RV32-NEXT: vsetvli a2, zero, e64, m1, ta, ma
; RV32-NEXT: vlse64.v v9, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
; RV32-NEXT: vadd.vv v8, v10, v8, v0.t
; RV32-NEXT: vsrl.vi v9, v8, 4, v0.t
; RV32-NEXT: vadd.vv v8, v8, v9, v0.t
+; RV32-NEXT: addi a1, sp, 8
; RV32-NEXT: vsetvli a2, zero, e64, m1, ta, ma
; RV32-NEXT: vlse64.v v9, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
; RV32-NEXT: vand.vv v8, v8, v9, v0.t
+; RV32-NEXT: mv a1, sp
; RV32-NEXT: vsetvli a2, zero, e64, m1, ta, ma
; RV32-NEXT: vlse64.v v9, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
; RV32-NEXT: vmul.vv v8, v8, v9, v0.t
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t
-; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: addi sp, sp, 32
; RV32-NEXT: ret
;
; RV64-LABEL: vp_ctpop_nxv1i64:
define <vscale x 1 x i64> @vp_ctpop_nxv1i64_unmasked(<vscale x 1 x i64> %va, i32 zeroext %evl) {
; RV32-LABEL: vp_ctpop_nxv1i64_unmasked:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: addi sp, sp, -32
+; RV32-NEXT: .cfi_def_cfa_offset 32
; RV32-NEXT: lui a1, 349525
; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 28(sp)
+; RV32-NEXT: sw a1, 24(sp)
; RV32-NEXT: lui a1, 209715
; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 20(sp)
+; RV32-NEXT: sw a1, 16(sp)
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
; RV32-NEXT: sw a1, 12(sp)
; RV32-NEXT: sw a1, 8(sp)
; RV32-NEXT: lui a1, 4112
; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 4(sp)
+; RV32-NEXT: sw a1, 0(sp)
; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
; RV32-NEXT: vsrl.vi v9, v8, 1
-; RV32-NEXT: addi a1, sp, 8
+; RV32-NEXT: addi a1, sp, 24
; RV32-NEXT: vsetvli a2, zero, e64, m1, ta, ma
; RV32-NEXT: vlse64.v v10, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
; RV32-NEXT: vand.vv v9, v9, v10
; RV32-NEXT: vsub.vv v8, v8, v9
+; RV32-NEXT: addi a1, sp, 16
; RV32-NEXT: vsetvli a2, zero, e64, m1, ta, ma
; RV32-NEXT: vlse64.v v9, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
; RV32-NEXT: vadd.vv v8, v10, v8
; RV32-NEXT: vsrl.vi v9, v8, 4
; RV32-NEXT: vadd.vv v8, v8, v9
+; RV32-NEXT: addi a1, sp, 8
; RV32-NEXT: vsetvli a2, zero, e64, m1, ta, ma
; RV32-NEXT: vlse64.v v9, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
; RV32-NEXT: vand.vv v8, v8, v9
+; RV32-NEXT: mv a1, sp
; RV32-NEXT: vsetvli a2, zero, e64, m1, ta, ma
; RV32-NEXT: vlse64.v v9, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
; RV32-NEXT: vmul.vv v8, v8, v9
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0
-; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: addi sp, sp, 32
; RV32-NEXT: ret
;
; RV64-LABEL: vp_ctpop_nxv1i64_unmasked:
define <vscale x 2 x i64> @vp_ctpop_nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
; RV32-LABEL: vp_ctpop_nxv2i64:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: addi sp, sp, -32
+; RV32-NEXT: .cfi_def_cfa_offset 32
; RV32-NEXT: lui a1, 349525
; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 28(sp)
+; RV32-NEXT: sw a1, 24(sp)
; RV32-NEXT: lui a1, 209715
; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 20(sp)
+; RV32-NEXT: sw a1, 16(sp)
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
; RV32-NEXT: sw a1, 12(sp)
; RV32-NEXT: sw a1, 8(sp)
; RV32-NEXT: lui a1, 4112
; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 4(sp)
+; RV32-NEXT: sw a1, 0(sp)
; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
; RV32-NEXT: vsrl.vi v10, v8, 1, v0.t
-; RV32-NEXT: addi a1, sp, 8
+; RV32-NEXT: addi a1, sp, 24
; RV32-NEXT: vsetvli a2, zero, e64, m2, ta, ma
; RV32-NEXT: vlse64.v v12, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
; RV32-NEXT: vand.vv v10, v10, v12, v0.t
; RV32-NEXT: vsub.vv v8, v8, v10, v0.t
+; RV32-NEXT: addi a1, sp, 16
; RV32-NEXT: vsetvli a2, zero, e64, m2, ta, ma
; RV32-NEXT: vlse64.v v10, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
; RV32-NEXT: vadd.vv v8, v12, v8, v0.t
; RV32-NEXT: vsrl.vi v10, v8, 4, v0.t
; RV32-NEXT: vadd.vv v8, v8, v10, v0.t
+; RV32-NEXT: addi a1, sp, 8
; RV32-NEXT: vsetvli a2, zero, e64, m2, ta, ma
; RV32-NEXT: vlse64.v v10, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
; RV32-NEXT: vand.vv v8, v8, v10, v0.t
+; RV32-NEXT: mv a1, sp
; RV32-NEXT: vsetvli a2, zero, e64, m2, ta, ma
; RV32-NEXT: vlse64.v v10, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
; RV32-NEXT: vmul.vv v8, v8, v10, v0.t
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t
-; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: addi sp, sp, 32
; RV32-NEXT: ret
;
; RV64-LABEL: vp_ctpop_nxv2i64:
define <vscale x 2 x i64> @vp_ctpop_nxv2i64_unmasked(<vscale x 2 x i64> %va, i32 zeroext %evl) {
; RV32-LABEL: vp_ctpop_nxv2i64_unmasked:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: addi sp, sp, -32
+; RV32-NEXT: .cfi_def_cfa_offset 32
; RV32-NEXT: lui a1, 349525
; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 28(sp)
+; RV32-NEXT: sw a1, 24(sp)
; RV32-NEXT: lui a1, 209715
; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 20(sp)
+; RV32-NEXT: sw a1, 16(sp)
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
; RV32-NEXT: sw a1, 12(sp)
; RV32-NEXT: sw a1, 8(sp)
; RV32-NEXT: lui a1, 4112
; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 4(sp)
+; RV32-NEXT: sw a1, 0(sp)
; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
; RV32-NEXT: vsrl.vi v10, v8, 1
-; RV32-NEXT: addi a1, sp, 8
+; RV32-NEXT: addi a1, sp, 24
; RV32-NEXT: vsetvli a2, zero, e64, m2, ta, ma
; RV32-NEXT: vlse64.v v12, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
; RV32-NEXT: vand.vv v10, v10, v12
; RV32-NEXT: vsub.vv v8, v8, v10
+; RV32-NEXT: addi a1, sp, 16
; RV32-NEXT: vsetvli a2, zero, e64, m2, ta, ma
; RV32-NEXT: vlse64.v v10, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
; RV32-NEXT: vadd.vv v8, v12, v8
; RV32-NEXT: vsrl.vi v10, v8, 4
; RV32-NEXT: vadd.vv v8, v8, v10
+; RV32-NEXT: addi a1, sp, 8
; RV32-NEXT: vsetvli a2, zero, e64, m2, ta, ma
; RV32-NEXT: vlse64.v v10, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
; RV32-NEXT: vand.vv v8, v8, v10
+; RV32-NEXT: mv a1, sp
; RV32-NEXT: vsetvli a2, zero, e64, m2, ta, ma
; RV32-NEXT: vlse64.v v10, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
; RV32-NEXT: vmul.vv v8, v8, v10
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0
-; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: addi sp, sp, 32
; RV32-NEXT: ret
;
; RV64-LABEL: vp_ctpop_nxv2i64_unmasked:
define <vscale x 4 x i64> @vp_ctpop_nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
; RV32-LABEL: vp_ctpop_nxv4i64:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: addi sp, sp, -32
+; RV32-NEXT: .cfi_def_cfa_offset 32
; RV32-NEXT: lui a1, 349525
; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 28(sp)
+; RV32-NEXT: sw a1, 24(sp)
; RV32-NEXT: lui a1, 209715
; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 20(sp)
+; RV32-NEXT: sw a1, 16(sp)
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
; RV32-NEXT: sw a1, 12(sp)
; RV32-NEXT: sw a1, 8(sp)
; RV32-NEXT: lui a1, 4112
; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 4(sp)
+; RV32-NEXT: sw a1, 0(sp)
; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
; RV32-NEXT: vsrl.vi v12, v8, 1, v0.t
-; RV32-NEXT: addi a1, sp, 8
+; RV32-NEXT: addi a1, sp, 24
; RV32-NEXT: vsetvli a2, zero, e64, m4, ta, ma
; RV32-NEXT: vlse64.v v16, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
; RV32-NEXT: vand.vv v12, v12, v16, v0.t
; RV32-NEXT: vsub.vv v8, v8, v12, v0.t
+; RV32-NEXT: addi a1, sp, 16
; RV32-NEXT: vsetvli a2, zero, e64, m4, ta, ma
; RV32-NEXT: vlse64.v v12, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
; RV32-NEXT: vadd.vv v8, v16, v8, v0.t
; RV32-NEXT: vsrl.vi v12, v8, 4, v0.t
; RV32-NEXT: vadd.vv v8, v8, v12, v0.t
+; RV32-NEXT: addi a1, sp, 8
; RV32-NEXT: vsetvli a2, zero, e64, m4, ta, ma
; RV32-NEXT: vlse64.v v12, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
; RV32-NEXT: vand.vv v8, v8, v12, v0.t
+; RV32-NEXT: mv a1, sp
; RV32-NEXT: vsetvli a2, zero, e64, m4, ta, ma
; RV32-NEXT: vlse64.v v12, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
; RV32-NEXT: vmul.vv v8, v8, v12, v0.t
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t
-; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: addi sp, sp, 32
; RV32-NEXT: ret
;
; RV64-LABEL: vp_ctpop_nxv4i64:
define <vscale x 4 x i64> @vp_ctpop_nxv4i64_unmasked(<vscale x 4 x i64> %va, i32 zeroext %evl) {
; RV32-LABEL: vp_ctpop_nxv4i64_unmasked:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: addi sp, sp, -32
+; RV32-NEXT: .cfi_def_cfa_offset 32
; RV32-NEXT: lui a1, 349525
; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 28(sp)
+; RV32-NEXT: sw a1, 24(sp)
; RV32-NEXT: lui a1, 209715
; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 20(sp)
+; RV32-NEXT: sw a1, 16(sp)
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
; RV32-NEXT: sw a1, 12(sp)
; RV32-NEXT: sw a1, 8(sp)
; RV32-NEXT: lui a1, 4112
; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 4(sp)
+; RV32-NEXT: sw a1, 0(sp)
; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
; RV32-NEXT: vsrl.vi v12, v8, 1
-; RV32-NEXT: addi a1, sp, 8
+; RV32-NEXT: addi a1, sp, 24
; RV32-NEXT: vsetvli a2, zero, e64, m4, ta, ma
; RV32-NEXT: vlse64.v v16, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
; RV32-NEXT: vand.vv v12, v12, v16
; RV32-NEXT: vsub.vv v8, v8, v12
+; RV32-NEXT: addi a1, sp, 16
; RV32-NEXT: vsetvli a2, zero, e64, m4, ta, ma
; RV32-NEXT: vlse64.v v12, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
; RV32-NEXT: vadd.vv v8, v16, v8
; RV32-NEXT: vsrl.vi v12, v8, 4
; RV32-NEXT: vadd.vv v8, v8, v12
+; RV32-NEXT: addi a1, sp, 8
; RV32-NEXT: vsetvli a2, zero, e64, m4, ta, ma
; RV32-NEXT: vlse64.v v12, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
; RV32-NEXT: vand.vv v8, v8, v12
+; RV32-NEXT: mv a1, sp
; RV32-NEXT: vsetvli a2, zero, e64, m4, ta, ma
; RV32-NEXT: vlse64.v v12, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
; RV32-NEXT: vmul.vv v8, v8, v12
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0
-; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: addi sp, sp, 32
; RV32-NEXT: ret
;
; RV64-LABEL: vp_ctpop_nxv4i64_unmasked:
define <vscale x 7 x i64> @vp_ctpop_nxv7i64(<vscale x 7 x i64> %va, <vscale x 7 x i1> %m, i32 zeroext %evl) {
; RV32-LABEL: vp_ctpop_nxv7i64:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: addi sp, sp, -32
+; RV32-NEXT: .cfi_def_cfa_offset 32
; RV32-NEXT: lui a1, 349525
; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 28(sp)
+; RV32-NEXT: sw a1, 24(sp)
; RV32-NEXT: lui a1, 209715
; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 20(sp)
+; RV32-NEXT: sw a1, 16(sp)
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
; RV32-NEXT: sw a1, 12(sp)
; RV32-NEXT: sw a1, 8(sp)
; RV32-NEXT: lui a1, 4112
; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 4(sp)
+; RV32-NEXT: sw a1, 0(sp)
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t
-; RV32-NEXT: addi a1, sp, 8
+; RV32-NEXT: addi a1, sp, 24
; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v24, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vand.vv v16, v16, v24, v0.t
; RV32-NEXT: vsub.vv v8, v8, v16, v0.t
+; RV32-NEXT: addi a1, sp, 16
; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v16, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vadd.vv v8, v24, v8, v0.t
; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t
; RV32-NEXT: vadd.vv v8, v8, v16, v0.t
+; RV32-NEXT: addi a1, sp, 8
; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v16, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vand.vv v8, v8, v16, v0.t
+; RV32-NEXT: mv a1, sp
; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v16, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vmul.vv v8, v8, v16, v0.t
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t
-; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: addi sp, sp, 32
; RV32-NEXT: ret
;
; RV64-LABEL: vp_ctpop_nxv7i64:
define <vscale x 7 x i64> @vp_ctpop_nxv7i64_unmasked(<vscale x 7 x i64> %va, i32 zeroext %evl) {
; RV32-LABEL: vp_ctpop_nxv7i64_unmasked:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: addi sp, sp, -32
+; RV32-NEXT: .cfi_def_cfa_offset 32
; RV32-NEXT: lui a1, 349525
; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 28(sp)
+; RV32-NEXT: sw a1, 24(sp)
; RV32-NEXT: lui a1, 209715
; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 20(sp)
+; RV32-NEXT: sw a1, 16(sp)
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
; RV32-NEXT: sw a1, 12(sp)
; RV32-NEXT: sw a1, 8(sp)
; RV32-NEXT: lui a1, 4112
; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 4(sp)
+; RV32-NEXT: sw a1, 0(sp)
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsrl.vi v16, v8, 1
-; RV32-NEXT: addi a1, sp, 8
+; RV32-NEXT: addi a1, sp, 24
; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v24, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vand.vv v16, v16, v24
; RV32-NEXT: vsub.vv v8, v8, v16
+; RV32-NEXT: addi a1, sp, 16
; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v16, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vadd.vv v8, v24, v8
; RV32-NEXT: vsrl.vi v16, v8, 4
; RV32-NEXT: vadd.vv v8, v8, v16
+; RV32-NEXT: addi a1, sp, 8
; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v16, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vand.vv v8, v8, v16
+; RV32-NEXT: mv a1, sp
; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v16, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vmul.vv v8, v8, v16
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0
-; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: addi sp, sp, 32
; RV32-NEXT: ret
;
; RV64-LABEL: vp_ctpop_nxv7i64_unmasked:
define <vscale x 8 x i64> @vp_ctpop_nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
; RV32-LABEL: vp_ctpop_nxv8i64:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: addi sp, sp, -32
+; RV32-NEXT: .cfi_def_cfa_offset 32
; RV32-NEXT: lui a1, 349525
; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 28(sp)
+; RV32-NEXT: sw a1, 24(sp)
; RV32-NEXT: lui a1, 209715
; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 20(sp)
+; RV32-NEXT: sw a1, 16(sp)
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
; RV32-NEXT: sw a1, 12(sp)
; RV32-NEXT: sw a1, 8(sp)
; RV32-NEXT: lui a1, 4112
; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 4(sp)
+; RV32-NEXT: sw a1, 0(sp)
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t
-; RV32-NEXT: addi a1, sp, 8
+; RV32-NEXT: addi a1, sp, 24
; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v24, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vand.vv v16, v16, v24, v0.t
; RV32-NEXT: vsub.vv v8, v8, v16, v0.t
+; RV32-NEXT: addi a1, sp, 16
; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v16, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vadd.vv v8, v24, v8, v0.t
; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t
; RV32-NEXT: vadd.vv v8, v8, v16, v0.t
+; RV32-NEXT: addi a1, sp, 8
; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v16, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vand.vv v8, v8, v16, v0.t
+; RV32-NEXT: mv a1, sp
; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v16, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vmul.vv v8, v8, v16, v0.t
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t
-; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: addi sp, sp, 32
; RV32-NEXT: ret
;
; RV64-LABEL: vp_ctpop_nxv8i64:
define <vscale x 8 x i64> @vp_ctpop_nxv8i64_unmasked(<vscale x 8 x i64> %va, i32 zeroext %evl) {
; RV32-LABEL: vp_ctpop_nxv8i64_unmasked:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: addi sp, sp, -32
+; RV32-NEXT: .cfi_def_cfa_offset 32
; RV32-NEXT: lui a1, 349525
; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 28(sp)
+; RV32-NEXT: sw a1, 24(sp)
; RV32-NEXT: lui a1, 209715
; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 20(sp)
+; RV32-NEXT: sw a1, 16(sp)
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
; RV32-NEXT: sw a1, 12(sp)
; RV32-NEXT: sw a1, 8(sp)
; RV32-NEXT: lui a1, 4112
; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 4(sp)
+; RV32-NEXT: sw a1, 0(sp)
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsrl.vi v16, v8, 1
-; RV32-NEXT: addi a1, sp, 8
+; RV32-NEXT: addi a1, sp, 24
; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v24, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vand.vv v16, v16, v24
; RV32-NEXT: vsub.vv v8, v8, v16
+; RV32-NEXT: addi a1, sp, 16
; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v16, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vadd.vv v8, v24, v8
; RV32-NEXT: vsrl.vi v16, v8, 4
; RV32-NEXT: vadd.vv v8, v8, v16
+; RV32-NEXT: addi a1, sp, 8
; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v16, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vand.vv v8, v8, v16
+; RV32-NEXT: mv a1, sp
; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v16, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vmul.vv v8, v8, v16
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0
-; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: addi sp, sp, 32
; RV32-NEXT: ret
;
; RV64-LABEL: vp_ctpop_nxv8i64_unmasked:
define <vscale x 16 x i64> @vp_ctpop_nxv16i64(<vscale x 16 x i64> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
; RV32-LABEL: vp_ctpop_nxv16i64:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: addi sp, sp, -48
+; RV32-NEXT: .cfi_def_cfa_offset 48
; RV32-NEXT: csrr a1, vlenb
; RV32-NEXT: li a2, 48
; RV32-NEXT: mul a1, a1, a2
; RV32-NEXT: sub sp, sp, a1
-; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 48 * vlenb
+; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 48 * vlenb
; RV32-NEXT: vmv1r.v v1, v0
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: li a2, 40
+; RV32-NEXT: li a2, 24
; RV32-NEXT: mul a1, a1, a2
; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: addi a1, a1, 48
; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
; RV32-NEXT: csrr a1, vlenb
; RV32-NEXT: srli a2, a1, 3
; RV32-NEXT: vslidedown.vx v0, v0, a2
; RV32-NEXT: lui a2, 349525
; RV32-NEXT: addi a2, a2, 1365
-; RV32-NEXT: sw a2, 12(sp)
-; RV32-NEXT: sw a2, 8(sp)
+; RV32-NEXT: sw a2, 44(sp)
+; RV32-NEXT: sw a2, 40(sp)
; RV32-NEXT: lui a2, 209715
; RV32-NEXT: addi a2, a2, 819
-; RV32-NEXT: sw a2, 12(sp)
-; RV32-NEXT: sw a2, 8(sp)
+; RV32-NEXT: sw a2, 36(sp)
+; RV32-NEXT: sw a2, 32(sp)
; RV32-NEXT: lui a2, 61681
; RV32-NEXT: addi a2, a2, -241
-; RV32-NEXT: sw a2, 12(sp)
-; RV32-NEXT: sw a2, 8(sp)
+; RV32-NEXT: sw a2, 28(sp)
+; RV32-NEXT: sw a2, 24(sp)
; RV32-NEXT: lui a2, 4112
; RV32-NEXT: addi a2, a2, 257
-; RV32-NEXT: sw a2, 12(sp)
-; RV32-NEXT: sw a2, 8(sp)
+; RV32-NEXT: sw a2, 20(sp)
+; RV32-NEXT: sw a2, 16(sp)
; RV32-NEXT: sub a2, a0, a1
; RV32-NEXT: sltu a3, a0, a2
; RV32-NEXT: addi a3, a3, -1
; RV32-NEXT: and a2, a3, a2
; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
; RV32-NEXT: vsrl.vi v24, v16, 1, v0.t
-; RV32-NEXT: addi a3, sp, 8
+; RV32-NEXT: csrr a3, vlenb
+; RV32-NEXT: li a4, 40
+; RV32-NEXT: mul a3, a3, a4
+; RV32-NEXT: add a3, sp, a3
+; RV32-NEXT: addi a3, a3, 48
+; RV32-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill
+; RV32-NEXT: addi a3, sp, 40
; RV32-NEXT: vsetvli a4, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v16, (a3), zero
; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT: csrr a4, vlenb
-; RV32-NEXT: slli a4, a4, 5
-; RV32-NEXT: add a4, sp, a4
-; RV32-NEXT: addi a4, a4, 16
-; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill
-; RV32-NEXT: vand.vv v24, v24, v16, v0.t
-; RV32-NEXT: csrr a4, vlenb
-; RV32-NEXT: li a5, 40
-; RV32-NEXT: mul a4, a4, a5
-; RV32-NEXT: add a4, sp, a4
-; RV32-NEXT: addi a4, a4, 16
-; RV32-NEXT: vl8r.v v16, (a4) # Unknown-size Folded Reload
-; RV32-NEXT: vsub.vv v16, v16, v24, v0.t
-; RV32-NEXT: csrr a4, vlenb
-; RV32-NEXT: li a5, 40
-; RV32-NEXT: mul a4, a4, a5
-; RV32-NEXT: add a4, sp, a4
-; RV32-NEXT: addi a4, a4, 16
-; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill
+; RV32-NEXT: csrr a3, vlenb
+; RV32-NEXT: slli a3, a3, 5
+; RV32-NEXT: add a3, sp, a3
+; RV32-NEXT: addi a3, a3, 48
+; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT: csrr a3, vlenb
+; RV32-NEXT: li a4, 40
+; RV32-NEXT: mul a3, a3, a4
+; RV32-NEXT: add a3, sp, a3
+; RV32-NEXT: addi a3, a3, 48
+; RV32-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v16, v24, v16, v0.t
+; RV32-NEXT: csrr a3, vlenb
+; RV32-NEXT: li a4, 24
+; RV32-NEXT: mul a3, a3, a4
+; RV32-NEXT: add a3, sp, a3
+; RV32-NEXT: addi a3, a3, 48
+; RV32-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload
+; RV32-NEXT: vsub.vv v16, v24, v16, v0.t
+; RV32-NEXT: csrr a3, vlenb
+; RV32-NEXT: li a4, 40
+; RV32-NEXT: mul a3, a3, a4
+; RV32-NEXT: add a3, sp, a3
+; RV32-NEXT: addi a3, a3, 48
+; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT: addi a3, sp, 32
; RV32-NEXT: vsetvli a4, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v16, (a3), zero
; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT: csrr a4, vlenb
-; RV32-NEXT: li a5, 40
-; RV32-NEXT: mul a4, a4, a5
-; RV32-NEXT: add a4, sp, a4
-; RV32-NEXT: addi a4, a4, 16
-; RV32-NEXT: vl8r.v v24, (a4) # Unknown-size Folded Reload
+; RV32-NEXT: csrr a3, vlenb
+; RV32-NEXT: li a4, 40
+; RV32-NEXT: mul a3, a3, a4
+; RV32-NEXT: add a3, sp, a3
+; RV32-NEXT: addi a3, a3, 48
+; RV32-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload
; RV32-NEXT: vand.vv v24, v24, v16, v0.t
-; RV32-NEXT: csrr a4, vlenb
-; RV32-NEXT: li a5, 24
-; RV32-NEXT: mul a4, a4, a5
-; RV32-NEXT: add a4, sp, a4
-; RV32-NEXT: addi a4, a4, 16
-; RV32-NEXT: vs8r.v v24, (a4) # Unknown-size Folded Spill
-; RV32-NEXT: csrr a4, vlenb
-; RV32-NEXT: li a5, 40
-; RV32-NEXT: mul a4, a4, a5
-; RV32-NEXT: add a4, sp, a4
-; RV32-NEXT: addi a4, a4, 16
-; RV32-NEXT: vl8r.v v24, (a4) # Unknown-size Folded Reload
+; RV32-NEXT: csrr a3, vlenb
+; RV32-NEXT: li a4, 24
+; RV32-NEXT: mul a3, a3, a4
+; RV32-NEXT: add a3, sp, a3
+; RV32-NEXT: addi a3, a3, 48
+; RV32-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill
+; RV32-NEXT: csrr a3, vlenb
+; RV32-NEXT: li a4, 40
+; RV32-NEXT: mul a3, a3, a4
+; RV32-NEXT: add a3, sp, a3
+; RV32-NEXT: addi a3, a3, 48
+; RV32-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload
; RV32-NEXT: vsrl.vi v24, v24, 2, v0.t
-; RV32-NEXT: csrr a4, vlenb
-; RV32-NEXT: li a5, 40
-; RV32-NEXT: mul a4, a4, a5
-; RV32-NEXT: add a4, sp, a4
-; RV32-NEXT: addi a4, a4, 16
-; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill
+; RV32-NEXT: csrr a3, vlenb
+; RV32-NEXT: li a4, 40
+; RV32-NEXT: mul a3, a3, a4
+; RV32-NEXT: add a3, sp, a3
+; RV32-NEXT: addi a3, a3, 48
+; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
; RV32-NEXT: vand.vv v16, v24, v16, v0.t
-; RV32-NEXT: csrr a4, vlenb
-; RV32-NEXT: li a5, 24
-; RV32-NEXT: mul a4, a4, a5
-; RV32-NEXT: add a4, sp, a4
-; RV32-NEXT: addi a4, a4, 16
-; RV32-NEXT: vl8r.v v24, (a4) # Unknown-size Folded Reload
+; RV32-NEXT: csrr a3, vlenb
+; RV32-NEXT: li a4, 24
+; RV32-NEXT: mul a3, a3, a4
+; RV32-NEXT: add a3, sp, a3
+; RV32-NEXT: addi a3, a3, 48
+; RV32-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload
; RV32-NEXT: vadd.vv v16, v24, v16, v0.t
; RV32-NEXT: vsrl.vi v24, v16, 4, v0.t
; RV32-NEXT: vadd.vv v24, v16, v24, v0.t
+; RV32-NEXT: addi a3, sp, 24
; RV32-NEXT: vsetvli a4, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v16, (a3), zero
; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT: csrr a4, vlenb
-; RV32-NEXT: li a5, 24
-; RV32-NEXT: mul a4, a4, a5
-; RV32-NEXT: add a4, sp, a4
-; RV32-NEXT: addi a4, a4, 16
-; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill
+; RV32-NEXT: csrr a3, vlenb
+; RV32-NEXT: li a4, 24
+; RV32-NEXT: mul a3, a3, a4
+; RV32-NEXT: add a3, sp, a3
+; RV32-NEXT: addi a3, a3, 48
+; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
; RV32-NEXT: vand.vv v24, v24, v16, v0.t
+; RV32-NEXT: addi a3, sp, 16
; RV32-NEXT: vsetvli a4, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v16, (a3), zero
-; RV32-NEXT: addi a3, sp, 16
+; RV32-NEXT: addi a3, sp, 48
; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
; RV32-NEXT: csrr a2, vlenb
; RV32-NEXT: slli a2, a2, 4
; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: addi a2, a2, 48
; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
; RV32-NEXT: vmul.vv v16, v24, v16, v0.t
; RV32-NEXT: li a2, 56
; RV32-NEXT: csrr a3, vlenb
; RV32-NEXT: slli a3, a3, 3
; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 16
+; RV32-NEXT: addi a3, a3, 48
; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
; RV32-NEXT: bltu a0, a1, .LBB46_2
; RV32-NEXT: # %bb.1:
; RV32-NEXT: .LBB46_2:
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vmv1r.v v0, v1
-; RV32-NEXT: addi a0, sp, 16
+; RV32-NEXT: addi a0, sp, 48
; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vsrl.vi v24, v8, 1, v0.t
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 5
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: addi a0, a0, 48
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vand.vv v16, v24, v16, v0.t
; RV32-NEXT: vsub.vv v8, v8, v16, v0.t
; RV32-NEXT: li a1, 40
; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: addi a0, a0, 48
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vand.vv v24, v8, v16, v0.t
; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t
; RV32-NEXT: li a1, 24
; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: addi a0, a0, 48
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vand.vv v8, v8, v16, v0.t
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 4
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: addi a0, a0, 48
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vmul.vv v8, v8, v16, v0.t
; RV32-NEXT: vsrl.vx v8, v8, a2, v0.t
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 3
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: addi a0, a0, 48
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: li a1, 48
; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add sp, sp, a0
-; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: addi sp, sp, 48
; RV32-NEXT: ret
;
; RV64-LABEL: vp_ctpop_nxv16i64:
define <vscale x 16 x i64> @vp_ctpop_nxv16i64_unmasked(<vscale x 16 x i64> %va, i32 zeroext %evl) {
; RV32-LABEL: vp_ctpop_nxv16i64_unmasked:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: addi sp, sp, -48
+; RV32-NEXT: .cfi_def_cfa_offset 48
; RV32-NEXT: csrr a1, vlenb
; RV32-NEXT: slli a1, a1, 5
; RV32-NEXT: sub sp, sp, a1
-; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 32 * vlenb
; RV32-NEXT: lui a1, 349525
; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 44(sp)
+; RV32-NEXT: sw a1, 40(sp)
; RV32-NEXT: lui a1, 209715
; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 36(sp)
+; RV32-NEXT: sw a1, 32(sp)
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 28(sp)
+; RV32-NEXT: sw a1, 24(sp)
; RV32-NEXT: lui a1, 4112
; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 20(sp)
+; RV32-NEXT: sw a1, 16(sp)
; RV32-NEXT: csrr a1, vlenb
; RV32-NEXT: sub a2, a0, a1
; RV32-NEXT: sltu a3, a0, a2
; RV32-NEXT: and a2, a3, a2
; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
; RV32-NEXT: vsrl.vi v24, v16, 1
-; RV32-NEXT: addi a3, sp, 8
+; RV32-NEXT: addi a3, sp, 40
; RV32-NEXT: vsetvli a4, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v0, (a3), zero
; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT: csrr a4, vlenb
-; RV32-NEXT: li a5, 24
-; RV32-NEXT: mul a4, a4, a5
-; RV32-NEXT: add a4, sp, a4
-; RV32-NEXT: addi a4, a4, 16
-; RV32-NEXT: vs8r.v v0, (a4) # Unknown-size Folded Spill
+; RV32-NEXT: csrr a3, vlenb
+; RV32-NEXT: li a4, 24
+; RV32-NEXT: mul a3, a3, a4
+; RV32-NEXT: add a3, sp, a3
+; RV32-NEXT: addi a3, a3, 48
+; RV32-NEXT: vs8r.v v0, (a3) # Unknown-size Folded Spill
; RV32-NEXT: vand.vv v24, v24, v0
; RV32-NEXT: vsub.vv v16, v16, v24
+; RV32-NEXT: addi a3, sp, 32
; RV32-NEXT: vsetvli a4, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v0, (a3), zero
; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
; RV32-NEXT: vadd.vv v16, v24, v16
; RV32-NEXT: vsrl.vi v24, v16, 4
; RV32-NEXT: vadd.vv v24, v16, v24
+; RV32-NEXT: addi a3, sp, 24
; RV32-NEXT: vsetvli a4, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v16, (a3), zero
; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT: csrr a4, vlenb
-; RV32-NEXT: slli a4, a4, 4
-; RV32-NEXT: add a4, sp, a4
-; RV32-NEXT: addi a4, a4, 16
-; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill
+; RV32-NEXT: csrr a3, vlenb
+; RV32-NEXT: slli a3, a3, 4
+; RV32-NEXT: add a3, sp, a3
+; RV32-NEXT: addi a3, a3, 48
+; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
; RV32-NEXT: vand.vv v16, v24, v16
+; RV32-NEXT: addi a3, sp, 16
; RV32-NEXT: vsetvli a4, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v24, (a3), zero
; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
; RV32-NEXT: csrr a2, vlenb
; RV32-NEXT: slli a2, a2, 3
; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: addi a2, a2, 48
; RV32-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill
; RV32-NEXT: vmul.vv v16, v16, v24
; RV32-NEXT: li a2, 56
; RV32-NEXT: vsrl.vx v16, v16, a2
-; RV32-NEXT: addi a3, sp, 16
+; RV32-NEXT: addi a3, sp, 48
; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
; RV32-NEXT: bltu a0, a1, .LBB47_2
; RV32-NEXT: # %bb.1:
; RV32-NEXT: li a1, 24
; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: addi a0, a0, 48
; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vand.vv v16, v16, v24
; RV32-NEXT: vsub.vv v8, v8, v16
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 4
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: addi a0, a0, 48
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vand.vv v8, v8, v16
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 3
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: addi a0, a0, 48
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vmul.vv v8, v8, v16
; RV32-NEXT: vsrl.vx v8, v8, a2
-; RV32-NEXT: addi a0, sp, 16
+; RV32-NEXT: addi a0, sp, 48
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 5
; RV32-NEXT: add sp, sp, a0
-; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: addi sp, sp, 48
; RV32-NEXT: ret
;
; RV64-LABEL: vp_ctpop_nxv16i64_unmasked:
define <vscale x 1 x i64> @cttz_nxv1i64(<vscale x 1 x i64> %va) {
; RV32I-LABEL: cttz_nxv1i64:
; RV32I: # %bb.0:
-; RV32I-NEXT: addi sp, sp, -16
-; RV32I-NEXT: .cfi_def_cfa_offset 16
+; RV32I-NEXT: addi sp, sp, -32
+; RV32I-NEXT: .cfi_def_cfa_offset 32
; RV32I-NEXT: lui a0, 349525
; RV32I-NEXT: addi a0, a0, 1365
-; RV32I-NEXT: sw a0, 12(sp)
-; RV32I-NEXT: sw a0, 8(sp)
+; RV32I-NEXT: sw a0, 28(sp)
+; RV32I-NEXT: sw a0, 24(sp)
; RV32I-NEXT: lui a0, 209715
; RV32I-NEXT: addi a0, a0, 819
-; RV32I-NEXT: sw a0, 12(sp)
-; RV32I-NEXT: sw a0, 8(sp)
+; RV32I-NEXT: sw a0, 20(sp)
+; RV32I-NEXT: sw a0, 16(sp)
; RV32I-NEXT: lui a0, 61681
; RV32I-NEXT: addi a0, a0, -241
; RV32I-NEXT: sw a0, 12(sp)
; RV32I-NEXT: sw a0, 8(sp)
; RV32I-NEXT: lui a0, 4112
; RV32I-NEXT: addi a0, a0, 257
-; RV32I-NEXT: sw a0, 12(sp)
-; RV32I-NEXT: sw a0, 8(sp)
+; RV32I-NEXT: sw a0, 4(sp)
+; RV32I-NEXT: sw a0, 0(sp)
; RV32I-NEXT: li a0, 1
; RV32I-NEXT: vsetvli a1, zero, e64, m1, ta, ma
; RV32I-NEXT: vsub.vx v9, v8, a0
; RV32I-NEXT: vnot.v v8, v8
-; RV32I-NEXT: addi a0, sp, 8
-; RV32I-NEXT: vlse64.v v10, (a0), zero
; RV32I-NEXT: vand.vv v8, v8, v9
+; RV32I-NEXT: addi a0, sp, 24
; RV32I-NEXT: vlse64.v v9, (a0), zero
+; RV32I-NEXT: addi a0, sp, 16
+; RV32I-NEXT: vlse64.v v10, (a0), zero
; RV32I-NEXT: vsrl.vi v11, v8, 1
-; RV32I-NEXT: vand.vv v10, v11, v10
-; RV32I-NEXT: vsub.vv v8, v8, v10
-; RV32I-NEXT: vand.vv v10, v8, v9
+; RV32I-NEXT: vand.vv v9, v11, v9
+; RV32I-NEXT: vsub.vv v8, v8, v9
+; RV32I-NEXT: vand.vv v9, v8, v10
; RV32I-NEXT: vsrl.vi v8, v8, 2
-; RV32I-NEXT: vand.vv v8, v8, v9
-; RV32I-NEXT: vadd.vv v8, v10, v8
+; RV32I-NEXT: vand.vv v8, v8, v10
+; RV32I-NEXT: vadd.vv v8, v9, v8
+; RV32I-NEXT: addi a0, sp, 8
; RV32I-NEXT: vlse64.v v9, (a0), zero
+; RV32I-NEXT: mv a0, sp
; RV32I-NEXT: vlse64.v v10, (a0), zero
; RV32I-NEXT: vsrl.vi v11, v8, 4
; RV32I-NEXT: vadd.vv v8, v8, v11
; RV32I-NEXT: vmul.vv v8, v8, v10
; RV32I-NEXT: li a0, 56
; RV32I-NEXT: vsrl.vx v8, v8, a0
-; RV32I-NEXT: addi sp, sp, 16
+; RV32I-NEXT: addi sp, sp, 32
; RV32I-NEXT: ret
;
; RV64I-LABEL: cttz_nxv1i64:
define <vscale x 2 x i64> @cttz_nxv2i64(<vscale x 2 x i64> %va) {
; RV32I-LABEL: cttz_nxv2i64:
; RV32I: # %bb.0:
-; RV32I-NEXT: addi sp, sp, -16
-; RV32I-NEXT: .cfi_def_cfa_offset 16
+; RV32I-NEXT: addi sp, sp, -32
+; RV32I-NEXT: .cfi_def_cfa_offset 32
; RV32I-NEXT: lui a0, 349525
; RV32I-NEXT: addi a0, a0, 1365
-; RV32I-NEXT: sw a0, 12(sp)
-; RV32I-NEXT: sw a0, 8(sp)
+; RV32I-NEXT: sw a0, 28(sp)
+; RV32I-NEXT: sw a0, 24(sp)
; RV32I-NEXT: lui a0, 209715
; RV32I-NEXT: addi a0, a0, 819
-; RV32I-NEXT: sw a0, 12(sp)
-; RV32I-NEXT: sw a0, 8(sp)
+; RV32I-NEXT: sw a0, 20(sp)
+; RV32I-NEXT: sw a0, 16(sp)
; RV32I-NEXT: lui a0, 61681
; RV32I-NEXT: addi a0, a0, -241
; RV32I-NEXT: sw a0, 12(sp)
; RV32I-NEXT: sw a0, 8(sp)
; RV32I-NEXT: lui a0, 4112
; RV32I-NEXT: addi a0, a0, 257
-; RV32I-NEXT: sw a0, 12(sp)
-; RV32I-NEXT: sw a0, 8(sp)
+; RV32I-NEXT: sw a0, 4(sp)
+; RV32I-NEXT: sw a0, 0(sp)
; RV32I-NEXT: li a0, 1
; RV32I-NEXT: vsetvli a1, zero, e64, m2, ta, ma
; RV32I-NEXT: vsub.vx v10, v8, a0
; RV32I-NEXT: vnot.v v8, v8
-; RV32I-NEXT: addi a0, sp, 8
-; RV32I-NEXT: vlse64.v v12, (a0), zero
; RV32I-NEXT: vand.vv v8, v8, v10
+; RV32I-NEXT: addi a0, sp, 24
; RV32I-NEXT: vlse64.v v10, (a0), zero
+; RV32I-NEXT: addi a0, sp, 16
+; RV32I-NEXT: vlse64.v v12, (a0), zero
; RV32I-NEXT: vsrl.vi v14, v8, 1
-; RV32I-NEXT: vand.vv v12, v14, v12
-; RV32I-NEXT: vsub.vv v8, v8, v12
-; RV32I-NEXT: vand.vv v12, v8, v10
+; RV32I-NEXT: vand.vv v10, v14, v10
+; RV32I-NEXT: vsub.vv v8, v8, v10
+; RV32I-NEXT: vand.vv v10, v8, v12
; RV32I-NEXT: vsrl.vi v8, v8, 2
-; RV32I-NEXT: vand.vv v8, v8, v10
-; RV32I-NEXT: vadd.vv v8, v12, v8
+; RV32I-NEXT: vand.vv v8, v8, v12
+; RV32I-NEXT: vadd.vv v8, v10, v8
+; RV32I-NEXT: addi a0, sp, 8
; RV32I-NEXT: vlse64.v v10, (a0), zero
+; RV32I-NEXT: mv a0, sp
; RV32I-NEXT: vlse64.v v12, (a0), zero
; RV32I-NEXT: vsrl.vi v14, v8, 4
; RV32I-NEXT: vadd.vv v8, v8, v14
; RV32I-NEXT: vmul.vv v8, v8, v12
; RV32I-NEXT: li a0, 56
; RV32I-NEXT: vsrl.vx v8, v8, a0
-; RV32I-NEXT: addi sp, sp, 16
+; RV32I-NEXT: addi sp, sp, 32
; RV32I-NEXT: ret
;
; RV64I-LABEL: cttz_nxv2i64:
define <vscale x 4 x i64> @cttz_nxv4i64(<vscale x 4 x i64> %va) {
; RV32I-LABEL: cttz_nxv4i64:
; RV32I: # %bb.0:
-; RV32I-NEXT: addi sp, sp, -16
-; RV32I-NEXT: .cfi_def_cfa_offset 16
+; RV32I-NEXT: addi sp, sp, -32
+; RV32I-NEXT: .cfi_def_cfa_offset 32
; RV32I-NEXT: lui a0, 349525
; RV32I-NEXT: addi a0, a0, 1365
-; RV32I-NEXT: sw a0, 12(sp)
-; RV32I-NEXT: sw a0, 8(sp)
+; RV32I-NEXT: sw a0, 28(sp)
+; RV32I-NEXT: sw a0, 24(sp)
; RV32I-NEXT: lui a0, 209715
; RV32I-NEXT: addi a0, a0, 819
-; RV32I-NEXT: sw a0, 12(sp)
-; RV32I-NEXT: sw a0, 8(sp)
+; RV32I-NEXT: sw a0, 20(sp)
+; RV32I-NEXT: sw a0, 16(sp)
; RV32I-NEXT: lui a0, 61681
; RV32I-NEXT: addi a0, a0, -241
; RV32I-NEXT: sw a0, 12(sp)
; RV32I-NEXT: sw a0, 8(sp)
; RV32I-NEXT: lui a0, 4112
; RV32I-NEXT: addi a0, a0, 257
-; RV32I-NEXT: sw a0, 12(sp)
-; RV32I-NEXT: sw a0, 8(sp)
+; RV32I-NEXT: sw a0, 4(sp)
+; RV32I-NEXT: sw a0, 0(sp)
; RV32I-NEXT: li a0, 1
; RV32I-NEXT: vsetvli a1, zero, e64, m4, ta, ma
; RV32I-NEXT: vsub.vx v12, v8, a0
; RV32I-NEXT: vnot.v v8, v8
-; RV32I-NEXT: addi a0, sp, 8
-; RV32I-NEXT: vlse64.v v16, (a0), zero
; RV32I-NEXT: vand.vv v8, v8, v12
+; RV32I-NEXT: addi a0, sp, 24
; RV32I-NEXT: vlse64.v v12, (a0), zero
+; RV32I-NEXT: addi a0, sp, 16
+; RV32I-NEXT: vlse64.v v16, (a0), zero
; RV32I-NEXT: vsrl.vi v20, v8, 1
-; RV32I-NEXT: vand.vv v16, v20, v16
-; RV32I-NEXT: vsub.vv v8, v8, v16
-; RV32I-NEXT: vand.vv v16, v8, v12
+; RV32I-NEXT: vand.vv v12, v20, v12
+; RV32I-NEXT: vsub.vv v8, v8, v12
+; RV32I-NEXT: vand.vv v12, v8, v16
; RV32I-NEXT: vsrl.vi v8, v8, 2
-; RV32I-NEXT: vand.vv v8, v8, v12
-; RV32I-NEXT: vadd.vv v8, v16, v8
+; RV32I-NEXT: vand.vv v8, v8, v16
+; RV32I-NEXT: vadd.vv v8, v12, v8
+; RV32I-NEXT: addi a0, sp, 8
; RV32I-NEXT: vlse64.v v12, (a0), zero
+; RV32I-NEXT: mv a0, sp
; RV32I-NEXT: vlse64.v v16, (a0), zero
; RV32I-NEXT: vsrl.vi v20, v8, 4
; RV32I-NEXT: vadd.vv v8, v8, v20
; RV32I-NEXT: vmul.vv v8, v8, v16
; RV32I-NEXT: li a0, 56
; RV32I-NEXT: vsrl.vx v8, v8, a0
-; RV32I-NEXT: addi sp, sp, 16
+; RV32I-NEXT: addi sp, sp, 32
; RV32I-NEXT: ret
;
; RV64I-LABEL: cttz_nxv4i64:
define <vscale x 8 x i64> @cttz_nxv8i64(<vscale x 8 x i64> %va) {
; RV32I-LABEL: cttz_nxv8i64:
; RV32I: # %bb.0:
-; RV32I-NEXT: addi sp, sp, -16
-; RV32I-NEXT: .cfi_def_cfa_offset 16
+; RV32I-NEXT: addi sp, sp, -32
+; RV32I-NEXT: .cfi_def_cfa_offset 32
; RV32I-NEXT: lui a0, 349525
; RV32I-NEXT: addi a0, a0, 1365
-; RV32I-NEXT: sw a0, 12(sp)
-; RV32I-NEXT: sw a0, 8(sp)
+; RV32I-NEXT: sw a0, 28(sp)
+; RV32I-NEXT: sw a0, 24(sp)
; RV32I-NEXT: lui a0, 209715
; RV32I-NEXT: addi a0, a0, 819
-; RV32I-NEXT: sw a0, 12(sp)
-; RV32I-NEXT: sw a0, 8(sp)
+; RV32I-NEXT: sw a0, 20(sp)
+; RV32I-NEXT: sw a0, 16(sp)
; RV32I-NEXT: lui a0, 61681
; RV32I-NEXT: addi a0, a0, -241
; RV32I-NEXT: sw a0, 12(sp)
; RV32I-NEXT: sw a0, 8(sp)
; RV32I-NEXT: lui a0, 4112
; RV32I-NEXT: addi a0, a0, 257
-; RV32I-NEXT: sw a0, 12(sp)
-; RV32I-NEXT: sw a0, 8(sp)
+; RV32I-NEXT: sw a0, 4(sp)
+; RV32I-NEXT: sw a0, 0(sp)
; RV32I-NEXT: li a0, 1
; RV32I-NEXT: vsetvli a1, zero, e64, m8, ta, ma
; RV32I-NEXT: vsub.vx v16, v8, a0
; RV32I-NEXT: vnot.v v8, v8
-; RV32I-NEXT: addi a0, sp, 8
-; RV32I-NEXT: vlse64.v v24, (a0), zero
; RV32I-NEXT: vand.vv v8, v8, v16
+; RV32I-NEXT: addi a0, sp, 24
; RV32I-NEXT: vlse64.v v16, (a0), zero
+; RV32I-NEXT: addi a0, sp, 16
+; RV32I-NEXT: vlse64.v v24, (a0), zero
; RV32I-NEXT: vsrl.vi v0, v8, 1
-; RV32I-NEXT: vand.vv v24, v0, v24
-; RV32I-NEXT: vsub.vv v8, v8, v24
-; RV32I-NEXT: vand.vv v24, v8, v16
+; RV32I-NEXT: vand.vv v16, v0, v16
+; RV32I-NEXT: vsub.vv v8, v8, v16
+; RV32I-NEXT: vand.vv v16, v8, v24
; RV32I-NEXT: vsrl.vi v8, v8, 2
-; RV32I-NEXT: vand.vv v8, v8, v16
-; RV32I-NEXT: vadd.vv v8, v24, v8
+; RV32I-NEXT: vand.vv v8, v8, v24
+; RV32I-NEXT: vadd.vv v8, v16, v8
+; RV32I-NEXT: addi a0, sp, 8
; RV32I-NEXT: vlse64.v v16, (a0), zero
+; RV32I-NEXT: mv a0, sp
; RV32I-NEXT: vlse64.v v24, (a0), zero
; RV32I-NEXT: vsrl.vi v0, v8, 4
; RV32I-NEXT: vadd.vv v8, v8, v0
; RV32I-NEXT: vmul.vv v8, v8, v24
; RV32I-NEXT: li a0, 56
; RV32I-NEXT: vsrl.vx v8, v8, a0
-; RV32I-NEXT: addi sp, sp, 16
+; RV32I-NEXT: addi sp, sp, 32
; RV32I-NEXT: ret
;
; RV64I-LABEL: cttz_nxv8i64:
define <vscale x 1 x i64> @cttz_zero_undef_nxv1i64(<vscale x 1 x i64> %va) {
; RV32I-LABEL: cttz_zero_undef_nxv1i64:
; RV32I: # %bb.0:
-; RV32I-NEXT: addi sp, sp, -16
-; RV32I-NEXT: .cfi_def_cfa_offset 16
+; RV32I-NEXT: addi sp, sp, -32
+; RV32I-NEXT: .cfi_def_cfa_offset 32
; RV32I-NEXT: lui a0, 349525
; RV32I-NEXT: addi a0, a0, 1365
-; RV32I-NEXT: sw a0, 12(sp)
-; RV32I-NEXT: sw a0, 8(sp)
+; RV32I-NEXT: sw a0, 28(sp)
+; RV32I-NEXT: sw a0, 24(sp)
; RV32I-NEXT: lui a0, 209715
; RV32I-NEXT: addi a0, a0, 819
-; RV32I-NEXT: sw a0, 12(sp)
-; RV32I-NEXT: sw a0, 8(sp)
+; RV32I-NEXT: sw a0, 20(sp)
+; RV32I-NEXT: sw a0, 16(sp)
; RV32I-NEXT: lui a0, 61681
; RV32I-NEXT: addi a0, a0, -241
; RV32I-NEXT: sw a0, 12(sp)
; RV32I-NEXT: sw a0, 8(sp)
; RV32I-NEXT: lui a0, 4112
; RV32I-NEXT: addi a0, a0, 257
-; RV32I-NEXT: sw a0, 12(sp)
-; RV32I-NEXT: sw a0, 8(sp)
+; RV32I-NEXT: sw a0, 4(sp)
+; RV32I-NEXT: sw a0, 0(sp)
; RV32I-NEXT: li a0, 1
; RV32I-NEXT: vsetvli a1, zero, e64, m1, ta, ma
; RV32I-NEXT: vsub.vx v9, v8, a0
; RV32I-NEXT: vnot.v v8, v8
-; RV32I-NEXT: addi a0, sp, 8
-; RV32I-NEXT: vlse64.v v10, (a0), zero
; RV32I-NEXT: vand.vv v8, v8, v9
+; RV32I-NEXT: addi a0, sp, 24
; RV32I-NEXT: vlse64.v v9, (a0), zero
+; RV32I-NEXT: addi a0, sp, 16
+; RV32I-NEXT: vlse64.v v10, (a0), zero
; RV32I-NEXT: vsrl.vi v11, v8, 1
-; RV32I-NEXT: vand.vv v10, v11, v10
-; RV32I-NEXT: vsub.vv v8, v8, v10
-; RV32I-NEXT: vand.vv v10, v8, v9
+; RV32I-NEXT: vand.vv v9, v11, v9
+; RV32I-NEXT: vsub.vv v8, v8, v9
+; RV32I-NEXT: vand.vv v9, v8, v10
; RV32I-NEXT: vsrl.vi v8, v8, 2
-; RV32I-NEXT: vand.vv v8, v8, v9
-; RV32I-NEXT: vadd.vv v8, v10, v8
+; RV32I-NEXT: vand.vv v8, v8, v10
+; RV32I-NEXT: vadd.vv v8, v9, v8
+; RV32I-NEXT: addi a0, sp, 8
; RV32I-NEXT: vlse64.v v9, (a0), zero
+; RV32I-NEXT: mv a0, sp
; RV32I-NEXT: vlse64.v v10, (a0), zero
; RV32I-NEXT: vsrl.vi v11, v8, 4
; RV32I-NEXT: vadd.vv v8, v8, v11
; RV32I-NEXT: vmul.vv v8, v8, v10
; RV32I-NEXT: li a0, 56
; RV32I-NEXT: vsrl.vx v8, v8, a0
-; RV32I-NEXT: addi sp, sp, 16
+; RV32I-NEXT: addi sp, sp, 32
; RV32I-NEXT: ret
;
; RV64I-LABEL: cttz_zero_undef_nxv1i64:
define <vscale x 2 x i64> @cttz_zero_undef_nxv2i64(<vscale x 2 x i64> %va) {
; RV32I-LABEL: cttz_zero_undef_nxv2i64:
; RV32I: # %bb.0:
-; RV32I-NEXT: addi sp, sp, -16
-; RV32I-NEXT: .cfi_def_cfa_offset 16
+; RV32I-NEXT: addi sp, sp, -32
+; RV32I-NEXT: .cfi_def_cfa_offset 32
; RV32I-NEXT: lui a0, 349525
; RV32I-NEXT: addi a0, a0, 1365
-; RV32I-NEXT: sw a0, 12(sp)
-; RV32I-NEXT: sw a0, 8(sp)
+; RV32I-NEXT: sw a0, 28(sp)
+; RV32I-NEXT: sw a0, 24(sp)
; RV32I-NEXT: lui a0, 209715
; RV32I-NEXT: addi a0, a0, 819
-; RV32I-NEXT: sw a0, 12(sp)
-; RV32I-NEXT: sw a0, 8(sp)
+; RV32I-NEXT: sw a0, 20(sp)
+; RV32I-NEXT: sw a0, 16(sp)
; RV32I-NEXT: lui a0, 61681
; RV32I-NEXT: addi a0, a0, -241
; RV32I-NEXT: sw a0, 12(sp)
; RV32I-NEXT: sw a0, 8(sp)
; RV32I-NEXT: lui a0, 4112
; RV32I-NEXT: addi a0, a0, 257
-; RV32I-NEXT: sw a0, 12(sp)
-; RV32I-NEXT: sw a0, 8(sp)
+; RV32I-NEXT: sw a0, 4(sp)
+; RV32I-NEXT: sw a0, 0(sp)
; RV32I-NEXT: li a0, 1
; RV32I-NEXT: vsetvli a1, zero, e64, m2, ta, ma
; RV32I-NEXT: vsub.vx v10, v8, a0
; RV32I-NEXT: vnot.v v8, v8
-; RV32I-NEXT: addi a0, sp, 8
-; RV32I-NEXT: vlse64.v v12, (a0), zero
; RV32I-NEXT: vand.vv v8, v8, v10
+; RV32I-NEXT: addi a0, sp, 24
; RV32I-NEXT: vlse64.v v10, (a0), zero
+; RV32I-NEXT: addi a0, sp, 16
+; RV32I-NEXT: vlse64.v v12, (a0), zero
; RV32I-NEXT: vsrl.vi v14, v8, 1
-; RV32I-NEXT: vand.vv v12, v14, v12
-; RV32I-NEXT: vsub.vv v8, v8, v12
-; RV32I-NEXT: vand.vv v12, v8, v10
+; RV32I-NEXT: vand.vv v10, v14, v10
+; RV32I-NEXT: vsub.vv v8, v8, v10
+; RV32I-NEXT: vand.vv v10, v8, v12
; RV32I-NEXT: vsrl.vi v8, v8, 2
-; RV32I-NEXT: vand.vv v8, v8, v10
-; RV32I-NEXT: vadd.vv v8, v12, v8
+; RV32I-NEXT: vand.vv v8, v8, v12
+; RV32I-NEXT: vadd.vv v8, v10, v8
+; RV32I-NEXT: addi a0, sp, 8
; RV32I-NEXT: vlse64.v v10, (a0), zero
+; RV32I-NEXT: mv a0, sp
; RV32I-NEXT: vlse64.v v12, (a0), zero
; RV32I-NEXT: vsrl.vi v14, v8, 4
; RV32I-NEXT: vadd.vv v8, v8, v14
; RV32I-NEXT: vmul.vv v8, v8, v12
; RV32I-NEXT: li a0, 56
; RV32I-NEXT: vsrl.vx v8, v8, a0
-; RV32I-NEXT: addi sp, sp, 16
+; RV32I-NEXT: addi sp, sp, 32
; RV32I-NEXT: ret
;
; RV64I-LABEL: cttz_zero_undef_nxv2i64:
define <vscale x 4 x i64> @cttz_zero_undef_nxv4i64(<vscale x 4 x i64> %va) {
; RV32I-LABEL: cttz_zero_undef_nxv4i64:
; RV32I: # %bb.0:
-; RV32I-NEXT: addi sp, sp, -16
-; RV32I-NEXT: .cfi_def_cfa_offset 16
+; RV32I-NEXT: addi sp, sp, -32
+; RV32I-NEXT: .cfi_def_cfa_offset 32
; RV32I-NEXT: lui a0, 349525
; RV32I-NEXT: addi a0, a0, 1365
-; RV32I-NEXT: sw a0, 12(sp)
-; RV32I-NEXT: sw a0, 8(sp)
+; RV32I-NEXT: sw a0, 28(sp)
+; RV32I-NEXT: sw a0, 24(sp)
; RV32I-NEXT: lui a0, 209715
; RV32I-NEXT: addi a0, a0, 819
-; RV32I-NEXT: sw a0, 12(sp)
-; RV32I-NEXT: sw a0, 8(sp)
+; RV32I-NEXT: sw a0, 20(sp)
+; RV32I-NEXT: sw a0, 16(sp)
; RV32I-NEXT: lui a0, 61681
; RV32I-NEXT: addi a0, a0, -241
; RV32I-NEXT: sw a0, 12(sp)
; RV32I-NEXT: sw a0, 8(sp)
; RV32I-NEXT: lui a0, 4112
; RV32I-NEXT: addi a0, a0, 257
-; RV32I-NEXT: sw a0, 12(sp)
-; RV32I-NEXT: sw a0, 8(sp)
+; RV32I-NEXT: sw a0, 4(sp)
+; RV32I-NEXT: sw a0, 0(sp)
; RV32I-NEXT: li a0, 1
; RV32I-NEXT: vsetvli a1, zero, e64, m4, ta, ma
; RV32I-NEXT: vsub.vx v12, v8, a0
; RV32I-NEXT: vnot.v v8, v8
-; RV32I-NEXT: addi a0, sp, 8
-; RV32I-NEXT: vlse64.v v16, (a0), zero
; RV32I-NEXT: vand.vv v8, v8, v12
+; RV32I-NEXT: addi a0, sp, 24
; RV32I-NEXT: vlse64.v v12, (a0), zero
+; RV32I-NEXT: addi a0, sp, 16
+; RV32I-NEXT: vlse64.v v16, (a0), zero
; RV32I-NEXT: vsrl.vi v20, v8, 1
-; RV32I-NEXT: vand.vv v16, v20, v16
-; RV32I-NEXT: vsub.vv v8, v8, v16
-; RV32I-NEXT: vand.vv v16, v8, v12
+; RV32I-NEXT: vand.vv v12, v20, v12
+; RV32I-NEXT: vsub.vv v8, v8, v12
+; RV32I-NEXT: vand.vv v12, v8, v16
; RV32I-NEXT: vsrl.vi v8, v8, 2
-; RV32I-NEXT: vand.vv v8, v8, v12
-; RV32I-NEXT: vadd.vv v8, v16, v8
+; RV32I-NEXT: vand.vv v8, v8, v16
+; RV32I-NEXT: vadd.vv v8, v12, v8
+; RV32I-NEXT: addi a0, sp, 8
; RV32I-NEXT: vlse64.v v12, (a0), zero
+; RV32I-NEXT: mv a0, sp
; RV32I-NEXT: vlse64.v v16, (a0), zero
; RV32I-NEXT: vsrl.vi v20, v8, 4
; RV32I-NEXT: vadd.vv v8, v8, v20
; RV32I-NEXT: vmul.vv v8, v8, v16
; RV32I-NEXT: li a0, 56
; RV32I-NEXT: vsrl.vx v8, v8, a0
-; RV32I-NEXT: addi sp, sp, 16
+; RV32I-NEXT: addi sp, sp, 32
; RV32I-NEXT: ret
;
; RV64I-LABEL: cttz_zero_undef_nxv4i64:
define <vscale x 8 x i64> @cttz_zero_undef_nxv8i64(<vscale x 8 x i64> %va) {
; RV32I-LABEL: cttz_zero_undef_nxv8i64:
; RV32I: # %bb.0:
-; RV32I-NEXT: addi sp, sp, -16
-; RV32I-NEXT: .cfi_def_cfa_offset 16
+; RV32I-NEXT: addi sp, sp, -32
+; RV32I-NEXT: .cfi_def_cfa_offset 32
; RV32I-NEXT: lui a0, 349525
; RV32I-NEXT: addi a0, a0, 1365
-; RV32I-NEXT: sw a0, 12(sp)
-; RV32I-NEXT: sw a0, 8(sp)
+; RV32I-NEXT: sw a0, 28(sp)
+; RV32I-NEXT: sw a0, 24(sp)
; RV32I-NEXT: lui a0, 209715
; RV32I-NEXT: addi a0, a0, 819
-; RV32I-NEXT: sw a0, 12(sp)
-; RV32I-NEXT: sw a0, 8(sp)
+; RV32I-NEXT: sw a0, 20(sp)
+; RV32I-NEXT: sw a0, 16(sp)
; RV32I-NEXT: lui a0, 61681
; RV32I-NEXT: addi a0, a0, -241
; RV32I-NEXT: sw a0, 12(sp)
; RV32I-NEXT: sw a0, 8(sp)
; RV32I-NEXT: lui a0, 4112
; RV32I-NEXT: addi a0, a0, 257
-; RV32I-NEXT: sw a0, 12(sp)
-; RV32I-NEXT: sw a0, 8(sp)
+; RV32I-NEXT: sw a0, 4(sp)
+; RV32I-NEXT: sw a0, 0(sp)
; RV32I-NEXT: li a0, 1
; RV32I-NEXT: vsetvli a1, zero, e64, m8, ta, ma
; RV32I-NEXT: vsub.vx v16, v8, a0
; RV32I-NEXT: vnot.v v8, v8
-; RV32I-NEXT: addi a0, sp, 8
-; RV32I-NEXT: vlse64.v v24, (a0), zero
; RV32I-NEXT: vand.vv v8, v8, v16
+; RV32I-NEXT: addi a0, sp, 24
; RV32I-NEXT: vlse64.v v16, (a0), zero
+; RV32I-NEXT: addi a0, sp, 16
+; RV32I-NEXT: vlse64.v v24, (a0), zero
; RV32I-NEXT: vsrl.vi v0, v8, 1
-; RV32I-NEXT: vand.vv v24, v0, v24
-; RV32I-NEXT: vsub.vv v8, v8, v24
-; RV32I-NEXT: vand.vv v24, v8, v16
+; RV32I-NEXT: vand.vv v16, v0, v16
+; RV32I-NEXT: vsub.vv v8, v8, v16
+; RV32I-NEXT: vand.vv v16, v8, v24
; RV32I-NEXT: vsrl.vi v8, v8, 2
-; RV32I-NEXT: vand.vv v8, v8, v16
-; RV32I-NEXT: vadd.vv v8, v24, v8
+; RV32I-NEXT: vand.vv v8, v8, v24
+; RV32I-NEXT: vadd.vv v8, v16, v8
+; RV32I-NEXT: addi a0, sp, 8
; RV32I-NEXT: vlse64.v v16, (a0), zero
+; RV32I-NEXT: mv a0, sp
; RV32I-NEXT: vlse64.v v24, (a0), zero
; RV32I-NEXT: vsrl.vi v0, v8, 4
; RV32I-NEXT: vadd.vv v8, v8, v0
; RV32I-NEXT: vmul.vv v8, v8, v24
; RV32I-NEXT: li a0, 56
; RV32I-NEXT: vsrl.vx v8, v8, a0
-; RV32I-NEXT: addi sp, sp, 16
+; RV32I-NEXT: addi sp, sp, 32
; RV32I-NEXT: ret
;
; RV64I-LABEL: cttz_zero_undef_nxv8i64:
define <vscale x 1 x i64> @vp_cttz_nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
; RV32-LABEL: vp_cttz_nxv1i64:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: addi sp, sp, -32
+; RV32-NEXT: .cfi_def_cfa_offset 32
; RV32-NEXT: lui a1, 349525
; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 28(sp)
+; RV32-NEXT: sw a1, 24(sp)
; RV32-NEXT: lui a1, 209715
; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 20(sp)
+; RV32-NEXT: sw a1, 16(sp)
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
; RV32-NEXT: sw a1, 12(sp)
; RV32-NEXT: sw a1, 8(sp)
; RV32-NEXT: lui a1, 4112
; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 4(sp)
+; RV32-NEXT: sw a1, 0(sp)
; RV32-NEXT: li a1, 1
; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
; RV32-NEXT: vsub.vx v9, v8, a1, v0.t
; RV32-NEXT: vnot.v v8, v8, v0.t
; RV32-NEXT: vand.vv v8, v8, v9, v0.t
; RV32-NEXT: vsrl.vi v9, v8, 1, v0.t
-; RV32-NEXT: addi a1, sp, 8
+; RV32-NEXT: addi a1, sp, 24
; RV32-NEXT: vsetvli a2, zero, e64, m1, ta, ma
; RV32-NEXT: vlse64.v v10, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
; RV32-NEXT: vand.vv v9, v9, v10, v0.t
; RV32-NEXT: vsub.vv v8, v8, v9, v0.t
+; RV32-NEXT: addi a1, sp, 16
; RV32-NEXT: vsetvli a2, zero, e64, m1, ta, ma
; RV32-NEXT: vlse64.v v9, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
; RV32-NEXT: vadd.vv v8, v10, v8, v0.t
; RV32-NEXT: vsrl.vi v9, v8, 4, v0.t
; RV32-NEXT: vadd.vv v8, v8, v9, v0.t
+; RV32-NEXT: addi a1, sp, 8
; RV32-NEXT: vsetvli a2, zero, e64, m1, ta, ma
; RV32-NEXT: vlse64.v v9, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
; RV32-NEXT: vand.vv v8, v8, v9, v0.t
+; RV32-NEXT: mv a1, sp
; RV32-NEXT: vsetvli a2, zero, e64, m1, ta, ma
; RV32-NEXT: vlse64.v v9, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
; RV32-NEXT: vmul.vv v8, v8, v9, v0.t
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t
-; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: addi sp, sp, 32
; RV32-NEXT: ret
;
; RV64-LABEL: vp_cttz_nxv1i64:
define <vscale x 1 x i64> @vp_cttz_nxv1i64_unmasked(<vscale x 1 x i64> %va, i32 zeroext %evl) {
; RV32-LABEL: vp_cttz_nxv1i64_unmasked:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: addi sp, sp, -32
+; RV32-NEXT: .cfi_def_cfa_offset 32
; RV32-NEXT: lui a1, 349525
; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 28(sp)
+; RV32-NEXT: sw a1, 24(sp)
; RV32-NEXT: lui a1, 209715
; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 20(sp)
+; RV32-NEXT: sw a1, 16(sp)
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
; RV32-NEXT: sw a1, 12(sp)
; RV32-NEXT: sw a1, 8(sp)
; RV32-NEXT: lui a1, 4112
; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 4(sp)
+; RV32-NEXT: sw a1, 0(sp)
; RV32-NEXT: li a1, 1
; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
; RV32-NEXT: vsub.vx v9, v8, a1
; RV32-NEXT: vnot.v v8, v8
; RV32-NEXT: vand.vv v8, v8, v9
; RV32-NEXT: vsrl.vi v9, v8, 1
-; RV32-NEXT: addi a1, sp, 8
+; RV32-NEXT: addi a1, sp, 24
; RV32-NEXT: vsetvli a2, zero, e64, m1, ta, ma
; RV32-NEXT: vlse64.v v10, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
; RV32-NEXT: vand.vv v9, v9, v10
; RV32-NEXT: vsub.vv v8, v8, v9
+; RV32-NEXT: addi a1, sp, 16
; RV32-NEXT: vsetvli a2, zero, e64, m1, ta, ma
; RV32-NEXT: vlse64.v v9, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
; RV32-NEXT: vadd.vv v8, v10, v8
; RV32-NEXT: vsrl.vi v9, v8, 4
; RV32-NEXT: vadd.vv v8, v8, v9
+; RV32-NEXT: addi a1, sp, 8
; RV32-NEXT: vsetvli a2, zero, e64, m1, ta, ma
; RV32-NEXT: vlse64.v v9, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
; RV32-NEXT: vand.vv v8, v8, v9
+; RV32-NEXT: mv a1, sp
; RV32-NEXT: vsetvli a2, zero, e64, m1, ta, ma
; RV32-NEXT: vlse64.v v9, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
; RV32-NEXT: vmul.vv v8, v8, v9
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0
-; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: addi sp, sp, 32
; RV32-NEXT: ret
;
; RV64-LABEL: vp_cttz_nxv1i64_unmasked:
define <vscale x 2 x i64> @vp_cttz_nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
; RV32-LABEL: vp_cttz_nxv2i64:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: addi sp, sp, -32
+; RV32-NEXT: .cfi_def_cfa_offset 32
; RV32-NEXT: lui a1, 349525
; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 28(sp)
+; RV32-NEXT: sw a1, 24(sp)
; RV32-NEXT: lui a1, 209715
; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 20(sp)
+; RV32-NEXT: sw a1, 16(sp)
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
; RV32-NEXT: sw a1, 12(sp)
; RV32-NEXT: sw a1, 8(sp)
; RV32-NEXT: lui a1, 4112
; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 4(sp)
+; RV32-NEXT: sw a1, 0(sp)
; RV32-NEXT: li a1, 1
; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
; RV32-NEXT: vsub.vx v10, v8, a1, v0.t
; RV32-NEXT: vnot.v v8, v8, v0.t
; RV32-NEXT: vand.vv v8, v8, v10, v0.t
; RV32-NEXT: vsrl.vi v10, v8, 1, v0.t
-; RV32-NEXT: addi a1, sp, 8
+; RV32-NEXT: addi a1, sp, 24
; RV32-NEXT: vsetvli a2, zero, e64, m2, ta, ma
; RV32-NEXT: vlse64.v v12, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
; RV32-NEXT: vand.vv v10, v10, v12, v0.t
; RV32-NEXT: vsub.vv v8, v8, v10, v0.t
+; RV32-NEXT: addi a1, sp, 16
; RV32-NEXT: vsetvli a2, zero, e64, m2, ta, ma
; RV32-NEXT: vlse64.v v10, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
; RV32-NEXT: vadd.vv v8, v12, v8, v0.t
; RV32-NEXT: vsrl.vi v10, v8, 4, v0.t
; RV32-NEXT: vadd.vv v8, v8, v10, v0.t
+; RV32-NEXT: addi a1, sp, 8
; RV32-NEXT: vsetvli a2, zero, e64, m2, ta, ma
; RV32-NEXT: vlse64.v v10, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
; RV32-NEXT: vand.vv v8, v8, v10, v0.t
+; RV32-NEXT: mv a1, sp
; RV32-NEXT: vsetvli a2, zero, e64, m2, ta, ma
; RV32-NEXT: vlse64.v v10, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
; RV32-NEXT: vmul.vv v8, v8, v10, v0.t
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t
-; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: addi sp, sp, 32
; RV32-NEXT: ret
;
; RV64-LABEL: vp_cttz_nxv2i64:
define <vscale x 2 x i64> @vp_cttz_nxv2i64_unmasked(<vscale x 2 x i64> %va, i32 zeroext %evl) {
; RV32-LABEL: vp_cttz_nxv2i64_unmasked:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: addi sp, sp, -32
+; RV32-NEXT: .cfi_def_cfa_offset 32
; RV32-NEXT: lui a1, 349525
; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 28(sp)
+; RV32-NEXT: sw a1, 24(sp)
; RV32-NEXT: lui a1, 209715
; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 20(sp)
+; RV32-NEXT: sw a1, 16(sp)
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
; RV32-NEXT: sw a1, 12(sp)
; RV32-NEXT: sw a1, 8(sp)
; RV32-NEXT: lui a1, 4112
; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 4(sp)
+; RV32-NEXT: sw a1, 0(sp)
; RV32-NEXT: li a1, 1
; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
; RV32-NEXT: vsub.vx v10, v8, a1
; RV32-NEXT: vnot.v v8, v8
; RV32-NEXT: vand.vv v8, v8, v10
; RV32-NEXT: vsrl.vi v10, v8, 1
-; RV32-NEXT: addi a1, sp, 8
+; RV32-NEXT: addi a1, sp, 24
; RV32-NEXT: vsetvli a2, zero, e64, m2, ta, ma
; RV32-NEXT: vlse64.v v12, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
; RV32-NEXT: vand.vv v10, v10, v12
; RV32-NEXT: vsub.vv v8, v8, v10
+; RV32-NEXT: addi a1, sp, 16
; RV32-NEXT: vsetvli a2, zero, e64, m2, ta, ma
; RV32-NEXT: vlse64.v v10, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
; RV32-NEXT: vadd.vv v8, v12, v8
; RV32-NEXT: vsrl.vi v10, v8, 4
; RV32-NEXT: vadd.vv v8, v8, v10
+; RV32-NEXT: addi a1, sp, 8
; RV32-NEXT: vsetvli a2, zero, e64, m2, ta, ma
; RV32-NEXT: vlse64.v v10, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
; RV32-NEXT: vand.vv v8, v8, v10
+; RV32-NEXT: mv a1, sp
; RV32-NEXT: vsetvli a2, zero, e64, m2, ta, ma
; RV32-NEXT: vlse64.v v10, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
; RV32-NEXT: vmul.vv v8, v8, v10
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0
-; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: addi sp, sp, 32
; RV32-NEXT: ret
;
; RV64-LABEL: vp_cttz_nxv2i64_unmasked:
define <vscale x 4 x i64> @vp_cttz_nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
; RV32-LABEL: vp_cttz_nxv4i64:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: addi sp, sp, -32
+; RV32-NEXT: .cfi_def_cfa_offset 32
; RV32-NEXT: lui a1, 349525
; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 28(sp)
+; RV32-NEXT: sw a1, 24(sp)
; RV32-NEXT: lui a1, 209715
; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 20(sp)
+; RV32-NEXT: sw a1, 16(sp)
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
; RV32-NEXT: sw a1, 12(sp)
; RV32-NEXT: sw a1, 8(sp)
; RV32-NEXT: lui a1, 4112
; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 4(sp)
+; RV32-NEXT: sw a1, 0(sp)
; RV32-NEXT: li a1, 1
; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
; RV32-NEXT: vsub.vx v12, v8, a1, v0.t
; RV32-NEXT: vnot.v v8, v8, v0.t
; RV32-NEXT: vand.vv v8, v8, v12, v0.t
; RV32-NEXT: vsrl.vi v12, v8, 1, v0.t
-; RV32-NEXT: addi a1, sp, 8
+; RV32-NEXT: addi a1, sp, 24
; RV32-NEXT: vsetvli a2, zero, e64, m4, ta, ma
; RV32-NEXT: vlse64.v v16, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
; RV32-NEXT: vand.vv v12, v12, v16, v0.t
; RV32-NEXT: vsub.vv v8, v8, v12, v0.t
+; RV32-NEXT: addi a1, sp, 16
; RV32-NEXT: vsetvli a2, zero, e64, m4, ta, ma
; RV32-NEXT: vlse64.v v12, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
; RV32-NEXT: vadd.vv v8, v16, v8, v0.t
; RV32-NEXT: vsrl.vi v12, v8, 4, v0.t
; RV32-NEXT: vadd.vv v8, v8, v12, v0.t
+; RV32-NEXT: addi a1, sp, 8
; RV32-NEXT: vsetvli a2, zero, e64, m4, ta, ma
; RV32-NEXT: vlse64.v v12, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
; RV32-NEXT: vand.vv v8, v8, v12, v0.t
+; RV32-NEXT: mv a1, sp
; RV32-NEXT: vsetvli a2, zero, e64, m4, ta, ma
; RV32-NEXT: vlse64.v v12, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
; RV32-NEXT: vmul.vv v8, v8, v12, v0.t
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t
-; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: addi sp, sp, 32
; RV32-NEXT: ret
;
; RV64-LABEL: vp_cttz_nxv4i64:
define <vscale x 4 x i64> @vp_cttz_nxv4i64_unmasked(<vscale x 4 x i64> %va, i32 zeroext %evl) {
; RV32-LABEL: vp_cttz_nxv4i64_unmasked:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: addi sp, sp, -32
+; RV32-NEXT: .cfi_def_cfa_offset 32
; RV32-NEXT: lui a1, 349525
; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 28(sp)
+; RV32-NEXT: sw a1, 24(sp)
; RV32-NEXT: lui a1, 209715
; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 20(sp)
+; RV32-NEXT: sw a1, 16(sp)
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
; RV32-NEXT: sw a1, 12(sp)
; RV32-NEXT: sw a1, 8(sp)
; RV32-NEXT: lui a1, 4112
; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 4(sp)
+; RV32-NEXT: sw a1, 0(sp)
; RV32-NEXT: li a1, 1
; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
; RV32-NEXT: vsub.vx v12, v8, a1
; RV32-NEXT: vnot.v v8, v8
; RV32-NEXT: vand.vv v8, v8, v12
; RV32-NEXT: vsrl.vi v12, v8, 1
-; RV32-NEXT: addi a1, sp, 8
+; RV32-NEXT: addi a1, sp, 24
; RV32-NEXT: vsetvli a2, zero, e64, m4, ta, ma
; RV32-NEXT: vlse64.v v16, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
; RV32-NEXT: vand.vv v12, v12, v16
; RV32-NEXT: vsub.vv v8, v8, v12
+; RV32-NEXT: addi a1, sp, 16
; RV32-NEXT: vsetvli a2, zero, e64, m4, ta, ma
; RV32-NEXT: vlse64.v v12, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
; RV32-NEXT: vadd.vv v8, v16, v8
; RV32-NEXT: vsrl.vi v12, v8, 4
; RV32-NEXT: vadd.vv v8, v8, v12
+; RV32-NEXT: addi a1, sp, 8
; RV32-NEXT: vsetvli a2, zero, e64, m4, ta, ma
; RV32-NEXT: vlse64.v v12, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
; RV32-NEXT: vand.vv v8, v8, v12
+; RV32-NEXT: mv a1, sp
; RV32-NEXT: vsetvli a2, zero, e64, m4, ta, ma
; RV32-NEXT: vlse64.v v12, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
; RV32-NEXT: vmul.vv v8, v8, v12
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0
-; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: addi sp, sp, 32
; RV32-NEXT: ret
;
; RV64-LABEL: vp_cttz_nxv4i64_unmasked:
define <vscale x 7 x i64> @vp_cttz_nxv7i64(<vscale x 7 x i64> %va, <vscale x 7 x i1> %m, i32 zeroext %evl) {
; RV32-LABEL: vp_cttz_nxv7i64:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: addi sp, sp, -32
+; RV32-NEXT: .cfi_def_cfa_offset 32
; RV32-NEXT: lui a1, 349525
; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 28(sp)
+; RV32-NEXT: sw a1, 24(sp)
; RV32-NEXT: lui a1, 209715
; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 20(sp)
+; RV32-NEXT: sw a1, 16(sp)
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
; RV32-NEXT: sw a1, 12(sp)
; RV32-NEXT: sw a1, 8(sp)
; RV32-NEXT: lui a1, 4112
; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 4(sp)
+; RV32-NEXT: sw a1, 0(sp)
; RV32-NEXT: li a1, 1
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsub.vx v16, v8, a1, v0.t
; RV32-NEXT: vnot.v v8, v8, v0.t
; RV32-NEXT: vand.vv v8, v8, v16, v0.t
; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t
-; RV32-NEXT: addi a1, sp, 8
+; RV32-NEXT: addi a1, sp, 24
; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v24, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vand.vv v16, v16, v24, v0.t
; RV32-NEXT: vsub.vv v8, v8, v16, v0.t
+; RV32-NEXT: addi a1, sp, 16
; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v16, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vadd.vv v8, v24, v8, v0.t
; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t
; RV32-NEXT: vadd.vv v8, v8, v16, v0.t
+; RV32-NEXT: addi a1, sp, 8
; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v16, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vand.vv v8, v8, v16, v0.t
+; RV32-NEXT: mv a1, sp
; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v16, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vmul.vv v8, v8, v16, v0.t
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t
-; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: addi sp, sp, 32
; RV32-NEXT: ret
;
; RV64-LABEL: vp_cttz_nxv7i64:
define <vscale x 7 x i64> @vp_cttz_nxv7i64_unmasked(<vscale x 7 x i64> %va, i32 zeroext %evl) {
; RV32-LABEL: vp_cttz_nxv7i64_unmasked:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: addi sp, sp, -32
+; RV32-NEXT: .cfi_def_cfa_offset 32
; RV32-NEXT: lui a1, 349525
; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 28(sp)
+; RV32-NEXT: sw a1, 24(sp)
; RV32-NEXT: lui a1, 209715
; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 20(sp)
+; RV32-NEXT: sw a1, 16(sp)
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
; RV32-NEXT: sw a1, 12(sp)
; RV32-NEXT: sw a1, 8(sp)
; RV32-NEXT: lui a1, 4112
; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 4(sp)
+; RV32-NEXT: sw a1, 0(sp)
; RV32-NEXT: li a1, 1
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsub.vx v16, v8, a1
; RV32-NEXT: vnot.v v8, v8
; RV32-NEXT: vand.vv v8, v8, v16
; RV32-NEXT: vsrl.vi v16, v8, 1
-; RV32-NEXT: addi a1, sp, 8
+; RV32-NEXT: addi a1, sp, 24
; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v24, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vand.vv v16, v16, v24
; RV32-NEXT: vsub.vv v8, v8, v16
+; RV32-NEXT: addi a1, sp, 16
; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v16, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vadd.vv v8, v24, v8
; RV32-NEXT: vsrl.vi v16, v8, 4
; RV32-NEXT: vadd.vv v8, v8, v16
+; RV32-NEXT: addi a1, sp, 8
; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v16, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vand.vv v8, v8, v16
+; RV32-NEXT: mv a1, sp
; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v16, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vmul.vv v8, v8, v16
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0
-; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: addi sp, sp, 32
; RV32-NEXT: ret
;
; RV64-LABEL: vp_cttz_nxv7i64_unmasked:
define <vscale x 8 x i64> @vp_cttz_nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
; RV32-LABEL: vp_cttz_nxv8i64:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: addi sp, sp, -32
+; RV32-NEXT: .cfi_def_cfa_offset 32
; RV32-NEXT: lui a1, 349525
; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 28(sp)
+; RV32-NEXT: sw a1, 24(sp)
; RV32-NEXT: lui a1, 209715
; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 20(sp)
+; RV32-NEXT: sw a1, 16(sp)
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
; RV32-NEXT: sw a1, 12(sp)
; RV32-NEXT: sw a1, 8(sp)
; RV32-NEXT: lui a1, 4112
; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 4(sp)
+; RV32-NEXT: sw a1, 0(sp)
; RV32-NEXT: li a1, 1
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsub.vx v16, v8, a1, v0.t
; RV32-NEXT: vnot.v v8, v8, v0.t
; RV32-NEXT: vand.vv v8, v8, v16, v0.t
; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t
-; RV32-NEXT: addi a1, sp, 8
+; RV32-NEXT: addi a1, sp, 24
; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v24, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vand.vv v16, v16, v24, v0.t
; RV32-NEXT: vsub.vv v8, v8, v16, v0.t
+; RV32-NEXT: addi a1, sp, 16
; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v16, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vadd.vv v8, v24, v8, v0.t
; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t
; RV32-NEXT: vadd.vv v8, v8, v16, v0.t
+; RV32-NEXT: addi a1, sp, 8
; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v16, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vand.vv v8, v8, v16, v0.t
+; RV32-NEXT: mv a1, sp
; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v16, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vmul.vv v8, v8, v16, v0.t
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t
-; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: addi sp, sp, 32
; RV32-NEXT: ret
;
; RV64-LABEL: vp_cttz_nxv8i64:
define <vscale x 8 x i64> @vp_cttz_nxv8i64_unmasked(<vscale x 8 x i64> %va, i32 zeroext %evl) {
; RV32-LABEL: vp_cttz_nxv8i64_unmasked:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: addi sp, sp, -32
+; RV32-NEXT: .cfi_def_cfa_offset 32
; RV32-NEXT: lui a1, 349525
; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 28(sp)
+; RV32-NEXT: sw a1, 24(sp)
; RV32-NEXT: lui a1, 209715
; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 20(sp)
+; RV32-NEXT: sw a1, 16(sp)
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
; RV32-NEXT: sw a1, 12(sp)
; RV32-NEXT: sw a1, 8(sp)
; RV32-NEXT: lui a1, 4112
; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 4(sp)
+; RV32-NEXT: sw a1, 0(sp)
; RV32-NEXT: li a1, 1
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsub.vx v16, v8, a1
; RV32-NEXT: vnot.v v8, v8
; RV32-NEXT: vand.vv v8, v8, v16
; RV32-NEXT: vsrl.vi v16, v8, 1
-; RV32-NEXT: addi a1, sp, 8
+; RV32-NEXT: addi a1, sp, 24
; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v24, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vand.vv v16, v16, v24
; RV32-NEXT: vsub.vv v8, v8, v16
+; RV32-NEXT: addi a1, sp, 16
; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v16, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vadd.vv v8, v24, v8
; RV32-NEXT: vsrl.vi v16, v8, 4
; RV32-NEXT: vadd.vv v8, v8, v16
+; RV32-NEXT: addi a1, sp, 8
; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v16, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vand.vv v8, v8, v16
+; RV32-NEXT: mv a1, sp
; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v16, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vmul.vv v8, v8, v16
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0
-; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: addi sp, sp, 32
; RV32-NEXT: ret
;
; RV64-LABEL: vp_cttz_nxv8i64_unmasked:
define <vscale x 16 x i64> @vp_cttz_nxv16i64(<vscale x 16 x i64> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
; RV32-LABEL: vp_cttz_nxv16i64:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: addi sp, sp, -48
+; RV32-NEXT: .cfi_def_cfa_offset 48
; RV32-NEXT: csrr a1, vlenb
; RV32-NEXT: li a2, 40
; RV32-NEXT: mul a1, a1, a2
; RV32-NEXT: sub sp, sp, a1
-; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
+; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 40 * vlenb
; RV32-NEXT: vmv1r.v v1, v0
; RV32-NEXT: csrr a1, vlenb
; RV32-NEXT: srli a2, a1, 3
; RV32-NEXT: vslidedown.vx v0, v0, a2
; RV32-NEXT: lui a2, 349525
; RV32-NEXT: addi a2, a2, 1365
-; RV32-NEXT: sw a2, 12(sp)
-; RV32-NEXT: sw a2, 8(sp)
+; RV32-NEXT: sw a2, 44(sp)
+; RV32-NEXT: sw a2, 40(sp)
; RV32-NEXT: lui a2, 209715
; RV32-NEXT: addi a2, a2, 819
-; RV32-NEXT: sw a2, 12(sp)
-; RV32-NEXT: sw a2, 8(sp)
+; RV32-NEXT: sw a2, 36(sp)
+; RV32-NEXT: sw a2, 32(sp)
; RV32-NEXT: lui a2, 61681
; RV32-NEXT: addi a2, a2, -241
-; RV32-NEXT: sw a2, 12(sp)
-; RV32-NEXT: sw a2, 8(sp)
+; RV32-NEXT: sw a2, 28(sp)
+; RV32-NEXT: sw a2, 24(sp)
; RV32-NEXT: lui a2, 4112
; RV32-NEXT: addi a2, a2, 257
-; RV32-NEXT: sw a2, 12(sp)
-; RV32-NEXT: sw a2, 8(sp)
+; RV32-NEXT: sw a2, 20(sp)
+; RV32-NEXT: sw a2, 16(sp)
; RV32-NEXT: sub a2, a0, a1
; RV32-NEXT: sltu a3, a0, a2
; RV32-NEXT: addi a3, a3, -1
; RV32-NEXT: vnot.v v16, v16, v0.t
; RV32-NEXT: vand.vv v16, v16, v24, v0.t
; RV32-NEXT: csrr a4, vlenb
-; RV32-NEXT: slli a4, a4, 5
+; RV32-NEXT: slli a4, a4, 4
; RV32-NEXT: add a4, sp, a4
-; RV32-NEXT: addi a4, a4, 16
+; RV32-NEXT: addi a4, a4, 48
; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill
; RV32-NEXT: vsrl.vi v24, v16, 1, v0.t
-; RV32-NEXT: addi a4, sp, 8
+; RV32-NEXT: csrr a4, vlenb
+; RV32-NEXT: slli a4, a4, 5
+; RV32-NEXT: add a4, sp, a4
+; RV32-NEXT: addi a4, a4, 48
+; RV32-NEXT: vs8r.v v24, (a4) # Unknown-size Folded Spill
+; RV32-NEXT: addi a4, sp, 40
; RV32-NEXT: vsetvli a5, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v16, (a4), zero
; RV32-NEXT: vsetvli zero, a3, e64, m8, ta, ma
-; RV32-NEXT: csrr a5, vlenb
-; RV32-NEXT: slli a5, a5, 4
-; RV32-NEXT: add a5, sp, a5
-; RV32-NEXT: addi a5, a5, 16
-; RV32-NEXT: vs8r.v v16, (a5) # Unknown-size Folded Spill
+; RV32-NEXT: csrr a4, vlenb
+; RV32-NEXT: li a5, 24
+; RV32-NEXT: mul a4, a4, a5
+; RV32-NEXT: add a4, sp, a4
+; RV32-NEXT: addi a4, a4, 48
+; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill
+; RV32-NEXT: csrr a4, vlenb
+; RV32-NEXT: slli a4, a4, 5
+; RV32-NEXT: add a4, sp, a4
+; RV32-NEXT: addi a4, a4, 48
+; RV32-NEXT: vl8r.v v24, (a4) # Unknown-size Folded Reload
; RV32-NEXT: vand.vv v24, v24, v16, v0.t
-; RV32-NEXT: csrr a5, vlenb
-; RV32-NEXT: slli a5, a5, 5
-; RV32-NEXT: add a5, sp, a5
-; RV32-NEXT: addi a5, a5, 16
-; RV32-NEXT: vl8r.v v16, (a5) # Unknown-size Folded Reload
+; RV32-NEXT: csrr a4, vlenb
+; RV32-NEXT: slli a4, a4, 4
+; RV32-NEXT: add a4, sp, a4
+; RV32-NEXT: addi a4, a4, 48
+; RV32-NEXT: vl8r.v v16, (a4) # Unknown-size Folded Reload
; RV32-NEXT: vsub.vv v16, v16, v24, v0.t
-; RV32-NEXT: csrr a5, vlenb
-; RV32-NEXT: li a6, 24
-; RV32-NEXT: mul a5, a5, a6
-; RV32-NEXT: add a5, sp, a5
-; RV32-NEXT: addi a5, a5, 16
-; RV32-NEXT: vs8r.v v16, (a5) # Unknown-size Folded Spill
+; RV32-NEXT: csrr a4, vlenb
+; RV32-NEXT: slli a4, a4, 5
+; RV32-NEXT: add a4, sp, a4
+; RV32-NEXT: addi a4, a4, 48
+; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill
+; RV32-NEXT: addi a4, sp, 32
; RV32-NEXT: vsetvli a5, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v16, (a4), zero
-; RV32-NEXT: csrr a5, vlenb
-; RV32-NEXT: slli a5, a5, 5
-; RV32-NEXT: add a5, sp, a5
-; RV32-NEXT: addi a5, a5, 16
-; RV32-NEXT: vs8r.v v16, (a5) # Unknown-size Folded Spill
; RV32-NEXT: vsetvli zero, a3, e64, m8, ta, ma
-; RV32-NEXT: csrr a5, vlenb
-; RV32-NEXT: li a6, 24
-; RV32-NEXT: mul a5, a5, a6
-; RV32-NEXT: add a5, sp, a5
-; RV32-NEXT: addi a5, a5, 16
-; RV32-NEXT: vl8r.v v16, (a5) # Unknown-size Folded Reload
-; RV32-NEXT: csrr a5, vlenb
-; RV32-NEXT: slli a5, a5, 5
-; RV32-NEXT: add a5, sp, a5
-; RV32-NEXT: addi a5, a5, 16
-; RV32-NEXT: vl8r.v v24, (a5) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v16, v16, v24, v0.t
-; RV32-NEXT: csrr a5, vlenb
-; RV32-NEXT: slli a5, a5, 3
-; RV32-NEXT: add a5, sp, a5
-; RV32-NEXT: addi a5, a5, 16
-; RV32-NEXT: vs8r.v v16, (a5) # Unknown-size Folded Spill
-; RV32-NEXT: csrr a5, vlenb
-; RV32-NEXT: li a6, 24
-; RV32-NEXT: mul a5, a5, a6
-; RV32-NEXT: add a5, sp, a5
-; RV32-NEXT: addi a5, a5, 16
-; RV32-NEXT: vl8r.v v16, (a5) # Unknown-size Folded Reload
-; RV32-NEXT: vsrl.vi v16, v16, 2, v0.t
-; RV32-NEXT: csrr a5, vlenb
-; RV32-NEXT: li a6, 24
-; RV32-NEXT: mul a5, a5, a6
-; RV32-NEXT: add a5, sp, a5
-; RV32-NEXT: addi a5, a5, 16
-; RV32-NEXT: vs8r.v v16, (a5) # Unknown-size Folded Spill
-; RV32-NEXT: csrr a5, vlenb
-; RV32-NEXT: slli a5, a5, 5
-; RV32-NEXT: add a5, sp, a5
-; RV32-NEXT: addi a5, a5, 16
-; RV32-NEXT: vl8r.v v16, (a5) # Unknown-size Folded Reload
-; RV32-NEXT: csrr a5, vlenb
-; RV32-NEXT: li a6, 24
-; RV32-NEXT: mul a5, a5, a6
-; RV32-NEXT: add a5, sp, a5
-; RV32-NEXT: addi a5, a5, 16
-; RV32-NEXT: vl8r.v v24, (a5) # Unknown-size Folded Reload
+; RV32-NEXT: csrr a4, vlenb
+; RV32-NEXT: slli a4, a4, 5
+; RV32-NEXT: add a4, sp, a4
+; RV32-NEXT: addi a4, a4, 48
+; RV32-NEXT: vl8r.v v24, (a4) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v24, v24, v16, v0.t
+; RV32-NEXT: csrr a4, vlenb
+; RV32-NEXT: slli a4, a4, 4
+; RV32-NEXT: add a4, sp, a4
+; RV32-NEXT: addi a4, a4, 48
+; RV32-NEXT: vs8r.v v24, (a4) # Unknown-size Folded Spill
+; RV32-NEXT: csrr a4, vlenb
+; RV32-NEXT: slli a4, a4, 5
+; RV32-NEXT: add a4, sp, a4
+; RV32-NEXT: addi a4, a4, 48
+; RV32-NEXT: vl8r.v v24, (a4) # Unknown-size Folded Reload
+; RV32-NEXT: vsrl.vi v24, v24, 2, v0.t
+; RV32-NEXT: csrr a4, vlenb
+; RV32-NEXT: slli a4, a4, 5
+; RV32-NEXT: add a4, sp, a4
+; RV32-NEXT: addi a4, a4, 48
+; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill
; RV32-NEXT: vand.vv v16, v24, v16, v0.t
-; RV32-NEXT: csrr a5, vlenb
-; RV32-NEXT: slli a5, a5, 3
-; RV32-NEXT: add a5, sp, a5
-; RV32-NEXT: addi a5, a5, 16
-; RV32-NEXT: vl8r.v v24, (a5) # Unknown-size Folded Reload
+; RV32-NEXT: csrr a4, vlenb
+; RV32-NEXT: slli a4, a4, 4
+; RV32-NEXT: add a4, sp, a4
+; RV32-NEXT: addi a4, a4, 48
+; RV32-NEXT: vl8r.v v24, (a4) # Unknown-size Folded Reload
; RV32-NEXT: vadd.vv v16, v24, v16, v0.t
; RV32-NEXT: vsrl.vi v24, v16, 4, v0.t
; RV32-NEXT: vadd.vv v24, v16, v24, v0.t
+; RV32-NEXT: addi a4, sp, 24
; RV32-NEXT: vsetvli a5, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v16, (a4), zero
; RV32-NEXT: vsetvli zero, a3, e64, m8, ta, ma
-; RV32-NEXT: csrr a5, vlenb
-; RV32-NEXT: li a6, 24
-; RV32-NEXT: mul a5, a5, a6
-; RV32-NEXT: add a5, sp, a5
-; RV32-NEXT: addi a5, a5, 16
-; RV32-NEXT: vs8r.v v16, (a5) # Unknown-size Folded Spill
+; RV32-NEXT: csrr a4, vlenb
+; RV32-NEXT: slli a4, a4, 4
+; RV32-NEXT: add a4, sp, a4
+; RV32-NEXT: addi a4, a4, 48
+; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill
; RV32-NEXT: vand.vv v24, v24, v16, v0.t
+; RV32-NEXT: addi a4, sp, 16
; RV32-NEXT: vsetvli a5, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v16, (a4), zero
; RV32-NEXT: vsetvli zero, a3, e64, m8, ta, ma
; RV32-NEXT: csrr a3, vlenb
; RV32-NEXT: slli a3, a3, 3
; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 16
+; RV32-NEXT: addi a3, a3, 48
; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
; RV32-NEXT: vmul.vv v16, v24, v16, v0.t
; RV32-NEXT: li a3, 56
; RV32-NEXT: vsrl.vx v16, v16, a3, v0.t
-; RV32-NEXT: addi a4, sp, 16
+; RV32-NEXT: addi a4, sp, 48
; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill
; RV32-NEXT: bltu a0, a1, .LBB46_2
; RV32-NEXT: # %bb.1:
; RV32-NEXT: vand.vv v8, v8, v16, v0.t
; RV32-NEXT: vsrl.vi v24, v8, 1, v0.t
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: li a1, 24
+; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: addi a0, a0, 48
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vand.vv v16, v24, v16, v0.t
; RV32-NEXT: vsub.vv v8, v8, v16, v0.t
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 5
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: addi a0, a0, 48
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vand.vv v24, v8, v16, v0.t
; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t
; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t
; RV32-NEXT: vadd.vv v8, v8, v16, v0.t
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: li a1, 24
-; RV32-NEXT: mul a0, a0, a1
+; RV32-NEXT: slli a0, a0, 4
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: addi a0, a0, 48
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vand.vv v8, v8, v16, v0.t
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 3
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: addi a0, a0, 48
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vmul.vv v8, v8, v16, v0.t
; RV32-NEXT: vsrl.vx v8, v8, a3, v0.t
-; RV32-NEXT: addi a0, sp, 16
+; RV32-NEXT: addi a0, sp, 48
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: li a1, 40
; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add sp, sp, a0
-; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: addi sp, sp, 48
; RV32-NEXT: ret
;
; RV64-LABEL: vp_cttz_nxv16i64:
define <vscale x 16 x i64> @vp_cttz_nxv16i64_unmasked(<vscale x 16 x i64> %va, i32 zeroext %evl) {
; RV32-LABEL: vp_cttz_nxv16i64_unmasked:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: addi sp, sp, -48
+; RV32-NEXT: .cfi_def_cfa_offset 48
; RV32-NEXT: csrr a1, vlenb
; RV32-NEXT: slli a1, a1, 5
; RV32-NEXT: sub sp, sp, a1
-; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 32 * vlenb
; RV32-NEXT: lui a1, 349525
; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 44(sp)
+; RV32-NEXT: sw a1, 40(sp)
; RV32-NEXT: lui a1, 209715
; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 36(sp)
+; RV32-NEXT: sw a1, 32(sp)
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 28(sp)
+; RV32-NEXT: sw a1, 24(sp)
; RV32-NEXT: lui a1, 4112
; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 20(sp)
+; RV32-NEXT: sw a1, 16(sp)
; RV32-NEXT: csrr a1, vlenb
; RV32-NEXT: sub a2, a0, a1
; RV32-NEXT: sltu a3, a0, a2
; RV32-NEXT: vnot.v v16, v16
; RV32-NEXT: vand.vv v16, v16, v24
; RV32-NEXT: vsrl.vi v24, v16, 1
-; RV32-NEXT: addi a4, sp, 8
+; RV32-NEXT: addi a4, sp, 40
; RV32-NEXT: vsetvli a5, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v0, (a4), zero
; RV32-NEXT: vsetvli zero, a3, e64, m8, ta, ma
-; RV32-NEXT: csrr a5, vlenb
-; RV32-NEXT: li a6, 24
-; RV32-NEXT: mul a5, a5, a6
-; RV32-NEXT: add a5, sp, a5
-; RV32-NEXT: addi a5, a5, 16
-; RV32-NEXT: vs8r.v v0, (a5) # Unknown-size Folded Spill
+; RV32-NEXT: csrr a4, vlenb
+; RV32-NEXT: li a5, 24
+; RV32-NEXT: mul a4, a4, a5
+; RV32-NEXT: add a4, sp, a4
+; RV32-NEXT: addi a4, a4, 48
+; RV32-NEXT: vs8r.v v0, (a4) # Unknown-size Folded Spill
; RV32-NEXT: vand.vv v24, v24, v0
; RV32-NEXT: vsub.vv v16, v16, v24
+; RV32-NEXT: addi a4, sp, 32
; RV32-NEXT: vsetvli a5, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v0, (a4), zero
; RV32-NEXT: vsetvli zero, a3, e64, m8, ta, ma
; RV32-NEXT: vadd.vv v16, v24, v16
; RV32-NEXT: vsrl.vi v24, v16, 4
; RV32-NEXT: vadd.vv v16, v16, v24
+; RV32-NEXT: addi a4, sp, 24
; RV32-NEXT: vsetvli a5, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v24, (a4), zero
; RV32-NEXT: vsetvli zero, a3, e64, m8, ta, ma
-; RV32-NEXT: csrr a5, vlenb
-; RV32-NEXT: slli a5, a5, 4
-; RV32-NEXT: add a5, sp, a5
-; RV32-NEXT: addi a5, a5, 16
-; RV32-NEXT: vs8r.v v24, (a5) # Unknown-size Folded Spill
+; RV32-NEXT: csrr a4, vlenb
+; RV32-NEXT: slli a4, a4, 4
+; RV32-NEXT: add a4, sp, a4
+; RV32-NEXT: addi a4, a4, 48
+; RV32-NEXT: vs8r.v v24, (a4) # Unknown-size Folded Spill
; RV32-NEXT: vand.vv v24, v16, v24
+; RV32-NEXT: addi a4, sp, 16
; RV32-NEXT: vsetvli a5, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v16, (a4), zero
; RV32-NEXT: vsetvli zero, a3, e64, m8, ta, ma
; RV32-NEXT: csrr a3, vlenb
; RV32-NEXT: slli a3, a3, 3
; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 16
+; RV32-NEXT: addi a3, a3, 48
; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
; RV32-NEXT: vmul.vv v24, v24, v16
; RV32-NEXT: li a3, 56
; RV32-NEXT: vsrl.vx v16, v24, a3
-; RV32-NEXT: addi a4, sp, 16
+; RV32-NEXT: addi a4, sp, 48
; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill
; RV32-NEXT: bltu a0, a1, .LBB47_2
; RV32-NEXT: # %bb.1:
; RV32-NEXT: li a1, 24
; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: addi a0, a0, 48
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vand.vv v24, v24, v16
; RV32-NEXT: vsub.vv v8, v8, v24
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 4
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: addi a0, a0, 48
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vand.vv v8, v8, v16
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 3
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: addi a0, a0, 48
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vmul.vv v8, v8, v16
; RV32-NEXT: vsrl.vx v8, v8, a3
-; RV32-NEXT: addi a0, sp, 16
+; RV32-NEXT: addi a0, sp, 48
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 5
; RV32-NEXT: add sp, sp, a0
-; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: addi sp, sp, 48
; RV32-NEXT: ret
;
; RV64-LABEL: vp_cttz_nxv16i64_unmasked:
define <vscale x 1 x i64> @vp_cttz_zero_undef_nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
; RV32-LABEL: vp_cttz_zero_undef_nxv1i64:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: addi sp, sp, -32
+; RV32-NEXT: .cfi_def_cfa_offset 32
; RV32-NEXT: lui a1, 349525
; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 28(sp)
+; RV32-NEXT: sw a1, 24(sp)
; RV32-NEXT: lui a1, 209715
; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 20(sp)
+; RV32-NEXT: sw a1, 16(sp)
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
; RV32-NEXT: sw a1, 12(sp)
; RV32-NEXT: sw a1, 8(sp)
; RV32-NEXT: lui a1, 4112
; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 4(sp)
+; RV32-NEXT: sw a1, 0(sp)
; RV32-NEXT: li a1, 1
; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
; RV32-NEXT: vsub.vx v9, v8, a1, v0.t
; RV32-NEXT: vnot.v v8, v8, v0.t
; RV32-NEXT: vand.vv v8, v8, v9, v0.t
; RV32-NEXT: vsrl.vi v9, v8, 1, v0.t
-; RV32-NEXT: addi a1, sp, 8
+; RV32-NEXT: addi a1, sp, 24
; RV32-NEXT: vsetvli a2, zero, e64, m1, ta, ma
; RV32-NEXT: vlse64.v v10, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
; RV32-NEXT: vand.vv v9, v9, v10, v0.t
; RV32-NEXT: vsub.vv v8, v8, v9, v0.t
+; RV32-NEXT: addi a1, sp, 16
; RV32-NEXT: vsetvli a2, zero, e64, m1, ta, ma
; RV32-NEXT: vlse64.v v9, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
; RV32-NEXT: vadd.vv v8, v10, v8, v0.t
; RV32-NEXT: vsrl.vi v9, v8, 4, v0.t
; RV32-NEXT: vadd.vv v8, v8, v9, v0.t
+; RV32-NEXT: addi a1, sp, 8
; RV32-NEXT: vsetvli a2, zero, e64, m1, ta, ma
; RV32-NEXT: vlse64.v v9, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
; RV32-NEXT: vand.vv v8, v8, v9, v0.t
+; RV32-NEXT: mv a1, sp
; RV32-NEXT: vsetvli a2, zero, e64, m1, ta, ma
; RV32-NEXT: vlse64.v v9, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
; RV32-NEXT: vmul.vv v8, v8, v9, v0.t
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t
-; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: addi sp, sp, 32
; RV32-NEXT: ret
;
; RV64-LABEL: vp_cttz_zero_undef_nxv1i64:
define <vscale x 1 x i64> @vp_cttz_zero_undef_nxv1i64_unmasked(<vscale x 1 x i64> %va, i32 zeroext %evl) {
; RV32-LABEL: vp_cttz_zero_undef_nxv1i64_unmasked:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: addi sp, sp, -32
+; RV32-NEXT: .cfi_def_cfa_offset 32
; RV32-NEXT: lui a1, 349525
; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 28(sp)
+; RV32-NEXT: sw a1, 24(sp)
; RV32-NEXT: lui a1, 209715
; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 20(sp)
+; RV32-NEXT: sw a1, 16(sp)
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
; RV32-NEXT: sw a1, 12(sp)
; RV32-NEXT: sw a1, 8(sp)
; RV32-NEXT: lui a1, 4112
; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 4(sp)
+; RV32-NEXT: sw a1, 0(sp)
; RV32-NEXT: li a1, 1
; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
; RV32-NEXT: vsub.vx v9, v8, a1
; RV32-NEXT: vnot.v v8, v8
; RV32-NEXT: vand.vv v8, v8, v9
; RV32-NEXT: vsrl.vi v9, v8, 1
-; RV32-NEXT: addi a1, sp, 8
+; RV32-NEXT: addi a1, sp, 24
; RV32-NEXT: vsetvli a2, zero, e64, m1, ta, ma
; RV32-NEXT: vlse64.v v10, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
; RV32-NEXT: vand.vv v9, v9, v10
; RV32-NEXT: vsub.vv v8, v8, v9
+; RV32-NEXT: addi a1, sp, 16
; RV32-NEXT: vsetvli a2, zero, e64, m1, ta, ma
; RV32-NEXT: vlse64.v v9, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
; RV32-NEXT: vadd.vv v8, v10, v8
; RV32-NEXT: vsrl.vi v9, v8, 4
; RV32-NEXT: vadd.vv v8, v8, v9
+; RV32-NEXT: addi a1, sp, 8
; RV32-NEXT: vsetvli a2, zero, e64, m1, ta, ma
; RV32-NEXT: vlse64.v v9, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
; RV32-NEXT: vand.vv v8, v8, v9
+; RV32-NEXT: mv a1, sp
; RV32-NEXT: vsetvli a2, zero, e64, m1, ta, ma
; RV32-NEXT: vlse64.v v9, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
; RV32-NEXT: vmul.vv v8, v8, v9
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0
-; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: addi sp, sp, 32
; RV32-NEXT: ret
;
; RV64-LABEL: vp_cttz_zero_undef_nxv1i64_unmasked:
define <vscale x 2 x i64> @vp_cttz_zero_undef_nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
; RV32-LABEL: vp_cttz_zero_undef_nxv2i64:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: addi sp, sp, -32
+; RV32-NEXT: .cfi_def_cfa_offset 32
; RV32-NEXT: lui a1, 349525
; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 28(sp)
+; RV32-NEXT: sw a1, 24(sp)
; RV32-NEXT: lui a1, 209715
; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 20(sp)
+; RV32-NEXT: sw a1, 16(sp)
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
; RV32-NEXT: sw a1, 12(sp)
; RV32-NEXT: sw a1, 8(sp)
; RV32-NEXT: lui a1, 4112
; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 4(sp)
+; RV32-NEXT: sw a1, 0(sp)
; RV32-NEXT: li a1, 1
; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
; RV32-NEXT: vsub.vx v10, v8, a1, v0.t
; RV32-NEXT: vnot.v v8, v8, v0.t
; RV32-NEXT: vand.vv v8, v8, v10, v0.t
; RV32-NEXT: vsrl.vi v10, v8, 1, v0.t
-; RV32-NEXT: addi a1, sp, 8
+; RV32-NEXT: addi a1, sp, 24
; RV32-NEXT: vsetvli a2, zero, e64, m2, ta, ma
; RV32-NEXT: vlse64.v v12, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
; RV32-NEXT: vand.vv v10, v10, v12, v0.t
; RV32-NEXT: vsub.vv v8, v8, v10, v0.t
+; RV32-NEXT: addi a1, sp, 16
; RV32-NEXT: vsetvli a2, zero, e64, m2, ta, ma
; RV32-NEXT: vlse64.v v10, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
; RV32-NEXT: vadd.vv v8, v12, v8, v0.t
; RV32-NEXT: vsrl.vi v10, v8, 4, v0.t
; RV32-NEXT: vadd.vv v8, v8, v10, v0.t
+; RV32-NEXT: addi a1, sp, 8
; RV32-NEXT: vsetvli a2, zero, e64, m2, ta, ma
; RV32-NEXT: vlse64.v v10, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
; RV32-NEXT: vand.vv v8, v8, v10, v0.t
+; RV32-NEXT: mv a1, sp
; RV32-NEXT: vsetvli a2, zero, e64, m2, ta, ma
; RV32-NEXT: vlse64.v v10, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
; RV32-NEXT: vmul.vv v8, v8, v10, v0.t
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t
-; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: addi sp, sp, 32
; RV32-NEXT: ret
;
; RV64-LABEL: vp_cttz_zero_undef_nxv2i64:
define <vscale x 2 x i64> @vp_cttz_zero_undef_nxv2i64_unmasked(<vscale x 2 x i64> %va, i32 zeroext %evl) {
; RV32-LABEL: vp_cttz_zero_undef_nxv2i64_unmasked:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: addi sp, sp, -32
+; RV32-NEXT: .cfi_def_cfa_offset 32
; RV32-NEXT: lui a1, 349525
; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 28(sp)
+; RV32-NEXT: sw a1, 24(sp)
; RV32-NEXT: lui a1, 209715
; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 20(sp)
+; RV32-NEXT: sw a1, 16(sp)
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
; RV32-NEXT: sw a1, 12(sp)
; RV32-NEXT: sw a1, 8(sp)
; RV32-NEXT: lui a1, 4112
; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 4(sp)
+; RV32-NEXT: sw a1, 0(sp)
; RV32-NEXT: li a1, 1
; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
; RV32-NEXT: vsub.vx v10, v8, a1
; RV32-NEXT: vnot.v v8, v8
; RV32-NEXT: vand.vv v8, v8, v10
; RV32-NEXT: vsrl.vi v10, v8, 1
-; RV32-NEXT: addi a1, sp, 8
+; RV32-NEXT: addi a1, sp, 24
; RV32-NEXT: vsetvli a2, zero, e64, m2, ta, ma
; RV32-NEXT: vlse64.v v12, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
; RV32-NEXT: vand.vv v10, v10, v12
; RV32-NEXT: vsub.vv v8, v8, v10
+; RV32-NEXT: addi a1, sp, 16
; RV32-NEXT: vsetvli a2, zero, e64, m2, ta, ma
; RV32-NEXT: vlse64.v v10, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
; RV32-NEXT: vadd.vv v8, v12, v8
; RV32-NEXT: vsrl.vi v10, v8, 4
; RV32-NEXT: vadd.vv v8, v8, v10
+; RV32-NEXT: addi a1, sp, 8
; RV32-NEXT: vsetvli a2, zero, e64, m2, ta, ma
; RV32-NEXT: vlse64.v v10, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
; RV32-NEXT: vand.vv v8, v8, v10
+; RV32-NEXT: mv a1, sp
; RV32-NEXT: vsetvli a2, zero, e64, m2, ta, ma
; RV32-NEXT: vlse64.v v10, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
; RV32-NEXT: vmul.vv v8, v8, v10
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0
-; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: addi sp, sp, 32
; RV32-NEXT: ret
;
; RV64-LABEL: vp_cttz_zero_undef_nxv2i64_unmasked:
define <vscale x 4 x i64> @vp_cttz_zero_undef_nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
; RV32-LABEL: vp_cttz_zero_undef_nxv4i64:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: addi sp, sp, -32
+; RV32-NEXT: .cfi_def_cfa_offset 32
; RV32-NEXT: lui a1, 349525
; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 28(sp)
+; RV32-NEXT: sw a1, 24(sp)
; RV32-NEXT: lui a1, 209715
; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 20(sp)
+; RV32-NEXT: sw a1, 16(sp)
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
; RV32-NEXT: sw a1, 12(sp)
; RV32-NEXT: sw a1, 8(sp)
; RV32-NEXT: lui a1, 4112
; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 4(sp)
+; RV32-NEXT: sw a1, 0(sp)
; RV32-NEXT: li a1, 1
; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
; RV32-NEXT: vsub.vx v12, v8, a1, v0.t
; RV32-NEXT: vnot.v v8, v8, v0.t
; RV32-NEXT: vand.vv v8, v8, v12, v0.t
; RV32-NEXT: vsrl.vi v12, v8, 1, v0.t
-; RV32-NEXT: addi a1, sp, 8
+; RV32-NEXT: addi a1, sp, 24
; RV32-NEXT: vsetvli a2, zero, e64, m4, ta, ma
; RV32-NEXT: vlse64.v v16, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
; RV32-NEXT: vand.vv v12, v12, v16, v0.t
; RV32-NEXT: vsub.vv v8, v8, v12, v0.t
+; RV32-NEXT: addi a1, sp, 16
; RV32-NEXT: vsetvli a2, zero, e64, m4, ta, ma
; RV32-NEXT: vlse64.v v12, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
; RV32-NEXT: vadd.vv v8, v16, v8, v0.t
; RV32-NEXT: vsrl.vi v12, v8, 4, v0.t
; RV32-NEXT: vadd.vv v8, v8, v12, v0.t
+; RV32-NEXT: addi a1, sp, 8
; RV32-NEXT: vsetvli a2, zero, e64, m4, ta, ma
; RV32-NEXT: vlse64.v v12, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
; RV32-NEXT: vand.vv v8, v8, v12, v0.t
+; RV32-NEXT: mv a1, sp
; RV32-NEXT: vsetvli a2, zero, e64, m4, ta, ma
; RV32-NEXT: vlse64.v v12, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
; RV32-NEXT: vmul.vv v8, v8, v12, v0.t
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t
-; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: addi sp, sp, 32
; RV32-NEXT: ret
;
; RV64-LABEL: vp_cttz_zero_undef_nxv4i64:
define <vscale x 4 x i64> @vp_cttz_zero_undef_nxv4i64_unmasked(<vscale x 4 x i64> %va, i32 zeroext %evl) {
; RV32-LABEL: vp_cttz_zero_undef_nxv4i64_unmasked:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: addi sp, sp, -32
+; RV32-NEXT: .cfi_def_cfa_offset 32
; RV32-NEXT: lui a1, 349525
; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 28(sp)
+; RV32-NEXT: sw a1, 24(sp)
; RV32-NEXT: lui a1, 209715
; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 20(sp)
+; RV32-NEXT: sw a1, 16(sp)
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
; RV32-NEXT: sw a1, 12(sp)
; RV32-NEXT: sw a1, 8(sp)
; RV32-NEXT: lui a1, 4112
; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 4(sp)
+; RV32-NEXT: sw a1, 0(sp)
; RV32-NEXT: li a1, 1
; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
; RV32-NEXT: vsub.vx v12, v8, a1
; RV32-NEXT: vnot.v v8, v8
; RV32-NEXT: vand.vv v8, v8, v12
; RV32-NEXT: vsrl.vi v12, v8, 1
-; RV32-NEXT: addi a1, sp, 8
+; RV32-NEXT: addi a1, sp, 24
; RV32-NEXT: vsetvli a2, zero, e64, m4, ta, ma
; RV32-NEXT: vlse64.v v16, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
; RV32-NEXT: vand.vv v12, v12, v16
; RV32-NEXT: vsub.vv v8, v8, v12
+; RV32-NEXT: addi a1, sp, 16
; RV32-NEXT: vsetvli a2, zero, e64, m4, ta, ma
; RV32-NEXT: vlse64.v v12, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
; RV32-NEXT: vadd.vv v8, v16, v8
; RV32-NEXT: vsrl.vi v12, v8, 4
; RV32-NEXT: vadd.vv v8, v8, v12
+; RV32-NEXT: addi a1, sp, 8
; RV32-NEXT: vsetvli a2, zero, e64, m4, ta, ma
; RV32-NEXT: vlse64.v v12, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
; RV32-NEXT: vand.vv v8, v8, v12
+; RV32-NEXT: mv a1, sp
; RV32-NEXT: vsetvli a2, zero, e64, m4, ta, ma
; RV32-NEXT: vlse64.v v12, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
; RV32-NEXT: vmul.vv v8, v8, v12
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0
-; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: addi sp, sp, 32
; RV32-NEXT: ret
;
; RV64-LABEL: vp_cttz_zero_undef_nxv4i64_unmasked:
define <vscale x 7 x i64> @vp_cttz_zero_undef_nxv7i64(<vscale x 7 x i64> %va, <vscale x 7 x i1> %m, i32 zeroext %evl) {
; RV32-LABEL: vp_cttz_zero_undef_nxv7i64:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: addi sp, sp, -32
+; RV32-NEXT: .cfi_def_cfa_offset 32
; RV32-NEXT: lui a1, 349525
; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 28(sp)
+; RV32-NEXT: sw a1, 24(sp)
; RV32-NEXT: lui a1, 209715
; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 20(sp)
+; RV32-NEXT: sw a1, 16(sp)
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
; RV32-NEXT: sw a1, 12(sp)
; RV32-NEXT: sw a1, 8(sp)
; RV32-NEXT: lui a1, 4112
; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 4(sp)
+; RV32-NEXT: sw a1, 0(sp)
; RV32-NEXT: li a1, 1
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsub.vx v16, v8, a1, v0.t
; RV32-NEXT: vnot.v v8, v8, v0.t
; RV32-NEXT: vand.vv v8, v8, v16, v0.t
; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t
-; RV32-NEXT: addi a1, sp, 8
+; RV32-NEXT: addi a1, sp, 24
; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v24, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vand.vv v16, v16, v24, v0.t
; RV32-NEXT: vsub.vv v8, v8, v16, v0.t
+; RV32-NEXT: addi a1, sp, 16
; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v16, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vadd.vv v8, v24, v8, v0.t
; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t
; RV32-NEXT: vadd.vv v8, v8, v16, v0.t
+; RV32-NEXT: addi a1, sp, 8
; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v16, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vand.vv v8, v8, v16, v0.t
+; RV32-NEXT: mv a1, sp
; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v16, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vmul.vv v8, v8, v16, v0.t
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t
-; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: addi sp, sp, 32
; RV32-NEXT: ret
;
; RV64-LABEL: vp_cttz_zero_undef_nxv7i64:
define <vscale x 7 x i64> @vp_cttz_zero_undef_nxv7i64_unmasked(<vscale x 7 x i64> %va, i32 zeroext %evl) {
; RV32-LABEL: vp_cttz_zero_undef_nxv7i64_unmasked:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: addi sp, sp, -32
+; RV32-NEXT: .cfi_def_cfa_offset 32
; RV32-NEXT: lui a1, 349525
; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 28(sp)
+; RV32-NEXT: sw a1, 24(sp)
; RV32-NEXT: lui a1, 209715
; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 20(sp)
+; RV32-NEXT: sw a1, 16(sp)
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
; RV32-NEXT: sw a1, 12(sp)
; RV32-NEXT: sw a1, 8(sp)
; RV32-NEXT: lui a1, 4112
; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 4(sp)
+; RV32-NEXT: sw a1, 0(sp)
; RV32-NEXT: li a1, 1
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsub.vx v16, v8, a1
; RV32-NEXT: vnot.v v8, v8
; RV32-NEXT: vand.vv v8, v8, v16
; RV32-NEXT: vsrl.vi v16, v8, 1
-; RV32-NEXT: addi a1, sp, 8
+; RV32-NEXT: addi a1, sp, 24
; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v24, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vand.vv v16, v16, v24
; RV32-NEXT: vsub.vv v8, v8, v16
+; RV32-NEXT: addi a1, sp, 16
; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v16, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vadd.vv v8, v24, v8
; RV32-NEXT: vsrl.vi v16, v8, 4
; RV32-NEXT: vadd.vv v8, v8, v16
+; RV32-NEXT: addi a1, sp, 8
; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v16, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vand.vv v8, v8, v16
+; RV32-NEXT: mv a1, sp
; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v16, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vmul.vv v8, v8, v16
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0
-; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: addi sp, sp, 32
; RV32-NEXT: ret
;
; RV64-LABEL: vp_cttz_zero_undef_nxv7i64_unmasked:
define <vscale x 8 x i64> @vp_cttz_zero_undef_nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
; RV32-LABEL: vp_cttz_zero_undef_nxv8i64:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: addi sp, sp, -32
+; RV32-NEXT: .cfi_def_cfa_offset 32
; RV32-NEXT: lui a1, 349525
; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 28(sp)
+; RV32-NEXT: sw a1, 24(sp)
; RV32-NEXT: lui a1, 209715
; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 20(sp)
+; RV32-NEXT: sw a1, 16(sp)
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
; RV32-NEXT: sw a1, 12(sp)
; RV32-NEXT: sw a1, 8(sp)
; RV32-NEXT: lui a1, 4112
; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 4(sp)
+; RV32-NEXT: sw a1, 0(sp)
; RV32-NEXT: li a1, 1
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsub.vx v16, v8, a1, v0.t
; RV32-NEXT: vnot.v v8, v8, v0.t
; RV32-NEXT: vand.vv v8, v8, v16, v0.t
; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t
-; RV32-NEXT: addi a1, sp, 8
+; RV32-NEXT: addi a1, sp, 24
; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v24, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vand.vv v16, v16, v24, v0.t
; RV32-NEXT: vsub.vv v8, v8, v16, v0.t
+; RV32-NEXT: addi a1, sp, 16
; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v16, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vadd.vv v8, v24, v8, v0.t
; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t
; RV32-NEXT: vadd.vv v8, v8, v16, v0.t
+; RV32-NEXT: addi a1, sp, 8
; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v16, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vand.vv v8, v8, v16, v0.t
+; RV32-NEXT: mv a1, sp
; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v16, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vmul.vv v8, v8, v16, v0.t
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t
-; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: addi sp, sp, 32
; RV32-NEXT: ret
;
; RV64-LABEL: vp_cttz_zero_undef_nxv8i64:
define <vscale x 8 x i64> @vp_cttz_zero_undef_nxv8i64_unmasked(<vscale x 8 x i64> %va, i32 zeroext %evl) {
; RV32-LABEL: vp_cttz_zero_undef_nxv8i64_unmasked:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: addi sp, sp, -32
+; RV32-NEXT: .cfi_def_cfa_offset 32
; RV32-NEXT: lui a1, 349525
; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 28(sp)
+; RV32-NEXT: sw a1, 24(sp)
; RV32-NEXT: lui a1, 209715
; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 20(sp)
+; RV32-NEXT: sw a1, 16(sp)
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
; RV32-NEXT: sw a1, 12(sp)
; RV32-NEXT: sw a1, 8(sp)
; RV32-NEXT: lui a1, 4112
; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 4(sp)
+; RV32-NEXT: sw a1, 0(sp)
; RV32-NEXT: li a1, 1
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsub.vx v16, v8, a1
; RV32-NEXT: vnot.v v8, v8
; RV32-NEXT: vand.vv v8, v8, v16
; RV32-NEXT: vsrl.vi v16, v8, 1
-; RV32-NEXT: addi a1, sp, 8
+; RV32-NEXT: addi a1, sp, 24
; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v24, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vand.vv v16, v16, v24
; RV32-NEXT: vsub.vv v8, v8, v16
+; RV32-NEXT: addi a1, sp, 16
; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v16, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vadd.vv v8, v24, v8
; RV32-NEXT: vsrl.vi v16, v8, 4
; RV32-NEXT: vadd.vv v8, v8, v16
+; RV32-NEXT: addi a1, sp, 8
; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v16, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vand.vv v8, v8, v16
+; RV32-NEXT: mv a1, sp
; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v16, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vmul.vv v8, v8, v16
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0
-; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: addi sp, sp, 32
; RV32-NEXT: ret
;
; RV64-LABEL: vp_cttz_zero_undef_nxv8i64_unmasked:
define <vscale x 16 x i64> @vp_cttz_zero_undef_nxv16i64(<vscale x 16 x i64> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
; RV32-LABEL: vp_cttz_zero_undef_nxv16i64:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: addi sp, sp, -48
+; RV32-NEXT: .cfi_def_cfa_offset 48
; RV32-NEXT: csrr a1, vlenb
; RV32-NEXT: li a2, 40
; RV32-NEXT: mul a1, a1, a2
; RV32-NEXT: sub sp, sp, a1
-; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
+; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 40 * vlenb
; RV32-NEXT: vmv1r.v v1, v0
; RV32-NEXT: csrr a1, vlenb
; RV32-NEXT: srli a2, a1, 3
; RV32-NEXT: vslidedown.vx v0, v0, a2
; RV32-NEXT: lui a2, 349525
; RV32-NEXT: addi a2, a2, 1365
-; RV32-NEXT: sw a2, 12(sp)
-; RV32-NEXT: sw a2, 8(sp)
+; RV32-NEXT: sw a2, 44(sp)
+; RV32-NEXT: sw a2, 40(sp)
; RV32-NEXT: lui a2, 209715
; RV32-NEXT: addi a2, a2, 819
-; RV32-NEXT: sw a2, 12(sp)
-; RV32-NEXT: sw a2, 8(sp)
+; RV32-NEXT: sw a2, 36(sp)
+; RV32-NEXT: sw a2, 32(sp)
; RV32-NEXT: lui a2, 61681
; RV32-NEXT: addi a2, a2, -241
-; RV32-NEXT: sw a2, 12(sp)
-; RV32-NEXT: sw a2, 8(sp)
+; RV32-NEXT: sw a2, 28(sp)
+; RV32-NEXT: sw a2, 24(sp)
; RV32-NEXT: lui a2, 4112
; RV32-NEXT: addi a2, a2, 257
-; RV32-NEXT: sw a2, 12(sp)
-; RV32-NEXT: sw a2, 8(sp)
+; RV32-NEXT: sw a2, 20(sp)
+; RV32-NEXT: sw a2, 16(sp)
; RV32-NEXT: sub a2, a0, a1
; RV32-NEXT: sltu a3, a0, a2
; RV32-NEXT: addi a3, a3, -1
; RV32-NEXT: vnot.v v16, v16, v0.t
; RV32-NEXT: vand.vv v16, v16, v24, v0.t
; RV32-NEXT: csrr a4, vlenb
-; RV32-NEXT: slli a4, a4, 5
+; RV32-NEXT: slli a4, a4, 4
; RV32-NEXT: add a4, sp, a4
-; RV32-NEXT: addi a4, a4, 16
+; RV32-NEXT: addi a4, a4, 48
; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill
; RV32-NEXT: vsrl.vi v24, v16, 1, v0.t
-; RV32-NEXT: addi a4, sp, 8
+; RV32-NEXT: csrr a4, vlenb
+; RV32-NEXT: slli a4, a4, 5
+; RV32-NEXT: add a4, sp, a4
+; RV32-NEXT: addi a4, a4, 48
+; RV32-NEXT: vs8r.v v24, (a4) # Unknown-size Folded Spill
+; RV32-NEXT: addi a4, sp, 40
; RV32-NEXT: vsetvli a5, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v16, (a4), zero
; RV32-NEXT: vsetvli zero, a3, e64, m8, ta, ma
-; RV32-NEXT: csrr a5, vlenb
-; RV32-NEXT: slli a5, a5, 4
-; RV32-NEXT: add a5, sp, a5
-; RV32-NEXT: addi a5, a5, 16
-; RV32-NEXT: vs8r.v v16, (a5) # Unknown-size Folded Spill
+; RV32-NEXT: csrr a4, vlenb
+; RV32-NEXT: li a5, 24
+; RV32-NEXT: mul a4, a4, a5
+; RV32-NEXT: add a4, sp, a4
+; RV32-NEXT: addi a4, a4, 48
+; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill
+; RV32-NEXT: csrr a4, vlenb
+; RV32-NEXT: slli a4, a4, 5
+; RV32-NEXT: add a4, sp, a4
+; RV32-NEXT: addi a4, a4, 48
+; RV32-NEXT: vl8r.v v24, (a4) # Unknown-size Folded Reload
; RV32-NEXT: vand.vv v24, v24, v16, v0.t
-; RV32-NEXT: csrr a5, vlenb
-; RV32-NEXT: slli a5, a5, 5
-; RV32-NEXT: add a5, sp, a5
-; RV32-NEXT: addi a5, a5, 16
-; RV32-NEXT: vl8r.v v16, (a5) # Unknown-size Folded Reload
+; RV32-NEXT: csrr a4, vlenb
+; RV32-NEXT: slli a4, a4, 4
+; RV32-NEXT: add a4, sp, a4
+; RV32-NEXT: addi a4, a4, 48
+; RV32-NEXT: vl8r.v v16, (a4) # Unknown-size Folded Reload
; RV32-NEXT: vsub.vv v16, v16, v24, v0.t
-; RV32-NEXT: csrr a5, vlenb
-; RV32-NEXT: li a6, 24
-; RV32-NEXT: mul a5, a5, a6
-; RV32-NEXT: add a5, sp, a5
-; RV32-NEXT: addi a5, a5, 16
-; RV32-NEXT: vs8r.v v16, (a5) # Unknown-size Folded Spill
+; RV32-NEXT: csrr a4, vlenb
+; RV32-NEXT: slli a4, a4, 5
+; RV32-NEXT: add a4, sp, a4
+; RV32-NEXT: addi a4, a4, 48
+; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill
+; RV32-NEXT: addi a4, sp, 32
; RV32-NEXT: vsetvli a5, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v16, (a4), zero
-; RV32-NEXT: csrr a5, vlenb
-; RV32-NEXT: slli a5, a5, 5
-; RV32-NEXT: add a5, sp, a5
-; RV32-NEXT: addi a5, a5, 16
-; RV32-NEXT: vs8r.v v16, (a5) # Unknown-size Folded Spill
; RV32-NEXT: vsetvli zero, a3, e64, m8, ta, ma
-; RV32-NEXT: csrr a5, vlenb
-; RV32-NEXT: li a6, 24
-; RV32-NEXT: mul a5, a5, a6
-; RV32-NEXT: add a5, sp, a5
-; RV32-NEXT: addi a5, a5, 16
-; RV32-NEXT: vl8r.v v16, (a5) # Unknown-size Folded Reload
-; RV32-NEXT: csrr a5, vlenb
-; RV32-NEXT: slli a5, a5, 5
-; RV32-NEXT: add a5, sp, a5
-; RV32-NEXT: addi a5, a5, 16
-; RV32-NEXT: vl8r.v v24, (a5) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v16, v16, v24, v0.t
-; RV32-NEXT: csrr a5, vlenb
-; RV32-NEXT: slli a5, a5, 3
-; RV32-NEXT: add a5, sp, a5
-; RV32-NEXT: addi a5, a5, 16
-; RV32-NEXT: vs8r.v v16, (a5) # Unknown-size Folded Spill
-; RV32-NEXT: csrr a5, vlenb
-; RV32-NEXT: li a6, 24
-; RV32-NEXT: mul a5, a5, a6
-; RV32-NEXT: add a5, sp, a5
-; RV32-NEXT: addi a5, a5, 16
-; RV32-NEXT: vl8r.v v16, (a5) # Unknown-size Folded Reload
-; RV32-NEXT: vsrl.vi v16, v16, 2, v0.t
-; RV32-NEXT: csrr a5, vlenb
-; RV32-NEXT: li a6, 24
-; RV32-NEXT: mul a5, a5, a6
-; RV32-NEXT: add a5, sp, a5
-; RV32-NEXT: addi a5, a5, 16
-; RV32-NEXT: vs8r.v v16, (a5) # Unknown-size Folded Spill
-; RV32-NEXT: csrr a5, vlenb
-; RV32-NEXT: slli a5, a5, 5
-; RV32-NEXT: add a5, sp, a5
-; RV32-NEXT: addi a5, a5, 16
-; RV32-NEXT: vl8r.v v16, (a5) # Unknown-size Folded Reload
-; RV32-NEXT: csrr a5, vlenb
-; RV32-NEXT: li a6, 24
-; RV32-NEXT: mul a5, a5, a6
-; RV32-NEXT: add a5, sp, a5
-; RV32-NEXT: addi a5, a5, 16
-; RV32-NEXT: vl8r.v v24, (a5) # Unknown-size Folded Reload
+; RV32-NEXT: csrr a4, vlenb
+; RV32-NEXT: slli a4, a4, 5
+; RV32-NEXT: add a4, sp, a4
+; RV32-NEXT: addi a4, a4, 48
+; RV32-NEXT: vl8r.v v24, (a4) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v24, v24, v16, v0.t
+; RV32-NEXT: csrr a4, vlenb
+; RV32-NEXT: slli a4, a4, 4
+; RV32-NEXT: add a4, sp, a4
+; RV32-NEXT: addi a4, a4, 48
+; RV32-NEXT: vs8r.v v24, (a4) # Unknown-size Folded Spill
+; RV32-NEXT: csrr a4, vlenb
+; RV32-NEXT: slli a4, a4, 5
+; RV32-NEXT: add a4, sp, a4
+; RV32-NEXT: addi a4, a4, 48
+; RV32-NEXT: vl8r.v v24, (a4) # Unknown-size Folded Reload
+; RV32-NEXT: vsrl.vi v24, v24, 2, v0.t
+; RV32-NEXT: csrr a4, vlenb
+; RV32-NEXT: slli a4, a4, 5
+; RV32-NEXT: add a4, sp, a4
+; RV32-NEXT: addi a4, a4, 48
+; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill
; RV32-NEXT: vand.vv v16, v24, v16, v0.t
-; RV32-NEXT: csrr a5, vlenb
-; RV32-NEXT: slli a5, a5, 3
-; RV32-NEXT: add a5, sp, a5
-; RV32-NEXT: addi a5, a5, 16
-; RV32-NEXT: vl8r.v v24, (a5) # Unknown-size Folded Reload
+; RV32-NEXT: csrr a4, vlenb
+; RV32-NEXT: slli a4, a4, 4
+; RV32-NEXT: add a4, sp, a4
+; RV32-NEXT: addi a4, a4, 48
+; RV32-NEXT: vl8r.v v24, (a4) # Unknown-size Folded Reload
; RV32-NEXT: vadd.vv v16, v24, v16, v0.t
; RV32-NEXT: vsrl.vi v24, v16, 4, v0.t
; RV32-NEXT: vadd.vv v24, v16, v24, v0.t
+; RV32-NEXT: addi a4, sp, 24
; RV32-NEXT: vsetvli a5, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v16, (a4), zero
; RV32-NEXT: vsetvli zero, a3, e64, m8, ta, ma
-; RV32-NEXT: csrr a5, vlenb
-; RV32-NEXT: li a6, 24
-; RV32-NEXT: mul a5, a5, a6
-; RV32-NEXT: add a5, sp, a5
-; RV32-NEXT: addi a5, a5, 16
-; RV32-NEXT: vs8r.v v16, (a5) # Unknown-size Folded Spill
+; RV32-NEXT: csrr a4, vlenb
+; RV32-NEXT: slli a4, a4, 4
+; RV32-NEXT: add a4, sp, a4
+; RV32-NEXT: addi a4, a4, 48
+; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill
; RV32-NEXT: vand.vv v24, v24, v16, v0.t
+; RV32-NEXT: addi a4, sp, 16
; RV32-NEXT: vsetvli a5, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v16, (a4), zero
; RV32-NEXT: vsetvli zero, a3, e64, m8, ta, ma
; RV32-NEXT: csrr a3, vlenb
; RV32-NEXT: slli a3, a3, 3
; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 16
+; RV32-NEXT: addi a3, a3, 48
; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
; RV32-NEXT: vmul.vv v16, v24, v16, v0.t
; RV32-NEXT: li a3, 56
; RV32-NEXT: vsrl.vx v16, v16, a3, v0.t
-; RV32-NEXT: addi a4, sp, 16
+; RV32-NEXT: addi a4, sp, 48
; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill
; RV32-NEXT: bltu a0, a1, .LBB94_2
; RV32-NEXT: # %bb.1:
; RV32-NEXT: vand.vv v8, v8, v16, v0.t
; RV32-NEXT: vsrl.vi v24, v8, 1, v0.t
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: li a1, 24
+; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: addi a0, a0, 48
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vand.vv v16, v24, v16, v0.t
; RV32-NEXT: vsub.vv v8, v8, v16, v0.t
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 5
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: addi a0, a0, 48
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vand.vv v24, v8, v16, v0.t
; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t
; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t
; RV32-NEXT: vadd.vv v8, v8, v16, v0.t
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: li a1, 24
-; RV32-NEXT: mul a0, a0, a1
+; RV32-NEXT: slli a0, a0, 4
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: addi a0, a0, 48
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vand.vv v8, v8, v16, v0.t
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 3
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: addi a0, a0, 48
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vmul.vv v8, v8, v16, v0.t
; RV32-NEXT: vsrl.vx v8, v8, a3, v0.t
-; RV32-NEXT: addi a0, sp, 16
+; RV32-NEXT: addi a0, sp, 48
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: li a1, 40
; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add sp, sp, a0
-; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: addi sp, sp, 48
; RV32-NEXT: ret
;
; RV64-LABEL: vp_cttz_zero_undef_nxv16i64:
define <vscale x 16 x i64> @vp_cttz_zero_undef_nxv16i64_unmasked(<vscale x 16 x i64> %va, i32 zeroext %evl) {
; RV32-LABEL: vp_cttz_zero_undef_nxv16i64_unmasked:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: addi sp, sp, -48
+; RV32-NEXT: .cfi_def_cfa_offset 48
; RV32-NEXT: csrr a1, vlenb
; RV32-NEXT: slli a1, a1, 5
; RV32-NEXT: sub sp, sp, a1
-; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 32 * vlenb
; RV32-NEXT: lui a1, 349525
; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 44(sp)
+; RV32-NEXT: sw a1, 40(sp)
; RV32-NEXT: lui a1, 209715
; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 36(sp)
+; RV32-NEXT: sw a1, 32(sp)
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 28(sp)
+; RV32-NEXT: sw a1, 24(sp)
; RV32-NEXT: lui a1, 4112
; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 20(sp)
+; RV32-NEXT: sw a1, 16(sp)
; RV32-NEXT: csrr a1, vlenb
; RV32-NEXT: sub a2, a0, a1
; RV32-NEXT: sltu a3, a0, a2
; RV32-NEXT: vnot.v v16, v16
; RV32-NEXT: vand.vv v16, v16, v24
; RV32-NEXT: vsrl.vi v24, v16, 1
-; RV32-NEXT: addi a4, sp, 8
+; RV32-NEXT: addi a4, sp, 40
; RV32-NEXT: vsetvli a5, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v0, (a4), zero
; RV32-NEXT: vsetvli zero, a3, e64, m8, ta, ma
-; RV32-NEXT: csrr a5, vlenb
-; RV32-NEXT: li a6, 24
-; RV32-NEXT: mul a5, a5, a6
-; RV32-NEXT: add a5, sp, a5
-; RV32-NEXT: addi a5, a5, 16
-; RV32-NEXT: vs8r.v v0, (a5) # Unknown-size Folded Spill
+; RV32-NEXT: csrr a4, vlenb
+; RV32-NEXT: li a5, 24
+; RV32-NEXT: mul a4, a4, a5
+; RV32-NEXT: add a4, sp, a4
+; RV32-NEXT: addi a4, a4, 48
+; RV32-NEXT: vs8r.v v0, (a4) # Unknown-size Folded Spill
; RV32-NEXT: vand.vv v24, v24, v0
; RV32-NEXT: vsub.vv v16, v16, v24
+; RV32-NEXT: addi a4, sp, 32
; RV32-NEXT: vsetvli a5, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v0, (a4), zero
; RV32-NEXT: vsetvli zero, a3, e64, m8, ta, ma
; RV32-NEXT: vadd.vv v16, v24, v16
; RV32-NEXT: vsrl.vi v24, v16, 4
; RV32-NEXT: vadd.vv v16, v16, v24
+; RV32-NEXT: addi a4, sp, 24
; RV32-NEXT: vsetvli a5, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v24, (a4), zero
; RV32-NEXT: vsetvli zero, a3, e64, m8, ta, ma
-; RV32-NEXT: csrr a5, vlenb
-; RV32-NEXT: slli a5, a5, 4
-; RV32-NEXT: add a5, sp, a5
-; RV32-NEXT: addi a5, a5, 16
-; RV32-NEXT: vs8r.v v24, (a5) # Unknown-size Folded Spill
+; RV32-NEXT: csrr a4, vlenb
+; RV32-NEXT: slli a4, a4, 4
+; RV32-NEXT: add a4, sp, a4
+; RV32-NEXT: addi a4, a4, 48
+; RV32-NEXT: vs8r.v v24, (a4) # Unknown-size Folded Spill
; RV32-NEXT: vand.vv v24, v16, v24
+; RV32-NEXT: addi a4, sp, 16
; RV32-NEXT: vsetvli a5, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v16, (a4), zero
; RV32-NEXT: vsetvli zero, a3, e64, m8, ta, ma
; RV32-NEXT: csrr a3, vlenb
; RV32-NEXT: slli a3, a3, 3
; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 16
+; RV32-NEXT: addi a3, a3, 48
; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
; RV32-NEXT: vmul.vv v24, v24, v16
; RV32-NEXT: li a3, 56
; RV32-NEXT: vsrl.vx v16, v24, a3
-; RV32-NEXT: addi a4, sp, 16
+; RV32-NEXT: addi a4, sp, 48
; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill
; RV32-NEXT: bltu a0, a1, .LBB95_2
; RV32-NEXT: # %bb.1:
; RV32-NEXT: li a1, 24
; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: addi a0, a0, 48
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vand.vv v24, v24, v16
; RV32-NEXT: vsub.vv v8, v8, v24
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 4
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: addi a0, a0, 48
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vand.vv v8, v8, v16
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 3
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: addi a0, a0, 48
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vmul.vv v8, v8, v16
; RV32-NEXT: vsrl.vx v8, v8, a3
-; RV32-NEXT: addi a0, sp, 16
+; RV32-NEXT: addi a0, sp, 48
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 5
; RV32-NEXT: add sp, sp, a0
-; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: addi sp, sp, 48
; RV32-NEXT: ret
;
; RV64-LABEL: vp_cttz_zero_undef_nxv16i64_unmasked:
--- /dev/null
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK
+
+declare <vscale x 2 x i64> @llvm.bitreverse.nxv2i64(<vscale x 2 x i64>)
+
+define i32 @splat_vector_split_i64() {
+; CHECK-LABEL: splat_vector_split_i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -32
+; CHECK-NEXT: .cfi_def_cfa_offset 32
+; CHECK-NEXT: li a0, 3
+; CHECK-NEXT: vsetvli a1, zero, e64, m2, ta, ma
+; CHECK-NEXT: vmv.s.x v10, a0
+; CHECK-NEXT: vmv.v.i v8, 0
+; CHECK-NEXT: vsetivli zero, 4, e64, m2, tu, ma
+; CHECK-NEXT: vslideup.vi v8, v10, 3
+; CHECK-NEXT: sw zero, 4(sp)
+; CHECK-NEXT: lui a0, 1044480
+; CHECK-NEXT: sw a0, 0(sp)
+; CHECK-NEXT: lui a0, 61681
+; CHECK-NEXT: addi a0, a0, -241
+; CHECK-NEXT: sw a0, 28(sp)
+; CHECK-NEXT: sw a0, 24(sp)
+; CHECK-NEXT: lui a0, 209715
+; CHECK-NEXT: addi a0, a0, 819
+; CHECK-NEXT: sw a0, 20(sp)
+; CHECK-NEXT: sw a0, 16(sp)
+; CHECK-NEXT: lui a0, 349525
+; CHECK-NEXT: addi a0, a0, 1365
+; CHECK-NEXT: sw a0, 12(sp)
+; CHECK-NEXT: sw a0, 8(sp)
+; CHECK-NEXT: li a0, 56
+; CHECK-NEXT: vsetvli a1, zero, e64, m2, ta, ma
+; CHECK-NEXT: vsrl.vx v10, v8, a0
+; CHECK-NEXT: li a1, 40
+; CHECK-NEXT: vsrl.vx v12, v8, a1
+; CHECK-NEXT: lui a2, 16
+; CHECK-NEXT: addi a2, a2, -256
+; CHECK-NEXT: vand.vx v12, v12, a2
+; CHECK-NEXT: vor.vv v10, v12, v10
+; CHECK-NEXT: vsrl.vi v12, v8, 24
+; CHECK-NEXT: mv a3, sp
+; CHECK-NEXT: vlse64.v v14, (a3), zero
+; CHECK-NEXT: lui a3, 4080
+; CHECK-NEXT: vand.vx v12, v12, a3
+; CHECK-NEXT: vsrl.vi v16, v8, 8
+; CHECK-NEXT: vand.vv v16, v16, v14
+; CHECK-NEXT: vor.vv v12, v16, v12
+; CHECK-NEXT: vor.vv v10, v12, v10
+; CHECK-NEXT: vand.vv v12, v8, v14
+; CHECK-NEXT: vsll.vi v12, v12, 8
+; CHECK-NEXT: vand.vx v14, v8, a3
+; CHECK-NEXT: vsll.vi v14, v14, 24
+; CHECK-NEXT: vor.vv v12, v14, v12
+; CHECK-NEXT: vsll.vx v14, v8, a0
+; CHECK-NEXT: vand.vx v8, v8, a2
+; CHECK-NEXT: vsll.vx v8, v8, a1
+; CHECK-NEXT: vor.vv v8, v14, v8
+; CHECK-NEXT: addi a0, sp, 24
+; CHECK-NEXT: vlse64.v v14, (a0), zero
+; CHECK-NEXT: vor.vv v8, v8, v12
+; CHECK-NEXT: vor.vv v8, v8, v10
+; CHECK-NEXT: vsrl.vi v10, v8, 4
+; CHECK-NEXT: vand.vv v10, v10, v14
+; CHECK-NEXT: vand.vv v8, v8, v14
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vlse64.v v12, (a0), zero
+; CHECK-NEXT: vsll.vi v8, v8, 4
+; CHECK-NEXT: vor.vv v8, v10, v8
+; CHECK-NEXT: vsrl.vi v10, v8, 2
+; CHECK-NEXT: vand.vv v10, v10, v12
+; CHECK-NEXT: vand.vv v8, v8, v12
+; CHECK-NEXT: addi a0, sp, 8
+; CHECK-NEXT: vlse64.v v12, (a0), zero
+; CHECK-NEXT: vsll.vi v8, v8, 2
+; CHECK-NEXT: vor.vv v8, v10, v8
+; CHECK-NEXT: vsrl.vi v10, v8, 1
+; CHECK-NEXT: vand.vv v10, v10, v12
+; CHECK-NEXT: vand.vv v8, v8, v12
+; CHECK-NEXT: vadd.vv v8, v8, v8
+; CHECK-NEXT: vor.vv v8, v10, v8
+; CHECK-NEXT: vsetivli zero, 1, e64, m2, ta, ma
+; CHECK-NEXT: vslidedown.vi v8, v8, 3
+; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: addi sp, sp, 32
+; CHECK-NEXT: ret
+ %1 = insertelement <vscale x 2 x i64> zeroinitializer, i64 3, i64 3
+ %2 = tail call <vscale x 2 x i64> @llvm.bitreverse.nxv2i64(<vscale x 2 x i64> %1)
+ %3 = extractelement <vscale x 2 x i64> %2, i32 3
+ %4 = trunc i64 %3 to i32
+ ret i32 %4
+}
; RV32-NEXT: lui a1, 11557
; RV32-NEXT: addi a1, a1, -683
; RV32-NEXT: mul a1, a0, a1
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 0(sp)
; RV32-NEXT: srli a0, a0, 3
; RV32-NEXT: li a1, 62
; RV32-NEXT: mul a1, a0, a1
; RV32-NEXT: addi a2, a2, -1368
; RV32-NEXT: mulhu a0, a0, a2
; RV32-NEXT: add a0, a0, a1
-; RV32-NEXT: sw a0, 12(sp)
+; RV32-NEXT: sw a0, 4(sp)
; RV32-NEXT: vsetvli a0, zero, e64, m8, ta, ma
; RV32-NEXT: addi a0, sp, 8
; RV32-NEXT: vlse64.v v8, (a0), zero
+; RV32-NEXT: mv a0, sp
; RV32-NEXT: vlse64.v v16, (a0), zero
; RV32-NEXT: vid.v v24
; RV32-NEXT: vmul.vv v8, v24, v8
; RV32-NEXT: addi a0, sp, 8
; RV32-NEXT: vsetvli a1, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v8, (a0), zero
-; RV32-NEXT: sw a3, 12(sp)
-; RV32-NEXT: sw a2, 8(sp)
+; RV32-NEXT: sw a3, 4(sp)
+; RV32-NEXT: sw a2, 0(sp)
+; RV32-NEXT: mv a0, sp
; RV32-NEXT: vlse64.v v16, (a0), zero
; RV32-NEXT: vadd.vv v8, v8, v16
; RV32-NEXT: addi sp, sp, 16
; RV32-NEXT: addi a0, sp, 8
; RV32-NEXT: vsetvli a1, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v8, (a0), zero
-; RV32-NEXT: sw a3, 12(sp)
-; RV32-NEXT: sw a2, 8(sp)
+; RV32-NEXT: sw a3, 4(sp)
+; RV32-NEXT: sw a2, 0(sp)
+; RV32-NEXT: mv a0, sp
; RV32-NEXT: vlse64.v v16, (a0), zero
; RV32-NEXT: vand.vv v8, v8, v16
; RV32-NEXT: addi sp, sp, 16
; RV32-NEXT: addi a0, sp, 8
; RV32-NEXT: vsetvli a1, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v8, (a0), zero
-; RV32-NEXT: sw a3, 12(sp)
-; RV32-NEXT: sw a2, 8(sp)
+; RV32-NEXT: sw a3, 4(sp)
+; RV32-NEXT: sw a2, 0(sp)
+; RV32-NEXT: mv a0, sp
; RV32-NEXT: vlse64.v v16, (a0), zero
; RV32-NEXT: vmul.vv v8, v8, v16
; RV32-NEXT: addi sp, sp, 16
; RV32-NEXT: addi a0, sp, 8
; RV32-NEXT: vsetvli a1, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v8, (a0), zero
-; RV32-NEXT: sw a3, 12(sp)
-; RV32-NEXT: sw a2, 8(sp)
+; RV32-NEXT: sw a3, 4(sp)
+; RV32-NEXT: sw a2, 0(sp)
+; RV32-NEXT: mv a0, sp
; RV32-NEXT: vlse64.v v16, (a0), zero
; RV32-NEXT: vor.vv v8, v8, v16
; RV32-NEXT: addi sp, sp, 16
; RV32-NEXT: addi a0, sp, 8
; RV32-NEXT: vsetvli a1, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v8, (a0), zero
-; RV32-NEXT: sw a3, 12(sp)
-; RV32-NEXT: sw a2, 8(sp)
+; RV32-NEXT: sw a3, 4(sp)
+; RV32-NEXT: sw a2, 0(sp)
+; RV32-NEXT: mv a0, sp
; RV32-NEXT: vlse64.v v16, (a0), zero
; RV32-NEXT: vsub.vv v8, v8, v16
; RV32-NEXT: addi sp, sp, 16
; RV32-NEXT: addi a0, sp, 8
; RV32-NEXT: vsetvli a1, zero, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v8, (a0), zero
-; RV32-NEXT: sw a3, 12(sp)
-; RV32-NEXT: sw a2, 8(sp)
+; RV32-NEXT: sw a3, 4(sp)
+; RV32-NEXT: sw a2, 0(sp)
+; RV32-NEXT: mv a0, sp
; RV32-NEXT: vlse64.v v16, (a0), zero
; RV32-NEXT: vxor.vv v8, v8, v16
; RV32-NEXT: addi sp, sp, 16