From d8f9eaad89fa895bf665564310a803f7fa168d81 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Tue, 30 Nov 2021 18:40:57 -0800 Subject: [PATCH] [RISCV] Teach RISCVTargetLowering::shouldSinkOperands to handle udiv/sdiv/urem/srem. The V extension supports .vx instructions for integer division and remainder so we should sink splats for that operand. --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 4 + llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll | 516 +++++++++++++++++++++ 2 files changed, 520 insertions(+) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 8bc7470..f6c2d1b 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -1177,6 +1177,10 @@ bool RISCVTargetLowering::shouldSinkOperands( case Instruction::Shl: case Instruction::LShr: case Instruction::AShr: + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::URem: + case Instruction::SRem: return Operand == 1; case Instruction::Call: if (auto *II = dyn_cast(I)) { diff --git a/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll b/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll index 550ea93..b44088a 100644 --- a/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll +++ b/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll @@ -2445,3 +2445,519 @@ for.cond.cleanup: ; preds = %vector.body ret void } declare void @llvm.masked.store.v4f32.p0v4f32(<4 x float>, <4 x float>*, i32, <4 x i1>) + +define void @sink_splat_udiv(i32* nocapture %a, i32 signext %x) { +; CHECK-LABEL: sink_splat_udiv: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: li a2, 1024 +; CHECK-NEXT: .LBB38_1: # %vector.body +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vdivu.vx v8, v8, a1 +; CHECK-NEXT: vse32.v v8, (a0) +; CHECK-NEXT: addi a2, a2, -4 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: bnez a2, .LBB38_1 +; CHECK-NEXT: # %bb.2: # %for.cond.cleanup +; CHECK-NEXT: ret +entry: + %broadcast.splatinsert = insertelement <4 x i32> poison, i32 %x, i32 0 + %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] + %0 = getelementptr inbounds i32, i32* %a, i64 %index + %1 = bitcast i32* %0 to <4 x i32>* + %wide.load = load <4 x i32>, <4 x i32>* %1, align 4 + %2 = udiv <4 x i32> %wide.load, %broadcast.splat + %3 = bitcast i32* %0 to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 4 + %index.next = add nuw i64 %index, 4 + %4 = icmp eq i64 %index.next, 1024 + br i1 %4, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret void +} + +define void @sink_splat_sdiv(i32* nocapture %a, i32 signext %x) { +; CHECK-LABEL: sink_splat_sdiv: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: li a2, 1024 +; CHECK-NEXT: .LBB39_1: # %vector.body +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vdiv.vx v8, v8, a1 +; CHECK-NEXT: vse32.v v8, (a0) +; CHECK-NEXT: addi a2, a2, -4 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: bnez a2, .LBB39_1 +; CHECK-NEXT: # %bb.2: # %for.cond.cleanup +; CHECK-NEXT: ret +entry: + %broadcast.splatinsert = insertelement <4 x i32> poison, i32 %x, i32 0 + %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] + %0 = getelementptr inbounds i32, i32* %a, i64 %index + %1 = bitcast i32* %0 to <4 x i32>* + %wide.load = load <4 x i32>, <4 x i32>* %1, align 4 + %2 = sdiv <4 x i32> %wide.load, %broadcast.splat + %3 = bitcast i32* %0 to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 4 + %index.next = add nuw i64 %index, 4 + %4 = icmp eq i64 %index.next, 1024 + br i1 %4, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret void +} + +define void @sink_splat_urem(i32* nocapture %a, i32 signext %x) { +; CHECK-LABEL: sink_splat_urem: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: li a2, 1024 +; CHECK-NEXT: .LBB40_1: # %vector.body +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vremu.vx v8, v8, a1 +; CHECK-NEXT: vse32.v v8, (a0) +; CHECK-NEXT: addi a2, a2, -4 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: bnez a2, .LBB40_1 +; CHECK-NEXT: # %bb.2: # %for.cond.cleanup +; CHECK-NEXT: ret +entry: + %broadcast.splatinsert = insertelement <4 x i32> poison, i32 %x, i32 0 + %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] + %0 = getelementptr inbounds i32, i32* %a, i64 %index + %1 = bitcast i32* %0 to <4 x i32>* + %wide.load = load <4 x i32>, <4 x i32>* %1, align 4 + %2 = urem <4 x i32> %wide.load, %broadcast.splat + %3 = bitcast i32* %0 to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 4 + %index.next = add nuw i64 %index, 4 + %4 = icmp eq i64 %index.next, 1024 + br i1 %4, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret void +} + +define void @sink_splat_srem(i32* nocapture %a, i32 signext %x) { +; CHECK-LABEL: sink_splat_srem: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: li a2, 1024 +; CHECK-NEXT: .LBB41_1: # %vector.body +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vrem.vx v8, v8, a1 +; CHECK-NEXT: vse32.v v8, (a0) +; CHECK-NEXT: addi a2, a2, -4 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: bnez a2, .LBB41_1 +; CHECK-NEXT: # %bb.2: # %for.cond.cleanup +; CHECK-NEXT: ret +entry: + %broadcast.splatinsert = insertelement <4 x i32> poison, i32 %x, i32 0 + %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] + %0 = getelementptr inbounds i32, i32* %a, i64 %index + %1 = bitcast i32* %0 to <4 x i32>* + %wide.load = load <4 x i32>, <4 x i32>* %1, align 4 + %2 = srem <4 x i32> %wide.load, %broadcast.splat + %3 = bitcast i32* %0 to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 4 + %index.next = add nuw i64 %index, 4 + %4 = icmp eq i64 %index.next, 1024 + br i1 %4, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret void +} + +define void @sink_splat_udiv_scalable(i32* nocapture %a, i32 signext %x) { +; CHECK-LABEL: sink_splat_udiv_scalable: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: srli a7, a2, 1 +; CHECK-NEXT: li a3, 1024 +; CHECK-NEXT: bgeu a3, a7, .LBB42_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: li t0, 0 +; CHECK-NEXT: j .LBB42_5 +; CHECK-NEXT: .LBB42_2: # %vector.ph +; CHECK-NEXT: li a5, 0 +; CHECK-NEXT: remu a6, a3, a7 +; CHECK-NEXT: sub t0, a3, a6 +; CHECK-NEXT: slli a4, a2, 1 +; CHECK-NEXT: mv a2, a0 +; CHECK-NEXT: .LBB42_3: # %vector.body +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vl2re32.v v8, (a2) +; CHECK-NEXT: vsetvli a3, zero, e32, m2, ta, mu +; CHECK-NEXT: vdivu.vx v8, v8, a1 +; CHECK-NEXT: vs2r.v v8, (a2) +; CHECK-NEXT: add a5, a5, a7 +; CHECK-NEXT: add a2, a2, a4 +; CHECK-NEXT: bne a5, t0, .LBB42_3 +; CHECK-NEXT: # %bb.4: # %middle.block +; CHECK-NEXT: beqz a6, .LBB42_7 +; CHECK-NEXT: .LBB42_5: # %for.body.preheader +; CHECK-NEXT: addi a2, t0, -1024 +; CHECK-NEXT: slli a3, t0, 2 +; CHECK-NEXT: add a0, a0, a3 +; CHECK-NEXT: .LBB42_6: # %for.body +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: lw a3, 0(a0) +; CHECK-NEXT: mv a4, a2 +; CHECK-NEXT: divuw a2, a3, a1 +; CHECK-NEXT: sw a2, 0(a0) +; CHECK-NEXT: addi a2, a4, 1 +; CHECK-NEXT: addi a0, a0, 4 +; CHECK-NEXT: bgeu a2, a4, .LBB42_6 +; CHECK-NEXT: .LBB42_7: # %for.cond.cleanup +; CHECK-NEXT: ret +entry: + %0 = call i64 @llvm.vscale.i64() + %1 = shl i64 %0, 2 + %min.iters.check = icmp ugt i64 %1, 1024 + br i1 %min.iters.check, label %for.body.preheader, label %vector.ph + +vector.ph: ; preds = %entry + %2 = call i64 @llvm.vscale.i64() + %3 = shl i64 %2, 2 + %n.mod.vf = urem i64 1024, %3 + %n.vec = sub nsw i64 1024, %n.mod.vf + %broadcast.splatinsert = insertelement poison, i32 %x, i32 0 + %broadcast.splat = shufflevector %broadcast.splatinsert, poison, zeroinitializer + %4 = call i64 @llvm.vscale.i64() + %5 = shl i64 %4, 2 + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %6 = getelementptr inbounds i32, i32* %a, i64 %index + %7 = bitcast i32* %6 to * + %wide.load = load , * %7, align 4 + %8 = udiv %wide.load, %broadcast.splat + %9 = bitcast i32* %6 to * + store %8, * %9, align 4 + %index.next = add nuw i64 %index, %5 + %10 = icmp eq i64 %index.next, %n.vec + br i1 %10, label %middle.block, label %vector.body + +middle.block: ; preds = %vector.body + %cmp.n = icmp eq i64 %n.mod.vf, 0 + br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader + +for.body.preheader: ; preds = %entry, %middle.block + %indvars.iv.ph = phi i64 [ 0, %entry ], [ %n.vec, %middle.block ] + br label %for.body + +for.cond.cleanup: ; preds = %for.body, %middle.block + ret void + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ %indvars.iv.ph, %for.body.preheader ] + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv + %11 = load i32, i32* %arrayidx, align 4 + %div = udiv i32 %11, %x + store i32 %div, i32* %arrayidx, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %cmp.not = icmp eq i64 %indvars.iv.next, 1024 + br i1 %cmp.not, label %for.cond.cleanup, label %for.body +} + +define void @sink_splat_sdiv_scalable(i32* nocapture %a, i32 signext %x) { +; CHECK-LABEL: sink_splat_sdiv_scalable: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: srli a7, a2, 1 +; CHECK-NEXT: li a3, 1024 +; CHECK-NEXT: bgeu a3, a7, .LBB43_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: li t0, 0 +; CHECK-NEXT: j .LBB43_5 +; CHECK-NEXT: .LBB43_2: # %vector.ph +; CHECK-NEXT: li a5, 0 +; CHECK-NEXT: remu a6, a3, a7 +; CHECK-NEXT: sub t0, a3, a6 +; CHECK-NEXT: slli a4, a2, 1 +; CHECK-NEXT: mv a2, a0 +; CHECK-NEXT: .LBB43_3: # %vector.body +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vl2re32.v v8, (a2) +; CHECK-NEXT: vsetvli a3, zero, e32, m2, ta, mu +; CHECK-NEXT: vdiv.vx v8, v8, a1 +; CHECK-NEXT: vs2r.v v8, (a2) +; CHECK-NEXT: add a5, a5, a7 +; CHECK-NEXT: add a2, a2, a4 +; CHECK-NEXT: bne a5, t0, .LBB43_3 +; CHECK-NEXT: # %bb.4: # %middle.block +; CHECK-NEXT: beqz a6, .LBB43_7 +; CHECK-NEXT: .LBB43_5: # %for.body.preheader +; CHECK-NEXT: addi a2, t0, -1024 +; CHECK-NEXT: slli a3, t0, 2 +; CHECK-NEXT: add a0, a0, a3 +; CHECK-NEXT: .LBB43_6: # %for.body +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: lw a3, 0(a0) +; CHECK-NEXT: mv a4, a2 +; CHECK-NEXT: divw a2, a3, a1 +; CHECK-NEXT: sw a2, 0(a0) +; CHECK-NEXT: addi a2, a4, 1 +; CHECK-NEXT: addi a0, a0, 4 +; CHECK-NEXT: bgeu a2, a4, .LBB43_6 +; CHECK-NEXT: .LBB43_7: # %for.cond.cleanup +; CHECK-NEXT: ret +entry: + %0 = call i64 @llvm.vscale.i64() + %1 = shl i64 %0, 2 + %min.iters.check = icmp ugt i64 %1, 1024 + br i1 %min.iters.check, label %for.body.preheader, label %vector.ph + +vector.ph: ; preds = %entry + %2 = call i64 @llvm.vscale.i64() + %3 = shl i64 %2, 2 + %n.mod.vf = urem i64 1024, %3 + %n.vec = sub nsw i64 1024, %n.mod.vf + %broadcast.splatinsert = insertelement poison, i32 %x, i32 0 + %broadcast.splat = shufflevector %broadcast.splatinsert, poison, zeroinitializer + %4 = call i64 @llvm.vscale.i64() + %5 = shl i64 %4, 2 + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %6 = getelementptr inbounds i32, i32* %a, i64 %index + %7 = bitcast i32* %6 to * + %wide.load = load , * %7, align 4 + %8 = sdiv %wide.load, %broadcast.splat + %9 = bitcast i32* %6 to * + store %8, * %9, align 4 + %index.next = add nuw i64 %index, %5 + %10 = icmp eq i64 %index.next, %n.vec + br i1 %10, label %middle.block, label %vector.body + +middle.block: ; preds = %vector.body + %cmp.n = icmp eq i64 %n.mod.vf, 0 + br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader + +for.body.preheader: ; preds = %entry, %middle.block + %indvars.iv.ph = phi i64 [ 0, %entry ], [ %n.vec, %middle.block ] + br label %for.body + +for.cond.cleanup: ; preds = %for.body, %middle.block + ret void + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ %indvars.iv.ph, %for.body.preheader ] + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv + %11 = load i32, i32* %arrayidx, align 4 + %div = sdiv i32 %11, %x + store i32 %div, i32* %arrayidx, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %cmp.not = icmp eq i64 %indvars.iv.next, 1024 + br i1 %cmp.not, label %for.cond.cleanup, label %for.body +} + +define void @sink_splat_urem_scalable(i32* nocapture %a, i32 signext %x) { +; CHECK-LABEL: sink_splat_urem_scalable: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: srli a7, a2, 1 +; CHECK-NEXT: li a3, 1024 +; CHECK-NEXT: bgeu a3, a7, .LBB44_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: li t0, 0 +; CHECK-NEXT: j .LBB44_5 +; CHECK-NEXT: .LBB44_2: # %vector.ph +; CHECK-NEXT: li a5, 0 +; CHECK-NEXT: remu a6, a3, a7 +; CHECK-NEXT: sub t0, a3, a6 +; CHECK-NEXT: slli a4, a2, 1 +; CHECK-NEXT: mv a2, a0 +; CHECK-NEXT: .LBB44_3: # %vector.body +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vl2re32.v v8, (a2) +; CHECK-NEXT: vsetvli a3, zero, e32, m2, ta, mu +; CHECK-NEXT: vremu.vx v8, v8, a1 +; CHECK-NEXT: vs2r.v v8, (a2) +; CHECK-NEXT: add a5, a5, a7 +; CHECK-NEXT: add a2, a2, a4 +; CHECK-NEXT: bne a5, t0, .LBB44_3 +; CHECK-NEXT: # %bb.4: # %middle.block +; CHECK-NEXT: beqz a6, .LBB44_7 +; CHECK-NEXT: .LBB44_5: # %for.body.preheader +; CHECK-NEXT: addi a2, t0, -1024 +; CHECK-NEXT: slli a3, t0, 2 +; CHECK-NEXT: add a0, a0, a3 +; CHECK-NEXT: .LBB44_6: # %for.body +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: lw a3, 0(a0) +; CHECK-NEXT: mv a4, a2 +; CHECK-NEXT: remuw a2, a3, a1 +; CHECK-NEXT: sw a2, 0(a0) +; CHECK-NEXT: addi a2, a4, 1 +; CHECK-NEXT: addi a0, a0, 4 +; CHECK-NEXT: bgeu a2, a4, .LBB44_6 +; CHECK-NEXT: .LBB44_7: # %for.cond.cleanup +; CHECK-NEXT: ret +entry: + %0 = call i64 @llvm.vscale.i64() + %1 = shl i64 %0, 2 + %min.iters.check = icmp ugt i64 %1, 1024 + br i1 %min.iters.check, label %for.body.preheader, label %vector.ph + +vector.ph: ; preds = %entry + %2 = call i64 @llvm.vscale.i64() + %3 = shl i64 %2, 2 + %n.mod.vf = urem i64 1024, %3 + %n.vec = sub nsw i64 1024, %n.mod.vf + %broadcast.splatinsert = insertelement poison, i32 %x, i32 0 + %broadcast.splat = shufflevector %broadcast.splatinsert, poison, zeroinitializer + %4 = call i64 @llvm.vscale.i64() + %5 = shl i64 %4, 2 + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %6 = getelementptr inbounds i32, i32* %a, i64 %index + %7 = bitcast i32* %6 to * + %wide.load = load , * %7, align 4 + %8 = urem %wide.load, %broadcast.splat + %9 = bitcast i32* %6 to * + store %8, * %9, align 4 + %index.next = add nuw i64 %index, %5 + %10 = icmp eq i64 %index.next, %n.vec + br i1 %10, label %middle.block, label %vector.body + +middle.block: ; preds = %vector.body + %cmp.n = icmp eq i64 %n.mod.vf, 0 + br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader + +for.body.preheader: ; preds = %entry, %middle.block + %indvars.iv.ph = phi i64 [ 0, %entry ], [ %n.vec, %middle.block ] + br label %for.body + +for.cond.cleanup: ; preds = %for.body, %middle.block + ret void + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ %indvars.iv.ph, %for.body.preheader ] + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv + %11 = load i32, i32* %arrayidx, align 4 + %rem = urem i32 %11, %x + store i32 %rem, i32* %arrayidx, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %cmp.not = icmp eq i64 %indvars.iv.next, 1024 + br i1 %cmp.not, label %for.cond.cleanup, label %for.body +} + +define void @sink_splat_srem_scalable(i32* nocapture %a, i32 signext %x) { +; CHECK-LABEL: sink_splat_srem_scalable: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: srli a7, a2, 1 +; CHECK-NEXT: li a3, 1024 +; CHECK-NEXT: bgeu a3, a7, .LBB45_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: li t0, 0 +; CHECK-NEXT: j .LBB45_5 +; CHECK-NEXT: .LBB45_2: # %vector.ph +; CHECK-NEXT: li a5, 0 +; CHECK-NEXT: remu a6, a3, a7 +; CHECK-NEXT: sub t0, a3, a6 +; CHECK-NEXT: slli a4, a2, 1 +; CHECK-NEXT: mv a2, a0 +; CHECK-NEXT: .LBB45_3: # %vector.body +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vl2re32.v v8, (a2) +; CHECK-NEXT: vsetvli a3, zero, e32, m2, ta, mu +; CHECK-NEXT: vrem.vx v8, v8, a1 +; CHECK-NEXT: vs2r.v v8, (a2) +; CHECK-NEXT: add a5, a5, a7 +; CHECK-NEXT: add a2, a2, a4 +; CHECK-NEXT: bne a5, t0, .LBB45_3 +; CHECK-NEXT: # %bb.4: # %middle.block +; CHECK-NEXT: beqz a6, .LBB45_7 +; CHECK-NEXT: .LBB45_5: # %for.body.preheader +; CHECK-NEXT: addi a2, t0, -1024 +; CHECK-NEXT: slli a3, t0, 2 +; CHECK-NEXT: add a0, a0, a3 +; CHECK-NEXT: .LBB45_6: # %for.body +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: lw a3, 0(a0) +; CHECK-NEXT: mv a4, a2 +; CHECK-NEXT: remw a2, a3, a1 +; CHECK-NEXT: sw a2, 0(a0) +; CHECK-NEXT: addi a2, a4, 1 +; CHECK-NEXT: addi a0, a0, 4 +; CHECK-NEXT: bgeu a2, a4, .LBB45_6 +; CHECK-NEXT: .LBB45_7: # %for.cond.cleanup +; CHECK-NEXT: ret +entry: + %0 = call i64 @llvm.vscale.i64() + %1 = shl i64 %0, 2 + %min.iters.check = icmp ugt i64 %1, 1024 + br i1 %min.iters.check, label %for.body.preheader, label %vector.ph + +vector.ph: ; preds = %entry + %2 = call i64 @llvm.vscale.i64() + %3 = shl i64 %2, 2 + %n.mod.vf = urem i64 1024, %3 + %n.vec = sub nsw i64 1024, %n.mod.vf + %broadcast.splatinsert = insertelement poison, i32 %x, i32 0 + %broadcast.splat = shufflevector %broadcast.splatinsert, poison, zeroinitializer + %4 = call i64 @llvm.vscale.i64() + %5 = shl i64 %4, 2 + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %6 = getelementptr inbounds i32, i32* %a, i64 %index + %7 = bitcast i32* %6 to * + %wide.load = load , * %7, align 4 + %8 = srem %wide.load, %broadcast.splat + %9 = bitcast i32* %6 to * + store %8, * %9, align 4 + %index.next = add nuw i64 %index, %5 + %10 = icmp eq i64 %index.next, %n.vec + br i1 %10, label %middle.block, label %vector.body + +middle.block: ; preds = %vector.body + %cmp.n = icmp eq i64 %n.mod.vf, 0 + br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader + +for.body.preheader: ; preds = %entry, %middle.block + %indvars.iv.ph = phi i64 [ 0, %entry ], [ %n.vec, %middle.block ] + br label %for.body + +for.cond.cleanup: ; preds = %for.body, %middle.block + ret void + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ %indvars.iv.ph, %for.body.preheader ] + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv + %11 = load i32, i32* %arrayidx, align 4 + %rem = srem i32 %11, %x + store i32 %rem, i32* %arrayidx, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %cmp.not = icmp eq i64 %indvars.iv.next, 1024 + br i1 %cmp.not, label %for.cond.cleanup, label %for.body +} -- 2.7.4