From d1628266946fdddb44bdad2b3ccf3cd5fc769f42 Mon Sep 17 00:00:00 2001 From: sgokhale Date: Mon, 27 Feb 2023 13:20:52 +0530 Subject: [PATCH] [LV] Update logic for calculating register usage due to invariants Previously, while calculating register usage due to invariants, it was assumed that invariant would always be part of widening instructions. This resulted in calculating vector register types for vectors which cant be legalized(check the newly added test for more details). An invariant might not always need a vector register. For e.g., invariant might just be used for iteration check. This patch checks if the invariant is part of any widening instruction and considers register usage accordingly. Fixes issue 60493 Differential Revision: https://reviews.llvm.org/D143422 --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 15 +++++--- .../Transforms/LoopVectorize/AArch64/reg-usage.ll | 41 ++++++++++++++++++++++ .../Transforms/LoopVectorize/PowerPC/reg-usage.ll | 2 +- .../Transforms/LoopVectorize/RISCV/reg-usage.ll | 8 ++--- .../LoopVectorize/RISCV/riscv-vector-reverse.ll | 4 +-- .../LoopVectorize/X86/reg-usage-debug.ll | 4 +-- 6 files changed, 60 insertions(+), 14 deletions(-) create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/reg-usage.ll diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index e20c4e8..34800b7 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -6004,7 +6004,7 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef VFs) { // Saves the list of values that are used in the loop but are defined outside // the loop (not including non-instruction values such as arguments and // constants). - SmallPtrSet LoopInvariants; + SmallPtrSet LoopInvariants; for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { for (Instruction &I : BB->instructionsWithoutDebug()) { @@ -6130,11 +6130,16 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef VFs) { for (auto *Inst : LoopInvariants) { // FIXME: The target might use more than one register for the type // even in the scalar case. - unsigned Usage = - VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]); + bool IsScalar = all_of(Inst->users(), [&](User *U) { + auto *I = cast(U); + return TheLoop != LI->getLoopFor(I->getParent()) || + isScalarAfterVectorization(I, VFs[i]); + }); + + ElementCount VF = IsScalar ? ElementCount::getFixed(1) : VFs[i]; unsigned ClassID = - TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType()); - Invariant[ClassID] += Usage; + TTI.getRegisterClassForType(VF.isVector(), Inst->getType()); + Invariant[ClassID] += GetRegUsage(Inst->getType(), VF); } LLVM_DEBUG({ diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/reg-usage.ll b/llvm/test/Transforms/LoopVectorize/AArch64/reg-usage.ll new file mode 100644 index 0000000..7da940e --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/reg-usage.ll @@ -0,0 +1,41 @@ +; REQUIRES: asserts + +; RUN: opt -mtriple arm64-linux -passes=loop-vectorize -mattr=+sve -debug-only=loop-vectorize -disable-output <%s 2>&1 | FileCheck %s + +; Invariant register usage calculation should take into account if the +; invariant would be used in widened instructions. Only in such cases, a vector +; register would be required for holding the invariant. For all other cases +; such as below(where usage of %0 in loop doesnt require vector register), a +; general purpose register suffices. +; Check that below test doesn't crash while calculating register usage for +; invariant %0 + +@string = internal unnamed_addr constant [5 x i8] c"abcd\00", align 1 +define void @get_invariant_reg_usage(ptr %z) { +; CHECK: LV: Checking a loop in 'get_invariant_reg_usage' +; CHECK: LV(REG): VF = vscale x 1 +; CHECK-NEXT: LV(REG): Found max usage: 1 item +; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 3 registers +; CHECK-NEXT: LV(REG): Found invariant usage: 2 item +; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 2 registers +; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 8 registers + +L.entry: + %0 = load i128, ptr %z, align 16 + %1 = icmp slt i128 %0, 1 + %a = getelementptr i8, ptr %z, i64 1 + br i1 %1, label %return, label %loopbody + +loopbody: ;preds = %L.entry, %loopbody + %b = phi ptr [ %2, %loopbody ], [ @string, %L.entry ] + %len_input = phi i128 [ %len, %loopbody ], [ %0, %L.entry ] + %len = add nsw i128 %len_input, -1 + %2 = getelementptr i8, ptr %b, i64 1 + %3 = load i8, ptr %b, align 1 + store i8 %3, ptr %a, align 4 + %.not = icmp eq i128 %len, 0 + br i1 %.not, label %return, label %loopbody + +return: ;preds = %loopexit, %L.entry + ret void +} diff --git a/llvm/test/Transforms/LoopVectorize/PowerPC/reg-usage.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/reg-usage.ll index b63f2cf..f9d512e 100644 --- a/llvm/test/Transforms/LoopVectorize/PowerPC/reg-usage.ll +++ b/llvm/test/Transforms/LoopVectorize/PowerPC/reg-usage.ll @@ -175,7 +175,7 @@ define void @double_(ptr nocapture %A, i32 %n) nounwind uwtable ssp { ;CHECK-PWR8-NEXT: LV(REG): RegisterClass: PPC::GPRRC, 2 registers ;CHECK-PWR8-NEXT: LV(REG): RegisterClass: PPC::VSXRC, 5 registers ;CHECK-PWR8: LV(REG): Found invariant usage: 1 item -;CHECK-PWR8-NEXT: LV(REG): RegisterClass: PPC::VSXRC, 1 registers +;CHECK-PWR8-NEXT: LV(REG): RegisterClass: PPC::GPRRC, 1 registers ;CHECK-PWR9: LV(REG): VF = 1 ;CHECK-PWR9: LV(REG): Found max usage: 2 item diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage.ll b/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage.ll index fddc21e..5e3de92 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage.ll @@ -31,22 +31,22 @@ define void @add(ptr noalias nocapture readonly %src1, ptr noalias nocapture rea ; CHECK-LMUL1-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 2 registers ; CHECK-LMUL1-NEXT: LV(REG): RegisterClass: RISCV::VRRC, 2 registers ; CHECK-LMUL1-NEXT: LV(REG): Found invariant usage: 1 item -; CHECK-LMUL1-NEXT: LV(REG): RegisterClass: RISCV::VRRC, 2 registers +; CHECK-LMUL1-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 1 registers ; CHECK-LMUL2: LV(REG): Found max usage: 2 item ; CHECK-LMUL2-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 2 registers ; CHECK-LMUL2-NEXT: LV(REG): RegisterClass: RISCV::VRRC, 4 registers ; CHECK-LMUL2-NEXT: LV(REG): Found invariant usage: 1 item -; CHECK-LMUL2-NEXT: LV(REG): RegisterClass: RISCV::VRRC, 4 registers +; CHECK-LMUL2-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 1 registers ; CHECK-LMUL4: LV(REG): Found max usage: 2 item ; CHECK-LMUL4-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 2 registers ; CHECK-LMUL4-NEXT: LV(REG): RegisterClass: RISCV::VRRC, 8 registers ; CHECK-LMUL4-NEXT: LV(REG): Found invariant usage: 1 item -; CHECK-LMUL4-NEXT: LV(REG): RegisterClass: RISCV::VRRC, 8 registers +; CHECK-LMUL4-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 1 registers ; CHECK-LMUL8: LV(REG): Found max usage: 2 item ; CHECK-LMUL8-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 2 registers ; CHECK-LMUL8-NEXT: LV(REG): RegisterClass: RISCV::VRRC, 16 registers ; CHECK-LMUL8-NEXT: LV(REG): Found invariant usage: 1 item -; CHECK-LMUL8-NEXT: LV(REG): RegisterClass: RISCV::VRRC, 16 registers +; CHECK-LMUL8-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 1 registers entry: %conv = zext i32 %size to i64 diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll index bbda0c6..28f6a78 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll @@ -102,7 +102,7 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 3 registers ; CHECK-NEXT: LV(REG): RegisterClass: RISCV::VRRC, 2 registers ; CHECK-NEXT: LV(REG): Found invariant usage: 1 item -; CHECK-NEXT: LV(REG): RegisterClass: RISCV::VRRC, 4 registers +; CHECK-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 1 registers ; CHECK-NEXT: LV: The target has 31 registers of RISCV::GPRRC register class ; CHECK-NEXT: LV: The target has 32 registers of RISCV::VRRC register class ; CHECK-NEXT: LV: Loop cost is 23 @@ -234,7 +234,7 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 3 registers ; CHECK-NEXT: LV(REG): RegisterClass: RISCV::VRRC, 2 registers ; CHECK-NEXT: LV(REG): Found invariant usage: 1 item -; CHECK-NEXT: LV(REG): RegisterClass: RISCV::VRRC, 4 registers +; CHECK-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 1 registers ; CHECK-NEXT: LV: The target has 31 registers of RISCV::GPRRC register class ; CHECK-NEXT: LV: The target has 32 registers of RISCV::VRRC register class ; CHECK-NEXT: LV: Loop cost is 23 diff --git a/llvm/test/Transforms/LoopVectorize/X86/reg-usage-debug.ll b/llvm/test/Transforms/LoopVectorize/X86/reg-usage-debug.ll index 7041a67..164188d 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/reg-usage-debug.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/reg-usage-debug.ll @@ -26,7 +26,7 @@ target triple = "x86_64-unknown-linux-gnu" ; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 2 registers ; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 2 registers ; CHECK-NEXT: LV(REG): Found invariant usage: 1 item -; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 2 registers +; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 1 registers define i32 @test_g(ptr nocapture readonly %a, i32 %n) local_unnamed_addr !dbg !6 { entry: @@ -68,7 +68,7 @@ for.end: ; preds = %for.end.loopexit, % ; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 2 registers ; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 2 registers ; CHECK-NEXT: LV(REG): Found invariant usage: 1 item -; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 2 registers +; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 1 registers define i32 @test(ptr nocapture readonly %a, i32 %n) local_unnamed_addr { entry: -- 2.7.4