From d1628266946fdddb44bdad2b3ccf3cd5fc769f42 Mon Sep 17 00:00:00 2001
From: sgokhale <sgokhale@nvidia.com>
Date: Mon, 27 Feb 2023 13:20:52 +0530
Subject: [PATCH] [LV] Update logic for calculating register usage due to
 invariants

Previously, while calculating register usage due to invariants, it was assumed that invariant would always be part of widening
instructions. This resulted in calculating vector register types for vectors which cant be legalized(check the newly added test for more details).

An invariant might not always need a vector register. For e.g., invariant might just be used for iteration check.

This patch checks if the invariant is part of any widening instruction and considers register usage accordingly. Fixes issue 60493

Differential Revision: https://reviews.llvm.org/D143422
---
 llvm/lib/Transforms/Vectorize/LoopVectorize.cpp    | 15 +++++---
 .../Transforms/LoopVectorize/AArch64/reg-usage.ll  | 41 ++++++++++++++++++++++
 .../Transforms/LoopVectorize/PowerPC/reg-usage.ll  |  2 +-
 .../Transforms/LoopVectorize/RISCV/reg-usage.ll    |  8 ++---
 .../LoopVectorize/RISCV/riscv-vector-reverse.ll    |  4 +--
 .../LoopVectorize/X86/reg-usage-debug.ll           |  4 +--
 6 files changed, 60 insertions(+), 14 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/reg-usage.ll
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index e20c4e8..34800b7 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -6004,7 +6004,7 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
   // Saves the list of values that are used in the loop but are defined outside
   // the loop (not including non-instruction values such as arguments and
   // constants).
-  SmallPtrSet<Value *, 8> LoopInvariants;
+  SmallPtrSet<Instruction *, 8> LoopInvariants;
 
   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
     for (Instruction &I : BB->instructionsWithoutDebug()) {
@@ -6130,11 +6130,16 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
     for (auto *Inst : LoopInvariants) {
       // FIXME: The target might use more than one register for the type
       // even in the scalar case.
-      unsigned Usage =
-          VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]);
+      bool IsScalar = all_of(Inst->users(), [&](User *U) {
+        auto *I = cast<Instruction>(U);
+        return TheLoop != LI->getLoopFor(I->getParent()) ||
+               isScalarAfterVectorization(I, VFs[i]);
+      });
+
+      ElementCount VF = IsScalar ? ElementCount::getFixed(1) : VFs[i];
       unsigned ClassID =
-          TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType());
-      Invariant[ClassID] += Usage;
+          TTI.getRegisterClassForType(VF.isVector(), Inst->getType());
+      Invariant[ClassID] += GetRegUsage(Inst->getType(), VF);
     }
 
     LLVM_DEBUG({
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/reg-usage.ll b/llvm/test/Transforms/LoopVectorize/AArch64/reg-usage.ll
new file mode 100644
index 0000000..7da940e
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/reg-usage.ll
@@ -0,0 +1,41 @@
+; REQUIRES: asserts
+
+; RUN: opt -mtriple arm64-linux -passes=loop-vectorize -mattr=+sve -debug-only=loop-vectorize -disable-output <%s 2>&1 | FileCheck %s
+
+; Invariant register usage calculation should take into account if the
+; invariant would be used in widened instructions. Only in such cases, a vector
+; register would be required for holding the invariant. For all other cases
+; such as below(where usage of %0 in loop doesnt require vector register), a
+; general purpose register suffices.
+; Check that below test doesn't crash while calculating register usage for
+; invariant %0
+
+@string = internal unnamed_addr constant [5 x i8] c"abcd\00", align 1
+define void @get_invariant_reg_usage(ptr %z) {
+; CHECK: LV: Checking a loop in 'get_invariant_reg_usage'
+; CHECK: LV(REG): VF = vscale x 1
+; CHECK-NEXT: LV(REG): Found max usage: 1 item
+; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 3 registers
+; CHECK-NEXT: LV(REG): Found invariant usage: 2 item
+; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 2 registers
+; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 8 registers
+
+L.entry:
+  %0 = load i128, ptr %z, align 16
+  %1 = icmp slt i128 %0, 1
+  %a = getelementptr i8, ptr %z, i64 1
+  br i1 %1, label %return, label %loopbody
+
+loopbody:                  ;preds = %L.entry, %loopbody
+  %b = phi ptr [ %2, %loopbody ], [ @string, %L.entry ]
+  %len_input = phi i128 [ %len, %loopbody ], [ %0, %L.entry ]
+  %len = add nsw i128 %len_input, -1
+  %2 = getelementptr i8, ptr %b, i64 1
+  %3 = load i8, ptr %b, align 1
+  store i8 %3, ptr %a, align 4
+  %.not = icmp eq i128 %len, 0
+  br i1 %.not, label %return, label %loopbody
+
+return:                    ;preds = %loopexit, %L.entry
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/PowerPC/reg-usage.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/reg-usage.ll
index b63f2cf..f9d512e 100644
--- a/llvm/test/Transforms/LoopVectorize/PowerPC/reg-usage.ll
+++ b/llvm/test/Transforms/LoopVectorize/PowerPC/reg-usage.ll
@@ -175,7 +175,7 @@ define void @double_(ptr nocapture %A, i32 %n) nounwind uwtable ssp {
 ;CHECK-PWR8-NEXT: LV(REG): RegisterClass: PPC::GPRRC, 2 registers
 ;CHECK-PWR8-NEXT: LV(REG): RegisterClass: PPC::VSXRC, 5 registers
 ;CHECK-PWR8: LV(REG): Found invariant usage: 1 item
-;CHECK-PWR8-NEXT: LV(REG): RegisterClass: PPC::VSXRC, 1 registers
+;CHECK-PWR8-NEXT: LV(REG): RegisterClass: PPC::GPRRC, 1 registers
 
 ;CHECK-PWR9: LV(REG): VF = 1
 ;CHECK-PWR9: LV(REG): Found max usage: 2 item
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage.ll b/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage.ll
index fddc21e..5e3de92 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage.ll
@@ -31,22 +31,22 @@ define void @add(ptr noalias nocapture readonly %src1, ptr noalias nocapture rea
 ; CHECK-LMUL1-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 2 registers
 ; CHECK-LMUL1-NEXT:  LV(REG): RegisterClass: RISCV::VRRC, 2 registers
 ; CHECK-LMUL1-NEXT:  LV(REG): Found invariant usage: 1 item
-; CHECK-LMUL1-NEXT:  LV(REG): RegisterClass: RISCV::VRRC, 2 registers
+; CHECK-LMUL1-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 1 registers
 ; CHECK-LMUL2:       LV(REG): Found max usage: 2 item
 ; CHECK-LMUL2-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 2 registers
 ; CHECK-LMUL2-NEXT:  LV(REG): RegisterClass: RISCV::VRRC, 4 registers
 ; CHECK-LMUL2-NEXT:  LV(REG): Found invariant usage: 1 item
-; CHECK-LMUL2-NEXT:  LV(REG): RegisterClass: RISCV::VRRC, 4 registers
+; CHECK-LMUL2-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 1 registers
 ; CHECK-LMUL4:       LV(REG): Found max usage: 2 item
 ; CHECK-LMUL4-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 2 registers
 ; CHECK-LMUL4-NEXT:  LV(REG): RegisterClass: RISCV::VRRC, 8 registers
 ; CHECK-LMUL4-NEXT:  LV(REG): Found invariant usage: 1 item
-; CHECK-LMUL4-NEXT:  LV(REG): RegisterClass: RISCV::VRRC, 8 registers
+; CHECK-LMUL4-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 1 registers
 ; CHECK-LMUL8:       LV(REG): Found max usage: 2 item
 ; CHECK-LMUL8-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 2 registers
 ; CHECK-LMUL8-NEXT:  LV(REG): RegisterClass: RISCV::VRRC, 16 registers
 ; CHECK-LMUL8-NEXT:  LV(REG): Found invariant usage: 1 item
-; CHECK-LMUL8-NEXT:  LV(REG): RegisterClass: RISCV::VRRC, 16 registers
+; CHECK-LMUL8-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 1 registers
 
 entry:
   %conv = zext i32 %size to i64
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
index bbda0c6..28f6a78 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
@@ -102,7 +102,7 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 3 registers
 ; CHECK-NEXT:  LV(REG): RegisterClass: RISCV::VRRC, 2 registers
 ; CHECK-NEXT:  LV(REG): Found invariant usage: 1 item
-; CHECK-NEXT:  LV(REG): RegisterClass: RISCV::VRRC, 4 registers
+; CHECK-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 1 registers
 ; CHECK-NEXT:  LV: The target has 31 registers of RISCV::GPRRC register class
 ; CHECK-NEXT:  LV: The target has 32 registers of RISCV::VRRC register class
 ; CHECK-NEXT:  LV: Loop cost is 23
@@ -234,7 +234,7 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 3 registers
 ; CHECK-NEXT:  LV(REG): RegisterClass: RISCV::VRRC, 2 registers
 ; CHECK-NEXT:  LV(REG): Found invariant usage: 1 item
-; CHECK-NEXT:  LV(REG): RegisterClass: RISCV::VRRC, 4 registers
+; CHECK-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 1 registers
 ; CHECK-NEXT:  LV: The target has 31 registers of RISCV::GPRRC register class
 ; CHECK-NEXT:  LV: The target has 32 registers of RISCV::VRRC register class
 ; CHECK-NEXT:  LV: Loop cost is 23
diff --git a/llvm/test/Transforms/LoopVectorize/X86/reg-usage-debug.ll b/llvm/test/Transforms/LoopVectorize/X86/reg-usage-debug.ll
index 7041a67..164188d 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/reg-usage-debug.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/reg-usage-debug.ll
@@ -26,7 +26,7 @@ target triple = "x86_64-unknown-linux-gnu"
 ; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 2 registers
 ; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 2 registers
 ; CHECK-NEXT: LV(REG): Found invariant usage: 1 item
-; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 2 registers
+; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 1 registers
 
 define i32 @test_g(ptr nocapture readonly %a, i32 %n) local_unnamed_addr !dbg !6 {
 entry:
@@ -68,7 +68,7 @@ for.end:                                          ; preds = %for.end.loopexit, %
 ; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 2 registers
 ; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 2 registers
 ; CHECK-NEXT: LV(REG): Found invariant usage: 1 item
-; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 2 registers
+; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 1 registers
 
 define i32 @test(ptr nocapture readonly %a, i32 %n) local_unnamed_addr {
 entry:
-- 
2.7.4