From 8045bf9d0dc5be3a8b8d075fdfe23828f4b7d70e Mon Sep 17 00:00:00 2001
From: Alexandros Lamprineas <alexandros.lamprineas@arm.com>
Date: Wed, 23 Mar 2022 14:51:16 +0000
Subject: [PATCH] [FuncSpec] Support function specialization across multiple
 arguments.

The current implementation of Function Specialization does not allow
specializing more than one arguments per function call, which is a
limitation I am lifting with this patch.

My main challenge was to choose the most suitable ADT for storing the
specializations. We need an associative container for binding all the
actual arguments of a specialization to the function call. We also
need a consistent iteration order across executions. Lastly we want
to be able to sort the entries by Gain and reject the least profitable
ones.

MapVector fits the bill but not quite; erasing elements is expensive
and using stable_sort messes up the indices to the underlying vector.
I am therefore using the underlying vector directly after calculating
the Gain.

Differential Revision: https://reviews.llvm.org/D119880
---
 .../llvm/Transforms/Utils/SCCPSolver.h        |  15 +-
 .../Transforms/IPO/FunctionSpecialization.cpp | 174 ++++++++--------
 llvm/lib/Transforms/Utils/SCCPSolver.cpp      |  23 ++-
 .../function-specialization4.ll               |   4 +-
 .../specialize-multiple-arguments.ll          | 185 ++++++++++++++++++
 5 files changed, 306 insertions(+), 95 deletions(-)
 create mode 100644 llvm/test/Transforms/FunctionSpecialization/specialize-multiple-arguments.ll
diff --git a/llvm/include/llvm/Transforms/Utils/SCCPSolver.h b/llvm/include/llvm/Transforms/Utils/SCCPSolver.h
index fb94b1dc20b8..17bd072598ee 100644
--- a/llvm/include/llvm/Transforms/Utils/SCCPSolver.h
+++ b/llvm/include/llvm/Transforms/Utils/SCCPSolver.h
@@ -151,13 +151,14 @@ public:
   /// Return a reference to the set of argument tracked functions.
   SmallPtrSetImpl<Function *> &getArgumentTrackedFunctions();
 
-  /// Mark the constant argument of a new function specialization. \p F points
-  /// to the cloned function and \p Arg represents the constant argument as a
-  /// pair of {formal,actual} values (the formal argument is associated with the
-  /// original function definition). All other arguments of the specialization
-  /// inherit the lattice state of their corresponding values in the original
-  /// function.
-  void markArgInFuncSpecialization(Function *F, const ArgInfo &Arg);
+  /// Mark the constant arguments of a new function specialization. \p F points
+  /// to the cloned function and \p Args contains a list of constant arguments
+  /// represented as pairs of {formal,actual} values (the formal argument is
+  /// associated with the original function definition). All other arguments of
+  /// the specialization inherit the lattice state of their corresponding values
+  /// in the original function.
+  void markArgInFuncSpecialization(Function *F,
+                                   const SmallVectorImpl<ArgInfo> &Args);
 
   /// Mark all of the blocks in function \p F non-executable. Clients can used
   /// this method to erase a function from the module (e.g., if it has been
diff --git a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
index 8faca670112b..c9775e097a45 100644
--- a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
+++ b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
@@ -99,8 +99,13 @@ static cl::opt<bool> SpecializeOnAddresses(
     "func-specialization-on-address", cl::init(false), cl::Hidden,
     cl::desc("Enable function specialization on the address of global values"));
 
-// TODO: This needs checking to see the impact on compile-times, which is why
-// this is off by default for now.
+// Disabled by default as it can significantly increase compilation times.
+// Running nikic's compile time tracker on x86 with instruction count as the
+// metric shows 3-4% regression for SPASS while being neutral for all other
+// benchmarks of the llvm test suite.
+//
+// https://llvm-compile-time-tracker.com
+// https://github.com/nikic/llvm-compile-time-tracker
 static cl::opt<bool> EnableSpecializationForLiteralConstant(
     "function-specialization-for-literal-constant", cl::init(false), cl::Hidden,
     cl::desc("Enable specialization of functions that take a literal constant "
@@ -110,17 +115,17 @@ namespace {
 // Bookkeeping struct to pass data from the analysis and profitability phase
 // to the actual transform helper functions.
 struct SpecializationInfo {
-  ArgInfo Arg;          // Stores the {formal,actual} argument pair.
-  InstructionCost Gain; // Profitability: Gain = Bonus - Cost.
-
-  SpecializationInfo(Argument *A, Constant *C, InstructionCost G)
-      : Arg(A, C), Gain(G){};
+  SmallVector<ArgInfo, 8> Args; // Stores the {formal,actual} argument pairs.
+  InstructionCost Gain;         // Profitability: Gain = Bonus - Cost.
 };
 } // Anonymous namespace
 
 using FuncList = SmallVectorImpl<Function *>;
-using ConstList = SmallVector<Constant *>;
-using SpecializationList = SmallVector<SpecializationInfo>;
+using CallArgBinding = std::pair<CallBase *, Constant *>;
+using CallSpecBinding = std::pair<CallBase *, SpecializationInfo>;
+// We are using MapVector because it guarantees deterministic iteration
+// order across executions.
+using SpecializationMap = SmallMapVector<CallBase *, SpecializationInfo, 8>;
 
 // Helper to check if \p LV is either a constant or a constant
 // range with a single element. This should cover exactly the same cases as the
@@ -307,17 +312,15 @@ public:
       LLVM_DEBUG(dbgs() << "FnSpecialization: Specialization cost for "
                         << F->getName() << " is " << Cost << "\n");
 
-      SpecializationList Specializations;
-      calculateGains(F, Cost, Specializations);
-      if (Specializations.empty()) {
-        LLVM_DEBUG(dbgs() << "FnSpecialization: no possible constants found\n");
+      SmallVector<CallSpecBinding, 8> Specializations;
+      if (!calculateGains(F, Cost, Specializations)) {
+        LLVM_DEBUG(dbgs() << "FnSpecialization: No possible constants found\n");
         continue;
       }
 
-      for (SpecializationInfo &S : Specializations) {
-        specializeFunction(F, S, WorkList);
-        Changed = true;
-      }
+      Changed = true;
+      for (auto &Entry : Specializations)
+        specializeFunction(F, Entry.second, WorkList);
     }
 
     updateSpecializedFuncs(Candidates, WorkList);
@@ -392,21 +395,22 @@ private:
     return Clone;
   }
 
-  /// This function decides whether it's worthwhile to specialize function \p F
-  /// based on the known constant values its arguments can take on, i.e. it
-  /// calculates a gain and returns a list of actual arguments that are deemed
-  /// profitable to specialize. Specialization is performed on the first
-  /// interesting argument. Specializations based on additional arguments will
-  /// be evaluated on following iterations of the main IPSCCP solve loop.
-  void calculateGains(Function *F, InstructionCost Cost,
-                      SpecializationList &WorkList) {
+  /// This function decides whether it's worthwhile to specialize function
+  /// \p F based on the known constant values its arguments can take on. It
+  /// only discovers potential specialization opportunities without actually
+  /// applying them.
+  ///
+  /// \returns true if any specializations have been found.
+  bool calculateGains(Function *F, InstructionCost Cost,
+                      SmallVectorImpl<CallSpecBinding> &WorkList) {
+    SpecializationMap Specializations;
     // Determine if we should specialize the function based on the values the
     // argument can take on. If specialization is not profitable, we continue
     // on to the next argument.
     for (Argument &FormalArg : F->args()) {
       // Determine if this argument is interesting. If we know the argument can
       // take on any constant values, they are collected in Constants.
-      ConstList ActualArgs;
+      SmallVector<CallArgBinding, 8> ActualArgs;
       if (!isArgumentInteresting(&FormalArg, ActualArgs)) {
         LLVM_DEBUG(dbgs() << "FnSpecialization: Argument "
                           << FormalArg.getNameOrAsOperand()
@@ -414,50 +418,56 @@ private:
         continue;
       }
 
-      for (auto *ActualArg : ActualArgs) {
-        InstructionCost Gain =
-            ForceFunctionSpecialization
-                ? 1
-                : getSpecializationBonus(&FormalArg, ActualArg) - Cost;
+      for (const auto &Entry : ActualArgs) {
+        CallBase *Call = Entry.first;
+        Constant *ActualArg = Entry.second;
 
-        if (Gain <= 0)
-          continue;
-        WorkList.push_back({&FormalArg, ActualArg, Gain});
-      }
+        auto I = Specializations.insert({Call, SpecializationInfo()});
+        SpecializationInfo &S = I.first->second;
 
-      if (WorkList.empty())
-        continue;
-
-      // Sort the candidates in descending order.
-      llvm::stable_sort(WorkList, [](const SpecializationInfo &L,
-                                     const SpecializationInfo &R) {
-        return L.Gain > R.Gain;
-      });
-
-      // Truncate the worklist to 'MaxClonesThreshold' candidates if
-      // necessary.
-      if (WorkList.size() > MaxClonesThreshold) {
-        LLVM_DEBUG(dbgs() << "FnSpecialization: Number of candidates exceed "
-                          << "the maximum number of clones threshold.\n"
-                          << "FnSpecialization: Truncating worklist to "
-                          << MaxClonesThreshold << " candidates.\n");
-        WorkList.erase(WorkList.begin() + MaxClonesThreshold, WorkList.end());
+        if (I.second)
+          S.Gain = ForceFunctionSpecialization ? 1 : 0 - Cost;
+        if (!ForceFunctionSpecialization)
+          S.Gain += getSpecializationBonus(&FormalArg, ActualArg);
+        S.Args.push_back({&FormalArg, ActualArg});
       }
+    }
+
+    // Remove unprofitable specializations.
+    Specializations.remove_if(
+        [](const auto &Entry) { return Entry.second.Gain <= 0; });
+
+    // Clear the MapVector and return the underlying vector.
+    WorkList = Specializations.takeVector();
+
+    // Sort the candidates in descending order.
+    llvm::stable_sort(WorkList, [](const auto &L, const auto &R) {
+      return L.second.Gain > R.second.Gain;
+    });
+
+    // Truncate the worklist to 'MaxClonesThreshold' candidates if necessary.
+    if (WorkList.size() > MaxClonesThreshold) {
+      LLVM_DEBUG(dbgs() << "FnSpecialization: Number of candidates exceed "
+                        << "the maximum number of clones threshold.\n"
+                        << "FnSpecialization: Truncating worklist to "
+                        << MaxClonesThreshold << " candidates.\n");
+      WorkList.erase(WorkList.begin() + MaxClonesThreshold, WorkList.end());
+    }
 
-      LLVM_DEBUG(dbgs() << "FnSpecialization: Specializations for function "
-                        << F->getName() << "\n";
-                 for (SpecializationInfo &S
-                      : WorkList) {
+    LLVM_DEBUG(dbgs() << "FnSpecialization: Specializations for function "
+                      << F->getName() << "\n";
+               for (const auto &Entry
+                    : WorkList) {
+                 dbgs() << "FnSpecialization:   Gain = " << Entry.second.Gain
+                        << "\n";
+                 for (const ArgInfo &Arg : Entry.second.Args)
                    dbgs() << "FnSpecialization:   FormalArg = "
-                          << S.Arg.Formal->getNameOrAsOperand()
+                          << Arg.Formal->getNameOrAsOperand()
                           << ", ActualArg = "
-                          << S.Arg.Actual->getNameOrAsOperand()
-                          << ", Gain = " << S.Gain << "\n";
-                 });
+                          << Arg.Actual->getNameOrAsOperand() << "\n";
+               });
 
-      // FIXME: Only one argument per function.
-      break;
-    }
+    return !WorkList.empty();
   }
 
   bool isCandidateFunction(Function *F) {
@@ -490,12 +500,12 @@ private:
     Function *Clone = cloneCandidateFunction(F, Mappings);
 
     // Rewrite calls to the function so that they call the clone instead.
-    rewriteCallSites(Clone, S.Arg, Mappings);
+    rewriteCallSites(Clone, S.Args, Mappings);
 
     // Initialize the lattice state of the arguments of the function clone,
     // marking the argument on which we specialized the function constant
     // with the given value.
-    Solver.markArgInFuncSpecialization(Clone, S.Arg);
+    Solver.markArgInFuncSpecialization(Clone, S.Args);
 
     // Mark all the specialized functions
     WorkList.push_back(Clone);
@@ -641,7 +651,8 @@ private:
   ///
   /// \returns true if the function should be specialized on the given
   /// argument.
-  bool isArgumentInteresting(Argument *A, ConstList &Constants) {
+  bool isArgumentInteresting(Argument *A,
+                             SmallVectorImpl<CallArgBinding> &Constants) {
     // For now, don't attempt to specialize functions based on the values of
     // composite types.
     if (!A->getType()->isSingleValueType() || A->user_empty())
@@ -681,7 +692,8 @@ private:
 
   /// Collect in \p Constants all the constant values that argument \p A can
   /// take on.
-  void getPossibleConstants(Argument *A, ConstList &Constants) {
+  void getPossibleConstants(Argument *A,
+                            SmallVectorImpl<CallArgBinding> &Constants) {
     Function *F = A->getParent();
 
     // Iterate over all the call sites of the argument's parent function.
@@ -723,23 +735,24 @@ private:
 
       if (isa<Constant>(V) && (Solver.getLatticeValueFor(V).isConstant() ||
                                EnableSpecializationForLiteralConstant))
-        Constants.push_back(cast<Constant>(V));
+        Constants.push_back({&CS, cast<Constant>(V)});
     }
   }
 
   /// Rewrite calls to function \p F to call function \p Clone instead.
   ///
   /// This function modifies calls to function \p F as long as the actual
-  /// argument matches the one in \p Arg. Note that for recursive calls we
-  /// need to compare against the cloned formal argument.
+  /// arguments match those in \p Args. Note that for recursive calls we
+  /// need to compare against the cloned formal arguments.
   ///
   /// Callsites that have been marked with the MinSize function attribute won't
   /// be specialized and rewritten.
-  void rewriteCallSites(Function *Clone, const ArgInfo &Arg,
+  void rewriteCallSites(Function *Clone, const SmallVectorImpl<ArgInfo> &Args,
                         ValueToValueMapTy &Mappings) {
-    Function *F = Arg.Formal->getParent();
-    unsigned ArgNo = Arg.Formal->getArgNo();
-    SmallVector<CallBase *, 4> CallSitesToRewrite;
+    assert(!Args.empty() && "Specialization without arguments");
+    Function *F = Args[0].Formal->getParent();
+
+    SmallVector<CallBase *, 8> CallSitesToRewrite;
     for (auto *U : F->users()) {
       if (!isa<CallInst>(U) && !isa<InvokeInst>(U))
         continue;
@@ -758,9 +771,16 @@ private:
                         << "\n");
       if (/* recursive call */
           (CS->getFunction() == Clone &&
-           CS->getArgOperand(ArgNo) == Mappings[Arg.Formal]) ||
+           all_of(Args,
+                  [CS, &Mappings](const ArgInfo &Arg) {
+                    unsigned ArgNo = Arg.Formal->getArgNo();
+                    return CS->getArgOperand(ArgNo) == Mappings[Arg.Formal];
+                  })) ||
           /* normal call */
-          CS->getArgOperand(ArgNo) == Arg.Actual) {
+          all_of(Args, [CS](const ArgInfo &Arg) {
+            unsigned ArgNo = Arg.Formal->getArgNo();
+            return CS->getArgOperand(ArgNo) == Arg.Actual;
+          })) {
         CS->setCalledFunction(Clone);
         Solver.markOverdefined(CS);
       }
@@ -891,7 +911,7 @@ bool llvm::runFunctionSpecialization(
   // Initially resolve the constants in all the argument tracked functions.
   RunSCCPSolver(FuncDecls);
 
-  SmallVector<Function *, 2> WorkList;
+  SmallVector<Function *, 8> WorkList;
   unsigned I = 0;
   while (FuncSpecializationMaxIters != I++ &&
          FS.specializeFunctions(FuncDecls, WorkList)) {
diff --git a/llvm/lib/Transforms/Utils/SCCPSolver.cpp b/llvm/lib/Transforms/Utils/SCCPSolver.cpp
index 88dd5e6031ec..607928c835fb 100644
--- a/llvm/lib/Transforms/Utils/SCCPSolver.cpp
+++ b/llvm/lib/Transforms/Utils/SCCPSolver.cpp
@@ -450,7 +450,8 @@ public:
     return TrackingIncomingArguments;
   }
 
-  void markArgInFuncSpecialization(Function *F, const ArgInfo &Arg);
+  void markArgInFuncSpecialization(Function *F,
+                                   const SmallVectorImpl<ArgInfo> &Args);
 
   void markFunctionUnreachable(Function *F) {
     for (auto &BB : *F)
@@ -524,21 +525,24 @@ Constant *SCCPInstVisitor::getConstant(const ValueLatticeElement &LV) const {
   return nullptr;
 }
 
-void SCCPInstVisitor::markArgInFuncSpecialization(Function *F,
-                                                  const ArgInfo &Arg) {
-  assert(F->arg_size() == Arg.Formal->getParent()->arg_size() &&
+void SCCPInstVisitor::markArgInFuncSpecialization(
+    Function *F, const SmallVectorImpl<ArgInfo> &Args) {
+  assert(!Args.empty() && "Specialization without arguments");
+  assert(F->arg_size() == Args[0].Formal->getParent()->arg_size() &&
          "Functions should have the same number of arguments");
 
+  auto Iter = Args.begin();
   Argument *NewArg = F->arg_begin();
-  Argument *OldArg = Arg.Formal->getParent()->arg_begin();
+  Argument *OldArg = Args[0].Formal->getParent()->arg_begin();
   for (auto End = F->arg_end(); NewArg != End; ++NewArg, ++OldArg) {
 
     LLVM_DEBUG(dbgs() << "SCCP: Marking argument "
                       << NewArg->getNameOrAsOperand() << "\n");
 
-    if (OldArg == Arg.Formal) {
+    if (OldArg == Iter->Formal) {
       // Mark the argument constants in the new function.
-      markConstant(NewArg, Arg.Actual);
+      markConstant(NewArg, Iter->Actual);
+      ++Iter;
     } else if (ValueState.count(OldArg)) {
       // For the remaining arguments in the new function, copy the lattice state
       // over from the old function.
@@ -1717,8 +1721,9 @@ SmallPtrSetImpl<Function *> &SCCPSolver::getArgumentTrackedFunctions() {
   return Visitor->getArgumentTrackedFunctions();
 }
 
-void SCCPSolver::markArgInFuncSpecialization(Function *F, const ArgInfo &Arg) {
-  Visitor->markArgInFuncSpecialization(F, Arg);
+void SCCPSolver::markArgInFuncSpecialization(
+    Function *F, const SmallVectorImpl<ArgInfo> &Args) {
+  Visitor->markArgInFuncSpecialization(F, Args);
 }
 
 void SCCPSolver::markFunctionUnreachable(Function *F) {
diff --git a/llvm/test/Transforms/FunctionSpecialization/function-specialization4.ll b/llvm/test/Transforms/FunctionSpecialization/function-specialization4.ll
index 35ad27e2cc9a..787e6e6c87d5 100644
--- a/llvm/test/Transforms/FunctionSpecialization/function-specialization4.ll
+++ b/llvm/test/Transforms/FunctionSpecialization/function-specialization4.ll
@@ -46,7 +46,7 @@ entry:
 ; CHECK-NEXT:   entry:
 ; CHECK-NEXT:     %0 = load i32, i32* @A, align 4
 ; CHECK-NEXT:     %add = add nsw i32 %x, %0
-; CHECK-NEXT:     %1 = load i32, i32* %c, align 4
+; CHECK-NEXT:     %1 = load i32, i32* @C, align 4
 ; CHECK-NEXT:     %add1 = add nsw i32 %add, %1
 ; CHECK-NEXT:     ret i32 %add1
 ; CHECK-NEXT:   }
@@ -55,7 +55,7 @@ entry:
 ; CHECK-NEXT:   entry:
 ; CHECK-NEXT:     %0 = load i32, i32* @B, align 4
 ; CHECK-NEXT:     %add = add nsw i32 %x, %0
-; CHECK-NEXT:     %1 = load i32, i32* %c, align 4
+; CHECK-NEXT:     %1 = load i32, i32* @D, align 4
 ; CHECK-NEXT:     %add1 = add nsw i32 %add, %1
 ; CHECK-NEXT:     ret i32 %add1
 ; CHECK-NEXT:   }
diff --git a/llvm/test/Transforms/FunctionSpecialization/specialize-multiple-arguments.ll b/llvm/test/Transforms/FunctionSpecialization/specialize-multiple-arguments.ll
new file mode 100644
index 000000000000..b4f28fd2b244
--- /dev/null
+++ b/llvm/test/Transforms/FunctionSpecialization/specialize-multiple-arguments.ll
@@ -0,0 +1,185 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -function-specialization -func-specialization-max-clones=0 -func-specialization-size-threshold=14 -S < %s | FileCheck %s --check-prefix=NONE
+; RUN: opt -function-specialization -func-specialization-max-clones=1 -func-specialization-size-threshold=14 -S < %s | FileCheck %s --check-prefix=ONE
+; RUN: opt -function-specialization -func-specialization-max-clones=2 -func-specialization-size-threshold=14 -S < %s | FileCheck %s --check-prefix=TWO
+; RUN: opt -function-specialization -func-specialization-max-clones=3 -func-specialization-size-threshold=14 -S < %s | FileCheck %s --check-prefix=THREE
+
+; Make sure that we iterate correctly after sorting the specializations:
+; FnSpecialization: Specializations for function compute
+; FnSpecialization:   Gain = 608
+; FnSpecialization:   FormalArg = binop1, ActualArg = power
+; FnSpecialization:   FormalArg = binop2, ActualArg = mul
+; FnSpecialization:   Gain = 982
+; FnSpecialization:   FormalArg = binop1, ActualArg = plus
+; FnSpecialization:   FormalArg = binop2, ActualArg = minus
+; FnSpecialization:   Gain = 795
+; FnSpecialization:   FormalArg = binop1, ActualArg = minus
+; FnSpecialization:   FormalArg = binop2, ActualArg = power
+
+define i64 @main(i64 %x, i64 %y, i1 %flag) {
+; NONE-LABEL: @main(
+; NONE-NEXT:  entry:
+; NONE-NEXT:    br i1 [[FLAG:%.*]], label [[PLUS:%.*]], label [[MINUS:%.*]]
+; NONE:       plus:
+; NONE-NEXT:    [[TMP0:%.*]] = call i64 @compute(i64 [[X:%.*]], i64 [[Y:%.*]], i64 (i64, i64)* @power, i64 (i64, i64)* @mul)
+; NONE-NEXT:    br label [[MERGE:%.*]]
+; NONE:       minus:
+; NONE-NEXT:    [[TMP1:%.*]] = call i64 @compute(i64 [[X]], i64 [[Y]], i64 (i64, i64)* @plus, i64 (i64, i64)* @minus)
+; NONE-NEXT:    br label [[MERGE]]
+; NONE:       merge:
+; NONE-NEXT:    [[TMP2:%.*]] = phi i64 [ [[TMP0]], [[PLUS]] ], [ [[TMP1]], [[MINUS]] ]
+; NONE-NEXT:    [[TMP3:%.*]] = call i64 @compute(i64 [[TMP2]], i64 42, i64 (i64, i64)* @minus, i64 (i64, i64)* @power)
+; NONE-NEXT:    ret i64 [[TMP3]]
+;
+; ONE-LABEL: @main(
+; ONE-NEXT:  entry:
+; ONE-NEXT:    br i1 [[FLAG:%.*]], label [[PLUS:%.*]], label [[MINUS:%.*]]
+; ONE:       plus:
+; ONE-NEXT:    [[TMP0:%.*]] = call i64 @compute(i64 [[X:%.*]], i64 [[Y:%.*]], i64 (i64, i64)* @power, i64 (i64, i64)* @mul)
+; ONE-NEXT:    br label [[MERGE:%.*]]
+; ONE:       minus:
+; ONE-NEXT:    [[TMP1:%.*]] = call i64 @compute.1(i64 [[X]], i64 [[Y]], i64 (i64, i64)* @plus, i64 (i64, i64)* @minus)
+; ONE-NEXT:    br label [[MERGE]]
+; ONE:       merge:
+; ONE-NEXT:    [[TMP2:%.*]] = phi i64 [ [[TMP0]], [[PLUS]] ], [ [[TMP1]], [[MINUS]] ]
+; ONE-NEXT:    [[TMP3:%.*]] = call i64 @compute(i64 [[TMP2]], i64 42, i64 (i64, i64)* @minus, i64 (i64, i64)* @power)
+; ONE-NEXT:    ret i64 [[TMP3]]
+;
+; TWO-LABEL: @main(
+; TWO-NEXT:  entry:
+; TWO-NEXT:    br i1 [[FLAG:%.*]], label [[PLUS:%.*]], label [[MINUS:%.*]]
+; TWO:       plus:
+; TWO-NEXT:    [[TMP0:%.*]] = call i64 @compute(i64 [[X:%.*]], i64 [[Y:%.*]], i64 (i64, i64)* @power, i64 (i64, i64)* @mul)
+; TWO-NEXT:    br label [[MERGE:%.*]]
+; TWO:       minus:
+; TWO-NEXT:    [[TMP1:%.*]] = call i64 @compute.1(i64 [[X]], i64 [[Y]], i64 (i64, i64)* @plus, i64 (i64, i64)* @minus)
+; TWO-NEXT:    br label [[MERGE]]
+; TWO:       merge:
+; TWO-NEXT:    [[TMP2:%.*]] = phi i64 [ [[TMP0]], [[PLUS]] ], [ [[TMP1]], [[MINUS]] ]
+; TWO-NEXT:    [[TMP3:%.*]] = call i64 @compute.2(i64 [[TMP2]], i64 42, i64 (i64, i64)* @minus, i64 (i64, i64)* @power)
+; TWO-NEXT:    ret i64 [[TMP3]]
+;
+; THREE-LABEL: @main(
+; THREE-NEXT:  entry:
+; THREE-NEXT:    br i1 [[FLAG:%.*]], label [[PLUS:%.*]], label [[MINUS:%.*]]
+; THREE:       plus:
+; THREE-NEXT:    [[TMP0:%.*]] = call i64 @compute.3(i64 [[X:%.*]], i64 [[Y:%.*]], i64 (i64, i64)* @power, i64 (i64, i64)* @mul)
+; THREE-NEXT:    br label [[MERGE:%.*]]
+; THREE:       minus:
+; THREE-NEXT:    [[TMP1:%.*]] = call i64 @compute.1(i64 [[X]], i64 [[Y]], i64 (i64, i64)* @plus, i64 (i64, i64)* @minus)
+; THREE-NEXT:    br label [[MERGE]]
+; THREE:       merge:
+; THREE-NEXT:    [[TMP2:%.*]] = phi i64 [ [[TMP0]], [[PLUS]] ], [ [[TMP1]], [[MINUS]] ]
+; THREE-NEXT:    [[TMP3:%.*]] = call i64 @compute.2(i64 [[TMP2]], i64 42, i64 (i64, i64)* @minus, i64 (i64, i64)* @power)
+; THREE-NEXT:    ret i64 [[TMP3]]
+;
+entry:
+  br i1 %flag, label %plus, label %minus
+
+plus:
+  %tmp0 = call i64 @compute(i64 %x, i64 %y, i64 (i64, i64)* @power, i64 (i64, i64)* @mul)
+  br label %merge
+
+minus:
+  %tmp1 = call i64 @compute(i64 %x, i64 %y, i64 (i64, i64)* @plus, i64 (i64, i64)* @minus)
+  br label %merge
+
+merge:
+  %tmp2 = phi i64 [ %tmp0, %plus ], [ %tmp1, %minus]
+  %tmp3 = call i64 @compute(i64 %tmp2, i64 42, i64 (i64, i64)* @minus, i64 (i64, i64)* @power)
+  ret i64 %tmp3
+}
+
+; THREE-NOT: define internal i64 @compute
+;
+; THREE-LABEL: define internal i64 @compute.1(i64 %x, i64 %y, i64 (i64, i64)* %binop1, i64 (i64, i64)* %binop2) {
+; THREE-NEXT:  entry:
+; THREE-NEXT:    [[TMP0:%.+]] = call i64 @plus(i64 %x, i64 %y)
+; THREE-NEXT:    [[TMP1:%.+]] = call i64 @minus(i64 %x, i64 %y)
+; THREE-NEXT:    [[TMP2:%.+]] = add i64 [[TMP0]], [[TMP1]]
+; THREE-NEXT:    [[TMP3:%.+]] = sdiv i64 [[TMP2]], %x
+; THREE-NEXT:    [[TMP4:%.+]] = sub i64 [[TMP3]], %y
+; THREE-NEXT:    [[TMP5:%.+]] = mul i64 [[TMP4]], 2
+; THREE-NEXT:    ret i64 [[TMP5]]
+; THREE-NEXT:  }
+;
+; THREE-LABEL: define internal i64 @compute.2(i64 %x, i64 %y, i64 (i64, i64)* %binop1, i64 (i64, i64)* %binop2) {
+; THREE-NEXT:  entry:
+; THREE-NEXT:    [[TMP0:%.+]] = call i64 @minus(i64 %x, i64 %y)
+; THREE-NEXT:    [[TMP1:%.+]] = call i64 @power(i64 %x, i64 %y)
+; THREE-NEXT:    [[TMP2:%.+]] = add i64 [[TMP0]], [[TMP1]]
+; THREE-NEXT:    [[TMP3:%.+]] = sdiv i64 [[TMP2]], %x
+; THREE-NEXT:    [[TMP4:%.+]] = sub i64 [[TMP3]], %y
+; THREE-NEXT:    [[TMP5:%.+]] = mul i64 [[TMP4]], 2
+; THREE-NEXT:    ret i64 [[TMP5]]
+; THREE-NEXT:  }
+;
+; THREE-LABEL: define internal i64 @compute.3(i64 %x, i64 %y, i64 (i64, i64)* %binop1, i64 (i64, i64)* %binop2) {
+; THREE-NEXT:  entry:
+; THREE-NEXT:    [[TMP0:%.+]] = call i64 @power(i64 %x, i64 %y)
+; THREE-NEXT:    [[TMP1:%.+]] = call i64 @mul(i64 %x, i64 %y)
+; THREE-NEXT:    [[TMP2:%.+]] = add i64 [[TMP0]], [[TMP1]]
+; THREE-NEXT:    [[TMP3:%.+]] = sdiv i64 [[TMP2]], %x
+; THREE-NEXT:    [[TMP4:%.+]] = sub i64 [[TMP3]], %y
+; THREE-NEXT:    [[TMP5:%.+]] = mul i64 [[TMP4]], 2
+; THREE-NEXT:    ret i64 [[TMP5]]
+; THREE-NEXT:  }
+;
+define internal i64 @compute(i64 %x, i64 %y, i64 (i64, i64)* %binop1, i64 (i64, i64)* %binop2) {
+entry:
+  %tmp0 = call i64 %binop1(i64 %x, i64 %y)
+  %tmp1 = call i64 %binop2(i64 %x, i64 %y)
+  %add = add i64 %tmp0, %tmp1
+  %div = sdiv i64 %add, %x
+  %sub = sub i64 %div, %y
+  %mul = mul i64 %sub, 2
+  ret i64 %mul
+}
+
+define internal i64 @plus(i64 %x, i64 %y) {
+entry:
+  %tmp0 = add i64 %x, %y
+  ret i64 %tmp0
+}
+
+define internal i64 @minus(i64 %x, i64 %y) {
+entry:
+  %tmp0 = sub i64 %x, %y
+  ret i64 %tmp0
+}
+
+define internal i64 @mul(i64 %x, i64 %n) {
+entry:
+  %cmp6 = icmp sgt i64 %n, 1
+  br i1 %cmp6, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  %x.addr.0.lcssa = phi i64 [ %x, %entry ], [ %add, %for.body ]
+  ret i64 %x.addr.0.lcssa
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 1, %entry ]
+  %x.addr.07 = phi i64 [ %add, %for.body ], [ %x, %entry ]
+  %add = shl nsw i64 %x.addr.07, 1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %n
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+define internal i64 @power(i64 %x, i64 %n) {
+entry:
+  %cmp6 = icmp sgt i64 %n, 1
+  br i1 %cmp6, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  %x.addr.0.lcssa = phi i64 [ %x, %entry ], [ %mul, %for.body ]
+  ret i64 %x.addr.0.lcssa
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 1, %entry ]
+  %x.addr.07 = phi i64 [ %mul, %for.body ], [ %x, %entry ]
+  %mul = mul nsw i64 %x.addr.07, %x.addr.07
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %n
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
-- 
2.34.1