From 2a5893351954c72423b9a608fd336557a46fe83d Mon Sep 17 00:00:00 2001
From: Jun Bum Lim <junbuml@codeaurora.org>
Date: Fri, 3 Nov 2017 19:01:57 +0000
Subject: [PATCH] Add CallSiteSplitting pass

Summary:
This change add a pass which tries to split a call-site to pass
more constrained arguments if its argument is predicated in the control flow
so that we can expose better context to the later passes (e.g, inliner, jump
threading, or IPA-CP based function cloning, etc.).
As of now we support two cases :

1) If a call site is dominated by an OR condition and if any of its arguments
are predicated on this OR condition, try to split the condition with more
constrained arguments. For example, in the code below, we try to split the
call site since we can predicate the argument (ptr) based on the OR condition.

Split from :
      if (!ptr || c)
        callee(ptr);
to :
      if (!ptr)
        callee(null ptr)  // set the known constant value
      else if (c)
        callee(nonnull ptr)  // set non-null attribute in the argument

2) We can also split a call-site based on constant incoming values of a PHI
For example,
from :
      BB0:
       %c = icmp eq i32 %i1, %i2
       br i1 %c, label %BB2, label %BB1
      BB1:
       br label %BB2
      BB2:
       %p = phi i32 [ 0, %BB0 ], [ 1, %BB1 ]
       call void @bar(i32 %p)
to
      BB0:
       %c = icmp eq i32 %i1, %i2
       br i1 %c, label %BB2-split0, label %BB1
      BB1:
       br label %BB2-split1
      BB2-split0:
       call void @bar(i32 0)
       br label %BB2
      BB2-split1:
       call void @bar(i32 1)
       br label %BB2
      BB2:
       %p = phi i32 [ 0, %BB2-split0 ], [ 1, %BB2-split1 ]

Reviewers: davidxl, huntergr, chandlerc, mcrosier, eraman, davide

Reviewed By: davidxl

Subscribers: sdesmalen, ashutosh.nema, fhahn, mssimpso, aemerson, mgorny, mehdi_amini, kristof.beyls, llvm-commits

Differential Revision: https://reviews.llvm.org/D39137

llvm-svn: 317351
---
 llvm/include/llvm/InitializePasses.h               |   1 +
 llvm/include/llvm/Transforms/Scalar.h              |   8 +
 .../llvm/Transforms/Scalar/CallSiteSplitting.h     |  29 ++
 llvm/lib/Passes/PassBuilder.cpp                    |   9 +-
 llvm/lib/Passes/PassRegistry.def                   |   1 +
 llvm/lib/Transforms/IPO/PassManagerBuilder.cpp     |   6 +
 llvm/lib/Transforms/Scalar/CMakeLists.txt          |   1 +
 llvm/lib/Transforms/Scalar/CallSiteSplitting.cpp   | 492 +++++++++++++++++++++
 llvm/lib/Transforms/Scalar/Scalar.cpp              |   1 +
 llvm/test/Other/new-pm-defaults.ll                 |   1 +
 llvm/test/Other/new-pm-lto-defaults.ll             |   9 +-
 llvm/test/Other/new-pm-thinlto-defaults.ll         |   1 +
 .../CallSiteSplitting/callsite-split-or-phi.ll     | 339 ++++++++++++++
 .../Transforms/CallSiteSplitting/callsite-split.ll | 119 +++++
 14 files changed, 1014 insertions(+), 3 deletions(-)
 create mode 100644 llvm/include/llvm/Transforms/Scalar/CallSiteSplitting.h
 create mode 100644 llvm/lib/Transforms/Scalar/CallSiteSplitting.cpp
 create mode 100644 llvm/test/Transforms/CallSiteSplitting/callsite-split-or-phi.ll
 create mode 100644 llvm/test/Transforms/CallSiteSplitting/callsite-split.ll

diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h
index b8183d1..9cdb493 100644
--- a/llvm/include/llvm/InitializePasses.h
+++ b/llvm/include/llvm/InitializePasses.h
@@ -80,6 +80,7 @@ void initializeBranchFolderPassPass(PassRegistry&);
 void initializeBranchProbabilityInfoWrapperPassPass(PassRegistry&);
 void initializeBranchRelaxationPass(PassRegistry&);
 void initializeBreakCriticalEdgesPass(PassRegistry&);
+void initializeCallSiteSplittingLegacyPassPass(PassRegistry&);
 void initializeCFGOnlyPrinterLegacyPassPass(PassRegistry&);
 void initializeCFGOnlyViewerLegacyPassPass(PassRegistry&);
 void initializeCFGPrinterLegacyPassPass(PassRegistry&);
diff --git a/llvm/include/llvm/Transforms/Scalar.h b/llvm/include/llvm/Transforms/Scalar.h
index a78c897..0cf1115 100644
--- a/llvm/include/llvm/Transforms/Scalar.h
+++ b/llvm/include/llvm/Transforms/Scalar.h
@@ -73,6 +73,14 @@ FunctionPass *createDeadCodeEliminationPass();
 //
 FunctionPass *createDeadStoreEliminationPass();
 
+
+//===----------------------------------------------------------------------===//
+//
+// CallSiteSplitting - This pass split call-site based on its known argument
+// values.
+FunctionPass *createCallSiteSplittingPass();
+
+
 //===----------------------------------------------------------------------===//
 //
 // AggressiveDCE - This pass uses the SSA based Aggressive DCE algorithm.  This
diff --git a/llvm/include/llvm/Transforms/Scalar/CallSiteSplitting.h b/llvm/include/llvm/Transforms/Scalar/CallSiteSplitting.h
new file mode 100644
index 0000000..5ab951a
--- /dev/null
+++ b/llvm/include/llvm/Transforms/Scalar/CallSiteSplitting.h
@@ -0,0 +1,29 @@
+//===- CallSiteSplitting..h - Callsite Splitting ------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_SCALAR_CALLSITESPLITTING__H
+#define LLVM_TRANSFORMS_SCALAR_CALLSITESPLITTING__H
+
+#include "llvm/ADT/SetVector.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/Support/Compiler.h"
+#include <vector>
+
+namespace llvm {
+
+struct CallSiteSplittingPass : PassInfoMixin<CallSiteSplittingPass> {
+  /// \brief Run the pass over the function.
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+};
+} // end namespace llvm
+
+#endif // LLVM_TRANSFORMS_SCALAR_CALLSITESPLITTING__H
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index 21d95a0..2088ea0 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -89,6 +89,7 @@
 #include "llvm/Transforms/Scalar/ADCE.h"
 #include "llvm/Transforms/Scalar/AlignmentFromAssumptions.h"
 #include "llvm/Transforms/Scalar/BDCE.h"
+#include "llvm/Transforms/Scalar/CallSiteSplitting.h"
 #include "llvm/Transforms/Scalar/ConstantHoisting.h"
 #include "llvm/Transforms/Scalar/CorrelatedValuePropagation.h"
 #include "llvm/Transforms/Scalar/DCE.h"
@@ -548,6 +549,9 @@ PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level,
   EarlyFPM.addPass(SROA());
   EarlyFPM.addPass(EarlyCSEPass());
   EarlyFPM.addPass(LowerExpectIntrinsicPass());
+  if (Level == O3)
+    EarlyFPM.addPass(CallSiteSplittingPass());
+
   // In SamplePGO ThinLTO backend, we need instcombine before profile annotation
   // to convert bitcast to direct calls so that they can be inlined during the
   // profile annotation prepration step.
@@ -920,13 +924,16 @@ ModulePassManager PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level,
   MPM.addPass(InferFunctionAttrsPass());
 
   if (Level > 1) {
+    FunctionPassManager EarlyFPM(DebugLogging);
+    EarlyFPM.addPass(CallSiteSplittingPass());
+    MPM.addPass(createModuleToFunctionPassAdaptor(std::move(EarlyFPM)));
+
     // Indirect call promotion. This should promote all the targets that are
     // left by the earlier promotion pass that promotes intra-module targets.
     // This two-step promotion is to save the compile time. For LTO, it should
     // produce the same result as if we only do promotion here.
     MPM.addPass(PGOIndirectCallPromotion(
         true /* InLTO */, PGOOpt && !PGOOpt->SampleProfileFile.empty()));
-
     // Propagate constants at call sites into the functions they call.  This
     // opens opportunities for globalopt (and inlining) by substituting function
     // pointers passed as arguments to direct uses of functions.
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index 20d1220..40b8843 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -140,6 +140,7 @@ FUNCTION_PASS("add-discriminators", AddDiscriminatorsPass())
 FUNCTION_PASS("alignment-from-assumptions", AlignmentFromAssumptionsPass())
 FUNCTION_PASS("bdce", BDCEPass())
 FUNCTION_PASS("break-crit-edges", BreakCriticalEdgesPass())
+FUNCTION_PASS("callsite-splitting", CallSiteSplittingPass())
 FUNCTION_PASS("consthoist", ConstantHoistingPass())
 FUNCTION_PASS("correlated-propagation", CorrelatedValuePropagationPass())
 FUNCTION_PASS("dce", DCEPass())
diff --git a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
index 828eb5e..b8ff614 100644
--- a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
+++ b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
@@ -467,6 +467,9 @@ void PassManagerBuilder::populateModulePassManager(
 
   addExtensionsToPM(EP_ModuleOptimizerEarly, MPM);
 
+  if (OptLevel > 2)
+    MPM.add(createCallSiteSplittingPass());
+
   MPM.add(createIPSCCPPass());          // IP SCCP
   MPM.add(createCalledValuePropagationPass());
   MPM.add(createGlobalOptimizerPass()); // Optimize out global vars
@@ -703,6 +706,9 @@ void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) {
   PM.add(createInferFunctionAttrsLegacyPass());
 
   if (OptLevel > 1) {
+    // Split call-site with more constrained arguments.
+    PM.add(createCallSiteSplittingPass());
+
     // Indirect call promotion. This should promote all the targets that are
     // left by the earlier promotion pass that promotes intra-module targets.
     // This two-step promotion is to save the compile time. For LTO, it should
diff --git a/llvm/lib/Transforms/Scalar/CMakeLists.txt b/llvm/lib/Transforms/Scalar/CMakeLists.txt
index d79ae85..6a27fbc 100644
--- a/llvm/lib/Transforms/Scalar/CMakeLists.txt
+++ b/llvm/lib/Transforms/Scalar/CMakeLists.txt
@@ -2,6 +2,7 @@ add_llvm_library(LLVMScalarOpts
   ADCE.cpp
   AlignmentFromAssumptions.cpp
   BDCE.cpp
+  CallSiteSplitting.cpp
   ConstantHoisting.cpp
   ConstantProp.cpp
   CorrelatedValuePropagation.cpp
diff --git a/llvm/lib/Transforms/Scalar/CallSiteSplitting.cpp b/llvm/lib/Transforms/Scalar/CallSiteSplitting.cpp
new file mode 100644
index 0000000..251e332
--- /dev/null
+++ b/llvm/lib/Transforms/Scalar/CallSiteSplitting.cpp
@@ -0,0 +1,492 @@
+//===- CallSiteSplitting.cpp ----------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a transformation that tries to split a call-site to pass
+// more constrained arguments if its argument is predicated in the control flow
+// so that we can expose better context to the later passes (e.g, inliner, jump
+// threading, or IPA-CP based function cloning, etc.).
+// As of now we support two cases :
+//
+// 1) If a call site is dominated by an OR condition and if any of its arguments
+// are predicated on this OR condition, try to split the condition with more
+// constrained arguments. For example, in the code below, we try to split the
+// call site since we can predicate the argument(ptr) based on the OR condition.
+//
+// Split from :
+//   if (!ptr || c)
+//     callee(ptr);
+// to :
+//   if (!ptr)
+//     callee(null)         // set the known constant value
+//   else if (c)
+//     callee(nonnull ptr)  // set non-null attribute in the argument
+//
+// 2) We can also split a call-site based on constant incoming values of a PHI
+// For example,
+// from :
+//   Header:
+//    %c = icmp eq i32 %i1, %i2
+//    br i1 %c, label %Tail, label %TBB
+//   TBB:
+//    br label Tail%
+//   Tail:
+//    %p = phi i32 [ 0, %Header], [ 1, %TBB]
+//    call void @bar(i32 %p)
+// to
+//   Header:
+//    %c = icmp eq i32 %i1, %i2
+//    br i1 %c, label %Tail-split0, label %TBB
+//   TBB:
+//    br label %Tail-split1
+//   Tail-split0:
+//    call void @bar(i32 0)
+//    br label %Tail
+//   Tail-split1:
+//    call void @bar(i32 1)
+//    br label %Tail
+//   Tail:
+//    %p = phi i32 [ 0, %Tail-split0 ], [ 1, %Tail-split1 ]
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/CallSiteSplitting.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+
+using namespace llvm;
+using namespace PatternMatch;
+
+#define DEBUG_TYPE "callsite-splitting"
+
+STATISTIC(NumCallSiteSplit, "Number of call-site split");
+
+static void addNonNullAttribute(Instruction *CallI, Instruction *&NewCallI,
+                                Value *Op) {
+  if (!NewCallI)
+    NewCallI = CallI->clone();
+  CallSite CS(NewCallI);
+  unsigned ArgNo = 0;
+  for (auto &I : CS.args()) {
+    if (&*I == Op)
+      CS.addParamAttr(ArgNo, Attribute::NonNull);
+    ++ArgNo;
+  }
+}
+
+static void setConstantInArgument(Instruction *CallI, Instruction *&NewCallI,
+                                  Value *Op, Constant *ConstValue) {
+  if (!NewCallI)
+    NewCallI = CallI->clone();
+  CallSite CS(NewCallI);
+  unsigned ArgNo = 0;
+  for (auto &I : CS.args()) {
+    if (&*I == Op)
+      CS.setArgument(ArgNo, ConstValue);
+    ++ArgNo;
+  }
+}
+
+static bool createCallSitesOnOrPredicatedArgument(
+    CallSite CS, Instruction *&NewCSTakenFromHeader,
+    Instruction *&NewCSTakenFromNextCond,
+    SmallVectorImpl<BranchInst *> &BranchInsts, BasicBlock *HeaderBB) {
+  assert(BranchInsts.size() <= 2 &&
+         "Unexpected number of blocks in the OR predicated condition");
+  Instruction *Instr = CS.getInstruction();
+  BasicBlock *CallSiteBB = Instr->getParent();
+  TerminatorInst *HeaderTI = HeaderBB->getTerminator();
+  bool IsCSInTakenPath = CallSiteBB == HeaderTI->getSuccessor(0);
+
+  for (unsigned I = 0, E = BranchInsts.size(); I != E; ++I) {
+    BranchInst *PBI = BranchInsts[I];
+    assert(isa<ICmpInst>(PBI->getCondition()) &&
+           "Unexpected condition in a conditional branch.");
+    ICmpInst *Cmp = cast<ICmpInst>(PBI->getCondition());
+    Value *Arg = Cmp->getOperand(0);
+    assert(isa<Constant>(Cmp->getOperand(1)) &&
+           "Expected op1 to be a constant.");
+    Constant *ConstVal = cast<Constant>(Cmp->getOperand(1));
+    CmpInst::Predicate Pred = Cmp->getPredicate();
+
+    if (PBI->getParent() == HeaderBB) {
+      Instruction *&CallTakenFromHeader =
+          IsCSInTakenPath ? NewCSTakenFromHeader : NewCSTakenFromNextCond;
+      Instruction *&CallUntakenFromHeader =
+          IsCSInTakenPath ? NewCSTakenFromNextCond : NewCSTakenFromHeader;
+
+      assert(Pred == ICmpInst::ICMP_EQ ||
+             Pred == ICmpInst::ICMP_NE &&
+                 "Unexpected predicate in an OR condition");
+
+      // Set the constant value for agruments in the call predicated based on
+      // the OR condition.
+      Instruction *&CallToSetConst = Pred == ICmpInst::ICMP_EQ
+                                         ? CallTakenFromHeader
+                                         : CallUntakenFromHeader;
+      setConstantInArgument(Instr, CallToSetConst, Arg, ConstVal);
+
+      // Add the NonNull attribute if compared with the null pointer.
+      if (ConstVal->getType()->isPointerTy() && ConstVal->isNullValue()) {
+        Instruction *&CallToSetAttr = Pred == ICmpInst::ICMP_EQ
+                                          ? CallUntakenFromHeader
+                                          : CallTakenFromHeader;
+        addNonNullAttribute(Instr, CallToSetAttr, Arg);
+      }
+      continue;
+    }
+
+    if (Pred == ICmpInst::ICMP_EQ) {
+      if (PBI->getSuccessor(0) == Instr->getParent()) {
+        // Set the constant value for the call taken from the second block in
+        // the OR condition.
+        setConstantInArgument(Instr, NewCSTakenFromNextCond, Arg, ConstVal);
+      } else {
+        // Add the NonNull attribute if compared with the null pointer for the
+        // call taken from the second block in the OR condition.
+        if (ConstVal->getType()->isPointerTy() && ConstVal->isNullValue())
+          addNonNullAttribute(Instr, NewCSTakenFromNextCond, Arg);
+      }
+    } else {
+      if (PBI->getSuccessor(0) == Instr->getParent()) {
+        // Add the NonNull attribute if compared with the null pointer for the
+        // call taken from the second block in the OR condition.
+        if (ConstVal->getType()->isPointerTy() && ConstVal->isNullValue())
+          addNonNullAttribute(Instr, NewCSTakenFromNextCond, Arg);
+      } else if (Pred == ICmpInst::ICMP_NE) {
+        // Set the constant value for the call in the untaken path from the
+        // header block.
+        setConstantInArgument(Instr, NewCSTakenFromNextCond, Arg, ConstVal);
+      } else
+        llvm_unreachable("Unexpected condition");
+    }
+  }
+  return NewCSTakenFromHeader || NewCSTakenFromNextCond;
+}
+
+static bool canSplitCallSite(CallSite CS) {
+  // FIXME: As of now we handle only CallInst. InvokeInst could be handled
+  // without too much effort.
+  Instruction *Instr = CS.getInstruction();
+  if (!isa<CallInst>(Instr))
+    return false;
+
+  // Allow splitting a call-site only when there is no instruction before the
+  // call-site in the basic block. Based on this constraint, we only clone the
+  // call instruction, and we do not move a call-site across any other
+  // instruction.
+  BasicBlock *CallSiteBB = Instr->getParent();
+  if (Instr != CallSiteBB->getFirstNonPHI())
+    return false;
+
+  pred_iterator PII = pred_begin(CallSiteBB);
+  pred_iterator PIE = pred_end(CallSiteBB);
+  unsigned NumPreds = std::distance(PII, PIE);
+
+  // Allow only one extra call-site. No more than two from one call-site.
+  if (NumPreds != 2)
+    return false;
+
+  // Cannot split an edge from an IndirectBrInst.
+  BasicBlock *Preds[2] = {*PII++, *PII};
+  if (isa<IndirectBrInst>(Preds[0]->getTerminator()) ||
+      isa<IndirectBrInst>(Preds[1]->getTerminator()))
+    return false;
+
+  return CallSiteBB->canSplitPredecessors();
+}
+
+/// Return true if the CS is split into its new predecessors which are directly
+/// hooked to each of its orignial predecessors pointed by PredBB1 and PredBB2.
+/// Note that PredBB1 and PredBB2 are decided in findPredicatedArgument(),
+/// especially for the OR predicated case where PredBB1 will point the header,
+/// and PredBB2 will point the the second compare block. CallInst1 and CallInst2
+/// will be the new call-sites placed in the new predecessors split for PredBB1
+/// and PredBB2, repectively. Therefore, CallInst1 will be the call-site placed
+/// between Header and Tail, and CallInst2 will be the call-site between TBB and
+/// Tail. For example, in the IR below with an OR condition, the call-site can
+/// be split
+///
+/// from :
+///
+///   Header:
+///     %c = icmp eq i32* %a, null
+///     br i1 %c %Tail, %TBB
+///   TBB:
+///     %c2 = icmp eq i32* %b, null
+///     br i1 %c %Tail, %End
+///   Tail:
+///     %ca = call i1  @callee (i32* %a, i32* %b)
+///
+///  to :
+///
+///   Header:                          // PredBB1 is Header
+///     %c = icmp eq i32* %a, null
+///     br i1 %c %Tail-split1, %TBB
+///   TBB:                             // PredBB2 is TBB
+///     %c2 = icmp eq i32* %b, null
+///     br i1 %c %Tail-split2, %End
+///   Tail-split1:
+///     %ca1 = call @callee (i32* null, i32* %b)         // CallInst1
+///    br %Tail
+///   Tail-split2:
+///     %ca2 = call @callee (i32* nonnull %a, i32* null) // CallInst2
+///    br %Tail
+///   Tail:
+///    %p = phi i1 [%ca1, %Tail-split1],[%ca2, %Tail-split2]
+///
+/// Note that for an OR predicated case, CallInst1 and CallInst2 should be
+/// created with more constrained arguments in
+/// createCallSitesOnOrPredicatedArgument().
+static void splitCallSite(CallSite CS, BasicBlock *PredBB1, BasicBlock *PredBB2,
+                          Instruction *CallInst1, Instruction *CallInst2) {
+  Instruction *Instr = CS.getInstruction();
+  BasicBlock *TailBB = Instr->getParent();
+  assert(Instr == (TailBB->getFirstNonPHI()) && "Unexpected call-site");
+
+  BasicBlock *SplitBlock1 =
+      SplitBlockPredecessors(TailBB, PredBB1, ".predBB1.split");
+  BasicBlock *SplitBlock2 =
+      SplitBlockPredecessors(TailBB, PredBB2, ".predBB2.split");
+
+  assert((SplitBlock1 && SplitBlock2) && "Unexpected new basic block split.");
+
+  if (!CallInst1)
+    CallInst1 = Instr->clone();
+  if (!CallInst2)
+    CallInst2 = Instr->clone();
+
+  CallInst1->insertBefore(&*SplitBlock1->getFirstInsertionPt());
+  CallInst2->insertBefore(&*SplitBlock2->getFirstInsertionPt());
+
+  CallSite CS1(CallInst1);
+  CallSite CS2(CallInst2);
+
+  // Handle PHIs used as arguments in the call-site.
+  for (auto &PI : *TailBB) {
+    PHINode *PN = dyn_cast<PHINode>(&PI);
+    if (!PN)
+      break;
+    unsigned ArgNo = 0;
+    for (auto &CI : CS.args()) {
+      if (&*CI == PN) {
+        CS1.setArgument(ArgNo, PN->getIncomingValueForBlock(SplitBlock1));
+        CS2.setArgument(ArgNo, PN->getIncomingValueForBlock(SplitBlock2));
+      }
+      ++ArgNo;
+    }
+  }
+
+  // Replace users of the original call with a PHI mering call-sites split.
+  if (Instr->getNumUses()) {
+    PHINode *PN = PHINode::Create(Instr->getType(), 2, "phi.call", Instr);
+    PN->addIncoming(CallInst1, SplitBlock1);
+    PN->addIncoming(CallInst2, SplitBlock2);
+    Instr->replaceAllUsesWith(PN);
+  }
+  DEBUG(dbgs() << "split call-site : " << *Instr << " into \n");
+  DEBUG(dbgs() << "    " << *CallInst1 << " in " << SplitBlock1->getName()
+               << "\n");
+  DEBUG(dbgs() << "    " << *CallInst2 << " in " << SplitBlock2->getName()
+               << "\n");
+  Instr->eraseFromParent();
+  NumCallSiteSplit++;
+}
+
+static bool isCondRelevantToAnyCallArgument(ICmpInst *Cmp, CallSite CS) {
+  assert(isa<Constant>(Cmp->getOperand(1)) && "Expected a constant operand.");
+  Value *Op0 = Cmp->getOperand(0);
+  unsigned ArgNo = 0;
+  for (CallSite::arg_iterator I = CS.arg_begin(), E = CS.arg_end(); I != E;
+       ++I, ++ArgNo) {
+    // Don't consider constant or arguments that are already known non-null.
+    if (isa<Constant>(*I) || CS.paramHasAttr(ArgNo, Attribute::NonNull))
+      continue;
+
+    if (*I == Op0)
+      return true;
+  }
+  return false;
+}
+
+static void findOrCondRelevantToCallArgument(
+    CallSite CS, BasicBlock *PredBB, BasicBlock *OtherPredBB,
+    SmallVectorImpl<BranchInst *> &BranchInsts, BasicBlock *&HeaderBB) {
+  auto *PBI = dyn_cast<BranchInst>(PredBB->getTerminator());
+  if (!PBI || !PBI->isConditional())
+    return;
+
+  if (PBI->getSuccessor(0) == OtherPredBB ||
+      PBI->getSuccessor(1) == OtherPredBB)
+    if (PredBB == OtherPredBB->getSinglePredecessor()) {
+      assert(!HeaderBB && "Expect to find only a single header block");
+      HeaderBB = PredBB;
+    }
+
+  CmpInst::Predicate Pred;
+  Value *Cond = PBI->getCondition();
+  if (!match(Cond, m_ICmp(Pred, m_Value(), m_Constant())))
+    return;
+  ICmpInst *Cmp = cast<ICmpInst>(Cond);
+  if (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE)
+    if (isCondRelevantToAnyCallArgument(Cmp, CS))
+      BranchInsts.push_back(PBI);
+}
+
+// Return true if the call-site has an argument which is a PHI with only
+// constant incoming values.
+static bool isPredicatedOnPHI(CallSite CS) {
+  Instruction *Instr = CS.getInstruction();
+  BasicBlock *Parent = Instr->getParent();
+  if (Instr != Parent->getFirstNonPHI())
+    return false;
+
+  for (auto &BI : *Parent) {
+    if (PHINode *PN = dyn_cast<PHINode>(&BI)) {
+      for (auto &I : CS.args())
+        if (&*I == PN) {
+          assert(PN->getNumIncomingValues() == 2 &&
+                 "Unexpected number of incoming values");
+          if (PN->getIncomingBlock(0) == PN->getIncomingBlock(1))
+            return false;
+          if (PN->getIncomingValue(0) == PN->getIncomingValue(1))
+            continue;
+          if (isa<Constant>(PN->getIncomingValue(0)) &&
+              isa<Constant>(PN->getIncomingValue(1)))
+            return true;
+        }
+    }
+    break;
+  }
+  return false;
+}
+
+// Return true if an agument in CS is predicated on an 'or' condition.
+// Create new call-site with arguments constrained based on the OR condition.
+static bool findPredicatedOnOrCondition(CallSite CS, BasicBlock *PredBB1,
+                                        BasicBlock *PredBB2,
+                                        Instruction *&NewCallTakenFromHeader,
+                                        Instruction *&NewCallTakenFromNextCond,
+                                        BasicBlock *&HeaderBB) {
+  SmallVector<BranchInst *, 4> BranchInsts;
+  findOrCondRelevantToCallArgument(CS, PredBB1, PredBB2, BranchInsts, HeaderBB);
+  findOrCondRelevantToCallArgument(CS, PredBB2, PredBB1, BranchInsts, HeaderBB);
+  if (BranchInsts.empty() || !HeaderBB)
+    return false;
+
+  // If an OR condition is detected, try to create call sites with constrained
+  // arguments (e.g., NonNull attribute or constant value).
+  return createCallSitesOnOrPredicatedArgument(CS, NewCallTakenFromHeader,
+                                               NewCallTakenFromNextCond,
+                                               BranchInsts, HeaderBB);
+}
+
+static bool findPredicatedArgument(CallSite CS, Instruction *&CallInst1,
+                                   Instruction *&CallInst2,
+                                   BasicBlock *&PredBB1, BasicBlock *&PredBB2) {
+  BasicBlock *CallSiteBB = CS.getInstruction()->getParent();
+  pred_iterator PII = pred_begin(CallSiteBB);
+  pred_iterator PIE = pred_end(CallSiteBB);
+  assert(std::distance(PII, PIE) == 2 && "Expect only two predecessors.");
+  BasicBlock *Preds[2] = {*PII++, *PII};
+  BasicBlock *&HeaderBB = PredBB1;
+  if (!findPredicatedOnOrCondition(CS, Preds[0], Preds[1], CallInst1, CallInst2,
+                                   HeaderBB) &&
+      !isPredicatedOnPHI(CS))
+    return false;
+
+  if (!PredBB1)
+    PredBB1 = Preds[0];
+
+  PredBB2 = PredBB1 == Preds[0] ? Preds[1] : Preds[0];
+  return true;
+}
+
+static bool tryToSplitCallSite(CallSite CS) {
+  if (!CS.arg_size())
+    return false;
+
+  BasicBlock *PredBB1 = nullptr;
+  BasicBlock *PredBB2 = nullptr;
+  Instruction *CallInst1 = nullptr;
+  Instruction *CallInst2 = nullptr;
+  if (!canSplitCallSite(CS) ||
+      !findPredicatedArgument(CS, CallInst1, CallInst2, PredBB1, PredBB2)) {
+    assert(!CallInst1 && !CallInst2 && "Unexpected new call-sites cloned.");
+    return false;
+  }
+  splitCallSite(CS, PredBB1, PredBB2, CallInst1, CallInst2);
+  return true;
+}
+
+static bool doCallSiteSplitting(Function &F, TargetLibraryInfo &TLI) {
+  bool Changed = false;
+  for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE;) {
+    BasicBlock &BB = *BI++;
+    for (BasicBlock::iterator II = BB.begin(), IE = BB.end(); II != IE;) {
+      Instruction *I = &*II++;
+      CallSite CS(cast<Value>(I));
+      if (!CS || isa<IntrinsicInst>(I) || isInstructionTriviallyDead(I, &TLI))
+        continue;
+
+      Function *Callee = CS.getCalledFunction();
+      if (!Callee || Callee->isDeclaration())
+        continue;
+      Changed |= tryToSplitCallSite(CS);
+    }
+  }
+  return Changed;
+}
+
+namespace {
+struct CallSiteSplittingLegacyPass : public FunctionPass {
+  static char ID;
+  CallSiteSplittingLegacyPass() : FunctionPass(ID) {
+    initializeCallSiteSplittingLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+    FunctionPass::getAnalysisUsage(AU);
+  }
+
+  bool runOnFunction(Function &F) override {
+    if (skipFunction(F))
+      return false;
+
+    auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+    return doCallSiteSplitting(F, TLI);
+  }
+};
+} // namespace
+
+char CallSiteSplittingLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(CallSiteSplittingLegacyPass, "callsite-splitting",
+                      "Call-site splitting", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(CallSiteSplittingLegacyPass, "callsite-splitting",
+                    "Call-site splitting", false, false)
+FunctionPass *llvm::createCallSiteSplittingPass() {
+  return new CallSiteSplittingLegacyPass();
+}
+
+PreservedAnalyses CallSiteSplittingPass::run(Function &F,
+                                             FunctionAnalysisManager &AM) {
+  auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
+
+  if (!doCallSiteSplitting(F, TLI))
+    return PreservedAnalyses::all();
+  PreservedAnalyses PA;
+  return PA;
+}
diff --git a/llvm/lib/Transforms/Scalar/Scalar.cpp b/llvm/lib/Transforms/Scalar/Scalar.cpp
index c1034ac..8a5ae1b 100644
--- a/llvm/lib/Transforms/Scalar/Scalar.cpp
+++ b/llvm/lib/Transforms/Scalar/Scalar.cpp
@@ -35,6 +35,7 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) {
   initializeADCELegacyPassPass(Registry);
   initializeBDCELegacyPassPass(Registry);
   initializeAlignmentFromAssumptionsPass(Registry);
+  initializeCallSiteSplittingLegacyPassPass(Registry);
   initializeConstantHoistingLegacyPassPass(Registry);
   initializeConstantPropagationPass(Registry);
   initializeCorrelatedValuePropagationPass(Registry);
diff --git a/llvm/test/Other/new-pm-defaults.ll b/llvm/test/Other/new-pm-defaults.ll
index 816f753..0810a13 100644
--- a/llvm/test/Other/new-pm-defaults.ll
+++ b/llvm/test/Other/new-pm-defaults.ll
@@ -76,6 +76,7 @@
 ; CHECK-O-NEXT: Running pass: EarlyCSEPass
 ; CHECK-O-NEXT: Running analysis: TargetLibraryAnalysis
 ; CHECK-O-NEXT: Running pass: LowerExpectIntrinsicPass
+; CHECK-O3-NEXT: Running pass: CallSiteSplittingPass
 ; CHECK-O-NEXT: Finished llvm::Function pass manager run.
 ; CHECK-O-NEXT: Running pass: IPSCCPPass
 ; CHECK-O-NEXT: Running pass: CalledValuePropagationPass
diff --git a/llvm/test/Other/new-pm-lto-defaults.ll b/llvm/test/Other/new-pm-lto-defaults.ll
index fc52f70..878198d 100644
--- a/llvm/test/Other/new-pm-lto-defaults.ll
+++ b/llvm/test/Other/new-pm-lto-defaults.ll
@@ -29,9 +29,14 @@
 ; CHECK-O-NEXT: Running pass: ForceFunctionAttrsPass
 ; CHECK-O-NEXT: Running pass: InferFunctionAttrsPass
 ; CHECK-O-NEXT: Running analysis: TargetLibraryAnalysis
+; CHECK-O2-NEXT: Running pass: ModuleToFunctionPassAdaptor<{{.*}}PassManager{{.*}}>
+; CHECK-O2-NEXT: Running analysis: InnerAnalysisManagerProxy<{{.*}}Module
+; CHECK-O2-NEXT: Starting llvm::Function pass manager run.
+; CHECK-O2-NEXT: Running pass: CallSiteSplittingPass on foo
+; CHECK-O2-NEXT: Running analysis: TargetLibraryAnalysis on foo
+; CHECK-O2-NEXT: Finished llvm::Function pass manager run.
 ; CHECK-O2-NEXT: PGOIndirectCallPromotion
 ; CHECK-O2-NEXT: Running analysis: ProfileSummaryAnalysis
-; CHECK-O2-NEXT: Running analysis: InnerAnalysisManagerProxy<{{.*}}Function
 ; CHECK-O2-NEXT: Running analysis: OptimizationRemarkEmitterAnalysis
 ; CHECK-O2-NEXT: Running pass: IPSCCPPass
 ; CHECK-O2-NEXT: Running pass: CalledValuePropagationPass
@@ -42,7 +47,7 @@
 ; CHECK-O-NEXT: Running analysis: FunctionAnalysisManagerCGSCCProxy
 ; CHECK-O-NEXT: Running analysis: OuterAnalysisManagerProxy<{{.*}}LazyCallGraph{{.*}}>
 ; CHECK-O-NEXT: Running analysis: AAManager
-; CHECK-O-NEXT: Running analysis: TargetLibraryAnalysis
+; CHECK-O1-NEXT: Running analysis: TargetLibraryAnalysis
 ; CHECK-O-NEXT: Running pass: ReversePostOrderFunctionAttrsPass
 ; CHECK-O-NEXT: Running analysis: CallGraphAnalysis
 ; CHECK-O-NEXT: Running pass: GlobalSplitPass
diff --git a/llvm/test/Other/new-pm-thinlto-defaults.ll b/llvm/test/Other/new-pm-thinlto-defaults.ll
index 7d40ef3..e83f0f87 100644
--- a/llvm/test/Other/new-pm-thinlto-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-defaults.ll
@@ -72,6 +72,7 @@
 ; CHECK-O-NEXT: Running pass: EarlyCSEPass
 ; CHECK-O-NEXT: Running analysis: TargetLibraryAnalysis
 ; CHECK-O-NEXT: Running pass: LowerExpectIntrinsicPass
+; CHECK-O3-NEXT: Running pass: CallSiteSplittingPass
 ; CHECK-O-NEXT: Finished llvm::Function pass manager run.
 ; CHECK-O-NEXT: Running pass: IPSCCPPass
 ; CHECK-O-NEXT: Running pass: CalledValuePropagationPass
diff --git a/llvm/test/Transforms/CallSiteSplitting/callsite-split-or-phi.ll b/llvm/test/Transforms/CallSiteSplitting/callsite-split-or-phi.ll
new file mode 100644
index 0000000..d1d854d
--- /dev/null
+++ b/llvm/test/Transforms/CallSiteSplitting/callsite-split-or-phi.ll
@@ -0,0 +1,339 @@
+; RUN: opt < %s -callsite-splitting -S | FileCheck %s
+; RUN: opt < %s  -passes='function(callsite-splitting)' -S | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-linaro-linux-gnueabi"
+
+;CHECK-LABEL: @test_eq_eq
+;CHECK-LABEL: Tail.predBB1.split:
+;CHECK: %[[CALL1:.*]] = call i32 @callee(i32* null, i32 %v, i32 1)
+;CHECK-LABEL: Tail.predBB2.split:
+;CHECK: %[[CALL2:.*]] = call i32 @callee(i32* nonnull %a, i32 1, i32 2)
+;CHECK-LABEL: Tail
+;CHECK: %p = phi i32 [ 1, %Tail.predBB1.split ], [ 2, %Tail.predBB2.split ]
+;CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Tail.predBB1.split ], [ %[[CALL2]], %Tail.predBB2.split ]
+;CHECK: ret i32 %[[MERGED]]
+define i32 @test_eq_eq(i32* %a, i32 %v) {
+Header:
+  %tobool1 = icmp eq i32* %a, null
+  br i1 %tobool1, label %Tail, label %TBB
+
+TBB:
+  %cmp = icmp eq i32 %v, 1
+  br i1 %cmp, label %Tail, label %End
+
+Tail:
+  %p = phi i32[1,%Header], [2, %TBB]
+  %r = call i32 @callee(i32* %a, i32 %v, i32 %p)
+  ret i32 %r
+
+End:
+  ret i32 %v
+}
+
+;CHECK-LABEL: @test_ne_eq
+;CHECK-LABEL: Tail.predBB1.split:
+;CHECK: %[[CALL1:.*]] = call i32 @callee(i32* nonnull %a, i32 %v, i32 1)
+;CHECK-LABEL: Tail.predBB2.split:
+;CHECK: %[[CALL2:.*]] = call i32 @callee(i32* null, i32 1, i32 2)
+;CHECK-LABEL: Tail
+;CHECK: %p = phi i32 [ 1, %Tail.predBB1.split ], [ 2, %Tail.predBB2.split ]
+;CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Tail.predBB1.split ], [ %[[CALL2]], %Tail.predBB2.split ]
+;CHECK: ret i32 %[[MERGED]]
+define i32 @test_ne_eq(i32* %a, i32 %v) {
+Header:
+  %tobool1 = icmp ne i32* %a, null
+  br i1 %tobool1, label %Tail, label %TBB
+
+TBB:
+  %cmp = icmp eq i32 %v, 1
+  br i1 %cmp, label %Tail, label %End
+
+Tail:
+  %p = phi i32[1,%Header], [2, %TBB]
+  %r = call i32 @callee(i32* %a, i32 %v, i32 %p)
+  ret i32 %r
+
+End:
+  ret i32 %v
+}
+
+;CHECK-LABEL: @test_ne_ne
+;CHECK-LABEL: Tail.predBB1.split:
+;CHECK: %[[CALL1:.*]] = call i32 @callee(i32* nonnull %a, i32 %v, i32 1)
+;CHECK-LABEL: Tail.predBB2.split:
+;CHECK: %[[CALL2:.*]] = call i32 @callee(i32* null, i32 %v, i32 2)
+;CHECK-LABEL: Tail
+;CHECK: %p = phi i32 [ 1, %Tail.predBB1.split ], [ 2, %Tail.predBB2.split ]
+;CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Tail.predBB1.split ], [ %[[CALL2]], %Tail.predBB2.split ]
+;CHECK: ret i32 %[[MERGED]]
+define i32 @test_ne_ne(i32* %a, i32 %v) {
+Header:
+  %tobool1 = icmp ne i32* %a, null
+  br i1 %tobool1, label %Tail, label %TBB
+
+TBB:
+  %cmp = icmp ne i32 %v, 1
+  br i1 %cmp, label %Tail, label %End
+
+Tail:
+  %p = phi i32[1,%Header], [2, %TBB]
+  %r = call i32 @callee(i32* %a, i32 %v, i32 %p)
+  ret i32 %r
+
+End:
+  ret i32 %v
+}
+
+;CHECK-LABEL: @test_eq_eq_untaken
+;CHECK-LABEL: Tail.predBB1.split:
+;CHECK: %[[CALL1:.*]] = call i32 @callee(i32* nonnull %a, i32 %v, i32 1)
+;CHECK-LABEL: Tail.predBB2.split:
+;CHECK: %[[CALL2:.*]] = call i32 @callee(i32* null, i32 1, i32 2)
+;CHECK-LABEL: Tail
+;CHECK: %p = phi i32 [ 1, %Tail.predBB1.split ], [ 2, %Tail.predBB2.split ]
+;CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Tail.predBB1.split ], [ %[[CALL2]], %Tail.predBB2.split ]
+;CHECK: ret i32 %[[MERGED]]
+define i32 @test_eq_eq_untaken(i32* %a, i32 %v) {
+Header:
+  %tobool1 = icmp eq i32* %a, null
+  br i1 %tobool1, label %TBB, label %Tail
+
+TBB:
+  %cmp = icmp eq i32 %v, 1
+  br i1 %cmp, label %Tail, label %End
+
+Tail:
+  %p = phi i32[1,%Header], [2, %TBB]
+  %r = call i32 @callee(i32* %a, i32 %v, i32 %p)
+  ret i32 %r
+
+End:
+  ret i32 %v
+}
+
+;CHECK-LABEL: @test_ne_eq_untaken
+;CHECK-LABEL: Tail.predBB1.split:
+;CHECK: %[[CALL1:.*]] = call i32 @callee(i32* null, i32 %v, i32 1)
+;CHECK-LABEL: Tail.predBB2.split:
+;CHECK: %[[CALL2:.*]] = call i32 @callee(i32* nonnull %a, i32 1, i32 2)
+;CHECK-LABEL: Tail
+;CHECK: %p = phi i32 [ 1, %Tail.predBB1.split ], [ 2, %Tail.predBB2.split ]
+;CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Tail.predBB1.split ], [ %[[CALL2]], %Tail.predBB2.split ]
+;CHECK: ret i32 %[[MERGED]]
+define i32 @test_ne_eq_untaken(i32* %a, i32 %v) {
+Header:
+  %tobool1 = icmp ne i32* %a, null
+  br i1 %tobool1, label %TBB, label %Tail
+
+TBB:
+  %cmp = icmp eq i32 %v, 1
+  br i1 %cmp, label %Tail, label %End
+
+Tail:
+  %p = phi i32[1,%Header], [2, %TBB]
+  %r = call i32 @callee(i32* %a, i32 %v, i32 %p)
+  ret i32 %r
+
+End:
+  ret i32 %v
+}
+
+;CHECK-LABEL: @test_ne_ne_untaken
+;CHECK-LABEL: Tail.predBB1.split:
+;CHECK: %[[CALL1:.*]] = call i32 @callee(i32* null, i32 %v, i32 1)
+;CHECK-LABEL: Tail.predBB2.split:
+;CHECK: %[[CALL2:.*]] = call i32 @callee(i32* nonnull %a, i32 1, i32 2)
+;CHECK-LABEL: Tail
+;CHECK: %p = phi i32 [ 1, %Tail.predBB1.split ], [ 2, %Tail.predBB2.split ]
+;CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Tail.predBB1.split ], [ %[[CALL2]], %Tail.predBB2.split ]
+;CHECK: ret i32 %[[MERGED]]
+define i32 @test_ne_ne_untaken(i32* %a, i32 %v) {
+Header:
+  %tobool1 = icmp ne i32* %a, null
+  br i1 %tobool1, label %TBB, label %Tail
+
+TBB:
+  %cmp = icmp ne i32 %v, 1
+  br i1 %cmp, label %End, label %Tail
+
+Tail:
+  %p = phi i32[1,%Header], [2, %TBB]
+  %r = call i32 @callee(i32* %a, i32 %v, i32 %p)
+  ret i32 %r
+
+End:
+  ret i32 %v
+}
+
+;CHECK-LABEL: @test_nonconst_const_phi
+;CHECK-LABEL: Tail.predBB1.split:
+;CHECK: %[[CALL1:.*]] = call i32 @callee(i32* %a, i32 %v, i32 1)
+;CHECK-LABEL: Tail.predBB2.split:
+;CHECK: %[[CALL2:.*]] = call i32 @callee(i32* %a, i32 1, i32 2)
+;CHECK-LABEL: Tail
+;CHECK: %p = phi i32 [ 1, %Tail.predBB1.split ], [ 2, %Tail.predBB2.split ]
+;CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Tail.predBB1.split ], [ %[[CALL2]], %Tail.predBB2.split ]
+;CHECK: ret i32 %[[MERGED]]
+define i32 @test_nonconst_const_phi(i32* %a, i32* %b, i32 %v) {
+Header:
+  %tobool1 = icmp eq i32* %a, %b
+  br i1 %tobool1, label %Tail, label %TBB
+
+TBB:
+  %cmp = icmp eq i32 %v, 1
+  br i1 %cmp, label %Tail, label %End
+
+Tail:
+  %p = phi i32[1,%Header], [2, %TBB]
+  %r = call i32 @callee(i32* %a, i32 %v, i32 %p)
+  ret i32 %r
+
+End:
+  ret i32 %v
+}
+
+;CHECK-LABEL: @test_nonconst_nonconst_phi
+;CHECK-LABEL: Tail.predBB1.split:
+;CHECK: %[[CALL1:.*]] = call i32 @callee(i32* %a, i32 %v, i32 1)
+;CHECK-LABEL: Tail.predBB2.split:
+;CHECK: %[[CALL2:.*]] = call i32 @callee(i32* %a, i32 %v, i32 2)
+;CHECK-LABEL: Tail
+;CHECK: %p = phi i32 [ 1, %Tail.predBB1.split ], [ 2, %Tail.predBB2.split ]
+;CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Tail.predBB1.split ], [ %[[CALL2]], %Tail.predBB2.split ]
+;CHECK: ret i32 %[[MERGED]]
+define i32 @test_nonconst_nonconst_phi(i32* %a, i32* %b, i32 %v, i32 %v2) {
+Header:
+  %tobool1 = icmp eq i32* %a, %b
+  br i1 %tobool1, label %Tail, label %TBB
+
+TBB:
+  %cmp = icmp eq i32 %v, %v2 
+  br i1 %cmp, label %Tail, label %End
+
+Tail:
+  %p = phi i32[1,%Header], [2, %TBB]
+  %r = call i32 @callee(i32* %a, i32 %v, i32 %p)
+  ret i32 %r
+
+End:
+  ret i32 %v
+}
+
+;CHECK-LABEL: @test_nonconst_nonconst_phi_noncost
+;CHECK-NOT: Tail.predBB1.split:
+;CHECK-NOT: Tail.predBB2.split:
+;CHECK-LABEL: Tail:
+;CHECK: %r = call i32 @callee(i32* %a, i32 %v, i32 %p)
+;CHECK: ret i32 %r
+define i32 @test_nonconst_nonconst_phi_noncost(i32* %a, i32* %b, i32 %v, i32 %v2) {
+Header:
+  %tobool1 = icmp eq i32* %a, %b
+  br i1 %tobool1, label %Tail, label %TBB
+
+TBB:
+  %cmp = icmp eq i32 %v, %v2 
+  br i1 %cmp, label %Tail, label %End
+
+Tail:
+  %p = phi i32[%v,%Header], [%v2, %TBB]
+  %r = call i32 @callee(i32* %a, i32 %v, i32 %p)
+  ret i32 %r
+
+End:
+  ret i32 %v
+}
+
+;CHECK-LABEL: @test_fisrtnonphi
+;CHECK-NOT: Tail.predBB1.split:
+;CHECK-NOT: Tail.predBB2.split:
+;CHECK-LABEL: Tail:
+;CHECK: %r = call i32 @callee(i32* %a, i32 %v, i32 %p)
+;CHECK: ret i32 %r
+define i32 @test_fisrtnonphi(i32* %a, i32 %v) {
+Header:
+  %tobool1 = icmp eq i32* %a, null
+  br i1 %tobool1, label %Tail, label %TBB
+
+TBB:
+  %cmp = icmp eq i32 %v, 1
+  br i1 %cmp, label %Tail, label %End
+
+Tail:
+  %p = phi i32[1,%Header], [2, %TBB]
+  store i32 %v, i32* %a
+  %r = call i32 @callee(i32* %a, i32 %v, i32 %p)
+  ret i32 %r
+
+End:
+  ret i32 %v
+}
+
+;CHECK-LABEL: @test_3preds_constphi
+;CHECK-NOT: Tail.predBB1.split:
+;CHECK-NOT: Tail.predBB2.split:
+;CHECK-LABEL: Tail:
+;CHECK: %r = call i32 @callee(i32* %a, i32 %v, i32 %p)
+;CHECK: ret i32 %r
+define i32 @test_3preds_constphi(i32* %a, i32 %v, i1 %c1, i1 %c2, i1 %c3) {
+Header:
+  br i1 %c1, label %Tail, label %TBB1
+
+TBB1:
+  br i1 %c2, label %Tail, label %TBB2
+
+TBB2:
+  br i1 %c3, label %Tail, label %End
+
+Tail:
+  %p = phi i32[1,%Header], [2, %TBB1], [3, %TBB2]
+  %r = call i32 @callee(i32* %a, i32 %v, i32 %p)
+  ret i32 %r
+
+End:
+  ret i32 %v
+}
+
+;CHECK-LABEL: @test_indirectbr_phi
+;CHECK-NOT: Tail.predBB1.split:
+;CHECK-NOT: Tail.predBB2.split:
+;CHECK-LABEL: Tail:
+;CHECK: %r = call i32 @callee(i32* %a, i32 %v, i32 %p)
+;CHECK: ret i32 %r
+define i32 @test_indirectbr_phi(i8* %address, i32* %a, i32* %b, i32 %v) {
+Header:
+   %indirect.goto.dest = select i1 undef, i8* blockaddress(@test_indirectbr_phi, %End), i8* %address
+   indirectbr i8* %indirect.goto.dest, [label %TBB, label %Tail]
+
+TBB:
+  %indirect.goto.dest2 = select i1 undef, i8* blockaddress(@test_indirectbr_phi, %End), i8* %address
+  indirectbr i8* %indirect.goto.dest2, [label %Tail, label %End]
+
+Tail:
+  %p = phi i32[1,%Header], [2, %TBB]
+  %r = call i32 @callee(i32* %a, i32 %v, i32 %p)
+  ret i32 %r
+
+End:
+  ret i32 %v
+}
+
+define i32 @callee(i32* %a, i32 %v, i32 %p) {
+entry:
+  %c = icmp ne i32* %a, null
+  br i1 %c, label %BB1, label %BB2
+
+BB1:
+  call void @dummy(i32* %a, i32 %p)
+  br label %End
+
+BB2:
+  call void @dummy2(i32 %v, i32 %p)
+  br label %End
+
+End:
+  ret i32 %p
+}
+
+declare void @dummy(i32*, i32)
+declare void @dummy2(i32, i32)
diff --git a/llvm/test/Transforms/CallSiteSplitting/callsite-split.ll b/llvm/test/Transforms/CallSiteSplitting/callsite-split.ll
new file mode 100644
index 0000000..419fa73
--- /dev/null
+++ b/llvm/test/Transforms/CallSiteSplitting/callsite-split.ll
@@ -0,0 +1,119 @@
+; RUN: opt < %s -callsite-splitting -inline -instcombine -jump-threading -S | FileCheck %s
+; RUN: opt < %s  -passes='function(callsite-splitting),cgscc(inline),function(instcombine,jump-threading)' -S | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-linaro-linux-gnueabi"
+
+%struct.bitmap = type { i32, %struct.bitmap* }
+
+;CHECK-LABEL: @caller
+;CHECK-LABEL: NextCond:
+;CHECK: br {{.*}} label %callee.exit
+;CHECK-LABEL: CallSiteBB.predBB1.split:
+;CHECK: call void @callee(%struct.bitmap* null, %struct.bitmap* null, %struct.bitmap* %b_elt, i1 false)
+;CHECK-LABEL: callee.exit:
+;CHECK: call void @dummy2(%struct.bitmap* %a_elt)
+
+define void @caller(i1 %c, %struct.bitmap* %a_elt, %struct.bitmap* %b_elt) {
+entry:
+  br label %Top
+
+Top:
+  %tobool1 = icmp eq %struct.bitmap* %a_elt, null
+  br i1 %tobool1, label %CallSiteBB, label %NextCond
+
+NextCond:
+  %cmp = icmp ne %struct.bitmap* %b_elt, null
+  br i1 %cmp, label %CallSiteBB, label %End
+
+CallSiteBB:
+  %p = phi i1 [0, %Top], [%c, %NextCond]
+  call void @callee(%struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %b_elt, i1 %p)
+  br label %End
+
+End:
+  ret void
+}
+
+define void @callee(%struct.bitmap* %dst_elt, %struct.bitmap* %a_elt, %struct.bitmap* %b_elt, i1 %c) {
+entry:
+  %tobool = icmp ne %struct.bitmap* %a_elt, null
+  %tobool1 = icmp ne %struct.bitmap* %b_elt, null
+  %or.cond = and i1 %tobool, %tobool1
+  br i1 %or.cond, label %Cond, label %Big
+
+Cond:
+  %cmp = icmp eq %struct.bitmap*  %dst_elt, %a_elt
+  br i1 %cmp, label %Small, label %Big
+
+Small:
+  call void @dummy2(%struct.bitmap* %a_elt)
+  br label %End
+
+Big:
+  call void @dummy1(%struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt)
+  call void @dummy1(%struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt)
+  call void @dummy1(%struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt)
+  call void @dummy1(%struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt)
+  call void @dummy1(%struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt)
+  call void @dummy1(%struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt)
+  call void @dummy1(%struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt)
+  br label %End
+
+End:
+  ret void
+}
+
+declare void @dummy2(%struct.bitmap*)
+declare void @dummy1(%struct.bitmap*, %struct.bitmap*, %struct.bitmap*, %struct.bitmap*, %struct.bitmap*, %struct.bitmap*)
+
+
+;CHECK-LABEL: @caller2
+;CHECK-LABEL: CallSiteBB.predBB1.split:
+;CHECK: call void @dummy4()
+;CHECK-LABEL: CallSiteBB.predBB2.split:
+;CHECK: call void @dummy3()
+;CheCK-LABEL: CallSiteBB:
+;CHECK: %phi.call = phi i1 [ false, %CallSiteBB.predBB1.split ], [ true, %CallSiteBB.predBB2.split ]
+;CHECK: call void @foo(i1 %phi.call)
+define void @caller2(i1 %c, %struct.bitmap* %a_elt, %struct.bitmap* %b_elt, %struct.bitmap* %c_elt) {
+entry:
+  br label %Top
+
+Top:
+  %tobool1 = icmp eq %struct.bitmap* %a_elt, %b_elt
+  br i1 %tobool1, label %CallSiteBB, label %NextCond
+
+NextCond:
+  %cmp = icmp ne %struct.bitmap* %b_elt, %c_elt
+  br i1 %cmp, label %CallSiteBB, label %End
+
+CallSiteBB:
+  %phi = phi i1 [0, %Top],[1, %NextCond]
+  %u = call i1 @callee2(i1 %phi)
+  call void @foo(i1 %u)
+  br label %End
+
+End:
+  ret void
+}
+
+define i1 @callee2(i1 %b) {
+entry:
+  br i1 %b, label %BB1, label %BB2
+
+BB1:
+  call void @dummy3()
+  br label %End
+
+BB2:
+  call void @dummy4()
+  br label %End
+
+End:
+  ret i1 %b
+}
+
+declare void @dummy3()
+declare void @dummy4()
+declare void @foo(i1)
-- 
2.7.4