[OpenMPOpt] Merge parallel regions

author Giorgis Georgakoudis <georgakoudis1@llnl.gov>

Tue, 7 Jul 2020 21:14:47 +0000 (14:14 -0700)

committer Giorgis Georgakoudis <georgakoudis1@llnl.gov>

Fri, 9 Oct 2020 16:59:04 +0000 (09:59 -0700)
author Giorgis Georgakoudis <georgakoudis1@llnl.gov>
Tue, 7 Jul 2020 21:14:47 +0000 (14:14 -0700)
committer Giorgis Georgakoudis <georgakoudis1@llnl.gov>
Fri, 9 Oct 2020 16:59:04 +0000 (09:59 -0700)
diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp

index f1eb88e..5a2287c 100644 (file)
--- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -19,14 +19,15 @@
  #include "llvm/Analysis/CallGraph.h"
  #include "llvm/Analysis/CallGraphSCCPass.h"
  #include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/ValueTracking.h"
  #include "llvm/Frontend/OpenMP/OMPConstants.h"
  #include "llvm/Frontend/OpenMP/OMPIRBuilder.h"
  #include "llvm/InitializePasses.h"
  #include "llvm/Support/CommandLine.h"
  #include "llvm/Transforms/IPO.h"
  #include "llvm/Transforms/IPO/Attributor.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
  #include "llvm/Transforms/Utils/CallGraphUpdater.h"
-#include "llvm/Analysis/ValueTracking.h"
  
  using namespace llvm;
  using namespace omp;
@@ -38,6 +39,11 @@ static cl::opt<bool> DisableOpenMPOptimizations(
      cl::desc("Disable OpenMP specific optimizations."), cl::Hidden,
      cl::init(false));
  
+static cl::opt<bool> EnableParallelRegionMerging(
+    "openmp-opt-enable-merging", cl::ZeroOrMore,
+    cl::desc("Enable the OpenMP region merging optimization."), cl::Hidden,
+    cl::init(false));
+
  static cl::opt<bool> PrintICVValues("openmp-print-icv-values", cl::init(false),
                                      cl::Hidden);
  static cl::opt<bool> PrintOpenMPKernels("openmp-print-gpu-kernels",
@@ -63,6 +69,8 @@ STATISTIC(NumOpenMPTargetRegionKernels,
  STATISTIC(
      NumOpenMPParallelRegionsReplacedInGPUStateMachine,
      "Number of OpenMP parallel regions replaced with ID in GPU state machines");
+STATISTIC(NumOpenMPParallelRegionsMerged,
+          "Number of OpenMP parallel regions merged");
  
  #if !defined(NDEBUG)
  static constexpr auto TAG = "[" DEBUG_TYPE "]";
@@ -505,12 +513,18 @@ struct OpenMPOpt {
      // Recollect uses, in case Attributor deleted any.
      OMPInfoCache.recollectUses();
  
-    Changed |= deduplicateRuntimeCalls();
      Changed |= deleteParallelRegions();
      if (HideMemoryTransferLatency)
        Changed |= hideMemTransfersLatency();
      if (remarksEnabled())
        analysisGlobalization();
+    Changed |= deduplicateRuntimeCalls();
+    if (EnableParallelRegionMerging) {
+      if (mergeParallelRegions()) {
+        deduplicateRuntimeCalls();
+        Changed = true;
+      }
+    }
  
      return Changed;
    }
@@ -575,6 +589,244 @@ struct OpenMPOpt {
    }
  
  private:
+  /// Merge parallel regions when it is safe.
+  bool mergeParallelRegions() {
+    const unsigned CallbackCalleeOperand = 2;
+    const unsigned CallbackFirstArgOperand = 3;
+    using InsertPointTy = OpenMPIRBuilder::InsertPointTy;
+
+    // Check if there are any __kmpc_fork_call calls to merge.
+    OMPInformationCache::RuntimeFunctionInfo &RFI =
+        OMPInfoCache.RFIs[OMPRTL___kmpc_fork_call];
+
+    if (!RFI.Declaration)
+      return false;
+
+    // Check if there any __kmpc_push_proc_bind calls for explicit affinities.
+    OMPInformationCache::RuntimeFunctionInfo &ProcBindRFI =
+        OMPInfoCache.RFIs[OMPRTL___kmpc_push_proc_bind];
+
+    // Defensively abort if explicit affinities are set.
+    // TODO: Track ICV proc_bind to merge when mergable regions have the same
+    // affinity.
+    if (ProcBindRFI.Declaration)
+      return false;
+
+    bool Changed = false;
+    LoopInfo *LI = nullptr;
+    DominatorTree *DT = nullptr;
+
+    SmallDenseMap<BasicBlock *, SmallPtrSet<Instruction *, 4>> BB2PRMap;
+
+    BasicBlock *StartBB = nullptr, *EndBB = nullptr;
+    auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
+                         BasicBlock &ContinuationIP) {
+      BasicBlock *CGStartBB = CodeGenIP.getBlock();
+      BasicBlock *CGEndBB =
+          SplitBlock(CGStartBB, &*CodeGenIP.getPoint(), DT, LI);
+      assert(StartBB != nullptr && "StartBB should not be null");
+      CGStartBB->getTerminator()->setSuccessor(0, StartBB);
+      assert(EndBB != nullptr && "EndBB should not be null");
+      EndBB->getTerminator()->setSuccessor(0, CGEndBB);
+    };
+
+    auto PrivCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
+                      Value &VPtr, Value *&ReplacementValue) -> InsertPointTy {
+      ReplacementValue = &VPtr;
+      return CodeGenIP;
+    };
+
+    auto FiniCB = [&](InsertPointTy CodeGenIP) {};
+
+    // Helper to merge the __kmpc_fork_call calls in MergableCIs. They are all
+    // contained in BB and only separated by instructions that can be
+    // redundantly executed in parallel. The block BB is split before the first
+    // call (in MergableCIs) and after the last so the entire region we merge
+    // into a single parallel region is contained in a single basic block
+    // without any other instructions. We use the OpenMPIRBuilder to outline
+    // that block and call the resulting function via __kmpc_fork_call.
+    auto Merge = [&](SmallVectorImpl<CallInst *> &MergableCIs, BasicBlock *BB) {
+      // TODO: Change the interface to allow single CIs expanded, e.g, to
+      // include an outer loop.
+      assert(MergableCIs.size() > 1 && "Assumed multiple mergable CIs");
+
+      auto Remark = [&](OptimizationRemark OR) {
+        OR << "Parallel region at "
+           << ore::NV("OpenMPParallelMergeFront",
+                      MergableCIs.front()->getDebugLoc())
+           << " merged with parallel regions at ";
+        for (auto *CI :
+             llvm::make_range(MergableCIs.begin() + 1, MergableCIs.end())) {
+          OR << ore::NV("OpenMPParallelMerge", CI->getDebugLoc());
+          if (CI != MergableCIs.back())
+            OR << ", ";
+        }
+        return OR;
+      };
+
+      emitRemark<OptimizationRemark>(MergableCIs.front(),
+                                     "OpenMPParallelRegionMerging", Remark);
+
+      Function *OriginalFn = BB->getParent();
+      LLVM_DEBUG(dbgs() << TAG << "Merge " << MergableCIs.size()
+                        << " parallel regions in " << OriginalFn->getName()
+                        << "\n");
+
+      // Isolate the calls to merge in a separate block.
+      EndBB = SplitBlock(BB, MergableCIs.back()->getNextNode(), DT, LI);
+      BasicBlock *AfterBB =
+          SplitBlock(EndBB, &*EndBB->getFirstInsertionPt(), DT, LI);
+      StartBB = SplitBlock(BB, MergableCIs.front(), DT, LI, nullptr,
+                           "omp.par.merged");
+
+      assert(BB->getUniqueSuccessor() == StartBB && "Expected a different CFG");
+      const DebugLoc DL = BB->getTerminator()->getDebugLoc();
+      BB->getTerminator()->eraseFromParent();
+
+      OpenMPIRBuilder::LocationDescription Loc(InsertPointTy(BB, BB->end()),
+                                               DL);
+      IRBuilder<>::InsertPoint AllocaIP(
+          &OriginalFn->getEntryBlock(),
+          OriginalFn->getEntryBlock().getFirstInsertionPt());
+      // Create the merged parallel region with default proc binding, to
+      // avoid overriding binding settings, and without explicit cancellation.
+      InsertPointTy AfterIP = OMPInfoCache.OMPBuilder.CreateParallel(
+          Loc, AllocaIP, BodyGenCB, PrivCB, FiniCB, nullptr, nullptr,
+          OMP_PROC_BIND_default, /* IsCancellable */ false);
+      BranchInst::Create(AfterBB, AfterIP.getBlock());
+
+      // Perform the actual outlining.
+      OMPInfoCache.OMPBuilder.finalize();
+
+      Function *OutlinedFn = MergableCIs.front()->getCaller();
+
+      // Replace the __kmpc_fork_call calls with direct calls to the outlined
+      // callbacks.
+      SmallVector<Value *, 8> Args;
+      for (auto *CI : MergableCIs) {
+        Value *Callee =
+            CI->getArgOperand(CallbackCalleeOperand)->stripPointerCasts();
+        FunctionType *FT =
+            cast<FunctionType>(Callee->getType()->getPointerElementType());
+        Args.clear();
+        Args.push_back(OutlinedFn->getArg(0));
+        Args.push_back(OutlinedFn->getArg(1));
+        for (unsigned U = CallbackFirstArgOperand, E = CI->getNumArgOperands();
+             U < E; ++U)
+          Args.push_back(CI->getArgOperand(U));
+
+        CallInst *NewCI = CallInst::Create(FT, Callee, Args, "", CI);
+        if (CI->getDebugLoc())
+          NewCI->setDebugLoc(CI->getDebugLoc());
+
+        // Forward parameter attributes from the callback to the callee.
+        for (unsigned U = CallbackFirstArgOperand, E = CI->getNumArgOperands();
+             U < E; ++U)
+          for (const Attribute &A : CI->getAttributes().getParamAttributes(U))
+            NewCI->addParamAttr(
+                U - (CallbackFirstArgOperand - CallbackCalleeOperand), A);
+
+        // Emit an explicit barrier to replace the implicit fork-join barrier.
+        if (CI != MergableCIs.back()) {
+          // TODO: Remove barrier if the merged parallel region includes the
+          // 'nowait' clause.
+          OMPInfoCache.OMPBuilder.CreateBarrier(
+              InsertPointTy(NewCI->getParent(),
+                            NewCI->getNextNode()->getIterator()),
+              OMPD_parallel);
+        }
+
+        auto Remark = [&](OptimizationRemark OR) {
+          return OR << "Parallel region at "
+                    << ore::NV("OpenMPParallelMerge", CI->getDebugLoc())
+                    << " merged with "
+                    << ore::NV("OpenMPParallelMergeFront",
+                               MergableCIs.front()->getDebugLoc());
+        };
+        if (CI != MergableCIs.front())
+          emitRemark<OptimizationRemark>(CI, "OpenMPParallelRegionMerging",
+                                         Remark);
+
+        CI->eraseFromParent();
+      }
+
+      assert(OutlinedFn != OriginalFn && "Outlining failed");
+      CGUpdater.registerOutlinedFunction(*OutlinedFn);
+      CGUpdater.reanalyzeFunction(*OriginalFn);
+
+      NumOpenMPParallelRegionsMerged += MergableCIs.size();
+
+      return true;
+    };
+
+    // Helper function that identifes sequences of
+    // __kmpc_fork_call uses in a basic block.
+    auto DetectPRsCB = [&](Use &U, Function &F) {
+      CallInst *CI = getCallIfRegularCall(U, &RFI);
+      BB2PRMap[CI->getParent()].insert(CI);
+
+      return false;
+    };
+
+    BB2PRMap.clear();
+    RFI.foreachUse(SCC, DetectPRsCB);
+    SmallVector<SmallVector<CallInst *, 4>, 4> MergableCIsVector;
+    // Find mergable parallel regions within a basic block that are
+    // safe to merge, that is any in-between instructions can safely
+    // execute in parallel after merging.
+    // TODO: support merging across basic-blocks.
+    for (auto &It : BB2PRMap) {
+      auto &CIs = It.getSecond();
+      if (CIs.size() < 2)
+        continue;
+
+      BasicBlock *BB = It.getFirst();
+      SmallVector<CallInst *, 4> MergableCIs;
+
+      // Find maximal number of parallel region CIs that are safe to merge.
+      for (Instruction &I : *BB) {
+        if (CIs.count(&I)) {
+          MergableCIs.push_back(cast<CallInst>(&I));
+          continue;
+        }
+
+        if (isSafeToSpeculativelyExecute(&I, &I, DT))
+          continue;
+
+        if (MergableCIs.size() > 1) {
+          MergableCIsVector.push_back(MergableCIs);
+          LLVM_DEBUG(dbgs() << TAG << "Found " << MergableCIs.size()
+                            << " parallel regions in block " << BB->getName()
+                            << " of function " << BB->getParent()->getName()
+                            << "\n";);
+        }
+
+        MergableCIs.clear();
+      }
+
+      if (!MergableCIsVector.empty()) {
+        Changed = true;
+
+        for (auto &MergableCIs : MergableCIsVector)
+          Merge(MergableCIs, BB);
+      }
+    }
+
+    if (Changed) {
+      // Update RFI info to set it up for later passes.
+      RFI.clearUsesMap();
+      OMPInfoCache.collectUses(RFI, /* CollectStats */ false);
+
+      // Collect uses for the emitted barrier call.
+      OMPInformationCache::RuntimeFunctionInfo &BarrierRFI =
+          OMPInfoCache.RFIs[OMPRTL___kmpc_barrier];
+      BarrierRFI.clearUsesMap();
+      OMPInfoCache.collectUses(BarrierRFI, /* CollectStats */ false);
+    }
+
+    return Changed;
+  }
+
    /// Try to delete parallel regions if possible.
    bool deleteParallelRegions() {
      const unsigned CallbackCalleeOperand = 2;
diff --git a/llvm/test/Transforms/OpenMP/parallel_region_merging.ll b/llvm/test/Transforms/OpenMP/parallel_region_merging.ll

new file mode 100644 (file)

index 0000000..7086299
--- /dev/null
+++ b/llvm/test/Transforms/OpenMP/parallel_region_merging.ll
@@ -0,0 +1,412 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs
+; RUN: opt -S -passes='attributor,cgscc(openmpopt)' -openmp-opt-enable-merging  < %s | FileCheck %s
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+
+%struct.ident_t = type { i32, i32, i32, i32, i8* }
+
+@0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
+@1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @0, i32 0, i32 0) }, align 8
+
+;   void merge_all() {
+;       int a = 1;
+;   #pragma omp parallel
+;       {
+;           a = 2;
+;       }
+;   #pragma omp parallel
+;       {
+;           a = 3;
+;       }
+;   }
+;
+; Merge all parallel regions.
+define dso_local void @merge_all() local_unnamed_addr  {
+  %1 = alloca i32, align 4
+  %2 = bitcast i32* %1 to i8*
+  store i32 1, i32* %1, align 4
+  %3 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_all..omp_par to void (i32*, i32*, ...)*), i32* nonnull %1)
+  %4 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_all..omp_par.1 to void (i32*, i32*, ...)*), i32* nonnull %1)
+  ret void
+}
+
+define internal void @merge_all..omp_par.1(i32* noalias nocapture readnone %0, i32* noalias nocapture readnone %1, i32* nocapture %2)  {
+  store i32 3, i32* %2, align 4
+  ret void
+}
+
+define internal void @merge_all..omp_par(i32* noalias nocapture readnone %0, i32* noalias nocapture readnone %1, i32* nocapture %2)  {
+  store i32 2, i32* %2, align 4
+  ret void
+}
+
+
+declare i32 @__kmpc_global_thread_num(%struct.ident_t*) local_unnamed_addr
+
+declare !callback !1 void @__kmpc_fork_call(%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) local_unnamed_addr
+
+;   void merge_none() {
+;       int a = 1;
+;   #pragma omp parallel
+;       {
+;           a = 2;
+;       }
+;       a = 3;
+;   #pragma omp parallel
+;       {
+;           a = 4;
+;       }
+;   }
+;
+; Does not merge parallel regions, in-between store
+; instruction is unsafe to execute in parallel.
+define dso_local void @merge_none() local_unnamed_addr  {
+  %1 = alloca i32, align 4
+  %2 = bitcast i32* %1 to i8*
+  store i32 1, i32* %1, align 4
+  %3 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_none..omp_par to void (i32*, i32*, ...)*), i32* nonnull %1)
+  store i32 3, i32* %1, align 4
+  %4 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_none..omp_par.2 to void (i32*, i32*, ...)*), i32* nonnull %1)
+  ret void
+}
+
+define internal void @merge_none..omp_par.2(i32* noalias nocapture readnone %0, i32* noalias nocapture readnone %1, i32* nocapture %2)  {
+  store i32 4, i32* %2, align 4
+  ret void
+}
+
+define internal void @merge_none..omp_par(i32* noalias nocapture readnone %0, i32* noalias nocapture readnone %1, i32* nocapture %2)  {
+  store i32 2, i32* %2, align 4
+  ret void
+}
+
+;   void merge_some() {
+;       int a = 1;
+;   #pragma omp parallel
+;       {
+;           a = 2;
+;       }
+;       a = 3;
+;   #pragma omp parallel
+;       {
+;           a = 4;
+;       }
+;   #pragma omp parallel
+;       {
+;           a = 5;
+;       }
+;   }
+;
+; Do not merge first parallel region, due to the
+; unsafe store, but merge the two next parallel
+; regions.
+define dso_local void @merge_some() local_unnamed_addr  {
+  %1 = alloca i32, align 4
+  %2 = bitcast i32* %1 to i8*
+  store i32 1, i32* %1, align 4
+  %3 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_some..omp_par to void (i32*, i32*, ...)*), i32* nonnull %1)
+  store i32 3, i32* %1, align 4
+  %4 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_some..omp_par.3 to void (i32*, i32*, ...)*), i32* nonnull %1)
+  %5 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_some..omp_par.4 to void (i32*, i32*, ...)*), i32* nonnull %1)
+  ret void
+}
+
+define internal void @merge_some..omp_par.4(i32* noalias nocapture readnone %0, i32* noalias nocapture readnone %1, i32* nocapture %2)  {
+  store i32 5, i32* %2, align 4
+  ret void
+}
+
+define internal void @merge_some..omp_par.3(i32* noalias nocapture readnone %0, i32* noalias nocapture readnone %1, i32* nocapture %2)  {
+  store i32 4, i32* %2, align 4
+  ret void
+}
+
+define internal void @merge_some..omp_par(i32* noalias nocapture readnone %0, i32* noalias nocapture readnone %1, i32* nocapture %2)  {
+  store i32 2, i32* %2, align 4
+  ret void
+}
+
+;   void merge_cancellable_regions(int cancel1, int cancel2)
+;   {
+;   #pragma omp parallel
+;       {
+;           if(cancel1) {
+;   #pragma omp cancel parallel
+;           }
+;       }
+;   
+;   #pragma omp parallel
+;       {
+;           if (cancel2) {
+;   #pragma omp cancel parallel
+;           }
+;       }
+;   }
+;
+; Merge correctly cancellable regions.
+define dso_local void @merge_cancellable_regions(i32 %0, i32 %1) local_unnamed_addr  {
+  %3 = alloca i32, align 4
+  %4 = alloca i32, align 4
+  store i32 %0, i32* %3, align 4
+  store i32 %1, i32* %4, align 4
+  %5 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_cancellable_regions..omp_par to void (i32*, i32*, ...)*), i32* nonnull %3)
+  %6 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_cancellable_regions..omp_par.5 to void (i32*, i32*, ...)*), i32* nonnull %4)
+  ret void
+}
+
+define internal void @merge_cancellable_regions..omp_par.5(i32* noalias nocapture readnone %0, i32* noalias nocapture readnone %1, i32* nocapture readonly %2)  {
+  %4 = load i32, i32* %2, align 4
+  %5 = icmp eq i32 %4, 0
+  br i1 %5, label %6, label %7
+
+6:                                                ; preds = %3
+  ret void
+
+7:                                                ; preds = %3
+  %8 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
+  %9 = call i32 @__kmpc_cancel(%struct.ident_t* nonnull @1, i32 %8, i32 1)
+  ret void
+}
+
+define internal void @merge_cancellable_regions..omp_par(i32* noalias nocapture readnone %0, i32* noalias nocapture readnone %1, i32* nocapture readonly %2)  {
+  %4 = load i32, i32* %2, align 4
+  %5 = icmp eq i32 %4, 0
+  br i1 %5, label %6, label %7
+
+6:                                                ; preds = %3
+  ret void
+
+7:                                                ; preds = %3
+  %8 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
+  %9 = call i32 @__kmpc_cancel(%struct.ident_t* nonnull @1, i32 %8, i32 1)
+  ret void
+}
+
+declare i32 @__kmpc_cancel(%struct.ident_t*, i32, i32) local_unnamed_addr
+
+
+!llvm.module.flags = !{!0}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{!2}
+!2 = !{i64 2, i64 -1, i64 -1, i1 true}
+; CHECK-LABEL: define {{[^@]+}}@merge_all() local_unnamed_addr {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1:@.*]])
+; CHECK-NEXT:    [[TMP2:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    store i32 1, i32* [[TMP2]], align 4
+; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT:    br label [[OMP_PARALLEL:%.*]]
+; CHECK:       omp_parallel:
+; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* [[GLOB1]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_all..omp_par.3 to void (i32*, i32*, ...)*), i32* [[TMP2]])
+; CHECK-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT:%.*]]
+; CHECK:       omp.par.outlined.exit:
+; CHECK-NEXT:    br label [[OMP_PAR_EXIT_SPLIT:%.*]]
+; CHECK:       omp.par.exit.split:
+; CHECK-NEXT:    br label [[DOTSPLIT_SPLIT:%.*]]
+; CHECK:       .split.split:
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_all..omp_par.3
+; CHECK-SAME: (i32* noalias [[TID_ADDR:%.*]], i32* noalias [[ZERO_ADDR:%.*]], i32* [[TMP0:%.*]]) [[ATTR0:#.*]] {
+; CHECK-NEXT:  omp.par.entry:
+; CHECK-NEXT:    [[TID_ADDR_LOCAL:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TID_ADDR]], align 4
+; CHECK-NEXT:    store i32 [[TMP1]], i32* [[TID_ADDR_LOCAL]], align 4
+; CHECK-NEXT:    [[TID:%.*]] = load i32, i32* [[TID_ADDR_LOCAL]], align 4
+; CHECK-NEXT:    br label [[OMP_PAR_REGION:%.*]]
+; CHECK:       omp.par.outlined.exit.exitStub:
+; CHECK-NEXT:    ret void
+; CHECK:       omp.par.region:
+; CHECK-NEXT:    br label [[OMP_PAR_MERGED:%.*]]
+; CHECK:       omp.par.merged:
+; CHECK-NEXT:    call void @merge_all..omp_par(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP0]])
+; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT:    call void @__kmpc_barrier(%struct.ident_t* [[GLOB2:@.*]], i32 [[OMP_GLOBAL_THREAD_NUM]])
+; CHECK-NEXT:    call void @merge_all..omp_par.1(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP0]])
+; CHECK-NEXT:    br label [[DOTSPLIT:%.*]]
+; CHECK:       .split:
+; CHECK-NEXT:    br label [[OMP_PAR_REGION_SPLIT:%.*]]
+; CHECK:       omp.par.region.split:
+; CHECK-NEXT:    br label [[OMP_PAR_PRE_FINALIZE:%.*]]
+; CHECK:       omp.par.pre_finalize:
+; CHECK-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT_EXITSTUB:%.*]]
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_all..omp_par.1
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[TMP0:%.*]], i32* noalias nocapture nofree readnone [[TMP1:%.*]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP2:%.*]]) [[ATTR1:#.*]] {
+; CHECK-NEXT:    store i32 3, i32* [[TMP2]], align 4
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_all..omp_par
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[TMP0:%.*]], i32* noalias nocapture nofree readnone [[TMP1:%.*]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP2:%.*]]) [[ATTR1]] {
+; CHECK-NEXT:    store i32 2, i32* [[TMP2]], align 4
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_none() local_unnamed_addr {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]])
+; CHECK-NEXT:    [[TMP2:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    store i32 1, i32* [[TMP2]], align 4
+; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]], i32 noundef 1, void (i32*, i32*, ...)* noundef bitcast (void (i32*, i32*, i32*)* @merge_none..omp_par to void (i32*, i32*, ...)*), i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP2]])
+; CHECK-NEXT:    store i32 3, i32* [[TMP2]], align 4
+; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]], i32 noundef 1, void (i32*, i32*, ...)* noundef bitcast (void (i32*, i32*, i32*)* @merge_none..omp_par.2 to void (i32*, i32*, ...)*), i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP2]])
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_none..omp_par.2
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[TMP0:%.*]], i32* noalias nocapture nofree readnone [[TMP1:%.*]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP2:%.*]]) [[ATTR1]] {
+; CHECK-NEXT:    store i32 4, i32* [[TMP2]], align 4
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_none..omp_par
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[TMP0:%.*]], i32* noalias nocapture nofree readnone [[TMP1:%.*]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP2:%.*]]) [[ATTR1]] {
+; CHECK-NEXT:    store i32 2, i32* [[TMP2]], align 4
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_some() local_unnamed_addr {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]])
+; CHECK-NEXT:    [[TMP2:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    store i32 1, i32* [[TMP2]], align 4
+; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]], i32 noundef 1, void (i32*, i32*, ...)* noundef bitcast (void (i32*, i32*, i32*)* @merge_some..omp_par to void (i32*, i32*, ...)*), i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP2]])
+; CHECK-NEXT:    store i32 3, i32* [[TMP2]], align 4
+; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT:    br label [[OMP_PARALLEL:%.*]]
+; CHECK:       omp_parallel:
+; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* [[GLOB1]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_some..omp_par.2 to void (i32*, i32*, ...)*), i32* [[TMP2]])
+; CHECK-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT:%.*]]
+; CHECK:       omp.par.outlined.exit:
+; CHECK-NEXT:    br label [[OMP_PAR_EXIT_SPLIT:%.*]]
+; CHECK:       omp.par.exit.split:
+; CHECK-NEXT:    br label [[DOTSPLIT_SPLIT:%.*]]
+; CHECK:       .split.split:
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_some..omp_par.2
+; CHECK-SAME: (i32* noalias [[TID_ADDR:%.*]], i32* noalias [[ZERO_ADDR:%.*]], i32* [[TMP0:%.*]]) [[ATTR0]] {
+; CHECK-NEXT:  omp.par.entry:
+; CHECK-NEXT:    [[TID_ADDR_LOCAL:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TID_ADDR]], align 4
+; CHECK-NEXT:    store i32 [[TMP1]], i32* [[TID_ADDR_LOCAL]], align 4
+; CHECK-NEXT:    [[TID:%.*]] = load i32, i32* [[TID_ADDR_LOCAL]], align 4
+; CHECK-NEXT:    br label [[OMP_PAR_REGION:%.*]]
+; CHECK:       omp.par.outlined.exit.exitStub:
+; CHECK-NEXT:    ret void
+; CHECK:       omp.par.region:
+; CHECK-NEXT:    br label [[OMP_PAR_MERGED:%.*]]
+; CHECK:       omp.par.merged:
+; CHECK-NEXT:    call void @merge_some..omp_par.3(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP0]])
+; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT:    call void @__kmpc_barrier(%struct.ident_t* [[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM]])
+; CHECK-NEXT:    call void @merge_some..omp_par.4(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP0]])
+; CHECK-NEXT:    br label [[DOTSPLIT:%.*]]
+; CHECK:       .split:
+; CHECK-NEXT:    br label [[OMP_PAR_REGION_SPLIT:%.*]]
+; CHECK:       omp.par.region.split:
+; CHECK-NEXT:    br label [[OMP_PAR_PRE_FINALIZE:%.*]]
+; CHECK:       omp.par.pre_finalize:
+; CHECK-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT_EXITSTUB:%.*]]
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_some..omp_par.4
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[TMP0:%.*]], i32* noalias nocapture nofree readnone [[TMP1:%.*]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP2:%.*]]) [[ATTR1]] {
+; CHECK-NEXT:    store i32 5, i32* [[TMP2]], align 4
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_some..omp_par.3
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[TMP0:%.*]], i32* noalias nocapture nofree readnone [[TMP1:%.*]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP2:%.*]]) [[ATTR1]] {
+; CHECK-NEXT:    store i32 4, i32* [[TMP2]], align 4
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_some..omp_par
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[TMP0:%.*]], i32* noalias nocapture nofree readnone [[TMP1:%.*]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP2:%.*]]) [[ATTR1]] {
+; CHECK-NEXT:    store i32 2, i32* [[TMP2]], align 4
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_cancellable_regions
+; CHECK-SAME: (i32 [[TMP0:%.*]], i32 [[TMP1:%.*]]) local_unnamed_addr {
+; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]])
+; CHECK-NEXT:    [[TMP4:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    store i32 [[TMP0]], i32* [[TMP4]], align 4
+; CHECK-NEXT:    store i32 [[TMP1]], i32* [[TMP5]], align 4
+; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT:    br label [[OMP_PARALLEL:%.*]]
+; CHECK:       omp_parallel:
+; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* [[GLOB1]], i32 2, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, i32*)* @merge_cancellable_regions..omp_par.1 to void (i32*, i32*, ...)*), i32* [[TMP4]], i32* [[TMP5]])
+; CHECK-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT:%.*]]
+; CHECK:       omp.par.outlined.exit:
+; CHECK-NEXT:    br label [[OMP_PAR_EXIT_SPLIT:%.*]]
+; CHECK:       omp.par.exit.split:
+; CHECK-NEXT:    br label [[DOTSPLIT_SPLIT:%.*]]
+; CHECK:       .split.split:
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_cancellable_regions..omp_par.1
+; CHECK-SAME: (i32* noalias [[TID_ADDR:%.*]], i32* noalias [[ZERO_ADDR:%.*]], i32* [[TMP0:%.*]], i32* [[TMP1:%.*]]) [[ATTR0]] {
+; CHECK-NEXT:  omp.par.entry:
+; CHECK-NEXT:    [[TID_ADDR_LOCAL:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[TID_ADDR]], align 4
+; CHECK-NEXT:    store i32 [[TMP2]], i32* [[TID_ADDR_LOCAL]], align 4
+; CHECK-NEXT:    [[TID:%.*]] = load i32, i32* [[TID_ADDR_LOCAL]], align 4
+; CHECK-NEXT:    br label [[OMP_PAR_REGION:%.*]]
+; CHECK:       omp.par.outlined.exit.exitStub:
+; CHECK-NEXT:    ret void
+; CHECK:       omp.par.region:
+; CHECK-NEXT:    br label [[OMP_PAR_MERGED:%.*]]
+; CHECK:       omp.par.merged:
+; CHECK-NEXT:    call void @merge_cancellable_regions..omp_par(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[TMP0]])
+; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT:    call void @__kmpc_barrier(%struct.ident_t* [[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM]])
+; CHECK-NEXT:    call void @merge_cancellable_regions..omp_par.5(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[TMP1]])
+; CHECK-NEXT:    br label [[DOTSPLIT:%.*]]
+; CHECK:       .split:
+; CHECK-NEXT:    br label [[OMP_PAR_REGION_SPLIT:%.*]]
+; CHECK:       omp.par.region.split:
+; CHECK-NEXT:    br label [[OMP_PAR_PRE_FINALIZE:%.*]]
+; CHECK:       omp.par.pre_finalize:
+; CHECK-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT_EXITSTUB:%.*]]
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_cancellable_regions..omp_par.5
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[TMP0:%.*]], i32* noalias nocapture nofree readnone [[TMP1:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[TMP2:%.*]]) {
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0
+; CHECK-NEXT:    br i1 [[TMP5]], label [[TMP6:%.*]], label [[TMP7:%.*]]
+; CHECK:       6:
+; CHECK-NEXT:    ret void
+; CHECK:       7:
+; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef nonnull [[GLOB1]])
+; CHECK-NEXT:    [[TMP9:%.*]] = call i32 @__kmpc_cancel(%struct.ident_t* noundef nonnull [[GLOB1]], i32 [[TMP8]], i32 noundef 1)
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_cancellable_regions..omp_par
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[TMP0:%.*]], i32* noalias nocapture nofree readnone [[TMP1:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[TMP2:%.*]]) {
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0
+; CHECK-NEXT:    br i1 [[TMP5]], label [[TMP6:%.*]], label [[TMP7:%.*]]
+; CHECK:       6:
+; CHECK-NEXT:    ret void
+; CHECK:       7:
+; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef nonnull [[GLOB1]])
+; CHECK-NEXT:    [[TMP9:%.*]] = call i32 @__kmpc_cancel(%struct.ident_t* noundef nonnull [[GLOB1]], i32 [[TMP8]], i32 noundef 1)
+; CHECK-NEXT:    ret void
+;
diff --git a/llvm/test/Transforms/OpenMP/parallel_region_merging_legacy_pm.ll b/llvm/test/Transforms/OpenMP/parallel_region_merging_legacy_pm.ll

new file mode 100644 (file)

index 0000000..5da416b
--- /dev/null
+++ b/llvm/test/Transforms/OpenMP/parallel_region_merging_legacy_pm.ll
@@ -0,0 +1,412 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs
+; RUN: opt -S -attributor -openmpopt -openmp-opt-enable-merging  < %s | FileCheck %s
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+
+%struct.ident_t = type { i32, i32, i32, i32, i8* }
+
+@0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
+@1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @0, i32 0, i32 0) }, align 8
+
+;   void merge_all() {
+;       int a = 1;
+;   #pragma omp parallel
+;       {
+;           a = 2;
+;       }
+;   #pragma omp parallel
+;       {
+;           a = 3;
+;       }
+;   }
+;
+; Merge all parallel regions.
+define dso_local void @merge_all() local_unnamed_addr  {
+  %1 = alloca i32, align 4
+  %2 = bitcast i32* %1 to i8*
+  store i32 1, i32* %1, align 4
+  %3 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_all..omp_par to void (i32*, i32*, ...)*), i32* nonnull %1)
+  %4 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_all..omp_par.1 to void (i32*, i32*, ...)*), i32* nonnull %1)
+  ret void
+}
+
+define internal void @merge_all..omp_par.1(i32* noalias nocapture readnone %0, i32* noalias nocapture readnone %1, i32* nocapture %2)  {
+  store i32 3, i32* %2, align 4
+  ret void
+}
+
+define internal void @merge_all..omp_par(i32* noalias nocapture readnone %0, i32* noalias nocapture readnone %1, i32* nocapture %2)  {
+  store i32 2, i32* %2, align 4
+  ret void
+}
+
+
+declare i32 @__kmpc_global_thread_num(%struct.ident_t*) local_unnamed_addr
+
+declare !callback !1 void @__kmpc_fork_call(%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) local_unnamed_addr
+
+;   void merge_none() {
+;       int a = 1;
+;   #pragma omp parallel
+;       {
+;           a = 2;
+;       }
+;       a = 3;
+;   #pragma omp parallel
+;       {
+;           a = 4;
+;       }
+;   }
+;
+; Does not merge parallel regions, in-between store
+; instruction is unsafe to execute in parallel.
+define dso_local void @merge_none() local_unnamed_addr  {
+  %1 = alloca i32, align 4
+  %2 = bitcast i32* %1 to i8*
+  store i32 1, i32* %1, align 4
+  %3 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_none..omp_par to void (i32*, i32*, ...)*), i32* nonnull %1)
+  store i32 3, i32* %1, align 4
+  %4 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_none..omp_par.2 to void (i32*, i32*, ...)*), i32* nonnull %1)
+  ret void
+}
+
+define internal void @merge_none..omp_par.2(i32* noalias nocapture readnone %0, i32* noalias nocapture readnone %1, i32* nocapture %2)  {
+  store i32 4, i32* %2, align 4
+  ret void
+}
+
+define internal void @merge_none..omp_par(i32* noalias nocapture readnone %0, i32* noalias nocapture readnone %1, i32* nocapture %2)  {
+  store i32 2, i32* %2, align 4
+  ret void
+}
+
+;   void merge_some() {
+;       int a = 1;
+;   #pragma omp parallel
+;       {
+;           a = 2;
+;       }
+;       a = 3;
+;   #pragma omp parallel
+;       {
+;           a = 4;
+;       }
+;   #pragma omp parallel
+;       {
+;           a = 5;
+;       }
+;   }
+;
+; Do not merge first parallel region, due to the
+; unsafe store, but merge the two next parallel
+; regions.
+define dso_local void @merge_some() local_unnamed_addr  {
+  %1 = alloca i32, align 4
+  %2 = bitcast i32* %1 to i8*
+  store i32 1, i32* %1, align 4
+  %3 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_some..omp_par to void (i32*, i32*, ...)*), i32* nonnull %1)
+  store i32 3, i32* %1, align 4
+  %4 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_some..omp_par.3 to void (i32*, i32*, ...)*), i32* nonnull %1)
+  %5 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_some..omp_par.4 to void (i32*, i32*, ...)*), i32* nonnull %1)
+  ret void
+}
+
+define internal void @merge_some..omp_par.4(i32* noalias nocapture readnone %0, i32* noalias nocapture readnone %1, i32* nocapture %2)  {
+  store i32 5, i32* %2, align 4
+  ret void
+}
+
+define internal void @merge_some..omp_par.3(i32* noalias nocapture readnone %0, i32* noalias nocapture readnone %1, i32* nocapture %2)  {
+  store i32 4, i32* %2, align 4
+  ret void
+}
+
+define internal void @merge_some..omp_par(i32* noalias nocapture readnone %0, i32* noalias nocapture readnone %1, i32* nocapture %2)  {
+  store i32 2, i32* %2, align 4
+  ret void
+}
+
+;   void merge_cancellable_regions(int cancel1, int cancel2)
+;   {
+;   #pragma omp parallel
+;       {
+;           if(cancel1) {
+;   #pragma omp cancel parallel
+;           }
+;       }
+;   
+;   #pragma omp parallel
+;       {
+;           if (cancel2) {
+;   #pragma omp cancel parallel
+;           }
+;       }
+;   }
+;
+; Merge correctly cancellable regions.
+define dso_local void @merge_cancellable_regions(i32 %0, i32 %1) local_unnamed_addr  {
+  %3 = alloca i32, align 4
+  %4 = alloca i32, align 4
+  store i32 %0, i32* %3, align 4
+  store i32 %1, i32* %4, align 4
+  %5 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_cancellable_regions..omp_par to void (i32*, i32*, ...)*), i32* nonnull %3)
+  %6 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_cancellable_regions..omp_par.5 to void (i32*, i32*, ...)*), i32* nonnull %4)
+  ret void
+}
+
+define internal void @merge_cancellable_regions..omp_par.5(i32* noalias nocapture readnone %0, i32* noalias nocapture readnone %1, i32* nocapture readonly %2)  {
+  %4 = load i32, i32* %2, align 4
+  %5 = icmp eq i32 %4, 0
+  br i1 %5, label %6, label %7
+
+6:                                                ; preds = %3
+  ret void
+
+7:                                                ; preds = %3
+  %8 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
+  %9 = call i32 @__kmpc_cancel(%struct.ident_t* nonnull @1, i32 %8, i32 1)
+  ret void
+}
+
+define internal void @merge_cancellable_regions..omp_par(i32* noalias nocapture readnone %0, i32* noalias nocapture readnone %1, i32* nocapture readonly %2)  {
+  %4 = load i32, i32* %2, align 4
+  %5 = icmp eq i32 %4, 0
+  br i1 %5, label %6, label %7
+
+6:                                                ; preds = %3
+  ret void
+
+7:                                                ; preds = %3
+  %8 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
+  %9 = call i32 @__kmpc_cancel(%struct.ident_t* nonnull @1, i32 %8, i32 1)
+  ret void
+}
+
+declare i32 @__kmpc_cancel(%struct.ident_t*, i32, i32) local_unnamed_addr
+
+
+!llvm.module.flags = !{!0}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{!2}
+!2 = !{i64 2, i64 -1, i64 -1, i1 true}
+; CHECK-LABEL: define {{[^@]+}}@merge_all() local_unnamed_addr {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1:@.*]])
+; CHECK-NEXT:    [[TMP2:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    store i32 1, i32* [[TMP2]], align 4
+; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT:    br label [[OMP_PARALLEL:%.*]]
+; CHECK:       omp_parallel:
+; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* [[GLOB1]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_all..omp_par.2 to void (i32*, i32*, ...)*), i32* [[TMP2]])
+; CHECK-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT:%.*]]
+; CHECK:       omp.par.outlined.exit:
+; CHECK-NEXT:    br label [[OMP_PAR_EXIT_SPLIT:%.*]]
+; CHECK:       omp.par.exit.split:
+; CHECK-NEXT:    br label [[DOTSPLIT_SPLIT:%.*]]
+; CHECK:       .split.split:
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_all..omp_par.2
+; CHECK-SAME: (i32* noalias [[TID_ADDR:%.*]], i32* noalias [[ZERO_ADDR:%.*]], i32* [[TMP0:%.*]]) [[ATTR0:#.*]] {
+; CHECK-NEXT:  omp.par.entry:
+; CHECK-NEXT:    [[TID_ADDR_LOCAL:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TID_ADDR]], align 4
+; CHECK-NEXT:    store i32 [[TMP1]], i32* [[TID_ADDR_LOCAL]], align 4
+; CHECK-NEXT:    [[TID:%.*]] = load i32, i32* [[TID_ADDR_LOCAL]], align 4
+; CHECK-NEXT:    br label [[OMP_PAR_REGION:%.*]]
+; CHECK:       omp.par.outlined.exit.exitStub:
+; CHECK-NEXT:    ret void
+; CHECK:       omp.par.region:
+; CHECK-NEXT:    br label [[OMP_PAR_MERGED:%.*]]
+; CHECK:       omp.par.merged:
+; CHECK-NEXT:    call void @merge_all..omp_par(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP0]])
+; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT:    call void @__kmpc_barrier(%struct.ident_t* [[GLOB2:@.*]], i32 [[OMP_GLOBAL_THREAD_NUM]])
+; CHECK-NEXT:    call void @merge_all..omp_par.1(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP0]])
+; CHECK-NEXT:    br label [[DOTSPLIT:%.*]]
+; CHECK:       .split:
+; CHECK-NEXT:    br label [[OMP_PAR_REGION_SPLIT:%.*]]
+; CHECK:       omp.par.region.split:
+; CHECK-NEXT:    br label [[OMP_PAR_PRE_FINALIZE:%.*]]
+; CHECK:       omp.par.pre_finalize:
+; CHECK-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT_EXITSTUB:%.*]]
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_all..omp_par.1
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[TMP0:%.*]], i32* noalias nocapture nofree readnone [[TMP1:%.*]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP2:%.*]]) [[ATTR1:#.*]] {
+; CHECK-NEXT:    store i32 3, i32* [[TMP2]], align 4
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_all..omp_par
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[TMP0:%.*]], i32* noalias nocapture nofree readnone [[TMP1:%.*]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP2:%.*]]) [[ATTR1]] {
+; CHECK-NEXT:    store i32 2, i32* [[TMP2]], align 4
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_none() local_unnamed_addr {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]])
+; CHECK-NEXT:    [[TMP2:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    store i32 1, i32* [[TMP2]], align 4
+; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]], i32 noundef 1, void (i32*, i32*, ...)* noundef bitcast (void (i32*, i32*, i32*)* @merge_none..omp_par to void (i32*, i32*, ...)*), i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP2]])
+; CHECK-NEXT:    store i32 3, i32* [[TMP2]], align 4
+; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]], i32 noundef 1, void (i32*, i32*, ...)* noundef bitcast (void (i32*, i32*, i32*)* @merge_none..omp_par.2 to void (i32*, i32*, ...)*), i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP2]])
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_none..omp_par.2
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[TMP0:%.*]], i32* noalias nocapture nofree readnone [[TMP1:%.*]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP2:%.*]]) [[ATTR1]] {
+; CHECK-NEXT:    store i32 4, i32* [[TMP2]], align 4
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_none..omp_par
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[TMP0:%.*]], i32* noalias nocapture nofree readnone [[TMP1:%.*]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP2:%.*]]) [[ATTR1]] {
+; CHECK-NEXT:    store i32 2, i32* [[TMP2]], align 4
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_some() local_unnamed_addr {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]])
+; CHECK-NEXT:    [[TMP2:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    store i32 1, i32* [[TMP2]], align 4
+; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]], i32 noundef 1, void (i32*, i32*, ...)* noundef bitcast (void (i32*, i32*, i32*)* @merge_some..omp_par to void (i32*, i32*, ...)*), i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP2]])
+; CHECK-NEXT:    store i32 3, i32* [[TMP2]], align 4
+; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT:    br label [[OMP_PARALLEL:%.*]]
+; CHECK:       omp_parallel:
+; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* [[GLOB1]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_some..omp_par.5 to void (i32*, i32*, ...)*), i32* [[TMP2]])
+; CHECK-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT:%.*]]
+; CHECK:       omp.par.outlined.exit:
+; CHECK-NEXT:    br label [[OMP_PAR_EXIT_SPLIT:%.*]]
+; CHECK:       omp.par.exit.split:
+; CHECK-NEXT:    br label [[DOTSPLIT_SPLIT:%.*]]
+; CHECK:       .split.split:
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_some..omp_par.5
+; CHECK-SAME: (i32* noalias [[TID_ADDR:%.*]], i32* noalias [[ZERO_ADDR:%.*]], i32* [[TMP0:%.*]]) [[ATTR0]] {
+; CHECK-NEXT:  omp.par.entry:
+; CHECK-NEXT:    [[TID_ADDR_LOCAL:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TID_ADDR]], align 4
+; CHECK-NEXT:    store i32 [[TMP1]], i32* [[TID_ADDR_LOCAL]], align 4
+; CHECK-NEXT:    [[TID:%.*]] = load i32, i32* [[TID_ADDR_LOCAL]], align 4
+; CHECK-NEXT:    br label [[OMP_PAR_REGION:%.*]]
+; CHECK:       omp.par.outlined.exit.exitStub:
+; CHECK-NEXT:    ret void
+; CHECK:       omp.par.region:
+; CHECK-NEXT:    br label [[OMP_PAR_MERGED:%.*]]
+; CHECK:       omp.par.merged:
+; CHECK-NEXT:    call void @merge_some..omp_par.3(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP0]])
+; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT:    call void @__kmpc_barrier(%struct.ident_t* [[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM]])
+; CHECK-NEXT:    call void @merge_some..omp_par.4(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP0]])
+; CHECK-NEXT:    br label [[DOTSPLIT:%.*]]
+; CHECK:       .split:
+; CHECK-NEXT:    br label [[OMP_PAR_REGION_SPLIT:%.*]]
+; CHECK:       omp.par.region.split:
+; CHECK-NEXT:    br label [[OMP_PAR_PRE_FINALIZE:%.*]]
+; CHECK:       omp.par.pre_finalize:
+; CHECK-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT_EXITSTUB:%.*]]
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_some..omp_par.4
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[TMP0:%.*]], i32* noalias nocapture nofree readnone [[TMP1:%.*]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP2:%.*]]) [[ATTR1]] {
+; CHECK-NEXT:    store i32 5, i32* [[TMP2]], align 4
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_some..omp_par.3
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[TMP0:%.*]], i32* noalias nocapture nofree readnone [[TMP1:%.*]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP2:%.*]]) [[ATTR1]] {
+; CHECK-NEXT:    store i32 4, i32* [[TMP2]], align 4
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_some..omp_par
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[TMP0:%.*]], i32* noalias nocapture nofree readnone [[TMP1:%.*]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP2:%.*]]) [[ATTR1]] {
+; CHECK-NEXT:    store i32 2, i32* [[TMP2]], align 4
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_cancellable_regions
+; CHECK-SAME: (i32 [[TMP0:%.*]], i32 [[TMP1:%.*]]) local_unnamed_addr {
+; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]])
+; CHECK-NEXT:    [[TMP4:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    store i32 [[TMP0]], i32* [[TMP4]], align 4
+; CHECK-NEXT:    store i32 [[TMP1]], i32* [[TMP5]], align 4
+; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT:    br label [[OMP_PARALLEL:%.*]]
+; CHECK:       omp_parallel:
+; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* [[GLOB1]], i32 2, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, i32*)* @merge_cancellable_regions..omp_par.6 to void (i32*, i32*, ...)*), i32* [[TMP4]], i32* [[TMP5]])
+; CHECK-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT:%.*]]
+; CHECK:       omp.par.outlined.exit:
+; CHECK-NEXT:    br label [[OMP_PAR_EXIT_SPLIT:%.*]]
+; CHECK:       omp.par.exit.split:
+; CHECK-NEXT:    br label [[DOTSPLIT_SPLIT:%.*]]
+; CHECK:       .split.split:
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_cancellable_regions..omp_par.6
+; CHECK-SAME: (i32* noalias [[TID_ADDR:%.*]], i32* noalias [[ZERO_ADDR:%.*]], i32* [[TMP0:%.*]], i32* [[TMP1:%.*]]) [[ATTR0]] {
+; CHECK-NEXT:  omp.par.entry:
+; CHECK-NEXT:    [[TID_ADDR_LOCAL:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[TID_ADDR]], align 4
+; CHECK-NEXT:    store i32 [[TMP2]], i32* [[TID_ADDR_LOCAL]], align 4
+; CHECK-NEXT:    [[TID:%.*]] = load i32, i32* [[TID_ADDR_LOCAL]], align 4
+; CHECK-NEXT:    br label [[OMP_PAR_REGION:%.*]]
+; CHECK:       omp.par.outlined.exit.exitStub:
+; CHECK-NEXT:    ret void
+; CHECK:       omp.par.region:
+; CHECK-NEXT:    br label [[OMP_PAR_MERGED:%.*]]
+; CHECK:       omp.par.merged:
+; CHECK-NEXT:    call void @merge_cancellable_regions..omp_par(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[TMP0]])
+; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT:    call void @__kmpc_barrier(%struct.ident_t* [[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM]])
+; CHECK-NEXT:    call void @merge_cancellable_regions..omp_par.5(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[TMP1]])
+; CHECK-NEXT:    br label [[DOTSPLIT:%.*]]
+; CHECK:       .split:
+; CHECK-NEXT:    br label [[OMP_PAR_REGION_SPLIT:%.*]]
+; CHECK:       omp.par.region.split:
+; CHECK-NEXT:    br label [[OMP_PAR_PRE_FINALIZE:%.*]]
+; CHECK:       omp.par.pre_finalize:
+; CHECK-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT_EXITSTUB:%.*]]
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_cancellable_regions..omp_par.5
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[TMP0:%.*]], i32* noalias nocapture nofree readnone [[TMP1:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[TMP2:%.*]]) {
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0
+; CHECK-NEXT:    br i1 [[TMP5]], label [[TMP6:%.*]], label [[TMP7:%.*]]
+; CHECK:       6:
+; CHECK-NEXT:    ret void
+; CHECK:       7:
+; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef nonnull [[GLOB1]])
+; CHECK-NEXT:    [[TMP9:%.*]] = call i32 @__kmpc_cancel(%struct.ident_t* noundef nonnull [[GLOB1]], i32 [[TMP8]], i32 noundef 1)
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_cancellable_regions..omp_par
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[TMP0:%.*]], i32* noalias nocapture nofree readnone [[TMP1:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[TMP2:%.*]]) {
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0
+; CHECK-NEXT:    br i1 [[TMP5]], label [[TMP6:%.*]], label [[TMP7:%.*]]
+; CHECK:       6:
+; CHECK-NEXT:    ret void
+; CHECK:       7:
+; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef nonnull [[GLOB1]])
+; CHECK-NEXT:    [[TMP9:%.*]] = call i32 @__kmpc_cancel(%struct.ident_t* noundef nonnull [[GLOB1]], i32 [[TMP8]], i32 noundef 1)
+; CHECK-NEXT:    ret void
+;
author	Giorgis Georgakoudis <georgakoudis1@llnl.gov>
	Tue, 7 Jul 2020 21:14:47 +0000 (14:14 -0700)
committer	Giorgis Georgakoudis <georgakoudis1@llnl.gov>
	Fri, 9 Oct 2020 16:59:04 +0000 (09:59 -0700)
llvm/lib/Transforms/IPO/OpenMPOpt.cpp		patch \| blob \| history
llvm/test/Transforms/OpenMP/parallel_region_merging.ll	[new file with mode: 0644]	patch \| blob
llvm/test/Transforms/OpenMP/parallel_region_merging_legacy_pm.ll	[new file with mode: 0644]	patch \| blob