#include "llvm/Analysis/CallGraph.h"
#include "llvm/Analysis/CallGraphSCCPass.h"
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/ValueTracking.h"
#include "llvm/Frontend/OpenMP/OMPConstants.h"
#include "llvm/Frontend/OpenMP/OMPIRBuilder.h"
#include "llvm/InitializePasses.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Transforms/IPO.h"
#include "llvm/Transforms/IPO/Attributor.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/CallGraphUpdater.h"
-#include "llvm/Analysis/ValueTracking.h"
using namespace llvm;
using namespace omp;
cl::desc("Disable OpenMP specific optimizations."), cl::Hidden,
cl::init(false));
+static cl::opt<bool> EnableParallelRegionMerging(
+ "openmp-opt-enable-merging", cl::ZeroOrMore,
+ cl::desc("Enable the OpenMP region merging optimization."), cl::Hidden,
+ cl::init(false));
+
static cl::opt<bool> PrintICVValues("openmp-print-icv-values", cl::init(false),
cl::Hidden);
static cl::opt<bool> PrintOpenMPKernels("openmp-print-gpu-kernels",
STATISTIC(
NumOpenMPParallelRegionsReplacedInGPUStateMachine,
"Number of OpenMP parallel regions replaced with ID in GPU state machines");
+STATISTIC(NumOpenMPParallelRegionsMerged,
+ "Number of OpenMP parallel regions merged");
#if !defined(NDEBUG)
static constexpr auto TAG = "[" DEBUG_TYPE "]";
// Recollect uses, in case Attributor deleted any.
OMPInfoCache.recollectUses();
- Changed |= deduplicateRuntimeCalls();
Changed |= deleteParallelRegions();
if (HideMemoryTransferLatency)
Changed |= hideMemTransfersLatency();
if (remarksEnabled())
analysisGlobalization();
+ Changed |= deduplicateRuntimeCalls();
+ if (EnableParallelRegionMerging) {
+ if (mergeParallelRegions()) {
+ deduplicateRuntimeCalls();
+ Changed = true;
+ }
+ }
return Changed;
}
}
private:
+ /// Merge parallel regions when it is safe.
+ bool mergeParallelRegions() {
+ const unsigned CallbackCalleeOperand = 2;
+ const unsigned CallbackFirstArgOperand = 3;
+ using InsertPointTy = OpenMPIRBuilder::InsertPointTy;
+
+ // Check if there are any __kmpc_fork_call calls to merge.
+ OMPInformationCache::RuntimeFunctionInfo &RFI =
+ OMPInfoCache.RFIs[OMPRTL___kmpc_fork_call];
+
+ if (!RFI.Declaration)
+ return false;
+
+ // Check if there any __kmpc_push_proc_bind calls for explicit affinities.
+ OMPInformationCache::RuntimeFunctionInfo &ProcBindRFI =
+ OMPInfoCache.RFIs[OMPRTL___kmpc_push_proc_bind];
+
+ // Defensively abort if explicit affinities are set.
+ // TODO: Track ICV proc_bind to merge when mergable regions have the same
+ // affinity.
+ if (ProcBindRFI.Declaration)
+ return false;
+
+ bool Changed = false;
+ LoopInfo *LI = nullptr;
+ DominatorTree *DT = nullptr;
+
+ SmallDenseMap<BasicBlock *, SmallPtrSet<Instruction *, 4>> BB2PRMap;
+
+ BasicBlock *StartBB = nullptr, *EndBB = nullptr;
+ auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
+ BasicBlock &ContinuationIP) {
+ BasicBlock *CGStartBB = CodeGenIP.getBlock();
+ BasicBlock *CGEndBB =
+ SplitBlock(CGStartBB, &*CodeGenIP.getPoint(), DT, LI);
+ assert(StartBB != nullptr && "StartBB should not be null");
+ CGStartBB->getTerminator()->setSuccessor(0, StartBB);
+ assert(EndBB != nullptr && "EndBB should not be null");
+ EndBB->getTerminator()->setSuccessor(0, CGEndBB);
+ };
+
+ auto PrivCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
+ Value &VPtr, Value *&ReplacementValue) -> InsertPointTy {
+ ReplacementValue = &VPtr;
+ return CodeGenIP;
+ };
+
+ auto FiniCB = [&](InsertPointTy CodeGenIP) {};
+
+ // Helper to merge the __kmpc_fork_call calls in MergableCIs. They are all
+ // contained in BB and only separated by instructions that can be
+ // redundantly executed in parallel. The block BB is split before the first
+ // call (in MergableCIs) and after the last so the entire region we merge
+ // into a single parallel region is contained in a single basic block
+ // without any other instructions. We use the OpenMPIRBuilder to outline
+ // that block and call the resulting function via __kmpc_fork_call.
+ auto Merge = [&](SmallVectorImpl<CallInst *> &MergableCIs, BasicBlock *BB) {
+ // TODO: Change the interface to allow single CIs expanded, e.g, to
+ // include an outer loop.
+ assert(MergableCIs.size() > 1 && "Assumed multiple mergable CIs");
+
+ auto Remark = [&](OptimizationRemark OR) {
+ OR << "Parallel region at "
+ << ore::NV("OpenMPParallelMergeFront",
+ MergableCIs.front()->getDebugLoc())
+ << " merged with parallel regions at ";
+ for (auto *CI :
+ llvm::make_range(MergableCIs.begin() + 1, MergableCIs.end())) {
+ OR << ore::NV("OpenMPParallelMerge", CI->getDebugLoc());
+ if (CI != MergableCIs.back())
+ OR << ", ";
+ }
+ return OR;
+ };
+
+ emitRemark<OptimizationRemark>(MergableCIs.front(),
+ "OpenMPParallelRegionMerging", Remark);
+
+ Function *OriginalFn = BB->getParent();
+ LLVM_DEBUG(dbgs() << TAG << "Merge " << MergableCIs.size()
+ << " parallel regions in " << OriginalFn->getName()
+ << "\n");
+
+ // Isolate the calls to merge in a separate block.
+ EndBB = SplitBlock(BB, MergableCIs.back()->getNextNode(), DT, LI);
+ BasicBlock *AfterBB =
+ SplitBlock(EndBB, &*EndBB->getFirstInsertionPt(), DT, LI);
+ StartBB = SplitBlock(BB, MergableCIs.front(), DT, LI, nullptr,
+ "omp.par.merged");
+
+ assert(BB->getUniqueSuccessor() == StartBB && "Expected a different CFG");
+ const DebugLoc DL = BB->getTerminator()->getDebugLoc();
+ BB->getTerminator()->eraseFromParent();
+
+ OpenMPIRBuilder::LocationDescription Loc(InsertPointTy(BB, BB->end()),
+ DL);
+ IRBuilder<>::InsertPoint AllocaIP(
+ &OriginalFn->getEntryBlock(),
+ OriginalFn->getEntryBlock().getFirstInsertionPt());
+ // Create the merged parallel region with default proc binding, to
+ // avoid overriding binding settings, and without explicit cancellation.
+ InsertPointTy AfterIP = OMPInfoCache.OMPBuilder.CreateParallel(
+ Loc, AllocaIP, BodyGenCB, PrivCB, FiniCB, nullptr, nullptr,
+ OMP_PROC_BIND_default, /* IsCancellable */ false);
+ BranchInst::Create(AfterBB, AfterIP.getBlock());
+
+ // Perform the actual outlining.
+ OMPInfoCache.OMPBuilder.finalize();
+
+ Function *OutlinedFn = MergableCIs.front()->getCaller();
+
+ // Replace the __kmpc_fork_call calls with direct calls to the outlined
+ // callbacks.
+ SmallVector<Value *, 8> Args;
+ for (auto *CI : MergableCIs) {
+ Value *Callee =
+ CI->getArgOperand(CallbackCalleeOperand)->stripPointerCasts();
+ FunctionType *FT =
+ cast<FunctionType>(Callee->getType()->getPointerElementType());
+ Args.clear();
+ Args.push_back(OutlinedFn->getArg(0));
+ Args.push_back(OutlinedFn->getArg(1));
+ for (unsigned U = CallbackFirstArgOperand, E = CI->getNumArgOperands();
+ U < E; ++U)
+ Args.push_back(CI->getArgOperand(U));
+
+ CallInst *NewCI = CallInst::Create(FT, Callee, Args, "", CI);
+ if (CI->getDebugLoc())
+ NewCI->setDebugLoc(CI->getDebugLoc());
+
+ // Forward parameter attributes from the callback to the callee.
+ for (unsigned U = CallbackFirstArgOperand, E = CI->getNumArgOperands();
+ U < E; ++U)
+ for (const Attribute &A : CI->getAttributes().getParamAttributes(U))
+ NewCI->addParamAttr(
+ U - (CallbackFirstArgOperand - CallbackCalleeOperand), A);
+
+ // Emit an explicit barrier to replace the implicit fork-join barrier.
+ if (CI != MergableCIs.back()) {
+ // TODO: Remove barrier if the merged parallel region includes the
+ // 'nowait' clause.
+ OMPInfoCache.OMPBuilder.CreateBarrier(
+ InsertPointTy(NewCI->getParent(),
+ NewCI->getNextNode()->getIterator()),
+ OMPD_parallel);
+ }
+
+ auto Remark = [&](OptimizationRemark OR) {
+ return OR << "Parallel region at "
+ << ore::NV("OpenMPParallelMerge", CI->getDebugLoc())
+ << " merged with "
+ << ore::NV("OpenMPParallelMergeFront",
+ MergableCIs.front()->getDebugLoc());
+ };
+ if (CI != MergableCIs.front())
+ emitRemark<OptimizationRemark>(CI, "OpenMPParallelRegionMerging",
+ Remark);
+
+ CI->eraseFromParent();
+ }
+
+ assert(OutlinedFn != OriginalFn && "Outlining failed");
+ CGUpdater.registerOutlinedFunction(*OutlinedFn);
+ CGUpdater.reanalyzeFunction(*OriginalFn);
+
+ NumOpenMPParallelRegionsMerged += MergableCIs.size();
+
+ return true;
+ };
+
+ // Helper function that identifes sequences of
+ // __kmpc_fork_call uses in a basic block.
+ auto DetectPRsCB = [&](Use &U, Function &F) {
+ CallInst *CI = getCallIfRegularCall(U, &RFI);
+ BB2PRMap[CI->getParent()].insert(CI);
+
+ return false;
+ };
+
+ BB2PRMap.clear();
+ RFI.foreachUse(SCC, DetectPRsCB);
+ SmallVector<SmallVector<CallInst *, 4>, 4> MergableCIsVector;
+ // Find mergable parallel regions within a basic block that are
+ // safe to merge, that is any in-between instructions can safely
+ // execute in parallel after merging.
+ // TODO: support merging across basic-blocks.
+ for (auto &It : BB2PRMap) {
+ auto &CIs = It.getSecond();
+ if (CIs.size() < 2)
+ continue;
+
+ BasicBlock *BB = It.getFirst();
+ SmallVector<CallInst *, 4> MergableCIs;
+
+ // Find maximal number of parallel region CIs that are safe to merge.
+ for (Instruction &I : *BB) {
+ if (CIs.count(&I)) {
+ MergableCIs.push_back(cast<CallInst>(&I));
+ continue;
+ }
+
+ if (isSafeToSpeculativelyExecute(&I, &I, DT))
+ continue;
+
+ if (MergableCIs.size() > 1) {
+ MergableCIsVector.push_back(MergableCIs);
+ LLVM_DEBUG(dbgs() << TAG << "Found " << MergableCIs.size()
+ << " parallel regions in block " << BB->getName()
+ << " of function " << BB->getParent()->getName()
+ << "\n";);
+ }
+
+ MergableCIs.clear();
+ }
+
+ if (!MergableCIsVector.empty()) {
+ Changed = true;
+
+ for (auto &MergableCIs : MergableCIsVector)
+ Merge(MergableCIs, BB);
+ }
+ }
+
+ if (Changed) {
+ // Update RFI info to set it up for later passes.
+ RFI.clearUsesMap();
+ OMPInfoCache.collectUses(RFI, /* CollectStats */ false);
+
+ // Collect uses for the emitted barrier call.
+ OMPInformationCache::RuntimeFunctionInfo &BarrierRFI =
+ OMPInfoCache.RFIs[OMPRTL___kmpc_barrier];
+ BarrierRFI.clearUsesMap();
+ OMPInfoCache.collectUses(BarrierRFI, /* CollectStats */ false);
+ }
+
+ return Changed;
+ }
+
/// Try to delete parallel regions if possible.
bool deleteParallelRegions() {
const unsigned CallbackCalleeOperand = 2;
--- /dev/null
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs
+; RUN: opt -S -passes='attributor,cgscc(openmpopt)' -openmp-opt-enable-merging < %s | FileCheck %s
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+
+%struct.ident_t = type { i32, i32, i32, i32, i8* }
+
+@0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
+@1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @0, i32 0, i32 0) }, align 8
+
+; void merge_all() {
+; int a = 1;
+; #pragma omp parallel
+; {
+; a = 2;
+; }
+; #pragma omp parallel
+; {
+; a = 3;
+; }
+; }
+;
+; Merge all parallel regions.
+define dso_local void @merge_all() local_unnamed_addr {
+ %1 = alloca i32, align 4
+ %2 = bitcast i32* %1 to i8*
+ store i32 1, i32* %1, align 4
+ %3 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
+ call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_all..omp_par to void (i32*, i32*, ...)*), i32* nonnull %1)
+ %4 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
+ call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_all..omp_par.1 to void (i32*, i32*, ...)*), i32* nonnull %1)
+ ret void
+}
+
+define internal void @merge_all..omp_par.1(i32* noalias nocapture readnone %0, i32* noalias nocapture readnone %1, i32* nocapture %2) {
+ store i32 3, i32* %2, align 4
+ ret void
+}
+
+define internal void @merge_all..omp_par(i32* noalias nocapture readnone %0, i32* noalias nocapture readnone %1, i32* nocapture %2) {
+ store i32 2, i32* %2, align 4
+ ret void
+}
+
+
+declare i32 @__kmpc_global_thread_num(%struct.ident_t*) local_unnamed_addr
+
+declare !callback !1 void @__kmpc_fork_call(%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) local_unnamed_addr
+
+; void merge_none() {
+; int a = 1;
+; #pragma omp parallel
+; {
+; a = 2;
+; }
+; a = 3;
+; #pragma omp parallel
+; {
+; a = 4;
+; }
+; }
+;
+; Does not merge parallel regions, in-between store
+; instruction is unsafe to execute in parallel.
+define dso_local void @merge_none() local_unnamed_addr {
+ %1 = alloca i32, align 4
+ %2 = bitcast i32* %1 to i8*
+ store i32 1, i32* %1, align 4
+ %3 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
+ call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_none..omp_par to void (i32*, i32*, ...)*), i32* nonnull %1)
+ store i32 3, i32* %1, align 4
+ %4 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
+ call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_none..omp_par.2 to void (i32*, i32*, ...)*), i32* nonnull %1)
+ ret void
+}
+
+define internal void @merge_none..omp_par.2(i32* noalias nocapture readnone %0, i32* noalias nocapture readnone %1, i32* nocapture %2) {
+ store i32 4, i32* %2, align 4
+ ret void
+}
+
+define internal void @merge_none..omp_par(i32* noalias nocapture readnone %0, i32* noalias nocapture readnone %1, i32* nocapture %2) {
+ store i32 2, i32* %2, align 4
+ ret void
+}
+
+; void merge_some() {
+; int a = 1;
+; #pragma omp parallel
+; {
+; a = 2;
+; }
+; a = 3;
+; #pragma omp parallel
+; {
+; a = 4;
+; }
+; #pragma omp parallel
+; {
+; a = 5;
+; }
+; }
+;
+; Do not merge first parallel region, due to the
+; unsafe store, but merge the two next parallel
+; regions.
+define dso_local void @merge_some() local_unnamed_addr {
+ %1 = alloca i32, align 4
+ %2 = bitcast i32* %1 to i8*
+ store i32 1, i32* %1, align 4
+ %3 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
+ call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_some..omp_par to void (i32*, i32*, ...)*), i32* nonnull %1)
+ store i32 3, i32* %1, align 4
+ %4 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
+ call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_some..omp_par.3 to void (i32*, i32*, ...)*), i32* nonnull %1)
+ %5 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
+ call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_some..omp_par.4 to void (i32*, i32*, ...)*), i32* nonnull %1)
+ ret void
+}
+
+define internal void @merge_some..omp_par.4(i32* noalias nocapture readnone %0, i32* noalias nocapture readnone %1, i32* nocapture %2) {
+ store i32 5, i32* %2, align 4
+ ret void
+}
+
+define internal void @merge_some..omp_par.3(i32* noalias nocapture readnone %0, i32* noalias nocapture readnone %1, i32* nocapture %2) {
+ store i32 4, i32* %2, align 4
+ ret void
+}
+
+define internal void @merge_some..omp_par(i32* noalias nocapture readnone %0, i32* noalias nocapture readnone %1, i32* nocapture %2) {
+ store i32 2, i32* %2, align 4
+ ret void
+}
+
+; void merge_cancellable_regions(int cancel1, int cancel2)
+; {
+; #pragma omp parallel
+; {
+; if(cancel1) {
+; #pragma omp cancel parallel
+; }
+; }
+;
+; #pragma omp parallel
+; {
+; if (cancel2) {
+; #pragma omp cancel parallel
+; }
+; }
+; }
+;
+; Merge correctly cancellable regions.
+define dso_local void @merge_cancellable_regions(i32 %0, i32 %1) local_unnamed_addr {
+ %3 = alloca i32, align 4
+ %4 = alloca i32, align 4
+ store i32 %0, i32* %3, align 4
+ store i32 %1, i32* %4, align 4
+ %5 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
+ call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_cancellable_regions..omp_par to void (i32*, i32*, ...)*), i32* nonnull %3)
+ %6 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
+ call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_cancellable_regions..omp_par.5 to void (i32*, i32*, ...)*), i32* nonnull %4)
+ ret void
+}
+
+define internal void @merge_cancellable_regions..omp_par.5(i32* noalias nocapture readnone %0, i32* noalias nocapture readnone %1, i32* nocapture readonly %2) {
+ %4 = load i32, i32* %2, align 4
+ %5 = icmp eq i32 %4, 0
+ br i1 %5, label %6, label %7
+
+6: ; preds = %3
+ ret void
+
+7: ; preds = %3
+ %8 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
+ %9 = call i32 @__kmpc_cancel(%struct.ident_t* nonnull @1, i32 %8, i32 1)
+ ret void
+}
+
+define internal void @merge_cancellable_regions..omp_par(i32* noalias nocapture readnone %0, i32* noalias nocapture readnone %1, i32* nocapture readonly %2) {
+ %4 = load i32, i32* %2, align 4
+ %5 = icmp eq i32 %4, 0
+ br i1 %5, label %6, label %7
+
+6: ; preds = %3
+ ret void
+
+7: ; preds = %3
+ %8 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
+ %9 = call i32 @__kmpc_cancel(%struct.ident_t* nonnull @1, i32 %8, i32 1)
+ ret void
+}
+
+declare i32 @__kmpc_cancel(%struct.ident_t*, i32, i32) local_unnamed_addr
+
+
+!llvm.module.flags = !{!0}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{!2}
+!2 = !{i64 2, i64 -1, i64 -1, i1 true}
+; CHECK-LABEL: define {{[^@]+}}@merge_all() local_unnamed_addr {
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1:@.*]])
+; CHECK-NEXT: [[TMP2:%.*]] = alloca i32, align 4
+; CHECK-NEXT: store i32 1, i32* [[TMP2]], align 4
+; CHECK-NEXT: [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT: br label [[OMP_PARALLEL:%.*]]
+; CHECK: omp_parallel:
+; CHECK-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* [[GLOB1]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_all..omp_par.3 to void (i32*, i32*, ...)*), i32* [[TMP2]])
+; CHECK-NEXT: br label [[OMP_PAR_OUTLINED_EXIT:%.*]]
+; CHECK: omp.par.outlined.exit:
+; CHECK-NEXT: br label [[OMP_PAR_EXIT_SPLIT:%.*]]
+; CHECK: omp.par.exit.split:
+; CHECK-NEXT: br label [[DOTSPLIT_SPLIT:%.*]]
+; CHECK: .split.split:
+; CHECK-NEXT: ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_all..omp_par.3
+; CHECK-SAME: (i32* noalias [[TID_ADDR:%.*]], i32* noalias [[ZERO_ADDR:%.*]], i32* [[TMP0:%.*]]) [[ATTR0:#.*]] {
+; CHECK-NEXT: omp.par.entry:
+; CHECK-NEXT: [[TID_ADDR_LOCAL:%.*]] = alloca i32, align 4
+; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[TID_ADDR]], align 4
+; CHECK-NEXT: store i32 [[TMP1]], i32* [[TID_ADDR_LOCAL]], align 4
+; CHECK-NEXT: [[TID:%.*]] = load i32, i32* [[TID_ADDR_LOCAL]], align 4
+; CHECK-NEXT: br label [[OMP_PAR_REGION:%.*]]
+; CHECK: omp.par.outlined.exit.exitStub:
+; CHECK-NEXT: ret void
+; CHECK: omp.par.region:
+; CHECK-NEXT: br label [[OMP_PAR_MERGED:%.*]]
+; CHECK: omp.par.merged:
+; CHECK-NEXT: call void @merge_all..omp_par(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP0]])
+; CHECK-NEXT: [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT: call void @__kmpc_barrier(%struct.ident_t* [[GLOB2:@.*]], i32 [[OMP_GLOBAL_THREAD_NUM]])
+; CHECK-NEXT: call void @merge_all..omp_par.1(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP0]])
+; CHECK-NEXT: br label [[DOTSPLIT:%.*]]
+; CHECK: .split:
+; CHECK-NEXT: br label [[OMP_PAR_REGION_SPLIT:%.*]]
+; CHECK: omp.par.region.split:
+; CHECK-NEXT: br label [[OMP_PAR_PRE_FINALIZE:%.*]]
+; CHECK: omp.par.pre_finalize:
+; CHECK-NEXT: br label [[OMP_PAR_OUTLINED_EXIT_EXITSTUB:%.*]]
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_all..omp_par.1
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[TMP0:%.*]], i32* noalias nocapture nofree readnone [[TMP1:%.*]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP2:%.*]]) [[ATTR1:#.*]] {
+; CHECK-NEXT: store i32 3, i32* [[TMP2]], align 4
+; CHECK-NEXT: ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_all..omp_par
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[TMP0:%.*]], i32* noalias nocapture nofree readnone [[TMP1:%.*]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP2:%.*]]) [[ATTR1]] {
+; CHECK-NEXT: store i32 2, i32* [[TMP2]], align 4
+; CHECK-NEXT: ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_none() local_unnamed_addr {
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]])
+; CHECK-NEXT: [[TMP2:%.*]] = alloca i32, align 4
+; CHECK-NEXT: store i32 1, i32* [[TMP2]], align 4
+; CHECK-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]], i32 noundef 1, void (i32*, i32*, ...)* noundef bitcast (void (i32*, i32*, i32*)* @merge_none..omp_par to void (i32*, i32*, ...)*), i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP2]])
+; CHECK-NEXT: store i32 3, i32* [[TMP2]], align 4
+; CHECK-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]], i32 noundef 1, void (i32*, i32*, ...)* noundef bitcast (void (i32*, i32*, i32*)* @merge_none..omp_par.2 to void (i32*, i32*, ...)*), i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP2]])
+; CHECK-NEXT: ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_none..omp_par.2
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[TMP0:%.*]], i32* noalias nocapture nofree readnone [[TMP1:%.*]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP2:%.*]]) [[ATTR1]] {
+; CHECK-NEXT: store i32 4, i32* [[TMP2]], align 4
+; CHECK-NEXT: ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_none..omp_par
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[TMP0:%.*]], i32* noalias nocapture nofree readnone [[TMP1:%.*]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP2:%.*]]) [[ATTR1]] {
+; CHECK-NEXT: store i32 2, i32* [[TMP2]], align 4
+; CHECK-NEXT: ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_some() local_unnamed_addr {
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]])
+; CHECK-NEXT: [[TMP2:%.*]] = alloca i32, align 4
+; CHECK-NEXT: store i32 1, i32* [[TMP2]], align 4
+; CHECK-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]], i32 noundef 1, void (i32*, i32*, ...)* noundef bitcast (void (i32*, i32*, i32*)* @merge_some..omp_par to void (i32*, i32*, ...)*), i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP2]])
+; CHECK-NEXT: store i32 3, i32* [[TMP2]], align 4
+; CHECK-NEXT: [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT: br label [[OMP_PARALLEL:%.*]]
+; CHECK: omp_parallel:
+; CHECK-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* [[GLOB1]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_some..omp_par.2 to void (i32*, i32*, ...)*), i32* [[TMP2]])
+; CHECK-NEXT: br label [[OMP_PAR_OUTLINED_EXIT:%.*]]
+; CHECK: omp.par.outlined.exit:
+; CHECK-NEXT: br label [[OMP_PAR_EXIT_SPLIT:%.*]]
+; CHECK: omp.par.exit.split:
+; CHECK-NEXT: br label [[DOTSPLIT_SPLIT:%.*]]
+; CHECK: .split.split:
+; CHECK-NEXT: ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_some..omp_par.2
+; CHECK-SAME: (i32* noalias [[TID_ADDR:%.*]], i32* noalias [[ZERO_ADDR:%.*]], i32* [[TMP0:%.*]]) [[ATTR0]] {
+; CHECK-NEXT: omp.par.entry:
+; CHECK-NEXT: [[TID_ADDR_LOCAL:%.*]] = alloca i32, align 4
+; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[TID_ADDR]], align 4
+; CHECK-NEXT: store i32 [[TMP1]], i32* [[TID_ADDR_LOCAL]], align 4
+; CHECK-NEXT: [[TID:%.*]] = load i32, i32* [[TID_ADDR_LOCAL]], align 4
+; CHECK-NEXT: br label [[OMP_PAR_REGION:%.*]]
+; CHECK: omp.par.outlined.exit.exitStub:
+; CHECK-NEXT: ret void
+; CHECK: omp.par.region:
+; CHECK-NEXT: br label [[OMP_PAR_MERGED:%.*]]
+; CHECK: omp.par.merged:
+; CHECK-NEXT: call void @merge_some..omp_par.3(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP0]])
+; CHECK-NEXT: [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT: call void @__kmpc_barrier(%struct.ident_t* [[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM]])
+; CHECK-NEXT: call void @merge_some..omp_par.4(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP0]])
+; CHECK-NEXT: br label [[DOTSPLIT:%.*]]
+; CHECK: .split:
+; CHECK-NEXT: br label [[OMP_PAR_REGION_SPLIT:%.*]]
+; CHECK: omp.par.region.split:
+; CHECK-NEXT: br label [[OMP_PAR_PRE_FINALIZE:%.*]]
+; CHECK: omp.par.pre_finalize:
+; CHECK-NEXT: br label [[OMP_PAR_OUTLINED_EXIT_EXITSTUB:%.*]]
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_some..omp_par.4
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[TMP0:%.*]], i32* noalias nocapture nofree readnone [[TMP1:%.*]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP2:%.*]]) [[ATTR1]] {
+; CHECK-NEXT: store i32 5, i32* [[TMP2]], align 4
+; CHECK-NEXT: ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_some..omp_par.3
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[TMP0:%.*]], i32* noalias nocapture nofree readnone [[TMP1:%.*]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP2:%.*]]) [[ATTR1]] {
+; CHECK-NEXT: store i32 4, i32* [[TMP2]], align 4
+; CHECK-NEXT: ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_some..omp_par
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[TMP0:%.*]], i32* noalias nocapture nofree readnone [[TMP1:%.*]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP2:%.*]]) [[ATTR1]] {
+; CHECK-NEXT: store i32 2, i32* [[TMP2]], align 4
+; CHECK-NEXT: ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_cancellable_regions
+; CHECK-SAME: (i32 [[TMP0:%.*]], i32 [[TMP1:%.*]]) local_unnamed_addr {
+; CHECK-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]])
+; CHECK-NEXT: [[TMP4:%.*]] = alloca i32, align 4
+; CHECK-NEXT: [[TMP5:%.*]] = alloca i32, align 4
+; CHECK-NEXT: store i32 [[TMP0]], i32* [[TMP4]], align 4
+; CHECK-NEXT: store i32 [[TMP1]], i32* [[TMP5]], align 4
+; CHECK-NEXT: [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT: br label [[OMP_PARALLEL:%.*]]
+; CHECK: omp_parallel:
+; CHECK-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* [[GLOB1]], i32 2, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, i32*)* @merge_cancellable_regions..omp_par.1 to void (i32*, i32*, ...)*), i32* [[TMP4]], i32* [[TMP5]])
+; CHECK-NEXT: br label [[OMP_PAR_OUTLINED_EXIT:%.*]]
+; CHECK: omp.par.outlined.exit:
+; CHECK-NEXT: br label [[OMP_PAR_EXIT_SPLIT:%.*]]
+; CHECK: omp.par.exit.split:
+; CHECK-NEXT: br label [[DOTSPLIT_SPLIT:%.*]]
+; CHECK: .split.split:
+; CHECK-NEXT: ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_cancellable_regions..omp_par.1
+; CHECK-SAME: (i32* noalias [[TID_ADDR:%.*]], i32* noalias [[ZERO_ADDR:%.*]], i32* [[TMP0:%.*]], i32* [[TMP1:%.*]]) [[ATTR0]] {
+; CHECK-NEXT: omp.par.entry:
+; CHECK-NEXT: [[TID_ADDR_LOCAL:%.*]] = alloca i32, align 4
+; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[TID_ADDR]], align 4
+; CHECK-NEXT: store i32 [[TMP2]], i32* [[TID_ADDR_LOCAL]], align 4
+; CHECK-NEXT: [[TID:%.*]] = load i32, i32* [[TID_ADDR_LOCAL]], align 4
+; CHECK-NEXT: br label [[OMP_PAR_REGION:%.*]]
+; CHECK: omp.par.outlined.exit.exitStub:
+; CHECK-NEXT: ret void
+; CHECK: omp.par.region:
+; CHECK-NEXT: br label [[OMP_PAR_MERGED:%.*]]
+; CHECK: omp.par.merged:
+; CHECK-NEXT: call void @merge_cancellable_regions..omp_par(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[TMP0]])
+; CHECK-NEXT: [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT: call void @__kmpc_barrier(%struct.ident_t* [[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM]])
+; CHECK-NEXT: call void @merge_cancellable_regions..omp_par.5(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[TMP1]])
+; CHECK-NEXT: br label [[DOTSPLIT:%.*]]
+; CHECK: .split:
+; CHECK-NEXT: br label [[OMP_PAR_REGION_SPLIT:%.*]]
+; CHECK: omp.par.region.split:
+; CHECK-NEXT: br label [[OMP_PAR_PRE_FINALIZE:%.*]]
+; CHECK: omp.par.pre_finalize:
+; CHECK-NEXT: br label [[OMP_PAR_OUTLINED_EXIT_EXITSTUB:%.*]]
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_cancellable_regions..omp_par.5
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[TMP0:%.*]], i32* noalias nocapture nofree readnone [[TMP1:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[TMP2:%.*]]) {
+; CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP2]], align 4
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0
+; CHECK-NEXT: br i1 [[TMP5]], label [[TMP6:%.*]], label [[TMP7:%.*]]
+; CHECK: 6:
+; CHECK-NEXT: ret void
+; CHECK: 7:
+; CHECK-NEXT: [[TMP8:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef nonnull [[GLOB1]])
+; CHECK-NEXT: [[TMP9:%.*]] = call i32 @__kmpc_cancel(%struct.ident_t* noundef nonnull [[GLOB1]], i32 [[TMP8]], i32 noundef 1)
+; CHECK-NEXT: ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_cancellable_regions..omp_par
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[TMP0:%.*]], i32* noalias nocapture nofree readnone [[TMP1:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[TMP2:%.*]]) {
+; CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP2]], align 4
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0
+; CHECK-NEXT: br i1 [[TMP5]], label [[TMP6:%.*]], label [[TMP7:%.*]]
+; CHECK: 6:
+; CHECK-NEXT: ret void
+; CHECK: 7:
+; CHECK-NEXT: [[TMP8:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef nonnull [[GLOB1]])
+; CHECK-NEXT: [[TMP9:%.*]] = call i32 @__kmpc_cancel(%struct.ident_t* noundef nonnull [[GLOB1]], i32 [[TMP8]], i32 noundef 1)
+; CHECK-NEXT: ret void
+;
--- /dev/null
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs
+; RUN: opt -S -attributor -openmpopt -openmp-opt-enable-merging < %s | FileCheck %s
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+
+%struct.ident_t = type { i32, i32, i32, i32, i8* }
+
+@0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
+@1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @0, i32 0, i32 0) }, align 8
+
+; void merge_all() {
+; int a = 1;
+; #pragma omp parallel
+; {
+; a = 2;
+; }
+; #pragma omp parallel
+; {
+; a = 3;
+; }
+; }
+;
+; Merge all parallel regions.
+define dso_local void @merge_all() local_unnamed_addr {
+ %1 = alloca i32, align 4
+ %2 = bitcast i32* %1 to i8*
+ store i32 1, i32* %1, align 4
+ %3 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
+ call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_all..omp_par to void (i32*, i32*, ...)*), i32* nonnull %1)
+ %4 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
+ call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_all..omp_par.1 to void (i32*, i32*, ...)*), i32* nonnull %1)
+ ret void
+}
+
+define internal void @merge_all..omp_par.1(i32* noalias nocapture readnone %0, i32* noalias nocapture readnone %1, i32* nocapture %2) {
+ store i32 3, i32* %2, align 4
+ ret void
+}
+
+define internal void @merge_all..omp_par(i32* noalias nocapture readnone %0, i32* noalias nocapture readnone %1, i32* nocapture %2) {
+ store i32 2, i32* %2, align 4
+ ret void
+}
+
+
+declare i32 @__kmpc_global_thread_num(%struct.ident_t*) local_unnamed_addr
+
+declare !callback !1 void @__kmpc_fork_call(%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) local_unnamed_addr
+
+; void merge_none() {
+; int a = 1;
+; #pragma omp parallel
+; {
+; a = 2;
+; }
+; a = 3;
+; #pragma omp parallel
+; {
+; a = 4;
+; }
+; }
+;
+; Does not merge parallel regions, in-between store
+; instruction is unsafe to execute in parallel.
+define dso_local void @merge_none() local_unnamed_addr {
+ %1 = alloca i32, align 4
+ %2 = bitcast i32* %1 to i8*
+ store i32 1, i32* %1, align 4
+ %3 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
+ call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_none..omp_par to void (i32*, i32*, ...)*), i32* nonnull %1)
+ store i32 3, i32* %1, align 4
+ %4 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
+ call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_none..omp_par.2 to void (i32*, i32*, ...)*), i32* nonnull %1)
+ ret void
+}
+
+define internal void @merge_none..omp_par.2(i32* noalias nocapture readnone %0, i32* noalias nocapture readnone %1, i32* nocapture %2) {
+ store i32 4, i32* %2, align 4
+ ret void
+}
+
+define internal void @merge_none..omp_par(i32* noalias nocapture readnone %0, i32* noalias nocapture readnone %1, i32* nocapture %2) {
+ store i32 2, i32* %2, align 4
+ ret void
+}
+
+; void merge_some() {
+; int a = 1;
+; #pragma omp parallel
+; {
+; a = 2;
+; }
+; a = 3;
+; #pragma omp parallel
+; {
+; a = 4;
+; }
+; #pragma omp parallel
+; {
+; a = 5;
+; }
+; }
+;
+; Do not merge first parallel region, due to the
+; unsafe store, but merge the two next parallel
+; regions.
+define dso_local void @merge_some() local_unnamed_addr {
+ %1 = alloca i32, align 4
+ %2 = bitcast i32* %1 to i8*
+ store i32 1, i32* %1, align 4
+ %3 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
+ call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_some..omp_par to void (i32*, i32*, ...)*), i32* nonnull %1)
+ store i32 3, i32* %1, align 4
+ %4 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
+ call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_some..omp_par.3 to void (i32*, i32*, ...)*), i32* nonnull %1)
+ %5 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
+ call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_some..omp_par.4 to void (i32*, i32*, ...)*), i32* nonnull %1)
+ ret void
+}
+
+define internal void @merge_some..omp_par.4(i32* noalias nocapture readnone %0, i32* noalias nocapture readnone %1, i32* nocapture %2) {
+ store i32 5, i32* %2, align 4
+ ret void
+}
+
+define internal void @merge_some..omp_par.3(i32* noalias nocapture readnone %0, i32* noalias nocapture readnone %1, i32* nocapture %2) {
+ store i32 4, i32* %2, align 4
+ ret void
+}
+
+define internal void @merge_some..omp_par(i32* noalias nocapture readnone %0, i32* noalias nocapture readnone %1, i32* nocapture %2) {
+ store i32 2, i32* %2, align 4
+ ret void
+}
+
+; void merge_cancellable_regions(int cancel1, int cancel2)
+; {
+; #pragma omp parallel
+; {
+; if(cancel1) {
+; #pragma omp cancel parallel
+; }
+; }
+;
+; #pragma omp parallel
+; {
+; if (cancel2) {
+; #pragma omp cancel parallel
+; }
+; }
+; }
+;
+; Merge correctly cancellable regions.
+define dso_local void @merge_cancellable_regions(i32 %0, i32 %1) local_unnamed_addr {
+ %3 = alloca i32, align 4
+ %4 = alloca i32, align 4
+ store i32 %0, i32* %3, align 4
+ store i32 %1, i32* %4, align 4
+ %5 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
+ call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_cancellable_regions..omp_par to void (i32*, i32*, ...)*), i32* nonnull %3)
+ %6 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
+ call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_cancellable_regions..omp_par.5 to void (i32*, i32*, ...)*), i32* nonnull %4)
+ ret void
+}
+
+define internal void @merge_cancellable_regions..omp_par.5(i32* noalias nocapture readnone %0, i32* noalias nocapture readnone %1, i32* nocapture readonly %2) {
+ %4 = load i32, i32* %2, align 4
+ %5 = icmp eq i32 %4, 0
+ br i1 %5, label %6, label %7
+
+6: ; preds = %3
+ ret void
+
+7: ; preds = %3
+ %8 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
+ %9 = call i32 @__kmpc_cancel(%struct.ident_t* nonnull @1, i32 %8, i32 1)
+ ret void
+}
+
+define internal void @merge_cancellable_regions..omp_par(i32* noalias nocapture readnone %0, i32* noalias nocapture readnone %1, i32* nocapture readonly %2) {
+ %4 = load i32, i32* %2, align 4
+ %5 = icmp eq i32 %4, 0
+ br i1 %5, label %6, label %7
+
+6: ; preds = %3
+ ret void
+
+7: ; preds = %3
+ %8 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
+ %9 = call i32 @__kmpc_cancel(%struct.ident_t* nonnull @1, i32 %8, i32 1)
+ ret void
+}
+
+declare i32 @__kmpc_cancel(%struct.ident_t*, i32, i32) local_unnamed_addr
+
+
+!llvm.module.flags = !{!0}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{!2}
+!2 = !{i64 2, i64 -1, i64 -1, i1 true}
+; CHECK-LABEL: define {{[^@]+}}@merge_all() local_unnamed_addr {
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1:@.*]])
+; CHECK-NEXT: [[TMP2:%.*]] = alloca i32, align 4
+; CHECK-NEXT: store i32 1, i32* [[TMP2]], align 4
+; CHECK-NEXT: [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT: br label [[OMP_PARALLEL:%.*]]
+; CHECK: omp_parallel:
+; CHECK-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* [[GLOB1]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_all..omp_par.2 to void (i32*, i32*, ...)*), i32* [[TMP2]])
+; CHECK-NEXT: br label [[OMP_PAR_OUTLINED_EXIT:%.*]]
+; CHECK: omp.par.outlined.exit:
+; CHECK-NEXT: br label [[OMP_PAR_EXIT_SPLIT:%.*]]
+; CHECK: omp.par.exit.split:
+; CHECK-NEXT: br label [[DOTSPLIT_SPLIT:%.*]]
+; CHECK: .split.split:
+; CHECK-NEXT: ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_all..omp_par.2
+; CHECK-SAME: (i32* noalias [[TID_ADDR:%.*]], i32* noalias [[ZERO_ADDR:%.*]], i32* [[TMP0:%.*]]) [[ATTR0:#.*]] {
+; CHECK-NEXT: omp.par.entry:
+; CHECK-NEXT: [[TID_ADDR_LOCAL:%.*]] = alloca i32, align 4
+; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[TID_ADDR]], align 4
+; CHECK-NEXT: store i32 [[TMP1]], i32* [[TID_ADDR_LOCAL]], align 4
+; CHECK-NEXT: [[TID:%.*]] = load i32, i32* [[TID_ADDR_LOCAL]], align 4
+; CHECK-NEXT: br label [[OMP_PAR_REGION:%.*]]
+; CHECK: omp.par.outlined.exit.exitStub:
+; CHECK-NEXT: ret void
+; CHECK: omp.par.region:
+; CHECK-NEXT: br label [[OMP_PAR_MERGED:%.*]]
+; CHECK: omp.par.merged:
+; CHECK-NEXT: call void @merge_all..omp_par(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP0]])
+; CHECK-NEXT: [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT: call void @__kmpc_barrier(%struct.ident_t* [[GLOB2:@.*]], i32 [[OMP_GLOBAL_THREAD_NUM]])
+; CHECK-NEXT: call void @merge_all..omp_par.1(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP0]])
+; CHECK-NEXT: br label [[DOTSPLIT:%.*]]
+; CHECK: .split:
+; CHECK-NEXT: br label [[OMP_PAR_REGION_SPLIT:%.*]]
+; CHECK: omp.par.region.split:
+; CHECK-NEXT: br label [[OMP_PAR_PRE_FINALIZE:%.*]]
+; CHECK: omp.par.pre_finalize:
+; CHECK-NEXT: br label [[OMP_PAR_OUTLINED_EXIT_EXITSTUB:%.*]]
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_all..omp_par.1
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[TMP0:%.*]], i32* noalias nocapture nofree readnone [[TMP1:%.*]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP2:%.*]]) [[ATTR1:#.*]] {
+; CHECK-NEXT: store i32 3, i32* [[TMP2]], align 4
+; CHECK-NEXT: ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_all..omp_par
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[TMP0:%.*]], i32* noalias nocapture nofree readnone [[TMP1:%.*]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP2:%.*]]) [[ATTR1]] {
+; CHECK-NEXT: store i32 2, i32* [[TMP2]], align 4
+; CHECK-NEXT: ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_none() local_unnamed_addr {
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]])
+; CHECK-NEXT: [[TMP2:%.*]] = alloca i32, align 4
+; CHECK-NEXT: store i32 1, i32* [[TMP2]], align 4
+; CHECK-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]], i32 noundef 1, void (i32*, i32*, ...)* noundef bitcast (void (i32*, i32*, i32*)* @merge_none..omp_par to void (i32*, i32*, ...)*), i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP2]])
+; CHECK-NEXT: store i32 3, i32* [[TMP2]], align 4
+; CHECK-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]], i32 noundef 1, void (i32*, i32*, ...)* noundef bitcast (void (i32*, i32*, i32*)* @merge_none..omp_par.2 to void (i32*, i32*, ...)*), i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP2]])
+; CHECK-NEXT: ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_none..omp_par.2
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[TMP0:%.*]], i32* noalias nocapture nofree readnone [[TMP1:%.*]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP2:%.*]]) [[ATTR1]] {
+; CHECK-NEXT: store i32 4, i32* [[TMP2]], align 4
+; CHECK-NEXT: ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_none..omp_par
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[TMP0:%.*]], i32* noalias nocapture nofree readnone [[TMP1:%.*]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP2:%.*]]) [[ATTR1]] {
+; CHECK-NEXT: store i32 2, i32* [[TMP2]], align 4
+; CHECK-NEXT: ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_some() local_unnamed_addr {
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]])
+; CHECK-NEXT: [[TMP2:%.*]] = alloca i32, align 4
+; CHECK-NEXT: store i32 1, i32* [[TMP2]], align 4
+; CHECK-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]], i32 noundef 1, void (i32*, i32*, ...)* noundef bitcast (void (i32*, i32*, i32*)* @merge_some..omp_par to void (i32*, i32*, ...)*), i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP2]])
+; CHECK-NEXT: store i32 3, i32* [[TMP2]], align 4
+; CHECK-NEXT: [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT: br label [[OMP_PARALLEL:%.*]]
+; CHECK: omp_parallel:
+; CHECK-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* [[GLOB1]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_some..omp_par.5 to void (i32*, i32*, ...)*), i32* [[TMP2]])
+; CHECK-NEXT: br label [[OMP_PAR_OUTLINED_EXIT:%.*]]
+; CHECK: omp.par.outlined.exit:
+; CHECK-NEXT: br label [[OMP_PAR_EXIT_SPLIT:%.*]]
+; CHECK: omp.par.exit.split:
+; CHECK-NEXT: br label [[DOTSPLIT_SPLIT:%.*]]
+; CHECK: .split.split:
+; CHECK-NEXT: ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_some..omp_par.5
+; CHECK-SAME: (i32* noalias [[TID_ADDR:%.*]], i32* noalias [[ZERO_ADDR:%.*]], i32* [[TMP0:%.*]]) [[ATTR0]] {
+; CHECK-NEXT: omp.par.entry:
+; CHECK-NEXT: [[TID_ADDR_LOCAL:%.*]] = alloca i32, align 4
+; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[TID_ADDR]], align 4
+; CHECK-NEXT: store i32 [[TMP1]], i32* [[TID_ADDR_LOCAL]], align 4
+; CHECK-NEXT: [[TID:%.*]] = load i32, i32* [[TID_ADDR_LOCAL]], align 4
+; CHECK-NEXT: br label [[OMP_PAR_REGION:%.*]]
+; CHECK: omp.par.outlined.exit.exitStub:
+; CHECK-NEXT: ret void
+; CHECK: omp.par.region:
+; CHECK-NEXT: br label [[OMP_PAR_MERGED:%.*]]
+; CHECK: omp.par.merged:
+; CHECK-NEXT: call void @merge_some..omp_par.3(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP0]])
+; CHECK-NEXT: [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT: call void @__kmpc_barrier(%struct.ident_t* [[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM]])
+; CHECK-NEXT: call void @merge_some..omp_par.4(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP0]])
+; CHECK-NEXT: br label [[DOTSPLIT:%.*]]
+; CHECK: .split:
+; CHECK-NEXT: br label [[OMP_PAR_REGION_SPLIT:%.*]]
+; CHECK: omp.par.region.split:
+; CHECK-NEXT: br label [[OMP_PAR_PRE_FINALIZE:%.*]]
+; CHECK: omp.par.pre_finalize:
+; CHECK-NEXT: br label [[OMP_PAR_OUTLINED_EXIT_EXITSTUB:%.*]]
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_some..omp_par.4
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[TMP0:%.*]], i32* noalias nocapture nofree readnone [[TMP1:%.*]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP2:%.*]]) [[ATTR1]] {
+; CHECK-NEXT: store i32 5, i32* [[TMP2]], align 4
+; CHECK-NEXT: ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_some..omp_par.3
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[TMP0:%.*]], i32* noalias nocapture nofree readnone [[TMP1:%.*]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP2:%.*]]) [[ATTR1]] {
+; CHECK-NEXT: store i32 4, i32* [[TMP2]], align 4
+; CHECK-NEXT: ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_some..omp_par
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[TMP0:%.*]], i32* noalias nocapture nofree readnone [[TMP1:%.*]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP2:%.*]]) [[ATTR1]] {
+; CHECK-NEXT: store i32 2, i32* [[TMP2]], align 4
+; CHECK-NEXT: ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_cancellable_regions
+; CHECK-SAME: (i32 [[TMP0:%.*]], i32 [[TMP1:%.*]]) local_unnamed_addr {
+; CHECK-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]])
+; CHECK-NEXT: [[TMP4:%.*]] = alloca i32, align 4
+; CHECK-NEXT: [[TMP5:%.*]] = alloca i32, align 4
+; CHECK-NEXT: store i32 [[TMP0]], i32* [[TMP4]], align 4
+; CHECK-NEXT: store i32 [[TMP1]], i32* [[TMP5]], align 4
+; CHECK-NEXT: [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT: br label [[OMP_PARALLEL:%.*]]
+; CHECK: omp_parallel:
+; CHECK-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* [[GLOB1]], i32 2, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, i32*)* @merge_cancellable_regions..omp_par.6 to void (i32*, i32*, ...)*), i32* [[TMP4]], i32* [[TMP5]])
+; CHECK-NEXT: br label [[OMP_PAR_OUTLINED_EXIT:%.*]]
+; CHECK: omp.par.outlined.exit:
+; CHECK-NEXT: br label [[OMP_PAR_EXIT_SPLIT:%.*]]
+; CHECK: omp.par.exit.split:
+; CHECK-NEXT: br label [[DOTSPLIT_SPLIT:%.*]]
+; CHECK: .split.split:
+; CHECK-NEXT: ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_cancellable_regions..omp_par.6
+; CHECK-SAME: (i32* noalias [[TID_ADDR:%.*]], i32* noalias [[ZERO_ADDR:%.*]], i32* [[TMP0:%.*]], i32* [[TMP1:%.*]]) [[ATTR0]] {
+; CHECK-NEXT: omp.par.entry:
+; CHECK-NEXT: [[TID_ADDR_LOCAL:%.*]] = alloca i32, align 4
+; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[TID_ADDR]], align 4
+; CHECK-NEXT: store i32 [[TMP2]], i32* [[TID_ADDR_LOCAL]], align 4
+; CHECK-NEXT: [[TID:%.*]] = load i32, i32* [[TID_ADDR_LOCAL]], align 4
+; CHECK-NEXT: br label [[OMP_PAR_REGION:%.*]]
+; CHECK: omp.par.outlined.exit.exitStub:
+; CHECK-NEXT: ret void
+; CHECK: omp.par.region:
+; CHECK-NEXT: br label [[OMP_PAR_MERGED:%.*]]
+; CHECK: omp.par.merged:
+; CHECK-NEXT: call void @merge_cancellable_regions..omp_par(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[TMP0]])
+; CHECK-NEXT: [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT: call void @__kmpc_barrier(%struct.ident_t* [[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM]])
+; CHECK-NEXT: call void @merge_cancellable_regions..omp_par.5(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[TMP1]])
+; CHECK-NEXT: br label [[DOTSPLIT:%.*]]
+; CHECK: .split:
+; CHECK-NEXT: br label [[OMP_PAR_REGION_SPLIT:%.*]]
+; CHECK: omp.par.region.split:
+; CHECK-NEXT: br label [[OMP_PAR_PRE_FINALIZE:%.*]]
+; CHECK: omp.par.pre_finalize:
+; CHECK-NEXT: br label [[OMP_PAR_OUTLINED_EXIT_EXITSTUB:%.*]]
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_cancellable_regions..omp_par.5
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[TMP0:%.*]], i32* noalias nocapture nofree readnone [[TMP1:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[TMP2:%.*]]) {
+; CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP2]], align 4
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0
+; CHECK-NEXT: br i1 [[TMP5]], label [[TMP6:%.*]], label [[TMP7:%.*]]
+; CHECK: 6:
+; CHECK-NEXT: ret void
+; CHECK: 7:
+; CHECK-NEXT: [[TMP8:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef nonnull [[GLOB1]])
+; CHECK-NEXT: [[TMP9:%.*]] = call i32 @__kmpc_cancel(%struct.ident_t* noundef nonnull [[GLOB1]], i32 [[TMP8]], i32 noundef 1)
+; CHECK-NEXT: ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_cancellable_regions..omp_par
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[TMP0:%.*]], i32* noalias nocapture nofree readnone [[TMP1:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[TMP2:%.*]]) {
+; CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP2]], align 4
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0
+; CHECK-NEXT: br i1 [[TMP5]], label [[TMP6:%.*]], label [[TMP7:%.*]]
+; CHECK: 6:
+; CHECK-NEXT: ret void
+; CHECK: 7:
+; CHECK-NEXT: [[TMP8:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef nonnull [[GLOB1]])
+; CHECK-NEXT: [[TMP9:%.*]] = call i32 @__kmpc_cancel(%struct.ident_t* noundef nonnull [[GLOB1]], i32 [[TMP8]], i32 noundef 1)
+; CHECK-NEXT: ret void
+;