Value *Start, Value *Stop, Value *Step,
bool IsSigned, bool InclusiveStop);
+ /// Modifies the canonical loop to be a statically-scheduled workshare loop.
+ ///
+ /// This takes a \p LoopInfo representing a canonical loop, such as the one
+ /// created by \p createCanonicalLoop and emits additional instructions to
+ /// turn it into a workshare loop. In particular, it calls to an OpenMP
+ /// runtime function in the preheader to obtain the loop bounds to be used in
+ /// the current thread, updates the relevant instructions in the canonical
+ /// loop and calls to an OpenMP runtime finalization function after the loop.
+ ///
+ /// \param Loc The source location description, the insertion location
+ /// is not used.
+ /// \param CLI A descriptor of the canonical loop to workshare.
+ /// \param AllocaIP An insertion point for Alloca instructions usable in the
+ /// preheader of the loop.
+ /// \param NeedsBarrier Indicates whether a barrier must be insterted after
+ /// the loop.
+ /// \param Chunk The size of loop chunk considered as a unit when
+ /// scheduling. If \p nullptr, defaults to 1.
+ ///
+ /// \returns Updated CanonicalLoopInfo.
+ CanonicalLoopInfo *createStaticWorkshareLoop(const LocationDescription &Loc,
+ CanonicalLoopInfo *CLI,
+ InsertPointTy AllocaIP,
+ bool NeedsBarrier,
+ Value *Chunk = nullptr);
+
/// Generator for '#omp flush'
///
/// \param Loc The location where the flush directive was encountered
/// | Cond---\
/// | | |
/// | Body |
-/// | | |
+/// | | | |
+/// | <...> |
+/// | | | |
/// \--Latch |
/// |
/// Exit
/// After
///
/// Code in the header, condition block, latch and exit block must not have any
-/// side-effect.
+/// side-effect. The body block is the single entry point into the loop body,
+/// which may contain arbitrary control flow as long as all control paths
+/// eventually branch to the latch block.
///
/// Defined outside OpenMPIRBuilder because one cannot forward-declare nested
/// classes.
/// statements/cancellations).
BasicBlock *getAfter() const { return After; }
- /// Returns the llvm::Value containing the number of loop iterations. I must
+ /// Returns the llvm::Value containing the number of loop iterations. It must
/// be valid in the preheader and always interpreted as an unsigned integer of
/// any bit-width.
Value *getTripCount() const {
return createCanonicalLoop(Builder.saveIP(), BodyGen, TripCount);
}
+// Returns an LLVM function to call for initializing loop bounds using OpenMP
+// static scheduling depending on `type`. Only i32 and i64 are supported by the
+// runtime. Always interpret integers as unsigned similarly to
+// CanonicalLoopInfo.
+static FunctionCallee getKmpcForStaticInitForType(Type *Ty, Module &M,
+ OpenMPIRBuilder &OMPBuilder) {
+ unsigned Bitwidth = Ty->getIntegerBitWidth();
+ if (Bitwidth == 32)
+ return OMPBuilder.getOrCreateRuntimeFunction(
+ M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_init_4u);
+ if (Bitwidth == 64)
+ return OMPBuilder.getOrCreateRuntimeFunction(
+ M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_init_8u);
+ llvm_unreachable("unknown OpenMP loop iterator bitwidth");
+}
+
+// Sets the number of loop iterations to the given value. This value must be
+// valid in the condition block (i.e., defined in the preheader) and is
+// interpreted as an unsigned integer.
+void setCanonicalLoopTripCount(CanonicalLoopInfo *CLI, Value *TripCount) {
+ Instruction *CmpI = &CLI->getCond()->front();
+ assert(isa<CmpInst>(CmpI) && "First inst must compare IV with TripCount");
+ CmpI->setOperand(1, TripCount);
+ CLI->assertOK();
+}
+
+CanonicalLoopInfo *OpenMPIRBuilder::createStaticWorkshareLoop(
+ const LocationDescription &Loc, CanonicalLoopInfo *CLI,
+ InsertPointTy AllocaIP, bool NeedsBarrier, Value *Chunk) {
+ // Set up the source location value for OpenMP runtime.
+ if (!updateToLocation(Loc))
+ return nullptr;
+
+ Constant *SrcLocStr = getOrCreateSrcLocStr(Loc);
+ Value *SrcLoc = getOrCreateIdent(SrcLocStr);
+
+ // Declare useful OpenMP runtime functions.
+ Value *IV = CLI->getIndVar();
+ Type *IVTy = IV->getType();
+ FunctionCallee StaticInit = getKmpcForStaticInitForType(IVTy, M, *this);
+ FunctionCallee StaticFini =
+ getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_for_static_fini);
+
+ // Allocate space for computed loop bounds as expected by the "init" function.
+ Builder.restoreIP(AllocaIP);
+ Type *I32Type = Type::getInt32Ty(M.getContext());
+ Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
+ Value *PLowerBound = Builder.CreateAlloca(IVTy, nullptr, "p.lowerbound");
+ Value *PUpperBound = Builder.CreateAlloca(IVTy, nullptr, "p.upperbound");
+ Value *PStride = Builder.CreateAlloca(IVTy, nullptr, "p.stride");
+
+ // At the end of the preheader, prepare for calling the "init" function by
+ // storing the current loop bounds into the allocated space. A canonical loop
+ // always iterates from 0 to trip-count with step 1. Note that "init" expects
+ // and produces an inclusive upper bound.
+ Builder.SetInsertPoint(CLI->getPreheader()->getTerminator());
+ Constant *Zero = ConstantInt::get(IVTy, 0);
+ Constant *One = ConstantInt::get(IVTy, 1);
+ Builder.CreateStore(Zero, PLowerBound);
+ Value *UpperBound = Builder.CreateSub(CLI->getTripCount(), One);
+ Builder.CreateStore(UpperBound, PUpperBound);
+ Builder.CreateStore(One, PStride);
+
+ if (!Chunk)
+ Chunk = One;
+
+ Value *ThreadNum = getOrCreateThreadID(SrcLoc);
+
+ // TODO: extract scheduling type and map it to OMP constant. This is curently
+ // happening in kmp.h and its ilk and needs to be moved to OpenMP.td first.
+ constexpr int StaticSchedType = 34;
+ Constant *SchedulingType = ConstantInt::get(I32Type, StaticSchedType);
+
+ // Call the "init" function and update the trip count of the loop with the
+ // value it produced.
+ Builder.CreateCall(StaticInit,
+ {SrcLoc, ThreadNum, SchedulingType, PLastIter, PLowerBound,
+ PUpperBound, PStride, One, Chunk});
+ Value *LowerBound = Builder.CreateLoad(PLowerBound);
+ Value *InclusiveUpperBound = Builder.CreateLoad(PUpperBound);
+ Value *TripCountMinusOne = Builder.CreateSub(InclusiveUpperBound, LowerBound);
+ Value *TripCount = Builder.CreateAdd(TripCountMinusOne, One);
+ setCanonicalLoopTripCount(CLI, TripCount);
+
+ // Update all uses of the induction variable except the one in the condition
+ // block that compares it with the actual upper bound, and the increment in
+ // the latch block.
+ // TODO: this can eventually move to CanonicalLoopInfo or to a new
+ // CanonicalLoopInfoUpdater interface.
+ Builder.SetInsertPoint(CLI->getBody(), CLI->getBody()->getFirstInsertionPt());
+ Value *UpdatedIV = Builder.CreateAdd(IV, LowerBound);
+ IV->replaceUsesWithIf(UpdatedIV, [&](Use &U) {
+ auto *Instr = dyn_cast<Instruction>(U.getUser());
+ return !Instr ||
+ (Instr->getParent() != CLI->getCond() &&
+ Instr->getParent() != CLI->getLatch() && Instr != UpdatedIV);
+ });
+
+ // In the "exit" block, call the "fini" function.
+ Builder.SetInsertPoint(CLI->getExit(),
+ CLI->getExit()->getTerminator()->getIterator());
+ Builder.CreateCall(StaticFini, {SrcLoc, ThreadNum});
+
+ // Add the barrier if requested.
+ if (NeedsBarrier)
+ createBarrier(Loc, omp::Directive::OMPD_for, /* ForceSimpleCall */ false,
+ /* CheckCancelFlag */ false);
+
+ CLI->assertOK();
+ return CLI;
+}
+
void CanonicalLoopInfo::eraseFromParent() {
assert(IsValid && "can only erase previously valid loop cfg");
IsValid = false;
EXPECT_FALSE(verifyModule(*M, &errs()));
}
+TEST_F(OpenMPIRBuilderTest, StaticWorkShareLoop) {
+ using InsertPointTy = OpenMPIRBuilder::InsertPointTy;
+ OpenMPIRBuilder OMPBuilder(*M);
+ OMPBuilder.initialize();
+ IRBuilder<> Builder(BB);
+ OpenMPIRBuilder::LocationDescription Loc({Builder.saveIP(), DL});
+
+ Type *LCTy = Type::getInt32Ty(Ctx);
+ Value *StartVal = ConstantInt::get(LCTy, 10);
+ Value *StopVal = ConstantInt::get(LCTy, 52);
+ Value *StepVal = ConstantInt::get(LCTy, 2);
+ auto LoopBodyGen = [&](InsertPointTy, llvm::Value *) {};
+
+ CanonicalLoopInfo *CLI = OMPBuilder.createCanonicalLoop(
+ Loc, LoopBodyGen, StartVal, StopVal, StepVal,
+ /*IsSigned=*/false, /*InclusiveStop=*/false);
+
+ Builder.SetInsertPoint(BB, BB->getFirstInsertionPt());
+ InsertPointTy AllocaIP = Builder.saveIP();
+
+ CLI = OMPBuilder.createStaticWorkshareLoop(Loc, CLI, AllocaIP,
+ /*NeedsBarrier=*/true);
+ auto AllocaIter = BB->begin();
+ ASSERT_GE(std::distance(BB->begin(), BB->end()), 4);
+ AllocaInst *PLastIter = dyn_cast<AllocaInst>(&*(AllocaIter++));
+ AllocaInst *PLowerBound = dyn_cast<AllocaInst>(&*(AllocaIter++));
+ AllocaInst *PUpperBound = dyn_cast<AllocaInst>(&*(AllocaIter++));
+ AllocaInst *PStride = dyn_cast<AllocaInst>(&*(AllocaIter++));
+ EXPECT_NE(PLastIter, nullptr);
+ EXPECT_NE(PLowerBound, nullptr);
+ EXPECT_NE(PUpperBound, nullptr);
+ EXPECT_NE(PStride, nullptr);
+
+ auto PreheaderIter = CLI->getPreheader()->begin();
+ ASSERT_GE(
+ std::distance(CLI->getPreheader()->begin(), CLI->getPreheader()->end()),
+ 7);
+ StoreInst *LowerBoundStore = dyn_cast<StoreInst>(&*(PreheaderIter++));
+ StoreInst *UpperBoundStore = dyn_cast<StoreInst>(&*(PreheaderIter++));
+ StoreInst *StrideStore = dyn_cast<StoreInst>(&*(PreheaderIter++));
+ ASSERT_NE(LowerBoundStore, nullptr);
+ ASSERT_NE(UpperBoundStore, nullptr);
+ ASSERT_NE(StrideStore, nullptr);
+
+ auto *OrigLowerBound =
+ dyn_cast<ConstantInt>(LowerBoundStore->getValueOperand());
+ auto *OrigUpperBound =
+ dyn_cast<ConstantInt>(UpperBoundStore->getValueOperand());
+ auto *OrigStride = dyn_cast<ConstantInt>(StrideStore->getValueOperand());
+ ASSERT_NE(OrigLowerBound, nullptr);
+ ASSERT_NE(OrigUpperBound, nullptr);
+ ASSERT_NE(OrigStride, nullptr);
+ EXPECT_EQ(OrigLowerBound->getValue(), 0);
+ EXPECT_EQ(OrigUpperBound->getValue(), 20);
+ EXPECT_EQ(OrigStride->getValue(), 1);
+
+ // Check that the loop IV is updated to account for the lower bound returned
+ // by the OpenMP runtime call.
+ BinaryOperator *Add = dyn_cast<BinaryOperator>(&CLI->getBody()->front());
+ EXPECT_EQ(Add->getOperand(0), CLI->getIndVar());
+ auto *LoadedLowerBound = dyn_cast<LoadInst>(Add->getOperand(1));
+ ASSERT_NE(LoadedLowerBound, nullptr);
+ EXPECT_EQ(LoadedLowerBound->getPointerOperand(), PLowerBound);
+
+ // Check that the trip count is updated to account for the lower and upper
+ // bounds return by the OpenMP runtime call.
+ auto *AddOne = dyn_cast<Instruction>(CLI->getTripCount());
+ ASSERT_NE(AddOne, nullptr);
+ ASSERT_TRUE(AddOne->isBinaryOp());
+ auto *One = dyn_cast<ConstantInt>(AddOne->getOperand(1));
+ ASSERT_NE(One, nullptr);
+ EXPECT_EQ(One->getValue(), 1);
+ auto *Difference = dyn_cast<Instruction>(AddOne->getOperand(0));
+ ASSERT_NE(Difference, nullptr);
+ ASSERT_TRUE(Difference->isBinaryOp());
+ EXPECT_EQ(Difference->getOperand(1), LoadedLowerBound);
+ auto *LoadedUpperBound = dyn_cast<LoadInst>(Difference->getOperand(0));
+ ASSERT_NE(LoadedUpperBound, nullptr);
+ EXPECT_EQ(LoadedUpperBound->getPointerOperand(), PUpperBound);
+
+ // The original loop iterator should only be used in the condition, in the
+ // increment and in the statement that adds the lower bound to it.
+ Value *IV = CLI->getIndVar();
+ EXPECT_EQ(std::distance(IV->use_begin(), IV->use_end()), 3);
+}
+
TEST_F(OpenMPIRBuilderTest, MasterDirective) {
using InsertPointTy = OpenMPIRBuilder::InsertPointTy;
OpenMPIRBuilder OMPBuilder(*M);