namespace {
// The store to load forwarding and load CSE rely on three conditions:
//
-// 1) store/load and load need to have mathematically equivalent affine access
-// functions (checked after full composition of load/store operands); this
-// implies that they access the same single memref element for all iterations of
-// the common surrounding loop,
+// 1) store/load providing a replacement value and load being replaced need to
+// have mathematically equivalent affine access functions (checked after full
+// composition of load/store operands); this implies that they access the same
+// single memref element for all iterations of the common surrounding loop,
//
// 2) the store/load op should dominate the load op,
//
-// 3) among all op's that satisfy both (1) and (2), for store to load
-// forwarding, the one that does not dominate any store op that has a
-// dependence into the load, is provably the last writer to the particular
-// memref location being loaded at the load op, and its store value can be
-// forwarded to the load; for load CSE, any op that does not dominate any store
-// op that have a dependence into the load can be forwarded and the first one
-// found is chosen. Note that the only dependences that are to be considered are
-// those that are satisfied at the block* of the innermost common surrounding
-// loop of the <store/load, load> being considered.
-//
-// (* A dependence being satisfied at a block: a dependence that is satisfied by
-// virtue of the destination operation appearing textually / lexically after
-// the source operation within the body of a 'affine.for' operation; thus, a
-// dependence is always either satisfied by a loop or by a block).
+// 3) no operation that may write to memory read by the load being replaced can
+// occur after executing the instruction (load or store) providing the
+// replacement value and before the load being replaced (thus potentially
+// allowing overwriting the memory read by the load).
//
// The above conditions are simple to check, sufficient, and powerful for most
// cases in practice - they are sufficient, but not necessary --- since they
: public AffineScalarReplacementBase<AffineScalarReplacement> {
void runOnFunction() override;
- LogicalResult forwardStoreToLoad(AffineReadOpInterface loadOp);
- void loadCSE(AffineReadOpInterface loadOp);
-
- // A list of memref's that are potentially dead / could be eliminated.
- SmallPtrSet<Value, 4> memrefsToErase;
- // Load ops whose results were replaced by those forwarded from stores
- // dominating stores or loads..
- SmallVector<Operation *, 8> loadOpsToErase;
+ LogicalResult forwardStoreToLoad(AffineReadOpInterface loadOp,
+ SmallVectorImpl<Operation *> &loadOpsToErase,
+ SmallPtrSetImpl<Value> &memrefsToErase,
+ DominanceInfo &domInfo);
- DominanceInfo *domInfo = nullptr;
+ void loadCSE(AffineReadOpInterface loadOp,
+ SmallVectorImpl<Operation *> &loadOpsToErase,
+ DominanceInfo &domInfo);
};
} // end anonymous namespace
return std::make_unique<AffineScalarReplacement>();
}
-// Check if the store may be reaching the load.
-static bool storeMayReachLoad(Operation *storeOp, Operation *loadOp,
- unsigned minSurroundingLoops) {
- MemRefAccess srcAccess(storeOp);
- MemRefAccess destAccess(loadOp);
- FlatAffineConstraints dependenceConstraints;
- unsigned nsLoops = getNumCommonSurroundingLoops(*loadOp, *storeOp);
- unsigned d;
- // Dependences at loop depth <= minSurroundingLoops do NOT matter.
- for (d = nsLoops + 1; d > minSurroundingLoops; d--) {
- DependenceResult result = checkMemrefAccessDependence(
- srcAccess, destAccess, d, &dependenceConstraints,
- /*dependenceComponents=*/nullptr);
- if (hasDependence(result))
- break;
- }
- if (d <= minSurroundingLoops)
- return false;
+/// Ensure that all operations that could be executed after `start`
+/// (noninclusive) and prior to `memOp` (e.g. on a control flow/op path
+/// between the operations) do not have the potential memory effect
+/// `EffectType` on `memOp`. `memOp` is an operation that reads or writes to
+/// a memref. For example, if `EffectType` is MemoryEffects::Write, this method
+/// will check if there is no write to the memory between `start` and `memOp`
+/// that would change the read within `memOp`.
+template <typename EffectType, typename T>
+bool hasNoInterveningEffect(Operation *start, T memOp) {
+
+ Value memref = memOp.getMemRef();
+ bool isOriginalAllocation = memref.getDefiningOp<memref::AllocaOp>() ||
+ memref.getDefiningOp<memref::AllocOp>();
+
+ // A boolean representing whether an intervening operation could have impacted
+ // memOp.
+ bool hasSideEffect = false;
+
+ // Check whether the effect on memOp can be caused by a given operation op.
+ std::function<void(Operation *)> checkOperation = [&](Operation *op) {
+ // If the effect has alreay been found, early exit,
+ if (hasSideEffect)
+ return;
+
+ if (auto memEffect = dyn_cast<MemoryEffectOpInterface>(op)) {
+ SmallVector<MemoryEffects::EffectInstance, 1> effects;
+ memEffect.getEffects(effects);
+
+ bool opMayHaveEffect = false;
+ for (auto effect : effects) {
+ // If op causes EffectType on a potentially aliasing location for
+ // memOp, mark as having the effect.
+ if (isa<EffectType>(effect.getEffect())) {
+ if (isOriginalAllocation && effect.getValue() &&
+ (effect.getValue().getDefiningOp<memref::AllocaOp>() ||
+ effect.getValue().getDefiningOp<memref::AllocOp>())) {
+ if (effect.getValue() != memref)
+ continue;
+ }
+ opMayHaveEffect = true;
+ break;
+ }
+ }
- return true;
+ if (!opMayHaveEffect)
+ return;
+
+ // If the side effect comes from an affine read or write, try to
+ // prove the side effecting `op` cannot reach `memOp`.
+ if (isa<AffineReadOpInterface, AffineWriteOpInterface>(op)) {
+ MemRefAccess srcAccess(op);
+ MemRefAccess destAccess(memOp);
+ // Dependence analysis is only correct if both ops operate on the same
+ // memref.
+ if (srcAccess.memref == destAccess.memref) {
+ FlatAffineConstraints dependenceConstraints;
+
+ // Number of loops containing the start op and the ending operation.
+ unsigned minSurroundingLoops =
+ getNumCommonSurroundingLoops(*start, *memOp);
+
+ // Number of loops containing the operation `op` which has the
+ // potential memory side effect and can occur on a path between
+ // `start` and `memOp`.
+ unsigned nsLoops = getNumCommonSurroundingLoops(*op, *memOp);
+
+ // For ease, let's consider the case that `op` is a store and we're
+ // looking for other potential stores (e.g `op`) that overwrite memory
+ // after `start`, and before being read in `memOp`. In this case, we
+ // only need to consider other potential stores with depth >
+ // minSurrounding loops since `start` would overwrite any store with a
+ // smaller number of surrounding loops before.
+ unsigned d;
+ for (d = nsLoops + 1; d > minSurroundingLoops; d--) {
+ DependenceResult result = checkMemrefAccessDependence(
+ srcAccess, destAccess, d, &dependenceConstraints,
+ /*dependenceComponents=*/nullptr);
+ if (hasDependence(result)) {
+ hasSideEffect = true;
+ return;
+ }
+ }
+
+ // No side effect was seen, simply return.
+ return;
+ }
+ }
+ hasSideEffect = true;
+ return;
+ }
+
+ if (op->hasTrait<OpTrait::HasRecursiveSideEffects>()) {
+ // Recurse into the regions for this op and check whether the internal
+ // operations may have the side effect `EffectType` on memOp.
+ for (Region ®ion : op->getRegions())
+ for (Block &block : region)
+ for (Operation &op : block)
+ checkOperation(&op);
+ return;
+ }
+
+ // Otherwise, conservatively assume generic operations have the effect
+ // on the operation
+ hasSideEffect = true;
+ return;
+ };
+
+ // Check all paths from ancestor op `parent` to the operation `to` for the
+ // effect. It is known that `to` must be contained within `parent`.
+ auto until = [&](Operation *parent, Operation *to) {
+ // TODO check only the paths from `parent` to `to`.
+ // Currently we fallback and check the entire parent op, rather than
+ // just the paths from the parent path, stopping after reaching `to`.
+ // This is conservatively correct, but could be made more aggressive.
+ assert(parent->isAncestor(to));
+ checkOperation(parent);
+ };
+
+ // Check for all paths from operation `from` to operation `untilOp` for the
+ // given memory effect.
+ std::function<void(Operation *, Operation *)> recur =
+ [&](Operation *from, Operation *untilOp) {
+ assert(
+ from->getParentRegion()->isAncestor(untilOp->getParentRegion()) &&
+ "Checking for side effect between two operations without a common "
+ "ancestor");
+
+ // If the operations are in different regions, recursively consider all
+ // path from `from` to the parent of `to` and all paths from the parent
+ // of `to` to `to`.
+ if (from->getParentRegion() != untilOp->getParentRegion()) {
+ recur(from, untilOp->getParentOp());
+ until(untilOp->getParentOp(), untilOp);
+ return;
+ }
+
+ // Now, assuming that `from` and `to` exist in the same region, perform
+ // a CFG traversal to check all the relevant operations.
+
+ // Additional blocks to consider.
+ SmallVector<Block *, 2> todoBlocks;
+ {
+ // First consider the parent block of `from` an check all operations
+ // after `from`.
+ for (auto iter = ++from->getIterator(), end = from->getBlock()->end();
+ iter != end && &*iter != untilOp; ++iter) {
+ checkOperation(&*iter);
+ }
+
+ // If the parent of `from` doesn't contain `to`, add the successors
+ // to the list of blocks to check.
+ if (untilOp->getBlock() != from->getBlock())
+ for (Block *succ : from->getBlock()->getSuccessors())
+ todoBlocks.push_back(succ);
+ }
+
+ SmallPtrSet<Block *, 4> done;
+ // Traverse the CFG until hitting `to`.
+ while (todoBlocks.size()) {
+ Block *blk = todoBlocks.pop_back_val();
+ if (done.count(blk))
+ continue;
+ done.insert(blk);
+ for (auto &op : *blk) {
+ if (&op == untilOp)
+ break;
+ checkOperation(&op);
+ if (&op == blk->getTerminator())
+ for (Block *succ : blk->getSuccessors())
+ todoBlocks.push_back(succ);
+ }
+ }
+ };
+ recur(start, memOp);
+ return !hasSideEffect;
}
-// This is a straightforward implementation not optimized for speed. Optimize
-// if needed.
-LogicalResult
-AffineScalarReplacement::forwardStoreToLoad(AffineReadOpInterface loadOp) {
- // First pass over the use list to get the minimum number of surrounding
- // loops common between the load op and the store op, with min taken across
- // all store ops.
- SmallVector<Operation *, 8> storeOps;
- unsigned minSurroundingLoops = getNestingDepth(loadOp);
+/// Attempt to eliminate loadOp by replacing it with a value stored into memory
+/// which the load is guaranteed to retrieve. This check involves three
+/// components: 1) The store and load must be on the same location 2) The store
+/// must dominate (and therefore must always occur prior to) the load 3) No
+/// other operations will overwrite the memory loaded between the given load
+/// and store. If such a value exists, the replaced `loadOp` will be added to
+/// `loadOpsToErase` and its memref will be added to `memrefsToErase`.
+LogicalResult AffineScalarReplacement::forwardStoreToLoad(
+ AffineReadOpInterface loadOp, SmallVectorImpl<Operation *> &loadOpsToErase,
+ SmallPtrSetImpl<Value> &memrefsToErase, DominanceInfo &domInfo) {
+
+ // The store op candidate for forwarding that satisfies all conditions
+ // to replace the load, if any.
+ Operation *lastWriteStoreOp = nullptr;
+
for (auto *user : loadOp.getMemRef().getUsers()) {
auto storeOp = dyn_cast<AffineWriteOpInterface>(user);
if (!storeOp)
continue;
- unsigned nsLoops = getNumCommonSurroundingLoops(*loadOp, *storeOp);
- minSurroundingLoops = std::min(nsLoops, minSurroundingLoops);
- storeOps.push_back(storeOp);
- }
-
- // The list of store op candidates for forwarding that satisfy conditions
- // (1) and (2) above - they will be filtered later when checking (3).
- SmallVector<Operation *, 8> fwdingCandidates;
-
- // Store ops that have a dependence into the load (even if they aren't
- // forwarding candidates). Each forwarding candidate will be checked for a
- // dominance on these. 'fwdingCandidates' are a subset of depSrcStores.
- SmallVector<Operation *, 8> depSrcStores;
-
- for (auto *storeOp : storeOps) {
- if (!storeMayReachLoad(storeOp, loadOp, minSurroundingLoops))
- continue;
-
- // Stores that *may* be reaching the load.
- depSrcStores.push_back(storeOp);
+ MemRefAccess srcAccess(storeOp);
+ MemRefAccess destAccess(loadOp);
// 1. Check if the store and the load have mathematically equivalent
// affine access functions; this implies that they statically refer to the
// store %A[%M]
// load %A[%N]
// Use the AffineValueMap difference based memref access equality checking.
- MemRefAccess srcAccess(storeOp);
- MemRefAccess destAccess(loadOp);
if (srcAccess != destAccess)
continue;
// 2. The store has to dominate the load op to be candidate.
- if (!domInfo->dominates(storeOp, loadOp))
+ if (!domInfo.dominates(storeOp, loadOp))
+ continue;
+
+ // 3. Ensure there is no intermediate operation which could replace the
+ // value in memory.
+ if (!hasNoInterveningEffect<MemoryEffects::Write>(storeOp, loadOp))
continue;
// We now have a candidate for forwarding.
- fwdingCandidates.push_back(storeOp);
+ assert(lastWriteStoreOp == nullptr &&
+ "multiple simulataneous replacement stores");
+ lastWriteStoreOp = storeOp;
}
- // 3. Of all the store ops that meet the above criteria, the store op
- // that does not dominate any of the ops in 'depSrcStores' (if such exists)
- // will not have any of those latter ops on its paths to `loadOp`. It would
- // thus be the unique store providing the value to the load. This condition is
- // however conservative for eg:
- //
- // for ... {
- // store
- // load
- // store
- // load
- // }
- //
- Operation *lastWriteStoreOp = nullptr;
- for (auto *storeOp : fwdingCandidates) {
- if (llvm::all_of(depSrcStores, [&](Operation *depStore) {
- return !domInfo->properlyDominates(storeOp, depStore);
- })) {
- lastWriteStoreOp = storeOp;
- break;
- }
- }
if (!lastWriteStoreOp)
return failure();
// loadA will be be replaced with loadB if:
// 1) loadA and loadB have mathematically equivalent affine access functions.
// 2) loadB dominates loadA.
-// 3) loadB does not dominate any of the store ops that have a dependence into
-// loadA.
-void AffineScalarReplacement::loadCSE(AffineReadOpInterface loadOp) {
- // The list of load op candidates for forwarding that satisfy conditions
- // (1) and (2) above - they will be filtered later when checking (3).
- SmallVector<Operation *, 8> fwdingCandidates;
- SmallVector<Operation *, 8> storeOps;
- unsigned minSurroundingLoops = getNestingDepth(loadOp);
- MemRefAccess memRefAccess(loadOp);
- // First pass over the use list to get 1) the minimum number of surrounding
- // loops common between the load op and an load op candidate, with min taken
- // across all load op candidates; 2) load op candidates; 3) store ops.
- // We take min across all load op candidates instead of all load ops to make
- // sure later dependence check is performed at loop depths that do matter.
- for (auto *user : loadOp.getMemRef().getUsers()) {
- if (auto storeOp = dyn_cast<AffineWriteOpInterface>(user)) {
- storeOps.push_back(storeOp);
- } else if (auto aLoadOp = dyn_cast<AffineReadOpInterface>(user)) {
- MemRefAccess otherMemRefAccess(aLoadOp);
- // No need to consider Load ops that have been replaced in previous store
- // to load forwarding or loadCSE. If loadA or storeA can be forwarded to
- // loadB, then loadA or storeA can be forwarded to loadC iff loadB can be
- // forwarded to loadC.
- // If loadB is visited before loadC and replace with loadA, we do not put
- // loadB in candidates list, only loadA. If loadC is visited before loadB,
- // loadC may be replaced with loadB, which will be replaced with loadA
- // later.
- if (aLoadOp != loadOp && !llvm::is_contained(loadOpsToErase, aLoadOp) &&
- memRefAccess == otherMemRefAccess &&
- domInfo->dominates(aLoadOp, loadOp)) {
- fwdingCandidates.push_back(aLoadOp);
- unsigned nsLoops = getNumCommonSurroundingLoops(*loadOp, *aLoadOp);
- minSurroundingLoops = std::min(nsLoops, minSurroundingLoops);
- }
+// 3) There is no write between loadA and loadB.
+void AffineScalarReplacement::loadCSE(
+ AffineReadOpInterface loadA, SmallVectorImpl<Operation *> &loadOpsToErase,
+ DominanceInfo &domInfo) {
+ SmallVector<AffineReadOpInterface, 4> loadCandidates;
+ for (auto *user : loadA.getMemRef().getUsers()) {
+ auto loadB = dyn_cast<AffineReadOpInterface>(user);
+ if (!loadB || loadB == loadA)
+ continue;
+
+ MemRefAccess srcAccess(loadB);
+ MemRefAccess destAccess(loadA);
+
+ // 1. The accesses have to be to the same location.
+ if (srcAccess != destAccess) {
+ continue;
}
- }
- // No forwarding candidate.
- if (fwdingCandidates.empty())
- return;
+ // 2. The store has to dominate the load op to be candidate.
+ if (!domInfo.dominates(loadB, loadA))
+ continue;
- // Store ops that have a dependence into the load.
- SmallVector<Operation *, 8> depSrcStores;
+ // 3. There is no write between loadA and loadB.
+ if (!hasNoInterveningEffect<MemoryEffects::Write>(loadB.getOperation(),
+ loadA))
+ continue;
- for (auto *storeOp : storeOps) {
- if (!storeMayReachLoad(storeOp, loadOp, minSurroundingLoops))
+ // Check if two values have the same shape. This is needed for affine vector
+ // loads.
+ if (loadB.getValue().getType() != loadA.getValue().getType())
continue;
- // Stores that *may* be reaching the load.
- depSrcStores.push_back(storeOp);
+ loadCandidates.push_back(loadB);
}
- // 3. Of all the load op's that meet the above criteria, return the first load
- // found that does not dominate any op in 'depSrcStores' and has the same
- // shape as the load to be replaced (if one exists). The shape check is needed
- // for affine vector loads.
- Operation *firstLoadOp = nullptr;
- Value oldVal = loadOp.getValue();
- for (auto *loadOp : fwdingCandidates) {
- if (llvm::all_of(depSrcStores,
- [&](Operation *depStore) {
- return !domInfo->properlyDominates(loadOp, depStore);
- }) &&
- cast<AffineReadOpInterface>(loadOp).getValue().getType() ==
- oldVal.getType()) {
- firstLoadOp = loadOp;
+ // Of the legal load candidates, use the one that dominates all others
+ // to minimize the subsequent need to loadCSE
+ Value loadB;
+ for (AffineReadOpInterface option : loadCandidates) {
+ if (llvm::all_of(loadCandidates, [&](AffineReadOpInterface depStore) {
+ return depStore == option ||
+ domInfo.dominates(option.getOperation(),
+ depStore.getOperation());
+ })) {
+ loadB = option.getValue();
break;
}
}
- if (!firstLoadOp)
- return;
- // Perform the actual load to load forwarding.
- Value loadVal = cast<AffineReadOpInterface>(firstLoadOp).getValue();
- loadOp.getValue().replaceAllUsesWith(loadVal);
- // Record this to erase later.
- loadOpsToErase.push_back(loadOp);
+ if (loadB) {
+ loadA.getValue().replaceAllUsesWith(loadB);
+ // Record this to erase later.
+ loadOpsToErase.push_back(loadA);
+ }
}
void AffineScalarReplacement::runOnFunction() {
// Only supports single block functions at the moment.
FuncOp f = getFunction();
- if (!llvm::hasSingleElement(f)) {
- markAllAnalysesPreserved();
- return;
- }
- domInfo = &getAnalysis<DominanceInfo>();
+ // Load op's whose results were replaced by those forwarded from stores.
+ SmallVector<Operation *, 8> opsToErase;
+
+ // A list of memref's that are potentially dead / could be eliminated.
+ SmallPtrSet<Value, 4> memrefsToErase;
- loadOpsToErase.clear();
- memrefsToErase.clear();
+ auto &domInfo = getAnalysis<DominanceInfo>();
- // Walk all load's and perform store to load forwarding and loadCSE.
+ // Walk all load's and perform store to load forwarding.
f.walk([&](AffineReadOpInterface loadOp) {
- // Do store to load forwarding first, if no success, try loadCSE.
- if (failed(forwardStoreToLoad(loadOp)))
- loadCSE(loadOp);
+ if (failed(
+ forwardStoreToLoad(loadOp, opsToErase, memrefsToErase, domInfo))) {
+ loadCSE(loadOp, opsToErase, domInfo);
+ }
});
- // Erase all load op's whose results were replaced with store or load fwd'ed
- // ones.
- for (auto *loadOp : loadOpsToErase)
- loadOp->erase();
+ // Erase all load op's whose results were replaced with store fwd'ed ones.
+ for (auto *op : opsToErase)
+ op->erase();
// Check if the store fwd'ed memrefs are now left with only stores and can
// thus be completely deleted. Note: the canonicalize pass should be able