unsigned Generation = 0;
int MatchingId = -1;
bool IsAtomic = false;
+
+ // TODO: Remove this flag. It would be strictly stronger to add a record
+ // to the AvailableInvariant table when passing the invariant load instead.
bool IsInvariant = false;
LoadValue() = default;
LoadMapAllocator>;
LoadHTType AvailableLoads;
+
+ // A scoped hash table mapping memory locations (represented as typed
+ // addresses) to generation numbers at which that memory location became
+ // (henceforth indefinitely) invariant.
+ using InvariantMapAllocator =
+ RecyclingAllocator<BumpPtrAllocator,
+ ScopedHashTableVal<MemoryLocation, unsigned>>;
+ using InvariantHTType =
+ ScopedHashTable<MemoryLocation, unsigned, DenseMapInfo<MemoryLocation>,
+ InvariantMapAllocator>;
+ InvariantHTType AvailableInvariants;
/// \brief A scoped hash table of the current values of read-only call
/// values.
class NodeScope {
public:
NodeScope(ScopedHTType &AvailableValues, LoadHTType &AvailableLoads,
- CallHTType &AvailableCalls)
- : Scope(AvailableValues), LoadScope(AvailableLoads),
- CallScope(AvailableCalls) {}
+ InvariantHTType &AvailableInvariants, CallHTType &AvailableCalls)
+ : Scope(AvailableValues), LoadScope(AvailableLoads),
+ InvariantScope(AvailableInvariants), CallScope(AvailableCalls) {}
NodeScope(const NodeScope &) = delete;
NodeScope &operator=(const NodeScope &) = delete;
private:
ScopedHTType::ScopeTy Scope;
LoadHTType::ScopeTy LoadScope;
+ InvariantHTType::ScopeTy InvariantScope;
CallHTType::ScopeTy CallScope;
};
class StackNode {
public:
StackNode(ScopedHTType &AvailableValues, LoadHTType &AvailableLoads,
- CallHTType &AvailableCalls, unsigned cg, DomTreeNode *n,
- DomTreeNode::iterator child, DomTreeNode::iterator end)
+ InvariantHTType &AvailableInvariants, CallHTType &AvailableCalls,
+ unsigned cg, DomTreeNode *n, DomTreeNode::iterator child,
+ DomTreeNode::iterator end)
: CurrentGeneration(cg), ChildGeneration(cg), Node(n), ChildIter(child),
- EndIter(end), Scopes(AvailableValues, AvailableLoads, AvailableCalls)
+ EndIter(end),
+ Scopes(AvailableValues, AvailableLoads, AvailableInvariants,
+ AvailableCalls)
{}
StackNode(const StackNode &) = delete;
StackNode &operator=(const StackNode &) = delete;
ExpectedType);
}
+ /// Return true if the instruction is known to only operate on memory
+ /// provably invariant in the given "generation".
+ bool isOperatingOnInvariantMemAt(Instruction *I, unsigned GenAt);
+
bool isSameMemGeneration(unsigned EarlierGeneration, unsigned LaterGeneration,
Instruction *EarlierInst, Instruction *LaterInst);
return MSSA->dominates(LaterDef, EarlierMA);
}
+bool EarlyCSE::isOperatingOnInvariantMemAt(Instruction *I, unsigned GenAt) {
+ // A location loaded from with an invariant_load is assumed to *never* change
+ // within the visible scope of the compilation.
+ if (auto *LI = dyn_cast<LoadInst>(I))
+ if (LI->getMetadata(LLVMContext::MD_invariant_load))
+ return true;
+
+ auto MemLocOpt = MemoryLocation::getOrNone(I);
+ if (!MemLocOpt)
+ // "target" intrinsic forms of loads aren't currently known to
+ // MemoryLocation::get. TODO
+ return false;
+ MemoryLocation MemLoc = *MemLocOpt;
+ if (!AvailableInvariants.count(MemLoc))
+ return false;
+
+ // Is the generation at which this became invariant older than the
+ // current one?
+ return AvailableInvariants.lookup(MemLoc) <= GenAt;
+}
+
bool EarlyCSE::processNode(DomTreeNode *Node) {
bool Changed = false;
BasicBlock *BB = Node->getBlock();
continue;
}
- // Skip invariant.start intrinsics since they only read memory, and we can
- // forward values across it. Also, we dont need to consume the last store
- // since the semantics of invariant.start allow us to perform DSE of the
- // last store, if there was a store following invariant.start. Consider:
+ // We can skip all invariant.start intrinsics since they only read memory,
+ // and we can forward values across it. For invariant starts without
+ // invariant ends, we can use the fact that the invariantness never ends to
+ // start a scope in the current generaton which is true for all future
+ // generations. Also, we dont need to consume the last store since the
+ // semantics of invariant.start allow us to perform DSE of the last
+ // store, if there was a store following invariant.start. Consider:
//
// store 30, i8* p
// invariant.start(p)
// store 40, i8* p
// We can DSE the store to 30, since the store 40 to invariant location p
// causes undefined behaviour.
- if (match(Inst, m_Intrinsic<Intrinsic::invariant_start>()))
+ if (match(Inst, m_Intrinsic<Intrinsic::invariant_start>())) {
+ // If there are any uses, the scope might end.
+ if (!Inst->use_empty())
+ continue;
+ auto *CI = cast<CallInst>(Inst);
+ MemoryLocation MemLoc = MemoryLocation::getForArgument(CI, 1, TLI);
+ AvailableInvariants.insert(MemLoc, CurrentGeneration);
continue;
+ }
if (match(Inst, m_Intrinsic<Intrinsic::experimental_guard>())) {
if (auto *CondI =
!MemInst.isVolatile() && MemInst.isUnordered() &&
// We can't replace an atomic load with one which isn't also atomic.
InVal.IsAtomic >= MemInst.isAtomic() &&
- (InVal.IsInvariant || MemInst.isInvariantLoad() ||
+ (InVal.IsInvariant ||
+ isOperatingOnInvariantMemAt(Inst, InVal.Generation) ||
isSameMemGeneration(InVal.Generation, CurrentGeneration,
InVal.DefInst, Inst))) {
Value *Op = getOrCreateResult(InVal.DefInst, Inst->getType());
InVal.MatchingId == MemInst.getMatchingId() &&
// We don't yet handle removing stores with ordering of any kind.
!MemInst.isVolatile() && MemInst.isUnordered() &&
- isSameMemGeneration(InVal.Generation, CurrentGeneration,
- InVal.DefInst, Inst)) {
+ (isOperatingOnInvariantMemAt(Inst, InVal.Generation) ||
+ isSameMemGeneration(InVal.Generation, CurrentGeneration,
+ InVal.DefInst, Inst))) {
// It is okay to have a LastStore to a different pointer here if MemorySSA
// tells us that the load and store are from the same memory generation.
// In that case, LastStore should keep its present value since we're
// Process the root node.
nodesToProcess.push_back(new StackNode(
- AvailableValues, AvailableLoads, AvailableCalls, CurrentGeneration,
- DT.getRootNode(), DT.getRootNode()->begin(), DT.getRootNode()->end()));
+ AvailableValues, AvailableLoads, AvailableInvariants, AvailableCalls,
+ CurrentGeneration, DT.getRootNode(),
+ DT.getRootNode()->begin(), DT.getRootNode()->end()));
// Save the current generation.
unsigned LiveOutGeneration = CurrentGeneration;
// Push the next child onto the stack.
DomTreeNode *child = NodeToProcess->nextChild();
nodesToProcess.push_back(
- new StackNode(AvailableValues, AvailableLoads, AvailableCalls,
- NodeToProcess->childGeneration(), child, child->begin(),
- child->end()));
+ new StackNode(AvailableValues, AvailableLoads, AvailableInvariants,
+ AvailableCalls, NodeToProcess->childGeneration(),
+ child, child->begin(), child->end()));
} else {
// It has been processed, and there are no more children to process,
// so delete it and pop it off the stack.
; Check that we do load-load forwarding over invariant.start, since it does not
; clobber memory
-define i8 @test1(i8 *%P) {
- ; CHECK-LABEL: @test1(
+define i8 @test_bypass1(i8 *%P) {
+ ; CHECK-LABEL: @test_bypass1(
; CHECK-NEXT: %V1 = load i8, i8* %P
; CHECK-NEXT: %i = call {}* @llvm.invariant.start.p0i8(i64 1, i8* %P)
; CHECK-NEXT: ret i8 0
-
%V1 = load i8, i8* %P
%i = call {}* @llvm.invariant.start.p0i8(i64 1, i8* %P)
%V2 = load i8, i8* %P
; Trivial Store->load forwarding over invariant.start
-define i8 @test2(i8 *%P) {
- ; CHECK-LABEL: @test2(
+define i8 @test_bypass2(i8 *%P) {
+ ; CHECK-LABEL: @test_bypass2(
; CHECK-NEXT: store i8 42, i8* %P
; CHECK-NEXT: %i = call {}* @llvm.invariant.start.p0i8(i64 1, i8* %P)
; CHECK-NEXT: ret i8 42
-
store i8 42, i8* %P
%i = call {}* @llvm.invariant.start.p0i8(i64 1, i8* %P)
%V1 = load i8, i8* %P
; We can DSE over invariant.start calls, since the first store to
; %P is valid, and the second store is actually unreachable based on semantics
; of invariant.start.
-define void @test3(i8* %P) {
-
-; CHECK-LABEL: @test3(
+define void @test_bypass3(i8* %P) {
+; CHECK-LABEL: @test_bypass3(
; CHECK-NEXT: %i = call {}* @llvm.invariant.start.p0i8(i64 1, i8* %P)
; CHECK-NEXT: store i8 60, i8* %P
-
store i8 50, i8* %P
%i = call {}* @llvm.invariant.start.p0i8(i64 1, i8* %P)
store i8 60, i8* %P
; FIXME: Now the first store can actually be eliminated, since there is no read within
; the invariant region, between start and end.
-define void @test4(i8* %P) {
+define void @test_bypass4(i8* %P) {
-; CHECK-LABEL: @test4(
+; CHECK-LABEL: @test_bypass4(
; CHECK-NEXT: store i8 50, i8* %P
; CHECK-NEXT: %i = call {}* @llvm.invariant.start.p0i8(i64 1, i8* %P)
; CHECK-NEXT: call void @llvm.invariant.end.p0i8({}* %i, i64 1, i8* %P)
store i8 60, i8* %P
ret void
}
+
+
+declare void @clobber()
+declare {}* @llvm.invariant.start.p0i32(i64 %size, i32* nocapture %ptr)
+declare void @llvm.invariant.end.p0i32({}*, i64, i32* nocapture) nounwind
+
+define i32 @test_before_load(i32* %p) {
+; CHECK-LABEL: @test_before_load
+; CHECK: ret i32 0
+ call {}* @llvm.invariant.start.p0i32(i64 4, i32* %p)
+ %v1 = load i32, i32* %p
+ call void @clobber()
+ %v2 = load i32, i32* %p
+ %sub = sub i32 %v1, %v2
+ ret i32 %sub
+}
+
+define i32 @test_before_clobber(i32* %p) {
+; CHECK-LABEL: @test_before_clobber
+; CHECK: ret i32 0
+ %v1 = load i32, i32* %p
+ call {}* @llvm.invariant.start.p0i32(i64 4, i32* %p)
+ call void @clobber()
+ %v2 = load i32, i32* %p
+ %sub = sub i32 %v1, %v2
+ ret i32 %sub
+}
+
+define i32 @test_unanalzyable_load(i32* %p) {
+; CHECK-LABEL: @test_unanalzyable_load
+; CHECK: ret i32 0
+ call {}* @llvm.invariant.start.p0i32(i64 4, i32* %p)
+ call void @clobber()
+ %v1 = load i32, i32* %p
+ call void @clobber()
+ %v2 = load i32, i32* %p
+ %sub = sub i32 %v1, %v2
+ ret i32 %sub
+}
+
+define i32 @test_negative_after_clobber(i32* %p) {
+; CHECK-LABEL: @test_negative_after_clobber
+; CHECK: ret i32 %sub
+ %v1 = load i32, i32* %p
+ call void @clobber()
+ call {}* @llvm.invariant.start.p0i32(i64 4, i32* %p)
+ %v2 = load i32, i32* %p
+ %sub = sub i32 %v1, %v2
+ ret i32 %sub
+}
+
+define i32 @test_merge(i32* %p, i1 %cnd) {
+; CHECK-LABEL: @test_merge
+; CHECK: ret i32 0
+ %v1 = load i32, i32* %p
+ call {}* @llvm.invariant.start.p0i32(i64 4, i32* %p)
+ br i1 %cnd, label %merge, label %taken
+
+taken:
+ call void @clobber()
+ br label %merge
+merge:
+ %v2 = load i32, i32* %p
+ %sub = sub i32 %v1, %v2
+ ret i32 %sub
+}
+
+define i32 @test_negative_after_mergeclobber(i32* %p, i1 %cnd) {
+; CHECK-LABEL: @test_negative_after_mergeclobber
+; CHECK: ret i32 %sub
+ %v1 = load i32, i32* %p
+ br i1 %cnd, label %merge, label %taken
+
+taken:
+ call void @clobber()
+ br label %merge
+merge:
+ call {}* @llvm.invariant.start.p0i32(i64 4, i32* %p)
+ %v2 = load i32, i32* %p
+ %sub = sub i32 %v1, %v2
+ ret i32 %sub
+}
+
+; In theory, this version could work, but earlycse is incapable of
+; merging facts along distinct paths.
+define i32 @test_false_negative_merge(i32* %p, i1 %cnd) {
+; CHECK-LABEL: @test_false_negative_merge
+; CHECK: ret i32 %sub
+ %v1 = load i32, i32* %p
+ br i1 %cnd, label %merge, label %taken
+
+taken:
+ call {}* @llvm.invariant.start.p0i32(i64 4, i32* %p)
+ call void @clobber()
+ br label %merge
+merge:
+ %v2 = load i32, i32* %p
+ %sub = sub i32 %v1, %v2
+ ret i32 %sub
+}
+
+define i32 @test_merge_unanalyzable_load(i32* %p, i1 %cnd) {
+; CHECK-LABEL: @test_merge_unanalyzable_load
+; CHECK: ret i32 0
+ call {}* @llvm.invariant.start.p0i32(i64 4, i32* %p)
+ call void @clobber()
+ %v1 = load i32, i32* %p
+ br i1 %cnd, label %merge, label %taken
+
+taken:
+ call void @clobber()
+ br label %merge
+merge:
+ %v2 = load i32, i32* %p
+ %sub = sub i32 %v1, %v2
+ ret i32 %sub
+}
+
+define void @test_dse_before_load(i32* %p, i1 %cnd) {
+; CHECK-LABEL: @test_dse_before_load
+; CHECK-NOT: store
+ call {}* @llvm.invariant.start.p0i32(i64 4, i32* %p)
+ %v1 = load i32, i32* %p
+ call void @clobber()
+ store i32 %v1, i32* %p
+ ret void
+}
+
+define void @test_dse_after_load(i32* %p, i1 %cnd) {
+; CHECK-LABEL: @test_dse_after_load
+; CHECK-NOT: store
+ %v1 = load i32, i32* %p
+ call {}* @llvm.invariant.start.p0i32(i64 4, i32* %p)
+ call void @clobber()
+ store i32 %v1, i32* %p
+ ret void
+}
+
+
+; In this case, we have a false negative since MemoryLocation is implicitly
+; typed due to the user of a Value to represent the address. Note that other
+; passes will canonicalize away the bitcasts in this example.
+define i32 @test_false_negative_types(i32* %p) {
+; CHECK-LABEL: @test_false_negative_types
+; CHECK: ret i32 %sub
+ call {}* @llvm.invariant.start.p0i32(i64 4, i32* %p)
+ %v1 = load i32, i32* %p
+ call void @clobber()
+ %pf = bitcast i32* %p to float*
+ %v2f = load float, float* %pf
+ %v2 = bitcast float %v2f to i32
+ %sub = sub i32 %v1, %v2
+ ret i32 %sub
+}
+
+define i32 @test_negative_size1(i32* %p) {
+; CHECK-LABEL: @test_negative_size1
+; CHECK: ret i32 %sub
+ call {}* @llvm.invariant.start.p0i32(i64 3, i32* %p)
+ %v1 = load i32, i32* %p
+ call void @clobber()
+ %v2 = load i32, i32* %p
+ %sub = sub i32 %v1, %v2
+ ret i32 %sub
+}
+
+define i32 @test_negative_size2(i32* %p) {
+; CHECK-LABEL: @test_negative_size2
+; CHECK: ret i32 %sub
+ call {}* @llvm.invariant.start.p0i32(i64 0, i32* %p)
+ %v1 = load i32, i32* %p
+ call void @clobber()
+ %v2 = load i32, i32* %p
+ %sub = sub i32 %v1, %v2
+ ret i32 %sub
+}
+
+define i32 @test_negative_scope(i32* %p) {
+; CHECK-LABEL: @test_negative_scope
+; CHECK: ret i32 %sub
+ %scope = call {}* @llvm.invariant.start.p0i32(i64 4, i32* %p)
+ call void @llvm.invariant.end.p0i32({}* %scope, i64 4, i32* %p)
+ %v1 = load i32, i32* %p
+ call void @clobber()
+ %v2 = load i32, i32* %p
+ %sub = sub i32 %v1, %v2
+ ret i32 %sub
+}
+
+define i32 @test_false_negative_scope(i32* %p) {
+; CHECK-LABEL: @test_false_negative_scope
+; CHECK: ret i32 %sub
+ %scope = call {}* @llvm.invariant.start.p0i32(i64 4, i32* %p)
+ %v1 = load i32, i32* %p
+ call void @clobber()
+ %v2 = load i32, i32* %p
+ call void @llvm.invariant.end.p0i32({}* %scope, i64 4, i32* %p)
+ %sub = sub i32 %v1, %v2
+ ret i32 %sub
+}
+
+; Invariant load defact starts an invariant.start scope of the appropriate size
+define i32 @test_invariant_load_scope(i32* %p) {
+; CHECK-LABEL: @test_invariant_load_scope
+; CHECK: ret i32 0
+ %v1 = load i32, i32* %p, !invariant.load !{}
+ call void @clobber()
+ %v2 = load i32, i32* %p
+ %sub = sub i32 %v1, %v2
+ ret i32 %sub
+}