AllowStatepointWithNoDeoptInfo("rs4gc-allow-statepoint-with-no-deopt-info",
cl::Hidden, cl::init(true));
+static cl::opt<bool> RematDerivedAtUses("rs4gc-remat-derived-at-uses",
+ cl::Hidden, cl::init(true));
+
/// The IR fed into RewriteStatepointsForGC may have had attributes and
/// metadata implying dereferenceability that are no longer valid/correct after
/// RewriteStatepointsForGC has run. This is because semantically, after
}
}
+// Try to rematerialize derived pointers immediately before their uses
+// (instead of rematerializing after every statepoint it is live through).
+// This can be beneficial when derived pointer is live across many
+// statepoints, but uses are rare.
+static void rematerializeLiveValuesAtUses(
+ RematCandTy &RematerizationCandidates,
+ MutableArrayRef<PartiallyConstructedSafepointRecord> Records,
+ PointerToBaseTy &PointerToBase) {
+ if (!RematDerivedAtUses)
+ return;
+
+ SmallVector<Instruction *, 32> LiveValuesToBeDeleted;
+
+ LLVM_DEBUG(dbgs() << "Rematerialize derived pointers at uses, "
+ << "Num statepoints: " << Records.size() << '\n');
+
+ for (auto &It : RematerizationCandidates) {
+ Instruction *Cand = cast<Instruction>(It.first);
+ auto &Record = It.second;
+
+ if (Record.Cost >= RematerializationThreshold)
+ continue;
+
+ if (Cand->user_empty())
+ continue;
+
+ if (Cand->hasOneUse())
+ if (auto *U = dyn_cast<Instruction>(Cand->getUniqueUndroppableUser()))
+ if (U->getParent() == Cand->getParent())
+ continue;
+
+ // Rematerialization before PHI nodes is not implemented.
+ if (llvm::any_of(Cand->users(),
+ [](const auto *U) { return isa<PHINode>(U); }))
+ continue;
+
+ LLVM_DEBUG(dbgs() << "Trying cand " << *Cand << " ... ");
+
+ // Count of rematerialization instructions we introduce is equal to number
+ // of candidate uses.
+ // Count of rematerialization instructions we eliminate is equal to number
+ // of statepoints it is live through.
+ // Consider transformation profitable if latter is greater than former
+ // (in other words, we create less than eliminate).
+ unsigned NumLiveStatepoints = llvm::count_if(
+ Records, [Cand](const auto &R) { return R.LiveSet.contains(Cand); });
+ unsigned NumUses = Cand->getNumUses();
+
+ LLVM_DEBUG(dbgs() << "Num uses: " << NumUses << " Num live statepoints: "
+ << NumLiveStatepoints << " ");
+
+ if (NumLiveStatepoints < NumUses) {
+ LLVM_DEBUG(dbgs() << "not profitable\n");
+ continue;
+ }
+
+ // If rematerialization is 'free', then favor rematerialization at
+ // uses as it generally shortens live ranges.
+ // TODO: Short (size ==1) chains only?
+ if (NumLiveStatepoints == NumUses && Record.Cost > 0) {
+ LLVM_DEBUG(dbgs() << "not profitable\n");
+ continue;
+ }
+
+ LLVM_DEBUG(dbgs() << "looks profitable\n");
+
+ // ChainToBase may contain another remat candidate (as a sub chain) which
+ // has been rewritten by now. Need to recollect chain to have up to date
+ // value.
+ // TODO: sort records in findRematerializationCandidates() in
+ // decreasing chain size order?
+ if (Record.ChainToBase.size() > 1) {
+ Record.ChainToBase.clear();
+ findRematerializableChainToBasePointer(Record.ChainToBase, Cand);
+ }
+
+ // Current rematerialization algorithm is very simple: we rematerialize
+ // immediately before EVERY use, even if there are several uses in same
+ // block or if use is local to Cand Def. The reason is that this allows
+ // us to avoid recomputing liveness without complicated analysis:
+ // - If we did not eliminate all uses of original Candidate, we do not
+ // know exaclty in what BBs it is still live.
+ // - If we rematerialize once per BB, we need to find proper insertion
+ // place (first use in block, but after Def) and analyze if there is
+ // statepoint between uses in the block.
+ while (!Cand->user_empty()) {
+ Instruction *UserI = cast<Instruction>(*Cand->user_begin());
+ Instruction *RematChain = rematerializeChain(
+ Record.ChainToBase, UserI, Record.RootOfChain, PointerToBase[Cand]);
+ UserI->replaceUsesOfWith(Cand, RematChain);
+ PointerToBase[RematChain] = PointerToBase[Cand];
+ }
+ LiveValuesToBeDeleted.push_back(Cand);
+ }
+
+ LLVM_DEBUG(dbgs() << "Rematerialized " << LiveValuesToBeDeleted.size()
+ << " derived pointers\n");
+ for (auto *Cand : LiveValuesToBeDeleted) {
+ assert(Cand->use_empty() && "Unexpected user remain");
+ RematerizationCandidates.erase(Cand);
+ for (auto &R : Records) {
+ assert(!R.LiveSet.contains(Cand) ||
+ R.LiveSet.contains(PointerToBase[Cand]));
+ R.LiveSet.remove(Cand);
+ }
+ }
+
+ // Recollect not rematerialized chains - we might have rewritten
+ // their sub-chains.
+ if (!LiveValuesToBeDeleted.empty()) {
+ for (auto &P : RematerizationCandidates) {
+ auto &R = P.second;
+ if (R.ChainToBase.size() > 1) {
+ R.ChainToBase.clear();
+ findRematerializableChainToBasePointer(R.ChainToBase, P.first);
+ }
+ }
+ }
+}
+
// From the statepoint live set pick values that are cheaper to recompute then
// to relocate. Remove this values from the live set, rematerialize them after
// statepoint and record them in "Info" structure. Note that similar to
// In order to reduce live set of statepoint we might choose to rematerialize
// some values instead of relocating them. This is purely an optimization and
// does not influence correctness.
+ // First try rematerialization at uses, then after statepoints.
+ rematerializeLiveValuesAtUses(RematerizationCandidates, Records,
+ PointerToBase);
for (size_t i = 0; i < Records.size(); i++)
rematerializeLiveValues(ToUpdate[i], Records[i], PointerToBase,
RematerizationCandidates, TTI);
; CHECK-NEXT: [[DERIVED:%.*]] = getelementptr i32, ptr addrspace(1) [[BASE:%.*]], i32 16
; CHECK-NEXT: [[STATEPOINT_TOKEN:%.*]] = call token (i64, i32, ptr, i32, i32, ...) @llvm.experimental.gc.statepoint.p0(i64 2882400000, i32 0, ptr elementtype(void ()) @do_safepoint, i32 0, i32 0, i32 0, i32 0) [ "deopt"(), "gc-live"(ptr addrspace(1) [[BASE]]) ]
; CHECK-NEXT: [[BASE_RELOCATED:%.*]] = call coldcc ptr addrspace(1) @llvm.experimental.gc.relocate.p1(token [[STATEPOINT_TOKEN]], i32 0, i32 0)
-; CHECK-NEXT: [[DERIVED_REMAT:%.*]] = getelementptr i32, ptr addrspace(1) [[BASE_RELOCATED]], i32 16
; CHECK-NEXT: br i1 [[COND:%.*]], label [[HERE:%.*]], label [[THERE:%.*]]
; CHECK: here:
-; CHECK-NEXT: [[STATEPOINT_TOKEN3:%.*]] = call token (i64, i32, ptr, i32, i32, ...) @llvm.experimental.gc.statepoint.p0(i64 2882400000, i32 0, ptr elementtype(void ()) @do_safepoint, i32 0, i32 0, i32 0, i32 0) [ "deopt"(), "gc-live"(ptr addrspace(1) [[BASE_RELOCATED]]) ]
-; CHECK-NEXT: [[BASE_RELOCATED4:%.*]] = call coldcc ptr addrspace(1) @llvm.experimental.gc.relocate.p1(token [[STATEPOINT_TOKEN3]], i32 0, i32 0)
-; CHECK-NEXT: [[DERIVED_REMAT1:%.*]] = getelementptr i32, ptr addrspace(1) [[BASE_RELOCATED4]], i32 16
+; CHECK-NEXT: [[STATEPOINT_TOKEN1:%.*]] = call token (i64, i32, ptr, i32, i32, ...) @llvm.experimental.gc.statepoint.p0(i64 2882400000, i32 0, ptr elementtype(void ()) @do_safepoint, i32 0, i32 0, i32 0, i32 0) [ "deopt"(), "gc-live"(ptr addrspace(1) [[BASE_RELOCATED]]) ]
+; CHECK-NEXT: [[BASE_RELOCATED2:%.*]] = call coldcc ptr addrspace(1) @llvm.experimental.gc.relocate.p1(token [[STATEPOINT_TOKEN1]], i32 0, i32 0)
; CHECK-NEXT: br label [[MERGE:%.*]]
; CHECK: there:
-; CHECK-NEXT: [[STATEPOINT_TOKEN5:%.*]] = call token (i64, i32, ptr, i32, i32, ...) @llvm.experimental.gc.statepoint.p0(i64 2882400000, i32 0, ptr elementtype(void ()) @do_safepoint, i32 0, i32 0, i32 0, i32 0) [ "deopt"(), "gc-live"(ptr addrspace(1) [[BASE_RELOCATED]]) ]
-; CHECK-NEXT: [[BASE_RELOCATED6:%.*]] = call coldcc ptr addrspace(1) @llvm.experimental.gc.relocate.p1(token [[STATEPOINT_TOKEN5]], i32 0, i32 0)
-; CHECK-NEXT: [[DERIVED_REMAT2:%.*]] = getelementptr i32, ptr addrspace(1) [[BASE_RELOCATED6]], i32 16
+; CHECK-NEXT: [[STATEPOINT_TOKEN3:%.*]] = call token (i64, i32, ptr, i32, i32, ...) @llvm.experimental.gc.statepoint.p0(i64 2882400000, i32 0, ptr elementtype(void ()) @do_safepoint, i32 0, i32 0, i32 0, i32 0) [ "deopt"(), "gc-live"(ptr addrspace(1) [[BASE_RELOCATED]]) ]
+; CHECK-NEXT: [[BASE_RELOCATED4:%.*]] = call coldcc ptr addrspace(1) @llvm.experimental.gc.relocate.p1(token [[STATEPOINT_TOKEN3]], i32 0, i32 0)
; CHECK-NEXT: br label [[MERGE]]
; CHECK: merge:
-; CHECK-NEXT: [[DOT0:%.*]] = phi ptr addrspace(1) [ [[DERIVED_REMAT1]], [[HERE]] ], [ [[DERIVED_REMAT2]], [[THERE]] ]
-; CHECK-NEXT: [[RET:%.*]] = load i32, ptr addrspace(1) [[DOT0]], align 4
+; CHECK-NEXT: [[DOT0:%.*]] = phi ptr addrspace(1) [ [[BASE_RELOCATED2]], [[HERE]] ], [ [[BASE_RELOCATED4]], [[THERE]] ]
+; CHECK-NEXT: [[DERIVED_REMAT:%.*]] = getelementptr i32, ptr addrspace(1) [[DOT0]], i32 16
+; CHECK-NEXT: [[RET:%.*]] = load i32, ptr addrspace(1) [[DERIVED_REMAT]], align 4
; CHECK-NEXT: ret i32 [[RET]]
;
entry:
; CHECK-NEXT: [[DERIVED:%.*]] = getelementptr i32, ptr addrspace(1) [[BASE:%.*]], i32 16
; CHECK-NEXT: [[STATEPOINT_TOKEN:%.*]] = call token (i64, i32, ptr, i32, i32, ...) @llvm.experimental.gc.statepoint.p0(i64 2882400000, i32 0, ptr elementtype(void ()) @do_safepoint, i32 0, i32 0, i32 0, i32 0) [ "deopt"(), "gc-live"(ptr addrspace(1) [[BASE]]) ]
; CHECK-NEXT: [[BASE_RELOCATED:%.*]] = call coldcc ptr addrspace(1) @llvm.experimental.gc.relocate.p1(token [[STATEPOINT_TOKEN]], i32 0, i32 0)
-; CHECK-NEXT: [[DERIVED_REMAT:%.*]] = getelementptr i32, ptr addrspace(1) [[BASE_RELOCATED]], i32 16
; CHECK-NEXT: br i1 [[COND:%.*]], label [[HERE:%.*]], label [[THERE:%.*]]
; CHECK: here:
-; CHECK-NEXT: call void @use_obj(ptr addrspace(1) [[DERIVED_REMAT]])
+; CHECK-NEXT: [[DERIVED_REMAT3:%.*]] = getelementptr i32, ptr addrspace(1) [[BASE_RELOCATED]], i32 16
+; CHECK-NEXT: call void @use_obj(ptr addrspace(1) [[DERIVED_REMAT3]])
; CHECK-NEXT: [[STATEPOINT_TOKEN4:%.*]] = call token (i64, i32, ptr, i32, i32, ...) @llvm.experimental.gc.statepoint.p0(i64 2882400000, i32 0, ptr elementtype(void ()) @do_safepoint, i32 0, i32 0, i32 0, i32 0) [ "deopt"(), "gc-live"(ptr addrspace(1) [[BASE_RELOCATED]]) ]
; CHECK-NEXT: [[BASE_RELOCATED5:%.*]] = call coldcc ptr addrspace(1) @llvm.experimental.gc.relocate.p1(token [[STATEPOINT_TOKEN4]], i32 0, i32 0)
+; CHECK-NEXT: [[DERIVED_REMAT2:%.*]] = getelementptr i32, ptr addrspace(1) [[BASE_RELOCATED5]], i32 16
+; CHECK-NEXT: call void @use_obj(ptr addrspace(1) [[DERIVED_REMAT2]])
; CHECK-NEXT: [[DERIVED_REMAT1:%.*]] = getelementptr i32, ptr addrspace(1) [[BASE_RELOCATED5]], i32 16
-; CHECK-NEXT: call void @use_obj(ptr addrspace(1) [[DERIVED_REMAT1]])
; CHECK-NEXT: [[DUMMY:%.*]] = load i32, ptr addrspace(1) [[DERIVED_REMAT1]], align 4
; CHECK-NEXT: [[STATEPOINT_TOKEN6:%.*]] = call token (i64, i32, ptr, i32, i32, ...) @llvm.experimental.gc.statepoint.p0(i64 2882400000, i32 0, ptr elementtype(void ()) @do_safepoint, i32 0, i32 0, i32 0, i32 0) [ "deopt"(), "gc-live"(ptr addrspace(1) [[BASE_RELOCATED5]]) ]
; CHECK-NEXT: [[BASE_RELOCATED7:%.*]] = call coldcc ptr addrspace(1) @llvm.experimental.gc.relocate.p1(token [[STATEPOINT_TOKEN6]], i32 0, i32 0)
-; CHECK-NEXT: [[DERIVED_REMAT2:%.*]] = getelementptr i32, ptr addrspace(1) [[BASE_RELOCATED7]], i32 16
; CHECK-NEXT: br label [[MERGE:%.*]]
; CHECK: there:
; CHECK-NEXT: [[STATEPOINT_TOKEN8:%.*]] = call token (i64, i32, ptr, i32, i32, ...) @llvm.experimental.gc.statepoint.p0(i64 2882400000, i32 0, ptr elementtype(void ()) @do_safepoint, i32 0, i32 0, i32 0, i32 0) [ "deopt"(), "gc-live"(ptr addrspace(1) [[BASE_RELOCATED]]) ]
; CHECK-NEXT: [[BASE_RELOCATED9:%.*]] = call coldcc ptr addrspace(1) @llvm.experimental.gc.relocate.p1(token [[STATEPOINT_TOKEN8]], i32 0, i32 0)
-; CHECK-NEXT: [[DERIVED_REMAT3:%.*]] = getelementptr i32, ptr addrspace(1) [[BASE_RELOCATED9]], i32 16
; CHECK-NEXT: br label [[MERGE]]
; CHECK: merge:
-; CHECK-NEXT: [[DOT0:%.*]] = phi ptr addrspace(1) [ [[DERIVED_REMAT2]], [[HERE]] ], [ [[DERIVED_REMAT3]], [[THERE]] ]
-; CHECK-NEXT: [[RET:%.*]] = load i32, ptr addrspace(1) [[DOT0]], align 4
+; CHECK-NEXT: [[DOT0:%.*]] = phi ptr addrspace(1) [ [[BASE_RELOCATED7]], [[HERE]] ], [ [[BASE_RELOCATED9]], [[THERE]] ]
+; CHECK-NEXT: [[DERIVED_REMAT:%.*]] = getelementptr i32, ptr addrspace(1) [[DOT0]], i32 16
+; CHECK-NEXT: [[RET:%.*]] = load i32, ptr addrspace(1) [[DERIVED_REMAT]], align 4
; CHECK-NEXT: ret i32 [[RET]]
;
entry:
; CHECK-NEXT: [[V0:%.*]] = getelementptr i8, ptr addrspace(1) [[BASE:%.*]], i64 16
; CHECK-NEXT: [[STATEPOINT_TOKEN:%.*]] = call token (i64, i32, ptr, i32, i32, ...) @llvm.experimental.gc.statepoint.p0(i64 2882400000, i32 0, ptr elementtype(void ()) @do_safepoint, i32 0, i32 0, i32 0, i32 0) [ "deopt"(ptr addrspace(1) [[BASE]]), "gc-live"(ptr addrspace(1) [[BASE]]) ]
; CHECK-NEXT: [[BASE_RELOCATED:%.*]] = call coldcc ptr addrspace(1) @llvm.experimental.gc.relocate.p1(token [[STATEPOINT_TOKEN]], i32 0, i32 0)
-; CHECK-NEXT: [[V0_REMAT:%.*]] = getelementptr i8, ptr addrspace(1) [[BASE_RELOCATED]], i64 16
; CHECK-NEXT: br i1 [[COND1:%.*]], label [[BLOCK3:%.*]], label [[COMMON_RET:%.*]]
; CHECK: block3:
+; CHECK-NEXT: [[V0_REMAT:%.*]] = getelementptr i8, ptr addrspace(1) [[BASE_RELOCATED]], i64 16
; CHECK-NEXT: [[V4:%.*]] = getelementptr i8, ptr addrspace(1) [[V0_REMAT]], i64 70
-; CHECK-NEXT: [[STATEPOINT_TOKEN2:%.*]] = call token (i64, i32, ptr, i32, i32, ...) @llvm.experimental.gc.statepoint.p0(i64 2882400000, i32 0, ptr elementtype(void ()) @do_safepoint, i32 0, i32 0, i32 0, i32 0) [ "deopt"(ptr addrspace(1) [[BASE_RELOCATED]]), "gc-live"(ptr addrspace(1) [[BASE_RELOCATED]]) ]
-; CHECK-NEXT: [[BASE_RELOCATED3:%.*]] = call coldcc ptr addrspace(1) @llvm.experimental.gc.relocate.p1(token [[STATEPOINT_TOKEN2]], i32 0, i32 0)
-; CHECK-NEXT: [[V0_REMAT1:%.*]] = getelementptr i8, ptr addrspace(1) [[BASE_RELOCATED3]], i64 16
-; CHECK-NEXT: [[V4_REMAT:%.*]] = getelementptr i8, ptr addrspace(1) [[V0_REMAT1]], i64 70
+; CHECK-NEXT: [[STATEPOINT_TOKEN1:%.*]] = call token (i64, i32, ptr, i32, i32, ...) @llvm.experimental.gc.statepoint.p0(i64 2882400000, i32 0, ptr elementtype(void ()) @do_safepoint, i32 0, i32 0, i32 0, i32 0) [ "deopt"(ptr addrspace(1) [[BASE_RELOCATED]]), "gc-live"(ptr addrspace(1) [[BASE_RELOCATED]]) ]
+; CHECK-NEXT: [[BASE_RELOCATED2:%.*]] = call coldcc ptr addrspace(1) @llvm.experimental.gc.relocate.p1(token [[STATEPOINT_TOKEN1]], i32 0, i32 0)
+; CHECK-NEXT: [[V0_REMAT_REMAT:%.*]] = getelementptr i8, ptr addrspace(1) [[BASE_RELOCATED2]], i64 16
+; CHECK-NEXT: [[V4_REMAT:%.*]] = getelementptr i8, ptr addrspace(1) [[V0_REMAT_REMAT]], i64 70
; CHECK-NEXT: [[V5:%.*]] = load atomic i8, ptr addrspace(1) [[V4_REMAT]] unordered, align 2
; CHECK-NEXT: br label [[COMMON_RET]]
; CHECK: common.ret: