Report dangling probes for frames that have real samples collected. Dangling probes are the probes associated to an empty block. When reported, sample count on a dangling probe will not be trusted by the compiler and we will rely on the counts inference algorithm to get the probe a reasonable count. This actually fixes a bug where previously only those dangling probes with samples collected were reported.
This patch also fixes two existing issues. Pseudo probes are stored in `Address2ProbesMap` and their pointers are used in `PseudoProbeInlineTree`. Previously `std::vector` was used to store probes and the pointers to probes may get obsolete as the vector grows. I'm changing `std::vector` to `std::list` instead.
The other issue is that all outlined functions shared the same inline frame previously due to the unchanged `Index` value as the dummy inlineSite identifier.
Good results seen for SPEC2017 in general regarding profile quality.
Reviewed By: wenlei, wlei
Differential Revision: https://reviews.llvm.org/D100235
// A list of NUM_INLINED_FUNCTIONS entries describing each of the inlined
// callees. Each record contains:
// INLINE SITE
-// GUID of the inlinee (uint64)
// ID of the callsite probe (ULEB128)
// FUNCTION BODY
// A FUNCTION BODY entry describing the inlined function.
--- /dev/null
+PERF_RECORD_MMAP2 595196/595196: [0x201000(0x1000) @ 0 00:1d 224227621 1042948]: r-xp /home/inline-cs-pseudoprobe.perfbin
+
+ 20180e
+ 5541f689495641d7
+ 0x201858/0x20180e/P/-/-/0 0x20182b/0x20184d/P/-/-/0 0x20182b/0x201800/P/-/-/0 0x20182b/0x201800/P/-/-/0 0x20182b/0x201800/P/-/-/0 0x20182b/0x201800/P/-/-/0 0x20182b/0x201800/P/-/-/0 0x20182b/0x201800/P/-/-/0 0x20182b/0x201800/P/-/-/0 0x20182b/0x201800/P/-/-/0 0x20182b/0x201800/P/-/-/0 0x20182b/0x201800/P/-/-/0 0x20182b/0x201800/P/-/-/0 0x20182b/0x201800/P/-/-/0 0x20182b/0x201800/P/-/-/0 0x20182b/0x201800/P/-/-/0
--- /dev/null
+; RUN: llvm-profgen --perfscript=%S/Inputs/inline-cs-dangling-pseudoprobe.perfscript --binary=%S/Inputs/inline-cs-pseudoprobe.perfbin --output=%t --show-unwinder-output --csprof-cold-thres=0 | FileCheck %s --check-prefix=CHECK-UNWINDER
+; RUN: FileCheck %s --input-file %t
+
+; CHECK: [main:2 @ foo]:58:0
+; CHECK-NEXT: 2: 15
+; CHECK-NEXT: 3: 14
+; CHECK-NEXT: 5: 14
+; CHECK-NEXT: 6: 15
+; CHECK-NEXT: !CFGChecksum: 138950591924
+; CHECK:[main:2 @ foo:8 @ bar]:1:0
+; CHECK-NEXT: 2: 18446744073709551615
+; CHECK-NEXT: 3: 18446744073709551615
+; CHECK-NEXT: 4: 1
+; CHECK-NEXT: !CFGChecksum: 72617220756
+
+
+; CHECK-UNWINDER: Binary(inline-cs-pseudoprobe.perfbin)'s Range Counter:
+; CHECK-UNWINDER-EMPTY:
+; CHECK-UNWINDER-NEXT: (800, 82b): 14
+; CHECK-UNWINDER-NEXT: (84d, 858): 1
+
+; CHECK-UNWINDER: Binary(inline-cs-pseudoprobe.perfbin)'s Branch Counter:
+; CHECK-UNWINDER-EMPTY:
+; CHECK-UNWINDER-NEXT: (82b, 800): 14
+; CHECK-UNWINDER-NEXT: (82b, 84d): 1
+; CHECK-UNWINDER-NEXT: (858, 80e): 1
+
+; clang -O3 -fexperimental-new-pass-manager -fuse-ld=lld -fpseudo-probe-for-profiling
+; -fno-omit-frame-pointer -mno-omit-leaf-frame-pointer -Xclang -mdisable-tail-calls
+; -g test.c -o a.out
+
+#include <stdio.h>
+
+int bar(int x, int y) {
+ if (x % 3) {
+ return x - y;
+ }
+ return x + y;
+}
+
+void foo() {
+ int s, i = 0;
+ while (i++ < 4000 * 4000)
+ if (i % 91) s = bar(i, s); else s += 30;
+ printf("sum is %d\n", s);
+}
+
+int main() {
+ foo();
+ return 0;
+}
; Used the data from recursion-compression.test, refer it for the unmerged output
-; RUN: llvm-profgen --perfscript=%S/Inputs/recursion-compression-pseudoprobe.perfscript --binary=%S/Inputs/recursion-compression-pseudoprobe.perfbin --output=%t --compress-recursion=-1 --csprof-cold-thres=8
-; RUN: FileCheck %s --input-file %t
+; RUN: llvm-profgen --perfscript=%S/Inputs/recursion-compression-pseudoprobe.perfscript --binary=%S/Inputs/recursion-compression-pseudoprobe.perfbin --output=%t1 --compress-recursion=-1 --csprof-cold-thres=8
+; RUN: FileCheck %s --input-file %t1
; Test --csprof-trim-cold-context=0
-; RUN: llvm-profgen --perfscript=%S/Inputs/recursion-compression-pseudoprobe.perfscript --binary=%S/Inputs/recursion-compression-pseudoprobe.perfbin --output=%t --compress-recursion=-1 --csprof-cold-thres=100 --csprof-trim-cold-context=0
-; RUN: FileCheck %s --input-file %t --check-prefix=CHECK-KEEP-COLD
+; RUN: llvm-profgen --perfscript=%S/Inputs/recursion-compression-pseudoprobe.perfscript --binary=%S/Inputs/recursion-compression-pseudoprobe.perfbin --output=%t2 --compress-recursion=-1 --csprof-cold-thres=100 --csprof-trim-cold-context=0
+; RUN: FileCheck %s --input-file %t2 --check-prefix=CHECK-KEEP-COLD
; Test --csprof-merge-cold-context=0
-; RUN: llvm-profgen --perfscript=%S/Inputs/recursion-compression-pseudoprobe.perfscript --binary=%S/Inputs/recursion-compression-pseudoprobe.perfbin --output=%t --compress-recursion=-1 --csprof-cold-thres=10 --csprof-merge-cold-context=0
-; RUN: FileCheck %s --input-file %t --check-prefix=CHECK-UNMERGED
+; RUN: llvm-profgen --perfscript=%S/Inputs/recursion-compression-pseudoprobe.perfscript --binary=%S/Inputs/recursion-compression-pseudoprobe.perfbin --output=%t3 --compress-recursion=-1 --csprof-cold-thres=10 --csprof-merge-cold-context=0
+; RUN: FileCheck %s --input-file %t3 --check-prefix=CHECK-UNMERGED
; CHECK: [fa]:14:4
; CHECK-NEXT: 1: 4
+; CHECK-NEXT: 2: 18446744073709551615
; CHECK-NEXT: 3: 4
; CHECK-NEXT: 4: 2
; CHECK-NEXT: 5: 1
; CHECK-KEEP-COLD-NEXT: !Attributes: 0
; CHECK-KEEP-COLD-NEXT:[fa]:14:4
; CHECK-KEEP-COLD-NEXT: 1: 4
+; CHECK-KEEP-COLD-NEXT: 2: 18446744073709551615
; CHECK-KEEP-COLD-NEXT: 3: 4
; CHECK-KEEP-COLD-NEXT: 4: 2
; CHECK-KEEP-COLD-NEXT: 5: 1
// Extract the top frame probes by looking up each address among the range in
// the Address2ProbeMap
extractProbesFromRange(RangeCounter, ProbeCounter, Binary);
+ std::unordered_map<PseudoProbeInlineTree *, FunctionSamples *> FrameSamples;
for (auto PI : ProbeCounter) {
const PseudoProbe *Probe = PI.first;
uint64_t Count = PI.second;
+ // Ignore dangling probes since they will be reported later if needed.
+ if (Probe->isDangling())
+ continue;
FunctionSamples &FunctionProfile =
getFunctionProfileForLeafProbe(ContextStrStack, Probe, Binary);
-
- // Use InvalidProbeCount(UINT64_MAX) to mark sample count for a dangling
- // probe. Dangling probes are the probes associated to an empty block. With
- // this place holder, sample count on dangling probe will not be trusted by
- // the compiler and it will rely on the counts inference algorithm to get
- // the probe a reasonable count.
- if (Probe->isDangling()) {
- FunctionProfile.addBodySamplesForProbe(
- Probe->Index, FunctionSamples::InvalidProbeCount);
- continue;
- }
+ // Record the current frame and FunctionProfile whenever samples are
+ // collected for non-danglie probes. This is for reporting all of the
+ // dangling probes of the frame later.
+ FrameSamples[Probe->getInlineTreeNode()] = &FunctionProfile;
FunctionProfile.addBodySamplesForProbe(Probe->Index, Count);
FunctionProfile.addTotalSamples(Count);
if (Probe->isEntry()) {
FunctionProfile.getContext().getNameWithoutContext(), Count);
}
}
+
+ // Report dangling probes for frames that have real samples collected.
+ // Dangling probes are the probes associated to an empty block. With this
+ // place holder, sample count on a dangling probe will not be trusted by the
+ // compiler and we will rely on the counts inference algorithm to get the
+ // probe a reasonable count. Use InvalidProbeCount to mark sample count for
+ // a dangling probe.
+ for (auto &I : FrameSamples) {
+ auto *FunctionProfile = I.second;
+ for (auto *Probe : I.first->getProbes()) {
+ if (Probe->isDangling()) {
+ FunctionProfile->addBodySamplesForProbe(
+ Probe->Index, FunctionSamples::InvalidProbeCount);
+ }
+ }
+ }
}
}
// A list of NUM_INLINED_FUNCTIONS entries describing each of the
// inlined callees. Each record contains:
// INLINE SITE
- // GUID of the inlinee (uint64)
// Index of the callsite probe (ULEB128)
// FUNCTION BODY
// A FUNCTION BODY entry describing the inlined function.
uint32_t Index = 0;
// A DFS-based decoding
while (Data < End) {
- // Read inline site for inlinees
- if (Root != Cur) {
+ if (Root == Cur) {
+ // Use a sequential id for top level inliner.
+ Index = Root->getChildren().size();
+ } else {
+ // Read inline site for inlinees
Index = readUnsignedNumber<uint32_t>();
}
// Switch/add to a new tree node(inlinee)
Addr = readUnencodedNumber<int64_t>();
}
// Populate Address2ProbesMap
- std::vector<PseudoProbe> &ProbeVec = Address2ProbesMap[Addr];
- ProbeVec.emplace_back(Addr, Cur->GUID, Index, PseudoProbeType(Kind), Attr,
- Cur);
- Cur->addProbes(&ProbeVec.back());
+ auto &Probes = Address2ProbesMap[Addr];
+ Probes.emplace_back(Addr, Cur->GUID, Index, PseudoProbeType(Kind), Attr,
+ Cur);
+ Cur->addProbes(&Probes.back());
LastAddr = Addr;
}
auto It = Address2ProbesMap.find(Address);
if (It == Address2ProbesMap.end())
return nullptr;
- const std::vector<PseudoProbe> &Probes = It->second;
+ const auto &Probes = It->second;
const PseudoProbe *CallProbe = nullptr;
for (const auto &Probe : Probes) {
return std::get<0>(Site) ^ std::get<1>(Site);
}
};
- std::unordered_map<InlineSite, std::unique_ptr<PseudoProbeInlineTree>,
- InlineSiteHash>
- Children;
+ using InlinedProbeTreeMap =
+ std::unordered_map<InlineSite, std::unique_ptr<PseudoProbeInlineTree>,
+ InlineSiteHash>;
+ InlinedProbeTreeMap Children;
public:
// Inlinee function GUID
return Ret.first->second.get();
}
+ InlinedProbeTreeMap &getChildren() { return Children; }
+ std::vector<PseudoProbe *> &getProbes() { return ProbeVector; }
void addProbes(PseudoProbe *Probe) { ProbeVector.push_back(Probe); }
// Return false if it's a dummy inline site
bool hasInlineSite() const { return std::get<0>(ISite) != 0; }
// GUID to PseudoProbeFuncDesc map
using GUIDProbeFunctionMap = std::unordered_map<uint64_t, PseudoProbeFuncDesc>;
// Address to pseudo probes map.
-using AddressProbesMap = std::unordered_map<uint64_t, std::vector<PseudoProbe>>;
+using AddressProbesMap = std::unordered_map<uint64_t, std::list<PseudoProbe>>;
/*
A pseudo probe has the format like below:
bool isDirectCall() const { return Type == PseudoProbeType::DirectCall; }
bool isCall() const { return isIndirectCall() || isDirectCall(); }
+ PseudoProbeInlineTree *getInlineTreeNode() const { return InlineTree; }
+
// Get the inlined context by traversing current inline tree backwards,
// each tree node has its InlineSite which is taken as the context.
// \p ContextStack is populated in root to leaf order