// This file is a part of XRay, a function call tracing system.
//
//===----------------------------------------------------------------------===//
-#include "gtest/gtest.h"
-
#include "xray_function_call_trie.h"
+#include "gtest/gtest.h"
+#include <cstdint>
namespace __xray {
auto A = FunctionCallTrie::InitAllocators();
FunctionCallTrie Trie(A);
- Trie.enterFunction(1, 1);
- Trie.exitFunction(1, 2);
+ uint64_t TSC = 1;
+ uint16_t CPU = 0;
+ Trie.enterFunction(1, TSC++, CPU++);
+ Trie.exitFunction(1, TSC++, CPU++);
+ const auto &R = Trie.getRoots();
- // We need a way to pull the data out. At this point, until we get a data
- // collection service implemented, we're going to export the data as a list of
- // roots, and manually walk through the structure ourselves.
+ ASSERT_EQ(R.size(), 1u);
+ ASSERT_EQ(R.front()->FId, 1);
+ ASSERT_EQ(R.front()->CallCount, 1u);
+ ASSERT_EQ(R.front()->CumulativeLocalTime, 1u);
+}
+
+TEST(FunctionCallTrieTest, HandleTSCOverflow) {
+ profilingFlags()->setDefaults();
+ auto A = FunctionCallTrie::InitAllocators();
+ FunctionCallTrie Trie(A);
+ Trie.enterFunction(1, std::numeric_limits<uint64_t>::max(), 0);
+ Trie.exitFunction(1, 1, 0);
const auto &R = Trie.getRoots();
ASSERT_EQ(R.size(), 1u);
ASSERT_EQ(R.front()->FId, 1);
- ASSERT_EQ(R.front()->CallCount, 1);
+ ASSERT_EQ(R.front()->CallCount, 1u);
ASSERT_EQ(R.front()->CumulativeLocalTime, 1u);
}
+TEST(FunctionCallTrieTest, MaximalCumulativeTime) {
+ profilingFlags()->setDefaults();
+ auto A = FunctionCallTrie::InitAllocators();
+ FunctionCallTrie Trie(A);
+
+ Trie.enterFunction(1, 1, 0);
+ Trie.exitFunction(1, 0, 0);
+ const auto &R = Trie.getRoots();
+
+ ASSERT_EQ(R.size(), 1u);
+ ASSERT_EQ(R.front()->FId, 1);
+ ASSERT_EQ(R.front()->CallCount, 1u);
+ ASSERT_EQ(R.front()->CumulativeLocalTime,
+ std::numeric_limits<uint64_t>::max() - 1);
+}
+
TEST(FunctionCallTrieTest, MissingFunctionEntry) {
profilingFlags()->setDefaults();
auto A = FunctionCallTrie::InitAllocators();
FunctionCallTrie Trie(A);
- Trie.exitFunction(1, 1);
+ Trie.exitFunction(1, 1, 0);
const auto &R = Trie.getRoots();
ASSERT_TRUE(R.empty());
profilingFlags()->setDefaults();
auto A = FunctionCallTrie::InitAllocators();
FunctionCallTrie Trie(A);
- Trie.enterFunction(2, 1);
- Trie.enterFunction(3, 3);
- Trie.exitFunction(1, 5);
+ Trie.enterFunction(2, 1, 0);
+ Trie.enterFunction(3, 3, 0);
+ Trie.exitFunction(1, 5, 0);
const auto &R = Trie.getRoots();
ASSERT_FALSE(R.empty());
profilingFlags()->setDefaults();
auto A = FunctionCallTrie::InitAllocators();
FunctionCallTrie Trie(A);
- Trie.enterFunction(1, 1);
+ Trie.enterFunction(1, 1, 0);
const auto &R = Trie.getRoots();
ASSERT_FALSE(R.empty());
FunctionCallTrie Trie(A);
// Enter and exit FId = 1.
- Trie.enterFunction(1, 1);
- Trie.exitFunction(1, 2);
+ Trie.enterFunction(1, 1, 0);
+ Trie.exitFunction(1, 2, 0);
// Enter and exit FId = 2.
- Trie.enterFunction(2, 3);
- Trie.exitFunction(2, 4);
+ Trie.enterFunction(2, 3, 0);
+ Trie.exitFunction(2, 4, 0);
const auto &R = Trie.getRoots();
ASSERT_FALSE(R.empty());
auto A = FunctionCallTrie::InitAllocators();
FunctionCallTrie Trie(A);
- Trie.enterFunction(1, 0);
- Trie.enterFunction(2, 100);
- Trie.enterFunction(3, 200);
- Trie.exitFunction(3, 300);
- Trie.exitFunction(1, 400);
+ Trie.enterFunction(1, 0, 0);
+ Trie.enterFunction(2, 100, 0);
+ Trie.enterFunction(3, 200, 0);
+ Trie.exitFunction(3, 300, 0);
+ Trie.exitFunction(1, 400, 0);
// What we should see at this point is all the functions in the trie in a
// specific order (1 -> 2 -> 3) with the appropriate count(s) and local
// Now that we've established the preconditions, we check for specific aspects
// of the nodes.
- EXPECT_EQ(F3.CallCount, 1);
- EXPECT_EQ(F2.CallCount, 1);
- EXPECT_EQ(F1.CallCount, 1);
- EXPECT_EQ(F3.CumulativeLocalTime, 100);
- EXPECT_EQ(F2.CumulativeLocalTime, 300);
- EXPECT_EQ(F1.CumulativeLocalTime, 100);
+ EXPECT_EQ(F3.CallCount, 1u);
+ EXPECT_EQ(F2.CallCount, 1u);
+ EXPECT_EQ(F1.CallCount, 1u);
+ EXPECT_EQ(F3.CumulativeLocalTime, 100u);
+ EXPECT_EQ(F2.CumulativeLocalTime, 300u);
+ EXPECT_EQ(F1.CumulativeLocalTime, 100u);
}
TEST(FunctionCallTrieTest, DeepCallStack) {
auto A = FunctionCallTrie::InitAllocators();
FunctionCallTrie Trie(A);
for (int i = 0; i < 32; ++i)
- Trie.enterFunction(i + 1, i);
- Trie.exitFunction(1, 33);
+ Trie.enterFunction(i + 1, i, 0);
+ Trie.exitFunction(1, 33, 0);
// Here, validate that we have a 32-level deep function call path from the
// root (1) down to the leaf (33).
auto F = R[0];
for (int i = 0; i < 32; ++i) {
EXPECT_EQ(F->FId, i + 1);
- EXPECT_EQ(F->CallCount, 1);
+ EXPECT_EQ(F->CallCount, 1u);
if (F->Callees.empty() && i != 31)
FAIL() << "Empty callees for FId " << F->FId;
if (i != 31)
auto A = FunctionCallTrie::InitAllocators();
FunctionCallTrie Trie(A);
- Trie.enterFunction(1, 0);
- Trie.enterFunction(2, 1);
- Trie.exitFunction(2, 2);
- Trie.enterFunction(3, 3);
- Trie.exitFunction(3, 4);
- Trie.exitFunction(1, 5);
+ Trie.enterFunction(1, 0, 0);
+ Trie.enterFunction(2, 1, 0);
+ Trie.exitFunction(2, 2, 0);
+ Trie.enterFunction(3, 3, 0);
+ Trie.exitFunction(3, 4, 0);
+ Trie.exitFunction(1, 5, 0);
// We want to make a deep copy and compare notes.
auto B = FunctionCallTrie::InitAllocators();
FunctionCallTrie T1(A);
// 1 -> 2 -> 3
- T0.enterFunction(1, 0);
- T0.enterFunction(2, 1);
- T0.enterFunction(3, 2);
- T0.exitFunction(3, 3);
- T0.exitFunction(2, 4);
- T0.exitFunction(1, 5);
+ T0.enterFunction(1, 0, 0);
+ T0.enterFunction(2, 1, 0);
+ T0.enterFunction(3, 2, 0);
+ T0.exitFunction(3, 3, 0);
+ T0.exitFunction(2, 4, 0);
+ T0.exitFunction(1, 5, 0);
// 1 -> 2 -> 3
- T1.enterFunction(1, 0);
- T1.enterFunction(2, 1);
- T1.enterFunction(3, 2);
- T1.exitFunction(3, 3);
- T1.exitFunction(2, 4);
- T1.exitFunction(1, 5);
+ T1.enterFunction(1, 0, 0);
+ T1.enterFunction(2, 1, 0);
+ T1.enterFunction(3, 2, 0);
+ T1.exitFunction(3, 3, 0);
+ T1.exitFunction(2, 4, 0);
+ T1.exitFunction(1, 5, 0);
// We use a different allocator here to make sure that we're able to transfer
// data into a FunctionCallTrie which uses a different allocator. This
ASSERT_EQ(Merged.getRoots().size(), 1u);
const auto &R0 = *Merged.getRoots()[0];
EXPECT_EQ(R0.FId, 1);
- EXPECT_EQ(R0.CallCount, 2);
- EXPECT_EQ(R0.CumulativeLocalTime, 10);
+ EXPECT_EQ(R0.CallCount, 2u);
+ EXPECT_EQ(R0.CumulativeLocalTime, 10u);
EXPECT_EQ(R0.Callees.size(), 1u);
const auto &F1 = *R0.Callees[0].NodePtr;
EXPECT_EQ(F1.FId, 2);
- EXPECT_EQ(F1.CallCount, 2);
- EXPECT_EQ(F1.CumulativeLocalTime, 6);
+ EXPECT_EQ(F1.CallCount, 2u);
+ EXPECT_EQ(F1.CumulativeLocalTime, 6u);
EXPECT_EQ(F1.Callees.size(), 1u);
const auto &F2 = *F1.Callees[0].NodePtr;
EXPECT_EQ(F2.FId, 3);
- EXPECT_EQ(F2.CallCount, 2);
- EXPECT_EQ(F2.CumulativeLocalTime, 2);
+ EXPECT_EQ(F2.CallCount, 2u);
+ EXPECT_EQ(F2.CumulativeLocalTime, 2u);
EXPECT_EQ(F2.Callees.size(), 0u);
}
#include "xray_defs.h"
#include "xray_profiling_flags.h"
#include "xray_segmented_array.h"
+#include <limits>
#include <memory> // For placement new.
#include <utility>
struct Node {
Node *Parent;
NodeIdPairArray Callees;
- int64_t CallCount;
- int64_t CumulativeLocalTime; // Typically in TSC deltas, not wall-time.
+ uint64_t CallCount;
+ uint64_t CumulativeLocalTime; // Typically in TSC deltas, not wall-time.
int32_t FId;
// We add a constructor here to allow us to inplace-construct through
// Array<...>'s AppendEmplace.
- Node(Node *P, NodeIdPairAllocatorType &A, int64_t CC, int64_t CLT,
+ Node(Node *P, NodeIdPairAllocatorType &A, uint64_t CC, uint64_t CLT,
int32_t F) XRAY_NEVER_INSTRUMENT : Parent(P),
Callees(A),
CallCount(CC),
struct ShadowStackEntry {
uint64_t EntryTSC;
Node *NodePtr;
+ uint16_t EntryCPU;
// We add a constructor here to allow us to inplace-construct through
// Array<...>'s AppendEmplace.
- ShadowStackEntry(uint64_t T, Node *N) XRAY_NEVER_INSTRUMENT : EntryTSC{T},
- NodePtr{N} {}
+ ShadowStackEntry(uint64_t T, Node *N, uint16_t C) XRAY_NEVER_INSTRUMENT
+ : EntryTSC{T},
+ NodePtr{N},
+ EntryCPU{C} {}
};
using NodeArray = Array<Node>;
ShadowStack(*A.ShadowStackAllocator),
NodeIdPairAllocator(A.NodeIdPairAllocator) {}
- void enterFunction(const int32_t FId, uint64_t TSC) XRAY_NEVER_INSTRUMENT {
+ void enterFunction(const int32_t FId, uint64_t TSC,
+ uint16_t CPU) XRAY_NEVER_INSTRUMENT {
DCHECK_NE(FId, 0);
// This function primarily deals with ensuring that the ShadowStack is
// consistent and ready for when an exit event is encountered.
if (UNLIKELY(ShadowStack.empty())) {
auto NewRoot =
- Nodes.AppendEmplace(nullptr, *NodeIdPairAllocator, 0, 0, FId);
+ Nodes.AppendEmplace(nullptr, *NodeIdPairAllocator, 0u, 0u, FId);
if (UNLIKELY(NewRoot == nullptr))
return;
Roots.Append(NewRoot);
- ShadowStack.AppendEmplace(TSC, NewRoot);
+ ShadowStack.AppendEmplace(TSC, NewRoot, CPU);
return;
}
[FId](const NodeIdPair &NR) { return NR.FId == FId; });
if (Callee != nullptr) {
CHECK_NE(Callee->NodePtr, nullptr);
- ShadowStack.AppendEmplace(TSC, Callee->NodePtr);
+ ShadowStack.AppendEmplace(TSC, Callee->NodePtr, CPU);
return;
}
// This means we've never seen this stack before, create a new node here.
auto NewNode =
- Nodes.AppendEmplace(TopNode, *NodeIdPairAllocator, 0, 0, FId);
+ Nodes.AppendEmplace(TopNode, *NodeIdPairAllocator, 0u, 0u, FId);
if (UNLIKELY(NewNode == nullptr))
return;
DCHECK_NE(NewNode, nullptr);
TopNode->Callees.AppendEmplace(NewNode, FId);
- ShadowStack.AppendEmplace(TSC, NewNode);
+ ShadowStack.AppendEmplace(TSC, NewNode, CPU);
DCHECK_NE(ShadowStack.back().NodePtr, nullptr);
return;
}
- void exitFunction(int32_t FId, uint64_t TSC) XRAY_NEVER_INSTRUMENT {
+ void exitFunction(int32_t FId, uint64_t TSC,
+ uint16_t CPU) XRAY_NEVER_INSTRUMENT {
// When we exit a function, we look up the ShadowStack to see whether we've
// entered this function before. We do as little processing here as we can,
// since most of the hard work would have already been done at function
const auto &Top = ShadowStack.back();
auto TopNode = Top.NodePtr;
DCHECK_NE(TopNode, nullptr);
- auto LocalTime = TSC - Top.EntryTSC;
+
+ // We may encounter overflow on the TSC we're provided, which may end up
+ // being less than the TSC when we first entered the function.
+ //
+ // To get the accurate measurement of cycles, we need to check whether
+ // we've overflowed (TSC < Top.EntryTSC) and then account the difference
+ // between the entry TSC and the max for the TSC counter (max of uint64_t)
+ // then add the value of TSC. We can prove that the maximum delta we will
+ // get is at most the 64-bit unsigned value, since the difference between
+ // a TSC of 0 and a Top.EntryTSC of 1 is (numeric_limits<uint64_t>::max()
+ // - 1) + 1.
+ //
+ // NOTE: This assumes that TSCs are synchronised across CPUs.
+ // TODO: Count the number of times we've seen CPU migrations.
+ uint64_t LocalTime =
+ Top.EntryTSC > TSC
+ ? (std::numeric_limits<uint64_t>::max() - Top.EntryTSC) + TSC
+ : TSC - Top.EntryTSC;
TopNode->CallCount++;
TopNode->CumulativeLocalTime += LocalTime - CumulativeTreeTime;
CumulativeTreeTime += LocalTime;
auto R = O.Roots.find_element(
[&](const Node *Node) { return Node->FId == Root->FId; });
if (R == nullptr) {
- TargetRoot = O.Nodes.AppendEmplace(nullptr, *O.NodeIdPairAllocator, 0,
- 0, Root->FId);
+ TargetRoot = O.Nodes.AppendEmplace(nullptr, *O.NodeIdPairAllocator, 0u,
+ 0u, Root->FId);
if (UNLIKELY(TargetRoot == nullptr))
return;
});
if (TargetCallee == nullptr) {
auto NewTargetNode = O.Nodes.AppendEmplace(
- NT.TargetNode, *O.NodeIdPairAllocator, 0, 0, Callee.FId);
+ NT.TargetNode, *O.NodeIdPairAllocator, 0u, 0u, Callee.FId);
if (UNLIKELY(NewTargetNode == nullptr))
return;