[XRay] Update TSC math to handle wraparound

author Dean Michael Berris <dberris@google.com>

Mon, 5 Nov 2018 05:43:22 +0000 (05:43 +0000)

committer Dean Michael Berris <dberris@google.com>

Mon, 5 Nov 2018 05:43:22 +0000 (05:43 +0000)
author Dean Michael Berris <dberris@google.com>
Mon, 5 Nov 2018 05:43:22 +0000 (05:43 +0000)
committer Dean Michael Berris <dberris@google.com>
Mon, 5 Nov 2018 05:43:22 +0000 (05:43 +0000)
diff --git a/compiler-rt/lib/xray/tests/unit/function_call_trie_test.cc b/compiler-rt/lib/xray/tests/unit/function_call_trie_test.cc

index 049ecfb..9b0f210 100644 (file)
--- a/compiler-rt/lib/xray/tests/unit/function_call_trie_test.cc
+++ b/compiler-rt/lib/xray/tests/unit/function_call_trie_test.cc
@@ -10,9 +10,9 @@
  // This file is a part of XRay, a function call tracing system.
  //
  //===----------------------------------------------------------------------===//
-#include "gtest/gtest.h"
-
  #include "xray_function_call_trie.h"
+#include "gtest/gtest.h"
+#include <cstdint>
  
  namespace __xray {
  
@@ -29,26 +29,54 @@ TEST(FunctionCallTrieTest, EnterAndExitFunction) {
    auto A = FunctionCallTrie::InitAllocators();
    FunctionCallTrie Trie(A);
  
-  Trie.enterFunction(1, 1);
-  Trie.exitFunction(1, 2);
+  uint64_t TSC = 1;
+  uint16_t CPU = 0;
+  Trie.enterFunction(1, TSC++, CPU++);
+  Trie.exitFunction(1, TSC++, CPU++);
+  const auto &R = Trie.getRoots();
  
-  // We need a way to pull the data out. At this point, until we get a data
-  // collection service implemented, we're going to export the data as a list of
-  // roots, and manually walk through the structure ourselves.
+  ASSERT_EQ(R.size(), 1u);
+  ASSERT_EQ(R.front()->FId, 1);
+  ASSERT_EQ(R.front()->CallCount, 1u);
+  ASSERT_EQ(R.front()->CumulativeLocalTime, 1u);
+}
+
+TEST(FunctionCallTrieTest, HandleTSCOverflow) {
+  profilingFlags()->setDefaults();
+  auto A = FunctionCallTrie::InitAllocators();
+  FunctionCallTrie Trie(A);
  
+  Trie.enterFunction(1, std::numeric_limits<uint64_t>::max(), 0);
+  Trie.exitFunction(1, 1, 0);
    const auto &R = Trie.getRoots();
  
    ASSERT_EQ(R.size(), 1u);
    ASSERT_EQ(R.front()->FId, 1);
-  ASSERT_EQ(R.front()->CallCount, 1);
+  ASSERT_EQ(R.front()->CallCount, 1u);
    ASSERT_EQ(R.front()->CumulativeLocalTime, 1u);
  }
  
+TEST(FunctionCallTrieTest, MaximalCumulativeTime) {
+  profilingFlags()->setDefaults();
+  auto A = FunctionCallTrie::InitAllocators();
+  FunctionCallTrie Trie(A);
+
+  Trie.enterFunction(1, 1, 0);
+  Trie.exitFunction(1, 0, 0);
+  const auto &R = Trie.getRoots();
+
+  ASSERT_EQ(R.size(), 1u);
+  ASSERT_EQ(R.front()->FId, 1);
+  ASSERT_EQ(R.front()->CallCount, 1u);
+  ASSERT_EQ(R.front()->CumulativeLocalTime,
+            std::numeric_limits<uint64_t>::max() - 1);
+}
+
  TEST(FunctionCallTrieTest, MissingFunctionEntry) {
    profilingFlags()->setDefaults();
    auto A = FunctionCallTrie::InitAllocators();
    FunctionCallTrie Trie(A);
-  Trie.exitFunction(1, 1);
+  Trie.exitFunction(1, 1, 0);
    const auto &R = Trie.getRoots();
  
    ASSERT_TRUE(R.empty());
@@ -58,9 +86,9 @@ TEST(FunctionCallTrieTest, NoMatchingEntersForExit) {
    profilingFlags()->setDefaults();
    auto A = FunctionCallTrie::InitAllocators();
    FunctionCallTrie Trie(A);
-  Trie.enterFunction(2, 1);
-  Trie.enterFunction(3, 3);
-  Trie.exitFunction(1, 5);
+  Trie.enterFunction(2, 1, 0);
+  Trie.enterFunction(3, 3, 0);
+  Trie.exitFunction(1, 5, 0);
    const auto &R = Trie.getRoots();
  
    ASSERT_FALSE(R.empty());
@@ -71,7 +99,7 @@ TEST(FunctionCallTrieTest, MissingFunctionExit) {
    profilingFlags()->setDefaults();
    auto A = FunctionCallTrie::InitAllocators();
    FunctionCallTrie Trie(A);
-  Trie.enterFunction(1, 1);
+  Trie.enterFunction(1, 1, 0);
    const auto &R = Trie.getRoots();
  
    ASSERT_FALSE(R.empty());
@@ -84,12 +112,12 @@ TEST(FunctionCallTrieTest, MultipleRoots) {
    FunctionCallTrie Trie(A);
  
    // Enter and exit FId = 1.
-  Trie.enterFunction(1, 1);
-  Trie.exitFunction(1, 2);
+  Trie.enterFunction(1, 1, 0);
+  Trie.exitFunction(1, 2, 0);
  
    // Enter and exit FId = 2.
-  Trie.enterFunction(2, 3);
-  Trie.exitFunction(2, 4);
+  Trie.enterFunction(2, 3, 0);
+  Trie.exitFunction(2, 4, 0);
  
    const auto &R = Trie.getRoots();
    ASSERT_FALSE(R.empty());
@@ -126,11 +154,11 @@ TEST(FunctionCallTrieTest, MissingIntermediaryExit) {
    auto A = FunctionCallTrie::InitAllocators();
    FunctionCallTrie Trie(A);
  
-  Trie.enterFunction(1, 0);
-  Trie.enterFunction(2, 100);
-  Trie.enterFunction(3, 200);
-  Trie.exitFunction(3, 300);
-  Trie.exitFunction(1, 400);
+  Trie.enterFunction(1, 0, 0);
+  Trie.enterFunction(2, 100, 0);
+  Trie.enterFunction(3, 200, 0);
+  Trie.exitFunction(3, 300, 0);
+  Trie.exitFunction(1, 400, 0);
  
    // What we should see at this point is all the functions in the trie in a
    // specific order (1 -> 2 -> 3) with the appropriate count(s) and local
@@ -153,12 +181,12 @@ TEST(FunctionCallTrieTest, MissingIntermediaryExit) {
  
    // Now that we've established the preconditions, we check for specific aspects
    // of the nodes.
-  EXPECT_EQ(F3.CallCount, 1);
-  EXPECT_EQ(F2.CallCount, 1);
-  EXPECT_EQ(F1.CallCount, 1);
-  EXPECT_EQ(F3.CumulativeLocalTime, 100);
-  EXPECT_EQ(F2.CumulativeLocalTime, 300);
-  EXPECT_EQ(F1.CumulativeLocalTime, 100);
+  EXPECT_EQ(F3.CallCount, 1u);
+  EXPECT_EQ(F2.CallCount, 1u);
+  EXPECT_EQ(F1.CallCount, 1u);
+  EXPECT_EQ(F3.CumulativeLocalTime, 100u);
+  EXPECT_EQ(F2.CumulativeLocalTime, 300u);
+  EXPECT_EQ(F1.CumulativeLocalTime, 100u);
  }
  
  TEST(FunctionCallTrieTest, DeepCallStack) {
@@ -168,8 +196,8 @@ TEST(FunctionCallTrieTest, DeepCallStack) {
    auto A = FunctionCallTrie::InitAllocators();
    FunctionCallTrie Trie(A);
    for (int i = 0; i < 32; ++i)
-    Trie.enterFunction(i + 1, i);
-  Trie.exitFunction(1, 33);
+    Trie.enterFunction(i + 1, i, 0);
+  Trie.exitFunction(1, 33, 0);
  
    // Here, validate that we have a 32-level deep function call path from the
    // root (1) down to the leaf (33).
@@ -178,7 +206,7 @@ TEST(FunctionCallTrieTest, DeepCallStack) {
    auto F = R[0];
    for (int i = 0; i < 32; ++i) {
      EXPECT_EQ(F->FId, i + 1);
-    EXPECT_EQ(F->CallCount, 1);
+    EXPECT_EQ(F->CallCount, 1u);
      if (F->Callees.empty() && i != 31)
        FAIL() << "Empty callees for FId " << F->FId;
      if (i != 31)
@@ -193,12 +221,12 @@ TEST(FunctionCallTrieTest, DeepCopy) {
    auto A = FunctionCallTrie::InitAllocators();
    FunctionCallTrie Trie(A);
  
-  Trie.enterFunction(1, 0);
-  Trie.enterFunction(2, 1);
-  Trie.exitFunction(2, 2);
-  Trie.enterFunction(3, 3);
-  Trie.exitFunction(3, 4);
-  Trie.exitFunction(1, 5);
+  Trie.enterFunction(1, 0, 0);
+  Trie.enterFunction(2, 1, 0);
+  Trie.exitFunction(2, 2, 0);
+  Trie.enterFunction(3, 3, 0);
+  Trie.exitFunction(3, 4, 0);
+  Trie.exitFunction(1, 5, 0);
  
    // We want to make a deep copy and compare notes.
    auto B = FunctionCallTrie::InitAllocators();
@@ -236,20 +264,20 @@ TEST(FunctionCallTrieTest, MergeInto) {
    FunctionCallTrie T1(A);
  
    // 1 -> 2 -> 3
-  T0.enterFunction(1, 0);
-  T0.enterFunction(2, 1);
-  T0.enterFunction(3, 2);
-  T0.exitFunction(3, 3);
-  T0.exitFunction(2, 4);
-  T0.exitFunction(1, 5);
+  T0.enterFunction(1, 0, 0);
+  T0.enterFunction(2, 1, 0);
+  T0.enterFunction(3, 2, 0);
+  T0.exitFunction(3, 3, 0);
+  T0.exitFunction(2, 4, 0);
+  T0.exitFunction(1, 5, 0);
  
    // 1 -> 2 -> 3
-  T1.enterFunction(1, 0);
-  T1.enterFunction(2, 1);
-  T1.enterFunction(3, 2);
-  T1.exitFunction(3, 3);
-  T1.exitFunction(2, 4);
-  T1.exitFunction(1, 5);
+  T1.enterFunction(1, 0, 0);
+  T1.enterFunction(2, 1, 0);
+  T1.enterFunction(3, 2, 0);
+  T1.exitFunction(3, 3, 0);
+  T1.exitFunction(2, 4, 0);
+  T1.exitFunction(1, 5, 0);
  
    // We use a different allocator here to make sure that we're able to transfer
    // data into a FunctionCallTrie which uses a different allocator. This
@@ -264,20 +292,20 @@ TEST(FunctionCallTrieTest, MergeInto) {
    ASSERT_EQ(Merged.getRoots().size(), 1u);
    const auto &R0 = *Merged.getRoots()[0];
    EXPECT_EQ(R0.FId, 1);
-  EXPECT_EQ(R0.CallCount, 2);
-  EXPECT_EQ(R0.CumulativeLocalTime, 10);
+  EXPECT_EQ(R0.CallCount, 2u);
+  EXPECT_EQ(R0.CumulativeLocalTime, 10u);
    EXPECT_EQ(R0.Callees.size(), 1u);
  
    const auto &F1 = *R0.Callees[0].NodePtr;
    EXPECT_EQ(F1.FId, 2);
-  EXPECT_EQ(F1.CallCount, 2);
-  EXPECT_EQ(F1.CumulativeLocalTime, 6);
+  EXPECT_EQ(F1.CallCount, 2u);
+  EXPECT_EQ(F1.CumulativeLocalTime, 6u);
    EXPECT_EQ(F1.Callees.size(), 1u);
  
    const auto &F2 = *F1.Callees[0].NodePtr;
    EXPECT_EQ(F2.FId, 3);
-  EXPECT_EQ(F2.CallCount, 2);
-  EXPECT_EQ(F2.CumulativeLocalTime, 2);
+  EXPECT_EQ(F2.CallCount, 2u);
+  EXPECT_EQ(F2.CumulativeLocalTime, 2u);
    EXPECT_EQ(F2.Callees.size(), 0u);
  }
  
diff --git a/compiler-rt/lib/xray/tests/unit/profile_collector_test.cc b/compiler-rt/lib/xray/tests/unit/profile_collector_test.cc

index 67049af..f06b702 100644 (file)
--- a/compiler-rt/lib/xray/tests/unit/profile_collector_test.cc
+++ b/compiler-rt/lib/xray/tests/unit/profile_collector_test.cc
@@ -121,10 +121,10 @@ TEST(profileCollectorServiceTest, PostSerializeCollect) {
    FunctionCallTrie T(Allocators);
  
    // Then, we populate the trie with some data.
-  T.enterFunction(1, 1);
-  T.enterFunction(2, 2);
-  T.exitFunction(2, 3);
-  T.exitFunction(1, 4);
+  T.enterFunction(1, 1, 0);
+  T.enterFunction(2, 2, 0);
+  T.exitFunction(2, 3, 0);
+  T.exitFunction(1, 4, 0);
  
    // Then we post the data to the global profile collector service.
    profileCollectorService::post(T, 1);
@@ -177,10 +177,10 @@ void threadProcessing() {
    thread_local auto Allocators = FunctionCallTrie::InitAllocators();
    FunctionCallTrie T(Allocators);
  
-  T.enterFunction(1, 1);
-  T.enterFunction(2, 2);
-  T.exitFunction(2, 3);
-  T.exitFunction(1, 4);
+  T.enterFunction(1, 1, 0);
+  T.enterFunction(2, 2, 0);
+  T.exitFunction(2, 3, 0);
+  T.exitFunction(1, 4, 0);
  
    profileCollectorService::post(T, GetTid());
  }
diff --git a/compiler-rt/lib/xray/xray_function_call_trie.h b/compiler-rt/lib/xray/xray_function_call_trie.h

index f4c2fc3..4a6a94c 100644 (file)
--- a/compiler-rt/lib/xray/xray_function_call_trie.h
+++ b/compiler-rt/lib/xray/xray_function_call_trie.h
@@ -18,6 +18,7 @@
  #include "xray_defs.h"
  #include "xray_profiling_flags.h"
  #include "xray_segmented_array.h"
+#include <limits>
  #include <memory> // For placement new.
  #include <utility>
  
@@ -113,13 +114,13 @@ public:
    struct Node {
      Node *Parent;
      NodeIdPairArray Callees;
-    int64_t CallCount;
-    int64_t CumulativeLocalTime; // Typically in TSC deltas, not wall-time.
+    uint64_t CallCount;
+    uint64_t CumulativeLocalTime; // Typically in TSC deltas, not wall-time.
      int32_t FId;
  
      // We add a constructor here to allow us to inplace-construct through
      // Array<...>'s AppendEmplace.
-    Node(Node *P, NodeIdPairAllocatorType &A, int64_t CC, int64_t CLT,
+    Node(Node *P, NodeIdPairAllocatorType &A, uint64_t CC, uint64_t CLT,
           int32_t F) XRAY_NEVER_INSTRUMENT : Parent(P),
                                              Callees(A),
                                              CallCount(CC),
@@ -133,11 +134,14 @@ private:
    struct ShadowStackEntry {
      uint64_t EntryTSC;
      Node *NodePtr;
+    uint16_t EntryCPU;
  
      // We add a constructor here to allow us to inplace-construct through
      // Array<...>'s AppendEmplace.
-    ShadowStackEntry(uint64_t T, Node *N) XRAY_NEVER_INSTRUMENT : EntryTSC{T},
-                                                                  NodePtr{N} {}
+    ShadowStackEntry(uint64_t T, Node *N, uint16_t C) XRAY_NEVER_INSTRUMENT
+        : EntryTSC{T},
+          NodePtr{N},
+          EntryCPU{C} {}
    };
  
    using NodeArray = Array<Node>;
@@ -262,17 +266,18 @@ public:
          ShadowStack(*A.ShadowStackAllocator),
          NodeIdPairAllocator(A.NodeIdPairAllocator) {}
  
-  void enterFunction(const int32_t FId, uint64_t TSC) XRAY_NEVER_INSTRUMENT {
+  void enterFunction(const int32_t FId, uint64_t TSC,
+                     uint16_t CPU) XRAY_NEVER_INSTRUMENT {
      DCHECK_NE(FId, 0);
      // This function primarily deals with ensuring that the ShadowStack is
      // consistent and ready for when an exit event is encountered.
      if (UNLIKELY(ShadowStack.empty())) {
        auto NewRoot =
-          Nodes.AppendEmplace(nullptr, *NodeIdPairAllocator, 0, 0, FId);
+          Nodes.AppendEmplace(nullptr, *NodeIdPairAllocator, 0u, 0u, FId);
        if (UNLIKELY(NewRoot == nullptr))
          return;
        Roots.Append(NewRoot);
-      ShadowStack.AppendEmplace(TSC, NewRoot);
+      ShadowStack.AppendEmplace(TSC, NewRoot, CPU);
        return;
      }
  
@@ -286,23 +291,24 @@ public:
          [FId](const NodeIdPair &NR) { return NR.FId == FId; });
      if (Callee != nullptr) {
        CHECK_NE(Callee->NodePtr, nullptr);
-      ShadowStack.AppendEmplace(TSC, Callee->NodePtr);
+      ShadowStack.AppendEmplace(TSC, Callee->NodePtr, CPU);
        return;
      }
  
      // This means we've never seen this stack before, create a new node here.
      auto NewNode =
-        Nodes.AppendEmplace(TopNode, *NodeIdPairAllocator, 0, 0, FId);
+        Nodes.AppendEmplace(TopNode, *NodeIdPairAllocator, 0u, 0u, FId);
      if (UNLIKELY(NewNode == nullptr))
        return;
      DCHECK_NE(NewNode, nullptr);
      TopNode->Callees.AppendEmplace(NewNode, FId);
-    ShadowStack.AppendEmplace(TSC, NewNode);
+    ShadowStack.AppendEmplace(TSC, NewNode, CPU);
      DCHECK_NE(ShadowStack.back().NodePtr, nullptr);
      return;
    }
  
-  void exitFunction(int32_t FId, uint64_t TSC) XRAY_NEVER_INSTRUMENT {
+  void exitFunction(int32_t FId, uint64_t TSC,
+                    uint16_t CPU) XRAY_NEVER_INSTRUMENT {
      // When we exit a function, we look up the ShadowStack to see whether we've
      // entered this function before. We do as little processing here as we can,
      // since most of the hard work would have already been done at function
@@ -312,7 +318,24 @@ public:
        const auto &Top = ShadowStack.back();
        auto TopNode = Top.NodePtr;
        DCHECK_NE(TopNode, nullptr);
-      auto LocalTime = TSC - Top.EntryTSC;
+
+      // We may encounter overflow on the TSC we're provided, which may end up
+      // being less than the TSC when we first entered the function.
+      //
+      // To get the accurate measurement of cycles, we need to check whether
+      // we've overflowed (TSC < Top.EntryTSC) and then account the difference
+      // between the entry TSC and the max for the TSC counter (max of uint64_t)
+      // then add the value of TSC. We can prove that the maximum delta we will
+      // get is at most the 64-bit unsigned value, since the difference between
+      // a TSC of 0 and a Top.EntryTSC of 1 is (numeric_limits<uint64_t>::max()
+      // - 1) + 1.
+      //
+      // NOTE: This assumes that TSCs are synchronised across CPUs.
+      // TODO: Count the number of times we've seen CPU migrations.
+      uint64_t LocalTime =
+          Top.EntryTSC > TSC
+              ? (std::numeric_limits<uint64_t>::max() - Top.EntryTSC) + TSC
+              : TSC - Top.EntryTSC;
        TopNode->CallCount++;
        TopNode->CumulativeLocalTime += LocalTime - CumulativeTreeTime;
        CumulativeTreeTime += LocalTime;
@@ -410,8 +433,8 @@ public:
        auto R = O.Roots.find_element(
            [&](const Node *Node) { return Node->FId == Root->FId; });
        if (R == nullptr) {
-        TargetRoot = O.Nodes.AppendEmplace(nullptr, *O.NodeIdPairAllocator, 0,
-                                           0, Root->FId);
+        TargetRoot = O.Nodes.AppendEmplace(nullptr, *O.NodeIdPairAllocator, 0u,
+                                           0u, Root->FId);
          if (UNLIKELY(TargetRoot == nullptr))
            return;
  
@@ -436,7 +459,7 @@ public:
                });
            if (TargetCallee == nullptr) {
              auto NewTargetNode = O.Nodes.AppendEmplace(
-                NT.TargetNode, *O.NodeIdPairAllocator, 0, 0, Callee.FId);
+                NT.TargetNode, *O.NodeIdPairAllocator, 0u, 0u, Callee.FId);
  
              if (UNLIKELY(NewTargetNode == nullptr))
                return;
diff --git a/compiler-rt/lib/xray/xray_profiling.cc b/compiler-rt/lib/xray/xray_profiling.cc

index 50dbd7f..4b9222a 100644 (file)
--- a/compiler-rt/lib/xray/xray_profiling.cc
+++ b/compiler-rt/lib/xray/xray_profiling.cc
@@ -197,11 +197,11 @@ void profilingHandleArg0(int32_t FuncId,
    switch (Entry) {
    case XRayEntryType::ENTRY:
    case XRayEntryType::LOG_ARGS_ENTRY:
-    TLD.FCT->enterFunction(FuncId, TSC);
+    TLD.FCT->enterFunction(FuncId, TSC, CPU);
      break;
    case XRayEntryType::EXIT:
    case XRayEntryType::TAIL:
-    TLD.FCT->exitFunction(FuncId, TSC);
+    TLD.FCT->exitFunction(FuncId, TSC, CPU);
      break;
    default:
      // FIXME: Handle bugs.
author	Dean Michael Berris <dberris@google.com>
	Mon, 5 Nov 2018 05:43:22 +0000 (05:43 +0000)
committer	Dean Michael Berris <dberris@google.com>
	Mon, 5 Nov 2018 05:43:22 +0000 (05:43 +0000)
compiler-rt/lib/xray/tests/unit/function_call_trie_test.cc		patch \| blob \| history
compiler-rt/lib/xray/tests/unit/profile_collector_test.cc		patch \| blob \| history
compiler-rt/lib/xray/xray_function_call_trie.h		patch \| blob \| history
compiler-rt/lib/xray/xray_profiling.cc		patch \| blob \| history