From 600eeecbf4556e733b1ab22570bbb2700a420fdb Mon Sep 17 00:00:00 2001
From: Ilia Cherniavskii <iliacher@fb.com>
Date: Thu, 28 Mar 2019 17:42:47 -0700
Subject: [PATCH] Add external callbacks into RecordFunction (#17844)

Summary:
Add a way to insert external callbacks into PT's RecordFunction
Pull Request resolved: https://github.com/pytorch/pytorch/pull/17844

Differential Revision: D14399664

Pulled By: ilia-cher

fbshipit-source-id: 76654799811fefd3ffed4abfb46ed95b492cebab
---
 test/cpp/jit/test.cpp                   |   3 +-
 test/cpp/jit/test_misc.h                |  66 +++++++++++++++++++++
 tools/build_variables.py                |   1 +
 torch/CMakeLists.txt                    |   1 +
 torch/csrc/autograd/profiler.cpp        |  62 ++++++-------------
 torch/csrc/autograd/profiler.h          |  32 ++--------
 torch/csrc/autograd/record_function.cpp |  83 ++++++++++++++++++++++++++
 torch/csrc/autograd/record_function.h   | 102 ++++++++++++++++++++++++++++++++
 8 files changed, 278 insertions(+), 72 deletions(-)
 create mode 100644 torch/csrc/autograd/record_function.cpp
 create mode 100644 torch/csrc/autograd/record_function.h

diff --git a/test/cpp/jit/test.cpp b/test/cpp/jit/test.cpp
index 1c4823d..20cab91 100644
--- a/test/cpp/jit/test.cpp
+++ b/test/cpp/jit/test.cpp
@@ -63,7 +63,8 @@ namespace jit {
   _(ATenNativeBatchNorm)           \
   _(NoneSchemaMatch)               \
   _(ClassParser)                   \
-  _(PeepholeOptimize)
+  _(PeepholeOptimize)              \
+  _(RecordFunction)
 
 #define TH_FORALL_TESTS_CUDA(_) \
   _(ArgumentSpec)               \
diff --git a/test/cpp/jit/test_misc.h b/test/cpp/jit/test_misc.h
index 10cd871..7a314ad 100644
--- a/test/cpp/jit/test_misc.h
+++ b/test/cpp/jit/test_misc.h
@@ -575,6 +575,72 @@ void testTopologicalIndex() {
   }
 }
 
+void invokeTestRecordFunction(at::Tensor& t) {
+  autograd::profiler::GetPackedInputsCallback inputs_cb =
+    [t]() {
+      Stack st;
+      pack(st, t);
+      return st;
+    };
+  autograd::profiler::RecordFunction guard("test", inputs_cb);
+  t.add_(torch::ones_like(t));
+}
+
+std::string getFullName(const autograd::profiler::RecordFunction* fn_ptr) {
+  std::string full_name = "";
+  while (fn_ptr != nullptr) {
+    if (!full_name.empty()) {
+      full_name = std::string(fn_ptr->name().str()) + "::" + full_name;
+    } else {
+      full_name = fn_ptr->name().str();
+    }
+    fn_ptr = fn_ptr->parent();
+  }
+  return full_name;
+}
+
+void invokeTestRecordFunctionNested() {
+  autograd::profiler::RecordFunction guard("inner");
+}
+
+void testRecordFunction() {
+  std::vector<std::vector<int64_t>> input_sizes;
+  autograd::profiler::pushCallback([&input_sizes](
+      const autograd::profiler::RecordFunction& fn) {
+    for (const auto& input : fn.inputs()) {
+      if (input.isTensor()) {
+        std::vector<int64_t> t = input.toTensor().sizes().vec();
+        input_sizes.push_back(t);
+      }
+    }
+  });
+
+  auto t = torch::randn({1, 2, 3}, at::kCPU);
+  invokeTestRecordFunction(t);
+
+  autograd::profiler::popCallback();
+
+  AT_CHECK(input_sizes.size() == 1);
+  AT_CHECK(input_sizes[0] == at::IntArrayRef({1, 2, 3}));
+
+  // test nested RecordFunctions
+  std::vector<std::string> nested_names;
+  autograd::profiler::pushCallback([&nested_names](
+      const autograd::profiler::RecordFunction& fn) {
+    nested_names.push_back(getFullName(&fn));
+  });
+
+  {
+    autograd::profiler::RecordFunction guard("outer");
+    invokeTestRecordFunctionNested();;
+  }
+
+  autograd::profiler::popCallback();
+  AT_CHECK(nested_names.size() == 2);
+  AT_CHECK(nested_names[0] == "outer");
+  AT_CHECK(nested_names[1] == "outer::inner");
+}
+
 void testAutogradProfiler() {
   constexpr int batch_size = 4;
   constexpr int input_size = 256;
diff --git a/tools/build_variables.py b/tools/build_variables.py
index 503eb6a..a293a56 100644
--- a/tools/build_variables.py
+++ b/tools/build_variables.py
@@ -45,6 +45,7 @@ libtorch_sources = [
     "torch/csrc/autograd/grad_mode.cpp",
     "torch/csrc/autograd/input_buffer.cpp",
     "torch/csrc/autograd/profiler.cpp",
+    "torch/csrc/autograd/record_function.cpp",
     "torch/csrc/autograd/saved_variable.cpp",
     "torch/csrc/autograd/variable.cpp",
     "torch/csrc/Exceptions.cpp",
diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt
index c2d7783..5db5260 100644
--- a/torch/CMakeLists.txt
+++ b/torch/CMakeLists.txt
@@ -117,6 +117,7 @@ set(TORCH_SRCS
   ${TORCH_SRC_DIR}/csrc/autograd/grad_mode.cpp
   ${TORCH_SRC_DIR}/csrc/autograd/input_buffer.cpp
   ${TORCH_SRC_DIR}/csrc/autograd/profiler.cpp
+  ${TORCH_SRC_DIR}/csrc/autograd/record_function.cpp
   ${TORCH_SRC_DIR}/csrc/autograd/saved_variable.cpp
   ${TORCH_SRC_DIR}/csrc/autograd/variable.cpp
   ${TORCH_SRC_DIR}/csrc/autograd/VariableTypeManual.cpp
diff --git a/torch/csrc/autograd/profiler.cpp b/torch/csrc/autograd/profiler.cpp
index 555ca22..b68dd28 100644
--- a/torch/csrc/autograd/profiler.cpp
+++ b/torch/csrc/autograd/profiler.cpp
@@ -34,88 +34,51 @@ RangeEventList& getEventList() {
 }
 
 void mark(std::string name, bool include_cuda /* = true */) {
-  if (state == ProfilerState::Disabled) {
-    return;
-  }
   if (state == ProfilerState::NVTX) {
     cuda_stubs->nvtxMarkA(name.c_str());
   } else {
     getEventList().record(
         EventKind::Mark,
-        std::move(name),
+        StringView(std::move(name)),
         thread_id,
         include_cuda && state == ProfilerState::CUDA);
   }
 }
 
-const char* c_str(const char *str) { return str; }
-// NB: non-const to disallow temporaries (lifetime issues)
-const char* c_str(std::string& str) { return str.c_str(); }
-
-template<typename T>
-void pushRangeImpl(T name, const char* msg="", int64_t sequence_nr=-1) {
-  if (state == ProfilerState::Disabled) {
-    return;
-  }
+void pushRangeImpl(const StringView& name, const char* msg="", int64_t sequence_nr=-1) {
   if (state == ProfilerState::NVTX) {
     if(sequence_nr >= 0) {
       std::stringstream s;
-      s << name << msg << sequence_nr;
+      s << name.str() << msg << sequence_nr;
       cuda_stubs->nvtxRangePushA(s.str().c_str());
     } else {
-      cuda_stubs->nvtxRangePushA(c_str(name));
+      cuda_stubs->nvtxRangePushA(name.str());
     }
   } else {
     getEventList().record(
         EventKind::PushRange,
-        std::move(name),
+        name,
         thread_id,
         state == ProfilerState::CUDA);
   }
 }
 
 void pushRange(std::string name) {
-  pushRangeImpl(std::move(name));
+  pushRangeImpl(StringView(std::move(name)));
 }
 
 void popRange() {
-  if (state == ProfilerState::Disabled) {
-    return;
-  }
   if (state == ProfilerState::NVTX) {
     cuda_stubs->nvtxRangePop();
   } else {
     getEventList().record(
         EventKind::PopRange,
-        "",
+        StringView(""),
         thread_id,
         state == ProfilerState::CUDA);
   }
 }
 
-RecordFunction::RecordFunction(Function* fn) {
-  // typeid(*fn).name() would avoid an additional string allocation.
-  // However, typeid(*fn).name() would cause nvtx annotations for all user-defined
-  // (Python-side) custom autograd function backward() methods to have the same name,
-  // because they route through the same C++ side class.
-  // fn->name() ensures that nvtx annotations for custom function backward() methods
-  // receive a relevant, demangled name.
-  pushRangeImpl(fn->name(), ", stashed seq=", fn->sequence_nr());
-}
-
-RecordFunction::RecordFunction(std::string name) {
-  pushRangeImpl(std::move(name));
-}
-
-RecordFunction::RecordFunction(const char* name) {
-  pushRangeImpl<const char*>(name);
-}
-
-RecordFunction::RecordFunction(const char* name, int64_t current_sequence_nr)
-{
-  pushRangeImpl<const char*>(name, ", seq=", current_sequence_nr);
-}
-
 void enableProfiler(ProfilerState new_state) {
   AT_ASSERT(new_state != ProfilerState::Disabled);
   if (new_state == ProfilerState::NVTX && !cuda_stubs->enabled())
@@ -123,6 +86,14 @@ void enableProfiler(ProfilerState new_state) {
   if (state != ProfilerState::Disabled && new_state != state) {
       throw std::runtime_error("can't change kind of profiling (e.g. NVTX to CPU) while profiler is running");
   }
+
+  pushCallback([](const RecordFunction& fn) {
+    auto* msg = (fn.seqNr() >= 0) ? ", seq = " : "";
+    pushRangeImpl(fn.name(), msg, fn.seqNr());
+  },
+  [](const RecordFunction& /* unused */) {
+    popRange();
+  });
   state = new_state;
 
   if(state == ProfilerState::CUDA) {
@@ -151,7 +122,10 @@ thread_event_lists disableProfiler() {
   }
   ProfilerState old_state = state;
   mark("__stop_profile");
+
+  popCallback();
   state = ProfilerState::Disabled;
+
   if (old_state == ProfilerState::NVTX) {
     return thread_event_lists();
   } else {
diff --git a/torch/csrc/autograd/profiler.h b/torch/csrc/autograd/profiler.h
index 8b9e841..8d06c09 100644
--- a/torch/csrc/autograd/profiler.h
+++ b/torch/csrc/autograd/profiler.h
@@ -17,6 +17,7 @@
 #include <ctime>
 #endif
 
+#include <torch/csrc/autograd/record_function.h>
 #include <torch/csrc/jit/code_template.h>
 
 typedef struct CUevent_st* CUDAEventStub;
@@ -97,13 +98,8 @@ enum class EventKind : uint16_t {
 };
 
 struct TORCH_API Event final {
-  Event(EventKind kind, std::string name, uint16_t thread_id, bool record_cuda)
-  : owned_name_(new std::string(std::move(name)))
-  , name_ptr_(owned_name_->c_str())
-  , kind_(kind)
-  , thread_id_(thread_id) { record(record_cuda); }
-  Event(EventKind kind, const char* name, uint16_t thread_id, bool record_cuda)
-  : name_ptr_(name)
+  Event(EventKind kind, StringView name, uint16_t thread_id, bool record_cuda)
+  : name_(std::move(name))
   , kind_(kind)
   , thread_id_(thread_id) { record(record_cuda); }
 
@@ -117,7 +113,7 @@ struct TORCH_API Event final {
     throw std::runtime_error("unknown EventKind");
   }
   const char* name() const {
-    return name_ptr_;
+    return name_.str();
   }
   uint16_t thread_id() const {
     return thread_id_;
@@ -134,11 +130,7 @@ struct TORCH_API Event final {
   }
 private:
   int64_t cpu_ns_ = 0; // signed to allow for negative intervals, initialized for safety.
-  // std::string is a very large object (usually around 32B),
-  // and this field is used only for user-created ranges, so
-  // it's better to save on size of Events.
-  std::unique_ptr<std::string> owned_name_;
-  const char * name_ptr_;
+  StringView name_;
   EventKind kind_;
   uint16_t thread_id_;
   int device_ = -1;
@@ -203,20 +195,6 @@ TORCH_API void mark(std::string name, bool include_cuda = true);
 TORCH_API void pushRange(std::string name);
 TORCH_API void popRange();
 
-struct TORCH_API RecordFunction {
-  explicit RecordFunction(Function* fn);
-
-  explicit RecordFunction(std::string name);
-
-  explicit RecordFunction(const char* name);
-
-  explicit RecordFunction(const char* name, int64_t current_sequence_nr);
-
-  ~RecordFunction() {
-    popRange();
-  }
-};
-
 using thread_event_lists = std::vector<std::vector<Event>>;
 // NOTE: changing profiler modes is **NOT THREAD SAFE**. You should ensure that
 // there no autograd functions are being executed when these function are used.
diff --git a/torch/csrc/autograd/record_function.cpp b/torch/csrc/autograd/record_function.cpp
new file mode 100644
index 0000000..57f83e9
--- /dev/null
+++ b/torch/csrc/autograd/record_function.cpp
@@ -0,0 +1,83 @@
+#include <torch/csrc/autograd/record_function.h>
+#include <torch/csrc/autograd/function.h>
+
+namespace torch { namespace autograd { namespace profiler {
+
+namespace {
+bool has_callbacks = false;
+std::vector<RecordFunctionCallback> start_callbacks;
+std::vector<RecordFunctionCallback> end_callbacks;
+thread_local RecordFunction* thread_local_func_ = nullptr;
+}
+
+void pushCallback(RecordFunctionCallback start, RecordFunctionCallback end) {
+  start_callbacks.push_back(start);
+  end_callbacks.push_back(end);
+  has_callbacks = true;
+}
+
+void pushCallback(RecordFunctionCallback start) {
+  pushCallback(start, [](const RecordFunction&){});
+}
+
+void popCallback() {
+  if (start_callbacks.empty()) {
+    throw std::runtime_error("Empty callbacks stack");
+  }
+  start_callbacks.pop_back();
+  end_callbacks.pop_back();
+  has_callbacks = !start_callbacks.empty();
+}
+
+RecordFunction::RecordFunction(Function* fn, GetPackedInputsCallback cb) {
+  if (!has_callbacks) {
+    return;
+  }
+  fn_ = fn;
+  name_ = StringView(fn->name());
+  sequence_nr_ = fn->sequence_nr();
+  inputs_cb_ = cb;
+  processCallbacks();
+}
+
+RecordFunction::RecordFunction(
+    std::string name, int64_t sequence_nr, GetPackedInputsCallback cb) {
+  if (!has_callbacks) {
+    return;
+  }
+  name_ = StringView(std::move(name));
+  sequence_nr_ = sequence_nr;
+  inputs_cb_ = cb;
+  processCallbacks();
+}
+
+RecordFunction::RecordFunction(
+    const char* name, int64_t sequence_nr, GetPackedInputsCallback cb) {
+  if (!has_callbacks) {
+    return;
+  }
+  name_ = StringView(name);
+  sequence_nr_ = sequence_nr;
+  inputs_cb_ = cb;
+  processCallbacks();
+}
+
+void RecordFunction::processCallbacks() {
+  parent_ = thread_local_func_;
+  thread_local_func_ = this;
+
+  for (const auto& cb : start_callbacks) {
+    cb(*this);
+  }
+}
+
+RecordFunction::~RecordFunction() {
+  if (has_callbacks) {
+    for (const auto& cb : end_callbacks) {
+      cb(*this);
+    }
+    thread_local_func_ = parent_;
+  }
+}
+
+}}}
diff --git a/torch/csrc/autograd/record_function.h b/torch/csrc/autograd/record_function.h
new file mode 100644
index 0000000..eef1a67
--- /dev/null
+++ b/torch/csrc/autograd/record_function.h
@@ -0,0 +1,102 @@
+#pragma once
+
+#include <ATen/core/ivalue.h>
+#include <c10/util/SmallVector.h>
+#include <torch/csrc/WindowsTorchApiMacro.h>
+
+namespace torch { namespace autograd {
+
+struct Function;
+
+namespace profiler {
+
+struct TORCH_API StringView {
+  StringView() : StringView(nullptr) {}
+  explicit StringView(const char* str_ptr)
+    : owned_str_ptr_(nullptr), str_ptr_(str_ptr) {}
+  explicit StringView(std::string str)
+    : owned_str_ptr_(std::make_shared<std::string>(std::move(str))),
+      str_ptr_(owned_str_ptr_->c_str()) {}
+
+  inline const char* str() const {
+    return str_ptr_;
+  }
+ private:
+  std::shared_ptr<std::string> owned_str_ptr_;
+  const char* str_ptr_;
+};
+
+using GetPackedInputsCallback = std::function<std::vector<c10::IValue>()>;
+
+struct TORCH_API RecordFunction {
+  explicit RecordFunction(Function* fn, GetPackedInputsCallback cb = nullptr);
+
+  explicit RecordFunction(
+      std::string name,
+      int64_t current_sequence_nr = -1,
+      GetPackedInputsCallback cb = nullptr);
+
+  explicit RecordFunction(
+      const char* name,
+      int64_t current_sequence_nr = -1,
+      GetPackedInputsCallback cb = nullptr);
+
+  explicit RecordFunction(
+      std::string name,
+      GetPackedInputsCallback cb) : RecordFunction(name, -1, cb) {}
+
+  explicit RecordFunction(
+      const char* name,
+      GetPackedInputsCallback cb) : RecordFunction(name, -1, cb) {}
+
+  virtual ~RecordFunction();
+
+
+  inline Function* func() const {
+    return fn_;
+  }
+
+  inline const StringView& name() const {
+    return name_;
+  }
+
+  inline int64_t seqNr() const {
+    return sequence_nr_;
+  }
+
+  const std::vector<c10::IValue>& inputs() const {
+    if (inputs_cb_ && !inputs_initialized_) {
+      inputs_ = inputs_cb_();
+      inputs_initialized_ = true;
+    }
+    return inputs_;
+  }
+
+  inline const RecordFunction* parent() const {
+    return parent_;
+  }
+
+ private:
+  void processCallbacks();
+
+  Function* fn_ = nullptr;
+  StringView name_;
+  int64_t sequence_nr_ = -1;
+
+  RecordFunction* parent_ = nullptr;
+
+  GetPackedInputsCallback inputs_cb_ = nullptr;
+  mutable bool inputs_initialized_ = false;
+  // initialized lazily by inputs_cb_
+  mutable std::vector<c10::IValue> inputs_;
+};
+
+// WARNING: all calls to pushCallback/popCallback are not thread safe and
+// must not overlap with other code execution
+using RecordFunctionCallback = std::function<void(const RecordFunction&)>;
+TORCH_API void pushCallback(RecordFunctionCallback, RecordFunctionCallback);
+TORCH_API void pushCallback(RecordFunctionCallback);
+TORCH_API void popCallback();
+
+} // namespace profiler
+}} // namespace torch::autograd
-- 
2.7.4