[Pytorch Profiler] Introduce scopes to enableProfiler (#62417)
authorKimish Patel <kimishpatel@fb.com>
Sat, 14 Aug 2021 04:37:57 +0000 (21:37 -0700)
committerFacebook GitHub Bot <facebook-github-bot@users.noreply.github.com>
Sat, 14 Aug 2021 04:40:15 +0000 (21:40 -0700)
Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/62417

This diff adds an option to make enableProfiler enable callbacks only
for certain RecordScopes.
Why?
Profiling has some overhead when we repeatedly execute callbacks for
alls copes. On mobile side when we often have small quantized models
this overhead can be large. We observed that by only profiling top level
op and skipping profiling of other atend ops called within we can limit
this overhead. For example, instead of profling at::conv2d -> at::convolution ->
at::convolution_ and further more if ops like transpose etc. are called,
skipping profiling of those. Of course this limits the visibility, but
at the least this way we get a choice.

Test Plan: Imported from OSS

Reviewed By: ilia-cher

Differential Revision: D29993659

fbshipit-source-id: 852d3ae7822f0d94dc6e507bd4019b60d488ef69

test/cpp/jit/test_misc.cpp
torch/csrc/autograd/init.cpp
torch/csrc/autograd/profiler_kineto.cpp
torch/csrc/autograd/profiler_kineto.h

index 5ee8816..8ecedd3 100644 (file)
@@ -2505,6 +2505,76 @@ TEST(RecordDebugHandles, Basic) {
   ASSERT_EQ(my_events, 2);
 }
 
+TEST(RecordDebugHandles, ScopedCallbacks) {
+  // Enable the profiler in this thread
+  torch::autograd::profiler::prepareProfiler(
+      torch::autograd::profiler::ProfilerConfig(
+          torch::autograd::profiler::ProfilerState::KINETO, false, false),
+      {torch::autograd::profiler::ActivityType::CPU});
+  torch::autograd::profiler::enableProfiler(
+      torch::autograd::profiler::ProfilerConfig(
+          torch::autograd::profiler::ProfilerState::KINETO, false, false),
+      {torch::autograd::profiler::ActivityType::CPU});
+
+  {
+    auto a = torch::rand({128, 128});
+    auto b = torch::rand({128, 128});
+    auto c = a + b;
+  }
+  auto profiler_results_ptr = torch::autograd::profiler::disableProfiler();
+  ASSERT_TRUE(profiler_results_ptr->events().size() > 0);
+
+  // Enable the profiler in this thread
+  torch::autograd::profiler::prepareProfiler(
+      torch::autograd::profiler::ProfilerConfig(
+          torch::autograd::profiler::ProfilerState::KINETO, false, false),
+      {torch::autograd::profiler::ActivityType::CPU});
+  torch::autograd::profiler::enableProfiler(
+      torch::autograd::profiler::ProfilerConfig(
+          torch::autograd::profiler::ProfilerState::KINETO, false, false),
+      {torch::autograd::profiler::ActivityType::CPU},
+      {at::RecordScope::USER_SCOPE});
+  {
+    auto a = torch::rand({128, 128});
+    auto b = torch::rand({128, 128});
+    auto c = a + b;
+  }
+  profiler_results_ptr = torch::autograd::profiler::disableProfiler();
+  ASSERT_TRUE(profiler_results_ptr->events().size() == 0);
+
+  torch::autograd::profiler::prepareProfiler(
+      torch::autograd::profiler::ProfilerConfig(
+          torch::autograd::profiler::ProfilerState::KINETO, false, false),
+      {torch::autograd::profiler::ActivityType::CPU});
+  torch::autograd::profiler::enableProfiler(
+      torch::autograd::profiler::ProfilerConfig(
+          torch::autograd::profiler::ProfilerState::KINETO, false, false),
+      {torch::autograd::profiler::ActivityType::CPU},
+      {at::RecordScope::USER_SCOPE});
+  {
+    RECORD_USER_SCOPE_WITH_DEBUG_HANDLE_AND_INPUTS("my_function", 42, {});
+    auto a = torch::rand({128, 128});
+    auto b = torch::rand({128, 128});
+    auto c = a + b;
+  }
+  {
+    RECORD_USER_SCOPE_WITH_INPUTS("not_my_function", {});
+    auto a = torch::rand({128, 128});
+    auto b = torch::rand({128, 128});
+    auto c = a + b;
+  }
+  profiler_results_ptr = torch::autograd::profiler::disableProfiler();
+  const auto& kineto_events = profiler_results_ptr->events();
+  for (const auto& e : kineto_events) {
+    if (e.name() == "my_function") {
+      ASSERT_EQ(e.debugHandle(), 42);
+    } else if (e.name() == "not_my_function") {
+      ASSERT_EQ(e.debugHandle(), -1);
+    }
+  }
+  ASSERT_TRUE(profiler_results_ptr->events().size() == 2);
+}
+
 TEST(IValueKWargsTest, Basic) {
   const auto text = R"(
     def foo(a : int, b : int, c : int = 4):
index dc51241..2eacbf1 100644 (file)
@@ -18,6 +18,7 @@
 #include <c10/core/ScalarType.h>
 
 #include <set>
+#include <unordered_set>
 
 struct DisableTorchDispatch {
   DisableTorchDispatch() : guard_(c10::DispatchKey::Python) {
@@ -223,7 +224,11 @@ PyObject* THPAutograd_initExtension(PyObject* _unused, PyObject *unused) {
 #endif // USE_KINETO
     ;
 
-  m.def("_enable_profiler", enableProfiler);
+  m.def("_enable_profiler",
+        &enableProfiler,
+        py::arg("config"),
+        py::arg("activities"),
+        py::arg("scopes") = std::unordered_set<at::RecordScope>());
   m.def("_disable_profiler", disableProfiler);
   m.def("_prepare_profiler", prepareProfiler);
 
index e92461a..526813d 100644 (file)
@@ -287,7 +287,7 @@ KinetoThreadLocalState* getProfilerTLSState() {
   return static_cast<KinetoThreadLocalState*>(state);
 }
 
-void pushProfilingCallbacks() {
+void pushProfilingCallbacks(const std::unordered_set<at::RecordScope>& scopes) {
   auto state_ptr = getProfilerTLSState();
   TORCH_INTERNAL_ASSERT(state_ptr, "Expected profiler state set");
   auto handle = at::addThreadLocalCallback(at::RecordFunctionCallback(
@@ -388,7 +388,8 @@ void pushProfilingCallbacks() {
         }
       })
     .needsInputs(state_ptr->config().report_input_shapes)
-    .needsIds(true));
+    .needsIds(true)
+    .scopes(scopes));
   state_ptr->setCallbackHandle(handle);
 }
 
@@ -497,7 +498,8 @@ void prepareProfiler(
 
 void enableProfiler(
     const ProfilerConfig& config,
-    const std::set<ActivityType>& activities) {
+    const std::set<ActivityType>& activities,
+    const std::unordered_set<at::RecordScope>& scopes) {
   if (config.state != ProfilerState::NVTX) {
     TORCH_CHECK(
         config.state == ProfilerState::KINETO ||
@@ -514,7 +516,7 @@ void enableProfiler(
   c10::ThreadLocalDebugInfo::_push(c10::DebugInfoKind::PROFILER_STATE, state);
 
   if (activities.count(ActivityType::CPU) || config.state == ProfilerState::NVTX) {
-    pushProfilingCallbacks();
+    pushProfilingCallbacks(scopes);
   }
 
 #ifdef USE_KINETO
index 8a878a0..310554a 100644 (file)
@@ -331,7 +331,8 @@ struct TORCH_API ProfilerResult {
 
 TORCH_API void enableProfiler(
     const ProfilerConfig& config,
-    const std::set<ActivityType>& activities);
+    const std::set<ActivityType>& activities,
+    const std::unordered_set<at::RecordScope>& scopes = {});
 
 TORCH_API std::unique_ptr<ProfilerResult> disableProfiler();