[Pytorch Profiler] Add support for adding module hierarchy to (#61792)

author Kimish Patel <kimishpatel@fb.com>

Sat, 14 Aug 2021 04:37:57 +0000 (21:37 -0700)

committer Facebook GitHub Bot <facebook-github-bot@users.noreply.github.com>

Sat, 14 Aug 2021 04:39:10 +0000 (21:39 -0700)
author Kimish Patel <kimishpatel@fb.com>
Sat, 14 Aug 2021 04:37:57 +0000 (21:37 -0700)
committer Facebook GitHub Bot <facebook-github-bot@users.noreply.github.com>
Sat, 14 Aug 2021 04:39:10 +0000 (21:39 -0700)
diff --git a/test/cpp/c10d/ProcessGroupNCCLTest.cpp b/test/cpp/c10d/ProcessGroupNCCLTest.cpp

index 2eed50c..c8ef529 100644 (file)
--- a/test/cpp/c10d/ProcessGroupNCCLTest.cpp
+++ b/test/cpp/c10d/ProcessGroupNCCLTest.cpp
@@ -177,7 +177,7 @@ class AllreduceNCCLTest : public NCCLTest {
      // Make sure enabling profile does not make any issue. Note, in single
      // process multi-device mode we do not expect any events be populated for
      // collective operations, since profiling for that mode is not supported.
-    enableProfilerLegacy({ProfilerState::CPU});
+    enableProfilerLegacy(ProfilerConfig(ProfilerState::CPU));
      auto results = pg_->allreduce(tensors_);
      disableProfilerLegacy();
      return results;
diff --git a/test/test_profiler.py b/test/test_profiler.py

index 8770639..28d9671 100644 (file)
--- a/test/test_profiler.py
+++ b/test/test_profiler.py
@@ -319,6 +319,70 @@ class TestProfiler(TestCase):
                  ]
              )
  
+    @unittest.skipIf(not kineto_available(), "Kineto is required")
+    def test_module_hierarchy(self):
+        class A(nn.Module):
+            def __init__(self):
+                super(A, self).__init__()
+
+            def my_new_method(self, x):
+                return x * 3
+
+            def forward_impl_(self, x, y):
+                return self.my_new_method(x) + y
+
+            def forward(self, x, y):
+                y = y - 2
+                return self.forward_impl_(x, y)
+
+        class B(nn.Module):
+            def __init__(self):
+                super(B, self).__init__()
+
+            def forward(self, x):
+                return x + 2
+
+        class C(nn.Module):
+            def __init__(self):
+                super(C, self).__init__()
+                self.A0 = A()
+                self.B0 = B()
+
+            def call_b(self, x):
+                return self.B0.forward(x)
+
+            def forward(self, x, y):
+                return self.A0.forward(x, y) + self.call_b(x)
+
+        model = C()
+        model = torch.jit.script(model)
+        input_a = torch.rand(128, 128)
+        input_b = torch.rand(128, 128)
+        op_to_module_hierarchy = {}
+        op_to_module_hierarchy["aten::sub"] = ["TOP(C)::forward.A0(A)::forward."]
+        op_to_module_hierarchy["aten::mul"] = [
+            "TOP(C)::forward.A0(A)::forward.SELF(A)::forward_impl_.SELF(A)::my_new_method."]
+        op_to_module_hierarchy["aten::add"] = [
+            "TOP(C)::forward.A0(A)::forward.SELF(A)::forward_impl_.",
+            "TOP(C)::forward.SELF(C)::call_b.B0(B)::forward.", "TOP(C)::forward."]
+        with TemporaryFileName(mode="w+") as fname:
+            with profile(activities=[torch.profiler.ProfilerActivity.CPU], with_modules=True,) as prof:
+                model(input_a, input_b)
+            prof.export_chrome_trace(fname)
+            with io.open(fname, 'r') as f:
+                trace = json.load(f)
+                assert "traceEvents" in trace
+                events = trace["traceEvents"]
+                found_memory_events = False
+                for evt in events:
+                    assert "name" in evt
+                    if "args" in evt:
+                        op_name = evt["name"]
+                        if "Module Hierarchy" in evt["args"]:
+                            hierarchy = evt["args"]["Module Hierarchy"]
+                            if op_name in op_to_module_hierarchy:
+                                assert hierarchy in op_to_module_hierarchy[op_name]
+
      def test_high_level_trace(self):
          """Checks that python side high level events are recorded.
          """
diff --git a/torch/_C/_autograd.pyi b/torch/_C/_autograd.pyi

index cd9b0da..6468eb5 100644 (file)
--- a/torch/_C/_autograd.pyi
+++ b/torch/_C/_autograd.pyi
@@ -40,7 +40,8 @@ class ProfilerConfig:
          report_input_shapes: bool,
          profile_memory: bool,
          with_stack: bool,
-        with_flops: bool
+        with_flops: bool,
+        with_modules: bool
      ) -> None: ...
      ...
  
diff --git a/torch/autograd/profiler.py b/torch/autograd/profiler.py

index ae5e090..ab95fdb 100644 (file)
--- a/torch/autograd/profiler.py
+++ b/torch/autograd/profiler.py
@@ -71,6 +71,13 @@ class profile(object):
  
          with_stack (bool, optional): record source information (file and line number) for the ops.
  
+        with_modules (bool): record module hierarchy (including function names)
+            corresponding to the callstack of the op. e.g. If module A's forward call's
+            module B's forward which contains an aten::add op,
+            then aten::add's module hierarchy is A.B
+            Note that this support exist, at the moment, only for TorchScript models
+            and not eager mode models.
+
          use_kineto (bool, optional): experimental, enable profiling with Kineto profiler.
  
          use_cpu (bool, optional): profile CPU events; setting to ``False`` requires
@@ -118,6 +125,7 @@ class profile(object):
              with_flops=False,
              profile_memory=False,
              with_stack=False,
+            with_modules=False,
              use_kineto=False,
              use_cpu=True):
          self.enabled: bool = enabled
@@ -131,6 +139,7 @@ class profile(object):
          self.record_shapes |= self.with_flops
          self.profile_memory = profile_memory
          self.with_stack = with_stack
+        self.with_modules = with_modules
          self.use_cpu = use_cpu
          self.kineto_results: Optional[_ProfilerResult] = None
  
@@ -165,7 +174,8 @@ class profile(object):
              self.record_shapes,
              self.profile_memory,
              self.with_stack,
-            self.with_flops)
+            self.with_flops,
+            self.with_modules)
  
      def __enter__(self):
          if not self.enabled:
@@ -557,6 +567,7 @@ class emit_nvtx(object):
                  self.record_shapes,
                  False,
                  False,
+                False,
                  False),
              set()
          )
diff --git a/torch/autograd/profiler_legacy.py b/torch/autograd/profiler_legacy.py

index 623e13a..445decf 100644 (file)
--- a/torch/autograd/profiler_legacy.py
+++ b/torch/autograd/profiler_legacy.py
@@ -24,7 +24,8 @@ class profile(object):
              record_shapes=False,
              with_flops=False,
              profile_memory=False,
-            with_stack=False):
+            with_stack=False,
+            with_modules=False):
          self.enabled: bool = enabled
          if not self.enabled:
              return
@@ -36,6 +37,7 @@ class profile(object):
          self.record_shapes |= self.with_flops
          self.profile_memory = profile_memory
          self.with_stack = with_stack
+        self.with_modules = with_modules
  
          if self.use_cuda and not torch.cuda.is_available():
              warn("CUDA is not available, disabling CUDA profiling")
@@ -52,7 +54,8 @@ class profile(object):
              self.record_shapes,
              self.profile_memory,
              self.with_stack,
-            self.with_flops)
+            self.with_flops,
+            self.with_modules)
  
      def __enter__(self):
          if not self.enabled:
diff --git a/torch/csrc/autograd/init.cpp b/torch/csrc/autograd/init.cpp

index ffe7e83..dc51241 100644 (file)
--- a/torch/csrc/autograd/init.cpp
+++ b/torch/csrc/autograd/init.cpp
@@ -73,7 +73,13 @@ PyObject* THPAutograd_initExtension(PyObject* _unused, PyObject *unused) {
        .value("CUDA", ActivityType::CUDA);
  
    py::class_<ProfilerConfig>(m, "ProfilerConfig")
-      .def(py::init<ProfilerState, bool, bool, bool, bool>());
+      .def(py::init<ProfilerState,
+          bool, /* record_input_shapes */
+          bool, /* profile_memory */
+          bool, /* with_stac k*/
+          bool, /* with_flops */
+          bool  /* with_modules */
+          >());
  
    py::class_<LegacyEvent>(m, "ProfilerEvent")
        .def("kind", &LegacyEvent::kindStr)
diff --git a/torch/csrc/autograd/profiler_kineto.cpp b/torch/csrc/autograd/profiler_kineto.cpp

index b9af532..3b5b511 100644 (file)
--- a/torch/csrc/autograd/profiler_kineto.cpp
+++ b/torch/csrc/autograd/profiler_kineto.cpp
@@ -43,7 +43,7 @@ inline int64_t getTimeUs() {
  }
  
  std::string shapesToStr(const std::vector<std::vector<int64_t>>& shapes);
-std::string stacksToStr(const std::vector<std::string>& stacks);
+std::string stacksToStr(const std::vector<std::string>& stacks, const char* delim);
  std::string dtypesToStr(const std::vector<std::string>& types);
  std::vector<std::string> inputTypes(const at::RecordFunction& fn);
  
@@ -110,6 +110,9 @@ struct KinetoThreadLocalState : public ProfilerThreadLocalState {
        if (ctx->stack && !ctx->stack->empty()) {
          kineto_events_.back().stack(*ctx->stack);
        }
+      if (ctx->module_hierarchy) {
+        kineto_events_.back().moduleHierarchy(*ctx->module_hierarchy);
+      }
        if (ctx->extraArgs && !ctx->extraArgs->empty()) {
          kineto_events_.back().flops(computeFlops(std::string(fn.name().str()), *ctx->extraArgs));
        }
@@ -228,7 +231,10 @@ struct KinetoThreadLocalState : public ProfilerThreadLocalState {
          activity.addMetadata("Input Dims", shapesToStr(kineto_event.shapes()));
        }
        if (kineto_event.hasStack()) {
-        activity.addMetadata("Call stack", stacksToStr(kineto_event.stack()));
+        activity.addMetadata("Call stack", stacksToStr(kineto_event.stack(), ";"));
+      }
+      if (kineto_event.hasModuleHierarchy()) {
+        activity.addMetadata("Module Hierarchy", stacksToStr(kineto_event.moduleHierarchy(), "."));
        }
        if (kineto_event.hasTypes()) {
          activity.addMetadata("Input type", dtypesToStr(kineto_event.dtypes()));
@@ -326,6 +332,10 @@ void pushProfilingCallbacks() {
              }
              ctx_ptr->stack = callstackStr(cs);
            }
+          if (config.with_modules &&
+              fn.scope() != at::RecordScope::BACKWARD_FUNCTION) {
+            ctx_ptr->module_hierarchy = jit::currentModuleHierarchy();
+          }
    #endif
            if (config.state == ProfilerState::KINETO_GPU_FALLBACK) {
              try {
@@ -416,12 +426,12 @@ std::string dtypesToStr(const std::vector<std::string>& types) {
    }
  }
  
-std::string stacksToStr(const std::vector<std::string>& stacks) {
+std::string stacksToStr(const std::vector<std::string>& stacks, const char* delim) {
    std::ostringstream oss;
    std::transform(
        stacks.begin(),
        stacks.end(),
-      std::ostream_iterator<std::string>(oss, ";"),
+      std::ostream_iterator<std::string>(oss, delim),
        [](std::string s) -> std::string {
  #ifdef _WIN32
          // replace the windows backslash with forward slash
@@ -430,7 +440,6 @@ std::string stacksToStr(const std::vector<std::string>& stacks) {
          return s;
        });
    auto rc = oss.str();
-  rc.pop_back();
    return "\"" + rc + "\"";
  }
  
diff --git a/torch/csrc/autograd/profiler_kineto.h b/torch/csrc/autograd/profiler_kineto.h

index a948ad3..8c14ae4 100644 (file)
--- a/torch/csrc/autograd/profiler_kineto.h
+++ b/torch/csrc/autograd/profiler_kineto.h
@@ -1,6 +1,7 @@
  #pragma once
  
  #include <torch/csrc/autograd/profiler_legacy.h>
+#include <vector>
  
  #ifdef USE_KINETO
  // skip Kineto dependency on mobile
@@ -38,6 +39,7 @@ struct KinetoObserverContext : public at::ObserverContext {
    uint64_t fwdThreadId;
    uint8_t recFunScope;
    c10::optional<std::vector<std::string>> stack;
+  c10::optional<std::vector<std::string>> module_hierarchy;
    // Extra arguments for computing op flops
    c10::optional<std::unordered_map<std::string, c10::IValue>> extraArgs;
    CUDAEventStub cuda_event_start_ = nullptr;
@@ -147,6 +149,19 @@ struct TORCH_API KinetoEvent {
      return *this;
    }
  
+  bool hasModuleHierarchy() const {
+    return module_hierarchy_ != c10::nullopt;
+  }
+
+  const std::vector<std::string>& moduleHierarchy() const {
+    return *module_hierarchy_;
+  }
+
+  KinetoEvent& moduleHierarchy(const std::vector<std::string>& module_hierarchy) {
+    module_hierarchy_ = module_hierarchy;
+    return *this;
+  }
+
    std::string name() const {
      return name_;
    }
@@ -248,6 +263,7 @@ struct TORCH_API KinetoEvent {
    uint8_t activity_type_ = 0;
    c10::optional<std::vector<std::vector<int64_t>>> shapes_;
    c10::optional<std::vector<std::string>> stack_;
+  c10::optional<std::vector<std::string>> module_hierarchy_;
    c10::optional<std::vector<std::string>> dtypes_;
    uint64_t flops_ = 0;
  
diff --git a/torch/csrc/autograd/profiler_legacy.h b/torch/csrc/autograd/profiler_legacy.h

index 4c751e8..363a42d 100644 (file)
--- a/torch/csrc/autograd/profiler_legacy.h
+++ b/torch/csrc/autograd/profiler_legacy.h
@@ -408,23 +408,26 @@ enum class C10_API_ENUM ProfilerState {
  };
  
  struct TORCH_API ProfilerConfig {
-  ProfilerConfig(
+  explicit ProfilerConfig(
        ProfilerState state,
        bool report_input_shapes = false,
        bool profile_memory = false,
        bool with_stack = false,
-      bool with_flops = false)
+      bool with_flops = false,
+      bool with_modules = false)
        : state(state),
          report_input_shapes(report_input_shapes),
          profile_memory(profile_memory),
          with_stack(with_stack),
-        with_flops(with_flops) {}
+        with_flops(with_flops),
+        with_modules(with_modules) {}
    ~ProfilerConfig() = default;
    ProfilerState state;
    bool report_input_shapes;
    bool profile_memory;
    bool with_stack;
    bool with_flops;
+  bool with_modules;
  
    // Returns IValues corresponding to ProfilerConfig struct, to be used for
    // serialization.
diff --git a/torch/csrc/jit/runtime/interpreter.cpp b/torch/csrc/jit/runtime/interpreter.cpp

index 79e6267..a095e4a 100644 (file)
--- a/torch/csrc/jit/runtime/interpreter.cpp
+++ b/torch/csrc/jit/runtime/interpreter.cpp
@@ -25,6 +25,7 @@
  #include <torch/csrc/jit/runtime/profiling_record.h>
  #include <torch/csrc/jit/runtime/script_profile.h>
  #include <torch/csrc/jit/runtime/vararg_functions.h>
+#include <string>
  
  #ifdef USE_RPC
  #include <torch/csrc/distributed/autograd/context/container.h>
@@ -784,6 +785,105 @@ struct InterpreterStateImpl : c10::intrusive_ptr_target {
    }
  
   public:
+  // One way to avoid overhead of forming string would be to return
+  // a vector of frame.function, i.e. CodeImpl*
+  // This is not exactly clean as it will expose, internal details of
+  // interpreter. But this way we hold onto graph/node and Function and
+  // we can create module hierarchy string for each event in autograd
+  // profiler at the end, when consolidating events.
+  // At the moment overhead does not seem exhorbitantly large.
+  // Another option would be return vector of (string, InlinedCallstackPtrs)
+  // string would contain function name and typename of self
+  // Format of the returned vector of strings:
+  // For each frame, the corresponding module name, type and function name
+  // are in following format:
+  // <module-instance-name>(module type)::<function-name>
+  // Special keys for module-instance-name:
+  //   - TOP: for top level module
+  //   - SELF: When method/function of the frame is associated with
+  //           previous frame's module instance
+  //   - INSTANCE_NAME_UNKNOWN: instance name cannot be figured out
+  //   - CALL_FUNCTION: call to free function
+  std::vector<std::string> moduleHierarchy() const {
+    std::vector<std::string> module_function_list;
+    std::string module_hierarchy("TOP");
+    for (size_t i = 0; i < frames.size(); ++i) {
+      const Frame& frame = frames[i];
+      std::string fn_name = frame.function->function_name_;
+      // For each frame, type of the class with which the function is
+      // associated, is queried here. And the type name is added to
+      // module hierarchy.
+      const auto& g = frame.function->graph_;
+      std::string g_self_type;
+      if (g && g->inputs().size() > 0) {
+        const auto& g_self_type_ptr =
+            g->inputs()[0]->type()->cast<c10::ClassType>();
+        if (g_self_type_ptr) {
+          g_self_type = g_self_type_ptr->name()->qualifiedName();
+          g_self_type = g_self_type.substr(g_self_type.find_last_of('.') + 1);
+        }
+      }
+      module_hierarchy.append("(")
+          .append(g_self_type)
+          .append(")::")
+          .append(fn_name);
+      module_function_list.emplace_back(std::move(module_hierarchy));
+
+      size_t pc = frame.pc;
+      // CALL nodes have already advanced the pc, so
+      // undo that to report the call node
+      if (i + 1 < frames.size()) {
+        --pc;
+      }
+
+      Node* node = frame.function->instructions_source_[pc];
+      if (node->callstack()) {
+        for (const auto& p : (*node->callstack())->vec()) {
+          fn_name = std::get<0>(p)->name();
+          const auto& opt_module_info = std::get<2>(p);
+          if (opt_module_info.has_value()) {
+            const auto& module_instance_info = opt_module_info.value();
+            module_hierarchy = utils::get_module_info(module_instance_info);
+            module_hierarchy.append("::").append(fn_name);
+          } else {
+            // This is likely a call to free function, not associated with
+            // any class
+            module_hierarchy = "::";
+            module_hierarchy.append(fn_name);
+          }
+          module_function_list.emplace_back(std::move(module_hierarchy));
+        }
+      }
+
+      module_hierarchy = std::string();
+      // If this node is of type callMethod then the following frame
+      // will contain the op being executed.
+      // For such callMethod node, we add the object instance name
+      // associated with it, since the following frame will not have it.
+      if (node->kind() == prim::CallMethod) {
+        std::string class_instance_name;
+        if (node->input(0)->node()->kind() == prim::GetAttr) {
+          class_instance_name = node->input(0)->node()->s(attr::name);
+        } else if (
+            node->owningGraph()->inputs().size() > 0 &&
+            node->input(0) == node->owningGraph()->inputs()[0]) {
+          class_instance_name = "SELF";
+        } else {
+          class_instance_name = "INSTANCE_NAME_UNKNOWN";
+        }
+        module_hierarchy = std::move(class_instance_name);
+      } else if (node->kind() == prim::CallFunction) {
+        auto function_constant = node->input(0)->node();
+        auto fun_type =
+            function_constant->output()->type()->expect<FunctionType>();
+        auto fun_name = fun_type->function()->name();
+        module_hierarchy = "CALL_FUNCTION::";
+        module_hierarchy.append(fun_name);
+      }
+    }
+    return module_function_list;
+  }
+
    std::vector<StackEntry> callstack() const {
      std::vector<StackEntry> entries;
      for (const auto i : c10::irange(frames.size())) {
@@ -848,6 +948,13 @@ std::vector<StackEntry> currentCallstack() {
    return std::vector<StackEntry>();
  }
  
+std::vector<std::string> currentModuleHierarchy() {
+  if (tls_int_state_ptr_) {
+    return tls_int_state_ptr_->moduleHierarchy();
+  }
+  return std::vector<std::string>();
+}
+
  std::ostream& operator<<(std::ostream& out, const Code& code) {
    out << *code.pImpl->graph_ << "\n";
    code.pImpl->dump(out);
diff --git a/torch/csrc/jit/runtime/interpreter.h b/torch/csrc/jit/runtime/interpreter.h

index 4728177..80720ea 100644 (file)
--- a/torch/csrc/jit/runtime/interpreter.h
+++ b/torch/csrc/jit/runtime/interpreter.h
@@ -154,6 +154,7 @@ TORCH_API at::TensorTypePtr tensorTypeInCurrentExecutionContext(
  
  // current (TLS) TorchScript interpreter callstack
  TORCH_API std::vector<StackEntry> currentCallstack();
+TORCH_API std::vector<std::string> currentModuleHierarchy();
  
  } // namespace jit
  } // namespace torch
diff --git a/torch/distributed/rpc/server_process_global_profiler.py b/torch/distributed/rpc/server_process_global_profiler.py

index 02d633e..4634313 100644 (file)
--- a/torch/distributed/rpc/server_process_global_profiler.py
+++ b/torch/distributed/rpc/server_process_global_profiler.py
@@ -117,6 +117,7 @@ class _server_process_global_profile(profile):
              self.record_shapes,
              self.profile_memory,
              False,
+            False,
              False)
          _enable_server_process_global_profiler(profiler_config)
          return self
diff --git a/torch/profiler/profiler.py b/torch/profiler/profiler.py

index f77405d..20bdaa3 100644 (file)
--- a/torch/profiler/profiler.py
+++ b/torch/profiler/profiler.py
@@ -116,6 +116,12 @@ class profile(object):
          with_stack (bool): record source information (file and line number) for the ops.
          with_flops (bool): use formula to estimate the FLOPS of specific operators
              (matrix multiplication and 2D convolution).
+        with_modules (bool): record module hierarchy (including function names)
+            corresponding to the callstack of the op. e.g. If module A's forward call's
+            module B's forward which contains an aten::add op,
+            then aten::add's module hierarchy is A.B
+            Note that this support exist, at the moment, only for TorchScript models
+            and not eager mode models.
          use_cuda (bool):
              .. deprecated:: 1.8.1
                  use ``activities`` instead.
@@ -210,6 +216,7 @@ class profile(object):
              profile_memory: bool = False,
              with_stack: bool = False,
              with_flops: bool = False,
+            with_modules: bool = False,
              # deprecated:
              use_cuda: Optional[bool] = None):
          if activities:
@@ -238,6 +245,7 @@ class profile(object):
          self.with_flops = with_flops
          self.profile_memory = profile_memory
          self.with_stack = with_stack
+        self.with_modules = with_modules
          self.step_num = 0
          self.current_action = self.schedule(self.step_num)
          self.profiler: Optional[prof.profile] = None
@@ -426,6 +434,7 @@ class profile(object):
              with_flops=self.with_flops,
              profile_memory=self.profile_memory,
              with_stack=self.with_stack,
+            with_modules=self.with_modules,
              use_kineto=True,
          )
          self.profiler._prepare_trace()
author	Kimish Patel <kimishpatel@fb.com>
	Sat, 14 Aug 2021 04:37:57 +0000 (21:37 -0700)
committer	Facebook GitHub Bot <facebook-github-bot@users.noreply.github.com>
	Sat, 14 Aug 2021 04:39:10 +0000 (21:39 -0700)
test/cpp/c10d/ProcessGroupNCCLTest.cpp		patch \| blob \| history
test/test_profiler.py		patch \| blob \| history
torch/_C/_autograd.pyi		patch \| blob \| history
torch/autograd/profiler.py		patch \| blob \| history
torch/autograd/profiler_legacy.py		patch \| blob \| history
torch/csrc/autograd/init.cpp		patch \| blob \| history
torch/csrc/autograd/profiler_kineto.cpp		patch \| blob \| history
torch/csrc/autograd/profiler_kineto.h		patch \| blob \| history
torch/csrc/autograd/profiler_legacy.h		patch \| blob \| history
torch/csrc/jit/runtime/interpreter.cpp		patch \| blob \| history
torch/csrc/jit/runtime/interpreter.h		patch \| blob \| history
torch/distributed/rpc/server_process_global_profiler.py		patch \| blob \| history
torch/profiler/profiler.py		patch \| blob \| history