[Pytorch Edge] Enable kineto profiler on mobile via EdgeKinetoProfiler (#62419)
authorKimish Patel <kimishpatel@fb.com>
Sat, 14 Aug 2021 04:37:57 +0000 (21:37 -0700)
committerFacebook GitHub Bot <facebook-github-bot@users.noreply.github.com>
Sat, 14 Aug 2021 04:40:19 +0000 (21:40 -0700)
Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/62419

This diff adds support for cpu only kineto profiler on mobile. Thus
enabling chrome trace generation on mobile. This bring cpp API for
mobile profiling on part with Torchscript.
This is done via:
1. Utilizating debug handle annotations in KinetoEvent.
2. Adding post processing capability, via callbacks, to
KinetoThreadLocalState
3. Creating new RAII stype profiler, KinetoEdgeCPUProfiler, which can be
used in surrounding scope of model execution. This will write chrome
trace to the location specified in profiler constructor.

Test Plan:
MobileProfiler.ModuleHierarchy

Imported from OSS

Reviewed By: raziel

Differential Revision: D29993660

fbshipit-source-id: 0b44f52f9e9c5f5aff81ebbd9273c254c3c03299

24 files changed:
.jenkins/pytorch/build.sh
CMakeLists.txt
android/common.sh
android/pytorch_android/build.gradle
aten/src/ATen/record_function.h
caffe2/CMakeLists.txt
cmake/Dependencies.cmake
scripts/build_ios.sh
test/cpp/jit/test_backend.cpp
test/cpp/jit/test_lite_interpreter.cpp
test/cpp/jit/test_misc.cpp
test/cpp/lite_interpreter_runtime/CMakeLists.txt
test/cpp/lite_interpreter_runtime/test_lite_interpreter_runtime.cpp
test/cpp/lite_interpreter_runtime/test_mobile_profiler.cpp [new file with mode: 0644]
test/cpp/lite_interpreter_runtime/to_be_profiled_module.ptl [new file with mode: 0644]
tools/build_variables.bzl
torch/csrc/autograd/profiler_kineto.cpp
torch/csrc/autograd/profiler_kineto.h
torch/csrc/jit/mobile/debug_info.cpp
torch/csrc/jit/mobile/interpreter.cpp
torch/csrc/jit/mobile/module.cpp
torch/csrc/jit/mobile/module.h
torch/csrc/jit/mobile/profiler_edge.cpp [new file with mode: 0644]
torch/csrc/jit/mobile/profiler_edge.h [new file with mode: 0644]

index f2c279b..f6ac52a 100755 (executable)
@@ -130,6 +130,7 @@ if [[ "${BUILD_ENVIRONMENT}" == *-android* ]]; then
   if [[ "${BUILD_ENVIRONMENT}" == *vulkan* ]]; then
     build_args+=("-DUSE_VULKAN=ON")
   fi
+  build_args+=("-DUSE_LITE_INTERPRETER_PROFILER=OFF")
   exec ./scripts/build_android.sh "${build_args[@]}" "$@"
 fi
 
index 5ee0e75..717de6e 100644 (file)
@@ -266,6 +266,7 @@ if(NOT DEFINED USE_VULKAN)
 endif()
 
 option(USE_SOURCE_DEBUG_ON_MOBILE "Enable " ON)
+option(USE_LITE_INTERPRETER_PROFILER "Enable " ON)
 option(USE_VULKAN_FP16_INFERENCE "Vulkan - Use fp16 inference" OFF)
 option(USE_VULKAN_RELAXED_PRECISION "Vulkan - Use relaxed precision math in the kernels (mediump)" OFF)
 option(USE_VULKAN_SHADERC_RUNTIME "Vulkan - Use runtime shader compilation as opposed to build-time (needs libshaderc)" OFF)
@@ -687,6 +688,10 @@ if(USE_SOURCE_DEBUG_ON_MOBILE)
   string(APPEND CMAKE_CXX_FLAGS " -DSYMBOLICATE_MOBILE_DEBUG_HANDLE")
 endif()
 
+if(USE_LITE_INTERPRETER_PROFILER)
+  string(APPEND CMAKE_CXX_FLAGS " -DEDGE_PROFILER_USE_KINETO")
+endif()
+
 # ---[ Allowlist file if allowlist is specified
 include(cmake/Allowlist.cmake)
 
index 9f5768d..ab1cb5f 100644 (file)
@@ -29,7 +29,7 @@ check_gradle() {
 }
 
 parse_abis_list() {
-  ABIS_LIST="armeabi-v7a,arm64-v8a,x86,x86_64"
+  ABIS_LIST="x86"
   CUSTOM_ABIS_LIST=false
   if [ $# -gt 0 ]; then
     ABIS_LIST=$1
@@ -59,7 +59,8 @@ build_android() {
     ANDROID_ABI="$abi" \
       BUILD_ROOT="$ANDROID_BUILD_ROOT" \
       "$PYTORCH_DIR/scripts/build_android.sh" \
-      -DANDROID_CCACHE="$(which ccache)"
+      -DANDROID_CCACHE="$(which ccache)" \
+      -DUSE_LITE_INTERPRETER_PROFILER="OFF"
 
     echo "$abi build output lib,include at $ANDROID_BUILD_ROOT/install"
     ln -s "$ANDROID_BUILD_ROOT/install/lib" "$LIB_DIR/$abi"
index f9a7559..a65c0ff 100644 (file)
@@ -18,9 +18,9 @@ android {
         externalNativeBuild {
             cmake {
               if(System.env.BUILD_LITE_INTERPRETER == '0') {
-                arguments "-DANDROID_STL=c++_shared", "-DBUILD_LITE_INTERPRETER=OFF"
+                arguments "-DANDROID_STL=c++_shared", "-DBUILD_LITE_INTERPRETER=OFF", "-DUSE_LITE_INTERPRETER_PROFILER=OFF"
               } else {
-                arguments "-DANDROID_STL=c++_shared"
+                arguments "-DANDROID_STL=c++_shared", "-DUSE_LITE_INTERPRETER_PROFILER=OFF"
               }
             }
         }
index 80c3ca9..f73df9c 100644 (file)
@@ -27,6 +27,8 @@ enum class C10_API_ENUM RecordScope : uint8_t {
   TORCHSCRIPT_FUNCTION,
   // Kernel Function dtype Tag
   KERNEL_FUNCTION_DTYPE,
+  // Kernel Function dtype Tag
+  LITE_INTERPRETER,
   // User defined scope (e.g. with record_function())
   USER_SCOPE,
   NUM_SCOPES, // must be the last in the list
@@ -502,11 +504,11 @@ class TORCH_API RecordFunctionCallback {
       }                                             \
     }
 
-// Helper macros to record user_scope events with debug handles
-#define RECORD_USER_SCOPE_WITH_DEBUG_HANDLE_AND_INPUTS(         \
-    fn, debug_handle, inputs)                                   \
-    RECORD_WITH_SCOPE_DEBUG_HANDLE_AND_INPUTS(                  \
-        at::RecordScope::USER_SCOPE, fn, debug_handle, inputs)
+// Helper macros to record LITE INTERPETER scope events with debug handles
+#define RECORD_EDGE_SCOPE_WITH_DEBUG_HANDLE_AND_INPUTS(             \
+    fn, debug_handle, inputs)                                       \
+    RECORD_WITH_SCOPE_DEBUG_HANDLE_AND_INPUTS(                      \
+        at::RecordScope::LITE_INTERPRETER, fn, debug_handle, inputs)
 
 // Notes:
 //  - two types of callbacks are provided: thread local and global
index 4ab9ef2..83048ce 100644 (file)
@@ -485,10 +485,17 @@ if(NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE)
     endif()
   endif()
 
+  list(APPEND LITE_PROFILER_SRCS "")
+  if(USE_LITE_INTERPRETER_PROFILER)
+    append_filelist("libtorch_edge_profiler_sources " LITE_PROFILER_SRCS)
+  endif()
+
   # Switch between the full jit interpreter and lite interpreter
   if(BUILD_LITE_INTERPRETER)
     append_filelist("libtorch_lite_cmake_sources" LIBTORCH_CMAKE_SRCS)
     list(APPEND LIBTORCH_CMAKE_SRCS ${LITE_EAGER_SYMOBLICATION_SRCS})
+    list(APPEND LIBTORCH_CMAKE_SRCS ${LITE_PROFILER_SRCS})
+    set(CMAKE_POSITION_INDEPENDENT_CODE TRUE)
   else()
     append_filelist("libtorch_cmake_sources" LIBTORCH_CMAKE_SRCS)
 
index b0d3689..526e39a 100644 (file)
@@ -1568,6 +1568,11 @@ endif()
 # --[ ATen checks
 set(USE_LAPACK 0)
 
+# we need to build all targets to be linked with PIC
+if(USE_KINETO AND INTERN_BUILD_MOBILE AND USE_LITE_INTERPRETER_PROFILER)
+  set(CMAKE_POSITION_INDEPENDENT_CODE TRUE)
+endif()
+
 if(NOT INTERN_BUILD_MOBILE)
   set(TORCH_CUDA_ARCH_LIST $ENV{TORCH_CUDA_ARCH_LIST})
   set(TORCH_NVCC_FLAGS $ENV{TORCH_NVCC_FLAGS})
@@ -1876,11 +1881,17 @@ list(APPEND Caffe2_DEPENDENCY_LIBS fmt::fmt-header-only)
 set(BUILD_SHARED_LIBS ${TEMP_BUILD_SHARED_LIBS} CACHE BOOL "Build shared libs" FORCE)
 
 # ---[ Kineto
-if(USE_KINETO AND INTERN_BUILD_MOBILE)
+# edge profiler depends on KinetoProfiler but it only does cpu
+# profiling. Thus we dont need USE_CUDA/USE_ROCM
+if(USE_KINETO AND INTERN_BUILD_MOBILE AND NOT (BUILD_LITE_INTERPRETER AND USE_LITE_INTERPRETER_PROFILER))
   message(STATUS "Not using libkineto in a mobile build.")
   set(USE_KINETO OFF)
 endif()
 
+if(USE_KINETO AND INTERN_BUILD_MOBILE AND USE_LITE_INTERPRETER_PROFILER AND (USE_CUDA OR USE_ROCM))
+  message(FATAL_ERROR "Mobile build with profiler does not support CUDA or ROCM")
+endif()
+
 if(USE_KINETO)
   if((NOT USE_CUDA) OR MSVC)
     set(LIBKINETO_NOCUPTI ON CACHE STRING "" FORCE)
@@ -1956,6 +1967,7 @@ if(USE_KINETO)
 
   if(NOT TARGET kineto)
     add_subdirectory("${KINETO_SOURCE_DIR}")
+    set_property(TARGET kineto PROPERTY POSITION_INDEPENDENT_CODE ON)
   endif()
   list(APPEND Caffe2_DEPENDENCY_LIBS kineto)
   string(APPEND CMAKE_CXX_FLAGS " -DUSE_KINETO")
index 6da5b16..7e48815 100755 (executable)
@@ -83,6 +83,7 @@ if [ "${BUILD_LITE_INTERPRETER}" == 0 ]; then
 else
   CMAKE_ARGS+=("-DBUILD_LITE_INTERPRETER=ON")
 fi
+CMAKE_ARGS+=("-DUSE_LITE_INTERPRETER_PROFILER=OFF")
 
 # Don't build binaries or tests (only the library)
 CMAKE_ARGS+=("-DBUILD_TEST=OFF")
index ef0294d..11b47a8 100644 (file)
@@ -338,16 +338,16 @@ TEST(BackendTestDebugInfo, TestCompiler) {
   lm._save_for_mobile(ss, ExtraFilesMap(), true);
   auto mlm = _load_for_mobile(ss);
   std::string error_pattern = R"(
-  Module hierarchy:top(m).aten::add
+  Module hierarchy:top(m)::<unknown>.aten::add
 Traceback of TorchScript (most recent call last):
-  File "<string>", line 5, in FunctionName_UNKNOWN
+  File "<string>", line 5, in <unknown>
                 typed_inputs: List[Any] = [x, h, ]
                 if self.__backend.is_available() :
                   _0, = self.__backend.execute(self.__handles["forward"], typed_inputs)
                         ~~~~~~~~~~~~~~~~~~~~~~ <--- HERE
                   assert isinstance(_0, Tensor)
                   return _0
-  File "<string>", line 3, in FunctionName_UNKNOWN
+  File "<string>", line 3, in <unknown>
 
     def forward(self, x, h):
         return x + h
@@ -392,16 +392,16 @@ TEST(BackendTestDebugInfo, TestExceptionStackForCompilerWithModuleHierarchy) {
   lm._save_for_mobile(ss, ExtraFilesMap(), true);
   auto mlm = _load_for_mobile(ss);
   std::string error_pattern = R"(
-  Module hierarchy:top(C).A0(A).aten::add
+  Module hierarchy:top(C)::<unknown>.A0(A)::forward.aten::add
 Traceback of TorchScript (most recent call last):
-  File "<string>", line 5, in FunctionName_UNKNOWN
+  File "<string>", line 5, in <unknown>
                 typed_inputs: List[Any] = [x, y, ]
                 if self.__backend.is_available() :
                   _0, = self.__backend.execute(self.__handles["forward"], typed_inputs)
                         ~~~~~~~~~~~~~~~~~~~~~~ <--- HERE
                   assert isinstance(_0, Tensor)
                   return _0
-  File "<string>", line 3, in FunctionName_UNKNOWN
+  File "<string>", line 3, in <unknown>
 
     def forward(self, x, y):
       return self.A0.forward(x, y) + self.B0.forward(x)
@@ -485,16 +485,16 @@ TEST(
    *
    */
   std::string error_pattern = R"(
-  Module hierarchy:top(C).B0(B).A0(A).aten::add
+  Module hierarchy:top(C)::<unknown>.B0(B)::forward.A0(A)::forward.aten::add
 Traceback of TorchScript (most recent call last):
-  File "<string>", line 5, in FunctionName_UNKNOWN
+  File "<string>", line 5, in <unknown>
                 typed_inputs: List[Any] = [x, y, ]
                 if self.__backend.is_available() :
                   _0, = self.__backend.execute(self.__handles["forward"], typed_inputs)
                         ~~~~~~~~~~~~~~~~~~~~~~ <--- HERE
                   assert isinstance(_0, Tensor)
                   return _0
-  File "<string>", line 3, in FunctionName_UNKNOWN
+  File "<string>", line 3, in <unknown>
 
     def forward(self, x, y):
       return self.B0.forward(x, y) + 3
@@ -572,9 +572,9 @@ TEST(BackendTestDebugInfo, TestExceptionStackForCompilerWithLoweredSubModule) {
   c._save_for_mobile(ss, ExtraFilesMap(), true);
   auto c_loaded = _load_for_mobile(ss);
   std::string error_pattern = R"(
-  Module hierarchy:top(C).A0(A).aten::add
+  Module hierarchy:top(C)::<unknown>.A0(A)::forward.aten::add
 Traceback of TorchScript (most recent call last):
-  File "<string>", line 3, in FunctionName_UNKNOWN
+  File "<string>", line 3, in <unknown>
 
     def forward(self, x, y):
       return self.A0.forward(x, y) + self.B0.forward(x)
@@ -587,7 +587,7 @@ Traceback of TorchScript (most recent call last):
                         ~~~~~~~~~~~~~~~~~~~~~~ <--- HERE
                   assert isinstance(_0, Tensor)
                   return _0
-  File "<string>", line 3, in FunctionName_UNKNOWN
+  File "<string>", line 3, in <unknown>
 
     def forward(self, x, y):
       return x + y
@@ -693,9 +693,9 @@ TEST(
    *
    *  */
   std::string error_pattern = R"(
-  Module hierarchy:top(C).A0(A).AA0(AA).aten::add
+  Module hierarchy:top(C)::<unknown>.A0(A)::forward.AA0(AA)::forward.aten::add
 Traceback of TorchScript (most recent call last):
-  File "<string>", line 3, in FunctionName_UNKNOWN
+  File "<string>", line 3, in <unknown>
 
     def forward(self, x, y):
       return self.A0.forward(x, y) + self.B0.forward(x)
@@ -708,7 +708,7 @@ Traceback of TorchScript (most recent call last):
                         ~~~~~~~~~~~~~~~~~~~~~~ <--- HERE
                   assert isinstance(_0, Tensor)
                   return _0
-  File "<string>", line 3, in FunctionName_UNKNOWN
+  File "<string>", line 3, in <unknown>
 
     def forward(self, x, y):
       return self.AA0.forward(x, y) + 3
index 93e6b40..3bd2bec 100644 (file)
@@ -482,7 +482,7 @@ TEST(LiteInterpreterTest, ModuleInfoBasic) {
     }
   }
 
-  AT_ASSERT(module_debug_info_set.count("top(M).aten::mul"));
+  AT_ASSERT(module_debug_info_set.count("top(M)::<unknown>.aten::mul"));
 }
 
 TEST(LiteInterpreterTest, NotSaveModuleInfo) {
@@ -542,9 +542,11 @@ TEST(LiteInterpreterTest, OneSubmoduleModuleInfo) {
     }
   }
 
-  AT_ASSERT(module_debug_info_set.count("top(B).aten::add"));
-  AT_ASSERT(module_debug_info_set.count("top(B).A0(A).aten::add"));
-  AT_ASSERT(module_debug_info_set.count("top(B).A0(A).aten::mul"));
+  AT_ASSERT(module_debug_info_set.count("top(B)::<unknown>.aten::add"));
+  AT_ASSERT(module_debug_info_set.count(
+      "top(B)::<unknown>.A0(A)::forward.aten::add"));
+  AT_ASSERT(module_debug_info_set.count(
+      "top(B)::<unknown>.A0(A)::forward.aten::mul"));
 }
 
 TEST(LiteInterpreterTest, TwoSubmodulesModuleInfo) {
@@ -585,9 +587,11 @@ TEST(LiteInterpreterTest, TwoSubmodulesModuleInfo) {
     }
   }
 
-  AT_ASSERT(module_debug_info_set.count("top(C).aten::add"));
-  AT_ASSERT(module_debug_info_set.count("top(C).A0(A).aten::add"));
-  AT_ASSERT(module_debug_info_set.count("top(C).B0(B).aten::add"));
+  AT_ASSERT(module_debug_info_set.count("top(C)::<unknown>.aten::add"));
+  AT_ASSERT(module_debug_info_set.count(
+      "top(C)::<unknown>.A0(A)::forward.aten::add"));
+  AT_ASSERT(module_debug_info_set.count(
+      "top(C)::<unknown>.B0(B)::forward.aten::add"));
 }
 
 TEST(LiteInterpreterTest, GetRuntimeByteCodeVersion) {
@@ -854,9 +858,11 @@ TEST(LiteInterpreterTest, SequentialModuleInfo) {
   //   def forward(self, x):
   //     return self.A0.forward(self.B0.forward(x))
 
-  AT_ASSERT(module_debug_info_set.count("top(C).prim::Return"));
-  AT_ASSERT(module_debug_info_set.count("top(C).A0(A).aten::add"));
-  AT_ASSERT(module_debug_info_set.count("top(C).B0(B).aten::add"));
+  AT_ASSERT(module_debug_info_set.count("top(C)::<unknown>.prim::Return"));
+  AT_ASSERT(module_debug_info_set.count(
+      "top(C)::<unknown>.A0(A)::forward.aten::add"));
+  AT_ASSERT(module_debug_info_set.count(
+      "top(C)::<unknown>.B0(B)::forward.aten::add"));
 }
 
 TEST(LiteInterpreterTest, HierarchyModuleInfo) {
@@ -901,9 +907,11 @@ TEST(LiteInterpreterTest, HierarchyModuleInfo) {
   // "top(C).forward": for the add operator in top.
   // "top(C).B0(B).forward": for the add operator in B0.
   // "top(C).B0(B).forward.A0(A).forward": for the add operator in A0.
-  AT_ASSERT(module_debug_info_set.count("top(C).aten::add"));
-  AT_ASSERT(module_debug_info_set.count("top(C).B0(B).aten::add"));
-  AT_ASSERT(module_debug_info_set.count("top(C).B0(B).A0(A).aten::add"));
+  AT_ASSERT(module_debug_info_set.count("top(C)::<unknown>.aten::add"));
+  AT_ASSERT(module_debug_info_set.count(
+      "top(C)::<unknown>.B0(B)::forward.aten::add"));
+  AT_ASSERT(module_debug_info_set.count(
+      "top(C)::<unknown>.B0(B)::forward.A0(A)::forward.aten::add"));
 }
 
 TEST(LiteInterpreterTest, DuplicatedClassTypeModuleInfo) {
@@ -960,9 +968,11 @@ TEST(LiteInterpreterTest, DuplicatedClassTypeModuleInfo) {
   // "top(B).A0(A).forward": for the add operator in A0.
   // "top(B).A1(A).forward": for the add operator in A1.
 
-  AT_ASSERT(module_debug_info_set.count("top(B).aten::add"));
-  AT_ASSERT(module_debug_info_set.count("top(B).A0(A).aten::add"));
-  AT_ASSERT(module_debug_info_set.count("top(B).A1(A).aten::add"));
+  AT_ASSERT(module_debug_info_set.count("top(B)::<unknown>.aten::add"));
+  AT_ASSERT(module_debug_info_set.count(
+      "top(B)::<unknown>.A0(A)::forward.aten::add"));
+  AT_ASSERT(module_debug_info_set.count(
+      "top(B)::<unknown>.A1(A)::forward.aten::add"));
 }
 #endif // !defined(FB_XPLAT_BUILD)
 
@@ -1371,9 +1381,9 @@ TEST(LiteInterpreterTest, TestExceptionStackWithTwoLevelModuleHierarchy) {
   c._save_for_mobile(ss, ExtraFilesMap(), true);
   auto lite_m = _load_for_mobile(ss);
   std::string error_pattern = R"(
-  Module hierarchy:top(C).B0(B).A0(A).aten::add
+  Module hierarchy:top(C)::<unknown>.B0(B)::foo.A0(A)::bar.aten::add
 Traceback of TorchScript (most recent call last):
-  File "<string>", line 3, in FunctionName_UNKNOWN
+  File "<string>", line 3, in <unknown>
 
     def forward(self, x, y):
       return self.B0.foo(x, y) + 3
index 8ecedd3..82f70fe 100644 (file)
@@ -2481,7 +2481,7 @@ TEST(RecordDebugHandles, Basic) {
           torch::autograd::profiler::ProfilerState::KINETO, false, false),
       activities);
   {
-    RECORD_USER_SCOPE_WITH_DEBUG_HANDLE_AND_INPUTS("my_function", 42, {});
+    RECORD_EDGE_SCOPE_WITH_DEBUG_HANDLE_AND_INPUTS("my_function", 42, {});
     float x{5.9999}, y{2.1212};
     float z = x / y;
   }
@@ -2533,7 +2533,7 @@ TEST(RecordDebugHandles, ScopedCallbacks) {
       torch::autograd::profiler::ProfilerConfig(
           torch::autograd::profiler::ProfilerState::KINETO, false, false),
       {torch::autograd::profiler::ActivityType::CPU},
-      {at::RecordScope::USER_SCOPE});
+      {at::RecordScope::LITE_INTERPRETER});
   {
     auto a = torch::rand({128, 128});
     auto b = torch::rand({128, 128});
@@ -2550,9 +2550,9 @@ TEST(RecordDebugHandles, ScopedCallbacks) {
       torch::autograd::profiler::ProfilerConfig(
           torch::autograd::profiler::ProfilerState::KINETO, false, false),
       {torch::autograd::profiler::ActivityType::CPU},
-      {at::RecordScope::USER_SCOPE});
+      {at::RecordScope::LITE_INTERPRETER});
   {
-    RECORD_USER_SCOPE_WITH_DEBUG_HANDLE_AND_INPUTS("my_function", 42, {});
+    RECORD_EDGE_SCOPE_WITH_DEBUG_HANDLE_AND_INPUTS("my_function", 42, {});
     auto a = torch::rand({128, 128});
     auto b = torch::rand({128, 128});
     auto c = a + b;
@@ -2568,11 +2568,9 @@ TEST(RecordDebugHandles, ScopedCallbacks) {
   for (const auto& e : kineto_events) {
     if (e.name() == "my_function") {
       ASSERT_EQ(e.debugHandle(), 42);
-    } else if (e.name() == "not_my_function") {
-      ASSERT_EQ(e.debugHandle(), -1);
     }
   }
-  ASSERT_TRUE(profiler_results_ptr->events().size() == 2);
+  ASSERT_TRUE(profiler_results_ptr->events().size() == 1);
 }
 
 TEST(IValueKWargsTest, Basic) {
index c68ea88..503203d 100644 (file)
@@ -4,6 +4,7 @@ set(
 set(LITE_INTERPRETER_RUNTIME_TEST_DIR
   ${TORCH_ROOT}/test/cpp/lite_interpreter_runtime/main.cpp
   ${TORCH_ROOT}/test/cpp/lite_interpreter_runtime/test_lite_interpreter_runtime.cpp
+  ${TORCH_ROOT}/test/cpp/lite_interpreter_runtime/test_mobile_profiler.cpp
 )
 
 add_library(backend_with_compiler_runtime SHARED
index 2ccf6ee..1648b1e 100644 (file)
@@ -142,9 +142,9 @@ TEST(RunTimeTest, DelegateException) {
   inputs.emplace_back(torch::rand({13, 9}));
 
   std::string error_pattern = R"(
-  Module hierarchy:top(C).A0(backend_with_compiler_demoLoweredModule).AA0(AA).aten::add
+  Module hierarchy:top(C)::<unknown>.A0(backend_with_compiler_demoLoweredModule)::forward.AA0(AA)::forward.aten::add
 Traceback of TorchScript (most recent call last):
-  File "<string>", line 3, in FunctionName_UNKNOWN
+  File "<string>", line 3, in <unknown>
 
     def forward(self, x, y):
       return self.A0.forward(x, y) + self.B0.forward(x)
@@ -157,7 +157,7 @@ Traceback of TorchScript (most recent call last):
                         ~~~~~~~~~~~~~~~~~~~~~~ <--- HERE
                   assert isinstance(_0, Tensor)
                   return _0
-  File "<string>", line 3, in FunctionName_UNKNOWN
+  File "<string>", line 3, in <unknown>
 
     def forward(self, x, y):
       return self.AA0.forward(x, y) + 3
diff --git a/test/cpp/lite_interpreter_runtime/test_mobile_profiler.cpp b/test/cpp/lite_interpreter_runtime/test_mobile_profiler.cpp
new file mode 100644 (file)
index 0000000..ee59b39
--- /dev/null
@@ -0,0 +1,74 @@
+#include <fstream>
+#include <gtest/gtest.h>
+#include <test/cpp/jit/test_utils.h>
+#include <torch/csrc/jit/api/module.h>
+#include <torch/csrc/jit/frontend/resolver.h>
+#include <torch/csrc/jit/mobile/import.h>
+#include <torch/csrc/jit/mobile/module.h>
+#include <torch/csrc/jit/mobile/profiler_edge.h>
+
+#include <unordered_set>
+
+#ifdef EDGE_PROFILER_USE_KINETO
+namespace torch {
+namespace jit {
+namespace mobile {
+
+namespace {
+bool checkModuleHierarchyForOp(
+    const std::string& op_name,
+    const std::string& module_hier,
+    std::ifstream& trace_file) {
+  std::string line;
+  while (std::getline(trace_file, line) ) {
+    if (line.find(op_name) != std::string::npos) {
+      while (std::getline(trace_file, line) ) {
+        if (line.find("Module Hierarchy") != std::string::npos) {
+          return (line.find(module_hier) != std::string::npos);
+        }
+      }
+    }
+  }
+  return false;
+}
+} // namespace
+
+TEST(MobileProfiler, ModuleHierarchy) {
+  std::string filePath(__FILE__);
+  auto testModelFile = filePath.substr(0, filePath.find_last_of("/\\") + 1);
+  testModelFile.append("to_be_profiled_module.ptl");
+
+  std::vector<IValue> inputs;
+  inputs.emplace_back(at::rand({64, 64}));
+  inputs.emplace_back(at::rand({64, 64}));
+  std::string trace_file_name("/tmp/test_trace.trace");
+
+  mobile::Module bc = _load_for_mobile(testModelFile);
+  {
+    KinetoEdgeCPUProfiler profiler(
+        bc,
+        trace_file_name,
+        false, // record input_shapes
+        false, // profile memory
+        true, // record callstack
+        false, // record flops
+        true); // record module hierarchy
+    bc.forward(inputs);
+  } // End of profiler
+  std::ifstream trace_file(trace_file_name);
+  std::string line;
+  ASSERT_TRUE(trace_file.is_open());
+  trace_file.seekg(0, std::ios_base::beg);
+  ASSERT_TRUE(checkModuleHierarchyForOp("aten::sub", "top(C)::<unknown>.A0(A)::forward.aten::sub", trace_file));
+  trace_file.seekg(0, std::ios_base::beg);
+  ASSERT_TRUE(checkModuleHierarchyForOp("aten::mul", "top(C)::<unknown>.A0(A)::forward.SELF(A)::forward_impl_.SELF(A)::my_new_method.aten::mul", trace_file));
+  trace_file.seekg(0, std::ios_base::beg);
+  ASSERT_TRUE(checkModuleHierarchyForOp("aten::add", "top(C)::<unknown>.A0(A)::forward.SELF(A)::forward_impl_.aten::add", trace_file));
+  ASSERT_TRUE(checkModuleHierarchyForOp("aten::add", "top(C)::<unknown>.SELF(C)::call_b.B0(B)::forward.aten::add", trace_file));
+  ASSERT_TRUE(checkModuleHierarchyForOp("aten::add", "top(C)::<unknown>.aten::add", trace_file));
+}
+
+} // namespace mobile
+} // namespace jit
+} // namespace torch
+#endif
diff --git a/test/cpp/lite_interpreter_runtime/to_be_profiled_module.ptl b/test/cpp/lite_interpreter_runtime/to_be_profiled_module.ptl
new file mode 100644 (file)
index 0000000..243e41e
Binary files /dev/null and b/test/cpp/lite_interpreter_runtime/to_be_profiled_module.ptl differ
index e7958cc..bfcf55a 100644 (file)
@@ -119,6 +119,10 @@ libtorch_profiler_sources = [
     "torch/csrc/autograd/profiler_kineto.cpp",
 ]
 
+libtorch_edge_profiler_sources = libtorch_profiler_sources + [
+    "torch/csrc/jit/mobile/profiler_edge.cpp",
+]
+
 core_trainer_sources = [
     "torch/csrc/autograd/anomaly_mode.cpp",
     "torch/csrc/autograd/autograd.cpp",
index 526813d..da1ae6f 100644 (file)
@@ -172,6 +172,14 @@ struct KinetoThreadLocalState : public ProfilerThreadLocalState {
     }
   }
 
+  const std::function<void(std::vector<KinetoEvent>&)>& getEventPostProcessingCallback() const {
+    return event_post_process_cb_;
+  }
+
+  void setEventPostProcessingCallback(std::function<void(std::vector<KinetoEvent>&)>&& cb) {
+    event_post_process_cb_ = std::move(cb);
+  }
+
 #ifdef USE_KINETO
   c10::DeviceType deviceTypeFromActivity(libkineto::ActivityType activity_type) {
     // fallthrough
@@ -258,6 +266,8 @@ struct KinetoThreadLocalState : public ProfilerThreadLocalState {
 #endif // USE_KINETO
   uint64_t start_time_;
   std::vector<KinetoEvent> kineto_events_;
+  // Optional, if event post-processing is enabled.
+  std::function<void(std::vector<KinetoEvent>&)> event_post_process_cb_;
 };
 
 std::vector<std::string> inputTypes(const at::RecordFunction& fn) {
@@ -496,6 +506,16 @@ void prepareProfiler(
 #endif // USE_KINETO
 }
 
+void enableProfilerWithEventPostProcess(
+    const ProfilerConfig& config,
+    const std::set<ActivityType>& activities,
+    std::function<void(std::vector<KinetoEvent>&)>&& cb,
+    const std::unordered_set<at::RecordScope>& scopes) {
+  enableProfiler(config, activities, scopes);
+  auto state_ptr = getProfilerTLSState();
+  state_ptr->setEventPostProcessingCallback(std::move(cb));
+}
+
 void enableProfiler(
     const ProfilerConfig& config,
     const std::set<ActivityType>& activities,
@@ -548,6 +568,11 @@ std::unique_ptr<ProfilerResult> disableProfiler() {
 
 #ifdef USE_KINETO
   state_ptr->cpu_trace->span.endTime = getTimeUs();
+
+  // Call events post processing callback before finalizing trace, if there is one.
+  if (state_ptr->getEventPostProcessingCallback()) {
+    state_ptr->getEventPostProcessingCallback()(state_ptr->kineto_events_);
+  }
   state_ptr->finalizeCPUTrace();
   libkineto::api().activityProfiler().transferCpuTrace(std::move(state_ptr->cpu_trace));
 
index 310554a..21bc318 100644 (file)
@@ -5,7 +5,11 @@
 
 #ifdef USE_KINETO
 // skip Kineto dependency on mobile
-#ifdef C10_MOBILE
+// unless explicitly asked for.
+// When is it explicitly asked for?
+// KinetoEdgeCPUProfiler uses KinetoProfiler for cpu
+// event profiling. This has dependency on cpu only libkineto
+#if defined(C10_MOBILE) && !defined(EDGE_PROFILER_USE_KINETO)
 #undef USE_KINETO
 #endif
 #endif
@@ -334,6 +338,28 @@ TORCH_API void enableProfiler(
     const std::set<ActivityType>& activities,
     const std::unordered_set<at::RecordScope>& scopes = {});
 
+/*
+ * Same as enableProfiler but with callback to do post-processing of
+ * KinetoEvents.
+ * enableProfilerWithEventPostProcess enables profiler to capture
+ * specified activities, with specified RecordFunction scope, if any.
+ * Additionally, it takes a functor that does in-place post processing of
+ * events, e.g. populate stack trace or module hierarchy information lazily
+ * using debug_handle.
+ * Example usage is with lite interpreter that has recording scope of LITE_INTERPRETER.
+ * In this case lite interpreter runtime, records debug handles in RecordFunction, along
+ * with other information. Debug handles are eventually passed down to KinetoEvent and
+ * recorded as part of the event. KinetoEdgeCPUProfiler,
+ * in torch/csrc/jit/mobile/profiler_edge.cpp, enables profiler using post-processing
+ * callback, via enableProfilerWithEventPostProcess, that takes these debug handles
+ * and generates stack trace and module hierarchy information, once profiling is done.
+ */
+TORCH_API void enableProfilerWithEventPostProcess(
+    const ProfilerConfig& config,
+    const std::set<ActivityType>& activities,
+    std::function<void(std::vector<KinetoEvent>&)>&& cb,
+    const std::unordered_set<at::RecordScope>& scopes = {});
+
 TORCH_API std::unique_ptr<ProfilerResult> disableProfiler();
 
 TORCH_API void prepareProfiler(
index d0d8ada..9c734f4 100644 (file)
@@ -44,11 +44,9 @@ std::pair<std::vector<StackEntry>, std::string> getStackTraceWithModuleHierarchy
           module_info.append(".").append(module_instance_info.instance_name());
         }
       } else {
-        module_info += ".UNKNOWN_INSTANCE(UNKNOWN_TYPE)";
+        module_info.append(".UNKNOWN_INSTANCE(UNKNOWN_TYPE)");
       }
       // Now add source range info to stack
-      // When we serialize function names, those can be added here.
-      // TODO: Add function name separately
       entries.emplace_back(
           StackEntry{prev_function_name, callstack_ptr->source_range()});
       if (callstack_ptr->function()) {
@@ -56,6 +54,11 @@ std::pair<std::vector<StackEntry>, std::string> getStackTraceWithModuleHierarchy
       } else {
         prev_function_name = callstack_ptr->function_name();
       }
+      // Function name appended here
+      // It is renamed to prev_function_name because for StackEntry
+      // it will be appended in the next iteration. This is the format
+      // in which format_stack_trace expects function names.
+      module_info.append("::").append(prev_function_name);
 
       if (callstack_ptr->callee()) {
         callstack_ptr = callstack_ptr->callee().value();
@@ -82,20 +85,21 @@ std::pair<std::string, std::string> getStackTraceWithModuleHierarchy(
   std::vector<StackEntry> stack_entries;
   std::string module_info =
       root_scope_string + "(" + top_module_type_name + ")";
-  std::string caller_fn_name = "FunctionName_UNKNOWN";
+  std::string caller_fn_name = "<unknown>";
+  module_info.append("::").append(caller_fn_name);
   for (const auto& debug_info : source_callstacks) {
     auto debug_info_pair =
         getStackTraceWithModuleHierarchy(debug_info, caller_fn_name);
     auto entries = std::move(debug_info_pair.first);
     stack_entries.insert(stack_entries.end(), entries.begin(), entries.end());
-    module_info += debug_info_pair.second;
+    module_info.append(debug_info_pair.second);
   }
   // Only last entry in the callstack will have a node name of interest.
   // Rest are likely CallMethod/CallFunction nodes
   auto last_entry = source_callstacks.back();
   const std::string& node_name =
       std::get<kDebugInfoTupleNodeNameIndex>(last_entry);
-  module_info += "." + node_name;
+  module_info.append(".").append(node_name);
   std::ostringstream ss;
   ss << "Module hierarchy:" << module_info << "\n";
   format_stack_trace(ss, stack_entries);
index d82d84e..275b84b 100644 (file)
@@ -54,7 +54,9 @@ bool InterpreterState::run(Stack& stack) {
   size_t pc = 0;
   while (true) {
     try {
-      Instruction inst = code_->instructions_with_handles_[pc].instruction;
+      auto inst_with_handle = code_->instructions_with_handles_.at(pc);
+      Instruction inst = inst_with_handle.instruction;
+      DebugHandle debug_handle = inst_with_handle.debug_handle;
 
       //    std::cout << "RUNNING " << pc << " " << code_->instructions_[pc];
       //    if (inst.op == OP) {
@@ -64,6 +66,17 @@ bool InterpreterState::run(Stack& stack) {
       //      }
       //    }
       //    std::cout << std::endl;
+
+      // TODO(iliacher): remove the workaround after RecordFunction is in
+      // Dispatcher
+      // Check with iliacher if has been done.
+      // Plus this is not safe as if you throw exception record function will be
+      // left enabled. That is a TODO
+      bool prev_value = isRecordFunctionEnabled();
+      if (!prev_value) {
+        // enable only for the RecordFunction
+        enableRecordFunction(true);
+      }
       switch (inst.op) {
         case OP: {
           if (at::hasGlobalCallbacks()) {
@@ -74,22 +87,15 @@ bool InterpreterState::run(Stack& stack) {
             }
           }
 
-          // TODO(iliacher): remove the workaround after RecordFunction is in
-          // Dispatcher
-          bool prev_value = isRecordFunctionEnabled();
-          if (!prev_value) {
-            // enable only for the RecordFunction
-            enableRecordFunction(true);
-          }
-          RECORD_USER_SCOPE_WITH_INPUTS(code_->op_names_[inst.X].name, stack);
-          if (!prev_value) {
-            enableRecordFunction(false);
-          }
+          RECORD_EDGE_SCOPE_WITH_DEBUG_HANDLE_AND_INPUTS(
+              code_->op_names_[inst.X].name, debug_handle, stack);
           code_->operators_[inst.X](stack);
           ++pc;
         } break;
         case OPN: {
           stack.push_back(inst.N);
+          RECORD_EDGE_SCOPE_WITH_DEBUG_HANDLE_AND_INPUTS(
+              code_->op_names_[inst.X].name, debug_handle, stack);
           code_->operators_[inst.X](stack);
           ++pc;
         } break;
@@ -99,6 +105,8 @@ bool InterpreterState::run(Stack& stack) {
                   .toObject()
                   ->type()
                   ->getMethod(code_->constants_[inst.X].toStringRef());
+          RECORD_EDGE_SCOPE_WITH_DEBUG_HANDLE_AND_INPUTS(
+              method.name(), debug_handle, stack);
           method.run(stack);
           ++pc;
         } break;
@@ -232,6 +240,10 @@ bool InterpreterState::run(Stack& stack) {
         default:
           AT_ERROR(toString(inst.op), " is invalid.");
       }
+
+      if (!prev_value) {
+        enableRecordFunction(false);
+      }
       // This exception must be caught first as it derived from c10::Error
     } catch (c10::BackendRuntimeException& e) {
       exception_pc_ = pc;
index fad6447..c04d9f7 100644 (file)
@@ -122,6 +122,24 @@ const std::map<std::string, at::Tensor> Module::named_parameters() const {
   return params;
 }
 
+std::string Module::getModuleHierarchy(const int64_t debug_handle) const {
+#if defined(SYMBOLICATE_MOBILE_DEBUG_HANDLE)
+  return getDebugTable().getModuleHierarchyInfo(
+      debug_handle, getTopModuleTypeName(*this));
+#else
+  return "";
+#endif
+}
+
+std::string Module::getCallStack(const int64_t debug_handle) const {
+#if defined(SYMBOLICATE_MOBILE_DEBUG_HANDLE)
+  return getDebugTable().getSourceDebugString(
+      debug_handle, getTopModuleTypeName(*this));
+#else
+  return "";
+#endif
+}
+
 // We will continue to support this API for now as this is being relied upon
 // for profiling.
 // We really need to change this part, so in the next step for profiling support
index 8a37b8b..73637aa 100644 (file)
@@ -79,6 +79,8 @@ class TORCH_API Module {
   const std::vector<at::Tensor> parameters() const;
   const std::map<std::string, at::Tensor> named_parameters() const;
   std::string get_forward_method_debug_info(size_t pc) const;
+  std::string getModuleHierarchy(const int64_t debug_handle) const;
+  std::string getCallStack(const int64_t debug_handle) const;
   /// Enables "training" mode.
   void train(bool on = true);
   /// Calls train(false) to enable "eval" mode.
diff --git a/torch/csrc/jit/mobile/profiler_edge.cpp b/torch/csrc/jit/mobile/profiler_edge.cpp
new file mode 100644 (file)
index 0000000..bcd5a62
--- /dev/null
@@ -0,0 +1,63 @@
+#include <torch/csrc/jit/mobile/profiler_edge.h>
+#include <string>
+#include <vector>
+
+namespace profiler = torch::autograd::profiler;
+namespace torch {
+namespace jit {
+namespace mobile {
+
+KinetoEdgeCPUProfiler::KinetoEdgeCPUProfiler(
+    const torch::jit::mobile::Module& m,
+    const std::string& fname,
+    const bool report_input_shapes,
+    const bool profile_memory,
+    const bool with_stack,
+    const bool with_flops,
+    const bool with_modules)
+    : m_(m), trace_file_name_(fname) {
+  profiler::ProfilerConfig config(
+      profiler::ProfilerState::KINETO,
+      report_input_shapes,
+      profile_memory,
+      with_stack,
+      with_flops,
+      with_modules);
+  profiler::prepareProfiler(config, {profiler::ActivityType::CPU});
+  if (with_modules || with_stack) {
+    auto post_processing = [this, with_stack, with_modules](
+                               std::vector<profiler::KinetoEvent>& events) {
+      for (auto& e : events) {
+        if (with_modules) {
+          // Since KinetoEvents's module hierarchy takes vector of strings we
+          // just construct a temporary vector using one string element
+          e.moduleHierarchy(std::vector<std::string>(
+              {this->m_.getModuleHierarchy(e.debugHandle())}));
+        } else if (with_stack) {
+          // Since KinetoEvents's stack trace takes vector of strings we just
+          // construct a temporary vector using one string element
+          e.stack(std::vector<std::string>(
+              {this->m_.getCallStack(e.debugHandle())}));
+        }
+      }
+    };
+    profiler::enableProfilerWithEventPostProcess(
+        config,
+        {profiler::ActivityType::CPU},
+        post_processing,
+        {at::RecordScope::LITE_INTERPRETER});
+  } else {
+    profiler::enableProfiler(
+        config,
+        {profiler::ActivityType::CPU},
+        {at::RecordScope::LITE_INTERPRETER});
+  }
+  trace_file_name_ = fname;
+}
+
+KinetoEdgeCPUProfiler::~KinetoEdgeCPUProfiler() {
+  profiler::disableProfiler()->save(trace_file_name_);
+}
+} // namespace mobile
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/mobile/profiler_edge.h b/torch/csrc/jit/mobile/profiler_edge.h
new file mode 100644 (file)
index 0000000..a245034
--- /dev/null
@@ -0,0 +1,68 @@
+#pragma once
+#include <torch/csrc/autograd/profiler_kineto.h>
+#include <torch/csrc/jit/mobile/module.h>
+
+namespace torch {
+namespace jit {
+namespace mobile {
+class TORCH_API KinetoEdgeCPUProfiler {
+ public:
+  // This profiler only profiles KINETO events
+  // No GPU_FALLBACK or NVTX
+  /*
+   * @param m is the instance of mobile Module which is being profiled.
+   *        Note that this implies that KinetoEdgeCPUProfiler can be used
+   *        to profile specific Module (see usage below), unliked ProfilerKineto
+   *        which can profile pytorch runtime in arbitrary scope.
+   * @param fname is the name of the file to which chrome trace is written.
+   * @param report_input_shapes: whether to record shapes of op's inputs.
+   * @param with_stack: whether to record model's python stacktrace for the op.
+   * @param with_flops: whether to report flops corresponding to the op.
+   * @param with_modules: whether to report original python module
+   *        hierarchy to which the op belongs.
+   *
+   * Usage pattern for this profiler must be as follows:
+   *
+   * {
+   *   KinetoEdgeCPUProfiler(m, filename, args);
+   *   m.forward(...);
+   * }
+   *
+   * The reason being that KinetoEdgeCPUProfiler has a dependency on Module
+   * and thus it must not outlive it.
+   *
+   * Thus, when KinetoEdgeCPUProfiler is used as RAII to do profiling
+   * within certain scope. In that scope, the captured reference to
+   * Module will outlive KinetoEdgeCPUProfiler. This is gauranteed because
+   * KinetoEdgeCPUProfiler must be constructed later than Module, on stack.
+   *
+   * An example of the anti-pattern and wrong usage is:
+   *
+   * std::shared_ptr<KinetoMobileCPUProfiler> profiler(m, filename, args);
+   * m.forward(...);
+   *
+   * Since KinetoEdgeCPUProfiler object would then be constructed on heap
+   * with its lifetime managed manually or via smart pointers.
+   */
+  KinetoEdgeCPUProfiler(
+      const torch::jit::mobile::Module& m,
+      const std::string& fname,
+      const bool report_input_shapes = false,
+      const bool profile_memory = false,
+      const bool with_stack = false,
+      const bool with_flops = false,
+      const bool with_modules = false);
+
+  ~KinetoEdgeCPUProfiler();
+
+ private:
+  /*
+   * We store a reference to Module to make such dependency explicit, since
+   * a Module reference is already stored in a functor.
+   */
+  const mobile::Module& m_;
+  std::string trace_file_name_;
+};
+} // namespace mobile
+} // namespace jit
+} // namespace torch