[Pytorch Edge] Enable kineto profiler on mobile via EdgeKinetoProfiler (#62419)
authorKimish Patel <kimishpatel@fb.com>
Sat, 14 Aug 2021 04:37:57 +0000 (21:37 -0700)
committerFacebook GitHub Bot <facebook-github-bot@users.noreply.github.com>
Sat, 14 Aug 2021 04:40:19 +0000 (21:40 -0700)
Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/62419

This diff adds support for cpu only kineto profiler on mobile. Thus
enabling chrome trace generation on mobile. This bring cpp API for
mobile profiling on part with Torchscript.
This is done via:
1. Utilizating debug handle annotations in KinetoEvent.
2. Adding post processing capability, via callbacks, to
KinetoThreadLocalState
3. Creating new RAII stype profiler, KinetoEdgeCPUProfiler, which can be
used in surrounding scope of model execution. This will write chrome
trace to the location specified in profiler constructor.

Test Plan:
MobileProfiler.ModuleHierarchy

Imported from OSS

Reviewed By: raziel

Differential Revision: D29993660

fbshipit-source-id: 0b44f52f9e9c5f5aff81ebbd9273c254c3c03299

24 files changed:
.jenkins/pytorch/build.sh
CMakeLists.txt
android/common.sh
android/pytorch_android/build.gradle
aten/src/ATen/record_function.h
caffe2/CMakeLists.txt
cmake/Dependencies.cmake
scripts/build_ios.sh
test/cpp/jit/test_backend.cpp
test/cpp/jit/test_lite_interpreter.cpp
test/cpp/jit/test_misc.cpp
test/cpp/lite_interpreter_runtime/CMakeLists.txt
test/cpp/lite_interpreter_runtime/test_lite_interpreter_runtime.cpp
test/cpp/lite_interpreter_runtime/test_mobile_profiler.cpp [new file with mode: 0644]
test/cpp/lite_interpreter_runtime/to_be_profiled_module.ptl [new file with mode: 0644]
tools/build_variables.bzl
torch/csrc/autograd/profiler_kineto.cpp
torch/csrc/autograd/profiler_kineto.h
torch/csrc/jit/mobile/debug_info.cpp
torch/csrc/jit/mobile/interpreter.cpp
torch/csrc/jit/mobile/module.cpp
torch/csrc/jit/mobile/module.h
torch/csrc/jit/mobile/profiler_edge.cpp [new file with mode: 0644]
torch/csrc/jit/mobile/profiler_edge.h [new file with mode: 0644]

index f2c279b02056e5b6732f1fbf1ef23338ec50c70b..f6ac52aed99c44d75a7da70cd5a649e763f8a3c5 100755 (executable)
@@ -130,6 +130,7 @@ if [[ "${BUILD_ENVIRONMENT}" == *-android* ]]; then
   if [[ "${BUILD_ENVIRONMENT}" == *vulkan* ]]; then
     build_args+=("-DUSE_VULKAN=ON")
   fi
+  build_args+=("-DUSE_LITE_INTERPRETER_PROFILER=OFF")
   exec ./scripts/build_android.sh "${build_args[@]}" "$@"
 fi
 
index 5ee0e75a669d7f3e1fe1b979ad3c0a0ca11bc5f9..717de6e61a44e31b983243cf36646129cc0cd560 100644 (file)
@@ -266,6 +266,7 @@ if(NOT DEFINED USE_VULKAN)
 endif()
 
 option(USE_SOURCE_DEBUG_ON_MOBILE "Enable " ON)
+option(USE_LITE_INTERPRETER_PROFILER "Enable " ON)
 option(USE_VULKAN_FP16_INFERENCE "Vulkan - Use fp16 inference" OFF)
 option(USE_VULKAN_RELAXED_PRECISION "Vulkan - Use relaxed precision math in the kernels (mediump)" OFF)
 option(USE_VULKAN_SHADERC_RUNTIME "Vulkan - Use runtime shader compilation as opposed to build-time (needs libshaderc)" OFF)
@@ -687,6 +688,10 @@ if(USE_SOURCE_DEBUG_ON_MOBILE)
   string(APPEND CMAKE_CXX_FLAGS " -DSYMBOLICATE_MOBILE_DEBUG_HANDLE")
 endif()
 
+if(USE_LITE_INTERPRETER_PROFILER)
+  string(APPEND CMAKE_CXX_FLAGS " -DEDGE_PROFILER_USE_KINETO")
+endif()
+
 # ---[ Allowlist file if allowlist is specified
 include(cmake/Allowlist.cmake)
 
index 9f5768d0b47bd7ab42037bbabdfaf925d1b2e452..ab1cb5ff43c728b8010ebc82c4af736a6c2c3129 100644 (file)
@@ -29,7 +29,7 @@ check_gradle() {
 }
 
 parse_abis_list() {
-  ABIS_LIST="armeabi-v7a,arm64-v8a,x86,x86_64"
+  ABIS_LIST="x86"
   CUSTOM_ABIS_LIST=false
   if [ $# -gt 0 ]; then
     ABIS_LIST=$1
@@ -59,7 +59,8 @@ build_android() {
     ANDROID_ABI="$abi" \
       BUILD_ROOT="$ANDROID_BUILD_ROOT" \
       "$PYTORCH_DIR/scripts/build_android.sh" \
-      -DANDROID_CCACHE="$(which ccache)"
+      -DANDROID_CCACHE="$(which ccache)" \
+      -DUSE_LITE_INTERPRETER_PROFILER="OFF"
 
     echo "$abi build output lib,include at $ANDROID_BUILD_ROOT/install"
     ln -s "$ANDROID_BUILD_ROOT/install/lib" "$LIB_DIR/$abi"
index f9a7559e598f2d382bb27be71927501744bc1b1a..a65c0ffd436b216bf52750339858d1b5a0eda4a9 100644 (file)
@@ -18,9 +18,9 @@ android {
         externalNativeBuild {
             cmake {
               if(System.env.BUILD_LITE_INTERPRETER == '0') {
-                arguments "-DANDROID_STL=c++_shared", "-DBUILD_LITE_INTERPRETER=OFF"
+                arguments "-DANDROID_STL=c++_shared", "-DBUILD_LITE_INTERPRETER=OFF", "-DUSE_LITE_INTERPRETER_PROFILER=OFF"
               } else {
-                arguments "-DANDROID_STL=c++_shared"
+                arguments "-DANDROID_STL=c++_shared", "-DUSE_LITE_INTERPRETER_PROFILER=OFF"
               }
             }
         }
index 80c3ca90107415e4b73bea42ce5ee224e5ac1d5a..f73df9c27608408b54db71f331754b0b84bb8cc1 100644 (file)
@@ -27,6 +27,8 @@ enum class C10_API_ENUM RecordScope : uint8_t {
   TORCHSCRIPT_FUNCTION,
   // Kernel Function dtype Tag
   KERNEL_FUNCTION_DTYPE,
+  // Kernel Function dtype Tag
+  LITE_INTERPRETER,
   // User defined scope (e.g. with record_function())
   USER_SCOPE,
   NUM_SCOPES, // must be the last in the list
@@ -502,11 +504,11 @@ class TORCH_API RecordFunctionCallback {
       }                                             \
     }
 
-// Helper macros to record user_scope events with debug handles
-#define RECORD_USER_SCOPE_WITH_DEBUG_HANDLE_AND_INPUTS(         \
-    fn, debug_handle, inputs)                                   \
-    RECORD_WITH_SCOPE_DEBUG_HANDLE_AND_INPUTS(                  \
-        at::RecordScope::USER_SCOPE, fn, debug_handle, inputs)
+// Helper macros to record LITE INTERPETER scope events with debug handles
+#define RECORD_EDGE_SCOPE_WITH_DEBUG_HANDLE_AND_INPUTS(             \
+    fn, debug_handle, inputs)                                       \
+    RECORD_WITH_SCOPE_DEBUG_HANDLE_AND_INPUTS(                      \
+        at::RecordScope::LITE_INTERPRETER, fn, debug_handle, inputs)
 
 // Notes:
 //  - two types of callbacks are provided: thread local and global
index 4ab9ef23833a414f9a1e10f2abf3f5af32ee1dea..83048ce723e2b7b2a0c2004696a1c17d33d7fb5a 100644 (file)
@@ -485,10 +485,17 @@ if(NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE)
     endif()
   endif()
 
+  list(APPEND LITE_PROFILER_SRCS "")
+  if(USE_LITE_INTERPRETER_PROFILER)
+    append_filelist("libtorch_edge_profiler_sources " LITE_PROFILER_SRCS)
+  endif()
+
   # Switch between the full jit interpreter and lite interpreter
   if(BUILD_LITE_INTERPRETER)
     append_filelist("libtorch_lite_cmake_sources" LIBTORCH_CMAKE_SRCS)
     list(APPEND LIBTORCH_CMAKE_SRCS ${LITE_EAGER_SYMOBLICATION_SRCS})
+    list(APPEND LIBTORCH_CMAKE_SRCS ${LITE_PROFILER_SRCS})
+    set(CMAKE_POSITION_INDEPENDENT_CODE TRUE)
   else()
     append_filelist("libtorch_cmake_sources" LIBTORCH_CMAKE_SRCS)
 
index b0d36893dab340bbe94bae1077b8de3f192d8447..526e39a9cda28d6c7663ff093725d57b01f63988 100644 (file)
@@ -1568,6 +1568,11 @@ endif()
 # --[ ATen checks
 set(USE_LAPACK 0)
 
+# we need to build all targets to be linked with PIC
+if(USE_KINETO AND INTERN_BUILD_MOBILE AND USE_LITE_INTERPRETER_PROFILER)
+  set(CMAKE_POSITION_INDEPENDENT_CODE TRUE)
+endif()
+
 if(NOT INTERN_BUILD_MOBILE)
   set(TORCH_CUDA_ARCH_LIST $ENV{TORCH_CUDA_ARCH_LIST})
   set(TORCH_NVCC_FLAGS $ENV{TORCH_NVCC_FLAGS})
@@ -1876,11 +1881,17 @@ list(APPEND Caffe2_DEPENDENCY_LIBS fmt::fmt-header-only)
 set(BUILD_SHARED_LIBS ${TEMP_BUILD_SHARED_LIBS} CACHE BOOL "Build shared libs" FORCE)
 
 # ---[ Kineto
-if(USE_KINETO AND INTERN_BUILD_MOBILE)
+# edge profiler depends on KinetoProfiler but it only does cpu
+# profiling. Thus we dont need USE_CUDA/USE_ROCM
+if(USE_KINETO AND INTERN_BUILD_MOBILE AND NOT (BUILD_LITE_INTERPRETER AND USE_LITE_INTERPRETER_PROFILER))
   message(STATUS "Not using libkineto in a mobile build.")
   set(USE_KINETO OFF)
 endif()
 
+if(USE_KINETO AND INTERN_BUILD_MOBILE AND USE_LITE_INTERPRETER_PROFILER AND (USE_CUDA OR USE_ROCM))
+  message(FATAL_ERROR "Mobile build with profiler does not support CUDA or ROCM")
+endif()
+
 if(USE_KINETO)
   if((NOT USE_CUDA) OR MSVC)
     set(LIBKINETO_NOCUPTI ON CACHE STRING "" FORCE)
@@ -1956,6 +1967,7 @@ if(USE_KINETO)
 
   if(NOT TARGET kineto)
     add_subdirectory("${KINETO_SOURCE_DIR}")
+    set_property(TARGET kineto PROPERTY POSITION_INDEPENDENT_CODE ON)
   endif()
   list(APPEND Caffe2_DEPENDENCY_LIBS kineto)
   string(APPEND CMAKE_CXX_FLAGS " -DUSE_KINETO")
index 6da5b16782a6a76744af2ef71b3438a8e0144e60..7e4881546dfa3ec8ad9e94dd32537171f6ee6d85 100755 (executable)
@@ -83,6 +83,7 @@ if [ "${BUILD_LITE_INTERPRETER}" == 0 ]; then
 else
   CMAKE_ARGS+=("-DBUILD_LITE_INTERPRETER=ON")
 fi
+CMAKE_ARGS+=("-DUSE_LITE_INTERPRETER_PROFILER=OFF")
 
 # Don't build binaries or tests (only the library)
 CMAKE_ARGS+=("-DBUILD_TEST=OFF")
index ef0294d2fbc4130d47ab1decff4ed06ceea17bc0..11b47a8f92122e5bbf790913317022f86a8c520c 100644 (file)
@@ -338,16 +338,16 @@ TEST(BackendTestDebugInfo, TestCompiler) {
   lm._save_for_mobile(ss, ExtraFilesMap(), true);
   auto mlm = _load_for_mobile(ss);
   std::string error_pattern = R"(
-  Module hierarchy:top(m).aten::add
+  Module hierarchy:top(m)::<unknown>.aten::add
 Traceback of TorchScript (most recent call last):
-  File "<string>", line 5, in FunctionName_UNKNOWN
+  File "<string>", line 5, in <unknown>
                 typed_inputs: List[Any] = [x, h, ]
                 if self.__backend.is_available() :
                   _0, = self.__backend.execute(self.__handles["forward"], typed_inputs)
                         ~~~~~~~~~~~~~~~~~~~~~~ <--- HERE
                   assert isinstance(_0, Tensor)
                   return _0
-  File "<string>", line 3, in FunctionName_UNKNOWN
+  File "<string>", line 3, in <unknown>
 
     def forward(self, x, h):
         return x + h
@@ -392,16 +392,16 @@ TEST(BackendTestDebugInfo, TestExceptionStackForCompilerWithModuleHierarchy) {
   lm._save_for_mobile(ss, ExtraFilesMap(), true);
   auto mlm = _load_for_mobile(ss);
   std::string error_pattern = R"(
-  Module hierarchy:top(C).A0(A).aten::add
+  Module hierarchy:top(C)::<unknown>.A0(A)::forward.aten::add
 Traceback of TorchScript (most recent call last):
-  File "<string>", line 5, in FunctionName_UNKNOWN
+  File "<string>", line 5, in <unknown>
                 typed_inputs: List[Any] = [x, y, ]
                 if self.__backend.is_available() :
                   _0, = self.__backend.execute(self.__handles["forward"], typed_inputs)
                         ~~~~~~~~~~~~~~~~~~~~~~ <--- HERE
                   assert isinstance(_0, Tensor)
                   return _0
-  File "<string>", line 3, in FunctionName_UNKNOWN
+  File "<string>", line 3, in <unknown>
 
     def forward(self, x, y):
       return self.A0.forward(x, y) + self.B0.forward(x)
@@ -485,16 +485,16 @@ TEST(
    *
    */
   std::string error_pattern = R"(
-  Module hierarchy:top(C).B0(B).A0(A).aten::add
+  Module hierarchy:top(C)::<unknown>.B0(B)::forward.A0(A)::forward.aten::add
 Traceback of TorchScript (most recent call last):
-  File "<string>", line 5, in FunctionName_UNKNOWN
+  File "<string>", line 5, in <unknown>
                 typed_inputs: List[Any] = [x, y, ]
                 if self.__backend.is_available() :
                   _0, = self.__backend.execute(self.__handles["forward"], typed_inputs)
                         ~~~~~~~~~~~~~~~~~~~~~~ <--- HERE
                   assert isinstance(_0, Tensor)
                   return _0
-  File "<string>", line 3, in FunctionName_UNKNOWN
+  File "<string>", line 3, in <unknown>
 
     def forward(self, x, y):
       return self.B0.forward(x, y) + 3
@@ -572,9 +572,9 @@ TEST(BackendTestDebugInfo, TestExceptionStackForCompilerWithLoweredSubModule) {
   c._save_for_mobile(ss, ExtraFilesMap(), true);
   auto c_loaded = _load_for_mobile(ss);
   std::string error_pattern = R"(
-  Module hierarchy:top(C).A0(A).aten::add
+  Module hierarchy:top(C)::<unknown>.A0(A)::forward.aten::add
 Traceback of TorchScript (most recent call last):
-  File "<string>", line 3, in FunctionName_UNKNOWN
+  File "<string>", line 3, in <unknown>
 
     def forward(self, x, y):
       return self.A0.forward(x, y) + self.B0.forward(x)
@@ -587,7 +587,7 @@ Traceback of TorchScript (most recent call last):
                         ~~~~~~~~~~~~~~~~~~~~~~ <--- HERE
                   assert isinstance(_0, Tensor)
                   return _0
-  File "<string>", line 3, in FunctionName_UNKNOWN
+  File "<string>", line 3, in <unknown>
 
     def forward(self, x, y):
       return x + y
@@ -693,9 +693,9 @@ TEST(
    *
    *  */
   std::string error_pattern = R"(
-  Module hierarchy:top(C).A0(A).AA0(AA).aten::add
+  Module hierarchy:top(C)::<unknown>.A0(A)::forward.AA0(AA)::forward.aten::add
 Traceback of TorchScript (most recent call last):
-  File "<string>", line 3, in FunctionName_UNKNOWN
+  File "<string>", line 3, in <unknown>
 
     def forward(self, x, y):
       return self.A0.forward(x, y) + self.B0.forward(x)
@@ -708,7 +708,7 @@ Traceback of TorchScript (most recent call last):
                         ~~~~~~~~~~~~~~~~~~~~~~ <--- HERE
                   assert isinstance(_0, Tensor)
                   return _0
-  File "<string>", line 3, in FunctionName_UNKNOWN
+  File "<string>", line 3, in <unknown>
 
     def forward(self, x, y):
       return self.AA0.forward(x, y) + 3
index 93e6b40f1c34deb23bb2591067fa9a5025e84bc4..3bd2becd8779d863e2f0c97d6c6fe9290d283608 100644 (file)
@@ -482,7 +482,7 @@ TEST(LiteInterpreterTest, ModuleInfoBasic) {
     }
   }
 
-  AT_ASSERT(module_debug_info_set.count("top(M).aten::mul"));
+  AT_ASSERT(module_debug_info_set.count("top(M)::<unknown>.aten::mul"));
 }
 
 TEST(LiteInterpreterTest, NotSaveModuleInfo) {
@@ -542,9 +542,11 @@ TEST(LiteInterpreterTest, OneSubmoduleModuleInfo) {
     }
   }
 
-  AT_ASSERT(module_debug_info_set.count("top(B).aten::add"));
-  AT_ASSERT(module_debug_info_set.count("top(B).A0(A).aten::add"));
-  AT_ASSERT(module_debug_info_set.count("top(B).A0(A).aten::mul"));
+  AT_ASSERT(module_debug_info_set.count("top(B)::<unknown>.aten::add"));
+  AT_ASSERT(module_debug_info_set.count(
+      "top(B)::<unknown>.A0(A)::forward.aten::add"));
+  AT_ASSERT(module_debug_info_set.count(
+      "top(B)::<unknown>.A0(A)::forward.aten::mul"));
 }
 
 TEST(LiteInterpreterTest, TwoSubmodulesModuleInfo) {
@@ -585,9 +587,11 @@ TEST(LiteInterpreterTest, TwoSubmodulesModuleInfo) {
     }
   }
 
-  AT_ASSERT(module_debug_info_set.count("top(C).aten::add"));
-  AT_ASSERT(module_debug_info_set.count("top(C).A0(A).aten::add"));
-  AT_ASSERT(module_debug_info_set.count("top(C).B0(B).aten::add"));
+  AT_ASSERT(module_debug_info_set.count("top(C)::<unknown>.aten::add"));
+  AT_ASSERT(module_debug_info_set.count(
+      "top(C)::<unknown>.A0(A)::forward.aten::add"));
+  AT_ASSERT(module_debug_info_set.count(
+      "top(C)::<unknown>.B0(B)::forward.aten::add"));
 }
 
 TEST(LiteInterpreterTest, GetRuntimeByteCodeVersion) {
@@ -854,9 +858,11 @@ TEST(LiteInterpreterTest, SequentialModuleInfo) {
   //   def forward(self, x):
   //     return self.A0.forward(self.B0.forward(x))
 
-  AT_ASSERT(module_debug_info_set.count("top(C).prim::Return"));
-  AT_ASSERT(module_debug_info_set.count("top(C).A0(A).aten::add"));
-  AT_ASSERT(module_debug_info_set.count("top(C).B0(B).aten::add"));
+  AT_ASSERT(module_debug_info_set.count("top(C)::<unknown>.prim::Return"));
+  AT_ASSERT(module_debug_info_set.count(
+      "top(C)::<unknown>.A0(A)::forward.aten::add"));
+  AT_ASSERT(module_debug_info_set.count(
+      "top(C)::<unknown>.B0(B)::forward.aten::add"));
 }
 
 TEST(LiteInterpreterTest, HierarchyModuleInfo) {
@@ -901,9 +907,11 @@ TEST(LiteInterpreterTest, HierarchyModuleInfo) {
   // "top(C).forward": for the add operator in top.
   // "top(C).B0(B).forward": for the add operator in B0.
   // "top(C).B0(B).forward.A0(A).forward": for the add operator in A0.
-  AT_ASSERT(module_debug_info_set.count("top(C).aten::add"));
-  AT_ASSERT(module_debug_info_set.count("top(C).B0(B).aten::add"));
-  AT_ASSERT(module_debug_info_set.count("top(C).B0(B).A0(A).aten::add"));
+  AT_ASSERT(module_debug_info_set.count("top(C)::<unknown>.aten::add"));
+  AT_ASSERT(module_debug_info_set.count(
+      "top(C)::<unknown>.B0(B)::forward.aten::add"));
+  AT_ASSERT(module_debug_info_set.count(
+      "top(C)::<unknown>.B0(B)::forward.A0(A)::forward.aten::add"));
 }
 
 TEST(LiteInterpreterTest, DuplicatedClassTypeModuleInfo) {
@@ -960,9 +968,11 @@ TEST(LiteInterpreterTest, DuplicatedClassTypeModuleInfo) {
   // "top(B).A0(A).forward": for the add operator in A0.
   // "top(B).A1(A).forward": for the add operator in A1.
 
-  AT_ASSERT(module_debug_info_set.count("top(B).aten::add"));
-  AT_ASSERT(module_debug_info_set.count("top(B).A0(A).aten::add"));
-  AT_ASSERT(module_debug_info_set.count("top(B).A1(A).aten::add"));
+  AT_ASSERT(module_debug_info_set.count("top(B)::<unknown>.aten::add"));
+  AT_ASSERT(module_debug_info_set.count(
+      "top(B)::<unknown>.A0(A)::forward.aten::add"));
+  AT_ASSERT(module_debug_info_set.count(
+      "top(B)::<unknown>.A1(A)::forward.aten::add"));
 }
 #endif // !defined(FB_XPLAT_BUILD)
 
@@ -1371,9 +1381,9 @@ TEST(LiteInterpreterTest, TestExceptionStackWithTwoLevelModuleHierarchy) {
   c._save_for_mobile(ss, ExtraFilesMap(), true);
   auto lite_m = _load_for_mobile(ss);
   std::string error_pattern = R"(
-  Module hierarchy:top(C).B0(B).A0(A).aten::add
+  Module hierarchy:top(C)::<unknown>.B0(B)::foo.A0(A)::bar.aten::add
 Traceback of TorchScript (most recent call last):
-  File "<string>", line 3, in FunctionName_UNKNOWN
+  File "<string>", line 3, in <unknown>
 
     def forward(self, x, y):
       return self.B0.foo(x, y) + 3
index 8ecedd363f401ecc130f5e50db9f23fa209a04b8..82f70fee1dd20d9e60fed829e49b41b1e722d7e6 100644 (file)
@@ -2481,7 +2481,7 @@ TEST(RecordDebugHandles, Basic) {
           torch::autograd::profiler::ProfilerState::KINETO, false, false),
       activities);
   {
-    RECORD_USER_SCOPE_WITH_DEBUG_HANDLE_AND_INPUTS("my_function", 42, {});
+    RECORD_EDGE_SCOPE_WITH_DEBUG_HANDLE_AND_INPUTS("my_function", 42, {});
     float x{5.9999}, y{2.1212};
     float z = x / y;
   }
@@ -2533,7 +2533,7 @@ TEST(RecordDebugHandles, ScopedCallbacks) {
       torch::autograd::profiler::ProfilerConfig(
           torch::autograd::profiler::ProfilerState::KINETO, false, false),
       {torch::autograd::profiler::ActivityType::CPU},
-      {at::RecordScope::USER_SCOPE});
+      {at::RecordScope::LITE_INTERPRETER});
   {
     auto a = torch::rand({128, 128});
     auto b = torch::rand({128, 128});
@@ -2550,9 +2550,9 @@ TEST(RecordDebugHandles, ScopedCallbacks) {
       torch::autograd::profiler::ProfilerConfig(
           torch::autograd::profiler::ProfilerState::KINETO, false, false),
       {torch::autograd::profiler::ActivityType::CPU},
-      {at::RecordScope::USER_SCOPE});
+      {at::RecordScope::LITE_INTERPRETER});
   {
-    RECORD_USER_SCOPE_WITH_DEBUG_HANDLE_AND_INPUTS("my_function", 42, {});
+    RECORD_EDGE_SCOPE_WITH_DEBUG_HANDLE_AND_INPUTS("my_function", 42, {});
     auto a = torch::rand({128, 128});
     auto b = torch::rand({128, 128});
     auto c = a + b;
@@ -2568,11 +2568,9 @@ TEST(RecordDebugHandles, ScopedCallbacks) {
   for (const auto& e : kineto_events) {
     if (e.name() == "my_function") {
       ASSERT_EQ(e.debugHandle(), 42);
-    } else if (e.name() == "not_my_function") {
-      ASSERT_EQ(e.debugHandle(), -1);
     }
   }
-  ASSERT_TRUE(profiler_results_ptr->events().size() == 2);
+  ASSERT_TRUE(profiler_results_ptr->events().size() == 1);
 }
 
 TEST(IValueKWargsTest, Basic) {
index c68ea8869b4b1a0c356d05363d55dad759f18586..503203d7be080ff2b5771498552d42aeb036466f 100644 (file)
@@ -4,6 +4,7 @@ set(
 set(LITE_INTERPRETER_RUNTIME_TEST_DIR
   ${TORCH_ROOT}/test/cpp/lite_interpreter_runtime/main.cpp
   ${TORCH_ROOT}/test/cpp/lite_interpreter_runtime/test_lite_interpreter_runtime.cpp
+  ${TORCH_ROOT}/test/cpp/lite_interpreter_runtime/test_mobile_profiler.cpp
 )
 
 add_library(backend_with_compiler_runtime SHARED
index 2ccf6ee18d3a81439287258cc2a18ad715b98265..1648b1e3d8197f6794440fe3757099b490df72c2 100644 (file)
@@ -142,9 +142,9 @@ TEST(RunTimeTest, DelegateException) {
   inputs.emplace_back(torch::rand({13, 9}));
 
   std::string error_pattern = R"(
-  Module hierarchy:top(C).A0(backend_with_compiler_demoLoweredModule).AA0(AA).aten::add
+  Module hierarchy:top(C)::<unknown>.A0(backend_with_compiler_demoLoweredModule)::forward.AA0(AA)::forward.aten::add
 Traceback of TorchScript (most recent call last):
-  File "<string>", line 3, in FunctionName_UNKNOWN
+  File "<string>", line 3, in <unknown>
 
     def forward(self, x, y):
       return self.A0.forward(x, y) + self.B0.forward(x)
@@ -157,7 +157,7 @@ Traceback of TorchScript (most recent call last):
                         ~~~~~~~~~~~~~~~~~~~~~~ <--- HERE
                   assert isinstance(_0, Tensor)
                   return _0
-  File "<string>", line 3, in FunctionName_UNKNOWN
+  File "<string>", line 3, in <unknown>
 
     def forward(self, x, y):
       return self.AA0.forward(x, y) + 3
diff --git a/test/cpp/lite_interpreter_runtime/test_mobile_profiler.cpp b/test/cpp/lite_interpreter_runtime/test_mobile_profiler.cpp
new file mode 100644 (file)
index 0000000..ee59b39
--- /dev/null
@@ -0,0 +1,74 @@
+#include <fstream>
+#include <gtest/gtest.h>
+#include <test/cpp/jit/test_utils.h>
+#include <torch/csrc/jit/api/module.h>
+#include <torch/csrc/jit/frontend/resolver.h>
+#include <torch/csrc/jit/mobile/import.h>
+#include <torch/csrc/jit/mobile/module.h>
+#include <torch/csrc/jit/mobile/profiler_edge.h>
+
+#include <unordered_set>
+
+#ifdef EDGE_PROFILER_USE_KINETO
+namespace torch {
+namespace jit {
+namespace mobile {
+
+namespace {
+bool checkModuleHierarchyForOp(
+    const std::string& op_name,
+    const std::string& module_hier,
+    std::ifstream& trace_file) {
+  std::string line;
+  while (std::getline(trace_file, line) ) {
+    if (line.find(op_name) != std::string::npos) {
+      while (std::getline(trace_file, line) ) {
+        if (line.find("Module Hierarchy") != std::string::npos) {
+          return (line.find(module_hier) != std::string::npos);
+        }
+      }
+    }
+  }
+  return false;
+}
+} // namespace
+
+TEST(MobileProfiler, ModuleHierarchy) {
+  std::string filePath(__FILE__);
+  auto testModelFile = filePath.substr(0, filePath.find_last_of("/\\") + 1);
+  testModelFile.append("to_be_profiled_module.ptl");
+
+  std::vector<IValue> inputs;
+  inputs.emplace_back(at::rand({64, 64}));
+  inputs.emplace_back(at::rand({64, 64}));
+  std::string trace_file_name("/tmp/test_trace.trace");
+
+  mobile::Module bc = _load_for_mobile(testModelFile);
+  {
+    KinetoEdgeCPUProfiler profiler(
+        bc,
+        trace_file_name,
+        false, // record input_shapes
+        false, // profile memory
+        true, // record callstack
+        false, // record flops
+        true); // record module hierarchy
+    bc.forward(inputs);
+  } // End of profiler
+  std::ifstream trace_file(trace_file_name);
+  std::string line;
+  ASSERT_TRUE(trace_file.is_open());
+  trace_file.seekg(0, std::ios_base::beg);
+  ASSERT_TRUE(checkModuleHierarchyForOp("aten::sub", "top(C)::<unknown>.A0(A)::forward.aten::sub", trace_file));
+  trace_file.seekg(0, std::ios_base::beg);
+  ASSERT_TRUE(checkModuleHierarchyForOp("aten::mul", "top(C)::<unknown>.A0(A)::forward.SELF(A)::forward_impl_.SELF(A)::my_new_method.aten::mul", trace_file));
+  trace_file.seekg(0, std::ios_base::beg);
+  ASSERT_TRUE(checkModuleHierarchyForOp("aten::add", "top(C)::<unknown>.A0(A)::forward.SELF(A)::forward_impl_.aten::add", trace_file));
+  ASSERT_TRUE(checkModuleHierarchyForOp("aten::add", "top(C)::<unknown>.SELF(C)::call_b.B0(B)::forward.aten::add", trace_file));
+  ASSERT_TRUE(checkModuleHierarchyForOp("aten::add", "top(C)::<unknown>.aten::add", trace_file));
+}
+
+} // namespace mobile
+} // namespace jit
+} // namespace torch
+#endif
diff --git a/test/cpp/lite_interpreter_runtime/to_be_profiled_module.ptl b/test/cpp/lite_interpreter_runtime/to_be_profiled_module.ptl
new file mode 100644 (file)
index 0000000..243e41e
Binary files /dev/null and b/test/cpp/lite_interpreter_runtime/to_be_profiled_module.ptl differ
index e7958ccba3bf328ae68ed0e4ca585d118bac8d9d..bfcf55adc522bdba8944bcce1ff9ad390cd40338 100644 (file)
@@ -119,6 +119,10 @@ libtorch_profiler_sources = [
     "torch/csrc/autograd/profiler_kineto.cpp",
 ]
 
+libtorch_edge_profiler_sources = libtorch_profiler_sources + [
+    "torch/csrc/jit/mobile/profiler_edge.cpp",
+]
+
 core_trainer_sources = [
     "torch/csrc/autograd/anomaly_mode.cpp",
     "torch/csrc/autograd/autograd.cpp",
index 526813d4fbde24dea8a08160e9f7f7a9066f45c7..da1ae6f14823f5f7afc2d2a60f50944c87d3e180 100644 (file)
@@ -172,6 +172,14 @@ struct KinetoThreadLocalState : public ProfilerThreadLocalState {
     }
   }
 
+  const std::function<void(std::vector<KinetoEvent>&)>& getEventPostProcessingCallback() const {
+    return event_post_process_cb_;
+  }
+
+  void setEventPostProcessingCallback(std::function<void(std::vector<KinetoEvent>&)>&& cb) {
+    event_post_process_cb_ = std::move(cb);
+  }
+
 #ifdef USE_KINETO
   c10::DeviceType deviceTypeFromActivity(libkineto::ActivityType activity_type) {
     // fallthrough
@@ -258,6 +266,8 @@ struct KinetoThreadLocalState : public ProfilerThreadLocalState {
 #endif // USE_KINETO
   uint64_t start_time_;
   std::vector<KinetoEvent> kineto_events_;
+  // Optional, if event post-processing is enabled.
+  std::function<void(std::vector<KinetoEvent>&)> event_post_process_cb_;
 };
 
 std::vector<std::string> inputTypes(const at::RecordFunction& fn) {
@@ -496,6 +506,16 @@ void prepareProfiler(
 #endif // USE_KINETO
 }
 
+void enableProfilerWithEventPostProcess(
+    const ProfilerConfig& config,
+    const std::set<ActivityType>& activities,
+    std::function<void(std::vector<KinetoEvent>&)>&& cb,
+    const std::unordered_set<at::RecordScope>& scopes) {
+  enableProfiler(config, activities, scopes);
+  auto state_ptr = getProfilerTLSState();
+  state_ptr->setEventPostProcessingCallback(std::move(cb));
+}
+
 void enableProfiler(
     const ProfilerConfig& config,
     const std::set<ActivityType>& activities,
@@ -548,6 +568,11 @@ std::unique_ptr<ProfilerResult> disableProfiler() {
 
 #ifdef USE_KINETO
   state_ptr->cpu_trace->span.endTime = getTimeUs();
+
+  // Call events post processing callback before finalizing trace, if there is one.
+  if (state_ptr->getEventPostProcessingCallback()) {
+    state_ptr->getEventPostProcessingCallback()(state_ptr->kineto_events_);
+  }
   state_ptr->finalizeCPUTrace();
   libkineto::api().activityProfiler().transferCpuTrace(std::move(state_ptr->cpu_trace));
 
index 310554a7ab642b19f86fec8342dafc0c03bc6d9b..21bc318ed808a003732aa68a46b1f68b2ec99a12 100644 (file)
@@ -5,7 +5,11 @@
 
 #ifdef USE_KINETO
 // skip Kineto dependency on mobile
-#ifdef C10_MOBILE
+// unless explicitly asked for.
+// When is it explicitly asked for?
+// KinetoEdgeCPUProfiler uses KinetoProfiler for cpu
+// event profiling. This has dependency on cpu only libkineto
+#if defined(C10_MOBILE) && !defined(EDGE_PROFILER_USE_KINETO)
 #undef USE_KINETO
 #endif
 #endif
@@ -334,6 +338,28 @@ TORCH_API void enableProfiler(
     const std::set<ActivityType>& activities,
     const std::unordered_set<at::RecordScope>& scopes = {});
 
+/*
+ * Same as enableProfiler but with callback to do post-processing of
+ * KinetoEvents.
+ * enableProfilerWithEventPostProcess enables profiler to capture
+ * specified activities, with specified RecordFunction scope, if any.
+ * Additionally, it takes a functor that does in-place post processing of
+ * events, e.g. populate stack trace or module hierarchy information lazily
+ * using debug_handle.
+ * Example usage is with lite interpreter that has recording scope of LITE_INTERPRETER.
+ * In this case lite interpreter runtime, records debug handles in RecordFunction, along
+ * with other information. Debug handles are eventually passed down to KinetoEvent and
+ * recorded as part of the event. KinetoEdgeCPUProfiler,
+ * in torch/csrc/jit/mobile/profiler_edge.cpp, enables profiler using post-processing
+ * callback, via enableProfilerWithEventPostProcess, that takes these debug handles
+ * and generates stack trace and module hierarchy information, once profiling is done.
+ */
+TORCH_API void enableProfilerWithEventPostProcess(
+    const ProfilerConfig& config,
+    const std::set<ActivityType>& activities,
+    std::function<void(std::vector<KinetoEvent>&)>&& cb,
+    const std::unordered_set<at::RecordScope>& scopes = {});
+
 TORCH_API std::unique_ptr<ProfilerResult> disableProfiler();
 
 TORCH_API void prepareProfiler(
index d0d8ada3fdb04b6a770eb89a4e1fb0d557e18b55..9c734f40a25a2b3f3c41ab2f336f58da00d07f22 100644 (file)
@@ -44,11 +44,9 @@ std::pair<std::vector<StackEntry>, std::string> getStackTraceWithModuleHierarchy
           module_info.append(".").append(module_instance_info.instance_name());
         }
       } else {
-        module_info += ".UNKNOWN_INSTANCE(UNKNOWN_TYPE)";
+        module_info.append(".UNKNOWN_INSTANCE(UNKNOWN_TYPE)");
       }
       // Now add source range info to stack
-      // When we serialize function names, those can be added here.
-      // TODO: Add function name separately
       entries.emplace_back(
           StackEntry{prev_function_name, callstack_ptr->source_range()});
       if (callstack_ptr->function()) {
@@ -56,6 +54,11 @@ std::pair<std::vector<StackEntry>, std::string> getStackTraceWithModuleHierarchy
       } else {
         prev_function_name = callstack_ptr->function_name();
       }
+      // Function name appended here
+      // It is renamed to prev_function_name because for StackEntry
+      // it will be appended in the next iteration. This is the format
+      // in which format_stack_trace expects function names.
+      module_info.append("::").append(prev_function_name);
 
       if (callstack_ptr->callee()) {
         callstack_ptr = callstack_ptr->callee().value();
@@ -82,20 +85,21 @@ std::pair<std::string, std::string> getStackTraceWithModuleHierarchy(
   std::vector<StackEntry> stack_entries;
   std::string module_info =
       root_scope_string + "(" + top_module_type_name + ")";
-  std::string caller_fn_name = "FunctionName_UNKNOWN";
+  std::string caller_fn_name = "<unknown>";
+  module_info.append("::").append(caller_fn_name);
   for (const auto& debug_info : source_callstacks) {
     auto debug_info_pair =
         getStackTraceWithModuleHierarchy(debug_info, caller_fn_name);
     auto entries = std::move(debug_info_pair.first);
     stack_entries.insert(stack_entries.end(), entries.begin(), entries.end());
-    module_info += debug_info_pair.second;
+    module_info.append(debug_info_pair.second);
   }
   // Only last entry in the callstack will have a node name of interest.
   // Rest are likely CallMethod/CallFunction nodes
   auto last_entry = source_callstacks.back();
   const std::string& node_name =
       std::get<kDebugInfoTupleNodeNameIndex>(last_entry);
-  module_info += "." + node_name;
+  module_info.append(".").append(node_name);
   std::ostringstream ss;
   ss << "Module hierarchy:" << module_info << "\n";
   format_stack_trace(ss, stack_entries);
index d82d84e7bfe2e46acb32f920f183db35d2ccfd6c..275b84beba97ba027a550b2fe5e7f0f95ce11ce8 100644 (file)
@@ -54,7 +54,9 @@ bool InterpreterState::run(Stack& stack) {
   size_t pc = 0;
   while (true) {
     try {
-      Instruction inst = code_->instructions_with_handles_[pc].instruction;
+      auto inst_with_handle = code_->instructions_with_handles_.at(pc);
+      Instruction inst = inst_with_handle.instruction;
+      DebugHandle debug_handle = inst_with_handle.debug_handle;
 
       //    std::cout << "RUNNING " << pc << " " << code_->instructions_[pc];
       //    if (inst.op == OP) {
@@ -64,6 +66,17 @@ bool InterpreterState::run(Stack& stack) {
       //      }
       //    }
       //    std::cout << std::endl;
+
+      // TODO(iliacher): remove the workaround after RecordFunction is in
+      // Dispatcher
+      // Check with iliacher if has been done.
+      // Plus this is not safe as if you throw exception record function will be
+      // left enabled. That is a TODO
+      bool prev_value = isRecordFunctionEnabled();
+      if (!prev_value) {
+        // enable only for the RecordFunction
+        enableRecordFunction(true);
+      }
       switch (inst.op) {
         case OP: {
           if (at::hasGlobalCallbacks()) {
@@ -74,22 +87,15 @@ bool InterpreterState::run(Stack& stack) {
             }
           }
 
-          // TODO(iliacher): remove the workaround after RecordFunction is in
-          // Dispatcher
-          bool prev_value = isRecordFunctionEnabled();
-          if (!prev_value) {
-            // enable only for the RecordFunction
-            enableRecordFunction(true);
-          }
-          RECORD_USER_SCOPE_WITH_INPUTS(code_->op_names_[inst.X].name, stack);
-          if (!prev_value) {
-            enableRecordFunction(false);
-          }
+          RECORD_EDGE_SCOPE_WITH_DEBUG_HANDLE_AND_INPUTS(
+              code_->op_names_[inst.X].name, debug_handle, stack);
           code_->operators_[inst.X](stack);
           ++pc;
         } break;
         case OPN: {
           stack.push_back(inst.N);
+          RECORD_EDGE_SCOPE_WITH_DEBUG_HANDLE_AND_INPUTS(
+              code_->op_names_[inst.X].name, debug_handle, stack);
           code_->operators_[inst.X](stack);
           ++pc;
         } break;
@@ -99,6 +105,8 @@ bool InterpreterState::run(Stack& stack) {
                   .toObject()
                   ->type()
                   ->getMethod(code_->constants_[inst.X].toStringRef());
+          RECORD_EDGE_SCOPE_WITH_DEBUG_HANDLE_AND_INPUTS(
+              method.name(), debug_handle, stack);
           method.run(stack);
           ++pc;
         } break;
@@ -232,6 +240,10 @@ bool InterpreterState::run(Stack& stack) {
         default:
           AT_ERROR(toString(inst.op), " is invalid.");
       }
+
+      if (!prev_value) {
+        enableRecordFunction(false);
+      }
       // This exception must be caught first as it derived from c10::Error
     } catch (c10::BackendRuntimeException& e) {
       exception_pc_ = pc;
index fad6447679bcd7aa688dffc1e133a73ad621f5c9..c04d9f74b7378177a0864b0b3e46fc9306dd0571 100644 (file)
@@ -122,6 +122,24 @@ const std::map<std::string, at::Tensor> Module::named_parameters() const {
   return params;
 }
 
+std::string Module::getModuleHierarchy(const int64_t debug_handle) const {
+#if defined(SYMBOLICATE_MOBILE_DEBUG_HANDLE)
+  return getDebugTable().getModuleHierarchyInfo(
+      debug_handle, getTopModuleTypeName(*this));
+#else
+  return "";
+#endif
+}
+
+std::string Module::getCallStack(const int64_t debug_handle) const {
+#if defined(SYMBOLICATE_MOBILE_DEBUG_HANDLE)
+  return getDebugTable().getSourceDebugString(
+      debug_handle, getTopModuleTypeName(*this));
+#else
+  return "";
+#endif
+}
+
 // We will continue to support this API for now as this is being relied upon
 // for profiling.
 // We really need to change this part, so in the next step for profiling support
index 8a37b8badf51c1e8577339faa00ed8ba8460c9c1..73637aa4584a0b843c17865ff6b3bc967dff21d7 100644 (file)
@@ -79,6 +79,8 @@ class TORCH_API Module {
   const std::vector<at::Tensor> parameters() const;
   const std::map<std::string, at::Tensor> named_parameters() const;
   std::string get_forward_method_debug_info(size_t pc) const;
+  std::string getModuleHierarchy(const int64_t debug_handle) const;
+  std::string getCallStack(const int64_t debug_handle) const;
   /// Enables "training" mode.
   void train(bool on = true);
   /// Calls train(false) to enable "eval" mode.
diff --git a/torch/csrc/jit/mobile/profiler_edge.cpp b/torch/csrc/jit/mobile/profiler_edge.cpp
new file mode 100644 (file)
index 0000000..bcd5a62
--- /dev/null
@@ -0,0 +1,63 @@
+#include <torch/csrc/jit/mobile/profiler_edge.h>
+#include <string>
+#include <vector>
+
+namespace profiler = torch::autograd::profiler;
+namespace torch {
+namespace jit {
+namespace mobile {
+
+KinetoEdgeCPUProfiler::KinetoEdgeCPUProfiler(
+    const torch::jit::mobile::Module& m,
+    const std::string& fname,
+    const bool report_input_shapes,
+    const bool profile_memory,
+    const bool with_stack,
+    const bool with_flops,
+    const bool with_modules)
+    : m_(m), trace_file_name_(fname) {
+  profiler::ProfilerConfig config(
+      profiler::ProfilerState::KINETO,
+      report_input_shapes,
+      profile_memory,
+      with_stack,
+      with_flops,
+      with_modules);
+  profiler::prepareProfiler(config, {profiler::ActivityType::CPU});
+  if (with_modules || with_stack) {
+    auto post_processing = [this, with_stack, with_modules](
+                               std::vector<profiler::KinetoEvent>& events) {
+      for (auto& e : events) {
+        if (with_modules) {
+          // Since KinetoEvents's module hierarchy takes vector of strings we
+          // just construct a temporary vector using one string element
+          e.moduleHierarchy(std::vector<std::string>(
+              {this->m_.getModuleHierarchy(e.debugHandle())}));
+        } else if (with_stack) {
+          // Since KinetoEvents's stack trace takes vector of strings we just
+          // construct a temporary vector using one string element
+          e.stack(std::vector<std::string>(
+              {this->m_.getCallStack(e.debugHandle())}));
+        }
+      }
+    };
+    profiler::enableProfilerWithEventPostProcess(
+        config,
+        {profiler::ActivityType::CPU},
+        post_processing,
+        {at::RecordScope::LITE_INTERPRETER});
+  } else {
+    profiler::enableProfiler(
+        config,
+        {profiler::ActivityType::CPU},
+        {at::RecordScope::LITE_INTERPRETER});
+  }
+  trace_file_name_ = fname;
+}
+
+KinetoEdgeCPUProfiler::~KinetoEdgeCPUProfiler() {
+  profiler::disableProfiler()->save(trace_file_name_);
+}
+} // namespace mobile
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/mobile/profiler_edge.h b/torch/csrc/jit/mobile/profiler_edge.h
new file mode 100644 (file)
index 0000000..a245034
--- /dev/null
@@ -0,0 +1,68 @@
+#pragma once
+#include <torch/csrc/autograd/profiler_kineto.h>
+#include <torch/csrc/jit/mobile/module.h>
+
+namespace torch {
+namespace jit {
+namespace mobile {
+class TORCH_API KinetoEdgeCPUProfiler {
+ public:
+  // This profiler only profiles KINETO events
+  // No GPU_FALLBACK or NVTX
+  /*
+   * @param m is the instance of mobile Module which is being profiled.
+   *        Note that this implies that KinetoEdgeCPUProfiler can be used
+   *        to profile specific Module (see usage below), unliked ProfilerKineto
+   *        which can profile pytorch runtime in arbitrary scope.
+   * @param fname is the name of the file to which chrome trace is written.
+   * @param report_input_shapes: whether to record shapes of op's inputs.
+   * @param with_stack: whether to record model's python stacktrace for the op.
+   * @param with_flops: whether to report flops corresponding to the op.
+   * @param with_modules: whether to report original python module
+   *        hierarchy to which the op belongs.
+   *
+   * Usage pattern for this profiler must be as follows:
+   *
+   * {
+   *   KinetoEdgeCPUProfiler(m, filename, args);
+   *   m.forward(...);
+   * }
+   *
+   * The reason being that KinetoEdgeCPUProfiler has a dependency on Module
+   * and thus it must not outlive it.
+   *
+   * Thus, when KinetoEdgeCPUProfiler is used as RAII to do profiling
+   * within certain scope. In that scope, the captured reference to
+   * Module will outlive KinetoEdgeCPUProfiler. This is gauranteed because
+   * KinetoEdgeCPUProfiler must be constructed later than Module, on stack.
+   *
+   * An example of the anti-pattern and wrong usage is:
+   *
+   * std::shared_ptr<KinetoMobileCPUProfiler> profiler(m, filename, args);
+   * m.forward(...);
+   *
+   * Since KinetoEdgeCPUProfiler object would then be constructed on heap
+   * with its lifetime managed manually or via smart pointers.
+   */
+  KinetoEdgeCPUProfiler(
+      const torch::jit::mobile::Module& m,
+      const std::string& fname,
+      const bool report_input_shapes = false,
+      const bool profile_memory = false,
+      const bool with_stack = false,
+      const bool with_flops = false,
+      const bool with_modules = false);
+
+  ~KinetoEdgeCPUProfiler();
+
+ private:
+  /*
+   * We store a reference to Module to make such dependency explicit, since
+   * a Module reference is already stored in a functor.
+   */
+  const mobile::Module& m_;
+  std::string trace_file_name_;
+};
+} // namespace mobile
+} // namespace jit
+} // namespace torch