if [[ "${BUILD_ENVIRONMENT}" == *vulkan* ]]; then
build_args+=("-DUSE_VULKAN=ON")
fi
+ build_args+=("-DUSE_LITE_INTERPRETER_PROFILER=OFF")
exec ./scripts/build_android.sh "${build_args[@]}" "$@"
fi
endif()
option(USE_SOURCE_DEBUG_ON_MOBILE "Enable " ON)
+option(USE_LITE_INTERPRETER_PROFILER "Enable " ON)
option(USE_VULKAN_FP16_INFERENCE "Vulkan - Use fp16 inference" OFF)
option(USE_VULKAN_RELAXED_PRECISION "Vulkan - Use relaxed precision math in the kernels (mediump)" OFF)
option(USE_VULKAN_SHADERC_RUNTIME "Vulkan - Use runtime shader compilation as opposed to build-time (needs libshaderc)" OFF)
string(APPEND CMAKE_CXX_FLAGS " -DSYMBOLICATE_MOBILE_DEBUG_HANDLE")
endif()
+if(USE_LITE_INTERPRETER_PROFILER)
+ string(APPEND CMAKE_CXX_FLAGS " -DEDGE_PROFILER_USE_KINETO")
+endif()
+
# ---[ Allowlist file if allowlist is specified
include(cmake/Allowlist.cmake)
}
parse_abis_list() {
- ABIS_LIST="armeabi-v7a,arm64-v8a,x86,x86_64"
+ ABIS_LIST="x86"
CUSTOM_ABIS_LIST=false
if [ $# -gt 0 ]; then
ABIS_LIST=$1
ANDROID_ABI="$abi" \
BUILD_ROOT="$ANDROID_BUILD_ROOT" \
"$PYTORCH_DIR/scripts/build_android.sh" \
- -DANDROID_CCACHE="$(which ccache)"
+ -DANDROID_CCACHE="$(which ccache)" \
+ -DUSE_LITE_INTERPRETER_PROFILER="OFF"
echo "$abi build output lib,include at $ANDROID_BUILD_ROOT/install"
ln -s "$ANDROID_BUILD_ROOT/install/lib" "$LIB_DIR/$abi"
externalNativeBuild {
cmake {
if(System.env.BUILD_LITE_INTERPRETER == '0') {
- arguments "-DANDROID_STL=c++_shared", "-DBUILD_LITE_INTERPRETER=OFF"
+ arguments "-DANDROID_STL=c++_shared", "-DBUILD_LITE_INTERPRETER=OFF", "-DUSE_LITE_INTERPRETER_PROFILER=OFF"
} else {
- arguments "-DANDROID_STL=c++_shared"
+ arguments "-DANDROID_STL=c++_shared", "-DUSE_LITE_INTERPRETER_PROFILER=OFF"
}
}
}
TORCHSCRIPT_FUNCTION,
// Kernel Function dtype Tag
KERNEL_FUNCTION_DTYPE,
+ // Kernel Function dtype Tag
+ LITE_INTERPRETER,
// User defined scope (e.g. with record_function())
USER_SCOPE,
NUM_SCOPES, // must be the last in the list
} \
}
-// Helper macros to record user_scope events with debug handles
-#define RECORD_USER_SCOPE_WITH_DEBUG_HANDLE_AND_INPUTS( \
- fn, debug_handle, inputs) \
- RECORD_WITH_SCOPE_DEBUG_HANDLE_AND_INPUTS( \
- at::RecordScope::USER_SCOPE, fn, debug_handle, inputs)
+// Helper macros to record LITE INTERPETER scope events with debug handles
+#define RECORD_EDGE_SCOPE_WITH_DEBUG_HANDLE_AND_INPUTS( \
+ fn, debug_handle, inputs) \
+ RECORD_WITH_SCOPE_DEBUG_HANDLE_AND_INPUTS( \
+ at::RecordScope::LITE_INTERPRETER, fn, debug_handle, inputs)
// Notes:
// - two types of callbacks are provided: thread local and global
endif()
endif()
+ list(APPEND LITE_PROFILER_SRCS "")
+ if(USE_LITE_INTERPRETER_PROFILER)
+ append_filelist("libtorch_edge_profiler_sources " LITE_PROFILER_SRCS)
+ endif()
+
# Switch between the full jit interpreter and lite interpreter
if(BUILD_LITE_INTERPRETER)
append_filelist("libtorch_lite_cmake_sources" LIBTORCH_CMAKE_SRCS)
list(APPEND LIBTORCH_CMAKE_SRCS ${LITE_EAGER_SYMOBLICATION_SRCS})
+ list(APPEND LIBTORCH_CMAKE_SRCS ${LITE_PROFILER_SRCS})
+ set(CMAKE_POSITION_INDEPENDENT_CODE TRUE)
else()
append_filelist("libtorch_cmake_sources" LIBTORCH_CMAKE_SRCS)
# --[ ATen checks
set(USE_LAPACK 0)
+# we need to build all targets to be linked with PIC
+if(USE_KINETO AND INTERN_BUILD_MOBILE AND USE_LITE_INTERPRETER_PROFILER)
+ set(CMAKE_POSITION_INDEPENDENT_CODE TRUE)
+endif()
+
if(NOT INTERN_BUILD_MOBILE)
set(TORCH_CUDA_ARCH_LIST $ENV{TORCH_CUDA_ARCH_LIST})
set(TORCH_NVCC_FLAGS $ENV{TORCH_NVCC_FLAGS})
set(BUILD_SHARED_LIBS ${TEMP_BUILD_SHARED_LIBS} CACHE BOOL "Build shared libs" FORCE)
# ---[ Kineto
-if(USE_KINETO AND INTERN_BUILD_MOBILE)
+# edge profiler depends on KinetoProfiler but it only does cpu
+# profiling. Thus we dont need USE_CUDA/USE_ROCM
+if(USE_KINETO AND INTERN_BUILD_MOBILE AND NOT (BUILD_LITE_INTERPRETER AND USE_LITE_INTERPRETER_PROFILER))
message(STATUS "Not using libkineto in a mobile build.")
set(USE_KINETO OFF)
endif()
+if(USE_KINETO AND INTERN_BUILD_MOBILE AND USE_LITE_INTERPRETER_PROFILER AND (USE_CUDA OR USE_ROCM))
+ message(FATAL_ERROR "Mobile build with profiler does not support CUDA or ROCM")
+endif()
+
if(USE_KINETO)
if((NOT USE_CUDA) OR MSVC)
set(LIBKINETO_NOCUPTI ON CACHE STRING "" FORCE)
if(NOT TARGET kineto)
add_subdirectory("${KINETO_SOURCE_DIR}")
+ set_property(TARGET kineto PROPERTY POSITION_INDEPENDENT_CODE ON)
endif()
list(APPEND Caffe2_DEPENDENCY_LIBS kineto)
string(APPEND CMAKE_CXX_FLAGS " -DUSE_KINETO")
else
CMAKE_ARGS+=("-DBUILD_LITE_INTERPRETER=ON")
fi
+CMAKE_ARGS+=("-DUSE_LITE_INTERPRETER_PROFILER=OFF")
# Don't build binaries or tests (only the library)
CMAKE_ARGS+=("-DBUILD_TEST=OFF")
lm._save_for_mobile(ss, ExtraFilesMap(), true);
auto mlm = _load_for_mobile(ss);
std::string error_pattern = R"(
- Module hierarchy:top(m).aten::add
+ Module hierarchy:top(m)::<unknown>.aten::add
Traceback of TorchScript (most recent call last):
- File "<string>", line 5, in FunctionName_UNKNOWN
+ File "<string>", line 5, in <unknown>
typed_inputs: List[Any] = [x, h, ]
if self.__backend.is_available() :
_0, = self.__backend.execute(self.__handles["forward"], typed_inputs)
~~~~~~~~~~~~~~~~~~~~~~ <--- HERE
assert isinstance(_0, Tensor)
return _0
- File "<string>", line 3, in FunctionName_UNKNOWN
+ File "<string>", line 3, in <unknown>
def forward(self, x, h):
return x + h
lm._save_for_mobile(ss, ExtraFilesMap(), true);
auto mlm = _load_for_mobile(ss);
std::string error_pattern = R"(
- Module hierarchy:top(C).A0(A).aten::add
+ Module hierarchy:top(C)::<unknown>.A0(A)::forward.aten::add
Traceback of TorchScript (most recent call last):
- File "<string>", line 5, in FunctionName_UNKNOWN
+ File "<string>", line 5, in <unknown>
typed_inputs: List[Any] = [x, y, ]
if self.__backend.is_available() :
_0, = self.__backend.execute(self.__handles["forward"], typed_inputs)
~~~~~~~~~~~~~~~~~~~~~~ <--- HERE
assert isinstance(_0, Tensor)
return _0
- File "<string>", line 3, in FunctionName_UNKNOWN
+ File "<string>", line 3, in <unknown>
def forward(self, x, y):
return self.A0.forward(x, y) + self.B0.forward(x)
*
*/
std::string error_pattern = R"(
- Module hierarchy:top(C).B0(B).A0(A).aten::add
+ Module hierarchy:top(C)::<unknown>.B0(B)::forward.A0(A)::forward.aten::add
Traceback of TorchScript (most recent call last):
- File "<string>", line 5, in FunctionName_UNKNOWN
+ File "<string>", line 5, in <unknown>
typed_inputs: List[Any] = [x, y, ]
if self.__backend.is_available() :
_0, = self.__backend.execute(self.__handles["forward"], typed_inputs)
~~~~~~~~~~~~~~~~~~~~~~ <--- HERE
assert isinstance(_0, Tensor)
return _0
- File "<string>", line 3, in FunctionName_UNKNOWN
+ File "<string>", line 3, in <unknown>
def forward(self, x, y):
return self.B0.forward(x, y) + 3
c._save_for_mobile(ss, ExtraFilesMap(), true);
auto c_loaded = _load_for_mobile(ss);
std::string error_pattern = R"(
- Module hierarchy:top(C).A0(A).aten::add
+ Module hierarchy:top(C)::<unknown>.A0(A)::forward.aten::add
Traceback of TorchScript (most recent call last):
- File "<string>", line 3, in FunctionName_UNKNOWN
+ File "<string>", line 3, in <unknown>
def forward(self, x, y):
return self.A0.forward(x, y) + self.B0.forward(x)
~~~~~~~~~~~~~~~~~~~~~~ <--- HERE
assert isinstance(_0, Tensor)
return _0
- File "<string>", line 3, in FunctionName_UNKNOWN
+ File "<string>", line 3, in <unknown>
def forward(self, x, y):
return x + y
*
* */
std::string error_pattern = R"(
- Module hierarchy:top(C).A0(A).AA0(AA).aten::add
+ Module hierarchy:top(C)::<unknown>.A0(A)::forward.AA0(AA)::forward.aten::add
Traceback of TorchScript (most recent call last):
- File "<string>", line 3, in FunctionName_UNKNOWN
+ File "<string>", line 3, in <unknown>
def forward(self, x, y):
return self.A0.forward(x, y) + self.B0.forward(x)
~~~~~~~~~~~~~~~~~~~~~~ <--- HERE
assert isinstance(_0, Tensor)
return _0
- File "<string>", line 3, in FunctionName_UNKNOWN
+ File "<string>", line 3, in <unknown>
def forward(self, x, y):
return self.AA0.forward(x, y) + 3
}
}
- AT_ASSERT(module_debug_info_set.count("top(M).aten::mul"));
+ AT_ASSERT(module_debug_info_set.count("top(M)::<unknown>.aten::mul"));
}
TEST(LiteInterpreterTest, NotSaveModuleInfo) {
}
}
- AT_ASSERT(module_debug_info_set.count("top(B).aten::add"));
- AT_ASSERT(module_debug_info_set.count("top(B).A0(A).aten::add"));
- AT_ASSERT(module_debug_info_set.count("top(B).A0(A).aten::mul"));
+ AT_ASSERT(module_debug_info_set.count("top(B)::<unknown>.aten::add"));
+ AT_ASSERT(module_debug_info_set.count(
+ "top(B)::<unknown>.A0(A)::forward.aten::add"));
+ AT_ASSERT(module_debug_info_set.count(
+ "top(B)::<unknown>.A0(A)::forward.aten::mul"));
}
TEST(LiteInterpreterTest, TwoSubmodulesModuleInfo) {
}
}
- AT_ASSERT(module_debug_info_set.count("top(C).aten::add"));
- AT_ASSERT(module_debug_info_set.count("top(C).A0(A).aten::add"));
- AT_ASSERT(module_debug_info_set.count("top(C).B0(B).aten::add"));
+ AT_ASSERT(module_debug_info_set.count("top(C)::<unknown>.aten::add"));
+ AT_ASSERT(module_debug_info_set.count(
+ "top(C)::<unknown>.A0(A)::forward.aten::add"));
+ AT_ASSERT(module_debug_info_set.count(
+ "top(C)::<unknown>.B0(B)::forward.aten::add"));
}
TEST(LiteInterpreterTest, GetRuntimeByteCodeVersion) {
// def forward(self, x):
// return self.A0.forward(self.B0.forward(x))
- AT_ASSERT(module_debug_info_set.count("top(C).prim::Return"));
- AT_ASSERT(module_debug_info_set.count("top(C).A0(A).aten::add"));
- AT_ASSERT(module_debug_info_set.count("top(C).B0(B).aten::add"));
+ AT_ASSERT(module_debug_info_set.count("top(C)::<unknown>.prim::Return"));
+ AT_ASSERT(module_debug_info_set.count(
+ "top(C)::<unknown>.A0(A)::forward.aten::add"));
+ AT_ASSERT(module_debug_info_set.count(
+ "top(C)::<unknown>.B0(B)::forward.aten::add"));
}
TEST(LiteInterpreterTest, HierarchyModuleInfo) {
// "top(C).forward": for the add operator in top.
// "top(C).B0(B).forward": for the add operator in B0.
// "top(C).B0(B).forward.A0(A).forward": for the add operator in A0.
- AT_ASSERT(module_debug_info_set.count("top(C).aten::add"));
- AT_ASSERT(module_debug_info_set.count("top(C).B0(B).aten::add"));
- AT_ASSERT(module_debug_info_set.count("top(C).B0(B).A0(A).aten::add"));
+ AT_ASSERT(module_debug_info_set.count("top(C)::<unknown>.aten::add"));
+ AT_ASSERT(module_debug_info_set.count(
+ "top(C)::<unknown>.B0(B)::forward.aten::add"));
+ AT_ASSERT(module_debug_info_set.count(
+ "top(C)::<unknown>.B0(B)::forward.A0(A)::forward.aten::add"));
}
TEST(LiteInterpreterTest, DuplicatedClassTypeModuleInfo) {
// "top(B).A0(A).forward": for the add operator in A0.
// "top(B).A1(A).forward": for the add operator in A1.
- AT_ASSERT(module_debug_info_set.count("top(B).aten::add"));
- AT_ASSERT(module_debug_info_set.count("top(B).A0(A).aten::add"));
- AT_ASSERT(module_debug_info_set.count("top(B).A1(A).aten::add"));
+ AT_ASSERT(module_debug_info_set.count("top(B)::<unknown>.aten::add"));
+ AT_ASSERT(module_debug_info_set.count(
+ "top(B)::<unknown>.A0(A)::forward.aten::add"));
+ AT_ASSERT(module_debug_info_set.count(
+ "top(B)::<unknown>.A1(A)::forward.aten::add"));
}
#endif // !defined(FB_XPLAT_BUILD)
c._save_for_mobile(ss, ExtraFilesMap(), true);
auto lite_m = _load_for_mobile(ss);
std::string error_pattern = R"(
- Module hierarchy:top(C).B0(B).A0(A).aten::add
+ Module hierarchy:top(C)::<unknown>.B0(B)::foo.A0(A)::bar.aten::add
Traceback of TorchScript (most recent call last):
- File "<string>", line 3, in FunctionName_UNKNOWN
+ File "<string>", line 3, in <unknown>
def forward(self, x, y):
return self.B0.foo(x, y) + 3
torch::autograd::profiler::ProfilerState::KINETO, false, false),
activities);
{
- RECORD_USER_SCOPE_WITH_DEBUG_HANDLE_AND_INPUTS("my_function", 42, {});
+ RECORD_EDGE_SCOPE_WITH_DEBUG_HANDLE_AND_INPUTS("my_function", 42, {});
float x{5.9999}, y{2.1212};
float z = x / y;
}
torch::autograd::profiler::ProfilerConfig(
torch::autograd::profiler::ProfilerState::KINETO, false, false),
{torch::autograd::profiler::ActivityType::CPU},
- {at::RecordScope::USER_SCOPE});
+ {at::RecordScope::LITE_INTERPRETER});
{
auto a = torch::rand({128, 128});
auto b = torch::rand({128, 128});
torch::autograd::profiler::ProfilerConfig(
torch::autograd::profiler::ProfilerState::KINETO, false, false),
{torch::autograd::profiler::ActivityType::CPU},
- {at::RecordScope::USER_SCOPE});
+ {at::RecordScope::LITE_INTERPRETER});
{
- RECORD_USER_SCOPE_WITH_DEBUG_HANDLE_AND_INPUTS("my_function", 42, {});
+ RECORD_EDGE_SCOPE_WITH_DEBUG_HANDLE_AND_INPUTS("my_function", 42, {});
auto a = torch::rand({128, 128});
auto b = torch::rand({128, 128});
auto c = a + b;
for (const auto& e : kineto_events) {
if (e.name() == "my_function") {
ASSERT_EQ(e.debugHandle(), 42);
- } else if (e.name() == "not_my_function") {
- ASSERT_EQ(e.debugHandle(), -1);
}
}
- ASSERT_TRUE(profiler_results_ptr->events().size() == 2);
+ ASSERT_TRUE(profiler_results_ptr->events().size() == 1);
}
TEST(IValueKWargsTest, Basic) {
set(LITE_INTERPRETER_RUNTIME_TEST_DIR
${TORCH_ROOT}/test/cpp/lite_interpreter_runtime/main.cpp
${TORCH_ROOT}/test/cpp/lite_interpreter_runtime/test_lite_interpreter_runtime.cpp
+ ${TORCH_ROOT}/test/cpp/lite_interpreter_runtime/test_mobile_profiler.cpp
)
add_library(backend_with_compiler_runtime SHARED
inputs.emplace_back(torch::rand({13, 9}));
std::string error_pattern = R"(
- Module hierarchy:top(C).A0(backend_with_compiler_demoLoweredModule).AA0(AA).aten::add
+ Module hierarchy:top(C)::<unknown>.A0(backend_with_compiler_demoLoweredModule)::forward.AA0(AA)::forward.aten::add
Traceback of TorchScript (most recent call last):
- File "<string>", line 3, in FunctionName_UNKNOWN
+ File "<string>", line 3, in <unknown>
def forward(self, x, y):
return self.A0.forward(x, y) + self.B0.forward(x)
~~~~~~~~~~~~~~~~~~~~~~ <--- HERE
assert isinstance(_0, Tensor)
return _0
- File "<string>", line 3, in FunctionName_UNKNOWN
+ File "<string>", line 3, in <unknown>
def forward(self, x, y):
return self.AA0.forward(x, y) + 3
--- /dev/null
+#include <fstream>
+#include <gtest/gtest.h>
+#include <test/cpp/jit/test_utils.h>
+#include <torch/csrc/jit/api/module.h>
+#include <torch/csrc/jit/frontend/resolver.h>
+#include <torch/csrc/jit/mobile/import.h>
+#include <torch/csrc/jit/mobile/module.h>
+#include <torch/csrc/jit/mobile/profiler_edge.h>
+
+#include <unordered_set>
+
+#ifdef EDGE_PROFILER_USE_KINETO
+namespace torch {
+namespace jit {
+namespace mobile {
+
+namespace {
+bool checkModuleHierarchyForOp(
+ const std::string& op_name,
+ const std::string& module_hier,
+ std::ifstream& trace_file) {
+ std::string line;
+ while (std::getline(trace_file, line) ) {
+ if (line.find(op_name) != std::string::npos) {
+ while (std::getline(trace_file, line) ) {
+ if (line.find("Module Hierarchy") != std::string::npos) {
+ return (line.find(module_hier) != std::string::npos);
+ }
+ }
+ }
+ }
+ return false;
+}
+} // namespace
+
+TEST(MobileProfiler, ModuleHierarchy) {
+ std::string filePath(__FILE__);
+ auto testModelFile = filePath.substr(0, filePath.find_last_of("/\\") + 1);
+ testModelFile.append("to_be_profiled_module.ptl");
+
+ std::vector<IValue> inputs;
+ inputs.emplace_back(at::rand({64, 64}));
+ inputs.emplace_back(at::rand({64, 64}));
+ std::string trace_file_name("/tmp/test_trace.trace");
+
+ mobile::Module bc = _load_for_mobile(testModelFile);
+ {
+ KinetoEdgeCPUProfiler profiler(
+ bc,
+ trace_file_name,
+ false, // record input_shapes
+ false, // profile memory
+ true, // record callstack
+ false, // record flops
+ true); // record module hierarchy
+ bc.forward(inputs);
+ } // End of profiler
+ std::ifstream trace_file(trace_file_name);
+ std::string line;
+ ASSERT_TRUE(trace_file.is_open());
+ trace_file.seekg(0, std::ios_base::beg);
+ ASSERT_TRUE(checkModuleHierarchyForOp("aten::sub", "top(C)::<unknown>.A0(A)::forward.aten::sub", trace_file));
+ trace_file.seekg(0, std::ios_base::beg);
+ ASSERT_TRUE(checkModuleHierarchyForOp("aten::mul", "top(C)::<unknown>.A0(A)::forward.SELF(A)::forward_impl_.SELF(A)::my_new_method.aten::mul", trace_file));
+ trace_file.seekg(0, std::ios_base::beg);
+ ASSERT_TRUE(checkModuleHierarchyForOp("aten::add", "top(C)::<unknown>.A0(A)::forward.SELF(A)::forward_impl_.aten::add", trace_file));
+ ASSERT_TRUE(checkModuleHierarchyForOp("aten::add", "top(C)::<unknown>.SELF(C)::call_b.B0(B)::forward.aten::add", trace_file));
+ ASSERT_TRUE(checkModuleHierarchyForOp("aten::add", "top(C)::<unknown>.aten::add", trace_file));
+}
+
+} // namespace mobile
+} // namespace jit
+} // namespace torch
+#endif
"torch/csrc/autograd/profiler_kineto.cpp",
]
+libtorch_edge_profiler_sources = libtorch_profiler_sources + [
+ "torch/csrc/jit/mobile/profiler_edge.cpp",
+]
+
core_trainer_sources = [
"torch/csrc/autograd/anomaly_mode.cpp",
"torch/csrc/autograd/autograd.cpp",
}
}
+ const std::function<void(std::vector<KinetoEvent>&)>& getEventPostProcessingCallback() const {
+ return event_post_process_cb_;
+ }
+
+ void setEventPostProcessingCallback(std::function<void(std::vector<KinetoEvent>&)>&& cb) {
+ event_post_process_cb_ = std::move(cb);
+ }
+
#ifdef USE_KINETO
c10::DeviceType deviceTypeFromActivity(libkineto::ActivityType activity_type) {
// fallthrough
#endif // USE_KINETO
uint64_t start_time_;
std::vector<KinetoEvent> kineto_events_;
+ // Optional, if event post-processing is enabled.
+ std::function<void(std::vector<KinetoEvent>&)> event_post_process_cb_;
};
std::vector<std::string> inputTypes(const at::RecordFunction& fn) {
#endif // USE_KINETO
}
+void enableProfilerWithEventPostProcess(
+ const ProfilerConfig& config,
+ const std::set<ActivityType>& activities,
+ std::function<void(std::vector<KinetoEvent>&)>&& cb,
+ const std::unordered_set<at::RecordScope>& scopes) {
+ enableProfiler(config, activities, scopes);
+ auto state_ptr = getProfilerTLSState();
+ state_ptr->setEventPostProcessingCallback(std::move(cb));
+}
+
void enableProfiler(
const ProfilerConfig& config,
const std::set<ActivityType>& activities,
#ifdef USE_KINETO
state_ptr->cpu_trace->span.endTime = getTimeUs();
+
+ // Call events post processing callback before finalizing trace, if there is one.
+ if (state_ptr->getEventPostProcessingCallback()) {
+ state_ptr->getEventPostProcessingCallback()(state_ptr->kineto_events_);
+ }
state_ptr->finalizeCPUTrace();
libkineto::api().activityProfiler().transferCpuTrace(std::move(state_ptr->cpu_trace));
#ifdef USE_KINETO
// skip Kineto dependency on mobile
-#ifdef C10_MOBILE
+// unless explicitly asked for.
+// When is it explicitly asked for?
+// KinetoEdgeCPUProfiler uses KinetoProfiler for cpu
+// event profiling. This has dependency on cpu only libkineto
+#if defined(C10_MOBILE) && !defined(EDGE_PROFILER_USE_KINETO)
#undef USE_KINETO
#endif
#endif
const std::set<ActivityType>& activities,
const std::unordered_set<at::RecordScope>& scopes = {});
+/*
+ * Same as enableProfiler but with callback to do post-processing of
+ * KinetoEvents.
+ * enableProfilerWithEventPostProcess enables profiler to capture
+ * specified activities, with specified RecordFunction scope, if any.
+ * Additionally, it takes a functor that does in-place post processing of
+ * events, e.g. populate stack trace or module hierarchy information lazily
+ * using debug_handle.
+ * Example usage is with lite interpreter that has recording scope of LITE_INTERPRETER.
+ * In this case lite interpreter runtime, records debug handles in RecordFunction, along
+ * with other information. Debug handles are eventually passed down to KinetoEvent and
+ * recorded as part of the event. KinetoEdgeCPUProfiler,
+ * in torch/csrc/jit/mobile/profiler_edge.cpp, enables profiler using post-processing
+ * callback, via enableProfilerWithEventPostProcess, that takes these debug handles
+ * and generates stack trace and module hierarchy information, once profiling is done.
+ */
+TORCH_API void enableProfilerWithEventPostProcess(
+ const ProfilerConfig& config,
+ const std::set<ActivityType>& activities,
+ std::function<void(std::vector<KinetoEvent>&)>&& cb,
+ const std::unordered_set<at::RecordScope>& scopes = {});
+
TORCH_API std::unique_ptr<ProfilerResult> disableProfiler();
TORCH_API void prepareProfiler(
module_info.append(".").append(module_instance_info.instance_name());
}
} else {
- module_info += ".UNKNOWN_INSTANCE(UNKNOWN_TYPE)";
+ module_info.append(".UNKNOWN_INSTANCE(UNKNOWN_TYPE)");
}
// Now add source range info to stack
- // When we serialize function names, those can be added here.
- // TODO: Add function name separately
entries.emplace_back(
StackEntry{prev_function_name, callstack_ptr->source_range()});
if (callstack_ptr->function()) {
} else {
prev_function_name = callstack_ptr->function_name();
}
+ // Function name appended here
+ // It is renamed to prev_function_name because for StackEntry
+ // it will be appended in the next iteration. This is the format
+ // in which format_stack_trace expects function names.
+ module_info.append("::").append(prev_function_name);
if (callstack_ptr->callee()) {
callstack_ptr = callstack_ptr->callee().value();
std::vector<StackEntry> stack_entries;
std::string module_info =
root_scope_string + "(" + top_module_type_name + ")";
- std::string caller_fn_name = "FunctionName_UNKNOWN";
+ std::string caller_fn_name = "<unknown>";
+ module_info.append("::").append(caller_fn_name);
for (const auto& debug_info : source_callstacks) {
auto debug_info_pair =
getStackTraceWithModuleHierarchy(debug_info, caller_fn_name);
auto entries = std::move(debug_info_pair.first);
stack_entries.insert(stack_entries.end(), entries.begin(), entries.end());
- module_info += debug_info_pair.second;
+ module_info.append(debug_info_pair.second);
}
// Only last entry in the callstack will have a node name of interest.
// Rest are likely CallMethod/CallFunction nodes
auto last_entry = source_callstacks.back();
const std::string& node_name =
std::get<kDebugInfoTupleNodeNameIndex>(last_entry);
- module_info += "." + node_name;
+ module_info.append(".").append(node_name);
std::ostringstream ss;
ss << "Module hierarchy:" << module_info << "\n";
format_stack_trace(ss, stack_entries);
size_t pc = 0;
while (true) {
try {
- Instruction inst = code_->instructions_with_handles_[pc].instruction;
+ auto inst_with_handle = code_->instructions_with_handles_.at(pc);
+ Instruction inst = inst_with_handle.instruction;
+ DebugHandle debug_handle = inst_with_handle.debug_handle;
// std::cout << "RUNNING " << pc << " " << code_->instructions_[pc];
// if (inst.op == OP) {
// }
// }
// std::cout << std::endl;
+
+ // TODO(iliacher): remove the workaround after RecordFunction is in
+ // Dispatcher
+ // Check with iliacher if has been done.
+ // Plus this is not safe as if you throw exception record function will be
+ // left enabled. That is a TODO
+ bool prev_value = isRecordFunctionEnabled();
+ if (!prev_value) {
+ // enable only for the RecordFunction
+ enableRecordFunction(true);
+ }
switch (inst.op) {
case OP: {
if (at::hasGlobalCallbacks()) {
}
}
- // TODO(iliacher): remove the workaround after RecordFunction is in
- // Dispatcher
- bool prev_value = isRecordFunctionEnabled();
- if (!prev_value) {
- // enable only for the RecordFunction
- enableRecordFunction(true);
- }
- RECORD_USER_SCOPE_WITH_INPUTS(code_->op_names_[inst.X].name, stack);
- if (!prev_value) {
- enableRecordFunction(false);
- }
+ RECORD_EDGE_SCOPE_WITH_DEBUG_HANDLE_AND_INPUTS(
+ code_->op_names_[inst.X].name, debug_handle, stack);
code_->operators_[inst.X](stack);
++pc;
} break;
case OPN: {
stack.push_back(inst.N);
+ RECORD_EDGE_SCOPE_WITH_DEBUG_HANDLE_AND_INPUTS(
+ code_->op_names_[inst.X].name, debug_handle, stack);
code_->operators_[inst.X](stack);
++pc;
} break;
.toObject()
->type()
->getMethod(code_->constants_[inst.X].toStringRef());
+ RECORD_EDGE_SCOPE_WITH_DEBUG_HANDLE_AND_INPUTS(
+ method.name(), debug_handle, stack);
method.run(stack);
++pc;
} break;
default:
AT_ERROR(toString(inst.op), " is invalid.");
}
+
+ if (!prev_value) {
+ enableRecordFunction(false);
+ }
// This exception must be caught first as it derived from c10::Error
} catch (c10::BackendRuntimeException& e) {
exception_pc_ = pc;
return params;
}
+std::string Module::getModuleHierarchy(const int64_t debug_handle) const {
+#if defined(SYMBOLICATE_MOBILE_DEBUG_HANDLE)
+ return getDebugTable().getModuleHierarchyInfo(
+ debug_handle, getTopModuleTypeName(*this));
+#else
+ return "";
+#endif
+}
+
+std::string Module::getCallStack(const int64_t debug_handle) const {
+#if defined(SYMBOLICATE_MOBILE_DEBUG_HANDLE)
+ return getDebugTable().getSourceDebugString(
+ debug_handle, getTopModuleTypeName(*this));
+#else
+ return "";
+#endif
+}
+
// We will continue to support this API for now as this is being relied upon
// for profiling.
// We really need to change this part, so in the next step for profiling support
const std::vector<at::Tensor> parameters() const;
const std::map<std::string, at::Tensor> named_parameters() const;
std::string get_forward_method_debug_info(size_t pc) const;
+ std::string getModuleHierarchy(const int64_t debug_handle) const;
+ std::string getCallStack(const int64_t debug_handle) const;
/// Enables "training" mode.
void train(bool on = true);
/// Calls train(false) to enable "eval" mode.
--- /dev/null
+#include <torch/csrc/jit/mobile/profiler_edge.h>
+#include <string>
+#include <vector>
+
+namespace profiler = torch::autograd::profiler;
+namespace torch {
+namespace jit {
+namespace mobile {
+
+KinetoEdgeCPUProfiler::KinetoEdgeCPUProfiler(
+ const torch::jit::mobile::Module& m,
+ const std::string& fname,
+ const bool report_input_shapes,
+ const bool profile_memory,
+ const bool with_stack,
+ const bool with_flops,
+ const bool with_modules)
+ : m_(m), trace_file_name_(fname) {
+ profiler::ProfilerConfig config(
+ profiler::ProfilerState::KINETO,
+ report_input_shapes,
+ profile_memory,
+ with_stack,
+ with_flops,
+ with_modules);
+ profiler::prepareProfiler(config, {profiler::ActivityType::CPU});
+ if (with_modules || with_stack) {
+ auto post_processing = [this, with_stack, with_modules](
+ std::vector<profiler::KinetoEvent>& events) {
+ for (auto& e : events) {
+ if (with_modules) {
+ // Since KinetoEvents's module hierarchy takes vector of strings we
+ // just construct a temporary vector using one string element
+ e.moduleHierarchy(std::vector<std::string>(
+ {this->m_.getModuleHierarchy(e.debugHandle())}));
+ } else if (with_stack) {
+ // Since KinetoEvents's stack trace takes vector of strings we just
+ // construct a temporary vector using one string element
+ e.stack(std::vector<std::string>(
+ {this->m_.getCallStack(e.debugHandle())}));
+ }
+ }
+ };
+ profiler::enableProfilerWithEventPostProcess(
+ config,
+ {profiler::ActivityType::CPU},
+ post_processing,
+ {at::RecordScope::LITE_INTERPRETER});
+ } else {
+ profiler::enableProfiler(
+ config,
+ {profiler::ActivityType::CPU},
+ {at::RecordScope::LITE_INTERPRETER});
+ }
+ trace_file_name_ = fname;
+}
+
+KinetoEdgeCPUProfiler::~KinetoEdgeCPUProfiler() {
+ profiler::disableProfiler()->save(trace_file_name_);
+}
+} // namespace mobile
+} // namespace jit
+} // namespace torch
--- /dev/null
+#pragma once
+#include <torch/csrc/autograd/profiler_kineto.h>
+#include <torch/csrc/jit/mobile/module.h>
+
+namespace torch {
+namespace jit {
+namespace mobile {
+class TORCH_API KinetoEdgeCPUProfiler {
+ public:
+ // This profiler only profiles KINETO events
+ // No GPU_FALLBACK or NVTX
+ /*
+ * @param m is the instance of mobile Module which is being profiled.
+ * Note that this implies that KinetoEdgeCPUProfiler can be used
+ * to profile specific Module (see usage below), unliked ProfilerKineto
+ * which can profile pytorch runtime in arbitrary scope.
+ * @param fname is the name of the file to which chrome trace is written.
+ * @param report_input_shapes: whether to record shapes of op's inputs.
+ * @param with_stack: whether to record model's python stacktrace for the op.
+ * @param with_flops: whether to report flops corresponding to the op.
+ * @param with_modules: whether to report original python module
+ * hierarchy to which the op belongs.
+ *
+ * Usage pattern for this profiler must be as follows:
+ *
+ * {
+ * KinetoEdgeCPUProfiler(m, filename, args);
+ * m.forward(...);
+ * }
+ *
+ * The reason being that KinetoEdgeCPUProfiler has a dependency on Module
+ * and thus it must not outlive it.
+ *
+ * Thus, when KinetoEdgeCPUProfiler is used as RAII to do profiling
+ * within certain scope. In that scope, the captured reference to
+ * Module will outlive KinetoEdgeCPUProfiler. This is gauranteed because
+ * KinetoEdgeCPUProfiler must be constructed later than Module, on stack.
+ *
+ * An example of the anti-pattern and wrong usage is:
+ *
+ * std::shared_ptr<KinetoMobileCPUProfiler> profiler(m, filename, args);
+ * m.forward(...);
+ *
+ * Since KinetoEdgeCPUProfiler object would then be constructed on heap
+ * with its lifetime managed manually or via smart pointers.
+ */
+ KinetoEdgeCPUProfiler(
+ const torch::jit::mobile::Module& m,
+ const std::string& fname,
+ const bool report_input_shapes = false,
+ const bool profile_memory = false,
+ const bool with_stack = false,
+ const bool with_flops = false,
+ const bool with_modules = false);
+
+ ~KinetoEdgeCPUProfiler();
+
+ private:
+ /*
+ * We store a reference to Module to make such dependency explicit, since
+ * a Module reference is already stored in a functor.
+ */
+ const mobile::Module& m_;
+ std::string trace_file_name_;
+};
+} // namespace mobile
+} // namespace jit
+} // namespace torch