From 600eeecbf4556e733b1ab22570bbb2700a420fdb Mon Sep 17 00:00:00 2001 From: Ilia Cherniavskii Date: Thu, 28 Mar 2019 17:42:47 -0700 Subject: [PATCH] Add external callbacks into RecordFunction (#17844) Summary: Add a way to insert external callbacks into PT's RecordFunction Pull Request resolved: https://github.com/pytorch/pytorch/pull/17844 Differential Revision: D14399664 Pulled By: ilia-cher fbshipit-source-id: 76654799811fefd3ffed4abfb46ed95b492cebab --- test/cpp/jit/test.cpp | 3 +- test/cpp/jit/test_misc.h | 66 +++++++++++++++++++++ tools/build_variables.py | 1 + torch/CMakeLists.txt | 1 + torch/csrc/autograd/profiler.cpp | 62 ++++++------------- torch/csrc/autograd/profiler.h | 32 ++-------- torch/csrc/autograd/record_function.cpp | 83 ++++++++++++++++++++++++++ torch/csrc/autograd/record_function.h | 102 ++++++++++++++++++++++++++++++++ 8 files changed, 278 insertions(+), 72 deletions(-) create mode 100644 torch/csrc/autograd/record_function.cpp create mode 100644 torch/csrc/autograd/record_function.h diff --git a/test/cpp/jit/test.cpp b/test/cpp/jit/test.cpp index 1c4823d..20cab91 100644 --- a/test/cpp/jit/test.cpp +++ b/test/cpp/jit/test.cpp @@ -63,7 +63,8 @@ namespace jit { _(ATenNativeBatchNorm) \ _(NoneSchemaMatch) \ _(ClassParser) \ - _(PeepholeOptimize) + _(PeepholeOptimize) \ + _(RecordFunction) #define TH_FORALL_TESTS_CUDA(_) \ _(ArgumentSpec) \ diff --git a/test/cpp/jit/test_misc.h b/test/cpp/jit/test_misc.h index 10cd871..7a314ad 100644 --- a/test/cpp/jit/test_misc.h +++ b/test/cpp/jit/test_misc.h @@ -575,6 +575,72 @@ void testTopologicalIndex() { } } +void invokeTestRecordFunction(at::Tensor& t) { + autograd::profiler::GetPackedInputsCallback inputs_cb = + [t]() { + Stack st; + pack(st, t); + return st; + }; + autograd::profiler::RecordFunction guard("test", inputs_cb); + t.add_(torch::ones_like(t)); +} + +std::string getFullName(const autograd::profiler::RecordFunction* fn_ptr) { + std::string full_name = ""; + while (fn_ptr != nullptr) { + if (!full_name.empty()) { + full_name = std::string(fn_ptr->name().str()) + "::" + full_name; + } else { + full_name = fn_ptr->name().str(); + } + fn_ptr = fn_ptr->parent(); + } + return full_name; +} + +void invokeTestRecordFunctionNested() { + autograd::profiler::RecordFunction guard("inner"); +} + +void testRecordFunction() { + std::vector> input_sizes; + autograd::profiler::pushCallback([&input_sizes]( + const autograd::profiler::RecordFunction& fn) { + for (const auto& input : fn.inputs()) { + if (input.isTensor()) { + std::vector t = input.toTensor().sizes().vec(); + input_sizes.push_back(t); + } + } + }); + + auto t = torch::randn({1, 2, 3}, at::kCPU); + invokeTestRecordFunction(t); + + autograd::profiler::popCallback(); + + AT_CHECK(input_sizes.size() == 1); + AT_CHECK(input_sizes[0] == at::IntArrayRef({1, 2, 3})); + + // test nested RecordFunctions + std::vector nested_names; + autograd::profiler::pushCallback([&nested_names]( + const autograd::profiler::RecordFunction& fn) { + nested_names.push_back(getFullName(&fn)); + }); + + { + autograd::profiler::RecordFunction guard("outer"); + invokeTestRecordFunctionNested();; + } + + autograd::profiler::popCallback(); + AT_CHECK(nested_names.size() == 2); + AT_CHECK(nested_names[0] == "outer"); + AT_CHECK(nested_names[1] == "outer::inner"); +} + void testAutogradProfiler() { constexpr int batch_size = 4; constexpr int input_size = 256; diff --git a/tools/build_variables.py b/tools/build_variables.py index 503eb6a..a293a56 100644 --- a/tools/build_variables.py +++ b/tools/build_variables.py @@ -45,6 +45,7 @@ libtorch_sources = [ "torch/csrc/autograd/grad_mode.cpp", "torch/csrc/autograd/input_buffer.cpp", "torch/csrc/autograd/profiler.cpp", + "torch/csrc/autograd/record_function.cpp", "torch/csrc/autograd/saved_variable.cpp", "torch/csrc/autograd/variable.cpp", "torch/csrc/Exceptions.cpp", diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt index c2d7783..5db5260 100644 --- a/torch/CMakeLists.txt +++ b/torch/CMakeLists.txt @@ -117,6 +117,7 @@ set(TORCH_SRCS ${TORCH_SRC_DIR}/csrc/autograd/grad_mode.cpp ${TORCH_SRC_DIR}/csrc/autograd/input_buffer.cpp ${TORCH_SRC_DIR}/csrc/autograd/profiler.cpp + ${TORCH_SRC_DIR}/csrc/autograd/record_function.cpp ${TORCH_SRC_DIR}/csrc/autograd/saved_variable.cpp ${TORCH_SRC_DIR}/csrc/autograd/variable.cpp ${TORCH_SRC_DIR}/csrc/autograd/VariableTypeManual.cpp diff --git a/torch/csrc/autograd/profiler.cpp b/torch/csrc/autograd/profiler.cpp index 555ca22..b68dd28 100644 --- a/torch/csrc/autograd/profiler.cpp +++ b/torch/csrc/autograd/profiler.cpp @@ -34,88 +34,51 @@ RangeEventList& getEventList() { } void mark(std::string name, bool include_cuda /* = true */) { - if (state == ProfilerState::Disabled) { - return; - } if (state == ProfilerState::NVTX) { cuda_stubs->nvtxMarkA(name.c_str()); } else { getEventList().record( EventKind::Mark, - std::move(name), + StringView(std::move(name)), thread_id, include_cuda && state == ProfilerState::CUDA); } } -const char* c_str(const char *str) { return str; } -// NB: non-const to disallow temporaries (lifetime issues) -const char* c_str(std::string& str) { return str.c_str(); } - -template -void pushRangeImpl(T name, const char* msg="", int64_t sequence_nr=-1) { - if (state == ProfilerState::Disabled) { - return; - } +void pushRangeImpl(const StringView& name, const char* msg="", int64_t sequence_nr=-1) { if (state == ProfilerState::NVTX) { if(sequence_nr >= 0) { std::stringstream s; - s << name << msg << sequence_nr; + s << name.str() << msg << sequence_nr; cuda_stubs->nvtxRangePushA(s.str().c_str()); } else { - cuda_stubs->nvtxRangePushA(c_str(name)); + cuda_stubs->nvtxRangePushA(name.str()); } } else { getEventList().record( EventKind::PushRange, - std::move(name), + name, thread_id, state == ProfilerState::CUDA); } } void pushRange(std::string name) { - pushRangeImpl(std::move(name)); + pushRangeImpl(StringView(std::move(name))); } void popRange() { - if (state == ProfilerState::Disabled) { - return; - } if (state == ProfilerState::NVTX) { cuda_stubs->nvtxRangePop(); } else { getEventList().record( EventKind::PopRange, - "", + StringView(""), thread_id, state == ProfilerState::CUDA); } } -RecordFunction::RecordFunction(Function* fn) { - // typeid(*fn).name() would avoid an additional string allocation. - // However, typeid(*fn).name() would cause nvtx annotations for all user-defined - // (Python-side) custom autograd function backward() methods to have the same name, - // because they route through the same C++ side class. - // fn->name() ensures that nvtx annotations for custom function backward() methods - // receive a relevant, demangled name. - pushRangeImpl(fn->name(), ", stashed seq=", fn->sequence_nr()); -} - -RecordFunction::RecordFunction(std::string name) { - pushRangeImpl(std::move(name)); -} - -RecordFunction::RecordFunction(const char* name) { - pushRangeImpl(name); -} - -RecordFunction::RecordFunction(const char* name, int64_t current_sequence_nr) -{ - pushRangeImpl(name, ", seq=", current_sequence_nr); -} - void enableProfiler(ProfilerState new_state) { AT_ASSERT(new_state != ProfilerState::Disabled); if (new_state == ProfilerState::NVTX && !cuda_stubs->enabled()) @@ -123,6 +86,14 @@ void enableProfiler(ProfilerState new_state) { if (state != ProfilerState::Disabled && new_state != state) { throw std::runtime_error("can't change kind of profiling (e.g. NVTX to CPU) while profiler is running"); } + + pushCallback([](const RecordFunction& fn) { + auto* msg = (fn.seqNr() >= 0) ? ", seq = " : ""; + pushRangeImpl(fn.name(), msg, fn.seqNr()); + }, + [](const RecordFunction& /* unused */) { + popRange(); + }); state = new_state; if(state == ProfilerState::CUDA) { @@ -151,7 +122,10 @@ thread_event_lists disableProfiler() { } ProfilerState old_state = state; mark("__stop_profile"); + + popCallback(); state = ProfilerState::Disabled; + if (old_state == ProfilerState::NVTX) { return thread_event_lists(); } else { diff --git a/torch/csrc/autograd/profiler.h b/torch/csrc/autograd/profiler.h index 8b9e841..8d06c09 100644 --- a/torch/csrc/autograd/profiler.h +++ b/torch/csrc/autograd/profiler.h @@ -17,6 +17,7 @@ #include #endif +#include #include typedef struct CUevent_st* CUDAEventStub; @@ -97,13 +98,8 @@ enum class EventKind : uint16_t { }; struct TORCH_API Event final { - Event(EventKind kind, std::string name, uint16_t thread_id, bool record_cuda) - : owned_name_(new std::string(std::move(name))) - , name_ptr_(owned_name_->c_str()) - , kind_(kind) - , thread_id_(thread_id) { record(record_cuda); } - Event(EventKind kind, const char* name, uint16_t thread_id, bool record_cuda) - : name_ptr_(name) + Event(EventKind kind, StringView name, uint16_t thread_id, bool record_cuda) + : name_(std::move(name)) , kind_(kind) , thread_id_(thread_id) { record(record_cuda); } @@ -117,7 +113,7 @@ struct TORCH_API Event final { throw std::runtime_error("unknown EventKind"); } const char* name() const { - return name_ptr_; + return name_.str(); } uint16_t thread_id() const { return thread_id_; @@ -134,11 +130,7 @@ struct TORCH_API Event final { } private: int64_t cpu_ns_ = 0; // signed to allow for negative intervals, initialized for safety. - // std::string is a very large object (usually around 32B), - // and this field is used only for user-created ranges, so - // it's better to save on size of Events. - std::unique_ptr owned_name_; - const char * name_ptr_; + StringView name_; EventKind kind_; uint16_t thread_id_; int device_ = -1; @@ -203,20 +195,6 @@ TORCH_API void mark(std::string name, bool include_cuda = true); TORCH_API void pushRange(std::string name); TORCH_API void popRange(); -struct TORCH_API RecordFunction { - explicit RecordFunction(Function* fn); - - explicit RecordFunction(std::string name); - - explicit RecordFunction(const char* name); - - explicit RecordFunction(const char* name, int64_t current_sequence_nr); - - ~RecordFunction() { - popRange(); - } -}; - using thread_event_lists = std::vector>; // NOTE: changing profiler modes is **NOT THREAD SAFE**. You should ensure that // there no autograd functions are being executed when these function are used. diff --git a/torch/csrc/autograd/record_function.cpp b/torch/csrc/autograd/record_function.cpp new file mode 100644 index 0000000..57f83e9 --- /dev/null +++ b/torch/csrc/autograd/record_function.cpp @@ -0,0 +1,83 @@ +#include +#include + +namespace torch { namespace autograd { namespace profiler { + +namespace { +bool has_callbacks = false; +std::vector start_callbacks; +std::vector end_callbacks; +thread_local RecordFunction* thread_local_func_ = nullptr; +} + +void pushCallback(RecordFunctionCallback start, RecordFunctionCallback end) { + start_callbacks.push_back(start); + end_callbacks.push_back(end); + has_callbacks = true; +} + +void pushCallback(RecordFunctionCallback start) { + pushCallback(start, [](const RecordFunction&){}); +} + +void popCallback() { + if (start_callbacks.empty()) { + throw std::runtime_error("Empty callbacks stack"); + } + start_callbacks.pop_back(); + end_callbacks.pop_back(); + has_callbacks = !start_callbacks.empty(); +} + +RecordFunction::RecordFunction(Function* fn, GetPackedInputsCallback cb) { + if (!has_callbacks) { + return; + } + fn_ = fn; + name_ = StringView(fn->name()); + sequence_nr_ = fn->sequence_nr(); + inputs_cb_ = cb; + processCallbacks(); +} + +RecordFunction::RecordFunction( + std::string name, int64_t sequence_nr, GetPackedInputsCallback cb) { + if (!has_callbacks) { + return; + } + name_ = StringView(std::move(name)); + sequence_nr_ = sequence_nr; + inputs_cb_ = cb; + processCallbacks(); +} + +RecordFunction::RecordFunction( + const char* name, int64_t sequence_nr, GetPackedInputsCallback cb) { + if (!has_callbacks) { + return; + } + name_ = StringView(name); + sequence_nr_ = sequence_nr; + inputs_cb_ = cb; + processCallbacks(); +} + +void RecordFunction::processCallbacks() { + parent_ = thread_local_func_; + thread_local_func_ = this; + + for (const auto& cb : start_callbacks) { + cb(*this); + } +} + +RecordFunction::~RecordFunction() { + if (has_callbacks) { + for (const auto& cb : end_callbacks) { + cb(*this); + } + thread_local_func_ = parent_; + } +} + +}}} diff --git a/torch/csrc/autograd/record_function.h b/torch/csrc/autograd/record_function.h new file mode 100644 index 0000000..eef1a67 --- /dev/null +++ b/torch/csrc/autograd/record_function.h @@ -0,0 +1,102 @@ +#pragma once + +#include +#include +#include + +namespace torch { namespace autograd { + +struct Function; + +namespace profiler { + +struct TORCH_API StringView { + StringView() : StringView(nullptr) {} + explicit StringView(const char* str_ptr) + : owned_str_ptr_(nullptr), str_ptr_(str_ptr) {} + explicit StringView(std::string str) + : owned_str_ptr_(std::make_shared(std::move(str))), + str_ptr_(owned_str_ptr_->c_str()) {} + + inline const char* str() const { + return str_ptr_; + } + private: + std::shared_ptr owned_str_ptr_; + const char* str_ptr_; +}; + +using GetPackedInputsCallback = std::function()>; + +struct TORCH_API RecordFunction { + explicit RecordFunction(Function* fn, GetPackedInputsCallback cb = nullptr); + + explicit RecordFunction( + std::string name, + int64_t current_sequence_nr = -1, + GetPackedInputsCallback cb = nullptr); + + explicit RecordFunction( + const char* name, + int64_t current_sequence_nr = -1, + GetPackedInputsCallback cb = nullptr); + + explicit RecordFunction( + std::string name, + GetPackedInputsCallback cb) : RecordFunction(name, -1, cb) {} + + explicit RecordFunction( + const char* name, + GetPackedInputsCallback cb) : RecordFunction(name, -1, cb) {} + + virtual ~RecordFunction(); + + + inline Function* func() const { + return fn_; + } + + inline const StringView& name() const { + return name_; + } + + inline int64_t seqNr() const { + return sequence_nr_; + } + + const std::vector& inputs() const { + if (inputs_cb_ && !inputs_initialized_) { + inputs_ = inputs_cb_(); + inputs_initialized_ = true; + } + return inputs_; + } + + inline const RecordFunction* parent() const { + return parent_; + } + + private: + void processCallbacks(); + + Function* fn_ = nullptr; + StringView name_; + int64_t sequence_nr_ = -1; + + RecordFunction* parent_ = nullptr; + + GetPackedInputsCallback inputs_cb_ = nullptr; + mutable bool inputs_initialized_ = false; + // initialized lazily by inputs_cb_ + mutable std::vector inputs_; +}; + +// WARNING: all calls to pushCallback/popCallback are not thread safe and +// must not overlap with other code execution +using RecordFunctionCallback = std::function; +TORCH_API void pushCallback(RecordFunctionCallback, RecordFunctionCallback); +TORCH_API void pushCallback(RecordFunctionCallback); +TORCH_API void popCallback(); + +} // namespace profiler +}} // namespace torch::autograd -- 2.7.4