_(ATenNativeBatchNorm) \
_(NoneSchemaMatch) \
_(ClassParser) \
- _(PeepholeOptimize)
+ _(PeepholeOptimize) \
+ _(RecordFunction)
#define TH_FORALL_TESTS_CUDA(_) \
_(ArgumentSpec) \
}
}
+void invokeTestRecordFunction(at::Tensor& t) {
+ autograd::profiler::GetPackedInputsCallback inputs_cb =
+ [t]() {
+ Stack st;
+ pack(st, t);
+ return st;
+ };
+ autograd::profiler::RecordFunction guard("test", inputs_cb);
+ t.add_(torch::ones_like(t));
+}
+
+std::string getFullName(const autograd::profiler::RecordFunction* fn_ptr) {
+ std::string full_name = "";
+ while (fn_ptr != nullptr) {
+ if (!full_name.empty()) {
+ full_name = std::string(fn_ptr->name().str()) + "::" + full_name;
+ } else {
+ full_name = fn_ptr->name().str();
+ }
+ fn_ptr = fn_ptr->parent();
+ }
+ return full_name;
+}
+
+void invokeTestRecordFunctionNested() {
+ autograd::profiler::RecordFunction guard("inner");
+}
+
+void testRecordFunction() {
+ std::vector<std::vector<int64_t>> input_sizes;
+ autograd::profiler::pushCallback([&input_sizes](
+ const autograd::profiler::RecordFunction& fn) {
+ for (const auto& input : fn.inputs()) {
+ if (input.isTensor()) {
+ std::vector<int64_t> t = input.toTensor().sizes().vec();
+ input_sizes.push_back(t);
+ }
+ }
+ });
+
+ auto t = torch::randn({1, 2, 3}, at::kCPU);
+ invokeTestRecordFunction(t);
+
+ autograd::profiler::popCallback();
+
+ AT_CHECK(input_sizes.size() == 1);
+ AT_CHECK(input_sizes[0] == at::IntArrayRef({1, 2, 3}));
+
+ // test nested RecordFunctions
+ std::vector<std::string> nested_names;
+ autograd::profiler::pushCallback([&nested_names](
+ const autograd::profiler::RecordFunction& fn) {
+ nested_names.push_back(getFullName(&fn));
+ });
+
+ {
+ autograd::profiler::RecordFunction guard("outer");
+ invokeTestRecordFunctionNested();;
+ }
+
+ autograd::profiler::popCallback();
+ AT_CHECK(nested_names.size() == 2);
+ AT_CHECK(nested_names[0] == "outer");
+ AT_CHECK(nested_names[1] == "outer::inner");
+}
+
void testAutogradProfiler() {
constexpr int batch_size = 4;
constexpr int input_size = 256;
"torch/csrc/autograd/grad_mode.cpp",
"torch/csrc/autograd/input_buffer.cpp",
"torch/csrc/autograd/profiler.cpp",
+ "torch/csrc/autograd/record_function.cpp",
"torch/csrc/autograd/saved_variable.cpp",
"torch/csrc/autograd/variable.cpp",
"torch/csrc/Exceptions.cpp",
${TORCH_SRC_DIR}/csrc/autograd/grad_mode.cpp
${TORCH_SRC_DIR}/csrc/autograd/input_buffer.cpp
${TORCH_SRC_DIR}/csrc/autograd/profiler.cpp
+ ${TORCH_SRC_DIR}/csrc/autograd/record_function.cpp
${TORCH_SRC_DIR}/csrc/autograd/saved_variable.cpp
${TORCH_SRC_DIR}/csrc/autograd/variable.cpp
${TORCH_SRC_DIR}/csrc/autograd/VariableTypeManual.cpp
}
void mark(std::string name, bool include_cuda /* = true */) {
- if (state == ProfilerState::Disabled) {
- return;
- }
if (state == ProfilerState::NVTX) {
cuda_stubs->nvtxMarkA(name.c_str());
} else {
getEventList().record(
EventKind::Mark,
- std::move(name),
+ StringView(std::move(name)),
thread_id,
include_cuda && state == ProfilerState::CUDA);
}
}
-const char* c_str(const char *str) { return str; }
-// NB: non-const to disallow temporaries (lifetime issues)
-const char* c_str(std::string& str) { return str.c_str(); }
-
-template<typename T>
-void pushRangeImpl(T name, const char* msg="", int64_t sequence_nr=-1) {
- if (state == ProfilerState::Disabled) {
- return;
- }
+void pushRangeImpl(const StringView& name, const char* msg="", int64_t sequence_nr=-1) {
if (state == ProfilerState::NVTX) {
if(sequence_nr >= 0) {
std::stringstream s;
- s << name << msg << sequence_nr;
+ s << name.str() << msg << sequence_nr;
cuda_stubs->nvtxRangePushA(s.str().c_str());
} else {
- cuda_stubs->nvtxRangePushA(c_str(name));
+ cuda_stubs->nvtxRangePushA(name.str());
}
} else {
getEventList().record(
EventKind::PushRange,
- std::move(name),
+ name,
thread_id,
state == ProfilerState::CUDA);
}
}
void pushRange(std::string name) {
- pushRangeImpl(std::move(name));
+ pushRangeImpl(StringView(std::move(name)));
}
void popRange() {
- if (state == ProfilerState::Disabled) {
- return;
- }
if (state == ProfilerState::NVTX) {
cuda_stubs->nvtxRangePop();
} else {
getEventList().record(
EventKind::PopRange,
- "",
+ StringView(""),
thread_id,
state == ProfilerState::CUDA);
}
}
-RecordFunction::RecordFunction(Function* fn) {
- // typeid(*fn).name() would avoid an additional string allocation.
- // However, typeid(*fn).name() would cause nvtx annotations for all user-defined
- // (Python-side) custom autograd function backward() methods to have the same name,
- // because they route through the same C++ side class.
- // fn->name() ensures that nvtx annotations for custom function backward() methods
- // receive a relevant, demangled name.
- pushRangeImpl(fn->name(), ", stashed seq=", fn->sequence_nr());
-}
-
-RecordFunction::RecordFunction(std::string name) {
- pushRangeImpl(std::move(name));
-}
-
-RecordFunction::RecordFunction(const char* name) {
- pushRangeImpl<const char*>(name);
-}
-
-RecordFunction::RecordFunction(const char* name, int64_t current_sequence_nr)
-{
- pushRangeImpl<const char*>(name, ", seq=", current_sequence_nr);
-}
-
void enableProfiler(ProfilerState new_state) {
AT_ASSERT(new_state != ProfilerState::Disabled);
if (new_state == ProfilerState::NVTX && !cuda_stubs->enabled())
if (state != ProfilerState::Disabled && new_state != state) {
throw std::runtime_error("can't change kind of profiling (e.g. NVTX to CPU) while profiler is running");
}
+
+ pushCallback([](const RecordFunction& fn) {
+ auto* msg = (fn.seqNr() >= 0) ? ", seq = " : "";
+ pushRangeImpl(fn.name(), msg, fn.seqNr());
+ },
+ [](const RecordFunction& /* unused */) {
+ popRange();
+ });
state = new_state;
if(state == ProfilerState::CUDA) {
}
ProfilerState old_state = state;
mark("__stop_profile");
+
+ popCallback();
state = ProfilerState::Disabled;
+
if (old_state == ProfilerState::NVTX) {
return thread_event_lists();
} else {
#include <ctime>
#endif
+#include <torch/csrc/autograd/record_function.h>
#include <torch/csrc/jit/code_template.h>
typedef struct CUevent_st* CUDAEventStub;
};
struct TORCH_API Event final {
- Event(EventKind kind, std::string name, uint16_t thread_id, bool record_cuda)
- : owned_name_(new std::string(std::move(name)))
- , name_ptr_(owned_name_->c_str())
- , kind_(kind)
- , thread_id_(thread_id) { record(record_cuda); }
- Event(EventKind kind, const char* name, uint16_t thread_id, bool record_cuda)
- : name_ptr_(name)
+ Event(EventKind kind, StringView name, uint16_t thread_id, bool record_cuda)
+ : name_(std::move(name))
, kind_(kind)
, thread_id_(thread_id) { record(record_cuda); }
throw std::runtime_error("unknown EventKind");
}
const char* name() const {
- return name_ptr_;
+ return name_.str();
}
uint16_t thread_id() const {
return thread_id_;
}
private:
int64_t cpu_ns_ = 0; // signed to allow for negative intervals, initialized for safety.
- // std::string is a very large object (usually around 32B),
- // and this field is used only for user-created ranges, so
- // it's better to save on size of Events.
- std::unique_ptr<std::string> owned_name_;
- const char * name_ptr_;
+ StringView name_;
EventKind kind_;
uint16_t thread_id_;
int device_ = -1;
TORCH_API void pushRange(std::string name);
TORCH_API void popRange();
-struct TORCH_API RecordFunction {
- explicit RecordFunction(Function* fn);
-
- explicit RecordFunction(std::string name);
-
- explicit RecordFunction(const char* name);
-
- explicit RecordFunction(const char* name, int64_t current_sequence_nr);
-
- ~RecordFunction() {
- popRange();
- }
-};
-
using thread_event_lists = std::vector<std::vector<Event>>;
// NOTE: changing profiler modes is **NOT THREAD SAFE**. You should ensure that
// there no autograd functions are being executed when these function are used.
--- /dev/null
+#include <torch/csrc/autograd/record_function.h>
+#include <torch/csrc/autograd/function.h>
+
+namespace torch { namespace autograd { namespace profiler {
+
+namespace {
+bool has_callbacks = false;
+std::vector<RecordFunctionCallback> start_callbacks;
+std::vector<RecordFunctionCallback> end_callbacks;
+thread_local RecordFunction* thread_local_func_ = nullptr;
+}
+
+void pushCallback(RecordFunctionCallback start, RecordFunctionCallback end) {
+ start_callbacks.push_back(start);
+ end_callbacks.push_back(end);
+ has_callbacks = true;
+}
+
+void pushCallback(RecordFunctionCallback start) {
+ pushCallback(start, [](const RecordFunction&){});
+}
+
+void popCallback() {
+ if (start_callbacks.empty()) {
+ throw std::runtime_error("Empty callbacks stack");
+ }
+ start_callbacks.pop_back();
+ end_callbacks.pop_back();
+ has_callbacks = !start_callbacks.empty();
+}
+
+RecordFunction::RecordFunction(Function* fn, GetPackedInputsCallback cb) {
+ if (!has_callbacks) {
+ return;
+ }
+ fn_ = fn;
+ name_ = StringView(fn->name());
+ sequence_nr_ = fn->sequence_nr();
+ inputs_cb_ = cb;
+ processCallbacks();
+}
+
+RecordFunction::RecordFunction(
+ std::string name, int64_t sequence_nr, GetPackedInputsCallback cb) {
+ if (!has_callbacks) {
+ return;
+ }
+ name_ = StringView(std::move(name));
+ sequence_nr_ = sequence_nr;
+ inputs_cb_ = cb;
+ processCallbacks();
+}
+
+RecordFunction::RecordFunction(
+ const char* name, int64_t sequence_nr, GetPackedInputsCallback cb) {
+ if (!has_callbacks) {
+ return;
+ }
+ name_ = StringView(name);
+ sequence_nr_ = sequence_nr;
+ inputs_cb_ = cb;
+ processCallbacks();
+}
+
+void RecordFunction::processCallbacks() {
+ parent_ = thread_local_func_;
+ thread_local_func_ = this;
+
+ for (const auto& cb : start_callbacks) {
+ cb(*this);
+ }
+}
+
+RecordFunction::~RecordFunction() {
+ if (has_callbacks) {
+ for (const auto& cb : end_callbacks) {
+ cb(*this);
+ }
+ thread_local_func_ = parent_;
+ }
+}
+
+}}}
--- /dev/null
+#pragma once
+
+#include <ATen/core/ivalue.h>
+#include <c10/util/SmallVector.h>
+#include <torch/csrc/WindowsTorchApiMacro.h>
+
+namespace torch { namespace autograd {
+
+struct Function;
+
+namespace profiler {
+
+struct TORCH_API StringView {
+ StringView() : StringView(nullptr) {}
+ explicit StringView(const char* str_ptr)
+ : owned_str_ptr_(nullptr), str_ptr_(str_ptr) {}
+ explicit StringView(std::string str)
+ : owned_str_ptr_(std::make_shared<std::string>(std::move(str))),
+ str_ptr_(owned_str_ptr_->c_str()) {}
+
+ inline const char* str() const {
+ return str_ptr_;
+ }
+ private:
+ std::shared_ptr<std::string> owned_str_ptr_;
+ const char* str_ptr_;
+};
+
+using GetPackedInputsCallback = std::function<std::vector<c10::IValue>()>;
+
+struct TORCH_API RecordFunction {
+ explicit RecordFunction(Function* fn, GetPackedInputsCallback cb = nullptr);
+
+ explicit RecordFunction(
+ std::string name,
+ int64_t current_sequence_nr = -1,
+ GetPackedInputsCallback cb = nullptr);
+
+ explicit RecordFunction(
+ const char* name,
+ int64_t current_sequence_nr = -1,
+ GetPackedInputsCallback cb = nullptr);
+
+ explicit RecordFunction(
+ std::string name,
+ GetPackedInputsCallback cb) : RecordFunction(name, -1, cb) {}
+
+ explicit RecordFunction(
+ const char* name,
+ GetPackedInputsCallback cb) : RecordFunction(name, -1, cb) {}
+
+ virtual ~RecordFunction();
+
+
+ inline Function* func() const {
+ return fn_;
+ }
+
+ inline const StringView& name() const {
+ return name_;
+ }
+
+ inline int64_t seqNr() const {
+ return sequence_nr_;
+ }
+
+ const std::vector<c10::IValue>& inputs() const {
+ if (inputs_cb_ && !inputs_initialized_) {
+ inputs_ = inputs_cb_();
+ inputs_initialized_ = true;
+ }
+ return inputs_;
+ }
+
+ inline const RecordFunction* parent() const {
+ return parent_;
+ }
+
+ private:
+ void processCallbacks();
+
+ Function* fn_ = nullptr;
+ StringView name_;
+ int64_t sequence_nr_ = -1;
+
+ RecordFunction* parent_ = nullptr;
+
+ GetPackedInputsCallback inputs_cb_ = nullptr;
+ mutable bool inputs_initialized_ = false;
+ // initialized lazily by inputs_cb_
+ mutable std::vector<c10::IValue> inputs_;
+};
+
+// WARNING: all calls to pushCallback/popCallback are not thread safe and
+// must not overlap with other code execution
+using RecordFunctionCallback = std::function<void(const RecordFunction&)>;
+TORCH_API void pushCallback(RecordFunctionCallback, RecordFunctionCallback);
+TORCH_API void pushCallback(RecordFunctionCallback);
+TORCH_API void popCallback();
+
+} // namespace profiler
+}} // namespace torch::autograd