From 4aad366111f88010c9e4027d054da2fe357e1bfe Mon Sep 17 00:00:00 2001 From: Mike Iovine Date: Wed, 1 Sep 2021 14:19:21 -0700 Subject: [PATCH] [Static Runtime] Make per-op latency readable by FAI-PEP (#64315) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64315 Add a new flag `generate_ai_pep_output` to `StaticRuntime::benchmark`. If set, produces per-op-kind average total latency in milliseconds in a JSON format recognized by [Facebook AI performance evaluation platform (FAI-PEP)](https://github.com/facebook/FAI-PEP). This is useful for observing the impact of changes that make a big difference for a specific op, but do not affect the overall SR latency by more than a few percent. Reviewed By: hlu1 Differential Revision: D30679352 fbshipit-source-id: c847fa6ea20774aaf1e7949b11db4421d1f70b7e --- torch/csrc/jit/runtime/static/impl.cpp | 29 ++++++++++++++++++++++++++++- torch/csrc/jit/runtime/static/impl.h | 3 ++- 2 files changed, 30 insertions(+), 2 deletions(-) diff --git a/torch/csrc/jit/runtime/static/impl.cpp b/torch/csrc/jit/runtime/static/impl.cpp index e224478..7697613 100644 --- a/torch/csrc/jit/runtime/static/impl.cpp +++ b/torch/csrc/jit/runtime/static/impl.cpp @@ -19,6 +19,11 @@ #include #include +#ifdef FBCODE_CAFFE2 +#include +#include +#endif + namespace torch { namespace jit { @@ -873,12 +878,30 @@ c10::IValue StaticRuntime::operator()( return std::move(*outputs_[0]); } +namespace { + +std::string generate_node_time_json(const std::string& kind, float millis) { +#ifdef FBCODE_CAFFE2 + folly::dynamic json = folly::dynamic::object(); + json["type"] = kind; + json["metric"] = "latency"; + json["unit"] = "ms"; + json["value"] = millis; + return folly::toJson(json); +#else + return ""; +#endif +} + +} // namespace + void StaticRuntime::benchmark( const std::vector& args, const std::unordered_map& kwargs, const int warmup_runs, const int main_runs, - bool print_per_node_time) { + bool print_per_node_time, + bool generate_ai_pep_output) { float time_per_iter = benchmark_model(args, kwargs, warmup_runs, main_runs); std::cout << "Static runtime ms per iter: " << time_per_iter << ". Iters per second: " << 1000.0 / time_per_iter << std::endl; @@ -916,6 +939,10 @@ void StaticRuntime::benchmark( } else { std::cout << ")" << std::endl; } + + if (generate_ai_pep_output) { + LOG(INFO) << "PyTorchObserver " << generate_node_time_json(kind, ms); + } } std::cout << std::setw(15) << results.total_time << " ms. in Total" << std::endl; diff --git a/torch/csrc/jit/runtime/static/impl.h b/torch/csrc/jit/runtime/static/impl.h index d8a99f7..0d23787 100644 --- a/torch/csrc/jit/runtime/static/impl.h +++ b/torch/csrc/jit/runtime/static/impl.h @@ -234,7 +234,8 @@ class TORCH_API StaticRuntime { const std::unordered_map& kwargs, const int warmup_runs, const int main_runs, - bool print_per_node_time = false); + bool print_per_node_time = false, + bool generate_ai_pep_output = false); float benchmark_model( const std::vector& args, -- 2.7.4