From 4aad366111f88010c9e4027d054da2fe357e1bfe Mon Sep 17 00:00:00 2001
From: Mike Iovine <mikeiovine@fb.com>
Date: Wed, 1 Sep 2021 14:19:21 -0700
Subject: [PATCH] [Static Runtime] Make per-op latency readable by FAI-PEP
 (#64315)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/64315

Add a new flag `generate_ai_pep_output` to `StaticRuntime::benchmark`. If set, produces per-op-kind average total latency in milliseconds in a JSON format recognized by [Facebook AI performance evaluation platform (FAI-PEP)](https://github.com/facebook/FAI-PEP).

This is useful for observing the impact of changes that make a big difference for a specific op, but do not affect the overall SR latency by more than a few percent.

Reviewed By: hlu1

Differential Revision: D30679352

fbshipit-source-id: c847fa6ea20774aaf1e7949b11db4421d1f70b7e
---
 torch/csrc/jit/runtime/static/impl.cpp | 29 ++++++++++++++++++++++++++++-
 torch/csrc/jit/runtime/static/impl.h   |  3 ++-
 2 files changed, 30 insertions(+), 2 deletions(-)
diff --git a/torch/csrc/jit/runtime/static/impl.cpp b/torch/csrc/jit/runtime/static/impl.cpp
index e224478..7697613 100644
--- a/torch/csrc/jit/runtime/static/impl.cpp
+++ b/torch/csrc/jit/runtime/static/impl.cpp
@@ -19,6 +19,11 @@
 #include <torch/csrc/jit/runtime/vararg_functions.h>
 #include <stdexcept>
 
+#ifdef FBCODE_CAFFE2
+#include <folly/dynamic.h>
+#include <folly/json.h>
+#endif
+
 namespace torch {
 namespace jit {
 
@@ -873,12 +878,30 @@ c10::IValue StaticRuntime::operator()(
   return std::move(*outputs_[0]);
 }
 
+namespace {
+
+std::string generate_node_time_json(const std::string& kind, float millis) {
+#ifdef FBCODE_CAFFE2
+  folly::dynamic json = folly::dynamic::object();
+  json["type"] = kind;
+  json["metric"] = "latency";
+  json["unit"] = "ms";
+  json["value"] = millis;
+  return folly::toJson(json);
+#else
+  return "";
+#endif
+}
+
+} // namespace
+
 void StaticRuntime::benchmark(
     const std::vector<c10::IValue>& args,
     const std::unordered_map<std::string, c10::IValue>& kwargs,
     const int warmup_runs,
     const int main_runs,
-    bool print_per_node_time) {
+    bool print_per_node_time,
+    bool generate_ai_pep_output) {
   float time_per_iter = benchmark_model(args, kwargs, warmup_runs, main_runs);
   std::cout << "Static runtime ms per iter: " << time_per_iter
             << ". Iters per second: " << 1000.0 / time_per_iter << std::endl;
@@ -916,6 +939,10 @@ void StaticRuntime::benchmark(
     } else {
       std::cout << ")" << std::endl;
     }
+
+    if (generate_ai_pep_output) {
+      LOG(INFO) << "PyTorchObserver " << generate_node_time_json(kind, ms);
+    }
   }
   std::cout << std::setw(15) << results.total_time << " ms. in Total"
             << std::endl;
diff --git a/torch/csrc/jit/runtime/static/impl.h b/torch/csrc/jit/runtime/static/impl.h
index d8a99f7..0d23787 100644
--- a/torch/csrc/jit/runtime/static/impl.h
+++ b/torch/csrc/jit/runtime/static/impl.h
@@ -234,7 +234,8 @@ class TORCH_API StaticRuntime {
       const std::unordered_map<std::string, c10::IValue>& kwargs,
       const int warmup_runs,
       const int main_runs,
-      bool print_per_node_time = false);
+      bool print_per_node_time = false,
+      bool generate_ai_pep_output = false);
 
   float benchmark_model(
       const std::vector<c10::IValue>& args,
-- 
2.7.4