From: Summer Deng <summerdeng@fb.com>
Date: Sat, 9 Mar 2019 03:00:43 +0000 (-0800)
Subject: Int8 FC performance debugging (#17700)
X-Git-Tag: accepted/tizen/6.5/unified/20211028.231830~908
X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=c10c73f0473baf80305bb800dbee113ff1067a95;p=platform%2Fupstream%2Fpytorch.git

Int8 FC performance debugging (#17700)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/17700

Add performance debugging utilities in DNNLOWP FC operator and the python script

Reviewed By: amylittleyang

Differential Revision: D14321299

fbshipit-source-id: 50dbd7b352a1da5d2ecb659d8003e71e70750063
---

diff --git a/caffe2/operators/fully_connected_op.h b/caffe2/operators/fully_connected_op.h
index ea305d9..8940730 100644
--- a/caffe2/operators/fully_connected_op.h
+++ b/caffe2/operators/fully_connected_op.h
@@ -7,6 +7,10 @@
 #include "caffe2/utils/conversions.h"
 #include "caffe2/utils/math.h"
 
+#ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN
+#include <chrono>
+#endif
+
 namespace caffe2 {
 
 // This is Caffe's InnerProductOp, with a name that fits its purpose better.
@@ -33,6 +37,16 @@ class FullyConnectedOp final : public Operator<Context> {
       typename T_Y,
       typename MATH>
   bool DoRunWithType() {
+#ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN
+    std::chrono::time_point<std::chrono::system_clock> t_very_begin, t_begin,
+        t_end;
+    /* if (VLOG_IS_ON(3)) */
+    {
+      t_begin = std::chrono::system_clock::now();
+      t_very_begin = t_begin;
+    }
+#endif
+
     const auto& X = Input(0);
     const auto& W = Input(1);
     const auto& b = Input(2);
@@ -91,6 +105,16 @@ class FullyConnectedOp final : public Operator<Context> {
       math_type = TensorProto_DataType_FLOAT16;
     }
 
+#ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN
+    /* if (VLOG_IS_ON(3)) */
+    {
+      t_end = std::chrono::system_clock::now();
+      double dt = std::chrono::duration<double>(t_end - t_begin).count();
+      LOG(INFO) << "@PERF this=" << this << " before_gemm: " << dt * 1e3
+                << " ms";
+      t_begin = std::chrono::system_clock::now();
+    }
+#endif
     // W * x
     math::Gemm<T_X, Context, Engine>(
         CblasNoTrans,
@@ -105,9 +129,20 @@ class FullyConnectedOp final : public Operator<Context> {
         Y->template mutable_data<T_Y>(),
         &context_,
         math_type);
+
+#ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN
+    /* if (VLOG_IS_ON(3)) */
+    {
+      t_end = std::chrono::system_clock::now();
+      double dt = std::chrono::duration<double>(t_end - t_begin).count();
+      LOG(INFO) << "@PERF this=" << this << " gemm: " << dt * 1e3 << " ms";
+      t_begin = std::chrono::system_clock::now();
+    }
+#endif
     // Add bias term
     if (!bias_multiplier_.has_value()) {
-      bias_multiplier_ = caffe2::empty({M}, at::dtype<T_B>().device(Context::GetDeviceType()));
+      bias_multiplier_ =
+          caffe2::empty({M}, at::dtype<T_B>().device(Context::GetDeviceType()));
       math::Set<T_B, Context>(
           M,
           convert::To<float, T_B>(1),
@@ -135,6 +170,16 @@ class FullyConnectedOp final : public Operator<Context> {
         Y->template mutable_data<T_Y>(),
         &context_,
         math_type);
+
+#ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN
+    /* if (VLOG_IS_ON(3)) */
+    {
+      t_end = std::chrono::system_clock::now();
+      double dt = std::chrono::duration<double>(t_end - t_begin).count();
+      LOG(INFO) << "@PERF this=" << this << " add_bias : " << dt * 1e3 << " ms";
+      t_begin = std::chrono::system_clock::now();
+    }
+#endif
     return true;
   }
 
diff --git a/caffe2/quantization/server/fully_connected_dnnlowp_op.cc b/caffe2/quantization/server/fully_connected_dnnlowp_op.cc
index c19f43e..5eee0a9 100644
--- a/caffe2/quantization/server/fully_connected_dnnlowp_op.cc
+++ b/caffe2/quantization/server/fully_connected_dnnlowp_op.cc
@@ -1,6 +1,8 @@
 #include "fully_connected_dnnlowp_op.h"
 
+#ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN
 #include <chrono>
+#endif
 
 #include "caffe2/core/flags.h"
 #include "caffe2/core/tensor_int8.h"
@@ -91,25 +93,30 @@ bool FullyConnectedDNNLowPOp<T>::RunOnDevice() {
     return true;
   }
 
+#ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN
   chrono::time_point<chrono::system_clock> t_very_begin, t_begin, t_end;
-
-  if (VLOG_IS_ON(3)) {
+  /* if (VLOG_IS_ON(3)) */
+  {
     t_begin = chrono::system_clock::now();
     t_very_begin = t_begin;
   }
+#endif
 
   // Get quantization parameters
   if (!GetQuantizationParameters_()) {
     return false;
   }
 
-  if (VLOG_IS_ON(3)) {
+#ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN
+  /* if (VLOG_IS_ON(3)) */
+  {
     t_end = chrono::system_clock::now();
     double dt = chrono::duration<double>(t_end - t_begin).count();
-    VLOG(3) << "@PERF this=" << this << " get_quant_params: " << dt * 1e3
-            << " ms";
+    LOG(INFO) << "@PERF this=" << this << " get_quant_params: " << dt * 1e3
+              << " ms";
     t_begin = chrono::system_clock::now();
   }
+#endif
 
   const auto& X = InputTensorCPU_(0);
   const auto& W = InputTensorCPU_(1);
@@ -130,13 +137,17 @@ bool FullyConnectedDNNLowPOp<T>::RunOnDevice() {
   const T* Xdata = nullptr;
   vector<T> X_temp;
 
-  if (VLOG_IS_ON(3)) {
+#ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN
+  /* if (VLOG_IS_ON(1)) */
+  {
     t_end = chrono::system_clock::now();
     double dt = chrono::duration<double>(t_end - t_begin).count();
-    VLOG(3) << "@PERF this=" << this << " initialize parameters: " << dt * 1e3
-            << " ms";
+    LOG(INFO) << "@PERF this=" << this << " initialize parameters: " << dt * 1e3
+              << " ms";
     t_begin = chrono::system_clock::now();
   }
+#endif
+
   if (Wq_packed_) {
     // fast path to use fbgemm
     using namespace fbgemm;
@@ -144,22 +155,31 @@ bool FullyConnectedDNNLowPOp<T>::RunOnDevice() {
     if (X.template IsType<T>() || !dequantize_output_) {
       // Only when input and output are float, we don't need input to be
       // quantized.
-      if (VLOG_IS_ON(3)) {
-        t_begin = chrono::system_clock::now();
-      }
+
+#ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN
+      /* if (VLOG_IS_ON(1)) */
+      { t_begin = chrono::system_clock::now(); }
+#endif
+
       Xdata = QuantizeInputIfNeeded<T>(this, 0, in_qparams_[0], X_temp);
-      if (VLOG_IS_ON(3)) {
+
+#ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN
+      /* if (VLOG_IS_ON(1)) */
+      {
         t_end = chrono::system_clock::now();
         double dt = chrono::duration<double>(t_end - t_begin).count();
-        VLOG(3) << "@PERF this=" << this << " input quantization: " << dt * 1e3
-                << " ms";
+        LOG(INFO) << "@PERF this=" << this
+                  << " input quantization: " << dt * 1e3 << " ms";
         t_begin = chrono::system_clock::now();
       }
+#endif
     }
 
-    if (VLOG_IS_ON(3)) {
-      t_begin = chrono::system_clock::now();
-    }
+#ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN
+    /* if (VLOG_IS_ON(1)) */
+    { t_begin = chrono::system_clock::now(); }
+#endif
+
     if (!dequantize_output_) {
       Y_int32_.resize(Y->size());
       DoNothing<> doNothingObj{};
@@ -396,17 +416,24 @@ bool FullyConnectedDNNLowPOp<T>::RunOnDevice() {
     } // dequantize_output
   } else {
     // Quantize X
-    if (VLOG_IS_ON(3)) {
-      t_begin = chrono::system_clock::now();
-    }
+
+#ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN
+    /* if (VLOG_IS_ON(1)) */
+    { t_begin = chrono::system_clock::now(); }
+#endif
+
     Xdata = QuantizeInputIfNeeded<T>(this, 0, in_qparams_[0], X_temp);
-    if (VLOG_IS_ON(3)) {
+
+#ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN
+    /* if (VLOG_IS_ON(1)) */
+    {
       t_end = chrono::system_clock::now();
       double dt = chrono::duration<double>(t_end - t_begin).count();
-      VLOG(3) << "@PERF this=" << this << " input quantization: " << dt * 1e3
-              << " ms";
+      LOG(INFO) << "@PERF this=" << this << " input quantization: " << dt * 1e3
+                << " ms";
       t_begin = chrono::system_clock::now();
     }
+#endif
 
     Y_int32_.resize(Y->size());
     for (int i = 0; i < M; ++i) {
@@ -429,13 +456,16 @@ bool FullyConnectedDNNLowPOp<T>::RunOnDevice() {
     StoreMatrixInMatrixMarketFormat(N, K, Wdata, this->debug_def().input(1));
   }
 
-  if (VLOG_IS_ON(3)) {
+#ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN
+  /* if (VLOG_IS_ON(1)) */
+  {
     t_end = chrono::system_clock::now();
     double dt = chrono::duration<double>(t_end - t_begin).count();
-    VLOG(3) << "@PERF this=" << this << " gemm: " << dt * 1e3 << " ms";
+    LOG(INFO) << "@PERF this=" << this << " gemm: " << dt * 1e3 << " ms";
 
     t_begin = chrono::system_clock::now();
   }
+#endif
 
   // Adjust with bias and zero_point and then requantize
   // See batch_matmul_dnnlowp_op.cc to why we compute column_offsets,
@@ -495,20 +525,23 @@ bool FullyConnectedDNNLowPOp<T>::RunOnDevice() {
 
   MeasureQuantizationError_();
 
-  if (VLOG_IS_ON(3)) {
+#ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN
+  /* if (VLOG_IS_ON(1)) */
+  {
     t_end = chrono::system_clock::now();
     double dt = chrono::duration<double>(t_end - t_begin).count();
-    VLOG(3) << "@PERF this=" << this
-            << " bias-offset-requantization: " << dt * 1e3 << " ms";
+    LOG(INFO) << "@PERF this=" << this
+              << " bias-offset-requantization: " << dt * 1e3 << " ms";
 
     t_end = chrono::system_clock::now();
     double ops = 2. * M * N * K;
     dt = chrono::duration<double>(t_end - t_very_begin).count();
     double gops = ops / dt / 1e9;
-    VLOG(3) << "@PERF this=" << this
-            << " output=" << this->debug_def().output(0) << " " << M << "x" << N
-            << "x" << K << ": " << dt * 1e3 << " ms " << gops << " gops";
+    LOG(INFO) << "@PERF this=" << this
+              << " output=" << this->debug_def().output(0) << " " << M << "x"
+              << N << "x" << K << ": " << dt * 1e3 << " ms " << gops << " gops";
   }
+#endif
 
   return true;
 }
@@ -517,20 +550,25 @@ template <typename T>
 bool FullyConnectedDNNLowPOp<T>::GetQuantizationParameters_() {
   using namespace dnnlowp;
 
+#ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN
   chrono::time_point<chrono::system_clock> t_begin, t_end;
-  if (VLOG_IS_ON(3)) {
-    t_begin = chrono::system_clock::now();
-  }
+  /* if (VLOG_IS_ON(1)) */
+  { t_begin = chrono::system_clock::now(); }
+#endif
+
   // Choose quantization for X
   in_qparams_[0] = GetInputTensorQuantizationParamsOf(this, 0, qfactory_.get());
 
-  if (VLOG_IS_ON(3)) {
+#ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN
+  /* if (VLOG_IS_ON(1)) */
+  {
     t_end = chrono::system_clock::now();
     double dt = chrono::duration<double>(t_end - t_begin).count();
-    VLOG(3) << "@PERF this=" << this << " GetInputTensorQuantizationParamsOf "
-            << dt * 1e3 << " ms";
+    LOG(INFO) << "@PERF this=" << this << " GetInputTensorQuantizationParamsOf "
+              << dt * 1e3 << " ms";
     t_begin = chrono::system_clock::now();
   }
+#endif
 
   // Quantize W
   const auto& X = InputTensorCPU_(0);
@@ -618,12 +656,16 @@ bool FullyConnectedDNNLowPOp<T>::GetQuantizationParameters_() {
         filter_qparams_[0]);
   }
 
-  if (VLOG_IS_ON(3)) {
+#ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN
+  /* if (VLOG_IS_ON(1)) */
+  {
     t_end = chrono::system_clock::now();
     double dt = chrono::duration<double>(t_end - t_begin).count();
-    VLOG(3) << "@PERF this=" << this << " Quantize W " << dt * 1e3 << " ms";
+    LOG(INFO) << "@PERF this=" << this << " Quantize W " << dt * 1e3 << " ms";
     t_begin = chrono::system_clock::now();
   }
+#endif
+
   // Pre-compute column_offset
   // If input tensor doesn't use dynamic quantization, we fold column_offsets_
   // into bias.
@@ -641,13 +683,17 @@ bool FullyConnectedDNNLowPOp<T>::GetQuantizationParameters_() {
           K, N, W_quantized_.data(), filter_qparams_, *column_offsets_);
     }
   }
-  if (VLOG_IS_ON(3)) {
+
+#ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN
+  /* if (VLOG_IS_ON(1)) */
+  {
     t_end = chrono::system_clock::now();
     double dt = chrono::duration<double>(t_end - t_begin).count();
-    VLOG(3) << "@PERF this=" << this << " Calculate column offset " << dt * 1e3
-            << " ms";
+    LOG(INFO) << "@PERF this=" << this << " Calculate column offset "
+              << dt * 1e3 << " ms";
     t_begin = chrono::system_clock::now();
   }
+#endif
 
   // Quantize bias
   if (!is_weight_constant_ || (!b_quantized_data_ && !b_dequantized_data_) ||
@@ -736,12 +782,16 @@ bool FullyConnectedDNNLowPOp<T>::GetQuantizationParameters_() {
     vector<T_signed>().swap(W_quantized_);
   }
 
-  if (VLOG_IS_ON(3)) {
+#ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN
+  /* if (VLOG_IS_ON(1)) */
+  {
     t_end = chrono::system_clock::now();
     double dt = chrono::duration<double>(t_end - t_begin).count();
-    VLOG(3) << "@PERF this=" << this << " Quantize bias " << dt * 1e3 << " ms";
+    LOG(INFO) << "@PERF this=" << this << " Quantize bias " << dt * 1e3
+              << " ms";
     t_begin = chrono::system_clock::now();
   }
+#endif
 
   if (!dequantize_output_ && !requantization_param_selected_) {
     GetOutputQuantizationParams_();
@@ -762,13 +812,18 @@ bool FullyConnectedDNNLowPOp<T>::GetQuantizationParameters_() {
       Fp32Op_()->Get()->RunOnDevice();
     }
   }
-  if (VLOG_IS_ON(3)) {
+
+#ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN
+  /* if (VLOG_IS_ON(1)) */
+  {
     t_end = chrono::system_clock::now();
     double dt = chrono::duration<double>(t_end - t_begin).count();
-    VLOG(3) << "@PERF this=" << this << " GetOutputQuantizationParams "
-            << dt * 1e3 << " ms";
+    LOG(INFO) << "@PERF this=" << this << " GetOutputQuantizationParams "
+              << dt * 1e3 << " ms";
     t_begin = chrono::system_clock::now();
   }
+#endif
+
   return true;
 }