From: Summer Deng Date: Sat, 9 Mar 2019 03:00:43 +0000 (-0800) Subject: Int8 FC performance debugging (#17700) X-Git-Tag: accepted/tizen/6.5/unified/20211028.231830~908 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=c10c73f0473baf80305bb800dbee113ff1067a95;p=platform%2Fupstream%2Fpytorch.git Int8 FC performance debugging (#17700) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/17700 Add performance debugging utilities in DNNLOWP FC operator and the python script Reviewed By: amylittleyang Differential Revision: D14321299 fbshipit-source-id: 50dbd7b352a1da5d2ecb659d8003e71e70750063 --- diff --git a/caffe2/operators/fully_connected_op.h b/caffe2/operators/fully_connected_op.h index ea305d9..8940730 100644 --- a/caffe2/operators/fully_connected_op.h +++ b/caffe2/operators/fully_connected_op.h @@ -7,6 +7,10 @@ #include "caffe2/utils/conversions.h" #include "caffe2/utils/math.h" +#ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN +#include +#endif + namespace caffe2 { // This is Caffe's InnerProductOp, with a name that fits its purpose better. @@ -33,6 +37,16 @@ class FullyConnectedOp final : public Operator { typename T_Y, typename MATH> bool DoRunWithType() { +#ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN + std::chrono::time_point t_very_begin, t_begin, + t_end; + /* if (VLOG_IS_ON(3)) */ + { + t_begin = std::chrono::system_clock::now(); + t_very_begin = t_begin; + } +#endif + const auto& X = Input(0); const auto& W = Input(1); const auto& b = Input(2); @@ -91,6 +105,16 @@ class FullyConnectedOp final : public Operator { math_type = TensorProto_DataType_FLOAT16; } +#ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN + /* if (VLOG_IS_ON(3)) */ + { + t_end = std::chrono::system_clock::now(); + double dt = std::chrono::duration(t_end - t_begin).count(); + LOG(INFO) << "@PERF this=" << this << " before_gemm: " << dt * 1e3 + << " ms"; + t_begin = std::chrono::system_clock::now(); + } +#endif // W * x math::Gemm( CblasNoTrans, @@ -105,9 +129,20 @@ class FullyConnectedOp final : public Operator { Y->template mutable_data(), &context_, math_type); + +#ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN + /* if (VLOG_IS_ON(3)) */ + { + t_end = std::chrono::system_clock::now(); + double dt = std::chrono::duration(t_end - t_begin).count(); + LOG(INFO) << "@PERF this=" << this << " gemm: " << dt * 1e3 << " ms"; + t_begin = std::chrono::system_clock::now(); + } +#endif // Add bias term if (!bias_multiplier_.has_value()) { - bias_multiplier_ = caffe2::empty({M}, at::dtype().device(Context::GetDeviceType())); + bias_multiplier_ = + caffe2::empty({M}, at::dtype().device(Context::GetDeviceType())); math::Set( M, convert::To(1), @@ -135,6 +170,16 @@ class FullyConnectedOp final : public Operator { Y->template mutable_data(), &context_, math_type); + +#ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN + /* if (VLOG_IS_ON(3)) */ + { + t_end = std::chrono::system_clock::now(); + double dt = std::chrono::duration(t_end - t_begin).count(); + LOG(INFO) << "@PERF this=" << this << " add_bias : " << dt * 1e3 << " ms"; + t_begin = std::chrono::system_clock::now(); + } +#endif return true; } diff --git a/caffe2/quantization/server/fully_connected_dnnlowp_op.cc b/caffe2/quantization/server/fully_connected_dnnlowp_op.cc index c19f43e..5eee0a9 100644 --- a/caffe2/quantization/server/fully_connected_dnnlowp_op.cc +++ b/caffe2/quantization/server/fully_connected_dnnlowp_op.cc @@ -1,6 +1,8 @@ #include "fully_connected_dnnlowp_op.h" +#ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN #include +#endif #include "caffe2/core/flags.h" #include "caffe2/core/tensor_int8.h" @@ -91,25 +93,30 @@ bool FullyConnectedDNNLowPOp::RunOnDevice() { return true; } +#ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN chrono::time_point t_very_begin, t_begin, t_end; - - if (VLOG_IS_ON(3)) { + /* if (VLOG_IS_ON(3)) */ + { t_begin = chrono::system_clock::now(); t_very_begin = t_begin; } +#endif // Get quantization parameters if (!GetQuantizationParameters_()) { return false; } - if (VLOG_IS_ON(3)) { +#ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN + /* if (VLOG_IS_ON(3)) */ + { t_end = chrono::system_clock::now(); double dt = chrono::duration(t_end - t_begin).count(); - VLOG(3) << "@PERF this=" << this << " get_quant_params: " << dt * 1e3 - << " ms"; + LOG(INFO) << "@PERF this=" << this << " get_quant_params: " << dt * 1e3 + << " ms"; t_begin = chrono::system_clock::now(); } +#endif const auto& X = InputTensorCPU_(0); const auto& W = InputTensorCPU_(1); @@ -130,13 +137,17 @@ bool FullyConnectedDNNLowPOp::RunOnDevice() { const T* Xdata = nullptr; vector X_temp; - if (VLOG_IS_ON(3)) { +#ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN + /* if (VLOG_IS_ON(1)) */ + { t_end = chrono::system_clock::now(); double dt = chrono::duration(t_end - t_begin).count(); - VLOG(3) << "@PERF this=" << this << " initialize parameters: " << dt * 1e3 - << " ms"; + LOG(INFO) << "@PERF this=" << this << " initialize parameters: " << dt * 1e3 + << " ms"; t_begin = chrono::system_clock::now(); } +#endif + if (Wq_packed_) { // fast path to use fbgemm using namespace fbgemm; @@ -144,22 +155,31 @@ bool FullyConnectedDNNLowPOp::RunOnDevice() { if (X.template IsType() || !dequantize_output_) { // Only when input and output are float, we don't need input to be // quantized. - if (VLOG_IS_ON(3)) { - t_begin = chrono::system_clock::now(); - } + +#ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN + /* if (VLOG_IS_ON(1)) */ + { t_begin = chrono::system_clock::now(); } +#endif + Xdata = QuantizeInputIfNeeded(this, 0, in_qparams_[0], X_temp); - if (VLOG_IS_ON(3)) { + +#ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN + /* if (VLOG_IS_ON(1)) */ + { t_end = chrono::system_clock::now(); double dt = chrono::duration(t_end - t_begin).count(); - VLOG(3) << "@PERF this=" << this << " input quantization: " << dt * 1e3 - << " ms"; + LOG(INFO) << "@PERF this=" << this + << " input quantization: " << dt * 1e3 << " ms"; t_begin = chrono::system_clock::now(); } +#endif } - if (VLOG_IS_ON(3)) { - t_begin = chrono::system_clock::now(); - } +#ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN + /* if (VLOG_IS_ON(1)) */ + { t_begin = chrono::system_clock::now(); } +#endif + if (!dequantize_output_) { Y_int32_.resize(Y->size()); DoNothing<> doNothingObj{}; @@ -396,17 +416,24 @@ bool FullyConnectedDNNLowPOp::RunOnDevice() { } // dequantize_output } else { // Quantize X - if (VLOG_IS_ON(3)) { - t_begin = chrono::system_clock::now(); - } + +#ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN + /* if (VLOG_IS_ON(1)) */ + { t_begin = chrono::system_clock::now(); } +#endif + Xdata = QuantizeInputIfNeeded(this, 0, in_qparams_[0], X_temp); - if (VLOG_IS_ON(3)) { + +#ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN + /* if (VLOG_IS_ON(1)) */ + { t_end = chrono::system_clock::now(); double dt = chrono::duration(t_end - t_begin).count(); - VLOG(3) << "@PERF this=" << this << " input quantization: " << dt * 1e3 - << " ms"; + LOG(INFO) << "@PERF this=" << this << " input quantization: " << dt * 1e3 + << " ms"; t_begin = chrono::system_clock::now(); } +#endif Y_int32_.resize(Y->size()); for (int i = 0; i < M; ++i) { @@ -429,13 +456,16 @@ bool FullyConnectedDNNLowPOp::RunOnDevice() { StoreMatrixInMatrixMarketFormat(N, K, Wdata, this->debug_def().input(1)); } - if (VLOG_IS_ON(3)) { +#ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN + /* if (VLOG_IS_ON(1)) */ + { t_end = chrono::system_clock::now(); double dt = chrono::duration(t_end - t_begin).count(); - VLOG(3) << "@PERF this=" << this << " gemm: " << dt * 1e3 << " ms"; + LOG(INFO) << "@PERF this=" << this << " gemm: " << dt * 1e3 << " ms"; t_begin = chrono::system_clock::now(); } +#endif // Adjust with bias and zero_point and then requantize // See batch_matmul_dnnlowp_op.cc to why we compute column_offsets, @@ -495,20 +525,23 @@ bool FullyConnectedDNNLowPOp::RunOnDevice() { MeasureQuantizationError_(); - if (VLOG_IS_ON(3)) { +#ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN + /* if (VLOG_IS_ON(1)) */ + { t_end = chrono::system_clock::now(); double dt = chrono::duration(t_end - t_begin).count(); - VLOG(3) << "@PERF this=" << this - << " bias-offset-requantization: " << dt * 1e3 << " ms"; + LOG(INFO) << "@PERF this=" << this + << " bias-offset-requantization: " << dt * 1e3 << " ms"; t_end = chrono::system_clock::now(); double ops = 2. * M * N * K; dt = chrono::duration(t_end - t_very_begin).count(); double gops = ops / dt / 1e9; - VLOG(3) << "@PERF this=" << this - << " output=" << this->debug_def().output(0) << " " << M << "x" << N - << "x" << K << ": " << dt * 1e3 << " ms " << gops << " gops"; + LOG(INFO) << "@PERF this=" << this + << " output=" << this->debug_def().output(0) << " " << M << "x" + << N << "x" << K << ": " << dt * 1e3 << " ms " << gops << " gops"; } +#endif return true; } @@ -517,20 +550,25 @@ template bool FullyConnectedDNNLowPOp::GetQuantizationParameters_() { using namespace dnnlowp; +#ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN chrono::time_point t_begin, t_end; - if (VLOG_IS_ON(3)) { - t_begin = chrono::system_clock::now(); - } + /* if (VLOG_IS_ON(1)) */ + { t_begin = chrono::system_clock::now(); } +#endif + // Choose quantization for X in_qparams_[0] = GetInputTensorQuantizationParamsOf(this, 0, qfactory_.get()); - if (VLOG_IS_ON(3)) { +#ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN + /* if (VLOG_IS_ON(1)) */ + { t_end = chrono::system_clock::now(); double dt = chrono::duration(t_end - t_begin).count(); - VLOG(3) << "@PERF this=" << this << " GetInputTensorQuantizationParamsOf " - << dt * 1e3 << " ms"; + LOG(INFO) << "@PERF this=" << this << " GetInputTensorQuantizationParamsOf " + << dt * 1e3 << " ms"; t_begin = chrono::system_clock::now(); } +#endif // Quantize W const auto& X = InputTensorCPU_(0); @@ -618,12 +656,16 @@ bool FullyConnectedDNNLowPOp::GetQuantizationParameters_() { filter_qparams_[0]); } - if (VLOG_IS_ON(3)) { +#ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN + /* if (VLOG_IS_ON(1)) */ + { t_end = chrono::system_clock::now(); double dt = chrono::duration(t_end - t_begin).count(); - VLOG(3) << "@PERF this=" << this << " Quantize W " << dt * 1e3 << " ms"; + LOG(INFO) << "@PERF this=" << this << " Quantize W " << dt * 1e3 << " ms"; t_begin = chrono::system_clock::now(); } +#endif + // Pre-compute column_offset // If input tensor doesn't use dynamic quantization, we fold column_offsets_ // into bias. @@ -641,13 +683,17 @@ bool FullyConnectedDNNLowPOp::GetQuantizationParameters_() { K, N, W_quantized_.data(), filter_qparams_, *column_offsets_); } } - if (VLOG_IS_ON(3)) { + +#ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN + /* if (VLOG_IS_ON(1)) */ + { t_end = chrono::system_clock::now(); double dt = chrono::duration(t_end - t_begin).count(); - VLOG(3) << "@PERF this=" << this << " Calculate column offset " << dt * 1e3 - << " ms"; + LOG(INFO) << "@PERF this=" << this << " Calculate column offset " + << dt * 1e3 << " ms"; t_begin = chrono::system_clock::now(); } +#endif // Quantize bias if (!is_weight_constant_ || (!b_quantized_data_ && !b_dequantized_data_) || @@ -736,12 +782,16 @@ bool FullyConnectedDNNLowPOp::GetQuantizationParameters_() { vector().swap(W_quantized_); } - if (VLOG_IS_ON(3)) { +#ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN + /* if (VLOG_IS_ON(1)) */ + { t_end = chrono::system_clock::now(); double dt = chrono::duration(t_end - t_begin).count(); - VLOG(3) << "@PERF this=" << this << " Quantize bias " << dt * 1e3 << " ms"; + LOG(INFO) << "@PERF this=" << this << " Quantize bias " << dt * 1e3 + << " ms"; t_begin = chrono::system_clock::now(); } +#endif if (!dequantize_output_ && !requantization_param_selected_) { GetOutputQuantizationParams_(); @@ -762,13 +812,18 @@ bool FullyConnectedDNNLowPOp::GetQuantizationParameters_() { Fp32Op_()->Get()->RunOnDevice(); } } - if (VLOG_IS_ON(3)) { + +#ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN + /* if (VLOG_IS_ON(1)) */ + { t_end = chrono::system_clock::now(); double dt = chrono::duration(t_end - t_begin).count(); - VLOG(3) << "@PERF this=" << this << " GetOutputQuantizationParams " - << dt * 1e3 << " ms"; + LOG(INFO) << "@PERF this=" << this << " GetOutputQuantizationParams " + << dt * 1e3 << " ms"; t_begin = chrono::system_clock::now(); } +#endif + return true; }