#include "caffe2/utils/conversions.h"
#include "caffe2/utils/math.h"
+#ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN
+#include <chrono>
+#endif
+
namespace caffe2 {
// This is Caffe's InnerProductOp, with a name that fits its purpose better.
typename T_Y,
typename MATH>
bool DoRunWithType() {
+#ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN
+ std::chrono::time_point<std::chrono::system_clock> t_very_begin, t_begin,
+ t_end;
+ /* if (VLOG_IS_ON(3)) */
+ {
+ t_begin = std::chrono::system_clock::now();
+ t_very_begin = t_begin;
+ }
+#endif
+
const auto& X = Input(0);
const auto& W = Input(1);
const auto& b = Input(2);
math_type = TensorProto_DataType_FLOAT16;
}
+#ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN
+ /* if (VLOG_IS_ON(3)) */
+ {
+ t_end = std::chrono::system_clock::now();
+ double dt = std::chrono::duration<double>(t_end - t_begin).count();
+ LOG(INFO) << "@PERF this=" << this << " before_gemm: " << dt * 1e3
+ << " ms";
+ t_begin = std::chrono::system_clock::now();
+ }
+#endif
// W * x
math::Gemm<T_X, Context, Engine>(
CblasNoTrans,
Y->template mutable_data<T_Y>(),
&context_,
math_type);
+
+#ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN
+ /* if (VLOG_IS_ON(3)) */
+ {
+ t_end = std::chrono::system_clock::now();
+ double dt = std::chrono::duration<double>(t_end - t_begin).count();
+ LOG(INFO) << "@PERF this=" << this << " gemm: " << dt * 1e3 << " ms";
+ t_begin = std::chrono::system_clock::now();
+ }
+#endif
// Add bias term
if (!bias_multiplier_.has_value()) {
- bias_multiplier_ = caffe2::empty({M}, at::dtype<T_B>().device(Context::GetDeviceType()));
+ bias_multiplier_ =
+ caffe2::empty({M}, at::dtype<T_B>().device(Context::GetDeviceType()));
math::Set<T_B, Context>(
M,
convert::To<float, T_B>(1),
Y->template mutable_data<T_Y>(),
&context_,
math_type);
+
+#ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN
+ /* if (VLOG_IS_ON(3)) */
+ {
+ t_end = std::chrono::system_clock::now();
+ double dt = std::chrono::duration<double>(t_end - t_begin).count();
+ LOG(INFO) << "@PERF this=" << this << " add_bias : " << dt * 1e3 << " ms";
+ t_begin = std::chrono::system_clock::now();
+ }
+#endif
return true;
}
#include "fully_connected_dnnlowp_op.h"
+#ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN
#include <chrono>
+#endif
#include "caffe2/core/flags.h"
#include "caffe2/core/tensor_int8.h"
return true;
}
+#ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN
chrono::time_point<chrono::system_clock> t_very_begin, t_begin, t_end;
-
- if (VLOG_IS_ON(3)) {
+ /* if (VLOG_IS_ON(3)) */
+ {
t_begin = chrono::system_clock::now();
t_very_begin = t_begin;
}
+#endif
// Get quantization parameters
if (!GetQuantizationParameters_()) {
return false;
}
- if (VLOG_IS_ON(3)) {
+#ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN
+ /* if (VLOG_IS_ON(3)) */
+ {
t_end = chrono::system_clock::now();
double dt = chrono::duration<double>(t_end - t_begin).count();
- VLOG(3) << "@PERF this=" << this << " get_quant_params: " << dt * 1e3
- << " ms";
+ LOG(INFO) << "@PERF this=" << this << " get_quant_params: " << dt * 1e3
+ << " ms";
t_begin = chrono::system_clock::now();
}
+#endif
const auto& X = InputTensorCPU_(0);
const auto& W = InputTensorCPU_(1);
const T* Xdata = nullptr;
vector<T> X_temp;
- if (VLOG_IS_ON(3)) {
+#ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN
+ /* if (VLOG_IS_ON(1)) */
+ {
t_end = chrono::system_clock::now();
double dt = chrono::duration<double>(t_end - t_begin).count();
- VLOG(3) << "@PERF this=" << this << " initialize parameters: " << dt * 1e3
- << " ms";
+ LOG(INFO) << "@PERF this=" << this << " initialize parameters: " << dt * 1e3
+ << " ms";
t_begin = chrono::system_clock::now();
}
+#endif
+
if (Wq_packed_) {
// fast path to use fbgemm
using namespace fbgemm;
if (X.template IsType<T>() || !dequantize_output_) {
// Only when input and output are float, we don't need input to be
// quantized.
- if (VLOG_IS_ON(3)) {
- t_begin = chrono::system_clock::now();
- }
+
+#ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN
+ /* if (VLOG_IS_ON(1)) */
+ { t_begin = chrono::system_clock::now(); }
+#endif
+
Xdata = QuantizeInputIfNeeded<T>(this, 0, in_qparams_[0], X_temp);
- if (VLOG_IS_ON(3)) {
+
+#ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN
+ /* if (VLOG_IS_ON(1)) */
+ {
t_end = chrono::system_clock::now();
double dt = chrono::duration<double>(t_end - t_begin).count();
- VLOG(3) << "@PERF this=" << this << " input quantization: " << dt * 1e3
- << " ms";
+ LOG(INFO) << "@PERF this=" << this
+ << " input quantization: " << dt * 1e3 << " ms";
t_begin = chrono::system_clock::now();
}
+#endif
}
- if (VLOG_IS_ON(3)) {
- t_begin = chrono::system_clock::now();
- }
+#ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN
+ /* if (VLOG_IS_ON(1)) */
+ { t_begin = chrono::system_clock::now(); }
+#endif
+
if (!dequantize_output_) {
Y_int32_.resize(Y->size());
DoNothing<> doNothingObj{};
} // dequantize_output
} else {
// Quantize X
- if (VLOG_IS_ON(3)) {
- t_begin = chrono::system_clock::now();
- }
+
+#ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN
+ /* if (VLOG_IS_ON(1)) */
+ { t_begin = chrono::system_clock::now(); }
+#endif
+
Xdata = QuantizeInputIfNeeded<T>(this, 0, in_qparams_[0], X_temp);
- if (VLOG_IS_ON(3)) {
+
+#ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN
+ /* if (VLOG_IS_ON(1)) */
+ {
t_end = chrono::system_clock::now();
double dt = chrono::duration<double>(t_end - t_begin).count();
- VLOG(3) << "@PERF this=" << this << " input quantization: " << dt * 1e3
- << " ms";
+ LOG(INFO) << "@PERF this=" << this << " input quantization: " << dt * 1e3
+ << " ms";
t_begin = chrono::system_clock::now();
}
+#endif
Y_int32_.resize(Y->size());
for (int i = 0; i < M; ++i) {
StoreMatrixInMatrixMarketFormat(N, K, Wdata, this->debug_def().input(1));
}
- if (VLOG_IS_ON(3)) {
+#ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN
+ /* if (VLOG_IS_ON(1)) */
+ {
t_end = chrono::system_clock::now();
double dt = chrono::duration<double>(t_end - t_begin).count();
- VLOG(3) << "@PERF this=" << this << " gemm: " << dt * 1e3 << " ms";
+ LOG(INFO) << "@PERF this=" << this << " gemm: " << dt * 1e3 << " ms";
t_begin = chrono::system_clock::now();
}
+#endif
// Adjust with bias and zero_point and then requantize
// See batch_matmul_dnnlowp_op.cc to why we compute column_offsets,
MeasureQuantizationError_();
- if (VLOG_IS_ON(3)) {
+#ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN
+ /* if (VLOG_IS_ON(1)) */
+ {
t_end = chrono::system_clock::now();
double dt = chrono::duration<double>(t_end - t_begin).count();
- VLOG(3) << "@PERF this=" << this
- << " bias-offset-requantization: " << dt * 1e3 << " ms";
+ LOG(INFO) << "@PERF this=" << this
+ << " bias-offset-requantization: " << dt * 1e3 << " ms";
t_end = chrono::system_clock::now();
double ops = 2. * M * N * K;
dt = chrono::duration<double>(t_end - t_very_begin).count();
double gops = ops / dt / 1e9;
- VLOG(3) << "@PERF this=" << this
- << " output=" << this->debug_def().output(0) << " " << M << "x" << N
- << "x" << K << ": " << dt * 1e3 << " ms " << gops << " gops";
+ LOG(INFO) << "@PERF this=" << this
+ << " output=" << this->debug_def().output(0) << " " << M << "x"
+ << N << "x" << K << ": " << dt * 1e3 << " ms " << gops << " gops";
}
+#endif
return true;
}
bool FullyConnectedDNNLowPOp<T>::GetQuantizationParameters_() {
using namespace dnnlowp;
+#ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN
chrono::time_point<chrono::system_clock> t_begin, t_end;
- if (VLOG_IS_ON(3)) {
- t_begin = chrono::system_clock::now();
- }
+ /* if (VLOG_IS_ON(1)) */
+ { t_begin = chrono::system_clock::now(); }
+#endif
+
// Choose quantization for X
in_qparams_[0] = GetInputTensorQuantizationParamsOf(this, 0, qfactory_.get());
- if (VLOG_IS_ON(3)) {
+#ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN
+ /* if (VLOG_IS_ON(1)) */
+ {
t_end = chrono::system_clock::now();
double dt = chrono::duration<double>(t_end - t_begin).count();
- VLOG(3) << "@PERF this=" << this << " GetInputTensorQuantizationParamsOf "
- << dt * 1e3 << " ms";
+ LOG(INFO) << "@PERF this=" << this << " GetInputTensorQuantizationParamsOf "
+ << dt * 1e3 << " ms";
t_begin = chrono::system_clock::now();
}
+#endif
// Quantize W
const auto& X = InputTensorCPU_(0);
filter_qparams_[0]);
}
- if (VLOG_IS_ON(3)) {
+#ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN
+ /* if (VLOG_IS_ON(1)) */
+ {
t_end = chrono::system_clock::now();
double dt = chrono::duration<double>(t_end - t_begin).count();
- VLOG(3) << "@PERF this=" << this << " Quantize W " << dt * 1e3 << " ms";
+ LOG(INFO) << "@PERF this=" << this << " Quantize W " << dt * 1e3 << " ms";
t_begin = chrono::system_clock::now();
}
+#endif
+
// Pre-compute column_offset
// If input tensor doesn't use dynamic quantization, we fold column_offsets_
// into bias.
K, N, W_quantized_.data(), filter_qparams_, *column_offsets_);
}
}
- if (VLOG_IS_ON(3)) {
+
+#ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN
+ /* if (VLOG_IS_ON(1)) */
+ {
t_end = chrono::system_clock::now();
double dt = chrono::duration<double>(t_end - t_begin).count();
- VLOG(3) << "@PERF this=" << this << " Calculate column offset " << dt * 1e3
- << " ms";
+ LOG(INFO) << "@PERF this=" << this << " Calculate column offset "
+ << dt * 1e3 << " ms";
t_begin = chrono::system_clock::now();
}
+#endif
// Quantize bias
if (!is_weight_constant_ || (!b_quantized_data_ && !b_dequantized_data_) ||
vector<T_signed>().swap(W_quantized_);
}
- if (VLOG_IS_ON(3)) {
+#ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN
+ /* if (VLOG_IS_ON(1)) */
+ {
t_end = chrono::system_clock::now();
double dt = chrono::duration<double>(t_end - t_begin).count();
- VLOG(3) << "@PERF this=" << this << " Quantize bias " << dt * 1e3 << " ms";
+ LOG(INFO) << "@PERF this=" << this << " Quantize bias " << dt * 1e3
+ << " ms";
t_begin = chrono::system_clock::now();
}
+#endif
if (!dequantize_output_ && !requantization_param_selected_) {
GetOutputQuantizationParams_();
Fp32Op_()->Get()->RunOnDevice();
}
}
- if (VLOG_IS_ON(3)) {
+
+#ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN
+ /* if (VLOG_IS_ON(1)) */
+ {
t_end = chrono::system_clock::now();
double dt = chrono::duration<double>(t_end - t_begin).count();
- VLOG(3) << "@PERF this=" << this << " GetOutputQuantizationParams "
- << dt * 1e3 << " ms";
+ LOG(INFO) << "@PERF this=" << this << " GetOutputQuantizationParams "
+ << dt * 1e3 << " ms";
t_begin = chrono::system_clock::now();
}
+#endif
+
return true;
}