From dad0dbd3b926b2729a5893758dc6f085b170d691 Mon Sep 17 00:00:00 2001 From: Jongsoo Park Date: Fri, 15 Feb 2019 09:44:32 -0800 Subject: [PATCH] merge fully_connected_rowwise_dnnlowp_op into fully_connected_dnnlowp_op (#17105) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/17105 To make FC with rowwise quantization faster, reduce code duplication, and make code consistent with Convolution Reviewed By: csummersea Differential Revision: D14080461 fbshipit-source-id: 2b0e67b86e7e3029c90751a8824bf80ae1223680 --- caffe2/quantization/server/CMakeLists.txt | 1 - caffe2/quantization/server/conv_dnnlowp_op.cc | 2 - caffe2/quantization/server/conv_dnnlowp_op.h | 1 - .../server/fbgemm_pack_matrix_cache.cc | 9 +- .../quantization/server/fbgemm_pack_matrix_cache.h | 3 +- caffe2/quantization/server/fbgemm_pack_op.cc | 12 +- caffe2/quantization/server/fbgemm_pack_op.h | 1 + .../server/fully_connected_dnnlowp_acc16_op.cc | 26 +- .../server/fully_connected_dnnlowp_op.cc | 298 ++++++++++----- .../server/fully_connected_dnnlowp_op.h | 8 +- .../server/fully_connected_rowwise_dnnlowp_op.cc | 401 --------------------- .../server/fully_connected_rowwise_dnnlowp_op.h | 49 --- .../fully_connected_rowwise_dnnlowp_op_test.py | 4 +- 13 files changed, 246 insertions(+), 569 deletions(-) delete mode 100644 caffe2/quantization/server/fully_connected_rowwise_dnnlowp_op.cc delete mode 100644 caffe2/quantization/server/fully_connected_rowwise_dnnlowp_op.h diff --git a/caffe2/quantization/server/CMakeLists.txt b/caffe2/quantization/server/CMakeLists.txt index b21eab5..2304012 100644 --- a/caffe2/quantization/server/CMakeLists.txt +++ b/caffe2/quantization/server/CMakeLists.txt @@ -30,7 +30,6 @@ list(APPEND Caffe2_CPU_SRCS "${CMAKE_CURRENT_SOURCE_DIR}/fully_connected_dnnlowp_acc16_op.cc" "${CMAKE_CURRENT_SOURCE_DIR}/fully_connected_dnnlowp_op.cc" "${CMAKE_CURRENT_SOURCE_DIR}/fully_connected_fake_lowp_op.cc" - "${CMAKE_CURRENT_SOURCE_DIR}/fully_connected_rowwise_dnnlowp_op.cc" "${CMAKE_CURRENT_SOURCE_DIR}/group_norm_dnnlowp_op.cc" "${CMAKE_CURRENT_SOURCE_DIR}/lstm_unit_dnnlowp_op.cc" "${CMAKE_CURRENT_SOURCE_DIR}/pool_dnnlowp_op.cc" diff --git a/caffe2/quantization/server/conv_dnnlowp_op.cc b/caffe2/quantization/server/conv_dnnlowp_op.cc index cdc6260..e75789f 100644 --- a/caffe2/quantization/server/conv_dnnlowp_op.cc +++ b/caffe2/quantization/server/conv_dnnlowp_op.cc @@ -341,12 +341,10 @@ void ConvDNNLowPOp::QuantizeWeight_() { } } - filter_scales_.resize(filter_qparams_.size()); filter_zero_points_.resize(filter_qparams_.size()); requantization_params_.resize(filter_qparams_.size()); requantization_multipliers_.resize(filter_qparams_.size()); for (int i = 0; i < filter_qparams_.size(); ++i) { - filter_scales_[i] = filter_qparams_[i].scale; filter_zero_points_[i] = filter_qparams_[i].zero_point; } diff --git a/caffe2/quantization/server/conv_dnnlowp_op.h b/caffe2/quantization/server/conv_dnnlowp_op.h index c8b6574..6f6c8ea 100644 --- a/caffe2/quantization/server/conv_dnnlowp_op.h +++ b/caffe2/quantization/server/conv_dnnlowp_op.h @@ -89,7 +89,6 @@ class ConvDNNLowPOp : public ConvPoolDNNLowPOpBase { std::vector Y_int32_; std::vector filter_qparams_; - std::vector filter_scales_; std::vector filter_zero_points_; std::vector requantization_multipliers_; diff --git a/caffe2/quantization/server/fbgemm_pack_matrix_cache.cc b/caffe2/quantization/server/fbgemm_pack_matrix_cache.cc index 6ee2913..463e0c4 100644 --- a/caffe2/quantization/server/fbgemm_pack_matrix_cache.cc +++ b/caffe2/quantization/server/fbgemm_pack_matrix_cache.cc @@ -15,8 +15,7 @@ shared_ptr> GetOrCreateFbgemmPackBMatrix( int32_t n, const void* orig_data, const int8_t* quantized_data, - int32_t ld, - int32_t zero_point) { + int32_t ld) { static std::map< std::tuple, weak_ptr>> @@ -65,8 +64,7 @@ GetOrCreateFbgemmPackBMatrix( int32_t n, const void* orig_data, const int8_t* quantized_data, - int32_t ld, - int32_t zero_point); + int32_t ld); template shared_ptr> GetOrCreateFbgemmPackBMatrix( @@ -75,7 +73,6 @@ GetOrCreateFbgemmPackBMatrix( int32_t n, const void* orig_data, const int8_t* quantized_data, - int32_t ld, - int32_t zero_point); + int32_t ld); } // namespace caffe2 diff --git a/caffe2/quantization/server/fbgemm_pack_matrix_cache.h b/caffe2/quantization/server/fbgemm_pack_matrix_cache.h index 7a7b3c5..fb5ff59 100644 --- a/caffe2/quantization/server/fbgemm_pack_matrix_cache.h +++ b/caffe2/quantization/server/fbgemm_pack_matrix_cache.h @@ -17,7 +17,6 @@ GetOrCreateFbgemmPackBMatrix( std::int32_t n, const void* orig_data, const std::int8_t* quantized_data, - std::int32_t ld, - std::int32_t zero_point); + std::int32_t ld); } // namespace caffe2 diff --git a/caffe2/quantization/server/fbgemm_pack_op.cc b/caffe2/quantization/server/fbgemm_pack_op.cc index e0852ba..704d4e1 100644 --- a/caffe2/quantization/server/fbgemm_pack_op.cc +++ b/caffe2/quantization/server/fbgemm_pack_op.cc @@ -211,7 +211,12 @@ FullyConnectedDNNLowPPackWeightOp::FullyConnectedDNNLowPPackWeightOp( const OperatorDef& operator_def, Workspace* ws) : DNNLowPOp(operator_def, ws), - axis_w_(this->GetSingleArgument("axis_w", 1)) { + axis_w_(this->GetSingleArgument("axis_w", 1)), + quantize_channelwise_( + this->GetSingleArgument("quantize_channelwise", false)) { + if (this->debug_def().engine() == "DNNLOWP_ROWWISE") { + quantize_channelwise_ = true; + } if (this->debug_def().engine() == "DNNLOWP_ACC16") { nbits_in_non_outlier_ = this->GetSingleArgument( "nbits_in_non_outlier", FLAGS_caffe2_dnnlowp_nbits_in_non_outlier); @@ -231,14 +236,13 @@ bool FullyConnectedDNNLowPPackWeightOp::RunOnDevice() { // This is just a convenient way to pass tensor shape information Y->original_tensor.ResizeLike(filter); - Y->qparams.resize((this->debug_def().engine() == "DNNLOWP_ROWWISE") ? N : 1); + Y->qparams.resize(quantize_channelwise_ ? N : 1); vector W_quantized; QuantizeWeight( InputBlob(0), K, N, Y->qparams, W_quantized, qfactory_.get()); - if (this->InputIsType(0) && - this->debug_def().engine() == "DNNLOWP_ROWWISE") { + if (this->InputIsType(0) && quantize_channelwise_) { static int log_occurences = 0; if (log_occurences < 32) { ++log_occurences; diff --git a/caffe2/quantization/server/fbgemm_pack_op.h b/caffe2/quantization/server/fbgemm_pack_op.h index 8d28711..a059484 100644 --- a/caffe2/quantization/server/fbgemm_pack_op.h +++ b/caffe2/quantization/server/fbgemm_pack_op.h @@ -22,6 +22,7 @@ class FullyConnectedDNNLowPPackWeightOp final private: int axis_w_; + bool quantize_channelwise_; int nbits_in_non_outlier_; // only for DNNLOWP_ACC16 INPUT_TAGS(FILTER, BIAS); diff --git a/caffe2/quantization/server/fully_connected_dnnlowp_acc16_op.cc b/caffe2/quantization/server/fully_connected_dnnlowp_acc16_op.cc index ecee4bd..343c398 100644 --- a/caffe2/quantization/server/fully_connected_dnnlowp_acc16_op.cc +++ b/caffe2/quantization/server/fully_connected_dnnlowp_acc16_op.cc @@ -45,6 +45,11 @@ bool FullyConnectedDNNLowPAcc16Op::RunOnDevice() { const uint8_t* Xdata = QuantizeInputIfNeeded(this, 0, in_qparams_[0], X_temp); + if (this->quantize_channelwise_) { + LOG(WARNING) << "FC with 16-bit accumulation doesn't work with per-channel " + "quantization yet."; + } + // Pack W if needed if (!Wq_acc16_packed_ || !is_weight_constant_) { if (this->template InputIsType(1)) { @@ -114,7 +119,7 @@ bool FullyConnectedDNNLowPAcc16Op::RunOnDevice() { this->row_offsets_.resize(row_offset_size_per_thread); this->X_pack_buf_.resize(x_pack_buf_size_per_thread); - // TODO: use PackAMatrix if in_qparams_[1].zero_point == 0 + // TODO: use PackAMatrix if filter_qparams_[0].zero_point == 0 PackAWithRowOffset packA( matrix_op_t::NoTranspose, M, @@ -129,10 +134,10 @@ bool FullyConnectedDNNLowPAcc16Op::RunOnDevice() { DoNothing<> doNothingObj{}; ReQuantizeOutput reqObj( doNothingObj, - &requantization_params_.real_multiplier, + this->requantization_multipliers_.data(), out_qparams_.zero_point, column_offsets_->empty() ? 0 : in_qparams_[0].zero_point, - &in_qparams_[1].zero_point, + this->filter_zero_points_.data(), packA.getRowOffsetBuffer(), column_offsets_->empty() ? nullptr : column_offsets_->data(), this->b_quantized_data_, @@ -170,9 +175,9 @@ bool FullyConnectedDNNLowPAcc16Op::RunOnDevice() { ReQuantizeForFloat reqObj( doNothingObj, in_qparams_[0].scale, - &in_qparams_[1].scale, + this->filter_scales_.data(), column_offsets_->empty() ? 0 : in_qparams_[0].zero_point, - &in_qparams_[1].zero_point, + this->filter_zero_points_.data(), packA.getRowOffsetBuffer(), column_offsets_->empty() ? nullptr : column_offsets_->data(), this->b_dequantized_data_, @@ -220,16 +225,17 @@ bool FullyConnectedDNNLowPAcc16Op::RunOnDevice() { for (int k = 0; k < K; ++k) { row_offset += Xdata[i * K + k]; } - row_offset *= in_qparams_[1].zero_point; for (int j = 0; j < N; ++j) { - Y_int32_[i * N + j] -= row_offset; + int quant_group = this->quantize_channelwise_ ? j : 0; + Y_int32_[i * N + j] -= + row_offset * this->filter_qparams_[quant_group].zero_point; if (!column_offsets_->empty()) { Y_int32_[i * N + j] -= in_qparams_[0].zero_point * (*column_offsets_)[j]; } Ydata_float[i * N + j] = Y_int32_[i * N + j] * in_qparams_[0].scale * - in_qparams_[1].scale + + in_qparams_[quant_group].scale + b_dequantized_data_[j]; } } @@ -248,10 +254,10 @@ bool FullyConnectedDNNLowPAcc16Op::RunOnDevice() { N, Y_int32_.data() + i * N, Ydata + i * N, - &requantization_params_.real_multiplier, + this->requantization_multipliers_.data(), out_qparams_.zero_point, column_offsets_->empty() ? 0 : in_qparams_[0].zero_point, - &in_qparams_[1].zero_point, + this->filter_zero_points_.data(), &row_offset, column_offsets_->empty() ? nullptr : column_offsets_->data(), b_quantized_->data(), diff --git a/caffe2/quantization/server/fully_connected_dnnlowp_op.cc b/caffe2/quantization/server/fully_connected_dnnlowp_op.cc index 8d0579b..c19f43e 100644 --- a/caffe2/quantization/server/fully_connected_dnnlowp_op.cc +++ b/caffe2/quantization/server/fully_connected_dnnlowp_op.cc @@ -29,6 +29,9 @@ FullyConnectedDNNLowPOp::FullyConnectedDNNLowPOp( : BaseType(operator_def, ws), axis_(this->template GetSingleArgument("axis", 1)), axis_w_(this->template GetSingleArgument("axis_w", 1)), + quantize_channelwise_(this->template GetSingleArgument( + "quantize_channelwise", + false)), b_quantized_(make_shared>()), column_offsets_(make_shared>()), is_weight_constant_( @@ -37,6 +40,10 @@ FullyConnectedDNNLowPOp::FullyConnectedDNNLowPOp( LOG(INFO) << operator_def.output(0) << " is_weight_constant " << is_weight_constant_; } + if (this->debug_def().engine() == "DNNLOWP_ROWWISE" || + this->debug_def().engine() == "DNNLOWP_ROWWISE_16") { + quantize_channelwise_ = true; + } VLOG(2) << "DNNLOWP FC with output " << operator_def.output(0); } @@ -157,7 +164,7 @@ bool FullyConnectedDNNLowPOp::RunOnDevice() { Y_int32_.resize(Y->size()); DoNothing<> doNothingObj{}; - if (in_qparams_[1].zero_point) { + if (quantize_channelwise_ || filter_qparams_[0].zero_point) { row_offsets_.resize(PackAWithRowOffset::rowOffsetBufferSize()); X_pack_buf_.resize(PackAWithRowOffset::packedBufferSize()); @@ -171,27 +178,54 @@ bool FullyConnectedDNNLowPOp::RunOnDevice() { 1, // group row_offsets_.data()); - ReQuantizeOutput outputProcObj( - doNothingObj, - &requantization_params_.real_multiplier, - out_qparams_.zero_point, - column_offsets_->empty() ? 0 : in_qparams_[0].zero_point, - &in_qparams_[1].zero_point, - packA.getRowOffsetBuffer(), - column_offsets_->empty() ? nullptr : column_offsets_->data(), - b_quantized_data_, - N); // ncols per quant group - - fbgemmPacked( - packA, - *Wq_packed_, - reinterpret_cast( - OutputTensorCPU_(0)->template mutable_data()), - Y_int32_.data(), - N, - outputProcObj, - 0, // thread_id - 1); // num_threads + if (quantize_channelwise_) { + ReQuantizeOutput< + false /* FUSE_RELU */, + QuantizationGranularity::OUT_CHANNEL> + outputProcObj( + doNothingObj, + requantization_multipliers_.data(), + out_qparams_.zero_point, + column_offsets_->empty() ? 0 : in_qparams_[0].zero_point, + filter_zero_points_.data(), + packA.getRowOffsetBuffer(), + column_offsets_->empty() ? nullptr : column_offsets_->data(), + b_quantized_data_, + N); + + fbgemmPacked( + packA, + *Wq_packed_, + reinterpret_cast( + OutputTensorCPU_(0)->template mutable_data()), + Y_int32_.data(), + N, + outputProcObj, + 0, // thread_id + 1); // num_threads + } else { + ReQuantizeOutput outputProcObj( + doNothingObj, + requantization_multipliers_.data(), + out_qparams_.zero_point, + column_offsets_->empty() ? 0 : in_qparams_[0].zero_point, + filter_zero_points_.data(), + packA.getRowOffsetBuffer(), + column_offsets_->empty() ? nullptr : column_offsets_->data(), + b_quantized_data_, + N); + + fbgemmPacked( + packA, + *Wq_packed_, + reinterpret_cast( + OutputTensorCPU_(0)->template mutable_data()), + Y_int32_.data(), + N, + outputProcObj, + 0, // thread_id + 1); // num_threads + } } else { X_pack_buf_.resize(PackAMatrix::packedBufferSize()); @@ -206,14 +240,14 @@ bool FullyConnectedDNNLowPOp::RunOnDevice() { ReQuantizeOutput outputProcObj( doNothingObj, - &requantization_params_.real_multiplier, + requantization_multipliers_.data(), out_qparams_.zero_point, column_offsets_->empty() ? 0 : in_qparams_[0].zero_point, - &in_qparams_[1].zero_point, + filter_zero_points_.data(), nullptr, column_offsets_->empty() ? nullptr : column_offsets_->data(), b_quantized_data_, - N); // ncols per quant group + N); fbgemmPacked( packA, @@ -249,26 +283,53 @@ bool FullyConnectedDNNLowPOp::RunOnDevice() { row_offsets_.data()); DoNothing doNothingObj{}; - ReQuantizeForFloat outputProcObj( - doNothingObj, - in_qparams_[0].scale, - &in_qparams_[1].scale, - column_offsets_->empty() ? 0 : in_qparams_[0].zero_point, - &in_qparams_[1].zero_point, - packA.getRowOffsetBuffer(), - column_offsets_->empty() ? nullptr : column_offsets_->data(), - b_dequantized_data_, // bias - N); // ncols per quant group - fbgemmPacked( - packA, - *Wq_packed_, - Y_data, - reinterpret_cast(Y_data), - N, - outputProcObj, - 0, // thread_id - 1); // num_threads + if (quantize_channelwise_) { + ReQuantizeForFloat< + false /* FUSE_RELU*/, + QuantizationGranularity::OUT_CHANNEL> + outputProcObj( + doNothingObj, + in_qparams_[0].scale, + filter_scales_.data(), + column_offsets_->empty() ? 0 : in_qparams_[0].zero_point, + filter_zero_points_.data(), + packA.getRowOffsetBuffer(), + column_offsets_->empty() ? nullptr : column_offsets_->data(), + b_dequantized_data_, // bias + N); + + fbgemmPacked( + packA, + *Wq_packed_, + Y_data, + reinterpret_cast(Y_data), + N, + outputProcObj, + 0, // thread_id + 1); // num_threads + } else { + ReQuantizeForFloat outputProcObj( + doNothingObj, + in_qparams_[0].scale, + filter_scales_.data(), + column_offsets_->empty() ? 0 : in_qparams_[0].zero_point, + filter_zero_points_.data(), + packA.getRowOffsetBuffer(), + column_offsets_->empty() ? nullptr : column_offsets_->data(), + b_dequantized_data_, // bias + N); + + fbgemmPacked( + packA, + *Wq_packed_, + Y_data, + reinterpret_cast(Y_data), + N, + outputProcObj, + 0, // thread_id + 1); // num_threads + } } else { // Input quantized and output float row_offsets_.resize(PackAWithRowOffset::rowOffsetBufferSize()); @@ -284,26 +345,53 @@ bool FullyConnectedDNNLowPOp::RunOnDevice() { row_offsets_.data()); DoNothing doNothingObj{}; - ReQuantizeForFloat outputProcObj( - doNothingObj, - in_qparams_[0].scale, - &in_qparams_[1].scale, - column_offsets_->empty() ? 0 : in_qparams_[0].zero_point, - &in_qparams_[1].zero_point, - packA.getRowOffsetBuffer(), - column_offsets_->empty() ? nullptr : column_offsets_->data(), - b_dequantized_data_, // bias - N); // ncols per quant group - fbgemmPacked( - packA, - *Wq_packed_, - Y_data, - reinterpret_cast(Y_data), - N, - outputProcObj, - 0, // thread_id - 1); // num_threads + if (quantize_channelwise_) { + ReQuantizeForFloat< + false /* FUSE_RELU*/, + QuantizationGranularity::OUT_CHANNEL> + outputProcObj( + doNothingObj, + in_qparams_[0].scale, + filter_scales_.data(), + column_offsets_->empty() ? 0 : in_qparams_[0].zero_point, + filter_zero_points_.data(), + packA.getRowOffsetBuffer(), + column_offsets_->empty() ? nullptr : column_offsets_->data(), + b_dequantized_data_, // bias + N); + + fbgemmPacked( + packA, + *Wq_packed_, + Y_data, + reinterpret_cast(Y_data), + N, + outputProcObj, + 0, // thread_id + 1); // num_threads + } else { + ReQuantizeForFloat outputProcObj( + doNothingObj, + in_qparams_[0].scale, + filter_scales_.data(), + column_offsets_->empty() ? 0 : in_qparams_[0].zero_point, + filter_zero_points_.data(), + packA.getRowOffsetBuffer(), + column_offsets_->empty() ? nullptr : column_offsets_->data(), + b_dequantized_data_, // bias + N); + + fbgemmPacked( + packA, + *Wq_packed_, + Y_data, + reinterpret_cast(Y_data), + N, + outputProcObj, + 0, // thread_id + 1); // num_threads + } } } // dequantize_output } else { @@ -361,16 +449,17 @@ bool FullyConnectedDNNLowPOp::RunOnDevice() { for (int k = 0; k < K; ++k) { row_offset += Xdata[i * K + k]; } - row_offset *= in_qparams_[1].zero_point; for (int j = 0; j < N; ++j) { if (!column_offsets_->empty()) { Y_int32_[i * N + j] -= in_qparams_[0].zero_point * (*column_offsets_)[j]; } - Y_int32_[i * N + j] -= row_offset; + int quant_group = quantize_channelwise_ ? j : 0; + Y_int32_[i * N + j] -= + row_offset * filter_qparams_[quant_group].zero_point; Ydata[i * N + j] = Y_int32_[i * N + j] * in_qparams_[0].scale * - in_qparams_[1].scale + + filter_qparams_[quant_group].scale + b_dequantized_data_[j]; } } @@ -383,7 +472,6 @@ bool FullyConnectedDNNLowPOp::RunOnDevice() { for (int k = 0; k < K; ++k) { row_offset += Xdata[i * K + k]; } - row_offset *= in_qparams_[1].zero_point; for (int j = 0; j < N; ++j) { if (!column_offsets_->empty()) { @@ -391,11 +479,13 @@ bool FullyConnectedDNNLowPOp::RunOnDevice() { Y_int32_[i * N + j] -= in_qparams_[0].zero_point * (*column_offsets_)[j]; } - Y_int32_[i * N + j] -= row_offset; + int quant_group = quantize_channelwise_ ? j : 0; + Y_int32_[i * N + j] -= + row_offset * filter_qparams_[quant_group].zero_point; Y_int32_[i * N + j] += b_quantized_data_[j]; Ydata[i * N + j] = fbgemm::Requantize( - Y_int32_[i * N + j], requantization_params_); + Y_int32_[i * N + j], requantization_params_[quant_group]); } } } @@ -459,13 +549,25 @@ bool FullyConnectedDNNLowPOp::GetQuantizationParameters_() { if (this->template InputIsType(1)) { const auto& packed_filter = this->template Input(1); - CAFFE_ENFORCE_EQ(packed_filter.qparams.size(), 1); - in_qparams_[1] = packed_filter.qparams[0]; + filter_qparams_ = packed_filter.qparams; + if (quantize_channelwise_) { + CAFFE_ENFORCE_EQ(filter_qparams_.size(), N); + } else { + CAFFE_ENFORCE_EQ(filter_qparams_.size(), 1); + } } else { - vector temp_qparams(1); + filter_qparams_.resize(quantize_channelwise_ ? N : 1); QuantizeWeight( - InputBlob(1), K, N, temp_qparams, W_quantized_, qfactory_.get()); - in_qparams_[1] = temp_qparams[0]; + InputBlob(1), K, N, filter_qparams_, W_quantized_, qfactory_.get()); + } + + filter_scales_.resize(filter_qparams_.size()); + filter_zero_points_.resize(filter_qparams_.size()); + requantization_params_.resize(filter_qparams_.size()); + requantization_multipliers_.resize(filter_qparams_.size()); + for (int i = 0; i < filter_qparams_.size(); ++i) { + filter_scales_[i] = filter_qparams_[i].scale; + filter_zero_points_[i] = filter_qparams_[i].zero_point; } if (fast_path) { @@ -481,8 +583,7 @@ bool FullyConnectedDNNLowPOp::GetQuantizationParameters_() { N, W.raw_data(), reinterpret_cast(W_quantized_.data()), - K, // ld - in_qparams_[1].zero_point); + K); // ld } } else { string reason; @@ -504,16 +605,17 @@ bool FullyConnectedDNNLowPOp::GetQuantizationParameters_() { } // is_weight_constant_ else { // !is_weight_constant_ - in_qparams_[1] = GetInputTensorQuantizationParamsOf( + filter_qparams_.resize(1); + filter_qparams_[0] = GetInputTensorQuantizationParamsOf( this, 1, qfactory_.get(), true /*weight*/); - in_qparams_[1].zero_point += signed_min; + filter_qparams_[0].zero_point += signed_min; W_quantized_.resize(W.size()); fbgemm::Quantize( W.template data(), W_quantized_.data(), W_quantized_.size(), - in_qparams_[1]); + filter_qparams_[0]); } if (VLOG_IS_ON(3)) { @@ -535,10 +637,8 @@ bool FullyConnectedDNNLowPOp::GetQuantizationParameters_() { this->template Input(1); column_offsets_ = packed_filter.column_offsets; } else { - vector temp_qparams; - temp_qparams.push_back(in_qparams_[1]); ComputeColumnOffsets( - K, N, W_quantized_.data(), temp_qparams, *column_offsets_); + K, N, W_quantized_.data(), filter_qparams_, *column_offsets_); } } if (VLOG_IS_ON(3)) { @@ -570,7 +670,7 @@ bool FullyConnectedDNNLowPOp::GetQuantizationParameters_() { CAFFE_ENFORCE_LE( std::abs( bias_qparams.scale - - in_qparams_[0].scale * in_qparams_[1].scale), + in_qparams_[0].scale * filter_qparams_[0].scale), 1e-4); CAFFE_ENFORCE_EQ(bias_qparams.zero_point, 0); b_quantized_data_ = bias.template data(); @@ -590,7 +690,7 @@ bool FullyConnectedDNNLowPOp::GetQuantizationParameters_() { (*b_quantized_)[j] = fbgemm::Quantize( b_dequantized_data_[j], 0, - in_qparams_[0].scale * in_qparams_[1].scale, + in_qparams_[0].scale * filter_qparams_[0].scale, 32); } b_quantized_data_ = b_quantized_->data(); @@ -615,11 +715,9 @@ bool FullyConnectedDNNLowPOp::GetQuantizationParameters_() { this->template Input(1); column_offset_ptr = packed_filter.column_offsets.get(); } else { - vector temp_qparams; - temp_qparams.push_back(in_qparams_[1]); column_offset_temp.resize(N); ComputeColumnOffsets( - K, N, W_quantized_.data(), temp_qparams, column_offset_temp); + K, N, W_quantized_.data(), filter_qparams_, column_offset_temp); column_offset_ptr = &column_offset_temp; } for (int i = 0; i < N; ++i) { @@ -648,10 +746,14 @@ bool FullyConnectedDNNLowPOp::GetQuantizationParameters_() { if (!dequantize_output_ && !requantization_param_selected_) { GetOutputQuantizationParams_(); - float real_multiplier = - in_qparams_[0].scale * in_qparams_[1].scale / out_qparams_.scale; - requantization_params_ = qfactory_->ChooseRequantizationMultiplier( - real_multiplier, out_qparams_); + for (int i = 0; i < filter_qparams_.size(); ++i) { + float real_multiplier = + in_qparams_[0].scale * filter_qparams_[i].scale / out_qparams_.scale; + requantization_params_[i] = qfactory_->ChooseRequantizationMultiplier( + real_multiplier, out_qparams_); + requantization_multipliers_[i] = + requantization_params_[i].real_multiplier; + } requantization_param_selected_ = true; } else { if (measure_quantization_error_) { @@ -684,4 +786,18 @@ REGISTER_CPU_OPERATOR_WITH_ENGINE( DNNLOWP, FullyConnectedDNNLowPOp); +REGISTER_CPU_OPERATOR_WITH_ENGINE( + FC, + DNNLOWP_ROWWISE, + FullyConnectedDNNLowPOp); +REGISTER_CPU_OPERATOR_WITH_ENGINE( + FC, + DNNLOWP_ROWWISE_16, + FullyConnectedDNNLowPOp); + +REGISTER_CPU_OPERATOR_WITH_ENGINE( + Int8FC, + DNNLOWP_ROWWISE, + FullyConnectedDNNLowPOp); + } // namespace caffe2 diff --git a/caffe2/quantization/server/fully_connected_dnnlowp_op.h b/caffe2/quantization/server/fully_connected_dnnlowp_op.h index a0d2d50..39c6db6 100644 --- a/caffe2/quantization/server/fully_connected_dnnlowp_op.h +++ b/caffe2/quantization/server/fully_connected_dnnlowp_op.h @@ -23,7 +23,7 @@ class FullyConnectedDNNLowPOp std::size_t axis_w_{1}; vector Y_shape_cache_; - dnnlowp::RequantizationParams requantization_params_; + std::vector requantization_params_; bool requantization_param_selected_{false}; // x86 only provides SIMD instructions that multiply a signed integer with an @@ -35,6 +35,12 @@ class FullyConnectedDNNLowPOp std::vector X_pack_buf_; std::vector Y_int32_; + std::vector filter_qparams_; + std::vector filter_scales_; + std::vector filter_zero_points_; + + std::vector requantization_multipliers_; + bool quantize_channelwise_; // used in slow path for T != uint8_t std::vector W_quantized_; diff --git a/caffe2/quantization/server/fully_connected_rowwise_dnnlowp_op.cc b/caffe2/quantization/server/fully_connected_rowwise_dnnlowp_op.cc deleted file mode 100644 index 92c2ea8..0000000 --- a/caffe2/quantization/server/fully_connected_rowwise_dnnlowp_op.cc +++ /dev/null @@ -1,401 +0,0 @@ -#include "fully_connected_rowwise_dnnlowp_op.h" - -#include -#include - -#include "fbgemm_pack_op.h" - -namespace caffe2 { - -using namespace std; - -template -FullyConnectedRowWiseDNNLowPOp::FullyConnectedRowWiseDNNLowPOp( - const OperatorDef& operator_def, - Workspace* ws) - : BaseType(operator_def, ws), - axis_(this->template GetSingleArgument("axis", 1)), - axis_w_(this->template GetSingleArgument("axis_w", 1)), - b_quantized_(make_shared>()), - column_offsets_(make_shared>()), - is_weight_constant_( - this->template GetSingleArgument("constant_weight", true)) { - using namespace dnnlowp; - LOG(INFO) << "Using Rowwise Quantization!"; - if (!is_weight_constant_) { - LOG(INFO) << operator_def.output(0) << " is_weight_constant " - << is_weight_constant_; - LOG(FATAL) << "rowwise quantization doesn't support nonconstant weights"; - } -} - -template -bool FullyConnectedRowWiseDNNLowPOp::RunOnDevice() { - using namespace std; - using namespace dnnlowp; - - this->ParseDNNLowPOperatorArguments_(); - - chrono::time_point t_very_begin, t_begin, t_end; - - if (VLOG_IS_ON(3)) { - t_begin = chrono::system_clock::now(); - t_very_begin = t_begin; - } - - // Get quantization parameters - if (!GetQuantizationParameters_()) { - return false; - } - - if (VLOG_IS_ON(3)) { - t_end = chrono::system_clock::now(); - double dt = chrono::duration(t_end - t_begin).count(); - VLOG(3) << "@PERF this=" << this << " get_quant_params: " << dt * 1e3 - << " ms"; - } - const auto& X = InputTensorCPU_(0); - const auto& W = InputTensorCPU_(1); - auto* Y = OutputTensorCPU_(0); - const auto canonical_axis = X.canonical_axis_index(axis_); - const auto M = X.size_to_dim(canonical_axis); - const auto K = X.size_from_dim(canonical_axis); - const auto canonical_axis_w = W.canonical_axis_index(axis_w_); - const int N = W.size_to_dim(canonical_axis_w); - const auto& b = InputTensorCPU_(2); - - Y_shape_cache_ = X.sizes().vec(); - Y_shape_cache_.resize(canonical_axis + 1); - Y_shape_cache_[canonical_axis] = N; - Y->Resize(Y_shape_cache_); - Y_int32_.resize(Y->size()); - // Quantize X - vector X_temp; - if (VLOG_IS_ON(3)) { - t_begin = chrono::system_clock::now(); - } - const T* Xdata = QuantizeInputIfNeeded(this, 0, in_qparams_[0], X_temp); - if (VLOG_IS_ON(3)) { - t_end = chrono::system_clock::now(); - double dt = chrono::duration(t_end - t_begin).count(); - VLOG(3) << "@PERF this=" << this << " input quantization: " << dt * 1e3 - << " ms"; - } - - const T_signed* Wdata = W_quantized_.data(); - - if (VLOG_IS_ON(3)) { - t_begin = chrono::system_clock::now(); - } - if (Wq_packed_) { - // fast path using fbgemm - using namespace fbgemm; - int row_offset_size_per_thread = M; - int x_pack_buf_size_per_thread = PackAMatrix::packedBufferSize(); - row_offsets_.resize(row_offset_size_per_thread); - X_pack_buf_.resize(x_pack_buf_size_per_thread); - - DoNothing doNothingObj{}; - memCopy<> memCopyObj(doNothingObj); - - PackAMatrix packA( - matrix_op_t::NoTranspose, - M, - K, - reinterpret_cast(Xdata), - K, - X_pack_buf_.data(), - 1); // group - - fbgemmPacked( - packA, - *Wq_packed_, - Y_int32_.data(), - Y_int32_.data(), - N, - memCopyObj, - 0, // thread_id - 1); // num_threads - - if (VLOG_IS_ON(3)) { - t_end = chrono::system_clock::now(); - double dt = chrono::duration(t_end - t_begin).count(); - VLOG(3) << "@PERF this=" << this << " gemm: " << dt * 1e3 << " ms"; - - t_begin = chrono::system_clock::now(); - } - - row_offsets_u8acc32_ref( - M, K, K, reinterpret_cast(Xdata), row_offsets_.data()); - - // Requantization - // TODO: implement row-wise requantization output pipeline - if (dequantize_output_) { - const float* b_data = b.template data(); - float* Ydata = OutputTensorCPU_(0)->template mutable_data(); - for (int i = 0; i < M; ++i) { - for (int j = 0; j < N; ++j) { - Y_int32_[i * N + j] -= - in_qparams_[0].zero_point * (*column_offsets_)[j] + - rowwise_qparams_[j].zero_point * row_offsets_[i]; - Y_int32_[i * N + j] += (*b_quantized_)[j]; - Ydata[i * N + j] = Y_int32_[i * N + j] * rowwise_qparams_[j].scale * - in_qparams_[0].scale + - b_data[j]; - } - } - } else { - T* Ydata = GetQuantizedOutputData_(); - for (int i = 0; i < M; ++i) { - for (int j = 0; j < N; ++j) { - Y_int32_[i * N + j] -= - in_qparams_[0].zero_point * (*column_offsets_)[j] + - rowwise_qparams_[j].zero_point * row_offsets_[i]; - Y_int32_[i * N + j] += (*b_quantized_)[j]; - Ydata[i * N + j] = Requantize( - Y_int32_[i * N + j], rowwise_requantization_params_[j]); - } - } - } - } else { - // slow path - for (int i = 0; i < M; ++i) { - for (int j = 0; j < N; ++j) { - int32_t sum = 0; - for (int k = 0; k < K; ++k) { - int w = (int)Wdata[j * K + k]; - sum += Xdata[i * K + k] * w; - } - Y_int32_[i * N + j] = sum; - } // for each column - } // for each row - - if (VLOG_IS_ON(3)) { - t_end = chrono::system_clock::now(); - double dt = chrono::duration(t_end - t_begin).count(); - VLOG(3) << "@PERF this=" << this << " gemm: " << dt * 1e3 << " ms"; - - t_begin = chrono::system_clock::now(); - } - - // Requantization - if (dequantize_output_) { - const float* b_data = b.template data(); - float* Ydata = OutputTensorCPU_(0)->template mutable_data(); - for (int i = 0; i < M; ++i) { - int32_t row_offset = 0; - for (int k = 0; k < K; ++k) { - row_offset += (int)Xdata[i * K + k]; - } - for (int j = 0; j < N; ++j) { - Y_int32_[i * N + j] -= - in_qparams_[0].zero_point * (*column_offsets_)[j] + - rowwise_qparams_[j].zero_point * row_offset; - Ydata[i * N + j] = Y_int32_[i * N + j] * rowwise_qparams_[j].scale * - in_qparams_[0].scale + - b_data[j]; - } - } - } else { - T* Ydata = GetQuantizedOutputData_(); - for (int i = 0; i < M; ++i) { - int32_t row_offset = 0; - for (int k = 0; k < K; ++k) { - row_offset += (int)Xdata[i * K + k]; - } - for (int j = 0; j < N; ++j) { - Y_int32_[i * N + j] -= - in_qparams_[0].zero_point * (*column_offsets_)[j] + - rowwise_qparams_[j].zero_point * row_offset; - Y_int32_[i * N + j] += (*b_quantized_)[j]; - Ydata[i * N + j] = fbgemm::Requantize( - Y_int32_[i * N + j], rowwise_requantization_params_[j]); - } - } - } - } - - if (!dequantize_output_) { - RunOnDeviceEpilogue_(); - } else { - this->MeasureQuantizationError_(); - } - - if (VLOG_IS_ON(3)) { - t_end = chrono::system_clock::now(); - double dt = chrono::duration(t_end - t_begin).count(); - VLOG(3) << "@PERF this=" << this - << " bias-offset-requantization: " << dt * 1e3 << " ms"; - - t_end = chrono::system_clock::now(); - double ops = 2. * M * N * K; - dt = chrono::duration(t_end - t_very_begin).count(); - double gops = ops / dt / 1e9; - VLOG(3) << "@PERF this=" << this - << " output=" << this->debug_def().output(0) << " " << M << "x" << N - << "x" << K << ": " << dt * 1e3 << " ms " << gops << " gops"; - } - - return true; -} - -template -bool FullyConnectedRowWiseDNNLowPOp::GetQuantizationParameters_() { - using namespace dnnlowp; - - in_qparams_[0] = GetInputTensorQuantizationParamsOf(this, 0, qfactory_.get()); - - const auto& W = InputTensorCPU_(1); - const auto canonical_axis_w = W.canonical_axis_index(axis_w_); - const auto N = W.size_to_dim(canonical_axis_w); - const auto K = W.size_from_dim(canonical_axis_w); - bool fast_path = is_same::value && GetCpuId().avx2(); - if (is_weight_constant_) { - if ((fast_path && !Wq_packed_) || (!fast_path && W_quantized_.empty())) { - LOG(INFO) << "Choose rowwise quantization params"; - if (rowwise_qparams_.empty()) { - if (this->template InputIsType(1)) { - const auto& packed_filter = - this->template Input(1); - CAFFE_ENFORCE_EQ(packed_filter.qparams.size(), N); - // TODO: optimize the overhead of copy - rowwise_qparams_ = packed_filter.qparams; - } else { - // choose rowwise quantization params - if (this->template InputIsType(1)) { - static int log_occurences = 0; - if (log_occurences < 32) { - ++log_occurences; - LOG(WARNING) << "Cannot do row-wise quantization for " - "pre-quantized weight " - << this->debug_def().input(1); - } - } - rowwise_qparams_.resize(N); - QuantizeWeight( - InputBlob(1), - K, - N, - rowwise_qparams_, - W_quantized_, - qfactory_.get()); - } - } - if (fast_path) { - // fast path using fbgemm - LOG(INFO) - << "Using fast path with int8 fbgemm and generating Wq_packed_"; - if (this->template InputIsType(1)) { - const auto& packed_filter = - this->template Input(1); - Wq_packed_ = packed_filter.W; - } else { - Wq_packed_.reset(new fbgemm::PackBMatrix( - fbgemm::matrix_op_t::Transpose, - K, - N, - reinterpret_cast(W_quantized_.data()), - K, // ld - nullptr, // pmat - 1)); // groups - } - } else { - LOG(WARNING) - << "Falling back to slow path because fbgemm doesn't support " - "this type or shape"; - } - } - } else { - // !is_weigtht_constant - LOG(WARNING) << "Not supporting nonconstant weights"; - in_qparams_[1] = - GetInputTensorQuantizationParamsOf(this, 1, qfactory_.get()); - fbgemm::Quantize( - W.template data(), - W_quantized_.data(), - W_quantized_.size(), - in_qparams_[1]); - if (rowwise_qparams_.empty()) { - rowwise_qparams_.resize(N); - for (int i = 0; i < N; ++i) { - rowwise_qparams_[i] = in_qparams_[1]; - } - } - } - - if (!is_weight_constant_ || column_offsets_->empty()) { - if (this->template InputIsType(1)) { - const auto& packed_filter = - this->template Input(1); - column_offsets_ = packed_filter.column_offsets; - } else { - ComputeColumnOffsets( - K, N, W_quantized_.data(), rowwise_qparams_, *column_offsets_); - } - } - - if (Wq_packed_) { - vector().swap(W_quantized_); - } - if (!is_weight_constant_ || b_quantized_->empty()) { - // Quantize bias - if (this->template InputIsType(2) && - this->template Input(2).bias.get()) { - const auto& packed_filter = - this->template Input(2); - CAFFE_ENFORCE(!dequantize_output_); - b_quantized_ = packed_filter.bias; - } else { - b_quantized_->resize(N); - const auto& b = InputTensorCPU_(2); - const float* b_data = b.template data(); - for (int j = 0; j < N; ++j) { - (*b_quantized_)[j] = fbgemm::Quantize( - b_data[j], 0, in_qparams_[0].scale * rowwise_qparams_[j].scale, 32); - } - } - } - if (!dequantize_output_) { - GetOutputQuantizationParams_(); - - if (rowwise_requantization_params_.empty()) { - // Choose requantization params - rowwise_requantization_params_.resize(N); - for (int i = 0; i < N; ++i) { - float real_multiplier = in_qparams_[0].scale * - rowwise_qparams_[i].scale / out_qparams_.scale; - rowwise_requantization_params_[i] = - qfactory_->ChooseRequantizationMultiplier( - real_multiplier, out_qparams_); - } - } - } else { - if (measure_quantization_error_) { - // to measure quantization error, run ref impl. - Fp32Op_()->DequantizeInput(); - Fp32Op_()->Get()->RunOnDevice(); - } - } - - return true; -} - -REGISTER_CPU_OPERATOR_WITH_ENGINE( - FC, - DNNLOWP_ROWWISE, - FullyConnectedRowWiseDNNLowPOp); -REGISTER_CPU_OPERATOR_WITH_ENGINE( - FC, - DNNLOWP_ROWWISE_16, - FullyConnectedRowWiseDNNLowPOp); - -REGISTER_CPU_OPERATOR_WITH_ENGINE( - Int8FC, - DNNLOWP_ROWWISE, - FullyConnectedRowWiseDNNLowPOp); -REGISTER_CPU_OPERATOR_WITH_ENGINE( - Int8FCRowWise, - DNNLOWP, - FullyConnectedRowWiseDNNLowPOp); - -} // namespace caffe2 diff --git a/caffe2/quantization/server/fully_connected_rowwise_dnnlowp_op.h b/caffe2/quantization/server/fully_connected_rowwise_dnnlowp_op.h deleted file mode 100644 index c756f98..0000000 --- a/caffe2/quantization/server/fully_connected_rowwise_dnnlowp_op.h +++ /dev/null @@ -1,49 +0,0 @@ -#pragma once - -#include "caffe2/operators/fully_connected_op.h" -#include "caffe2/quantization/server/dnnlowp_op.h" -#include "fbgemm/Fbgemm.h" - -namespace caffe2 { - -template -class FullyConnectedRowWiseDNNLowPOp final - : public DNNLowPOp> { - public: - FullyConnectedRowWiseDNNLowPOp( - const OperatorDef& operator_def, - Workspace* ws); - bool RunOnDevice() override; - - USE_OPERATOR_FUNCTIONS(CPUContext); - USE_DNNLOWP_OPERATOR_BASE_FUNCTIONS(T, FullyConnectedOp); - - private: - bool GetQuantizationParameters_(); - - std::size_t axis_{1}; - std::size_t axis_w_{1}; - vector Y_shape_cache_; - - std::vector rowwise_requantization_params_; - std::vector rowwise_qparams_; - - using T_signed = typename std::make_signed::type; - - // used in fast path for T == uint8_t - std::shared_ptr> Wq_packed_; - std::vector X_pack_buf_; - - // used in slow path for T != uint8_t - std::vector W_quantized_; - std::shared_ptr> b_quantized_; - - std::shared_ptr> column_offsets_; - std::vector row_offsets_; - std::vector Y_int32_; - - bool is_weight_constant_ = true; - bool rowwise_weight_quantization_ = true; -}; // class FullyConnectedRowWiseDNNLowPOp - -} // namespace caffe2 diff --git a/caffe2/quantization/server/fully_connected_rowwise_dnnlowp_op_test.py b/caffe2/quantization/server/fully_connected_rowwise_dnnlowp_op_test.py index 65b17fc..14a926e 100644 --- a/caffe2/quantization/server/fully_connected_rowwise_dnnlowp_op_test.py +++ b/caffe2/quantization/server/fully_connected_rowwise_dnnlowp_op_test.py @@ -80,6 +80,9 @@ class RowWiseDNNLowPFullyConnectedOpTest(hu.HypothesisTestCase): W_max, ) + if i % 2 == 0: + W[i, :] = (W[i, :] - W_min) * 2 + W_min + b = np.random.randn(output_channels).astype(np.float32) Output = collections.namedtuple("Output", ["Y", "op_type", "engine"]) @@ -90,7 +93,6 @@ class RowWiseDNNLowPFullyConnectedOpTest(hu.HypothesisTestCase): ("FC", "DNNLOWP_ROWWISE"), ("FC", "DNNLOWP_ROWWISE_16"), ("Int8FC", "DNNLOWP_ROWWISE"), - ("Int8FCRowWise", "DNNLOWP"), ] for op_type, engine in op_engine_list: -- 2.7.4