From: Jongsoo Park Date: Thu, 4 Apr 2019 05:50:05 +0000 (-0700) Subject: fold col offset into bias; optimize A symmetric quant (#17026) X-Git-Tag: accepted/tizen/6.5/unified/20211028.231830~427 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=fa0ad057f8b0b16500a00ec874fcaa733e0287e3;p=platform%2Fupstream%2Fpytorch.git fold col offset into bias; optimize A symmetric quant (#17026) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/17026 D14013931 was for FC. This diff is similar optimizations for Conv. A subtle difference is that in FC, once we fold col_offset into bias during pre-processing step, we can treat everything as if A_zero_offset == 0 (symmetric quantization of A). In Conv, we can't do this because padding still needs to use the original A_zero_offset. From requantization point of view, once col_offset folded into bias, we can treat as if we're doing symmetric A quantization. But, for steps involving padding like im2col, im2col fused with packing, and direct conv for depth-wise/group convolution we still need to pass the original A_zero_offset. Reviewed By: jianyuh Differential Revision: D14020276 fbshipit-source-id: c29caefd1127bbc6aff0e9d535939bb0c1ecb66c --- diff --git a/caffe2/quantization/server/conv_dnnlowp_acc16_op.cc b/caffe2/quantization/server/conv_dnnlowp_acc16_op.cc index f356b42..454be17 100644 --- a/caffe2/quantization/server/conv_dnnlowp_acc16_op.cc +++ b/caffe2/quantization/server/conv_dnnlowp_acc16_op.cc @@ -491,9 +491,9 @@ static void conv_nhwc_acc16_ref_( } template -template +template void ConvDNNLowPAcc16Op::DispatchFBGEMM_( - fbgemm::PackAWithRowOffset& packA, + PackAMatrix& packA, const uint8_t* col_buffer_data, vector* Y_int32, uint8_t* Y_uint8_data) { @@ -513,10 +513,11 @@ void ConvDNNLowPAcc16Op::DispatchFBGEMM_( doNothingObj, this->requantization_multipliers_.data(), out_qparams_.zero_point, - in_qparams_[INPUT].zero_point, + // column_offsets_ empty means column_offsets_ are folded into bias + this->column_offsets_->empty() ? 0 : in_qparams_[INPUT].zero_point, this->filter_zero_points_.data(), packA.getRowOffsetBuffer(), - this->column_offsets_->data(), + this->column_offsets_->empty() ? nullptr : this->column_offsets_->data(), InputSize() == 3 ? this->b_quantized_data_ : nullptr, M, group_); @@ -670,14 +671,21 @@ bool ConvDNNLowPAcc16Op::RunOnDeviceWithOrderNHWC() { int row_offset_size_per_thread = -1; int x_pack_buf_size_per_thread = -1; if (Wq_acc16_packed_) { - row_offset_size_per_thread = - PackAWithRowOffset::rowOffsetBufferSize(); - x_pack_buf_size_per_thread = - PackAWithRowOffset::packedBufferSize(); - row_offsets_.resize( - dnnlowp_get_max_threads() * row_offset_size_per_thread); - X_pack_buf_.resize( - dnnlowp_get_max_threads() * x_pack_buf_size_per_thread); + if (!this->quantize_groupwise_ && this->filter_zero_points_[0] == 0) { + x_pack_buf_size_per_thread = + PackAMatrix::packedBufferSize(); + X_pack_buf_.resize( + dnnlowp_get_max_threads() * x_pack_buf_size_per_thread); + } else { + row_offset_size_per_thread = + PackAWithRowOffset::rowOffsetBufferSize(); + x_pack_buf_size_per_thread = + PackAWithRowOffset::packedBufferSize(); + row_offsets_.resize( + dnnlowp_get_max_threads() * row_offset_size_per_thread); + X_pack_buf_.resize( + dnnlowp_get_max_threads() * x_pack_buf_size_per_thread); + } } uint8_t* Y_uint8_data = Y->template mutable_data(); @@ -692,22 +700,50 @@ bool ConvDNNLowPAcc16Op::RunOnDeviceWithOrderNHWC() { int tid = dnnlowp_get_thread_num(); // no im2col fusion - PackAWithRowOffset packA( - matrix_op_t::NoTranspose, - N * output_image_size, - group_ * kernel_dim, - col_buffer_data, - group_ * kernel_dim, - X_pack_buf_.data() + tid * x_pack_buf_size_per_thread, - group_, - row_offsets_.data() + tid * row_offset_size_per_thread); - - if (this->quantize_groupwise_) { - DispatchFBGEMM_( - packA, col_buffer_data, Y_int32, Y_uint8_data); + if (!this->quantize_groupwise_ && this->filter_zero_points_[0] == 0) { + PackAMatrix packA( + matrix_op_t::NoTranspose, + N * output_image_size, + group_ * kernel_dim, + col_buffer_data, + group_ * kernel_dim, + X_pack_buf_.data() + tid * x_pack_buf_size_per_thread, + group_); + + if (this->quantize_groupwise_) { + DispatchFBGEMM_< + PackAMatrix, + QuantizationGranularity::GROUP>( + packA, col_buffer_data, Y_int32, Y_uint8_data); + } else { + DispatchFBGEMM_< + PackAMatrix, + QuantizationGranularity::TENSOR>( + packA, col_buffer_data, Y_int32, Y_uint8_data); + } } else { - DispatchFBGEMM_( - packA, col_buffer_data, Y_int32, Y_uint8_data); + // no im2col fusion + PackAWithRowOffset packA( + matrix_op_t::NoTranspose, + N * output_image_size, + group_ * kernel_dim, + col_buffer_data, + group_ * kernel_dim, + X_pack_buf_.data() + tid * x_pack_buf_size_per_thread, + group_, + row_offsets_.data() + tid * row_offset_size_per_thread); + + if (this->quantize_groupwise_) { + DispatchFBGEMM_< + PackAWithRowOffset, + QuantizationGranularity::GROUP>( + packA, col_buffer_data, Y_int32, Y_uint8_data); + } else { + DispatchFBGEMM_< + PackAWithRowOffset, + QuantizationGranularity::TENSOR>( + packA, col_buffer_data, Y_int32, Y_uint8_data); + } } } else { // slow path diff --git a/caffe2/quantization/server/conv_dnnlowp_acc16_op.h b/caffe2/quantization/server/conv_dnnlowp_acc16_op.h index 39dbddc..848dbd5 100644 --- a/caffe2/quantization/server/conv_dnnlowp_acc16_op.h +++ b/caffe2/quantization/server/conv_dnnlowp_acc16_op.h @@ -35,9 +35,9 @@ class ConvDNNLowPAcc16Op final : public ConvDNNLowPOp { bool GetQuantizationParameters_(); - template + template void DispatchFBGEMM_( - fbgemm::PackAWithRowOffset& packA, + PackAMatrix& packA, const std::uint8_t* col_buffer_data, vector* Y_int32, uint8_t* Y_uint8_data); diff --git a/caffe2/quantization/server/conv_dnnlowp_op.cc b/caffe2/quantization/server/conv_dnnlowp_op.cc index 05c788e..635a63e 100644 --- a/caffe2/quantization/server/conv_dnnlowp_op.cc +++ b/caffe2/quantization/server/conv_dnnlowp_op.cc @@ -198,15 +198,20 @@ bool ConvDNNLowPOp::NoIm2ColNHWC_() { template void ConvDNNLowPOp::PreComputeRowColumnOffsets_() { + if (this->order_ == StorageOrder::NHWC && + this->template InputIsType(INPUT)) { + // If input tensor doesn't use dynamic quantization, we fold column_offsets_ + // into bias. + return; + } + const auto& filter = InputTensorCPU_(FILTER); int kernel_dim = KernelDim_(); int M = filter.dim32(0); // Pre-compute row_offset / column_offset vector& offsets = - StorageOrder::NCHW == ConvPoolOpBase::order_ - ? row_offsets_ - : *column_offsets_; + this->order_ == StorageOrder::NCHW ? row_offsets_ : *column_offsets_; if (offsets.empty()) { if (this->template InputIsType(FILTER)) { @@ -235,7 +240,8 @@ void ConvDNNLowPOp::QuantizeBias_() { // Quantize bias if (has_bias && (!b_quantized_data_ || - in_qparams_[INPUT].scale != in_qparams_scale_old_)) { + in_qparams_[INPUT].scale != in_qparams_scale_old_ || + in_qparams_[INPUT].zero_point != in_qparams_zero_point_old_)) { if (has_packed_bias) { const auto& packed_filter = this->template Input(FILTER); @@ -273,10 +279,73 @@ void ConvDNNLowPOp::QuantizeBias_() { } b_quantized_data_ = b_quantized_->data(); } - in_qparams_scale_old_ = in_qparams_[INPUT].scale; } + in_qparams_scale_old_ = in_qparams_[INPUT].scale; + in_qparams_zero_point_old_ = in_qparams_[INPUT].zero_point; CAFFE_ENFORCE(b_quantized_data_); + + // If column_offsets_ is empty even when we need column_offsets (asymmetric + // quantization in input), it means we need to fuse column_offsets to bias. + if (this->order_ == StorageOrder::NHWC && in_qparams_[INPUT].zero_point && + column_offsets_->empty()) { + if (b_quantized_->empty()) { + b_quantized_->assign(b_quantized_data_, b_quantized_data_ + M); + b_quantized_data_ = b_quantized_->data(); + } + vector* column_offset_ptr; + vector column_offset_temp; + if (this->template InputIsType(FILTER)) { + const auto& packed_filter = + this->template Input(FILTER); + column_offset_ptr = packed_filter.column_offsets.get(); + } else { + vector temp_qparams; + temp_qparams.push_back(in_qparams_[1]); + column_offset_temp.resize(M); + ComputeColumnOffsets( + KernelDim_(), + M, + W_quantized_.data(), + filter_qparams_, + column_offset_temp); + column_offset_ptr = &column_offset_temp; + } + for (int i = 0; i < M; ++i) { + (*b_quantized_)[i] -= + in_qparams_[0].zero_point * (*column_offset_ptr)[i]; + } + } + } + + if (!has_bias && this->order_ == StorageOrder::NHWC && + in_qparams_[INPUT].zero_point && column_offsets_->empty() && + !b_quantized_data_) { + // no bias but create one filling with column offset values + b_quantized_->resize(M, 0); + b_quantized_data_ = b_quantized_->data(); + + vector* column_offset_ptr; + vector column_offset_temp; + if (this->template InputIsType(FILTER)) { + const auto& packed_filter = + this->template Input(FILTER); + column_offset_ptr = packed_filter.column_offsets.get(); + } else { + vector temp_qparams; + temp_qparams.push_back(in_qparams_[1]); + column_offset_temp.resize(M); + ComputeColumnOffsets( + KernelDim_(), + M, + W_quantized_.data(), + filter_qparams_, + column_offset_temp); + column_offset_ptr = &column_offset_temp; + } + for (int i = 0; i < M; ++i) { + (*b_quantized_)[i] -= in_qparams_[0].zero_point * (*column_offset_ptr)[i]; + } } } @@ -465,13 +534,13 @@ bool ConvDNNLowPOp::GetQuantizationParameters_() { QuantizeWeight_(); PreComputeRowColumnOffsets_(); + QuantizeBias_(); + if (Wq_packed_ && !FLAGS_caffe2_dnnlowp_dump_tensors) { // From here, W_quantized_ is not used anymore when we have Wq_packed_ vector().swap(W_quantized_); } - QuantizeBias_(); - bool fp32_executed = false; if (HasStaticQuantization(this)) { out_qparams_ = GetStaticQuantizationParamsOf(this, 0); @@ -750,8 +819,10 @@ void ConvDNNLowPOp::RunOnDeviceEpilogueNHWC_( for (int j = group_id * (M / group_); j < (group_id + 1) * (M / group_); ++j) { - int32_t raw = Y_int32[i * M + j] - - A_zero_point * (*column_offsets_)[j] - row_offset; + int32_t raw = Y_int32[i * M + j] - row_offset; + if (!column_offsets_->empty()) { + raw -= A_zero_point * (*column_offsets_)[j]; + } if (b_quantized_data_) { raw += b_quantized_data_[j]; } @@ -808,10 +879,12 @@ void ConvDNNLowPOp::RunOnDeviceEpilogueNHWC_( reinterpret_cast(Ydata + i * M + group_id * (M / group_)), &C_multiplier, C_zero_point, - A_zero_point, + column_offsets_->empty() ? 0 : A_zero_point, &B_zero_point, &row_offset, - column_offsets_->data() + group_id * (M / group_), + column_offsets_->empty() + ? nullptr + : column_offsets_->data() + group_id * (M / group_), b_quantized_data_ ? b_quantized_data_ + group_id * (M / group_) : nullptr, M / group_, @@ -834,8 +907,10 @@ void ConvDNNLowPOp::RunOnDeviceEpilogueNHWC_( for (int j = group_id * (M / group_); j < (group_id + 1) * (M / group_); ++j) { - int32_t raw = Y_int32[i * M + j] - - A_zero_point * (*column_offsets_)[j] - row_offset; + int32_t raw = Y_int32[i * M + j] - row_offset; + if (!column_offsets_->empty()) { + raw -= A_zero_point * (*column_offsets_)[j]; + } if (b_quantized_data_) { raw += b_quantized_data_[j]; } @@ -1006,10 +1081,11 @@ void ConvDNNLowPOp::DispatchFBGEMM_( doNothingObj, requantization_multipliers_.data(), out_qparams_.zero_point, - in_qparams_[INPUT].zero_point, + // column_offsets_ empty means column_offsets_ are folded into bias + column_offsets_->empty() ? 0 : in_qparams_[INPUT].zero_point, filter_zero_points_.data(), packA.getRowOffsetBuffer(), - column_offsets_->data(), + column_offsets_->empty() ? nullptr : column_offsets_->data(), b_quantized_data_, M, group_); @@ -1074,6 +1150,8 @@ void ConvDNNLowPOp::ConvNHWCCore_( this->stride_[0], this->stride_[1], this->stride_[2], + // Shouldn't pass 0 if column_offsets_ is empty here because we + // need zero_point for padding in_qparams_[INPUT].zero_point, reinterpret_cast(Xdata), filter_zero_points_.data(), @@ -1081,7 +1159,8 @@ void ConvDNNLowPOp::ConvNHWCCore_( requantization_multipliers_.data(), out_qparams_.zero_point, Y_uint8_data, - column_offsets_->data(), + // column_offsets_ empty means column_offsets_ are folded into bias + column_offsets_->empty() ? nullptr : column_offsets_->data(), b_quantized_data_, ReluFused, dnnlowp_get_thread_num(), @@ -1096,6 +1175,8 @@ void ConvDNNLowPOp::ConvNHWCCore_( this->stride_[0], this->stride_[1], this->stride_[2], + // Shouldn't pass 0 if column_offsets_ is empty here because we + // need zero_point for padding in_qparams_[INPUT].zero_point, reinterpret_cast(Xdata), FilterQuantizationParams(0).zero_point, @@ -1103,7 +1184,8 @@ void ConvDNNLowPOp::ConvNHWCCore_( requantization_params_[0].real_multiplier, out_qparams_.zero_point, Y_uint8_data, - column_offsets_->data(), + // column_offsets_ empty means column_offsets_ are folded into bias + column_offsets_->empty() ? nullptr : column_offsets_->data(), b_quantized_data_, ReluFused, dnnlowp_get_thread_num(), @@ -1130,6 +1212,8 @@ void ConvDNNLowPOp::ConvNHWCCore_( C, stride_h(), stride_w(), + // Shouldn't pass 0 if column_offsets_ is empty here because we + // need zero_point for padding in_qparams_[INPUT].zero_point, reinterpret_cast(Xdata), filter_zero_points_.data(), @@ -1137,7 +1221,8 @@ void ConvDNNLowPOp::ConvNHWCCore_( requantization_multipliers_.data(), out_qparams_.zero_point, Y_uint8_data, - column_offsets_->data(), + // column_offsets_ empty means column_offsets_ are folded into bias + column_offsets_->empty() ? nullptr : column_offsets_->data(), b_quantized_data_, ReluFused, dnnlowp_get_thread_num(), @@ -1150,6 +1235,8 @@ void ConvDNNLowPOp::ConvNHWCCore_( C, stride_h(), stride_w(), + // Shouldn't pass 0 if column_offsets_ is empty here because we + // need zero_point for padding in_qparams_[INPUT].zero_point, reinterpret_cast(Xdata), FilterQuantizationParams(0).zero_point, @@ -1157,7 +1244,8 @@ void ConvDNNLowPOp::ConvNHWCCore_( requantization_params_[0].real_multiplier, out_qparams_.zero_point, Y_uint8_data, - column_offsets_->data(), + // column_offsets_ empty means column_offsets_ are folded into bias + column_offsets_->empty() ? nullptr : column_offsets_->data(), b_quantized_data_, ReluFused, dnnlowp_get_thread_num(), @@ -1198,10 +1286,11 @@ void ConvDNNLowPOp::ConvNHWCCore_( doNothingObj, requantization_multipliers_.data(), out_qparams_.zero_point, - in_qparams_[INPUT].zero_point, + // column_offsets_ empty means column_offsets_ are folded into bias + column_offsets_->empty() ? 0 : in_qparams_[INPUT].zero_point, filter_zero_points_.data(), row_offsets_.data() + tid * row_offset_size_per_thread, - column_offsets_->data(), + column_offsets_->empty() ? nullptr : column_offsets_->data(), b_quantized_data_, conv_p.OC, conv_p.G); @@ -1209,6 +1298,8 @@ void ConvDNNLowPOp::ConvNHWCCore_( fbgemmGroupwiseConv( conv_p, reinterpret_cast(Xdata), + // Shouldn't pass 0 if column_offsets_ is empty here because we + // need zero_point for padding in_qparams_[INPUT].zero_point, row_offsets_.data() + tid * row_offset_size_per_thread, *Wq_gconv_packed_, @@ -1222,12 +1313,13 @@ void ConvDNNLowPOp::ConvNHWCCore_( doNothingObj, requantization_multipliers_.data(), out_qparams_.zero_point, - in_qparams_[INPUT].zero_point, + // column_offsets_ empty means column_offsets_ are folded into bias + column_offsets_->empty() ? 0 : in_qparams_[INPUT].zero_point, filter_zero_points_.data(), filter_zero_points_[0] ? row_offsets_.data() + tid * row_offset_size_per_thread : nullptr, - column_offsets_->data(), + column_offsets_->empty() ? nullptr : column_offsets_->data(), b_quantized_data_, conv_p.OC, conv_p.G); @@ -1235,6 +1327,8 @@ void ConvDNNLowPOp::ConvNHWCCore_( fbgemmGroupwiseConv( conv_p, reinterpret_cast(Xdata), + // Shouldn't pass 0 if column_offsets_ is empty here because we + // need zero_point for padding in_qparams_[INPUT].zero_point, filter_zero_points_[0] ? row_offsets_.data() + tid * row_offset_size_per_thread @@ -1261,6 +1355,9 @@ void ConvDNNLowPOp::ConvNHWCCore_( row_offset_size_per_thread = PackAWithIm2Col::rowOffsetBufferSize(); x_pack_buf_size_per_thread = PackAWithIm2Col::packedBufferSize(); + } else if (!quantize_groupwise_ && filter_zero_points_[0] == 0) { + row_offset_size_per_thread = 0; + x_pack_buf_size_per_thread = PackAMatrix::packedBufferSize(); } else { row_offset_size_per_thread = PackAWithRowOffset::rowOffsetBufferSize(); @@ -1303,6 +1400,8 @@ void ConvDNNLowPOp::ConvNHWCCore_( reinterpret_cast(col_buffer_data), // buffer for packed matrix X_pack_buf_.data() + tid * x_pack_buf_size_per_thread, + // Shouldn't pass 0 if column_offsets_ is empty here because we + // need zero_point for padding in_qparams_[INPUT].zero_point, row_offsets_.data() + tid * row_offset_size_per_thread); @@ -1337,6 +1436,8 @@ void ConvDNNLowPOp::ConvNHWCCore_( reinterpret_cast(col_buffer_data), // buffer for packed matrix X_pack_buf_.data() + tid * x_pack_buf_size_per_thread, + // Shouldn't pass 0 if column_offsets_ is empty here because we + // need zero_point for padding in_qparams_[INPUT].zero_point, row_offsets_.data() + tid * row_offset_size_per_thread); @@ -1350,6 +1451,20 @@ void ConvDNNLowPOp::ConvNHWCCore_( QuantizationGranularity::TENSOR>(packA, Y_int32, Y_uint8_data); } } // 3D + } else if (!quantize_groupwise_ && filter_zero_points_[0] == 0) { + // no im2col fusion + PackAMatrix packA( + matrix_op_t::NoTranspose, + N * Y_HxW, + group_ * kernel_dim, + reinterpret_cast(col_buffer_data), + group_ * kernel_dim, + // buffer for packed matrix + X_pack_buf_.data() + tid * x_pack_buf_size_per_thread, + group_); + + DispatchFBGEMM_, QuantizationGranularity::TENSOR>( + packA, Y_int32, Y_uint8_data); } else { // no im2col fusion PackAWithRowOffset packA( diff --git a/caffe2/quantization/server/conv_dnnlowp_op.h b/caffe2/quantization/server/conv_dnnlowp_op.h index 6f6c8ea..268a4cf 100644 --- a/caffe2/quantization/server/conv_dnnlowp_op.h +++ b/caffe2/quantization/server/conv_dnnlowp_op.h @@ -127,7 +127,8 @@ class ConvDNNLowPOp : public ConvPoolDNNLowPOpBase { // pre-computed biases and offsets std::shared_ptr> b_quantized_; - float in_qparams_scale_old_ = 0; + float in_qparams_scale_old_{0}; + std::int32_t in_qparams_zero_point_old_{0}; }; // class ConvDNNLowPOp } // namespace caffe2