}
template <bool ReluFused>
-template <fbgemm::QuantizationGranularity Q_GRAN>
+template <typename PackAMatrix, fbgemm::QuantizationGranularity Q_GRAN>
void ConvDNNLowPAcc16Op<ReluFused>::DispatchFBGEMM_(
- fbgemm::PackAWithRowOffset<uint8_t, int16_t>& packA,
+ PackAMatrix& packA,
const uint8_t* col_buffer_data,
vector<int32_t>* Y_int32,
uint8_t* Y_uint8_data) {
doNothingObj,
this->requantization_multipliers_.data(),
out_qparams_.zero_point,
- in_qparams_[INPUT].zero_point,
+ // column_offsets_ empty means column_offsets_ are folded into bias
+ this->column_offsets_->empty() ? 0 : in_qparams_[INPUT].zero_point,
this->filter_zero_points_.data(),
packA.getRowOffsetBuffer(),
- this->column_offsets_->data(),
+ this->column_offsets_->empty() ? nullptr : this->column_offsets_->data(),
InputSize() == 3 ? this->b_quantized_data_ : nullptr,
M,
group_);
int row_offset_size_per_thread = -1;
int x_pack_buf_size_per_thread = -1;
if (Wq_acc16_packed_) {
- row_offset_size_per_thread =
- PackAWithRowOffset<uint8_t, int16_t>::rowOffsetBufferSize();
- x_pack_buf_size_per_thread =
- PackAWithRowOffset<uint8_t, int16_t>::packedBufferSize();
- row_offsets_.resize(
- dnnlowp_get_max_threads() * row_offset_size_per_thread);
- X_pack_buf_.resize(
- dnnlowp_get_max_threads() * x_pack_buf_size_per_thread);
+ if (!this->quantize_groupwise_ && this->filter_zero_points_[0] == 0) {
+ x_pack_buf_size_per_thread =
+ PackAMatrix<uint8_t, int16_t>::packedBufferSize();
+ X_pack_buf_.resize(
+ dnnlowp_get_max_threads() * x_pack_buf_size_per_thread);
+ } else {
+ row_offset_size_per_thread =
+ PackAWithRowOffset<uint8_t, int16_t>::rowOffsetBufferSize();
+ x_pack_buf_size_per_thread =
+ PackAWithRowOffset<uint8_t, int16_t>::packedBufferSize();
+ row_offsets_.resize(
+ dnnlowp_get_max_threads() * row_offset_size_per_thread);
+ X_pack_buf_.resize(
+ dnnlowp_get_max_threads() * x_pack_buf_size_per_thread);
+ }
}
uint8_t* Y_uint8_data = Y->template mutable_data<uint8_t>();
int tid = dnnlowp_get_thread_num();
// no im2col fusion
- PackAWithRowOffset<uint8_t, int16_t> packA(
- matrix_op_t::NoTranspose,
- N * output_image_size,
- group_ * kernel_dim,
- col_buffer_data,
- group_ * kernel_dim,
- X_pack_buf_.data() + tid * x_pack_buf_size_per_thread,
- group_,
- row_offsets_.data() + tid * row_offset_size_per_thread);
-
- if (this->quantize_groupwise_) {
- DispatchFBGEMM_<QuantizationGranularity::GROUP>(
- packA, col_buffer_data, Y_int32, Y_uint8_data);
+ if (!this->quantize_groupwise_ && this->filter_zero_points_[0] == 0) {
+ PackAMatrix<uint8_t, int16_t> packA(
+ matrix_op_t::NoTranspose,
+ N * output_image_size,
+ group_ * kernel_dim,
+ col_buffer_data,
+ group_ * kernel_dim,
+ X_pack_buf_.data() + tid * x_pack_buf_size_per_thread,
+ group_);
+
+ if (this->quantize_groupwise_) {
+ DispatchFBGEMM_<
+ PackAMatrix<uint8_t, int16_t>,
+ QuantizationGranularity::GROUP>(
+ packA, col_buffer_data, Y_int32, Y_uint8_data);
+ } else {
+ DispatchFBGEMM_<
+ PackAMatrix<uint8_t, int16_t>,
+ QuantizationGranularity::TENSOR>(
+ packA, col_buffer_data, Y_int32, Y_uint8_data);
+ }
} else {
- DispatchFBGEMM_<QuantizationGranularity::TENSOR>(
- packA, col_buffer_data, Y_int32, Y_uint8_data);
+ // no im2col fusion
+ PackAWithRowOffset<uint8_t, int16_t> packA(
+ matrix_op_t::NoTranspose,
+ N * output_image_size,
+ group_ * kernel_dim,
+ col_buffer_data,
+ group_ * kernel_dim,
+ X_pack_buf_.data() + tid * x_pack_buf_size_per_thread,
+ group_,
+ row_offsets_.data() + tid * row_offset_size_per_thread);
+
+ if (this->quantize_groupwise_) {
+ DispatchFBGEMM_<
+ PackAWithRowOffset<uint8_t, int16_t>,
+ QuantizationGranularity::GROUP>(
+ packA, col_buffer_data, Y_int32, Y_uint8_data);
+ } else {
+ DispatchFBGEMM_<
+ PackAWithRowOffset<uint8_t, int16_t>,
+ QuantizationGranularity::TENSOR>(
+ packA, col_buffer_data, Y_int32, Y_uint8_data);
+ }
}
} else {
// slow path
template <typename T, bool ReluFused>
void ConvDNNLowPOp<T, ReluFused>::PreComputeRowColumnOffsets_() {
+ if (this->order_ == StorageOrder::NHWC &&
+ this->template InputIsType<int8::Int8TensorCPU>(INPUT)) {
+ // If input tensor doesn't use dynamic quantization, we fold column_offsets_
+ // into bias.
+ return;
+ }
+
const auto& filter = InputTensorCPU_(FILTER);
int kernel_dim = KernelDim_();
int M = filter.dim32(0);
// Pre-compute row_offset / column_offset
vector<int>& offsets =
- StorageOrder::NCHW == ConvPoolOpBase<CPUContext>::order_
- ? row_offsets_
- : *column_offsets_;
+ this->order_ == StorageOrder::NCHW ? row_offsets_ : *column_offsets_;
if (offsets.empty()) {
if (this->template InputIsType<Int8ConvDNNLowPPackedWeightBlob>(FILTER)) {
// Quantize bias
if (has_bias &&
(!b_quantized_data_ ||
- in_qparams_[INPUT].scale != in_qparams_scale_old_)) {
+ in_qparams_[INPUT].scale != in_qparams_scale_old_ ||
+ in_qparams_[INPUT].zero_point != in_qparams_zero_point_old_)) {
if (has_packed_bias) {
const auto& packed_filter =
this->template Input<Int8ConvDNNLowPPackedWeightBlob>(FILTER);
}
b_quantized_data_ = b_quantized_->data();
}
- in_qparams_scale_old_ = in_qparams_[INPUT].scale;
}
+ in_qparams_scale_old_ = in_qparams_[INPUT].scale;
+ in_qparams_zero_point_old_ = in_qparams_[INPUT].zero_point;
CAFFE_ENFORCE(b_quantized_data_);
+
+ // If column_offsets_ is empty even when we need column_offsets (asymmetric
+ // quantization in input), it means we need to fuse column_offsets to bias.
+ if (this->order_ == StorageOrder::NHWC && in_qparams_[INPUT].zero_point &&
+ column_offsets_->empty()) {
+ if (b_quantized_->empty()) {
+ b_quantized_->assign(b_quantized_data_, b_quantized_data_ + M);
+ b_quantized_data_ = b_quantized_->data();
+ }
+ vector<int32_t>* column_offset_ptr;
+ vector<int32_t> column_offset_temp;
+ if (this->template InputIsType<Int8ConvDNNLowPPackedWeightBlob>(FILTER)) {
+ const auto& packed_filter =
+ this->template Input<Int8ConvDNNLowPPackedWeightBlob>(FILTER);
+ column_offset_ptr = packed_filter.column_offsets.get();
+ } else {
+ vector<TensorQuantizationParams> temp_qparams;
+ temp_qparams.push_back(in_qparams_[1]);
+ column_offset_temp.resize(M);
+ ComputeColumnOffsets<T_signed>(
+ KernelDim_(),
+ M,
+ W_quantized_.data(),
+ filter_qparams_,
+ column_offset_temp);
+ column_offset_ptr = &column_offset_temp;
+ }
+ for (int i = 0; i < M; ++i) {
+ (*b_quantized_)[i] -=
+ in_qparams_[0].zero_point * (*column_offset_ptr)[i];
+ }
+ }
+ }
+
+ if (!has_bias && this->order_ == StorageOrder::NHWC &&
+ in_qparams_[INPUT].zero_point && column_offsets_->empty() &&
+ !b_quantized_data_) {
+ // no bias but create one filling with column offset values
+ b_quantized_->resize(M, 0);
+ b_quantized_data_ = b_quantized_->data();
+
+ vector<int32_t>* column_offset_ptr;
+ vector<int32_t> column_offset_temp;
+ if (this->template InputIsType<Int8ConvDNNLowPPackedWeightBlob>(FILTER)) {
+ const auto& packed_filter =
+ this->template Input<Int8ConvDNNLowPPackedWeightBlob>(FILTER);
+ column_offset_ptr = packed_filter.column_offsets.get();
+ } else {
+ vector<TensorQuantizationParams> temp_qparams;
+ temp_qparams.push_back(in_qparams_[1]);
+ column_offset_temp.resize(M);
+ ComputeColumnOffsets<T_signed>(
+ KernelDim_(),
+ M,
+ W_quantized_.data(),
+ filter_qparams_,
+ column_offset_temp);
+ column_offset_ptr = &column_offset_temp;
+ }
+ for (int i = 0; i < M; ++i) {
+ (*b_quantized_)[i] -= in_qparams_[0].zero_point * (*column_offset_ptr)[i];
+ }
}
}
QuantizeWeight_();
PreComputeRowColumnOffsets_();
+ QuantizeBias_();
+
if (Wq_packed_ && !FLAGS_caffe2_dnnlowp_dump_tensors) {
// From here, W_quantized_ is not used anymore when we have Wq_packed_
vector<T_signed>().swap(W_quantized_);
}
- QuantizeBias_();
-
bool fp32_executed = false;
if (HasStaticQuantization(this)) {
out_qparams_ = GetStaticQuantizationParamsOf(this, 0);
for (int j = group_id * (M / group_); j < (group_id + 1) * (M / group_);
++j) {
- int32_t raw = Y_int32[i * M + j] -
- A_zero_point * (*column_offsets_)[j] - row_offset;
+ int32_t raw = Y_int32[i * M + j] - row_offset;
+ if (!column_offsets_->empty()) {
+ raw -= A_zero_point * (*column_offsets_)[j];
+ }
if (b_quantized_data_) {
raw += b_quantized_data_[j];
}
reinterpret_cast<uint8_t*>(Ydata + i * M + group_id * (M / group_)),
&C_multiplier,
C_zero_point,
- A_zero_point,
+ column_offsets_->empty() ? 0 : A_zero_point,
&B_zero_point,
&row_offset,
- column_offsets_->data() + group_id * (M / group_),
+ column_offsets_->empty()
+ ? nullptr
+ : column_offsets_->data() + group_id * (M / group_),
b_quantized_data_ ? b_quantized_data_ + group_id * (M / group_)
: nullptr,
M / group_,
for (int j = group_id * (M / group_); j < (group_id + 1) * (M / group_);
++j) {
- int32_t raw = Y_int32[i * M + j] -
- A_zero_point * (*column_offsets_)[j] - row_offset;
+ int32_t raw = Y_int32[i * M + j] - row_offset;
+ if (!column_offsets_->empty()) {
+ raw -= A_zero_point * (*column_offsets_)[j];
+ }
if (b_quantized_data_) {
raw += b_quantized_data_[j];
}
doNothingObj,
requantization_multipliers_.data(),
out_qparams_.zero_point,
- in_qparams_[INPUT].zero_point,
+ // column_offsets_ empty means column_offsets_ are folded into bias
+ column_offsets_->empty() ? 0 : in_qparams_[INPUT].zero_point,
filter_zero_points_.data(),
packA.getRowOffsetBuffer(),
- column_offsets_->data(),
+ column_offsets_->empty() ? nullptr : column_offsets_->data(),
b_quantized_data_,
M,
group_);
this->stride_[0],
this->stride_[1],
this->stride_[2],
+ // Shouldn't pass 0 if column_offsets_ is empty here because we
+ // need zero_point for padding
in_qparams_[INPUT].zero_point,
reinterpret_cast<const uint8_t*>(Xdata),
filter_zero_points_.data(),
requantization_multipliers_.data(),
out_qparams_.zero_point,
Y_uint8_data,
- column_offsets_->data(),
+ // column_offsets_ empty means column_offsets_ are folded into bias
+ column_offsets_->empty() ? nullptr : column_offsets_->data(),
b_quantized_data_,
ReluFused,
dnnlowp_get_thread_num(),
this->stride_[0],
this->stride_[1],
this->stride_[2],
+ // Shouldn't pass 0 if column_offsets_ is empty here because we
+ // need zero_point for padding
in_qparams_[INPUT].zero_point,
reinterpret_cast<const uint8_t*>(Xdata),
FilterQuantizationParams(0).zero_point,
requantization_params_[0].real_multiplier,
out_qparams_.zero_point,
Y_uint8_data,
- column_offsets_->data(),
+ // column_offsets_ empty means column_offsets_ are folded into bias
+ column_offsets_->empty() ? nullptr : column_offsets_->data(),
b_quantized_data_,
ReluFused,
dnnlowp_get_thread_num(),
C,
stride_h(),
stride_w(),
+ // Shouldn't pass 0 if column_offsets_ is empty here because we
+ // need zero_point for padding
in_qparams_[INPUT].zero_point,
reinterpret_cast<const uint8_t*>(Xdata),
filter_zero_points_.data(),
requantization_multipliers_.data(),
out_qparams_.zero_point,
Y_uint8_data,
- column_offsets_->data(),
+ // column_offsets_ empty means column_offsets_ are folded into bias
+ column_offsets_->empty() ? nullptr : column_offsets_->data(),
b_quantized_data_,
ReluFused,
dnnlowp_get_thread_num(),
C,
stride_h(),
stride_w(),
+ // Shouldn't pass 0 if column_offsets_ is empty here because we
+ // need zero_point for padding
in_qparams_[INPUT].zero_point,
reinterpret_cast<const uint8_t*>(Xdata),
FilterQuantizationParams(0).zero_point,
requantization_params_[0].real_multiplier,
out_qparams_.zero_point,
Y_uint8_data,
- column_offsets_->data(),
+ // column_offsets_ empty means column_offsets_ are folded into bias
+ column_offsets_->empty() ? nullptr : column_offsets_->data(),
b_quantized_data_,
ReluFused,
dnnlowp_get_thread_num(),
doNothingObj,
requantization_multipliers_.data(),
out_qparams_.zero_point,
- in_qparams_[INPUT].zero_point,
+ // column_offsets_ empty means column_offsets_ are folded into bias
+ column_offsets_->empty() ? 0 : in_qparams_[INPUT].zero_point,
filter_zero_points_.data(),
row_offsets_.data() + tid * row_offset_size_per_thread,
- column_offsets_->data(),
+ column_offsets_->empty() ? nullptr : column_offsets_->data(),
b_quantized_data_,
conv_p.OC,
conv_p.G);
fbgemmGroupwiseConv(
conv_p,
reinterpret_cast<const uint8_t*>(Xdata),
+ // Shouldn't pass 0 if column_offsets_ is empty here because we
+ // need zero_point for padding
in_qparams_[INPUT].zero_point,
row_offsets_.data() + tid * row_offset_size_per_thread,
*Wq_gconv_packed_,
doNothingObj,
requantization_multipliers_.data(),
out_qparams_.zero_point,
- in_qparams_[INPUT].zero_point,
+ // column_offsets_ empty means column_offsets_ are folded into bias
+ column_offsets_->empty() ? 0 : in_qparams_[INPUT].zero_point,
filter_zero_points_.data(),
filter_zero_points_[0]
? row_offsets_.data() + tid * row_offset_size_per_thread
: nullptr,
- column_offsets_->data(),
+ column_offsets_->empty() ? nullptr : column_offsets_->data(),
b_quantized_data_,
conv_p.OC,
conv_p.G);
fbgemmGroupwiseConv(
conv_p,
reinterpret_cast<const uint8_t*>(Xdata),
+ // Shouldn't pass 0 if column_offsets_ is empty here because we
+ // need zero_point for padding
in_qparams_[INPUT].zero_point,
filter_zero_points_[0]
? row_offsets_.data() + tid * row_offset_size_per_thread
row_offset_size_per_thread =
PackAWithIm2Col<uint8_t>::rowOffsetBufferSize();
x_pack_buf_size_per_thread = PackAWithIm2Col<uint8_t>::packedBufferSize();
+ } else if (!quantize_groupwise_ && filter_zero_points_[0] == 0) {
+ row_offset_size_per_thread = 0;
+ x_pack_buf_size_per_thread = PackAMatrix<uint8_t>::packedBufferSize();
} else {
row_offset_size_per_thread =
PackAWithRowOffset<uint8_t>::rowOffsetBufferSize();
reinterpret_cast<const uint8_t*>(col_buffer_data),
// buffer for packed matrix
X_pack_buf_.data() + tid * x_pack_buf_size_per_thread,
+ // Shouldn't pass 0 if column_offsets_ is empty here because we
+ // need zero_point for padding
in_qparams_[INPUT].zero_point,
row_offsets_.data() + tid * row_offset_size_per_thread);
reinterpret_cast<const uint8_t*>(col_buffer_data),
// buffer for packed matrix
X_pack_buf_.data() + tid * x_pack_buf_size_per_thread,
+ // Shouldn't pass 0 if column_offsets_ is empty here because we
+ // need zero_point for padding
in_qparams_[INPUT].zero_point,
row_offsets_.data() + tid * row_offset_size_per_thread);
QuantizationGranularity::TENSOR>(packA, Y_int32, Y_uint8_data);
}
} // 3D
+ } else if (!quantize_groupwise_ && filter_zero_points_[0] == 0) {
+ // no im2col fusion
+ PackAMatrix<uint8_t> packA(
+ matrix_op_t::NoTranspose,
+ N * Y_HxW,
+ group_ * kernel_dim,
+ reinterpret_cast<const uint8_t*>(col_buffer_data),
+ group_ * kernel_dim,
+ // buffer for packed matrix
+ X_pack_buf_.data() + tid * x_pack_buf_size_per_thread,
+ group_);
+
+ DispatchFBGEMM_<PackAMatrix<uint8_t>, QuantizationGranularity::TENSOR>(
+ packA, Y_int32, Y_uint8_data);
} else {
// no im2col fusion
PackAWithRowOffset<uint8_t> packA(