FLAGS_caffe2_dnnlowp_copy_to_32bit_frequency)) {}
template <bool ReluFused>
-bool ConvDNNLowPAcc16Op<ReluFused>::RunOnDeviceWithOrderNCHW() {
- if (fallback_to_32_bit_accumulation_) {
- return BaseType::RunOnDeviceWithOrderNCHW();
- }
- const Tensor& X = InputTensorCPU_(INPUT);
- if (X.template IsType<uint8_t>()) {
- return RunOnDeviceWithOrderNCHWAndType_<uint8_t>();
- } else {
- assert(X.template IsType<float>());
- return RunOnDeviceWithOrderNCHWAndType_<float>();
- }
-}
-
-template <bool ReluFused>
-bool ConvDNNLowPAcc16Op<ReluFused>::RunOnDeviceWithOrderNHWC() {
+bool ConvDNNLowPAcc16Op<ReluFused>::GetQuantizationParameters_() {
if (fallback_to_32_bit_accumulation_) {
- return BaseType::RunOnDeviceWithOrderNHWC();
- }
- const Tensor& X = InputTensorCPU_(INPUT);
- if (X.template IsType<uint8_t>()) {
- return RunOnDeviceWithOrderNHWCAndType_<uint8_t>();
- } else {
- assert(X.template IsType<float>());
- return RunOnDeviceWithOrderNHWCAndType_<float>();
+ return true;
}
-}
-template <bool ReluFused>
-bool ConvDNNLowPAcc16Op<ReluFused>::GetQuantizationParameters_() {
if (!BaseType::GetQuantizationParameters_()) {
return false;
}
}
template <bool ReluFused>
-template <typename InType>
-bool ConvDNNLowPAcc16Op<ReluFused>::RunOnDeviceWithOrderNCHWAndType_() {
+bool ConvDNNLowPAcc16Op<ReluFused>::RunOnDeviceWithOrderNCHW() {
VLOG(2) << "Running DNNLOWP_ACC16 Conv";
using namespace dnnlowp;
return false;
}
if (fallback_to_32_bit_accumulation_) {
- return BaseType::template RunOnDeviceWithOrderNCHWAndType_<InType>();
+ return BaseType::RunOnDeviceWithOrderNCHW();
}
const Tensor& X = InputTensorCPU_(INPUT);
// The col buffer is stored in CHW order as well - kernel_dim, and the
// height and width.
- const InType* Xdata = X.template data<InType>();
+ const uint8_t* Xdata = X.template data<uint8_t>();
col_buffer_.Resize(buffer_shape);
- InType* col_buffer_data = col_buffer_.template mutable_data<InType>();
+ uint8_t* col_buffer_data = col_buffer_.template mutable_data<uint8_t>();
auto f = [&](vector<int32_t>* Y_int32) {
Y_int32->resize(M * output_image_size * dnnlowp_get_max_threads());
buffer_shape.begin() + 1, buffer_shape.end());
// Im2Col, followed by gemm.
- vector<uint8_t> Y_temp;
- uint8_t* Y_data;
- float* Y_data_float = nullptr;
- if (dequantize_output_) {
- Y_temp.resize(Y->numel());
- Y_data = Y_temp.data();
- Y_data_float = Y->template mutable_data<float>();
- } else {
- Y_data = Y->template mutable_data<uint8_t>();
- }
+ uint8_t* Y_data = Y->template mutable_data<uint8_t>();
this->column_offsets_->resize(
output_image_size * dnnlowp_get_max_threads());
int tid = dnnlowp_get_thread_num();
for (int group_id = 0; group_id < group_; ++group_id) {
if (this->kernel_.size() == 2) {
- math::Im2ColNCHW<InType>(
+ math::Im2ColNCHW<uint8_t>(
C / group_,
input_dims[0],
input_dims[1],
Xdata + (group_ * image_id + group_id) * input_offset,
col_buffer_data + tid * col_buffer_size,
&context_,
- X.IsType<uint8_t>() ? in_qparams_[INPUT].zero_point : 0);
+ in_qparams_[INPUT].zero_point);
} else {
- math::Im2ColNdNCHW<InType>(
+ math::Im2ColNdNCHW<uint8_t>(
this->kernel_.size(),
C * input_image_size,
col_buffer_size,
Xdata + (group_ * image_id + group_id) * input_offset,
col_buffer_data + tid * col_buffer_size,
&context_,
- X.IsType<uint8_t>() ? in_qparams_[INPUT].zero_point : 0);
+ in_qparams_[INPUT].zero_point);
}
// quantize col_buffer
- uint8_t* col_buffer_quantized_data = nullptr;
- vector<uint8_t> col_buffer_quantized;
- if (X.template IsType<uint8_t>()) {
- col_buffer_quantized_data =
- reinterpret_cast<uint8_t*>(col_buffer_data) +
- tid * col_buffer_size;
- } else {
- col_buffer_quantized.resize(kernel_dim * output_image_size);
- fbgemm::Quantize<uint8_t>(
- reinterpret_cast<const float*>(col_buffer_data) +
- tid * col_buffer_size,
- col_buffer_quantized.data(),
- col_buffer_quantized.size(),
- in_qparams_[INPUT]);
- col_buffer_quantized_data = col_buffer_quantized.data();
- }
+ uint8_t* col_buffer_private = col_buffer_data + tid * col_buffer_size;
// main GEMM
int32_t* Y_int32_temp = Y_int32->data() +
int16_t int16_sum = 0;
for (int k = 0; k < kernel_dim; ++k) {
int32_t w = W_quantized_group[i * kernel_dim + k];
- int32_t x = col_buffer_quantized_data[k * output_image_size + j];
+ int32_t x = col_buffer_private[k * output_image_size + j];
#ifdef DNNLOWP_ACC16_IN_SLOW_PATH
int16_sum = std::max<int32_t>(
numeric_limits<int16_t>::min(),
}
}
- if (dequantize_output_) {
- this->RunOnDeviceEpilogueNCHW_(
- col_buffer_quantized_data,
- Y_int32_temp,
- Y_data_float +
- (M * image_id + M / group_ * group_id) * output_image_size,
- M / group_ * group_id,
- group_id);
- } else {
- this->RunOnDeviceEpilogueNCHW_(
- col_buffer_quantized_data,
- Y_int32_temp,
- Y_data +
- (M * image_id + M / group_ * group_id) * output_image_size,
- M / group_ * group_id,
- group_id);
- }
+ this->RunOnDeviceEpilogueNCHW_(
+ col_buffer_private,
+ Y_int32_temp,
+ Y_data + (M * image_id + M / group_ * group_id) * output_image_size,
+ M / group_ * group_id,
+ group_id);
} // for each group
} // for each image_id
}; // f
f(&(this->Y_int32_));
}
- if (!dequantize_output_) {
- PropagateOutputTensorQuantizationParams(this, 0, out_qparams_);
- }
+ PropagateOutputTensorQuantizationParams(this, 0, out_qparams_);
this->MeasureQuantizationError_();
}
template <bool ReluFused>
-template <typename PackAMatrix, fbgemm::QuantizationGranularity Q_GRAN>
+template <fbgemm::QuantizationGranularity Q_GRAN>
void ConvDNNLowPAcc16Op<ReluFused>::DispatchFBGEMM_(
- PackAMatrix& packA,
- const uint8_t* col_buffer_quantized_data,
+ fbgemm::PackAWithRowOffset<uint8_t, int16_t>& packA,
+ const uint8_t* col_buffer_data,
vector<int32_t>* Y_int32,
uint8_t* Y_uint8_data) {
+ // This function is called within an OpenMP region
auto& filter = InputTensorCPU_(FILTER);
const int M = filter.dim32(0);
- bool fuse_output_pipeline = Wq_acc16_packed_ && !dequantize_output_;
- assert(fuse_output_pipeline);
+ assert(Wq_acc16_packed_.get());
int kernel_dim = this->KernelDim_();
int nthreads = dnnlowp_get_num_threads();
int32_t,
ReQuantizeOutput<ReluFused, Q_GRAN>>
spmdmObj(
- reqObj,
- col_buffer_quantized_data,
- group_ * kernel_dim,
- *Wq_outlier_,
- group_);
+ reqObj, col_buffer_data, group_ * kernel_dim, *Wq_outlier_, group_);
fbgemmPacked(
packA,
}
template <bool ReluFused>
-template <typename InType>
-bool ConvDNNLowPAcc16Op<ReluFused>::RunOnDeviceWithOrderNHWCAndType_() {
+bool ConvDNNLowPAcc16Op<ReluFused>::RunOnDeviceWithOrderNHWC() {
CAFFE_ENFORCE_LE(
this->kernel_.size(),
3,
}
if (fallback_to_32_bit_accumulation_) {
- return BaseType::template RunOnDeviceWithOrderNHWCAndType_<InType>();
+ return BaseType::RunOnDeviceWithOrderNHWC();
}
#ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN
t_begin = chrono::system_clock::now();
#endif
- bool fuse_output_pipeline = Wq_acc16_packed_ && !dequantize_output_;
bool no_im2col = this->NoIm2ColNHWC_();
// Im2Col, followed by gemm.
auto f2 = [&](Tensor* col_buffer_) {
- const InType* Xdata = X.template data<InType>();
- const InType* col_buffer_data =
- no_im2col ? Xdata : this->template Im2ColNHWC_<InType>(col_buffer_);
+ const uint8_t* Xdata = X.template data<uint8_t>();
+ const uint8_t* col_buffer_data =
+ no_im2col ? Xdata : this->Im2ColNHWC_(col_buffer_);
#ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN
t_end = chrono::system_clock::now();
t_begin = chrono::system_clock::now();
#endif
- // quantize col_buffer
- const uint8_t* col_buffer_quantized_data = nullptr;
- vector<uint8_t> col_buffer_quantized;
- if (X.template IsType<uint8_t>()) {
- col_buffer_quantized_data =
- reinterpret_cast<const uint8_t*>(col_buffer_data);
- } else {
- col_buffer_quantized.resize(
- group_ * kernel_dim * output_image_size * N);
-#ifdef _OPENMP
-#pragma omp parallel
-#endif
- {
- size_t begin, end;
- std::tie(begin, end) = Get1DPartition(
- col_buffer_quantized.size(),
- dnnlowp_get_num_threads(),
- dnnlowp_get_thread_num());
- fbgemm::Quantize<uint8_t>(
- (const float*)col_buffer_data + begin,
- col_buffer_quantized.data() + begin,
- end - begin,
- in_qparams_[INPUT]);
- }
- col_buffer_quantized_data = col_buffer_quantized.data();
- }
-
-#ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN
- t_end = chrono::system_clock::now();
- dt = chrono::duration<double>(t_end - t_begin).count();
- LOG(INFO) << "this=" << this << " quantize col_buf: " << dt * 1e3
- << " ms";
- t_begin = chrono::system_clock::now();
-#endif
-
using namespace fbgemm;
int row_offset_size_per_thread = -1;
int x_pack_buf_size_per_thread = -1;
if (Wq_acc16_packed_) {
- if (fuse_output_pipeline) {
- row_offset_size_per_thread =
- PackAWithRowOffset<uint8_t, int16_t>::rowOffsetBufferSize();
- x_pack_buf_size_per_thread =
- PackAWithRowOffset<uint8_t, int16_t>::packedBufferSize();
- row_offsets_.resize(
- dnnlowp_get_max_threads() * row_offset_size_per_thread);
- } else {
- x_pack_buf_size_per_thread =
- PackAMatrix<uint8_t, int16_t>::packedBufferSize();
- }
+ row_offset_size_per_thread =
+ PackAWithRowOffset<uint8_t, int16_t>::rowOffsetBufferSize();
+ x_pack_buf_size_per_thread =
+ PackAWithRowOffset<uint8_t, int16_t>::packedBufferSize();
+ row_offsets_.resize(
+ dnnlowp_get_max_threads() * row_offset_size_per_thread);
X_pack_buf_.resize(
dnnlowp_get_max_threads() * x_pack_buf_size_per_thread);
}
- uint8_t* Y_uint8_data = nullptr;
- if (!dequantize_output_) {
- // Output is uint8_t
- Y_uint8_data = Y->template mutable_data<uint8_t>();
- }
+ uint8_t* Y_uint8_data = Y->template mutable_data<uint8_t>();
// Main GEMM for non-outlier
if (Wq_acc16_packed_)
#endif
{
// fast path
- int nthreads = dnnlowp_get_num_threads();
int tid = dnnlowp_get_thread_num();
- if (fuse_output_pipeline) {
- // no im2col fusion
- PackAWithRowOffset<uint8_t, int16_t> packA(
- matrix_op_t::NoTranspose,
- N * output_image_size,
- group_ * kernel_dim,
- col_buffer_quantized_data,
- group_ * kernel_dim,
- X_pack_buf_.data() + tid * x_pack_buf_size_per_thread,
- group_,
- row_offsets_.data() + tid * row_offset_size_per_thread);
-
- if (this->quantize_groupwise_) {
- DispatchFBGEMM_<
- PackAWithRowOffset<uint8_t, int16_t>,
- QuantizationGranularity::GROUP>(
- packA, col_buffer_quantized_data, Y_int32, Y_uint8_data);
- } else {
- DispatchFBGEMM_<
- PackAWithRowOffset<uint8_t, int16_t>,
- QuantizationGranularity::TENSOR>(
- packA, col_buffer_quantized_data, Y_int32, Y_uint8_data);
- }
+ // no im2col fusion
+ PackAWithRowOffset<uint8_t, int16_t> packA(
+ matrix_op_t::NoTranspose,
+ N * output_image_size,
+ group_ * kernel_dim,
+ col_buffer_data,
+ group_ * kernel_dim,
+ X_pack_buf_.data() + tid * x_pack_buf_size_per_thread,
+ group_,
+ row_offsets_.data() + tid * row_offset_size_per_thread);
+
+ if (this->quantize_groupwise_) {
+ DispatchFBGEMM_<QuantizationGranularity::GROUP>(
+ packA, col_buffer_data, Y_int32, Y_uint8_data);
} else {
- // !fuse_output_pipeline
- PackAMatrix<uint8_t, int16_t> packA(
- matrix_op_t::NoTranspose,
- N * output_image_size,
- group_ * kernel_dim,
- col_buffer_quantized_data,
- group_ * kernel_dim,
- X_pack_buf_.data() + tid * x_pack_buf_size_per_thread,
- group_); // group
-
- DoNothing<int32_t, int32_t> doNothingObj{};
- memCopy<> memCopyObj(doNothingObj);
- fbgemmPacked(
- packA,
- *Wq_acc16_packed_,
- Y_int32->data(),
- Y_int32->data(),
- M,
- memCopyObj,
- tid, // thread_id
- nthreads); // num_threads
- } // omp parallel
+ DispatchFBGEMM_<QuantizationGranularity::TENSOR>(
+ packA, col_buffer_data, Y_int32, Y_uint8_data);
+ }
} else {
// slow path
conv_nhwc_acc16_ref_(
output_image_size,
M,
kernel_dim,
- col_buffer_quantized_data,
+ col_buffer_data,
W_quantized_.data(),
Y_int32->data()
#ifdef DNNLOWP_ACC16_IN_SLOW_PATH
t_begin = chrono::system_clock::now();
#endif
- if (!fuse_output_pipeline) {
- ConvOutlier_(col_buffer_quantized_data, Y_int32);
+ if (!Wq_acc16_packed_) {
+ ConvOutlier_(col_buffer_data, Y_int32);
}
#ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN
t_begin = chrono::system_clock::now();
#endif
- if (!fuse_output_pipeline) {
- this->RunOnDeviceEpilogueNHWC_(
- col_buffer_quantized_data, Y_int32->data());
+ if (!Wq_acc16_packed_) {
+ this->RunOnDeviceEpilogueNHWC_(col_buffer_data, Y_int32->data());
} else {
- if (!dequantize_output_) {
- PropagateOutputTensorQuantizationParams(this, 0, out_qparams_);
- }
+ PropagateOutputTensorQuantizationParams(this, 0, out_qparams_);
}
}; // f2
}
template <typename T, bool ReluFused>
-bool ConvDNNLowPOp<T, ReluFused>::RunOnDeviceWithOrderNCHW() {
- const Tensor& X = InputTensorCPU_(INPUT);
- if (X.template IsType<T>()) {
- return RunOnDeviceWithOrderNCHWAndType_<T>();
- } else {
- assert(X.template IsType<float>());
- return RunOnDeviceWithOrderNCHWAndType_<float>();
- }
-}
-
-template <typename T, bool ReluFused>
-bool ConvDNNLowPOp<T, ReluFused>::RunOnDeviceWithOrderNHWC() {
- const Tensor& X = InputTensorCPU_(INPUT);
- if (X.template IsType<T>()) {
- return RunOnDeviceWithOrderNHWCAndType_<T>();
- } else {
- assert(X.template IsType<float>());
- return RunOnDeviceWithOrderNHWCAndType_<float>();
- }
-}
-
-template <typename T, bool ReluFused>
bool ConvDNNLowPOp<T, ReluFused>::TakeDepthWise3x3FastPath_() {
const Tensor& X = InputTensorCPU_(INPUT);
return StorageOrder::NHWC == ConvPoolOpBase<CPUContext>::order_ &&
- is_same<T, uint8_t>::value && X.template IsType<T>() && !Acc16() &&
+ is_same<T, uint8_t>::value && !Acc16() &&
group_ == X.dim32(X.dim() - 1) && group_ % 8 == 0 &&
this->kernel_.size() == 2 && kernel_h() == 3 && kernel_w() == 3 &&
stride_h() == stride_w() && (stride_h() == 1 || stride_h() == 2) &&
dilation_h() == 1 && dilation_w() == 1 && pad_t() == 1 && pad_b() == 1 &&
- pad_l() == 1 && pad_r() == 1 && !dequantize_output_ &&
- GetCpuId().avx2() && !quantize_groupwise_;
+ pad_l() == 1 && pad_r() == 1 && GetCpuId().avx2() && !quantize_groupwise_;
}
template <typename T, bool ReluFused>
bool ConvDNNLowPOp<T, ReluFused>::TakeDepthWise3x3x3FastPath_() {
const Tensor& X = InputTensorCPU_(INPUT);
bool ret = StorageOrder::NHWC == ConvPoolOpBase<CPUContext>::order_ &&
- is_same<T, uint8_t>::value && X.template IsType<T>() && !Acc16() &&
+ is_same<T, uint8_t>::value && !Acc16() &&
group_ == X.dim32(X.dim() - 1) && group_ % 8 == 0 &&
this->kernel_.size() == 3 && this->kernel_[0] == 3 &&
this->kernel_[1] == 3 && this->kernel_[2] == 3 &&
this->dilation_[2] == 1 &&
accumulate(
this->pads_.begin(), this->pads_.end(), 1, multiplies<int>()) == 1 &&
- !dequantize_output_ && GetCpuId().avx2() && !quantize_groupwise_;
+ GetCpuId().avx2() && !quantize_groupwise_;
return ret;
}
return true;
}
- const Tensor& X = InputTensorCPU_(INPUT);
- if (Wq_packed_ && X.template IsType<T>() &&
+ if (Wq_packed_ &&
accumulate(
this->dilation_.begin(),
this->dilation_.end(),
1,
multiplies<int>()) == 1) {
+ // im2col fusion
return true;
}
// Quantize bias
if (InputSize() == 3 &&
- ((!b_quantized_data_ && !b_dequantized_data_) ||
+ (!b_quantized_data_ ||
in_qparams_[INPUT].scale != in_qparams_scale_old_)) {
if (this->template InputIsType<Int8ConvDNNLowPPackedWeightBlob>(FILTER) &&
this->template Input<Int8ConvDNNLowPPackedWeightBlob>(FILTER)
.bias.get()) {
const auto& packed_filter =
this->template Input<Int8ConvDNNLowPPackedWeightBlob>(FILTER);
- CAFFE_ENFORCE(!dequantize_output_);
b_quantized_ = packed_filter.bias;
b_quantized_data_ = b_quantized_->data();
} else {
1e-4);
CAFFE_ENFORCE_EQ(bias_qparams.zero_point, 0);
b_quantized_data_ = bias.template data<int32_t>();
- if (dequantize_output_) {
- b_dequantized_.resize(bias.numel());
-#ifdef _OPENMP
-#pragma omp parallel for
-#endif
- for (int i = 0; i < b_dequantized_.size(); ++i) {
- b_dequantized_[i] =
- fbgemm::Dequantize<int32_t>(b_quantized_data_[i], bias_qparams);
- }
- b_dequantized_data_ = b_dequantized_.data();
- }
} else {
- b_dequantized_data_ = bias.template data<float>();
- if (!dequantize_output_) {
- b_quantized_->resize(bias.numel());
- for (int g = 0; g < filter_qparams_.size(); ++g) {
- int i_begin = g * (M / filter_qparams_.size());
- int i_end = i_begin + (M / filter_qparams_.size());
- for (int i = i_begin; i < i_end; ++i) {
- (*b_quantized_)[i] = fbgemm::Quantize<int32_t>(
- b_dequantized_data_[i],
- 0,
- in_qparams_[INPUT].scale * FilterQuantizationParams(g).scale,
- 32,
- true /* signed */);
- }
+ const float* b_data = bias.template data<float>();
+ b_quantized_->resize(bias.numel());
+ for (int g = 0; g < filter_qparams_.size(); ++g) {
+ int i_begin = g * (M / filter_qparams_.size());
+ int i_end = i_begin + (M / filter_qparams_.size());
+ for (int i = i_begin; i < i_end; ++i) {
+ (*b_quantized_)[i] = fbgemm::Quantize<int32_t>(
+ b_data[i],
+ 0,
+ in_qparams_[INPUT].scale * FilterQuantizationParams(g).scale,
+ 32,
+ true /* signed */);
}
- b_quantized_data_ = b_quantized_->data();
}
+ b_quantized_data_ = b_quantized_->data();
}
in_qparams_scale_old_ = in_qparams_[INPUT].scale;
}
- CAFFE_ENFORCE(
- (dequantize_output_ && b_dequantized_data_) ||
- (!dequantize_output_ && b_quantized_data_));
+ CAFFE_ENFORCE(b_quantized_data_);
}
}
using namespace dnnlowp;
if (!this->arguments_parsed_) {
+ bool dequantize_output;
ParseDNNLowPOperatorArguments(
- this, &dequantize_output_, &measure_quantization_error_, &followed_by_);
+ this, &dequantize_output, &measure_quantization_error_, &followed_by_);
+ CAFFE_ENFORCE_EQ(
+ dequantize_output,
+ false,
+ "Conv DNNLOWP operators don't support dequantize_output");
if (ReluFused) {
// It's actually fused with Relu not followed by but setting this to make
QuantizeBias_();
bool fp32_executed = false;
- if (!dequantize_output_) {
- if (HasStaticQuantization(this)) {
- out_qparams_ = GetStaticQuantizationParamsOf(this, 0);
- } else {
- // If quantization parameters are not chosen beforehand, run reference
- // Conv op in fp32 to choose quantization for Y.
- Fp32Op_()->DequantizeInput();
- Fp32Op_()->Get()->RunOnDevice();
- out_qparams_ = Fp32Op_()->GetOutputQuantizationParams(qfactory_.get());
- fp32_executed = true;
- }
+ if (HasStaticQuantization(this)) {
+ out_qparams_ = GetStaticQuantizationParamsOf(this, 0);
+ } else {
+ // If quantization parameters are not chosen beforehand, run reference
+ // Conv op in fp32 to choose quantization for Y.
+ Fp32Op_()->DequantizeInput();
+ Fp32Op_()->Get()->RunOnDevice();
+ out_qparams_ = Fp32Op_()->GetOutputQuantizationParams(qfactory_.get());
+ fp32_executed = true;
+ }
- for (int g = 0; g < filter_qparams_.size(); ++g) {
- float real_multiplier = in_qparams_[INPUT].scale *
- FilterQuantizationParams(g).scale / out_qparams_.scale;
- requantization_params_[g] = qfactory_->ChooseRequantizationMultiplier(
- real_multiplier, out_qparams_);
- requantization_multipliers_[g] =
- requantization_params_[g].real_multiplier;
- }
+ for (int g = 0; g < filter_qparams_.size(); ++g) {
+ float real_multiplier = in_qparams_[INPUT].scale *
+ FilterQuantizationParams(g).scale / out_qparams_.scale;
+ requantization_params_[g] = qfactory_->ChooseRequantizationMultiplier(
+ real_multiplier, out_qparams_);
+ requantization_multipliers_[g] = requantization_params_[g].real_multiplier;
}
if (measure_quantization_error_ && Fp32Op_() && !fp32_executed) {
}
template <typename T, bool ReluFused>
-template <typename OutType>
void ConvDNNLowPOp<T, ReluFused>::RunOnDeviceEpilogueNCHW_(
- const T* col_buffer_quantized_data,
+ const T* col_buffer_data,
int32_t* Y_int32,
- OutType* Y_data,
+ T* Y_data,
size_t i_offset,
int group_id) {
auto& filter = InputTensorCPU_(FILTER);
for (int j = 0; j < Y_HxW; ++j) {
int sum = 0;
for (int k = 0; k < kernel_dim; ++k) {
- sum += col_buffer_quantized_data[k * Y_HxW + j];
+ sum += col_buffer_data[k * Y_HxW + j];
}
column_offsets[j] = sum * filter_qparams.zero_point;
}
- if (dequantize_output_) {
- for (int i = 0; i < M / group_; ++i) {
- int32_t row_offset = row_offsets_[i_offset + i];
- row_offset *= -in_qparams_[INPUT].zero_point;
- for (int j = 0; j < Y_HxW; ++j) {
- int32_t raw = Y_int32[i * Y_HxW + j] + row_offset - column_offsets[j];
- float dequantized =
- raw * in_qparams_[INPUT].scale * filter_qparams.scale;
- if (InputSize() == 3) {
- dequantized += b_dequantized_data_[i_offset + i];
- }
- if (ReluFused) {
- dequantized = std::max(0.f, dequantized);
- }
- Y_data[i * Y_HxW + j] = dequantized;
- }
+ for (int i = 0; i < M / group_; ++i) {
+ int32_t row_offset = row_offsets_[i_offset + i];
+ row_offset *= -in_qparams_[INPUT].zero_point;
+ if (InputSize() == 3) {
+ row_offset += b_quantized_data_[i_offset + i];
}
- } // dequantize_output_
- else {
- for (int i = 0; i < M / group_; ++i) {
- int32_t row_offset = row_offsets_[i_offset + i];
- row_offset *= -in_qparams_[INPUT].zero_point;
- if (InputSize() == 3) {
- row_offset += b_quantized_data_[i_offset + i];
- }
- for (int j = 0; j < Y_HxW; ++j) {
- int32_t raw = Y_int32[i * Y_HxW + j] + row_offset - column_offsets[j];
- if (ReluFused) {
- raw = std::max(0, raw);
- }
- Y_data[i * Y_HxW + j] =
- fbgemm::Requantize<T>(raw, RequantizationParams(group_id));
+ for (int j = 0; j < Y_HxW; ++j) {
+ int32_t raw = Y_int32[i * Y_HxW + j] + row_offset - column_offsets[j];
+ if (ReluFused) {
+ raw = std::max(0, raw);
}
+ Y_data[i * Y_HxW + j] =
+ fbgemm::Requantize<T>(raw, RequantizationParams(group_id));
}
- } // !dequantize_output_
+ }
}
template <typename T, bool ReluFused>
-template <typename InType>
-bool ConvDNNLowPOp<T, ReluFused>::RunOnDeviceWithOrderNCHWAndType_() {
+bool ConvDNNLowPOp<T, ReluFused>::RunOnDeviceWithOrderNCHW() {
VLOG(2) << "Running DNNLOWP Conv";
using namespace dnnlowp;
// The col buffer is stored in CHW order as well - kernel_dim, and the
// height and width.
- const InType* Xdata = X.template data<InType>();
+ const T* Xdata = X.template data<T>();
// We must not call mutable_data inside omp region
- float* Y_data_float = nullptr;
- T* Y_data_T = nullptr;
- if (dequantize_output_) {
- Y_data_float = Y->template mutable_data<float>();
- } else {
- Y_data_T = Y->template mutable_data<T>();
- }
+ T* Y_data_T = Y->template mutable_data<T>();
column_offsets_->resize(Y_HxW * dnnlowp_get_max_threads());
auto f = [&](Tensor* col_buffer) {
col_buffer->Resize(buffer_shape);
vector<int> buffer_shape_per_thread(
buffer_shape.begin() + 1, buffer_shape.end());
- InType* col_buffer_data = col_buffer->template mutable_data<InType>();
+ T* col_buffer_data = col_buffer->template mutable_data<T>();
auto f2 = [&](vector<int32_t>* Y_int32) {
Y_int32->resize(M * Y_HxW * dnnlowp_get_max_threads());
int tid = dnnlowp_get_thread_num();
for (int group_id = 0; group_id < group_; ++group_id) {
if (this->kernel_.size() == 2) {
- math::Im2ColNCHW<InType>(
+ math::Im2ColNCHW<T>(
C / group_,
input_dims[0],
input_dims[1],
Xdata + (group_ * image_id + group_id) * input_offset,
col_buffer_data + tid * col_buffer_size,
&context_,
- X.IsType<T>() ? in_qparams_[INPUT].zero_point : 0);
+ in_qparams_[INPUT].zero_point);
} else {
- math::Im2ColNdNCHW<InType>(
+ math::Im2ColNdNCHW<T>(
this->kernel_.size(),
C * X_HxW,
col_buffer_size,
Xdata + (group_ * image_id + group_id) * input_offset,
col_buffer_data + tid * col_buffer_size,
&context_,
- X.IsType<T>() ? in_qparams_[INPUT].zero_point : 0);
+ in_qparams_[INPUT].zero_point);
}
// quantize col_buffer
- T* col_buffer_quantized_data = nullptr;
- vector<T> col_buffer_quantized;
- if (X.template IsType<T>()) {
- col_buffer_quantized_data =
- reinterpret_cast<T*>(col_buffer_data) + tid * col_buffer_size;
- } else {
- col_buffer_quantized.resize(kernel_dim * Y_HxW);
- fbgemm::Quantize<T>(
- (const float*)col_buffer_data + tid * col_buffer_size,
- col_buffer_quantized.data(),
- col_buffer_quantized.size(),
- in_qparams_[INPUT]);
- col_buffer_quantized_data = col_buffer_quantized.data();
- }
+ T* col_buffer_private = col_buffer_data + tid * col_buffer_size;
int32_t* Y_int32_temp =
Y_int32->data() + ((M / group_) * group_id + M * tid) * Y_HxW;
int32_t sum = 0;
for (int k = 0; k < kernel_dim; ++k) {
int w = W_quantized_group[i * kernel_dim + k];
- int x = col_buffer_quantized_data[k * Y_HxW + j];
+ int x = col_buffer_private[k * Y_HxW + j];
sum += w * x;
}
Y_int32_temp[i * Y_HxW + j] = sum;
} // j
} // i
- if (dequantize_output_) {
- RunOnDeviceEpilogueNCHW_(
- col_buffer_quantized_data,
- Y_int32_temp,
- Y_data_float + (M * image_id + M / group_ * group_id) * Y_HxW,
- M / group_ * group_id,
- group_id);
- } else {
- RunOnDeviceEpilogueNCHW_(
- col_buffer_quantized_data,
- Y_int32_temp,
- Y_data_T + (M * image_id + M / group_ * group_id) * Y_HxW,
- M / group_ * group_id,
- group_id);
- }
+ RunOnDeviceEpilogueNCHW_(
+ col_buffer_private,
+ Y_int32_temp,
+ Y_data_T + (M * image_id + M / group_ * group_id) * Y_HxW,
+ M / group_ * group_id,
+ group_id);
} // for each group
} // for each image_id
- if (!dequantize_output_) {
- PropagateOutputTensorQuantizationParams(this, 0, out_qparams_);
- }
+ PropagateOutputTensorQuantizationParams(this, 0, out_qparams_);
MeasureQuantizationError_();
}; // f2
}
return true;
-} // RunOnDeviceWithOrderNCHWAndType_
+} // RunOnDeviceWithOrderNCHW
template <typename T, bool ReluFused>
void ConvDNNLowPOp<T, ReluFused>::RunOnDeviceEpilogueNHWC_(
- const T* col_buffer_quantized_data,
+ const T* col_buffer_data,
int32_t* Y_int32) {
const Tensor& X = InputTensorCPU_(INPUT);
auto& filter = InputTensorCPU_(FILTER);
// Adjust with bias and zero_point and then requantize
// See batch_matmul_dnnlowp_op.cc to why we compute column_offsets,
// row_offset, and const_offset in this way.
- if (dequantize_output_) {
- float* Ydata = Y->template mutable_data<float>();
+ int32_t A_zero_point = in_qparams_[INPUT].zero_point;
+
+ if (!dnnlowp::HasStaticQuantization(this)) {
+ if (quantize_groupwise_) {
+ static int log_occurences = 0;
+ if (log_occurences < 32) {
+ ++log_occurences;
+ LOG(WARNING) << "Cannot do group-wise quantization without "
+ "static quantization of activations for "
+ << OperatorBase::debug_def().output(0);
+ }
+ }
+
+ int32_t Y_min = numeric_limits<int32_t>::max();
+ int32_t Y_max = numeric_limits<int32_t>::min();
#ifdef _OPENMP
-#pragma omp parallel for
+#pragma omp parallel for reduction(min : Y_min), reduction(max : Y_max)
#endif
for (int i = 0; i < N * Y_HxW; ++i) {
for (int group_id = 0; group_id < group_; ++group_id) {
int32_t row_offset = 0;
for (int k = 0; k < kernel_dim; ++k) {
- row_offset += col_buffer_quantized_data
- [(i * group_ + group_id) * kernel_dim + k];
+ row_offset +=
+ col_buffer_data[(i * group_ + group_id) * kernel_dim + k];
}
- row_offset *= FilterQuantizationParams(group_id).zero_point;
+ row_offset *= FilterQuantizationParams(0).zero_point;
for (int j = group_id * (M / group_); j < (group_id + 1) * (M / group_);
++j) {
- Y_int32[i * M + j] -=
- in_qparams_[INPUT].zero_point * (*column_offsets_)[j] +
- row_offset;
- Ydata[i * M + j] = Y_int32[i * M + j] * in_qparams_[INPUT].scale *
- FilterQuantizationParams(group_id).scale +
- ((InputSize() == 3) ? b_dequantized_data_[j] : 0.f);
- if (ReluFused) {
- Ydata[i * M + j] = std::max(Ydata[i * M + j], 0.f);
+ int32_t raw = Y_int32[i * M + j] -
+ A_zero_point * (*column_offsets_)[j] - row_offset;
+ if (b_quantized_data_) {
+ raw += b_quantized_data_[j];
}
+ Y_min = std::min(Y_min, raw);
+ Y_max = std::max(Y_max, raw);
}
} // for each group
- } // for each i
- } else {
- int32_t A_zero_point = in_qparams_[INPUT].zero_point;
-
- if (!dnnlowp::HasStaticQuantization(this)) {
- if (quantize_groupwise_) {
- static int log_occurences = 0;
- if (log_occurences < 32) {
- ++log_occurences;
- LOG(WARNING) << "Cannot do group-wise quantization without "
- "static quantization of activations for "
- << OperatorBase::debug_def().output(0);
- }
- }
-
- int32_t Y_int32_min = numeric_limits<int32_t>::max();
- int32_t Y_int32_max = numeric_limits<int32_t>::min();
-
-#ifdef _OPENMP
-#pragma omp parallel for reduction(min \
- : Y_int32_min), \
- reduction(max \
- : Y_int32_max)
-#endif
- for (int i = 0; i < N * Y_HxW; ++i) {
- for (int group_id = 0; group_id < group_; ++group_id) {
- int32_t row_offset = 0;
- for (int k = 0; k < kernel_dim; ++k) {
- row_offset += col_buffer_quantized_data
- [(i * group_ + group_id) * kernel_dim + k];
- }
- row_offset *= FilterQuantizationParams(0).zero_point;
-
- for (int j = group_id * (M / group_);
- j < (group_id + 1) * (M / group_);
- ++j) {
- int32_t raw = Y_int32[i * M + j] -
- A_zero_point * (*column_offsets_)[j] - row_offset;
- if (b_quantized_data_) {
- raw += b_quantized_data_[j];
- }
- Y_int32_min = std::min(Y_int32_min, raw);
- Y_int32_max = std::max(Y_int32_max, raw);
- }
- } // for each group
- } // for each row i
+ } // for each row i
- if (ReluFused) {
- Y_int32_min = std::max(0, Y_int32_min);
- Y_int32_max = std::max(0, Y_int32_max);
- }
+ if (ReluFused) {
+ Y_min = std::max(0, Y_min);
+ Y_max = std::max(0, Y_max);
+ }
- float Y_int32_scale =
- in_qparams_[INPUT].scale * FilterQuantizationParams(0).scale;
- out_qparams_ = qfactory_->ChooseQuantizationParams(
- Y_int32_scale * Y_int32_min, Y_int32_scale * Y_int32_max);
+ float Y_scale =
+ in_qparams_[INPUT].scale * FilterQuantizationParams(0).scale;
+ out_qparams_ =
+ qfactory_->ChooseQuantizationParams(Y_scale * Y_min, Y_scale * Y_max);
- float real_multiplier = Y_int32_scale / out_qparams_.scale;
- requantization_params_[0] = qfactory_->ChooseRequantizationMultiplier(
- real_multiplier, out_qparams_);
- requantization_multipliers_[0] =
- requantization_params_[0].real_multiplier;
- }
+ float real_multiplier = Y_scale / out_qparams_.scale;
+ requantization_params_[0] = qfactory_->ChooseRequantizationMultiplier(
+ real_multiplier, out_qparams_);
+ requantization_multipliers_[0] = requantization_params_[0].real_multiplier;
+ }
- int32_t C_zero_point = out_qparams_.zero_point;
+ int32_t C_zero_point = out_qparams_.zero_point;
- T* Ydata = Y->template mutable_data<T>();
+ T* Ydata = Y->template mutable_data<T>();
- using namespace fbgemm;
- if (is_same<T, uint8_t>::value && GetCpuId().avx2()) {
+ using namespace fbgemm;
+ if (is_same<T, uint8_t>::value && GetCpuId().avx2()) {
#ifdef _OPENMP
#pragma omp parallel for
#endif
- for (int i = 0; i < N * Y_HxW; ++i) {
- for (int group_id = 0; group_id < group_; ++group_id) {
- int32_t row_offset;
- row_offsets_u8acc32_ref(
- 1,
- kernel_dim,
- group_ * kernel_dim,
- reinterpret_cast<const uint8_t*>(
- col_buffer_quantized_data +
- (i * group_ + group_id) * kernel_dim),
- &row_offset);
-
- int32_t B_zero_point = FilterQuantizationParams(group_id).zero_point;
- float C_multiplier = RequantizationParams(group_id).real_multiplier;
-
- requantize_u8acc32_ref(
- 1,
- M / group_,
- M,
- Y_int32 + i * M + group_id * (M / group_),
- reinterpret_cast<uint8_t*>(
- Ydata + i * M + group_id * (M / group_)),
- &C_multiplier,
- C_zero_point,
- A_zero_point,
- &B_zero_point,
- &row_offset,
- column_offsets_->data() + group_id * (M / group_),
- b_quantized_data_ ? b_quantized_data_ + group_id * (M / group_)
- : nullptr,
- M / group_,
- ReluFused);
- } // for each group
- } // for each row i
- } else {
+ for (int i = 0; i < N * Y_HxW; ++i) {
+ for (int group_id = 0; group_id < group_; ++group_id) {
+ int32_t row_offset;
+ row_offsets_u8acc32_ref(
+ 1,
+ kernel_dim,
+ group_ * kernel_dim,
+ reinterpret_cast<const uint8_t*>(
+ col_buffer_data + (i * group_ + group_id) * kernel_dim),
+ &row_offset);
+
+ int32_t B_zero_point = FilterQuantizationParams(group_id).zero_point;
+ float C_multiplier = RequantizationParams(group_id).real_multiplier;
+
+ requantize_u8acc32_ref(
+ 1,
+ M / group_,
+ M,
+ Y_int32 + i * M + group_id * (M / group_),
+ reinterpret_cast<uint8_t*>(Ydata + i * M + group_id * (M / group_)),
+ &C_multiplier,
+ C_zero_point,
+ A_zero_point,
+ &B_zero_point,
+ &row_offset,
+ column_offsets_->data() + group_id * (M / group_),
+ b_quantized_data_ ? b_quantized_data_ + group_id * (M / group_)
+ : nullptr,
+ M / group_,
+ ReluFused);
+ } // for each group
+ } // for each row i
+ } else {
#ifdef _OPENMP
#pragma omp parallel for
#endif
- for (int i = 0; i < N * Y_HxW; ++i) {
- for (int group_id = 0; group_id < group_; ++group_id) {
- int32_t B_zero_point = FilterQuantizationParams(group_id).zero_point;
- int32_t row_offset = 0;
- for (int k = 0; k < kernel_dim; ++k) {
- row_offset += col_buffer_quantized_data
- [(i * group_ + group_id) * kernel_dim + k];
- }
- row_offset *= B_zero_point;
+ for (int i = 0; i < N * Y_HxW; ++i) {
+ for (int group_id = 0; group_id < group_; ++group_id) {
+ int32_t B_zero_point = FilterQuantizationParams(group_id).zero_point;
+ int32_t row_offset = 0;
+ for (int k = 0; k < kernel_dim; ++k) {
+ row_offset +=
+ col_buffer_data[(i * group_ + group_id) * kernel_dim + k];
+ }
+ row_offset *= B_zero_point;
- for (int j = group_id * (M / group_);
- j < (group_id + 1) * (M / group_);
- ++j) {
- int32_t raw = Y_int32[i * M + j] -
- A_zero_point * (*column_offsets_)[j] - row_offset;
- if (b_quantized_data_) {
- raw += b_quantized_data_[j];
- }
+ for (int j = group_id * (M / group_); j < (group_id + 1) * (M / group_);
+ ++j) {
+ int32_t raw = Y_int32[i * M + j] -
+ A_zero_point * (*column_offsets_)[j] - row_offset;
+ if (b_quantized_data_) {
+ raw += b_quantized_data_[j];
+ }
+ Ydata[i * M + j] =
+ fbgemm::Requantize<T>(raw, RequantizationParams(group_id));
+ if (ReluFused) { // static if
Ydata[i * M + j] =
- fbgemm::Requantize<T>(raw, RequantizationParams(group_id));
- if (ReluFused) { // static if
- Ydata[i * M + j] =
- std::max<int32_t>(C_zero_point, Ydata[i * M + j]);
- }
+ std::max<int32_t>(C_zero_point, Ydata[i * M + j]);
}
- } // for each group
- } // for each row i
- } // !__AVX2__
+ }
+ } // for each group
+ } // for each row i
+ } // !__AVX2__
- dnnlowp::PropagateOutputTensorQuantizationParams(this, 0, out_qparams_);
- }
+ dnnlowp::PropagateOutputTensorQuantizationParams(this, 0, out_qparams_);
}
template <typename T, bool ReluFused>
}
template <typename T, bool ReluFused>
-template <typename InType>
-const InType* ConvDNNLowPOp<T, ReluFused>::Im2ColNHWC_(Tensor* col_buffer) {
+const T* ConvDNNLowPOp<T, ReluFused>::Im2ColNHWC_(Tensor* col_buffer) {
const Tensor& X = InputTensorCPU_(INPUT);
Tensor* Y = OutputTensorCPU_(0);
int ndim = X.dim();
const int input_offset = X_HxW * C;
const int Y_HxW = this->GetDimsSize(*Y);
- const InType* Xdata = X.template data<InType>();
+ const T* Xdata = X.template data<T>();
vector<int> buffer_shape(ndim);
for (auto i = 0; i < ndim - 1; ++i) {
col_buffer->Resize(buffer_shape);
- InType* col_buffer_data = col_buffer->template mutable_data<InType>();
+ T* col_buffer_data = col_buffer->template mutable_data<T>();
#ifdef _OPENMP
#pragma omp parallel for if (N > 1)
#endif
for (int image_id = 0; image_id < N; ++image_id) {
if (this->kernel_.size() <= 2) {
- math::Im2ColNHWC<InType>(
+ math::Im2ColNHWC<T>(
C,
X.dim32(1),
this->kernel_.size() == 2 ? X.dim32(2) : 1,
col_buffer_data + image_id * group_ * kernel_dim * Y_HxW,
&context_,
group_,
- X.IsType<T>() ? in_qparams_[INPUT].zero_point : 0);
+ in_qparams_[INPUT].zero_point);
} else {
- math::Im2Col3DNHWC<InType>(
+ math::Im2Col3DNHWC<T>(
C,
X.dim32(1), // num_frames
X.dim32(2), // H
col_buffer_data + image_id * group_ * kernel_dim * Y_HxW,
&context_,
group_,
- X.IsType<T>() ? in_qparams_[INPUT].zero_point : 0);
+ in_qparams_[INPUT].zero_point);
}
}
- return col_buffer->template data<InType>();
+ return col_buffer->template data<T>();
}
template <typename T, typename T_signed>
void ConvDNNLowPOp<T, ReluFused>::DispatchFBGEMM_(
PackAMatrix& packA,
vector<int32_t>* Y_int32,
- uint8_t* Y_uint8_data,
- float* Y_float_data) {
+ uint8_t* Y_uint8_data) {
+ // This function is called within an OpenMP region
auto& filter = InputTensorCPU_(FILTER);
const int M = filter.dim32(0);
int tid = dnnlowp_get_thread_num();
using namespace fbgemm;
- if (Y_uint8_data) {
- DoNothing<> doNothingObj{};
- ReQuantizeOutput<ReluFused, Q_GRAN> outputProcObj(
- doNothingObj,
- requantization_multipliers_.data(),
- out_qparams_.zero_point,
- in_qparams_[INPUT].zero_point,
- filter_zero_points_.data(),
- packA.getRowOffsetBuffer(),
- column_offsets_->data(),
- InputSize() == 3 ? b_quantized_data_ : nullptr,
- M,
- group_);
+ DoNothing<> doNothingObj{};
+ ReQuantizeOutput<ReluFused, Q_GRAN> outputProcObj(
+ doNothingObj,
+ requantization_multipliers_.data(),
+ out_qparams_.zero_point,
+ in_qparams_[INPUT].zero_point,
+ filter_zero_points_.data(),
+ packA.getRowOffsetBuffer(),
+ column_offsets_->data(),
+ InputSize() == 3 ? b_quantized_data_ : nullptr,
+ M,
+ group_);
- fbgemmPacked(
- packA,
- *Wq_packed_,
- Y_uint8_data,
- Y_int32->data(),
- M,
- outputProcObj,
- tid,
- nthreads);
- } else {
- DoNothing<float, float> doNothingObj{};
- ReQuantizeForFloat<ReluFused, Q_GRAN> outputProcObj(
- doNothingObj,
- in_qparams_[INPUT].scale,
- filter_scales_.data(),
- in_qparams_[INPUT].zero_point,
- filter_zero_points_.data(),
- packA.getRowOffsetBuffer(),
- column_offsets_->data(),
- InputSize() == 3 ? b_dequantized_data_ : nullptr,
- M,
- group_);
-
- fbgemmPacked(
- packA,
- *Wq_packed_,
- Y_float_data,
- reinterpret_cast<int32_t*>(Y_float_data),
- M,
- outputProcObj,
- tid,
- nthreads);
- }
+ fbgemmPacked(
+ packA,
+ *Wq_packed_,
+ Y_uint8_data,
+ Y_int32->data(),
+ M,
+ outputProcObj,
+ tid,
+ nthreads);
}
template <typename T, bool ReluFused>
-template <typename InType>
void ConvDNNLowPOp<T, ReluFused>::ConvNHWCCore_(
- const InType* col_buffer_data,
- const T* col_buffer_quantized_data,
+ const T* col_buffer_data,
vector<int32_t>* Y_int32) {
const Tensor& X = InputTensorCPU_(INPUT);
auto& filter = InputTensorCPU_(FILTER);
StoreMatrixInMatrixMarketFormat(
N * Y_HxW * group_,
kernel_dim,
- col_buffer_quantized_data,
+ col_buffer_data,
OperatorBase::debug_def().input(INPUT));
// Dump weight
}
if (TakeDepthWise3x3x3FastPath_()) {
- const InType* Xdata = X.template data<InType>();
+ const T* Xdata = X.template data<T>();
uint8_t* Y_uint8_data =
OutputTensorCPU_(0)->template mutable_data<uint8_t>();
return;
} else if (TakeDepthWise3x3FastPath_()) {
const int H = X.dim32(1), W = X.dim32(2);
- const InType* Xdata = X.template data<InType>();
+ const T* Xdata = X.template data<T>();
uint8_t* Y_uint8_data =
OutputTensorCPU_(0)->template mutable_data<uint8_t>();
using namespace fbgemm;
int row_offset_size_per_thread = -1;
int x_pack_buf_size_per_thread = -1;
- bool fuse_im2col = Wq_packed_ && X.template IsType<T>() &&
- X.template data<T>() == col_buffer_quantized_data && !IsConvGEMM_();
+ bool fuse_im2col =
+ Wq_packed_ && X.template data<T>() == col_buffer_data && !IsConvGEMM_();
if (Wq_packed_) {
if (fuse_im2col) {
row_offset_size_per_thread =
X_pack_buf_.resize(dnnlowp_get_max_threads() * x_pack_buf_size_per_thread);
}
- uint8_t* Y_uint8_data = nullptr;
- float* Y_float_data = nullptr;
- if (dequantize_output_) {
- // Output is float
- Y_float_data = Y->template mutable_data<float>();
- } else {
- // Output is uint8_t
- Y_uint8_data = Y->template mutable_data<uint8_t>();
- }
+ uint8_t* Y_uint8_data = Y->template mutable_data<uint8_t>();
if (Wq_packed_)
#ifdef _OPENMP
PackAWithIm2Col<uint8_t> packA(
conv_p,
- reinterpret_cast<const uint8_t*>(col_buffer_quantized_data),
+ reinterpret_cast<const uint8_t*>(col_buffer_data),
// buffer for packed matrix
X_pack_buf_.data() + tid * x_pack_buf_size_per_thread,
in_qparams_[INPUT].zero_point,
if (quantize_groupwise_) {
DispatchFBGEMM_<
PackAWithIm2Col<uint8_t>,
- QuantizationGranularity::GROUP>(
- packA, Y_int32, Y_uint8_data, Y_float_data);
+ QuantizationGranularity::GROUP>(packA, Y_int32, Y_uint8_data);
} else {
DispatchFBGEMM_<
PackAWithIm2Col<uint8_t>,
- QuantizationGranularity::TENSOR>(
- packA, Y_int32, Y_uint8_data, Y_float_data);
+ QuantizationGranularity::TENSOR>(packA, Y_int32, Y_uint8_data);
}
} else {
// 3D
PackAWithIm2Col<uint8_t, int32_t, 3> packA(
conv_p,
- reinterpret_cast<const uint8_t*>(col_buffer_quantized_data),
+ reinterpret_cast<const uint8_t*>(col_buffer_data),
// buffer for packed matrix
X_pack_buf_.data() + tid * x_pack_buf_size_per_thread,
in_qparams_[INPUT].zero_point,
if (quantize_groupwise_) {
DispatchFBGEMM_<
PackAWithIm2Col<uint8_t, int32_t, 3>,
- QuantizationGranularity::GROUP>(
- packA, Y_int32, Y_uint8_data, Y_float_data);
+ QuantizationGranularity::GROUP>(packA, Y_int32, Y_uint8_data);
} else {
DispatchFBGEMM_<
PackAWithIm2Col<uint8_t, int32_t, 3>,
- QuantizationGranularity::TENSOR>(
- packA, Y_int32, Y_uint8_data, Y_float_data);
+ QuantizationGranularity::TENSOR>(packA, Y_int32, Y_uint8_data);
}
} // 3D
} else {
matrix_op_t::NoTranspose,
N * Y_HxW,
group_ * kernel_dim,
- reinterpret_cast<const uint8_t*>(col_buffer_quantized_data),
+ reinterpret_cast<const uint8_t*>(col_buffer_data),
group_ * kernel_dim,
// buffer for packed matrix
X_pack_buf_.data() + tid * x_pack_buf_size_per_thread,
if (quantize_groupwise_) {
DispatchFBGEMM_<
PackAWithRowOffset<uint8_t>,
- QuantizationGranularity::GROUP>(
- packA, Y_int32, Y_uint8_data, Y_float_data);
+ QuantizationGranularity::GROUP>(packA, Y_int32, Y_uint8_data);
} else {
DispatchFBGEMM_<
PackAWithRowOffset<uint8_t>,
- QuantizationGranularity::TENSOR>(
- packA, Y_int32, Y_uint8_data, Y_float_data);
+ QuantizationGranularity::TENSOR>(packA, Y_int32, Y_uint8_data);
}
} // no im2col fusion
} else {
N * Y_HxW,
M,
kernel_dim,
- col_buffer_quantized_data,
+ col_buffer_data,
W_quantized_.data(),
Y_int32->data());
}
}
template <typename T, bool ReluFused>
-template <typename InType>
-bool ConvDNNLowPOp<T, ReluFused>::RunOnDeviceWithOrderNHWCAndType_() {
+bool ConvDNNLowPOp<T, ReluFused>::RunOnDeviceWithOrderNHWC() {
CAFFE_ENFORCE_LE(
this->kernel_.size(),
3,
const Tensor& X = InputTensorCPU_(INPUT);
auto& filter = InputTensorCPU_(FILTER);
Tensor* Y = OutputTensorCPU_(0);
- const int N = X.dim32(0), C = X.dim32(X.dim() - 1);
+ const int C = X.dim32(X.dim() - 1);
const int G = group_;
CAFFE_ENFORCE_EQ(X.dim(), filter.dim());
const int M = filter.dim32(0);
M % G, 0, "The number of output channels is not divisible by group.");
ConvPoolOpBase<CPUContext>::SetOutputSize(X, Y, filter.dim32(0));
- // The dimension of each kernel
- const int kernel_dim = KernelDim_();
- // The output image size is the spatial size of the output.
- const int Y_HxW = this->GetDimsSize(*Y);
+
// The col buffer is stored in HWC order as well - kernel_dim, and the height
// and width.
// Im2col, followed by gemm.
auto f2 = [&](Tensor* col_buffer_) {
- const InType* Xdata = X.template data<InType>();
- const InType* col_buffer_data =
- no_im2col ? Xdata : Im2ColNHWC_<InType>(col_buffer_);
+ const T* Xdata = X.template data<T>();
+ const T* col_buffer_data = no_im2col ? Xdata : Im2ColNHWC_(col_buffer_);
#ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN
/*if (VLOG_IS_ON(3)) */ {
}
#endif
- // quantize col_buffer
- const T* col_buffer_quantized_data = nullptr;
- vector<T> col_buffer_quantized;
- if (X.template IsType<T>()) {
- col_buffer_quantized_data = reinterpret_cast<const T*>(col_buffer_data);
- } else {
- col_buffer_quantized.resize(G * kernel_dim * Y_HxW * N);
- fbgemm::Quantize<T>(
- reinterpret_cast<const float*>(col_buffer_data),
- col_buffer_quantized.data(),
- col_buffer_quantized.size(),
- in_qparams_[INPUT]);
- col_buffer_quantized_data = col_buffer_quantized.data();
- }
-
#ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN
/*if (VLOG_IS_ON(3)) */ {
t_end = chrono::system_clock::now();
}
#endif
- ConvNHWCCore_(col_buffer_data, col_buffer_quantized_data, Y_int32);
+ ConvNHWCCore_(col_buffer_data, Y_int32);
#ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN
/*if (VLOG_IS_ON(3)) */ {
Wq_depthwise_3x3x3_packed_) {
// In fast path with fbgemm except when
// rescaling quantized numbers should've been already done.
- if (!dequantize_output_) {
- PropagateOutputTensorQuantizationParams(this, 0, out_qparams_);
- }
+ PropagateOutputTensorQuantizationParams(this, 0, out_qparams_);
} else {
- RunOnDeviceEpilogueNHWC_(col_buffer_quantized_data, Y_int32->data());
+ RunOnDeviceEpilogueNHWC_(col_buffer_data, Y_int32->data());
}
}; // f2
#ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN
/*if (VLOG_IS_ON(3)) */ {
+ const int N = X.dim32(0);
+ // The dimension of each kernel
+ const int kernel_dim = KernelDim_();
+ // The output image size is the spatial size of the output.
+ const int Y_HxW = this->GetDimsSize(*Y);
+
t_end = chrono::system_clock::now();
double dt = chrono::duration<double>(t_end - t_begin).count();
LOG(INFO) << "this=" << this << " prologue: " << dt * 1e3 << " ms";