From: Jongsoo Park <jongsoo@fb.com>
Date: Thu, 4 Apr 2019 05:50:05 +0000 (-0700)
Subject: fold col offset into bias; optimize A symmetric quant (#17026)
X-Git-Tag: accepted/tizen/6.5/unified/20211028.231830~427
X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=fa0ad057f8b0b16500a00ec874fcaa733e0287e3;p=platform%2Fupstream%2Fpytorch.git

fold col offset into bias; optimize A symmetric quant (#17026)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/17026

D14013931 was for FC. This diff is similar optimizations for Conv.
A subtle difference is that in FC, once we fold col_offset into bias during pre-processing step, we can treat everything as if A_zero_offset == 0 (symmetric quantization of A).
In Conv, we can't do this because padding still needs to use the original A_zero_offset.
From requantization point of view, once col_offset folded into bias, we can treat as if we're doing symmetric A quantization.
But, for steps involving padding like im2col, im2col fused with packing, and direct conv for depth-wise/group convolution we still need to pass the original A_zero_offset.

Reviewed By: jianyuh

Differential Revision: D14020276

fbshipit-source-id: c29caefd1127bbc6aff0e9d535939bb0c1ecb66c
---

diff --git a/caffe2/quantization/server/conv_dnnlowp_acc16_op.cc b/caffe2/quantization/server/conv_dnnlowp_acc16_op.cc
index f356b42..454be17 100644
--- a/caffe2/quantization/server/conv_dnnlowp_acc16_op.cc
+++ b/caffe2/quantization/server/conv_dnnlowp_acc16_op.cc
@@ -491,9 +491,9 @@ static void conv_nhwc_acc16_ref_(
 }
 
 template <bool ReluFused>
-template <fbgemm::QuantizationGranularity Q_GRAN>
+template <typename PackAMatrix, fbgemm::QuantizationGranularity Q_GRAN>
 void ConvDNNLowPAcc16Op<ReluFused>::DispatchFBGEMM_(
-    fbgemm::PackAWithRowOffset<uint8_t, int16_t>& packA,
+    PackAMatrix& packA,
     const uint8_t* col_buffer_data,
     vector<int32_t>* Y_int32,
     uint8_t* Y_uint8_data) {
@@ -513,10 +513,11 @@ void ConvDNNLowPAcc16Op<ReluFused>::DispatchFBGEMM_(
       doNothingObj,
       this->requantization_multipliers_.data(),
       out_qparams_.zero_point,
-      in_qparams_[INPUT].zero_point,
+      // column_offsets_ empty means column_offsets_ are folded into bias
+      this->column_offsets_->empty() ? 0 : in_qparams_[INPUT].zero_point,
       this->filter_zero_points_.data(),
       packA.getRowOffsetBuffer(),
-      this->column_offsets_->data(),
+      this->column_offsets_->empty() ? nullptr : this->column_offsets_->data(),
       InputSize() == 3 ? this->b_quantized_data_ : nullptr,
       M,
       group_);
@@ -670,14 +671,21 @@ bool ConvDNNLowPAcc16Op<ReluFused>::RunOnDeviceWithOrderNHWC() {
     int row_offset_size_per_thread = -1;
     int x_pack_buf_size_per_thread = -1;
     if (Wq_acc16_packed_) {
-      row_offset_size_per_thread =
-          PackAWithRowOffset<uint8_t, int16_t>::rowOffsetBufferSize();
-      x_pack_buf_size_per_thread =
-          PackAWithRowOffset<uint8_t, int16_t>::packedBufferSize();
-      row_offsets_.resize(
-          dnnlowp_get_max_threads() * row_offset_size_per_thread);
-      X_pack_buf_.resize(
-          dnnlowp_get_max_threads() * x_pack_buf_size_per_thread);
+      if (!this->quantize_groupwise_ && this->filter_zero_points_[0] == 0) {
+        x_pack_buf_size_per_thread =
+            PackAMatrix<uint8_t, int16_t>::packedBufferSize();
+        X_pack_buf_.resize(
+            dnnlowp_get_max_threads() * x_pack_buf_size_per_thread);
+      } else {
+        row_offset_size_per_thread =
+            PackAWithRowOffset<uint8_t, int16_t>::rowOffsetBufferSize();
+        x_pack_buf_size_per_thread =
+            PackAWithRowOffset<uint8_t, int16_t>::packedBufferSize();
+        row_offsets_.resize(
+            dnnlowp_get_max_threads() * row_offset_size_per_thread);
+        X_pack_buf_.resize(
+            dnnlowp_get_max_threads() * x_pack_buf_size_per_thread);
+      }
     }
 
     uint8_t* Y_uint8_data = Y->template mutable_data<uint8_t>();
@@ -692,22 +700,50 @@ bool ConvDNNLowPAcc16Op<ReluFused>::RunOnDeviceWithOrderNHWC() {
       int tid = dnnlowp_get_thread_num();
 
       // no im2col fusion
-      PackAWithRowOffset<uint8_t, int16_t> packA(
-          matrix_op_t::NoTranspose,
-          N * output_image_size,
-          group_ * kernel_dim,
-          col_buffer_data,
-          group_ * kernel_dim,
-          X_pack_buf_.data() + tid * x_pack_buf_size_per_thread,
-          group_,
-          row_offsets_.data() + tid * row_offset_size_per_thread);
-
-      if (this->quantize_groupwise_) {
-        DispatchFBGEMM_<QuantizationGranularity::GROUP>(
-            packA, col_buffer_data, Y_int32, Y_uint8_data);
+      if (!this->quantize_groupwise_ && this->filter_zero_points_[0] == 0) {
+        PackAMatrix<uint8_t, int16_t> packA(
+            matrix_op_t::NoTranspose,
+            N * output_image_size,
+            group_ * kernel_dim,
+            col_buffer_data,
+            group_ * kernel_dim,
+            X_pack_buf_.data() + tid * x_pack_buf_size_per_thread,
+            group_);
+
+        if (this->quantize_groupwise_) {
+          DispatchFBGEMM_<
+              PackAMatrix<uint8_t, int16_t>,
+              QuantizationGranularity::GROUP>(
+              packA, col_buffer_data, Y_int32, Y_uint8_data);
+        } else {
+          DispatchFBGEMM_<
+              PackAMatrix<uint8_t, int16_t>,
+              QuantizationGranularity::TENSOR>(
+              packA, col_buffer_data, Y_int32, Y_uint8_data);
+        }
       } else {
-        DispatchFBGEMM_<QuantizationGranularity::TENSOR>(
-            packA, col_buffer_data, Y_int32, Y_uint8_data);
+        // no im2col fusion
+        PackAWithRowOffset<uint8_t, int16_t> packA(
+            matrix_op_t::NoTranspose,
+            N * output_image_size,
+            group_ * kernel_dim,
+            col_buffer_data,
+            group_ * kernel_dim,
+            X_pack_buf_.data() + tid * x_pack_buf_size_per_thread,
+            group_,
+            row_offsets_.data() + tid * row_offset_size_per_thread);
+
+        if (this->quantize_groupwise_) {
+          DispatchFBGEMM_<
+              PackAWithRowOffset<uint8_t, int16_t>,
+              QuantizationGranularity::GROUP>(
+              packA, col_buffer_data, Y_int32, Y_uint8_data);
+        } else {
+          DispatchFBGEMM_<
+              PackAWithRowOffset<uint8_t, int16_t>,
+              QuantizationGranularity::TENSOR>(
+              packA, col_buffer_data, Y_int32, Y_uint8_data);
+        }
       }
     } else {
       // slow path
diff --git a/caffe2/quantization/server/conv_dnnlowp_acc16_op.h b/caffe2/quantization/server/conv_dnnlowp_acc16_op.h
index 39dbddc..848dbd5 100644
--- a/caffe2/quantization/server/conv_dnnlowp_acc16_op.h
+++ b/caffe2/quantization/server/conv_dnnlowp_acc16_op.h
@@ -35,9 +35,9 @@ class ConvDNNLowPAcc16Op final : public ConvDNNLowPOp<std::uint8_t, ReluFused> {
 
   bool GetQuantizationParameters_();
 
-  template <fbgemm::QuantizationGranularity Q_GRAN>
+  template <typename PackAMatrix, fbgemm::QuantizationGranularity Q_GRAN>
   void DispatchFBGEMM_(
-      fbgemm::PackAWithRowOffset<std::uint8_t, std::int16_t>& packA,
+      PackAMatrix& packA,
       const std::uint8_t* col_buffer_data,
       vector<std::int32_t>* Y_int32,
       uint8_t* Y_uint8_data);
diff --git a/caffe2/quantization/server/conv_dnnlowp_op.cc b/caffe2/quantization/server/conv_dnnlowp_op.cc
index 05c788e..635a63e 100644
--- a/caffe2/quantization/server/conv_dnnlowp_op.cc
+++ b/caffe2/quantization/server/conv_dnnlowp_op.cc
@@ -198,15 +198,20 @@ bool ConvDNNLowPOp<T, ReluFused>::NoIm2ColNHWC_() {
 
 template <typename T, bool ReluFused>
 void ConvDNNLowPOp<T, ReluFused>::PreComputeRowColumnOffsets_() {
+  if (this->order_ == StorageOrder::NHWC &&
+      this->template InputIsType<int8::Int8TensorCPU>(INPUT)) {
+    // If input tensor doesn't use dynamic quantization, we fold column_offsets_
+    // into bias.
+    return;
+  }
+
   const auto& filter = InputTensorCPU_(FILTER);
   int kernel_dim = KernelDim_();
   int M = filter.dim32(0);
 
   // Pre-compute row_offset / column_offset
   vector<int>& offsets =
-      StorageOrder::NCHW == ConvPoolOpBase<CPUContext>::order_
-      ? row_offsets_
-      : *column_offsets_;
+      this->order_ == StorageOrder::NCHW ? row_offsets_ : *column_offsets_;
 
   if (offsets.empty()) {
     if (this->template InputIsType<Int8ConvDNNLowPPackedWeightBlob>(FILTER)) {
@@ -235,7 +240,8 @@ void ConvDNNLowPOp<T, ReluFused>::QuantizeBias_() {
   // Quantize bias
   if (has_bias &&
       (!b_quantized_data_ ||
-       in_qparams_[INPUT].scale != in_qparams_scale_old_)) {
+       in_qparams_[INPUT].scale != in_qparams_scale_old_ ||
+       in_qparams_[INPUT].zero_point != in_qparams_zero_point_old_)) {
     if (has_packed_bias) {
       const auto& packed_filter =
           this->template Input<Int8ConvDNNLowPPackedWeightBlob>(FILTER);
@@ -273,10 +279,73 @@ void ConvDNNLowPOp<T, ReluFused>::QuantizeBias_() {
         }
         b_quantized_data_ = b_quantized_->data();
       }
-      in_qparams_scale_old_ = in_qparams_[INPUT].scale;
     }
+    in_qparams_scale_old_ = in_qparams_[INPUT].scale;
+    in_qparams_zero_point_old_ = in_qparams_[INPUT].zero_point;
 
     CAFFE_ENFORCE(b_quantized_data_);
+
+    // If column_offsets_ is empty even when we need column_offsets (asymmetric
+    // quantization in input), it means we need to fuse column_offsets to bias.
+    if (this->order_ == StorageOrder::NHWC && in_qparams_[INPUT].zero_point &&
+        column_offsets_->empty()) {
+      if (b_quantized_->empty()) {
+        b_quantized_->assign(b_quantized_data_, b_quantized_data_ + M);
+        b_quantized_data_ = b_quantized_->data();
+      }
+      vector<int32_t>* column_offset_ptr;
+      vector<int32_t> column_offset_temp;
+      if (this->template InputIsType<Int8ConvDNNLowPPackedWeightBlob>(FILTER)) {
+        const auto& packed_filter =
+            this->template Input<Int8ConvDNNLowPPackedWeightBlob>(FILTER);
+        column_offset_ptr = packed_filter.column_offsets.get();
+      } else {
+        vector<TensorQuantizationParams> temp_qparams;
+        temp_qparams.push_back(in_qparams_[1]);
+        column_offset_temp.resize(M);
+        ComputeColumnOffsets<T_signed>(
+            KernelDim_(),
+            M,
+            W_quantized_.data(),
+            filter_qparams_,
+            column_offset_temp);
+        column_offset_ptr = &column_offset_temp;
+      }
+      for (int i = 0; i < M; ++i) {
+        (*b_quantized_)[i] -=
+            in_qparams_[0].zero_point * (*column_offset_ptr)[i];
+      }
+    }
+  }
+
+  if (!has_bias && this->order_ == StorageOrder::NHWC &&
+      in_qparams_[INPUT].zero_point && column_offsets_->empty() &&
+      !b_quantized_data_) {
+    // no bias but create one filling with column offset values
+    b_quantized_->resize(M, 0);
+    b_quantized_data_ = b_quantized_->data();
+
+    vector<int32_t>* column_offset_ptr;
+    vector<int32_t> column_offset_temp;
+    if (this->template InputIsType<Int8ConvDNNLowPPackedWeightBlob>(FILTER)) {
+      const auto& packed_filter =
+          this->template Input<Int8ConvDNNLowPPackedWeightBlob>(FILTER);
+      column_offset_ptr = packed_filter.column_offsets.get();
+    } else {
+      vector<TensorQuantizationParams> temp_qparams;
+      temp_qparams.push_back(in_qparams_[1]);
+      column_offset_temp.resize(M);
+      ComputeColumnOffsets<T_signed>(
+          KernelDim_(),
+          M,
+          W_quantized_.data(),
+          filter_qparams_,
+          column_offset_temp);
+      column_offset_ptr = &column_offset_temp;
+    }
+    for (int i = 0; i < M; ++i) {
+      (*b_quantized_)[i] -= in_qparams_[0].zero_point * (*column_offset_ptr)[i];
+    }
   }
 }
 
@@ -465,13 +534,13 @@ bool ConvDNNLowPOp<T, ReluFused>::GetQuantizationParameters_() {
 
   QuantizeWeight_();
   PreComputeRowColumnOffsets_();
+  QuantizeBias_();
+
   if (Wq_packed_ && !FLAGS_caffe2_dnnlowp_dump_tensors) {
     // From here, W_quantized_ is not used anymore when we have Wq_packed_
     vector<T_signed>().swap(W_quantized_);
   }
 
-  QuantizeBias_();
-
   bool fp32_executed = false;
   if (HasStaticQuantization(this)) {
     out_qparams_ = GetStaticQuantizationParamsOf(this, 0);
@@ -750,8 +819,10 @@ void ConvDNNLowPOp<T, ReluFused>::RunOnDeviceEpilogueNHWC_(
 
         for (int j = group_id * (M / group_); j < (group_id + 1) * (M / group_);
              ++j) {
-          int32_t raw = Y_int32[i * M + j] -
-              A_zero_point * (*column_offsets_)[j] - row_offset;
+          int32_t raw = Y_int32[i * M + j] - row_offset;
+          if (!column_offsets_->empty()) {
+            raw -= A_zero_point * (*column_offsets_)[j];
+          }
           if (b_quantized_data_) {
             raw += b_quantized_data_[j];
           }
@@ -808,10 +879,12 @@ void ConvDNNLowPOp<T, ReluFused>::RunOnDeviceEpilogueNHWC_(
             reinterpret_cast<uint8_t*>(Ydata + i * M + group_id * (M / group_)),
             &C_multiplier,
             C_zero_point,
-            A_zero_point,
+            column_offsets_->empty() ? 0 : A_zero_point,
             &B_zero_point,
             &row_offset,
-            column_offsets_->data() + group_id * (M / group_),
+            column_offsets_->empty()
+                ? nullptr
+                : column_offsets_->data() + group_id * (M / group_),
             b_quantized_data_ ? b_quantized_data_ + group_id * (M / group_)
                               : nullptr,
             M / group_,
@@ -834,8 +907,10 @@ void ConvDNNLowPOp<T, ReluFused>::RunOnDeviceEpilogueNHWC_(
 
         for (int j = group_id * (M / group_); j < (group_id + 1) * (M / group_);
              ++j) {
-          int32_t raw = Y_int32[i * M + j] -
-              A_zero_point * (*column_offsets_)[j] - row_offset;
+          int32_t raw = Y_int32[i * M + j] - row_offset;
+          if (!column_offsets_->empty()) {
+            raw -= A_zero_point * (*column_offsets_)[j];
+          }
           if (b_quantized_data_) {
             raw += b_quantized_data_[j];
           }
@@ -1006,10 +1081,11 @@ void ConvDNNLowPOp<T, ReluFused>::DispatchFBGEMM_(
       doNothingObj,
       requantization_multipliers_.data(),
       out_qparams_.zero_point,
-      in_qparams_[INPUT].zero_point,
+      // column_offsets_ empty means column_offsets_ are folded into bias
+      column_offsets_->empty() ? 0 : in_qparams_[INPUT].zero_point,
       filter_zero_points_.data(),
       packA.getRowOffsetBuffer(),
-      column_offsets_->data(),
+      column_offsets_->empty() ? nullptr : column_offsets_->data(),
       b_quantized_data_,
       M,
       group_);
@@ -1074,6 +1150,8 @@ void ConvDNNLowPOp<T, ReluFused>::ConvNHWCCore_(
             this->stride_[0],
             this->stride_[1],
             this->stride_[2],
+            // Shouldn't pass 0 if column_offsets_ is empty here because we
+            // need zero_point for padding
             in_qparams_[INPUT].zero_point,
             reinterpret_cast<const uint8_t*>(Xdata),
             filter_zero_points_.data(),
@@ -1081,7 +1159,8 @@ void ConvDNNLowPOp<T, ReluFused>::ConvNHWCCore_(
             requantization_multipliers_.data(),
             out_qparams_.zero_point,
             Y_uint8_data,
-            column_offsets_->data(),
+            // column_offsets_ empty means column_offsets_ are folded into bias
+            column_offsets_->empty() ? nullptr : column_offsets_->data(),
             b_quantized_data_,
             ReluFused,
             dnnlowp_get_thread_num(),
@@ -1096,6 +1175,8 @@ void ConvDNNLowPOp<T, ReluFused>::ConvNHWCCore_(
             this->stride_[0],
             this->stride_[1],
             this->stride_[2],
+            // Shouldn't pass 0 if column_offsets_ is empty here because we
+            // need zero_point for padding
             in_qparams_[INPUT].zero_point,
             reinterpret_cast<const uint8_t*>(Xdata),
             FilterQuantizationParams(0).zero_point,
@@ -1103,7 +1184,8 @@ void ConvDNNLowPOp<T, ReluFused>::ConvNHWCCore_(
             requantization_params_[0].real_multiplier,
             out_qparams_.zero_point,
             Y_uint8_data,
-            column_offsets_->data(),
+            // column_offsets_ empty means column_offsets_ are folded into bias
+            column_offsets_->empty() ? nullptr : column_offsets_->data(),
             b_quantized_data_,
             ReluFused,
             dnnlowp_get_thread_num(),
@@ -1130,6 +1212,8 @@ void ConvDNNLowPOp<T, ReluFused>::ConvNHWCCore_(
             C,
             stride_h(),
             stride_w(),
+            // Shouldn't pass 0 if column_offsets_ is empty here because we
+            // need zero_point for padding
             in_qparams_[INPUT].zero_point,
             reinterpret_cast<const uint8_t*>(Xdata),
             filter_zero_points_.data(),
@@ -1137,7 +1221,8 @@ void ConvDNNLowPOp<T, ReluFused>::ConvNHWCCore_(
             requantization_multipliers_.data(),
             out_qparams_.zero_point,
             Y_uint8_data,
-            column_offsets_->data(),
+            // column_offsets_ empty means column_offsets_ are folded into bias
+            column_offsets_->empty() ? nullptr : column_offsets_->data(),
             b_quantized_data_,
             ReluFused,
             dnnlowp_get_thread_num(),
@@ -1150,6 +1235,8 @@ void ConvDNNLowPOp<T, ReluFused>::ConvNHWCCore_(
             C,
             stride_h(),
             stride_w(),
+            // Shouldn't pass 0 if column_offsets_ is empty here because we
+            // need zero_point for padding
             in_qparams_[INPUT].zero_point,
             reinterpret_cast<const uint8_t*>(Xdata),
             FilterQuantizationParams(0).zero_point,
@@ -1157,7 +1244,8 @@ void ConvDNNLowPOp<T, ReluFused>::ConvNHWCCore_(
             requantization_params_[0].real_multiplier,
             out_qparams_.zero_point,
             Y_uint8_data,
-            column_offsets_->data(),
+            // column_offsets_ empty means column_offsets_ are folded into bias
+            column_offsets_->empty() ? nullptr : column_offsets_->data(),
             b_quantized_data_,
             ReluFused,
             dnnlowp_get_thread_num(),
@@ -1198,10 +1286,11 @@ void ConvDNNLowPOp<T, ReluFused>::ConvNHWCCore_(
             doNothingObj,
             requantization_multipliers_.data(),
             out_qparams_.zero_point,
-            in_qparams_[INPUT].zero_point,
+            // column_offsets_ empty means column_offsets_ are folded into bias
+            column_offsets_->empty() ? 0 : in_qparams_[INPUT].zero_point,
             filter_zero_points_.data(),
             row_offsets_.data() + tid * row_offset_size_per_thread,
-            column_offsets_->data(),
+            column_offsets_->empty() ? nullptr : column_offsets_->data(),
             b_quantized_data_,
             conv_p.OC,
             conv_p.G);
@@ -1209,6 +1298,8 @@ void ConvDNNLowPOp<T, ReluFused>::ConvNHWCCore_(
         fbgemmGroupwiseConv(
             conv_p,
             reinterpret_cast<const uint8_t*>(Xdata),
+            // Shouldn't pass 0 if column_offsets_ is empty here because we
+            // need zero_point for padding
             in_qparams_[INPUT].zero_point,
             row_offsets_.data() + tid * row_offset_size_per_thread,
             *Wq_gconv_packed_,
@@ -1222,12 +1313,13 @@ void ConvDNNLowPOp<T, ReluFused>::ConvNHWCCore_(
             doNothingObj,
             requantization_multipliers_.data(),
             out_qparams_.zero_point,
-            in_qparams_[INPUT].zero_point,
+            // column_offsets_ empty means column_offsets_ are folded into bias
+            column_offsets_->empty() ? 0 : in_qparams_[INPUT].zero_point,
             filter_zero_points_.data(),
             filter_zero_points_[0]
                 ? row_offsets_.data() + tid * row_offset_size_per_thread
                 : nullptr,
-            column_offsets_->data(),
+            column_offsets_->empty() ? nullptr : column_offsets_->data(),
             b_quantized_data_,
             conv_p.OC,
             conv_p.G);
@@ -1235,6 +1327,8 @@ void ConvDNNLowPOp<T, ReluFused>::ConvNHWCCore_(
         fbgemmGroupwiseConv(
             conv_p,
             reinterpret_cast<const uint8_t*>(Xdata),
+            // Shouldn't pass 0 if column_offsets_ is empty here because we
+            // need zero_point for padding
             in_qparams_[INPUT].zero_point,
             filter_zero_points_[0]
                 ? row_offsets_.data() + tid * row_offset_size_per_thread
@@ -1261,6 +1355,9 @@ void ConvDNNLowPOp<T, ReluFused>::ConvNHWCCore_(
       row_offset_size_per_thread =
           PackAWithIm2Col<uint8_t>::rowOffsetBufferSize();
       x_pack_buf_size_per_thread = PackAWithIm2Col<uint8_t>::packedBufferSize();
+    } else if (!quantize_groupwise_ && filter_zero_points_[0] == 0) {
+      row_offset_size_per_thread = 0;
+      x_pack_buf_size_per_thread = PackAMatrix<uint8_t>::packedBufferSize();
     } else {
       row_offset_size_per_thread =
           PackAWithRowOffset<uint8_t>::rowOffsetBufferSize();
@@ -1303,6 +1400,8 @@ void ConvDNNLowPOp<T, ReluFused>::ConvNHWCCore_(
             reinterpret_cast<const uint8_t*>(col_buffer_data),
             // buffer for packed matrix
             X_pack_buf_.data() + tid * x_pack_buf_size_per_thread,
+            // Shouldn't pass 0 if column_offsets_ is empty here because we
+            // need zero_point for padding
             in_qparams_[INPUT].zero_point,
             row_offsets_.data() + tid * row_offset_size_per_thread);
 
@@ -1337,6 +1436,8 @@ void ConvDNNLowPOp<T, ReluFused>::ConvNHWCCore_(
             reinterpret_cast<const uint8_t*>(col_buffer_data),
             // buffer for packed matrix
             X_pack_buf_.data() + tid * x_pack_buf_size_per_thread,
+            // Shouldn't pass 0 if column_offsets_ is empty here because we
+            // need zero_point for padding
             in_qparams_[INPUT].zero_point,
             row_offsets_.data() + tid * row_offset_size_per_thread);
 
@@ -1350,6 +1451,20 @@ void ConvDNNLowPOp<T, ReluFused>::ConvNHWCCore_(
               QuantizationGranularity::TENSOR>(packA, Y_int32, Y_uint8_data);
         }
       } // 3D
+    } else if (!quantize_groupwise_ && filter_zero_points_[0] == 0) {
+      // no im2col fusion
+      PackAMatrix<uint8_t> packA(
+          matrix_op_t::NoTranspose,
+          N * Y_HxW,
+          group_ * kernel_dim,
+          reinterpret_cast<const uint8_t*>(col_buffer_data),
+          group_ * kernel_dim,
+          // buffer for packed matrix
+          X_pack_buf_.data() + tid * x_pack_buf_size_per_thread,
+          group_);
+
+      DispatchFBGEMM_<PackAMatrix<uint8_t>, QuantizationGranularity::TENSOR>(
+          packA, Y_int32, Y_uint8_data);
     } else {
       // no im2col fusion
       PackAWithRowOffset<uint8_t> packA(
diff --git a/caffe2/quantization/server/conv_dnnlowp_op.h b/caffe2/quantization/server/conv_dnnlowp_op.h
index 6f6c8ea..268a4cf 100644
--- a/caffe2/quantization/server/conv_dnnlowp_op.h
+++ b/caffe2/quantization/server/conv_dnnlowp_op.h
@@ -127,7 +127,8 @@ class ConvDNNLowPOp : public ConvPoolDNNLowPOpBase<T, ConvFp32Op> {
   // pre-computed biases and offsets
   std::shared_ptr<std::vector<std::int32_t>> b_quantized_;
 
-  float in_qparams_scale_old_ = 0;
+  float in_qparams_scale_old_{0};
+  std::int32_t in_qparams_zero_point_old_{0};
 }; // class ConvDNNLowPOp
 
 } // namespace caffe2