From 496b0b03d988ccdb242f8674f1c5e176f2bef221 Mon Sep 17 00:00:00 2001
From: Summer Deng <summerdeng@fb.com>
Date: Tue, 9 Apr 2019 21:59:33 -0700
Subject: [PATCH] amend D14778810 (#18902)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/18902

Fix in D14778810 had an issue that when we fallback to acc32 because the density of outlier is too high W_quantized_ is already modified. In this diff we first just count the number of outliers (without modifying W_quantized_) and only when density is low enough and no need for fallback we modify W_quantized_ and construct an outlier matrix.

Reviewed By: jspark1105

Differential Revision: D14785256

fbshipit-source-id: 03933110a4ca7409686a06b18a9bb921f8657950
---
 .../quantization/server/conv_dnnlowp_acc16_op.cc   | 12 +++++++++---
 .../server/conv_dnnlowp_acc16_op_test.py           |  2 +-
 .../server/conv_groupwise_dnnlowp_acc16_op_test.py |  2 +-
 caffe2/quantization/server/fbgemm_pack_op.cc       | 22 ++++++++++++++++++----
 caffe2/quantization/server/fbgemm_pack_op.h        |  7 +++++++
 5 files changed, 36 insertions(+), 9 deletions(-)
diff --git a/caffe2/quantization/server/conv_dnnlowp_acc16_op.cc b/caffe2/quantization/server/conv_dnnlowp_acc16_op.cc
index 3bce760..2f2b51a 100644
--- a/caffe2/quantization/server/conv_dnnlowp_acc16_op.cc
+++ b/caffe2/quantization/server/conv_dnnlowp_acc16_op.cc
@@ -174,13 +174,12 @@ bool ConvDNNLowPAcc16Op<ReluFused>::GetQuantizationParameters_() {
       nbits_in_non_outlier_ < 8) {
     CAFFE_ENFORCE(!W_quantized_.empty());
 
-    Wq_outlier_.reset(ExtractOutlierMatrix(
+    int outlier_cnt = CountOutliers(
         group_,
         kernel_dim,
         num_out_channels,
         nbits_in_non_outlier_,
-        W_quantized_));
-    int outlier_cnt = Wq_outlier_->ColPtr()[num_out_channels];
+        W_quantized_);
 
     C10_LOG_FIRST_N(INFO, 10)
         << "Proportion of outlier for Conv layer with weight blob "
@@ -202,6 +201,13 @@ bool ConvDNNLowPAcc16Op<ReluFused>::GetQuantizationParameters_() {
       // We need to call GetQuantizationParameters_ again to pack for acc32
       return BaseType::GetQuantizationParameters_();
     }
+
+    Wq_outlier_.reset(ExtractOutlierMatrix(
+        group_,
+        kernel_dim,
+        num_out_channels,
+        nbits_in_non_outlier_,
+        W_quantized_));
   }
 
   bool packW = this->order_ == StorageOrder::NHWC && GetCpuId().avx2();
diff --git a/caffe2/quantization/server/conv_dnnlowp_acc16_op_test.py b/caffe2/quantization/server/conv_dnnlowp_acc16_op_test.py
index 1ddf2ce..d14b7dc 100644
--- a/caffe2/quantization/server/conv_dnnlowp_acc16_op_test.py
+++ b/caffe2/quantization/server/conv_dnnlowp_acc16_op_test.py
@@ -17,7 +17,7 @@ workspace.GlobalInit(
         "caffe2",
         "--caffe2_omp_num_threads=11",
         # Increase this threshold to test acc16 with randomly generated data
-        "--caffe2_dnnlowp_acc16_density_threshold=0.9",
+        "--caffe2_dnnlowp_acc16_density_threshold=0.5",
     ]
 )
 
diff --git a/caffe2/quantization/server/conv_groupwise_dnnlowp_acc16_op_test.py b/caffe2/quantization/server/conv_groupwise_dnnlowp_acc16_op_test.py
index d542126..1cd91dc 100644
--- a/caffe2/quantization/server/conv_groupwise_dnnlowp_acc16_op_test.py
+++ b/caffe2/quantization/server/conv_groupwise_dnnlowp_acc16_op_test.py
@@ -17,7 +17,7 @@ workspace.GlobalInit(
         "caffe2",
         "--caffe2_omp_num_threads=11",
         # Increase this threshold to test acc16 with randomly generated data
-        "--caffe2_dnnlowp_acc16_density_threshold=0.9",
+        "--caffe2_dnnlowp_acc16_density_threshold=0.5",
     ]
 )
 
diff --git a/caffe2/quantization/server/fbgemm_pack_op.cc b/caffe2/quantization/server/fbgemm_pack_op.cc
index 9e98b01..d245011 100644
--- a/caffe2/quantization/server/fbgemm_pack_op.cc
+++ b/caffe2/quantization/server/fbgemm_pack_op.cc
@@ -118,7 +118,7 @@ template void ComputeColumnOffsets<int16_t>(
     const vector<TensorQuantizationParams>& qparams,
     vector<int32_t>& col_offsets);
 
-fbgemm::CompressedSparseColumn* ExtractOutlierMatrix(
+int CountOutliers(
     int groups,
     int kernel_dim,
     int M,
@@ -136,6 +136,17 @@ fbgemm::CompressedSparseColumn* ExtractOutlierMatrix(
       }
     }
   }
+  return outlier_cnt;
+}
+
+fbgemm::CompressedSparseColumn* ExtractOutlierMatrix(
+    int groups,
+    int kernel_dim,
+    int M,
+    int nbits_in_non_outlier,
+    vector<int8_t>& W_quantized) {
+  int outlier_cnt =
+      CountOutliers(groups, kernel_dim, M, nbits_in_non_outlier, W_quantized);
 
   fbgemm::CompressedSparseColumn* Wq_outlier =
       new fbgemm::CompressedSparseColumn(kernel_dim, M);
@@ -163,6 +174,7 @@ fbgemm::CompressedSparseColumn* ExtractOutlierMatrix(
       }
     }
   } // for each group
+  CAFFE_ENFORCE_EQ(outlier_cnt, Wq_outlier->RowIdx().size());
   Wq_outlier->ColPtr()[M] = outlier_cnt;
 
   return Wq_outlier;
@@ -464,9 +476,8 @@ bool ConvDNNLowPPackWeightOp::RunOnDevice() {
   if (this->debug_def().engine() == "DNNLOWP_ACC16" &&
       !fallback_to_32_bit_accumulation) {
     if (nbits_in_non_outlier_ < 8) {
-      Y->W_outlier.reset(ExtractOutlierMatrix(
-          group_, kernel_dim, M, nbits_in_non_outlier_, W_quantized));
-      int outlier_cnt = Y->W_outlier->ColPtr()[M];
+      int outlier_cnt = CountOutliers(
+          group_, kernel_dim, M, nbits_in_non_outlier_, W_quantized);
 
       LOG(INFO) << "Proportion of outlier for Conv layer with weight blob "
                 << this->debug_def().input(0) << " is "
@@ -479,6 +490,9 @@ bool ConvDNNLowPPackWeightOp::RunOnDevice() {
                   << FLAGS_caffe2_dnnlowp_acc16_density_threshold
                   << " . Falling back to acc32";
         fallback_to_32_bit_accumulation = true;
+      } else {
+        Y->W_outlier.reset(ExtractOutlierMatrix(
+            group_, kernel_dim, M, nbits_in_non_outlier_, W_quantized));
       }
     }
 
diff --git a/caffe2/quantization/server/fbgemm_pack_op.h b/caffe2/quantization/server/fbgemm_pack_op.h
index a059484..a2a6c9d 100644
--- a/caffe2/quantization/server/fbgemm_pack_op.h
+++ b/caffe2/quantization/server/fbgemm_pack_op.h
@@ -82,6 +82,13 @@ void ComputeColumnOffsets(
     const vector<dnnlowp::TensorQuantizationParams>& qparams,
     vector<int32_t>& col_offsets);
 
+int CountOutliers(
+    int groups,
+    int kernel_dim,
+    int M,
+    int nbits_in_non_outlier,
+    vector<std::int8_t>& W_quantized);
+
 /**
  * @param W_quantized input quantized weight that is not packed yet
  */
-- 
2.7.4