From 496b0b03d988ccdb242f8674f1c5e176f2bef221 Mon Sep 17 00:00:00 2001 From: Summer Deng Date: Tue, 9 Apr 2019 21:59:33 -0700 Subject: [PATCH] amend D14778810 (#18902) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/18902 Fix in D14778810 had an issue that when we fallback to acc32 because the density of outlier is too high W_quantized_ is already modified. In this diff we first just count the number of outliers (without modifying W_quantized_) and only when density is low enough and no need for fallback we modify W_quantized_ and construct an outlier matrix. Reviewed By: jspark1105 Differential Revision: D14785256 fbshipit-source-id: 03933110a4ca7409686a06b18a9bb921f8657950 --- .../quantization/server/conv_dnnlowp_acc16_op.cc | 12 +++++++++--- .../server/conv_dnnlowp_acc16_op_test.py | 2 +- .../server/conv_groupwise_dnnlowp_acc16_op_test.py | 2 +- caffe2/quantization/server/fbgemm_pack_op.cc | 22 ++++++++++++++++++---- caffe2/quantization/server/fbgemm_pack_op.h | 7 +++++++ 5 files changed, 36 insertions(+), 9 deletions(-) diff --git a/caffe2/quantization/server/conv_dnnlowp_acc16_op.cc b/caffe2/quantization/server/conv_dnnlowp_acc16_op.cc index 3bce760..2f2b51a 100644 --- a/caffe2/quantization/server/conv_dnnlowp_acc16_op.cc +++ b/caffe2/quantization/server/conv_dnnlowp_acc16_op.cc @@ -174,13 +174,12 @@ bool ConvDNNLowPAcc16Op::GetQuantizationParameters_() { nbits_in_non_outlier_ < 8) { CAFFE_ENFORCE(!W_quantized_.empty()); - Wq_outlier_.reset(ExtractOutlierMatrix( + int outlier_cnt = CountOutliers( group_, kernel_dim, num_out_channels, nbits_in_non_outlier_, - W_quantized_)); - int outlier_cnt = Wq_outlier_->ColPtr()[num_out_channels]; + W_quantized_); C10_LOG_FIRST_N(INFO, 10) << "Proportion of outlier for Conv layer with weight blob " @@ -202,6 +201,13 @@ bool ConvDNNLowPAcc16Op::GetQuantizationParameters_() { // We need to call GetQuantizationParameters_ again to pack for acc32 return BaseType::GetQuantizationParameters_(); } + + Wq_outlier_.reset(ExtractOutlierMatrix( + group_, + kernel_dim, + num_out_channels, + nbits_in_non_outlier_, + W_quantized_)); } bool packW = this->order_ == StorageOrder::NHWC && GetCpuId().avx2(); diff --git a/caffe2/quantization/server/conv_dnnlowp_acc16_op_test.py b/caffe2/quantization/server/conv_dnnlowp_acc16_op_test.py index 1ddf2ce..d14b7dc 100644 --- a/caffe2/quantization/server/conv_dnnlowp_acc16_op_test.py +++ b/caffe2/quantization/server/conv_dnnlowp_acc16_op_test.py @@ -17,7 +17,7 @@ workspace.GlobalInit( "caffe2", "--caffe2_omp_num_threads=11", # Increase this threshold to test acc16 with randomly generated data - "--caffe2_dnnlowp_acc16_density_threshold=0.9", + "--caffe2_dnnlowp_acc16_density_threshold=0.5", ] ) diff --git a/caffe2/quantization/server/conv_groupwise_dnnlowp_acc16_op_test.py b/caffe2/quantization/server/conv_groupwise_dnnlowp_acc16_op_test.py index d542126..1cd91dc 100644 --- a/caffe2/quantization/server/conv_groupwise_dnnlowp_acc16_op_test.py +++ b/caffe2/quantization/server/conv_groupwise_dnnlowp_acc16_op_test.py @@ -17,7 +17,7 @@ workspace.GlobalInit( "caffe2", "--caffe2_omp_num_threads=11", # Increase this threshold to test acc16 with randomly generated data - "--caffe2_dnnlowp_acc16_density_threshold=0.9", + "--caffe2_dnnlowp_acc16_density_threshold=0.5", ] ) diff --git a/caffe2/quantization/server/fbgemm_pack_op.cc b/caffe2/quantization/server/fbgemm_pack_op.cc index 9e98b01..d245011 100644 --- a/caffe2/quantization/server/fbgemm_pack_op.cc +++ b/caffe2/quantization/server/fbgemm_pack_op.cc @@ -118,7 +118,7 @@ template void ComputeColumnOffsets( const vector& qparams, vector& col_offsets); -fbgemm::CompressedSparseColumn* ExtractOutlierMatrix( +int CountOutliers( int groups, int kernel_dim, int M, @@ -136,6 +136,17 @@ fbgemm::CompressedSparseColumn* ExtractOutlierMatrix( } } } + return outlier_cnt; +} + +fbgemm::CompressedSparseColumn* ExtractOutlierMatrix( + int groups, + int kernel_dim, + int M, + int nbits_in_non_outlier, + vector& W_quantized) { + int outlier_cnt = + CountOutliers(groups, kernel_dim, M, nbits_in_non_outlier, W_quantized); fbgemm::CompressedSparseColumn* Wq_outlier = new fbgemm::CompressedSparseColumn(kernel_dim, M); @@ -163,6 +174,7 @@ fbgemm::CompressedSparseColumn* ExtractOutlierMatrix( } } } // for each group + CAFFE_ENFORCE_EQ(outlier_cnt, Wq_outlier->RowIdx().size()); Wq_outlier->ColPtr()[M] = outlier_cnt; return Wq_outlier; @@ -464,9 +476,8 @@ bool ConvDNNLowPPackWeightOp::RunOnDevice() { if (this->debug_def().engine() == "DNNLOWP_ACC16" && !fallback_to_32_bit_accumulation) { if (nbits_in_non_outlier_ < 8) { - Y->W_outlier.reset(ExtractOutlierMatrix( - group_, kernel_dim, M, nbits_in_non_outlier_, W_quantized)); - int outlier_cnt = Y->W_outlier->ColPtr()[M]; + int outlier_cnt = CountOutliers( + group_, kernel_dim, M, nbits_in_non_outlier_, W_quantized); LOG(INFO) << "Proportion of outlier for Conv layer with weight blob " << this->debug_def().input(0) << " is " @@ -479,6 +490,9 @@ bool ConvDNNLowPPackWeightOp::RunOnDevice() { << FLAGS_caffe2_dnnlowp_acc16_density_threshold << " . Falling back to acc32"; fallback_to_32_bit_accumulation = true; + } else { + Y->W_outlier.reset(ExtractOutlierMatrix( + group_, kernel_dim, M, nbits_in_non_outlier_, W_quantized)); } } diff --git a/caffe2/quantization/server/fbgemm_pack_op.h b/caffe2/quantization/server/fbgemm_pack_op.h index a059484..a2a6c9d 100644 --- a/caffe2/quantization/server/fbgemm_pack_op.h +++ b/caffe2/quantization/server/fbgemm_pack_op.h @@ -82,6 +82,13 @@ void ComputeColumnOffsets( const vector& qparams, vector& col_offsets); +int CountOutliers( + int groups, + int kernel_dim, + int M, + int nbits_in_non_outlier, + vector& W_quantized); + /** * @param W_quantized input quantized weight that is not packed yet */ -- 2.7.4