use acc16 only when n>128 and k>128 in Skylake (#18672)
authorJongsoo Park <jongsoo@fb.com>
Mon, 1 Apr 2019 15:49:37 +0000 (08:49 -0700)
committerFacebook Github Bot <facebook-github-bot@users.noreply.github.com>
Mon, 1 Apr 2019 15:52:28 +0000 (08:52 -0700)
Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/18672

In Skylake, when n < 128 or k < 128, acc16 is slower.

Reviewed By: jianyuh

Differential Revision: D14700576

fbshipit-source-id: 80ca9f1af4626637eed9c5ca49f95ae744811189

caffe2/quantization/server/conv_dnnlowp_acc16_op.cc

index b339e52..f356b42 100644 (file)
@@ -106,6 +106,20 @@ bool ConvDNNLowPAcc16Op<ReluFused>::GetQuantizationParameters_() {
     Tensor* Y = OutputTensorCPU_(0, sizes, at::dtype<uint8_t>());
     const int output_image_size = this->GetDimsSize(*Y);
 
+    // In Skylake, acc16 is not faster when N or K is smaller than 128
+    constexpr int SKYLAKE_ACC16_N_THRESHOLD_MIN = 128,
+                  SKYLAKE_ACC16_K_THRESHOLD_MIN = 128;
+    int acc16_n_threshold = FLAGS_caffe2_dnnlowp_acc16_n_threshold;
+    if (caffe2::GetCpuId().avx512f() &&
+        acc16_n_threshold < SKYLAKE_ACC16_N_THRESHOLD_MIN) {
+      acc16_n_threshold = SKYLAKE_ACC16_N_THRESHOLD_MIN;
+    }
+    int acc16_k_threshold = FLAGS_caffe2_dnnlowp_acc16_k_threshold;
+    if (caffe2::GetCpuId().avx512f() &&
+        acc16_k_threshold < SKYLAKE_ACC16_K_THRESHOLD_MIN) {
+      acc16_k_threshold = SKYLAKE_ACC16_K_THRESHOLD_MIN;
+    }
+
     if (N * output_image_size < FLAGS_caffe2_dnnlowp_acc16_m_threshold) {
       LOG(INFO) << "M " << N * output_image_size
                 << " of Conv layer with weight blob "
@@ -115,20 +129,18 @@ bool ConvDNNLowPAcc16Op<ReluFused>::GetQuantizationParameters_() {
       fallback_to_32_bit_accumulation_ = true;
       return true;
     }
-    if (num_out_channels / group_ < FLAGS_caffe2_dnnlowp_acc16_n_threshold) {
+    if (num_out_channels / group_ < acc16_n_threshold) {
       LOG(INFO) << "N " << num_out_channels / group_
                 << " of Conv layer with weight blob "
                 << this->debug_def().input(1) << " is smaller than threshold "
-                << FLAGS_caffe2_dnnlowp_acc16_n_threshold
-                << " . Falling back to acc32";
+                << acc16_n_threshold << " . Falling back to acc32";
       fallback_to_32_bit_accumulation_ = true;
       return true;
     }
-    if (kernel_dim < FLAGS_caffe2_dnnlowp_acc16_k_threshold) {
+    if (kernel_dim < acc16_k_threshold) {
       LOG(INFO) << "K " << kernel_dim << " of Conv layer with weight blob "
                 << this->debug_def().input(1) << " is smaller than threshold "
-                << FLAGS_caffe2_dnnlowp_acc16_k_threshold
-                << " . Falling back to acc32";
+                << acc16_k_threshold << " . Falling back to acc32";
       fallback_to_32_bit_accumulation_ = true;
       return true;
     }