Tensor* Y = OutputTensorCPU_(0, sizes, at::dtype<uint8_t>());
const int output_image_size = this->GetDimsSize(*Y);
+ // In Skylake, acc16 is not faster when N or K is smaller than 128
+ constexpr int SKYLAKE_ACC16_N_THRESHOLD_MIN = 128,
+ SKYLAKE_ACC16_K_THRESHOLD_MIN = 128;
+ int acc16_n_threshold = FLAGS_caffe2_dnnlowp_acc16_n_threshold;
+ if (caffe2::GetCpuId().avx512f() &&
+ acc16_n_threshold < SKYLAKE_ACC16_N_THRESHOLD_MIN) {
+ acc16_n_threshold = SKYLAKE_ACC16_N_THRESHOLD_MIN;
+ }
+ int acc16_k_threshold = FLAGS_caffe2_dnnlowp_acc16_k_threshold;
+ if (caffe2::GetCpuId().avx512f() &&
+ acc16_k_threshold < SKYLAKE_ACC16_K_THRESHOLD_MIN) {
+ acc16_k_threshold = SKYLAKE_ACC16_K_THRESHOLD_MIN;
+ }
+
if (N * output_image_size < FLAGS_caffe2_dnnlowp_acc16_m_threshold) {
LOG(INFO) << "M " << N * output_image_size
<< " of Conv layer with weight blob "
fallback_to_32_bit_accumulation_ = true;
return true;
}
- if (num_out_channels / group_ < FLAGS_caffe2_dnnlowp_acc16_n_threshold) {
+ if (num_out_channels / group_ < acc16_n_threshold) {
LOG(INFO) << "N " << num_out_channels / group_
<< " of Conv layer with weight blob "
<< this->debug_def().input(1) << " is smaller than threshold "
- << FLAGS_caffe2_dnnlowp_acc16_n_threshold
- << " . Falling back to acc32";
+ << acc16_n_threshold << " . Falling back to acc32";
fallback_to_32_bit_accumulation_ = true;
return true;
}
- if (kernel_dim < FLAGS_caffe2_dnnlowp_acc16_k_threshold) {
+ if (kernel_dim < acc16_k_threshold) {
LOG(INFO) << "K " << kernel_dim << " of Conv layer with weight blob "
<< this->debug_def().input(1) << " is smaller than threshold "
- << FLAGS_caffe2_dnnlowp_acc16_k_threshold
- << " . Falling back to acc32";
+ << acc16_k_threshold << " . Falling back to acc32";
fallback_to_32_bit_accumulation_ = true;
return true;
}