From 4778a4089ece23e1ead0e414d1b4d206ce60a192 Mon Sep 17 00:00:00 2001 From: Jongsoo Park Date: Fri, 22 Feb 2019 10:20:24 -0800 Subject: [PATCH] optimize max pool 2d (#17391) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/17391 Optimize 2D max pool using AVX2 intrinsics. Reviewed By: jianyuh Differential Revision: D14181620 fbshipit-source-id: ffc6c4412bd1c1d7839fe06226921df40d9cab83 --- caffe2/quantization/server/pool_dnnlowp_op.cc | 66 +++++++++++++------- caffe2/quantization/server/pool_dnnlowp_op_avx2.cc | 70 ++++++++++++++++++++++ caffe2/quantization/server/pool_dnnlowp_op_avx2.h | 26 ++++++++ 3 files changed, 141 insertions(+), 21 deletions(-) create mode 100644 caffe2/quantization/server/pool_dnnlowp_op_avx2.cc create mode 100644 caffe2/quantization/server/pool_dnnlowp_op_avx2.h diff --git a/caffe2/quantization/server/pool_dnnlowp_op.cc b/caffe2/quantization/server/pool_dnnlowp_op.cc index bbf6026..0dda848 100644 --- a/caffe2/quantization/server/pool_dnnlowp_op.cc +++ b/caffe2/quantization/server/pool_dnnlowp_op.cc @@ -3,6 +3,7 @@ #include "caffe2/quantization/server/caffe2_dnnlowp_utils.h" #include "caffe2/quantization/server/conv_pool_dnnlowp_op_base.h" #include "caffe2/quantization/server/op_wrapper.h" +#include "caffe2/quantization/server/pool_dnnlowp_op_avx2.h" #include "caffe2/utils/eigen_utils.h" namespace caffe2 { @@ -582,32 +583,55 @@ class MaxPoolDnnLowPOp final : public ConvPoolDNNLowPOpBase { } break; case 2: + if (is_same::value) { #ifdef _OPENMP #pragma omp parallel for #endif - for (int n = 0; n < X.dim32(0); ++n) { - const T* Xdata_temp = Xdata + n * height * width * channels; - T* Ydata_temp = Ydata + n * pooled_height * pooled_width * channels; - for (int ph = 0; ph < pooled_height; ++ph) { - int hstart = ph * stride_h() - pad_t(); - int hend = min(hstart + kernel_h(), height); - hstart = max(hstart, 0); - for (int pw = 0; pw < pooled_width; ++pw) { - int wstart = pw * stride_w() - pad_l(); - int wend = min(wstart + kernel_w(), width); - wstart = max(wstart, 0); - int size = (hend - hstart) * (wend - wstart); - for (int c = 0; c < channels; ++c) { - T Yh = MaxPool::initialize(); - const int pool_idx = (ph * pooled_width + pw) * channels + c; - for (int h = hstart; h < hend; ++h) { - for (int w = wstart; w < wend; ++w) { - const int input_idx = (h * width + w) * channels + c; - MaxPool::process(Xdata_temp[input_idx], Yh); + for (int n = 0; n < X.dim32(0); ++n) { + max_pool_avx2( + reinterpret_cast(Xdata), + n, + height, + width, + channels, + pooled_height, + pooled_width, + kernel_h(), + kernel_w(), + stride_h(), + stride_w(), + pad_t(), + pad_l(), + reinterpret_cast(Ydata)); + } + } else { +#ifdef _OPENMP +#pragma omp parallel for +#endif + for (int n = 0; n < X.dim32(0); ++n) { + const T* Xdata_temp = Xdata + n * height * width * channels; + T* Ydata_temp = Ydata + n * pooled_height * pooled_width * channels; + for (int ph = 0; ph < pooled_height; ++ph) { + int hstart = ph * stride_h() - pad_t(); + int hend = min(hstart + kernel_h(), height); + hstart = max(hstart, 0); + for (int pw = 0; pw < pooled_width; ++pw) { + int wstart = pw * stride_w() - pad_l(); + int wend = min(wstart + kernel_w(), width); + wstart = max(wstart, 0); + int size = (hend - hstart) * (wend - wstart); + for (int c = 0; c < channels; ++c) { + T Yh = MaxPool::initialize(); + const int pool_idx = (ph * pooled_width + pw) * channels + c; + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + const int input_idx = (h * width + w) * channels + c; + MaxPool::process(Xdata_temp[input_idx], Yh); + } } + MaxPool::finalize(size, Yh); + Ydata_temp[pool_idx] = Yh; } - MaxPool::finalize(size, Yh); - Ydata_temp[pool_idx] = Yh; } } } diff --git a/caffe2/quantization/server/pool_dnnlowp_op_avx2.cc b/caffe2/quantization/server/pool_dnnlowp_op_avx2.cc new file mode 100644 index 0000000..92d0816 --- /dev/null +++ b/caffe2/quantization/server/pool_dnnlowp_op_avx2.cc @@ -0,0 +1,70 @@ +#include "caffe2/quantization/server/pool_dnnlowp_op_avx2.h" + +#include +#include + +namespace caffe2 { + +using namespace std; + +void max_pool_avx2( + const uint8_t* Xdata, + int n, + int height, + int width, + int channels, + int pooled_height, + int pooled_width, + int kernel_h, + int kernel_w, + int stride_h, + int stride_w, + int pad_t, + int pad_l, + uint8_t* Ydata) { + const uint8_t* Xdata_temp = Xdata + n * height * width * channels; + uint8_t* Ydata_temp = Ydata + n * pooled_height * pooled_width * channels; + for (int ph = 0; ph < pooled_height; ++ph) { + int hstart = ph * stride_h - pad_t; + int hend = hstart + kernel_h < height ? hstart + kernel_h : height; + hstart = hstart > 0 ? hstart : 0; + for (int pw = 0; pw < pooled_width; ++pw) { + int wstart = pw * stride_w - pad_l; + int wend = wstart + kernel_w < width ? wstart + kernel_w : width; + wstart = wstart > 0 ? wstart : 0; + + uint8_t* Yh = Ydata_temp + (ph * pooled_width + pw) * channels; + constexpr int VLEN = 8; + // vectorized loop + for (int c = 0; c < channels / VLEN * VLEN; c += VLEN) { + __m256i Y_v = _mm256_setzero_si256(); + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + const int input_idx = (h * width + w) * channels + c; + Y_v = _mm256_max_epu8( + _mm256_loadu_si256( + reinterpret_cast(Xdata_temp + input_idx)), + Y_v); + } + } + _mm256_storeu_si256(reinterpret_cast<__m256i*>(Yh + c), Y_v); + } + + // remainder + for (int c = channels / VLEN * VLEN; c < channels; ++c) { + Yh[c] = 0; + } + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + for (int c = channels / VLEN * VLEN; c < channels; ++c) { + const int input_idx = (h * width + w) * channels + c; + Yh[c] = + Xdata_temp[input_idx] > Yh[c] ? Xdata_temp[input_idx] : Yh[c]; + } + } + } + } // pw loop + } // ph loop +} + +} // namespace caffe2 diff --git a/caffe2/quantization/server/pool_dnnlowp_op_avx2.h b/caffe2/quantization/server/pool_dnnlowp_op_avx2.h new file mode 100644 index 0000000..abb0573 --- /dev/null +++ b/caffe2/quantization/server/pool_dnnlowp_op_avx2.h @@ -0,0 +1,26 @@ +#pragma once + +#include + +namespace caffe2 { + +/** + * Optimized using AVX2 intrinsics for max pool 2D in NHWC layout + */ +void max_pool_avx2( + const std::uint8_t* Xdata, + int n, + int height, + int width, + int channels, + int pooled_height, + int pooled_width, + int kernel_h, + int kernel_w, + int stride_h, + int stride_w, + int pad_t, + int pad_l, + std::uint8_t* Ydata); + +} // namespace caffe2 -- 2.7.4