From 4778a4089ece23e1ead0e414d1b4d206ce60a192 Mon Sep 17 00:00:00 2001
From: Jongsoo Park <jongsoo@fb.com>
Date: Fri, 22 Feb 2019 10:20:24 -0800
Subject: [PATCH] optimize max pool 2d (#17391)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/17391

Optimize 2D max pool using AVX2 intrinsics.

Reviewed By: jianyuh

Differential Revision: D14181620

fbshipit-source-id: ffc6c4412bd1c1d7839fe06226921df40d9cab83
---
 caffe2/quantization/server/pool_dnnlowp_op.cc      | 66 +++++++++++++-------
 caffe2/quantization/server/pool_dnnlowp_op_avx2.cc | 70 ++++++++++++++++++++++
 caffe2/quantization/server/pool_dnnlowp_op_avx2.h  | 26 ++++++++
 3 files changed, 141 insertions(+), 21 deletions(-)
 create mode 100644 caffe2/quantization/server/pool_dnnlowp_op_avx2.cc
 create mode 100644 caffe2/quantization/server/pool_dnnlowp_op_avx2.h
diff --git a/caffe2/quantization/server/pool_dnnlowp_op.cc b/caffe2/quantization/server/pool_dnnlowp_op.cc
index bbf6026..0dda848 100644
--- a/caffe2/quantization/server/pool_dnnlowp_op.cc
+++ b/caffe2/quantization/server/pool_dnnlowp_op.cc
@@ -3,6 +3,7 @@
 #include "caffe2/quantization/server/caffe2_dnnlowp_utils.h"
 #include "caffe2/quantization/server/conv_pool_dnnlowp_op_base.h"
 #include "caffe2/quantization/server/op_wrapper.h"
+#include "caffe2/quantization/server/pool_dnnlowp_op_avx2.h"
 #include "caffe2/utils/eigen_utils.h"
 
 namespace caffe2 {
@@ -582,32 +583,55 @@ class MaxPoolDnnLowPOp final : public ConvPoolDNNLowPOpBase<T, MaxPoolFp32Op> {
         }
         break;
       case 2:
+        if (is_same<T, uint8_t>::value) {
 #ifdef _OPENMP
 #pragma omp parallel for
 #endif
-        for (int n = 0; n < X.dim32(0); ++n) {
-          const T* Xdata_temp = Xdata + n * height * width * channels;
-          T* Ydata_temp = Ydata + n * pooled_height * pooled_width * channels;
-          for (int ph = 0; ph < pooled_height; ++ph) {
-            int hstart = ph * stride_h() - pad_t();
-            int hend = min(hstart + kernel_h(), height);
-            hstart = max(hstart, 0);
-            for (int pw = 0; pw < pooled_width; ++pw) {
-              int wstart = pw * stride_w() - pad_l();
-              int wend = min(wstart + kernel_w(), width);
-              wstart = max(wstart, 0);
-              int size = (hend - hstart) * (wend - wstart);
-              for (int c = 0; c < channels; ++c) {
-                T Yh = MaxPool<T>::initialize();
-                const int pool_idx = (ph * pooled_width + pw) * channels + c;
-                for (int h = hstart; h < hend; ++h) {
-                  for (int w = wstart; w < wend; ++w) {
-                    const int input_idx = (h * width + w) * channels + c;
-                    MaxPool<T>::process(Xdata_temp[input_idx], Yh);
+          for (int n = 0; n < X.dim32(0); ++n) {
+            max_pool_avx2(
+                reinterpret_cast<const uint8_t*>(Xdata),
+                n,
+                height,
+                width,
+                channels,
+                pooled_height,
+                pooled_width,
+                kernel_h(),
+                kernel_w(),
+                stride_h(),
+                stride_w(),
+                pad_t(),
+                pad_l(),
+                reinterpret_cast<uint8_t*>(Ydata));
+          }
+        } else {
+#ifdef _OPENMP
+#pragma omp parallel for
+#endif
+          for (int n = 0; n < X.dim32(0); ++n) {
+            const T* Xdata_temp = Xdata + n * height * width * channels;
+            T* Ydata_temp = Ydata + n * pooled_height * pooled_width * channels;
+            for (int ph = 0; ph < pooled_height; ++ph) {
+              int hstart = ph * stride_h() - pad_t();
+              int hend = min(hstart + kernel_h(), height);
+              hstart = max(hstart, 0);
+              for (int pw = 0; pw < pooled_width; ++pw) {
+                int wstart = pw * stride_w() - pad_l();
+                int wend = min(wstart + kernel_w(), width);
+                wstart = max(wstart, 0);
+                int size = (hend - hstart) * (wend - wstart);
+                for (int c = 0; c < channels; ++c) {
+                  T Yh = MaxPool<T>::initialize();
+                  const int pool_idx = (ph * pooled_width + pw) * channels + c;
+                  for (int h = hstart; h < hend; ++h) {
+                    for (int w = wstart; w < wend; ++w) {
+                      const int input_idx = (h * width + w) * channels + c;
+                      MaxPool<T>::process(Xdata_temp[input_idx], Yh);
+                    }
                   }
+                  MaxPool<T>::finalize(size, Yh);
+                  Ydata_temp[pool_idx] = Yh;
                 }
-                MaxPool<T>::finalize(size, Yh);
-                Ydata_temp[pool_idx] = Yh;
               }
             }
           }
diff --git a/caffe2/quantization/server/pool_dnnlowp_op_avx2.cc b/caffe2/quantization/server/pool_dnnlowp_op_avx2.cc
new file mode 100644
index 0000000..92d0816
--- /dev/null
+++ b/caffe2/quantization/server/pool_dnnlowp_op_avx2.cc
@@ -0,0 +1,70 @@
+#include "caffe2/quantization/server/pool_dnnlowp_op_avx2.h"
+
+#include <immintrin.h>
+#include <cmath>
+
+namespace caffe2 {
+
+using namespace std;
+
+void max_pool_avx2(
+    const uint8_t* Xdata,
+    int n,
+    int height,
+    int width,
+    int channels,
+    int pooled_height,
+    int pooled_width,
+    int kernel_h,
+    int kernel_w,
+    int stride_h,
+    int stride_w,
+    int pad_t,
+    int pad_l,
+    uint8_t* Ydata) {
+  const uint8_t* Xdata_temp = Xdata + n * height * width * channels;
+  uint8_t* Ydata_temp = Ydata + n * pooled_height * pooled_width * channels;
+  for (int ph = 0; ph < pooled_height; ++ph) {
+    int hstart = ph * stride_h - pad_t;
+    int hend = hstart + kernel_h < height ? hstart + kernel_h : height;
+    hstart = hstart > 0 ? hstart : 0;
+    for (int pw = 0; pw < pooled_width; ++pw) {
+      int wstart = pw * stride_w - pad_l;
+      int wend = wstart + kernel_w < width ? wstart + kernel_w : width;
+      wstart = wstart > 0 ? wstart : 0;
+
+      uint8_t* Yh = Ydata_temp + (ph * pooled_width + pw) * channels;
+      constexpr int VLEN = 8;
+      // vectorized loop
+      for (int c = 0; c < channels / VLEN * VLEN; c += VLEN) {
+        __m256i Y_v = _mm256_setzero_si256();
+        for (int h = hstart; h < hend; ++h) {
+          for (int w = wstart; w < wend; ++w) {
+            const int input_idx = (h * width + w) * channels + c;
+            Y_v = _mm256_max_epu8(
+                _mm256_loadu_si256(
+                    reinterpret_cast<const __m256i*>(Xdata_temp + input_idx)),
+                Y_v);
+          }
+        }
+        _mm256_storeu_si256(reinterpret_cast<__m256i*>(Yh + c), Y_v);
+      }
+
+      // remainder
+      for (int c = channels / VLEN * VLEN; c < channels; ++c) {
+        Yh[c] = 0;
+      }
+      for (int h = hstart; h < hend; ++h) {
+        for (int w = wstart; w < wend; ++w) {
+          for (int c = channels / VLEN * VLEN; c < channels; ++c) {
+            const int input_idx = (h * width + w) * channels + c;
+            Yh[c] =
+                Xdata_temp[input_idx] > Yh[c] ? Xdata_temp[input_idx] : Yh[c];
+          }
+        }
+      }
+    } // pw loop
+  } // ph loop
+}
+
+} // namespace caffe2
diff --git a/caffe2/quantization/server/pool_dnnlowp_op_avx2.h b/caffe2/quantization/server/pool_dnnlowp_op_avx2.h
new file mode 100644
index 0000000..abb0573
--- /dev/null
+++ b/caffe2/quantization/server/pool_dnnlowp_op_avx2.h
@@ -0,0 +1,26 @@
+#pragma once
+
+#include <cstdint>
+
+namespace caffe2 {
+
+/**
+ * Optimized using AVX2 intrinsics for max pool 2D in NHWC layout
+ */
+void max_pool_avx2(
+    const std::uint8_t* Xdata,
+    int n,
+    int height,
+    int width,
+    int channels,
+    int pooled_height,
+    int pooled_width,
+    int kernel_h,
+    int kernel_w,
+    int stride_h,
+    int stride_w,
+    int pad_t,
+    int pad_l,
+    std::uint8_t* Ydata);
+
+} // namespace caffe2
-- 
2.7.4