Pre-processing(GAPI): ARM(NEON) integration + Split, Merge, Color conversion kernels...
authorAnna Khakimova <anna.khakimova@intel.com>
Tue, 21 Jul 2020 11:19:15 +0000 (14:19 +0300)
committerGitHub <noreply@github.com>
Tue, 21 Jul 2020 11:19:15 +0000 (14:19 +0300)
inference-engine/src/preprocessing/arm_neon/ie_preprocess_gapi_kernels_neon.hpp [new file with mode: 0644]
inference-engine/src/preprocessing/ie_preprocess_gapi_kernels.cpp
inference-engine/tests_deprecated/fluid_preproc/common/fluid_tests.cpp
inference-engine/tests_deprecated/fluid_preproc/cpu/fluid_tests_cpu.cpp
inference-engine/thirdparty/ocv/opencv_hal_intrin.hpp
inference-engine/thirdparty/ocv/opencv_hal_neon.hpp

diff --git a/inference-engine/src/preprocessing/arm_neon/ie_preprocess_gapi_kernels_neon.hpp b/inference-engine/src/preprocessing/arm_neon/ie_preprocess_gapi_kernels_neon.hpp
new file mode 100644 (file)
index 0000000..92b9da7
--- /dev/null
@@ -0,0 +1,192 @@
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "ie_preprocess_gapi_kernels.hpp"
+#include "ie_preprocess_gapi_kernels_impl.hpp"
+#include  <type_traits>
+
+namespace InferenceEngine {
+namespace gapi {
+namespace kernels {
+namespace neon {
+
+using C3 = std::integral_constant<int, 3>;
+using C4 = std::integral_constant<int, 4>;
+//-----------------------------------------------------------------------------
+
+typedef MapperUnit<float,   int> MapperUnit32F;
+typedef MapperUnit<Q0_16, short> MapperUnit8U;
+
+void calcRowArea_8U(uchar dst[], const uchar *src[], const Size &inSz, const Size &outSz,
+                    Q0_16 yalpha, const MapperUnit8U& ymap, int xmaxdf, const short xindex[],
+                    const Q0_16 xalpha[], Q8_8 vbuf[]);
+
+void calcRowArea_32F(float dst[], const float *src[], const Size &inSz, const Size &outSz,
+                     float yalpha, const MapperUnit32F& ymap, int xmaxdf, const int xindex[],
+                     const float xalpha[], float vbuf[]);
+
+// Resize (bi-linear, 8U)
+void calcRowLinear_8U(uint8_t *dst[],
+                      const uint8_t *src0[],
+                      const uint8_t *src1[],
+                        const short  alpha[],
+                        const short  clone[],
+                        const short  mapsx[],
+                        const short  beta[],
+                            uint8_t  tmp[],
+                        const Size&  inSz,
+                        const Size&  outSz,
+                                int  lpi);
+
+// Resize (bi-linear, 8UC3)
+void calcRowLinear_8U(C3, std::array<std::array<uint8_t*, 4>, 3> &dst,
+                      const uint8_t *src0[],
+                      const uint8_t *src1[],
+                        const short  alpha[],
+                        const short  clone[],
+                        const short  mapsx[],
+                        const short  beta[],
+                            uint8_t  tmp[],
+                        const Size&  inSz,
+                        const Size&  outSz,
+                                int  lpi);
+
+// Resize (bi-linear, 8UC4)
+void calcRowLinear_8U(C4, std::array<std::array<uint8_t*, 4>, 4> &dst,
+                      const uint8_t *src0[],
+                      const uint8_t *src1[],
+                        const short  alpha[],
+                        const short  clone[],
+                        const short  mapsx[],
+                        const short  beta[],
+                            uint8_t  tmp[],
+                        const Size&  inSz,
+                        const Size&  outSz,
+                                int  lpi);
+
+template<int numChan>
+void calcRowLinear_8UC(std::array<std::array<uint8_t*, 4>, numChan> &dst,
+                       const uint8_t *src0[],
+                       const uint8_t *src1[],
+                         const short  alpha[],
+                         const short  clone[],
+                         const short  mapsx[],
+                         const short  beta[],
+                             uint8_t  tmp[],
+                         const Size&  inSz,
+                         const Size&  outSz,
+                                 int  lpi) {
+    calcRowLinear_8U(std::integral_constant<int, numChan>{}, dst, src0, src1, alpha, clone, mapsx, beta, tmp, inSz, outSz, lpi);
+}
+
+// Resize (bi-linear, 32F)
+void calcRowLinear_32F(      float *dst[],
+                       const float *src0[],
+                       const float *src1[],
+                       const float  alpha[],
+                       const int    mapsx[],
+                       const float  beta[],
+                       const Size& inSz,
+                       const Size& outSz,
+                               int lpi);
+
+//----------------------------------------------------------------------
+
+void mergeRow_8UC2(const uint8_t in0[],
+                   const uint8_t in1[],
+                         uint8_t out[],
+                             int length);
+
+void mergeRow_8UC3(const uint8_t in0[],
+                   const uint8_t in1[],
+                   const uint8_t in2[],
+                         uint8_t out[],
+                             int length);
+
+void mergeRow_8UC4(const uint8_t in0[],
+                   const uint8_t in1[],
+                   const uint8_t in2[],
+                   const uint8_t in3[],
+                         uint8_t out[],
+                             int length);
+
+void mergeRow_32FC2(const float in0[],
+                    const float in1[],
+                          float out[],
+                            int length);
+
+void mergeRow_32FC3(const float in0[],
+                    const float in1[],
+                    const float in2[],
+                          float out[],
+                            int length);
+
+void mergeRow_32FC4(const float in0[],
+                    const float in1[],
+                    const float in2[],
+                    const float in3[],
+                          float out[],
+                            int length);
+
+void splitRow_8UC2(const uint8_t in[],
+                         uint8_t out0[],
+                         uint8_t out1[],
+                             int length);
+
+void splitRow_8UC3(const uint8_t in[],
+                         uint8_t out0[],
+                         uint8_t out1[],
+                         uint8_t out2[],
+                             int length);
+
+void splitRow_8UC4(const uint8_t in[],
+                         uint8_t out0[],
+                         uint8_t out1[],
+                         uint8_t out2[],
+                         uint8_t out3[],
+                             int length);
+
+void splitRow_32FC2(const float in[],
+                          float out0[],
+                          float out1[],
+                            int length);
+
+void splitRow_32FC3(const float in[],
+                          float out0[],
+                          float out1[],
+                          float out2[],
+                            int length);
+
+void splitRow_32FC4(const float in[],
+                          float out0[],
+                          float out1[],
+                          float out2[],
+                          float out3[],
+                            int length);
+
+void calculate_nv12_to_rgb(const  uchar **srcY,
+                           const  uchar *srcUV,
+                                  uchar **dstRGBx,
+                                    int width);
+
+void calculate_i420_to_rgb(const  uchar **srcY,
+                           const  uchar *srcU,
+                           const  uchar *srcV,
+                                  uchar **dstRGBx,
+                                    int width);
+
+void copyRow_8U(const uint8_t in[],
+                uint8_t out[],
+                int length);
+
+void copyRow_32F(const float in[],
+                 float out[],
+                 int length);
+
+}  // namespace neon
+}  // namespace kernels
+}  // namespace gapi
+}  // namespace InferenceEngine
index ab9db8a..e6a3dbf 100644 (file)
 
 #endif
 
+#ifdef HAVE_NEON
+  #include "arm_neon/ie_preprocess_gapi_kernels_neon.hpp"
+#endif
+
 #include <opencv2/gapi/opencv_includes.hpp>
 #include <opencv2/gapi/fluid/gfluidkernel.hpp>
 #include <opencv2/gapi/gcompoundkernel.hpp>
@@ -174,6 +178,47 @@ void mergeRow(const std::array<const uint8_t*, chs>& ins, uint8_t* out, int leng
     }
 #endif  // HAVE_SSE
 
+#ifdef HAVE_NEON
+    if (std::is_same<T, uint8_t>::value && chs == 2) {
+        neon::mergeRow_8UC2(ins[0], ins[1], out, length);
+        return;
+    }
+
+    if (std::is_same<T, uint8_t>::value && chs == 3) {
+        neon::mergeRow_8UC3(ins[0], ins[1], ins[2], out, length);
+        return;
+    }
+
+    if (std::is_same<T, uint8_t>::value && chs == 4) {
+        neon::mergeRow_8UC4(ins[0], ins[1], ins[2], ins[3], out, length);
+        return;
+    }
+
+    if (std::is_same<T, float>::value && chs == 2) {
+        neon::mergeRow_32FC2(reinterpret_cast<const float*>(ins[0]),
+                             reinterpret_cast<const float*>(ins[1]),
+                             reinterpret_cast<float*>(out), length);
+        return;
+    }
+
+    if (std::is_same<T, float>::value && chs == 3) {
+        neon::mergeRow_32FC3(reinterpret_cast<const float*>(ins[0]),
+                             reinterpret_cast<const float*>(ins[1]),
+                             reinterpret_cast<const float*>(ins[2]),
+                             reinterpret_cast<float*>(out), length);
+        return;
+    }
+
+    if (std::is_same<T, float>::value && chs == 4) {
+        neon::mergeRow_32FC4(reinterpret_cast<const float*>(ins[0]),
+                             reinterpret_cast<const float*>(ins[1]),
+                             reinterpret_cast<const float*>(ins[2]),
+                             reinterpret_cast<const float*>(ins[3]),
+                             reinterpret_cast<float*>(out), length);
+        return;
+    }
+#endif  // HAVE_NEON
+
     const T* insT[chs];
     for (int c = 0; c < chs; c++) {
         insT[c] = reinterpret_cast<const T*>(ins[c]);
@@ -328,6 +373,50 @@ void splitRow(const uint8_t* in, std::array<uint8_t*, chs>& outs, int length) {
     }
 #endif  // HAVE_SSE
 
+#ifdef HAVE_NEON
+    if (std::is_same<T, uint8_t>::value && chs == 2) {
+        neon::splitRow_8UC2(in, outs[0], outs[1], length);
+        return;
+    }
+
+    if (std::is_same<T, uint8_t>::value && chs == 3) {
+        neon::splitRow_8UC3(in, outs[0], outs[1], outs[2], length);
+        return;
+    }
+
+    if (std::is_same<T, uint8_t>::value && chs == 4) {
+        neon::splitRow_8UC4(in, outs[0], outs[1], outs[2], outs[3], length);
+        return;
+    }
+
+    if (std::is_same<T, float>::value && chs == 2) {
+        neon::splitRow_32FC2(reinterpret_cast<const float*>(in),
+                             reinterpret_cast<float*>(outs[0]),
+                             reinterpret_cast<float*>(outs[1]),
+                             length);
+        return;
+    }
+
+    if (std::is_same<T, float>::value && chs == 3) {
+        neon::splitRow_32FC3(reinterpret_cast<const float*>(in),
+                             reinterpret_cast<float*>(outs[0]),
+                             reinterpret_cast<float*>(outs[1]),
+                             reinterpret_cast<float*>(outs[2]),
+                             length);
+        return;
+    }
+
+    if (std::is_same<T, float>::value && chs == 4) {
+        neon::splitRow_32FC4(reinterpret_cast<const float*>(in),
+                             reinterpret_cast<float*>(outs[0]),
+                             reinterpret_cast<float*>(outs[1]),
+                             reinterpret_cast<float*>(outs[2]),
+                             reinterpret_cast<float*>(outs[3]),
+                             length);
+        return;
+    }
+#endif  // HAVE_NEON
+
     auto inT = reinterpret_cast<const T*>(in);
 
     T* outsT[chs];
@@ -484,6 +573,7 @@ static void chanToPlaneRow(const uint8_t* in, int chan, int chs, uint8_t* out, i
     }
     #endif  // HAVE_AVX512
 #endif
+
     #ifdef HAVE_AVX2
     if (with_cpu_x86_avx2()) {
         if (std::is_same<T, uint8_t>::value && chs == 1) {
@@ -515,6 +605,20 @@ static void chanToPlaneRow(const uint8_t* in, int chan, int chs, uint8_t* out, i
     }
     #endif  // HAVE_SSE
 
+    #ifdef HAVE_NEON
+    if (std::is_same<T, uint8_t>::value && chs == 1) {
+        neon::copyRow_8U(in, out, length);
+        return;
+    }
+
+    if (std::is_same<T, float>::value && chs == 1) {
+        neon::copyRow_32F(reinterpret_cast<const float*>(in),
+                          reinterpret_cast<float*>(out),
+                          length);
+        return;
+    }
+    #endif  // HAVE_NEON
+
     const auto inT  = reinterpret_cast<const T*>(in);
           auto outT = reinterpret_cast<      T*>(out);
 
@@ -831,14 +935,14 @@ static void calcRowLinear(const cv::gapi::fluid::View  & in,
         if (std::is_same<T, uint8_t>::value) {
             if (inSz.width >= 16 && outSz.width >= 8) {
                 calcRowLinear_8UC1(reinterpret_cast<uint8_t**>(dst),
-                                 reinterpret_cast<const uint8_t**>(src0),
-                                 reinterpret_cast<const uint8_t**>(src1),
-                                 reinterpret_cast<const short*>(alpha),
-                                 reinterpret_cast<const short*>(clone),
-                                 reinterpret_cast<const short*>(mapsx),
-                                 reinterpret_cast<const short*>(beta),
-                                 reinterpret_cast<uint8_t*>(tmp),
-                                 inSz, outSz, lpi);
+                                   reinterpret_cast<const uint8_t**>(src0),
+                                   reinterpret_cast<const uint8_t**>(src1),
+                                   reinterpret_cast<const short*>(alpha),
+                                   reinterpret_cast<const short*>(clone),
+                                   reinterpret_cast<const short*>(mapsx),
+                                   reinterpret_cast<const short*>(beta),
+                                   reinterpret_cast<uint8_t*>(tmp),
+                                   inSz, outSz, lpi);
                 return;
             }
         }
@@ -2011,6 +2115,7 @@ GAPI_FLUID_KERNEL(FNV12toRGB, NV12toRGB, false) {
         }
     #endif  // HAVE_AVX512
     #endif
+
     #ifdef HAVE_AVX2
         if (with_cpu_x86_avx2()) {
             avx::calculate_nv12_to_rgb(y_rows, uv_row, out_rows, buf_width);
@@ -2024,6 +2129,11 @@ GAPI_FLUID_KERNEL(FNV12toRGB, NV12toRGB, false) {
         }
     #endif  // HAVE_SSE
 
+    #ifdef HAVE_NEON
+        neon::calculate_nv12_to_rgb(y_rows, uv_row, out_rows, buf_width);
+        return;
+    #endif  // HAVE_NEON
+
         calculate_nv12_to_rgb_fallback(y_rows, uv_row, out_rows, buf_width);
     }
 };
@@ -2045,29 +2155,35 @@ GAPI_FLUID_KERNEL(FI420toRGB, I420toRGB, false) {
         int buf_width = out.length();
         GAPI_DbgAssert(in_u.length() ==  in_v.length());
 
-// AVX512 implementation of wide universal intrinsics is slower than AVX2.
-// It is turned off until the cause isn't found out.
-    #if 0
-    #ifdef HAVE_AVX512
-        if (with_cpu_x86_avx512_core()) {
-           #define CV_AVX_512DQ 1
-           avx512::calculate_i420_to_rgb(y_rows, u_row, v_row, out_rows, buf_width);
-           return;
-        }
-    #endif  // HAVE_AVX512
-    #endif
-    #ifdef HAVE_AVX2
-        if (with_cpu_x86_avx2()) {
-           avx::calculate_i420_to_rgb(y_rows, u_row, v_row, out_rows, buf_width);
-           return;
-        }
-    #endif  // HAVE_AVX2
-    #ifdef HAVE_SSE
-        if (with_cpu_x86_sse42()) {
-           calculate_i420_to_rgb(y_rows, u_row, v_row, out_rows, buf_width);
-           return;
-        }
-    #endif  // HAVE_SSE
+        // AVX512 implementation of wide universal intrinsics is slower than AVX2.
+        // It is turned off until the cause isn't found out.
+        #if 0
+        #ifdef HAVE_AVX512
+            if (with_cpu_x86_avx512_core()) {
+               #define CV_AVX_512DQ 1
+               avx512::calculate_i420_to_rgb(y_rows, u_row, v_row, out_rows, buf_width);
+               return;
+            }
+        #endif  // HAVE_AVX512
+        #endif
+
+        #ifdef HAVE_AVX2
+            if (with_cpu_x86_avx2()) {
+               avx::calculate_i420_to_rgb(y_rows, u_row, v_row, out_rows, buf_width);
+               return;
+            }
+        #endif  // HAVE_AVX2
+        #ifdef HAVE_SSE
+            if (with_cpu_x86_sse42()) {
+               calculate_i420_to_rgb(y_rows, u_row, v_row, out_rows, buf_width);
+               return;
+            }
+        #endif  // HAVE_SSE
+
+        #ifdef HAVE_NEON
+            neon::calculate_i420_to_rgb(y_rows, u_row, v_row, out_rows, buf_width);
+            return;
+        #endif  // HAVE_NEON
 
         calculate_i420_to_rgb_fallback(y_rows, u_row, v_row, out_rows, buf_width);
     }
index b38bb5b..071faa6 100644 (file)
@@ -1058,7 +1058,11 @@ TEST_P(PreprocTest, Performance)
     std::tie(in_size, out_size) = sizes;
     int in_ocv_chan = -1, out_ocv_chan = -1;
     std::tie(in_ocv_chan, out_ocv_chan) = ocv_channels;
+#if defined(__arm__) || defined(__aarch64__)
+    double tolerance = Precision::U8 ? 4 : 0.015;
+#else
     double tolerance = Precision::U8 ? 1 : 0.015;
+#endif
 
     const int ocv_depth = prec == Precision::U8 ? CV_8U :
         prec == Precision::FP32 ? CV_32F : -1;
index e28c612..6684cd3 100644 (file)
     std::make_pair(cv::Size(  96,  256), cv::Size( 128,  384))
 
 using namespace testing;
-
+#if defined(__arm__) || defined(__aarch64__)
 INSTANTIATE_TEST_CASE_P(ResizeTestFluid_U8, ResizeTestGAPI,
                         Combine(Values(CV_8UC1, CV_8UC3),
                                 Values(cv::INTER_LINEAR, cv::INTER_AREA),
                                 Values(TEST_RESIZE_PAIRS),
-                                Values(1))); // error not more than 1 unit
+                                Values(4))); // error not more than 4 unit
 
-INSTANTIATE_TEST_CASE_P(ResizeTestFluid_F32, ResizeTestGAPI,
-                        Combine(Values(CV_32FC1, CV_32FC3),
+INSTANTIATE_TEST_CASE_P(ResizeRGB8UTestFluid_U8, ResizeRGB8UTestGAPI,
+                        Combine(Values(CV_8UC3, CV_8UC4),
+                                Values(cv::INTER_LINEAR),
+                                Values(TEST_RESIZE_PAIRS),
+                                Values(4))); // error not more than 4 unit
+#else
+INSTANTIATE_TEST_CASE_P(ResizeTestFluid_U8, ResizeTestGAPI,
+                        Combine(Values(CV_8UC1, CV_8UC3),
                                 Values(cv::INTER_LINEAR, cv::INTER_AREA),
                                 Values(TEST_RESIZE_PAIRS),
-                                Values(0.015))); // accuracy like ~1.5%
+                                Values(1))); // error not more than 1 unit
 
 INSTANTIATE_TEST_CASE_P(ResizeRGB8UTestFluid_U8, ResizeRGB8UTestGAPI,
                         Combine(Values(CV_8UC3, CV_8UC4),
                                 Values(cv::INTER_LINEAR),
                                 Values(TEST_RESIZE_PAIRS),
                                 Values(1))); // error not more than 1 unit
+#endif
+
+INSTANTIATE_TEST_CASE_P(ResizeTestFluid_F32, ResizeTestGAPI,
+                        Combine(Values(CV_32FC1, CV_32FC3),
+                                Values(cv::INTER_LINEAR, cv::INTER_AREA),
+                                Values(TEST_RESIZE_PAIRS),
+                                Values(0.015))); // accuracy like ~1.5%
+
 
 INSTANTIATE_TEST_CASE_P(SplitTestFluid, SplitTestGAPI,
                         Combine(Values(2, 3, 4),
@@ -179,11 +193,19 @@ INSTANTIATE_TEST_CASE_P(ResizeRGB8URoiTestFluid, ResizeRGB8URoiTestGAPI,
 
 //----------------------------------------------------------------------
 
+#if defined(__arm__) || defined(__aarch64__)
+INSTANTIATE_TEST_CASE_P(ResizeTestFluid_U8, ResizeTestIE,
+                        Combine(Values(CV_8UC1, CV_8UC3),
+                                Values(cv::INTER_LINEAR, cv::INTER_AREA),
+                                Values(TEST_RESIZE_PAIRS),
+                                Values(4))); // error not more than 4 unit
+#else
 INSTANTIATE_TEST_CASE_P(ResizeTestFluid_U8, ResizeTestIE,
                         Combine(Values(CV_8UC1, CV_8UC3),
                                 Values(cv::INTER_LINEAR, cv::INTER_AREA),
                                 Values(TEST_RESIZE_PAIRS),
                                 Values(1))); // error not more than 1 unit
+#endif
 
 INSTANTIATE_TEST_CASE_P(ResizeTestFluid_F32, ResizeTestIE,
                         Combine(Values(CV_32FC1, CV_32FC3),
index 13bd390..c5b843f 100644 (file)
@@ -268,7 +268,7 @@ using namespace CV_CPU_OPTIMIZATION_HAL_NAMESPACE;
 #   undef CV_MSA
 #endif*/
 
-#if CV_SSE2 || CV_NEON || CV_VSX || CV_MSA || CV_WASM_SIMD
+#if CV_SSE2 || CV_NEON
 #define CV__SIMD_FORWARD 128
 #include "opencv_hal_intrin_forward.hpp"
 #endif
index a4df685..3ee2d3f 100644 (file)
@@ -511,7 +511,7 @@ inline v_float32x4& operator /= (v_float32x4& a, const v_float32x4& b)
 }
 #endif
 
-inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
+static inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
                                const v_float32x4& m1, const v_float32x4& m2,
                                const v_float32x4& a)
 {
@@ -2224,7 +2224,7 @@ inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_flo
 #endif
 
 ////// FP16 support ///////
-// Currently disabled
+// Unsupported. Currently disabled.
 #if 0
 #if CV_FP16
 inline v_float32x4 v_load_expand(const float16_t* ptr)