Merge pull request #16093 from alalek:core_itt_thread_name_16072

author Alexander Alekhin <alexander.a.alekhin@gmail.com>

Mon, 9 Dec 2019 18:29:53 +0000 (18:29 +0000)

committer Alexander Alekhin <alexander.a.alekhin@gmail.com>

Mon, 9 Dec 2019 18:29:53 +0000 (18:29 +0000)
author Alexander Alekhin <alexander.a.alekhin@gmail.com>
Mon, 9 Dec 2019 18:29:53 +0000 (18:29 +0000)
committer Alexander Alekhin <alexander.a.alekhin@gmail.com>
Mon, 9 Dec 2019 18:29:53 +0000 (18:29 +0000)
diff --git a/LICENSE b/LICENSE

index be57671..aeb19a4 100644 (file)
--- a/LICENSE
+++ b/LICENSE
@@ -13,6 +13,7 @@ Copyright (C) 2009-2016, NVIDIA Corporation, all rights reserved.
  Copyright (C) 2010-2013, Advanced Micro Devices, Inc., all rights reserved.
  Copyright (C) 2015-2016, OpenCV Foundation, all rights reserved.
  Copyright (C) 2015-2016, Itseez Inc., all rights reserved.
+Copyright (C) 2019, Xperience AI, all rights reserved.
  Third party copyrights are property of their respective owners.
  
  Redistribution and use in source and binary forms, with or without modification,
diff --git a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp

index e4d13af..bda1d85 100644 (file)
--- a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp
@@ -346,11 +346,37 @@ OPENCV_HAL_IMPL_VSX_EXPAND(v_int16x8, v_int32x4, short, vec_unpackl, vec_unpackh
  OPENCV_HAL_IMPL_VSX_EXPAND(v_uint32x4, v_uint64x2, uint, vec_unpacklu, vec_unpackhu)
  OPENCV_HAL_IMPL_VSX_EXPAND(v_int32x4, v_int64x2, int, vec_unpackl, vec_unpackh)
  
+/* Load and zero expand a 4 byte value into the second dword, first is don't care. */
+#if !defined(CV_COMPILER_VSX_BROKEN_ASM)
+    #define _LXSIWZX(out, ptr, T) __asm__ ("lxsiwzx %x0, 0, %1\r\n" : "=wa"(out) : "r" (ptr) : "memory");
+#else
+    /* This is compiler-agnostic, but will introduce an unneeded splat on the critical path. */
+    #define _LXSIWZX(out, ptr, T) out = (T)vec_udword2_sp(*(uint32_t*)(ptr));
+#endif
+
  inline v_uint32x4 v_load_expand_q(const uchar* ptr)
-{ return v_uint32x4(vec_uint4_set(ptr[0], ptr[1], ptr[2], ptr[3])); }
+{
+    // Zero-extend the extra 24B instead of unpacking. Usually faster in small kernel
+    // Likewise note, value is zero extended and upper 4 bytes are zero'ed.
+    vec_uchar16 pmu = {8, 12, 12, 12, 9, 12, 12, 12, 10, 12, 12, 12, 11, 12, 12, 12};
+    vec_uchar16 out;
+
+    _LXSIWZX(out, ptr, vec_uchar16);
+    out = vec_perm(out, out, pmu);
+    return v_uint32x4((vec_uint4)out);
+}
  
  inline v_int32x4 v_load_expand_q(const schar* ptr)
-{ return v_int32x4(vec_int4_set(ptr[0], ptr[1], ptr[2], ptr[3])); }
+{
+    vec_char16 out;
+    vec_short8 outs;
+    vec_int4 outw;
+
+    _LXSIWZX(out, ptr, vec_char16);
+    outs = vec_unpackl(out);
+    outw = vec_unpackh(outs);
+    return v_int32x4(outw);
+}
  
  /* pack */
  #define OPENCV_HAL_IMPL_VSX_PACK(_Tpvec, _Tp, _Tpwvec, _Tpvn, _Tpdel, sfnc, pkfnc, addfnc, pack)    \
diff --git a/modules/dnn/src/layers/detection_output_layer.cpp b/modules/dnn/src/layers/detection_output_layer.cpp

index d629a71..d391e01 100644 (file)
--- a/modules/dnn/src/layers/detection_output_layer.cpp
+++ b/modules/dnn/src/layers/detection_output_layer.cpp
@@ -55,36 +55,6 @@
  #ifdef HAVE_DNN_NGRAPH
  #include "../ie_ngraph.hpp"
  #include <ngraph/op/experimental/layers/detection_output.hpp>
-
-namespace ngraph {
-namespace op {
-
-class Dummy : public Op {
-public:
-    Dummy() : Op("Dummy", {}) {
-        constructor_validate_and_infer_types();
-    }
-
-    void validate_and_infer_types() override {
-        set_output_type(0, ngraph::element::Type(), {});
-    }
-
-    std::shared_ptr<Node> copy_with_new_args(const NodeVector& new_args) const override {
-        if (!new_args.empty())
-            throw ngraph_error("Incorrect number of new arguments");
-        return std::make_shared<Dummy>();
-    }
-
-    static constexpr NodeTypeInfo type_info{"Dummy", 1};
-    const NodeTypeInfo& get_type_info() const override {
-        return type_info;
-    }
-};
-
-constexpr NodeTypeInfo Dummy::type_info;
-
-}  // namespace op
-}  // namespace ngraph
  #endif
  
  namespace cv
@@ -1000,10 +970,8 @@ public:
          attrs.code_type                  = std::string{"caffe.PriorBoxParameter." + _codeType};
          attrs.normalized                 = true;
  
-        auto aux_class_preds = std::make_shared<ngraph::op::Dummy>();
-        auto aux_box_preds   = std::make_shared<ngraph::op::Dummy>();
          auto det_out = std::make_shared<ngraph::op::DetectionOutput>(box_logits, class_preds,
-                       proposals, aux_class_preds, aux_box_preds, attrs);
+                       proposals, attrs);
          return Ptr<BackendNode>(new InfEngineNgraphNode(det_out));
      }
  #endif  // HAVE_DNN_NGRAPH
diff --git a/modules/imgproc/perf/perf_resize.cpp b/modules/imgproc/perf/perf_resize.cpp

index 0705108..236955b 100644 (file)
--- a/modules/imgproc/perf/perf_resize.cpp
+++ b/modules/imgproc/perf/perf_resize.cpp
@@ -7,6 +7,31 @@ namespace opencv_test {
  
  typedef tuple<MatType, Size, Size> MatInfo_Size_Size_t;
  typedef TestBaseWithParam<MatInfo_Size_Size_t> MatInfo_Size_Size;
+typedef tuple<Size,Size> Size_Size_t;
+typedef tuple<MatType, Size_Size_t> MatInfo_SizePair_t;
+typedef TestBaseWithParam<MatInfo_SizePair_t> MatInfo_SizePair;
+
+#define MATTYPE_NE_VALUES CV_8UC1, CV_8UC2, CV_8UC3, CV_8UC4,     \
+                          CV_16UC1, CV_16UC2, CV_16UC3, CV_16UC4, \
+                          CV_32FC1, CV_32FC2, CV_32FC3, CV_32FC4
+
+// For gradient-ish testing of the other matrix formats
+template<typename T>
+static void fillFPGradient(Mat& img)
+{
+    const int ch = img.channels();
+
+    int r, c, i;
+    for(r=0; r<img.rows; r++)
+    {
+        for(c=0; c<img.cols; c++)
+        {
+            T vals[] = {(T)r, (T)c, (T)(r*c), (T)(r*c/(r+c+1))};
+            T *p = (T*)img.ptr(r, c);
+            for(i=0; i<ch; i++) p[i] = (T)vals[i];
+        }
+    }
+}
  
  PERF_TEST_P(MatInfo_Size_Size, resizeUpLinear,
              testing::Values(
@@ -38,6 +63,33 @@ PERF_TEST_P(MatInfo_Size_Size, resizeUpLinear,
  #endif
  }
  
+PERF_TEST_P(MatInfo_SizePair, resizeUpLinearNonExact,
+            testing::Combine
+                (
+                testing::Values( MATTYPE_NE_VALUES ),
+                testing::Values( Size_Size_t(szVGA, szqHD), Size_Size_t(szVGA, sz720p) )
+                )
+             )
+{
+    int matType = get<0>(GetParam());
+    Size_Size_t sizes = get<1>(GetParam());
+    Size from = get<0>(sizes);
+    Size to = get<1>(sizes);
+
+    cv::Mat src(from, matType), dst(to, matType);
+    switch(src.depth())
+    {
+        case CV_8U: cvtest::fillGradient(src); break;
+        case CV_16U: fillFPGradient<ushort>(src); break;
+        case CV_32F: fillFPGradient<float>(src); break;
+    }
+    declare.in(src).out(dst);
+
+    TEST_CYCLE_MULTIRUN(10) resize(src, dst, to, 0, 0, INTER_LINEAR);
+
+    SANITY_CHECK_NOTHING();
+}
+
  PERF_TEST_P(MatInfo_Size_Size, resizeDownLinear,
              testing::Values(
                  MatInfo_Size_Size_t(CV_8UC1, szVGA, szQVGA),
@@ -80,6 +132,40 @@ PERF_TEST_P(MatInfo_Size_Size, resizeDownLinear,
  #endif
  }
  
+PERF_TEST_P(MatInfo_SizePair, resizeDownLinearNonExact,
+            testing::Combine
+                (
+                testing::Values( MATTYPE_NE_VALUES ),
+                testing::Values
+                    (
+                    Size_Size_t(szVGA, szQVGA),
+                    Size_Size_t(szqHD, szVGA),
+                    Size_Size_t(sz720p, Size(120 * sz720p.width / sz720p.height, 120)),
+                    Size_Size_t(sz720p, szVGA),
+                    Size_Size_t(sz720p, szQVGA)
+                    )
+                )
+            )
+{
+    int matType = get<0>(GetParam());
+    Size_Size_t sizes = get<1>(GetParam());
+    Size from = get<0>(sizes);
+    Size to = get<1>(sizes);
+
+    cv::Mat src(from, matType), dst(to, matType);
+    switch(src.depth())
+    {
+        case CV_8U: cvtest::fillGradient(src); break;
+        case CV_16U: fillFPGradient<ushort>(src); break;
+        case CV_32F: fillFPGradient<float>(src); break;
+    }
+    declare.in(src).out(dst);
+
+    TEST_CYCLE_MULTIRUN(10) resize(src, dst, to, 0, 0, INTER_LINEAR);
+
+    SANITY_CHECK_NOTHING();
+}
+
  
  typedef tuple<MatType, Size, int> MatInfo_Size_Scale_t;
  typedef TestBaseWithParam<MatInfo_Size_Scale_t> MatInfo_Size_Scale;
diff --git a/modules/imgproc/src/resize.cpp b/modules/imgproc/src/resize.cpp

index b0283e5..56f06ac 100644 (file)
--- a/modules/imgproc/src/resize.cpp
+++ b/modules/imgproc/src/resize.cpp
@@ -1481,10 +1481,320 @@ typedef VResizeNoVec VResizeLanczos4Vec_32f;
  
  #endif
  
+#if CV_SIMD128
+
+template<typename ST, typename DT, typename AT, typename DVT>
+struct HResizeLinearVec_X4
+{
+    int operator()(const uchar** _src, uchar** _dst, int count, const int* xofs,
+        const uchar* _alpha, int, int, int cn, int, int xmax) const
+    {
+        const ST **src = (const ST**)_src;
+        const AT *alpha = (const AT*)_alpha;
+        DT **dst = (DT**)_dst;
+        const int nlanes = 4;
+        const int len0 = xmax & -nlanes;
+        int dx = 0, k = 0;
+
+        for( ; k <= (count - 2); k+=2 )
+        {
+            const ST *S0 = src[k];
+            DT *D0 = dst[k];
+            const ST *S1 = src[k+1];
+            DT *D1 = dst[k+1];
+
+            for( dx = 0; dx < len0; dx += nlanes )
+            {
+                int sx0 = xofs[dx+0];
+                int sx1 = xofs[dx+1];
+                int sx2 = xofs[dx+2];
+                int sx3 = xofs[dx+3];
+                DVT a_even;
+                DVT a_odd;
+
+                v_load_deinterleave(&alpha[dx*2], a_even, a_odd);
+                DVT s0(S0[sx0], S0[sx1], S0[sx2], S0[sx3]);
+                DVT s1(S0[sx0+cn], S0[sx1+cn], S0[sx2+cn], S0[sx3+cn]);
+                DVT s0_u(S1[sx0], S1[sx1], S1[sx2], S1[sx3]);
+                DVT s1_u(S1[sx0+cn], S1[sx1+cn], S1[sx2+cn], S1[sx3+cn]);
+                v_store(&D1[dx], s0_u * a_even + s1_u * a_odd);
+                v_store(&D0[dx], s0 * a_even + s1 * a_odd);
+            }
+        }
+        for( ; k < count; k++ )
+        {
+            const ST *S = src[k];
+            DT *D = dst[k];
+            for( dx = 0; dx < len0; dx += nlanes )
+            {
+                int sx0 = xofs[dx+0];
+                int sx1 = xofs[dx+1];
+                int sx2 = xofs[dx+2];
+                int sx3 = xofs[dx+3];
+                DVT a_even;
+                DVT a_odd;
+
+                v_load_deinterleave(&alpha[dx*2], a_even, a_odd);
+                DVT s0(S[sx0], S[sx1], S[sx2], S[sx3]);
+                DVT s1(S[sx0+cn], S[sx1+cn], S[sx2+cn], S[sx3+cn]);
+                v_store(&D[dx], s0 * a_even + s1 * a_odd);
+            }
+        }
+        return dx;
+    }
+};
+
+struct HResizeLinearVecU8_X4
+{
+    int operator()(const uchar** src, uchar** _dst, int count, const int* xofs,
+        const uchar* _alpha, int, int, int cn, int, int xmax) const
+    {
+        const short *alpha = (const short*)_alpha;
+        int **dst = (int**)_dst;
+        int dx = 0, k = 0;
+
+        if(cn == 1)
+        {
+            const int step = 8;
+            const int len0 = xmax & -step;
+            for( ; k <= (count - 2); k+=2 )
+            {
+                const uchar *S0 = src[k];
+                int *D0 = dst[k];
+                const uchar *S1 = src[k+1];
+                int *D1 = dst[k+1];
+
+                for( dx = 0; dx < len0; dx += step )
+                {
+                    v_int16x8 al = v_load(alpha+dx*2);
+                    v_int16x8 ah = v_load(alpha+dx*2+8);
+                    v_uint16x8 sl, sh;
+                    v_expand(v_lut_pairs(S0, xofs+dx), sl, sh);
+                    v_store(&D0[dx], v_dotprod(v_reinterpret_as_s16(sl), al));
+                    v_store(&D0[dx+4], v_dotprod(v_reinterpret_as_s16(sh), ah));
+                    v_expand(v_lut_pairs(S1, xofs+dx), sl, sh);
+                    v_store(&D1[dx], v_dotprod(v_reinterpret_as_s16(sl), al));
+                    v_store(&D1[dx+4], v_dotprod(v_reinterpret_as_s16(sh), ah));
+                }
+            }
+            for( ; k < count; k++ )
+            {
+                const uchar *S = src[k];
+                int *D = dst[k];
+                for( dx = 0; dx < len0; dx += step )
+                {
+                    v_int16x8 al = v_load(alpha+dx*2);
+                    v_int16x8 ah = v_load(alpha+dx*2+8);
+                    v_uint16x8 sl, sh;
+                    v_expand(v_lut_pairs(S, xofs+dx), sl, sh);
+                    v_store(&D[dx], v_dotprod(v_reinterpret_as_s16(sl), al));
+                    v_store(&D[dx+4], v_dotprod(v_reinterpret_as_s16(sh), ah));
+                }
+            }
+        }
+        else if(cn == 2)
+        {
+            const int step = 8;
+            const int len0 = xmax & -step;
+            for( ; k <= (count - 2); k+=2 )
+            {
+                const uchar *S0 = src[k];
+                int *D0 = dst[k];
+                const uchar *S1 = src[k+1];
+                int *D1 = dst[k+1];
+
+                for( dx = 0; dx < len0; dx += step )
+                {
+                    v_int16x8 al = v_load(alpha+dx*2);
+                    v_int16x8 ah = v_load(alpha+dx*2+8);
+                    v_uint16x8 sl, sh;
+                    v_expand(v_interleave_pairs(v_lut_quads(S0, xofs+dx)), sl, sh);
+                    v_store(&D0[dx], v_dotprod(v_reinterpret_as_s16(sl), al));
+                    v_store(&D0[dx+4], v_dotprod(v_reinterpret_as_s16(sh), ah));
+                    v_expand(v_interleave_pairs(v_lut_pairs(S1, xofs+dx)), sl, sh);
+                    v_store(&D1[dx], v_dotprod(v_reinterpret_as_s16(sl), al));
+                    v_store(&D1[dx+4], v_dotprod(v_reinterpret_as_s16(sh), ah));
+                }
+            }
+            for( ; k < count; k++ )
+            {
+                const uchar *S = src[k];
+                int *D = dst[k];
+                for( dx = 0; dx < len0; dx += step )
+                {
+                    v_int16x8 al = v_load(alpha+dx*2);
+                    v_int16x8 ah = v_load(alpha+dx*2+8);
+                    v_uint16x8 sl, sh;
+                    v_expand(v_interleave_pairs(v_lut_quads(S, xofs+dx)), sl, sh);
+                    v_store(&D[dx], v_dotprod(v_reinterpret_as_s16(sl), al));
+                    v_store(&D[dx+4], v_dotprod(v_reinterpret_as_s16(sh), ah));
+                }
+            }
+        }
+        else if(cn == 3)
+        {
+            const int step = 4;
+            const int len0 = xmax - step;
+            for( ; k <= (count - 2); k+=2 )
+            {
+                const uchar *S0 = src[k];
+                int *D0 = dst[k];
+                const uchar *S1 = src[k+1];
+                int *D1 = dst[k+1];
+
+                for( dx = 0; dx < len0; dx += 3*step/4 )
+                {
+                    v_int16x8 a = v_load(alpha+dx*2);
+                    v_store(&D0[dx], v_dotprod(v_reinterpret_as_s16(v_load_expand_q(S0+xofs[dx]) | (v_load_expand_q(S0+xofs[dx]+cn)<<16)), a));
+                    v_store(&D1[dx], v_dotprod(v_reinterpret_as_s16(v_load_expand_q(S1+xofs[dx]) | (v_load_expand_q(S1+xofs[dx]+cn)<<16)), a));
+                }
+            }
+            for( ; k < count; k++ )
+            {
+                const uchar *S = src[k];
+                int *D = dst[k];
+                for( dx = 0; dx < len0; dx += 3*step/4 )
+                {
+                    v_int16x8 a = v_load(alpha+dx*2);
+                    v_store(&D[dx], v_dotprod(v_reinterpret_as_s16(v_load_expand_q(S+xofs[dx]) | (v_load_expand_q(S+xofs[dx]+cn)<<16)), a));
+                }
+            }
+        }
+        else if(cn == 4)
+        {
+            const int step = 4;
+            const int len0 = xmax & -step;
+            for( ; k <= (count - 2); k+=2 )
+            {
+                const uchar *S0 = src[k];
+                int *D0 = dst[k];
+                const uchar *S1 = src[k+1];
+                int *D1 = dst[k+1];
+
+                for( dx = 0; dx < len0; dx += step )
+                {
+                    v_int16x8 a = v_load(alpha+dx*2);
+                    v_store(&D0[dx], v_dotprod(v_reinterpret_as_s16(v_interleave_quads(v_load_expand(S0+xofs[dx]))), a));
+                    v_store(&D1[dx], v_dotprod(v_reinterpret_as_s16(v_interleave_quads(v_load_expand(S1+xofs[dx]))), a));
+                }
+            }
+            for( ; k < count; k++ )
+            {
+                const uchar *S = src[k];
+                int *D = dst[k];
+                for( dx = 0; dx < len0; dx += step )
+                {
+                    v_int16x8 a = v_load(alpha+dx*2);
+                    v_store(&D[dx], v_dotprod(v_reinterpret_as_s16(v_interleave_quads(v_load_expand(S+xofs[dx]))), a));
+                }
+            }
+        }
+        else if(cn < 9)
+        {
+            const int step = 8;
+            const int len0 = xmax & -step;
+            for( ; k <= (count - 2); k+=2 )
+            {
+                const uchar *S0 = src[k];
+                int *D0 = dst[k];
+                const uchar *S1 = src[k+1];
+                int *D1 = dst[k+1];
+
+                for( dx = 0; dx < len0; dx += cn )
+                {
+                    v_int16x8 a0 = v_load(alpha+dx*2);
+                    v_int16x8 a1 = v_load(alpha+dx*2 + 8);
+                    v_uint16x8 s0, s1;
+                    v_zip(v_load_expand(S0+xofs[dx]), v_load_expand(S0+xofs[dx]+cn), s0, s1);
+                    v_store(&D0[dx], v_dotprod(v_reinterpret_as_s16(s0), a0));
+                    v_store(&D0[dx+4], v_dotprod(v_reinterpret_as_s16(s1), a1));
+                    v_zip(v_load_expand(S1+xofs[dx]), v_load_expand(S1+xofs[dx]+cn), s0, s1);
+                    v_store(&D1[dx], v_dotprod(v_reinterpret_as_s16(s0), a0));
+                    v_store(&D1[dx+4], v_dotprod(v_reinterpret_as_s16(s1), a1));
+                }
+            }
+            for( ; k < count; k++ )
+            {
+                const uchar *S = src[k];
+                int *D = dst[k];
+                for( dx = 0; dx < len0; dx += cn )
+                {
+                    v_int16x8 a0 = v_load(alpha+dx*2);
+                    v_int16x8 a1 = v_load(alpha+dx*2 + 8);
+                    v_uint16x8 s0, s1;
+                    v_zip(v_load_expand(S+xofs[dx]), v_load_expand(S+xofs[dx]+cn), s0, s1);
+                    v_store(&D[dx], v_dotprod(v_reinterpret_as_s16(s0), a0));
+                    v_store(&D[dx+4], v_dotprod(v_reinterpret_as_s16(s1), a1));
+                }
+            }
+        }
+        else
+        {
+            const int step = 16;
+            const int len0 = (xmax - cn) & -step;
+            for( ; k <= (count - 2); k+=2 )
+            {
+                const uchar *S0 = src[k];
+                int *D0 = dst[k];
+                const uchar *S1 = src[k+1];
+                int *D1 = dst[k+1];
+
+                for( dx = 0; dx < len0; dx += step )
+                {
+                    v_int16x8 a0 = v_load(alpha+dx*2);
+                    v_int16x8 a1 = v_load(alpha+dx*2 + 8);
+                    v_int16x8 a2 = v_load(alpha+dx*2 + 16);
+                    v_int16x8 a3 = v_load(alpha+dx*2 + 24);
+                    v_uint8x16 s01, s23;
+                    v_zip(v_lut(S0, xofs+dx), v_lut(S0+cn, xofs+dx), s01, s23);
+                    v_store(&D0[dx], v_dotprod(v_reinterpret_as_s16(v_expand_low(s01)), a0));
+                    v_store(&D0[dx+4], v_dotprod(v_reinterpret_as_s16(v_expand_high(s01)), a1));
+                    v_store(&D0[dx+8], v_dotprod(v_reinterpret_as_s16(v_expand_low(s23)), a2));
+                    v_store(&D0[dx+12], v_dotprod(v_reinterpret_as_s16(v_expand_high(s23)), a3));
+                    v_zip(v_lut(S1, xofs+dx), v_lut(S1+cn, xofs+dx), s01, s23);
+                    v_store(&D1[dx], v_dotprod(v_reinterpret_as_s16(v_expand_low(s01)), a0));
+                    v_store(&D1[dx+4], v_dotprod(v_reinterpret_as_s16(v_expand_high(s01)), a1));
+                    v_store(&D1[dx+8], v_dotprod(v_reinterpret_as_s16(v_expand_low(s23)), a2));
+                    v_store(&D1[dx+12], v_dotprod(v_reinterpret_as_s16(v_expand_high(s23)), a3));
+                }
+            }
+            for( ; k < count; k++ )
+            {
+                const uchar *S = src[k];
+                int *D = dst[k];
+                for( dx = 0; dx < len0; dx += step )
+                {
+                    v_int16x8 a0 = v_load(alpha+dx*2);
+                    v_int16x8 a1 = v_load(alpha+dx*2 + 8);
+                    v_int16x8 a2 = v_load(alpha+dx*2 + 16);
+                    v_int16x8 a3 = v_load(alpha+dx*2 + 24);
+                    v_uint8x16 s01, s23;
+                    v_zip(v_lut(S, xofs+dx), v_lut(S+cn, xofs+dx), s01, s23);
+                    v_store(&D[dx], v_dotprod(v_reinterpret_as_s16(v_expand_low(s01)), a0));
+                    v_store(&D[dx+4], v_dotprod(v_reinterpret_as_s16(v_expand_high(s01)), a1));
+                    v_store(&D[dx+8], v_dotprod(v_reinterpret_as_s16(v_expand_low(s23)), a2));
+                    v_store(&D[dx+12], v_dotprod(v_reinterpret_as_s16(v_expand_high(s23)), a3));
+                }
+            }
+        }
+        return dx;
+    }
+};
+
+typedef HResizeLinearVec_X4<float,float,float,v_float32x4> HResizeLinearVec_32f;
+typedef HResizeLinearVec_X4<ushort,float,float,v_float32x4> HResizeLinearVec_16u32f;
+typedef HResizeLinearVec_X4<short,float,float,v_float32x4> HResizeLinearVec_16s32f;
+typedef HResizeLinearVecU8_X4 HResizeLinearVec_8u32s;
+
+#else
+
  typedef HResizeNoVec HResizeLinearVec_8u32s;
  typedef HResizeNoVec HResizeLinearVec_16u32f;
  typedef HResizeNoVec HResizeLinearVec_16s32f;
  typedef HResizeNoVec HResizeLinearVec_32f;
+
+#endif
+
  typedef HResizeNoVec HResizeLinearVec_64f;
  
  
@@ -1505,7 +1815,7 @@ struct HResizeLinear
          int dx0 = vecOp((const uchar**)src, (uchar**)dst, count,
              xofs, (const uchar*)alpha, swidth, dwidth, cn, xmin, xmax );
  
-        for( k = 0; k <= count - 2; k++ )
+        for( k = 0; k <= count - 2; k+=2 )
          {
              const T *S0 = src[k], *S1 = src[k+1];
              WT *D0 = dst[k], *D1 = dst[k+1];
@@ -1529,7 +1839,7 @@ struct HResizeLinear
          {
              const T *S = src[k];
              WT *D = dst[k];
-            for( dx = 0; dx < xmax; dx++ )
+            for( dx = dx0; dx < xmax; dx++ )
              {
                  int sx = xofs[dx];
                  D[dx] = S[sx]*alpha[dx*2] + S[sx+cn]*alpha[dx*2+1];
diff --git a/modules/imgproc/src/thresh.cpp b/modules/imgproc/src/thresh.cpp

index 466b0a8..2e6690e 100644 (file)
--- a/modules/imgproc/src/thresh.cpp
+++ b/modules/imgproc/src/thresh.cpp
@@ -774,6 +774,7 @@ thresh_32f( const Mat& _src, Mat& _dst, float thresh, float maxval, int type )
              }
              setIppErrorStatus();
              break;
+#if 0  // details: https://github.com/opencv/opencv/pull/16085
          case THRESH_TOZERO:
              if (0 <= CV_INSTRUMENT_FUN_IPP(ippiThreshold_LTVal_32f_C1R, src, (int)src_step*sizeof(src[0]), dst, (int)dst_step*sizeof(dst[0]), sz, thresh + FLT_EPSILON, 0))
              {
@@ -782,6 +783,7 @@ thresh_32f( const Mat& _src, Mat& _dst, float thresh, float maxval, int type )
              }
              setIppErrorStatus();
              break;
+#endif
          case THRESH_TOZERO_INV:
              if (0 <= CV_INSTRUMENT_FUN_IPP(ippiThreshold_GTVal_32f_C1R, src, (int)src_step*sizeof(src[0]), dst, (int)dst_step*sizeof(dst[0]), sz, thresh, 0))
              {
diff --git a/modules/imgproc/test/test_thresh.cpp b/modules/imgproc/test/test_thresh.cpp

index e9bed8c..a61095d 100644 (file)
--- a/modules/imgproc/test/test_thresh.cpp
+++ b/modules/imgproc/test/test_thresh.cpp
@@ -434,4 +434,13 @@ BIGDATA_TEST(Imgproc_Threshold, huge)
      ASSERT_EQ((uint64)nz, n / 2);
  }
  
+TEST(Imgproc_Threshold, regression_THRESH_TOZERO_IPP_16085)
+{
+    Size sz(16, 16);
+    Mat input(sz, CV_32F, Scalar::all(2));
+    Mat result;
+    cv::threshold(input, result, 2.0, 0.0, THRESH_TOZERO);
+    EXPECT_EQ(0, cv::norm(result, NORM_INF));
+}
+
  }} // namespace
author	Alexander Alekhin <alexander.a.alekhin@gmail.com>
	Mon, 9 Dec 2019 18:29:53 +0000 (18:29 +0000)
committer	Alexander Alekhin <alexander.a.alekhin@gmail.com>
	Mon, 9 Dec 2019 18:29:53 +0000 (18:29 +0000)
LICENSE		patch \| blob \| history
modules/core/include/opencv2/core/hal/intrin_vsx.hpp		patch \| blob \| history
modules/dnn/src/layers/detection_output_layer.cpp		patch \| blob \| history
modules/imgproc/perf/perf_resize.cpp		patch \| blob \| history
modules/imgproc/src/resize.cpp		patch \| blob \| history
modules/imgproc/src/thresh.cpp		patch \| blob \| history
modules/imgproc/test/test_thresh.cpp		patch \| blob \| history