Merge pull request #15257 from pmur:resize
authorPaul Murphy <12972156+pmur@users.noreply.github.com>
Mon, 9 Dec 2019 11:54:06 +0000 (05:54 -0600)
committerAlexander Alekhin <alexander.a.alekhin@gmail.com>
Mon, 9 Dec 2019 11:54:06 +0000 (14:54 +0300)
* resize: HResizeLinear reduce duplicate work

There appears to be a 2x unroll of the HResizeLinear against k,
however the k value is only incremented by 1 during the unroll. This
results in k - 1 duplicate passes when k > 1.

Likewise, the final pass may not respect the work done by the vector
loop. Start it with the offset returned by the vector op if
implemented. Note, no vector ops are implemented today.

The performance is most noticable on a linear downscale. A set of
performance tests are added to characterize this.  The performance
improvement is 10-50% depending on the scaling.

* imgproc: vectorize HResizeLinear

Performance is mostly gated by the gather operations
for x inputs.

Likewise, provide a 2x unroll against k, this reduces the
number of alpha gathers by 1/2 for larger k.

While not a 4x improvement, it still performs substantially
better under P9 for a 1.4x improvement. P8 baseline is
1.05-1.10x due to reduced VSX instruction set.

For float types, this results in a more modest
1.2x improvement.

* Update U8 processing for non-bitexact linear resize

* core: hal: vsx: improve v_load_expand_q

With a little help, we can do this quickly without gprs on
all VSX enabled targets.

* resize: Fix cn == 3 step per feedback

Per feedback, ensure we don't overrun. This was caught via the
failure observed in Test_TensorFlow.inception_accuracy.

modules/core/include/opencv2/core/hal/intrin_vsx.hpp
modules/imgproc/perf/perf_resize.cpp
modules/imgproc/src/resize.cpp

index e4d13af..bda1d85 100644 (file)
@@ -346,11 +346,37 @@ OPENCV_HAL_IMPL_VSX_EXPAND(v_int16x8, v_int32x4, short, vec_unpackl, vec_unpackh
 OPENCV_HAL_IMPL_VSX_EXPAND(v_uint32x4, v_uint64x2, uint, vec_unpacklu, vec_unpackhu)
 OPENCV_HAL_IMPL_VSX_EXPAND(v_int32x4, v_int64x2, int, vec_unpackl, vec_unpackh)
 
+/* Load and zero expand a 4 byte value into the second dword, first is don't care. */
+#if !defined(CV_COMPILER_VSX_BROKEN_ASM)
+    #define _LXSIWZX(out, ptr, T) __asm__ ("lxsiwzx %x0, 0, %1\r\n" : "=wa"(out) : "r" (ptr) : "memory");
+#else
+    /* This is compiler-agnostic, but will introduce an unneeded splat on the critical path. */
+    #define _LXSIWZX(out, ptr, T) out = (T)vec_udword2_sp(*(uint32_t*)(ptr));
+#endif
+
 inline v_uint32x4 v_load_expand_q(const uchar* ptr)
-{ return v_uint32x4(vec_uint4_set(ptr[0], ptr[1], ptr[2], ptr[3])); }
+{
+    // Zero-extend the extra 24B instead of unpacking. Usually faster in small kernel
+    // Likewise note, value is zero extended and upper 4 bytes are zero'ed.
+    vec_uchar16 pmu = {8, 12, 12, 12, 9, 12, 12, 12, 10, 12, 12, 12, 11, 12, 12, 12};
+    vec_uchar16 out;
+
+    _LXSIWZX(out, ptr, vec_uchar16);
+    out = vec_perm(out, out, pmu);
+    return v_uint32x4((vec_uint4)out);
+}
 
 inline v_int32x4 v_load_expand_q(const schar* ptr)
-{ return v_int32x4(vec_int4_set(ptr[0], ptr[1], ptr[2], ptr[3])); }
+{
+    vec_char16 out;
+    vec_short8 outs;
+    vec_int4 outw;
+
+    _LXSIWZX(out, ptr, vec_char16);
+    outs = vec_unpackl(out);
+    outw = vec_unpackh(outs);
+    return v_int32x4(outw);
+}
 
 /* pack */
 #define OPENCV_HAL_IMPL_VSX_PACK(_Tpvec, _Tp, _Tpwvec, _Tpvn, _Tpdel, sfnc, pkfnc, addfnc, pack)    \
index 0705108..236955b 100644 (file)
@@ -7,6 +7,31 @@ namespace opencv_test {
 
 typedef tuple<MatType, Size, Size> MatInfo_Size_Size_t;
 typedef TestBaseWithParam<MatInfo_Size_Size_t> MatInfo_Size_Size;
+typedef tuple<Size,Size> Size_Size_t;
+typedef tuple<MatType, Size_Size_t> MatInfo_SizePair_t;
+typedef TestBaseWithParam<MatInfo_SizePair_t> MatInfo_SizePair;
+
+#define MATTYPE_NE_VALUES CV_8UC1, CV_8UC2, CV_8UC3, CV_8UC4,     \
+                          CV_16UC1, CV_16UC2, CV_16UC3, CV_16UC4, \
+                          CV_32FC1, CV_32FC2, CV_32FC3, CV_32FC4
+
+// For gradient-ish testing of the other matrix formats
+template<typename T>
+static void fillFPGradient(Mat& img)
+{
+    const int ch = img.channels();
+
+    int r, c, i;
+    for(r=0; r<img.rows; r++)
+    {
+        for(c=0; c<img.cols; c++)
+        {
+            T vals[] = {(T)r, (T)c, (T)(r*c), (T)(r*c/(r+c+1))};
+            T *p = (T*)img.ptr(r, c);
+            for(i=0; i<ch; i++) p[i] = (T)vals[i];
+        }
+    }
+}
 
 PERF_TEST_P(MatInfo_Size_Size, resizeUpLinear,
             testing::Values(
@@ -38,6 +63,33 @@ PERF_TEST_P(MatInfo_Size_Size, resizeUpLinear,
 #endif
 }
 
+PERF_TEST_P(MatInfo_SizePair, resizeUpLinearNonExact,
+            testing::Combine
+                (
+                testing::Values( MATTYPE_NE_VALUES ),
+                testing::Values( Size_Size_t(szVGA, szqHD), Size_Size_t(szVGA, sz720p) )
+                )
+             )
+{
+    int matType = get<0>(GetParam());
+    Size_Size_t sizes = get<1>(GetParam());
+    Size from = get<0>(sizes);
+    Size to = get<1>(sizes);
+
+    cv::Mat src(from, matType), dst(to, matType);
+    switch(src.depth())
+    {
+        case CV_8U: cvtest::fillGradient(src); break;
+        case CV_16U: fillFPGradient<ushort>(src); break;
+        case CV_32F: fillFPGradient<float>(src); break;
+    }
+    declare.in(src).out(dst);
+
+    TEST_CYCLE_MULTIRUN(10) resize(src, dst, to, 0, 0, INTER_LINEAR);
+
+    SANITY_CHECK_NOTHING();
+}
+
 PERF_TEST_P(MatInfo_Size_Size, resizeDownLinear,
             testing::Values(
                 MatInfo_Size_Size_t(CV_8UC1, szVGA, szQVGA),
@@ -80,6 +132,40 @@ PERF_TEST_P(MatInfo_Size_Size, resizeDownLinear,
 #endif
 }
 
+PERF_TEST_P(MatInfo_SizePair, resizeDownLinearNonExact,
+            testing::Combine
+                (
+                testing::Values( MATTYPE_NE_VALUES ),
+                testing::Values
+                    (
+                    Size_Size_t(szVGA, szQVGA),
+                    Size_Size_t(szqHD, szVGA),
+                    Size_Size_t(sz720p, Size(120 * sz720p.width / sz720p.height, 120)),
+                    Size_Size_t(sz720p, szVGA),
+                    Size_Size_t(sz720p, szQVGA)
+                    )
+                )
+            )
+{
+    int matType = get<0>(GetParam());
+    Size_Size_t sizes = get<1>(GetParam());
+    Size from = get<0>(sizes);
+    Size to = get<1>(sizes);
+
+    cv::Mat src(from, matType), dst(to, matType);
+    switch(src.depth())
+    {
+        case CV_8U: cvtest::fillGradient(src); break;
+        case CV_16U: fillFPGradient<ushort>(src); break;
+        case CV_32F: fillFPGradient<float>(src); break;
+    }
+    declare.in(src).out(dst);
+
+    TEST_CYCLE_MULTIRUN(10) resize(src, dst, to, 0, 0, INTER_LINEAR);
+
+    SANITY_CHECK_NOTHING();
+}
+
 
 typedef tuple<MatType, Size, int> MatInfo_Size_Scale_t;
 typedef TestBaseWithParam<MatInfo_Size_Scale_t> MatInfo_Size_Scale;
index b0283e5..56f06ac 100644 (file)
@@ -1481,10 +1481,320 @@ typedef VResizeNoVec VResizeLanczos4Vec_32f;
 
 #endif
 
+#if CV_SIMD128
+
+template<typename ST, typename DT, typename AT, typename DVT>
+struct HResizeLinearVec_X4
+{
+    int operator()(const uchar** _src, uchar** _dst, int count, const int* xofs,
+        const uchar* _alpha, int, int, int cn, int, int xmax) const
+    {
+        const ST **src = (const ST**)_src;
+        const AT *alpha = (const AT*)_alpha;
+        DT **dst = (DT**)_dst;
+        const int nlanes = 4;
+        const int len0 = xmax & -nlanes;
+        int dx = 0, k = 0;
+
+        for( ; k <= (count - 2); k+=2 )
+        {
+            const ST *S0 = src[k];
+            DT *D0 = dst[k];
+            const ST *S1 = src[k+1];
+            DT *D1 = dst[k+1];
+
+            for( dx = 0; dx < len0; dx += nlanes )
+            {
+                int sx0 = xofs[dx+0];
+                int sx1 = xofs[dx+1];
+                int sx2 = xofs[dx+2];
+                int sx3 = xofs[dx+3];
+                DVT a_even;
+                DVT a_odd;
+
+                v_load_deinterleave(&alpha[dx*2], a_even, a_odd);
+                DVT s0(S0[sx0], S0[sx1], S0[sx2], S0[sx3]);
+                DVT s1(S0[sx0+cn], S0[sx1+cn], S0[sx2+cn], S0[sx3+cn]);
+                DVT s0_u(S1[sx0], S1[sx1], S1[sx2], S1[sx3]);
+                DVT s1_u(S1[sx0+cn], S1[sx1+cn], S1[sx2+cn], S1[sx3+cn]);
+                v_store(&D1[dx], s0_u * a_even + s1_u * a_odd);
+                v_store(&D0[dx], s0 * a_even + s1 * a_odd);
+            }
+        }
+        for( ; k < count; k++ )
+        {
+            const ST *S = src[k];
+            DT *D = dst[k];
+            for( dx = 0; dx < len0; dx += nlanes )
+            {
+                int sx0 = xofs[dx+0];
+                int sx1 = xofs[dx+1];
+                int sx2 = xofs[dx+2];
+                int sx3 = xofs[dx+3];
+                DVT a_even;
+                DVT a_odd;
+
+                v_load_deinterleave(&alpha[dx*2], a_even, a_odd);
+                DVT s0(S[sx0], S[sx1], S[sx2], S[sx3]);
+                DVT s1(S[sx0+cn], S[sx1+cn], S[sx2+cn], S[sx3+cn]);
+                v_store(&D[dx], s0 * a_even + s1 * a_odd);
+            }
+        }
+        return dx;
+    }
+};
+
+struct HResizeLinearVecU8_X4
+{
+    int operator()(const uchar** src, uchar** _dst, int count, const int* xofs,
+        const uchar* _alpha, int, int, int cn, int, int xmax) const
+    {
+        const short *alpha = (const short*)_alpha;
+        int **dst = (int**)_dst;
+        int dx = 0, k = 0;
+
+        if(cn == 1)
+        {
+            const int step = 8;
+            const int len0 = xmax & -step;
+            for( ; k <= (count - 2); k+=2 )
+            {
+                const uchar *S0 = src[k];
+                int *D0 = dst[k];
+                const uchar *S1 = src[k+1];
+                int *D1 = dst[k+1];
+
+                for( dx = 0; dx < len0; dx += step )
+                {
+                    v_int16x8 al = v_load(alpha+dx*2);
+                    v_int16x8 ah = v_load(alpha+dx*2+8);
+                    v_uint16x8 sl, sh;
+                    v_expand(v_lut_pairs(S0, xofs+dx), sl, sh);
+                    v_store(&D0[dx], v_dotprod(v_reinterpret_as_s16(sl), al));
+                    v_store(&D0[dx+4], v_dotprod(v_reinterpret_as_s16(sh), ah));
+                    v_expand(v_lut_pairs(S1, xofs+dx), sl, sh);
+                    v_store(&D1[dx], v_dotprod(v_reinterpret_as_s16(sl), al));
+                    v_store(&D1[dx+4], v_dotprod(v_reinterpret_as_s16(sh), ah));
+                }
+            }
+            for( ; k < count; k++ )
+            {
+                const uchar *S = src[k];
+                int *D = dst[k];
+                for( dx = 0; dx < len0; dx += step )
+                {
+                    v_int16x8 al = v_load(alpha+dx*2);
+                    v_int16x8 ah = v_load(alpha+dx*2+8);
+                    v_uint16x8 sl, sh;
+                    v_expand(v_lut_pairs(S, xofs+dx), sl, sh);
+                    v_store(&D[dx], v_dotprod(v_reinterpret_as_s16(sl), al));
+                    v_store(&D[dx+4], v_dotprod(v_reinterpret_as_s16(sh), ah));
+                }
+            }
+        }
+        else if(cn == 2)
+        {
+            const int step = 8;
+            const int len0 = xmax & -step;
+            for( ; k <= (count - 2); k+=2 )
+            {
+                const uchar *S0 = src[k];
+                int *D0 = dst[k];
+                const uchar *S1 = src[k+1];
+                int *D1 = dst[k+1];
+
+                for( dx = 0; dx < len0; dx += step )
+                {
+                    v_int16x8 al = v_load(alpha+dx*2);
+                    v_int16x8 ah = v_load(alpha+dx*2+8);
+                    v_uint16x8 sl, sh;
+                    v_expand(v_interleave_pairs(v_lut_quads(S0, xofs+dx)), sl, sh);
+                    v_store(&D0[dx], v_dotprod(v_reinterpret_as_s16(sl), al));
+                    v_store(&D0[dx+4], v_dotprod(v_reinterpret_as_s16(sh), ah));
+                    v_expand(v_interleave_pairs(v_lut_pairs(S1, xofs+dx)), sl, sh);
+                    v_store(&D1[dx], v_dotprod(v_reinterpret_as_s16(sl), al));
+                    v_store(&D1[dx+4], v_dotprod(v_reinterpret_as_s16(sh), ah));
+                }
+            }
+            for( ; k < count; k++ )
+            {
+                const uchar *S = src[k];
+                int *D = dst[k];
+                for( dx = 0; dx < len0; dx += step )
+                {
+                    v_int16x8 al = v_load(alpha+dx*2);
+                    v_int16x8 ah = v_load(alpha+dx*2+8);
+                    v_uint16x8 sl, sh;
+                    v_expand(v_interleave_pairs(v_lut_quads(S, xofs+dx)), sl, sh);
+                    v_store(&D[dx], v_dotprod(v_reinterpret_as_s16(sl), al));
+                    v_store(&D[dx+4], v_dotprod(v_reinterpret_as_s16(sh), ah));
+                }
+            }
+        }
+        else if(cn == 3)
+        {
+            const int step = 4;
+            const int len0 = xmax - step;
+            for( ; k <= (count - 2); k+=2 )
+            {
+                const uchar *S0 = src[k];
+                int *D0 = dst[k];
+                const uchar *S1 = src[k+1];
+                int *D1 = dst[k+1];
+
+                for( dx = 0; dx < len0; dx += 3*step/4 )
+                {
+                    v_int16x8 a = v_load(alpha+dx*2);
+                    v_store(&D0[dx], v_dotprod(v_reinterpret_as_s16(v_load_expand_q(S0+xofs[dx]) | (v_load_expand_q(S0+xofs[dx]+cn)<<16)), a));
+                    v_store(&D1[dx], v_dotprod(v_reinterpret_as_s16(v_load_expand_q(S1+xofs[dx]) | (v_load_expand_q(S1+xofs[dx]+cn)<<16)), a));
+                }
+            }
+            for( ; k < count; k++ )
+            {
+                const uchar *S = src[k];
+                int *D = dst[k];
+                for( dx = 0; dx < len0; dx += 3*step/4 )
+                {
+                    v_int16x8 a = v_load(alpha+dx*2);
+                    v_store(&D[dx], v_dotprod(v_reinterpret_as_s16(v_load_expand_q(S+xofs[dx]) | (v_load_expand_q(S+xofs[dx]+cn)<<16)), a));
+                }
+            }
+        }
+        else if(cn == 4)
+        {
+            const int step = 4;
+            const int len0 = xmax & -step;
+            for( ; k <= (count - 2); k+=2 )
+            {
+                const uchar *S0 = src[k];
+                int *D0 = dst[k];
+                const uchar *S1 = src[k+1];
+                int *D1 = dst[k+1];
+
+                for( dx = 0; dx < len0; dx += step )
+                {
+                    v_int16x8 a = v_load(alpha+dx*2);
+                    v_store(&D0[dx], v_dotprod(v_reinterpret_as_s16(v_interleave_quads(v_load_expand(S0+xofs[dx]))), a));
+                    v_store(&D1[dx], v_dotprod(v_reinterpret_as_s16(v_interleave_quads(v_load_expand(S1+xofs[dx]))), a));
+                }
+            }
+            for( ; k < count; k++ )
+            {
+                const uchar *S = src[k];
+                int *D = dst[k];
+                for( dx = 0; dx < len0; dx += step )
+                {
+                    v_int16x8 a = v_load(alpha+dx*2);
+                    v_store(&D[dx], v_dotprod(v_reinterpret_as_s16(v_interleave_quads(v_load_expand(S+xofs[dx]))), a));
+                }
+            }
+        }
+        else if(cn < 9)
+        {
+            const int step = 8;
+            const int len0 = xmax & -step;
+            for( ; k <= (count - 2); k+=2 )
+            {
+                const uchar *S0 = src[k];
+                int *D0 = dst[k];
+                const uchar *S1 = src[k+1];
+                int *D1 = dst[k+1];
+
+                for( dx = 0; dx < len0; dx += cn )
+                {
+                    v_int16x8 a0 = v_load(alpha+dx*2);
+                    v_int16x8 a1 = v_load(alpha+dx*2 + 8);
+                    v_uint16x8 s0, s1;
+                    v_zip(v_load_expand(S0+xofs[dx]), v_load_expand(S0+xofs[dx]+cn), s0, s1);
+                    v_store(&D0[dx], v_dotprod(v_reinterpret_as_s16(s0), a0));
+                    v_store(&D0[dx+4], v_dotprod(v_reinterpret_as_s16(s1), a1));
+                    v_zip(v_load_expand(S1+xofs[dx]), v_load_expand(S1+xofs[dx]+cn), s0, s1);
+                    v_store(&D1[dx], v_dotprod(v_reinterpret_as_s16(s0), a0));
+                    v_store(&D1[dx+4], v_dotprod(v_reinterpret_as_s16(s1), a1));
+                }
+            }
+            for( ; k < count; k++ )
+            {
+                const uchar *S = src[k];
+                int *D = dst[k];
+                for( dx = 0; dx < len0; dx += cn )
+                {
+                    v_int16x8 a0 = v_load(alpha+dx*2);
+                    v_int16x8 a1 = v_load(alpha+dx*2 + 8);
+                    v_uint16x8 s0, s1;
+                    v_zip(v_load_expand(S+xofs[dx]), v_load_expand(S+xofs[dx]+cn), s0, s1);
+                    v_store(&D[dx], v_dotprod(v_reinterpret_as_s16(s0), a0));
+                    v_store(&D[dx+4], v_dotprod(v_reinterpret_as_s16(s1), a1));
+                }
+            }
+        }
+        else
+        {
+            const int step = 16;
+            const int len0 = (xmax - cn) & -step;
+            for( ; k <= (count - 2); k+=2 )
+            {
+                const uchar *S0 = src[k];
+                int *D0 = dst[k];
+                const uchar *S1 = src[k+1];
+                int *D1 = dst[k+1];
+
+                for( dx = 0; dx < len0; dx += step )
+                {
+                    v_int16x8 a0 = v_load(alpha+dx*2);
+                    v_int16x8 a1 = v_load(alpha+dx*2 + 8);
+                    v_int16x8 a2 = v_load(alpha+dx*2 + 16);
+                    v_int16x8 a3 = v_load(alpha+dx*2 + 24);
+                    v_uint8x16 s01, s23;
+                    v_zip(v_lut(S0, xofs+dx), v_lut(S0+cn, xofs+dx), s01, s23);
+                    v_store(&D0[dx], v_dotprod(v_reinterpret_as_s16(v_expand_low(s01)), a0));
+                    v_store(&D0[dx+4], v_dotprod(v_reinterpret_as_s16(v_expand_high(s01)), a1));
+                    v_store(&D0[dx+8], v_dotprod(v_reinterpret_as_s16(v_expand_low(s23)), a2));
+                    v_store(&D0[dx+12], v_dotprod(v_reinterpret_as_s16(v_expand_high(s23)), a3));
+                    v_zip(v_lut(S1, xofs+dx), v_lut(S1+cn, xofs+dx), s01, s23);
+                    v_store(&D1[dx], v_dotprod(v_reinterpret_as_s16(v_expand_low(s01)), a0));
+                    v_store(&D1[dx+4], v_dotprod(v_reinterpret_as_s16(v_expand_high(s01)), a1));
+                    v_store(&D1[dx+8], v_dotprod(v_reinterpret_as_s16(v_expand_low(s23)), a2));
+                    v_store(&D1[dx+12], v_dotprod(v_reinterpret_as_s16(v_expand_high(s23)), a3));
+                }
+            }
+            for( ; k < count; k++ )
+            {
+                const uchar *S = src[k];
+                int *D = dst[k];
+                for( dx = 0; dx < len0; dx += step )
+                {
+                    v_int16x8 a0 = v_load(alpha+dx*2);
+                    v_int16x8 a1 = v_load(alpha+dx*2 + 8);
+                    v_int16x8 a2 = v_load(alpha+dx*2 + 16);
+                    v_int16x8 a3 = v_load(alpha+dx*2 + 24);
+                    v_uint8x16 s01, s23;
+                    v_zip(v_lut(S, xofs+dx), v_lut(S+cn, xofs+dx), s01, s23);
+                    v_store(&D[dx], v_dotprod(v_reinterpret_as_s16(v_expand_low(s01)), a0));
+                    v_store(&D[dx+4], v_dotprod(v_reinterpret_as_s16(v_expand_high(s01)), a1));
+                    v_store(&D[dx+8], v_dotprod(v_reinterpret_as_s16(v_expand_low(s23)), a2));
+                    v_store(&D[dx+12], v_dotprod(v_reinterpret_as_s16(v_expand_high(s23)), a3));
+                }
+            }
+        }
+        return dx;
+    }
+};
+
+typedef HResizeLinearVec_X4<float,float,float,v_float32x4> HResizeLinearVec_32f;
+typedef HResizeLinearVec_X4<ushort,float,float,v_float32x4> HResizeLinearVec_16u32f;
+typedef HResizeLinearVec_X4<short,float,float,v_float32x4> HResizeLinearVec_16s32f;
+typedef HResizeLinearVecU8_X4 HResizeLinearVec_8u32s;
+
+#else
+
 typedef HResizeNoVec HResizeLinearVec_8u32s;
 typedef HResizeNoVec HResizeLinearVec_16u32f;
 typedef HResizeNoVec HResizeLinearVec_16s32f;
 typedef HResizeNoVec HResizeLinearVec_32f;
+
+#endif
+
 typedef HResizeNoVec HResizeLinearVec_64f;
 
 
@@ -1505,7 +1815,7 @@ struct HResizeLinear
         int dx0 = vecOp((const uchar**)src, (uchar**)dst, count,
             xofs, (const uchar*)alpha, swidth, dwidth, cn, xmin, xmax );
 
-        for( k = 0; k <= count - 2; k++ )
+        for( k = 0; k <= count - 2; k+=2 )
         {
             const T *S0 = src[k], *S1 = src[k+1];
             WT *D0 = dst[k], *D1 = dst[k+1];
@@ -1529,7 +1839,7 @@ struct HResizeLinear
         {
             const T *S = src[k];
             WT *D = dst[k];
-            for( dx = 0; dx < xmax; dx++ )
+            for( dx = dx0; dx < xmax; dx++ )
             {
                 int sx = xofs[dx];
                 D[dx] = S[sx]*alpha[dx*2] + S[sx+cn]*alpha[dx*2+1];