Merge pull request #5811 from mshabunin:fix-java-moments
[platform/upstream/opencv.git] / modules / core / src / merge.cpp
1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 //  By downloading, copying, installing or using the software you agree to this license.
6 //  If you do not agree to this license, do not download, install,
7 //  copy or use the software.
8 //
9 //
10 //                           License Agreement
11 //                For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
15 // Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
16 // Third party copyrights are property of their respective owners.
17 //
18 // Redistribution and use in source and binary forms, with or without modification,
19 // are permitted provided that the following conditions are met:
20 //
21 //   * Redistribution's of source code must retain the above copyright notice,
22 //     this list of conditions and the following disclaimer.
23 //
24 //   * Redistribution's in binary form must reproduce the above copyright notice,
25 //     this list of conditions and the following disclaimer in the documentation
26 //     and/or other materials provided with the distribution.
27 //
28 //   * The name of the copyright holders may not be used to endorse or promote products
29 //     derived from this software without specific prior written permission.
30 //
31 // This software is provided by the copyright holders and contributors "as is" and
32 // any express or implied warranties, including, but not limited to, the implied
33 // warranties of merchantability and fitness for a particular purpose are disclaimed.
34 // In no event shall the Intel Corporation or contributors be liable for any direct,
35 // indirect, incidental, special, exemplary, or consequential damages
36 // (including, but not limited to, procurement of substitute goods or services;
37 // loss of use, data, or profits; or business interruption) however caused
38 // and on any theory of liability, whether in contract, strict liability,
39 // or tort (including negligence or otherwise) arising in any way out of
40 // the use of this software, even if advised of the possibility of such damage.
41 //
42 //M*/
43
44 #include "precomp.hpp"
45
46 namespace cv { namespace hal {
47
48 #if CV_NEON
49 template<typename T> struct VMerge2;
50 template<typename T> struct VMerge3;
51 template<typename T> struct VMerge4;
52
53 #define MERGE2_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func)  \
54     template<>                                                                    \
55     struct name<data_type>{                                                       \
56         void operator()(const data_type* src0, const data_type* src1,             \
57                         data_type* dst){                                          \
58             reg_type r;                                                           \
59             r.val[0] = load_func(src0);                                           \
60             r.val[1] = load_func(src1);                                           \
61             store_func(dst, r);                                                   \
62         }                                                                         \
63     }
64
65 #define MERGE3_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func)  \
66     template<>                                                                    \
67     struct name<data_type>{                                                       \
68         void operator()(const data_type* src0, const data_type* src1,             \
69                         const data_type* src2, data_type* dst){                   \
70             reg_type r;                                                           \
71             r.val[0] = load_func(src0);                                           \
72             r.val[1] = load_func(src1);                                           \
73             r.val[2] = load_func(src2);                                           \
74             store_func(dst, r);                                                   \
75         }                                                                         \
76     }
77
78 #define MERGE4_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func)  \
79     template<>                                                                    \
80     struct name<data_type>{                                                       \
81         void operator()(const data_type* src0, const data_type* src1,             \
82                         const data_type* src2, const data_type* src3,             \
83                         data_type* dst){                                          \
84             reg_type r;                                                           \
85             r.val[0] = load_func(src0);                                           \
86             r.val[1] = load_func(src1);                                           \
87             r.val[2] = load_func(src2);                                           \
88             r.val[3] = load_func(src3);                                           \
89             store_func(dst, r);                                                   \
90         }                                                                         \
91     }
92
93 MERGE2_KERNEL_TEMPLATE(VMerge2, uchar ,  uint8x16x2_t, vld1q_u8 , vst2q_u8 );
94 MERGE2_KERNEL_TEMPLATE(VMerge2, ushort,  uint16x8x2_t, vld1q_u16, vst2q_u16);
95 MERGE2_KERNEL_TEMPLATE(VMerge2, int   ,   int32x4x2_t, vld1q_s32, vst2q_s32);
96 MERGE2_KERNEL_TEMPLATE(VMerge2, int64 ,   int64x1x2_t, vld1_s64 , vst2_s64 );
97
98 MERGE3_KERNEL_TEMPLATE(VMerge3, uchar ,  uint8x16x3_t, vld1q_u8 , vst3q_u8 );
99 MERGE3_KERNEL_TEMPLATE(VMerge3, ushort,  uint16x8x3_t, vld1q_u16, vst3q_u16);
100 MERGE3_KERNEL_TEMPLATE(VMerge3, int   ,   int32x4x3_t, vld1q_s32, vst3q_s32);
101 MERGE3_KERNEL_TEMPLATE(VMerge3, int64 ,   int64x1x3_t, vld1_s64 , vst3_s64 );
102
103 MERGE4_KERNEL_TEMPLATE(VMerge4, uchar ,  uint8x16x4_t, vld1q_u8 , vst4q_u8 );
104 MERGE4_KERNEL_TEMPLATE(VMerge4, ushort,  uint16x8x4_t, vld1q_u16, vst4q_u16);
105 MERGE4_KERNEL_TEMPLATE(VMerge4, int   ,   int32x4x4_t, vld1q_s32, vst4q_s32);
106 MERGE4_KERNEL_TEMPLATE(VMerge4, int64 ,   int64x1x4_t, vld1_s64 , vst4_s64 );
107
108 #elif CV_SSE2
109
110 template <typename T>
111 struct VMerge2
112 {
113     VMerge2() : support(false) { }
114     void operator()(const T *, const T *, T *) const { }
115
116     bool support;
117 };
118
119 template <typename T>
120 struct VMerge3
121 {
122     VMerge3() : support(false) { }
123     void operator()(const T *, const T *, const T *, T *) const { }
124
125     bool support;
126 };
127
128 template <typename T>
129 struct VMerge4
130 {
131     VMerge4() : support(false) { }
132     void operator()(const T *, const T *, const T *, const T *, T *) const { }
133
134     bool support;
135 };
136
137 #define MERGE2_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_interleave, flavor, se) \
138 template <>                                                                                \
139 struct VMerge2<data_type>                                                                  \
140 {                                                                                          \
141     enum                                                                                   \
142     {                                                                                      \
143         ELEMS_IN_VEC = 16 / sizeof(data_type)                                              \
144     };                                                                                     \
145                                                                                            \
146     VMerge2()                                                                              \
147     {                                                                                      \
148         support = checkHardwareSupport(se);                                                \
149     }                                                                                      \
150                                                                                            \
151     void operator()(const data_type * src0, const data_type * src1,                        \
152                     data_type * dst) const                                                 \
153     {                                                                                      \
154         reg_type v_src0 = _mm_loadu_##flavor((const cast_type *)(src0));                   \
155         reg_type v_src1 = _mm_loadu_##flavor((const cast_type *)(src0 + ELEMS_IN_VEC));    \
156         reg_type v_src2 = _mm_loadu_##flavor((const cast_type *)(src1));                   \
157         reg_type v_src3 = _mm_loadu_##flavor((const cast_type *)(src1 + ELEMS_IN_VEC));    \
158                                                                                            \
159         _mm_interleave(v_src0, v_src1, v_src2, v_src3);                                    \
160                                                                                            \
161         _mm_storeu_##flavor((cast_type *)(dst), v_src0);                                   \
162         _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC), v_src1);                    \
163         _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 2), v_src2);                \
164         _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 3), v_src3);                \
165     }                                                                                      \
166                                                                                            \
167     bool support;                                                                          \
168 }
169
170 #define MERGE3_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_interleave, flavor, se) \
171 template <>                                                                                \
172 struct VMerge3<data_type>                                                                  \
173 {                                                                                          \
174     enum                                                                                   \
175     {                                                                                      \
176         ELEMS_IN_VEC = 16 / sizeof(data_type)                                              \
177     };                                                                                     \
178                                                                                            \
179     VMerge3()                                                                              \
180     {                                                                                      \
181         support = checkHardwareSupport(se);                                                \
182     }                                                                                      \
183                                                                                            \
184     void operator()(const data_type * src0, const data_type * src1, const data_type * src2,\
185                     data_type * dst) const                                                 \
186     {                                                                                      \
187         reg_type v_src0 = _mm_loadu_##flavor((const cast_type *)(src0));                   \
188         reg_type v_src1 = _mm_loadu_##flavor((const cast_type *)(src0 + ELEMS_IN_VEC));    \
189         reg_type v_src2 = _mm_loadu_##flavor((const cast_type *)(src1));                   \
190         reg_type v_src3 = _mm_loadu_##flavor((const cast_type *)(src1 + ELEMS_IN_VEC));    \
191         reg_type v_src4 = _mm_loadu_##flavor((const cast_type *)(src2));                   \
192         reg_type v_src5 = _mm_loadu_##flavor((const cast_type *)(src2 + ELEMS_IN_VEC));    \
193                                                                                            \
194         _mm_interleave(v_src0, v_src1, v_src2,                                             \
195                        v_src3, v_src4, v_src5);                                            \
196                                                                                            \
197         _mm_storeu_##flavor((cast_type *)(dst), v_src0);                                   \
198         _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC), v_src1);                    \
199         _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 2), v_src2);                \
200         _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 3), v_src3);                \
201         _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 4), v_src4);                \
202         _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 5), v_src5);                \
203     }                                                                                      \
204                                                                                            \
205     bool support;                                                                          \
206 }
207
208 #define MERGE4_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_interleave, flavor, se) \
209 template <>                                                                                \
210 struct VMerge4<data_type>                                                                  \
211 {                                                                                          \
212     enum                                                                                   \
213     {                                                                                      \
214         ELEMS_IN_VEC = 16 / sizeof(data_type)                                              \
215     };                                                                                     \
216                                                                                            \
217     VMerge4()                                                                              \
218     {                                                                                      \
219         support = checkHardwareSupport(se);                                                \
220     }                                                                                      \
221                                                                                            \
222     void operator()(const data_type * src0, const data_type * src1,                        \
223                     const data_type * src2, const data_type * src3,                        \
224                     data_type * dst) const                                                 \
225     {                                                                                      \
226         reg_type v_src0 = _mm_loadu_##flavor((const cast_type *)(src0));                   \
227         reg_type v_src1 = _mm_loadu_##flavor((const cast_type *)(src0 + ELEMS_IN_VEC));    \
228         reg_type v_src2 = _mm_loadu_##flavor((const cast_type *)(src1));                   \
229         reg_type v_src3 = _mm_loadu_##flavor((const cast_type *)(src1 + ELEMS_IN_VEC));    \
230         reg_type v_src4 = _mm_loadu_##flavor((const cast_type *)(src2));                   \
231         reg_type v_src5 = _mm_loadu_##flavor((const cast_type *)(src2 + ELEMS_IN_VEC));    \
232         reg_type v_src6 = _mm_loadu_##flavor((const cast_type *)(src3));                   \
233         reg_type v_src7 = _mm_loadu_##flavor((const cast_type *)(src3 + ELEMS_IN_VEC));    \
234                                                                                            \
235         _mm_interleave(v_src0, v_src1, v_src2, v_src3,                                     \
236                        v_src4, v_src5, v_src6, v_src7);                                    \
237                                                                                            \
238         _mm_storeu_##flavor((cast_type *)(dst), v_src0);                                   \
239         _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC), v_src1);                    \
240         _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 2), v_src2);                \
241         _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 3), v_src3);                \
242         _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 4), v_src4);                \
243         _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 5), v_src5);                \
244         _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 6), v_src6);                \
245         _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 7), v_src7);                \
246     }                                                                                      \
247                                                                                            \
248     bool support;                                                                          \
249 }
250
251 MERGE2_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_interleave_epi8, si128, CV_CPU_SSE2);
252 MERGE3_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_interleave_epi8, si128, CV_CPU_SSE2);
253 MERGE4_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_interleave_epi8, si128, CV_CPU_SSE2);
254
255 #if CV_SSE4_1
256 MERGE2_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_interleave_epi16, si128, CV_CPU_SSE4_1);
257 MERGE3_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_interleave_epi16, si128, CV_CPU_SSE4_1);
258 MERGE4_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_interleave_epi16, si128, CV_CPU_SSE4_1);
259 #endif
260
261 MERGE2_KERNEL_TEMPLATE(   int,  __m128,   float, _mm_interleave_ps, ps, CV_CPU_SSE2);
262 MERGE3_KERNEL_TEMPLATE(   int,  __m128,   float, _mm_interleave_ps, ps, CV_CPU_SSE2);
263 MERGE4_KERNEL_TEMPLATE(   int,  __m128,   float, _mm_interleave_ps, ps, CV_CPU_SSE2);
264
265 #endif
266
267 template<typename T> static void
268 merge_( const T** src, T* dst, int len, int cn )
269 {
270     int k = cn % 4 ? cn % 4 : 4;
271     int i, j;
272     if( k == 1 )
273     {
274         const T* src0 = src[0];
275         for( i = j = 0; i < len; i++, j += cn )
276             dst[j] = src0[i];
277     }
278     else if( k == 2 )
279     {
280         const T *src0 = src[0], *src1 = src[1];
281         i = j = 0;
282 #if CV_NEON
283         if(cn == 2)
284         {
285             int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T);
286             int inc_j = 2 * inc_i;
287
288             VMerge2<T> vmerge;
289             for( ; i < len - inc_i; i += inc_i, j += inc_j)
290                 vmerge(src0 + i, src1 + i, dst + j);
291         }
292 #elif CV_SSE2
293         if(cn == 2)
294         {
295             int inc_i = 32/sizeof(T);
296             int inc_j = 2 * inc_i;
297
298             VMerge2<T> vmerge;
299             if (vmerge.support)
300                 for( ; i < len - inc_i; i += inc_i, j += inc_j)
301                     vmerge(src0 + i, src1 + i, dst + j);
302         }
303 #endif
304         for( ; i < len; i++, j += cn )
305         {
306             dst[j] = src0[i];
307             dst[j+1] = src1[i];
308         }
309     }
310     else if( k == 3 )
311     {
312         const T *src0 = src[0], *src1 = src[1], *src2 = src[2];
313         i = j = 0;
314 #if CV_NEON
315         if(cn == 3)
316         {
317             int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T);
318             int inc_j = 3 * inc_i;
319
320             VMerge3<T> vmerge;
321             for( ; i < len - inc_i; i += inc_i, j += inc_j)
322                 vmerge(src0 + i, src1 + i, src2 + i, dst + j);
323         }
324 #elif CV_SSE2
325         if(cn == 3)
326         {
327             int inc_i = 32/sizeof(T);
328             int inc_j = 3 * inc_i;
329
330             VMerge3<T> vmerge;
331             if (vmerge.support)
332                 for( ; i < len - inc_i; i += inc_i, j += inc_j)
333                     vmerge(src0 + i, src1 + i, src2 + i, dst + j);
334         }
335 #endif
336         for( ; i < len; i++, j += cn )
337         {
338             dst[j] = src0[i];
339             dst[j+1] = src1[i];
340             dst[j+2] = src2[i];
341         }
342     }
343     else
344     {
345         const T *src0 = src[0], *src1 = src[1], *src2 = src[2], *src3 = src[3];
346         i = j = 0;
347 #if CV_NEON
348         if(cn == 4)
349         {
350             int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T);
351             int inc_j = 4 * inc_i;
352
353             VMerge4<T> vmerge;
354             for( ; i < len - inc_i; i += inc_i, j += inc_j)
355                 vmerge(src0 + i, src1 + i, src2 + i, src3 + i, dst + j);
356         }
357 #elif CV_SSE2
358         if(cn == 4)
359         {
360             int inc_i = 32/sizeof(T);
361             int inc_j = 4 * inc_i;
362
363             VMerge4<T> vmerge;
364             if (vmerge.support)
365                 for( ; i < len - inc_i; i += inc_i, j += inc_j)
366                     vmerge(src0 + i, src1 + i, src2 + i, src3 + i, dst + j);
367         }
368 #endif
369         for( ; i < len; i++, j += cn )
370         {
371             dst[j] = src0[i]; dst[j+1] = src1[i];
372             dst[j+2] = src2[i]; dst[j+3] = src3[i];
373         }
374     }
375
376     for( ; k < cn; k += 4 )
377     {
378         const T *src0 = src[k], *src1 = src[k+1], *src2 = src[k+2], *src3 = src[k+3];
379         for( i = 0, j = k; i < len; i++, j += cn )
380         {
381             dst[j] = src0[i]; dst[j+1] = src1[i];
382             dst[j+2] = src2[i]; dst[j+3] = src3[i];
383         }
384     }
385 }
386
387
388 void merge8u(const uchar** src, uchar* dst, int len, int cn )
389 {
390     CALL_HAL(merge8u, cv_hal_merge8u, src, dst, len, cn)
391     merge_(src, dst, len, cn);
392 }
393
394 void merge16u(const ushort** src, ushort* dst, int len, int cn )
395 {
396     CALL_HAL(merge16u, cv_hal_merge16u, src, dst, len, cn)
397     merge_(src, dst, len, cn);
398 }
399
400 void merge32s(const int** src, int* dst, int len, int cn )
401 {
402     CALL_HAL(merge32s, cv_hal_merge32s, src, dst, len, cn)
403     merge_(src, dst, len, cn);
404 }
405
406 void merge64s(const int64** src, int64* dst, int len, int cn )
407 {
408     CALL_HAL(merge64s, cv_hal_merge64s, src, dst, len, cn)
409     merge_(src, dst, len, cn);
410 }
411
412 }}