modules/core/src/merge.cpp

   1 // This file is part of OpenCV project.
   2 // It is subject to the license terms in the LICENSE file found in the top-level directory
   3 // of this distribution and at http://opencv.org/license.html
   4
   5
   6 #include "precomp.hpp"
   7 #include "opencl_kernels_core.hpp"
   8
   9 namespace cv { namespace hal {
  10
  11 #if CV_SIMD
  12 /*
  13   The trick with STORE_UNALIGNED/STORE_ALIGNED_NOCACHE is the following:
  14   on IA there are instructions movntps and such to which
  15   v_store_interleave(...., STORE_ALIGNED_NOCACHE) is mapped.
  16   Those instructions write directly into memory w/o touching cache
  17   that results in dramatic speed improvements, especially on
  18   large arrays (FullHD, 4K etc.).
  19
  20   Those intrinsics require the destination address to be aligned
  21   by 16/32 bits (with SSE2 and AVX2, respectively).
  22   So we potentially split the processing into 3 stages:
  23   1) the optional prefix part [0:i0), where we use simple unaligned stores.
  24   2) the optional main part [i0:len - VECSZ], where we use "nocache" mode.
  25      But in some cases we have to use unaligned stores in this part.
  26   3) the optional suffix part (the tail) (len - VECSZ:len) where we switch back to "unaligned" mode
  27      to process the remaining len - VECSZ elements.
  28   In principle there can be very poorly aligned data where there is no main part.
  29   For that we set i0=0 and use unaligned stores for the whole array.
  30 */
  31 template<typename T, typename VecT> static void
  32 vecmerge_( const T** src, T* dst, int len, int cn )
  33 {
  34     const int VECSZ = VecT::nlanes;
  35     int i, i0 = 0;
  36     const T* src0 = src[0];
  37     const T* src1 = src[1];
  38
  39     int r = (int)((size_t)(void*)dst % (VECSZ*sizeof(T)));
  40     hal::StoreMode mode = hal::STORE_ALIGNED_NOCACHE;
  41     if( r != 0 )
  42     {
  43         mode = hal::STORE_UNALIGNED;
  44         if( r % cn == 0 && len > VECSZ )
  45             i0 = VECSZ - (r / cn);
  46     }
  47
  48     if( cn == 2 )
  49     {
  50         for( i = 0; i < len; i += VECSZ )
  51         {
  52             if( i > len - VECSZ )
  53             {
  54                 i = len - VECSZ;
  55                 mode = hal::STORE_UNALIGNED;
  56             }
  57             VecT a = vx_load(src0 + i), b = vx_load(src1 + i);
  58             v_store_interleave(dst + i*cn, a, b, mode);
  59             if( i < i0 )
  60             {
  61                 i = i0 - VECSZ;
  62                 mode = hal::STORE_ALIGNED_NOCACHE;
  63             }
  64         }
  65     }
  66     else if( cn == 3 )
  67     {
  68         const T* src2 = src[2];
  69         for( i = 0; i < len; i += VECSZ )
  70         {
  71             if( i > len - VECSZ )
  72             {
  73                 i = len - VECSZ;
  74                 mode = hal::STORE_UNALIGNED;
  75             }
  76             VecT a = vx_load(src0 + i), b = vx_load(src1 + i), c = vx_load(src2 + i);
  77             v_store_interleave(dst + i*cn, a, b, c, mode);
  78             if( i < i0 )
  79             {
  80                 i = i0 - VECSZ;
  81                 mode = hal::STORE_ALIGNED_NOCACHE;
  82             }
  83         }
  84     }
  85     else
  86     {
  87         CV_Assert( cn == 4 );
  88         const T* src2 = src[2];
  89         const T* src3 = src[3];
  90         for( i = 0; i < len; i += VECSZ )
  91         {
  92             if( i > len - VECSZ )
  93             {
  94                 i = len - VECSZ;
  95                 mode = hal::STORE_UNALIGNED;
  96             }
  97             VecT a = vx_load(src0 + i), b = vx_load(src1 + i);
  98             VecT c = vx_load(src2 + i), d = vx_load(src3 + i);
  99             v_store_interleave(dst + i*cn, a, b, c, d, mode);
 100             if( i < i0 )
 101             {
 102                 i = i0 - VECSZ;
 103                 mode = hal::STORE_ALIGNED_NOCACHE;
 104             }
 105         }
 106     }
 107     vx_cleanup();
 108 }
 109 #endif
 110
 111 template<typename T> static void
 112 merge_( const T** src, T* dst, int len, int cn )
 113 {
 114     int k = cn % 4 ? cn % 4 : 4;
 115     int i, j;
 116     if( k == 1 )
 117     {
 118         const T* src0 = src[0];
 119         for( i = j = 0; i < len; i++, j += cn )
 120             dst[j] = src0[i];
 121     }
 122     else if( k == 2 )
 123     {
 124         const T *src0 = src[0], *src1 = src[1];
 125         i = j = 0;
 126         for( ; i < len; i++, j += cn )
 127         {
 128             dst[j] = src0[i];
 129             dst[j+1] = src1[i];
 130         }
 131     }
 132     else if( k == 3 )
 133     {
 134         const T *src0 = src[0], *src1 = src[1], *src2 = src[2];
 135         i = j = 0;
 136         for( ; i < len; i++, j += cn )
 137         {
 138             dst[j] = src0[i];
 139             dst[j+1] = src1[i];
 140             dst[j+2] = src2[i];
 141         }
 142     }
 143     else
 144     {
 145         const T *src0 = src[0], *src1 = src[1], *src2 = src[2], *src3 = src[3];
 146         i = j = 0;
 147         for( ; i < len; i++, j += cn )
 148         {
 149             dst[j] = src0[i]; dst[j+1] = src1[i];
 150             dst[j+2] = src2[i]; dst[j+3] = src3[i];
 151         }
 152     }
 153
 154     for( ; k < cn; k += 4 )
 155     {
 156         const T *src0 = src[k], *src1 = src[k+1], *src2 = src[k+2], *src3 = src[k+3];
 157         for( i = 0, j = k; i < len; i++, j += cn )
 158         {
 159             dst[j] = src0[i]; dst[j+1] = src1[i];
 160             dst[j+2] = src2[i]; dst[j+3] = src3[i];
 161         }
 162     }
 163 }
 164
 165 void merge8u(const uchar** src, uchar* dst, int len, int cn )
 166 {
 167     CALL_HAL(merge8u, cv_hal_merge8u, src, dst, len, cn)
 168 #if CV_SIMD
 169     if( len >= v_uint8::nlanes && 2 <= cn && cn <= 4 )
 170         vecmerge_<uchar, v_uint8>(src, dst, len, cn);
 171     else
 172 #endif
 173         merge_(src, dst, len, cn);
 174 }
 175
 176 void merge16u(const ushort** src, ushort* dst, int len, int cn )
 177 {
 178     CALL_HAL(merge16u, cv_hal_merge16u, src, dst, len, cn)
 179 #if CV_SIMD
 180     if( len >= v_uint16::nlanes && 2 <= cn && cn <= 4 )
 181         vecmerge_<ushort, v_uint16>(src, dst, len, cn);
 182     else
 183 #endif
 184         merge_(src, dst, len, cn);
 185 }
 186
 187 void merge32s(const int** src, int* dst, int len, int cn )
 188 {
 189     CALL_HAL(merge32s, cv_hal_merge32s, src, dst, len, cn)
 190 #if CV_SIMD
 191     if( len >= v_int32::nlanes && 2 <= cn && cn <= 4 )
 192         vecmerge_<int, v_int32>(src, dst, len, cn);
 193     else
 194 #endif
 195         merge_(src, dst, len, cn);
 196 }
 197
 198 void merge64s(const int64** src, int64* dst, int len, int cn )
 199 {
 200     CALL_HAL(merge64s, cv_hal_merge64s, src, dst, len, cn)
 201 #if CV_SIMD
 202     if( len >= v_int64::nlanes && 2 <= cn && cn <= 4 )
 203         vecmerge_<int64, v_int64>(src, dst, len, cn);
 204     else
 205 #endif
 206         merge_(src, dst, len, cn);
 207 }
 208
 209 }} // cv::hal::
 210
 211
 212 typedef void (*MergeFunc)(const uchar** src, uchar* dst, int len, int cn);
 213
 214 static MergeFunc getMergeFunc(int depth)
 215 {
 216     static MergeFunc mergeTab[] =
 217     {
 218         (MergeFunc)GET_OPTIMIZED(cv::hal::merge8u), (MergeFunc)GET_OPTIMIZED(cv::hal::merge8u), (MergeFunc)GET_OPTIMIZED(cv::hal::merge16u), (MergeFunc)GET_OPTIMIZED(cv::hal::merge16u),
 219         (MergeFunc)GET_OPTIMIZED(cv::hal::merge32s), (MergeFunc)GET_OPTIMIZED(cv::hal::merge32s), (MergeFunc)GET_OPTIMIZED(cv::hal::merge64s), 0
 220     };
 221
 222     return mergeTab[depth];
 223 }
 224
 225 #ifdef HAVE_IPP
 226
 227 namespace cv {
 228 static bool ipp_merge(const Mat* mv, Mat& dst, int channels)
 229 {
 230 #ifdef HAVE_IPP_IW
 231     CV_INSTRUMENT_REGION_IPP()
 232
 233     if(channels != 3 && channels != 4)
 234         return false;
 235
 236     if(mv[0].dims <= 2)
 237     {
 238         IppiSize    size       = ippiSize(mv[0].size());
 239         const void *srcPtrs[4] = {NULL};
 240         size_t      srcStep    = mv[0].step;
 241         for(int i = 0; i < channels; i++)
 242         {
 243             srcPtrs[i] = mv[i].ptr();
 244             if(srcStep != mv[i].step)
 245                 return false;
 246         }
 247
 248         return CV_INSTRUMENT_FUN_IPP(llwiCopyMerge, srcPtrs, (int)srcStep, dst.ptr(), (int)dst.step, size, (int)mv[0].elemSize1(), channels, 0) >= 0;
 249     }
 250     else
 251     {
 252         const Mat *arrays[5] = {NULL};
 253         uchar     *ptrs[5]   = {NULL};
 254         arrays[0] = &dst;
 255
 256         for(int i = 1; i < channels; i++)
 257         {
 258             arrays[i] = &mv[i-1];
 259         }
 260
 261         NAryMatIterator it(arrays, ptrs);
 262         IppiSize size = { (int)it.size, 1 };
 263
 264         for( size_t i = 0; i < it.nplanes; i++, ++it )
 265         {
 266             if(CV_INSTRUMENT_FUN_IPP(llwiCopyMerge, (const void**)&ptrs[1], 0, ptrs[0], 0, size, (int)mv[0].elemSize1(), channels, 0) < 0)
 267                 return false;
 268         }
 269         return true;
 270     }
 271 #else
 272     CV_UNUSED(dst); CV_UNUSED(mv); CV_UNUSED(channels);
 273     return false;
 274 #endif
 275 }
 276 }
 277 #endif
 278
 279 void cv::merge(const Mat* mv, size_t n, OutputArray _dst)
 280 {
 281     CV_INSTRUMENT_REGION()
 282
 283     CV_Assert( mv && n > 0 );
 284
 285     int depth = mv[0].depth();
 286     bool allch1 = true;
 287     int k, cn = 0;
 288     size_t i;
 289
 290     for( i = 0; i < n; i++ )
 291     {
 292         CV_Assert(mv[i].size == mv[0].size && mv[i].depth() == depth);
 293         allch1 = allch1 && mv[i].channels() == 1;
 294         cn += mv[i].channels();
 295     }
 296
 297     CV_Assert( 0 < cn && cn <= CV_CN_MAX );
 298     _dst.create(mv[0].dims, mv[0].size, CV_MAKETYPE(depth, cn));
 299     Mat dst = _dst.getMat();
 300
 301     if( n == 1 )
 302     {
 303         mv[0].copyTo(dst);
 304         return;
 305     }
 306
 307     CV_IPP_RUN_FAST(ipp_merge(mv, dst, (int)n));
 308
 309     if( !allch1 )
 310     {
 311         AutoBuffer<int> pairs(cn*2);
 312         int j, ni=0;
 313
 314         for( i = 0, j = 0; i < n; i++, j += ni )
 315         {
 316             ni = mv[i].channels();
 317             for( k = 0; k < ni; k++ )
 318             {
 319                 pairs[(j+k)*2] = j + k;
 320                 pairs[(j+k)*2+1] = j + k;
 321             }
 322         }
 323         mixChannels( mv, n, &dst, 1, &pairs[0], cn );
 324         return;
 325     }
 326
 327     MergeFunc func = getMergeFunc(depth);
 328     CV_Assert( func != 0 );
 329
 330     size_t esz = dst.elemSize(), esz1 = dst.elemSize1();
 331     size_t blocksize0 = (int)((BLOCK_SIZE + esz-1)/esz);
 332     AutoBuffer<uchar> _buf((cn+1)*(sizeof(Mat*) + sizeof(uchar*)) + 16);
 333     const Mat** arrays = (const Mat**)_buf.data();
 334     uchar** ptrs = (uchar**)alignPtr(arrays + cn + 1, 16);
 335
 336     arrays[0] = &dst;
 337     for( k = 0; k < cn; k++ )
 338         arrays[k+1] = &mv[k];
 339
 340     NAryMatIterator it(arrays, ptrs, cn+1);
 341     size_t total = (int)it.size;
 342     size_t blocksize = std::min((size_t)CV_SPLIT_MERGE_MAX_BLOCK_SIZE(cn), cn <= 4 ? total : std::min(total, blocksize0));
 343
 344     for( i = 0; i < it.nplanes; i++, ++it )
 345     {
 346         for( size_t j = 0; j < total; j += blocksize )
 347         {
 348             size_t bsz = std::min(total - j, blocksize);
 349             func( (const uchar**)&ptrs[1], ptrs[0], (int)bsz, cn );
 350
 351             if( j + blocksize < total )
 352             {
 353                 ptrs[0] += bsz*esz;
 354                 for( int t = 0; t < cn; t++ )
 355                     ptrs[t+1] += bsz*esz1;
 356             }
 357         }
 358     }
 359 }
 360
 361 #ifdef HAVE_OPENCL
 362
 363 namespace cv {
 364
 365 static bool ocl_merge( InputArrayOfArrays _mv, OutputArray _dst )
 366 {
 367     std::vector<UMat> src, ksrc;
 368     _mv.getUMatVector(src);
 369     CV_Assert(!src.empty());
 370
 371     int type = src[0].type(), depth = CV_MAT_DEPTH(type),
 372             rowsPerWI = ocl::Device::getDefault().isIntel() ? 4 : 1;
 373     Size size = src[0].size();
 374
 375     for (size_t i = 0, srcsize = src.size(); i < srcsize; ++i)
 376     {
 377         int itype = src[i].type(), icn = CV_MAT_CN(itype), idepth = CV_MAT_DEPTH(itype),
 378                 esz1 = CV_ELEM_SIZE1(idepth);
 379         if (src[i].dims > 2)
 380             return false;
 381
 382         CV_Assert(size == src[i].size() && depth == idepth);
 383
 384         for (int cn = 0; cn < icn; ++cn)
 385         {
 386             UMat tsrc = src[i];
 387             tsrc.offset += cn * esz1;
 388             ksrc.push_back(tsrc);
 389         }
 390     }
 391     int dcn = (int)ksrc.size();
 392
 393     String srcargs, processelem, cndecl, indexdecl;
 394     for (int i = 0; i < dcn; ++i)
 395     {
 396         srcargs += format("DECLARE_SRC_PARAM(%d)", i);
 397         processelem += format("PROCESS_ELEM(%d)", i);
 398         indexdecl += format("DECLARE_INDEX(%d)", i);
 399         cndecl += format(" -D scn%d=%d", i, ksrc[i].channels());
 400     }
 401
 402     ocl::Kernel k("merge", ocl::core::split_merge_oclsrc,
 403                   format("-D OP_MERGE -D cn=%d -D T=%s -D DECLARE_SRC_PARAMS_N=%s"
 404                          " -D DECLARE_INDEX_N=%s -D PROCESS_ELEMS_N=%s%s",
 405                          dcn, ocl::memopTypeToStr(depth), srcargs.c_str(),
 406                          indexdecl.c_str(), processelem.c_str(), cndecl.c_str()));
 407     if (k.empty())
 408         return false;
 409
 410     _dst.create(size, CV_MAKE_TYPE(depth, dcn));
 411     UMat dst = _dst.getUMat();
 412
 413     int argidx = 0;
 414     for (int i = 0; i < dcn; ++i)
 415         argidx = k.set(argidx, ocl::KernelArg::ReadOnlyNoSize(ksrc[i]));
 416     argidx = k.set(argidx, ocl::KernelArg::WriteOnly(dst));
 417     k.set(argidx, rowsPerWI);
 418
 419     size_t globalsize[2] = { (size_t)dst.cols, ((size_t)dst.rows + rowsPerWI - 1) / rowsPerWI };
 420     return k.run(2, globalsize, NULL, false);
 421 }
 422
 423 }
 424
 425 #endif
 426
 427 void cv::merge(InputArrayOfArrays _mv, OutputArray _dst)
 428 {
 429     CV_INSTRUMENT_REGION()
 430
 431     CV_OCL_RUN(_mv.isUMatVector() && _dst.isUMat(),
 432                ocl_merge(_mv, _dst))
 433
 434     std::vector<Mat> mv;
 435     _mv.getMatVector(mv);
 436     merge(!mv.empty() ? &mv[0] : 0, mv.size(), _dst);
 437 }