modules/calib3d/src/stereosgbm.cpp

   1 /*M///////////////////////////////////////////////////////////////////////////////////////
   2 //
   3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
   4 //
   5 //  By downloading, copying, installing or using the software you agree to this license.
   6 //  If you do not agree to this license, do not download, install,
   7 //  copy or use the software.
   8 //
   9 //
  10 //                           License Agreement
  11 //                For Open Source Computer Vision Library
  12 //
  13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
  14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
  15 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
  16 // Third party copyrights are property of their respective owners.
  17 //
  18 // Redistribution and use in source and binary forms, with or without modification,
  19 // are permitted provided that the following conditions are met:
  20 //
  21 //   * Redistribution's of source code must retain the above copyright notice,
  22 //     this list of conditions and the following disclaimer.
  23 //
  24 //   * Redistribution's in binary form must reproduce the above copyright notice,
  25 //     this list of conditions and the following disclaimer in the documentation
  26 //     and/or other materials provided with the distribution.
  27 //
  28 //   * The name of the copyright holders may not be used to endorse or promote products
  29 //     derived from this software without specific prior written permission.
  30 //
  31 // This software is provided by the copyright holders and contributors "as is" and
  32 // any express or implied warranties, including, but not limited to, the implied
  33 // warranties of merchantability and fitness for a particular purpose are disclaimed.
  34 // In no event shall the Intel Corporation or contributors be liable for any direct,
  35 // indirect, incidental, special, exemplary, or consequential damages
  36 // (including, but not limited to, procurement of substitute goods or services;
  37 // loss of use, data, or profits; or business interruption) however caused
  38 // and on any theory of liability, whether in contract, strict liability,
  39 // or tort (including negligence or otherwise) arising in any way out of
  40 // the use of this software, even if advised of the possibility of such damage.
  41 //
  42 //M*/
  43
  44 /*
  45  This is a variation of
  46  "Stereo Processing by Semiglobal Matching and Mutual Information"
  47  by Heiko Hirschmuller.
  48
  49  We match blocks rather than individual pixels, thus the algorithm is called
  50  SGBM (Semi-global block matching)
  51  */
  52
  53 #include "precomp.hpp"
  54 #include <limits.h>
  55 #include "opencv2/hal/intrin.hpp"
  56
  57 namespace cv
  58 {
  59
  60 typedef uchar PixType;
  61 typedef short CostType;
  62 typedef short DispType;
  63
  64 enum { NR = 16, NR2 = NR/2 };
  65
  66
  67 struct StereoSGBMParams
  68 {
  69     StereoSGBMParams()
  70     {
  71         minDisparity = numDisparities = 0;
  72         SADWindowSize = 0;
  73         P1 = P2 = 0;
  74         disp12MaxDiff = 0;
  75         preFilterCap = 0;
  76         uniquenessRatio = 0;
  77         speckleWindowSize = 0;
  78         speckleRange = 0;
  79         mode = StereoSGBM::MODE_SGBM;
  80     }
  81
  82     StereoSGBMParams( int _minDisparity, int _numDisparities, int _SADWindowSize,
  83                       int _P1, int _P2, int _disp12MaxDiff, int _preFilterCap,
  84                       int _uniquenessRatio, int _speckleWindowSize, int _speckleRange,
  85                       int _mode )
  86     {
  87         minDisparity = _minDisparity;
  88         numDisparities = _numDisparities;
  89         SADWindowSize = _SADWindowSize;
  90         P1 = _P1;
  91         P2 = _P2;
  92         disp12MaxDiff = _disp12MaxDiff;
  93         preFilterCap = _preFilterCap;
  94         uniquenessRatio = _uniquenessRatio;
  95         speckleWindowSize = _speckleWindowSize;
  96         speckleRange = _speckleRange;
  97         mode = _mode;
  98     }
  99
 100     int minDisparity;
 101     int numDisparities;
 102     int SADWindowSize;
 103     int preFilterCap;
 104     int uniquenessRatio;
 105     int P1;
 106     int P2;
 107     int speckleWindowSize;
 108     int speckleRange;
 109     int disp12MaxDiff;
 110     int mode;
 111 };
 112
 113 /*
 114  For each pixel row1[x], max(maxD, 0) <= minX <= x < maxX <= width - max(0, -minD),
 115  and for each disparity minD<=d<maxD the function
 116  computes the cost (cost[(x-minX)*(maxD - minD) + (d - minD)]), depending on the difference between
 117  row1[x] and row2[x-d]. The subpixel algorithm from
 118  "Depth Discontinuities by Pixel-to-Pixel Stereo" by Stan Birchfield and C. Tomasi
 119  is used, hence the suffix BT.
 120
 121  the temporary buffer should contain width2*2 elements
 122  */
 123 static void calcPixelCostBT( const Mat& img1, const Mat& img2, int y,
 124                             int minD, int maxD, CostType* cost,
 125                             PixType* buffer, const PixType* tab,
 126                             int tabOfs, int )
 127 {
 128     int x, c, width = img1.cols, cn = img1.channels();
 129     int minX1 = std::max(maxD, 0), maxX1 = width + std::min(minD, 0);
 130     int minX2 = std::max(minX1 - maxD, 0), maxX2 = std::min(maxX1 - minD, width);
 131     int D = maxD - minD, width1 = maxX1 - minX1, width2 = maxX2 - minX2;
 132     const PixType *row1 = img1.ptr<PixType>(y), *row2 = img2.ptr<PixType>(y);
 133     PixType *prow1 = buffer + width2*2, *prow2 = prow1 + width*cn*2;
 134
 135     tab += tabOfs;
 136
 137     for( c = 0; c < cn*2; c++ )
 138     {
 139         prow1[width*c] = prow1[width*c + width-1] =
 140         prow2[width*c] = prow2[width*c + width-1] = tab[0];
 141     }
 142
 143     int n1 = y > 0 ? -(int)img1.step : 0, s1 = y < img1.rows-1 ? (int)img1.step : 0;
 144     int n2 = y > 0 ? -(int)img2.step : 0, s2 = y < img2.rows-1 ? (int)img2.step : 0;
 145
 146     if( cn == 1 )
 147     {
 148         for( x = 1; x < width-1; x++ )
 149         {
 150             prow1[x] = tab[(row1[x+1] - row1[x-1])*2 + row1[x+n1+1] - row1[x+n1-1] + row1[x+s1+1] - row1[x+s1-1]];
 151             prow2[width-1-x] = tab[(row2[x+1] - row2[x-1])*2 + row2[x+n2+1] - row2[x+n2-1] + row2[x+s2+1] - row2[x+s2-1]];
 152
 153             prow1[x+width] = row1[x];
 154             prow2[width-1-x+width] = row2[x];
 155         }
 156     }
 157     else
 158     {
 159         for( x = 1; x < width-1; x++ )
 160         {
 161             prow1[x] = tab[(row1[x*3+3] - row1[x*3-3])*2 + row1[x*3+n1+3] - row1[x*3+n1-3] + row1[x*3+s1+3] - row1[x*3+s1-3]];
 162             prow1[x+width] = tab[(row1[x*3+4] - row1[x*3-2])*2 + row1[x*3+n1+4] - row1[x*3+n1-2] + row1[x*3+s1+4] - row1[x*3+s1-2]];
 163             prow1[x+width*2] = tab[(row1[x*3+5] - row1[x*3-1])*2 + row1[x*3+n1+5] - row1[x*3+n1-1] + row1[x*3+s1+5] - row1[x*3+s1-1]];
 164
 165             prow2[width-1-x] = tab[(row2[x*3+3] - row2[x*3-3])*2 + row2[x*3+n2+3] - row2[x*3+n2-3] + row2[x*3+s2+3] - row2[x*3+s2-3]];
 166             prow2[width-1-x+width] = tab[(row2[x*3+4] - row2[x*3-2])*2 + row2[x*3+n2+4] - row2[x*3+n2-2] + row2[x*3+s2+4] - row2[x*3+s2-2]];
 167             prow2[width-1-x+width*2] = tab[(row2[x*3+5] - row2[x*3-1])*2 + row2[x*3+n2+5] - row2[x*3+n2-1] + row2[x*3+s2+5] - row2[x*3+s2-1]];
 168
 169             prow1[x+width*3] = row1[x*3];
 170             prow1[x+width*4] = row1[x*3+1];
 171             prow1[x+width*5] = row1[x*3+2];
 172
 173             prow2[width-1-x+width*3] = row2[x*3];
 174             prow2[width-1-x+width*4] = row2[x*3+1];
 175             prow2[width-1-x+width*5] = row2[x*3+2];
 176         }
 177     }
 178
 179     memset( cost, 0, width1*D*sizeof(cost[0]) );
 180
 181     buffer -= minX2;
 182     cost -= minX1*D + minD; // simplify the cost indices inside the loop
 183
 184 #if 1
 185     for( c = 0; c < cn*2; c++, prow1 += width, prow2 += width )
 186     {
 187         int diff_scale = c < cn ? 0 : 2;
 188
 189         // precompute
 190         //   v0 = min(row2[x-1/2], row2[x], row2[x+1/2]) and
 191         //   v1 = max(row2[x-1/2], row2[x], row2[x+1/2]) and
 192         for( x = minX2; x < maxX2; x++ )
 193         {
 194             int v = prow2[x];
 195             int vl = x > 0 ? (v + prow2[x-1])/2 : v;
 196             int vr = x < width-1 ? (v + prow2[x+1])/2 : v;
 197             int v0 = std::min(vl, vr); v0 = std::min(v0, v);
 198             int v1 = std::max(vl, vr); v1 = std::max(v1, v);
 199             buffer[x] = (PixType)v0;
 200             buffer[x + width2] = (PixType)v1;
 201         }
 202
 203         for( x = minX1; x < maxX1; x++ )
 204         {
 205             int u = prow1[x];
 206             int ul = x > 0 ? (u + prow1[x-1])/2 : u;
 207             int ur = x < width-1 ? (u + prow1[x+1])/2 : u;
 208             int u0 = std::min(ul, ur); u0 = std::min(u0, u);
 209             int u1 = std::max(ul, ur); u1 = std::max(u1, u);
 210
 211         #if CV_SIMD128
 212             v_uint8x16 _u  = v_setall_u8((uchar)u), _u0 = v_setall_u8((uchar)u0);
 213             v_uint8x16 _u1 = v_setall_u8((uchar)u1);
 214
 215             for( int d = minD; d < maxD; d += 16 )
 216             {
 217                 v_uint8x16 _v  = v_load(prow2  + width-x-1 + d);
 218                 v_uint8x16 _v0 = v_load(buffer + width-x-1 + d);
 219                 v_uint8x16 _v1 = v_load(buffer + width-x-1 + d + width2);
 220                 v_uint8x16 c0 = v_max(_u - _v1, _v0 - _u);
 221                 v_uint8x16 c1 = v_max(_v - _u1, _u0 - _v);
 222                 v_uint8x16 diff = v_min(c0, c1);
 223
 224                 v_int16x8 _c0 = v_load_aligned(cost + x*D + d);
 225                 v_int16x8 _c1 = v_load_aligned(cost + x*D + d + 8);
 226
 227                 v_uint16x8 diff1,diff2;
 228                 v_expand(diff,diff1,diff2);
 229                 v_store_aligned(cost + x*D + d,     _c0 + v_reinterpret_as_s16(diff1 >> diff_scale));
 230                 v_store_aligned(cost + x*D + d + 8, _c1 + v_reinterpret_as_s16(diff2 >> diff_scale));
 231             }
 232         #else
 233             for( int d = minD; d < maxD; d++ )
 234             {
 235                 int v = prow2[width-x-1 + d];
 236                 int v0 = buffer[width-x-1 + d];
 237                 int v1 = buffer[width-x-1 + d + width2];
 238                 int c0 = std::max(0, u - v1); c0 = std::max(c0, v0 - u);
 239                 int c1 = std::max(0, v - u1); c1 = std::max(c1, u0 - v);
 240
 241                 cost[x*D + d] = (CostType)(cost[x*D+d] + (std::min(c0, c1) >> diff_scale));
 242             }
 243         #endif
 244         }
 245     }
 246 #else
 247     for( c = 0; c < cn*2; c++, prow1 += width, prow2 += width )
 248     {
 249         for( x = minX1; x < maxX1; x++ )
 250         {
 251             int u = prow1[x];
 252         #if CV_SSE2
 253             if( useSIMD )
 254             {
 255                 __m128i _u = _mm_set1_epi8(u), z = _mm_setzero_si128();
 256
 257                 for( int d = minD; d < maxD; d += 16 )
 258                 {
 259                     __m128i _v = _mm_loadu_si128((const __m128i*)(prow2 + width-1-x + d));
 260                     __m128i diff = _mm_adds_epu8(_mm_subs_epu8(_u,_v), _mm_subs_epu8(_v,_u));
 261                     __m128i c0 = _mm_load_si128((__m128i*)(cost + x*D + d));
 262                     __m128i c1 = _mm_load_si128((__m128i*)(cost + x*D + d + 8));
 263
 264                     _mm_store_si128((__m128i*)(cost + x*D + d), _mm_adds_epi16(c0, _mm_unpacklo_epi8(diff,z)));
 265                     _mm_store_si128((__m128i*)(cost + x*D + d + 8), _mm_adds_epi16(c1, _mm_unpackhi_epi8(diff,z)));
 266                 }
 267             }
 268             else
 269         #endif
 270             {
 271                 for( int d = minD; d < maxD; d++ )
 272                 {
 273                     int v = prow2[width-1-x + d];
 274                     cost[x*D + d] = (CostType)(cost[x*D + d] + (CostType)std::abs(u - v));
 275                 }
 276             }
 277         }
 278     }
 279 #endif
 280 }
 281
 282
 283 /*
 284  computes disparity for "roi" in img1 w.r.t. img2 and write it to disp1buf.
 285  that is, disp1buf(x, y)=d means that img1(x+roi.x, y+roi.y) ~ img2(x+roi.x-d, y+roi.y).
 286  minD <= d < maxD.
 287  disp2full is the reverse disparity map, that is:
 288  disp2full(x+roi.x,y+roi.y)=d means that img2(x+roi.x, y+roi.y) ~ img1(x+roi.x+d, y+roi.y)
 289
 290  note that disp1buf will have the same size as the roi and
 291  disp2full will have the same size as img1 (or img2).
 292  On exit disp2buf is not the final disparity, it is an intermediate result that becomes
 293  final after all the tiles are processed.
 294
 295  the disparity in disp1buf is written with sub-pixel accuracy
 296  (4 fractional bits, see StereoSGBM::DISP_SCALE),
 297  using quadratic interpolation, while the disparity in disp2buf
 298  is written as is, without interpolation.
 299
 300  disp2cost also has the same size as img1 (or img2).
 301  It contains the minimum current cost, used to find the best disparity, corresponding to the minimal cost.
 302  */
 303 static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
 304                                  Mat& disp1, const StereoSGBMParams& params,
 305                                  Mat& buffer )
 306 {
 307 #if CV_SSE2
 308     static const uchar LSBTab[] =
 309     {
 310         0, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
 311         5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
 312         6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
 313         5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
 314         7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
 315         5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
 316         6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
 317         5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0
 318     };
 319
 320     volatile bool useSIMD = checkHardwareSupport(CV_CPU_SSE2);
 321 #endif
 322
 323     const int ALIGN = 16;
 324     const int DISP_SHIFT = StereoMatcher::DISP_SHIFT;
 325     const int DISP_SCALE = (1 << DISP_SHIFT);
 326     const CostType MAX_COST = SHRT_MAX;
 327
 328     int minD = params.minDisparity, maxD = minD + params.numDisparities;
 329     Size SADWindowSize;
 330     SADWindowSize.width = SADWindowSize.height = params.SADWindowSize > 0 ? params.SADWindowSize : 5;
 331     int ftzero = std::max(params.preFilterCap, 15) | 1;
 332     int uniquenessRatio = params.uniquenessRatio >= 0 ? params.uniquenessRatio : 10;
 333     int disp12MaxDiff = params.disp12MaxDiff > 0 ? params.disp12MaxDiff : 1;
 334     int P1 = params.P1 > 0 ? params.P1 : 2, P2 = std::max(params.P2 > 0 ? params.P2 : 5, P1+1);
 335     int k, width = disp1.cols, height = disp1.rows;
 336     int minX1 = std::max(maxD, 0), maxX1 = width + std::min(minD, 0);
 337     int D = maxD - minD, width1 = maxX1 - minX1;
 338     int INVALID_DISP = minD - 1, INVALID_DISP_SCALED = INVALID_DISP*DISP_SCALE;
 339     int SW2 = SADWindowSize.width/2, SH2 = SADWindowSize.height/2;
 340     bool fullDP = params.mode == StereoSGBM::MODE_HH;
 341     int npasses = fullDP ? 2 : 1;
 342     const int TAB_OFS = 256*4, TAB_SIZE = 256 + TAB_OFS*2;
 343     PixType clipTab[TAB_SIZE];
 344
 345     for( k = 0; k < TAB_SIZE; k++ )
 346         clipTab[k] = (PixType)(std::min(std::max(k - TAB_OFS, -ftzero), ftzero) + ftzero);
 347
 348     if( minX1 >= maxX1 )
 349     {
 350         disp1 = Scalar::all(INVALID_DISP_SCALED);
 351         return;
 352     }
 353
 354     CV_Assert( D % 16 == 0 );
 355
 356     // NR - the number of directions. the loop on x below that computes Lr assumes that NR == 8.
 357     // if you change NR, please, modify the loop as well.
 358     int D2 = D+16, NRD2 = NR2*D2;
 359
 360     // the number of L_r(.,.) and min_k L_r(.,.) lines in the buffer:
 361     // for 8-way dynamic programming we need the current row and
 362     // the previous row, i.e. 2 rows in total
 363     const int NLR = 2;
 364     const int LrBorder = NLR - 1;
 365
 366     // for each possible stereo match (img1(x,y) <=> img2(x-d,y))
 367     // we keep pixel difference cost (C) and the summary cost over NR directions (S).
 368     // we also keep all the partial costs for the previous line L_r(x,d) and also min_k L_r(x, k)
 369     size_t costBufSize = width1*D;
 370     size_t CSBufSize = costBufSize*(fullDP ? height : 1);
 371     size_t minLrSize = (width1 + LrBorder*2)*NR2, LrSize = minLrSize*D2;
 372     int hsumBufNRows = SH2*2 + 2;
 373     size_t totalBufSize = (LrSize + minLrSize)*NLR*sizeof(CostType) + // minLr[] and Lr[]
 374     costBufSize*(hsumBufNRows + 1)*sizeof(CostType) + // hsumBuf, pixdiff
 375     CSBufSize*2*sizeof(CostType) + // C, S
 376     width*16*img1.channels()*sizeof(PixType) + // temp buffer for computing per-pixel cost
 377     width*(sizeof(CostType) + sizeof(DispType)) + 1024; // disp2cost + disp2
 378
 379     if( buffer.empty() || !buffer.isContinuous() ||
 380         buffer.cols*buffer.rows*buffer.elemSize() < totalBufSize )
 381         buffer.create(1, (int)totalBufSize, CV_8U);
 382
 383     // summary cost over different (nDirs) directions
 384     CostType* Cbuf = (CostType*)alignPtr(buffer.ptr(), ALIGN);
 385     CostType* Sbuf = Cbuf + CSBufSize;
 386     CostType* hsumBuf = Sbuf + CSBufSize;
 387     CostType* pixDiff = hsumBuf + costBufSize*hsumBufNRows;
 388
 389     CostType* disp2cost = pixDiff + costBufSize + (LrSize + minLrSize)*NLR;
 390     DispType* disp2ptr = (DispType*)(disp2cost + width);
 391     PixType* tempBuf = (PixType*)(disp2ptr + width);
 392
 393     // add P2 to every C(x,y). it saves a few operations in the inner loops
 394     for( k = 0; k < width1*D; k++ )
 395         Cbuf[k] = (CostType)P2;
 396
 397     for( int pass = 1; pass <= npasses; pass++ )
 398     {
 399         int x1, y1, x2, y2, dx, dy;
 400
 401         if( pass == 1 )
 402         {
 403             y1 = 0; y2 = height; dy = 1;
 404             x1 = 0; x2 = width1; dx = 1;
 405         }
 406         else
 407         {
 408             y1 = height-1; y2 = -1; dy = -1;
 409             x1 = width1-1; x2 = -1; dx = -1;
 410         }
 411
 412         CostType *Lr[NLR]={0}, *minLr[NLR]={0};
 413
 414         for( k = 0; k < NLR; k++ )
 415         {
 416             // shift Lr[k] and minLr[k] pointers, because we allocated them with the borders,
 417             // and will occasionally use negative indices with the arrays
 418             // we need to shift Lr[k] pointers by 1, to give the space for d=-1.
 419             // however, then the alignment will be imperfect, i.e. bad for SSE,
 420             // thus we shift the pointers by 8 (8*sizeof(short) == 16 - ideal alignment)
 421             Lr[k] = pixDiff + costBufSize + LrSize*k + NRD2*LrBorder + 8;
 422             memset( Lr[k] - LrBorder*NRD2 - 8, 0, LrSize*sizeof(CostType) );
 423             minLr[k] = pixDiff + costBufSize + LrSize*NLR + minLrSize*k + NR2*LrBorder;
 424             memset( minLr[k] - LrBorder*NR2, 0, minLrSize*sizeof(CostType) );
 425         }
 426
 427         for( int y = y1; y != y2; y += dy )
 428         {
 429             int x, d;
 430             DispType* disp1ptr = disp1.ptr<DispType>(y);
 431             CostType* C = Cbuf + (!fullDP ? 0 : y*costBufSize);
 432             CostType* S = Sbuf + (!fullDP ? 0 : y*costBufSize);
 433
 434             if( pass == 1 ) // compute C on the first pass, and reuse it on the second pass, if any.
 435             {
 436                 int dy1 = y == 0 ? 0 : y + SH2, dy2 = y == 0 ? SH2 : dy1;
 437
 438                 for( k = dy1; k <= dy2; k++ )
 439                 {
 440                     CostType* hsumAdd = hsumBuf + (std::min(k, height-1) % hsumBufNRows)*costBufSize;
 441
 442                     if( k < height )
 443                     {
 444                         calcPixelCostBT( img1, img2, k, minD, maxD, pixDiff, tempBuf, clipTab, TAB_OFS, ftzero );
 445
 446                         memset(hsumAdd, 0, D*sizeof(CostType));
 447                         for( x = 0; x <= SW2*D; x += D )
 448                         {
 449                             int scale = x == 0 ? SW2 + 1 : 1;
 450                             for( d = 0; d < D; d++ )
 451                                 hsumAdd[d] = (CostType)(hsumAdd[d] + pixDiff[x + d]*scale);
 452                         }
 453
 454                         if( y > 0 )
 455                         {
 456                             const CostType* hsumSub = hsumBuf + (std::max(y - SH2 - 1, 0) % hsumBufNRows)*costBufSize;
 457                             const CostType* Cprev = !fullDP || y == 0 ? C : C - costBufSize;
 458
 459                             for( x = D; x < width1*D; x += D )
 460                             {
 461                                 const CostType* pixAdd = pixDiff + std::min(x + SW2*D, (width1-1)*D);
 462                                 const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*D, 0);
 463
 464                             #if CV_SSE2
 465                                 if( useSIMD )
 466                                 {
 467                                     for( d = 0; d < D; d += 8 )
 468                                     {
 469                                         __m128i hv = _mm_load_si128((const __m128i*)(hsumAdd + x - D + d));
 470                                         __m128i Cx = _mm_load_si128((__m128i*)(Cprev + x + d));
 471                                         hv = _mm_adds_epi16(_mm_subs_epi16(hv,
 472                                                                            _mm_load_si128((const __m128i*)(pixSub + d))),
 473                                                             _mm_load_si128((const __m128i*)(pixAdd + d)));
 474                                         Cx = _mm_adds_epi16(_mm_subs_epi16(Cx,
 475                                                                            _mm_load_si128((const __m128i*)(hsumSub + x + d))),
 476                                                             hv);
 477                                         _mm_store_si128((__m128i*)(hsumAdd + x + d), hv);
 478                                         _mm_store_si128((__m128i*)(C + x + d), Cx);
 479                                     }
 480                                 }
 481                                 else
 482                             #endif
 483                                 {
 484                                     for( d = 0; d < D; d++ )
 485                                     {
 486                                         int hv = hsumAdd[x + d] = (CostType)(hsumAdd[x - D + d] + pixAdd[d] - pixSub[d]);
 487                                         C[x + d] = (CostType)(Cprev[x + d] + hv - hsumSub[x + d]);
 488                                     }
 489                                 }
 490                             }
 491                         }
 492                         else
 493                         {
 494                             for( x = D; x < width1*D; x += D )
 495                             {
 496                                 const CostType* pixAdd = pixDiff + std::min(x + SW2*D, (width1-1)*D);
 497                                 const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*D, 0);
 498
 499                                 for( d = 0; d < D; d++ )
 500                                     hsumAdd[x + d] = (CostType)(hsumAdd[x - D + d] + pixAdd[d] - pixSub[d]);
 501                             }
 502                         }
 503                     }
 504
 505                     if( y == 0 )
 506                     {
 507                         int scale = k == 0 ? SH2 + 1 : 1;
 508                         for( x = 0; x < width1*D; x++ )
 509                             C[x] = (CostType)(C[x] + hsumAdd[x]*scale);
 510                     }
 511                 }
 512
 513                 // also, clear the S buffer
 514                 for( k = 0; k < width1*D; k++ )
 515                     S[k] = 0;
 516             }
 517
 518             // clear the left and the right borders
 519             memset( Lr[0] - NRD2*LrBorder - 8, 0, NRD2*LrBorder*sizeof(CostType) );
 520             memset( Lr[0] + width1*NRD2 - 8, 0, NRD2*LrBorder*sizeof(CostType) );
 521             memset( minLr[0] - NR2*LrBorder, 0, NR2*LrBorder*sizeof(CostType) );
 522             memset( minLr[0] + width1*NR2, 0, NR2*LrBorder*sizeof(CostType) );
 523
 524             /*
 525              [formula 13 in the paper]
 526              compute L_r(p, d) = C(p, d) +
 527              min(L_r(p-r, d),
 528              L_r(p-r, d-1) + P1,
 529              L_r(p-r, d+1) + P1,
 530              min_k L_r(p-r, k) + P2) - min_k L_r(p-r, k)
 531              where p = (x,y), r is one of the directions.
 532              we process all the directions at once:
 533              0: r=(-dx, 0)
 534              1: r=(-1, -dy)
 535              2: r=(0, -dy)
 536              3: r=(1, -dy)
 537              4: r=(-2, -dy)
 538              5: r=(-1, -dy*2)
 539              6: r=(1, -dy*2)
 540              7: r=(2, -dy)
 541              */
 542             for( x = x1; x != x2; x += dx )
 543             {
 544                 int xm = x*NR2, xd = xm*D2;
 545
 546                 int delta0 = minLr[0][xm - dx*NR2] + P2, delta1 = minLr[1][xm - NR2 + 1] + P2;
 547                 int delta2 = minLr[1][xm + 2] + P2, delta3 = minLr[1][xm + NR2 + 3] + P2;
 548
 549                 CostType* Lr_p0 = Lr[0] + xd - dx*NRD2;
 550                 CostType* Lr_p1 = Lr[1] + xd - NRD2 + D2;
 551                 CostType* Lr_p2 = Lr[1] + xd + D2*2;
 552                 CostType* Lr_p3 = Lr[1] + xd + NRD2 + D2*3;
 553
 554                 Lr_p0[-1] = Lr_p0[D] = Lr_p1[-1] = Lr_p1[D] =
 555                 Lr_p2[-1] = Lr_p2[D] = Lr_p3[-1] = Lr_p3[D] = MAX_COST;
 556
 557                 CostType* Lr_p = Lr[0] + xd;
 558                 const CostType* Cp = C + x*D;
 559                 CostType* Sp = S + x*D;
 560
 561             #if CV_SSE2
 562                 if( useSIMD )
 563                 {
 564                     __m128i _P1 = _mm_set1_epi16((short)P1);
 565
 566                     __m128i _delta0 = _mm_set1_epi16((short)delta0);
 567                     __m128i _delta1 = _mm_set1_epi16((short)delta1);
 568                     __m128i _delta2 = _mm_set1_epi16((short)delta2);
 569                     __m128i _delta3 = _mm_set1_epi16((short)delta3);
 570                     __m128i _minL0 = _mm_set1_epi16((short)MAX_COST);
 571
 572                     for( d = 0; d < D; d += 8 )
 573                     {
 574                         __m128i Cpd = _mm_load_si128((const __m128i*)(Cp + d));
 575                         __m128i L0, L1, L2, L3;
 576
 577                         L0 = _mm_load_si128((const __m128i*)(Lr_p0 + d));
 578                         L1 = _mm_load_si128((const __m128i*)(Lr_p1 + d));
 579                         L2 = _mm_load_si128((const __m128i*)(Lr_p2 + d));
 580                         L3 = _mm_load_si128((const __m128i*)(Lr_p3 + d));
 581
 582                         L0 = _mm_min_epi16(L0, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p0 + d - 1)), _P1));
 583                         L0 = _mm_min_epi16(L0, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p0 + d + 1)), _P1));
 584
 585                         L1 = _mm_min_epi16(L1, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p1 + d - 1)), _P1));
 586                         L1 = _mm_min_epi16(L1, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p1 + d + 1)), _P1));
 587
 588                         L2 = _mm_min_epi16(L2, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p2 + d - 1)), _P1));
 589                         L2 = _mm_min_epi16(L2, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p2 + d + 1)), _P1));
 590
 591                         L3 = _mm_min_epi16(L3, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p3 + d - 1)), _P1));
 592                         L3 = _mm_min_epi16(L3, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p3 + d + 1)), _P1));
 593
 594                         L0 = _mm_min_epi16(L0, _delta0);
 595                         L0 = _mm_adds_epi16(_mm_subs_epi16(L0, _delta0), Cpd);
 596
 597                         L1 = _mm_min_epi16(L1, _delta1);
 598                         L1 = _mm_adds_epi16(_mm_subs_epi16(L1, _delta1), Cpd);
 599
 600                         L2 = _mm_min_epi16(L2, _delta2);
 601                         L2 = _mm_adds_epi16(_mm_subs_epi16(L2, _delta2), Cpd);
 602
 603                         L3 = _mm_min_epi16(L3, _delta3);
 604                         L3 = _mm_adds_epi16(_mm_subs_epi16(L3, _delta3), Cpd);
 605
 606                         _mm_store_si128( (__m128i*)(Lr_p + d), L0);
 607                         _mm_store_si128( (__m128i*)(Lr_p + d + D2), L1);
 608                         _mm_store_si128( (__m128i*)(Lr_p + d + D2*2), L2);
 609                         _mm_store_si128( (__m128i*)(Lr_p + d + D2*3), L3);
 610
 611                         __m128i t0 = _mm_min_epi16(_mm_unpacklo_epi16(L0, L2), _mm_unpackhi_epi16(L0, L2));
 612                         __m128i t1 = _mm_min_epi16(_mm_unpacklo_epi16(L1, L3), _mm_unpackhi_epi16(L1, L3));
 613                         t0 = _mm_min_epi16(_mm_unpacklo_epi16(t0, t1), _mm_unpackhi_epi16(t0, t1));
 614                         _minL0 = _mm_min_epi16(_minL0, t0);
 615
 616                         __m128i Sval = _mm_load_si128((const __m128i*)(Sp + d));
 617
 618                         L0 = _mm_adds_epi16(L0, L1);
 619                         L2 = _mm_adds_epi16(L2, L3);
 620                         Sval = _mm_adds_epi16(Sval, L0);
 621                         Sval = _mm_adds_epi16(Sval, L2);
 622
 623                         _mm_store_si128((__m128i*)(Sp + d), Sval);
 624                     }
 625
 626                     _minL0 = _mm_min_epi16(_minL0, _mm_srli_si128(_minL0, 8));
 627                     _mm_storel_epi64((__m128i*)&minLr[0][xm], _minL0);
 628                 }
 629                 else
 630             #endif
 631                 {
 632                     int minL0 = MAX_COST, minL1 = MAX_COST, minL2 = MAX_COST, minL3 = MAX_COST;
 633
 634                     for( d = 0; d < D; d++ )
 635                     {
 636                         int Cpd = Cp[d], L0, L1, L2, L3;
 637
 638                         L0 = Cpd + std::min((int)Lr_p0[d], std::min(Lr_p0[d-1] + P1, std::min(Lr_p0[d+1] + P1, delta0))) - delta0;
 639                         L1 = Cpd + std::min((int)Lr_p1[d], std::min(Lr_p1[d-1] + P1, std::min(Lr_p1[d+1] + P1, delta1))) - delta1;
 640                         L2 = Cpd + std::min((int)Lr_p2[d], std::min(Lr_p2[d-1] + P1, std::min(Lr_p2[d+1] + P1, delta2))) - delta2;
 641                         L3 = Cpd + std::min((int)Lr_p3[d], std::min(Lr_p3[d-1] + P1, std::min(Lr_p3[d+1] + P1, delta3))) - delta3;
 642
 643                         Lr_p[d] = (CostType)L0;
 644                         minL0 = std::min(minL0, L0);
 645
 646                         Lr_p[d + D2] = (CostType)L1;
 647                         minL1 = std::min(minL1, L1);
 648
 649                         Lr_p[d + D2*2] = (CostType)L2;
 650                         minL2 = std::min(minL2, L2);
 651
 652                         Lr_p[d + D2*3] = (CostType)L3;
 653                         minL3 = std::min(minL3, L3);
 654
 655                         Sp[d] = saturate_cast<CostType>(Sp[d] + L0 + L1 + L2 + L3);
 656                     }
 657                     minLr[0][xm] = (CostType)minL0;
 658                     minLr[0][xm+1] = (CostType)minL1;
 659                     minLr[0][xm+2] = (CostType)minL2;
 660                     minLr[0][xm+3] = (CostType)minL3;
 661                 }
 662             }
 663
 664             if( pass == npasses )
 665             {
 666                 for( x = 0; x < width; x++ )
 667                 {
 668                     disp1ptr[x] = disp2ptr[x] = (DispType)INVALID_DISP_SCALED;
 669                     disp2cost[x] = MAX_COST;
 670                 }
 671
 672                 for( x = width1 - 1; x >= 0; x-- )
 673                 {
 674                     CostType* Sp = S + x*D;
 675                     int minS = MAX_COST, bestDisp = -1;
 676
 677                     if( npasses == 1 )
 678                     {
 679                         int xm = x*NR2, xd = xm*D2;
 680
 681                         int minL0 = MAX_COST;
 682                         int delta0 = minLr[0][xm + NR2] + P2;
 683                         CostType* Lr_p0 = Lr[0] + xd + NRD2;
 684                         Lr_p0[-1] = Lr_p0[D] = MAX_COST;
 685                         CostType* Lr_p = Lr[0] + xd;
 686
 687                         const CostType* Cp = C + x*D;
 688
 689                     #if CV_SSE2
 690                         if( useSIMD )
 691                         {
 692                             __m128i _P1 = _mm_set1_epi16((short)P1);
 693                             __m128i _delta0 = _mm_set1_epi16((short)delta0);
 694
 695                             __m128i _minL0 = _mm_set1_epi16((short)minL0);
 696                             __m128i _minS = _mm_set1_epi16(MAX_COST), _bestDisp = _mm_set1_epi16(-1);
 697                             __m128i _d8 = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7), _8 = _mm_set1_epi16(8);
 698
 699                             for( d = 0; d < D; d += 8 )
 700                             {
 701                                 __m128i Cpd = _mm_load_si128((const __m128i*)(Cp + d)), L0;
 702
 703                                 L0 = _mm_load_si128((const __m128i*)(Lr_p0 + d));
 704                                 L0 = _mm_min_epi16(L0, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p0 + d - 1)), _P1));
 705                                 L0 = _mm_min_epi16(L0, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p0 + d + 1)), _P1));
 706                                 L0 = _mm_min_epi16(L0, _delta0);
 707                                 L0 = _mm_adds_epi16(_mm_subs_epi16(L0, _delta0), Cpd);
 708
 709                                 _mm_store_si128((__m128i*)(Lr_p + d), L0);
 710                                 _minL0 = _mm_min_epi16(_minL0, L0);
 711                                 L0 = _mm_adds_epi16(L0, *(__m128i*)(Sp + d));
 712                                 _mm_store_si128((__m128i*)(Sp + d), L0);
 713
 714                                 __m128i mask = _mm_cmpgt_epi16(_minS, L0);
 715                                 _minS = _mm_min_epi16(_minS, L0);
 716                                 _bestDisp = _mm_xor_si128(_bestDisp, _mm_and_si128(_mm_xor_si128(_bestDisp,_d8), mask));
 717                                 _d8 = _mm_adds_epi16(_d8, _8);
 718                             }
 719
 720                             short CV_DECL_ALIGNED(16) bestDispBuf[8];
 721                             _mm_store_si128((__m128i*)bestDispBuf, _bestDisp);
 722
 723                             _minL0 = _mm_min_epi16(_minL0, _mm_srli_si128(_minL0, 8));
 724                             _minL0 = _mm_min_epi16(_minL0, _mm_srli_si128(_minL0, 4));
 725                             _minL0 = _mm_min_epi16(_minL0, _mm_srli_si128(_minL0, 2));
 726
 727                             __m128i qS = _mm_min_epi16(_minS, _mm_srli_si128(_minS, 8));
 728                             qS = _mm_min_epi16(qS, _mm_srli_si128(qS, 4));
 729                             qS = _mm_min_epi16(qS, _mm_srli_si128(qS, 2));
 730
 731                             minLr[0][xm] = (CostType)_mm_cvtsi128_si32(_minL0);
 732                             minS = (CostType)_mm_cvtsi128_si32(qS);
 733
 734                             qS = _mm_shuffle_epi32(_mm_unpacklo_epi16(qS, qS), 0);
 735                             qS = _mm_cmpeq_epi16(_minS, qS);
 736                             int idx = _mm_movemask_epi8(_mm_packs_epi16(qS, qS)) & 255;
 737
 738                             bestDisp = bestDispBuf[LSBTab[idx]];
 739                         }
 740                         else
 741                     #endif
 742                         {
 743                             for( d = 0; d < D; d++ )
 744                             {
 745                                 int L0 = Cp[d] + std::min((int)Lr_p0[d], std::min(Lr_p0[d-1] + P1, std::min(Lr_p0[d+1] + P1, delta0))) - delta0;
 746
 747                                 Lr_p[d] = (CostType)L0;
 748                                 minL0 = std::min(minL0, L0);
 749
 750                                 int Sval = Sp[d] = saturate_cast<CostType>(Sp[d] + L0);
 751                                 if( Sval < minS )
 752                                 {
 753                                     minS = Sval;
 754                                     bestDisp = d;
 755                                 }
 756                             }
 757                             minLr[0][xm] = (CostType)minL0;
 758                         }
 759                     }
 760                     else
 761                     {
 762                         for( d = 0; d < D; d++ )
 763                         {
 764                             int Sval = Sp[d];
 765                             if( Sval < minS )
 766                             {
 767                                 minS = Sval;
 768                                 bestDisp = d;
 769                             }
 770                         }
 771                     }
 772
 773                     for( d = 0; d < D; d++ )
 774                     {
 775                         if( Sp[d]*(100 - uniquenessRatio) < minS*100 && std::abs(bestDisp - d) > 1 )
 776                             break;
 777                     }
 778                     if( d < D )
 779                         continue;
 780                     d = bestDisp;
 781                     int _x2 = x + minX1 - d - minD;
 782                     if( disp2cost[_x2] > minS )
 783                     {
 784                         disp2cost[_x2] = (CostType)minS;
 785                         disp2ptr[_x2] = (DispType)(d + minD);
 786                     }
 787
 788                     if( 0 < d && d < D-1 )
 789                     {
 790                         // do subpixel quadratic interpolation:
 791                         //   fit parabola into (x1=d-1, y1=Sp[d-1]), (x2=d, y2=Sp[d]), (x3=d+1, y3=Sp[d+1])
 792                         //   then find minimum of the parabola.
 793                         int denom2 = std::max(Sp[d-1] + Sp[d+1] - 2*Sp[d], 1);
 794                         d = d*DISP_SCALE + ((Sp[d-1] - Sp[d+1])*DISP_SCALE + denom2)/(denom2*2);
 795                     }
 796                     else
 797                         d *= DISP_SCALE;
 798                     disp1ptr[x + minX1] = (DispType)(d + minD*DISP_SCALE);
 799                 }
 800
 801                 for( x = minX1; x < maxX1; x++ )
 802                 {
 803                     // we round the computed disparity both towards -inf and +inf and check
 804                     // if either of the corresponding disparities in disp2 is consistent.
 805                     // This is to give the computed disparity a chance to look valid if it is.
 806                     int d1 = disp1ptr[x];
 807                     if( d1 == INVALID_DISP_SCALED )
 808                         continue;
 809                     int _d = d1 >> DISP_SHIFT;
 810                     int d_ = (d1 + DISP_SCALE-1) >> DISP_SHIFT;
 811                     int _x = x - _d, x_ = x - d_;
 812                     if( 0 <= _x && _x < width && disp2ptr[_x] >= minD && std::abs(disp2ptr[_x] - _d) > disp12MaxDiff &&
 813                        0 <= x_ && x_ < width && disp2ptr[x_] >= minD && std::abs(disp2ptr[x_] - d_) > disp12MaxDiff )
 814                         disp1ptr[x] = (DispType)INVALID_DISP_SCALED;
 815                 }
 816             }
 817
 818             // now shift the cyclic buffers
 819             std::swap( Lr[0], Lr[1] );
 820             std::swap( minLr[0], minLr[1] );
 821         }
 822     }
 823 }
 824
 825 //////////////////////////////////////////////////////////////////////////////////////////////////////
 826
 827 void getBufferPointers(Mat& buffer, int width, int width1, int D, int num_ch, int SH2, int P2,
 828                        CostType*& curCostVolumeLine, CostType*& hsumBuf, CostType*& pixDiff,
 829                        PixType*& tmpBuf, CostType*& horPassCostVolume,
 830                        CostType*& vertPassCostVolume, CostType*& vertPassMin, CostType*& rightPassBuf,
 831                        CostType*& disp2CostBuf, short*& disp2Buf);
 832
 833 struct SGBM3WayMainLoop : public ParallelLoopBody
 834 {
 835     Mat* buffers;
 836     const Mat *img1, *img2;
 837     Mat* dst_disp;
 838
 839     int nstripes, stripe_sz;
 840     int stripe_overlap;
 841
 842     int width,height;
 843     int minD, maxD, D;
 844     int minX1, maxX1, width1;
 845
 846     int SW2, SH2;
 847     int P1, P2;
 848     int uniquenessRatio, disp12MaxDiff;
 849
 850     int costBufSize, hsumBufNRows;
 851     int TAB_OFS, ftzero;
 852
 853     PixType* clipTab;
 854
 855     SGBM3WayMainLoop(Mat *_buffers, const Mat& _img1, const Mat& _img2, Mat* _dst_disp, const StereoSGBMParams& params, PixType* _clipTab, int _nstripes, int _stripe_overlap);
 856     void getRawMatchingCost(CostType* C, CostType* hsumBuf, CostType* pixDiff, PixType* tmpBuf, int y, int src_start_idx) const;
 857     void operator () (const Range& range) const;
 858 };
 859
 860 SGBM3WayMainLoop::SGBM3WayMainLoop(Mat *_buffers, const Mat& _img1, const Mat& _img2, Mat* _dst_disp, const StereoSGBMParams& params, PixType* _clipTab, int _nstripes, int _stripe_overlap):
 861 buffers(_buffers), img1(&_img1), img2(&_img2), dst_disp(_dst_disp), clipTab(_clipTab)
 862 {
 863     nstripes = _nstripes;
 864     stripe_overlap = _stripe_overlap;
 865     stripe_sz = (int)ceil(img1->rows/(double)nstripes);
 866
 867     width = img1->cols; height = img1->rows;
 868     minD = params.minDisparity; maxD = minD + params.numDisparities; D = maxD - minD;
 869     minX1 = std::max(maxD, 0); maxX1 = width + std::min(minD, 0); width1 = maxX1 - minX1;
 870     CV_Assert( D % 16 == 0 );
 871
 872     SW2 = SH2 = params.SADWindowSize > 0 ? params.SADWindowSize/2 : 1;
 873
 874     P1 = params.P1 > 0 ? params.P1 : 2; P2 = std::max(params.P2 > 0 ? params.P2 : 5, P1+1);
 875     uniquenessRatio = params.uniquenessRatio >= 0 ? params.uniquenessRatio : 10;
 876     disp12MaxDiff = params.disp12MaxDiff > 0 ? params.disp12MaxDiff : 1;
 877
 878     costBufSize = width1*D;
 879     hsumBufNRows = SH2*2 + 2;
 880     TAB_OFS = 256*4;
 881     ftzero = std::max(params.preFilterCap, 15) | 1;
 882 }
 883
 884 void getBufferPointers(Mat& buffer, int width, int width1, int D, int num_ch, int SH2, int P2,
 885                        CostType*& curCostVolumeLine, CostType*& hsumBuf, CostType*& pixDiff,
 886                        PixType*& tmpBuf, CostType*& horPassCostVolume,
 887                        CostType*& vertPassCostVolume, CostType*& vertPassMin, CostType*& rightPassBuf,
 888                        CostType*& disp2CostBuf, short*& disp2Buf)
 889 {
 890     // allocating all the required memory:
 891     int costVolumeLineSize = width1*D;
 892     int width1_ext = width1+2;
 893     int costVolumeLineSize_ext = width1_ext*D;
 894     int hsumBufNRows = SH2*2 + 2;
 895
 896     // main buffer to store matching costs for the current line:
 897     int curCostVolumeLineSize = costVolumeLineSize*sizeof(CostType);
 898
 899     // auxiliary buffers for the raw matching cost computation:
 900     int hsumBufSize  = costVolumeLineSize*hsumBufNRows*sizeof(CostType);
 901     int pixDiffSize  = costVolumeLineSize*sizeof(CostType);
 902     int tmpBufSize   = width*16*num_ch*sizeof(PixType);
 903
 904     // auxiliary buffers for the matching cost aggregation:
 905     int horPassCostVolumeSize  = costVolumeLineSize_ext*sizeof(CostType); // buffer for the 2-pass horizontal cost aggregation
 906     int vertPassCostVolumeSize = costVolumeLineSize_ext*sizeof(CostType); // buffer for the vertical cost aggregation
 907     int vertPassMinSize        = width1_ext*sizeof(CostType);             // buffer for storing minimum costs from the previous line
 908     int rightPassBufSize       = D*sizeof(CostType);                      // additional small buffer for the right-to-left pass
 909
 910     // buffers for the pseudo-LRC check:
 911     int disp2CostBufSize = width*sizeof(CostType);
 912     int disp2BufSize     = width*sizeof(short);
 913
 914     // sum up the sizes of all the buffers:
 915     size_t totalBufSize = curCostVolumeLineSize +
 916                           hsumBufSize +
 917                           pixDiffSize +
 918                           tmpBufSize  +
 919                           horPassCostVolumeSize +
 920                           vertPassCostVolumeSize +
 921                           vertPassMinSize +
 922                           rightPassBufSize +
 923                           disp2CostBufSize +
 924                           disp2BufSize +
 925                           16;  //to compensate for the alignPtr shifts
 926
 927     if( buffer.empty() || !buffer.isContinuous() || buffer.cols*buffer.rows*buffer.elemSize() < totalBufSize )
 928         buffer.create(1, (int)totalBufSize, CV_8U);
 929
 930     // set up all the pointers:
 931     curCostVolumeLine  = (CostType*)alignPtr(buffer.ptr(), 16);
 932     hsumBuf            = curCostVolumeLine + costVolumeLineSize;
 933     pixDiff            = hsumBuf + costVolumeLineSize*hsumBufNRows;
 934     tmpBuf             = (PixType*)(pixDiff + costVolumeLineSize);
 935     horPassCostVolume  = (CostType*)(tmpBuf + width*16*num_ch);
 936     vertPassCostVolume = horPassCostVolume + costVolumeLineSize_ext;
 937     rightPassBuf       = vertPassCostVolume + costVolumeLineSize_ext;
 938     vertPassMin        = rightPassBuf + D;
 939     disp2CostBuf       = vertPassMin + width1_ext;
 940     disp2Buf           = disp2CostBuf + width;
 941
 942     // initialize memory:
 943     memset(buffer.ptr(),0,totalBufSize);
 944     for(int i=0;i<costVolumeLineSize;i++)
 945         curCostVolumeLine[i] = (CostType)P2; //such initialization simplifies the cost aggregation loops a bit
 946 }
 947
 948 // performing block matching and building raw cost-volume for the current row
 949 void SGBM3WayMainLoop::getRawMatchingCost(CostType* C, // target cost-volume row
 950                                           CostType* hsumBuf, CostType* pixDiff, PixType* tmpBuf, //buffers
 951                                           int y, int src_start_idx) const
 952 {
 953     int x, d;
 954     int dy1 = (y == src_start_idx) ? src_start_idx : y + SH2, dy2 = (y == src_start_idx) ? src_start_idx+SH2 : dy1;
 955
 956     for(int k = dy1; k <= dy2; k++ )
 957     {
 958         CostType* hsumAdd = hsumBuf + (std::min(k, height-1) % hsumBufNRows)*costBufSize;
 959         if( k < height )
 960         {
 961             calcPixelCostBT( *img1, *img2, k, minD, maxD, pixDiff, tmpBuf, clipTab, TAB_OFS, ftzero );
 962
 963             memset(hsumAdd, 0, D*sizeof(CostType));
 964             for(x = 0; x <= SW2*D; x += D )
 965             {
 966                 int scale = x == 0 ? SW2 + 1 : 1;
 967
 968                 for( d = 0; d < D; d++ )
 969                     hsumAdd[d] = (CostType)(hsumAdd[d] + pixDiff[x + d]*scale);
 970             }
 971
 972             if( y > src_start_idx )
 973             {
 974                 const CostType* hsumSub = hsumBuf + (std::max(y - SH2 - 1, src_start_idx) % hsumBufNRows)*costBufSize;
 975
 976                 for( x = D; x < width1*D; x += D )
 977                 {
 978                     const CostType* pixAdd = pixDiff + std::min(x + SW2*D, (width1-1)*D);
 979                     const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*D, 0);
 980
 981 #if CV_SIMD128
 982                     v_int16x8 hv_reg;
 983                     for( d = 0; d < D; d+=8 )
 984                     {
 985                         hv_reg = v_load_aligned(hsumAdd+x-D+d) + (v_load_aligned(pixAdd+d) - v_load_aligned(pixSub+d));
 986                         v_store_aligned(hsumAdd+x+d,hv_reg);
 987                         v_store_aligned(C+x+d,v_load_aligned(C+x+d)+(hv_reg-v_load_aligned(hsumSub+x+d)));
 988                     }
 989 #else
 990                     for( d = 0; d < D; d++ )
 991                     {
 992                         int hv = hsumAdd[x + d] = (CostType)(hsumAdd[x - D + d] + pixAdd[d] - pixSub[d]);
 993                         C[x + d] = (CostType)(C[x + d] + hv - hsumSub[x + d]);
 994                     }
 995 #endif
 996                 }
 997             }
 998             else
 999             {
1000                 for( x = D; x < width1*D; x += D )
1001                 {
1002                     const CostType* pixAdd = pixDiff + std::min(x + SW2*D, (width1-1)*D);
1003                     const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*D, 0);
1004
1005                     for( d = 0; d < D; d++ )
1006                         hsumAdd[x + d] = (CostType)(hsumAdd[x - D + d] + pixAdd[d] - pixSub[d]);
1007                 }
1008             }
1009         }
1010
1011         if( y == src_start_idx )
1012         {
1013             int scale = k == src_start_idx ? SH2 + 1 : 1;
1014             for( x = 0; x < width1*D; x++ )
1015                 C[x] = (CostType)(C[x] + hsumAdd[x]*scale);
1016         }
1017     }
1018 }
1019
1020 #if CV_SIMD128
1021 // define some additional reduce operations:
1022 inline short min(const v_int16x8& a)
1023 {
1024     short CV_DECL_ALIGNED(16) buf[8];
1025     v_store_aligned(buf, a);
1026     short s0 = std::min(buf[0], buf[1]);
1027     short s1 = std::min(buf[2], buf[3]);
1028     short s2 = std::min(buf[4], buf[5]);
1029     short s3 = std::min(buf[6], buf[7]);
1030     return std::min(std::min(s0, s1),std::min(s2, s3));
1031 }
1032
1033 inline short min_pos(const v_int16x8& val,const v_int16x8& pos)
1034 {
1035     short CV_DECL_ALIGNED(16) val_buf[8];
1036     v_store_aligned(val_buf, val);
1037     short CV_DECL_ALIGNED(16) pos_buf[8];
1038     v_store_aligned(pos_buf, pos);
1039     short res_pos = 0;
1040     short min_val = SHRT_MAX;
1041     if(val_buf[0]<min_val) {min_val=val_buf[0]; res_pos=pos_buf[0];}
1042     if(val_buf[1]<min_val) {min_val=val_buf[1]; res_pos=pos_buf[1];}
1043     if(val_buf[2]<min_val) {min_val=val_buf[2]; res_pos=pos_buf[2];}
1044     if(val_buf[3]<min_val) {min_val=val_buf[3]; res_pos=pos_buf[3];}
1045     if(val_buf[4]<min_val) {min_val=val_buf[4]; res_pos=pos_buf[4];}
1046     if(val_buf[5]<min_val) {min_val=val_buf[5]; res_pos=pos_buf[5];}
1047     if(val_buf[6]<min_val) {min_val=val_buf[6]; res_pos=pos_buf[6];}
1048     if(val_buf[7]<min_val) {min_val=val_buf[7]; res_pos=pos_buf[7];}
1049     return res_pos;
1050 }
1051 #endif
1052
1053 // performing SGM cost accumulation from left to right (result is stored in leftBuf) and
1054 // in-place cost accumulation from top to bottom (result is stored in topBuf)
1055 inline void accumulateCostsLeftTop(CostType* leftBuf, CostType* leftBuf_prev, CostType* topBuf, CostType* costs,
1056                                    CostType& leftMinCost, CostType& topMinCost, int D, int P1, int P2)
1057 {
1058 #if CV_SIMD128
1059     v_int16x8 P1_reg = v_setall_s16(cv::saturate_cast<CostType>(P1));
1060
1061     v_int16x8 leftMinCostP2_reg   = v_setall_s16(cv::saturate_cast<CostType>(leftMinCost+P2));
1062     v_int16x8 leftMinCost_new_reg = v_setall_s16(SHRT_MAX);
1063     v_int16x8 src0_leftBuf        = v_setall_s16(SHRT_MAX);
1064     v_int16x8 src1_leftBuf        = v_load_aligned(leftBuf_prev);
1065
1066     v_int16x8 topMinCostP2_reg   = v_setall_s16(cv::saturate_cast<CostType>(topMinCost+P2));
1067     v_int16x8 topMinCost_new_reg = v_setall_s16(SHRT_MAX);
1068     v_int16x8 src0_topBuf        = v_setall_s16(SHRT_MAX);
1069     v_int16x8 src1_topBuf        = v_load_aligned(topBuf);
1070
1071     v_int16x8 src2;
1072     v_int16x8 src_shifted_left,src_shifted_right;
1073     v_int16x8 res;
1074
1075     for(int i=0;i<D-8;i+=8)
1076     {
1077         //process leftBuf:
1078         //lookahead load:
1079         src2 = v_load_aligned(leftBuf_prev+i+8);
1080
1081         //get shifted versions of the current block and add P1:
1082         src_shifted_left  = v_extract<7> (src0_leftBuf,src1_leftBuf) + P1_reg;
1083         src_shifted_right = v_extract<1> (src1_leftBuf,src2        ) + P1_reg;
1084
1085         // process and save current block:
1086         res = v_load_aligned(costs+i) + (v_min(v_min(src_shifted_left,src_shifted_right),v_min(src1_leftBuf,leftMinCostP2_reg))-leftMinCostP2_reg);
1087         leftMinCost_new_reg = v_min(leftMinCost_new_reg,res);
1088         v_store_aligned(leftBuf+i, res);
1089
1090         //update src buffers:
1091         src0_leftBuf = src1_leftBuf;
1092         src1_leftBuf = src2;
1093
1094         //process topBuf:
1095         //lookahead load:
1096         src2 = v_load_aligned(topBuf+i+8);
1097
1098         //get shifted versions of the current block and add P1:
1099         src_shifted_left  = v_extract<7> (src0_topBuf,src1_topBuf) + P1_reg;
1100         src_shifted_right = v_extract<1> (src1_topBuf,src2       ) + P1_reg;
1101
1102         // process and save current block:
1103         res = v_load_aligned(costs+i) + (v_min(v_min(src_shifted_left,src_shifted_right),v_min(src1_topBuf,topMinCostP2_reg))-topMinCostP2_reg);
1104         topMinCost_new_reg = v_min(topMinCost_new_reg,res);
1105         v_store_aligned(topBuf+i, res);
1106
1107         //update src buffers:
1108         src0_topBuf = src1_topBuf;
1109         src1_topBuf = src2;
1110     }
1111
1112     // a bit different processing for the last cycle of the loop:
1113     //process leftBuf:
1114     src2 = v_setall_s16(SHRT_MAX);
1115     src_shifted_left  = v_extract<7> (src0_leftBuf,src1_leftBuf) + P1_reg;
1116     src_shifted_right = v_extract<1> (src1_leftBuf,src2        ) + P1_reg;
1117
1118     res = v_load_aligned(costs+D-8) + (v_min(v_min(src_shifted_left,src_shifted_right),v_min(src1_leftBuf,leftMinCostP2_reg))-leftMinCostP2_reg);
1119     leftMinCost = min(v_min(leftMinCost_new_reg,res));
1120     v_store_aligned(leftBuf+D-8, res);
1121
1122     //process topBuf:
1123     src2 = v_setall_s16(SHRT_MAX);
1124     src_shifted_left  = v_extract<7> (src0_topBuf,src1_topBuf) + P1_reg;
1125     src_shifted_right = v_extract<1> (src1_topBuf,src2       ) + P1_reg;
1126
1127     res = v_load_aligned(costs+D-8) + (v_min(v_min(src_shifted_left,src_shifted_right),v_min(src1_topBuf,topMinCostP2_reg))-topMinCostP2_reg);
1128     topMinCost = min(v_min(topMinCost_new_reg,res));
1129     v_store_aligned(topBuf+D-8, res);
1130 #else
1131     CostType leftMinCost_new = SHRT_MAX;
1132     CostType topMinCost_new  = SHRT_MAX;
1133     int leftMinCost_P2  = leftMinCost + P2;
1134     int topMinCost_P2   = topMinCost  + P2;
1135     CostType leftBuf_prev_i_minus_1 = SHRT_MAX;
1136     CostType topBuf_i_minus_1       = SHRT_MAX;
1137     CostType tmp;
1138
1139     for(int i=0;i<D-1;i++)
1140     {
1141         leftBuf[i] = cv::saturate_cast<CostType>(costs[i] + std::min(std::min(leftBuf_prev_i_minus_1+P1,leftBuf_prev[i+1]+P1),std::min((int)leftBuf_prev[i],leftMinCost_P2))-leftMinCost_P2);
1142         leftBuf_prev_i_minus_1 = leftBuf_prev[i];
1143         leftMinCost_new = std::min(leftMinCost_new,leftBuf[i]);
1144
1145         tmp = topBuf[i];
1146         topBuf[i]  = cv::saturate_cast<CostType>(costs[i] + std::min(std::min(topBuf_i_minus_1+P1,topBuf[i+1]+P1),std::min((int)topBuf[i],topMinCost_P2))-topMinCost_P2);
1147         topBuf_i_minus_1 = tmp;
1148         topMinCost_new  = std::min(topMinCost_new,topBuf[i]);
1149     }
1150
1151     leftBuf[D-1] = cv::saturate_cast<CostType>(costs[D-1] + std::min(leftBuf_prev_i_minus_1+P1,std::min((int)leftBuf_prev[D-1],leftMinCost_P2))-leftMinCost_P2);
1152     leftMinCost = std::min(leftMinCost_new,leftBuf[D-1]);
1153
1154     topBuf[D-1]  = cv::saturate_cast<CostType>(costs[D-1] + std::min(topBuf_i_minus_1+P1,std::min((int)topBuf[D-1],topMinCost_P2))-topMinCost_P2);
1155     topMinCost  = std::min(topMinCost_new,topBuf[D-1]);
1156 #endif
1157 }
1158
1159 // performing in-place SGM cost accumulation from right to left (the result is stored in rightBuf) and
1160 // summing rightBuf, topBuf, leftBuf together (the result is stored in leftBuf), as well as finding the
1161 // optimal disparity value with minimum accumulated cost
1162 inline void accumulateCostsRight(CostType* rightBuf, CostType* topBuf, CostType* leftBuf, CostType* costs,
1163                                  CostType& rightMinCost, int D, int P1, int P2, int& optimal_disp, CostType& min_cost)
1164 {
1165 #if CV_SIMD128
1166     v_int16x8 P1_reg = v_setall_s16(cv::saturate_cast<CostType>(P1));
1167
1168     v_int16x8 rightMinCostP2_reg   = v_setall_s16(cv::saturate_cast<CostType>(rightMinCost+P2));
1169     v_int16x8 rightMinCost_new_reg = v_setall_s16(SHRT_MAX);
1170     v_int16x8 src0_rightBuf        = v_setall_s16(SHRT_MAX);
1171     v_int16x8 src1_rightBuf        = v_load(rightBuf);
1172
1173     v_int16x8 src2;
1174     v_int16x8 src_shifted_left,src_shifted_right;
1175     v_int16x8 res;
1176
1177     v_int16x8 min_sum_cost_reg = v_setall_s16(SHRT_MAX);
1178     v_int16x8 min_sum_pos_reg  = v_setall_s16(0);
1179     v_int16x8 loop_idx(0,1,2,3,4,5,6,7);
1180     v_int16x8 eight_reg = v_setall_s16(8);
1181
1182     for(int i=0;i<D-8;i+=8)
1183     {
1184         //lookahead load:
1185         src2 = v_load_aligned(rightBuf+i+8);
1186
1187         //get shifted versions of the current block and add P1:
1188         src_shifted_left  = v_extract<7> (src0_rightBuf,src1_rightBuf) + P1_reg;
1189         src_shifted_right = v_extract<1> (src1_rightBuf,src2         ) + P1_reg;
1190
1191         // process and save current block:
1192         res = v_load_aligned(costs+i) + (v_min(v_min(src_shifted_left,src_shifted_right),v_min(src1_rightBuf,rightMinCostP2_reg))-rightMinCostP2_reg);
1193         rightMinCost_new_reg = v_min(rightMinCost_new_reg,res);
1194         v_store_aligned(rightBuf+i, res);
1195
1196         // compute and save total cost:
1197         res = res + v_load_aligned(leftBuf+i) + v_load_aligned(topBuf+i);
1198         v_store_aligned(leftBuf+i, res);
1199
1200         // track disparity value with the minimum cost:
1201         min_sum_cost_reg = v_min(min_sum_cost_reg,res);
1202         min_sum_pos_reg = min_sum_pos_reg + ((min_sum_cost_reg == res) & (loop_idx - min_sum_pos_reg));
1203         loop_idx = loop_idx+eight_reg;
1204
1205         //update src:
1206         src0_rightBuf    = src1_rightBuf;
1207         src1_rightBuf    = src2;
1208     }
1209
1210     // a bit different processing for the last cycle of the loop:
1211     src2 = v_setall_s16(SHRT_MAX);
1212     src_shifted_left  = v_extract<7> (src0_rightBuf,src1_rightBuf) + P1_reg;
1213     src_shifted_right = v_extract<1> (src1_rightBuf,src2         ) + P1_reg;
1214
1215     res = v_load_aligned(costs+D-8) + (v_min(v_min(src_shifted_left,src_shifted_right),v_min(src1_rightBuf,rightMinCostP2_reg))-rightMinCostP2_reg);
1216     rightMinCost = min(v_min(rightMinCost_new_reg,res));
1217     v_store_aligned(rightBuf+D-8, res);
1218
1219     res = res + v_load_aligned(leftBuf+D-8) + v_load_aligned(topBuf+D-8);
1220     v_store_aligned(leftBuf+D-8, res);
1221
1222     min_sum_cost_reg = v_min(min_sum_cost_reg,res);
1223     min_cost = min(min_sum_cost_reg);
1224     min_sum_pos_reg = min_sum_pos_reg + ((min_sum_cost_reg == res) & (loop_idx - min_sum_pos_reg));
1225     optimal_disp = min_pos(min_sum_cost_reg,min_sum_pos_reg);
1226 #else
1227     CostType rightMinCost_new = SHRT_MAX;
1228     int rightMinCost_P2  = rightMinCost + P2;
1229     CostType rightBuf_i_minus_1 = SHRT_MAX;
1230     CostType tmp;
1231     min_cost = SHRT_MAX;
1232
1233     for(int i=0;i<D-1;i++)
1234     {
1235         tmp = rightBuf[i];
1236         rightBuf[i]  = cv::saturate_cast<CostType>(costs[i] + std::min(std::min(rightBuf_i_minus_1+P1,rightBuf[i+1]+P1),std::min((int)rightBuf[i],rightMinCost_P2))-rightMinCost_P2);
1237         rightBuf_i_minus_1 = tmp;
1238         rightMinCost_new  = std::min(rightMinCost_new,rightBuf[i]);
1239         leftBuf[i] = cv::saturate_cast<CostType>((int)leftBuf[i]+rightBuf[i]+topBuf[i]);
1240         if(leftBuf[i]<min_cost)
1241         {
1242             optimal_disp = i;
1243             min_cost = leftBuf[i];
1244         }
1245     }
1246
1247     rightBuf[D-1]  = cv::saturate_cast<CostType>(costs[D-1] + std::min(rightBuf_i_minus_1+P1,std::min((int)rightBuf[D-1],rightMinCost_P2))-rightMinCost_P2);
1248     rightMinCost  = std::min(rightMinCost_new,rightBuf[D-1]);
1249     leftBuf[D-1] = cv::saturate_cast<CostType>((int)leftBuf[D-1]+rightBuf[D-1]+topBuf[D-1]);
1250     if(leftBuf[D-1]<min_cost)
1251     {
1252         optimal_disp = D-1;
1253         min_cost = leftBuf[D-1];
1254     }
1255 #endif
1256 }
1257
1258 void SGBM3WayMainLoop::operator () (const Range& range) const
1259 {
1260     // force separate processing of stripes:
1261     if(range.end>range.start+1)
1262     {
1263         for(int n=range.start;n<range.end;n++)
1264             (*this)(Range(n,n+1));
1265         return;
1266     }
1267
1268     const int DISP_SCALE = (1 << StereoMatcher::DISP_SHIFT);
1269     int INVALID_DISP = minD - 1, INVALID_DISP_SCALED = INVALID_DISP*DISP_SCALE;
1270
1271     // setting up the ranges:
1272     int src_start_idx = std::max(std::min(range.start * stripe_sz - stripe_overlap, height),0);
1273     int src_end_idx   = std::min(range.end   * stripe_sz, height);
1274
1275     int dst_offset;
1276     if(range.start==0)
1277         dst_offset=stripe_overlap;
1278     else
1279         dst_offset=0;
1280
1281     Mat cur_buffer = buffers [range.start];
1282     Mat cur_disp   = dst_disp[range.start];
1283     cur_disp = Scalar(INVALID_DISP_SCALED);
1284
1285     // prepare buffers:
1286     CostType *curCostVolumeLine, *hsumBuf, *pixDiff;
1287     PixType* tmpBuf;
1288     CostType *horPassCostVolume, *vertPassCostVolume, *vertPassMin, *rightPassBuf, *disp2CostBuf;
1289     short* disp2Buf;
1290     getBufferPointers(cur_buffer,width,width1,D,img1->channels(),SH2,P2,
1291                       curCostVolumeLine,hsumBuf,pixDiff,tmpBuf,horPassCostVolume,
1292                       vertPassCostVolume,vertPassMin,rightPassBuf,disp2CostBuf,disp2Buf);
1293
1294     // start real processing:
1295     for(int y=src_start_idx;y<src_end_idx;y++)
1296     {
1297         getRawMatchingCost(curCostVolumeLine,hsumBuf,pixDiff,tmpBuf,y,src_start_idx);
1298
1299         short* disp_row = (short*)cur_disp.ptr(dst_offset+(y-src_start_idx));
1300
1301         // initialize the auxiliary buffers for the pseudo left-right consistency check:
1302         for(int x=0;x<width;x++)
1303         {
1304             disp2Buf[x] = (short)INVALID_DISP_SCALED;
1305             disp2CostBuf[x] = SHRT_MAX;
1306         }
1307         CostType* C = curCostVolumeLine - D;
1308         CostType prev_min, min_cost;
1309         int d, best_d;
1310         d = best_d = 0;
1311
1312         // forward pass
1313         prev_min=0;
1314         for (int x=D;x<(1+width1)*D;x+=D)
1315             accumulateCostsLeftTop(horPassCostVolume+x,horPassCostVolume+x-D,vertPassCostVolume+x,C+x,prev_min,vertPassMin[x/D],D,P1,P2);
1316
1317         //backward pass
1318         memset(rightPassBuf,0,D*sizeof(CostType));
1319         prev_min=0;
1320         for (int x=width1*D;x>=D;x-=D)
1321         {
1322             accumulateCostsRight(rightPassBuf,vertPassCostVolume+x,horPassCostVolume+x,C+x,prev_min,D,P1,P2,best_d,min_cost);
1323
1324             if(uniquenessRatio>0)
1325             {
1326 #if CV_SIMD128
1327                 horPassCostVolume+=x;
1328                 int thresh = (100*min_cost)/(100-uniquenessRatio);
1329                 v_int16x8 thresh_reg = v_setall_s16((short)(thresh+1));
1330                 v_int16x8 d1 = v_setall_s16((short)(best_d-1));
1331                 v_int16x8 d2 = v_setall_s16((short)(best_d+1));
1332                 v_int16x8 eight_reg = v_setall_s16(8);
1333                 v_int16x8 cur_d(0,1,2,3,4,5,6,7);
1334                 v_int16x8 mask,cost1,cost2;
1335
1336                 for( d = 0; d < D; d+=16 )
1337                 {
1338                     cost1 = v_load_aligned(horPassCostVolume+d);
1339                     cost2 = v_load_aligned(horPassCostVolume+d+8);
1340
1341                     mask = cost1 < thresh_reg;
1342                     mask = mask & ( (cur_d<d1) | (cur_d>d2) );
1343                     if( v_signmask(mask) )
1344                         break;
1345
1346                     cur_d = cur_d+eight_reg;
1347
1348                     mask = cost2 < thresh_reg;
1349                     mask = mask & ( (cur_d<d1) | (cur_d>d2) );
1350                     if( v_signmask(mask) )
1351                         break;
1352
1353                     cur_d = cur_d+eight_reg;
1354                 }
1355                 horPassCostVolume-=x;
1356 #else
1357                 for( d = 0; d < D; d++ )
1358                 {
1359                     if( horPassCostVolume[x+d]*(100 - uniquenessRatio) < min_cost*100 && std::abs(d - best_d) > 1 )
1360                         break;
1361                 }
1362 #endif
1363                 if( d < D )
1364                     continue;
1365             }
1366             d = best_d;
1367
1368             int _x2 = x/D - 1 + minX1 - d - minD;
1369             if( _x2>=0 && _x2<width && disp2CostBuf[_x2] > min_cost )
1370             {
1371                 disp2CostBuf[_x2] = min_cost;
1372                 disp2Buf[_x2] = (short)(d + minD);
1373             }
1374
1375             if( 0 < d && d < D-1 )
1376             {
1377                 // do subpixel quadratic interpolation:
1378                 //   fit parabola into (x1=d-1, y1=Sp[d-1]), (x2=d, y2=Sp[d]), (x3=d+1, y3=Sp[d+1])
1379                 //   then find minimum of the parabola.
1380                 int denom2 = std::max(horPassCostVolume[x+d-1] + horPassCostVolume[x+d+1] - 2*horPassCostVolume[x+d], 1);
1381                 d = d*DISP_SCALE + ((horPassCostVolume[x+d-1] - horPassCostVolume[x+d+1])*DISP_SCALE + denom2)/(denom2*2);
1382             }
1383             else
1384                 d *= DISP_SCALE;
1385
1386             disp_row[(x/D)-1 + minX1] = (DispType)(d + minD*DISP_SCALE);
1387         }
1388
1389         for(int x = minX1; x < maxX1; x++ )
1390         {
1391             // pseudo LRC consistency check using only one disparity map;
1392             // pixels with difference more than disp12MaxDiff are invalidated
1393             int d1 = disp_row[x];
1394             if( d1 == INVALID_DISP_SCALED )
1395                 continue;
1396             int _d = d1 >> StereoMatcher::DISP_SHIFT;
1397             int d_ = (d1 + DISP_SCALE-1) >> StereoMatcher::DISP_SHIFT;
1398             int _x = x - _d, x_ = x - d_;
1399             if( 0 <= _x && _x < width && disp2Buf[_x] >= minD && std::abs(disp2Buf[_x] - _d) > disp12MaxDiff &&
1400                 0 <= x_ && x_ < width && disp2Buf[x_] >= minD && std::abs(disp2Buf[x_] - d_) > disp12MaxDiff )
1401                 disp_row[x] = (short)INVALID_DISP_SCALED;
1402         }
1403     }
1404 }
1405
1406 static void computeDisparity3WaySGBM( const Mat& img1, const Mat& img2,
1407                                       Mat& disp1, const StereoSGBMParams& params,
1408                                       Mat* buffers, int nstripes )
1409 {
1410     // precompute a lookup table for the raw matching cost computation:
1411     const int TAB_OFS = 256*4, TAB_SIZE = 256 + TAB_OFS*2;
1412     PixType* clipTab = new PixType[TAB_SIZE];
1413     int ftzero = std::max(params.preFilterCap, 15) | 1;
1414     for(int k = 0; k < TAB_SIZE; k++ )
1415         clipTab[k] = (PixType)(std::min(std::max(k - TAB_OFS, -ftzero), ftzero) + ftzero);
1416
1417     // allocate separate dst_disp arrays to avoid conflicts due to stripe overlap:
1418     int stripe_sz = (int)ceil(img1.rows/(double)nstripes);
1419     int stripe_overlap = (params.SADWindowSize/2+1) + (int)ceil(0.1*stripe_sz);
1420     Mat* dst_disp = new Mat[nstripes];
1421     for(int i=0;i<nstripes;i++)
1422         dst_disp[i].create(stripe_sz+stripe_overlap,img1.cols,CV_16S);
1423
1424     parallel_for_(Range(0,nstripes),SGBM3WayMainLoop(buffers,img1,img2,dst_disp,params,clipTab,nstripes,stripe_overlap));
1425
1426     //assemble disp1 from dst_disp:
1427     short* dst_row;
1428     short* src_row;
1429     for(int i=0;i<disp1.rows;i++)
1430     {
1431         dst_row = (short*)disp1.ptr(i);
1432         src_row = (short*)dst_disp[i/stripe_sz].ptr(stripe_overlap+i%stripe_sz);
1433         memcpy(dst_row,src_row,disp1.cols*sizeof(short));
1434     }
1435
1436     delete[] clipTab;
1437     delete[] dst_disp;
1438 }
1439
1440 class StereoSGBMImpl : public StereoSGBM
1441 {
1442 public:
1443     StereoSGBMImpl()
1444     {
1445         params = StereoSGBMParams();
1446     }
1447
1448     StereoSGBMImpl( int _minDisparity, int _numDisparities, int _SADWindowSize,
1449                     int _P1, int _P2, int _disp12MaxDiff, int _preFilterCap,
1450                     int _uniquenessRatio, int _speckleWindowSize, int _speckleRange,
1451                     int _mode )
1452     {
1453         params = StereoSGBMParams( _minDisparity, _numDisparities, _SADWindowSize,
1454                                    _P1, _P2, _disp12MaxDiff, _preFilterCap,
1455                                    _uniquenessRatio, _speckleWindowSize, _speckleRange,
1456                                    _mode );
1457     }
1458
1459     void compute( InputArray leftarr, InputArray rightarr, OutputArray disparr )
1460     {
1461         Mat left = leftarr.getMat(), right = rightarr.getMat();
1462         CV_Assert( left.size() == right.size() && left.type() == right.type() &&
1463                    left.depth() == CV_8U );
1464
1465         disparr.create( left.size(), CV_16S );
1466         Mat disp = disparr.getMat();
1467
1468         if(params.mode==MODE_SGBM_3WAY)
1469             computeDisparity3WaySGBM( left, right, disp, params, buffers, num_stripes );
1470         else
1471             computeDisparitySGBM( left, right, disp, params, buffer );
1472
1473         medianBlur(disp, disp, 3);
1474
1475         if( params.speckleWindowSize > 0 )
1476             filterSpeckles(disp, (params.minDisparity - 1)*StereoMatcher::DISP_SCALE, params.speckleWindowSize,
1477                            StereoMatcher::DISP_SCALE*params.speckleRange, buffer);
1478     }
1479
1480     int getMinDisparity() const { return params.minDisparity; }
1481     void setMinDisparity(int minDisparity) { params.minDisparity = minDisparity; }
1482
1483     int getNumDisparities() const { return params.numDisparities; }
1484     void setNumDisparities(int numDisparities) { params.numDisparities = numDisparities; }
1485
1486     int getBlockSize() const { return params.SADWindowSize; }
1487     void setBlockSize(int blockSize) { params.SADWindowSize = blockSize; }
1488
1489     int getSpeckleWindowSize() const { return params.speckleWindowSize; }
1490     void setSpeckleWindowSize(int speckleWindowSize) { params.speckleWindowSize = speckleWindowSize; }
1491
1492     int getSpeckleRange() const { return params.speckleRange; }
1493     void setSpeckleRange(int speckleRange) { params.speckleRange = speckleRange; }
1494
1495     int getDisp12MaxDiff() const { return params.disp12MaxDiff; }
1496     void setDisp12MaxDiff(int disp12MaxDiff) { params.disp12MaxDiff = disp12MaxDiff; }
1497
1498     int getPreFilterCap() const { return params.preFilterCap; }
1499     void setPreFilterCap(int preFilterCap) { params.preFilterCap = preFilterCap; }
1500
1501     int getUniquenessRatio() const { return params.uniquenessRatio; }
1502     void setUniquenessRatio(int uniquenessRatio) { params.uniquenessRatio = uniquenessRatio; }
1503
1504     int getP1() const { return params.P1; }
1505     void setP1(int P1) { params.P1 = P1; }
1506
1507     int getP2() const { return params.P2; }
1508     void setP2(int P2) { params.P2 = P2; }
1509
1510     int getMode() const { return params.mode; }
1511     void setMode(int mode) { params.mode = mode; }
1512
1513     void write(FileStorage& fs) const
1514     {
1515         fs << "name" << name_
1516         << "minDisparity" << params.minDisparity
1517         << "numDisparities" << params.numDisparities
1518         << "blockSize" << params.SADWindowSize
1519         << "speckleWindowSize" << params.speckleWindowSize
1520         << "speckleRange" << params.speckleRange
1521         << "disp12MaxDiff" << params.disp12MaxDiff
1522         << "preFilterCap" << params.preFilterCap
1523         << "uniquenessRatio" << params.uniquenessRatio
1524         << "P1" << params.P1
1525         << "P2" << params.P2
1526         << "mode" << params.mode;
1527     }
1528
1529     void read(const FileNode& fn)
1530     {
1531         FileNode n = fn["name"];
1532         CV_Assert( n.isString() && String(n) == name_ );
1533         params.minDisparity = (int)fn["minDisparity"];
1534         params.numDisparities = (int)fn["numDisparities"];
1535         params.SADWindowSize = (int)fn["blockSize"];
1536         params.speckleWindowSize = (int)fn["speckleWindowSize"];
1537         params.speckleRange = (int)fn["speckleRange"];
1538         params.disp12MaxDiff = (int)fn["disp12MaxDiff"];
1539         params.preFilterCap = (int)fn["preFilterCap"];
1540         params.uniquenessRatio = (int)fn["uniquenessRatio"];
1541         params.P1 = (int)fn["P1"];
1542         params.P2 = (int)fn["P2"];
1543         params.mode = (int)fn["mode"];
1544     }
1545
1546     StereoSGBMParams params;
1547     Mat buffer;
1548
1549     // the number of stripes is fixed, disregarding the number of threads/processors
1550     // to make the results fully reproducible:
1551     static const int num_stripes = 4;
1552     Mat buffers[num_stripes];
1553
1554     static const char* name_;
1555 };
1556
1557 const char* StereoSGBMImpl::name_ = "StereoMatcher.SGBM";
1558
1559
1560 Ptr<StereoSGBM> StereoSGBM::create(int minDisparity, int numDisparities, int SADWindowSize,
1561                                  int P1, int P2, int disp12MaxDiff,
1562                                  int preFilterCap, int uniquenessRatio,
1563                                  int speckleWindowSize, int speckleRange,
1564                                  int mode)
1565 {
1566     return Ptr<StereoSGBM>(
1567         new StereoSGBMImpl(minDisparity, numDisparities, SADWindowSize,
1568                            P1, P2, disp12MaxDiff,
1569                            preFilterCap, uniquenessRatio,
1570                            speckleWindowSize, speckleRange,
1571                            mode));
1572 }
1573
1574 Rect getValidDisparityROI( Rect roi1, Rect roi2,
1575                           int minDisparity,
1576                           int numberOfDisparities,
1577                           int SADWindowSize )
1578 {
1579     int SW2 = SADWindowSize/2;
1580     int minD = minDisparity, maxD = minDisparity + numberOfDisparities - 1;
1581
1582     int xmin = std::max(roi1.x, roi2.x + maxD) + SW2;
1583     int xmax = std::min(roi1.x + roi1.width, roi2.x + roi2.width - minD) - SW2;
1584     int ymin = std::max(roi1.y, roi2.y) + SW2;
1585     int ymax = std::min(roi1.y + roi1.height, roi2.y + roi2.height) - SW2;
1586
1587     Rect r(xmin, ymin, xmax - xmin, ymax - ymin);
1588
1589     return r.width > 0 && r.height > 0 ? r : Rect();
1590 }
1591
1592 typedef cv::Point_<short> Point2s;
1593
1594 template <typename T>
1595 void filterSpecklesImpl(cv::Mat& img, int newVal, int maxSpeckleSize, int maxDiff, cv::Mat& _buf)
1596 {
1597     using namespace cv;
1598
1599     int width = img.cols, height = img.rows, npixels = width*height;
1600     size_t bufSize = npixels*(int)(sizeof(Point2s) + sizeof(int) + sizeof(uchar));
1601     if( !_buf.isContinuous() || _buf.empty() || _buf.cols*_buf.rows*_buf.elemSize() < bufSize )
1602         _buf.create(1, (int)bufSize, CV_8U);
1603
1604     uchar* buf = _buf.ptr();
1605     int i, j, dstep = (int)(img.step/sizeof(T));
1606     int* labels = (int*)buf;
1607     buf += npixels*sizeof(labels[0]);
1608     Point2s* wbuf = (Point2s*)buf;
1609     buf += npixels*sizeof(wbuf[0]);
1610     uchar* rtype = (uchar*)buf;
1611     int curlabel = 0;
1612
1613     // clear out label assignments
1614     memset(labels, 0, npixels*sizeof(labels[0]));
1615
1616     for( i = 0; i < height; i++ )
1617     {
1618         T* ds = img.ptr<T>(i);
1619         int* ls = labels + width*i;
1620
1621         for( j = 0; j < width; j++ )
1622         {
1623             if( ds[j] != newVal )   // not a bad disparity
1624             {
1625                 if( ls[j] )     // has a label, check for bad label
1626                 {
1627                     if( rtype[ls[j]] ) // small region, zero out disparity
1628                         ds[j] = (T)newVal;
1629                 }
1630                 // no label, assign and propagate
1631                 else
1632                 {
1633                     Point2s* ws = wbuf; // initialize wavefront
1634                     Point2s p((short)j, (short)i);  // current pixel
1635                     curlabel++; // next label
1636                     int count = 0;  // current region size
1637                     ls[j] = curlabel;
1638
1639                     // wavefront propagation
1640                     while( ws >= wbuf ) // wavefront not empty
1641                     {
1642                         count++;
1643                         // put neighbors onto wavefront
1644                         T* dpp = &img.at<T>(p.y, p.x);
1645                         T dp = *dpp;
1646                         int* lpp = labels + width*p.y + p.x;
1647
1648                         if( p.y < height-1 && !lpp[+width] && dpp[+dstep] != newVal && std::abs(dp - dpp[+dstep]) <= maxDiff )
1649                         {
1650                             lpp[+width] = curlabel;
1651                             *ws++ = Point2s(p.x, p.y+1);
1652                         }
1653
1654                         if( p.y > 0 && !lpp[-width] && dpp[-dstep] != newVal && std::abs(dp - dpp[-dstep]) <= maxDiff )
1655                         {
1656                             lpp[-width] = curlabel;
1657                             *ws++ = Point2s(p.x, p.y-1);
1658                         }
1659
1660                         if( p.x < width-1 && !lpp[+1] && dpp[+1] != newVal && std::abs(dp - dpp[+1]) <= maxDiff )
1661                         {
1662                             lpp[+1] = curlabel;
1663                             *ws++ = Point2s(p.x+1, p.y);
1664                         }
1665
1666                         if( p.x > 0 && !lpp[-1] && dpp[-1] != newVal && std::abs(dp - dpp[-1]) <= maxDiff )
1667                         {
1668                             lpp[-1] = curlabel;
1669                             *ws++ = Point2s(p.x-1, p.y);
1670                         }
1671
1672                         // pop most recent and propagate
1673                         // NB: could try least recent, maybe better convergence
1674                         p = *--ws;
1675                     }
1676
1677                     // assign label type
1678                     if( count <= maxSpeckleSize )   // speckle region
1679                     {
1680                         rtype[ls[j]] = 1;   // small region label
1681                         ds[j] = (T)newVal;
1682                     }
1683                     else
1684                         rtype[ls[j]] = 0;   // large region label
1685                 }
1686             }
1687         }
1688     }
1689 }
1690
1691 #ifdef HAVE_IPP
1692 static bool ipp_filterSpeckles(Mat &img, int maxSpeckleSize, int newVal, int maxDiff)
1693 {
1694 #if IPP_VERSION_X100 >= 810
1695     int type = img.type();
1696     Ipp32s bufsize = 0;
1697     IppiSize roisize = { img.cols, img.rows };
1698     IppDataType datatype = type == CV_8UC1 ? ipp8u : ipp16s;
1699     Ipp8u *pBuffer = NULL;
1700     IppStatus status = ippStsNoErr;
1701
1702     if(ippiMarkSpecklesGetBufferSize(roisize, datatype, CV_MAT_CN(type), &bufsize) < 0)
1703         return false;
1704
1705     pBuffer = (Ipp8u*)ippMalloc(bufsize);
1706     if(!pBuffer && bufsize)
1707         return false;
1708
1709     if (type == CV_8UC1)
1710     {
1711         status = ippiMarkSpeckles_8u_C1IR(img.ptr<Ipp8u>(), (int)img.step, roisize,
1712                                             (Ipp8u)newVal, maxSpeckleSize, (Ipp8u)maxDiff, ippiNormL1, pBuffer);
1713     }
1714     else
1715     {
1716         status = ippiMarkSpeckles_16s_C1IR(img.ptr<Ipp16s>(), (int)img.step, roisize,
1717                                             (Ipp16s)newVal, maxSpeckleSize, (Ipp16s)maxDiff, ippiNormL1, pBuffer);
1718     }
1719     if(pBuffer) ippFree(pBuffer);
1720
1721     if (status >= 0)
1722         return true;
1723 #else
1724     CV_UNUSED(img); CV_UNUSED(maxSpeckleSize); CV_UNUSED(newVal); CV_UNUSED(maxDiff);
1725 #endif
1726     return false;
1727 }
1728 #endif
1729
1730 }
1731
1732 void cv::filterSpeckles( InputOutputArray _img, double _newval, int maxSpeckleSize,
1733                          double _maxDiff, InputOutputArray __buf )
1734 {
1735     Mat img = _img.getMat();
1736     int type = img.type();
1737     Mat temp, &_buf = __buf.needed() ? __buf.getMatRef() : temp;
1738     CV_Assert( type == CV_8UC1 || type == CV_16SC1 );
1739
1740     int newVal = cvRound(_newval), maxDiff = cvRound(_maxDiff);
1741
1742     CV_IPP_RUN(IPP_VERSION_X100 >= 810 && !__buf.needed() && (type == CV_8UC1 || type == CV_16SC1), ipp_filterSpeckles(img, maxSpeckleSize, newVal, maxDiff));
1743
1744     if (type == CV_8UC1)
1745         filterSpecklesImpl<uchar>(img, newVal, maxSpeckleSize, maxDiff, _buf);
1746     else
1747         filterSpecklesImpl<short>(img, newVal, maxSpeckleSize, maxDiff, _buf);
1748 }
1749
1750 void cv::validateDisparity( InputOutputArray _disp, InputArray _cost, int minDisparity,
1751                             int numberOfDisparities, int disp12MaxDiff )
1752 {
1753     Mat disp = _disp.getMat(), cost = _cost.getMat();
1754     int cols = disp.cols, rows = disp.rows;
1755     int minD = minDisparity, maxD = minDisparity + numberOfDisparities;
1756     int x, minX1 = std::max(maxD, 0), maxX1 = cols + std::min(minD, 0);
1757     AutoBuffer<int> _disp2buf(cols*2);
1758     int* disp2buf = _disp2buf;
1759     int* disp2cost = disp2buf + cols;
1760     const int DISP_SHIFT = 4, DISP_SCALE = 1 << DISP_SHIFT;
1761     int INVALID_DISP = minD - 1, INVALID_DISP_SCALED = INVALID_DISP*DISP_SCALE;
1762     int costType = cost.type();
1763
1764     disp12MaxDiff *= DISP_SCALE;
1765
1766     CV_Assert( numberOfDisparities > 0 && disp.type() == CV_16S &&
1767               (costType == CV_16S || costType == CV_32S) &&
1768               disp.size() == cost.size() );
1769
1770     for( int y = 0; y < rows; y++ )
1771     {
1772         short* dptr = disp.ptr<short>(y);
1773
1774         for( x = 0; x < cols; x++ )
1775         {
1776             disp2buf[x] = INVALID_DISP_SCALED;
1777             disp2cost[x] = INT_MAX;
1778         }
1779
1780         if( costType == CV_16S )
1781         {
1782             const short* cptr = cost.ptr<short>(y);
1783
1784             for( x = minX1; x < maxX1; x++ )
1785             {
1786                 int d = dptr[x], c = cptr[x];
1787
1788                 if( d == INVALID_DISP_SCALED )
1789                     continue;
1790
1791                 int x2 = x - ((d + DISP_SCALE/2) >> DISP_SHIFT);
1792
1793                 if( disp2cost[x2] > c )
1794                 {
1795                     disp2cost[x2] = c;
1796                     disp2buf[x2] = d;
1797                 }
1798             }
1799         }
1800         else
1801         {
1802             const int* cptr = cost.ptr<int>(y);
1803
1804             for( x = minX1; x < maxX1; x++ )
1805             {
1806                 int d = dptr[x], c = cptr[x];
1807
1808                 if( d == INVALID_DISP_SCALED )
1809                     continue;
1810
1811                 int x2 = x - ((d + DISP_SCALE/2) >> DISP_SHIFT);
1812
1813                 if( disp2cost[x2] > c )
1814                 {
1815                     disp2cost[x2] = c;
1816                     disp2buf[x2] = d;
1817                 }
1818             }
1819         }
1820
1821         for( x = minX1; x < maxX1; x++ )
1822         {
1823             // we round the computed disparity both towards -inf and +inf and check
1824             // if either of the corresponding disparities in disp2 is consistent.
1825             // This is to give the computed disparity a chance to look valid if it is.
1826             int d = dptr[x];
1827             if( d == INVALID_DISP_SCALED )
1828                 continue;
1829             int d0 = d >> DISP_SHIFT;
1830             int d1 = (d + DISP_SCALE-1) >> DISP_SHIFT;
1831             int x0 = x - d0, x1 = x - d1;
1832             if( (0 <= x0 && x0 < cols && disp2buf[x0] > INVALID_DISP_SCALED && std::abs(disp2buf[x0] - d) > disp12MaxDiff) &&
1833                 (0 <= x1 && x1 < cols && disp2buf[x1] > INVALID_DISP_SCALED && std::abs(disp2buf[x1] - d) > disp12MaxDiff) )
1834                 dptr[x] = (short)INVALID_DISP_SCALED;
1835         }
1836     }
1837 }