modules/calib3d/src/stereosgbm.cpp

   1 /*M///////////////////////////////////////////////////////////////////////////////////////
   2 //
   3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
   4 //
   5 //  By downloading, copying, installing or using the software you agree to this license.
   6 //  If you do not agree to this license, do not download, install,
   7 //  copy or use the software.
   8 //
   9 //
  10 //                           License Agreement
  11 //                For Open Source Computer Vision Library
  12 //
  13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
  14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
  15 // Third party copyrights are property of their respective owners.
  16 //
  17 // Redistribution and use in source and binary forms, with or without modification,
  18 // are permitted provided that the following conditions are met:
  19 //
  20 //   * Redistribution's of source code must retain the above copyright notice,
  21 //     this list of conditions and the following disclaimer.
  22 //
  23 //   * Redistribution's in binary form must reproduce the above copyright notice,
  24 //     this list of conditions and the following disclaimer in the documentation
  25 //     and/or other materials provided with the distribution.
  26 //
  27 //   * The name of the copyright holders may not be used to endorse or promote products
  28 //     derived from this software without specific prior written permission.
  29 //
  30 // This software is provided by the copyright holders and contributors "as is" and
  31 // any express or implied warranties, including, but not limited to, the implied
  32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
  33 // In no event shall the Intel Corporation or contributors be liable for any direct,
  34 // indirect, incidental, special, exemplary, or consequential damages
  35 // (including, but not limited to, procurement of substitute goods or services;
  36 // loss of use, data, or profits; or business interruption) however caused
  37 // and on any theory of liability, whether in contract, strict liability,
  38 // or tort (including negligence or otherwise) arising in any way out of
  39 // the use of this software, even if advised of the possibility of such damage.
  40 //
  41 //M*/
  42
  43 /*
  44  This is a variation of
  45  "Stereo Processing by Semiglobal Matching and Mutual Information"
  46  by Heiko Hirschmuller.
  47
  48  We match blocks rather than individual pixels, thus the algorithm is called
  49  SGBM (Semi-global block matching)
  50  */
  51
  52 #include "precomp.hpp"
  53 #include <limits.h>
  54
  55 namespace cv
  56 {
  57
  58 typedef uchar PixType;
  59 typedef short CostType;
  60 typedef short DispType;
  61
  62 enum { NR = 16, NR2 = NR/2 };
  63
  64 StereoSGBM::StereoSGBM()
  65 {
  66     minDisparity = numberOfDisparities = 0;
  67     SADWindowSize = 0;
  68     P1 = P2 = 0;
  69     disp12MaxDiff = 0;
  70     preFilterCap = 0;
  71     uniquenessRatio = 0;
  72     speckleWindowSize = 0;
  73     speckleRange = 0;
  74     fullDP = false;
  75 }
  76
  77
  78 StereoSGBM::StereoSGBM( int _minDisparity, int _numDisparities, int _SADWindowSize,
  79                    int _P1, int _P2, int _disp12MaxDiff, int _preFilterCap,
  80                    int _uniquenessRatio, int _speckleWindowSize, int _speckleRange,
  81                    bool _fullDP )
  82 {
  83     minDisparity = _minDisparity;
  84     numberOfDisparities = _numDisparities;
  85     SADWindowSize = _SADWindowSize;
  86     P1 = _P1;
  87     P2 = _P2;
  88     disp12MaxDiff = _disp12MaxDiff;
  89     preFilterCap = _preFilterCap;
  90     uniquenessRatio = _uniquenessRatio;
  91     speckleWindowSize = _speckleWindowSize;
  92     speckleRange = _speckleRange;
  93     fullDP = _fullDP;
  94 }
  95
  96
  97 StereoSGBM::~StereoSGBM()
  98 {
  99 }
 100
 101 /*
 102  For each pixel row1[x], max(-maxD, 0) <= minX <= x < maxX <= width - max(0, -minD),
 103  and for each disparity minD<=d<maxD the function
 104  computes the cost (cost[(x-minX)*(maxD - minD) + (d - minD)]), depending on the difference between
 105  row1[x] and row2[x-d]. The subpixel algorithm from
 106  "Depth Discontinuities by Pixel-to-Pixel Stereo" by Stan Birchfield and C. Tomasi
 107  is used, hence the suffix BT.
 108
 109  the temporary buffer should contain width2*2 elements
 110  */
 111 static void calcPixelCostBT( const Mat& img1, const Mat& img2, int y,
 112                             int minD, int maxD, CostType* cost,
 113                             PixType* buffer, const PixType* tab,
 114                             int tabOfs, int )
 115 {
 116     int x, c, width = img1.cols, cn = img1.channels();
 117     int minX1 = max(maxD, 0), maxX1 = width + min(minD, 0);
 118     int minX2 = max(minX1 - maxD, 0), maxX2 = min(maxX1 - minD, width);
 119     int D = maxD - minD, width1 = maxX1 - minX1, width2 = maxX2 - minX2;
 120     const PixType *row1 = img1.ptr<PixType>(y), *row2 = img2.ptr<PixType>(y);
 121     PixType *prow1 = buffer + width2*2, *prow2 = prow1 + width*cn*2;
 122
 123     tab += tabOfs;
 124
 125     for( c = 0; c < cn*2; c++ )
 126     {
 127         prow1[width*c] = prow1[width*c + width-1] =
 128         prow2[width*c] = prow2[width*c + width-1] = tab[0];
 129     }
 130
 131     int n1 = y > 0 ? -(int)img1.step : 0, s1 = y < img1.rows-1 ? (int)img1.step : 0;
 132     int n2 = y > 0 ? -(int)img2.step : 0, s2 = y < img2.rows-1 ? (int)img2.step : 0;
 133
 134     if( cn == 1 )
 135     {
 136         for( x = 1; x < width-1; x++ )
 137         {
 138             prow1[x] = tab[(row1[x+1] - row1[x-1])*2 + row1[x+n1+1] - row1[x+n1-1] + row1[x+s1+1] - row1[x+s1-1]];
 139             prow2[width-1-x] = tab[(row2[x+1] - row2[x-1])*2 + row2[x+n2+1] - row2[x+n2-1] + row2[x+s2+1] - row2[x+s2-1]];
 140
 141             prow1[x+width] = row1[x];
 142             prow2[width-1-x+width] = row2[x];
 143         }
 144     }
 145     else
 146     {
 147         for( x = 1; x < width-1; x++ )
 148         {
 149             prow1[x] = tab[(row1[x*3+3] - row1[x*3-3])*2 + row1[x*3+n1+3] - row1[x*3+n1-3] + row1[x*3+s1+3] - row1[x*3+s1-3]];
 150             prow1[x+width] = tab[(row1[x*3+4] - row1[x*3-2])*2 + row1[x*3+n1+4] - row1[x*3+n1-2] + row1[x*3+s1+4] - row1[x*3+s1-2]];
 151             prow1[x+width*2] = tab[(row1[x*3+5] - row1[x*3-1])*2 + row1[x*3+n1+5] - row1[x*3+n1-1] + row1[x*3+s1+5] - row1[x*3+s1-1]];
 152
 153             prow2[width-1-x] = tab[(row2[x*3+3] - row2[x*3-3])*2 + row2[x*3+n2+3] - row2[x*3+n2-3] + row2[x*3+s2+3] - row2[x*3+s2-3]];
 154             prow2[width-1-x+width] = tab[(row2[x*3+4] - row2[x*3-2])*2 + row2[x*3+n2+4] - row2[x*3+n2-2] + row2[x*3+s2+4] - row2[x*3+s2-2]];
 155             prow2[width-1-x+width*2] = tab[(row2[x*3+5] - row2[x*3-1])*2 + row2[x*3+n2+5] - row2[x*3+n2-1] + row2[x*3+s2+5] - row2[x*3+s2-1]];
 156
 157             prow1[x+width*3] = row1[x*3];
 158             prow1[x+width*4] = row1[x*3+1];
 159             prow1[x+width*5] = row1[x*3+2];
 160
 161             prow2[width-1-x+width*3] = row2[x*3];
 162             prow2[width-1-x+width*4] = row2[x*3+1];
 163             prow2[width-1-x+width*5] = row2[x*3+2];
 164         }
 165     }
 166
 167     memset( cost, 0, width1*D*sizeof(cost[0]) );
 168
 169     buffer -= minX2;
 170     cost -= minX1*D + minD; // simplify the cost indices inside the loop
 171
 172 #if CV_SSE2
 173     volatile bool useSIMD = checkHardwareSupport(CV_CPU_SSE2);
 174 #endif
 175
 176 #if 1
 177     for( c = 0; c < cn*2; c++, prow1 += width, prow2 += width )
 178     {
 179         int diff_scale = c < cn ? 0 : 2;
 180
 181         // precompute
 182         //   v0 = min(row2[x-1/2], row2[x], row2[x+1/2]) and
 183         //   v1 = max(row2[x-1/2], row2[x], row2[x+1/2]) and
 184         for( x = minX2; x < maxX2; x++ )
 185         {
 186             int v = prow2[x];
 187             int vl = x > 0 ? (v + prow2[x-1])/2 : v;
 188             int vr = x < width-1 ? (v + prow2[x+1])/2 : v;
 189             int v0 = min(vl, vr); v0 = min(v0, v);
 190             int v1 = max(vl, vr); v1 = max(v1, v);
 191             buffer[x] = (PixType)v0;
 192             buffer[x + width2] = (PixType)v1;
 193         }
 194
 195         for( x = minX1; x < maxX1; x++ )
 196         {
 197             int u = prow1[x];
 198             int ul = x > 0 ? (u + prow1[x-1])/2 : u;
 199             int ur = x < width-1 ? (u + prow1[x+1])/2 : u;
 200             int u0 = min(ul, ur); u0 = min(u0, u);
 201             int u1 = max(ul, ur); u1 = max(u1, u);
 202
 203         #if CV_SSE2
 204             if( useSIMD )
 205             {
 206                 __m128i _u = _mm_set1_epi8((char)u), _u0 = _mm_set1_epi8((char)u0);
 207                 __m128i _u1 = _mm_set1_epi8((char)u1), z = _mm_setzero_si128();
 208                 __m128i ds = _mm_cvtsi32_si128(diff_scale);
 209
 210                 for( int d = minD; d < maxD; d += 16 )
 211                 {
 212                     __m128i _v = _mm_loadu_si128((const __m128i*)(prow2 + width-x-1 + d));
 213                     __m128i _v0 = _mm_loadu_si128((const __m128i*)(buffer + width-x-1 + d));
 214                     __m128i _v1 = _mm_loadu_si128((const __m128i*)(buffer + width-x-1 + d + width2));
 215                     __m128i c0 = _mm_max_epu8(_mm_subs_epu8(_u, _v1), _mm_subs_epu8(_v0, _u));
 216                     __m128i c1 = _mm_max_epu8(_mm_subs_epu8(_v, _u1), _mm_subs_epu8(_u0, _v));
 217                     __m128i diff = _mm_min_epu8(c0, c1);
 218
 219                     c0 = _mm_load_si128((__m128i*)(cost + x*D + d));
 220                     c1 = _mm_load_si128((__m128i*)(cost + x*D + d + 8));
 221
 222                     _mm_store_si128((__m128i*)(cost + x*D + d), _mm_adds_epi16(c0, _mm_srl_epi16(_mm_unpacklo_epi8(diff,z), ds)));
 223                     _mm_store_si128((__m128i*)(cost + x*D + d + 8), _mm_adds_epi16(c1, _mm_srl_epi16(_mm_unpackhi_epi8(diff,z), ds)));
 224                 }
 225             }
 226             else
 227         #endif
 228             {
 229                 for( int d = minD; d < maxD; d++ )
 230                 {
 231                     int v = prow2[width-x-1 + d];
 232                     int v0 = buffer[width-x-1 + d];
 233                     int v1 = buffer[width-x-1 + d + width2];
 234                     int c0 = max(0, u - v1); c0 = max(c0, v0 - u);
 235                     int c1 = max(0, v - u1); c1 = max(c1, u0 - v);
 236
 237                     cost[x*D + d] = (CostType)(cost[x*D+d] + (min(c0, c1) >> diff_scale));
 238                 }
 239             }
 240         }
 241     }
 242 #else
 243     for( c = 0; c < cn*2; c++, prow1 += width, prow2 += width )
 244     {
 245         for( x = minX1; x < maxX1; x++ )
 246         {
 247             int u = prow1[x];
 248         #if CV_SSE2
 249             if( useSIMD )
 250             {
 251                 __m128i _u = _mm_set1_epi8(u), z = _mm_setzero_si128();
 252
 253                 for( int d = minD; d < maxD; d += 16 )
 254                 {
 255                     __m128i _v = _mm_loadu_si128((const __m128i*)(prow2 + width-1-x + d));
 256                     __m128i diff = _mm_adds_epu8(_mm_subs_epu8(_u,_v), _mm_subs_epu8(_v,_u));
 257                     __m128i c0 = _mm_load_si128((__m128i*)(cost + x*D + d));
 258                     __m128i c1 = _mm_load_si128((__m128i*)(cost + x*D + d + 8));
 259
 260                     _mm_store_si128((__m128i*)(cost + x*D + d), _mm_adds_epi16(c0, _mm_unpacklo_epi8(diff,z)));
 261                     _mm_store_si128((__m128i*)(cost + x*D + d + 8), _mm_adds_epi16(c1, _mm_unpackhi_epi8(diff,z)));
 262                 }
 263             }
 264             else
 265         #endif
 266             {
 267                 for( int d = minD; d < maxD; d++ )
 268                 {
 269                     int v = prow2[width-1-x + d];
 270                     cost[x*D + d] = (CostType)(cost[x*D + d] + (CostType)std::abs(u - v));
 271                 }
 272             }
 273         }
 274     }
 275 #endif
 276 }
 277
 278
 279 /*
 280  computes disparity for "roi" in img1 w.r.t. img2 and write it to disp1buf.
 281  that is, disp1buf(x, y)=d means that img1(x+roi.x, y+roi.y) ~ img2(x+roi.x-d, y+roi.y).
 282  minD <= d < maxD.
 283  disp2full is the reverse disparity map, that is:
 284  disp2full(x+roi.x,y+roi.y)=d means that img2(x+roi.x, y+roi.y) ~ img1(x+roi.x+d, y+roi.y)
 285
 286  note that disp1buf will have the same size as the roi and
 287  disp2full will have the same size as img1 (or img2).
 288  On exit disp2buf is not the final disparity, it is an intermediate result that becomes
 289  final after all the tiles are processed.
 290
 291  the disparity in disp1buf is written with sub-pixel accuracy
 292  (4 fractional bits, see CvStereoSGBM::DISP_SCALE),
 293  using quadratic interpolation, while the disparity in disp2buf
 294  is written as is, without interpolation.
 295
 296  disp2cost also has the same size as img1 (or img2).
 297  It contains the minimum current cost, used to find the best disparity, corresponding to the minimal cost.
 298  */
 299 static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
 300                                  Mat& disp1, const StereoSGBM& params,
 301                                  Mat& buffer )
 302 {
 303 #if CV_SSE2
 304     static const uchar LSBTab[] =
 305     {
 306         0, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
 307         5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
 308         6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
 309         5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
 310         7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
 311         5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
 312         6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
 313         5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0
 314     };
 315
 316     volatile bool useSIMD = checkHardwareSupport(CV_CPU_SSE2);
 317 #endif
 318
 319     const int ALIGN = 16;
 320     const int DISP_SHIFT = StereoSGBM::DISP_SHIFT;
 321     const int DISP_SCALE = StereoSGBM::DISP_SCALE;
 322     const CostType MAX_COST = SHRT_MAX;
 323
 324     int minD = params.minDisparity, maxD = minD + params.numberOfDisparities;
 325     Size SADWindowSize;
 326     SADWindowSize.width = SADWindowSize.height = params.SADWindowSize > 0 ? params.SADWindowSize : 5;
 327     int ftzero = max(params.preFilterCap, 15) | 1;
 328     int uniquenessRatio = params.uniquenessRatio >= 0 ? params.uniquenessRatio : 10;
 329     int disp12MaxDiff = params.disp12MaxDiff > 0 ? params.disp12MaxDiff : 1;
 330     int P1 = params.P1 > 0 ? params.P1 : 2, P2 = max(params.P2 > 0 ? params.P2 : 5, P1+1);
 331     int k, width = disp1.cols, height = disp1.rows;
 332     int minX1 = max(maxD, 0), maxX1 = width + min(minD, 0);
 333     int D = maxD - minD, width1 = maxX1 - minX1;
 334     int INVALID_DISP = minD - 1, INVALID_DISP_SCALED = INVALID_DISP*DISP_SCALE;
 335     int SW2 = SADWindowSize.width/2, SH2 = SADWindowSize.height/2;
 336     int npasses = params.fullDP ? 2 : 1;
 337     const int TAB_OFS = 256*4, TAB_SIZE = 256 + TAB_OFS*2;
 338     PixType clipTab[TAB_SIZE];
 339
 340     for( k = 0; k < TAB_SIZE; k++ )
 341         clipTab[k] = (PixType)(min(max(k - TAB_OFS, -ftzero), ftzero) + ftzero);
 342
 343     if( minX1 >= maxX1 )
 344     {
 345         disp1 = Scalar::all(INVALID_DISP_SCALED);
 346         return;
 347     }
 348
 349     CV_Assert( D % 16 == 0 );
 350
 351     // NR - the number of directions. the loop on x below that computes Lr assumes that NR == 8.
 352     // if you change NR, please, modify the loop as well.
 353     int D2 = D+16, NRD2 = NR2*D2;
 354
 355     // the number of L_r(.,.) and min_k L_r(.,.) lines in the buffer:
 356     // for 8-way dynamic programming we need the current row and
 357     // the previous row, i.e. 2 rows in total
 358     const int NLR = 2;
 359     const int LrBorder = NLR - 1;
 360
 361     // for each possible stereo match (img1(x,y) <=> img2(x-d,y))
 362     // we keep pixel difference cost (C) and the summary cost over NR directions (S).
 363     // we also keep all the partial costs for the previous line L_r(x,d) and also min_k L_r(x, k)
 364     size_t costBufSize = width1*D;
 365     size_t CSBufSize = costBufSize*(params.fullDP ? height : 1);
 366     size_t minLrSize = (width1 + LrBorder*2)*NR2, LrSize = minLrSize*D2;
 367     int hsumBufNRows = SH2*2 + 2;
 368     size_t totalBufSize = (LrSize + minLrSize)*NLR*sizeof(CostType) + // minLr[] and Lr[]
 369     costBufSize*(hsumBufNRows + 1)*sizeof(CostType) + // hsumBuf, pixdiff
 370     CSBufSize*2*sizeof(CostType) + // C, S
 371     width*16*img1.channels()*sizeof(PixType) + // temp buffer for computing per-pixel cost
 372     width*(sizeof(CostType) + sizeof(DispType)) + 1024; // disp2cost + disp2
 373
 374     if( !buffer.data || !buffer.isContinuous() ||
 375         buffer.cols*buffer.rows*buffer.elemSize() < totalBufSize )
 376         buffer.create(1, (int)totalBufSize, CV_8U);
 377
 378     // summary cost over different (nDirs) directions
 379     CostType* Cbuf = (CostType*)alignPtr(buffer.data, ALIGN);
 380     CostType* Sbuf = Cbuf + CSBufSize;
 381     CostType* hsumBuf = Sbuf + CSBufSize;
 382     CostType* pixDiff = hsumBuf + costBufSize*hsumBufNRows;
 383
 384     CostType* disp2cost = pixDiff + costBufSize + (LrSize + minLrSize)*NLR;
 385     DispType* disp2ptr = (DispType*)(disp2cost + width);
 386     PixType* tempBuf = (PixType*)(disp2ptr + width);
 387
 388     // add P2 to every C(x,y). it saves a few operations in the inner loops
 389     for( k = 0; k < width1*D; k++ )
 390         Cbuf[k] = (CostType)P2;
 391
 392     for( int pass = 1; pass <= npasses; pass++ )
 393     {
 394         int x1, y1, x2, y2, dx, dy;
 395
 396         if( pass == 1 )
 397         {
 398             y1 = 0; y2 = height; dy = 1;
 399             x1 = 0; x2 = width1; dx = 1;
 400         }
 401         else
 402         {
 403             y1 = height-1; y2 = -1; dy = -1;
 404             x1 = width1-1; x2 = -1; dx = -1;
 405         }
 406
 407         CostType *Lr[NLR]={0}, *minLr[NLR]={0};
 408
 409         for( k = 0; k < NLR; k++ )
 410         {
 411             // shift Lr[k] and minLr[k] pointers, because we allocated them with the borders,
 412             // and will occasionally use negative indices with the arrays
 413             // we need to shift Lr[k] pointers by 1, to give the space for d=-1.
 414             // however, then the alignment will be imperfect, i.e. bad for SSE,
 415             // thus we shift the pointers by 8 (8*sizeof(short) == 16 - ideal alignment)
 416             Lr[k] = pixDiff + costBufSize + LrSize*k + NRD2*LrBorder + 8;
 417             memset( Lr[k] - LrBorder*NRD2 - 8, 0, LrSize*sizeof(CostType) );
 418             minLr[k] = pixDiff + costBufSize + LrSize*NLR + minLrSize*k + NR2*LrBorder;
 419             memset( minLr[k] - LrBorder*NR2, 0, minLrSize*sizeof(CostType) );
 420         }
 421
 422         for( int y = y1; y != y2; y += dy )
 423         {
 424             int x, d;
 425             DispType* disp1ptr = disp1.ptr<DispType>(y);
 426             CostType* C = Cbuf + (!params.fullDP ? 0 : y*costBufSize);
 427             CostType* S = Sbuf + (!params.fullDP ? 0 : y*costBufSize);
 428
 429             if( pass == 1 ) // compute C on the first pass, and reuse it on the second pass, if any.
 430             {
 431                 int dy1 = y == 0 ? 0 : y + SH2, dy2 = y == 0 ? SH2 : dy1;
 432
 433                 for( k = dy1; k <= dy2; k++ )
 434                 {
 435                     CostType* hsumAdd = hsumBuf + (min(k, height-1) % hsumBufNRows)*costBufSize;
 436
 437                     if( k < height )
 438                     {
 439                         calcPixelCostBT( img1, img2, k, minD, maxD, pixDiff, tempBuf, clipTab, TAB_OFS, ftzero );
 440
 441                         memset(hsumAdd, 0, D*sizeof(CostType));
 442                         for( x = 0; x <= SW2*D; x += D )
 443                         {
 444                             int scale = x == 0 ? SW2 + 1 : 1;
 445                             for( d = 0; d < D; d++ )
 446                                 hsumAdd[d] = (CostType)(hsumAdd[d] + pixDiff[x + d]*scale);
 447                         }
 448
 449                         if( y > 0 )
 450                         {
 451                             const CostType* hsumSub = hsumBuf + (max(y - SH2 - 1, 0) % hsumBufNRows)*costBufSize;
 452                             const CostType* Cprev = !params.fullDP || y == 0 ? C : C - costBufSize;
 453
 454                             for( x = D; x < width1*D; x += D )
 455                             {
 456                                 const CostType* pixAdd = pixDiff + min(x + SW2*D, (width1-1)*D);
 457                                 const CostType* pixSub = pixDiff + max(x - (SW2+1)*D, 0);
 458
 459                             #if CV_SSE2
 460                                 if( useSIMD )
 461                                 {
 462                                     for( d = 0; d < D; d += 8 )
 463                                     {
 464                                         __m128i hv = _mm_load_si128((const __m128i*)(hsumAdd + x - D + d));
 465                                         __m128i Cx = _mm_load_si128((__m128i*)(Cprev + x + d));
 466                                         hv = _mm_adds_epi16(_mm_subs_epi16(hv,
 467                                                                            _mm_load_si128((const __m128i*)(pixSub + d))),
 468                                                             _mm_load_si128((const __m128i*)(pixAdd + d)));
 469                                         Cx = _mm_adds_epi16(_mm_subs_epi16(Cx,
 470                                                                            _mm_load_si128((const __m128i*)(hsumSub + x + d))),
 471                                                             hv);
 472                                         _mm_store_si128((__m128i*)(hsumAdd + x + d), hv);
 473                                         _mm_store_si128((__m128i*)(C + x + d), Cx);
 474                                     }
 475                                 }
 476                                 else
 477                             #endif
 478                                 {
 479                                     for( d = 0; d < D; d++ )
 480                                     {
 481                                         int hv = hsumAdd[x + d] = (CostType)(hsumAdd[x - D + d] + pixAdd[d] - pixSub[d]);
 482                                         C[x + d] = (CostType)(Cprev[x + d] + hv - hsumSub[x + d]);
 483                                     }
 484                                 }
 485                             }
 486                         }
 487                         else
 488                         {
 489                             for( x = D; x < width1*D; x += D )
 490                             {
 491                                 const CostType* pixAdd = pixDiff + min(x + SW2*D, (width1-1)*D);
 492                                 const CostType* pixSub = pixDiff + max(x - (SW2+1)*D, 0);
 493
 494                                 for( d = 0; d < D; d++ )
 495                                     hsumAdd[x + d] = (CostType)(hsumAdd[x - D + d] + pixAdd[d] - pixSub[d]);
 496                             }
 497                         }
 498                     }
 499
 500                     if( y == 0 )
 501                     {
 502                         int scale = k == 0 ? SH2 + 1 : 1;
 503                         for( x = 0; x < width1*D; x++ )
 504                             C[x] = (CostType)(C[x] + hsumAdd[x]*scale);
 505                     }
 506                 }
 507
 508                 // also, clear the S buffer
 509                 for( k = 0; k < width1*D; k++ )
 510                     S[k] = 0;
 511             }
 512
 513             // clear the left and the right borders
 514             memset( Lr[0] - NRD2*LrBorder - 8, 0, NRD2*LrBorder*sizeof(CostType) );
 515             memset( Lr[0] + width1*NRD2 - 8, 0, NRD2*LrBorder*sizeof(CostType) );
 516             memset( minLr[0] - NR2*LrBorder, 0, NR2*LrBorder*sizeof(CostType) );
 517             memset( minLr[0] + width1*NR2, 0, NR2*LrBorder*sizeof(CostType) );
 518
 519             /*
 520              [formula 13 in the paper]
 521              compute L_r(p, d) = C(p, d) +
 522              min(L_r(p-r, d),
 523              L_r(p-r, d-1) + P1,
 524              L_r(p-r, d+1) + P1,
 525              min_k L_r(p-r, k) + P2) - min_k L_r(p-r, k)
 526              where p = (x,y), r is one of the directions.
 527              we process all the directions at once:
 528              0: r=(-dx, 0)
 529              1: r=(-1, -dy)
 530              2: r=(0, -dy)
 531              3: r=(1, -dy)
 532              4: r=(-2, -dy)
 533              5: r=(-1, -dy*2)
 534              6: r=(1, -dy*2)
 535              7: r=(2, -dy)
 536              */
 537             for( x = x1; x != x2; x += dx )
 538             {
 539                 int xm = x*NR2, xd = xm*D2;
 540
 541                 int delta0 = minLr[0][xm - dx*NR2] + P2, delta1 = minLr[1][xm - NR2 + 1] + P2;
 542                 int delta2 = minLr[1][xm + 2] + P2, delta3 = minLr[1][xm + NR2 + 3] + P2;
 543
 544                 CostType* Lr_p0 = Lr[0] + xd - dx*NRD2;
 545                 CostType* Lr_p1 = Lr[1] + xd - NRD2 + D2;
 546                 CostType* Lr_p2 = Lr[1] + xd + D2*2;
 547                 CostType* Lr_p3 = Lr[1] + xd + NRD2 + D2*3;
 548
 549                 Lr_p0[-1] = Lr_p0[D] = Lr_p1[-1] = Lr_p1[D] =
 550                 Lr_p2[-1] = Lr_p2[D] = Lr_p3[-1] = Lr_p3[D] = MAX_COST;
 551
 552                 CostType* Lr_p = Lr[0] + xd;
 553                 const CostType* Cp = C + x*D;
 554                 CostType* Sp = S + x*D;
 555
 556             #if CV_SSE2
 557                 if( useSIMD )
 558                 {
 559                     __m128i _P1 = _mm_set1_epi16((short)P1);
 560
 561                     __m128i _delta0 = _mm_set1_epi16((short)delta0);
 562                     __m128i _delta1 = _mm_set1_epi16((short)delta1);
 563                     __m128i _delta2 = _mm_set1_epi16((short)delta2);
 564                     __m128i _delta3 = _mm_set1_epi16((short)delta3);
 565                     __m128i _minL0 = _mm_set1_epi16((short)MAX_COST);
 566
 567                     for( d = 0; d < D; d += 8 )
 568                     {
 569                         __m128i Cpd = _mm_load_si128((const __m128i*)(Cp + d));
 570                         __m128i L0, L1, L2, L3;
 571
 572                         L0 = _mm_load_si128((const __m128i*)(Lr_p0 + d));
 573                         L1 = _mm_load_si128((const __m128i*)(Lr_p1 + d));
 574                         L2 = _mm_load_si128((const __m128i*)(Lr_p2 + d));
 575                         L3 = _mm_load_si128((const __m128i*)(Lr_p3 + d));
 576
 577                         L0 = _mm_min_epi16(L0, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p0 + d - 1)), _P1));
 578                         L0 = _mm_min_epi16(L0, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p0 + d + 1)), _P1));
 579
 580                         L1 = _mm_min_epi16(L1, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p1 + d - 1)), _P1));
 581                         L1 = _mm_min_epi16(L1, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p1 + d + 1)), _P1));
 582
 583                         L2 = _mm_min_epi16(L2, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p2 + d - 1)), _P1));
 584                         L2 = _mm_min_epi16(L2, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p2 + d + 1)), _P1));
 585
 586                         L3 = _mm_min_epi16(L3, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p3 + d - 1)), _P1));
 587                         L3 = _mm_min_epi16(L3, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p3 + d + 1)), _P1));
 588
 589                         L0 = _mm_min_epi16(L0, _delta0);
 590                         L0 = _mm_adds_epi16(_mm_subs_epi16(L0, _delta0), Cpd);
 591
 592                         L1 = _mm_min_epi16(L1, _delta1);
 593                         L1 = _mm_adds_epi16(_mm_subs_epi16(L1, _delta1), Cpd);
 594
 595                         L2 = _mm_min_epi16(L2, _delta2);
 596                         L2 = _mm_adds_epi16(_mm_subs_epi16(L2, _delta2), Cpd);
 597
 598                         L3 = _mm_min_epi16(L3, _delta3);
 599                         L3 = _mm_adds_epi16(_mm_subs_epi16(L3, _delta3), Cpd);
 600
 601                         _mm_store_si128( (__m128i*)(Lr_p + d), L0);
 602                         _mm_store_si128( (__m128i*)(Lr_p + d + D2), L1);
 603                         _mm_store_si128( (__m128i*)(Lr_p + d + D2*2), L2);
 604                         _mm_store_si128( (__m128i*)(Lr_p + d + D2*3), L3);
 605
 606                         __m128i t0 = _mm_min_epi16(_mm_unpacklo_epi16(L0, L2), _mm_unpackhi_epi16(L0, L2));
 607                         __m128i t1 = _mm_min_epi16(_mm_unpacklo_epi16(L1, L3), _mm_unpackhi_epi16(L1, L3));
 608                         t0 = _mm_min_epi16(_mm_unpacklo_epi16(t0, t1), _mm_unpackhi_epi16(t0, t1));
 609                         _minL0 = _mm_min_epi16(_minL0, t0);
 610
 611                         __m128i Sval = _mm_load_si128((const __m128i*)(Sp + d));
 612
 613                         L0 = _mm_adds_epi16(L0, L1);
 614                         L2 = _mm_adds_epi16(L2, L3);
 615                         Sval = _mm_adds_epi16(Sval, L0);
 616                         Sval = _mm_adds_epi16(Sval, L2);
 617
 618                         _mm_store_si128((__m128i*)(Sp + d), Sval);
 619                     }
 620
 621                     _minL0 = _mm_min_epi16(_minL0, _mm_srli_si128(_minL0, 8));
 622                     _mm_storel_epi64((__m128i*)&minLr[0][xm], _minL0);
 623                 }
 624                 else
 625             #endif
 626                 {
 627                     int minL0 = MAX_COST, minL1 = MAX_COST, minL2 = MAX_COST, minL3 = MAX_COST;
 628
 629                     for( d = 0; d < D; d++ )
 630                     {
 631                         int Cpd = Cp[d], L0, L1, L2, L3;
 632
 633                         L0 = Cpd + min((int)Lr_p0[d], min(Lr_p0[d-1] + P1, min(Lr_p0[d+1] + P1, delta0))) - delta0;
 634                         L1 = Cpd + min((int)Lr_p1[d], min(Lr_p1[d-1] + P1, min(Lr_p1[d+1] + P1, delta1))) - delta1;
 635                         L2 = Cpd + min((int)Lr_p2[d], min(Lr_p2[d-1] + P1, min(Lr_p2[d+1] + P1, delta2))) - delta2;
 636                         L3 = Cpd + min((int)Lr_p3[d], min(Lr_p3[d-1] + P1, min(Lr_p3[d+1] + P1, delta3))) - delta3;
 637
 638                         Lr_p[d] = (CostType)L0;
 639                         minL0 = min(minL0, L0);
 640
 641                         Lr_p[d + D2] = (CostType)L1;
 642                         minL1 = min(minL1, L1);
 643
 644                         Lr_p[d + D2*2] = (CostType)L2;
 645                         minL2 = min(minL2, L2);
 646
 647                         Lr_p[d + D2*3] = (CostType)L3;
 648                         minL3 = min(minL3, L3);
 649
 650                         Sp[d] = saturate_cast<CostType>(Sp[d] + L0 + L1 + L2 + L3);
 651                     }
 652                     minLr[0][xm] = (CostType)minL0;
 653                     minLr[0][xm+1] = (CostType)minL1;
 654                     minLr[0][xm+2] = (CostType)minL2;
 655                     minLr[0][xm+3] = (CostType)minL3;
 656                 }
 657             }
 658
 659             if( pass == npasses )
 660             {
 661                 for( x = 0; x < width; x++ )
 662                 {
 663                     disp1ptr[x] = disp2ptr[x] = (DispType)INVALID_DISP_SCALED;
 664                     disp2cost[x] = MAX_COST;
 665                 }
 666
 667                 for( x = width1 - 1; x >= 0; x-- )
 668                 {
 669                     CostType* Sp = S + x*D;
 670                     int minS = MAX_COST, bestDisp = -1;
 671
 672                     if( npasses == 1 )
 673                     {
 674                         int xm = x*NR2, xd = xm*D2;
 675
 676                         int minL0 = MAX_COST;
 677                         int delta0 = minLr[0][xm + NR2] + P2;
 678                         CostType* Lr_p0 = Lr[0] + xd + NRD2;
 679                         Lr_p0[-1] = Lr_p0[D] = MAX_COST;
 680                         CostType* Lr_p = Lr[0] + xd;
 681
 682                         const CostType* Cp = C + x*D;
 683
 684                     #if CV_SSE2
 685                         if( useSIMD )
 686                         {
 687                             __m128i _P1 = _mm_set1_epi16((short)P1);
 688                             __m128i _delta0 = _mm_set1_epi16((short)delta0);
 689
 690                             __m128i _minL0 = _mm_set1_epi16((short)minL0);
 691                             __m128i _minS = _mm_set1_epi16(MAX_COST), _bestDisp = _mm_set1_epi16(-1);
 692                             __m128i _d8 = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7), _8 = _mm_set1_epi16(8);
 693
 694                             for( d = 0; d < D; d += 8 )
 695                             {
 696                                 __m128i Cpd = _mm_load_si128((const __m128i*)(Cp + d)), L0;
 697
 698                                 L0 = _mm_load_si128((const __m128i*)(Lr_p0 + d));
 699                                 L0 = _mm_min_epi16(L0, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p0 + d - 1)), _P1));
 700                                 L0 = _mm_min_epi16(L0, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p0 + d + 1)), _P1));
 701                                 L0 = _mm_min_epi16(L0, _delta0);
 702                                 L0 = _mm_adds_epi16(_mm_subs_epi16(L0, _delta0), Cpd);
 703
 704                                 _mm_store_si128((__m128i*)(Lr_p + d), L0);
 705                                 _minL0 = _mm_min_epi16(_minL0, L0);
 706                                 L0 = _mm_adds_epi16(L0, *(__m128i*)(Sp + d));
 707                                 _mm_store_si128((__m128i*)(Sp + d), L0);
 708
 709                                 __m128i mask = _mm_cmpgt_epi16(_minS, L0);
 710                                 _minS = _mm_min_epi16(_minS, L0);
 711                                 _bestDisp = _mm_xor_si128(_bestDisp, _mm_and_si128(_mm_xor_si128(_bestDisp,_d8), mask));
 712                                 _d8 = _mm_adds_epi16(_d8, _8);
 713                             }
 714
 715                             short CV_DECL_ALIGNED(16) bestDispBuf[8];
 716                             _mm_store_si128((__m128i*)bestDispBuf, _bestDisp);
 717
 718                             _minL0 = _mm_min_epi16(_minL0, _mm_srli_si128(_minL0, 8));
 719                             _minL0 = _mm_min_epi16(_minL0, _mm_srli_si128(_minL0, 4));
 720                             _minL0 = _mm_min_epi16(_minL0, _mm_srli_si128(_minL0, 2));
 721
 722                             __m128i qS = _mm_min_epi16(_minS, _mm_srli_si128(_minS, 8));
 723                             qS = _mm_min_epi16(qS, _mm_srli_si128(qS, 4));
 724                             qS = _mm_min_epi16(qS, _mm_srli_si128(qS, 2));
 725
 726                             minLr[0][xm] = (CostType)_mm_cvtsi128_si32(_minL0);
 727                             minS = (CostType)_mm_cvtsi128_si32(qS);
 728
 729                             qS = _mm_shuffle_epi32(_mm_unpacklo_epi16(qS, qS), 0);
 730                             qS = _mm_cmpeq_epi16(_minS, qS);
 731                             int idx = _mm_movemask_epi8(_mm_packs_epi16(qS, qS)) & 255;
 732
 733                             bestDisp = bestDispBuf[LSBTab[idx]];
 734                         }
 735                         else
 736                     #endif
 737                         {
 738                             for( d = 0; d < D; d++ )
 739                             {
 740                                 int L0 = Cp[d] + min((int)Lr_p0[d], min(Lr_p0[d-1] + P1, min(Lr_p0[d+1] + P1, delta0))) - delta0;
 741
 742                                 Lr_p[d] = (CostType)L0;
 743                                 minL0 = min(minL0, L0);
 744
 745                                 int Sval = Sp[d] = saturate_cast<CostType>(Sp[d] + L0);
 746                                 if( Sval < minS )
 747                                 {
 748                                     minS = Sval;
 749                                     bestDisp = d;
 750                                 }
 751                             }
 752                             minLr[0][xm] = (CostType)minL0;
 753                         }
 754                     }
 755                     else
 756                     {
 757                         for( d = 0; d < D; d++ )
 758                         {
 759                             int Sval = Sp[d];
 760                             if( Sval < minS )
 761                             {
 762                                 minS = Sval;
 763                                 bestDisp = d;
 764                             }
 765                         }
 766                     }
 767
 768                     for( d = 0; d < D; d++ )
 769                     {
 770                         if( Sp[d]*(100 - uniquenessRatio) < minS*100 && std::abs(bestDisp - d) > 1 )
 771                             break;
 772                     }
 773                     if( d < D )
 774                         continue;
 775                     d = bestDisp;
 776                     int _x2 = x + minX1 - d - minD;
 777                     if( disp2cost[_x2] > minS )
 778                     {
 779                         disp2cost[_x2] = (CostType)minS;
 780                         disp2ptr[_x2] = (DispType)(d + minD);
 781                     }
 782
 783                     if( 0 < d && d < D-1 )
 784                     {
 785                         // do subpixel quadratic interpolation:
 786                         //   fit parabola into (x1=d-1, y1=Sp[d-1]), (x2=d, y2=Sp[d]), (x3=d+1, y3=Sp[d+1])
 787                         //   then find minimum of the parabola.
 788                         int denom2 = max(Sp[d-1] + Sp[d+1] - 2*Sp[d], 1);
 789                         d = d*DISP_SCALE + ((Sp[d-1] - Sp[d+1])*DISP_SCALE + denom2)/(denom2*2);
 790                     }
 791                     else
 792                         d *= DISP_SCALE;
 793                     disp1ptr[x + minX1] = (DispType)(d + minD*DISP_SCALE);
 794                 }
 795
 796                 for( x = minX1; x < maxX1; x++ )
 797                 {
 798                     // we round the computed disparity both towards -inf and +inf and check
 799                     // if either of the corresponding disparities in disp2 is consistent.
 800                     // This is to give the computed disparity a chance to look valid if it is.
 801                     int d1 = disp1ptr[x];
 802                     if( d1 == INVALID_DISP_SCALED )
 803                         continue;
 804                     int _d = d1 >> DISP_SHIFT;
 805                     int d_ = (d1 + DISP_SCALE-1) >> DISP_SHIFT;
 806                     int _x = x - _d, x_ = x - d_;
 807                     if( 0 <= _x && _x < width && disp2ptr[_x] >= minD && std::abs(disp2ptr[_x] - _d) > disp12MaxDiff &&
 808                        0 <= x_ && x_ < width && disp2ptr[x_] >= minD && std::abs(disp2ptr[x_] - d_) > disp12MaxDiff )
 809                         disp1ptr[x] = (DispType)INVALID_DISP_SCALED;
 810                 }
 811             }
 812
 813             // now shift the cyclic buffers
 814             std::swap( Lr[0], Lr[1] );
 815             std::swap( minLr[0], minLr[1] );
 816         }
 817     }
 818 }
 819
 820 typedef cv::Point_<short> Point2s;
 821
 822 void StereoSGBM::operator ()( InputArray _left, InputArray _right,
 823                              OutputArray _disp )
 824 {
 825     Mat left = _left.getMat(), right = _right.getMat();
 826     CV_Assert( left.size() == right.size() && left.type() == right.type() &&
 827               left.depth() == DataType<PixType>::depth );
 828
 829     _disp.create( left.size(), CV_16S );
 830     Mat disp = _disp.getMat();
 831
 832     computeDisparitySGBM( left, right, disp, *this, buffer );
 833     medianBlur(disp, disp, 3);
 834
 835     if( speckleWindowSize > 0 )
 836         filterSpeckles(disp, (minDisparity - 1)*DISP_SCALE, speckleWindowSize, DISP_SCALE*speckleRange, buffer);
 837 }
 838
 839
 840 Rect getValidDisparityROI( Rect roi1, Rect roi2,
 841                           int minDisparity,
 842                           int numberOfDisparities,
 843                           int SADWindowSize )
 844 {
 845     int SW2 = SADWindowSize/2;
 846     int minD = minDisparity, maxD = minDisparity + numberOfDisparities - 1;
 847
 848     int xmin = max(roi1.x, roi2.x + maxD) + SW2;
 849     int xmax = min(roi1.x + roi1.width, roi2.x + roi2.width - minD) - SW2;
 850     int ymin = max(roi1.y, roi2.y) + SW2;
 851     int ymax = min(roi1.y + roi1.height, roi2.y + roi2.height) - SW2;
 852
 853     Rect r(xmin, ymin, xmax - xmin, ymax - ymin);
 854
 855     return r.width > 0 && r.height > 0 ? r : Rect();
 856 }
 857
 858 }
 859
 860 namespace
 861 {
 862     template <typename T>
 863     void filterSpecklesImpl(cv::Mat& img, int newVal, int maxSpeckleSize, int maxDiff, cv::Mat& _buf)
 864     {
 865         using namespace cv;
 866
 867         int width = img.cols, height = img.rows, npixels = width*height;
 868         size_t bufSize = npixels*(int)(sizeof(Point2s) + sizeof(int) + sizeof(uchar));
 869         if( !_buf.isContinuous() || !_buf.data || _buf.cols*_buf.rows*_buf.elemSize() < bufSize )
 870             _buf.create(1, (int)bufSize, CV_8U);
 871
 872         uchar* buf = _buf.data;
 873         int i, j, dstep = (int)(img.step/sizeof(T));
 874         int* labels = (int*)buf;
 875         buf += npixels*sizeof(labels[0]);
 876         Point2s* wbuf = (Point2s*)buf;
 877         buf += npixels*sizeof(wbuf[0]);
 878         uchar* rtype = (uchar*)buf;
 879         int curlabel = 0;
 880
 881         // clear out label assignments
 882         memset(labels, 0, npixels*sizeof(labels[0]));
 883
 884         for( i = 0; i < height; i++ )
 885         {
 886             T* ds = img.ptr<T>(i);
 887             int* ls = labels + width*i;
 888
 889             for( j = 0; j < width; j++ )
 890             {
 891                 if( ds[j] != newVal )   // not a bad disparity
 892                 {
 893                     if( ls[j] )     // has a label, check for bad label
 894                     {
 895                         if( rtype[ls[j]] ) // small region, zero out disparity
 896                             ds[j] = (T)newVal;
 897                     }
 898                     // no label, assign and propagate
 899                     else
 900                     {
 901                         Point2s* ws = wbuf; // initialize wavefront
 902                         Point2s p((short)j, (short)i);  // current pixel
 903                         curlabel++; // next label
 904                         int count = 0;  // current region size
 905                         ls[j] = curlabel;
 906
 907                         // wavefront propagation
 908                         while( ws >= wbuf ) // wavefront not empty
 909                         {
 910                             count++;
 911                             // put neighbors onto wavefront
 912                             T* dpp = &img.at<T>(p.y, p.x);
 913                             T dp = *dpp;
 914                             int* lpp = labels + width*p.y + p.x;
 915
 916                             if( p.x < width-1 && !lpp[+1] && dpp[+1] != newVal && std::abs(dp - dpp[+1]) <= maxDiff )
 917                             {
 918                                 lpp[+1] = curlabel;
 919                                 *ws++ = Point2s(p.x+1, p.y);
 920                             }
 921
 922                             if( p.x > 0 && !lpp[-1] && dpp[-1] != newVal && std::abs(dp - dpp[-1]) <= maxDiff )
 923                             {
 924                                 lpp[-1] = curlabel;
 925                                 *ws++ = Point2s(p.x-1, p.y);
 926                             }
 927
 928                             if( p.y < height-1 && !lpp[+width] && dpp[+dstep] != newVal && std::abs(dp - dpp[+dstep]) <= maxDiff )
 929                             {
 930                                 lpp[+width] = curlabel;
 931                                 *ws++ = Point2s(p.x, p.y+1);
 932                             }
 933
 934                             if( p.y > 0 && !lpp[-width] && dpp[-dstep] != newVal && std::abs(dp - dpp[-dstep]) <= maxDiff )
 935                             {
 936                                 lpp[-width] = curlabel;
 937                                 *ws++ = Point2s(p.x, p.y-1);
 938                             }
 939
 940                             // pop most recent and propagate
 941                             // NB: could try least recent, maybe better convergence
 942                             p = *--ws;
 943                         }
 944
 945                         // assign label type
 946                         if( count <= maxSpeckleSize )   // speckle region
 947                         {
 948                             rtype[ls[j]] = 1;   // small region label
 949                             ds[j] = (T)newVal;
 950                         }
 951                         else
 952                             rtype[ls[j]] = 0;   // large region label
 953                     }
 954                 }
 955             }
 956         }
 957     }
 958 }
 959
 960 void cv::filterSpeckles( InputOutputArray _img, double _newval, int maxSpeckleSize,
 961                          double _maxDiff, InputOutputArray __buf )
 962 {
 963     Mat img = _img.getMat();
 964     Mat temp, &_buf = __buf.needed() ? __buf.getMatRef() : temp;
 965     CV_Assert( img.type() == CV_8UC1 || img.type() == CV_16SC1 );
 966
 967     int newVal = cvRound(_newval);
 968     int maxDiff = cvRound(_maxDiff);
 969
 970     if (img.type() == CV_8UC1)
 971         filterSpecklesImpl<uchar>(img, newVal, maxSpeckleSize, maxDiff, _buf);
 972     else
 973         filterSpecklesImpl<short>(img, newVal, maxSpeckleSize, maxDiff, _buf);
 974 }
 975
 976 void cv::validateDisparity( InputOutputArray _disp, InputArray _cost, int minDisparity,
 977                             int numberOfDisparities, int disp12MaxDiff )
 978 {
 979     Mat disp = _disp.getMat(), cost = _cost.getMat();
 980     int cols = disp.cols, rows = disp.rows;
 981     int minD = minDisparity, maxD = minDisparity + numberOfDisparities;
 982     int x, minX1 = max(maxD, 0), maxX1 = cols + min(minD, 0);
 983     AutoBuffer<int> _disp2buf(cols*2);
 984     int* disp2buf = _disp2buf;
 985     int* disp2cost = disp2buf + cols;
 986     const int DISP_SHIFT = 4, DISP_SCALE = 1 << DISP_SHIFT;
 987     int INVALID_DISP = minD - 1, INVALID_DISP_SCALED = INVALID_DISP*DISP_SCALE;
 988     int costType = cost.type();
 989
 990     disp12MaxDiff *= DISP_SCALE;
 991
 992     CV_Assert( numberOfDisparities > 0 && disp.type() == CV_16S &&
 993               (costType == CV_16S || costType == CV_32S) &&
 994               disp.size() == cost.size() );
 995
 996     for( int y = 0; y < rows; y++ )
 997     {
 998         short* dptr = disp.ptr<short>(y);
 999
1000         for( x = 0; x < cols; x++ )
1001         {
1002             disp2buf[x] = INVALID_DISP_SCALED;
1003             disp2cost[x] = INT_MAX;
1004         }
1005
1006         if( costType == CV_16S )
1007         {
1008             const short* cptr = cost.ptr<short>(y);
1009
1010             for( x = minX1; x < maxX1; x++ )
1011             {
1012                 int d = dptr[x], c = cptr[x];
1013                 int x2 = x - ((d + DISP_SCALE/2) >> DISP_SHIFT);
1014
1015                 if( disp2cost[x2] > c )
1016                 {
1017                     disp2cost[x2] = c;
1018                     disp2buf[x2] = d;
1019                 }
1020             }
1021         }
1022         else
1023         {
1024             const int* cptr = cost.ptr<int>(y);
1025
1026             for( x = minX1; x < maxX1; x++ )
1027             {
1028                 int d = dptr[x], c = cptr[x];
1029                 int x2 = x - ((d + DISP_SCALE/2) >> DISP_SHIFT);
1030
1031                 if( disp2cost[x2] < c )
1032                 {
1033                     disp2cost[x2] = c;
1034                     disp2buf[x2] = d;
1035                 }
1036             }
1037         }
1038
1039         for( x = minX1; x < maxX1; x++ )
1040         {
1041             // we round the computed disparity both towards -inf and +inf and check
1042             // if either of the corresponding disparities in disp2 is consistent.
1043             // This is to give the computed disparity a chance to look valid if it is.
1044             int d = dptr[x];
1045             if( d == INVALID_DISP_SCALED )
1046                 continue;
1047             int d0 = d >> DISP_SHIFT;
1048             int d1 = (d + DISP_SCALE-1) >> DISP_SHIFT;
1049             int x0 = x - d0, x1 = x - d1;
1050             if( (0 <= x0 && x0 < cols && disp2buf[x0] > INVALID_DISP_SCALED && std::abs(disp2buf[x0] - d) > disp12MaxDiff) &&
1051                 (0 <= x1 && x1 < cols && disp2buf[x1] > INVALID_DISP_SCALED && std::abs(disp2buf[x1] - d) > disp12MaxDiff) )
1052                 dptr[x] = (short)INVALID_DISP_SCALED;
1053         }
1054     }
1055 }
1056
1057 CvRect cvGetValidDisparityROI( CvRect roi1, CvRect roi2, int minDisparity,
1058                                int numberOfDisparities, int SADWindowSize )
1059 {
1060     return (CvRect)cv::getValidDisparityROI( roi1, roi2, minDisparity,
1061                                              numberOfDisparities, SADWindowSize );
1062 }
1063
1064 void cvValidateDisparity( CvArr* _disp, const CvArr* _cost, int minDisparity,
1065                           int numberOfDisparities, int disp12MaxDiff )
1066 {
1067     cv::Mat disp = cv::cvarrToMat(_disp), cost = cv::cvarrToMat(_cost);
1068     cv::validateDisparity( disp, cost, minDisparity, numberOfDisparities, disp12MaxDiff );
1069 }