modules/calib3d/src/stereosgbm.cpp

   1 /*M///////////////////////////////////////////////////////////////////////////////////////
   2 //
   3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
   4 //
   5 //  By downloading, copying, installing or using the software you agree to this license.
   6 //  If you do not agree to this license, do not download, install,
   7 //  copy or use the software.
   8 //
   9 //
  10 //                           License Agreement
  11 //                For Open Source Computer Vision Library
  12 //
  13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
  14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
  15 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
  16 // Third party copyrights are property of their respective owners.
  17 //
  18 // Redistribution and use in source and binary forms, with or without modification,
  19 // are permitted provided that the following conditions are met:
  20 //
  21 //   * Redistribution's of source code must retain the above copyright notice,
  22 //     this list of conditions and the following disclaimer.
  23 //
  24 //   * Redistribution's in binary form must reproduce the above copyright notice,
  25 //     this list of conditions and the following disclaimer in the documentation
  26 //     and/or other materials provided with the distribution.
  27 //
  28 //   * The name of the copyright holders may not be used to endorse or promote products
  29 //     derived from this software without specific prior written permission.
  30 //
  31 // This software is provided by the copyright holders and contributors "as is" and
  32 // any express or implied warranties, including, but not limited to, the implied
  33 // warranties of merchantability and fitness for a particular purpose are disclaimed.
  34 // In no event shall the Intel Corporation or contributors be liable for any direct,
  35 // indirect, incidental, special, exemplary, or consequential damages
  36 // (including, but not limited to, procurement of substitute goods or services;
  37 // loss of use, data, or profits; or business interruption) however caused
  38 // and on any theory of liability, whether in contract, strict liability,
  39 // or tort (including negligence or otherwise) arising in any way out of
  40 // the use of this software, even if advised of the possibility of such damage.
  41 //
  42 //M*/
  43
  44 /*
  45  This is a variation of
  46  "Stereo Processing by Semiglobal Matching and Mutual Information"
  47  by Heiko Hirschmuller.
  48
  49  We match blocks rather than individual pixels, thus the algorithm is called
  50  SGBM (Semi-global block matching)
  51  */
  52
  53 #include "precomp.hpp"
  54 #include <limits.h>
  55
  56 namespace cv
  57 {
  58
  59 typedef uchar PixType;
  60 typedef short CostType;
  61 typedef short DispType;
  62
  63 enum { NR = 16, NR2 = NR/2 };
  64
  65
  66 struct StereoSGBMParams
  67 {
  68     StereoSGBMParams()
  69     {
  70         minDisparity = numDisparities = 0;
  71         SADWindowSize = 0;
  72         P1 = P2 = 0;
  73         disp12MaxDiff = 0;
  74         preFilterCap = 0;
  75         uniquenessRatio = 0;
  76         speckleWindowSize = 0;
  77         speckleRange = 0;
  78         mode = StereoSGBM::MODE_SGBM;
  79     }
  80
  81     StereoSGBMParams( int _minDisparity, int _numDisparities, int _SADWindowSize,
  82                       int _P1, int _P2, int _disp12MaxDiff, int _preFilterCap,
  83                       int _uniquenessRatio, int _speckleWindowSize, int _speckleRange,
  84                       int _mode )
  85     {
  86         minDisparity = _minDisparity;
  87         numDisparities = _numDisparities;
  88         SADWindowSize = _SADWindowSize;
  89         P1 = _P1;
  90         P2 = _P2;
  91         disp12MaxDiff = _disp12MaxDiff;
  92         preFilterCap = _preFilterCap;
  93         uniquenessRatio = _uniquenessRatio;
  94         speckleWindowSize = _speckleWindowSize;
  95         speckleRange = _speckleRange;
  96         mode = _mode;
  97     }
  98
  99     int minDisparity;
 100     int numDisparities;
 101     int SADWindowSize;
 102     int preFilterCap;
 103     int uniquenessRatio;
 104     int P1;
 105     int P2;
 106     int speckleWindowSize;
 107     int speckleRange;
 108     int disp12MaxDiff;
 109     int mode;
 110 };
 111
 112 /*
 113  For each pixel row1[x], max(-maxD, 0) <= minX <= x < maxX <= width - max(0, -minD),
 114  and for each disparity minD<=d<maxD the function
 115  computes the cost (cost[(x-minX)*(maxD - minD) + (d - minD)]), depending on the difference between
 116  row1[x] and row2[x-d]. The subpixel algorithm from
 117  "Depth Discontinuities by Pixel-to-Pixel Stereo" by Stan Birchfield and C. Tomasi
 118  is used, hence the suffix BT.
 119
 120  the temporary buffer should contain width2*2 elements
 121  */
 122 static void calcPixelCostBT( const Mat& img1, const Mat& img2, int y,
 123                             int minD, int maxD, CostType* cost,
 124                             PixType* buffer, const PixType* tab,
 125                             int tabOfs, int )
 126 {
 127     int x, c, width = img1.cols, cn = img1.channels();
 128     int minX1 = std::max(-maxD, 0), maxX1 = width + std::min(minD, 0);
 129     int minX2 = std::max(minX1 - maxD, 0), maxX2 = std::min(maxX1 - minD, width);
 130     int D = maxD - minD, width1 = maxX1 - minX1, width2 = maxX2 - minX2;
 131     const PixType *row1 = img1.ptr<PixType>(y), *row2 = img2.ptr<PixType>(y);
 132     PixType *prow1 = buffer + width2*2, *prow2 = prow1 + width*cn*2;
 133
 134     tab += tabOfs;
 135
 136     for( c = 0; c < cn*2; c++ )
 137     {
 138         prow1[width*c] = prow1[width*c + width-1] =
 139         prow2[width*c] = prow2[width*c + width-1] = tab[0];
 140     }
 141
 142     int n1 = y > 0 ? -(int)img1.step : 0, s1 = y < img1.rows-1 ? (int)img1.step : 0;
 143     int n2 = y > 0 ? -(int)img2.step : 0, s2 = y < img2.rows-1 ? (int)img2.step : 0;
 144
 145     if( cn == 1 )
 146     {
 147         for( x = 1; x < width-1; x++ )
 148         {
 149             prow1[x] = tab[(row1[x+1] - row1[x-1])*2 + row1[x+n1+1] - row1[x+n1-1] + row1[x+s1+1] - row1[x+s1-1]];
 150             prow2[width-1-x] = tab[(row2[x+1] - row2[x-1])*2 + row2[x+n2+1] - row2[x+n2-1] + row2[x+s2+1] - row2[x+s2-1]];
 151
 152             prow1[x+width] = row1[x];
 153             prow2[width-1-x+width] = row2[x];
 154         }
 155     }
 156     else
 157     {
 158         for( x = 1; x < width-1; x++ )
 159         {
 160             prow1[x] = tab[(row1[x*3+3] - row1[x*3-3])*2 + row1[x*3+n1+3] - row1[x*3+n1-3] + row1[x*3+s1+3] - row1[x*3+s1-3]];
 161             prow1[x+width] = tab[(row1[x*3+4] - row1[x*3-2])*2 + row1[x*3+n1+4] - row1[x*3+n1-2] + row1[x*3+s1+4] - row1[x*3+s1-2]];
 162             prow1[x+width*2] = tab[(row1[x*3+5] - row1[x*3-1])*2 + row1[x*3+n1+5] - row1[x*3+n1-1] + row1[x*3+s1+5] - row1[x*3+s1-1]];
 163
 164             prow2[width-1-x] = tab[(row2[x*3+3] - row2[x*3-3])*2 + row2[x*3+n2+3] - row2[x*3+n2-3] + row2[x*3+s2+3] - row2[x*3+s2-3]];
 165             prow2[width-1-x+width] = tab[(row2[x*3+4] - row2[x*3-2])*2 + row2[x*3+n2+4] - row2[x*3+n2-2] + row2[x*3+s2+4] - row2[x*3+s2-2]];
 166             prow2[width-1-x+width*2] = tab[(row2[x*3+5] - row2[x*3-1])*2 + row2[x*3+n2+5] - row2[x*3+n2-1] + row2[x*3+s2+5] - row2[x*3+s2-1]];
 167
 168             prow1[x+width*3] = row1[x*3];
 169             prow1[x+width*4] = row1[x*3+1];
 170             prow1[x+width*5] = row1[x*3+2];
 171
 172             prow2[width-1-x+width*3] = row2[x*3];
 173             prow2[width-1-x+width*4] = row2[x*3+1];
 174             prow2[width-1-x+width*5] = row2[x*3+2];
 175         }
 176     }
 177
 178     memset( cost, 0, width1*D*sizeof(cost[0]) );
 179
 180     buffer -= minX2;
 181     cost -= minX1*D + minD; // simplify the cost indices inside the loop
 182
 183 #if CV_SSE2
 184     volatile bool useSIMD = checkHardwareSupport(CV_CPU_SSE2);
 185 #endif
 186
 187 #if 1
 188     for( c = 0; c < cn*2; c++, prow1 += width, prow2 += width )
 189     {
 190         int diff_scale = c < cn ? 0 : 2;
 191
 192         // precompute
 193         //   v0 = min(row2[x-1/2], row2[x], row2[x+1/2]) and
 194         //   v1 = max(row2[x-1/2], row2[x], row2[x+1/2]) and
 195         for( x = minX2; x < maxX2; x++ )
 196         {
 197             int v = prow2[x];
 198             int vl = x > 0 ? (v + prow2[x-1])/2 : v;
 199             int vr = x < width-1 ? (v + prow2[x+1])/2 : v;
 200             int v0 = std::min(vl, vr); v0 = std::min(v0, v);
 201             int v1 = std::max(vl, vr); v1 = std::max(v1, v);
 202             buffer[x] = (PixType)v0;
 203             buffer[x + width2] = (PixType)v1;
 204         }
 205
 206         for( x = minX1; x < maxX1; x++ )
 207         {
 208             int u = prow1[x];
 209             int ul = x > 0 ? (u + prow1[x-1])/2 : u;
 210             int ur = x < width-1 ? (u + prow1[x+1])/2 : u;
 211             int u0 = std::min(ul, ur); u0 = std::min(u0, u);
 212             int u1 = std::max(ul, ur); u1 = std::max(u1, u);
 213
 214         #if CV_SSE2
 215             if( useSIMD )
 216             {
 217                 __m128i _u = _mm_set1_epi8((char)u), _u0 = _mm_set1_epi8((char)u0);
 218                 __m128i _u1 = _mm_set1_epi8((char)u1), z = _mm_setzero_si128();
 219                 __m128i ds = _mm_cvtsi32_si128(diff_scale);
 220
 221                 for( int d = minD; d < maxD; d += 16 )
 222                 {
 223                     __m128i _v = _mm_loadu_si128((const __m128i*)(prow2 + width-x-1 + d));
 224                     __m128i _v0 = _mm_loadu_si128((const __m128i*)(buffer + width-x-1 + d));
 225                     __m128i _v1 = _mm_loadu_si128((const __m128i*)(buffer + width-x-1 + d + width2));
 226                     __m128i c0 = _mm_max_epu8(_mm_subs_epu8(_u, _v1), _mm_subs_epu8(_v0, _u));
 227                     __m128i c1 = _mm_max_epu8(_mm_subs_epu8(_v, _u1), _mm_subs_epu8(_u0, _v));
 228                     __m128i diff = _mm_min_epu8(c0, c1);
 229
 230                     c0 = _mm_load_si128((__m128i*)(cost + x*D + d));
 231                     c1 = _mm_load_si128((__m128i*)(cost + x*D + d + 8));
 232
 233                     _mm_store_si128((__m128i*)(cost + x*D + d), _mm_adds_epi16(c0, _mm_srl_epi16(_mm_unpacklo_epi8(diff,z), ds)));
 234                     _mm_store_si128((__m128i*)(cost + x*D + d + 8), _mm_adds_epi16(c1, _mm_srl_epi16(_mm_unpackhi_epi8(diff,z), ds)));
 235                 }
 236             }
 237             else
 238         #endif
 239             {
 240                 for( int d = minD; d < maxD; d++ )
 241                 {
 242                     int v = prow2[width-x-1 + d];
 243                     int v0 = buffer[width-x-1 + d];
 244                     int v1 = buffer[width-x-1 + d + width2];
 245                     int c0 = std::max(0, u - v1); c0 = std::max(c0, v0 - u);
 246                     int c1 = std::max(0, v - u1); c1 = std::max(c1, u0 - v);
 247
 248                     cost[x*D + d] = (CostType)(cost[x*D+d] + (std::min(c0, c1) >> diff_scale));
 249                 }
 250             }
 251         }
 252     }
 253 #else
 254     for( c = 0; c < cn*2; c++, prow1 += width, prow2 += width )
 255     {
 256         for( x = minX1; x < maxX1; x++ )
 257         {
 258             int u = prow1[x];
 259         #if CV_SSE2
 260             if( useSIMD )
 261             {
 262                 __m128i _u = _mm_set1_epi8(u), z = _mm_setzero_si128();
 263
 264                 for( int d = minD; d < maxD; d += 16 )
 265                 {
 266                     __m128i _v = _mm_loadu_si128((const __m128i*)(prow2 + width-1-x + d));
 267                     __m128i diff = _mm_adds_epu8(_mm_subs_epu8(_u,_v), _mm_subs_epu8(_v,_u));
 268                     __m128i c0 = _mm_load_si128((__m128i*)(cost + x*D + d));
 269                     __m128i c1 = _mm_load_si128((__m128i*)(cost + x*D + d + 8));
 270
 271                     _mm_store_si128((__m128i*)(cost + x*D + d), _mm_adds_epi16(c0, _mm_unpacklo_epi8(diff,z)));
 272                     _mm_store_si128((__m128i*)(cost + x*D + d + 8), _mm_adds_epi16(c1, _mm_unpackhi_epi8(diff,z)));
 273                 }
 274             }
 275             else
 276         #endif
 277             {
 278                 for( int d = minD; d < maxD; d++ )
 279                 {
 280                     int v = prow2[width-1-x + d];
 281                     cost[x*D + d] = (CostType)(cost[x*D + d] + (CostType)std::abs(u - v));
 282                 }
 283             }
 284         }
 285     }
 286 #endif
 287 }
 288
 289
 290 /*
 291  computes disparity for "roi" in img1 w.r.t. img2 and write it to disp1buf.
 292  that is, disp1buf(x, y)=d means that img1(x+roi.x, y+roi.y) ~ img2(x+roi.x-d, y+roi.y).
 293  minD <= d < maxD.
 294  disp2full is the reverse disparity map, that is:
 295  disp2full(x+roi.x,y+roi.y)=d means that img2(x+roi.x, y+roi.y) ~ img1(x+roi.x+d, y+roi.y)
 296
 297  note that disp1buf will have the same size as the roi and
 298  disp2full will have the same size as img1 (or img2).
 299  On exit disp2buf is not the final disparity, it is an intermediate result that becomes
 300  final after all the tiles are processed.
 301
 302  the disparity in disp1buf is written with sub-pixel accuracy
 303  (4 fractional bits, see StereoSGBM::DISP_SCALE),
 304  using quadratic interpolation, while the disparity in disp2buf
 305  is written as is, without interpolation.
 306
 307  disp2cost also has the same size as img1 (or img2).
 308  It contains the minimum current cost, used to find the best disparity, corresponding to the minimal cost.
 309  */
 310 static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
 311                                  Mat& disp1, const StereoSGBMParams& params,
 312                                  Mat& buffer )
 313 {
 314 #if CV_SSE2
 315     static const uchar LSBTab[] =
 316     {
 317         0, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
 318         5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
 319         6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
 320         5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
 321         7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
 322         5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
 323         6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
 324         5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0
 325     };
 326
 327     volatile bool useSIMD = checkHardwareSupport(CV_CPU_SSE2);
 328 #endif
 329
 330     const int ALIGN = 16;
 331     const int DISP_SHIFT = StereoMatcher::DISP_SHIFT;
 332     const int DISP_SCALE = (1 << DISP_SHIFT);
 333     const CostType MAX_COST = SHRT_MAX;
 334
 335     int minD = params.minDisparity, maxD = minD + params.numDisparities;
 336     Size SADWindowSize;
 337     SADWindowSize.width = SADWindowSize.height = params.SADWindowSize > 0 ? params.SADWindowSize : 5;
 338     int ftzero = std::max(params.preFilterCap, 15) | 1;
 339     int uniquenessRatio = params.uniquenessRatio >= 0 ? params.uniquenessRatio : 10;
 340     int disp12MaxDiff = params.disp12MaxDiff > 0 ? params.disp12MaxDiff : 1;
 341     int P1 = params.P1 > 0 ? params.P1 : 2, P2 = std::max(params.P2 > 0 ? params.P2 : 5, P1+1);
 342     int k, width = disp1.cols, height = disp1.rows;
 343     int minX1 = std::max(-maxD, 0), maxX1 = width + std::min(minD, 0);
 344     int D = maxD - minD, width1 = maxX1 - minX1;
 345     int INVALID_DISP = minD - 1, INVALID_DISP_SCALED = INVALID_DISP*DISP_SCALE;
 346     int SW2 = SADWindowSize.width/2, SH2 = SADWindowSize.height/2;
 347     bool fullDP = params.mode == StereoSGBM::MODE_HH;
 348     int npasses = fullDP ? 2 : 1;
 349     const int TAB_OFS = 256*4, TAB_SIZE = 256 + TAB_OFS*2;
 350     PixType clipTab[TAB_SIZE];
 351
 352     for( k = 0; k < TAB_SIZE; k++ )
 353         clipTab[k] = (PixType)(std::min(std::max(k - TAB_OFS, -ftzero), ftzero) + ftzero);
 354
 355     if( minX1 >= maxX1 )
 356     {
 357         disp1 = Scalar::all(INVALID_DISP_SCALED);
 358         return;
 359     }
 360
 361     CV_Assert( D % 16 == 0 );
 362
 363     // NR - the number of directions. the loop on x below that computes Lr assumes that NR == 8.
 364     // if you change NR, please, modify the loop as well.
 365     int D2 = D+16, NRD2 = NR2*D2;
 366
 367     // the number of L_r(.,.) and min_k L_r(.,.) lines in the buffer:
 368     // for 8-way dynamic programming we need the current row and
 369     // the previous row, i.e. 2 rows in total
 370     const int NLR = 2;
 371     const int LrBorder = NLR - 1;
 372
 373     // for each possible stereo match (img1(x,y) <=> img2(x-d,y))
 374     // we keep pixel difference cost (C) and the summary cost over NR directions (S).
 375     // we also keep all the partial costs for the previous line L_r(x,d) and also min_k L_r(x, k)
 376     size_t costBufSize = width1*D;
 377     size_t CSBufSize = costBufSize*(fullDP ? height : 1);
 378     size_t minLrSize = (width1 + LrBorder*2)*NR2, LrSize = minLrSize*D2;
 379     int hsumBufNRows = SH2*2 + 2;
 380     size_t totalBufSize = (LrSize + minLrSize)*NLR*sizeof(CostType) + // minLr[] and Lr[]
 381     costBufSize*(hsumBufNRows + 1)*sizeof(CostType) + // hsumBuf, pixdiff
 382     CSBufSize*2*sizeof(CostType) + // C, S
 383     width*16*img1.channels()*sizeof(PixType) + // temp buffer for computing per-pixel cost
 384     width*(sizeof(CostType) + sizeof(DispType)) + 1024; // disp2cost + disp2
 385
 386     if( !buffer.data || !buffer.isContinuous() ||
 387         buffer.cols*buffer.rows*buffer.elemSize() < totalBufSize )
 388         buffer.create(1, (int)totalBufSize, CV_8U);
 389
 390     // summary cost over different (nDirs) directions
 391     CostType* Cbuf = (CostType*)alignPtr(buffer.data, ALIGN);
 392     CostType* Sbuf = Cbuf + CSBufSize;
 393     CostType* hsumBuf = Sbuf + CSBufSize;
 394     CostType* pixDiff = hsumBuf + costBufSize*hsumBufNRows;
 395
 396     CostType* disp2cost = pixDiff + costBufSize + (LrSize + minLrSize)*NLR;
 397     DispType* disp2ptr = (DispType*)(disp2cost + width);
 398     PixType* tempBuf = (PixType*)(disp2ptr + width);
 399
 400     // add P2 to every C(x,y). it saves a few operations in the inner loops
 401     for( k = 0; k < width1*D; k++ )
 402         Cbuf[k] = (CostType)P2;
 403
 404     for( int pass = 1; pass <= npasses; pass++ )
 405     {
 406         int x1, y1, x2, y2, dx, dy;
 407
 408         if( pass == 1 )
 409         {
 410             y1 = 0; y2 = height; dy = 1;
 411             x1 = 0; x2 = width1; dx = 1;
 412         }
 413         else
 414         {
 415             y1 = height-1; y2 = -1; dy = -1;
 416             x1 = width1-1; x2 = -1; dx = -1;
 417         }
 418
 419         CostType *Lr[NLR]={0}, *minLr[NLR]={0};
 420
 421         for( k = 0; k < NLR; k++ )
 422         {
 423             // shift Lr[k] and minLr[k] pointers, because we allocated them with the borders,
 424             // and will occasionally use negative indices with the arrays
 425             // we need to shift Lr[k] pointers by 1, to give the space for d=-1.
 426             // however, then the alignment will be imperfect, i.e. bad for SSE,
 427             // thus we shift the pointers by 8 (8*sizeof(short) == 16 - ideal alignment)
 428             Lr[k] = pixDiff + costBufSize + LrSize*k + NRD2*LrBorder + 8;
 429             memset( Lr[k] - LrBorder*NRD2 - 8, 0, LrSize*sizeof(CostType) );
 430             minLr[k] = pixDiff + costBufSize + LrSize*NLR + minLrSize*k + NR2*LrBorder;
 431             memset( minLr[k] - LrBorder*NR2, 0, minLrSize*sizeof(CostType) );
 432         }
 433
 434         for( int y = y1; y != y2; y += dy )
 435         {
 436             int x, d;
 437             DispType* disp1ptr = disp1.ptr<DispType>(y);
 438             CostType* C = Cbuf + (!fullDP ? 0 : y*costBufSize);
 439             CostType* S = Sbuf + (!fullDP ? 0 : y*costBufSize);
 440
 441             if( pass == 1 ) // compute C on the first pass, and reuse it on the second pass, if any.
 442             {
 443                 int dy1 = y == 0 ? 0 : y + SH2, dy2 = y == 0 ? SH2 : dy1;
 444
 445                 for( k = dy1; k <= dy2; k++ )
 446                 {
 447                     CostType* hsumAdd = hsumBuf + (std::min(k, height-1) % hsumBufNRows)*costBufSize;
 448
 449                     if( k < height )
 450                     {
 451                         calcPixelCostBT( img1, img2, k, minD, maxD, pixDiff, tempBuf, clipTab, TAB_OFS, ftzero );
 452
 453                         memset(hsumAdd, 0, D*sizeof(CostType));
 454                         for( x = 0; x <= SW2*D; x += D )
 455                         {
 456                             int scale = x == 0 ? SW2 + 1 : 1;
 457                             for( d = 0; d < D; d++ )
 458                                 hsumAdd[d] = (CostType)(hsumAdd[d] + pixDiff[x + d]*scale);
 459                         }
 460
 461                         if( y > 0 )
 462                         {
 463                             const CostType* hsumSub = hsumBuf + (std::max(y - SH2 - 1, 0) % hsumBufNRows)*costBufSize;
 464                             const CostType* Cprev = !fullDP || y == 0 ? C : C - costBufSize;
 465
 466                             for( x = D; x < width1*D; x += D )
 467                             {
 468                                 const CostType* pixAdd = pixDiff + std::min(x + SW2*D, (width1-1)*D);
 469                                 const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*D, 0);
 470
 471                             #if CV_SSE2
 472                                 if( useSIMD )
 473                                 {
 474                                     for( d = 0; d < D; d += 8 )
 475                                     {
 476                                         __m128i hv = _mm_load_si128((const __m128i*)(hsumAdd + x - D + d));
 477                                         __m128i Cx = _mm_load_si128((__m128i*)(Cprev + x + d));
 478                                         hv = _mm_adds_epi16(_mm_subs_epi16(hv,
 479                                                                            _mm_load_si128((const __m128i*)(pixSub + d))),
 480                                                             _mm_load_si128((const __m128i*)(pixAdd + d)));
 481                                         Cx = _mm_adds_epi16(_mm_subs_epi16(Cx,
 482                                                                            _mm_load_si128((const __m128i*)(hsumSub + x + d))),
 483                                                             hv);
 484                                         _mm_store_si128((__m128i*)(hsumAdd + x + d), hv);
 485                                         _mm_store_si128((__m128i*)(C + x + d), Cx);
 486                                     }
 487                                 }
 488                                 else
 489                             #endif
 490                                 {
 491                                     for( d = 0; d < D; d++ )
 492                                     {
 493                                         int hv = hsumAdd[x + d] = (CostType)(hsumAdd[x - D + d] + pixAdd[d] - pixSub[d]);
 494                                         C[x + d] = (CostType)(Cprev[x + d] + hv - hsumSub[x + d]);
 495                                     }
 496                                 }
 497                             }
 498                         }
 499                         else
 500                         {
 501                             for( x = D; x < width1*D; x += D )
 502                             {
 503                                 const CostType* pixAdd = pixDiff + std::min(x + SW2*D, (width1-1)*D);
 504                                 const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*D, 0);
 505
 506                                 for( d = 0; d < D; d++ )
 507                                     hsumAdd[x + d] = (CostType)(hsumAdd[x - D + d] + pixAdd[d] - pixSub[d]);
 508                             }
 509                         }
 510                     }
 511
 512                     if( y == 0 )
 513                     {
 514                         int scale = k == 0 ? SH2 + 1 : 1;
 515                         for( x = 0; x < width1*D; x++ )
 516                             C[x] = (CostType)(C[x] + hsumAdd[x]*scale);
 517                     }
 518                 }
 519
 520                 // also, clear the S buffer
 521                 for( k = 0; k < width1*D; k++ )
 522                     S[k] = 0;
 523             }
 524
 525             // clear the left and the right borders
 526             memset( Lr[0] - NRD2*LrBorder - 8, 0, NRD2*LrBorder*sizeof(CostType) );
 527             memset( Lr[0] + width1*NRD2 - 8, 0, NRD2*LrBorder*sizeof(CostType) );
 528             memset( minLr[0] - NR2*LrBorder, 0, NR2*LrBorder*sizeof(CostType) );
 529             memset( minLr[0] + width1*NR2, 0, NR2*LrBorder*sizeof(CostType) );
 530
 531             /*
 532              [formula 13 in the paper]
 533              compute L_r(p, d) = C(p, d) +
 534              min(L_r(p-r, d),
 535              L_r(p-r, d-1) + P1,
 536              L_r(p-r, d+1) + P1,
 537              min_k L_r(p-r, k) + P2) - min_k L_r(p-r, k)
 538              where p = (x,y), r is one of the directions.
 539              we process all the directions at once:
 540              0: r=(-dx, 0)
 541              1: r=(-1, -dy)
 542              2: r=(0, -dy)
 543              3: r=(1, -dy)
 544              4: r=(-2, -dy)
 545              5: r=(-1, -dy*2)
 546              6: r=(1, -dy*2)
 547              7: r=(2, -dy)
 548              */
 549             for( x = x1; x != x2; x += dx )
 550             {
 551                 int xm = x*NR2, xd = xm*D2;
 552
 553                 int delta0 = minLr[0][xm - dx*NR2] + P2, delta1 = minLr[1][xm - NR2 + 1] + P2;
 554                 int delta2 = minLr[1][xm + 2] + P2, delta3 = minLr[1][xm + NR2 + 3] + P2;
 555
 556                 CostType* Lr_p0 = Lr[0] + xd - dx*NRD2;
 557                 CostType* Lr_p1 = Lr[1] + xd - NRD2 + D2;
 558                 CostType* Lr_p2 = Lr[1] + xd + D2*2;
 559                 CostType* Lr_p3 = Lr[1] + xd + NRD2 + D2*3;
 560
 561                 Lr_p0[-1] = Lr_p0[D] = Lr_p1[-1] = Lr_p1[D] =
 562                 Lr_p2[-1] = Lr_p2[D] = Lr_p3[-1] = Lr_p3[D] = MAX_COST;
 563
 564                 CostType* Lr_p = Lr[0] + xd;
 565                 const CostType* Cp = C + x*D;
 566                 CostType* Sp = S + x*D;
 567
 568             #if CV_SSE2
 569                 if( useSIMD )
 570                 {
 571                     __m128i _P1 = _mm_set1_epi16((short)P1);
 572
 573                     __m128i _delta0 = _mm_set1_epi16((short)delta0);
 574                     __m128i _delta1 = _mm_set1_epi16((short)delta1);
 575                     __m128i _delta2 = _mm_set1_epi16((short)delta2);
 576                     __m128i _delta3 = _mm_set1_epi16((short)delta3);
 577                     __m128i _minL0 = _mm_set1_epi16((short)MAX_COST);
 578
 579                     for( d = 0; d < D; d += 8 )
 580                     {
 581                         __m128i Cpd = _mm_load_si128((const __m128i*)(Cp + d));
 582                         __m128i L0, L1, L2, L3;
 583
 584                         L0 = _mm_load_si128((const __m128i*)(Lr_p0 + d));
 585                         L1 = _mm_load_si128((const __m128i*)(Lr_p1 + d));
 586                         L2 = _mm_load_si128((const __m128i*)(Lr_p2 + d));
 587                         L3 = _mm_load_si128((const __m128i*)(Lr_p3 + d));
 588
 589                         L0 = _mm_min_epi16(L0, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p0 + d - 1)), _P1));
 590                         L0 = _mm_min_epi16(L0, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p0 + d + 1)), _P1));
 591
 592                         L1 = _mm_min_epi16(L1, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p1 + d - 1)), _P1));
 593                         L1 = _mm_min_epi16(L1, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p1 + d + 1)), _P1));
 594
 595                         L2 = _mm_min_epi16(L2, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p2 + d - 1)), _P1));
 596                         L2 = _mm_min_epi16(L2, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p2 + d + 1)), _P1));
 597
 598                         L3 = _mm_min_epi16(L3, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p3 + d - 1)), _P1));
 599                         L3 = _mm_min_epi16(L3, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p3 + d + 1)), _P1));
 600
 601                         L0 = _mm_min_epi16(L0, _delta0);
 602                         L0 = _mm_adds_epi16(_mm_subs_epi16(L0, _delta0), Cpd);
 603
 604                         L1 = _mm_min_epi16(L1, _delta1);
 605                         L1 = _mm_adds_epi16(_mm_subs_epi16(L1, _delta1), Cpd);
 606
 607                         L2 = _mm_min_epi16(L2, _delta2);
 608                         L2 = _mm_adds_epi16(_mm_subs_epi16(L2, _delta2), Cpd);
 609
 610                         L3 = _mm_min_epi16(L3, _delta3);
 611                         L3 = _mm_adds_epi16(_mm_subs_epi16(L3, _delta3), Cpd);
 612
 613                         _mm_store_si128( (__m128i*)(Lr_p + d), L0);
 614                         _mm_store_si128( (__m128i*)(Lr_p + d + D2), L1);
 615                         _mm_store_si128( (__m128i*)(Lr_p + d + D2*2), L2);
 616                         _mm_store_si128( (__m128i*)(Lr_p + d + D2*3), L3);
 617
 618                         __m128i t0 = _mm_min_epi16(_mm_unpacklo_epi16(L0, L2), _mm_unpackhi_epi16(L0, L2));
 619                         __m128i t1 = _mm_min_epi16(_mm_unpacklo_epi16(L1, L3), _mm_unpackhi_epi16(L1, L3));
 620                         t0 = _mm_min_epi16(_mm_unpacklo_epi16(t0, t1), _mm_unpackhi_epi16(t0, t1));
 621                         _minL0 = _mm_min_epi16(_minL0, t0);
 622
 623                         __m128i Sval = _mm_load_si128((const __m128i*)(Sp + d));
 624
 625                         L0 = _mm_adds_epi16(L0, L1);
 626                         L2 = _mm_adds_epi16(L2, L3);
 627                         Sval = _mm_adds_epi16(Sval, L0);
 628                         Sval = _mm_adds_epi16(Sval, L2);
 629
 630                         _mm_store_si128((__m128i*)(Sp + d), Sval);
 631                     }
 632
 633                     _minL0 = _mm_min_epi16(_minL0, _mm_srli_si128(_minL0, 8));
 634                     _mm_storel_epi64((__m128i*)&minLr[0][xm], _minL0);
 635                 }
 636                 else
 637             #endif
 638                 {
 639                     int minL0 = MAX_COST, minL1 = MAX_COST, minL2 = MAX_COST, minL3 = MAX_COST;
 640
 641                     for( d = 0; d < D; d++ )
 642                     {
 643                         int Cpd = Cp[d], L0, L1, L2, L3;
 644
 645                         L0 = Cpd + std::min((int)Lr_p0[d], std::min(Lr_p0[d-1] + P1, std::min(Lr_p0[d+1] + P1, delta0))) - delta0;
 646                         L1 = Cpd + std::min((int)Lr_p1[d], std::min(Lr_p1[d-1] + P1, std::min(Lr_p1[d+1] + P1, delta1))) - delta1;
 647                         L2 = Cpd + std::min((int)Lr_p2[d], std::min(Lr_p2[d-1] + P1, std::min(Lr_p2[d+1] + P1, delta2))) - delta2;
 648                         L3 = Cpd + std::min((int)Lr_p3[d], std::min(Lr_p3[d-1] + P1, std::min(Lr_p3[d+1] + P1, delta3))) - delta3;
 649
 650                         Lr_p[d] = (CostType)L0;
 651                         minL0 = std::min(minL0, L0);
 652
 653                         Lr_p[d + D2] = (CostType)L1;
 654                         minL1 = std::min(minL1, L1);
 655
 656                         Lr_p[d + D2*2] = (CostType)L2;
 657                         minL2 = std::min(minL2, L2);
 658
 659                         Lr_p[d + D2*3] = (CostType)L3;
 660                         minL3 = std::min(minL3, L3);
 661
 662                         Sp[d] = saturate_cast<CostType>(Sp[d] + L0 + L1 + L2 + L3);
 663                     }
 664                     minLr[0][xm] = (CostType)minL0;
 665                     minLr[0][xm+1] = (CostType)minL1;
 666                     minLr[0][xm+2] = (CostType)minL2;
 667                     minLr[0][xm+3] = (CostType)minL3;
 668                 }
 669             }
 670
 671             if( pass == npasses )
 672             {
 673                 for( x = 0; x < width; x++ )
 674                 {
 675                     disp1ptr[x] = disp2ptr[x] = (DispType)INVALID_DISP_SCALED;
 676                     disp2cost[x] = MAX_COST;
 677                 }
 678
 679                 for( x = width1 - 1; x >= 0; x-- )
 680                 {
 681                     CostType* Sp = S + x*D;
 682                     int minS = MAX_COST, bestDisp = -1;
 683
 684                     if( npasses == 1 )
 685                     {
 686                         int xm = x*NR2, xd = xm*D2;
 687
 688                         int minL0 = MAX_COST;
 689                         int delta0 = minLr[0][xm + NR2] + P2;
 690                         CostType* Lr_p0 = Lr[0] + xd + NRD2;
 691                         Lr_p0[-1] = Lr_p0[D] = MAX_COST;
 692                         CostType* Lr_p = Lr[0] + xd;
 693
 694                         const CostType* Cp = C + x*D;
 695
 696                     #if CV_SSE2
 697                         if( useSIMD )
 698                         {
 699                             __m128i _P1 = _mm_set1_epi16((short)P1);
 700                             __m128i _delta0 = _mm_set1_epi16((short)delta0);
 701
 702                             __m128i _minL0 = _mm_set1_epi16((short)minL0);
 703                             __m128i _minS = _mm_set1_epi16(MAX_COST), _bestDisp = _mm_set1_epi16(-1);
 704                             __m128i _d8 = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7), _8 = _mm_set1_epi16(8);
 705
 706                             for( d = 0; d < D; d += 8 )
 707                             {
 708                                 __m128i Cpd = _mm_load_si128((const __m128i*)(Cp + d)), L0;
 709
 710                                 L0 = _mm_load_si128((const __m128i*)(Lr_p0 + d));
 711                                 L0 = _mm_min_epi16(L0, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p0 + d - 1)), _P1));
 712                                 L0 = _mm_min_epi16(L0, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p0 + d + 1)), _P1));
 713                                 L0 = _mm_min_epi16(L0, _delta0);
 714                                 L0 = _mm_adds_epi16(_mm_subs_epi16(L0, _delta0), Cpd);
 715
 716                                 _mm_store_si128((__m128i*)(Lr_p + d), L0);
 717                                 _minL0 = _mm_min_epi16(_minL0, L0);
 718                                 L0 = _mm_adds_epi16(L0, *(__m128i*)(Sp + d));
 719                                 _mm_store_si128((__m128i*)(Sp + d), L0);
 720
 721                                 __m128i mask = _mm_cmpgt_epi16(_minS, L0);
 722                                 _minS = _mm_min_epi16(_minS, L0);
 723                                 _bestDisp = _mm_xor_si128(_bestDisp, _mm_and_si128(_mm_xor_si128(_bestDisp,_d8), mask));
 724                                 _d8 = _mm_adds_epi16(_d8, _8);
 725                             }
 726
 727                             short CV_DECL_ALIGNED(16) bestDispBuf[8];
 728                             _mm_store_si128((__m128i*)bestDispBuf, _bestDisp);
 729
 730                             _minL0 = _mm_min_epi16(_minL0, _mm_srli_si128(_minL0, 8));
 731                             _minL0 = _mm_min_epi16(_minL0, _mm_srli_si128(_minL0, 4));
 732                             _minL0 = _mm_min_epi16(_minL0, _mm_srli_si128(_minL0, 2));
 733
 734                             __m128i qS = _mm_min_epi16(_minS, _mm_srli_si128(_minS, 8));
 735                             qS = _mm_min_epi16(qS, _mm_srli_si128(qS, 4));
 736                             qS = _mm_min_epi16(qS, _mm_srli_si128(qS, 2));
 737
 738                             minLr[0][xm] = (CostType)_mm_cvtsi128_si32(_minL0);
 739                             minS = (CostType)_mm_cvtsi128_si32(qS);
 740
 741                             qS = _mm_shuffle_epi32(_mm_unpacklo_epi16(qS, qS), 0);
 742                             qS = _mm_cmpeq_epi16(_minS, qS);
 743                             int idx = _mm_movemask_epi8(_mm_packs_epi16(qS, qS)) & 255;
 744
 745                             bestDisp = bestDispBuf[LSBTab[idx]];
 746                         }
 747                         else
 748                     #endif
 749                         {
 750                             for( d = 0; d < D; d++ )
 751                             {
 752                                 int L0 = Cp[d] + std::min((int)Lr_p0[d], std::min(Lr_p0[d-1] + P1, std::min(Lr_p0[d+1] + P1, delta0))) - delta0;
 753
 754                                 Lr_p[d] = (CostType)L0;
 755                                 minL0 = std::min(minL0, L0);
 756
 757                                 int Sval = Sp[d] = saturate_cast<CostType>(Sp[d] + L0);
 758                                 if( Sval < minS )
 759                                 {
 760                                     minS = Sval;
 761                                     bestDisp = d;
 762                                 }
 763                             }
 764                             minLr[0][xm] = (CostType)minL0;
 765                         }
 766                     }
 767                     else
 768                     {
 769                         for( d = 0; d < D; d++ )
 770                         {
 771                             int Sval = Sp[d];
 772                             if( Sval < minS )
 773                             {
 774                                 minS = Sval;
 775                                 bestDisp = d;
 776                             }
 777                         }
 778                     }
 779
 780                     for( d = 0; d < D; d++ )
 781                     {
 782                         if( Sp[d]*(100 - uniquenessRatio) < minS*100 && std::abs(bestDisp - d) > 1 )
 783                             break;
 784                     }
 785                     if( d < D )
 786                         continue;
 787                     d = bestDisp;
 788                     int _x2 = x + minX1 - d - minD;
 789                     if( disp2cost[_x2] > minS )
 790                     {
 791                         disp2cost[_x2] = (CostType)minS;
 792                         disp2ptr[_x2] = (DispType)(d + minD);
 793                     }
 794
 795                     if( 0 < d && d < D-1 )
 796                     {
 797                         // do subpixel quadratic interpolation:
 798                         //   fit parabola into (x1=d-1, y1=Sp[d-1]), (x2=d, y2=Sp[d]), (x3=d+1, y3=Sp[d+1])
 799                         //   then find minimum of the parabola.
 800                         int denom2 = std::max(Sp[d-1] + Sp[d+1] - 2*Sp[d], 1);
 801                         d = d*DISP_SCALE + ((Sp[d-1] - Sp[d+1])*DISP_SCALE + denom2)/(denom2*2);
 802                     }
 803                     else
 804                         d *= DISP_SCALE;
 805                     disp1ptr[x + minX1] = (DispType)(d + minD*DISP_SCALE);
 806                 }
 807
 808                 for( x = minX1; x < maxX1; x++ )
 809                 {
 810                     // we round the computed disparity both towards -inf and +inf and check
 811                     // if either of the corresponding disparities in disp2 is consistent.
 812                     // This is to give the computed disparity a chance to look valid if it is.
 813                     int d1 = disp1ptr[x];
 814                     if( d1 == INVALID_DISP_SCALED )
 815                         continue;
 816                     int _d = d1 >> DISP_SHIFT;
 817                     int d_ = (d1 + DISP_SCALE-1) >> DISP_SHIFT;
 818                     int _x = x - _d, x_ = x - d_;
 819                     if( 0 <= _x && _x < width && disp2ptr[_x] >= minD && std::abs(disp2ptr[_x] - _d) > disp12MaxDiff &&
 820                        0 <= x_ && x_ < width && disp2ptr[x_] >= minD && std::abs(disp2ptr[x_] - d_) > disp12MaxDiff )
 821                         disp1ptr[x] = (DispType)INVALID_DISP_SCALED;
 822                 }
 823             }
 824
 825             // now shift the cyclic buffers
 826             std::swap( Lr[0], Lr[1] );
 827             std::swap( minLr[0], minLr[1] );
 828         }
 829     }
 830 }
 831
 832 class StereoSGBMImpl : public StereoSGBM
 833 {
 834 public:
 835     StereoSGBMImpl()
 836     {
 837         params = StereoSGBMParams();
 838     }
 839
 840     StereoSGBMImpl( int _minDisparity, int _numDisparities, int _SADWindowSize,
 841                     int _P1, int _P2, int _disp12MaxDiff, int _preFilterCap,
 842                     int _uniquenessRatio, int _speckleWindowSize, int _speckleRange,
 843                     int _mode )
 844     {
 845         params = StereoSGBMParams( _minDisparity, _numDisparities, _SADWindowSize,
 846                                    _P1, _P2, _disp12MaxDiff, _preFilterCap,
 847                                    _uniquenessRatio, _speckleWindowSize, _speckleRange,
 848                                    _mode );
 849     }
 850
 851     void compute( InputArray leftarr, InputArray rightarr, OutputArray disparr )
 852     {
 853         Mat left = leftarr.getMat(), right = rightarr.getMat();
 854         CV_Assert( left.size() == right.size() && left.type() == right.type() &&
 855                    left.depth() == CV_8U );
 856
 857         disparr.create( left.size(), CV_16S );
 858         Mat disp = disparr.getMat();
 859
 860         computeDisparitySGBM( left, right, disp, params, buffer );
 861         medianBlur(disp, disp, 3);
 862
 863         if( params.speckleWindowSize > 0 )
 864             filterSpeckles(disp, (params.minDisparity - 1)*StereoMatcher::DISP_SCALE, params.speckleWindowSize,
 865                            StereoMatcher::DISP_SCALE*params.speckleRange, buffer);
 866     }
 867
 868     AlgorithmInfo* info() const { return 0; }
 869
 870     int getMinDisparity() const { return params.minDisparity; }
 871     void setMinDisparity(int minDisparity) { params.minDisparity = minDisparity; }
 872
 873     int getNumDisparities() const { return params.numDisparities; }
 874     void setNumDisparities(int numDisparities) { params.numDisparities = numDisparities; }
 875
 876     int getBlockSize() const { return params.SADWindowSize; }
 877     void setBlockSize(int blockSize) { params.SADWindowSize = blockSize; }
 878
 879     int getSpeckleWindowSize() const { return params.speckleWindowSize; }
 880     void setSpeckleWindowSize(int speckleWindowSize) { params.speckleWindowSize = speckleWindowSize; }
 881
 882     int getSpeckleRange() const { return params.speckleRange; }
 883     void setSpeckleRange(int speckleRange) { params.speckleRange = speckleRange; }
 884
 885     int getDisp12MaxDiff() const { return params.disp12MaxDiff; }
 886     void setDisp12MaxDiff(int disp12MaxDiff) { params.disp12MaxDiff = disp12MaxDiff; }
 887
 888     int getPreFilterCap() const { return params.preFilterCap; }
 889     void setPreFilterCap(int preFilterCap) { params.preFilterCap = preFilterCap; }
 890
 891     int getUniquenessRatio() const { return params.uniquenessRatio; }
 892     void setUniquenessRatio(int uniquenessRatio) { params.uniquenessRatio = uniquenessRatio; }
 893
 894     int getP1() const { return params.P1; }
 895     void setP1(int P1) { params.P1 = P1; }
 896
 897     int getP2() const { return params.P2; }
 898     void setP2(int P2) { params.P2 = P2; }
 899
 900     int getMode() const { return params.mode; }
 901     void setMode(int mode) { params.mode = mode; }
 902
 903     void write(FileStorage& fs) const
 904     {
 905         fs << "name" << name_
 906         << "minDisparity" << params.minDisparity
 907         << "numDisparities" << params.numDisparities
 908         << "blockSize" << params.SADWindowSize
 909         << "speckleWindowSize" << params.speckleWindowSize
 910         << "speckleRange" << params.speckleRange
 911         << "disp12MaxDiff" << params.disp12MaxDiff
 912         << "preFilterCap" << params.preFilterCap
 913         << "uniquenessRatio" << params.uniquenessRatio
 914         << "P1" << params.P1
 915         << "P2" << params.P2
 916         << "mode" << params.mode;
 917     }
 918
 919     void read(const FileNode& fn)
 920     {
 921         FileNode n = fn["name"];
 922         CV_Assert( n.isString() && String(n) == name_ );
 923         params.minDisparity = (int)fn["minDisparity"];
 924         params.numDisparities = (int)fn["numDisparities"];
 925         params.SADWindowSize = (int)fn["blockSize"];
 926         params.speckleWindowSize = (int)fn["speckleWindowSize"];
 927         params.speckleRange = (int)fn["speckleRange"];
 928         params.disp12MaxDiff = (int)fn["disp12MaxDiff"];
 929         params.preFilterCap = (int)fn["preFilterCap"];
 930         params.uniquenessRatio = (int)fn["uniquenessRatio"];
 931         params.P1 = (int)fn["P1"];
 932         params.P2 = (int)fn["P2"];
 933         params.mode = (int)fn["mode"];
 934     }
 935
 936     StereoSGBMParams params;
 937     Mat buffer;
 938     static const char* name_;
 939 };
 940
 941 const char* StereoSGBMImpl::name_ = "StereoMatcher.SGBM";
 942
 943
 944 Ptr<StereoSGBM> createStereoSGBM(int minDisparity, int numDisparities, int SADWindowSize,
 945                                  int P1, int P2, int disp12MaxDiff,
 946                                  int preFilterCap, int uniquenessRatio,
 947                                  int speckleWindowSize, int speckleRange,
 948                                  int mode)
 949 {
 950     return Ptr<StereoSGBM>(
 951         new StereoSGBMImpl(minDisparity, numDisparities, SADWindowSize,
 952                            P1, P2, disp12MaxDiff,
 953                            preFilterCap, uniquenessRatio,
 954                            speckleWindowSize, speckleRange,
 955                            mode));
 956 }
 957
 958 Rect getValidDisparityROI( Rect roi1, Rect roi2,
 959                           int minDisparity,
 960                           int numberOfDisparities,
 961                           int SADWindowSize )
 962 {
 963     int SW2 = SADWindowSize/2;
 964     int minD = minDisparity, maxD = minDisparity + numberOfDisparities - 1;
 965
 966     int xmin = std::max(roi1.x, roi2.x + maxD) + SW2;
 967     int xmax = std::min(roi1.x + roi1.width, roi2.x + roi2.width - minD) - SW2;
 968     int ymin = std::max(roi1.y, roi2.y) + SW2;
 969     int ymax = std::min(roi1.y + roi1.height, roi2.y + roi2.height) - SW2;
 970
 971     Rect r(xmin, ymin, xmax - xmin, ymax - ymin);
 972
 973     return r.width > 0 && r.height > 0 ? r : Rect();
 974 }
 975
 976 typedef cv::Point_<short> Point2s;
 977
 978 template <typename T>
 979 void filterSpecklesImpl(cv::Mat& img, int newVal, int maxSpeckleSize, int maxDiff, cv::Mat& _buf)
 980 {
 981     using namespace cv;
 982
 983     int width = img.cols, height = img.rows, npixels = width*height;
 984     size_t bufSize = npixels*(int)(sizeof(Point2s) + sizeof(int) + sizeof(uchar));
 985     if( !_buf.isContinuous() || !_buf.data || _buf.cols*_buf.rows*_buf.elemSize() < bufSize )
 986         _buf.create(1, (int)bufSize, CV_8U);
 987
 988     uchar* buf = _buf.data;
 989     int i, j, dstep = (int)(img.step/sizeof(T));
 990     int* labels = (int*)buf;
 991     buf += npixels*sizeof(labels[0]);
 992     Point2s* wbuf = (Point2s*)buf;
 993     buf += npixels*sizeof(wbuf[0]);
 994     uchar* rtype = (uchar*)buf;
 995     int curlabel = 0;
 996
 997     // clear out label assignments
 998     memset(labels, 0, npixels*sizeof(labels[0]));
 999
1000     for( i = 0; i < height; i++ )
1001     {
1002         T* ds = img.ptr<T>(i);
1003         int* ls = labels + width*i;
1004
1005         for( j = 0; j < width; j++ )
1006         {
1007             if( ds[j] != newVal )   // not a bad disparity
1008             {
1009                 if( ls[j] )     // has a label, check for bad label
1010                 {
1011                     if( rtype[ls[j]] ) // small region, zero out disparity
1012                         ds[j] = (T)newVal;
1013                 }
1014                 // no label, assign and propagate
1015                 else
1016                 {
1017                     Point2s* ws = wbuf; // initialize wavefront
1018                     Point2s p((short)j, (short)i);  // current pixel
1019                     curlabel++; // next label
1020                     int count = 0;  // current region size
1021                     ls[j] = curlabel;
1022
1023                     // wavefront propagation
1024                     while( ws >= wbuf ) // wavefront not empty
1025                     {
1026                         count++;
1027                         // put neighbors onto wavefront
1028                         T* dpp = &img.at<T>(p.y, p.x);
1029                         T dp = *dpp;
1030                         int* lpp = labels + width*p.y + p.x;
1031
1032                         if( p.x < width-1 && !lpp[+1] && dpp[+1] != newVal && std::abs(dp - dpp[+1]) <= maxDiff )
1033                         {
1034                             lpp[+1] = curlabel;
1035                             *ws++ = Point2s(p.x+1, p.y);
1036                         }
1037
1038                         if( p.x > 0 && !lpp[-1] && dpp[-1] != newVal && std::abs(dp - dpp[-1]) <= maxDiff )
1039                         {
1040                             lpp[-1] = curlabel;
1041                             *ws++ = Point2s(p.x-1, p.y);
1042                         }
1043
1044                         if( p.y < height-1 && !lpp[+width] && dpp[+dstep] != newVal && std::abs(dp - dpp[+dstep]) <= maxDiff )
1045                         {
1046                             lpp[+width] = curlabel;
1047                             *ws++ = Point2s(p.x, p.y+1);
1048                         }
1049
1050                         if( p.y > 0 && !lpp[-width] && dpp[-dstep] != newVal && std::abs(dp - dpp[-dstep]) <= maxDiff )
1051                         {
1052                             lpp[-width] = curlabel;
1053                             *ws++ = Point2s(p.x, p.y-1);
1054                         }
1055
1056                         // pop most recent and propagate
1057                         // NB: could try least recent, maybe better convergence
1058                         p = *--ws;
1059                     }
1060
1061                     // assign label type
1062                     if( count <= maxSpeckleSize )   // speckle region
1063                     {
1064                         rtype[ls[j]] = 1;   // small region label
1065                         ds[j] = (T)newVal;
1066                     }
1067                     else
1068                         rtype[ls[j]] = 0;   // large region label
1069                 }
1070             }
1071         }
1072     }
1073 }
1074
1075 }
1076
1077 void cv::filterSpeckles( InputOutputArray _img, double _newval, int maxSpeckleSize,
1078                          double _maxDiff, InputOutputArray __buf )
1079 {
1080     Mat img = _img.getMat();
1081     int type = img.type();
1082     Mat temp, &_buf = __buf.needed() ? __buf.getMatRef() : temp;
1083     CV_Assert( type == CV_8UC1 || type == CV_16SC1 );
1084
1085     int newVal = cvRound(_newval), maxDiff = cvRound(_maxDiff);
1086
1087 #if IPP_VERSION_X100 >= 801 && !defined HAVE_IPP_ICV_ONLY
1088     Ipp32s bufsize = 0;
1089     IppiSize roisize = { img.cols, img.rows };
1090     IppDataType datatype = type == CV_8UC1 ? ipp8u : ipp16s;
1091
1092     if (!__buf.needed() && (type == CV_8UC1 || type == CV_16SC1))
1093     {
1094         IppStatus status = ippiMarkSpecklesGetBufferSize(roisize, datatype, CV_MAT_CN(type), &bufsize);
1095         Ipp8u * buffer = ippsMalloc_8u(bufsize);
1096
1097         if ((int)status >= 0)
1098         {
1099             if (type == CV_8UC1)
1100                 status = ippiMarkSpeckles_8u_C1IR((Ipp8u *)img.data, (int)img.step, roisize,
1101                                                   (Ipp8u)newVal, maxSpeckleSize, (Ipp8u)maxDiff, ippiNormL1, buffer);
1102             else
1103                 status = ippiMarkSpeckles_16s_C1IR((Ipp16s *)img.data, (int)img.step, roisize,
1104                                                    (Ipp16s)newVal, maxSpeckleSize, (Ipp16s)maxDiff, ippiNormL1, buffer);
1105         }
1106
1107         if (status >= 0)
1108             return;
1109         setIppErrorStatus();
1110     }
1111 #endif
1112
1113     if (type == CV_8UC1)
1114         filterSpecklesImpl<uchar>(img, newVal, maxSpeckleSize, maxDiff, _buf);
1115     else
1116         filterSpecklesImpl<short>(img, newVal, maxSpeckleSize, maxDiff, _buf);
1117 }
1118
1119 void cv::validateDisparity( InputOutputArray _disp, InputArray _cost, int minDisparity,
1120                             int numberOfDisparities, int disp12MaxDiff )
1121 {
1122     Mat disp = _disp.getMat(), cost = _cost.getMat();
1123     int cols = disp.cols, rows = disp.rows;
1124     int minD = minDisparity, maxD = minDisparity + numberOfDisparities;
1125     int x, minX1 = std::max(maxD, 0), maxX1 = cols + std::min(minD, 0);
1126     AutoBuffer<int> _disp2buf(cols*2);
1127     int* disp2buf = _disp2buf;
1128     int* disp2cost = disp2buf + cols;
1129     const int DISP_SHIFT = 4, DISP_SCALE = 1 << DISP_SHIFT;
1130     int INVALID_DISP = minD - 1, INVALID_DISP_SCALED = INVALID_DISP*DISP_SCALE;
1131     int costType = cost.type();
1132
1133     disp12MaxDiff *= DISP_SCALE;
1134
1135     CV_Assert( numberOfDisparities > 0 && disp.type() == CV_16S &&
1136               (costType == CV_16S || costType == CV_32S) &&
1137               disp.size() == cost.size() );
1138
1139     for( int y = 0; y < rows; y++ )
1140     {
1141         short* dptr = disp.ptr<short>(y);
1142
1143         for( x = 0; x < cols; x++ )
1144         {
1145             disp2buf[x] = INVALID_DISP_SCALED;
1146             disp2cost[x] = INT_MAX;
1147         }
1148
1149         if( costType == CV_16S )
1150         {
1151             const short* cptr = cost.ptr<short>(y);
1152
1153             for( x = minX1; x < maxX1; x++ )
1154             {
1155                 int d = dptr[x], c = cptr[x];
1156                 int x2 = x - ((d + DISP_SCALE/2) >> DISP_SHIFT);
1157
1158                 if( disp2cost[x2] > c )
1159                 {
1160                     disp2cost[x2] = c;
1161                     disp2buf[x2] = d;
1162                 }
1163             }
1164         }
1165         else
1166         {
1167             const int* cptr = cost.ptr<int>(y);
1168
1169             for( x = minX1; x < maxX1; x++ )
1170             {
1171                 int d = dptr[x], c = cptr[x];
1172                 int x2 = x - ((d + DISP_SCALE/2) >> DISP_SHIFT);
1173
1174                 if( disp2cost[x2] < c )
1175                 {
1176                     disp2cost[x2] = c;
1177                     disp2buf[x2] = d;
1178                 }
1179             }
1180         }
1181
1182         for( x = minX1; x < maxX1; x++ )
1183         {
1184             // we round the computed disparity both towards -inf and +inf and check
1185             // if either of the corresponding disparities in disp2 is consistent.
1186             // This is to give the computed disparity a chance to look valid if it is.
1187             int d = dptr[x];
1188             if( d == INVALID_DISP_SCALED )
1189                 continue;
1190             int d0 = d >> DISP_SHIFT;
1191             int d1 = (d + DISP_SCALE-1) >> DISP_SHIFT;
1192             int x0 = x - d0, x1 = x - d1;
1193             if( (0 <= x0 && x0 < cols && disp2buf[x0] > INVALID_DISP_SCALED && std::abs(disp2buf[x0] - d) > disp12MaxDiff) &&
1194                 (0 <= x1 && x1 < cols && disp2buf[x1] > INVALID_DISP_SCALED && std::abs(disp2buf[x1] - d) > disp12MaxDiff) )
1195                 dptr[x] = (short)INVALID_DISP_SCALED;
1196         }
1197     }
1198 }