15b35fcf1716a1899bfd48b0985edfe8e585adc6
[platform/upstream/opencv.git] / modules / calib3d / src / stereosgbm.cpp
1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 //  By downloading, copying, installing or using the software you agree to this license.
6 //  If you do not agree to this license, do not download, install,
7 //  copy or use the software.
8 //
9 //
10 //                           License Agreement
11 //                For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
16 // Third party copyrights are property of their respective owners.
17 //
18 // Redistribution and use in source and binary forms, with or without modification,
19 // are permitted provided that the following conditions are met:
20 //
21 //   * Redistribution's of source code must retain the above copyright notice,
22 //     this list of conditions and the following disclaimer.
23 //
24 //   * Redistribution's in binary form must reproduce the above copyright notice,
25 //     this list of conditions and the following disclaimer in the documentation
26 //     and/or other materials provided with the distribution.
27 //
28 //   * The name of the copyright holders may not be used to endorse or promote products
29 //     derived from this software without specific prior written permission.
30 //
31 // This software is provided by the copyright holders and contributors "as is" and
32 // any express or implied warranties, including, but not limited to, the implied
33 // warranties of merchantability and fitness for a particular purpose are disclaimed.
34 // In no event shall the Intel Corporation or contributors be liable for any direct,
35 // indirect, incidental, special, exemplary, or consequential damages
36 // (including, but not limited to, procurement of substitute goods or services;
37 // loss of use, data, or profits; or business interruption) however caused
38 // and on any theory of liability, whether in contract, strict liability,
39 // or tort (including negligence or otherwise) arising in any way out of
40 // the use of this software, even if advised of the possibility of such damage.
41 //
42 //M*/
43
44 /*
45  This is a variation of
46  "Stereo Processing by Semiglobal Matching and Mutual Information"
47  by Heiko Hirschmuller.
48
49  We match blocks rather than individual pixels, thus the algorithm is called
50  SGBM (Semi-global block matching)
51  */
52
53 #include "precomp.hpp"
54 #include <limits.h>
55 #include "opencv2/hal/intrin.hpp"
56
57 namespace cv
58 {
59
60 typedef uchar PixType;
61 typedef short CostType;
62 typedef short DispType;
63
64 enum { NR = 16, NR2 = NR/2 };
65
66
67 struct StereoSGBMParams
68 {
69     StereoSGBMParams()
70     {
71         minDisparity = numDisparities = 0;
72         SADWindowSize = 0;
73         P1 = P2 = 0;
74         disp12MaxDiff = 0;
75         preFilterCap = 0;
76         uniquenessRatio = 0;
77         speckleWindowSize = 0;
78         speckleRange = 0;
79         mode = StereoSGBM::MODE_SGBM;
80     }
81
82     StereoSGBMParams( int _minDisparity, int _numDisparities, int _SADWindowSize,
83                       int _P1, int _P2, int _disp12MaxDiff, int _preFilterCap,
84                       int _uniquenessRatio, int _speckleWindowSize, int _speckleRange,
85                       int _mode )
86     {
87         minDisparity = _minDisparity;
88         numDisparities = _numDisparities;
89         SADWindowSize = _SADWindowSize;
90         P1 = _P1;
91         P2 = _P2;
92         disp12MaxDiff = _disp12MaxDiff;
93         preFilterCap = _preFilterCap;
94         uniquenessRatio = _uniquenessRatio;
95         speckleWindowSize = _speckleWindowSize;
96         speckleRange = _speckleRange;
97         mode = _mode;
98     }
99
100     int minDisparity;
101     int numDisparities;
102     int SADWindowSize;
103     int preFilterCap;
104     int uniquenessRatio;
105     int P1;
106     int P2;
107     int speckleWindowSize;
108     int speckleRange;
109     int disp12MaxDiff;
110     int mode;
111 };
112
113 /*
114  For each pixel row1[x], max(maxD, 0) <= minX <= x < maxX <= width - max(0, -minD),
115  and for each disparity minD<=d<maxD the function
116  computes the cost (cost[(x-minX)*(maxD - minD) + (d - minD)]), depending on the difference between
117  row1[x] and row2[x-d]. The subpixel algorithm from
118  "Depth Discontinuities by Pixel-to-Pixel Stereo" by Stan Birchfield and C. Tomasi
119  is used, hence the suffix BT.
120
121  the temporary buffer should contain width2*2 elements
122  */
123 static void calcPixelCostBT( const Mat& img1, const Mat& img2, int y,
124                             int minD, int maxD, CostType* cost,
125                             PixType* buffer, const PixType* tab,
126                             int tabOfs, int )
127 {
128     int x, c, width = img1.cols, cn = img1.channels();
129     int minX1 = std::max(maxD, 0), maxX1 = width + std::min(minD, 0);
130     int minX2 = std::max(minX1 - maxD, 0), maxX2 = std::min(maxX1 - minD, width);
131     int D = maxD - minD, width1 = maxX1 - minX1, width2 = maxX2 - minX2;
132     const PixType *row1 = img1.ptr<PixType>(y), *row2 = img2.ptr<PixType>(y);
133     PixType *prow1 = buffer + width2*2, *prow2 = prow1 + width*cn*2;
134
135     tab += tabOfs;
136
137     for( c = 0; c < cn*2; c++ )
138     {
139         prow1[width*c] = prow1[width*c + width-1] =
140         prow2[width*c] = prow2[width*c + width-1] = tab[0];
141     }
142
143     int n1 = y > 0 ? -(int)img1.step : 0, s1 = y < img1.rows-1 ? (int)img1.step : 0;
144     int n2 = y > 0 ? -(int)img2.step : 0, s2 = y < img2.rows-1 ? (int)img2.step : 0;
145
146     if( cn == 1 )
147     {
148         for( x = 1; x < width-1; x++ )
149         {
150             prow1[x] = tab[(row1[x+1] - row1[x-1])*2 + row1[x+n1+1] - row1[x+n1-1] + row1[x+s1+1] - row1[x+s1-1]];
151             prow2[width-1-x] = tab[(row2[x+1] - row2[x-1])*2 + row2[x+n2+1] - row2[x+n2-1] + row2[x+s2+1] - row2[x+s2-1]];
152
153             prow1[x+width] = row1[x];
154             prow2[width-1-x+width] = row2[x];
155         }
156     }
157     else
158     {
159         for( x = 1; x < width-1; x++ )
160         {
161             prow1[x] = tab[(row1[x*3+3] - row1[x*3-3])*2 + row1[x*3+n1+3] - row1[x*3+n1-3] + row1[x*3+s1+3] - row1[x*3+s1-3]];
162             prow1[x+width] = tab[(row1[x*3+4] - row1[x*3-2])*2 + row1[x*3+n1+4] - row1[x*3+n1-2] + row1[x*3+s1+4] - row1[x*3+s1-2]];
163             prow1[x+width*2] = tab[(row1[x*3+5] - row1[x*3-1])*2 + row1[x*3+n1+5] - row1[x*3+n1-1] + row1[x*3+s1+5] - row1[x*3+s1-1]];
164
165             prow2[width-1-x] = tab[(row2[x*3+3] - row2[x*3-3])*2 + row2[x*3+n2+3] - row2[x*3+n2-3] + row2[x*3+s2+3] - row2[x*3+s2-3]];
166             prow2[width-1-x+width] = tab[(row2[x*3+4] - row2[x*3-2])*2 + row2[x*3+n2+4] - row2[x*3+n2-2] + row2[x*3+s2+4] - row2[x*3+s2-2]];
167             prow2[width-1-x+width*2] = tab[(row2[x*3+5] - row2[x*3-1])*2 + row2[x*3+n2+5] - row2[x*3+n2-1] + row2[x*3+s2+5] - row2[x*3+s2-1]];
168
169             prow1[x+width*3] = row1[x*3];
170             prow1[x+width*4] = row1[x*3+1];
171             prow1[x+width*5] = row1[x*3+2];
172
173             prow2[width-1-x+width*3] = row2[x*3];
174             prow2[width-1-x+width*4] = row2[x*3+1];
175             prow2[width-1-x+width*5] = row2[x*3+2];
176         }
177     }
178
179     memset( cost, 0, width1*D*sizeof(cost[0]) );
180
181     buffer -= minX2;
182     cost -= minX1*D + minD; // simplify the cost indices inside the loop
183
184 #if 1
185     for( c = 0; c < cn*2; c++, prow1 += width, prow2 += width )
186     {
187         int diff_scale = c < cn ? 0 : 2;
188
189         // precompute
190         //   v0 = min(row2[x-1/2], row2[x], row2[x+1/2]) and
191         //   v1 = max(row2[x-1/2], row2[x], row2[x+1/2]) and
192         for( x = minX2; x < maxX2; x++ )
193         {
194             int v = prow2[x];
195             int vl = x > 0 ? (v + prow2[x-1])/2 : v;
196             int vr = x < width-1 ? (v + prow2[x+1])/2 : v;
197             int v0 = std::min(vl, vr); v0 = std::min(v0, v);
198             int v1 = std::max(vl, vr); v1 = std::max(v1, v);
199             buffer[x] = (PixType)v0;
200             buffer[x + width2] = (PixType)v1;
201         }
202
203         for( x = minX1; x < maxX1; x++ )
204         {
205             int u = prow1[x];
206             int ul = x > 0 ? (u + prow1[x-1])/2 : u;
207             int ur = x < width-1 ? (u + prow1[x+1])/2 : u;
208             int u0 = std::min(ul, ur); u0 = std::min(u0, u);
209             int u1 = std::max(ul, ur); u1 = std::max(u1, u);
210
211         #if CV_SIMD128
212             v_uint8x16 _u  = v_setall_u8((uchar)u), _u0 = v_setall_u8((uchar)u0);
213             v_uint8x16 _u1 = v_setall_u8((uchar)u1);
214
215             for( int d = minD; d < maxD; d += 16 )
216             {
217                 v_uint8x16 _v  = v_load(prow2  + width-x-1 + d);
218                 v_uint8x16 _v0 = v_load(buffer + width-x-1 + d);
219                 v_uint8x16 _v1 = v_load(buffer + width-x-1 + d + width2);
220                 v_uint8x16 c0 = v_max(_u - _v1, _v0 - _u);
221                 v_uint8x16 c1 = v_max(_v - _u1, _u0 - _v);
222                 v_uint8x16 diff = v_min(c0, c1);
223
224                 v_int16x8 _c0 = v_load_aligned(cost + x*D + d);
225                 v_int16x8 _c1 = v_load_aligned(cost + x*D + d + 8);
226
227                 v_uint16x8 diff1,diff2;
228                 v_expand(diff,diff1,diff2);
229                 v_store_aligned(cost + x*D + d,     _c0 + v_reinterpret_as_s16(diff1 >> diff_scale));
230                 v_store_aligned(cost + x*D + d + 8, _c1 + v_reinterpret_as_s16(diff2 >> diff_scale));
231             }
232         #else
233             for( int d = minD; d < maxD; d++ )
234             {
235                 int v = prow2[width-x-1 + d];
236                 int v0 = buffer[width-x-1 + d];
237                 int v1 = buffer[width-x-1 + d + width2];
238                 int c0 = std::max(0, u - v1); c0 = std::max(c0, v0 - u);
239                 int c1 = std::max(0, v - u1); c1 = std::max(c1, u0 - v);
240
241                 cost[x*D + d] = (CostType)(cost[x*D+d] + (std::min(c0, c1) >> diff_scale));
242             }
243         #endif
244         }
245     }
246 #else
247     for( c = 0; c < cn*2; c++, prow1 += width, prow2 += width )
248     {
249         for( x = minX1; x < maxX1; x++ )
250         {
251             int u = prow1[x];
252         #if CV_SSE2
253             if( useSIMD )
254             {
255                 __m128i _u = _mm_set1_epi8(u), z = _mm_setzero_si128();
256
257                 for( int d = minD; d < maxD; d += 16 )
258                 {
259                     __m128i _v = _mm_loadu_si128((const __m128i*)(prow2 + width-1-x + d));
260                     __m128i diff = _mm_adds_epu8(_mm_subs_epu8(_u,_v), _mm_subs_epu8(_v,_u));
261                     __m128i c0 = _mm_load_si128((__m128i*)(cost + x*D + d));
262                     __m128i c1 = _mm_load_si128((__m128i*)(cost + x*D + d + 8));
263
264                     _mm_store_si128((__m128i*)(cost + x*D + d), _mm_adds_epi16(c0, _mm_unpacklo_epi8(diff,z)));
265                     _mm_store_si128((__m128i*)(cost + x*D + d + 8), _mm_adds_epi16(c1, _mm_unpackhi_epi8(diff,z)));
266                 }
267             }
268             else
269         #endif
270             {
271                 for( int d = minD; d < maxD; d++ )
272                 {
273                     int v = prow2[width-1-x + d];
274                     cost[x*D + d] = (CostType)(cost[x*D + d] + (CostType)std::abs(u - v));
275                 }
276             }
277         }
278     }
279 #endif
280 }
281
282
283 /*
284  computes disparity for "roi" in img1 w.r.t. img2 and write it to disp1buf.
285  that is, disp1buf(x, y)=d means that img1(x+roi.x, y+roi.y) ~ img2(x+roi.x-d, y+roi.y).
286  minD <= d < maxD.
287  disp2full is the reverse disparity map, that is:
288  disp2full(x+roi.x,y+roi.y)=d means that img2(x+roi.x, y+roi.y) ~ img1(x+roi.x+d, y+roi.y)
289
290  note that disp1buf will have the same size as the roi and
291  disp2full will have the same size as img1 (or img2).
292  On exit disp2buf is not the final disparity, it is an intermediate result that becomes
293  final after all the tiles are processed.
294
295  the disparity in disp1buf is written with sub-pixel accuracy
296  (4 fractional bits, see StereoSGBM::DISP_SCALE),
297  using quadratic interpolation, while the disparity in disp2buf
298  is written as is, without interpolation.
299
300  disp2cost also has the same size as img1 (or img2).
301  It contains the minimum current cost, used to find the best disparity, corresponding to the minimal cost.
302  */
303 static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
304                                  Mat& disp1, const StereoSGBMParams& params,
305                                  Mat& buffer )
306 {
307 #if CV_SSE2
308     static const uchar LSBTab[] =
309     {
310         0, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
311         5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
312         6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
313         5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
314         7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
315         5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
316         6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
317         5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0
318     };
319
320     volatile bool useSIMD = checkHardwareSupport(CV_CPU_SSE2);
321 #endif
322
323     const int ALIGN = 16;
324     const int DISP_SHIFT = StereoMatcher::DISP_SHIFT;
325     const int DISP_SCALE = (1 << DISP_SHIFT);
326     const CostType MAX_COST = SHRT_MAX;
327
328     int minD = params.minDisparity, maxD = minD + params.numDisparities;
329     Size SADWindowSize;
330     SADWindowSize.width = SADWindowSize.height = params.SADWindowSize > 0 ? params.SADWindowSize : 5;
331     int ftzero = std::max(params.preFilterCap, 15) | 1;
332     int uniquenessRatio = params.uniquenessRatio >= 0 ? params.uniquenessRatio : 10;
333     int disp12MaxDiff = params.disp12MaxDiff > 0 ? params.disp12MaxDiff : 1;
334     int P1 = params.P1 > 0 ? params.P1 : 2, P2 = std::max(params.P2 > 0 ? params.P2 : 5, P1+1);
335     int k, width = disp1.cols, height = disp1.rows;
336     int minX1 = std::max(maxD, 0), maxX1 = width + std::min(minD, 0);
337     int D = maxD - minD, width1 = maxX1 - minX1;
338     int INVALID_DISP = minD - 1, INVALID_DISP_SCALED = INVALID_DISP*DISP_SCALE;
339     int SW2 = SADWindowSize.width/2, SH2 = SADWindowSize.height/2;
340     bool fullDP = params.mode == StereoSGBM::MODE_HH;
341     int npasses = fullDP ? 2 : 1;
342     const int TAB_OFS = 256*4, TAB_SIZE = 256 + TAB_OFS*2;
343     PixType clipTab[TAB_SIZE];
344
345     for( k = 0; k < TAB_SIZE; k++ )
346         clipTab[k] = (PixType)(std::min(std::max(k - TAB_OFS, -ftzero), ftzero) + ftzero);
347
348     if( minX1 >= maxX1 )
349     {
350         disp1 = Scalar::all(INVALID_DISP_SCALED);
351         return;
352     }
353
354     CV_Assert( D % 16 == 0 );
355
356     // NR - the number of directions. the loop on x below that computes Lr assumes that NR == 8.
357     // if you change NR, please, modify the loop as well.
358     int D2 = D+16, NRD2 = NR2*D2;
359
360     // the number of L_r(.,.) and min_k L_r(.,.) lines in the buffer:
361     // for 8-way dynamic programming we need the current row and
362     // the previous row, i.e. 2 rows in total
363     const int NLR = 2;
364     const int LrBorder = NLR - 1;
365
366     // for each possible stereo match (img1(x,y) <=> img2(x-d,y))
367     // we keep pixel difference cost (C) and the summary cost over NR directions (S).
368     // we also keep all the partial costs for the previous line L_r(x,d) and also min_k L_r(x, k)
369     size_t costBufSize = width1*D;
370     size_t CSBufSize = costBufSize*(fullDP ? height : 1);
371     size_t minLrSize = (width1 + LrBorder*2)*NR2, LrSize = minLrSize*D2;
372     int hsumBufNRows = SH2*2 + 2;
373     size_t totalBufSize = (LrSize + minLrSize)*NLR*sizeof(CostType) + // minLr[] and Lr[]
374     costBufSize*(hsumBufNRows + 1)*sizeof(CostType) + // hsumBuf, pixdiff
375     CSBufSize*2*sizeof(CostType) + // C, S
376     width*16*img1.channels()*sizeof(PixType) + // temp buffer for computing per-pixel cost
377     width*(sizeof(CostType) + sizeof(DispType)) + 1024; // disp2cost + disp2
378
379     if( buffer.empty() || !buffer.isContinuous() ||
380         buffer.cols*buffer.rows*buffer.elemSize() < totalBufSize )
381         buffer.create(1, (int)totalBufSize, CV_8U);
382
383     // summary cost over different (nDirs) directions
384     CostType* Cbuf = (CostType*)alignPtr(buffer.ptr(), ALIGN);
385     CostType* Sbuf = Cbuf + CSBufSize;
386     CostType* hsumBuf = Sbuf + CSBufSize;
387     CostType* pixDiff = hsumBuf + costBufSize*hsumBufNRows;
388
389     CostType* disp2cost = pixDiff + costBufSize + (LrSize + minLrSize)*NLR;
390     DispType* disp2ptr = (DispType*)(disp2cost + width);
391     PixType* tempBuf = (PixType*)(disp2ptr + width);
392
393     // add P2 to every C(x,y). it saves a few operations in the inner loops
394     for( k = 0; k < width1*D; k++ )
395         Cbuf[k] = (CostType)P2;
396
397     for( int pass = 1; pass <= npasses; pass++ )
398     {
399         int x1, y1, x2, y2, dx, dy;
400
401         if( pass == 1 )
402         {
403             y1 = 0; y2 = height; dy = 1;
404             x1 = 0; x2 = width1; dx = 1;
405         }
406         else
407         {
408             y1 = height-1; y2 = -1; dy = -1;
409             x1 = width1-1; x2 = -1; dx = -1;
410         }
411
412         CostType *Lr[NLR]={0}, *minLr[NLR]={0};
413
414         for( k = 0; k < NLR; k++ )
415         {
416             // shift Lr[k] and minLr[k] pointers, because we allocated them with the borders,
417             // and will occasionally use negative indices with the arrays
418             // we need to shift Lr[k] pointers by 1, to give the space for d=-1.
419             // however, then the alignment will be imperfect, i.e. bad for SSE,
420             // thus we shift the pointers by 8 (8*sizeof(short) == 16 - ideal alignment)
421             Lr[k] = pixDiff + costBufSize + LrSize*k + NRD2*LrBorder + 8;
422             memset( Lr[k] - LrBorder*NRD2 - 8, 0, LrSize*sizeof(CostType) );
423             minLr[k] = pixDiff + costBufSize + LrSize*NLR + minLrSize*k + NR2*LrBorder;
424             memset( minLr[k] - LrBorder*NR2, 0, minLrSize*sizeof(CostType) );
425         }
426
427         for( int y = y1; y != y2; y += dy )
428         {
429             int x, d;
430             DispType* disp1ptr = disp1.ptr<DispType>(y);
431             CostType* C = Cbuf + (!fullDP ? 0 : y*costBufSize);
432             CostType* S = Sbuf + (!fullDP ? 0 : y*costBufSize);
433
434             if( pass == 1 ) // compute C on the first pass, and reuse it on the second pass, if any.
435             {
436                 int dy1 = y == 0 ? 0 : y + SH2, dy2 = y == 0 ? SH2 : dy1;
437
438                 for( k = dy1; k <= dy2; k++ )
439                 {
440                     CostType* hsumAdd = hsumBuf + (std::min(k, height-1) % hsumBufNRows)*costBufSize;
441
442                     if( k < height )
443                     {
444                         calcPixelCostBT( img1, img2, k, minD, maxD, pixDiff, tempBuf, clipTab, TAB_OFS, ftzero );
445
446                         memset(hsumAdd, 0, D*sizeof(CostType));
447                         for( x = 0; x <= SW2*D; x += D )
448                         {
449                             int scale = x == 0 ? SW2 + 1 : 1;
450                             for( d = 0; d < D; d++ )
451                                 hsumAdd[d] = (CostType)(hsumAdd[d] + pixDiff[x + d]*scale);
452                         }
453
454                         if( y > 0 )
455                         {
456                             const CostType* hsumSub = hsumBuf + (std::max(y - SH2 - 1, 0) % hsumBufNRows)*costBufSize;
457                             const CostType* Cprev = !fullDP || y == 0 ? C : C - costBufSize;
458
459                             for( x = D; x < width1*D; x += D )
460                             {
461                                 const CostType* pixAdd = pixDiff + std::min(x + SW2*D, (width1-1)*D);
462                                 const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*D, 0);
463
464                             #if CV_SSE2
465                                 if( useSIMD )
466                                 {
467                                     for( d = 0; d < D; d += 8 )
468                                     {
469                                         __m128i hv = _mm_load_si128((const __m128i*)(hsumAdd + x - D + d));
470                                         __m128i Cx = _mm_load_si128((__m128i*)(Cprev + x + d));
471                                         hv = _mm_adds_epi16(_mm_subs_epi16(hv,
472                                                                            _mm_load_si128((const __m128i*)(pixSub + d))),
473                                                             _mm_load_si128((const __m128i*)(pixAdd + d)));
474                                         Cx = _mm_adds_epi16(_mm_subs_epi16(Cx,
475                                                                            _mm_load_si128((const __m128i*)(hsumSub + x + d))),
476                                                             hv);
477                                         _mm_store_si128((__m128i*)(hsumAdd + x + d), hv);
478                                         _mm_store_si128((__m128i*)(C + x + d), Cx);
479                                     }
480                                 }
481                                 else
482                             #endif
483                                 {
484                                     for( d = 0; d < D; d++ )
485                                     {
486                                         int hv = hsumAdd[x + d] = (CostType)(hsumAdd[x - D + d] + pixAdd[d] - pixSub[d]);
487                                         C[x + d] = (CostType)(Cprev[x + d] + hv - hsumSub[x + d]);
488                                     }
489                                 }
490                             }
491                         }
492                         else
493                         {
494                             for( x = D; x < width1*D; x += D )
495                             {
496                                 const CostType* pixAdd = pixDiff + std::min(x + SW2*D, (width1-1)*D);
497                                 const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*D, 0);
498
499                                 for( d = 0; d < D; d++ )
500                                     hsumAdd[x + d] = (CostType)(hsumAdd[x - D + d] + pixAdd[d] - pixSub[d]);
501                             }
502                         }
503                     }
504
505                     if( y == 0 )
506                     {
507                         int scale = k == 0 ? SH2 + 1 : 1;
508                         for( x = 0; x < width1*D; x++ )
509                             C[x] = (CostType)(C[x] + hsumAdd[x]*scale);
510                     }
511                 }
512
513                 // also, clear the S buffer
514                 for( k = 0; k < width1*D; k++ )
515                     S[k] = 0;
516             }
517
518             // clear the left and the right borders
519             memset( Lr[0] - NRD2*LrBorder - 8, 0, NRD2*LrBorder*sizeof(CostType) );
520             memset( Lr[0] + width1*NRD2 - 8, 0, NRD2*LrBorder*sizeof(CostType) );
521             memset( minLr[0] - NR2*LrBorder, 0, NR2*LrBorder*sizeof(CostType) );
522             memset( minLr[0] + width1*NR2, 0, NR2*LrBorder*sizeof(CostType) );
523
524             /*
525              [formula 13 in the paper]
526              compute L_r(p, d) = C(p, d) +
527              min(L_r(p-r, d),
528              L_r(p-r, d-1) + P1,
529              L_r(p-r, d+1) + P1,
530              min_k L_r(p-r, k) + P2) - min_k L_r(p-r, k)
531              where p = (x,y), r is one of the directions.
532              we process all the directions at once:
533              0: r=(-dx, 0)
534              1: r=(-1, -dy)
535              2: r=(0, -dy)
536              3: r=(1, -dy)
537              4: r=(-2, -dy)
538              5: r=(-1, -dy*2)
539              6: r=(1, -dy*2)
540              7: r=(2, -dy)
541              */
542             for( x = x1; x != x2; x += dx )
543             {
544                 int xm = x*NR2, xd = xm*D2;
545
546                 int delta0 = minLr[0][xm - dx*NR2] + P2, delta1 = minLr[1][xm - NR2 + 1] + P2;
547                 int delta2 = minLr[1][xm + 2] + P2, delta3 = minLr[1][xm + NR2 + 3] + P2;
548
549                 CostType* Lr_p0 = Lr[0] + xd - dx*NRD2;
550                 CostType* Lr_p1 = Lr[1] + xd - NRD2 + D2;
551                 CostType* Lr_p2 = Lr[1] + xd + D2*2;
552                 CostType* Lr_p3 = Lr[1] + xd + NRD2 + D2*3;
553
554                 Lr_p0[-1] = Lr_p0[D] = Lr_p1[-1] = Lr_p1[D] =
555                 Lr_p2[-1] = Lr_p2[D] = Lr_p3[-1] = Lr_p3[D] = MAX_COST;
556
557                 CostType* Lr_p = Lr[0] + xd;
558                 const CostType* Cp = C + x*D;
559                 CostType* Sp = S + x*D;
560
561             #if CV_SSE2
562                 if( useSIMD )
563                 {
564                     __m128i _P1 = _mm_set1_epi16((short)P1);
565
566                     __m128i _delta0 = _mm_set1_epi16((short)delta0);
567                     __m128i _delta1 = _mm_set1_epi16((short)delta1);
568                     __m128i _delta2 = _mm_set1_epi16((short)delta2);
569                     __m128i _delta3 = _mm_set1_epi16((short)delta3);
570                     __m128i _minL0 = _mm_set1_epi16((short)MAX_COST);
571
572                     for( d = 0; d < D; d += 8 )
573                     {
574                         __m128i Cpd = _mm_load_si128((const __m128i*)(Cp + d));
575                         __m128i L0, L1, L2, L3;
576
577                         L0 = _mm_load_si128((const __m128i*)(Lr_p0 + d));
578                         L1 = _mm_load_si128((const __m128i*)(Lr_p1 + d));
579                         L2 = _mm_load_si128((const __m128i*)(Lr_p2 + d));
580                         L3 = _mm_load_si128((const __m128i*)(Lr_p3 + d));
581
582                         L0 = _mm_min_epi16(L0, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p0 + d - 1)), _P1));
583                         L0 = _mm_min_epi16(L0, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p0 + d + 1)), _P1));
584
585                         L1 = _mm_min_epi16(L1, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p1 + d - 1)), _P1));
586                         L1 = _mm_min_epi16(L1, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p1 + d + 1)), _P1));
587
588                         L2 = _mm_min_epi16(L2, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p2 + d - 1)), _P1));
589                         L2 = _mm_min_epi16(L2, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p2 + d + 1)), _P1));
590
591                         L3 = _mm_min_epi16(L3, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p3 + d - 1)), _P1));
592                         L3 = _mm_min_epi16(L3, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p3 + d + 1)), _P1));
593
594                         L0 = _mm_min_epi16(L0, _delta0);
595                         L0 = _mm_adds_epi16(_mm_subs_epi16(L0, _delta0), Cpd);
596
597                         L1 = _mm_min_epi16(L1, _delta1);
598                         L1 = _mm_adds_epi16(_mm_subs_epi16(L1, _delta1), Cpd);
599
600                         L2 = _mm_min_epi16(L2, _delta2);
601                         L2 = _mm_adds_epi16(_mm_subs_epi16(L2, _delta2), Cpd);
602
603                         L3 = _mm_min_epi16(L3, _delta3);
604                         L3 = _mm_adds_epi16(_mm_subs_epi16(L3, _delta3), Cpd);
605
606                         _mm_store_si128( (__m128i*)(Lr_p + d), L0);
607                         _mm_store_si128( (__m128i*)(Lr_p + d + D2), L1);
608                         _mm_store_si128( (__m128i*)(Lr_p + d + D2*2), L2);
609                         _mm_store_si128( (__m128i*)(Lr_p + d + D2*3), L3);
610
611                         __m128i t0 = _mm_min_epi16(_mm_unpacklo_epi16(L0, L2), _mm_unpackhi_epi16(L0, L2));
612                         __m128i t1 = _mm_min_epi16(_mm_unpacklo_epi16(L1, L3), _mm_unpackhi_epi16(L1, L3));
613                         t0 = _mm_min_epi16(_mm_unpacklo_epi16(t0, t1), _mm_unpackhi_epi16(t0, t1));
614                         _minL0 = _mm_min_epi16(_minL0, t0);
615
616                         __m128i Sval = _mm_load_si128((const __m128i*)(Sp + d));
617
618                         L0 = _mm_adds_epi16(L0, L1);
619                         L2 = _mm_adds_epi16(L2, L3);
620                         Sval = _mm_adds_epi16(Sval, L0);
621                         Sval = _mm_adds_epi16(Sval, L2);
622
623                         _mm_store_si128((__m128i*)(Sp + d), Sval);
624                     }
625
626                     _minL0 = _mm_min_epi16(_minL0, _mm_srli_si128(_minL0, 8));
627                     _mm_storel_epi64((__m128i*)&minLr[0][xm], _minL0);
628                 }
629                 else
630             #endif
631                 {
632                     int minL0 = MAX_COST, minL1 = MAX_COST, minL2 = MAX_COST, minL3 = MAX_COST;
633
634                     for( d = 0; d < D; d++ )
635                     {
636                         int Cpd = Cp[d], L0, L1, L2, L3;
637
638                         L0 = Cpd + std::min((int)Lr_p0[d], std::min(Lr_p0[d-1] + P1, std::min(Lr_p0[d+1] + P1, delta0))) - delta0;
639                         L1 = Cpd + std::min((int)Lr_p1[d], std::min(Lr_p1[d-1] + P1, std::min(Lr_p1[d+1] + P1, delta1))) - delta1;
640                         L2 = Cpd + std::min((int)Lr_p2[d], std::min(Lr_p2[d-1] + P1, std::min(Lr_p2[d+1] + P1, delta2))) - delta2;
641                         L3 = Cpd + std::min((int)Lr_p3[d], std::min(Lr_p3[d-1] + P1, std::min(Lr_p3[d+1] + P1, delta3))) - delta3;
642
643                         Lr_p[d] = (CostType)L0;
644                         minL0 = std::min(minL0, L0);
645
646                         Lr_p[d + D2] = (CostType)L1;
647                         minL1 = std::min(minL1, L1);
648
649                         Lr_p[d + D2*2] = (CostType)L2;
650                         minL2 = std::min(minL2, L2);
651
652                         Lr_p[d + D2*3] = (CostType)L3;
653                         minL3 = std::min(minL3, L3);
654
655                         Sp[d] = saturate_cast<CostType>(Sp[d] + L0 + L1 + L2 + L3);
656                     }
657                     minLr[0][xm] = (CostType)minL0;
658                     minLr[0][xm+1] = (CostType)minL1;
659                     minLr[0][xm+2] = (CostType)minL2;
660                     minLr[0][xm+3] = (CostType)minL3;
661                 }
662             }
663
664             if( pass == npasses )
665             {
666                 for( x = 0; x < width; x++ )
667                 {
668                     disp1ptr[x] = disp2ptr[x] = (DispType)INVALID_DISP_SCALED;
669                     disp2cost[x] = MAX_COST;
670                 }
671
672                 for( x = width1 - 1; x >= 0; x-- )
673                 {
674                     CostType* Sp = S + x*D;
675                     int minS = MAX_COST, bestDisp = -1;
676
677                     if( npasses == 1 )
678                     {
679                         int xm = x*NR2, xd = xm*D2;
680
681                         int minL0 = MAX_COST;
682                         int delta0 = minLr[0][xm + NR2] + P2;
683                         CostType* Lr_p0 = Lr[0] + xd + NRD2;
684                         Lr_p0[-1] = Lr_p0[D] = MAX_COST;
685                         CostType* Lr_p = Lr[0] + xd;
686
687                         const CostType* Cp = C + x*D;
688
689                     #if CV_SSE2
690                         if( useSIMD )
691                         {
692                             __m128i _P1 = _mm_set1_epi16((short)P1);
693                             __m128i _delta0 = _mm_set1_epi16((short)delta0);
694
695                             __m128i _minL0 = _mm_set1_epi16((short)minL0);
696                             __m128i _minS = _mm_set1_epi16(MAX_COST), _bestDisp = _mm_set1_epi16(-1);
697                             __m128i _d8 = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7), _8 = _mm_set1_epi16(8);
698
699                             for( d = 0; d < D; d += 8 )
700                             {
701                                 __m128i Cpd = _mm_load_si128((const __m128i*)(Cp + d)), L0;
702
703                                 L0 = _mm_load_si128((const __m128i*)(Lr_p0 + d));
704                                 L0 = _mm_min_epi16(L0, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p0 + d - 1)), _P1));
705                                 L0 = _mm_min_epi16(L0, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p0 + d + 1)), _P1));
706                                 L0 = _mm_min_epi16(L0, _delta0);
707                                 L0 = _mm_adds_epi16(_mm_subs_epi16(L0, _delta0), Cpd);
708
709                                 _mm_store_si128((__m128i*)(Lr_p + d), L0);
710                                 _minL0 = _mm_min_epi16(_minL0, L0);
711                                 L0 = _mm_adds_epi16(L0, *(__m128i*)(Sp + d));
712                                 _mm_store_si128((__m128i*)(Sp + d), L0);
713
714                                 __m128i mask = _mm_cmpgt_epi16(_minS, L0);
715                                 _minS = _mm_min_epi16(_minS, L0);
716                                 _bestDisp = _mm_xor_si128(_bestDisp, _mm_and_si128(_mm_xor_si128(_bestDisp,_d8), mask));
717                                 _d8 = _mm_adds_epi16(_d8, _8);
718                             }
719
720                             short CV_DECL_ALIGNED(16) bestDispBuf[8];
721                             _mm_store_si128((__m128i*)bestDispBuf, _bestDisp);
722
723                             _minL0 = _mm_min_epi16(_minL0, _mm_srli_si128(_minL0, 8));
724                             _minL0 = _mm_min_epi16(_minL0, _mm_srli_si128(_minL0, 4));
725                             _minL0 = _mm_min_epi16(_minL0, _mm_srli_si128(_minL0, 2));
726
727                             __m128i qS = _mm_min_epi16(_minS, _mm_srli_si128(_minS, 8));
728                             qS = _mm_min_epi16(qS, _mm_srli_si128(qS, 4));
729                             qS = _mm_min_epi16(qS, _mm_srli_si128(qS, 2));
730
731                             minLr[0][xm] = (CostType)_mm_cvtsi128_si32(_minL0);
732                             minS = (CostType)_mm_cvtsi128_si32(qS);
733
734                             qS = _mm_shuffle_epi32(_mm_unpacklo_epi16(qS, qS), 0);
735                             qS = _mm_cmpeq_epi16(_minS, qS);
736                             int idx = _mm_movemask_epi8(_mm_packs_epi16(qS, qS)) & 255;
737
738                             bestDisp = bestDispBuf[LSBTab[idx]];
739                         }
740                         else
741                     #endif
742                         {
743                             for( d = 0; d < D; d++ )
744                             {
745                                 int L0 = Cp[d] + std::min((int)Lr_p0[d], std::min(Lr_p0[d-1] + P1, std::min(Lr_p0[d+1] + P1, delta0))) - delta0;
746
747                                 Lr_p[d] = (CostType)L0;
748                                 minL0 = std::min(minL0, L0);
749
750                                 int Sval = Sp[d] = saturate_cast<CostType>(Sp[d] + L0);
751                                 if( Sval < minS )
752                                 {
753                                     minS = Sval;
754                                     bestDisp = d;
755                                 }
756                             }
757                             minLr[0][xm] = (CostType)minL0;
758                         }
759                     }
760                     else
761                     {
762                         for( d = 0; d < D; d++ )
763                         {
764                             int Sval = Sp[d];
765                             if( Sval < minS )
766                             {
767                                 minS = Sval;
768                                 bestDisp = d;
769                             }
770                         }
771                     }
772
773                     for( d = 0; d < D; d++ )
774                     {
775                         if( Sp[d]*(100 - uniquenessRatio) < minS*100 && std::abs(bestDisp - d) > 1 )
776                             break;
777                     }
778                     if( d < D )
779                         continue;
780                     d = bestDisp;
781                     int _x2 = x + minX1 - d - minD;
782                     if( disp2cost[_x2] > minS )
783                     {
784                         disp2cost[_x2] = (CostType)minS;
785                         disp2ptr[_x2] = (DispType)(d + minD);
786                     }
787
788                     if( 0 < d && d < D-1 )
789                     {
790                         // do subpixel quadratic interpolation:
791                         //   fit parabola into (x1=d-1, y1=Sp[d-1]), (x2=d, y2=Sp[d]), (x3=d+1, y3=Sp[d+1])
792                         //   then find minimum of the parabola.
793                         int denom2 = std::max(Sp[d-1] + Sp[d+1] - 2*Sp[d], 1);
794                         d = d*DISP_SCALE + ((Sp[d-1] - Sp[d+1])*DISP_SCALE + denom2)/(denom2*2);
795                     }
796                     else
797                         d *= DISP_SCALE;
798                     disp1ptr[x + minX1] = (DispType)(d + minD*DISP_SCALE);
799                 }
800
801                 for( x = minX1; x < maxX1; x++ )
802                 {
803                     // we round the computed disparity both towards -inf and +inf and check
804                     // if either of the corresponding disparities in disp2 is consistent.
805                     // This is to give the computed disparity a chance to look valid if it is.
806                     int d1 = disp1ptr[x];
807                     if( d1 == INVALID_DISP_SCALED )
808                         continue;
809                     int _d = d1 >> DISP_SHIFT;
810                     int d_ = (d1 + DISP_SCALE-1) >> DISP_SHIFT;
811                     int _x = x - _d, x_ = x - d_;
812                     if( 0 <= _x && _x < width && disp2ptr[_x] >= minD && std::abs(disp2ptr[_x] - _d) > disp12MaxDiff &&
813                        0 <= x_ && x_ < width && disp2ptr[x_] >= minD && std::abs(disp2ptr[x_] - d_) > disp12MaxDiff )
814                         disp1ptr[x] = (DispType)INVALID_DISP_SCALED;
815                 }
816             }
817
818             // now shift the cyclic buffers
819             std::swap( Lr[0], Lr[1] );
820             std::swap( minLr[0], minLr[1] );
821         }
822     }
823 }
824
825 //////////////////////////////////////////////////////////////////////////////////////////////////////
826
827 void getBufferPointers(Mat& buffer, int width, int width1, int D, int num_ch, int SH2, int P2,
828                        CostType*& curCostVolumeLine, CostType*& hsumBuf, CostType*& pixDiff,
829                        PixType*& tmpBuf, CostType*& horPassCostVolume,
830                        CostType*& vertPassCostVolume, CostType*& vertPassMin, CostType*& rightPassBuf,
831                        CostType*& disp2CostBuf, short*& disp2Buf);
832
833 struct SGBM3WayMainLoop : public ParallelLoopBody
834 {
835     Mat* buffers;
836     const Mat *img1, *img2;
837     Mat* dst_disp;
838
839     int nstripes, stripe_sz;
840     int stripe_overlap;
841
842     int width,height;
843     int minD, maxD, D;
844     int minX1, maxX1, width1;
845
846     int SW2, SH2;
847     int P1, P2;
848     int uniquenessRatio, disp12MaxDiff;
849
850     int costBufSize, hsumBufNRows;
851     int TAB_OFS, ftzero;
852
853     PixType* clipTab;
854
855     SGBM3WayMainLoop(Mat *_buffers, const Mat& _img1, const Mat& _img2, Mat* _dst_disp, const StereoSGBMParams& params, PixType* _clipTab, int _nstripes, int _stripe_overlap);
856     void getRawMatchingCost(CostType* C, CostType* hsumBuf, CostType* pixDiff, PixType* tmpBuf, int y, int src_start_idx) const;
857     void operator () (const Range& range) const;
858 };
859
860 SGBM3WayMainLoop::SGBM3WayMainLoop(Mat *_buffers, const Mat& _img1, const Mat& _img2, Mat* _dst_disp, const StereoSGBMParams& params, PixType* _clipTab, int _nstripes, int _stripe_overlap):
861 buffers(_buffers), img1(&_img1), img2(&_img2), dst_disp(_dst_disp), clipTab(_clipTab)
862 {
863     nstripes = _nstripes;
864     stripe_overlap = _stripe_overlap;
865     stripe_sz = (int)ceil(img1->rows/(double)nstripes);
866
867     width = img1->cols; height = img1->rows;
868     minD = params.minDisparity; maxD = minD + params.numDisparities; D = maxD - minD;
869     minX1 = std::max(maxD, 0); maxX1 = width + std::min(minD, 0); width1 = maxX1 - minX1;
870     CV_Assert( D % 16 == 0 );
871
872     SW2 = SH2 = params.SADWindowSize > 0 ? params.SADWindowSize/2 : 1;
873
874     P1 = params.P1 > 0 ? params.P1 : 2; P2 = std::max(params.P2 > 0 ? params.P2 : 5, P1+1);
875     uniquenessRatio = params.uniquenessRatio >= 0 ? params.uniquenessRatio : 10;
876     disp12MaxDiff = params.disp12MaxDiff > 0 ? params.disp12MaxDiff : 1;
877
878     costBufSize = width1*D;
879     hsumBufNRows = SH2*2 + 2;
880     TAB_OFS = 256*4;
881     ftzero = std::max(params.preFilterCap, 15) | 1;
882 }
883
884 void getBufferPointers(Mat& buffer, int width, int width1, int D, int num_ch, int SH2, int P2,
885                        CostType*& curCostVolumeLine, CostType*& hsumBuf, CostType*& pixDiff,
886                        PixType*& tmpBuf, CostType*& horPassCostVolume,
887                        CostType*& vertPassCostVolume, CostType*& vertPassMin, CostType*& rightPassBuf,
888                        CostType*& disp2CostBuf, short*& disp2Buf)
889 {
890     // allocating all the required memory:
891     int costVolumeLineSize = width1*D;
892     int width1_ext = width1+2;
893     int costVolumeLineSize_ext = width1_ext*D;
894     int hsumBufNRows = SH2*2 + 2;
895
896     // main buffer to store matching costs for the current line:
897     int curCostVolumeLineSize = costVolumeLineSize*sizeof(CostType);
898
899     // auxiliary buffers for the raw matching cost computation:
900     int hsumBufSize  = costVolumeLineSize*hsumBufNRows*sizeof(CostType);
901     int pixDiffSize  = costVolumeLineSize*sizeof(CostType);
902     int tmpBufSize   = width*16*num_ch*sizeof(PixType);
903
904     // auxiliary buffers for the matching cost aggregation:
905     int horPassCostVolumeSize  = costVolumeLineSize_ext*sizeof(CostType); // buffer for the 2-pass horizontal cost aggregation
906     int vertPassCostVolumeSize = costVolumeLineSize_ext*sizeof(CostType); // buffer for the vertical cost aggregation
907     int vertPassMinSize        = width1_ext*sizeof(CostType);             // buffer for storing minimum costs from the previous line
908     int rightPassBufSize       = D*sizeof(CostType);                      // additional small buffer for the right-to-left pass
909
910     // buffers for the pseudo-LRC check:
911     int disp2CostBufSize = width*sizeof(CostType);
912     int disp2BufSize     = width*sizeof(short);
913
914     // sum up the sizes of all the buffers:
915     size_t totalBufSize = curCostVolumeLineSize +
916                           hsumBufSize +
917                           pixDiffSize +
918                           tmpBufSize  +
919                           horPassCostVolumeSize +
920                           vertPassCostVolumeSize +
921                           vertPassMinSize +
922                           rightPassBufSize +
923                           disp2CostBufSize +
924                           disp2BufSize +
925                           16;  //to compensate for the alignPtr shifts
926
927     if( buffer.empty() || !buffer.isContinuous() || buffer.cols*buffer.rows*buffer.elemSize() < totalBufSize )
928         buffer.create(1, (int)totalBufSize, CV_8U);
929
930     // set up all the pointers:
931     curCostVolumeLine  = (CostType*)alignPtr(buffer.ptr(), 16);
932     hsumBuf            = curCostVolumeLine + costVolumeLineSize;
933     pixDiff            = hsumBuf + costVolumeLineSize*hsumBufNRows;
934     tmpBuf             = (PixType*)(pixDiff + costVolumeLineSize);
935     horPassCostVolume  = (CostType*)(tmpBuf + width*16*num_ch);
936     vertPassCostVolume = horPassCostVolume + costVolumeLineSize_ext;
937     rightPassBuf       = vertPassCostVolume + costVolumeLineSize_ext;
938     vertPassMin        = rightPassBuf + D;
939     disp2CostBuf       = vertPassMin + width1_ext;
940     disp2Buf           = disp2CostBuf + width;
941
942     // initialize memory:
943     memset(buffer.ptr(),0,totalBufSize);
944     for(int i=0;i<costVolumeLineSize;i++)
945         curCostVolumeLine[i] = (CostType)P2; //such initialization simplifies the cost aggregation loops a bit
946 }
947
948 // performing block matching and building raw cost-volume for the current row
949 void SGBM3WayMainLoop::getRawMatchingCost(CostType* C, // target cost-volume row
950                                           CostType* hsumBuf, CostType* pixDiff, PixType* tmpBuf, //buffers
951                                           int y, int src_start_idx) const
952 {
953     int x, d;
954     int dy1 = (y == src_start_idx) ? src_start_idx : y + SH2, dy2 = (y == src_start_idx) ? src_start_idx+SH2 : dy1;
955
956     for(int k = dy1; k <= dy2; k++ )
957     {
958         CostType* hsumAdd = hsumBuf + (std::min(k, height-1) % hsumBufNRows)*costBufSize;
959         if( k < height )
960         {
961             calcPixelCostBT( *img1, *img2, k, minD, maxD, pixDiff, tmpBuf, clipTab, TAB_OFS, ftzero );
962
963             memset(hsumAdd, 0, D*sizeof(CostType));
964             for(x = 0; x <= SW2*D; x += D )
965             {
966                 int scale = x == 0 ? SW2 + 1 : 1;
967
968                 for( d = 0; d < D; d++ )
969                     hsumAdd[d] = (CostType)(hsumAdd[d] + pixDiff[x + d]*scale);
970             }
971
972             if( y > src_start_idx )
973             {
974                 const CostType* hsumSub = hsumBuf + (std::max(y - SH2 - 1, src_start_idx) % hsumBufNRows)*costBufSize;
975
976                 for( x = D; x < width1*D; x += D )
977                 {
978                     const CostType* pixAdd = pixDiff + std::min(x + SW2*D, (width1-1)*D);
979                     const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*D, 0);
980
981 #if CV_SIMD128
982                     v_int16x8 hv_reg;
983                     for( d = 0; d < D; d+=8 )
984                     {
985                         hv_reg = v_load_aligned(hsumAdd+x-D+d) + (v_load_aligned(pixAdd+d) - v_load_aligned(pixSub+d));
986                         v_store_aligned(hsumAdd+x+d,hv_reg);
987                         v_store_aligned(C+x+d,v_load_aligned(C+x+d)+(hv_reg-v_load_aligned(hsumSub+x+d)));
988                     }
989 #else
990                     for( d = 0; d < D; d++ )
991                     {
992                         int hv = hsumAdd[x + d] = (CostType)(hsumAdd[x - D + d] + pixAdd[d] - pixSub[d]);
993                         C[x + d] = (CostType)(C[x + d] + hv - hsumSub[x + d]);
994                     }
995 #endif
996                 }
997             }
998             else
999             {
1000                 for( x = D; x < width1*D; x += D )
1001                 {
1002                     const CostType* pixAdd = pixDiff + std::min(x + SW2*D, (width1-1)*D);
1003                     const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*D, 0);
1004
1005                     for( d = 0; d < D; d++ )
1006                         hsumAdd[x + d] = (CostType)(hsumAdd[x - D + d] + pixAdd[d] - pixSub[d]);
1007                 }
1008             }
1009         }
1010
1011         if( y == src_start_idx )
1012         {
1013             int scale = k == src_start_idx ? SH2 + 1 : 1;
1014             for( x = 0; x < width1*D; x++ )
1015                 C[x] = (CostType)(C[x] + hsumAdd[x]*scale);
1016         }
1017     }
1018 }
1019
1020 #if CV_SIMD128
1021 // define some additional reduce operations:
1022 inline short min(const v_int16x8& a)
1023 {
1024     short CV_DECL_ALIGNED(16) buf[8];
1025     v_store_aligned(buf, a);
1026     short s0 = std::min(buf[0], buf[1]);
1027     short s1 = std::min(buf[2], buf[3]);
1028     short s2 = std::min(buf[4], buf[5]);
1029     short s3 = std::min(buf[6], buf[7]);
1030     return std::min(std::min(s0, s1),std::min(s2, s3));
1031 }
1032
1033 inline short min_pos(const v_int16x8& val,const v_int16x8& pos)
1034 {
1035     short CV_DECL_ALIGNED(16) val_buf[8];
1036     v_store_aligned(val_buf, val);
1037     short CV_DECL_ALIGNED(16) pos_buf[8];
1038     v_store_aligned(pos_buf, pos);
1039     short res_pos = 0;
1040     short min_val = SHRT_MAX;
1041     if(val_buf[0]<min_val) {min_val=val_buf[0]; res_pos=pos_buf[0];}
1042     if(val_buf[1]<min_val) {min_val=val_buf[1]; res_pos=pos_buf[1];}
1043     if(val_buf[2]<min_val) {min_val=val_buf[2]; res_pos=pos_buf[2];}
1044     if(val_buf[3]<min_val) {min_val=val_buf[3]; res_pos=pos_buf[3];}
1045     if(val_buf[4]<min_val) {min_val=val_buf[4]; res_pos=pos_buf[4];}
1046     if(val_buf[5]<min_val) {min_val=val_buf[5]; res_pos=pos_buf[5];}
1047     if(val_buf[6]<min_val) {min_val=val_buf[6]; res_pos=pos_buf[6];}
1048     if(val_buf[7]<min_val) {min_val=val_buf[7]; res_pos=pos_buf[7];}
1049     return res_pos;
1050 }
1051 #endif
1052
1053 // performing SGM cost accumulation from left to right (result is stored in leftBuf) and
1054 // in-place cost accumulation from top to bottom (result is stored in topBuf)
1055 inline void accumulateCostsLeftTop(CostType* leftBuf, CostType* leftBuf_prev, CostType* topBuf, CostType* costs,
1056                                    CostType& leftMinCost, CostType& topMinCost, int D, int P1, int P2)
1057 {
1058 #if CV_SIMD128
1059     v_int16x8 P1_reg = v_setall_s16(cv::saturate_cast<CostType>(P1));
1060
1061     v_int16x8 leftMinCostP2_reg   = v_setall_s16(cv::saturate_cast<CostType>(leftMinCost+P2));
1062     v_int16x8 leftMinCost_new_reg = v_setall_s16(SHRT_MAX);
1063     v_int16x8 src0_leftBuf        = v_setall_s16(SHRT_MAX);
1064     v_int16x8 src1_leftBuf        = v_load_aligned(leftBuf_prev);
1065
1066     v_int16x8 topMinCostP2_reg   = v_setall_s16(cv::saturate_cast<CostType>(topMinCost+P2));
1067     v_int16x8 topMinCost_new_reg = v_setall_s16(SHRT_MAX);
1068     v_int16x8 src0_topBuf        = v_setall_s16(SHRT_MAX);
1069     v_int16x8 src1_topBuf        = v_load_aligned(topBuf);
1070
1071     v_int16x8 src2;
1072     v_int16x8 src_shifted_left,src_shifted_right;
1073     v_int16x8 res;
1074
1075     for(int i=0;i<D-8;i+=8)
1076     {
1077         //process leftBuf:
1078         //lookahead load:
1079         src2 = v_load_aligned(leftBuf_prev+i+8);
1080
1081         //get shifted versions of the current block and add P1:
1082         src_shifted_left  = v_extract<7> (src0_leftBuf,src1_leftBuf) + P1_reg;
1083         src_shifted_right = v_extract<1> (src1_leftBuf,src2        ) + P1_reg;
1084
1085         // process and save current block:
1086         res = v_load_aligned(costs+i) + (v_min(v_min(src_shifted_left,src_shifted_right),v_min(src1_leftBuf,leftMinCostP2_reg))-leftMinCostP2_reg);
1087         leftMinCost_new_reg = v_min(leftMinCost_new_reg,res);
1088         v_store_aligned(leftBuf+i, res);
1089
1090         //update src buffers:
1091         src0_leftBuf = src1_leftBuf;
1092         src1_leftBuf = src2;
1093
1094         //process topBuf:
1095         //lookahead load:
1096         src2 = v_load_aligned(topBuf+i+8);
1097
1098         //get shifted versions of the current block and add P1:
1099         src_shifted_left  = v_extract<7> (src0_topBuf,src1_topBuf) + P1_reg;
1100         src_shifted_right = v_extract<1> (src1_topBuf,src2       ) + P1_reg;
1101
1102         // process and save current block:
1103         res = v_load_aligned(costs+i) + (v_min(v_min(src_shifted_left,src_shifted_right),v_min(src1_topBuf,topMinCostP2_reg))-topMinCostP2_reg);
1104         topMinCost_new_reg = v_min(topMinCost_new_reg,res);
1105         v_store_aligned(topBuf+i, res);
1106
1107         //update src buffers:
1108         src0_topBuf = src1_topBuf;
1109         src1_topBuf = src2;
1110     }
1111
1112     // a bit different processing for the last cycle of the loop:
1113     //process leftBuf:
1114     src2 = v_setall_s16(SHRT_MAX);
1115     src_shifted_left  = v_extract<7> (src0_leftBuf,src1_leftBuf) + P1_reg;
1116     src_shifted_right = v_extract<1> (src1_leftBuf,src2        ) + P1_reg;
1117
1118     res = v_load_aligned(costs+D-8) + (v_min(v_min(src_shifted_left,src_shifted_right),v_min(src1_leftBuf,leftMinCostP2_reg))-leftMinCostP2_reg);
1119     leftMinCost = min(v_min(leftMinCost_new_reg,res));
1120     v_store_aligned(leftBuf+D-8, res);
1121
1122     //process topBuf:
1123     src2 = v_setall_s16(SHRT_MAX);
1124     src_shifted_left  = v_extract<7> (src0_topBuf,src1_topBuf) + P1_reg;
1125     src_shifted_right = v_extract<1> (src1_topBuf,src2       ) + P1_reg;
1126
1127     res = v_load_aligned(costs+D-8) + (v_min(v_min(src_shifted_left,src_shifted_right),v_min(src1_topBuf,topMinCostP2_reg))-topMinCostP2_reg);
1128     topMinCost = min(v_min(topMinCost_new_reg,res));
1129     v_store_aligned(topBuf+D-8, res);
1130 #else
1131     CostType leftMinCost_new = SHRT_MAX;
1132     CostType topMinCost_new  = SHRT_MAX;
1133     int leftMinCost_P2  = leftMinCost + P2;
1134     int topMinCost_P2   = topMinCost  + P2;
1135     CostType leftBuf_prev_i_minus_1 = SHRT_MAX;
1136     CostType topBuf_i_minus_1       = SHRT_MAX;
1137     CostType tmp;
1138
1139     for(int i=0;i<D-1;i++)
1140     {
1141         leftBuf[i] = cv::saturate_cast<CostType>(costs[i] + std::min(std::min(leftBuf_prev_i_minus_1+P1,leftBuf_prev[i+1]+P1),std::min((int)leftBuf_prev[i],leftMinCost_P2))-leftMinCost_P2);
1142         leftBuf_prev_i_minus_1 = leftBuf_prev[i];
1143         leftMinCost_new = std::min(leftMinCost_new,leftBuf[i]);
1144
1145         tmp = topBuf[i];
1146         topBuf[i]  = cv::saturate_cast<CostType>(costs[i] + std::min(std::min(topBuf_i_minus_1+P1,topBuf[i+1]+P1),std::min((int)topBuf[i],topMinCost_P2))-topMinCost_P2);
1147         topBuf_i_minus_1 = tmp;
1148         topMinCost_new  = std::min(topMinCost_new,topBuf[i]);
1149     }
1150
1151     leftBuf[D-1] = cv::saturate_cast<CostType>(costs[D-1] + std::min(leftBuf_prev_i_minus_1+P1,std::min((int)leftBuf_prev[D-1],leftMinCost_P2))-leftMinCost_P2);
1152     leftMinCost = std::min(leftMinCost_new,leftBuf[D-1]);
1153
1154     topBuf[D-1]  = cv::saturate_cast<CostType>(costs[D-1] + std::min(topBuf_i_minus_1+P1,std::min((int)topBuf[D-1],topMinCost_P2))-topMinCost_P2);
1155     topMinCost  = std::min(topMinCost_new,topBuf[D-1]);
1156 #endif
1157 }
1158
1159 // performing in-place SGM cost accumulation from right to left (the result is stored in rightBuf) and
1160 // summing rightBuf, topBuf, leftBuf together (the result is stored in leftBuf), as well as finding the
1161 // optimal disparity value with minimum accumulated cost
1162 inline void accumulateCostsRight(CostType* rightBuf, CostType* topBuf, CostType* leftBuf, CostType* costs,
1163                                  CostType& rightMinCost, int D, int P1, int P2, int& optimal_disp, CostType& min_cost)
1164 {
1165 #if CV_SIMD128
1166     v_int16x8 P1_reg = v_setall_s16(cv::saturate_cast<CostType>(P1));
1167
1168     v_int16x8 rightMinCostP2_reg   = v_setall_s16(cv::saturate_cast<CostType>(rightMinCost+P2));
1169     v_int16x8 rightMinCost_new_reg = v_setall_s16(SHRT_MAX);
1170     v_int16x8 src0_rightBuf        = v_setall_s16(SHRT_MAX);
1171     v_int16x8 src1_rightBuf        = v_load(rightBuf);
1172
1173     v_int16x8 src2;
1174     v_int16x8 src_shifted_left,src_shifted_right;
1175     v_int16x8 res;
1176
1177     v_int16x8 min_sum_cost_reg = v_setall_s16(SHRT_MAX);
1178     v_int16x8 min_sum_pos_reg  = v_setall_s16(0);
1179     v_int16x8 loop_idx(0,1,2,3,4,5,6,7);
1180     v_int16x8 eight_reg = v_setall_s16(8);
1181
1182     for(int i=0;i<D-8;i+=8)
1183     {
1184         //lookahead load:
1185         src2 = v_load_aligned(rightBuf+i+8);
1186
1187         //get shifted versions of the current block and add P1:
1188         src_shifted_left  = v_extract<7> (src0_rightBuf,src1_rightBuf) + P1_reg;
1189         src_shifted_right = v_extract<1> (src1_rightBuf,src2         ) + P1_reg;
1190
1191         // process and save current block:
1192         res = v_load_aligned(costs+i) + (v_min(v_min(src_shifted_left,src_shifted_right),v_min(src1_rightBuf,rightMinCostP2_reg))-rightMinCostP2_reg);
1193         rightMinCost_new_reg = v_min(rightMinCost_new_reg,res);
1194         v_store_aligned(rightBuf+i, res);
1195
1196         // compute and save total cost:
1197         res = res + v_load_aligned(leftBuf+i) + v_load_aligned(topBuf+i);
1198         v_store_aligned(leftBuf+i, res);
1199
1200         // track disparity value with the minimum cost:
1201         min_sum_cost_reg = v_min(min_sum_cost_reg,res);
1202         min_sum_pos_reg = min_sum_pos_reg + ((min_sum_cost_reg == res) & (loop_idx - min_sum_pos_reg));
1203         loop_idx = loop_idx+eight_reg;
1204
1205         //update src:
1206         src0_rightBuf    = src1_rightBuf;
1207         src1_rightBuf    = src2;
1208     }
1209
1210     // a bit different processing for the last cycle of the loop:
1211     src2 = v_setall_s16(SHRT_MAX);
1212     src_shifted_left  = v_extract<7> (src0_rightBuf,src1_rightBuf) + P1_reg;
1213     src_shifted_right = v_extract<1> (src1_rightBuf,src2         ) + P1_reg;
1214
1215     res = v_load_aligned(costs+D-8) + (v_min(v_min(src_shifted_left,src_shifted_right),v_min(src1_rightBuf,rightMinCostP2_reg))-rightMinCostP2_reg);
1216     rightMinCost = min(v_min(rightMinCost_new_reg,res));
1217     v_store_aligned(rightBuf+D-8, res);
1218
1219     res = res + v_load_aligned(leftBuf+D-8) + v_load_aligned(topBuf+D-8);
1220     v_store_aligned(leftBuf+D-8, res);
1221
1222     min_sum_cost_reg = v_min(min_sum_cost_reg,res);
1223     min_cost = min(min_sum_cost_reg);
1224     min_sum_pos_reg = min_sum_pos_reg + ((min_sum_cost_reg == res) & (loop_idx - min_sum_pos_reg));
1225     optimal_disp = min_pos(min_sum_cost_reg,min_sum_pos_reg);
1226 #else
1227     CostType rightMinCost_new = SHRT_MAX;
1228     int rightMinCost_P2  = rightMinCost + P2;
1229     CostType rightBuf_i_minus_1 = SHRT_MAX;
1230     CostType tmp;
1231     min_cost = SHRT_MAX;
1232
1233     for(int i=0;i<D-1;i++)
1234     {
1235         tmp = rightBuf[i];
1236         rightBuf[i]  = cv::saturate_cast<CostType>(costs[i] + std::min(std::min(rightBuf_i_minus_1+P1,rightBuf[i+1]+P1),std::min((int)rightBuf[i],rightMinCost_P2))-rightMinCost_P2);
1237         rightBuf_i_minus_1 = tmp;
1238         rightMinCost_new  = std::min(rightMinCost_new,rightBuf[i]);
1239         leftBuf[i] = cv::saturate_cast<CostType>((int)leftBuf[i]+rightBuf[i]+topBuf[i]);
1240         if(leftBuf[i]<min_cost)
1241         {
1242             optimal_disp = i;
1243             min_cost = leftBuf[i];
1244         }
1245     }
1246
1247     rightBuf[D-1]  = cv::saturate_cast<CostType>(costs[D-1] + std::min(rightBuf_i_minus_1+P1,std::min((int)rightBuf[D-1],rightMinCost_P2))-rightMinCost_P2);
1248     rightMinCost  = std::min(rightMinCost_new,rightBuf[D-1]);
1249     leftBuf[D-1] = cv::saturate_cast<CostType>((int)leftBuf[D-1]+rightBuf[D-1]+topBuf[D-1]);
1250     if(leftBuf[D-1]<min_cost)
1251     {
1252         optimal_disp = D-1;
1253         min_cost = leftBuf[D-1];
1254     }
1255 #endif
1256 }
1257
1258 void SGBM3WayMainLoop::operator () (const Range& range) const
1259 {
1260     // force separate processing of stripes:
1261     if(range.end>range.start+1)
1262     {
1263         for(int n=range.start;n<range.end;n++)
1264             (*this)(Range(n,n+1));
1265         return;
1266     }
1267
1268     const int DISP_SCALE = (1 << StereoMatcher::DISP_SHIFT);
1269     int INVALID_DISP = minD - 1, INVALID_DISP_SCALED = INVALID_DISP*DISP_SCALE;
1270
1271     // setting up the ranges:
1272     int src_start_idx = std::max(std::min(range.start * stripe_sz - stripe_overlap, height),0);
1273     int src_end_idx   = std::min(range.end   * stripe_sz, height);
1274
1275     int dst_offset;
1276     if(range.start==0)
1277         dst_offset=stripe_overlap;
1278     else
1279         dst_offset=0;
1280
1281     Mat cur_buffer = buffers [range.start];
1282     Mat cur_disp   = dst_disp[range.start];
1283     cur_disp = Scalar(INVALID_DISP_SCALED);
1284
1285     // prepare buffers:
1286     CostType *curCostVolumeLine, *hsumBuf, *pixDiff;
1287     PixType* tmpBuf;
1288     CostType *horPassCostVolume, *vertPassCostVolume, *vertPassMin, *rightPassBuf, *disp2CostBuf;
1289     short* disp2Buf;
1290     getBufferPointers(cur_buffer,width,width1,D,img1->channels(),SH2,P2,
1291                       curCostVolumeLine,hsumBuf,pixDiff,tmpBuf,horPassCostVolume,
1292                       vertPassCostVolume,vertPassMin,rightPassBuf,disp2CostBuf,disp2Buf);
1293
1294     // start real processing:
1295     for(int y=src_start_idx;y<src_end_idx;y++)
1296     {
1297         getRawMatchingCost(curCostVolumeLine,hsumBuf,pixDiff,tmpBuf,y,src_start_idx);
1298
1299         short* disp_row = (short*)cur_disp.ptr(dst_offset+(y-src_start_idx));
1300
1301         // initialize the auxiliary buffers for the pseudo left-right consistency check:
1302         for(int x=0;x<width;x++)
1303         {
1304             disp2Buf[x] = (short)INVALID_DISP_SCALED;
1305             disp2CostBuf[x] = SHRT_MAX;
1306         }
1307         CostType* C = curCostVolumeLine - D;
1308         CostType prev_min, min_cost;
1309         int d, best_d;
1310         d = best_d = 0;
1311
1312         // forward pass
1313         prev_min=0;
1314         for (int x=D;x<(1+width1)*D;x+=D)
1315             accumulateCostsLeftTop(horPassCostVolume+x,horPassCostVolume+x-D,vertPassCostVolume+x,C+x,prev_min,vertPassMin[x/D],D,P1,P2);
1316
1317         //backward pass
1318         memset(rightPassBuf,0,D*sizeof(CostType));
1319         prev_min=0;
1320         for (int x=width1*D;x>=D;x-=D)
1321         {
1322             accumulateCostsRight(rightPassBuf,vertPassCostVolume+x,horPassCostVolume+x,C+x,prev_min,D,P1,P2,best_d,min_cost);
1323
1324             if(uniquenessRatio>0)
1325             {
1326 #if CV_SIMD128
1327                 horPassCostVolume+=x;
1328                 int thresh = (100*min_cost)/(100-uniquenessRatio);
1329                 v_int16x8 thresh_reg = v_setall_s16((short)(thresh+1));
1330                 v_int16x8 d1 = v_setall_s16((short)(best_d-1));
1331                 v_int16x8 d2 = v_setall_s16((short)(best_d+1));
1332                 v_int16x8 eight_reg = v_setall_s16(8);
1333                 v_int16x8 cur_d(0,1,2,3,4,5,6,7);
1334                 v_int16x8 mask,cost1,cost2;
1335
1336                 for( d = 0; d < D; d+=16 )
1337                 {
1338                     cost1 = v_load_aligned(horPassCostVolume+d);
1339                     cost2 = v_load_aligned(horPassCostVolume+d+8);
1340
1341                     mask = cost1 < thresh_reg;
1342                     mask = mask & ( (cur_d<d1) | (cur_d>d2) );
1343                     if( v_signmask(mask) )
1344                         break;
1345
1346                     cur_d = cur_d+eight_reg;
1347
1348                     mask = cost2 < thresh_reg;
1349                     mask = mask & ( (cur_d<d1) | (cur_d>d2) );
1350                     if( v_signmask(mask) )
1351                         break;
1352
1353                     cur_d = cur_d+eight_reg;
1354                 }
1355                 horPassCostVolume-=x;
1356 #else
1357                 for( d = 0; d < D; d++ )
1358                 {
1359                     if( horPassCostVolume[x+d]*(100 - uniquenessRatio) < min_cost*100 && std::abs(d - best_d) > 1 )
1360                         break;
1361                 }
1362 #endif
1363                 if( d < D )
1364                     continue;
1365             }
1366             d = best_d;
1367
1368             int _x2 = x/D - 1 + minX1 - d - minD;
1369             if( _x2>=0 && _x2<width && disp2CostBuf[_x2] > min_cost )
1370             {
1371                 disp2CostBuf[_x2] = min_cost;
1372                 disp2Buf[_x2] = (short)(d + minD);
1373             }
1374
1375             if( 0 < d && d < D-1 )
1376             {
1377                 // do subpixel quadratic interpolation:
1378                 //   fit parabola into (x1=d-1, y1=Sp[d-1]), (x2=d, y2=Sp[d]), (x3=d+1, y3=Sp[d+1])
1379                 //   then find minimum of the parabola.
1380                 int denom2 = std::max(horPassCostVolume[x+d-1] + horPassCostVolume[x+d+1] - 2*horPassCostVolume[x+d], 1);
1381                 d = d*DISP_SCALE + ((horPassCostVolume[x+d-1] - horPassCostVolume[x+d+1])*DISP_SCALE + denom2)/(denom2*2);
1382             }
1383             else
1384                 d *= DISP_SCALE;
1385
1386             disp_row[(x/D)-1 + minX1] = (DispType)(d + minD*DISP_SCALE);
1387         }
1388
1389         for(int x = minX1; x < maxX1; x++ )
1390         {
1391             // pseudo LRC consistency check using only one disparity map;
1392             // pixels with difference more than disp12MaxDiff are invalidated
1393             int d1 = disp_row[x];
1394             if( d1 == INVALID_DISP_SCALED )
1395                 continue;
1396             int _d = d1 >> StereoMatcher::DISP_SHIFT;
1397             int d_ = (d1 + DISP_SCALE-1) >> StereoMatcher::DISP_SHIFT;
1398             int _x = x - _d, x_ = x - d_;
1399             if( 0 <= _x && _x < width && disp2Buf[_x] >= minD && std::abs(disp2Buf[_x] - _d) > disp12MaxDiff &&
1400                 0 <= x_ && x_ < width && disp2Buf[x_] >= minD && std::abs(disp2Buf[x_] - d_) > disp12MaxDiff )
1401                 disp_row[x] = (short)INVALID_DISP_SCALED;
1402         }
1403     }
1404 }
1405
1406 static void computeDisparity3WaySGBM( const Mat& img1, const Mat& img2,
1407                                       Mat& disp1, const StereoSGBMParams& params,
1408                                       Mat* buffers, int nstripes )
1409 {
1410     // precompute a lookup table for the raw matching cost computation:
1411     const int TAB_OFS = 256*4, TAB_SIZE = 256 + TAB_OFS*2;
1412     PixType* clipTab = new PixType[TAB_SIZE];
1413     int ftzero = std::max(params.preFilterCap, 15) | 1;
1414     for(int k = 0; k < TAB_SIZE; k++ )
1415         clipTab[k] = (PixType)(std::min(std::max(k - TAB_OFS, -ftzero), ftzero) + ftzero);
1416
1417     // allocate separate dst_disp arrays to avoid conflicts due to stripe overlap:
1418     int stripe_sz = (int)ceil(img1.rows/(double)nstripes);
1419     int stripe_overlap = (params.SADWindowSize/2+1) + (int)ceil(0.1*stripe_sz);
1420     Mat* dst_disp = new Mat[nstripes];
1421     for(int i=0;i<nstripes;i++)
1422         dst_disp[i].create(stripe_sz+stripe_overlap,img1.cols,CV_16S);
1423
1424     parallel_for_(Range(0,nstripes),SGBM3WayMainLoop(buffers,img1,img2,dst_disp,params,clipTab,nstripes,stripe_overlap));
1425
1426     //assemble disp1 from dst_disp:
1427     short* dst_row;
1428     short* src_row;
1429     for(int i=0;i<disp1.rows;i++)
1430     {
1431         dst_row = (short*)disp1.ptr(i);
1432         src_row = (short*)dst_disp[i/stripe_sz].ptr(stripe_overlap+i%stripe_sz);
1433         memcpy(dst_row,src_row,disp1.cols*sizeof(short));
1434     }
1435
1436     delete[] clipTab;
1437     delete[] dst_disp;
1438 }
1439
1440 class StereoSGBMImpl : public StereoSGBM
1441 {
1442 public:
1443     StereoSGBMImpl()
1444     {
1445         params = StereoSGBMParams();
1446     }
1447
1448     StereoSGBMImpl( int _minDisparity, int _numDisparities, int _SADWindowSize,
1449                     int _P1, int _P2, int _disp12MaxDiff, int _preFilterCap,
1450                     int _uniquenessRatio, int _speckleWindowSize, int _speckleRange,
1451                     int _mode )
1452     {
1453         params = StereoSGBMParams( _minDisparity, _numDisparities, _SADWindowSize,
1454                                    _P1, _P2, _disp12MaxDiff, _preFilterCap,
1455                                    _uniquenessRatio, _speckleWindowSize, _speckleRange,
1456                                    _mode );
1457     }
1458
1459     void compute( InputArray leftarr, InputArray rightarr, OutputArray disparr )
1460     {
1461         Mat left = leftarr.getMat(), right = rightarr.getMat();
1462         CV_Assert( left.size() == right.size() && left.type() == right.type() &&
1463                    left.depth() == CV_8U );
1464
1465         disparr.create( left.size(), CV_16S );
1466         Mat disp = disparr.getMat();
1467
1468         if(params.mode==MODE_SGBM_3WAY)
1469             computeDisparity3WaySGBM( left, right, disp, params, buffers, num_stripes );
1470         else
1471             computeDisparitySGBM( left, right, disp, params, buffer );
1472
1473         medianBlur(disp, disp, 3);
1474
1475         if( params.speckleWindowSize > 0 )
1476             filterSpeckles(disp, (params.minDisparity - 1)*StereoMatcher::DISP_SCALE, params.speckleWindowSize,
1477                            StereoMatcher::DISP_SCALE*params.speckleRange, buffer);
1478     }
1479
1480     int getMinDisparity() const { return params.minDisparity; }
1481     void setMinDisparity(int minDisparity) { params.minDisparity = minDisparity; }
1482
1483     int getNumDisparities() const { return params.numDisparities; }
1484     void setNumDisparities(int numDisparities) { params.numDisparities = numDisparities; }
1485
1486     int getBlockSize() const { return params.SADWindowSize; }
1487     void setBlockSize(int blockSize) { params.SADWindowSize = blockSize; }
1488
1489     int getSpeckleWindowSize() const { return params.speckleWindowSize; }
1490     void setSpeckleWindowSize(int speckleWindowSize) { params.speckleWindowSize = speckleWindowSize; }
1491
1492     int getSpeckleRange() const { return params.speckleRange; }
1493     void setSpeckleRange(int speckleRange) { params.speckleRange = speckleRange; }
1494
1495     int getDisp12MaxDiff() const { return params.disp12MaxDiff; }
1496     void setDisp12MaxDiff(int disp12MaxDiff) { params.disp12MaxDiff = disp12MaxDiff; }
1497
1498     int getPreFilterCap() const { return params.preFilterCap; }
1499     void setPreFilterCap(int preFilterCap) { params.preFilterCap = preFilterCap; }
1500
1501     int getUniquenessRatio() const { return params.uniquenessRatio; }
1502     void setUniquenessRatio(int uniquenessRatio) { params.uniquenessRatio = uniquenessRatio; }
1503
1504     int getP1() const { return params.P1; }
1505     void setP1(int P1) { params.P1 = P1; }
1506
1507     int getP2() const { return params.P2; }
1508     void setP2(int P2) { params.P2 = P2; }
1509
1510     int getMode() const { return params.mode; }
1511     void setMode(int mode) { params.mode = mode; }
1512
1513     void write(FileStorage& fs) const
1514     {
1515         fs << "name" << name_
1516         << "minDisparity" << params.minDisparity
1517         << "numDisparities" << params.numDisparities
1518         << "blockSize" << params.SADWindowSize
1519         << "speckleWindowSize" << params.speckleWindowSize
1520         << "speckleRange" << params.speckleRange
1521         << "disp12MaxDiff" << params.disp12MaxDiff
1522         << "preFilterCap" << params.preFilterCap
1523         << "uniquenessRatio" << params.uniquenessRatio
1524         << "P1" << params.P1
1525         << "P2" << params.P2
1526         << "mode" << params.mode;
1527     }
1528
1529     void read(const FileNode& fn)
1530     {
1531         FileNode n = fn["name"];
1532         CV_Assert( n.isString() && String(n) == name_ );
1533         params.minDisparity = (int)fn["minDisparity"];
1534         params.numDisparities = (int)fn["numDisparities"];
1535         params.SADWindowSize = (int)fn["blockSize"];
1536         params.speckleWindowSize = (int)fn["speckleWindowSize"];
1537         params.speckleRange = (int)fn["speckleRange"];
1538         params.disp12MaxDiff = (int)fn["disp12MaxDiff"];
1539         params.preFilterCap = (int)fn["preFilterCap"];
1540         params.uniquenessRatio = (int)fn["uniquenessRatio"];
1541         params.P1 = (int)fn["P1"];
1542         params.P2 = (int)fn["P2"];
1543         params.mode = (int)fn["mode"];
1544     }
1545
1546     StereoSGBMParams params;
1547     Mat buffer;
1548
1549     // the number of stripes is fixed, disregarding the number of threads/processors
1550     // to make the results fully reproducible:
1551     static const int num_stripes = 4;
1552     Mat buffers[num_stripes];
1553
1554     static const char* name_;
1555 };
1556
1557 const char* StereoSGBMImpl::name_ = "StereoMatcher.SGBM";
1558
1559
1560 Ptr<StereoSGBM> StereoSGBM::create(int minDisparity, int numDisparities, int SADWindowSize,
1561                                  int P1, int P2, int disp12MaxDiff,
1562                                  int preFilterCap, int uniquenessRatio,
1563                                  int speckleWindowSize, int speckleRange,
1564                                  int mode)
1565 {
1566     return Ptr<StereoSGBM>(
1567         new StereoSGBMImpl(minDisparity, numDisparities, SADWindowSize,
1568                            P1, P2, disp12MaxDiff,
1569                            preFilterCap, uniquenessRatio,
1570                            speckleWindowSize, speckleRange,
1571                            mode));
1572 }
1573
1574 Rect getValidDisparityROI( Rect roi1, Rect roi2,
1575                           int minDisparity,
1576                           int numberOfDisparities,
1577                           int SADWindowSize )
1578 {
1579     int SW2 = SADWindowSize/2;
1580     int minD = minDisparity, maxD = minDisparity + numberOfDisparities - 1;
1581
1582     int xmin = std::max(roi1.x, roi2.x + maxD) + SW2;
1583     int xmax = std::min(roi1.x + roi1.width, roi2.x + roi2.width - minD) - SW2;
1584     int ymin = std::max(roi1.y, roi2.y) + SW2;
1585     int ymax = std::min(roi1.y + roi1.height, roi2.y + roi2.height) - SW2;
1586
1587     Rect r(xmin, ymin, xmax - xmin, ymax - ymin);
1588
1589     return r.width > 0 && r.height > 0 ? r : Rect();
1590 }
1591
1592 typedef cv::Point_<short> Point2s;
1593
1594 template <typename T>
1595 void filterSpecklesImpl(cv::Mat& img, int newVal, int maxSpeckleSize, int maxDiff, cv::Mat& _buf)
1596 {
1597     using namespace cv;
1598
1599     int width = img.cols, height = img.rows, npixels = width*height;
1600     size_t bufSize = npixels*(int)(sizeof(Point2s) + sizeof(int) + sizeof(uchar));
1601     if( !_buf.isContinuous() || _buf.empty() || _buf.cols*_buf.rows*_buf.elemSize() < bufSize )
1602         _buf.create(1, (int)bufSize, CV_8U);
1603
1604     uchar* buf = _buf.ptr();
1605     int i, j, dstep = (int)(img.step/sizeof(T));
1606     int* labels = (int*)buf;
1607     buf += npixels*sizeof(labels[0]);
1608     Point2s* wbuf = (Point2s*)buf;
1609     buf += npixels*sizeof(wbuf[0]);
1610     uchar* rtype = (uchar*)buf;
1611     int curlabel = 0;
1612
1613     // clear out label assignments
1614     memset(labels, 0, npixels*sizeof(labels[0]));
1615
1616     for( i = 0; i < height; i++ )
1617     {
1618         T* ds = img.ptr<T>(i);
1619         int* ls = labels + width*i;
1620
1621         for( j = 0; j < width; j++ )
1622         {
1623             if( ds[j] != newVal )   // not a bad disparity
1624             {
1625                 if( ls[j] )     // has a label, check for bad label
1626                 {
1627                     if( rtype[ls[j]] ) // small region, zero out disparity
1628                         ds[j] = (T)newVal;
1629                 }
1630                 // no label, assign and propagate
1631                 else
1632                 {
1633                     Point2s* ws = wbuf; // initialize wavefront
1634                     Point2s p((short)j, (short)i);  // current pixel
1635                     curlabel++; // next label
1636                     int count = 0;  // current region size
1637                     ls[j] = curlabel;
1638
1639                     // wavefront propagation
1640                     while( ws >= wbuf ) // wavefront not empty
1641                     {
1642                         count++;
1643                         // put neighbors onto wavefront
1644                         T* dpp = &img.at<T>(p.y, p.x);
1645                         T dp = *dpp;
1646                         int* lpp = labels + width*p.y + p.x;
1647
1648                         if( p.y < height-1 && !lpp[+width] && dpp[+dstep] != newVal && std::abs(dp - dpp[+dstep]) <= maxDiff )
1649                         {
1650                             lpp[+width] = curlabel;
1651                             *ws++ = Point2s(p.x, p.y+1);
1652                         }
1653
1654                         if( p.y > 0 && !lpp[-width] && dpp[-dstep] != newVal && std::abs(dp - dpp[-dstep]) <= maxDiff )
1655                         {
1656                             lpp[-width] = curlabel;
1657                             *ws++ = Point2s(p.x, p.y-1);
1658                         }
1659
1660                         if( p.x < width-1 && !lpp[+1] && dpp[+1] != newVal && std::abs(dp - dpp[+1]) <= maxDiff )
1661                         {
1662                             lpp[+1] = curlabel;
1663                             *ws++ = Point2s(p.x+1, p.y);
1664                         }
1665
1666                         if( p.x > 0 && !lpp[-1] && dpp[-1] != newVal && std::abs(dp - dpp[-1]) <= maxDiff )
1667                         {
1668                             lpp[-1] = curlabel;
1669                             *ws++ = Point2s(p.x-1, p.y);
1670                         }
1671
1672                         // pop most recent and propagate
1673                         // NB: could try least recent, maybe better convergence
1674                         p = *--ws;
1675                     }
1676
1677                     // assign label type
1678                     if( count <= maxSpeckleSize )   // speckle region
1679                     {
1680                         rtype[ls[j]] = 1;   // small region label
1681                         ds[j] = (T)newVal;
1682                     }
1683                     else
1684                         rtype[ls[j]] = 0;   // large region label
1685                 }
1686             }
1687         }
1688     }
1689 }
1690
1691 #ifdef HAVE_IPP
1692 static bool ipp_filterSpeckles(Mat &img, int maxSpeckleSize, int newVal, int maxDiff)
1693 {
1694 #if IPP_VERSION_X100 >= 810
1695     int type = img.type();
1696     Ipp32s bufsize = 0;
1697     IppiSize roisize = { img.cols, img.rows };
1698     IppDataType datatype = type == CV_8UC1 ? ipp8u : ipp16s;
1699     Ipp8u *pBuffer = NULL;
1700     IppStatus status = ippStsNoErr;
1701
1702     if(ippiMarkSpecklesGetBufferSize(roisize, datatype, CV_MAT_CN(type), &bufsize) < 0)
1703         return false;
1704
1705     pBuffer = (Ipp8u*)ippMalloc(bufsize);
1706     if(!pBuffer && bufsize)
1707         return false;
1708
1709     if (type == CV_8UC1)
1710     {
1711         status = ippiMarkSpeckles_8u_C1IR(img.ptr<Ipp8u>(), (int)img.step, roisize,
1712                                             (Ipp8u)newVal, maxSpeckleSize, (Ipp8u)maxDiff, ippiNormL1, pBuffer);
1713     }
1714     else
1715     {
1716         status = ippiMarkSpeckles_16s_C1IR(img.ptr<Ipp16s>(), (int)img.step, roisize,
1717                                             (Ipp16s)newVal, maxSpeckleSize, (Ipp16s)maxDiff, ippiNormL1, pBuffer);
1718     }
1719     if(pBuffer) ippFree(pBuffer);
1720
1721     if (status >= 0)
1722         return true;
1723 #else
1724     CV_UNUSED(img); CV_UNUSED(maxSpeckleSize); CV_UNUSED(newVal); CV_UNUSED(maxDiff);
1725 #endif
1726     return false;
1727 }
1728 #endif
1729
1730 }
1731
1732 void cv::filterSpeckles( InputOutputArray _img, double _newval, int maxSpeckleSize,
1733                          double _maxDiff, InputOutputArray __buf )
1734 {
1735     Mat img = _img.getMat();
1736     int type = img.type();
1737     Mat temp, &_buf = __buf.needed() ? __buf.getMatRef() : temp;
1738     CV_Assert( type == CV_8UC1 || type == CV_16SC1 );
1739
1740     int newVal = cvRound(_newval), maxDiff = cvRound(_maxDiff);
1741
1742     CV_IPP_RUN(IPP_VERSION_X100 >= 810 && !__buf.needed() && (type == CV_8UC1 || type == CV_16SC1), ipp_filterSpeckles(img, maxSpeckleSize, newVal, maxDiff));
1743
1744     if (type == CV_8UC1)
1745         filterSpecklesImpl<uchar>(img, newVal, maxSpeckleSize, maxDiff, _buf);
1746     else
1747         filterSpecklesImpl<short>(img, newVal, maxSpeckleSize, maxDiff, _buf);
1748 }
1749
1750 void cv::validateDisparity( InputOutputArray _disp, InputArray _cost, int minDisparity,
1751                             int numberOfDisparities, int disp12MaxDiff )
1752 {
1753     Mat disp = _disp.getMat(), cost = _cost.getMat();
1754     int cols = disp.cols, rows = disp.rows;
1755     int minD = minDisparity, maxD = minDisparity + numberOfDisparities;
1756     int x, minX1 = std::max(maxD, 0), maxX1 = cols + std::min(minD, 0);
1757     AutoBuffer<int> _disp2buf(cols*2);
1758     int* disp2buf = _disp2buf;
1759     int* disp2cost = disp2buf + cols;
1760     const int DISP_SHIFT = 4, DISP_SCALE = 1 << DISP_SHIFT;
1761     int INVALID_DISP = minD - 1, INVALID_DISP_SCALED = INVALID_DISP*DISP_SCALE;
1762     int costType = cost.type();
1763
1764     disp12MaxDiff *= DISP_SCALE;
1765
1766     CV_Assert( numberOfDisparities > 0 && disp.type() == CV_16S &&
1767               (costType == CV_16S || costType == CV_32S) &&
1768               disp.size() == cost.size() );
1769
1770     for( int y = 0; y < rows; y++ )
1771     {
1772         short* dptr = disp.ptr<short>(y);
1773
1774         for( x = 0; x < cols; x++ )
1775         {
1776             disp2buf[x] = INVALID_DISP_SCALED;
1777             disp2cost[x] = INT_MAX;
1778         }
1779
1780         if( costType == CV_16S )
1781         {
1782             const short* cptr = cost.ptr<short>(y);
1783
1784             for( x = minX1; x < maxX1; x++ )
1785             {
1786                 int d = dptr[x], c = cptr[x];
1787
1788                 if( d == INVALID_DISP_SCALED )
1789                     continue;
1790
1791                 int x2 = x - ((d + DISP_SCALE/2) >> DISP_SHIFT);
1792
1793                 if( disp2cost[x2] > c )
1794                 {
1795                     disp2cost[x2] = c;
1796                     disp2buf[x2] = d;
1797                 }
1798             }
1799         }
1800         else
1801         {
1802             const int* cptr = cost.ptr<int>(y);
1803
1804             for( x = minX1; x < maxX1; x++ )
1805             {
1806                 int d = dptr[x], c = cptr[x];
1807
1808                 if( d == INVALID_DISP_SCALED )
1809                     continue;
1810
1811                 int x2 = x - ((d + DISP_SCALE/2) >> DISP_SHIFT);
1812
1813                 if( disp2cost[x2] > c )
1814                 {
1815                     disp2cost[x2] = c;
1816                     disp2buf[x2] = d;
1817                 }
1818             }
1819         }
1820
1821         for( x = minX1; x < maxX1; x++ )
1822         {
1823             // we round the computed disparity both towards -inf and +inf and check
1824             // if either of the corresponding disparities in disp2 is consistent.
1825             // This is to give the computed disparity a chance to look valid if it is.
1826             int d = dptr[x];
1827             if( d == INVALID_DISP_SCALED )
1828                 continue;
1829             int d0 = d >> DISP_SHIFT;
1830             int d1 = (d + DISP_SCALE-1) >> DISP_SHIFT;
1831             int x0 = x - d0, x1 = x - d1;
1832             if( (0 <= x0 && x0 < cols && disp2buf[x0] > INVALID_DISP_SCALED && std::abs(disp2buf[x0] - d) > disp12MaxDiff) &&
1833                 (0 <= x1 && x1 < cols && disp2buf[x1] > INVALID_DISP_SCALED && std::abs(disp2buf[x1] - d) > disp12MaxDiff) )
1834                 dptr[x] = (short)INVALID_DISP_SCALED;
1835         }
1836     }
1837 }