1 /*M///////////////////////////////////////////////////////////////////////////////////////
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install,
7 // copy or use the software.
11 // For Open Source Computer Vision Library
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
16 // Third party copyrights are property of their respective owners.
18 // Redistribution and use in source and binary forms, with or without modification,
19 // are permitted provided that the following conditions are met:
21 // * Redistribution's of source code must retain the above copyright notice,
22 // this list of conditions and the following disclaimer.
24 // * Redistribution's in binary form must reproduce the above copyright notice,
25 // this list of conditions and the following disclaimer in the documentation
26 // and/or other materials provided with the distribution.
28 // * The name of the copyright holders may not be used to endorse or promote products
29 // derived from this software without specific prior written permission.
31 // This software is provided by the copyright holders and contributors "as is" and
32 // any express or implied warranties, including, but not limited to, the implied
33 // warranties of merchantability and fitness for a particular purpose are disclaimed.
34 // In no event shall the Intel Corporation or contributors be liable for any direct,
35 // indirect, incidental, special, exemplary, or consequential damages
36 // (including, but not limited to, procurement of substitute goods or services;
37 // loss of use, data, or profits; or business interruption) however caused
38 // and on any theory of liability, whether in contract, strict liability,
39 // or tort (including negligence or otherwise) arising in any way out of
40 // the use of this software, even if advised of the possibility of such damage.
45 This is a variation of
46 "Stereo Processing by Semiglobal Matching and Mutual Information"
47 by Heiko Hirschmuller.
49 We match blocks rather than individual pixels, thus the algorithm is called
50 SGBM (Semi-global block matching)
53 #include "precomp.hpp"
55 #include "opencv2/hal/intrin.hpp"
60 typedef uchar PixType;
61 typedef short CostType;
62 typedef short DispType;
64 enum { NR = 16, NR2 = NR/2 };
67 struct StereoSGBMParams
71 minDisparity = numDisparities = 0;
77 speckleWindowSize = 0;
79 mode = StereoSGBM::MODE_SGBM;
82 StereoSGBMParams( int _minDisparity, int _numDisparities, int _SADWindowSize,
83 int _P1, int _P2, int _disp12MaxDiff, int _preFilterCap,
84 int _uniquenessRatio, int _speckleWindowSize, int _speckleRange,
87 minDisparity = _minDisparity;
88 numDisparities = _numDisparities;
89 SADWindowSize = _SADWindowSize;
92 disp12MaxDiff = _disp12MaxDiff;
93 preFilterCap = _preFilterCap;
94 uniquenessRatio = _uniquenessRatio;
95 speckleWindowSize = _speckleWindowSize;
96 speckleRange = _speckleRange;
107 int speckleWindowSize;
114 For each pixel row1[x], max(maxD, 0) <= minX <= x < maxX <= width - max(0, -minD),
115 and for each disparity minD<=d<maxD the function
116 computes the cost (cost[(x-minX)*(maxD - minD) + (d - minD)]), depending on the difference between
117 row1[x] and row2[x-d]. The subpixel algorithm from
118 "Depth Discontinuities by Pixel-to-Pixel Stereo" by Stan Birchfield and C. Tomasi
119 is used, hence the suffix BT.
121 the temporary buffer should contain width2*2 elements
123 static void calcPixelCostBT( const Mat& img1, const Mat& img2, int y,
124 int minD, int maxD, CostType* cost,
125 PixType* buffer, const PixType* tab,
128 int x, c, width = img1.cols, cn = img1.channels();
129 int minX1 = std::max(maxD, 0), maxX1 = width + std::min(minD, 0);
130 int minX2 = std::max(minX1 - maxD, 0), maxX2 = std::min(maxX1 - minD, width);
131 int D = maxD - minD, width1 = maxX1 - minX1, width2 = maxX2 - minX2;
132 const PixType *row1 = img1.ptr<PixType>(y), *row2 = img2.ptr<PixType>(y);
133 PixType *prow1 = buffer + width2*2, *prow2 = prow1 + width*cn*2;
137 for( c = 0; c < cn*2; c++ )
139 prow1[width*c] = prow1[width*c + width-1] =
140 prow2[width*c] = prow2[width*c + width-1] = tab[0];
143 int n1 = y > 0 ? -(int)img1.step : 0, s1 = y < img1.rows-1 ? (int)img1.step : 0;
144 int n2 = y > 0 ? -(int)img2.step : 0, s2 = y < img2.rows-1 ? (int)img2.step : 0;
148 for( x = 1; x < width-1; x++ )
150 prow1[x] = tab[(row1[x+1] - row1[x-1])*2 + row1[x+n1+1] - row1[x+n1-1] + row1[x+s1+1] - row1[x+s1-1]];
151 prow2[width-1-x] = tab[(row2[x+1] - row2[x-1])*2 + row2[x+n2+1] - row2[x+n2-1] + row2[x+s2+1] - row2[x+s2-1]];
153 prow1[x+width] = row1[x];
154 prow2[width-1-x+width] = row2[x];
159 for( x = 1; x < width-1; x++ )
161 prow1[x] = tab[(row1[x*3+3] - row1[x*3-3])*2 + row1[x*3+n1+3] - row1[x*3+n1-3] + row1[x*3+s1+3] - row1[x*3+s1-3]];
162 prow1[x+width] = tab[(row1[x*3+4] - row1[x*3-2])*2 + row1[x*3+n1+4] - row1[x*3+n1-2] + row1[x*3+s1+4] - row1[x*3+s1-2]];
163 prow1[x+width*2] = tab[(row1[x*3+5] - row1[x*3-1])*2 + row1[x*3+n1+5] - row1[x*3+n1-1] + row1[x*3+s1+5] - row1[x*3+s1-1]];
165 prow2[width-1-x] = tab[(row2[x*3+3] - row2[x*3-3])*2 + row2[x*3+n2+3] - row2[x*3+n2-3] + row2[x*3+s2+3] - row2[x*3+s2-3]];
166 prow2[width-1-x+width] = tab[(row2[x*3+4] - row2[x*3-2])*2 + row2[x*3+n2+4] - row2[x*3+n2-2] + row2[x*3+s2+4] - row2[x*3+s2-2]];
167 prow2[width-1-x+width*2] = tab[(row2[x*3+5] - row2[x*3-1])*2 + row2[x*3+n2+5] - row2[x*3+n2-1] + row2[x*3+s2+5] - row2[x*3+s2-1]];
169 prow1[x+width*3] = row1[x*3];
170 prow1[x+width*4] = row1[x*3+1];
171 prow1[x+width*5] = row1[x*3+2];
173 prow2[width-1-x+width*3] = row2[x*3];
174 prow2[width-1-x+width*4] = row2[x*3+1];
175 prow2[width-1-x+width*5] = row2[x*3+2];
179 memset( cost, 0, width1*D*sizeof(cost[0]) );
182 cost -= minX1*D + minD; // simplify the cost indices inside the loop
185 for( c = 0; c < cn*2; c++, prow1 += width, prow2 += width )
187 int diff_scale = c < cn ? 0 : 2;
190 // v0 = min(row2[x-1/2], row2[x], row2[x+1/2]) and
191 // v1 = max(row2[x-1/2], row2[x], row2[x+1/2]) and
192 for( x = minX2; x < maxX2; x++ )
195 int vl = x > 0 ? (v + prow2[x-1])/2 : v;
196 int vr = x < width-1 ? (v + prow2[x+1])/2 : v;
197 int v0 = std::min(vl, vr); v0 = std::min(v0, v);
198 int v1 = std::max(vl, vr); v1 = std::max(v1, v);
199 buffer[x] = (PixType)v0;
200 buffer[x + width2] = (PixType)v1;
203 for( x = minX1; x < maxX1; x++ )
206 int ul = x > 0 ? (u + prow1[x-1])/2 : u;
207 int ur = x < width-1 ? (u + prow1[x+1])/2 : u;
208 int u0 = std::min(ul, ur); u0 = std::min(u0, u);
209 int u1 = std::max(ul, ur); u1 = std::max(u1, u);
212 v_uint8x16 _u = v_setall_u8((uchar)u), _u0 = v_setall_u8((uchar)u0);
213 v_uint8x16 _u1 = v_setall_u8((uchar)u1);
215 for( int d = minD; d < maxD; d += 16 )
217 v_uint8x16 _v = v_load(prow2 + width-x-1 + d);
218 v_uint8x16 _v0 = v_load(buffer + width-x-1 + d);
219 v_uint8x16 _v1 = v_load(buffer + width-x-1 + d + width2);
220 v_uint8x16 c0 = v_max(_u - _v1, _v0 - _u);
221 v_uint8x16 c1 = v_max(_v - _u1, _u0 - _v);
222 v_uint8x16 diff = v_min(c0, c1);
224 v_int16x8 _c0 = v_load_aligned(cost + x*D + d);
225 v_int16x8 _c1 = v_load_aligned(cost + x*D + d + 8);
227 v_uint16x8 diff1,diff2;
228 v_expand(diff,diff1,diff2);
229 v_store_aligned(cost + x*D + d, _c0 + v_reinterpret_as_s16(diff1 >> diff_scale));
230 v_store_aligned(cost + x*D + d + 8, _c1 + v_reinterpret_as_s16(diff2 >> diff_scale));
233 for( int d = minD; d < maxD; d++ )
235 int v = prow2[width-x-1 + d];
236 int v0 = buffer[width-x-1 + d];
237 int v1 = buffer[width-x-1 + d + width2];
238 int c0 = std::max(0, u - v1); c0 = std::max(c0, v0 - u);
239 int c1 = std::max(0, v - u1); c1 = std::max(c1, u0 - v);
241 cost[x*D + d] = (CostType)(cost[x*D+d] + (std::min(c0, c1) >> diff_scale));
247 for( c = 0; c < cn*2; c++, prow1 += width, prow2 += width )
249 for( x = minX1; x < maxX1; x++ )
255 __m128i _u = _mm_set1_epi8(u), z = _mm_setzero_si128();
257 for( int d = minD; d < maxD; d += 16 )
259 __m128i _v = _mm_loadu_si128((const __m128i*)(prow2 + width-1-x + d));
260 __m128i diff = _mm_adds_epu8(_mm_subs_epu8(_u,_v), _mm_subs_epu8(_v,_u));
261 __m128i c0 = _mm_load_si128((__m128i*)(cost + x*D + d));
262 __m128i c1 = _mm_load_si128((__m128i*)(cost + x*D + d + 8));
264 _mm_store_si128((__m128i*)(cost + x*D + d), _mm_adds_epi16(c0, _mm_unpacklo_epi8(diff,z)));
265 _mm_store_si128((__m128i*)(cost + x*D + d + 8), _mm_adds_epi16(c1, _mm_unpackhi_epi8(diff,z)));
271 for( int d = minD; d < maxD; d++ )
273 int v = prow2[width-1-x + d];
274 cost[x*D + d] = (CostType)(cost[x*D + d] + (CostType)std::abs(u - v));
284 computes disparity for "roi" in img1 w.r.t. img2 and write it to disp1buf.
285 that is, disp1buf(x, y)=d means that img1(x+roi.x, y+roi.y) ~ img2(x+roi.x-d, y+roi.y).
287 disp2full is the reverse disparity map, that is:
288 disp2full(x+roi.x,y+roi.y)=d means that img2(x+roi.x, y+roi.y) ~ img1(x+roi.x+d, y+roi.y)
290 note that disp1buf will have the same size as the roi and
291 disp2full will have the same size as img1 (or img2).
292 On exit disp2buf is not the final disparity, it is an intermediate result that becomes
293 final after all the tiles are processed.
295 the disparity in disp1buf is written with sub-pixel accuracy
296 (4 fractional bits, see StereoSGBM::DISP_SCALE),
297 using quadratic interpolation, while the disparity in disp2buf
298 is written as is, without interpolation.
300 disp2cost also has the same size as img1 (or img2).
301 It contains the minimum current cost, used to find the best disparity, corresponding to the minimal cost.
303 static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
304 Mat& disp1, const StereoSGBMParams& params,
308 static const uchar LSBTab[] =
310 0, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
311 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
312 6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
313 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
314 7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
315 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
316 6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
317 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0
320 volatile bool useSIMD = checkHardwareSupport(CV_CPU_SSE2);
323 const int ALIGN = 16;
324 const int DISP_SHIFT = StereoMatcher::DISP_SHIFT;
325 const int DISP_SCALE = (1 << DISP_SHIFT);
326 const CostType MAX_COST = SHRT_MAX;
328 int minD = params.minDisparity, maxD = minD + params.numDisparities;
330 SADWindowSize.width = SADWindowSize.height = params.SADWindowSize > 0 ? params.SADWindowSize : 5;
331 int ftzero = std::max(params.preFilterCap, 15) | 1;
332 int uniquenessRatio = params.uniquenessRatio >= 0 ? params.uniquenessRatio : 10;
333 int disp12MaxDiff = params.disp12MaxDiff > 0 ? params.disp12MaxDiff : 1;
334 int P1 = params.P1 > 0 ? params.P1 : 2, P2 = std::max(params.P2 > 0 ? params.P2 : 5, P1+1);
335 int k, width = disp1.cols, height = disp1.rows;
336 int minX1 = std::max(maxD, 0), maxX1 = width + std::min(minD, 0);
337 int D = maxD - minD, width1 = maxX1 - minX1;
338 int INVALID_DISP = minD - 1, INVALID_DISP_SCALED = INVALID_DISP*DISP_SCALE;
339 int SW2 = SADWindowSize.width/2, SH2 = SADWindowSize.height/2;
340 bool fullDP = params.mode == StereoSGBM::MODE_HH;
341 int npasses = fullDP ? 2 : 1;
342 const int TAB_OFS = 256*4, TAB_SIZE = 256 + TAB_OFS*2;
343 PixType clipTab[TAB_SIZE];
345 for( k = 0; k < TAB_SIZE; k++ )
346 clipTab[k] = (PixType)(std::min(std::max(k - TAB_OFS, -ftzero), ftzero) + ftzero);
350 disp1 = Scalar::all(INVALID_DISP_SCALED);
354 CV_Assert( D % 16 == 0 );
356 // NR - the number of directions. the loop on x below that computes Lr assumes that NR == 8.
357 // if you change NR, please, modify the loop as well.
358 int D2 = D+16, NRD2 = NR2*D2;
360 // the number of L_r(.,.) and min_k L_r(.,.) lines in the buffer:
361 // for 8-way dynamic programming we need the current row and
362 // the previous row, i.e. 2 rows in total
364 const int LrBorder = NLR - 1;
366 // for each possible stereo match (img1(x,y) <=> img2(x-d,y))
367 // we keep pixel difference cost (C) and the summary cost over NR directions (S).
368 // we also keep all the partial costs for the previous line L_r(x,d) and also min_k L_r(x, k)
369 size_t costBufSize = width1*D;
370 size_t CSBufSize = costBufSize*(fullDP ? height : 1);
371 size_t minLrSize = (width1 + LrBorder*2)*NR2, LrSize = minLrSize*D2;
372 int hsumBufNRows = SH2*2 + 2;
373 size_t totalBufSize = (LrSize + minLrSize)*NLR*sizeof(CostType) + // minLr[] and Lr[]
374 costBufSize*(hsumBufNRows + 1)*sizeof(CostType) + // hsumBuf, pixdiff
375 CSBufSize*2*sizeof(CostType) + // C, S
376 width*16*img1.channels()*sizeof(PixType) + // temp buffer for computing per-pixel cost
377 width*(sizeof(CostType) + sizeof(DispType)) + 1024; // disp2cost + disp2
379 if( buffer.empty() || !buffer.isContinuous() ||
380 buffer.cols*buffer.rows*buffer.elemSize() < totalBufSize )
381 buffer.create(1, (int)totalBufSize, CV_8U);
383 // summary cost over different (nDirs) directions
384 CostType* Cbuf = (CostType*)alignPtr(buffer.ptr(), ALIGN);
385 CostType* Sbuf = Cbuf + CSBufSize;
386 CostType* hsumBuf = Sbuf + CSBufSize;
387 CostType* pixDiff = hsumBuf + costBufSize*hsumBufNRows;
389 CostType* disp2cost = pixDiff + costBufSize + (LrSize + minLrSize)*NLR;
390 DispType* disp2ptr = (DispType*)(disp2cost + width);
391 PixType* tempBuf = (PixType*)(disp2ptr + width);
393 // add P2 to every C(x,y). it saves a few operations in the inner loops
394 for( k = 0; k < width1*D; k++ )
395 Cbuf[k] = (CostType)P2;
397 for( int pass = 1; pass <= npasses; pass++ )
399 int x1, y1, x2, y2, dx, dy;
403 y1 = 0; y2 = height; dy = 1;
404 x1 = 0; x2 = width1; dx = 1;
408 y1 = height-1; y2 = -1; dy = -1;
409 x1 = width1-1; x2 = -1; dx = -1;
412 CostType *Lr[NLR]={0}, *minLr[NLR]={0};
414 for( k = 0; k < NLR; k++ )
416 // shift Lr[k] and minLr[k] pointers, because we allocated them with the borders,
417 // and will occasionally use negative indices with the arrays
418 // we need to shift Lr[k] pointers by 1, to give the space for d=-1.
419 // however, then the alignment will be imperfect, i.e. bad for SSE,
420 // thus we shift the pointers by 8 (8*sizeof(short) == 16 - ideal alignment)
421 Lr[k] = pixDiff + costBufSize + LrSize*k + NRD2*LrBorder + 8;
422 memset( Lr[k] - LrBorder*NRD2 - 8, 0, LrSize*sizeof(CostType) );
423 minLr[k] = pixDiff + costBufSize + LrSize*NLR + minLrSize*k + NR2*LrBorder;
424 memset( minLr[k] - LrBorder*NR2, 0, minLrSize*sizeof(CostType) );
427 for( int y = y1; y != y2; y += dy )
430 DispType* disp1ptr = disp1.ptr<DispType>(y);
431 CostType* C = Cbuf + (!fullDP ? 0 : y*costBufSize);
432 CostType* S = Sbuf + (!fullDP ? 0 : y*costBufSize);
434 if( pass == 1 ) // compute C on the first pass, and reuse it on the second pass, if any.
436 int dy1 = y == 0 ? 0 : y + SH2, dy2 = y == 0 ? SH2 : dy1;
438 for( k = dy1; k <= dy2; k++ )
440 CostType* hsumAdd = hsumBuf + (std::min(k, height-1) % hsumBufNRows)*costBufSize;
444 calcPixelCostBT( img1, img2, k, minD, maxD, pixDiff, tempBuf, clipTab, TAB_OFS, ftzero );
446 memset(hsumAdd, 0, D*sizeof(CostType));
447 for( x = 0; x <= SW2*D; x += D )
449 int scale = x == 0 ? SW2 + 1 : 1;
450 for( d = 0; d < D; d++ )
451 hsumAdd[d] = (CostType)(hsumAdd[d] + pixDiff[x + d]*scale);
456 const CostType* hsumSub = hsumBuf + (std::max(y - SH2 - 1, 0) % hsumBufNRows)*costBufSize;
457 const CostType* Cprev = !fullDP || y == 0 ? C : C - costBufSize;
459 for( x = D; x < width1*D; x += D )
461 const CostType* pixAdd = pixDiff + std::min(x + SW2*D, (width1-1)*D);
462 const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*D, 0);
467 for( d = 0; d < D; d += 8 )
469 __m128i hv = _mm_load_si128((const __m128i*)(hsumAdd + x - D + d));
470 __m128i Cx = _mm_load_si128((__m128i*)(Cprev + x + d));
471 hv = _mm_adds_epi16(_mm_subs_epi16(hv,
472 _mm_load_si128((const __m128i*)(pixSub + d))),
473 _mm_load_si128((const __m128i*)(pixAdd + d)));
474 Cx = _mm_adds_epi16(_mm_subs_epi16(Cx,
475 _mm_load_si128((const __m128i*)(hsumSub + x + d))),
477 _mm_store_si128((__m128i*)(hsumAdd + x + d), hv);
478 _mm_store_si128((__m128i*)(C + x + d), Cx);
484 for( d = 0; d < D; d++ )
486 int hv = hsumAdd[x + d] = (CostType)(hsumAdd[x - D + d] + pixAdd[d] - pixSub[d]);
487 C[x + d] = (CostType)(Cprev[x + d] + hv - hsumSub[x + d]);
494 for( x = D; x < width1*D; x += D )
496 const CostType* pixAdd = pixDiff + std::min(x + SW2*D, (width1-1)*D);
497 const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*D, 0);
499 for( d = 0; d < D; d++ )
500 hsumAdd[x + d] = (CostType)(hsumAdd[x - D + d] + pixAdd[d] - pixSub[d]);
507 int scale = k == 0 ? SH2 + 1 : 1;
508 for( x = 0; x < width1*D; x++ )
509 C[x] = (CostType)(C[x] + hsumAdd[x]*scale);
513 // also, clear the S buffer
514 for( k = 0; k < width1*D; k++ )
518 // clear the left and the right borders
519 memset( Lr[0] - NRD2*LrBorder - 8, 0, NRD2*LrBorder*sizeof(CostType) );
520 memset( Lr[0] + width1*NRD2 - 8, 0, NRD2*LrBorder*sizeof(CostType) );
521 memset( minLr[0] - NR2*LrBorder, 0, NR2*LrBorder*sizeof(CostType) );
522 memset( minLr[0] + width1*NR2, 0, NR2*LrBorder*sizeof(CostType) );
525 [formula 13 in the paper]
526 compute L_r(p, d) = C(p, d) +
530 min_k L_r(p-r, k) + P2) - min_k L_r(p-r, k)
531 where p = (x,y), r is one of the directions.
532 we process all the directions at once:
542 for( x = x1; x != x2; x += dx )
544 int xm = x*NR2, xd = xm*D2;
546 int delta0 = minLr[0][xm - dx*NR2] + P2, delta1 = minLr[1][xm - NR2 + 1] + P2;
547 int delta2 = minLr[1][xm + 2] + P2, delta3 = minLr[1][xm + NR2 + 3] + P2;
549 CostType* Lr_p0 = Lr[0] + xd - dx*NRD2;
550 CostType* Lr_p1 = Lr[1] + xd - NRD2 + D2;
551 CostType* Lr_p2 = Lr[1] + xd + D2*2;
552 CostType* Lr_p3 = Lr[1] + xd + NRD2 + D2*3;
554 Lr_p0[-1] = Lr_p0[D] = Lr_p1[-1] = Lr_p1[D] =
555 Lr_p2[-1] = Lr_p2[D] = Lr_p3[-1] = Lr_p3[D] = MAX_COST;
557 CostType* Lr_p = Lr[0] + xd;
558 const CostType* Cp = C + x*D;
559 CostType* Sp = S + x*D;
564 __m128i _P1 = _mm_set1_epi16((short)P1);
566 __m128i _delta0 = _mm_set1_epi16((short)delta0);
567 __m128i _delta1 = _mm_set1_epi16((short)delta1);
568 __m128i _delta2 = _mm_set1_epi16((short)delta2);
569 __m128i _delta3 = _mm_set1_epi16((short)delta3);
570 __m128i _minL0 = _mm_set1_epi16((short)MAX_COST);
572 for( d = 0; d < D; d += 8 )
574 __m128i Cpd = _mm_load_si128((const __m128i*)(Cp + d));
575 __m128i L0, L1, L2, L3;
577 L0 = _mm_load_si128((const __m128i*)(Lr_p0 + d));
578 L1 = _mm_load_si128((const __m128i*)(Lr_p1 + d));
579 L2 = _mm_load_si128((const __m128i*)(Lr_p2 + d));
580 L3 = _mm_load_si128((const __m128i*)(Lr_p3 + d));
582 L0 = _mm_min_epi16(L0, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p0 + d - 1)), _P1));
583 L0 = _mm_min_epi16(L0, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p0 + d + 1)), _P1));
585 L1 = _mm_min_epi16(L1, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p1 + d - 1)), _P1));
586 L1 = _mm_min_epi16(L1, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p1 + d + 1)), _P1));
588 L2 = _mm_min_epi16(L2, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p2 + d - 1)), _P1));
589 L2 = _mm_min_epi16(L2, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p2 + d + 1)), _P1));
591 L3 = _mm_min_epi16(L3, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p3 + d - 1)), _P1));
592 L3 = _mm_min_epi16(L3, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p3 + d + 1)), _P1));
594 L0 = _mm_min_epi16(L0, _delta0);
595 L0 = _mm_adds_epi16(_mm_subs_epi16(L0, _delta0), Cpd);
597 L1 = _mm_min_epi16(L1, _delta1);
598 L1 = _mm_adds_epi16(_mm_subs_epi16(L1, _delta1), Cpd);
600 L2 = _mm_min_epi16(L2, _delta2);
601 L2 = _mm_adds_epi16(_mm_subs_epi16(L2, _delta2), Cpd);
603 L3 = _mm_min_epi16(L3, _delta3);
604 L3 = _mm_adds_epi16(_mm_subs_epi16(L3, _delta3), Cpd);
606 _mm_store_si128( (__m128i*)(Lr_p + d), L0);
607 _mm_store_si128( (__m128i*)(Lr_p + d + D2), L1);
608 _mm_store_si128( (__m128i*)(Lr_p + d + D2*2), L2);
609 _mm_store_si128( (__m128i*)(Lr_p + d + D2*3), L3);
611 __m128i t0 = _mm_min_epi16(_mm_unpacklo_epi16(L0, L2), _mm_unpackhi_epi16(L0, L2));
612 __m128i t1 = _mm_min_epi16(_mm_unpacklo_epi16(L1, L3), _mm_unpackhi_epi16(L1, L3));
613 t0 = _mm_min_epi16(_mm_unpacklo_epi16(t0, t1), _mm_unpackhi_epi16(t0, t1));
614 _minL0 = _mm_min_epi16(_minL0, t0);
616 __m128i Sval = _mm_load_si128((const __m128i*)(Sp + d));
618 L0 = _mm_adds_epi16(L0, L1);
619 L2 = _mm_adds_epi16(L2, L3);
620 Sval = _mm_adds_epi16(Sval, L0);
621 Sval = _mm_adds_epi16(Sval, L2);
623 _mm_store_si128((__m128i*)(Sp + d), Sval);
626 _minL0 = _mm_min_epi16(_minL0, _mm_srli_si128(_minL0, 8));
627 _mm_storel_epi64((__m128i*)&minLr[0][xm], _minL0);
632 int minL0 = MAX_COST, minL1 = MAX_COST, minL2 = MAX_COST, minL3 = MAX_COST;
634 for( d = 0; d < D; d++ )
636 int Cpd = Cp[d], L0, L1, L2, L3;
638 L0 = Cpd + std::min((int)Lr_p0[d], std::min(Lr_p0[d-1] + P1, std::min(Lr_p0[d+1] + P1, delta0))) - delta0;
639 L1 = Cpd + std::min((int)Lr_p1[d], std::min(Lr_p1[d-1] + P1, std::min(Lr_p1[d+1] + P1, delta1))) - delta1;
640 L2 = Cpd + std::min((int)Lr_p2[d], std::min(Lr_p2[d-1] + P1, std::min(Lr_p2[d+1] + P1, delta2))) - delta2;
641 L3 = Cpd + std::min((int)Lr_p3[d], std::min(Lr_p3[d-1] + P1, std::min(Lr_p3[d+1] + P1, delta3))) - delta3;
643 Lr_p[d] = (CostType)L0;
644 minL0 = std::min(minL0, L0);
646 Lr_p[d + D2] = (CostType)L1;
647 minL1 = std::min(minL1, L1);
649 Lr_p[d + D2*2] = (CostType)L2;
650 minL2 = std::min(minL2, L2);
652 Lr_p[d + D2*3] = (CostType)L3;
653 minL3 = std::min(minL3, L3);
655 Sp[d] = saturate_cast<CostType>(Sp[d] + L0 + L1 + L2 + L3);
657 minLr[0][xm] = (CostType)minL0;
658 minLr[0][xm+1] = (CostType)minL1;
659 minLr[0][xm+2] = (CostType)minL2;
660 minLr[0][xm+3] = (CostType)minL3;
664 if( pass == npasses )
666 for( x = 0; x < width; x++ )
668 disp1ptr[x] = disp2ptr[x] = (DispType)INVALID_DISP_SCALED;
669 disp2cost[x] = MAX_COST;
672 for( x = width1 - 1; x >= 0; x-- )
674 CostType* Sp = S + x*D;
675 int minS = MAX_COST, bestDisp = -1;
679 int xm = x*NR2, xd = xm*D2;
681 int minL0 = MAX_COST;
682 int delta0 = minLr[0][xm + NR2] + P2;
683 CostType* Lr_p0 = Lr[0] + xd + NRD2;
684 Lr_p0[-1] = Lr_p0[D] = MAX_COST;
685 CostType* Lr_p = Lr[0] + xd;
687 const CostType* Cp = C + x*D;
692 __m128i _P1 = _mm_set1_epi16((short)P1);
693 __m128i _delta0 = _mm_set1_epi16((short)delta0);
695 __m128i _minL0 = _mm_set1_epi16((short)minL0);
696 __m128i _minS = _mm_set1_epi16(MAX_COST), _bestDisp = _mm_set1_epi16(-1);
697 __m128i _d8 = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7), _8 = _mm_set1_epi16(8);
699 for( d = 0; d < D; d += 8 )
701 __m128i Cpd = _mm_load_si128((const __m128i*)(Cp + d)), L0;
703 L0 = _mm_load_si128((const __m128i*)(Lr_p0 + d));
704 L0 = _mm_min_epi16(L0, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p0 + d - 1)), _P1));
705 L0 = _mm_min_epi16(L0, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p0 + d + 1)), _P1));
706 L0 = _mm_min_epi16(L0, _delta0);
707 L0 = _mm_adds_epi16(_mm_subs_epi16(L0, _delta0), Cpd);
709 _mm_store_si128((__m128i*)(Lr_p + d), L0);
710 _minL0 = _mm_min_epi16(_minL0, L0);
711 L0 = _mm_adds_epi16(L0, *(__m128i*)(Sp + d));
712 _mm_store_si128((__m128i*)(Sp + d), L0);
714 __m128i mask = _mm_cmpgt_epi16(_minS, L0);
715 _minS = _mm_min_epi16(_minS, L0);
716 _bestDisp = _mm_xor_si128(_bestDisp, _mm_and_si128(_mm_xor_si128(_bestDisp,_d8), mask));
717 _d8 = _mm_adds_epi16(_d8, _8);
720 short CV_DECL_ALIGNED(16) bestDispBuf[8];
721 _mm_store_si128((__m128i*)bestDispBuf, _bestDisp);
723 _minL0 = _mm_min_epi16(_minL0, _mm_srli_si128(_minL0, 8));
724 _minL0 = _mm_min_epi16(_minL0, _mm_srli_si128(_minL0, 4));
725 _minL0 = _mm_min_epi16(_minL0, _mm_srli_si128(_minL0, 2));
727 __m128i qS = _mm_min_epi16(_minS, _mm_srli_si128(_minS, 8));
728 qS = _mm_min_epi16(qS, _mm_srli_si128(qS, 4));
729 qS = _mm_min_epi16(qS, _mm_srli_si128(qS, 2));
731 minLr[0][xm] = (CostType)_mm_cvtsi128_si32(_minL0);
732 minS = (CostType)_mm_cvtsi128_si32(qS);
734 qS = _mm_shuffle_epi32(_mm_unpacklo_epi16(qS, qS), 0);
735 qS = _mm_cmpeq_epi16(_minS, qS);
736 int idx = _mm_movemask_epi8(_mm_packs_epi16(qS, qS)) & 255;
738 bestDisp = bestDispBuf[LSBTab[idx]];
743 for( d = 0; d < D; d++ )
745 int L0 = Cp[d] + std::min((int)Lr_p0[d], std::min(Lr_p0[d-1] + P1, std::min(Lr_p0[d+1] + P1, delta0))) - delta0;
747 Lr_p[d] = (CostType)L0;
748 minL0 = std::min(minL0, L0);
750 int Sval = Sp[d] = saturate_cast<CostType>(Sp[d] + L0);
757 minLr[0][xm] = (CostType)minL0;
762 for( d = 0; d < D; d++ )
773 for( d = 0; d < D; d++ )
775 if( Sp[d]*(100 - uniquenessRatio) < minS*100 && std::abs(bestDisp - d) > 1 )
781 int _x2 = x + minX1 - d - minD;
782 if( disp2cost[_x2] > minS )
784 disp2cost[_x2] = (CostType)minS;
785 disp2ptr[_x2] = (DispType)(d + minD);
788 if( 0 < d && d < D-1 )
790 // do subpixel quadratic interpolation:
791 // fit parabola into (x1=d-1, y1=Sp[d-1]), (x2=d, y2=Sp[d]), (x3=d+1, y3=Sp[d+1])
792 // then find minimum of the parabola.
793 int denom2 = std::max(Sp[d-1] + Sp[d+1] - 2*Sp[d], 1);
794 d = d*DISP_SCALE + ((Sp[d-1] - Sp[d+1])*DISP_SCALE + denom2)/(denom2*2);
798 disp1ptr[x + minX1] = (DispType)(d + minD*DISP_SCALE);
801 for( x = minX1; x < maxX1; x++ )
803 // we round the computed disparity both towards -inf and +inf and check
804 // if either of the corresponding disparities in disp2 is consistent.
805 // This is to give the computed disparity a chance to look valid if it is.
806 int d1 = disp1ptr[x];
807 if( d1 == INVALID_DISP_SCALED )
809 int _d = d1 >> DISP_SHIFT;
810 int d_ = (d1 + DISP_SCALE-1) >> DISP_SHIFT;
811 int _x = x - _d, x_ = x - d_;
812 if( 0 <= _x && _x < width && disp2ptr[_x] >= minD && std::abs(disp2ptr[_x] - _d) > disp12MaxDiff &&
813 0 <= x_ && x_ < width && disp2ptr[x_] >= minD && std::abs(disp2ptr[x_] - d_) > disp12MaxDiff )
814 disp1ptr[x] = (DispType)INVALID_DISP_SCALED;
818 // now shift the cyclic buffers
819 std::swap( Lr[0], Lr[1] );
820 std::swap( minLr[0], minLr[1] );
825 //////////////////////////////////////////////////////////////////////////////////////////////////////
827 void getBufferPointers(Mat& buffer, int width, int width1, int D, int num_ch, int SH2, int P2,
828 CostType*& curCostVolumeLine, CostType*& hsumBuf, CostType*& pixDiff,
829 PixType*& tmpBuf, CostType*& horPassCostVolume,
830 CostType*& vertPassCostVolume, CostType*& vertPassMin, CostType*& rightPassBuf,
831 CostType*& disp2CostBuf, short*& disp2Buf);
833 struct SGBM3WayMainLoop : public ParallelLoopBody
836 const Mat *img1, *img2;
839 int nstripes, stripe_sz;
844 int minX1, maxX1, width1;
848 int uniquenessRatio, disp12MaxDiff;
850 int costBufSize, hsumBufNRows;
855 SGBM3WayMainLoop(Mat *_buffers, const Mat& _img1, const Mat& _img2, Mat* _dst_disp, const StereoSGBMParams& params, PixType* _clipTab, int _nstripes, int _stripe_overlap);
856 void getRawMatchingCost(CostType* C, CostType* hsumBuf, CostType* pixDiff, PixType* tmpBuf, int y, int src_start_idx) const;
857 void operator () (const Range& range) const;
860 SGBM3WayMainLoop::SGBM3WayMainLoop(Mat *_buffers, const Mat& _img1, const Mat& _img2, Mat* _dst_disp, const StereoSGBMParams& params, PixType* _clipTab, int _nstripes, int _stripe_overlap):
861 buffers(_buffers), img1(&_img1), img2(&_img2), dst_disp(_dst_disp), clipTab(_clipTab)
863 nstripes = _nstripes;
864 stripe_overlap = _stripe_overlap;
865 stripe_sz = (int)ceil(img1->rows/(double)nstripes);
867 width = img1->cols; height = img1->rows;
868 minD = params.minDisparity; maxD = minD + params.numDisparities; D = maxD - minD;
869 minX1 = std::max(maxD, 0); maxX1 = width + std::min(minD, 0); width1 = maxX1 - minX1;
870 CV_Assert( D % 16 == 0 );
872 SW2 = SH2 = params.SADWindowSize > 0 ? params.SADWindowSize/2 : 1;
874 P1 = params.P1 > 0 ? params.P1 : 2; P2 = std::max(params.P2 > 0 ? params.P2 : 5, P1+1);
875 uniquenessRatio = params.uniquenessRatio >= 0 ? params.uniquenessRatio : 10;
876 disp12MaxDiff = params.disp12MaxDiff > 0 ? params.disp12MaxDiff : 1;
878 costBufSize = width1*D;
879 hsumBufNRows = SH2*2 + 2;
881 ftzero = std::max(params.preFilterCap, 15) | 1;
884 void getBufferPointers(Mat& buffer, int width, int width1, int D, int num_ch, int SH2, int P2,
885 CostType*& curCostVolumeLine, CostType*& hsumBuf, CostType*& pixDiff,
886 PixType*& tmpBuf, CostType*& horPassCostVolume,
887 CostType*& vertPassCostVolume, CostType*& vertPassMin, CostType*& rightPassBuf,
888 CostType*& disp2CostBuf, short*& disp2Buf)
890 // allocating all the required memory:
891 int costVolumeLineSize = width1*D;
892 int width1_ext = width1+2;
893 int costVolumeLineSize_ext = width1_ext*D;
894 int hsumBufNRows = SH2*2 + 2;
896 // main buffer to store matching costs for the current line:
897 int curCostVolumeLineSize = costVolumeLineSize*sizeof(CostType);
899 // auxiliary buffers for the raw matching cost computation:
900 int hsumBufSize = costVolumeLineSize*hsumBufNRows*sizeof(CostType);
901 int pixDiffSize = costVolumeLineSize*sizeof(CostType);
902 int tmpBufSize = width*16*num_ch*sizeof(PixType);
904 // auxiliary buffers for the matching cost aggregation:
905 int horPassCostVolumeSize = costVolumeLineSize_ext*sizeof(CostType); // buffer for the 2-pass horizontal cost aggregation
906 int vertPassCostVolumeSize = costVolumeLineSize_ext*sizeof(CostType); // buffer for the vertical cost aggregation
907 int vertPassMinSize = width1_ext*sizeof(CostType); // buffer for storing minimum costs from the previous line
908 int rightPassBufSize = D*sizeof(CostType); // additional small buffer for the right-to-left pass
910 // buffers for the pseudo-LRC check:
911 int disp2CostBufSize = width*sizeof(CostType);
912 int disp2BufSize = width*sizeof(short);
914 // sum up the sizes of all the buffers:
915 size_t totalBufSize = curCostVolumeLineSize +
919 horPassCostVolumeSize +
920 vertPassCostVolumeSize +
925 16; //to compensate for the alignPtr shifts
927 if( buffer.empty() || !buffer.isContinuous() || buffer.cols*buffer.rows*buffer.elemSize() < totalBufSize )
928 buffer.create(1, (int)totalBufSize, CV_8U);
930 // set up all the pointers:
931 curCostVolumeLine = (CostType*)alignPtr(buffer.ptr(), 16);
932 hsumBuf = curCostVolumeLine + costVolumeLineSize;
933 pixDiff = hsumBuf + costVolumeLineSize*hsumBufNRows;
934 tmpBuf = (PixType*)(pixDiff + costVolumeLineSize);
935 horPassCostVolume = (CostType*)(tmpBuf + width*16*num_ch);
936 vertPassCostVolume = horPassCostVolume + costVolumeLineSize_ext;
937 rightPassBuf = vertPassCostVolume + costVolumeLineSize_ext;
938 vertPassMin = rightPassBuf + D;
939 disp2CostBuf = vertPassMin + width1_ext;
940 disp2Buf = disp2CostBuf + width;
942 // initialize memory:
943 memset(buffer.ptr(),0,totalBufSize);
944 for(int i=0;i<costVolumeLineSize;i++)
945 curCostVolumeLine[i] = (CostType)P2; //such initialization simplifies the cost aggregation loops a bit
948 // performing block matching and building raw cost-volume for the current row
949 void SGBM3WayMainLoop::getRawMatchingCost(CostType* C, // target cost-volume row
950 CostType* hsumBuf, CostType* pixDiff, PixType* tmpBuf, //buffers
951 int y, int src_start_idx) const
954 int dy1 = (y == src_start_idx) ? src_start_idx : y + SH2, dy2 = (y == src_start_idx) ? src_start_idx+SH2 : dy1;
956 for(int k = dy1; k <= dy2; k++ )
958 CostType* hsumAdd = hsumBuf + (std::min(k, height-1) % hsumBufNRows)*costBufSize;
961 calcPixelCostBT( *img1, *img2, k, minD, maxD, pixDiff, tmpBuf, clipTab, TAB_OFS, ftzero );
963 memset(hsumAdd, 0, D*sizeof(CostType));
964 for(x = 0; x <= SW2*D; x += D )
966 int scale = x == 0 ? SW2 + 1 : 1;
968 for( d = 0; d < D; d++ )
969 hsumAdd[d] = (CostType)(hsumAdd[d] + pixDiff[x + d]*scale);
972 if( y > src_start_idx )
974 const CostType* hsumSub = hsumBuf + (std::max(y - SH2 - 1, src_start_idx) % hsumBufNRows)*costBufSize;
976 for( x = D; x < width1*D; x += D )
978 const CostType* pixAdd = pixDiff + std::min(x + SW2*D, (width1-1)*D);
979 const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*D, 0);
983 for( d = 0; d < D; d+=8 )
985 hv_reg = v_load_aligned(hsumAdd+x-D+d) + (v_load_aligned(pixAdd+d) - v_load_aligned(pixSub+d));
986 v_store_aligned(hsumAdd+x+d,hv_reg);
987 v_store_aligned(C+x+d,v_load_aligned(C+x+d)+(hv_reg-v_load_aligned(hsumSub+x+d)));
990 for( d = 0; d < D; d++ )
992 int hv = hsumAdd[x + d] = (CostType)(hsumAdd[x - D + d] + pixAdd[d] - pixSub[d]);
993 C[x + d] = (CostType)(C[x + d] + hv - hsumSub[x + d]);
1000 for( x = D; x < width1*D; x += D )
1002 const CostType* pixAdd = pixDiff + std::min(x + SW2*D, (width1-1)*D);
1003 const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*D, 0);
1005 for( d = 0; d < D; d++ )
1006 hsumAdd[x + d] = (CostType)(hsumAdd[x - D + d] + pixAdd[d] - pixSub[d]);
1011 if( y == src_start_idx )
1013 int scale = k == src_start_idx ? SH2 + 1 : 1;
1014 for( x = 0; x < width1*D; x++ )
1015 C[x] = (CostType)(C[x] + hsumAdd[x]*scale);
1021 // define some additional reduce operations:
1022 inline short min(const v_int16x8& a)
1024 short CV_DECL_ALIGNED(16) buf[8];
1025 v_store_aligned(buf, a);
1026 short s0 = std::min(buf[0], buf[1]);
1027 short s1 = std::min(buf[2], buf[3]);
1028 short s2 = std::min(buf[4], buf[5]);
1029 short s3 = std::min(buf[6], buf[7]);
1030 return std::min(std::min(s0, s1),std::min(s2, s3));
1033 inline short min_pos(const v_int16x8& val,const v_int16x8& pos)
1035 short CV_DECL_ALIGNED(16) val_buf[8];
1036 v_store_aligned(val_buf, val);
1037 short CV_DECL_ALIGNED(16) pos_buf[8];
1038 v_store_aligned(pos_buf, pos);
1040 short min_val = SHRT_MAX;
1041 if(val_buf[0]<min_val) {min_val=val_buf[0]; res_pos=pos_buf[0];}
1042 if(val_buf[1]<min_val) {min_val=val_buf[1]; res_pos=pos_buf[1];}
1043 if(val_buf[2]<min_val) {min_val=val_buf[2]; res_pos=pos_buf[2];}
1044 if(val_buf[3]<min_val) {min_val=val_buf[3]; res_pos=pos_buf[3];}
1045 if(val_buf[4]<min_val) {min_val=val_buf[4]; res_pos=pos_buf[4];}
1046 if(val_buf[5]<min_val) {min_val=val_buf[5]; res_pos=pos_buf[5];}
1047 if(val_buf[6]<min_val) {min_val=val_buf[6]; res_pos=pos_buf[6];}
1048 if(val_buf[7]<min_val) {min_val=val_buf[7]; res_pos=pos_buf[7];}
1053 // performing SGM cost accumulation from left to right (result is stored in leftBuf) and
1054 // in-place cost accumulation from top to bottom (result is stored in topBuf)
1055 inline void accumulateCostsLeftTop(CostType* leftBuf, CostType* leftBuf_prev, CostType* topBuf, CostType* costs,
1056 CostType& leftMinCost, CostType& topMinCost, int D, int P1, int P2)
1059 v_int16x8 P1_reg = v_setall_s16(cv::saturate_cast<CostType>(P1));
1061 v_int16x8 leftMinCostP2_reg = v_setall_s16(cv::saturate_cast<CostType>(leftMinCost+P2));
1062 v_int16x8 leftMinCost_new_reg = v_setall_s16(SHRT_MAX);
1063 v_int16x8 src0_leftBuf = v_setall_s16(SHRT_MAX);
1064 v_int16x8 src1_leftBuf = v_load_aligned(leftBuf_prev);
1066 v_int16x8 topMinCostP2_reg = v_setall_s16(cv::saturate_cast<CostType>(topMinCost+P2));
1067 v_int16x8 topMinCost_new_reg = v_setall_s16(SHRT_MAX);
1068 v_int16x8 src0_topBuf = v_setall_s16(SHRT_MAX);
1069 v_int16x8 src1_topBuf = v_load_aligned(topBuf);
1072 v_int16x8 src_shifted_left,src_shifted_right;
1075 for(int i=0;i<D-8;i+=8)
1079 src2 = v_load_aligned(leftBuf_prev+i+8);
1081 //get shifted versions of the current block and add P1:
1082 src_shifted_left = v_extract<7> (src0_leftBuf,src1_leftBuf) + P1_reg;
1083 src_shifted_right = v_extract<1> (src1_leftBuf,src2 ) + P1_reg;
1085 // process and save current block:
1086 res = v_load_aligned(costs+i) + (v_min(v_min(src_shifted_left,src_shifted_right),v_min(src1_leftBuf,leftMinCostP2_reg))-leftMinCostP2_reg);
1087 leftMinCost_new_reg = v_min(leftMinCost_new_reg,res);
1088 v_store_aligned(leftBuf+i, res);
1090 //update src buffers:
1091 src0_leftBuf = src1_leftBuf;
1092 src1_leftBuf = src2;
1096 src2 = v_load_aligned(topBuf+i+8);
1098 //get shifted versions of the current block and add P1:
1099 src_shifted_left = v_extract<7> (src0_topBuf,src1_topBuf) + P1_reg;
1100 src_shifted_right = v_extract<1> (src1_topBuf,src2 ) + P1_reg;
1102 // process and save current block:
1103 res = v_load_aligned(costs+i) + (v_min(v_min(src_shifted_left,src_shifted_right),v_min(src1_topBuf,topMinCostP2_reg))-topMinCostP2_reg);
1104 topMinCost_new_reg = v_min(topMinCost_new_reg,res);
1105 v_store_aligned(topBuf+i, res);
1107 //update src buffers:
1108 src0_topBuf = src1_topBuf;
1112 // a bit different processing for the last cycle of the loop:
1114 src2 = v_setall_s16(SHRT_MAX);
1115 src_shifted_left = v_extract<7> (src0_leftBuf,src1_leftBuf) + P1_reg;
1116 src_shifted_right = v_extract<1> (src1_leftBuf,src2 ) + P1_reg;
1118 res = v_load_aligned(costs+D-8) + (v_min(v_min(src_shifted_left,src_shifted_right),v_min(src1_leftBuf,leftMinCostP2_reg))-leftMinCostP2_reg);
1119 leftMinCost = min(v_min(leftMinCost_new_reg,res));
1120 v_store_aligned(leftBuf+D-8, res);
1123 src2 = v_setall_s16(SHRT_MAX);
1124 src_shifted_left = v_extract<7> (src0_topBuf,src1_topBuf) + P1_reg;
1125 src_shifted_right = v_extract<1> (src1_topBuf,src2 ) + P1_reg;
1127 res = v_load_aligned(costs+D-8) + (v_min(v_min(src_shifted_left,src_shifted_right),v_min(src1_topBuf,topMinCostP2_reg))-topMinCostP2_reg);
1128 topMinCost = min(v_min(topMinCost_new_reg,res));
1129 v_store_aligned(topBuf+D-8, res);
1131 CostType leftMinCost_new = SHRT_MAX;
1132 CostType topMinCost_new = SHRT_MAX;
1133 int leftMinCost_P2 = leftMinCost + P2;
1134 int topMinCost_P2 = topMinCost + P2;
1135 CostType leftBuf_prev_i_minus_1 = SHRT_MAX;
1136 CostType topBuf_i_minus_1 = SHRT_MAX;
1139 for(int i=0;i<D-1;i++)
1141 leftBuf[i] = cv::saturate_cast<CostType>(costs[i] + std::min(std::min(leftBuf_prev_i_minus_1+P1,leftBuf_prev[i+1]+P1),std::min((int)leftBuf_prev[i],leftMinCost_P2))-leftMinCost_P2);
1142 leftBuf_prev_i_minus_1 = leftBuf_prev[i];
1143 leftMinCost_new = std::min(leftMinCost_new,leftBuf[i]);
1146 topBuf[i] = cv::saturate_cast<CostType>(costs[i] + std::min(std::min(topBuf_i_minus_1+P1,topBuf[i+1]+P1),std::min((int)topBuf[i],topMinCost_P2))-topMinCost_P2);
1147 topBuf_i_minus_1 = tmp;
1148 topMinCost_new = std::min(topMinCost_new,topBuf[i]);
1151 leftBuf[D-1] = cv::saturate_cast<CostType>(costs[D-1] + std::min(leftBuf_prev_i_minus_1+P1,std::min((int)leftBuf_prev[D-1],leftMinCost_P2))-leftMinCost_P2);
1152 leftMinCost = std::min(leftMinCost_new,leftBuf[D-1]);
1154 topBuf[D-1] = cv::saturate_cast<CostType>(costs[D-1] + std::min(topBuf_i_minus_1+P1,std::min((int)topBuf[D-1],topMinCost_P2))-topMinCost_P2);
1155 topMinCost = std::min(topMinCost_new,topBuf[D-1]);
1159 // performing in-place SGM cost accumulation from right to left (the result is stored in rightBuf) and
1160 // summing rightBuf, topBuf, leftBuf together (the result is stored in leftBuf), as well as finding the
1161 // optimal disparity value with minimum accumulated cost
1162 inline void accumulateCostsRight(CostType* rightBuf, CostType* topBuf, CostType* leftBuf, CostType* costs,
1163 CostType& rightMinCost, int D, int P1, int P2, int& optimal_disp, CostType& min_cost)
1166 v_int16x8 P1_reg = v_setall_s16(cv::saturate_cast<CostType>(P1));
1168 v_int16x8 rightMinCostP2_reg = v_setall_s16(cv::saturate_cast<CostType>(rightMinCost+P2));
1169 v_int16x8 rightMinCost_new_reg = v_setall_s16(SHRT_MAX);
1170 v_int16x8 src0_rightBuf = v_setall_s16(SHRT_MAX);
1171 v_int16x8 src1_rightBuf = v_load(rightBuf);
1174 v_int16x8 src_shifted_left,src_shifted_right;
1177 v_int16x8 min_sum_cost_reg = v_setall_s16(SHRT_MAX);
1178 v_int16x8 min_sum_pos_reg = v_setall_s16(0);
1179 v_int16x8 loop_idx(0,1,2,3,4,5,6,7);
1180 v_int16x8 eight_reg = v_setall_s16(8);
1182 for(int i=0;i<D-8;i+=8)
1185 src2 = v_load_aligned(rightBuf+i+8);
1187 //get shifted versions of the current block and add P1:
1188 src_shifted_left = v_extract<7> (src0_rightBuf,src1_rightBuf) + P1_reg;
1189 src_shifted_right = v_extract<1> (src1_rightBuf,src2 ) + P1_reg;
1191 // process and save current block:
1192 res = v_load_aligned(costs+i) + (v_min(v_min(src_shifted_left,src_shifted_right),v_min(src1_rightBuf,rightMinCostP2_reg))-rightMinCostP2_reg);
1193 rightMinCost_new_reg = v_min(rightMinCost_new_reg,res);
1194 v_store_aligned(rightBuf+i, res);
1196 // compute and save total cost:
1197 res = res + v_load_aligned(leftBuf+i) + v_load_aligned(topBuf+i);
1198 v_store_aligned(leftBuf+i, res);
1200 // track disparity value with the minimum cost:
1201 min_sum_cost_reg = v_min(min_sum_cost_reg,res);
1202 min_sum_pos_reg = min_sum_pos_reg + ((min_sum_cost_reg == res) & (loop_idx - min_sum_pos_reg));
1203 loop_idx = loop_idx+eight_reg;
1206 src0_rightBuf = src1_rightBuf;
1207 src1_rightBuf = src2;
1210 // a bit different processing for the last cycle of the loop:
1211 src2 = v_setall_s16(SHRT_MAX);
1212 src_shifted_left = v_extract<7> (src0_rightBuf,src1_rightBuf) + P1_reg;
1213 src_shifted_right = v_extract<1> (src1_rightBuf,src2 ) + P1_reg;
1215 res = v_load_aligned(costs+D-8) + (v_min(v_min(src_shifted_left,src_shifted_right),v_min(src1_rightBuf,rightMinCostP2_reg))-rightMinCostP2_reg);
1216 rightMinCost = min(v_min(rightMinCost_new_reg,res));
1217 v_store_aligned(rightBuf+D-8, res);
1219 res = res + v_load_aligned(leftBuf+D-8) + v_load_aligned(topBuf+D-8);
1220 v_store_aligned(leftBuf+D-8, res);
1222 min_sum_cost_reg = v_min(min_sum_cost_reg,res);
1223 min_cost = min(min_sum_cost_reg);
1224 min_sum_pos_reg = min_sum_pos_reg + ((min_sum_cost_reg == res) & (loop_idx - min_sum_pos_reg));
1225 optimal_disp = min_pos(min_sum_cost_reg,min_sum_pos_reg);
1227 CostType rightMinCost_new = SHRT_MAX;
1228 int rightMinCost_P2 = rightMinCost + P2;
1229 CostType rightBuf_i_minus_1 = SHRT_MAX;
1231 min_cost = SHRT_MAX;
1233 for(int i=0;i<D-1;i++)
1236 rightBuf[i] = cv::saturate_cast<CostType>(costs[i] + std::min(std::min(rightBuf_i_minus_1+P1,rightBuf[i+1]+P1),std::min((int)rightBuf[i],rightMinCost_P2))-rightMinCost_P2);
1237 rightBuf_i_minus_1 = tmp;
1238 rightMinCost_new = std::min(rightMinCost_new,rightBuf[i]);
1239 leftBuf[i] = cv::saturate_cast<CostType>((int)leftBuf[i]+rightBuf[i]+topBuf[i]);
1240 if(leftBuf[i]<min_cost)
1243 min_cost = leftBuf[i];
1247 rightBuf[D-1] = cv::saturate_cast<CostType>(costs[D-1] + std::min(rightBuf_i_minus_1+P1,std::min((int)rightBuf[D-1],rightMinCost_P2))-rightMinCost_P2);
1248 rightMinCost = std::min(rightMinCost_new,rightBuf[D-1]);
1249 leftBuf[D-1] = cv::saturate_cast<CostType>((int)leftBuf[D-1]+rightBuf[D-1]+topBuf[D-1]);
1250 if(leftBuf[D-1]<min_cost)
1253 min_cost = leftBuf[D-1];
1258 void SGBM3WayMainLoop::operator () (const Range& range) const
1260 // force separate processing of stripes:
1261 if(range.end>range.start+1)
1263 for(int n=range.start;n<range.end;n++)
1264 (*this)(Range(n,n+1));
1268 const int DISP_SCALE = (1 << StereoMatcher::DISP_SHIFT);
1269 int INVALID_DISP = minD - 1, INVALID_DISP_SCALED = INVALID_DISP*DISP_SCALE;
1271 // setting up the ranges:
1272 int src_start_idx = std::max(std::min(range.start * stripe_sz - stripe_overlap, height),0);
1273 int src_end_idx = std::min(range.end * stripe_sz, height);
1277 dst_offset=stripe_overlap;
1281 Mat cur_buffer = buffers [range.start];
1282 Mat cur_disp = dst_disp[range.start];
1283 cur_disp = Scalar(INVALID_DISP_SCALED);
1286 CostType *curCostVolumeLine, *hsumBuf, *pixDiff;
1288 CostType *horPassCostVolume, *vertPassCostVolume, *vertPassMin, *rightPassBuf, *disp2CostBuf;
1290 getBufferPointers(cur_buffer,width,width1,D,img1->channels(),SH2,P2,
1291 curCostVolumeLine,hsumBuf,pixDiff,tmpBuf,horPassCostVolume,
1292 vertPassCostVolume,vertPassMin,rightPassBuf,disp2CostBuf,disp2Buf);
1294 // start real processing:
1295 for(int y=src_start_idx;y<src_end_idx;y++)
1297 getRawMatchingCost(curCostVolumeLine,hsumBuf,pixDiff,tmpBuf,y,src_start_idx);
1299 short* disp_row = (short*)cur_disp.ptr(dst_offset+(y-src_start_idx));
1301 // initialize the auxiliary buffers for the pseudo left-right consistency check:
1302 for(int x=0;x<width;x++)
1304 disp2Buf[x] = (short)INVALID_DISP_SCALED;
1305 disp2CostBuf[x] = SHRT_MAX;
1307 CostType* C = curCostVolumeLine - D;
1308 CostType prev_min, min_cost;
1314 for (int x=D;x<(1+width1)*D;x+=D)
1315 accumulateCostsLeftTop(horPassCostVolume+x,horPassCostVolume+x-D,vertPassCostVolume+x,C+x,prev_min,vertPassMin[x/D],D,P1,P2);
1318 memset(rightPassBuf,0,D*sizeof(CostType));
1320 for (int x=width1*D;x>=D;x-=D)
1322 accumulateCostsRight(rightPassBuf,vertPassCostVolume+x,horPassCostVolume+x,C+x,prev_min,D,P1,P2,best_d,min_cost);
1324 if(uniquenessRatio>0)
1327 horPassCostVolume+=x;
1328 int thresh = (100*min_cost)/(100-uniquenessRatio);
1329 v_int16x8 thresh_reg = v_setall_s16((short)(thresh+1));
1330 v_int16x8 d1 = v_setall_s16((short)(best_d-1));
1331 v_int16x8 d2 = v_setall_s16((short)(best_d+1));
1332 v_int16x8 eight_reg = v_setall_s16(8);
1333 v_int16x8 cur_d(0,1,2,3,4,5,6,7);
1334 v_int16x8 mask,cost1,cost2;
1336 for( d = 0; d < D; d+=16 )
1338 cost1 = v_load_aligned(horPassCostVolume+d);
1339 cost2 = v_load_aligned(horPassCostVolume+d+8);
1341 mask = cost1 < thresh_reg;
1342 mask = mask & ( (cur_d<d1) | (cur_d>d2) );
1343 if( v_signmask(mask) )
1346 cur_d = cur_d+eight_reg;
1348 mask = cost2 < thresh_reg;
1349 mask = mask & ( (cur_d<d1) | (cur_d>d2) );
1350 if( v_signmask(mask) )
1353 cur_d = cur_d+eight_reg;
1355 horPassCostVolume-=x;
1357 for( d = 0; d < D; d++ )
1359 if( horPassCostVolume[x+d]*(100 - uniquenessRatio) < min_cost*100 && std::abs(d - best_d) > 1 )
1368 int _x2 = x/D - 1 + minX1 - d - minD;
1369 if( _x2>=0 && _x2<width && disp2CostBuf[_x2] > min_cost )
1371 disp2CostBuf[_x2] = min_cost;
1372 disp2Buf[_x2] = (short)(d + minD);
1375 if( 0 < d && d < D-1 )
1377 // do subpixel quadratic interpolation:
1378 // fit parabola into (x1=d-1, y1=Sp[d-1]), (x2=d, y2=Sp[d]), (x3=d+1, y3=Sp[d+1])
1379 // then find minimum of the parabola.
1380 int denom2 = std::max(horPassCostVolume[x+d-1] + horPassCostVolume[x+d+1] - 2*horPassCostVolume[x+d], 1);
1381 d = d*DISP_SCALE + ((horPassCostVolume[x+d-1] - horPassCostVolume[x+d+1])*DISP_SCALE + denom2)/(denom2*2);
1386 disp_row[(x/D)-1 + minX1] = (DispType)(d + minD*DISP_SCALE);
1389 for(int x = minX1; x < maxX1; x++ )
1391 // pseudo LRC consistency check using only one disparity map;
1392 // pixels with difference more than disp12MaxDiff are invalidated
1393 int d1 = disp_row[x];
1394 if( d1 == INVALID_DISP_SCALED )
1396 int _d = d1 >> StereoMatcher::DISP_SHIFT;
1397 int d_ = (d1 + DISP_SCALE-1) >> StereoMatcher::DISP_SHIFT;
1398 int _x = x - _d, x_ = x - d_;
1399 if( 0 <= _x && _x < width && disp2Buf[_x] >= minD && std::abs(disp2Buf[_x] - _d) > disp12MaxDiff &&
1400 0 <= x_ && x_ < width && disp2Buf[x_] >= minD && std::abs(disp2Buf[x_] - d_) > disp12MaxDiff )
1401 disp_row[x] = (short)INVALID_DISP_SCALED;
1406 static void computeDisparity3WaySGBM( const Mat& img1, const Mat& img2,
1407 Mat& disp1, const StereoSGBMParams& params,
1408 Mat* buffers, int nstripes )
1410 // precompute a lookup table for the raw matching cost computation:
1411 const int TAB_OFS = 256*4, TAB_SIZE = 256 + TAB_OFS*2;
1412 PixType* clipTab = new PixType[TAB_SIZE];
1413 int ftzero = std::max(params.preFilterCap, 15) | 1;
1414 for(int k = 0; k < TAB_SIZE; k++ )
1415 clipTab[k] = (PixType)(std::min(std::max(k - TAB_OFS, -ftzero), ftzero) + ftzero);
1417 // allocate separate dst_disp arrays to avoid conflicts due to stripe overlap:
1418 int stripe_sz = (int)ceil(img1.rows/(double)nstripes);
1419 int stripe_overlap = (params.SADWindowSize/2+1) + (int)ceil(0.1*stripe_sz);
1420 Mat* dst_disp = new Mat[nstripes];
1421 for(int i=0;i<nstripes;i++)
1422 dst_disp[i].create(stripe_sz+stripe_overlap,img1.cols,CV_16S);
1424 parallel_for_(Range(0,nstripes),SGBM3WayMainLoop(buffers,img1,img2,dst_disp,params,clipTab,nstripes,stripe_overlap));
1426 //assemble disp1 from dst_disp:
1429 for(int i=0;i<disp1.rows;i++)
1431 dst_row = (short*)disp1.ptr(i);
1432 src_row = (short*)dst_disp[i/stripe_sz].ptr(stripe_overlap+i%stripe_sz);
1433 memcpy(dst_row,src_row,disp1.cols*sizeof(short));
1440 class StereoSGBMImpl : public StereoSGBM
1445 params = StereoSGBMParams();
1448 StereoSGBMImpl( int _minDisparity, int _numDisparities, int _SADWindowSize,
1449 int _P1, int _P2, int _disp12MaxDiff, int _preFilterCap,
1450 int _uniquenessRatio, int _speckleWindowSize, int _speckleRange,
1453 params = StereoSGBMParams( _minDisparity, _numDisparities, _SADWindowSize,
1454 _P1, _P2, _disp12MaxDiff, _preFilterCap,
1455 _uniquenessRatio, _speckleWindowSize, _speckleRange,
1459 void compute( InputArray leftarr, InputArray rightarr, OutputArray disparr )
1461 Mat left = leftarr.getMat(), right = rightarr.getMat();
1462 CV_Assert( left.size() == right.size() && left.type() == right.type() &&
1463 left.depth() == CV_8U );
1465 disparr.create( left.size(), CV_16S );
1466 Mat disp = disparr.getMat();
1468 if(params.mode==MODE_SGBM_3WAY)
1469 computeDisparity3WaySGBM( left, right, disp, params, buffers, num_stripes );
1471 computeDisparitySGBM( left, right, disp, params, buffer );
1473 medianBlur(disp, disp, 3);
1475 if( params.speckleWindowSize > 0 )
1476 filterSpeckles(disp, (params.minDisparity - 1)*StereoMatcher::DISP_SCALE, params.speckleWindowSize,
1477 StereoMatcher::DISP_SCALE*params.speckleRange, buffer);
1480 int getMinDisparity() const { return params.minDisparity; }
1481 void setMinDisparity(int minDisparity) { params.minDisparity = minDisparity; }
1483 int getNumDisparities() const { return params.numDisparities; }
1484 void setNumDisparities(int numDisparities) { params.numDisparities = numDisparities; }
1486 int getBlockSize() const { return params.SADWindowSize; }
1487 void setBlockSize(int blockSize) { params.SADWindowSize = blockSize; }
1489 int getSpeckleWindowSize() const { return params.speckleWindowSize; }
1490 void setSpeckleWindowSize(int speckleWindowSize) { params.speckleWindowSize = speckleWindowSize; }
1492 int getSpeckleRange() const { return params.speckleRange; }
1493 void setSpeckleRange(int speckleRange) { params.speckleRange = speckleRange; }
1495 int getDisp12MaxDiff() const { return params.disp12MaxDiff; }
1496 void setDisp12MaxDiff(int disp12MaxDiff) { params.disp12MaxDiff = disp12MaxDiff; }
1498 int getPreFilterCap() const { return params.preFilterCap; }
1499 void setPreFilterCap(int preFilterCap) { params.preFilterCap = preFilterCap; }
1501 int getUniquenessRatio() const { return params.uniquenessRatio; }
1502 void setUniquenessRatio(int uniquenessRatio) { params.uniquenessRatio = uniquenessRatio; }
1504 int getP1() const { return params.P1; }
1505 void setP1(int P1) { params.P1 = P1; }
1507 int getP2() const { return params.P2; }
1508 void setP2(int P2) { params.P2 = P2; }
1510 int getMode() const { return params.mode; }
1511 void setMode(int mode) { params.mode = mode; }
1513 void write(FileStorage& fs) const
1515 fs << "name" << name_
1516 << "minDisparity" << params.minDisparity
1517 << "numDisparities" << params.numDisparities
1518 << "blockSize" << params.SADWindowSize
1519 << "speckleWindowSize" << params.speckleWindowSize
1520 << "speckleRange" << params.speckleRange
1521 << "disp12MaxDiff" << params.disp12MaxDiff
1522 << "preFilterCap" << params.preFilterCap
1523 << "uniquenessRatio" << params.uniquenessRatio
1524 << "P1" << params.P1
1525 << "P2" << params.P2
1526 << "mode" << params.mode;
1529 void read(const FileNode& fn)
1531 FileNode n = fn["name"];
1532 CV_Assert( n.isString() && String(n) == name_ );
1533 params.minDisparity = (int)fn["minDisparity"];
1534 params.numDisparities = (int)fn["numDisparities"];
1535 params.SADWindowSize = (int)fn["blockSize"];
1536 params.speckleWindowSize = (int)fn["speckleWindowSize"];
1537 params.speckleRange = (int)fn["speckleRange"];
1538 params.disp12MaxDiff = (int)fn["disp12MaxDiff"];
1539 params.preFilterCap = (int)fn["preFilterCap"];
1540 params.uniquenessRatio = (int)fn["uniquenessRatio"];
1541 params.P1 = (int)fn["P1"];
1542 params.P2 = (int)fn["P2"];
1543 params.mode = (int)fn["mode"];
1546 StereoSGBMParams params;
1549 // the number of stripes is fixed, disregarding the number of threads/processors
1550 // to make the results fully reproducible:
1551 static const int num_stripes = 4;
1552 Mat buffers[num_stripes];
1554 static const char* name_;
1557 const char* StereoSGBMImpl::name_ = "StereoMatcher.SGBM";
1560 Ptr<StereoSGBM> StereoSGBM::create(int minDisparity, int numDisparities, int SADWindowSize,
1561 int P1, int P2, int disp12MaxDiff,
1562 int preFilterCap, int uniquenessRatio,
1563 int speckleWindowSize, int speckleRange,
1566 return Ptr<StereoSGBM>(
1567 new StereoSGBMImpl(minDisparity, numDisparities, SADWindowSize,
1568 P1, P2, disp12MaxDiff,
1569 preFilterCap, uniquenessRatio,
1570 speckleWindowSize, speckleRange,
1574 Rect getValidDisparityROI( Rect roi1, Rect roi2,
1576 int numberOfDisparities,
1579 int SW2 = SADWindowSize/2;
1580 int minD = minDisparity, maxD = minDisparity + numberOfDisparities - 1;
1582 int xmin = std::max(roi1.x, roi2.x + maxD) + SW2;
1583 int xmax = std::min(roi1.x + roi1.width, roi2.x + roi2.width - minD) - SW2;
1584 int ymin = std::max(roi1.y, roi2.y) + SW2;
1585 int ymax = std::min(roi1.y + roi1.height, roi2.y + roi2.height) - SW2;
1587 Rect r(xmin, ymin, xmax - xmin, ymax - ymin);
1589 return r.width > 0 && r.height > 0 ? r : Rect();
1592 typedef cv::Point_<short> Point2s;
1594 template <typename T>
1595 void filterSpecklesImpl(cv::Mat& img, int newVal, int maxSpeckleSize, int maxDiff, cv::Mat& _buf)
1599 int width = img.cols, height = img.rows, npixels = width*height;
1600 size_t bufSize = npixels*(int)(sizeof(Point2s) + sizeof(int) + sizeof(uchar));
1601 if( !_buf.isContinuous() || _buf.empty() || _buf.cols*_buf.rows*_buf.elemSize() < bufSize )
1602 _buf.create(1, (int)bufSize, CV_8U);
1604 uchar* buf = _buf.ptr();
1605 int i, j, dstep = (int)(img.step/sizeof(T));
1606 int* labels = (int*)buf;
1607 buf += npixels*sizeof(labels[0]);
1608 Point2s* wbuf = (Point2s*)buf;
1609 buf += npixels*sizeof(wbuf[0]);
1610 uchar* rtype = (uchar*)buf;
1613 // clear out label assignments
1614 memset(labels, 0, npixels*sizeof(labels[0]));
1616 for( i = 0; i < height; i++ )
1618 T* ds = img.ptr<T>(i);
1619 int* ls = labels + width*i;
1621 for( j = 0; j < width; j++ )
1623 if( ds[j] != newVal ) // not a bad disparity
1625 if( ls[j] ) // has a label, check for bad label
1627 if( rtype[ls[j]] ) // small region, zero out disparity
1630 // no label, assign and propagate
1633 Point2s* ws = wbuf; // initialize wavefront
1634 Point2s p((short)j, (short)i); // current pixel
1635 curlabel++; // next label
1636 int count = 0; // current region size
1639 // wavefront propagation
1640 while( ws >= wbuf ) // wavefront not empty
1643 // put neighbors onto wavefront
1644 T* dpp = &img.at<T>(p.y, p.x);
1646 int* lpp = labels + width*p.y + p.x;
1648 if( p.y < height-1 && !lpp[+width] && dpp[+dstep] != newVal && std::abs(dp - dpp[+dstep]) <= maxDiff )
1650 lpp[+width] = curlabel;
1651 *ws++ = Point2s(p.x, p.y+1);
1654 if( p.y > 0 && !lpp[-width] && dpp[-dstep] != newVal && std::abs(dp - dpp[-dstep]) <= maxDiff )
1656 lpp[-width] = curlabel;
1657 *ws++ = Point2s(p.x, p.y-1);
1660 if( p.x < width-1 && !lpp[+1] && dpp[+1] != newVal && std::abs(dp - dpp[+1]) <= maxDiff )
1663 *ws++ = Point2s(p.x+1, p.y);
1666 if( p.x > 0 && !lpp[-1] && dpp[-1] != newVal && std::abs(dp - dpp[-1]) <= maxDiff )
1669 *ws++ = Point2s(p.x-1, p.y);
1672 // pop most recent and propagate
1673 // NB: could try least recent, maybe better convergence
1677 // assign label type
1678 if( count <= maxSpeckleSize ) // speckle region
1680 rtype[ls[j]] = 1; // small region label
1684 rtype[ls[j]] = 0; // large region label
1692 static bool ipp_filterSpeckles(Mat &img, int maxSpeckleSize, int newVal, int maxDiff)
1694 #if IPP_VERSION_X100 >= 810
1695 int type = img.type();
1697 IppiSize roisize = { img.cols, img.rows };
1698 IppDataType datatype = type == CV_8UC1 ? ipp8u : ipp16s;
1699 Ipp8u *pBuffer = NULL;
1700 IppStatus status = ippStsNoErr;
1702 if(ippiMarkSpecklesGetBufferSize(roisize, datatype, CV_MAT_CN(type), &bufsize) < 0)
1705 pBuffer = (Ipp8u*)ippMalloc(bufsize);
1706 if(!pBuffer && bufsize)
1709 if (type == CV_8UC1)
1711 status = ippiMarkSpeckles_8u_C1IR(img.ptr<Ipp8u>(), (int)img.step, roisize,
1712 (Ipp8u)newVal, maxSpeckleSize, (Ipp8u)maxDiff, ippiNormL1, pBuffer);
1716 status = ippiMarkSpeckles_16s_C1IR(img.ptr<Ipp16s>(), (int)img.step, roisize,
1717 (Ipp16s)newVal, maxSpeckleSize, (Ipp16s)maxDiff, ippiNormL1, pBuffer);
1719 if(pBuffer) ippFree(pBuffer);
1724 CV_UNUSED(img); CV_UNUSED(maxSpeckleSize); CV_UNUSED(newVal); CV_UNUSED(maxDiff);
1732 void cv::filterSpeckles( InputOutputArray _img, double _newval, int maxSpeckleSize,
1733 double _maxDiff, InputOutputArray __buf )
1735 Mat img = _img.getMat();
1736 int type = img.type();
1737 Mat temp, &_buf = __buf.needed() ? __buf.getMatRef() : temp;
1738 CV_Assert( type == CV_8UC1 || type == CV_16SC1 );
1740 int newVal = cvRound(_newval), maxDiff = cvRound(_maxDiff);
1742 CV_IPP_RUN(IPP_VERSION_X100 >= 810 && !__buf.needed() && (type == CV_8UC1 || type == CV_16SC1), ipp_filterSpeckles(img, maxSpeckleSize, newVal, maxDiff));
1744 if (type == CV_8UC1)
1745 filterSpecklesImpl<uchar>(img, newVal, maxSpeckleSize, maxDiff, _buf);
1747 filterSpecklesImpl<short>(img, newVal, maxSpeckleSize, maxDiff, _buf);
1750 void cv::validateDisparity( InputOutputArray _disp, InputArray _cost, int minDisparity,
1751 int numberOfDisparities, int disp12MaxDiff )
1753 Mat disp = _disp.getMat(), cost = _cost.getMat();
1754 int cols = disp.cols, rows = disp.rows;
1755 int minD = minDisparity, maxD = minDisparity + numberOfDisparities;
1756 int x, minX1 = std::max(maxD, 0), maxX1 = cols + std::min(minD, 0);
1757 AutoBuffer<int> _disp2buf(cols*2);
1758 int* disp2buf = _disp2buf;
1759 int* disp2cost = disp2buf + cols;
1760 const int DISP_SHIFT = 4, DISP_SCALE = 1 << DISP_SHIFT;
1761 int INVALID_DISP = minD - 1, INVALID_DISP_SCALED = INVALID_DISP*DISP_SCALE;
1762 int costType = cost.type();
1764 disp12MaxDiff *= DISP_SCALE;
1766 CV_Assert( numberOfDisparities > 0 && disp.type() == CV_16S &&
1767 (costType == CV_16S || costType == CV_32S) &&
1768 disp.size() == cost.size() );
1770 for( int y = 0; y < rows; y++ )
1772 short* dptr = disp.ptr<short>(y);
1774 for( x = 0; x < cols; x++ )
1776 disp2buf[x] = INVALID_DISP_SCALED;
1777 disp2cost[x] = INT_MAX;
1780 if( costType == CV_16S )
1782 const short* cptr = cost.ptr<short>(y);
1784 for( x = minX1; x < maxX1; x++ )
1786 int d = dptr[x], c = cptr[x];
1788 if( d == INVALID_DISP_SCALED )
1791 int x2 = x - ((d + DISP_SCALE/2) >> DISP_SHIFT);
1793 if( disp2cost[x2] > c )
1802 const int* cptr = cost.ptr<int>(y);
1804 for( x = minX1; x < maxX1; x++ )
1806 int d = dptr[x], c = cptr[x];
1808 if( d == INVALID_DISP_SCALED )
1811 int x2 = x - ((d + DISP_SCALE/2) >> DISP_SHIFT);
1813 if( disp2cost[x2] > c )
1821 for( x = minX1; x < maxX1; x++ )
1823 // we round the computed disparity both towards -inf and +inf and check
1824 // if either of the corresponding disparities in disp2 is consistent.
1825 // This is to give the computed disparity a chance to look valid if it is.
1827 if( d == INVALID_DISP_SCALED )
1829 int d0 = d >> DISP_SHIFT;
1830 int d1 = (d + DISP_SCALE-1) >> DISP_SHIFT;
1831 int x0 = x - d0, x1 = x - d1;
1832 if( (0 <= x0 && x0 < cols && disp2buf[x0] > INVALID_DISP_SCALED && std::abs(disp2buf[x0] - d) > disp12MaxDiff) &&
1833 (0 <= x1 && x1 < cols && disp2buf[x1] > INVALID_DISP_SCALED && std::abs(disp2buf[x1] - d) > disp12MaxDiff) )
1834 dptr[x] = (short)INVALID_DISP_SCALED;