From: Maksim Shabunin Date: Thu, 13 Feb 2020 11:25:47 +0000 (+0300) Subject: Use BufferArea in more places X-Git-Tag: submit/tizen/20210224.033012~2^2~296^2~1^2 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=bf96d8239dfb4b0a40260447ade886762094428f;p=platform%2Fupstream%2Fopencv.git Use BufferArea in more places --- diff --git a/modules/calib3d/src/rho.cpp b/modules/calib3d/src/rho.cpp index 3cfa6b1..341b6b9 100644 --- a/modules/calib3d/src/rho.cpp +++ b/modules/calib3d/src/rho.cpp @@ -55,7 +55,7 @@ #include #include #include "rho.h" - +#include "opencv2/core/utils/buffer_area.private.hpp" @@ -65,7 +65,6 @@ namespace cv{/* For C support, replace with extern "C" { */ /* Constants */ -const int MEM_ALIGN = 32; const size_t HSIZE = (3*3*sizeof(float)); const double MIN_DELTA_CHNG = 0.1; // const double CHI_STAT = 2.706; @@ -312,16 +311,14 @@ struct RHO_HEST_REFC : RHO_HEST{ /* Levenberg-Marquardt Refinement */ struct{ - float (* JtJ)[8]; /* JtJ matrix */ - float (* tmp1)[8]; /* Temporary 1 */ + float* JtJ; /* JtJ matrix */ + float* tmp1; /* Temporary 1 */ float* Jte; /* Jte vector */ } lm; /* Memory Management */ - struct{ - cv::Mat perObj; - cv::Mat perRun; - } mem; + utils::BufferArea runArea; + utils::BufferArea objArea; /* Initialized? */ int initialized; @@ -659,16 +656,9 @@ inline int RHO_HEST_REFC::initialize(void){ fastSeed((uint64_t)~0); + initialized = 1; - int areAllAllocsSuccessful = !mem.perObj.empty(); - - if(!areAllAllocsSuccessful){ - finalize(); - }else{ - initialized = 1; - } - - return areAllAllocsSuccessful; + return true; } /** @@ -835,45 +825,14 @@ unsigned RHO_HEST_REFC::rhoHest(const float* src, /* Source points */ */ inline void RHO_HEST_REFC::allocatePerObj(void){ - /* We have known sizes */ - size_t ctrl_smpl_sz = SMPL_SIZE*sizeof(*ctrl.smpl); - size_t curr_pkdPts_sz = SMPL_SIZE*2*2*sizeof(*curr.pkdPts); - size_t curr_H_sz = HSIZE; - size_t best_H_sz = HSIZE; - size_t lm_JtJ_sz = 8*8*sizeof(float); - size_t lm_tmp1_sz = 8*8*sizeof(float); - size_t lm_Jte_sz = 1*8*sizeof(float); - - /* We compute offsets */ - size_t total = 0; -#define MK_OFFSET(v) \ - size_t v ## _of = total; \ - total = alignSize(v ## _of + v ## _sz, MEM_ALIGN) - - MK_OFFSET(ctrl_smpl); - MK_OFFSET(curr_pkdPts); - MK_OFFSET(curr_H); - MK_OFFSET(best_H); - MK_OFFSET(lm_JtJ); - MK_OFFSET(lm_tmp1); - MK_OFFSET(lm_Jte); - -#undef MK_OFFSET - - /* Allocate dynamic memory managed by cv::Mat */ - mem.perObj.create(1, (int)(total + MEM_ALIGN), CV_8UC1); - - /* Extract aligned pointer */ - unsigned char* ptr = alignPtr(mem.perObj.data, MEM_ALIGN); - - /* Assign pointers */ - ctrl.smpl = (unsigned*) (ptr + ctrl_smpl_of); - curr.pkdPts = (float*) (ptr + curr_pkdPts_of); - curr.H = (float*) (ptr + curr_H_of); - best.H = (float*) (ptr + best_H_of); - lm.JtJ = (float(*)[8])(ptr + lm_JtJ_of); - lm.tmp1 = (float(*)[8])(ptr + lm_tmp1_of); - lm.Jte = (float*) (ptr + lm_Jte_of); + objArea.allocate(ctrl.smpl, SMPL_SIZE); + objArea.allocate(curr.pkdPts, SMPL_SIZE*2*2); + objArea.allocate(curr.H, HSIZE); + objArea.allocate(best.H, HSIZE); + objArea.allocate(lm.JtJ, 8*8); + objArea.allocate(lm.tmp1, 8*8); + objArea.allocate(lm.Jte, 1*8); + objArea.commit(); } @@ -885,30 +844,9 @@ inline void RHO_HEST_REFC::allocatePerObj(void){ */ inline void RHO_HEST_REFC::allocatePerRun(void){ - /* We have known sizes */ - size_t best_inl_sz = arg.N; - size_t curr_inl_sz = arg.N; - - /* We compute offsets */ - size_t total = 0; -#define MK_OFFSET(v) \ - size_t v ## _of = total; \ - total = alignSize(v ## _of + v ## _sz, MEM_ALIGN) - - MK_OFFSET(best_inl); - MK_OFFSET(curr_inl); - -#undef MK_OFFSET - - /* Allocate dynamic memory managed by cv::Mat */ - mem.perRun.create(1, (int)(total + MEM_ALIGN), CV_8UC1); - - /* Extract aligned pointer */ - unsigned char* ptr = alignPtr(mem.perRun.data, MEM_ALIGN); - - /* Assign pointers */ - best.inl = (char*)(ptr + best_inl_of); - curr.inl = (char*)(ptr + curr_inl_of); + runArea.allocate(best.inl, arg.N); + runArea.allocate(curr.inl, arg.N); + runArea.commit(); } @@ -919,10 +857,7 @@ inline void RHO_HEST_REFC::allocatePerRun(void){ */ inline void RHO_HEST_REFC::deallocatePerRun(void){ - best.inl = NULL; - curr.inl = NULL; - - mem.perRun.release(); + runArea.release(); } @@ -933,15 +868,7 @@ inline void RHO_HEST_REFC::deallocatePerRun(void){ */ inline void RHO_HEST_REFC::deallocatePerObj(void){ - ctrl.smpl = NULL; - curr.pkdPts = NULL; - curr.H = NULL; - best.H = NULL; - lm.JtJ = NULL; - lm.tmp1 = NULL; - lm.Jte = NULL; - - mem.perObj.release(); + objArea.release(); } @@ -2144,7 +2071,7 @@ inline void RHO_HEST_REFC::refine(void){ */ /* Find initial conditions */ sacCalcJacobianErrors(best.H, arg.src, arg.dst, best.inl, arg.N, - lm.JtJ, lm.Jte, &S); + (float(*)[8])lm.JtJ, lm.Jte, &S); /*Levenberg-Marquardt Loop.*/ for(i=0;i #include "opencv2/core/hal/intrin.hpp" +#include "opencv2/core/utils/buffer_area.private.hpp" namespace cv { @@ -99,6 +100,16 @@ struct StereoSGBMParams mode = _mode; } + inline bool isFullDP() const + { + return mode == StereoSGBM::MODE_HH || mode == StereoSGBM::MODE_HH4; + } + inline Size calcSADWindowSize() const + { + const int dim = SADWindowSize > 0 ? SADWindowSize : 5; + return Size(dim, dim); + } + int minDisparity; int numDisparities; int SADWindowSize; @@ -148,6 +159,7 @@ static inline void min_pos(const v_int16& val, const v_int16& pos, short &min_va #endif static const int DEFAULT_RIGHT_BORDER = -1; + /* For each pixel row1[x], max(maxD, 0) <= minX <= x < maxX <= width - max(0, -minD), and for each disparity minD<=d(y), *row2 = img2.ptr(y); PixType *prow1 = buffer + width2*2, *prow2 = prow1 + width*cn*2; - tab += tabOfs; - for( c = 0; c < cn*2; c++ ) { prow1[width*c] = prow1[width*c + width-1] = @@ -297,6 +307,166 @@ static void calcPixelCostBT( const Mat& img1, const Mat& img2, int y, } + +class BufferSGBM +{ +private: + size_t width1; + size_t Da; + size_t Dlra; + size_t costWidth; + size_t costHeight; + size_t hsumRows; + bool fullDP; + uchar dirs; + uchar dirs2; + static const size_t TAB_OFS = 256*4; + +public: + CostType* Cbuf; + CostType* Sbuf; + CostType* hsumBuf; + CostType* pixDiff; + CostType* disp2cost; + DispType* disp2ptr; + PixType* tempBuf; + std::vector Lr; + std::vector minLr; + PixType * clipTab; + +private: + utils::BufferArea area; + +public: + BufferSGBM(size_t width1_, + size_t Da_, + size_t Dlra_, + size_t cn, + size_t width, + size_t height, + const StereoSGBMParams ¶ms) + : width1(width1_), + Da(Da_), + Dlra(Dlra_), + Cbuf(NULL), + Sbuf(NULL), + hsumBuf(NULL), + pixDiff(NULL), + disp2cost(NULL), + disp2ptr(NULL), + tempBuf(NULL), + Lr(2, (CostType*)NULL), + minLr(2, (CostType*)NULL), + clipTab(NULL) + { + const size_t TAB_SIZE = 256 + TAB_OFS*2; + fullDP = params.isFullDP(); + costWidth = width1 * Da; + costHeight = fullDP ? height : 1; + hsumRows = params.calcSADWindowSize().height + 2; + dirs = params.mode == StereoSGBM::MODE_HH4 ? 1 : NR; + dirs2 = params.mode == StereoSGBM::MODE_HH4 ? 1 : NR2; + // for each possible stereo match (img1(x,y) <=> img2(x-d,y)) + // we keep pixel difference cost (C) and the summary cost over NR directions (S). + // we also keep all the partial costs for the previous line L_r(x,d) and also min_k L_r(x, k) + area.allocate(Cbuf, costWidth * costHeight, CV_SIMD_WIDTH); // summary cost over different (nDirs) directions + area.allocate(Sbuf, costWidth * costHeight, CV_SIMD_WIDTH); + area.allocate(hsumBuf, costWidth * hsumRows, CV_SIMD_WIDTH); + area.allocate(pixDiff, costWidth, CV_SIMD_WIDTH); + area.allocate(disp2cost, width, CV_SIMD_WIDTH); + area.allocate(disp2ptr, width, CV_SIMD_WIDTH); + area.allocate(tempBuf, width * (4 * cn + 2), CV_SIMD_WIDTH); + // the number of L_r(.,.) and min_k L_r(.,.) lines in the buffer: + // for 8-way dynamic programming we need the current row and + // the previous row, i.e. 2 rows in total + for (size_t i = 0; i < 2; ++i) + { + // 2D: [ NR ][ w1 * NR2 ][ NR ] * [ Dlra ] + area.allocate(Lr[i], calcLrCount() * Dlra, CV_SIMD_WIDTH); + // 1D: [ NR ][ w1 * NR2 ][ NR ] + area.allocate(minLr[i], calcLrCount(), CV_SIMD_WIDTH); + } + area.allocate(clipTab, TAB_SIZE, CV_SIMD_WIDTH); + area.commit(); + + // init clipTab + const int ftzero = std::max(params.preFilterCap, 15) | 1; + for(int i = 0; i < (int)TAB_SIZE; i++ ) + clipTab[i] = (PixType)(std::min(std::max(i - (int)TAB_OFS, -ftzero), ftzero) + ftzero); + } + inline const PixType * getClipTab() const + { + return clipTab + TAB_OFS; + } + inline void initCBuf(CostType val) const + { + for (size_t i = 0; i < costWidth * costHeight; ++i) + Cbuf[i] = val; + } + inline void clearLr(const Range & range = Range::all()) const + { + for (uchar i = 0; i < 2; ++i) + { + if (range == Range::all()) + { + memset(Lr[i], 0, calcLrCount() * Dlra * sizeof(CostType)); + memset(minLr[i], 0, calcLrCount() * sizeof(CostType)); + } + else + { + memset(getLr(i, range.start), 0, range.size() * sizeof(CostType) * Dlra); + memset(getMinLr(i, range.start), 0, range.size() * sizeof(CostType)); + } + } + } + inline size_t calcLrCount() const + { + return width1 * dirs2 + 2 * dirs; + } + inline void swapLr() + { + std::swap(Lr[0], Lr[1]); + std::swap(minLr[0], minLr[1]); + } + inline CostType * getHSumBuf(int row) const + { + return hsumBuf + (row % hsumRows) * costWidth; + } + inline CostType * getCBuf(int row) const + { + CV_Assert(row >= 0); + return Cbuf + (!fullDP ? 0 : (row * costWidth)); + } + inline CostType * getSBuf(int row) const + { + CV_Assert(row >= 0); + return Sbuf + (!fullDP ? 0 : (row * costWidth)); + } + inline void clearSBuf(int row, const Range & range = Range::all()) const + { + if (range == Range::all()) + memset(getSBuf(row), 0, costWidth * sizeof(CostType)); + else + memset(getSBuf(row) + range.start * Da, 0, range.size() * Da * sizeof(CostType)); + } + + // shift Lr[k] and minLr[k] pointers, because we allocated them with the borders, + // and will occasionally use negative indices with the arrays + // we need to shift Lr[k] pointers by 1, to give the space for d=-1. + inline CostType * getLr(uchar id, int idx, uchar shift = 0) const + { + CV_Assert(id < 2); + const size_t fixed_offset = dirs * Dlra; + return Lr[id] + fixed_offset + (idx * (int)dirs2 + (int)shift) * (int)Dlra; + } + inline CostType * getMinLr(uchar id, int idx, uchar shift = 0) const + { + CV_Assert(id < 2); + const size_t fixed_offset = dirs; + return minLr[id] + fixed_offset + (idx * dirs2 + shift); + } +}; + /* computes disparity for "roi" in img1 w.r.t. img2 and write it to disp1buf. that is, disp1buf(x, y)=d means that img1(x+roi.x, y+roi.y) ~ img2(x+roi.x-d, y+roi.y). @@ -318,34 +488,25 @@ static void calcPixelCostBT( const Mat& img1, const Mat& img2, int y, It contains the minimum current cost, used to find the best disparity, corresponding to the minimal cost. */ static void computeDisparitySGBM( const Mat& img1, const Mat& img2, - Mat& disp1, const StereoSGBMParams& params, - Mat& buffer ) + Mat& disp1, const StereoSGBMParams& params ) { const int DISP_SHIFT = StereoMatcher::DISP_SHIFT; const int DISP_SCALE = (1 << DISP_SHIFT); const CostType MAX_COST = SHRT_MAX; int minD = params.minDisparity, maxD = minD + params.numDisparities; - Size SADWindowSize; - SADWindowSize.width = SADWindowSize.height = params.SADWindowSize > 0 ? params.SADWindowSize : 5; - int ftzero = std::max(params.preFilterCap, 15) | 1; int uniquenessRatio = params.uniquenessRatio >= 0 ? params.uniquenessRatio : 10; int disp12MaxDiff = params.disp12MaxDiff > 0 ? params.disp12MaxDiff : 1; int P1 = params.P1 > 0 ? params.P1 : 2, P2 = std::max(params.P2 > 0 ? params.P2 : 5, P1+1); int k, width = disp1.cols, height = disp1.rows; int minX1 = std::max(maxD, 0), maxX1 = width + std::min(minD, 0); - int D = maxD - minD, width1 = maxX1 - minX1; + const int D = params.numDisparities; + int width1 = maxX1 - minX1; int Da = (int)alignSize(D, v_int16::nlanes); int Dlra = Da + v_int16::nlanes;//Additional memory is necessary to store disparity values(MAX_COST) for d=-1 and d=D int INVALID_DISP = minD - 1, INVALID_DISP_SCALED = INVALID_DISP*DISP_SCALE; - int SW2 = SADWindowSize.width/2, SH2 = SADWindowSize.height/2; - bool fullDP = params.mode == StereoSGBM::MODE_HH; - int npasses = fullDP ? 2 : 1; - const int TAB_OFS = 256*4, TAB_SIZE = 256 + TAB_OFS*2; - PixType clipTab[TAB_SIZE]; - - for( k = 0; k < TAB_SIZE; k++ ) - clipTab[k] = (PixType)(std::min(std::max(k - TAB_OFS, -ftzero), ftzero) + ftzero); + int SW2 = params.calcSADWindowSize().width/2, SH2 = params.calcSADWindowSize().height/2; + int npasses = params.isFullDP() ? 2 : 1; if( minX1 >= maxX1 ) { @@ -353,39 +514,8 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2, return; } - // for each possible stereo match (img1(x,y) <=> img2(x-d,y)) - // we keep pixel difference cost (C) and the summary cost over NR directions (S). - // we also keep all the partial costs for the previous line L_r(x,d) and also min_k L_r(x, k) - size_t costBufSize = width1*Da; - size_t CSBufSize = costBufSize*(fullDP ? height : 1); - size_t minLrSize = (width1 + 2)*NR2, LrSize = minLrSize*Dlra; - int hsumBufNRows = SH2*2 + 2; - // the number of L_r(.,.) and min_k L_r(.,.) lines in the buffer: - // for 8-way dynamic programming we need the current row and - // the previous row, i.e. 2 rows in total - size_t totalBufSize = CV_SIMD_WIDTH + CSBufSize * 2 * sizeof(CostType) + // alignment, C, S - costBufSize*(hsumBufNRows + 1)*sizeof(CostType) + // hsumBuf, pixdiff - ((LrSize + minLrSize)*2 + v_int16::nlanes) * sizeof(CostType) + // minLr[] and Lr[] - width*(sizeof(CostType) + sizeof(DispType)) + // disp2cost + disp2 - width * (4*img1.channels() + 2) * sizeof(PixType); // temp buffer for computing per-pixel cost - - if( buffer.empty() || !buffer.isContinuous() || - buffer.cols*buffer.rows*buffer.elemSize() < totalBufSize ) - buffer.reserveBuffer(totalBufSize); - - // summary cost over different (nDirs) directions - CostType* Cbuf = (CostType*)alignPtr(buffer.ptr(), CV_SIMD_WIDTH); - CostType* Sbuf = Cbuf + CSBufSize; - CostType* hsumBuf = Sbuf + CSBufSize; - CostType* pixDiff = hsumBuf + costBufSize*hsumBufNRows; - - CostType* disp2cost = pixDiff + costBufSize + ((LrSize + minLrSize)*2 + v_int16::nlanes); - DispType* disp2ptr = (DispType*)(disp2cost + width); - PixType* tempBuf = (PixType*)(disp2ptr + width); - - // add P2 to every C(x,y). it saves a few operations in the inner loops - for(k = 0; k < (int)CSBufSize; k++ ) - Cbuf[k] = (CostType)P2; + BufferSGBM mem(width1, Da, Dlra, img1.channels(), width, height, params); + mem.initCBuf((CostType)P2); // add P2 to every C(x,y). it saves a few operations in the inner loops for( int pass = 1; pass <= npasses; pass++ ) { @@ -402,27 +532,15 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2, x1 = width1-1; x2 = -1; dx = -1; } - CostType *Lr[2]={0}, *minLr[2]={0}; - - for( k = 0; k < 2; k++ ) - { - // shift Lr[k] and minLr[k] pointers, because we allocated them with the borders, - // and will occasionally use negative indices with the arrays - // we need to shift Lr[k] pointers by 1, to give the space for d=-1. - // however, then the alignment will be imperfect, i.e. bad for SSE, - // thus we shift the pointers by SIMD vector size - Lr[k] = pixDiff + costBufSize + v_int16::nlanes + LrSize*k + NR2*Dlra; - memset( Lr[k] - NR2*Dlra, 0, LrSize*sizeof(CostType) ); - minLr[k] = pixDiff + costBufSize + v_int16::nlanes + LrSize*2 + minLrSize*k + NR2; - memset( minLr[k] - NR2, 0, minLrSize*sizeof(CostType) ); - } + uchar lrID = 0; + mem.clearLr(); for( int y = y1; y != y2; y += dy ) { int x, d; DispType* disp1ptr = disp1.ptr(y); - CostType* C = Cbuf + (!fullDP ? 0 : y*costBufSize); - CostType* S = Sbuf + (!fullDP ? 0 : y*costBufSize); + CostType* const C = mem.getCBuf(y); + CostType* const S = mem.getSBuf(y); if( pass == 1 ) // compute C on the first pass, and reuse it on the second pass, if any. { @@ -430,35 +548,35 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2, for( k = dy1; k <= dy2; k++ ) { - CostType* hsumAdd = hsumBuf + (std::min(k, height-1) % hsumBufNRows)*costBufSize; + CostType* hsumAdd = mem.getHSumBuf(std::min(k, height-1)); if( k < height ) { - calcPixelCostBT( img1, img2, k, minD, maxD, pixDiff, tempBuf, clipTab, TAB_OFS, ftzero ); + calcPixelCostBT( img1, img2, k, minD, maxD, mem.pixDiff, mem.tempBuf, mem.getClipTab() ); memset(hsumAdd, 0, Da*sizeof(CostType)); #if CV_SIMD v_int16 h_scale = vx_setall_s16((short)SW2 + 1); for( d = 0; d < Da; d += v_int16::nlanes ) { - v_int16 v_hsumAdd = vx_load_aligned(pixDiff + d) * h_scale; + v_int16 v_hsumAdd = vx_load_aligned(mem.pixDiff + d) * h_scale; for( x = Da; x <= SW2*Da; x += Da ) - v_hsumAdd += vx_load_aligned(pixDiff + x + d); + v_hsumAdd += vx_load_aligned(mem.pixDiff + x + d); v_store_aligned(hsumAdd + d, v_hsumAdd); } #else for (d = 0; d < D; d++) { - hsumAdd[d] = (CostType)(pixDiff[d] * (SW2 + 1)); + hsumAdd[d] = (CostType)(mem.pixDiff[d] * (SW2 + 1)); for( x = Da; x <= SW2*Da; x += Da ) - hsumAdd[d] = (CostType)(hsumAdd[d] + pixDiff[x + d]); + hsumAdd[d] = (CostType)(hsumAdd[d] + mem.pixDiff[x + d]); } #endif if( y > 0 ) { - const CostType* hsumSub = hsumBuf + (std::max(y - SH2 - 1, 0) % hsumBufNRows)*costBufSize; - const CostType* Cprev = !fullDP || y == 0 ? C : C - costBufSize; + const CostType* hsumSub = mem.getHSumBuf(std::max(y - SH2 - 1, 0)); + const CostType* Cprev = mem.getCBuf(y - 1); #if CV_SIMD for (d = 0; d < Da; d += v_int16::nlanes) @@ -470,8 +588,8 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2, for( x = Da; x < width1*Da; x += Da ) { - const CostType* pixAdd = pixDiff + std::min(x + SW2*Da, (width1-1)*Da); - const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*Da, 0); + const CostType* pixAdd = mem.pixDiff + std::min(x + SW2*Da, (width1-1)*Da); + const CostType* pixSub = mem.pixDiff + std::max(x - (SW2+1)*Da, 0); #if CV_SIMD for( d = 0; d < Da; d += v_int16::nlanes ) { @@ -501,8 +619,8 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2, #endif for( x = Da; x < width1*Da; x += Da ) { - const CostType* pixAdd = pixDiff + std::min(x + SW2*Da, (width1-1)*Da); - const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*Da, 0); + const CostType* pixAdd = mem.pixDiff + std::min(x + SW2*Da, (width1-1)*Da); + const CostType* pixSub = mem.pixDiff + std::max(x - (SW2+1)*Da, 0); #if CV_SIMD for (d = 0; d < Da; d += v_int16::nlanes) @@ -526,8 +644,8 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2, { if( y > 0 ) { - const CostType* hsumSub = hsumBuf + (std::max(y - SH2 - 1, 0) % hsumBufNRows)*costBufSize; - const CostType* Cprev = !fullDP || y == 0 ? C : C - costBufSize; + const CostType* hsumSub = mem.getHSumBuf(std::max(y - SH2 - 1, 0)); + const CostType* Cprev = mem.getCBuf(y - 1); #if CV_SIMD for (x = 0; x < width1*Da; x += v_int16::nlanes) v_store_aligned(C + x, vx_load_aligned(Cprev + x) - vx_load_aligned(hsumSub + x) + vx_load_aligned(hsumAdd + x)); @@ -551,7 +669,7 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2, } // also, clear the S buffer - memset(S, 0, width1*Da * sizeof(CostType)); + mem.clearSBuf(y); } /* @@ -575,24 +693,26 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2, for( x = x1; x != x2; x += dx ) { - int xm = x*NR2, xd = xm*Dlra; - - int delta0 = minLr[0][xm - dx*NR2] + P2, delta1 = minLr[1][xm - NR2 + 1] + P2; - int delta2 = minLr[1][xm + 2] + P2, delta3 = minLr[1][xm + NR2 + 3] + P2; - - CostType* Lr_p0 = Lr[0] + xd - dx*NR2*Dlra; - CostType* Lr_p1 = Lr[1] + xd - NR2*Dlra + Dlra; - CostType* Lr_p2 = Lr[1] + xd + Dlra*2; - CostType* Lr_p3 = Lr[1] + xd + NR2*Dlra + Dlra*3; - - Lr_p0[-1] = Lr_p0[D] = Lr_p1[-1] = Lr_p1[D] = - Lr_p2[-1] = Lr_p2[D] = Lr_p3[-1] = Lr_p3[D] = MAX_COST; - - CostType* Lr_p = Lr[0] + xd; + int delta0 = P2 + *mem.getMinLr(lrID, x - dx); + int delta1 = P2 + *mem.getMinLr(1 - lrID, x - 1, 1); + int delta2 = P2 + *mem.getMinLr(1 - lrID, x, 2); + int delta3 = P2 + *mem.getMinLr(1 - lrID, x + 1, 3); + + CostType* Lr_p0 = mem.getLr(lrID, x - dx); + CostType* Lr_p1 = mem.getLr(1 - lrID, x - 1, 1); + CostType* Lr_p2 = mem.getLr(1 - lrID, x, 2); + CostType* Lr_p3 = mem.getLr(1 - lrID, x + 1, 3); + + Lr_p0[-1] = Lr_p0[D] = MAX_COST; + Lr_p1[-1] = Lr_p1[D] = MAX_COST; + Lr_p2[-1] = Lr_p2[D] = MAX_COST; + Lr_p3[-1] = Lr_p3[D] = MAX_COST; + + CostType* Lr_p = mem.getLr(lrID, x); const CostType* Cp = C + x*Da; CostType* Sp = S + x*Da; - CostType* minL = minLr[0] + xm; + CostType* minL = mem.getMinLr(lrID, x); d = 0; #if CV_SIMD v_int16 _P1 = vx_setall_s16((short)P1); @@ -703,14 +823,14 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2, for( ; x <= width - v_int16::nlanes; x += v_int16::nlanes ) { v_store(disp1ptr + x, v_inv_dist); - v_store(disp2ptr + x, v_inv_dist); - v_store(disp2cost + x, v_max_cost); + v_store(mem.disp2ptr + x, v_inv_dist); + v_store(mem.disp2cost + x, v_max_cost); } #endif for( ; x < width; x++ ) { - disp1ptr[x] = disp2ptr[x] = (DispType)INVALID_DISP_SCALED; - disp2cost[x] = MAX_COST; + disp1ptr[x] = mem.disp2ptr[x] = (DispType)INVALID_DISP_SCALED; + mem.disp2cost[x] = MAX_COST; } for( x = width1 - 1; x >= 0; x-- ) @@ -721,16 +841,14 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2, if( npasses == 1 ) { - int xm = x*NR2, xd = xm*Dlra; - - CostType* Lr_p0 = Lr[0] + xd + NR2*Dlra; + CostType* Lr_p0 = mem.getLr(lrID, x + 1); Lr_p0[-1] = Lr_p0[D] = MAX_COST; - CostType* Lr_p = Lr[0] + xd; + CostType* Lr_p = mem.getLr(lrID, x); const CostType* Cp = C + x*Da; d = 0; - int delta0 = minLr[0][xm + NR2] + P2; + int delta0 = P2 + *mem.getMinLr(lrID, x + 1); int minL0 = MAX_COST; #if CV_SIMD v_int16 _P1 = vx_setall_s16((short)P1); @@ -768,7 +886,7 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2, bestDisp = (short)d; } } - minLr[0][xm] = (CostType)minL0; + *mem.getMinLr(lrID, x) = (CostType)minL0; } else { @@ -803,10 +921,10 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2, continue; d = bestDisp; int _x2 = x + minX1 - d - minD; - if( disp2cost[_x2] > minS ) + if( mem.disp2cost[_x2] > minS ) { - disp2cost[_x2] = (CostType)minS; - disp2ptr[_x2] = (DispType)(d + minD); + mem.disp2cost[_x2] = (CostType)minS; + mem.disp2ptr[_x2] = (DispType)(d + minD); } if( 0 < d && d < D-1 ) @@ -833,15 +951,13 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2, int _d = d1 >> DISP_SHIFT; int d_ = (d1 + DISP_SCALE-1) >> DISP_SHIFT; int _x = x - _d, x_ = x - d_; - if( 0 <= _x && _x < width && disp2ptr[_x] >= minD && std::abs(disp2ptr[_x] - _d) > disp12MaxDiff && - 0 <= x_ && x_ < width && disp2ptr[x_] >= minD && std::abs(disp2ptr[x_] - d_) > disp12MaxDiff ) + if( 0 <= _x && _x < width && mem.disp2ptr[_x] >= minD && std::abs(mem.disp2ptr[_x] - _d) > disp12MaxDiff && + 0 <= x_ && x_ < width && mem.disp2ptr[x_] >= minD && std::abs(mem.disp2ptr[x_] - d_) > disp12MaxDiff ) disp1ptr[x] = (DispType)INVALID_DISP_SCALED; } } - // now shift the cyclic buffers - std::swap( Lr[0], Lr[1] ); - std::swap( minLr[0], minLr[1] ); + lrID = 1 - lrID; // now shift the cyclic buffers } } } @@ -849,13 +965,12 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2, //////////////////////////////////////////////////////////////////////////////////////////// struct CalcVerticalSums: public ParallelLoopBody { - CalcVerticalSums(const Mat& _img1, const Mat& _img2, const StereoSGBMParams& params, - CostType* alignedBuf, PixType* _clipTab): img1(_img1), img2(_img2), clipTab(_clipTab) + CalcVerticalSums(const Mat& _img1, const Mat& _img2, const StereoSGBMParams& params, const BufferSGBM &mem_) + : img1(_img1), img2(_img2), mem(mem_) { minD = params.minDisparity; maxD = minD + params.numDisparities; - SW2 = SH2 = (params.SADWindowSize > 0 ? params.SADWindowSize : 5)/2; - ftzero = std::max(params.preFilterCap, 15) | 1; + SW2 = SH2 = params.calcSADWindowSize().height/2; P1 = params.P1 > 0 ? params.P1 : 2; P2 = std::max(params.P2 > 0 ? params.P2 : 5, P1+1); height = img1.rows; @@ -865,32 +980,27 @@ struct CalcVerticalSums: public ParallelLoopBody Da = (int)alignSize(D, v_int16::nlanes); Dlra = Da + v_int16::nlanes;//Additional memory is necessary to store disparity values(MAX_COST) for d=-1 and d=D width1 = maxX1 - minX1; - costBufSize = width1*Da; - CSBufSize = costBufSize*height; - minLrSize = width1; - LrSize = minLrSize*Dlra; - hsumBufNRows = SH2*2 + 2; - Cbuf = alignedBuf; - Sbuf = Cbuf + CSBufSize; - hsumBuf = Sbuf + CSBufSize; + D = params.numDisparities; + Da = (int)alignSize(D, v_int16::nlanes); } void operator()(const Range& range) const CV_OVERRIDE { - static const CostType MAX_COST = SHRT_MAX; - static const int TAB_OFS = 256*4; - static const int npasses = 2; - int x1 = range.start, x2 = range.end, k; - size_t pixDiffSize = ((x2 - x1) + 2*SW2)*Da; - size_t auxBufsSize = CV_SIMD_WIDTH + pixDiffSize*sizeof(CostType) + //alignment and pixdiff size - width*(4*img1.channels()+2)*sizeof(PixType); //tempBuf - Mat auxBuff; - auxBuff.create(1, (int)auxBufsSize, CV_8U); - CostType* pixDiff = (CostType*)alignPtr(auxBuff.ptr(), CV_SIMD_WIDTH); - PixType* tempBuf = (PixType*)(pixDiff + pixDiffSize); + const CostType MAX_COST = SHRT_MAX; + const int npasses = 2; + const int x1 = range.start, x2 = range.end; + int k; + + CostType* pixDiff = 0; + PixType* tempBuf = 0; + utils::BufferArea aux_area; + aux_area.allocate(pixDiff, ((x2 - x1) + 2 * SW2) * Da, CV_SIMD_WIDTH); + aux_area.allocate(tempBuf, width * (4 * img1.channels() + 2) * sizeof(PixType), CV_SIMD_WIDTH); + aux_area.commit(); // Simplification of index calculation - pixDiff -= (x1>SW2 ? (x1 - SW2): 0)*Da; + if (x1 > SW2) + pixDiff -= (x1 - SW2) * Da; for( int pass = 1; pass <= npasses; pass++ ) { @@ -905,26 +1015,14 @@ struct CalcVerticalSums: public ParallelLoopBody y1 = height-1; y2 = -1; dy = -1; } - CostType *Lr[2]={0}, *minLr[2]={0}; - - for( k = 0; k < 2; k++ ) - { - // shift Lr[k] and minLr[k] pointers, because we allocated them with the borders, - // and will occasionally use negative indices with the arrays - // we need to shift Lr[k] pointers by 1, to give the space for d=-1. - // however, then the alignment will be imperfect, i.e. bad for SSE, - // thus we shift the pointers by SIMD vector size - Lr[k] = hsumBuf + costBufSize*hsumBufNRows + v_int16::nlanes + LrSize*k; - memset( Lr[k] + x1*Dlra, 0, (x2-x1)*Dlra*sizeof(CostType) ); - minLr[k] = hsumBuf + costBufSize*hsumBufNRows + v_int16::nlanes + LrSize*2 + minLrSize*k; - memset( minLr[k] + x1, 0, (x2-x1)*sizeof(CostType) ); - } + uchar lrID = 0; + mem.clearLr(range); for( int y = y1; y != y2; y += dy ) { int x, d; - CostType* C = Cbuf + y*costBufSize; - CostType* S = Sbuf + y*costBufSize; + CostType* C = mem.getCBuf(y); + CostType* S = mem.getSBuf(y); if( pass == 1 ) // compute C on the first pass, and reuse it on the second pass, if any. { @@ -932,11 +1030,11 @@ struct CalcVerticalSums: public ParallelLoopBody for( k = dy1; k <= dy2; k++ ) { - CostType* hsumAdd = hsumBuf + (std::min(k, height-1) % hsumBufNRows)*costBufSize; + CostType* hsumAdd = mem.getHSumBuf(std::min(k, height-1)); if( k < height ) { - calcPixelCostBT( img1, img2, k, minD, maxD, pixDiff, tempBuf, clipTab, TAB_OFS, ftzero, x1 - SW2, x2 + SW2); + calcPixelCostBT( img1, img2, k, minD, maxD, pixDiff, tempBuf, mem.getClipTab(), x1 - SW2, x2 + SW2); memset(hsumAdd + x1*Da, 0, Da*sizeof(CostType)); for( x = (x1 - SW2)*Da; x <= (x1 + SW2)*Da; x += Da ) @@ -953,8 +1051,8 @@ struct CalcVerticalSums: public ParallelLoopBody if( y > 0 ) { - const CostType* hsumSub = hsumBuf + (std::max(y - SH2 - 1, 0) % hsumBufNRows)*costBufSize; - const CostType* Cprev = C - costBufSize; + const CostType* hsumSub = mem.getHSumBuf(std::max(y - SH2 - 1, 0)); + const CostType* Cprev = mem.getCBuf(y - 1); #if CV_SIMD for( d = 0; d < Da; d += v_int16::nlanes ) v_store_aligned(C + x1*Da + d, vx_load_aligned(Cprev + x1*Da + d) + vx_load_aligned(hsumAdd + x1*Da + d) - vx_load_aligned(hsumSub + x1*Da + d)); @@ -1020,8 +1118,8 @@ struct CalcVerticalSums: public ParallelLoopBody { /* if (y > 0) { - const CostType* hsumSub = hsumBuf + (std::max(y - SH2 - 1, 0) % hsumBufNRows)*costBufSize; - const CostType* Cprev = C - costBufSize; + const CostType* hsumSub = mem.getHSumBuf(std::max(y - SH2 - 1, 0)); + const CostType* Cprev = mem.getCBuf(y - 1); #if CV_SIMD for( x = x1*Da; x < x2*Da; x += v_int16::nlanes ) @@ -1044,9 +1142,7 @@ struct CalcVerticalSums: public ParallelLoopBody } } } - - // also, clear the S buffer - memset(S + x1*Da, 0, (x2-x1)*Da*sizeof(CostType)); + mem.clearSBuf(y, range); } // [formula 13 in the paper] @@ -1061,19 +1157,16 @@ struct CalcVerticalSums: public ParallelLoopBody for( x = x1; x != x2; x++ ) { - int xd = x*Dlra; - - int delta = minLr[1][x] + P2; - - CostType* Lr_ppr = Lr[1] + xd; + int delta = P2 + *mem.getMinLr(1 - lrID, x); + CostType* Lr_ppr = mem.getLr(1 - lrID, x); Lr_ppr[-1] = Lr_ppr[D] = MAX_COST; - CostType* Lr_p = Lr[0] + xd; + CostType* Lr_p = mem.getLr(lrID, x); const CostType* Cp = C + x*Da; CostType* Sp = S + x*Da; - CostType& minL = minLr[0][x]; + CostType& minL = *(mem.getMinLr(lrID, x)); d = 0; #if CV_SIMD v_int16 _P1 = vx_setall_s16((short)P1); @@ -1105,19 +1198,13 @@ struct CalcVerticalSums: public ParallelLoopBody Sp[d] = saturate_cast(Sp[d] + L); } } - - // now shift the cyclic buffers - std::swap( Lr[0], Lr[1] ); - std::swap( minLr[0], minLr[1] ); + lrID = 1 - lrID; // now shift the cyclic buffers } } } const Mat& img1; const Mat& img2; - CostType* Cbuf; - CostType* Sbuf; - CostType* hsumBuf; - PixType* clipTab; + const BufferSGBM & mem; int minD; int maxD; int D, Da, Dlra; @@ -1128,18 +1215,12 @@ struct CalcVerticalSums: public ParallelLoopBody int height; int P1; int P2; - size_t costBufSize; - size_t CSBufSize; - size_t minLrSize; - size_t LrSize; - size_t hsumBufNRows; - int ftzero; }; struct CalcHorizontalSums: public ParallelLoopBody { - CalcHorizontalSums(const Mat& _img1, const Mat& _img2, Mat& _disp1, const StereoSGBMParams& params, - CostType* alignedBuf): img1(_img1), img2(_img2), disp1(_disp1) + CalcHorizontalSums(const Mat& _img1, const Mat& _img2, Mat& _disp1, const StereoSGBMParams& params, const BufferSGBM &mem_) + : img1(_img1), img2(_img2), disp1(_disp1), mem(mem_) { minD = params.minDisparity; maxD = minD + params.numDisparities; @@ -1157,23 +1238,22 @@ struct CalcHorizontalSums: public ParallelLoopBody Da = (int)alignSize(D, v_int16::nlanes); Dlra = Da + v_int16::nlanes;//Additional memory is necessary to store disparity values(MAX_COST) for d=-1 and d=D width1 = maxX1 - minX1; - costBufSize = width1*Da; - CSBufSize = costBufSize*height; - LrSize = 2 * Dlra; - Cbuf = alignedBuf; - Sbuf = Cbuf + CSBufSize; } void operator()(const Range& range) const CV_OVERRIDE { int y1 = range.start, y2 = range.end; - size_t auxBufsSize = CV_SIMD_WIDTH + (v_int16::nlanes + LrSize) * sizeof(CostType) + width*(sizeof(CostType) + sizeof(DispType)); - Mat auxBuff; - auxBuff.create(1, (int)auxBufsSize, CV_8U); - CostType *Lr = ((CostType*)alignPtr(auxBuff.ptr(), CV_SIMD_WIDTH)) + v_int16::nlanes; - CostType* disp2cost = Lr + LrSize; - DispType* disp2ptr = (DispType*)(disp2cost + width); + const size_t LrSize = 2 * (1 + Dlra + 1); + + CostType * Lr = 0; + CostType * disp2cost = 0; + DispType * disp2ptr = 0; + utils::BufferArea aux_area; + aux_area.allocate(Lr, LrSize); + aux_area.allocate(disp2cost, width, CV_SIMD_WIDTH); + aux_area.allocate(disp2ptr, width, CV_SIMD_WIDTH); + aux_area.commit(); CostType minLr; @@ -1181,8 +1261,8 @@ struct CalcHorizontalSums: public ParallelLoopBody { int x, d; DispType* disp1ptr = disp1.ptr(y); - CostType* C = Cbuf + y*costBufSize; - CostType* S = Sbuf + y*costBufSize; + CostType* C = mem.getCBuf(y); + CostType* S = mem.getSBuf(y); x = 0; #if CV_SIMD @@ -1202,8 +1282,8 @@ struct CalcHorizontalSums: public ParallelLoopBody } // clear buffers - memset( Lr, 0, LrSize*sizeof(CostType) ); - Lr[-1] = Lr[D] = Lr[Dlra - 1] = Lr[Dlra + D] = MAX_COST; + aux_area.zeroFill(Lr); + Lr[0] = Lr[1 + D] = Lr[3 + Dlra - 1] = Lr[3 + Dlra + D] = MAX_COST; minLr = 0; // [formula 13 in the paper] @@ -1219,10 +1299,8 @@ struct CalcHorizontalSums: public ParallelLoopBody for( x = 0; x != width1; x++) { int delta = minLr + P2; - - CostType* Lr_ppr = Lr + ((x&1)? 0 : Dlra); - - CostType* Lr_p = Lr + ((x&1)? Dlra :0); + CostType* Lr_ppr = Lr + ((x&1)? 1 : 3 + Dlra); + CostType* Lr_p = Lr + ((x&1)? 3 + Dlra : 1); const CostType* Cp = C + x*Da; CostType* Sp = S + x*Da; @@ -1236,8 +1314,8 @@ struct CalcHorizontalSums: public ParallelLoopBody for( ; d <= D - v_int16::nlanes; d += v_int16::nlanes) { v_int16 Cpd = vx_load_aligned(Cp + d); - v_int16 L = v_min(v_min(v_min(vx_load_aligned(Lr_ppr + d), vx_load(Lr_ppr + d - 1) + _P1), vx_load(Lr_ppr + d + 1) + _P1), _delta) - _delta + Cpd; - v_store_aligned(Lr_p + d, L); + v_int16 L = v_min(v_min(v_min(vx_load(Lr_ppr + d), vx_load(Lr_ppr + d - 1) + _P1), vx_load(Lr_ppr + d + 1) + _P1), _delta) - _delta + Cpd; + v_store(Lr_p + d, L); _minL = v_min(_minL, L); v_store_aligned(Sp + d, vx_load_aligned(Sp + d) + L); } @@ -1255,18 +1333,16 @@ struct CalcHorizontalSums: public ParallelLoopBody } } - memset( Lr, 0, LrSize*sizeof(CostType) ); - Lr[-1] = Lr[D] = Lr[Dlra - 1] = Lr[Dlra + D] = MAX_COST; + aux_area.zeroFill(Lr); + Lr[0] = Lr[1 + D] = Lr[3 + Dlra - 1] = Lr[3 + Dlra + D] = MAX_COST; minLr = 0; for( x = width1-1; x != -1; x--) { int delta = minLr + P2; - - CostType* Lr_ppr = Lr + ((x&1)? 0 :Dlra); - - CostType* Lr_p = Lr + ((x&1)? Dlra :0); + CostType* Lr_ppr = Lr + ((x&1)? 1 : 3 + Dlra); + CostType* Lr_p = Lr + ((x&1)? 3 + Dlra : 1); const CostType* Cp = C + x*Da; CostType* Sp = S + x*Da; CostType minS = MAX_COST; @@ -1283,8 +1359,8 @@ struct CalcHorizontalSums: public ParallelLoopBody for( ; d <= D - v_int16::nlanes; d += v_int16::nlanes ) { v_int16 Cpd = vx_load_aligned(Cp + d); - v_int16 L = v_min(v_min(v_min(vx_load_aligned(Lr_ppr + d), vx_load(Lr_ppr + d - 1) + _P1), vx_load(Lr_ppr + d + 1) + _P1), _delta) - _delta + Cpd; - v_store_aligned(Lr_p + d, L); + v_int16 L = v_min(v_min(v_min(vx_load(Lr_ppr + d), vx_load(Lr_ppr + d - 1) + _P1), vx_load(Lr_ppr + d + 1) + _P1), _delta) - _delta + Cpd; + v_store(Lr_p + d, L); _minL = v_min(_minL, L); L += vx_load_aligned(Sp + d); v_store_aligned(Sp + d, L); @@ -1366,8 +1442,7 @@ struct CalcHorizontalSums: public ParallelLoopBody const Mat& img1; const Mat& img2; Mat& disp1; - CostType* Cbuf; - CostType* Sbuf; + const BufferSGBM & mem; int minD; int maxD; int D, Da, Dlra; @@ -1378,9 +1453,6 @@ struct CalcHorizontalSums: public ParallelLoopBody int P2; int minX1; int maxX1; - size_t costBufSize; - size_t CSBufSize; - size_t LrSize; int INVALID_DISP; int INVALID_DISP_SCALED; int uniquenessRatio; @@ -1401,28 +1473,21 @@ struct CalcHorizontalSums: public ParallelLoopBody is written as is, without interpolation. */ static void computeDisparitySGBM_HH4( const Mat& img1, const Mat& img2, - Mat& disp1, const StereoSGBMParams& params, - Mat& buffer ) + Mat& disp1, const StereoSGBMParams& params) { const int DISP_SHIFT = StereoMatcher::DISP_SHIFT; const int DISP_SCALE = (1 << DISP_SHIFT); int minD = params.minDisparity, maxD = minD + params.numDisparities; Size SADWindowSize; SADWindowSize.width = SADWindowSize.height = params.SADWindowSize > 0 ? params.SADWindowSize : 5; - int ftzero = std::max(params.preFilterCap, 15) | 1; int P1 = params.P1 > 0 ? params.P1 : 2, P2 = std::max(params.P2 > 0 ? params.P2 : 5, P1+1); - int k, width = disp1.cols, height = disp1.rows; + int width = disp1.cols, height = disp1.rows; int minX1 = std::max(maxD, 0), maxX1 = width + std::min(minD, 0); - int D = (int)alignSize(maxD - minD, v_int16::nlanes), width1 = maxX1 - minX1; - int Dlra = D + v_int16::nlanes;//Additional memory is necessary to store disparity values(MAX_COST) for d=-1 and d=D - int SH2 = SADWindowSize.height/2; + int width1 = maxX1 - minX1; + int Da = (int)alignSize(params.numDisparities, v_int16::nlanes); + int Dlra = Da + v_int16::nlanes;//Additional memory is necessary to store disparity values(MAX_COST) for d=-1 and d=D int INVALID_DISP = minD - 1; int INVALID_DISP_SCALED = INVALID_DISP*DISP_SCALE; - const int TAB_OFS = 256*4, TAB_SIZE = 256 + TAB_OFS*2; - PixType clipTab[TAB_SIZE]; - - for( k = 0; k < TAB_SIZE; k++ ) - clipTab[k] = (PixType)(std::min(std::max(k - TAB_OFS, -ftzero), ftzero) + ftzero); if( minX1 >= maxX1 ) { @@ -1430,54 +1495,79 @@ static void computeDisparitySGBM_HH4( const Mat& img1, const Mat& img2, return; } - // for each possible stereo match (img1(x,y) <=> img2(x-d,y)) - // we keep pixel difference cost (C) and the summary cost over 4 directions (S). - // we also keep all the partial costs for the previous line L_r(x,d) and also min_k L_r(x, k) - - // the number of L_r(.,.) and min_k L_r(.,.) lines in the buffer: - // for dynamic programming we need the current row and - // the previous row, i.e. 2 rows in total - size_t costBufSize = width1*D; - size_t CSBufSize = costBufSize*height; - size_t minLrSize = width1 , LrSize = minLrSize*Dlra; - int hsumBufNRows = SH2*2 + 2; - size_t totalBufSize = CV_SIMD_WIDTH + CSBufSize * 2 * sizeof(CostType) + // Alignment, C, S - costBufSize*hsumBufNRows * sizeof(CostType) + // hsumBuf - ((LrSize + minLrSize)*2 + v_int16::nlanes) * sizeof(CostType); // minLr[] and Lr[] - - if( buffer.empty() || !buffer.isContinuous() || - buffer.cols*buffer.rows*buffer.elemSize() < totalBufSize ) - { - buffer.reserveBuffer(totalBufSize); - } - - // summary cost over different (nDirs) directions - CostType* Cbuf = (CostType*)alignPtr(buffer.ptr(), CV_SIMD_WIDTH); - - // add P2 to every C(x,y). it saves a few operations in the inner loops - for(k = 0; k < (int)CSBufSize; k++ ) - Cbuf[k] = (CostType)P2; - - parallel_for_(Range(0,width1),CalcVerticalSums(img1, img2, params, Cbuf, clipTab),8); - parallel_for_(Range(0,height),CalcHorizontalSums(img1, img2, disp1, params, Cbuf),8); + BufferSGBM mem(width1, Da, Dlra, img1.channels(), width, height, params); + mem.initCBuf((CostType)P2); // add P2 to every C(x,y). it saves a few operations in the inner loops + parallel_for_(Range(0,width1),CalcVerticalSums(img1, img2, params, mem),8); + parallel_for_(Range(0,height),CalcHorizontalSums(img1, img2, disp1, params, mem),8); } ////////////////////////////////////////////////////////////////////////////////////////////////////// -void getBufferPointers(Mat& buffer, int width, int width1, int Da, int num_ch, int SH2, int P2, - CostType*& curCostVolumeLine, CostType*& hsumBuf, CostType*& pixDiff, - PixType*& tmpBuf, CostType*& horPassCostVolume, - CostType*& vertPassCostVolume, CostType*& vertPassMin, CostType*& rightPassBuf, - CostType*& disp2CostBuf, short*& disp2Buf); +class BufferSGBM3Way +{ +private: + size_t hsumCols; + size_t hsumRows; +public: + CostType *curCostVolumeLine; + CostType *hsumBuf; + CostType *pixDiff; + PixType *tmpBuf; + CostType *horPassCostVolume; + CostType *vertPassCostVolume; + CostType *vertPassMin; + CostType *rightPassBuf; + CostType *disp2CostBuf; + short *disp2Buf; +private: + utils::BufferArea area; +public: + BufferSGBM3Way(int width1, int width, int num_ch, int Da, int SH2, int P2) : + curCostVolumeLine(0), + hsumBuf(0), + pixDiff(0), + tmpBuf(0), + horPassCostVolume(0), + vertPassCostVolume(0), + vertPassMin(0), + rightPassBuf(0), + disp2CostBuf(0), + disp2Buf(0) + { + hsumCols = width1 * Da; + hsumRows = SH2*2 + 2; + area.allocate(curCostVolumeLine, hsumCols, CV_SIMD_WIDTH); + area.allocate(hsumBuf, hsumCols * hsumRows, CV_SIMD_WIDTH); + area.allocate(pixDiff,hsumCols, CV_SIMD_WIDTH); + area.allocate(tmpBuf, width * (4 * num_ch + 2), CV_SIMD_WIDTH); + area.allocate(horPassCostVolume, (width1 + 2) * Da, CV_SIMD_WIDTH); + area.allocate(vertPassCostVolume, (width1 + 2) * Da, CV_SIMD_WIDTH); + area.allocate(vertPassMin, width1 + 2, CV_SIMD_WIDTH); + area.allocate(rightPassBuf, Da, CV_SIMD_WIDTH); + area.allocate(disp2CostBuf, width, CV_SIMD_WIDTH); + area.allocate(disp2Buf, width, CV_SIMD_WIDTH); + area.commit(); + area.zeroFill(); + for(size_t i = 0; i < hsumCols; i++) + curCostVolumeLine[i] = (CostType)P2; + } + inline void clearRightPassBuf() + { + area.zeroFill(rightPassBuf); + } + CostType *getHSumBuf(int x) const + { + return hsumBuf + (x % hsumRows) * hsumCols; + } +}; struct SGBM3WayMainLoop : public ParallelLoopBody { - Mat* buffers; const Mat *img1, *img2; Mat* dst_disp; - int nstripes, stripe_sz; + int stripe_sz; int stripe_overlap; int width,height; @@ -1488,25 +1578,54 @@ struct SGBM3WayMainLoop : public ParallelLoopBody int P1, P2; int uniquenessRatio, disp12MaxDiff; - int costBufSize, hsumBufNRows; - int TAB_OFS, ftzero; + int TAB_OFS; + utils::BufferArea aux_area; PixType* clipTab; #if CV_SIMD short idx_row[v_int16::nlanes]; #endif - SGBM3WayMainLoop(Mat *_buffers, const Mat& _img1, const Mat& _img2, Mat* _dst_disp, const StereoSGBMParams& params, PixType* _clipTab, int _nstripes, int _stripe_overlap); - void getRawMatchingCost(CostType* C, CostType* hsumBuf, CostType* pixDiff, PixType* tmpBuf, int y, int src_start_idx) const; + SGBM3WayMainLoop(const Mat& _img1, const Mat& _img2, Mat* _dst_disp, const StereoSGBMParams& params, int stripe_size, int _stripe_overlap); void operator () (const Range& range) const CV_OVERRIDE; template void impl(const Range& range) const; + +private: + void getRawMatchingCost(const BufferSGBM3Way &mem, int y, int src_start_idx) const; + + template + void accumulateCostsLeftTop(const BufferSGBM3Way &mem, + int x, + CostType &leftMinCost) const; + + template + void accumulateCostsRight(const BufferSGBM3Way &mem, + int x, + CostType &rightMinCost, + short &optimal_disp, + CostType &min_cost) const; }; -SGBM3WayMainLoop::SGBM3WayMainLoop(Mat *_buffers, const Mat& _img1, const Mat& _img2, Mat* _dst_disp, const StereoSGBMParams& params, PixType* _clipTab, int _nstripes, int _stripe_overlap): -buffers(_buffers), img1(&_img1), img2(&_img2), dst_disp(_dst_disp), clipTab(_clipTab) +SGBM3WayMainLoop::SGBM3WayMainLoop(const Mat& _img1, + const Mat& _img2, + Mat* _dst_disp, + const StereoSGBMParams& params, + int _stripe_sz, + int _stripe_overlap) + : img1(&_img1), + img2(&_img2), + dst_disp(_dst_disp), + stripe_sz(_stripe_sz), + stripe_overlap(_stripe_overlap), + clipTab(0) { - nstripes = _nstripes; - stripe_overlap = _stripe_overlap; - stripe_sz = (int)ceil(img1->rows/(double)nstripes); + // precompute a lookup table for the raw matching cost computation: + TAB_OFS = 256*4; + const int TAB_SIZE = 256 + TAB_OFS*2; + aux_area.allocate(clipTab, TAB_SIZE, CV_SIMD_WIDTH); + aux_area.commit(); + const int ftzero = std::max(params.preFilterCap, 15) | 1; + for(int k = 0; k < TAB_SIZE; k++ ) + clipTab[k] = (PixType)(std::min(std::max(k - TAB_OFS, -ftzero), ftzero) + ftzero); width = img1->cols; height = img1->rows; minD = params.minDisparity; maxD = minD + params.numDisparities; D = maxD - minD; @@ -1519,100 +1638,27 @@ buffers(_buffers), img1(&_img1), img2(&_img2), dst_disp(_dst_disp), clipTab(_cli uniquenessRatio = params.uniquenessRatio >= 0 ? params.uniquenessRatio : 10; disp12MaxDiff = params.disp12MaxDiff > 0 ? params.disp12MaxDiff : 1; - costBufSize = width1*Da; - hsumBufNRows = SH2*2 + 2; - TAB_OFS = 256*4; - ftzero = std::max(params.preFilterCap, 15) | 1; #if CV_SIMD for(short i = 0; i < v_int16::nlanes; ++i) idx_row[i] = i; #endif } -void getBufferPointers(Mat& buffer, int width, int width1, int Da, int num_ch, int SH2, int P2, - CostType*& curCostVolumeLine, CostType*& hsumBuf, CostType*& pixDiff, - PixType*& tmpBuf, CostType*& horPassCostVolume, - CostType*& vertPassCostVolume, CostType*& vertPassMin, CostType*& rightPassBuf, - CostType*& disp2CostBuf, short*& disp2Buf) -{ - // allocating all the required memory: - int costVolumeLineSize = width1*Da; - int width1_ext = width1+2; - int costVolumeLineSize_ext = width1_ext*Da; - int hsumBufNRows = SH2*2 + 2; - - // main buffer to store matching costs for the current line: - int curCostVolumeLineSize = costVolumeLineSize*sizeof(CostType); - - // auxiliary buffers for the raw matching cost computation: - int hsumBufSize = costVolumeLineSize*hsumBufNRows*sizeof(CostType); - int pixDiffSize = costVolumeLineSize*sizeof(CostType); - int tmpBufSize = width * (4 * num_ch + 2) * sizeof(PixType); - - // auxiliary buffers for the matching cost aggregation: - int horPassCostVolumeSize = costVolumeLineSize_ext*sizeof(CostType); // buffer for the 2-pass horizontal cost aggregation - int vertPassCostVolumeSize = costVolumeLineSize_ext*sizeof(CostType); // buffer for the vertical cost aggregation - int rightPassBufSize = Da * sizeof(CostType); // additional small buffer for the right-to-left pass - int vertPassMinSize = width1_ext*sizeof(CostType); // buffer for storing minimum costs from the previous line - - // buffers for the pseudo-LRC check: - int disp2CostBufSize = width*sizeof(CostType); - int disp2BufSize = width*sizeof(short); - - // sum up the sizes of all the buffers: - size_t totalBufSize = CV_SIMD_WIDTH + curCostVolumeLineSize + - hsumBufSize + - pixDiffSize + - horPassCostVolumeSize + - vertPassCostVolumeSize + - rightPassBufSize + - vertPassMinSize + - disp2CostBufSize + - disp2BufSize + - tmpBufSize; - - if( buffer.empty() || !buffer.isContinuous() || buffer.cols*buffer.rows*buffer.elemSize() < totalBufSize ) - buffer.reserveBuffer(totalBufSize); - - // set up all the pointers: - curCostVolumeLine = (CostType*)alignPtr(buffer.ptr(), CV_SIMD_WIDTH); - hsumBuf = curCostVolumeLine + costVolumeLineSize; - pixDiff = hsumBuf + costVolumeLineSize*hsumBufNRows; - horPassCostVolume = pixDiff + costVolumeLineSize; - vertPassCostVolume = horPassCostVolume + costVolumeLineSize_ext; - rightPassBuf = vertPassCostVolume + costVolumeLineSize_ext; - vertPassMin = rightPassBuf + Da; - - disp2CostBuf = vertPassMin + width1_ext; - disp2Buf = disp2CostBuf + width; - tmpBuf = (PixType*)(disp2Buf + width); - - // initialize memory: - memset(buffer.ptr(),0,totalBufSize); - int i = 0; -#if CV_SIMD - v_int16 _P2 = vx_setall_s16((CostType)P2); - for (; i<=costVolumeLineSize-v_int16::nlanes; i+=v_int16::nlanes) - v_store_aligned(curCostVolumeLine + i, _P2); -#endif - for(;i src_start_idx ) { - const CostType* hsumSub = hsumBuf + (std::max(y - SH2 - 1, src_start_idx) % hsumBufNRows)*costBufSize; + const CostType* hsumSub = mem.getHSumBuf(std::max(y - SH2 - 1, src_start_idx)); #if CV_SIMD for (d = 0; d < Da; d += v_int16::nlanes) @@ -1702,7 +1748,7 @@ void SGBM3WayMainLoop::getRawMatchingCost(CostType* C, // target cost-volume row { if( y > src_start_idx ) { - const CostType* hsumSub = hsumBuf + (std::max(y - SH2 - 1, src_start_idx) % hsumBufNRows)*costBufSize; + const CostType* hsumSub = mem.getHSumBuf(std::max(y - SH2 - 1, src_start_idx)); #if CV_SIMD for( x = 0; x < width1*Da; x += v_int16::nlanes) v_store_aligned(C + x, vx_load_aligned(C + x) + vx_load_aligned(hsumAdd + x) - vx_load_aligned(hsumSub + x)); @@ -1728,12 +1774,15 @@ void SGBM3WayMainLoop::getRawMatchingCost(CostType* C, // target cost-volume row // performing SGM cost accumulation from left to right (result is stored in leftBuf) and // in-place cost accumulation from top to bottom (result is stored in topBuf) template -inline void accumulateCostsLeftTop(CostType* leftBuf, CostType* leftBuf_prev, CostType* topBuf, CostType* costs, - CostType& leftMinCost, CostType& topMinCost, int D, int P1, int P2) +void SGBM3WayMainLoop::accumulateCostsLeftTop(const BufferSGBM3Way &mem, int x, CostType& leftMinCost) const { + CostType *leftBuf = mem.horPassCostVolume + x; + CostType *leftBuf_prev = mem.horPassCostVolume + x - Da; + CostType *topBuf = mem.vertPassCostVolume + x; + CostType *costs = mem.curCostVolumeLine - Da + x; + CostType& topMinCost = mem.vertPassMin[x/Da]; int i = 0; #if CV_SIMD - int Da = (int)alignSize(D, v_int16::nlanes); v_int16 P1_reg = vx_setall_s16(cv::saturate_cast(P1)); v_int16 leftMinCostP2_reg = vx_setall_s16(cv::saturate_cast(leftMinCost+P2)); @@ -1847,12 +1896,16 @@ inline void accumulateCostsLeftTop(CostType* leftBuf, CostType* leftBuf_prev, Co // summing rightBuf, topBuf, leftBuf together (the result is stored in leftBuf), as well as finding the // optimal disparity value with minimum accumulated cost template -inline void accumulateCostsRight(CostType* rightBuf, CostType* topBuf, CostType* leftBuf, CostType* costs, - CostType& rightMinCost, int D, int P1, int P2, short& optimal_disp, CostType& min_cost) +void SGBM3WayMainLoop::accumulateCostsRight(const BufferSGBM3Way &mem, int x, + CostType& rightMinCost, short& optimal_disp, CostType& min_cost) const { + CostType* costs = mem.curCostVolumeLine - Da + x; + CostType* rightBuf = mem.rightPassBuf; + CostType* topBuf = mem.vertPassCostVolume + x; + CostType* leftBuf = mem.horPassCostVolume + x; + int i = 0; #if CV_SIMD - int Da = (int)alignSize(D, v_int16::nlanes); v_int16 P1_reg = vx_setall_s16(cv::saturate_cast(P1)); v_int16 rightMinCostP2_reg = vx_setall_s16(cv::saturate_cast(rightMinCost+P2)); @@ -1955,6 +2008,7 @@ void SGBM3WayMainLoop::operator () (const Range& range) const if (D == Da) impl(range); else impl(range); } + template void SGBM3WayMainLoop::impl(const Range& range) const { @@ -1979,33 +2033,24 @@ void SGBM3WayMainLoop::impl(const Range& range) const else dst_offset=0; - Mat cur_buffer = buffers [range.start]; Mat cur_disp = dst_disp[range.start]; cur_disp = Scalar(INVALID_DISP_SCALED); - // prepare buffers: - CostType *curCostVolumeLine, *hsumBuf, *pixDiff; - PixType* tmpBuf; - CostType *horPassCostVolume, *vertPassCostVolume, *vertPassMin, *rightPassBuf, *disp2CostBuf; - short* disp2Buf; - getBufferPointers(cur_buffer,width,width1,Da,img1->channels(),SH2,P2, - curCostVolumeLine,hsumBuf,pixDiff,tmpBuf,horPassCostVolume, - vertPassCostVolume,vertPassMin,rightPassBuf,disp2CostBuf,disp2Buf); - + BufferSGBM3Way mem(width1, width, img1->channels(), Da, SH2, P2); + CostType *horPassCostVolume = mem.horPassCostVolume; // start real processing: for(int y=src_start_idx;y(horPassCostVolume+x,horPassCostVolume+x-Da,vertPassCostVolume+x,C+x,prev_min,vertPassMin[x/Da],D,P1,P2); + accumulateCostsLeftTop(mem, x, prev_min); //backward pass - memset(rightPassBuf,0,Da*sizeof(CostType)); + mem.clearRightPassBuf(); prev_min=0; for (int x=width1*Da;x>=Da;x-=Da) { - accumulateCostsRight(rightPassBuf,vertPassCostVolume+x,horPassCostVolume+x,C+x,prev_min,D,P1,P2,best_d,min_cost); + accumulateCostsRight(mem, x, prev_min, best_d, min_cost); if(uniquenessRatio>0) { @@ -2074,10 +2119,10 @@ void SGBM3WayMainLoop::impl(const Range& range) const d = best_d; int _x2 = x/Da - 1 + minX1 - d - minD; - if( _x2>=0 && _x2 min_cost ) + if( _x2>=0 && _x2 min_cost ) { - disp2CostBuf[_x2] = min_cost; - disp2Buf[_x2] = (short)(d + minD); + mem.disp2CostBuf[_x2] = min_cost; + mem.disp2Buf[_x2] = (short)(d + minD); } if( 0 < d && d < D-1 ) @@ -2104,32 +2149,27 @@ void SGBM3WayMainLoop::impl(const Range& range) const int _d = d1 >> StereoMatcher::DISP_SHIFT; int d_ = (d1 + DISP_SCALE-1) >> StereoMatcher::DISP_SHIFT; int _x = x - _d, x_ = x - d_; - if( 0 <= _x && _x < width && disp2Buf[_x] >= minD && std::abs(disp2Buf[_x] - _d) > disp12MaxDiff && - 0 <= x_ && x_ < width && disp2Buf[x_] >= minD && std::abs(disp2Buf[x_] - d_) > disp12MaxDiff ) + if( 0 <= _x && _x < width && mem.disp2Buf[_x] >= minD && std::abs(mem.disp2Buf[_x] - _d) > disp12MaxDiff && + 0 <= x_ && x_ < width && mem.disp2Buf[x_] >= minD && std::abs(mem.disp2Buf[x_] - d_) > disp12MaxDiff ) disp_row[x] = (short)INVALID_DISP_SCALED; } } } -static void computeDisparity3WaySGBM( const Mat& img1, const Mat& img2, - Mat& disp1, const StereoSGBMParams& params, - Mat* buffers, int nstripes ) +template +static void computeDisparity3WaySGBM(const Mat& img1, const Mat& img2, Mat& disp1, const StereoSGBMParams& params) { - // precompute a lookup table for the raw matching cost computation: - const int TAB_OFS = 256*4, TAB_SIZE = 256 + TAB_OFS*2; - PixType* clipTab = new PixType[TAB_SIZE]; - int ftzero = std::max(params.preFilterCap, 15) | 1; - for(int k = 0; k < TAB_SIZE; k++ ) - clipTab[k] = (PixType)(std::min(std::max(k - TAB_OFS, -ftzero), ftzero) + ftzero); - // allocate separate dst_disp arrays to avoid conflicts due to stripe overlap: int stripe_sz = (int)ceil(img1.rows/(double)nstripes); int stripe_overlap = (params.SADWindowSize/2+1) + (int)ceil(0.1*stripe_sz); - Mat* dst_disp = new Mat[nstripes]; + Mat dst_disp[nstripes]; for(int i=0;i( left, right, disp, params ); else if(params.mode==MODE_HH4) - computeDisparitySGBM_HH4( left, right, disp, params, buffer ); + computeDisparitySGBM_HH4( left, right, disp, params ); else - computeDisparitySGBM( left, right, disp, params, buffer ); + computeDisparitySGBM( left, right, disp, params ); medianBlur(disp, disp, 3); @@ -2259,11 +2298,6 @@ public: StereoSGBMParams params; Mat buffer; - // the number of stripes is fixed, disregarding the number of threads/processors - // to make the results fully reproducible: - static const int num_stripes = 4; - Mat buffers[num_stripes]; - static const char* name_; }; diff --git a/modules/core/include/opencv2/core/utils/buffer_area.private.hpp b/modules/core/include/opencv2/core/utils/buffer_area.private.hpp index 141ad2c..ab19da6 100644 --- a/modules/core/include/opencv2/core/utils/buffer_area.private.hpp +++ b/modules/core/include/opencv2/core/utils/buffer_area.private.hpp @@ -74,6 +74,25 @@ public: allocate_((void**)(&ptr), static_cast(sizeof(T)), count, alignment); } + /** @brief Fill one of buffers with zeroes + + @param ptr pointer to memory block previously added using BufferArea::allocate + + BufferArea::commit must be called before using this method + */ + template + void zeroFill(T*&ptr) + { + CV_Assert(ptr); + zeroFill_((void**)&ptr); + } + + /** @brief Fill all buffers with zeroes + + BufferArea::commit must be called before using this method + */ + void zeroFill(); + /** @brief Allocate memory and initialize all bound pointers Each pointer bound to the area with the BufferArea::allocate will be initialized and will be set @@ -83,10 +102,18 @@ public: */ void commit(); + /** @brief Release all memory and unbind all pointers + + All memory will be freed and all pointers will be reset to NULL and untied from the area allowing + to call `allocate` and `commit` again. + */ + void release(); + private: BufferArea(const BufferArea &); // = delete BufferArea &operator=(const BufferArea &); // = delete void allocate_(void **ptr, ushort type_size, size_t count, ushort alignment); + void zeroFill_(void **ptr); private: class Block; diff --git a/modules/core/src/buffer_area.cpp b/modules/core/src/buffer_area.cpp index 2a41c72..b6bb321 100644 --- a/modules/core/src/buffer_area.cpp +++ b/modules/core/src/buffer_area.cpp @@ -66,6 +66,16 @@ public: *ptr = buf; return static_cast(static_cast(*ptr) + type_size * count); } + bool operator==(void **other) const + { + CV_Assert(ptr && other); + return *ptr == *other; + } + void zeroFill() const + { + CV_Assert(ptr && *ptr); + memset(static_cast(*ptr), 0, count * type_size); + } private: void **ptr; void * raw_mem; @@ -85,10 +95,7 @@ BufferArea::BufferArea(bool safe_) : BufferArea::~BufferArea() { - for(std::vector::const_iterator i = blocks.begin(); i != blocks.end(); ++i) - i->cleanup(); - if (oneBuf) - fastFree(oneBuf); + release(); } void BufferArea::allocate_(void **ptr, ushort type_size, size_t count, ushort alignment) @@ -100,6 +107,26 @@ void BufferArea::allocate_(void **ptr, ushort type_size, size_t count, ushort al totalSize += blocks.back().getByteCount(); } +void BufferArea::zeroFill_(void **ptr) +{ + for(std::vector::const_iterator i = blocks.begin(); i != blocks.end(); ++i) + { + if (*i == ptr) + { + i->zeroFill(); + break; + } + } +} + +void BufferArea::zeroFill() +{ + for(std::vector::const_iterator i = blocks.begin(); i != blocks.end(); ++i) + { + i->zeroFill(); + } +} + void BufferArea::commit() { if (!safe) @@ -116,6 +143,20 @@ void BufferArea::commit() } } +void BufferArea::release() +{ + for(std::vector::const_iterator i = blocks.begin(); i != blocks.end(); ++i) + { + i->cleanup(); + } + blocks.clear(); + if (oneBuf) + { + fastFree(oneBuf); + oneBuf = 0; + } +} + //================================================================================================== }} // cv::utils:: diff --git a/modules/core/test/test_utils.cpp b/modules/core/test/test_utils.cpp index 8789148..1a23e01 100644 --- a/modules/core/test/test_utils.cpp +++ b/modules/core/test/test_utils.cpp @@ -337,6 +337,21 @@ TEST_P(BufferArea, basic) ASSERT_TRUE(dbl_ptr != NULL); EXPECT_EQ((size_t)0, (size_t)int_ptr % sizeof(int)); EXPECT_EQ((size_t)0, (size_t)dbl_ptr % sizeof(double)); + for (size_t i = 0; i < SZ; ++i) + { + int_ptr[i] = (int)i + 1; + uchar_ptr[i] = (uchar)i + 1; + dbl_ptr[i] = (double)i + 1; + } + area.zeroFill(int_ptr); + area.zeroFill(uchar_ptr); + area.zeroFill(dbl_ptr); + for (size_t i = 0; i < SZ; ++i) + { + EXPECT_EQ((int)0, int_ptr[i]); + EXPECT_EQ((uchar)0, uchar_ptr[i]); + EXPECT_EQ((double)0, dbl_ptr[i]); + } } EXPECT_TRUE(int_ptr == NULL); EXPECT_TRUE(uchar_ptr == NULL); diff --git a/modules/features2d/src/fast.cpp b/modules/features2d/src/fast.cpp index cb9ff25..7e29214 100644 --- a/modules/features2d/src/fast.cpp +++ b/modules/features2d/src/fast.cpp @@ -47,6 +47,7 @@ The references are: #include "opencl_kernels_features2d.hpp" #include "hal_replacement.hpp" #include "opencv2/core/hal/intrin.hpp" +#include "opencv2/core/utils/buffer_area.private.hpp" #include "opencv2/core/openvx/ovx_defs.hpp" @@ -80,20 +81,26 @@ void FAST_t(InputArray _img, std::vector& keypoints, int threshold, bo for( i = -255; i <= 255; i++ ) threshold_tab[i+255] = (uchar)(i < -threshold ? 1 : i > threshold ? 2 : 0); - AutoBuffer _buf((img.cols+16)*3*(sizeof(int) + sizeof(uchar)) + 128); - uchar* buf[3]; - buf[0] = _buf.data(); buf[1] = buf[0] + img.cols; buf[2] = buf[1] + img.cols; - int* cpbuf[3]; - cpbuf[0] = (int*)alignPtr(buf[2] + img.cols, sizeof(int)) + 1; - cpbuf[1] = cpbuf[0] + img.cols + 1; - cpbuf[2] = cpbuf[1] + img.cols + 1; - memset(buf[0], 0, img.cols*3); + uchar* buf[3] = { 0 }; + int* cpbuf[3] = { 0 }; + utils::BufferArea area; + for (unsigned idx = 0; idx < 3; ++idx) + { + area.allocate(buf[idx], img.cols); + area.allocate(cpbuf[idx], img.cols + 1); + } + area.commit(); + + for (unsigned idx = 0; idx < 3; ++idx) + { + memset(buf[idx], 0, img.cols); + } for(i = 3; i < img.rows-2; i++) { const uchar* ptr = img.ptr(i) + 3; uchar* curr = buf[(i - 3)%3]; - int* cornerpos = cpbuf[(i - 3)%3]; + int* cornerpos = cpbuf[(i - 3)%3] + 1; // cornerpos[-1] is used to store a value memset(curr, 0, img.cols); int ncorners = 0; @@ -266,7 +273,7 @@ void FAST_t(InputArray _img, std::vector& keypoints, int threshold, bo const uchar* prev = buf[(i - 4 + 3)%3]; const uchar* pprev = buf[(i - 5 + 3)%3]; - cornerpos = cpbuf[(i - 4 + 3)%3]; + cornerpos = cpbuf[(i - 4 + 3)%3] + 1; // cornerpos[-1] is used to store a value ncorners = cornerpos[-1]; for( k = 0; k < ncorners; k++ )