1 /*M///////////////////////////////////////////////////////////////////////////////////////
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install,
7 // copy or use the software.
11 // For Open Source Computer Vision Library
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Third party copyrights are property of their respective owners.
17 // Redistribution and use in source and binary forms, with or without modification,
18 // are permitted provided that the following conditions are met:
20 // * Redistribution's of source code must retain the above copyright notice,
21 // this list of conditions and the following disclaimer.
23 // * Redistribution's in binary form must reproduce the above copyright notice,
24 // this list of conditions and the following disclaimer in the documentation
25 // and/or other materials provided with the distribution.
27 // * The name of the copyright holders may not be used to endorse or promote products
28 // derived from this software without specific prior written permission.
30 // This software is provided by the copyright holders and contributors "as is" and
31 // any express or implied warranties, including, but not limited to, the implied
32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
33 // In no event shall the Intel Corporation or contributors be liable for any direct,
34 // indirect, incidental, special, exemplary, or consequential damages
35 // (including, but not limited to, procurement of substitute goods or services;
36 // loss of use, data, or profits; or business interruption) however caused
37 // and on any theory of liability, whether in contract, strict liability,
38 // or tort (including negligence or otherwise) arising in any way out of
39 // the use of this software, even if advised of the possibility of such damage.
43 #include "precomp.hpp"
45 /****************************************************************************************\
47 \****************************************************************************************/
50 Various border types, image boundaries are denoted with '|'
52 * BORDER_REPLICATE: aaaaaa|abcdefgh|hhhhhhh
53 * BORDER_REFLECT: fedcba|abcdefgh|hgfedcb
54 * BORDER_REFLECT_101: gfedcb|abcdefgh|gfedcba
55 * BORDER_WRAP: cdefgh|abcdefgh|abcdefg
56 * BORDER_CONSTANT: iiiiii|abcdefgh|iiiiiii with some specified 'i'
58 int cv::borderInterpolate( int p, int len, int borderType )
60 if( (unsigned)p < (unsigned)len )
62 else if( borderType == BORDER_REPLICATE )
63 p = p < 0 ? 0 : len - 1;
64 else if( borderType == BORDER_REFLECT || borderType == BORDER_REFLECT_101 )
66 int delta = borderType == BORDER_REFLECT_101;
74 p = len - 1 - (p - len) - delta;
76 while( (unsigned)p >= (unsigned)len );
78 else if( borderType == BORDER_WRAP )
81 p -= ((p-len+1)/len)*len;
85 else if( borderType == BORDER_CONSTANT )
88 CV_Error( CV_StsBadArg, "Unknown/unsupported border type" );
96 BaseRowFilter::BaseRowFilter() { ksize = anchor = -1; }
97 BaseRowFilter::~BaseRowFilter() {}
99 BaseColumnFilter::BaseColumnFilter() { ksize = anchor = -1; }
100 BaseColumnFilter::~BaseColumnFilter() {}
101 void BaseColumnFilter::reset() {}
103 BaseFilter::BaseFilter() { ksize = Size(-1,-1); anchor = Point(-1,-1); }
104 BaseFilter::~BaseFilter() {}
105 void BaseFilter::reset() {}
107 FilterEngine::FilterEngine()
109 srcType = dstType = bufType = -1;
110 rowBorderType = columnBorderType = BORDER_REPLICATE;
111 bufStep = startY = startY0 = endY = rowCount = dstY = 0;
114 wholeSize = Size(-1,-1);
118 FilterEngine::FilterEngine( const Ptr<BaseFilter>& _filter2D,
119 const Ptr<BaseRowFilter>& _rowFilter,
120 const Ptr<BaseColumnFilter>& _columnFilter,
121 int _srcType, int _dstType, int _bufType,
122 int _rowBorderType, int _columnBorderType,
123 const Scalar& _borderValue )
125 init(_filter2D, _rowFilter, _columnFilter, _srcType, _dstType, _bufType,
126 _rowBorderType, _columnBorderType, _borderValue);
129 FilterEngine::~FilterEngine()
134 void FilterEngine::init( const Ptr<BaseFilter>& _filter2D,
135 const Ptr<BaseRowFilter>& _rowFilter,
136 const Ptr<BaseColumnFilter>& _columnFilter,
137 int _srcType, int _dstType, int _bufType,
138 int _rowBorderType, int _columnBorderType,
139 const Scalar& _borderValue )
141 _srcType = CV_MAT_TYPE(_srcType);
142 _bufType = CV_MAT_TYPE(_bufType);
143 _dstType = CV_MAT_TYPE(_dstType);
146 int srcElemSize = (int)getElemSize(srcType);
150 filter2D = _filter2D;
151 rowFilter = _rowFilter;
152 columnFilter = _columnFilter;
154 if( _columnBorderType < 0 )
155 _columnBorderType = _rowBorderType;
157 rowBorderType = _rowBorderType;
158 columnBorderType = _columnBorderType;
160 CV_Assert( columnBorderType != BORDER_WRAP );
164 CV_Assert( !rowFilter.empty() && !columnFilter.empty() );
165 ksize = Size(rowFilter->ksize, columnFilter->ksize);
166 anchor = Point(rowFilter->anchor, columnFilter->anchor);
170 CV_Assert( bufType == srcType );
171 ksize = filter2D->ksize;
172 anchor = filter2D->anchor;
175 CV_Assert( 0 <= anchor.x && anchor.x < ksize.width &&
176 0 <= anchor.y && anchor.y < ksize.height );
178 borderElemSize = srcElemSize/(CV_MAT_DEPTH(srcType) >= CV_32S ? sizeof(int) : 1);
179 int borderLength = std::max(ksize.width - 1, 1);
180 borderTab.resize(borderLength*borderElemSize);
182 maxWidth = bufStep = 0;
183 constBorderRow.clear();
185 if( rowBorderType == BORDER_CONSTANT || columnBorderType == BORDER_CONSTANT )
187 constBorderValue.resize(srcElemSize*borderLength);
188 scalarToRawData(_borderValue, &constBorderValue[0], srcType,
189 borderLength*CV_MAT_CN(srcType));
192 wholeSize = Size(-1,-1);
195 static const int VEC_ALIGN = CV_MALLOC_ALIGN;
197 int FilterEngine::start(Size _wholeSize, Rect _roi, int _maxBufRows)
201 wholeSize = _wholeSize;
203 CV_Assert( roi.x >= 0 && roi.y >= 0 && roi.width >= 0 && roi.height >= 0 &&
204 roi.x + roi.width <= wholeSize.width &&
205 roi.y + roi.height <= wholeSize.height );
207 int esz = (int)getElemSize(srcType);
208 int bufElemSize = (int)getElemSize(bufType);
209 const uchar* constVal = !constBorderValue.empty() ? &constBorderValue[0] : 0;
211 if( _maxBufRows < 0 )
212 _maxBufRows = ksize.height + 3;
213 _maxBufRows = std::max(_maxBufRows, std::max(anchor.y, ksize.height-anchor.y-1)*2+1);
215 if( maxWidth < roi.width || _maxBufRows != (int)rows.size() )
217 rows.resize(_maxBufRows);
218 maxWidth = std::max(maxWidth, roi.width);
219 int cn = CV_MAT_CN(srcType);
220 srcRow.resize(esz*(maxWidth + ksize.width - 1));
221 if( columnBorderType == BORDER_CONSTANT )
223 constBorderRow.resize(getElemSize(bufType)*(maxWidth + ksize.width - 1 + VEC_ALIGN));
224 uchar *dst = alignPtr(&constBorderRow[0], VEC_ALIGN), *tdst;
225 int n = (int)constBorderValue.size(), N;
226 N = (maxWidth + ksize.width - 1)*esz;
227 tdst = isSeparable() ? &srcRow[0] : dst;
229 for( i = 0; i < N; i += n )
231 n = std::min( n, N - i );
232 for(j = 0; j < n; j++)
233 tdst[i+j] = constVal[j];
237 (*rowFilter)(&srcRow[0], dst, maxWidth, cn);
240 int maxBufStep = bufElemSize*(int)alignSize(maxWidth +
241 (!isSeparable() ? ksize.width - 1 : 0),VEC_ALIGN);
242 ringBuf.resize(maxBufStep*rows.size()+VEC_ALIGN);
245 // adjust bufstep so that the used part of the ring buffer stays compact in memory
246 bufStep = bufElemSize*(int)alignSize(roi.width + (!isSeparable() ? ksize.width - 1 : 0),16);
248 dx1 = std::max(anchor.x - roi.x, 0);
249 dx2 = std::max(ksize.width - anchor.x - 1 + roi.x + roi.width - wholeSize.width, 0);
251 // recompute border tables
252 if( dx1 > 0 || dx2 > 0 )
254 if( rowBorderType == BORDER_CONSTANT )
256 int nr = isSeparable() ? 1 : (int)rows.size();
257 for( i = 0; i < nr; i++ )
259 uchar* dst = isSeparable() ? &srcRow[0] : alignPtr(&ringBuf[0],VEC_ALIGN) + bufStep*i;
260 memcpy( dst, constVal, dx1*esz );
261 memcpy( dst + (roi.width + ksize.width - 1 - dx2)*esz, constVal, dx2*esz );
266 int xofs1 = std::min(roi.x, anchor.x) - roi.x;
268 int btab_esz = borderElemSize, wholeWidth = wholeSize.width;
269 int* btab = (int*)&borderTab[0];
271 for( i = 0; i < dx1; i++ )
273 int p0 = (borderInterpolate(i-dx1, wholeWidth, rowBorderType) + xofs1)*btab_esz;
274 for( j = 0; j < btab_esz; j++ )
275 btab[i*btab_esz + j] = p0 + j;
278 for( i = 0; i < dx2; i++ )
280 int p0 = (borderInterpolate(wholeWidth + i, wholeWidth, rowBorderType) + xofs1)*btab_esz;
281 for( j = 0; j < btab_esz; j++ )
282 btab[(i + dx1)*btab_esz + j] = p0 + j;
288 startY = startY0 = std::max(roi.y - anchor.y, 0);
289 endY = std::min(roi.y + roi.height + ksize.height - anchor.y - 1, wholeSize.height);
290 if( !columnFilter.empty() )
291 columnFilter->reset();
292 if( !filter2D.empty() )
299 int FilterEngine::start(const Mat& src, const Rect& _srcRoi,
300 bool isolated, int maxBufRows)
302 Rect srcRoi = _srcRoi;
304 if( srcRoi == Rect(0,0,-1,-1) )
305 srcRoi = Rect(0,0,src.cols,src.rows);
307 CV_Assert( srcRoi.x >= 0 && srcRoi.y >= 0 &&
308 srcRoi.width >= 0 && srcRoi.height >= 0 &&
309 srcRoi.x + srcRoi.width <= src.cols &&
310 srcRoi.y + srcRoi.height <= src.rows );
313 Size wholeSize(src.cols, src.rows);
315 src.locateROI( wholeSize, ofs );
316 start( wholeSize, srcRoi + ofs, maxBufRows );
318 return startY - ofs.y;
322 int FilterEngine::remainingInputRows() const
324 return endY - startY - rowCount;
327 int FilterEngine::remainingOutputRows() const
329 return roi.height - dstY;
332 int FilterEngine::proceed( const uchar* src, int srcstep, int count,
333 uchar* dst, int dststep )
335 CV_Assert( wholeSize.width > 0 && wholeSize.height > 0 );
337 const int *btab = &borderTab[0];
338 int esz = (int)getElemSize(srcType), btab_esz = borderElemSize;
339 uchar** brows = &rows[0];
340 int bufRows = (int)rows.size();
341 int cn = CV_MAT_CN(bufType);
342 int width = roi.width, kwidth = ksize.width;
343 int kheight = ksize.height, ay = anchor.y;
344 int _dx1 = dx1, _dx2 = dx2;
345 int width1 = roi.width + kwidth - 1;
346 int xofs1 = std::min(roi.x, anchor.x);
347 bool isSep = isSeparable();
348 bool makeBorder = (_dx1 > 0 || _dx2 > 0) && rowBorderType != BORDER_CONSTANT;
352 count = std::min(count, remainingInputRows());
354 CV_Assert( src && dst && count > 0 );
356 for(;; dst += dststep*i, dy += i)
358 int dcount = bufRows - ay - startY - rowCount + roi.y;
359 dcount = dcount > 0 ? dcount : bufRows - kheight + 1;
360 dcount = std::min(dcount, count);
362 for( ; dcount-- > 0; src += srcstep )
364 int bi = (startY - startY0 + rowCount) % bufRows;
365 uchar* brow = alignPtr(&ringBuf[0], VEC_ALIGN) + bi*bufStep;
366 uchar* row = isSep ? &srcRow[0] : brow;
368 if( ++rowCount > bufRows )
374 memcpy( row + _dx1*esz, src, (width1 - _dx2 - _dx1)*esz );
378 if( btab_esz*(int)sizeof(int) == esz )
380 const int* isrc = (const int*)src;
381 int* irow = (int*)row;
383 for( i = 0; i < _dx1*btab_esz; i++ )
384 irow[i] = isrc[btab[i]];
385 for( i = 0; i < _dx2*btab_esz; i++ )
386 irow[i + (width1 - _dx2)*btab_esz] = isrc[btab[i+_dx1*btab_esz]];
390 for( i = 0; i < _dx1*esz; i++ )
391 row[i] = src[btab[i]];
392 for( i = 0; i < _dx2*esz; i++ )
393 row[i + (width1 - _dx2)*esz] = src[btab[i+_dx1*esz]];
398 (*rowFilter)(row, brow, width, CV_MAT_CN(srcType));
401 int max_i = std::min(bufRows, roi.height - (dstY + dy) + (kheight - 1));
402 for( i = 0; i < max_i; i++ )
404 int srcY = borderInterpolate(dstY + dy + i + roi.y - ay,
405 wholeSize.height, columnBorderType);
406 if( srcY < 0 ) // can happen only with constant border type
407 brows[i] = alignPtr(&constBorderRow[0], VEC_ALIGN);
410 CV_Assert( srcY >= startY );
411 if( srcY >= startY + rowCount )
413 int bi = (srcY - startY0) % bufRows;
414 brows[i] = alignPtr(&ringBuf[0], VEC_ALIGN) + bi*bufStep;
421 (*columnFilter)((const uchar**)brows, dst, dststep, i, roi.width*cn);
423 (*filter2D)((const uchar**)brows, dst, dststep, i, roi.width, cn);
427 CV_Assert( dstY <= roi.height );
432 void FilterEngine::apply(const Mat& src, Mat& dst,
433 const Rect& _srcRoi, Point dstOfs, bool isolated)
435 CV_Assert( src.type() == srcType && dst.type() == dstType );
437 Rect srcRoi = _srcRoi;
438 if( srcRoi == Rect(0,0,-1,-1) )
439 srcRoi = Rect(0,0,src.cols,src.rows);
441 if( srcRoi.area() == 0 )
444 CV_Assert( dstOfs.x >= 0 && dstOfs.y >= 0 &&
445 dstOfs.x + srcRoi.width <= dst.cols &&
446 dstOfs.y + srcRoi.height <= dst.rows );
448 int y = start(src, srcRoi, isolated);
449 proceed( src.data + y*src.step, (int)src.step, endY - startY,
450 dst.data + dstOfs.y*dst.step + dstOfs.x*dst.elemSize(), (int)dst.step );
455 /****************************************************************************************\
456 * Separable linear filter *
457 \****************************************************************************************/
459 int cv::getKernelType(InputArray filter_kernel, Point anchor)
461 Mat _kernel = filter_kernel.getMat();
462 CV_Assert( _kernel.channels() == 1 );
463 int i, sz = _kernel.rows*_kernel.cols;
466 _kernel.convertTo(kernel, CV_64F);
468 const double* coeffs = (double*)kernel.data;
470 int type = KERNEL_SMOOTH + KERNEL_INTEGER;
471 if( (_kernel.rows == 1 || _kernel.cols == 1) &&
472 anchor.x*2 + 1 == _kernel.cols &&
473 anchor.y*2 + 1 == _kernel.rows )
474 type |= (KERNEL_SYMMETRICAL + KERNEL_ASYMMETRICAL);
476 for( i = 0; i < sz; i++ )
478 double a = coeffs[i], b = coeffs[sz - i - 1];
480 type &= ~KERNEL_SYMMETRICAL;
482 type &= ~KERNEL_ASYMMETRICAL;
484 type &= ~KERNEL_SMOOTH;
485 if( a != saturate_cast<int>(a) )
486 type &= ~KERNEL_INTEGER;
490 if( fabs(sum - 1) > FLT_EPSILON*(fabs(sum) + 1) )
491 type &= ~KERNEL_SMOOTH;
502 RowNoVec(const Mat&) {}
503 int operator()(const uchar*, uchar*, int, int) const { return 0; }
509 ColumnNoVec(const Mat&, int, int, double) {}
510 int operator()(const uchar**, uchar*, int) const { return 0; }
513 struct SymmRowSmallNoVec
515 SymmRowSmallNoVec() {}
516 SymmRowSmallNoVec(const Mat&, int) {}
517 int operator()(const uchar*, uchar*, int, int) const { return 0; }
520 struct SymmColumnSmallNoVec
522 SymmColumnSmallNoVec() {}
523 SymmColumnSmallNoVec(const Mat&, int, int, double) {}
524 int operator()(const uchar**, uchar*, int) const { return 0; }
530 FilterNoVec(const Mat&, int, double) {}
531 int operator()(const uchar**, uchar*, int) const { return 0; }
537 ///////////////////////////////////// 8u-16s & 8u-8u //////////////////////////////////
541 RowVec_8u32s() { smallValues = false; }
542 RowVec_8u32s( const Mat& _kernel )
546 int k, ksize = kernel.rows + kernel.cols - 1;
547 for( k = 0; k < ksize; k++ )
549 int v = ((const int*)kernel.data)[k];
550 if( v < SHRT_MIN || v > SHRT_MAX )
558 int operator()(const uchar* _src, uchar* _dst, int width, int cn) const
560 if( !checkHardwareSupport(CV_CPU_SSE2) )
563 int i = 0, k, _ksize = kernel.rows + kernel.cols - 1;
564 int* dst = (int*)_dst;
565 const int* _kx = (const int*)kernel.data;
570 for( ; i <= width - 16; i += 16 )
572 const uchar* src = _src + i;
573 __m128i f, z = _mm_setzero_si128(), s0 = z, s1 = z, s2 = z, s3 = z;
574 __m128i x0, x1, x2, x3;
576 for( k = 0; k < _ksize; k++, src += cn )
578 f = _mm_cvtsi32_si128(_kx[k]);
579 f = _mm_shuffle_epi32(f, 0);
580 f = _mm_packs_epi32(f, f);
582 x0 = _mm_loadu_si128((const __m128i*)src);
583 x2 = _mm_unpackhi_epi8(x0, z);
584 x0 = _mm_unpacklo_epi8(x0, z);
585 x1 = _mm_mulhi_epi16(x0, f);
586 x3 = _mm_mulhi_epi16(x2, f);
587 x0 = _mm_mullo_epi16(x0, f);
588 x2 = _mm_mullo_epi16(x2, f);
590 s0 = _mm_add_epi32(s0, _mm_unpacklo_epi16(x0, x1));
591 s1 = _mm_add_epi32(s1, _mm_unpackhi_epi16(x0, x1));
592 s2 = _mm_add_epi32(s2, _mm_unpacklo_epi16(x2, x3));
593 s3 = _mm_add_epi32(s3, _mm_unpackhi_epi16(x2, x3));
596 _mm_store_si128((__m128i*)(dst + i), s0);
597 _mm_store_si128((__m128i*)(dst + i + 4), s1);
598 _mm_store_si128((__m128i*)(dst + i + 8), s2);
599 _mm_store_si128((__m128i*)(dst + i + 12), s3);
602 for( ; i <= width - 4; i += 4 )
604 const uchar* src = _src + i;
605 __m128i f, z = _mm_setzero_si128(), s0 = z, x0, x1;
607 for( k = 0; k < _ksize; k++, src += cn )
609 f = _mm_cvtsi32_si128(_kx[k]);
610 f = _mm_shuffle_epi32(f, 0);
611 f = _mm_packs_epi32(f, f);
613 x0 = _mm_cvtsi32_si128(*(const int*)src);
614 x0 = _mm_unpacklo_epi8(x0, z);
615 x1 = _mm_mulhi_epi16(x0, f);
616 x0 = _mm_mullo_epi16(x0, f);
617 s0 = _mm_add_epi32(s0, _mm_unpacklo_epi16(x0, x1));
619 _mm_store_si128((__m128i*)(dst + i), s0);
630 struct SymmRowSmallVec_8u32s
632 SymmRowSmallVec_8u32s() { smallValues = false; }
633 SymmRowSmallVec_8u32s( const Mat& _kernel, int _symmetryType )
636 symmetryType = _symmetryType;
638 int k, ksize = kernel.rows + kernel.cols - 1;
639 for( k = 0; k < ksize; k++ )
641 int v = ((const int*)kernel.data)[k];
642 if( v < SHRT_MIN || v > SHRT_MAX )
650 int operator()(const uchar* src, uchar* _dst, int width, int cn) const
652 if( !checkHardwareSupport(CV_CPU_SSE2) )
655 int i = 0, j, k, _ksize = kernel.rows + kernel.cols - 1;
656 int* dst = (int*)_dst;
657 bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
658 const int* kx = (const int*)kernel.data + _ksize/2;
662 src += (_ksize/2)*cn;
665 __m128i z = _mm_setzero_si128();
672 if( kx[0] == 2 && kx[1] == 1 )
673 for( ; i <= width - 16; i += 16, src += 16 )
675 __m128i x0, x1, x2, y0, y1, y2;
676 x0 = _mm_loadu_si128((__m128i*)(src - cn));
677 x1 = _mm_loadu_si128((__m128i*)src);
678 x2 = _mm_loadu_si128((__m128i*)(src + cn));
679 y0 = _mm_unpackhi_epi8(x0, z);
680 x0 = _mm_unpacklo_epi8(x0, z);
681 y1 = _mm_unpackhi_epi8(x1, z);
682 x1 = _mm_unpacklo_epi8(x1, z);
683 y2 = _mm_unpackhi_epi8(x2, z);
684 x2 = _mm_unpacklo_epi8(x2, z);
685 x0 = _mm_add_epi16(x0, _mm_add_epi16(_mm_add_epi16(x1, x1), x2));
686 y0 = _mm_add_epi16(y0, _mm_add_epi16(_mm_add_epi16(y1, y1), y2));
687 _mm_store_si128((__m128i*)(dst + i), _mm_unpacklo_epi16(x0, z));
688 _mm_store_si128((__m128i*)(dst + i + 4), _mm_unpackhi_epi16(x0, z));
689 _mm_store_si128((__m128i*)(dst + i + 8), _mm_unpacklo_epi16(y0, z));
690 _mm_store_si128((__m128i*)(dst + i + 12), _mm_unpackhi_epi16(y0, z));
692 else if( kx[0] == -2 && kx[1] == 1 )
693 for( ; i <= width - 16; i += 16, src += 16 )
695 __m128i x0, x1, x2, y0, y1, y2;
696 x0 = _mm_loadu_si128((__m128i*)(src - cn));
697 x1 = _mm_loadu_si128((__m128i*)src);
698 x2 = _mm_loadu_si128((__m128i*)(src + cn));
699 y0 = _mm_unpackhi_epi8(x0, z);
700 x0 = _mm_unpacklo_epi8(x0, z);
701 y1 = _mm_unpackhi_epi8(x1, z);
702 x1 = _mm_unpacklo_epi8(x1, z);
703 y2 = _mm_unpackhi_epi8(x2, z);
704 x2 = _mm_unpacklo_epi8(x2, z);
705 x0 = _mm_add_epi16(x0, _mm_sub_epi16(x2, _mm_add_epi16(x1, x1)));
706 y0 = _mm_add_epi16(y0, _mm_sub_epi16(y2, _mm_add_epi16(y1, y1)));
707 _mm_store_si128((__m128i*)(dst + i), _mm_srai_epi32(_mm_unpacklo_epi16(x0, x0),16));
708 _mm_store_si128((__m128i*)(dst + i + 4), _mm_srai_epi32(_mm_unpackhi_epi16(x0, x0),16));
709 _mm_store_si128((__m128i*)(dst + i + 8), _mm_srai_epi32(_mm_unpacklo_epi16(y0, y0),16));
710 _mm_store_si128((__m128i*)(dst + i + 12), _mm_srai_epi32(_mm_unpackhi_epi16(y0, y0),16));
714 __m128i k0 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[0]), 0),
715 k1 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[1]), 0);
716 k0 = _mm_packs_epi32(k0, k0);
717 k1 = _mm_packs_epi32(k1, k1);
719 for( ; i <= width - 16; i += 16, src += 16 )
721 __m128i x0, x1, x2, y0, y1, t0, t1, z0, z1, z2, z3;
722 x0 = _mm_loadu_si128((__m128i*)(src - cn));
723 x1 = _mm_loadu_si128((__m128i*)src);
724 x2 = _mm_loadu_si128((__m128i*)(src + cn));
725 y0 = _mm_add_epi16(_mm_unpackhi_epi8(x0, z), _mm_unpackhi_epi8(x2, z));
726 x0 = _mm_add_epi16(_mm_unpacklo_epi8(x0, z), _mm_unpacklo_epi8(x2, z));
727 y1 = _mm_unpackhi_epi8(x1, z);
728 x1 = _mm_unpacklo_epi8(x1, z);
730 t1 = _mm_mulhi_epi16(x1, k0);
731 t0 = _mm_mullo_epi16(x1, k0);
732 x2 = _mm_mulhi_epi16(x0, k1);
733 x0 = _mm_mullo_epi16(x0, k1);
734 z0 = _mm_unpacklo_epi16(t0, t1);
735 z1 = _mm_unpackhi_epi16(t0, t1);
736 z0 = _mm_add_epi32(z0, _mm_unpacklo_epi16(x0, x2));
737 z1 = _mm_add_epi32(z1, _mm_unpackhi_epi16(x0, x2));
739 t1 = _mm_mulhi_epi16(y1, k0);
740 t0 = _mm_mullo_epi16(y1, k0);
741 y1 = _mm_mulhi_epi16(y0, k1);
742 y0 = _mm_mullo_epi16(y0, k1);
743 z2 = _mm_unpacklo_epi16(t0, t1);
744 z3 = _mm_unpackhi_epi16(t0, t1);
745 z2 = _mm_add_epi32(z2, _mm_unpacklo_epi16(y0, y1));
746 z3 = _mm_add_epi32(z3, _mm_unpackhi_epi16(y0, y1));
747 _mm_store_si128((__m128i*)(dst + i), z0);
748 _mm_store_si128((__m128i*)(dst + i + 4), z1);
749 _mm_store_si128((__m128i*)(dst + i + 8), z2);
750 _mm_store_si128((__m128i*)(dst + i + 12), z3);
754 else if( _ksize == 5 )
756 if( kx[0] == -2 && kx[1] == 0 && kx[2] == 1 )
757 for( ; i <= width - 16; i += 16, src += 16 )
759 __m128i x0, x1, x2, y0, y1, y2;
760 x0 = _mm_loadu_si128((__m128i*)(src - cn*2));
761 x1 = _mm_loadu_si128((__m128i*)src);
762 x2 = _mm_loadu_si128((__m128i*)(src + cn*2));
763 y0 = _mm_unpackhi_epi8(x0, z);
764 x0 = _mm_unpacklo_epi8(x0, z);
765 y1 = _mm_unpackhi_epi8(x1, z);
766 x1 = _mm_unpacklo_epi8(x1, z);
767 y2 = _mm_unpackhi_epi8(x2, z);
768 x2 = _mm_unpacklo_epi8(x2, z);
769 x0 = _mm_add_epi16(x0, _mm_sub_epi16(x2, _mm_add_epi16(x1, x1)));
770 y0 = _mm_add_epi16(y0, _mm_sub_epi16(y2, _mm_add_epi16(y1, y1)));
771 _mm_store_si128((__m128i*)(dst + i), _mm_srai_epi32(_mm_unpacklo_epi16(x0, x0),16));
772 _mm_store_si128((__m128i*)(dst + i + 4), _mm_srai_epi32(_mm_unpackhi_epi16(x0, x0),16));
773 _mm_store_si128((__m128i*)(dst + i + 8), _mm_srai_epi32(_mm_unpacklo_epi16(y0, y0),16));
774 _mm_store_si128((__m128i*)(dst + i + 12), _mm_srai_epi32(_mm_unpackhi_epi16(y0, y0),16));
778 __m128i k0 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[0]), 0),
779 k1 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[1]), 0),
780 k2 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[2]), 0);
781 k0 = _mm_packs_epi32(k0, k0);
782 k1 = _mm_packs_epi32(k1, k1);
783 k2 = _mm_packs_epi32(k2, k2);
785 for( ; i <= width - 16; i += 16, src += 16 )
787 __m128i x0, x1, x2, y0, y1, t0, t1, z0, z1, z2, z3;
788 x0 = _mm_loadu_si128((__m128i*)(src - cn));
789 x1 = _mm_loadu_si128((__m128i*)src);
790 x2 = _mm_loadu_si128((__m128i*)(src + cn));
791 y0 = _mm_add_epi16(_mm_unpackhi_epi8(x0, z), _mm_unpackhi_epi8(x2, z));
792 x0 = _mm_add_epi16(_mm_unpacklo_epi8(x0, z), _mm_unpacklo_epi8(x2, z));
793 y1 = _mm_unpackhi_epi8(x1, z);
794 x1 = _mm_unpacklo_epi8(x1, z);
796 t1 = _mm_mulhi_epi16(x1, k0);
797 t0 = _mm_mullo_epi16(x1, k0);
798 x2 = _mm_mulhi_epi16(x0, k1);
799 x0 = _mm_mullo_epi16(x0, k1);
800 z0 = _mm_unpacklo_epi16(t0, t1);
801 z1 = _mm_unpackhi_epi16(t0, t1);
802 z0 = _mm_add_epi32(z0, _mm_unpacklo_epi16(x0, x2));
803 z1 = _mm_add_epi32(z1, _mm_unpackhi_epi16(x0, x2));
805 t1 = _mm_mulhi_epi16(y1, k0);
806 t0 = _mm_mullo_epi16(y1, k0);
807 y1 = _mm_mulhi_epi16(y0, k1);
808 y0 = _mm_mullo_epi16(y0, k1);
809 z2 = _mm_unpacklo_epi16(t0, t1);
810 z3 = _mm_unpackhi_epi16(t0, t1);
811 z2 = _mm_add_epi32(z2, _mm_unpacklo_epi16(y0, y1));
812 z3 = _mm_add_epi32(z3, _mm_unpackhi_epi16(y0, y1));
814 x0 = _mm_loadu_si128((__m128i*)(src - cn*2));
815 x1 = _mm_loadu_si128((__m128i*)(src + cn*2));
816 y1 = _mm_add_epi16(_mm_unpackhi_epi8(x0, z), _mm_unpackhi_epi8(x1, z));
817 y0 = _mm_add_epi16(_mm_unpacklo_epi8(x0, z), _mm_unpacklo_epi8(x1, z));
819 t1 = _mm_mulhi_epi16(y0, k2);
820 t0 = _mm_mullo_epi16(y0, k2);
821 y0 = _mm_mullo_epi16(y1, k2);
822 y1 = _mm_mulhi_epi16(y1, k2);
823 z0 = _mm_add_epi32(z0, _mm_unpacklo_epi16(t0, t1));
824 z1 = _mm_add_epi32(z1, _mm_unpackhi_epi16(t0, t1));
825 z2 = _mm_add_epi32(z2, _mm_unpacklo_epi16(y0, y1));
826 z3 = _mm_add_epi32(z3, _mm_unpackhi_epi16(y0, y1));
828 _mm_store_si128((__m128i*)(dst + i), z0);
829 _mm_store_si128((__m128i*)(dst + i + 4), z1);
830 _mm_store_si128((__m128i*)(dst + i + 8), z2);
831 _mm_store_si128((__m128i*)(dst + i + 12), z3);
840 if( kx[0] == 0 && kx[1] == 1 )
841 for( ; i <= width - 16; i += 16, src += 16 )
844 x0 = _mm_loadu_si128((__m128i*)(src + cn));
845 x1 = _mm_loadu_si128((__m128i*)(src - cn));
846 y0 = _mm_sub_epi16(_mm_unpackhi_epi8(x0, z), _mm_unpackhi_epi8(x1, z));
847 x0 = _mm_sub_epi16(_mm_unpacklo_epi8(x0, z), _mm_unpacklo_epi8(x1, z));
848 _mm_store_si128((__m128i*)(dst + i), _mm_srai_epi32(_mm_unpacklo_epi16(x0, x0),16));
849 _mm_store_si128((__m128i*)(dst + i + 4), _mm_srai_epi32(_mm_unpackhi_epi16(x0, x0),16));
850 _mm_store_si128((__m128i*)(dst + i + 8), _mm_srai_epi32(_mm_unpacklo_epi16(y0, y0),16));
851 _mm_store_si128((__m128i*)(dst + i + 12), _mm_srai_epi32(_mm_unpackhi_epi16(y0, y0),16));
855 __m128i k1 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[1]), 0);
856 k1 = _mm_packs_epi32(k1, k1);
858 for( ; i <= width - 16; i += 16, src += 16 )
860 __m128i x0, x1, y0, y1, z0, z1, z2, z3;
861 x0 = _mm_loadu_si128((__m128i*)(src + cn));
862 x1 = _mm_loadu_si128((__m128i*)(src - cn));
863 y0 = _mm_sub_epi16(_mm_unpackhi_epi8(x0, z), _mm_unpackhi_epi8(x1, z));
864 x0 = _mm_sub_epi16(_mm_unpacklo_epi8(x0, z), _mm_unpacklo_epi8(x1, z));
866 x1 = _mm_mulhi_epi16(x0, k1);
867 x0 = _mm_mullo_epi16(x0, k1);
868 z0 = _mm_unpacklo_epi16(x0, x1);
869 z1 = _mm_unpackhi_epi16(x0, x1);
871 y1 = _mm_mulhi_epi16(y0, k1);
872 y0 = _mm_mullo_epi16(y0, k1);
873 z2 = _mm_unpacklo_epi16(y0, y1);
874 z3 = _mm_unpackhi_epi16(y0, y1);
875 _mm_store_si128((__m128i*)(dst + i), z0);
876 _mm_store_si128((__m128i*)(dst + i + 4), z1);
877 _mm_store_si128((__m128i*)(dst + i + 8), z2);
878 _mm_store_si128((__m128i*)(dst + i + 12), z3);
882 else if( _ksize == 5 )
884 __m128i k0 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[0]), 0),
885 k1 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[1]), 0),
886 k2 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[2]), 0);
887 k0 = _mm_packs_epi32(k0, k0);
888 k1 = _mm_packs_epi32(k1, k1);
889 k2 = _mm_packs_epi32(k2, k2);
891 for( ; i <= width - 16; i += 16, src += 16 )
893 __m128i x0, x1, x2, y0, y1, t0, t1, z0, z1, z2, z3;
894 x0 = _mm_loadu_si128((__m128i*)(src + cn));
895 x2 = _mm_loadu_si128((__m128i*)(src - cn));
896 y0 = _mm_sub_epi16(_mm_unpackhi_epi8(x0, z), _mm_unpackhi_epi8(x2, z));
897 x0 = _mm_sub_epi16(_mm_unpacklo_epi8(x0, z), _mm_unpacklo_epi8(x2, z));
899 x2 = _mm_mulhi_epi16(x0, k1);
900 x0 = _mm_mullo_epi16(x0, k1);
901 z0 = _mm_unpacklo_epi16(x0, x2);
902 z1 = _mm_unpackhi_epi16(x0, x2);
903 y1 = _mm_mulhi_epi16(y0, k1);
904 y0 = _mm_mullo_epi16(y0, k1);
905 z2 = _mm_unpacklo_epi16(y0, y1);
906 z3 = _mm_unpackhi_epi16(y0, y1);
908 x0 = _mm_loadu_si128((__m128i*)(src + cn*2));
909 x1 = _mm_loadu_si128((__m128i*)(src - cn*2));
910 y1 = _mm_sub_epi16(_mm_unpackhi_epi8(x0, z), _mm_unpackhi_epi8(x1, z));
911 y0 = _mm_sub_epi16(_mm_unpacklo_epi8(x0, z), _mm_unpacklo_epi8(x1, z));
913 t1 = _mm_mulhi_epi16(y0, k2);
914 t0 = _mm_mullo_epi16(y0, k2);
915 y0 = _mm_mullo_epi16(y1, k2);
916 y1 = _mm_mulhi_epi16(y1, k2);
917 z0 = _mm_add_epi32(z0, _mm_unpacklo_epi16(t0, t1));
918 z1 = _mm_add_epi32(z1, _mm_unpackhi_epi16(t0, t1));
919 z2 = _mm_add_epi32(z2, _mm_unpacklo_epi16(y0, y1));
920 z3 = _mm_add_epi32(z3, _mm_unpackhi_epi16(y0, y1));
922 _mm_store_si128((__m128i*)(dst + i), z0);
923 _mm_store_si128((__m128i*)(dst + i + 4), z1);
924 _mm_store_si128((__m128i*)(dst + i + 8), z2);
925 _mm_store_si128((__m128i*)(dst + i + 12), z3);
930 src -= (_ksize/2)*cn;
932 for( ; i <= width - 4; i += 4, src += 4 )
934 __m128i f, s0 = z, x0, x1;
936 for( k = j = 0; k < _ksize; k++, j += cn )
938 f = _mm_cvtsi32_si128(kx[k]);
939 f = _mm_shuffle_epi32(f, 0);
940 f = _mm_packs_epi32(f, f);
942 x0 = _mm_cvtsi32_si128(*(const int*)(src + j));
943 x0 = _mm_unpacklo_epi8(x0, z);
944 x1 = _mm_mulhi_epi16(x0, f);
945 x0 = _mm_mullo_epi16(x0, f);
946 s0 = _mm_add_epi32(s0, _mm_unpacklo_epi16(x0, x1));
948 _mm_store_si128((__m128i*)(dst + i), s0);
960 struct SymmColumnVec_32s8u
962 SymmColumnVec_32s8u() { symmetryType=0; }
963 SymmColumnVec_32s8u(const Mat& _kernel, int _symmetryType, int _bits, double _delta)
965 symmetryType = _symmetryType;
966 _kernel.convertTo(kernel, CV_32F, 1./(1 << _bits), 0);
967 delta = (float)(_delta/(1 << _bits));
968 CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 );
971 int operator()(const uchar** _src, uchar* dst, int width) const
973 if( !checkHardwareSupport(CV_CPU_SSE2) )
976 int ksize2 = (kernel.rows + kernel.cols - 1)/2;
977 const float* ky = (const float*)kernel.data + ksize2;
979 bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
980 const int** src = (const int**)_src;
981 const __m128i *S, *S2;
982 __m128 d4 = _mm_set1_ps(delta);
986 for( ; i <= width - 16; i += 16 )
988 __m128 f = _mm_load_ss(ky);
989 f = _mm_shuffle_ps(f, f, 0);
990 __m128 s0, s1, s2, s3;
992 S = (const __m128i*)(src[0] + i);
993 s0 = _mm_cvtepi32_ps(_mm_load_si128(S));
994 s1 = _mm_cvtepi32_ps(_mm_load_si128(S+1));
995 s0 = _mm_add_ps(_mm_mul_ps(s0, f), d4);
996 s1 = _mm_add_ps(_mm_mul_ps(s1, f), d4);
997 s2 = _mm_cvtepi32_ps(_mm_load_si128(S+2));
998 s3 = _mm_cvtepi32_ps(_mm_load_si128(S+3));
999 s2 = _mm_add_ps(_mm_mul_ps(s2, f), d4);
1000 s3 = _mm_add_ps(_mm_mul_ps(s3, f), d4);
1002 for( k = 1; k <= ksize2; k++ )
1004 S = (const __m128i*)(src[k] + i);
1005 S2 = (const __m128i*)(src[-k] + i);
1006 f = _mm_load_ss(ky+k);
1007 f = _mm_shuffle_ps(f, f, 0);
1008 x0 = _mm_add_epi32(_mm_load_si128(S), _mm_load_si128(S2));
1009 x1 = _mm_add_epi32(_mm_load_si128(S+1), _mm_load_si128(S2+1));
1010 s0 = _mm_add_ps(s0, _mm_mul_ps(_mm_cvtepi32_ps(x0), f));
1011 s1 = _mm_add_ps(s1, _mm_mul_ps(_mm_cvtepi32_ps(x1), f));
1012 x0 = _mm_add_epi32(_mm_load_si128(S+2), _mm_load_si128(S2+2));
1013 x1 = _mm_add_epi32(_mm_load_si128(S+3), _mm_load_si128(S2+3));
1014 s2 = _mm_add_ps(s2, _mm_mul_ps(_mm_cvtepi32_ps(x0), f));
1015 s3 = _mm_add_ps(s3, _mm_mul_ps(_mm_cvtepi32_ps(x1), f));
1018 x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), _mm_cvtps_epi32(s1));
1019 x1 = _mm_packs_epi32(_mm_cvtps_epi32(s2), _mm_cvtps_epi32(s3));
1020 x0 = _mm_packus_epi16(x0, x1);
1021 _mm_storeu_si128((__m128i*)(dst + i), x0);
1024 for( ; i <= width - 4; i += 4 )
1026 __m128 f = _mm_load_ss(ky);
1027 f = _mm_shuffle_ps(f, f, 0);
1029 __m128 s0 = _mm_cvtepi32_ps(_mm_load_si128((const __m128i*)(src[0] + i)));
1030 s0 = _mm_add_ps(_mm_mul_ps(s0, f), d4);
1032 for( k = 1; k <= ksize2; k++ )
1034 S = (const __m128i*)(src[k] + i);
1035 S2 = (const __m128i*)(src[-k] + i);
1036 f = _mm_load_ss(ky+k);
1037 f = _mm_shuffle_ps(f, f, 0);
1038 x0 = _mm_add_epi32(_mm_load_si128(S), _mm_load_si128(S2));
1039 s0 = _mm_add_ps(s0, _mm_mul_ps(_mm_cvtepi32_ps(x0), f));
1042 x0 = _mm_cvtps_epi32(s0);
1043 x0 = _mm_packs_epi32(x0, x0);
1044 x0 = _mm_packus_epi16(x0, x0);
1045 *(int*)(dst + i) = _mm_cvtsi128_si32(x0);
1050 for( ; i <= width - 16; i += 16 )
1052 __m128 f, s0 = d4, s1 = d4, s2 = d4, s3 = d4;
1055 for( k = 1; k <= ksize2; k++ )
1057 S = (const __m128i*)(src[k] + i);
1058 S2 = (const __m128i*)(src[-k] + i);
1059 f = _mm_load_ss(ky+k);
1060 f = _mm_shuffle_ps(f, f, 0);
1061 x0 = _mm_sub_epi32(_mm_load_si128(S), _mm_load_si128(S2));
1062 x1 = _mm_sub_epi32(_mm_load_si128(S+1), _mm_load_si128(S2+1));
1063 s0 = _mm_add_ps(s0, _mm_mul_ps(_mm_cvtepi32_ps(x0), f));
1064 s1 = _mm_add_ps(s1, _mm_mul_ps(_mm_cvtepi32_ps(x1), f));
1065 x0 = _mm_sub_epi32(_mm_load_si128(S+2), _mm_load_si128(S2+2));
1066 x1 = _mm_sub_epi32(_mm_load_si128(S+3), _mm_load_si128(S2+3));
1067 s2 = _mm_add_ps(s2, _mm_mul_ps(_mm_cvtepi32_ps(x0), f));
1068 s3 = _mm_add_ps(s3, _mm_mul_ps(_mm_cvtepi32_ps(x1), f));
1071 x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), _mm_cvtps_epi32(s1));
1072 x1 = _mm_packs_epi32(_mm_cvtps_epi32(s2), _mm_cvtps_epi32(s3));
1073 x0 = _mm_packus_epi16(x0, x1);
1074 _mm_storeu_si128((__m128i*)(dst + i), x0);
1077 for( ; i <= width - 4; i += 4 )
1082 for( k = 1; k <= ksize2; k++ )
1084 S = (const __m128i*)(src[k] + i);
1085 S2 = (const __m128i*)(src[-k] + i);
1086 f = _mm_load_ss(ky+k);
1087 f = _mm_shuffle_ps(f, f, 0);
1088 x0 = _mm_sub_epi32(_mm_load_si128(S), _mm_load_si128(S2));
1089 s0 = _mm_add_ps(s0, _mm_mul_ps(_mm_cvtepi32_ps(x0), f));
1092 x0 = _mm_cvtps_epi32(s0);
1093 x0 = _mm_packs_epi32(x0, x0);
1094 x0 = _mm_packus_epi16(x0, x0);
1095 *(int*)(dst + i) = _mm_cvtsi128_si32(x0);
1108 struct SymmColumnSmallVec_32s16s
1110 SymmColumnSmallVec_32s16s() { symmetryType=0; }
1111 SymmColumnSmallVec_32s16s(const Mat& _kernel, int _symmetryType, int _bits, double _delta)
1113 symmetryType = _symmetryType;
1114 _kernel.convertTo(kernel, CV_32F, 1./(1 << _bits), 0);
1115 delta = (float)(_delta/(1 << _bits));
1116 CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 );
1119 int operator()(const uchar** _src, uchar* _dst, int width) const
1121 if( !checkHardwareSupport(CV_CPU_SSE2) )
1124 int ksize2 = (kernel.rows + kernel.cols - 1)/2;
1125 const float* ky = (const float*)kernel.data + ksize2;
1127 bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
1128 const int** src = (const int**)_src;
1129 const int *S0 = src[-1], *S1 = src[0], *S2 = src[1];
1130 short* dst = (short*)_dst;
1131 __m128 df4 = _mm_set1_ps(delta);
1132 __m128i d4 = _mm_cvtps_epi32(df4);
1136 if( ky[0] == 2 && ky[1] == 1 )
1138 for( ; i <= width - 8; i += 8 )
1140 __m128i s0, s1, s2, s3, s4, s5;
1141 s0 = _mm_load_si128((__m128i*)(S0 + i));
1142 s1 = _mm_load_si128((__m128i*)(S0 + i + 4));
1143 s2 = _mm_load_si128((__m128i*)(S1 + i));
1144 s3 = _mm_load_si128((__m128i*)(S1 + i + 4));
1145 s4 = _mm_load_si128((__m128i*)(S2 + i));
1146 s5 = _mm_load_si128((__m128i*)(S2 + i + 4));
1147 s0 = _mm_add_epi32(s0, _mm_add_epi32(s4, _mm_add_epi32(s2, s2)));
1148 s1 = _mm_add_epi32(s1, _mm_add_epi32(s5, _mm_add_epi32(s3, s3)));
1149 s0 = _mm_add_epi32(s0, d4);
1150 s1 = _mm_add_epi32(s1, d4);
1151 _mm_storeu_si128((__m128i*)(dst + i), _mm_packs_epi32(s0, s1));
1154 else if( ky[0] == -2 && ky[1] == 1 )
1156 for( ; i <= width - 8; i += 8 )
1158 __m128i s0, s1, s2, s3, s4, s5;
1159 s0 = _mm_load_si128((__m128i*)(S0 + i));
1160 s1 = _mm_load_si128((__m128i*)(S0 + i + 4));
1161 s2 = _mm_load_si128((__m128i*)(S1 + i));
1162 s3 = _mm_load_si128((__m128i*)(S1 + i + 4));
1163 s4 = _mm_load_si128((__m128i*)(S2 + i));
1164 s5 = _mm_load_si128((__m128i*)(S2 + i + 4));
1165 s0 = _mm_add_epi32(s0, _mm_sub_epi32(s4, _mm_add_epi32(s2, s2)));
1166 s1 = _mm_add_epi32(s1, _mm_sub_epi32(s5, _mm_add_epi32(s3, s3)));
1167 s0 = _mm_add_epi32(s0, d4);
1168 s1 = _mm_add_epi32(s1, d4);
1169 _mm_storeu_si128((__m128i*)(dst + i), _mm_packs_epi32(s0, s1));
1174 __m128 k0 = _mm_set1_ps(ky[0]), k1 = _mm_set1_ps(ky[1]);
1175 for( ; i <= width - 8; i += 8 )
1178 s0 = _mm_cvtepi32_ps(_mm_load_si128((__m128i*)(S1 + i)));
1179 s1 = _mm_cvtepi32_ps(_mm_load_si128((__m128i*)(S1 + i + 4)));
1180 s0 = _mm_add_ps(_mm_mul_ps(s0, k0), df4);
1181 s1 = _mm_add_ps(_mm_mul_ps(s1, k0), df4);
1183 x0 = _mm_add_epi32(_mm_load_si128((__m128i*)(S0 + i)),
1184 _mm_load_si128((__m128i*)(S2 + i)));
1185 x1 = _mm_add_epi32(_mm_load_si128((__m128i*)(S0 + i + 4)),
1186 _mm_load_si128((__m128i*)(S2 + i + 4)));
1187 s0 = _mm_add_ps(s0, _mm_mul_ps(_mm_cvtepi32_ps(x0),k1));
1188 s1 = _mm_add_ps(s1, _mm_mul_ps(_mm_cvtepi32_ps(x1),k1));
1189 x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), _mm_cvtps_epi32(s1));
1190 _mm_storeu_si128((__m128i*)(dst + i), x0);
1196 if( fabs(ky[1]) == 1 && ky[1] == -ky[-1] )
1200 for( ; i <= width - 8; i += 8 )
1202 __m128i s0, s1, s2, s3;
1203 s0 = _mm_load_si128((__m128i*)(S2 + i));
1204 s1 = _mm_load_si128((__m128i*)(S2 + i + 4));
1205 s2 = _mm_load_si128((__m128i*)(S0 + i));
1206 s3 = _mm_load_si128((__m128i*)(S0 + i + 4));
1207 s0 = _mm_add_epi32(_mm_sub_epi32(s0, s2), d4);
1208 s1 = _mm_add_epi32(_mm_sub_epi32(s1, s3), d4);
1209 _mm_storeu_si128((__m128i*)(dst + i), _mm_packs_epi32(s0, s1));
1214 __m128 k1 = _mm_set1_ps(ky[1]);
1215 for( ; i <= width - 8; i += 8 )
1217 __m128 s0 = df4, s1 = df4;
1219 x0 = _mm_sub_epi32(_mm_load_si128((__m128i*)(S0 + i)),
1220 _mm_load_si128((__m128i*)(S2 + i)));
1221 x1 = _mm_sub_epi32(_mm_load_si128((__m128i*)(S0 + i + 4)),
1222 _mm_load_si128((__m128i*)(S2 + i + 4)));
1223 s0 = _mm_add_ps(s0, _mm_mul_ps(_mm_cvtepi32_ps(x0),k1));
1224 s1 = _mm_add_ps(s1, _mm_mul_ps(_mm_cvtepi32_ps(x1),k1));
1225 x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), _mm_cvtps_epi32(s1));
1226 _mm_storeu_si128((__m128i*)(dst + i), x0);
1240 /////////////////////////////////////// 16s //////////////////////////////////
1242 struct RowVec_16s32f
1245 RowVec_16s32f( const Mat& _kernel )
1248 sse2_supported = checkHardwareSupport(CV_CPU_SSE2);
1251 int operator()(const uchar* _src, uchar* _dst, int width, int cn) const
1253 if( !sse2_supported )
1256 int i = 0, k, _ksize = kernel.rows + kernel.cols - 1;
1257 float* dst = (float*)_dst;
1258 const float* _kx = (const float*)kernel.data;
1261 for( ; i <= width - 8; i += 8 )
1263 const short* src = (const short*)_src + i;
1264 __m128 f, s0 = _mm_setzero_ps(), s1 = s0, x0, x1;
1265 for( k = 0; k < _ksize; k++, src += cn )
1267 f = _mm_load_ss(_kx+k);
1268 f = _mm_shuffle_ps(f, f, 0);
1270 __m128i x0i = _mm_loadu_si128((const __m128i*)src);
1271 __m128i x1i = _mm_srai_epi32(_mm_unpackhi_epi16(x0i, x0i), 16);
1272 x0i = _mm_srai_epi32(_mm_unpacklo_epi16(x0i, x0i), 16);
1273 x0 = _mm_cvtepi32_ps(x0i);
1274 x1 = _mm_cvtepi32_ps(x1i);
1275 s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
1276 s1 = _mm_add_ps(s1, _mm_mul_ps(x1, f));
1278 _mm_store_ps(dst + i, s0);
1279 _mm_store_ps(dst + i + 4, s1);
1285 bool sse2_supported;
1289 struct SymmColumnVec_32f16s
1291 SymmColumnVec_32f16s() { symmetryType=0; }
1292 SymmColumnVec_32f16s(const Mat& _kernel, int _symmetryType, int, double _delta)
1294 symmetryType = _symmetryType;
1296 delta = (float)_delta;
1297 CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 );
1298 sse2_supported = checkHardwareSupport(CV_CPU_SSE2);
1301 int operator()(const uchar** _src, uchar* _dst, int width) const
1303 if( !sse2_supported )
1306 int ksize2 = (kernel.rows + kernel.cols - 1)/2;
1307 const float* ky = (const float*)kernel.data + ksize2;
1309 bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
1310 const float** src = (const float**)_src;
1311 const float *S, *S2;
1312 short* dst = (short*)_dst;
1313 __m128 d4 = _mm_set1_ps(delta);
1317 for( ; i <= width - 16; i += 16 )
1319 __m128 f = _mm_load_ss(ky);
1320 f = _mm_shuffle_ps(f, f, 0);
1321 __m128 s0, s1, s2, s3;
1324 s0 = _mm_load_ps(S);
1325 s1 = _mm_load_ps(S+4);
1326 s0 = _mm_add_ps(_mm_mul_ps(s0, f), d4);
1327 s1 = _mm_add_ps(_mm_mul_ps(s1, f), d4);
1328 s2 = _mm_load_ps(S+8);
1329 s3 = _mm_load_ps(S+12);
1330 s2 = _mm_add_ps(_mm_mul_ps(s2, f), d4);
1331 s3 = _mm_add_ps(_mm_mul_ps(s3, f), d4);
1333 for( k = 1; k <= ksize2; k++ )
1337 f = _mm_load_ss(ky+k);
1338 f = _mm_shuffle_ps(f, f, 0);
1339 x0 = _mm_add_ps(_mm_load_ps(S), _mm_load_ps(S2));
1340 x1 = _mm_add_ps(_mm_load_ps(S+4), _mm_load_ps(S2+4));
1341 s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
1342 s1 = _mm_add_ps(s1, _mm_mul_ps(x1, f));
1343 x0 = _mm_add_ps(_mm_load_ps(S+8), _mm_load_ps(S2+8));
1344 x1 = _mm_add_ps(_mm_load_ps(S+12), _mm_load_ps(S2+12));
1345 s2 = _mm_add_ps(s2, _mm_mul_ps(x0, f));
1346 s3 = _mm_add_ps(s3, _mm_mul_ps(x1, f));
1349 __m128i s0i = _mm_cvtps_epi32(s0);
1350 __m128i s1i = _mm_cvtps_epi32(s1);
1351 __m128i s2i = _mm_cvtps_epi32(s2);
1352 __m128i s3i = _mm_cvtps_epi32(s3);
1354 _mm_storeu_si128((__m128i*)(dst + i), _mm_packs_epi32(s0i, s1i));
1355 _mm_storeu_si128((__m128i*)(dst + i + 8), _mm_packs_epi32(s2i, s3i));
1358 for( ; i <= width - 4; i += 4 )
1360 __m128 f = _mm_load_ss(ky);
1361 f = _mm_shuffle_ps(f, f, 0);
1362 __m128 x0, s0 = _mm_load_ps(src[0] + i);
1363 s0 = _mm_add_ps(_mm_mul_ps(s0, f), d4);
1365 for( k = 1; k <= ksize2; k++ )
1367 f = _mm_load_ss(ky+k);
1368 f = _mm_shuffle_ps(f, f, 0);
1371 x0 = _mm_add_ps(_mm_load_ps(src[k]+i), _mm_load_ps(src[-k] + i));
1372 s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
1375 __m128i s0i = _mm_cvtps_epi32(s0);
1376 _mm_storel_epi64((__m128i*)(dst + i), _mm_packs_epi32(s0i, s0i));
1381 for( ; i <= width - 16; i += 16 )
1383 __m128 f, s0 = d4, s1 = d4, s2 = d4, s3 = d4;
1387 for( k = 1; k <= ksize2; k++ )
1391 f = _mm_load_ss(ky+k);
1392 f = _mm_shuffle_ps(f, f, 0);
1393 x0 = _mm_sub_ps(_mm_load_ps(S), _mm_load_ps(S2));
1394 x1 = _mm_sub_ps(_mm_load_ps(S+4), _mm_load_ps(S2+4));
1395 s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
1396 s1 = _mm_add_ps(s1, _mm_mul_ps(x1, f));
1397 x0 = _mm_sub_ps(_mm_load_ps(S+8), _mm_load_ps(S2+8));
1398 x1 = _mm_sub_ps(_mm_load_ps(S+12), _mm_load_ps(S2+12));
1399 s2 = _mm_add_ps(s2, _mm_mul_ps(x0, f));
1400 s3 = _mm_add_ps(s3, _mm_mul_ps(x1, f));
1403 __m128i s0i = _mm_cvtps_epi32(s0);
1404 __m128i s1i = _mm_cvtps_epi32(s1);
1405 __m128i s2i = _mm_cvtps_epi32(s2);
1406 __m128i s3i = _mm_cvtps_epi32(s3);
1408 _mm_storeu_si128((__m128i*)(dst + i), _mm_packs_epi32(s0i, s1i));
1409 _mm_storeu_si128((__m128i*)(dst + i + 8), _mm_packs_epi32(s2i, s3i));
1412 for( ; i <= width - 4; i += 4 )
1414 __m128 f, x0, s0 = d4;
1416 for( k = 1; k <= ksize2; k++ )
1418 f = _mm_load_ss(ky+k);
1419 f = _mm_shuffle_ps(f, f, 0);
1420 x0 = _mm_sub_ps(_mm_load_ps(src[k]+i), _mm_load_ps(src[-k] + i));
1421 s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
1424 __m128i s0i = _mm_cvtps_epi32(s0);
1425 _mm_storel_epi64((__m128i*)(dst + i), _mm_packs_epi32(s0i, s0i));
1435 bool sse2_supported;
1439 /////////////////////////////////////// 32f //////////////////////////////////
1444 RowVec_32f( const Mat& _kernel )
1449 int operator()(const uchar* _src, uchar* _dst, int width, int cn) const
1451 if( !checkHardwareSupport(CV_CPU_SSE) )
1454 int i = 0, k, _ksize = kernel.rows + kernel.cols - 1;
1455 float* dst = (float*)_dst;
1456 const float* _kx = (const float*)kernel.data;
1459 for( ; i <= width - 8; i += 8 )
1461 const float* src = (const float*)_src + i;
1462 __m128 f, s0 = _mm_setzero_ps(), s1 = s0, x0, x1;
1463 for( k = 0; k < _ksize; k++, src += cn )
1465 f = _mm_load_ss(_kx+k);
1466 f = _mm_shuffle_ps(f, f, 0);
1468 x0 = _mm_loadu_ps(src);
1469 x1 = _mm_loadu_ps(src + 4);
1470 s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
1471 s1 = _mm_add_ps(s1, _mm_mul_ps(x1, f));
1473 _mm_store_ps(dst + i, s0);
1474 _mm_store_ps(dst + i + 4, s1);
1483 struct SymmRowSmallVec_32f
1485 SymmRowSmallVec_32f() {}
1486 SymmRowSmallVec_32f( const Mat& _kernel, int _symmetryType )
1489 symmetryType = _symmetryType;
1492 int operator()(const uchar* _src, uchar* _dst, int width, int cn) const
1494 if( !checkHardwareSupport(CV_CPU_SSE) )
1497 int i = 0, _ksize = kernel.rows + kernel.cols - 1;
1498 float* dst = (float*)_dst;
1499 const float* src = (const float*)_src + (_ksize/2)*cn;
1500 bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
1501 const float* kx = (const float*)kernel.data + _ksize/2;
1510 if( kx[0] == 2 && kx[1] == 1 )
1511 for( ; i <= width - 8; i += 8, src += 8 )
1513 __m128 x0, x1, x2, y0, y1, y2;
1514 x0 = _mm_loadu_ps(src - cn);
1515 x1 = _mm_loadu_ps(src);
1516 x2 = _mm_loadu_ps(src + cn);
1517 y0 = _mm_loadu_ps(src - cn + 4);
1518 y1 = _mm_loadu_ps(src + 4);
1519 y2 = _mm_loadu_ps(src + cn + 4);
1520 x0 = _mm_add_ps(x0, _mm_add_ps(_mm_add_ps(x1, x1), x2));
1521 y0 = _mm_add_ps(y0, _mm_add_ps(_mm_add_ps(y1, y1), y2));
1522 _mm_store_ps(dst + i, x0);
1523 _mm_store_ps(dst + i + 4, y0);
1525 else if( kx[0] == -2 && kx[1] == 1 )
1526 for( ; i <= width - 8; i += 8, src += 8 )
1528 __m128 x0, x1, x2, y0, y1, y2;
1529 x0 = _mm_loadu_ps(src - cn);
1530 x1 = _mm_loadu_ps(src);
1531 x2 = _mm_loadu_ps(src + cn);
1532 y0 = _mm_loadu_ps(src - cn + 4);
1533 y1 = _mm_loadu_ps(src + 4);
1534 y2 = _mm_loadu_ps(src + cn + 4);
1535 x0 = _mm_add_ps(x0, _mm_sub_ps(x2, _mm_add_ps(x1, x1)));
1536 y0 = _mm_add_ps(y0, _mm_sub_ps(y2, _mm_add_ps(y1, y1)));
1537 _mm_store_ps(dst + i, x0);
1538 _mm_store_ps(dst + i + 4, y0);
1542 __m128 k0 = _mm_set1_ps(kx[0]), k1 = _mm_set1_ps(kx[1]);
1543 for( ; i <= width - 8; i += 8, src += 8 )
1545 __m128 x0, x1, x2, y0, y1, y2;
1546 x0 = _mm_loadu_ps(src - cn);
1547 x1 = _mm_loadu_ps(src);
1548 x2 = _mm_loadu_ps(src + cn);
1549 y0 = _mm_loadu_ps(src - cn + 4);
1550 y1 = _mm_loadu_ps(src + 4);
1551 y2 = _mm_loadu_ps(src + cn + 4);
1553 x0 = _mm_mul_ps(_mm_add_ps(x0, x2), k1);
1554 y0 = _mm_mul_ps(_mm_add_ps(y0, y2), k1);
1555 x0 = _mm_add_ps(x0, _mm_mul_ps(x1, k0));
1556 y0 = _mm_add_ps(y0, _mm_mul_ps(y1, k0));
1557 _mm_store_ps(dst + i, x0);
1558 _mm_store_ps(dst + i + 4, y0);
1562 else if( _ksize == 5 )
1564 if( kx[0] == -2 && kx[1] == 0 && kx[2] == 1 )
1565 for( ; i <= width - 8; i += 8, src += 8 )
1567 __m128 x0, x1, x2, y0, y1, y2;
1568 x0 = _mm_loadu_ps(src - cn*2);
1569 x1 = _mm_loadu_ps(src);
1570 x2 = _mm_loadu_ps(src + cn*2);
1571 y0 = _mm_loadu_ps(src - cn*2 + 4);
1572 y1 = _mm_loadu_ps(src + 4);
1573 y2 = _mm_loadu_ps(src + cn*2 + 4);
1574 x0 = _mm_add_ps(x0, _mm_sub_ps(x2, _mm_add_ps(x1, x1)));
1575 y0 = _mm_add_ps(y0, _mm_sub_ps(y2, _mm_add_ps(y1, y1)));
1576 _mm_store_ps(dst + i, x0);
1577 _mm_store_ps(dst + i + 4, y0);
1581 __m128 k0 = _mm_set1_ps(kx[0]), k1 = _mm_set1_ps(kx[1]), k2 = _mm_set1_ps(kx[2]);
1582 for( ; i <= width - 8; i += 8, src += 8 )
1584 __m128 x0, x1, x2, y0, y1, y2;
1585 x0 = _mm_loadu_ps(src - cn);
1586 x1 = _mm_loadu_ps(src);
1587 x2 = _mm_loadu_ps(src + cn);
1588 y0 = _mm_loadu_ps(src - cn + 4);
1589 y1 = _mm_loadu_ps(src + 4);
1590 y2 = _mm_loadu_ps(src + cn + 4);
1592 x0 = _mm_mul_ps(_mm_add_ps(x0, x2), k1);
1593 y0 = _mm_mul_ps(_mm_add_ps(y0, y2), k1);
1594 x0 = _mm_add_ps(x0, _mm_mul_ps(x1, k0));
1595 y0 = _mm_add_ps(y0, _mm_mul_ps(y1, k0));
1597 x2 = _mm_add_ps(_mm_loadu_ps(src + cn*2), _mm_loadu_ps(src - cn*2));
1598 y2 = _mm_add_ps(_mm_loadu_ps(src + cn*2 + 4), _mm_loadu_ps(src - cn*2 + 4));
1599 x0 = _mm_add_ps(x0, _mm_mul_ps(x2, k2));
1600 y0 = _mm_add_ps(y0, _mm_mul_ps(y2, k2));
1602 _mm_store_ps(dst + i, x0);
1603 _mm_store_ps(dst + i + 4, y0);
1612 if( kx[0] == 0 && kx[1] == 1 )
1613 for( ; i <= width - 8; i += 8, src += 8 )
1615 __m128 x0, x2, y0, y2;
1616 x0 = _mm_loadu_ps(src + cn);
1617 x2 = _mm_loadu_ps(src - cn);
1618 y0 = _mm_loadu_ps(src + cn + 4);
1619 y2 = _mm_loadu_ps(src - cn + 4);
1620 x0 = _mm_sub_ps(x0, x2);
1621 y0 = _mm_sub_ps(y0, y2);
1622 _mm_store_ps(dst + i, x0);
1623 _mm_store_ps(dst + i + 4, y0);
1627 __m128 k1 = _mm_set1_ps(kx[1]);
1628 for( ; i <= width - 8; i += 8, src += 8 )
1630 __m128 x0, x2, y0, y2;
1631 x0 = _mm_loadu_ps(src + cn);
1632 x2 = _mm_loadu_ps(src - cn);
1633 y0 = _mm_loadu_ps(src + cn + 4);
1634 y2 = _mm_loadu_ps(src - cn + 4);
1636 x0 = _mm_mul_ps(_mm_sub_ps(x0, x2), k1);
1637 y0 = _mm_mul_ps(_mm_sub_ps(y0, y2), k1);
1638 _mm_store_ps(dst + i, x0);
1639 _mm_store_ps(dst + i + 4, y0);
1643 else if( _ksize == 5 )
1645 __m128 k1 = _mm_set1_ps(kx[1]), k2 = _mm_set1_ps(kx[2]);
1646 for( ; i <= width - 8; i += 8, src += 8 )
1648 __m128 x0, x2, y0, y2;
1649 x0 = _mm_loadu_ps(src + cn);
1650 x2 = _mm_loadu_ps(src - cn);
1651 y0 = _mm_loadu_ps(src + cn + 4);
1652 y2 = _mm_loadu_ps(src - cn + 4);
1654 x0 = _mm_mul_ps(_mm_sub_ps(x0, x2), k1);
1655 y0 = _mm_mul_ps(_mm_sub_ps(y0, y2), k1);
1657 x2 = _mm_sub_ps(_mm_loadu_ps(src + cn*2), _mm_loadu_ps(src - cn*2));
1658 y2 = _mm_sub_ps(_mm_loadu_ps(src + cn*2 + 4), _mm_loadu_ps(src - cn*2 + 4));
1659 x0 = _mm_add_ps(x0, _mm_mul_ps(x2, k2));
1660 y0 = _mm_add_ps(y0, _mm_mul_ps(y2, k2));
1662 _mm_store_ps(dst + i, x0);
1663 _mm_store_ps(dst + i + 4, y0);
1676 struct SymmColumnVec_32f
1678 SymmColumnVec_32f() { symmetryType=0; }
1679 SymmColumnVec_32f(const Mat& _kernel, int _symmetryType, int, double _delta)
1681 symmetryType = _symmetryType;
1683 delta = (float)_delta;
1684 CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 );
1687 int operator()(const uchar** _src, uchar* _dst, int width) const
1689 if( !checkHardwareSupport(CV_CPU_SSE) )
1692 int ksize2 = (kernel.rows + kernel.cols - 1)/2;
1693 const float* ky = (const float*)kernel.data + ksize2;
1695 bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
1696 const float** src = (const float**)_src;
1697 const float *S, *S2;
1698 float* dst = (float*)_dst;
1699 __m128 d4 = _mm_set1_ps(delta);
1703 for( ; i <= width - 16; i += 16 )
1705 __m128 f = _mm_load_ss(ky);
1706 f = _mm_shuffle_ps(f, f, 0);
1707 __m128 s0, s1, s2, s3;
1710 s0 = _mm_load_ps(S);
1711 s1 = _mm_load_ps(S+4);
1712 s0 = _mm_add_ps(_mm_mul_ps(s0, f), d4);
1713 s1 = _mm_add_ps(_mm_mul_ps(s1, f), d4);
1714 s2 = _mm_load_ps(S+8);
1715 s3 = _mm_load_ps(S+12);
1716 s2 = _mm_add_ps(_mm_mul_ps(s2, f), d4);
1717 s3 = _mm_add_ps(_mm_mul_ps(s3, f), d4);
1719 for( k = 1; k <= ksize2; k++ )
1723 f = _mm_load_ss(ky+k);
1724 f = _mm_shuffle_ps(f, f, 0);
1725 x0 = _mm_add_ps(_mm_load_ps(S), _mm_load_ps(S2));
1726 x1 = _mm_add_ps(_mm_load_ps(S+4), _mm_load_ps(S2+4));
1727 s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
1728 s1 = _mm_add_ps(s1, _mm_mul_ps(x1, f));
1729 x0 = _mm_add_ps(_mm_load_ps(S+8), _mm_load_ps(S2+8));
1730 x1 = _mm_add_ps(_mm_load_ps(S+12), _mm_load_ps(S2+12));
1731 s2 = _mm_add_ps(s2, _mm_mul_ps(x0, f));
1732 s3 = _mm_add_ps(s3, _mm_mul_ps(x1, f));
1735 _mm_storeu_ps(dst + i, s0);
1736 _mm_storeu_ps(dst + i + 4, s1);
1737 _mm_storeu_ps(dst + i + 8, s2);
1738 _mm_storeu_ps(dst + i + 12, s3);
1741 for( ; i <= width - 4; i += 4 )
1743 __m128 f = _mm_load_ss(ky);
1744 f = _mm_shuffle_ps(f, f, 0);
1745 __m128 x0, s0 = _mm_load_ps(src[0] + i);
1746 s0 = _mm_add_ps(_mm_mul_ps(s0, f), d4);
1748 for( k = 1; k <= ksize2; k++ )
1750 f = _mm_load_ss(ky+k);
1751 f = _mm_shuffle_ps(f, f, 0);
1754 x0 = _mm_add_ps(_mm_load_ps(src[k]+i), _mm_load_ps(src[-k] + i));
1755 s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
1758 _mm_storeu_ps(dst + i, s0);
1763 for( ; i <= width - 16; i += 16 )
1765 __m128 f, s0 = d4, s1 = d4, s2 = d4, s3 = d4;
1769 for( k = 1; k <= ksize2; k++ )
1773 f = _mm_load_ss(ky+k);
1774 f = _mm_shuffle_ps(f, f, 0);
1775 x0 = _mm_sub_ps(_mm_load_ps(S), _mm_load_ps(S2));
1776 x1 = _mm_sub_ps(_mm_load_ps(S+4), _mm_load_ps(S2+4));
1777 s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
1778 s1 = _mm_add_ps(s1, _mm_mul_ps(x1, f));
1779 x0 = _mm_sub_ps(_mm_load_ps(S+8), _mm_load_ps(S2+8));
1780 x1 = _mm_sub_ps(_mm_load_ps(S+12), _mm_load_ps(S2+12));
1781 s2 = _mm_add_ps(s2, _mm_mul_ps(x0, f));
1782 s3 = _mm_add_ps(s3, _mm_mul_ps(x1, f));
1785 _mm_storeu_ps(dst + i, s0);
1786 _mm_storeu_ps(dst + i + 4, s1);
1787 _mm_storeu_ps(dst + i + 8, s2);
1788 _mm_storeu_ps(dst + i + 12, s3);
1791 for( ; i <= width - 4; i += 4 )
1793 __m128 f, x0, s0 = d4;
1795 for( k = 1; k <= ksize2; k++ )
1797 f = _mm_load_ss(ky+k);
1798 f = _mm_shuffle_ps(f, f, 0);
1799 x0 = _mm_sub_ps(_mm_load_ps(src[k]+i), _mm_load_ps(src[-k] + i));
1800 s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
1803 _mm_storeu_ps(dst + i, s0);
1816 struct SymmColumnSmallVec_32f
1818 SymmColumnSmallVec_32f() { symmetryType=0; }
1819 SymmColumnSmallVec_32f(const Mat& _kernel, int _symmetryType, int, double _delta)
1821 symmetryType = _symmetryType;
1823 delta = (float)_delta;
1824 CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 );
1827 int operator()(const uchar** _src, uchar* _dst, int width) const
1829 if( !checkHardwareSupport(CV_CPU_SSE) )
1832 int ksize2 = (kernel.rows + kernel.cols - 1)/2;
1833 const float* ky = (const float*)kernel.data + ksize2;
1835 bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
1836 const float** src = (const float**)_src;
1837 const float *S0 = src[-1], *S1 = src[0], *S2 = src[1];
1838 float* dst = (float*)_dst;
1839 __m128 d4 = _mm_set1_ps(delta);
1843 if( ky[0] == 2 && ky[1] == 1 )
1845 for( ; i <= width - 8; i += 8 )
1847 __m128 s0, s1, s2, s3, s4, s5;
1848 s0 = _mm_load_ps(S0 + i);
1849 s1 = _mm_load_ps(S0 + i + 4);
1850 s2 = _mm_load_ps(S1 + i);
1851 s3 = _mm_load_ps(S1 + i + 4);
1852 s4 = _mm_load_ps(S2 + i);
1853 s5 = _mm_load_ps(S2 + i + 4);
1854 s0 = _mm_add_ps(s0, _mm_add_ps(s4, _mm_add_ps(s2, s2)));
1855 s1 = _mm_add_ps(s1, _mm_add_ps(s5, _mm_add_ps(s3, s3)));
1856 s0 = _mm_add_ps(s0, d4);
1857 s1 = _mm_add_ps(s1, d4);
1858 _mm_storeu_ps(dst + i, s0);
1859 _mm_storeu_ps(dst + i + 4, s1);
1862 else if( ky[0] == -2 && ky[1] == 1 )
1864 for( ; i <= width - 8; i += 8 )
1866 __m128 s0, s1, s2, s3, s4, s5;
1867 s0 = _mm_load_ps(S0 + i);
1868 s1 = _mm_load_ps(S0 + i + 4);
1869 s2 = _mm_load_ps(S1 + i);
1870 s3 = _mm_load_ps(S1 + i + 4);
1871 s4 = _mm_load_ps(S2 + i);
1872 s5 = _mm_load_ps(S2 + i + 4);
1873 s0 = _mm_add_ps(s0, _mm_sub_ps(s4, _mm_add_ps(s2, s2)));
1874 s1 = _mm_add_ps(s1, _mm_sub_ps(s5, _mm_add_ps(s3, s3)));
1875 s0 = _mm_add_ps(s0, d4);
1876 s1 = _mm_add_ps(s1, d4);
1877 _mm_storeu_ps(dst + i, s0);
1878 _mm_storeu_ps(dst + i + 4, s1);
1883 __m128 k0 = _mm_set1_ps(ky[0]), k1 = _mm_set1_ps(ky[1]);
1884 for( ; i <= width - 8; i += 8 )
1886 __m128 s0, s1, x0, x1;
1887 s0 = _mm_load_ps(S1 + i);
1888 s1 = _mm_load_ps(S1 + i + 4);
1889 s0 = _mm_add_ps(_mm_mul_ps(s0, k0), d4);
1890 s1 = _mm_add_ps(_mm_mul_ps(s1, k0), d4);
1891 x0 = _mm_add_ps(_mm_load_ps(S0 + i), _mm_load_ps(S2 + i));
1892 x1 = _mm_add_ps(_mm_load_ps(S0 + i + 4), _mm_load_ps(S2 + i + 4));
1893 s0 = _mm_add_ps(s0, _mm_mul_ps(x0,k1));
1894 s1 = _mm_add_ps(s1, _mm_mul_ps(x1,k1));
1895 _mm_storeu_ps(dst + i, s0);
1896 _mm_storeu_ps(dst + i + 4, s1);
1902 if( fabs(ky[1]) == 1 && ky[1] == -ky[-1] )
1906 for( ; i <= width - 8; i += 8 )
1908 __m128 s0, s1, s2, s3;
1909 s0 = _mm_load_ps(S2 + i);
1910 s1 = _mm_load_ps(S2 + i + 4);
1911 s2 = _mm_load_ps(S0 + i);
1912 s3 = _mm_load_ps(S0 + i + 4);
1913 s0 = _mm_add_ps(_mm_sub_ps(s0, s2), d4);
1914 s1 = _mm_add_ps(_mm_sub_ps(s1, s3), d4);
1915 _mm_storeu_ps(dst + i, s0);
1916 _mm_storeu_ps(dst + i + 4, s1);
1921 __m128 k1 = _mm_set1_ps(ky[1]);
1922 for( ; i <= width - 8; i += 8 )
1924 __m128 s0 = d4, s1 = d4, x0, x1;
1925 x0 = _mm_sub_ps(_mm_load_ps(S2 + i), _mm_load_ps(S0 + i));
1926 x1 = _mm_sub_ps(_mm_load_ps(S2 + i + 4), _mm_load_ps(S0 + i + 4));
1927 s0 = _mm_add_ps(s0, _mm_mul_ps(x0,k1));
1928 s1 = _mm_add_ps(s1, _mm_mul_ps(x1,k1));
1929 _mm_storeu_ps(dst + i, s0);
1930 _mm_storeu_ps(dst + i + 4, s1);
1944 /////////////////////////////// non-separable filters ///////////////////////////////
1946 ///////////////////////////////// 8u<->8u, 8u<->16s /////////////////////////////////
1951 FilterVec_8u(const Mat& _kernel, int _bits, double _delta)
1954 _kernel.convertTo(kernel, CV_32F, 1./(1 << _bits), 0);
1955 delta = (float)(_delta/(1 << _bits));
1956 vector<Point> coords;
1957 preprocess2DKernel(kernel, coords, coeffs);
1958 _nz = (int)coords.size();
1961 int operator()(const uchar** src, uchar* dst, int width) const
1963 if( !checkHardwareSupport(CV_CPU_SSE2) )
1966 const float* kf = (const float*)&coeffs[0];
1967 int i = 0, k, nz = _nz;
1968 __m128 d4 = _mm_set1_ps(delta);
1970 for( ; i <= width - 16; i += 16 )
1972 __m128 s0 = d4, s1 = d4, s2 = d4, s3 = d4;
1973 __m128i x0, x1, z = _mm_setzero_si128();
1975 for( k = 0; k < nz; k++ )
1977 __m128 f = _mm_load_ss(kf+k), t0, t1;
1978 f = _mm_shuffle_ps(f, f, 0);
1980 x0 = _mm_loadu_si128((const __m128i*)(src[k] + i));
1981 x1 = _mm_unpackhi_epi8(x0, z);
1982 x0 = _mm_unpacklo_epi8(x0, z);
1984 t0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(x0, z));
1985 t1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(x0, z));
1986 s0 = _mm_add_ps(s0, _mm_mul_ps(t0, f));
1987 s1 = _mm_add_ps(s1, _mm_mul_ps(t1, f));
1989 t0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(x1, z));
1990 t1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(x1, z));
1991 s2 = _mm_add_ps(s2, _mm_mul_ps(t0, f));
1992 s3 = _mm_add_ps(s3, _mm_mul_ps(t1, f));
1995 x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), _mm_cvtps_epi32(s1));
1996 x1 = _mm_packs_epi32(_mm_cvtps_epi32(s2), _mm_cvtps_epi32(s3));
1997 x0 = _mm_packus_epi16(x0, x1);
1998 _mm_storeu_si128((__m128i*)(dst + i), x0);
2001 for( ; i <= width - 4; i += 4 )
2004 __m128i x0, z = _mm_setzero_si128();
2006 for( k = 0; k < nz; k++ )
2008 __m128 f = _mm_load_ss(kf+k), t0;
2009 f = _mm_shuffle_ps(f, f, 0);
2011 x0 = _mm_cvtsi32_si128(*(const int*)(src[k] + i));
2012 x0 = _mm_unpacklo_epi8(x0, z);
2013 t0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(x0, z));
2014 s0 = _mm_add_ps(s0, _mm_mul_ps(t0, f));
2017 x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), z);
2018 x0 = _mm_packus_epi16(x0, x0);
2019 *(int*)(dst + i) = _mm_cvtsi128_si32(x0);
2026 vector<uchar> coeffs;
2031 struct FilterVec_8u16s
2033 FilterVec_8u16s() {}
2034 FilterVec_8u16s(const Mat& _kernel, int _bits, double _delta)
2037 _kernel.convertTo(kernel, CV_32F, 1./(1 << _bits), 0);
2038 delta = (float)(_delta/(1 << _bits));
2039 vector<Point> coords;
2040 preprocess2DKernel(kernel, coords, coeffs);
2041 _nz = (int)coords.size();
2044 int operator()(const uchar** src, uchar* _dst, int width) const
2046 if( !checkHardwareSupport(CV_CPU_SSE2) )
2049 const float* kf = (const float*)&coeffs[0];
2050 short* dst = (short*)_dst;
2051 int i = 0, k, nz = _nz;
2052 __m128 d4 = _mm_set1_ps(delta);
2054 for( ; i <= width - 16; i += 16 )
2056 __m128 s0 = d4, s1 = d4, s2 = d4, s3 = d4;
2057 __m128i x0, x1, z = _mm_setzero_si128();
2059 for( k = 0; k < nz; k++ )
2061 __m128 f = _mm_load_ss(kf+k), t0, t1;
2062 f = _mm_shuffle_ps(f, f, 0);
2064 x0 = _mm_loadu_si128((const __m128i*)(src[k] + i));
2065 x1 = _mm_unpackhi_epi8(x0, z);
2066 x0 = _mm_unpacklo_epi8(x0, z);
2068 t0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(x0, z));
2069 t1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(x0, z));
2070 s0 = _mm_add_ps(s0, _mm_mul_ps(t0, f));
2071 s1 = _mm_add_ps(s1, _mm_mul_ps(t1, f));
2073 t0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(x1, z));
2074 t1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(x1, z));
2075 s2 = _mm_add_ps(s2, _mm_mul_ps(t0, f));
2076 s3 = _mm_add_ps(s3, _mm_mul_ps(t1, f));
2079 x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), _mm_cvtps_epi32(s1));
2080 x1 = _mm_packs_epi32(_mm_cvtps_epi32(s2), _mm_cvtps_epi32(s3));
2081 _mm_storeu_si128((__m128i*)(dst + i), x0);
2082 _mm_storeu_si128((__m128i*)(dst + i + 8), x1);
2085 for( ; i <= width - 4; i += 4 )
2088 __m128i x0, z = _mm_setzero_si128();
2090 for( k = 0; k < nz; k++ )
2092 __m128 f = _mm_load_ss(kf+k), t0;
2093 f = _mm_shuffle_ps(f, f, 0);
2095 x0 = _mm_cvtsi32_si128(*(const int*)(src[k] + i));
2096 x0 = _mm_unpacklo_epi8(x0, z);
2097 t0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(x0, z));
2098 s0 = _mm_add_ps(s0, _mm_mul_ps(t0, f));
2101 x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), z);
2102 _mm_storel_epi64((__m128i*)(dst + i), x0);
2109 vector<uchar> coeffs;
2114 struct FilterVec_32f
2117 FilterVec_32f(const Mat& _kernel, int, double _delta)
2119 delta = (float)_delta;
2120 vector<Point> coords;
2121 preprocess2DKernel(_kernel, coords, coeffs);
2122 _nz = (int)coords.size();
2125 int operator()(const uchar** _src, uchar* _dst, int width) const
2127 if( !checkHardwareSupport(CV_CPU_SSE) )
2130 const float* kf = (const float*)&coeffs[0];
2131 const float** src = (const float**)_src;
2132 float* dst = (float*)_dst;
2133 int i = 0, k, nz = _nz;
2134 __m128 d4 = _mm_set1_ps(delta);
2136 for( ; i <= width - 16; i += 16 )
2138 __m128 s0 = d4, s1 = d4, s2 = d4, s3 = d4;
2140 for( k = 0; k < nz; k++ )
2142 __m128 f = _mm_load_ss(kf+k), t0, t1;
2143 f = _mm_shuffle_ps(f, f, 0);
2144 const float* S = src[k] + i;
2146 t0 = _mm_loadu_ps(S);
2147 t1 = _mm_loadu_ps(S + 4);
2148 s0 = _mm_add_ps(s0, _mm_mul_ps(t0, f));
2149 s1 = _mm_add_ps(s1, _mm_mul_ps(t1, f));
2151 t0 = _mm_loadu_ps(S + 8);
2152 t1 = _mm_loadu_ps(S + 12);
2153 s2 = _mm_add_ps(s2, _mm_mul_ps(t0, f));
2154 s3 = _mm_add_ps(s3, _mm_mul_ps(t1, f));
2157 _mm_storeu_ps(dst + i, s0);
2158 _mm_storeu_ps(dst + i + 4, s1);
2159 _mm_storeu_ps(dst + i + 8, s2);
2160 _mm_storeu_ps(dst + i + 12, s3);
2163 for( ; i <= width - 4; i += 4 )
2167 for( k = 0; k < nz; k++ )
2169 __m128 f = _mm_load_ss(kf+k), t0;
2170 f = _mm_shuffle_ps(f, f, 0);
2171 t0 = _mm_loadu_ps(src[k] + i);
2172 s0 = _mm_add_ps(s0, _mm_mul_ps(t0, f));
2174 _mm_storeu_ps(dst + i, s0);
2181 vector<uchar> coeffs;
2188 typedef RowNoVec RowVec_8u32s;
2189 typedef RowNoVec RowVec_16s32f;
2190 typedef RowNoVec RowVec_32f;
2191 typedef SymmRowSmallNoVec SymmRowSmallVec_8u32s;
2192 typedef SymmRowSmallNoVec SymmRowSmallVec_32f;
2193 typedef ColumnNoVec SymmColumnVec_32s8u;
2194 typedef ColumnNoVec SymmColumnVec_32f16s;
2195 typedef ColumnNoVec SymmColumnVec_32f;
2196 typedef SymmColumnSmallNoVec SymmColumnSmallVec_32s16s;
2197 typedef SymmColumnSmallNoVec SymmColumnSmallVec_32f;
2198 typedef FilterNoVec FilterVec_8u;
2199 typedef FilterNoVec FilterVec_8u16s;
2200 typedef FilterNoVec FilterVec_32f;
2205 template<typename ST, typename DT, class VecOp> struct RowFilter : public BaseRowFilter
2207 RowFilter( const Mat& _kernel, int _anchor, const VecOp& _vecOp=VecOp() )
2209 if( _kernel.isContinuous() )
2212 _kernel.copyTo(kernel);
2214 ksize = kernel.rows + kernel.cols - 1;
2215 CV_Assert( kernel.type() == DataType<DT>::type &&
2216 (kernel.rows == 1 || kernel.cols == 1));
2220 void operator()(const uchar* src, uchar* dst, int width, int cn)
2223 const DT* kx = (const DT*)kernel.data;
2228 i = vecOp(src, dst, width, cn);
2230 #if CV_ENABLE_UNROLLED
2231 for( ; i <= width - 4; i += 4 )
2233 S = (const ST*)src + i;
2235 DT s0 = f*S[0], s1 = f*S[1], s2 = f*S[2], s3 = f*S[3];
2237 for( k = 1; k < _ksize; k++ )
2241 s0 += f*S[0]; s1 += f*S[1];
2242 s2 += f*S[2]; s3 += f*S[3];
2245 D[i] = s0; D[i+1] = s1;
2246 D[i+2] = s2; D[i+3] = s3;
2249 for( ; i < width; i++ )
2251 S = (const ST*)src + i;
2253 for( k = 1; k < _ksize; k++ )
2267 template<typename ST, typename DT, class VecOp> struct SymmRowSmallFilter :
2268 public RowFilter<ST, DT, VecOp>
2270 SymmRowSmallFilter( const Mat& _kernel, int _anchor, int _symmetryType,
2271 const VecOp& _vecOp = VecOp())
2272 : RowFilter<ST, DT, VecOp>( _kernel, _anchor, _vecOp )
2274 symmetryType = _symmetryType;
2275 CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 && this->ksize <= 5 );
2278 void operator()(const uchar* src, uchar* dst, int width, int cn)
2280 int ksize2 = this->ksize/2, ksize2n = ksize2*cn;
2281 const DT* kx = (const DT*)this->kernel.data + ksize2;
2282 bool symmetrical = (this->symmetryType & KERNEL_SYMMETRICAL) != 0;
2284 int i = this->vecOp(src, dst, width, cn), j, k;
2285 const ST* S = (const ST*)src + i + ksize2n;
2290 if( this->ksize == 1 && kx[0] == 1 )
2292 for( ; i <= width - 2; i += 2 )
2294 DT s0 = S[i], s1 = S[i+1];
2295 D[i] = s0; D[i+1] = s1;
2299 else if( this->ksize == 3 )
2301 if( kx[0] == 2 && kx[1] == 1 )
2302 for( ; i <= width - 2; i += 2, S += 2 )
2304 DT s0 = S[-cn] + S[0]*2 + S[cn], s1 = S[1-cn] + S[1]*2 + S[1+cn];
2305 D[i] = s0; D[i+1] = s1;
2307 else if( kx[0] == -2 && kx[1] == 1 )
2308 for( ; i <= width - 2; i += 2, S += 2 )
2310 DT s0 = S[-cn] - S[0]*2 + S[cn], s1 = S[1-cn] - S[1]*2 + S[1+cn];
2311 D[i] = s0; D[i+1] = s1;
2315 DT k0 = kx[0], k1 = kx[1];
2316 for( ; i <= width - 2; i += 2, S += 2 )
2318 DT s0 = S[0]*k0 + (S[-cn] + S[cn])*k1, s1 = S[1]*k0 + (S[1-cn] + S[1+cn])*k1;
2319 D[i] = s0; D[i+1] = s1;
2323 else if( this->ksize == 5 )
2325 DT k0 = kx[0], k1 = kx[1], k2 = kx[2];
2326 if( k0 == -2 && k1 == 0 && k2 == 1 )
2327 for( ; i <= width - 2; i += 2, S += 2 )
2329 DT s0 = -2*S[0] + S[-cn*2] + S[cn*2];
2330 DT s1 = -2*S[1] + S[1-cn*2] + S[1+cn*2];
2331 D[i] = s0; D[i+1] = s1;
2334 for( ; i <= width - 2; i += 2, S += 2 )
2336 DT s0 = S[0]*k0 + (S[-cn] + S[cn])*k1 + (S[-cn*2] + S[cn*2])*k2;
2337 DT s1 = S[1]*k0 + (S[1-cn] + S[1+cn])*k1 + (S[1-cn*2] + S[1+cn*2])*k2;
2338 D[i] = s0; D[i+1] = s1;
2342 for( ; i < width; i++, S++ )
2345 for( k = 1, j = cn; k <= ksize2; k++, j += cn )
2346 s0 += kx[k]*(S[j] + S[-j]);
2352 if( this->ksize == 3 )
2354 if( kx[0] == 0 && kx[1] == 1 )
2355 for( ; i <= width - 2; i += 2, S += 2 )
2357 DT s0 = S[cn] - S[-cn], s1 = S[1+cn] - S[1-cn];
2358 D[i] = s0; D[i+1] = s1;
2363 for( ; i <= width - 2; i += 2, S += 2 )
2365 DT s0 = (S[cn] - S[-cn])*k1, s1 = (S[1+cn] - S[1-cn])*k1;
2366 D[i] = s0; D[i+1] = s1;
2370 else if( this->ksize == 5 )
2372 DT k1 = kx[1], k2 = kx[2];
2373 for( ; i <= width - 2; i += 2, S += 2 )
2375 DT s0 = (S[cn] - S[-cn])*k1 + (S[cn*2] - S[-cn*2])*k2;
2376 DT s1 = (S[1+cn] - S[1-cn])*k1 + (S[1+cn*2] - S[1-cn*2])*k2;
2377 D[i] = s0; D[i+1] = s1;
2381 for( ; i < width; i++, S++ )
2384 for( k = 1, j = cn; k <= ksize2; k++, j += cn )
2385 s0 += kx[k]*(S[j] - S[-j]);
2395 template<class CastOp, class VecOp> struct ColumnFilter : public BaseColumnFilter
2397 typedef typename CastOp::type1 ST;
2398 typedef typename CastOp::rtype DT;
2400 ColumnFilter( const Mat& _kernel, int _anchor,
2401 double _delta, const CastOp& _castOp=CastOp(),
2402 const VecOp& _vecOp=VecOp() )
2404 if( _kernel.isContinuous() )
2407 _kernel.copyTo(kernel);
2409 ksize = kernel.rows + kernel.cols - 1;
2410 delta = saturate_cast<ST>(_delta);
2413 CV_Assert( kernel.type() == DataType<ST>::type &&
2414 (kernel.rows == 1 || kernel.cols == 1));
2417 void operator()(const uchar** src, uchar* dst, int dststep, int count, int width)
2419 const ST* ky = (const ST*)kernel.data;
2423 CastOp castOp = castOp0;
2425 for( ; count--; dst += dststep, src++ )
2428 i = vecOp(src, dst, width);
2429 #if CV_ENABLE_UNROLLED
2430 for( ; i <= width - 4; i += 4 )
2433 const ST* S = (const ST*)src[0] + i;
2434 ST s0 = f*S[0] + _delta, s1 = f*S[1] + _delta,
2435 s2 = f*S[2] + _delta, s3 = f*S[3] + _delta;
2437 for( k = 1; k < _ksize; k++ )
2439 S = (const ST*)src[k] + i; f = ky[k];
2440 s0 += f*S[0]; s1 += f*S[1];
2441 s2 += f*S[2]; s3 += f*S[3];
2444 D[i] = castOp(s0); D[i+1] = castOp(s1);
2445 D[i+2] = castOp(s2); D[i+3] = castOp(s3);
2448 for( ; i < width; i++ )
2450 ST s0 = ky[0]*((const ST*)src[0])[i] + _delta;
2451 for( k = 1; k < _ksize; k++ )
2452 s0 += ky[k]*((const ST*)src[k])[i];
2465 template<class CastOp, class VecOp> struct SymmColumnFilter : public ColumnFilter<CastOp, VecOp>
2467 typedef typename CastOp::type1 ST;
2468 typedef typename CastOp::rtype DT;
2470 SymmColumnFilter( const Mat& _kernel, int _anchor,
2471 double _delta, int _symmetryType,
2472 const CastOp& _castOp=CastOp(),
2473 const VecOp& _vecOp=VecOp())
2474 : ColumnFilter<CastOp, VecOp>( _kernel, _anchor, _delta, _castOp, _vecOp )
2476 symmetryType = _symmetryType;
2477 CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 );
2480 void operator()(const uchar** src, uchar* dst, int dststep, int count, int width)
2482 int ksize2 = this->ksize/2;
2483 const ST* ky = (const ST*)this->kernel.data + ksize2;
2485 bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
2486 ST _delta = this->delta;
2487 CastOp castOp = this->castOp0;
2492 for( ; count--; dst += dststep, src++ )
2495 i = (this->vecOp)(src, dst, width);
2496 #if CV_ENABLE_UNROLLED
2497 for( ; i <= width - 4; i += 4 )
2500 const ST* S = (const ST*)src[0] + i, *S2;
2501 ST s0 = f*S[0] + _delta, s1 = f*S[1] + _delta,
2502 s2 = f*S[2] + _delta, s3 = f*S[3] + _delta;
2504 for( k = 1; k <= ksize2; k++ )
2506 S = (const ST*)src[k] + i;
2507 S2 = (const ST*)src[-k] + i;
2509 s0 += f*(S[0] + S2[0]);
2510 s1 += f*(S[1] + S2[1]);
2511 s2 += f*(S[2] + S2[2]);
2512 s3 += f*(S[3] + S2[3]);
2515 D[i] = castOp(s0); D[i+1] = castOp(s1);
2516 D[i+2] = castOp(s2); D[i+3] = castOp(s3);
2519 for( ; i < width; i++ )
2521 ST s0 = ky[0]*((const ST*)src[0])[i] + _delta;
2522 for( k = 1; k <= ksize2; k++ )
2523 s0 += ky[k]*(((const ST*)src[k])[i] + ((const ST*)src[-k])[i]);
2530 for( ; count--; dst += dststep, src++ )
2533 i = this->vecOp(src, dst, width);
2534 #if CV_ENABLE_UNROLLED
2535 for( ; i <= width - 4; i += 4 )
2539 ST s0 = _delta, s1 = _delta, s2 = _delta, s3 = _delta;
2541 for( k = 1; k <= ksize2; k++ )
2543 S = (const ST*)src[k] + i;
2544 S2 = (const ST*)src[-k] + i;
2546 s0 += f*(S[0] - S2[0]);
2547 s1 += f*(S[1] - S2[1]);
2548 s2 += f*(S[2] - S2[2]);
2549 s3 += f*(S[3] - S2[3]);
2552 D[i] = castOp(s0); D[i+1] = castOp(s1);
2553 D[i+2] = castOp(s2); D[i+3] = castOp(s3);
2556 for( ; i < width; i++ )
2559 for( k = 1; k <= ksize2; k++ )
2560 s0 += ky[k]*(((const ST*)src[k])[i] - ((const ST*)src[-k])[i]);
2571 template<class CastOp, class VecOp>
2572 struct SymmColumnSmallFilter : public SymmColumnFilter<CastOp, VecOp>
2574 typedef typename CastOp::type1 ST;
2575 typedef typename CastOp::rtype DT;
2577 SymmColumnSmallFilter( const Mat& _kernel, int _anchor,
2578 double _delta, int _symmetryType,
2579 const CastOp& _castOp=CastOp(),
2580 const VecOp& _vecOp=VecOp())
2581 : SymmColumnFilter<CastOp, VecOp>( _kernel, _anchor, _delta, _symmetryType, _castOp, _vecOp )
2583 CV_Assert( this->ksize == 3 );
2586 void operator()(const uchar** src, uchar* dst, int dststep, int count, int width)
2588 int ksize2 = this->ksize/2;
2589 const ST* ky = (const ST*)this->kernel.data + ksize2;
2591 bool symmetrical = (this->symmetryType & KERNEL_SYMMETRICAL) != 0;
2592 bool is_1_2_1 = ky[0] == 1 && ky[1] == 2;
2593 bool is_1_m2_1 = ky[0] == 1 && ky[1] == -2;
2594 bool is_m1_0_1 = ky[1] == 1 || ky[1] == -1;
2595 ST f0 = ky[0], f1 = ky[1];
2596 ST _delta = this->delta;
2597 CastOp castOp = this->castOp0;
2600 for( ; count--; dst += dststep, src++ )
2603 i = (this->vecOp)(src, dst, width);
2604 const ST* S0 = (const ST*)src[-1];
2605 const ST* S1 = (const ST*)src[0];
2606 const ST* S2 = (const ST*)src[1];
2612 #if CV_ENABLE_UNROLLED
2613 for( ; i <= width - 4; i += 4 )
2615 ST s0 = S0[i] + S1[i]*2 + S2[i] + _delta;
2616 ST s1 = S0[i+1] + S1[i+1]*2 + S2[i+1] + _delta;
2618 D[i+1] = castOp(s1);
2620 s0 = S0[i+2] + S1[i+2]*2 + S2[i+2] + _delta;
2621 s1 = S0[i+3] + S1[i+3]*2 + S2[i+3] + _delta;
2622 D[i+2] = castOp(s0);
2623 D[i+3] = castOp(s1);
2626 for( ; i < width; i ++ )
2628 ST s0 = S0[i] + S1[i]*2 + S2[i] + _delta;
2633 else if( is_1_m2_1 )
2635 #if CV_ENABLE_UNROLLED
2636 for( ; i <= width - 4; i += 4 )
2638 ST s0 = S0[i] - S1[i]*2 + S2[i] + _delta;
2639 ST s1 = S0[i+1] - S1[i+1]*2 + S2[i+1] + _delta;
2641 D[i+1] = castOp(s1);
2643 s0 = S0[i+2] - S1[i+2]*2 + S2[i+2] + _delta;
2644 s1 = S0[i+3] - S1[i+3]*2 + S2[i+3] + _delta;
2645 D[i+2] = castOp(s0);
2646 D[i+3] = castOp(s1);
2649 for( ; i < width; i ++ )
2651 ST s0 = S0[i] - S1[i]*2 + S2[i] + _delta;
2658 #if CV_ENABLE_UNROLLED
2659 for( ; i <= width - 4; i += 4 )
2661 ST s0 = (S0[i] + S2[i])*f1 + S1[i]*f0 + _delta;
2662 ST s1 = (S0[i+1] + S2[i+1])*f1 + S1[i+1]*f0 + _delta;
2664 D[i+1] = castOp(s1);
2666 s0 = (S0[i+2] + S2[i+2])*f1 + S1[i+2]*f0 + _delta;
2667 s1 = (S0[i+3] + S2[i+3])*f1 + S1[i+3]*f0 + _delta;
2668 D[i+2] = castOp(s0);
2669 D[i+3] = castOp(s1);
2672 for( ; i < width; i ++ )
2674 ST s0 = (S0[i] + S2[i])*f1 + S1[i]*f0 + _delta;
2679 for( ; i < width; i++ )
2680 D[i] = castOp((S0[i] + S2[i])*f1 + S1[i]*f0 + _delta);
2688 #if CV_ENABLE_UNROLLED
2689 for( ; i <= width - 4; i += 4 )
2691 ST s0 = S2[i] - S0[i] + _delta;
2692 ST s1 = S2[i+1] - S0[i+1] + _delta;
2694 D[i+1] = castOp(s1);
2696 s0 = S2[i+2] - S0[i+2] + _delta;
2697 s1 = S2[i+3] - S0[i+3] + _delta;
2698 D[i+2] = castOp(s0);
2699 D[i+3] = castOp(s1);
2702 for( ; i < width; i ++ )
2704 ST s0 = S2[i] - S0[i] + _delta;
2713 #if CV_ENABLE_UNROLLED
2714 for( ; i <= width - 4; i += 4 )
2716 ST s0 = (S2[i] - S0[i])*f1 + _delta;
2717 ST s1 = (S2[i+1] - S0[i+1])*f1 + _delta;
2719 D[i+1] = castOp(s1);
2721 s0 = (S2[i+2] - S0[i+2])*f1 + _delta;
2722 s1 = (S2[i+3] - S0[i+3])*f1 + _delta;
2723 D[i+2] = castOp(s0);
2724 D[i+3] = castOp(s1);
2729 for( ; i < width; i++ )
2730 D[i] = castOp((S2[i] - S0[i])*f1 + _delta);
2736 template<typename ST, typename DT> struct Cast
2741 DT operator()(ST val) const { return saturate_cast<DT>(val); }
2744 template<typename ST, typename DT, int bits> struct FixedPtCast
2748 enum { SHIFT = bits, DELTA = 1 << (bits-1) };
2750 DT operator()(ST val) const { return saturate_cast<DT>((val + DELTA)>>SHIFT); }
2753 template<typename ST, typename DT> struct FixedPtCastEx
2758 FixedPtCastEx() : SHIFT(0), DELTA(0) {}
2759 FixedPtCastEx(int bits) : SHIFT(bits), DELTA(bits ? 1 << (bits-1) : 0) {}
2760 DT operator()(ST val) const { return saturate_cast<DT>((val + DELTA)>>SHIFT); }
2766 cv::Ptr<cv::BaseRowFilter> cv::getLinearRowFilter( int srcType, int bufType,
2767 InputArray _kernel, int anchor,
2770 Mat kernel = _kernel.getMat();
2771 int sdepth = CV_MAT_DEPTH(srcType), ddepth = CV_MAT_DEPTH(bufType);
2772 int cn = CV_MAT_CN(srcType);
2773 CV_Assert( cn == CV_MAT_CN(bufType) &&
2774 ddepth >= std::max(sdepth, CV_32S) &&
2775 kernel.type() == ddepth );
2776 int ksize = kernel.rows + kernel.cols - 1;
2778 if( (symmetryType & (KERNEL_SYMMETRICAL|KERNEL_ASYMMETRICAL)) != 0 && ksize <= 5 )
2780 if( sdepth == CV_8U && ddepth == CV_32S )
2781 return Ptr<BaseRowFilter>(new SymmRowSmallFilter<uchar, int, SymmRowSmallVec_8u32s>
2782 (kernel, anchor, symmetryType, SymmRowSmallVec_8u32s(kernel, symmetryType)));
2783 if( sdepth == CV_32F && ddepth == CV_32F )
2784 return Ptr<BaseRowFilter>(new SymmRowSmallFilter<float, float, SymmRowSmallVec_32f>
2785 (kernel, anchor, symmetryType, SymmRowSmallVec_32f(kernel, symmetryType)));
2788 if( sdepth == CV_8U && ddepth == CV_32S )
2789 return Ptr<BaseRowFilter>(new RowFilter<uchar, int, RowVec_8u32s>
2790 (kernel, anchor, RowVec_8u32s(kernel)));
2791 if( sdepth == CV_8U && ddepth == CV_32F )
2792 return Ptr<BaseRowFilter>(new RowFilter<uchar, float, RowNoVec>(kernel, anchor));
2793 if( sdepth == CV_8U && ddepth == CV_64F )
2794 return Ptr<BaseRowFilter>(new RowFilter<uchar, double, RowNoVec>(kernel, anchor));
2795 if( sdepth == CV_16U && ddepth == CV_32F )
2796 return Ptr<BaseRowFilter>(new RowFilter<ushort, float, RowNoVec>(kernel, anchor));
2797 if( sdepth == CV_16U && ddepth == CV_64F )
2798 return Ptr<BaseRowFilter>(new RowFilter<ushort, double, RowNoVec>(kernel, anchor));
2799 if( sdepth == CV_16S && ddepth == CV_32F )
2800 return Ptr<BaseRowFilter>(new RowFilter<short, float, RowVec_16s32f>
2801 (kernel, anchor, RowVec_16s32f(kernel)));
2802 if( sdepth == CV_16S && ddepth == CV_64F )
2803 return Ptr<BaseRowFilter>(new RowFilter<short, double, RowNoVec>(kernel, anchor));
2804 if( sdepth == CV_32F && ddepth == CV_32F )
2805 return Ptr<BaseRowFilter>(new RowFilter<float, float, RowVec_32f>
2806 (kernel, anchor, RowVec_32f(kernel)));
2807 if( sdepth == CV_64F && ddepth == CV_64F )
2808 return Ptr<BaseRowFilter>(new RowFilter<double, double, RowNoVec>(kernel, anchor));
2810 CV_Error_( CV_StsNotImplemented,
2811 ("Unsupported combination of source format (=%d), and buffer format (=%d)",
2814 return Ptr<BaseRowFilter>(0);
2818 cv::Ptr<cv::BaseColumnFilter> cv::getLinearColumnFilter( int bufType, int dstType,
2819 InputArray _kernel, int anchor,
2820 int symmetryType, double delta,
2823 Mat kernel = _kernel.getMat();
2824 int sdepth = CV_MAT_DEPTH(bufType), ddepth = CV_MAT_DEPTH(dstType);
2825 int cn = CV_MAT_CN(dstType);
2826 CV_Assert( cn == CV_MAT_CN(bufType) &&
2827 sdepth >= std::max(ddepth, CV_32S) &&
2828 kernel.type() == sdepth );
2830 if( !(symmetryType & (KERNEL_SYMMETRICAL|KERNEL_ASYMMETRICAL)) )
2832 if( ddepth == CV_8U && sdepth == CV_32S )
2833 return Ptr<BaseColumnFilter>(new ColumnFilter<FixedPtCastEx<int, uchar>, ColumnNoVec>
2834 (kernel, anchor, delta, FixedPtCastEx<int, uchar>(bits)));
2835 if( ddepth == CV_8U && sdepth == CV_32F )
2836 return Ptr<BaseColumnFilter>(new ColumnFilter<Cast<float, uchar>, ColumnNoVec>(kernel, anchor, delta));
2837 if( ddepth == CV_8U && sdepth == CV_64F )
2838 return Ptr<BaseColumnFilter>(new ColumnFilter<Cast<double, uchar>, ColumnNoVec>(kernel, anchor, delta));
2839 if( ddepth == CV_16U && sdepth == CV_32F )
2840 return Ptr<BaseColumnFilter>(new ColumnFilter<Cast<float, ushort>, ColumnNoVec>(kernel, anchor, delta));
2841 if( ddepth == CV_16U && sdepth == CV_64F )
2842 return Ptr<BaseColumnFilter>(new ColumnFilter<Cast<double, ushort>, ColumnNoVec>(kernel, anchor, delta));
2843 if( ddepth == CV_16S && sdepth == CV_32F )
2844 return Ptr<BaseColumnFilter>(new ColumnFilter<Cast<float, short>, ColumnNoVec>(kernel, anchor, delta));
2845 if( ddepth == CV_16S && sdepth == CV_64F )
2846 return Ptr<BaseColumnFilter>(new ColumnFilter<Cast<double, short>, ColumnNoVec>(kernel, anchor, delta));
2847 if( ddepth == CV_32F && sdepth == CV_32F )
2848 return Ptr<BaseColumnFilter>(new ColumnFilter<Cast<float, float>, ColumnNoVec>(kernel, anchor, delta));
2849 if( ddepth == CV_64F && sdepth == CV_64F )
2850 return Ptr<BaseColumnFilter>(new ColumnFilter<Cast<double, double>, ColumnNoVec>(kernel, anchor, delta));
2854 int ksize = kernel.rows + kernel.cols - 1;
2857 if( ddepth == CV_8U && sdepth == CV_32S )
2858 return Ptr<BaseColumnFilter>(new SymmColumnSmallFilter<
2859 FixedPtCastEx<int, uchar>, SymmColumnVec_32s8u>
2860 (kernel, anchor, delta, symmetryType, FixedPtCastEx<int, uchar>(bits),
2861 SymmColumnVec_32s8u(kernel, symmetryType, bits, delta)));
2862 if( ddepth == CV_16S && sdepth == CV_32S && bits == 0 )
2863 return Ptr<BaseColumnFilter>(new SymmColumnSmallFilter<Cast<int, short>,
2864 SymmColumnSmallVec_32s16s>(kernel, anchor, delta, symmetryType,
2865 Cast<int, short>(), SymmColumnSmallVec_32s16s(kernel, symmetryType, bits, delta)));
2866 if( ddepth == CV_32F && sdepth == CV_32F )
2867 return Ptr<BaseColumnFilter>(new SymmColumnSmallFilter<
2868 Cast<float, float>,SymmColumnSmallVec_32f>
2869 (kernel, anchor, delta, symmetryType, Cast<float, float>(),
2870 SymmColumnSmallVec_32f(kernel, symmetryType, 0, delta)));
2872 if( ddepth == CV_8U && sdepth == CV_32S )
2873 return Ptr<BaseColumnFilter>(new SymmColumnFilter<FixedPtCastEx<int, uchar>, SymmColumnVec_32s8u>
2874 (kernel, anchor, delta, symmetryType, FixedPtCastEx<int, uchar>(bits),
2875 SymmColumnVec_32s8u(kernel, symmetryType, bits, delta)));
2876 if( ddepth == CV_8U && sdepth == CV_32F )
2877 return Ptr<BaseColumnFilter>(new SymmColumnFilter<Cast<float, uchar>, ColumnNoVec>
2878 (kernel, anchor, delta, symmetryType));
2879 if( ddepth == CV_8U && sdepth == CV_64F )
2880 return Ptr<BaseColumnFilter>(new SymmColumnFilter<Cast<double, uchar>, ColumnNoVec>
2881 (kernel, anchor, delta, symmetryType));
2882 if( ddepth == CV_16U && sdepth == CV_32F )
2883 return Ptr<BaseColumnFilter>(new SymmColumnFilter<Cast<float, ushort>, ColumnNoVec>
2884 (kernel, anchor, delta, symmetryType));
2885 if( ddepth == CV_16U && sdepth == CV_64F )
2886 return Ptr<BaseColumnFilter>(new SymmColumnFilter<Cast<double, ushort>, ColumnNoVec>
2887 (kernel, anchor, delta, symmetryType));
2888 if( ddepth == CV_16S && sdepth == CV_32S )
2889 return Ptr<BaseColumnFilter>(new SymmColumnFilter<Cast<int, short>, ColumnNoVec>
2890 (kernel, anchor, delta, symmetryType));
2891 if( ddepth == CV_16S && sdepth == CV_32F )
2892 return Ptr<BaseColumnFilter>(new SymmColumnFilter<Cast<float, short>, SymmColumnVec_32f16s>
2893 (kernel, anchor, delta, symmetryType, Cast<float, short>(),
2894 SymmColumnVec_32f16s(kernel, symmetryType, 0, delta)));
2895 if( ddepth == CV_16S && sdepth == CV_64F )
2896 return Ptr<BaseColumnFilter>(new SymmColumnFilter<Cast<double, short>, ColumnNoVec>
2897 (kernel, anchor, delta, symmetryType));
2898 if( ddepth == CV_32F && sdepth == CV_32F )
2899 return Ptr<BaseColumnFilter>(new SymmColumnFilter<Cast<float, float>, SymmColumnVec_32f>
2900 (kernel, anchor, delta, symmetryType, Cast<float, float>(),
2901 SymmColumnVec_32f(kernel, symmetryType, 0, delta)));
2902 if( ddepth == CV_64F && sdepth == CV_64F )
2903 return Ptr<BaseColumnFilter>(new SymmColumnFilter<Cast<double, double>, ColumnNoVec>
2904 (kernel, anchor, delta, symmetryType));
2907 CV_Error_( CV_StsNotImplemented,
2908 ("Unsupported combination of buffer format (=%d), and destination format (=%d)",
2911 return Ptr<BaseColumnFilter>(0);
2915 cv::Ptr<cv::FilterEngine> cv::createSeparableLinearFilter(
2916 int _srcType, int _dstType,
2917 InputArray __rowKernel, InputArray __columnKernel,
2918 Point _anchor, double _delta,
2919 int _rowBorderType, int _columnBorderType,
2920 const Scalar& _borderValue )
2922 Mat _rowKernel = __rowKernel.getMat(), _columnKernel = __columnKernel.getMat();
2923 _srcType = CV_MAT_TYPE(_srcType);
2924 _dstType = CV_MAT_TYPE(_dstType);
2925 int sdepth = CV_MAT_DEPTH(_srcType), ddepth = CV_MAT_DEPTH(_dstType);
2926 int cn = CV_MAT_CN(_srcType);
2927 CV_Assert( cn == CV_MAT_CN(_dstType) );
2928 int rsize = _rowKernel.rows + _rowKernel.cols - 1;
2929 int csize = _columnKernel.rows + _columnKernel.cols - 1;
2931 _anchor.x = rsize/2;
2933 _anchor.y = csize/2;
2934 int rtype = getKernelType(_rowKernel,
2935 _rowKernel.rows == 1 ? Point(_anchor.x, 0) : Point(0, _anchor.x));
2936 int ctype = getKernelType(_columnKernel,
2937 _columnKernel.rows == 1 ? Point(_anchor.y, 0) : Point(0, _anchor.y));
2938 Mat rowKernel, columnKernel;
2940 int bdepth = std::max(CV_32F,std::max(sdepth, ddepth));
2943 if( sdepth == CV_8U &&
2944 ((rtype == KERNEL_SMOOTH+KERNEL_SYMMETRICAL &&
2945 ctype == KERNEL_SMOOTH+KERNEL_SYMMETRICAL &&
2947 ((rtype & (KERNEL_SYMMETRICAL+KERNEL_ASYMMETRICAL)) &&
2948 (ctype & (KERNEL_SYMMETRICAL+KERNEL_ASYMMETRICAL)) &&
2949 (rtype & ctype & KERNEL_INTEGER) &&
2950 ddepth == CV_16S)) )
2953 bits = ddepth == CV_8U ? 8 : 0;
2954 _rowKernel.convertTo( rowKernel, CV_32S, 1 << bits );
2955 _columnKernel.convertTo( columnKernel, CV_32S, 1 << bits );
2957 _delta *= (1 << bits);
2961 if( _rowKernel.type() != bdepth )
2962 _rowKernel.convertTo( rowKernel, bdepth );
2964 rowKernel = _rowKernel;
2965 if( _columnKernel.type() != bdepth )
2966 _columnKernel.convertTo( columnKernel, bdepth );
2968 columnKernel = _columnKernel;
2971 int _bufType = CV_MAKETYPE(bdepth, cn);
2972 Ptr<BaseRowFilter> _rowFilter = getLinearRowFilter(
2973 _srcType, _bufType, rowKernel, _anchor.x, rtype);
2974 Ptr<BaseColumnFilter> _columnFilter = getLinearColumnFilter(
2975 _bufType, _dstType, columnKernel, _anchor.y, ctype, _delta, bits );
2977 return Ptr<FilterEngine>( new FilterEngine(Ptr<BaseFilter>(0), _rowFilter, _columnFilter,
2978 _srcType, _dstType, _bufType, _rowBorderType, _columnBorderType, _borderValue ));
2982 /****************************************************************************************\
2983 * Non-separable linear filter *
2984 \****************************************************************************************/
2989 void preprocess2DKernel( const Mat& kernel, vector<Point>& coords, vector<uchar>& coeffs )
2991 int i, j, k, nz = countNonZero(kernel), ktype = kernel.type();
2994 CV_Assert( ktype == CV_8U || ktype == CV_32S || ktype == CV_32F || ktype == CV_64F );
2996 coeffs.resize(nz*getElemSize(ktype));
2997 uchar* _coeffs = &coeffs[0];
2999 for( i = k = 0; i < kernel.rows; i++ )
3001 const uchar* krow = kernel.data + kernel.step*i;
3002 for( j = 0; j < kernel.cols; j++ )
3004 if( ktype == CV_8U )
3006 uchar val = krow[j];
3009 coords[k] = Point(j,i);
3012 else if( ktype == CV_32S )
3014 int val = ((const int*)krow)[j];
3017 coords[k] = Point(j,i);
3018 ((int*)_coeffs)[k++] = val;
3020 else if( ktype == CV_32F )
3022 float val = ((const float*)krow)[j];
3025 coords[k] = Point(j,i);
3026 ((float*)_coeffs)[k++] = val;
3030 double val = ((const double*)krow)[j];
3033 coords[k] = Point(j,i);
3034 ((double*)_coeffs)[k++] = val;
3041 template<typename ST, class CastOp, class VecOp> struct Filter2D : public BaseFilter
3043 typedef typename CastOp::type1 KT;
3044 typedef typename CastOp::rtype DT;
3046 Filter2D( const Mat& _kernel, Point _anchor,
3047 double _delta, const CastOp& _castOp=CastOp(),
3048 const VecOp& _vecOp=VecOp() )
3051 ksize = _kernel.size();
3052 delta = saturate_cast<KT>(_delta);
3055 CV_Assert( _kernel.type() == DataType<KT>::type );
3056 preprocess2DKernel( _kernel, coords, coeffs );
3057 ptrs.resize( coords.size() );
3060 void operator()(const uchar** src, uchar* dst, int dststep, int count, int width, int cn)
3063 const Point* pt = &coords[0];
3064 const KT* kf = (const KT*)&coeffs[0];
3065 const ST** kp = (const ST**)&ptrs[0];
3066 int i, k, nz = (int)coords.size();
3067 CastOp castOp = castOp0;
3070 for( ; count > 0; count--, dst += dststep, src++ )
3074 for( k = 0; k < nz; k++ )
3075 kp[k] = (const ST*)src[pt[k].y] + pt[k].x*cn;
3077 i = vecOp((const uchar**)kp, dst, width);
3078 #if CV_ENABLE_UNROLLED
3079 for( ; i <= width - 4; i += 4 )
3081 KT s0 = _delta, s1 = _delta, s2 = _delta, s3 = _delta;
3083 for( k = 0; k < nz; k++ )
3085 const ST* sptr = kp[k] + i;
3093 D[i] = castOp(s0); D[i+1] = castOp(s1);
3094 D[i+2] = castOp(s2); D[i+3] = castOp(s3);
3097 for( ; i < width; i++ )
3100 for( k = 0; k < nz; k++ )
3101 s0 += kf[k]*kp[k][i];
3107 vector<Point> coords;
3108 vector<uchar> coeffs;
3109 vector<uchar*> ptrs;
3117 cv::Ptr<cv::BaseFilter> cv::getLinearFilter(int srcType, int dstType,
3118 InputArray filter_kernel, Point anchor,
3119 double delta, int bits)
3121 Mat _kernel = filter_kernel.getMat();
3122 int sdepth = CV_MAT_DEPTH(srcType), ddepth = CV_MAT_DEPTH(dstType);
3123 int cn = CV_MAT_CN(srcType), kdepth = _kernel.depth();
3124 CV_Assert( cn == CV_MAT_CN(dstType) && ddepth >= sdepth );
3126 anchor = normalizeAnchor(anchor, _kernel.size());
3128 /*if( sdepth == CV_8U && ddepth == CV_8U && kdepth == CV_32S )
3129 return Ptr<BaseFilter>(new Filter2D<uchar, FixedPtCastEx<int, uchar>, FilterVec_8u>
3130 (_kernel, anchor, delta, FixedPtCastEx<int, uchar>(bits),
3131 FilterVec_8u(_kernel, bits, delta)));
3132 if( sdepth == CV_8U && ddepth == CV_16S && kdepth == CV_32S )
3133 return Ptr<BaseFilter>(new Filter2D<uchar, FixedPtCastEx<int, short>, FilterVec_8u16s>
3134 (_kernel, anchor, delta, FixedPtCastEx<int, short>(bits),
3135 FilterVec_8u16s(_kernel, bits, delta)));*/
3137 kdepth = sdepth == CV_64F || ddepth == CV_64F ? CV_64F : CV_32F;
3139 if( _kernel.type() == kdepth )
3142 _kernel.convertTo(kernel, kdepth, _kernel.type() == CV_32S ? 1./(1 << bits) : 1.);
3144 if( sdepth == CV_8U && ddepth == CV_8U )
3145 return Ptr<BaseFilter>(new Filter2D<uchar, Cast<float, uchar>, FilterVec_8u>
3146 (kernel, anchor, delta, Cast<float, uchar>(), FilterVec_8u(kernel, 0, delta)));
3147 if( sdepth == CV_8U && ddepth == CV_16U )
3148 return Ptr<BaseFilter>(new Filter2D<uchar,
3149 Cast<float, ushort>, FilterNoVec>(kernel, anchor, delta));
3150 if( sdepth == CV_8U && ddepth == CV_16S )
3151 return Ptr<BaseFilter>(new Filter2D<uchar, Cast<float, short>, FilterVec_8u16s>
3152 (kernel, anchor, delta, Cast<float, short>(), FilterVec_8u16s(kernel, 0, delta)));
3153 if( sdepth == CV_8U && ddepth == CV_32F )
3154 return Ptr<BaseFilter>(new Filter2D<uchar,
3155 Cast<float, float>, FilterNoVec>(kernel, anchor, delta));
3156 if( sdepth == CV_8U && ddepth == CV_64F )
3157 return Ptr<BaseFilter>(new Filter2D<uchar,
3158 Cast<double, double>, FilterNoVec>(kernel, anchor, delta));
3160 if( sdepth == CV_16U && ddepth == CV_16U )
3161 return Ptr<BaseFilter>(new Filter2D<ushort,
3162 Cast<float, ushort>, FilterNoVec>(kernel, anchor, delta));
3163 if( sdepth == CV_16U && ddepth == CV_32F )
3164 return Ptr<BaseFilter>(new Filter2D<ushort,
3165 Cast<float, float>, FilterNoVec>(kernel, anchor, delta));
3166 if( sdepth == CV_16U && ddepth == CV_64F )
3167 return Ptr<BaseFilter>(new Filter2D<ushort,
3168 Cast<double, double>, FilterNoVec>(kernel, anchor, delta));
3170 if( sdepth == CV_16S && ddepth == CV_16S )
3171 return Ptr<BaseFilter>(new Filter2D<short,
3172 Cast<float, short>, FilterNoVec>(kernel, anchor, delta));
3173 if( sdepth == CV_16S && ddepth == CV_32F )
3174 return Ptr<BaseFilter>(new Filter2D<short,
3175 Cast<float, float>, FilterNoVec>(kernel, anchor, delta));
3176 if( sdepth == CV_16S && ddepth == CV_64F )
3177 return Ptr<BaseFilter>(new Filter2D<short,
3178 Cast<double, double>, FilterNoVec>(kernel, anchor, delta));
3180 if( sdepth == CV_32F && ddepth == CV_32F )
3181 return Ptr<BaseFilter>(new Filter2D<float, Cast<float, float>, FilterVec_32f>
3182 (kernel, anchor, delta, Cast<float, float>(), FilterVec_32f(kernel, 0, delta)));
3183 if( sdepth == CV_64F && ddepth == CV_64F )
3184 return Ptr<BaseFilter>(new Filter2D<double,
3185 Cast<double, double>, FilterNoVec>(kernel, anchor, delta));
3187 CV_Error_( CV_StsNotImplemented,
3188 ("Unsupported combination of source format (=%d), and destination format (=%d)",
3191 return Ptr<BaseFilter>(0);
3195 cv::Ptr<cv::FilterEngine> cv::createLinearFilter( int _srcType, int _dstType,
3196 InputArray filter_kernel,
3197 Point _anchor, double _delta,
3198 int _rowBorderType, int _columnBorderType,
3199 const Scalar& _borderValue )
3201 Mat _kernel = filter_kernel.getMat();
3202 _srcType = CV_MAT_TYPE(_srcType);
3203 _dstType = CV_MAT_TYPE(_dstType);
3204 int cn = CV_MAT_CN(_srcType);
3205 CV_Assert( cn == CV_MAT_CN(_dstType) );
3207 Mat kernel = _kernel;
3210 /*int sdepth = CV_MAT_DEPTH(_srcType), ddepth = CV_MAT_DEPTH(_dstType);
3211 int ktype = _kernel.depth() == CV_32S ? KERNEL_INTEGER : getKernelType(_kernel, _anchor);
3212 if( sdepth == CV_8U && (ddepth == CV_8U || ddepth == CV_16S) &&
3213 _kernel.rows*_kernel.cols <= (1 << 10) )
3215 bits = (ktype & KERNEL_INTEGER) ? 0 : 11;
3216 _kernel.convertTo(kernel, CV_32S, 1 << bits);
3219 Ptr<BaseFilter> _filter2D = getLinearFilter(_srcType, _dstType,
3220 kernel, _anchor, _delta, bits);
3222 return Ptr<FilterEngine>(new FilterEngine(_filter2D, Ptr<BaseRowFilter>(0),
3223 Ptr<BaseColumnFilter>(0), _srcType, _dstType, _srcType,
3224 _rowBorderType, _columnBorderType, _borderValue ));
3228 void cv::filter2D( InputArray _src, OutputArray _dst, int ddepth,
3229 InputArray _kernel, Point anchor,
3230 double delta, int borderType )
3232 Mat src = _src.getMat(), kernel = _kernel.getMat();
3235 ddepth = src.depth();
3238 int dft_filter_size = ((src.depth() == CV_8U && (ddepth == CV_8U || ddepth == CV_16S)) ||
3239 (src.depth() == CV_32F && ddepth == CV_32F)) && checkHardwareSupport(CV_CPU_SSE3)? 130 : 50;
3241 int dft_filter_size = 50;
3244 _dst.create( src.size(), CV_MAKETYPE(ddepth, src.channels()) );
3245 Mat dst = _dst.getMat();
3246 anchor = normalizeAnchor(anchor, kernel.size());
3248 #ifdef HAVE_TEGRA_OPTIMIZATION
3249 if( tegra::filter2D(src, dst, kernel, anchor, delta, borderType) )
3253 if( kernel.cols*kernel.rows >= dft_filter_size )
3256 if( src.data != dst.data )
3259 temp.create(dst.size(), dst.type());
3260 crossCorr( src, kernel, temp, src.size(),
3261 CV_MAKETYPE(ddepth, src.channels()),
3262 anchor, delta, borderType );
3263 if( temp.data != dst.data )
3268 Ptr<FilterEngine> f = createLinearFilter(src.type(), dst.type(), kernel,
3269 anchor, delta, borderType & ~BORDER_ISOLATED );
3270 f->apply(src, dst, Rect(0,0,-1,-1), Point(), (borderType & BORDER_ISOLATED) != 0 );
3274 void cv::sepFilter2D( InputArray _src, OutputArray _dst, int ddepth,
3275 InputArray _kernelX, InputArray _kernelY, Point anchor,
3276 double delta, int borderType )
3278 Mat src = _src.getMat(), kernelX = _kernelX.getMat(), kernelY = _kernelY.getMat();
3281 ddepth = src.depth();
3283 _dst.create( src.size(), CV_MAKETYPE(ddepth, src.channels()) );
3284 Mat dst = _dst.getMat();
3286 Ptr<FilterEngine> f = createSeparableLinearFilter(src.type(),
3287 dst.type(), kernelX, kernelY, anchor, delta, borderType & ~BORDER_ISOLATED );
3288 f->apply(src, dst, Rect(0,0,-1,-1), Point(), (borderType & BORDER_ISOLATED) != 0 );
3293 cvFilter2D( const CvArr* srcarr, CvArr* dstarr, const CvMat* _kernel, CvPoint anchor )
3295 cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr);
3296 cv::Mat kernel = cv::cvarrToMat(_kernel);
3298 CV_Assert( src.size() == dst.size() && src.channels() == dst.channels() );
3300 cv::filter2D( src, dst, dst.depth(), kernel, anchor, 0, cv::BORDER_REPLICATE );