efe552a394b299881cd26020dfc57167acc6fa72
[profile/ivi/opencv.git] / modules / imgproc / src / filter.cpp
1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 //  By downloading, copying, installing or using the software you agree to this license.
6 //  If you do not agree to this license, do not download, install,
7 //  copy or use the software.
8 //
9 //
10 //                           License Agreement
11 //                For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Third party copyrights are property of their respective owners.
16 //
17 // Redistribution and use in source and binary forms, with or without modification,
18 // are permitted provided that the following conditions are met:
19 //
20 //   * Redistribution's of source code must retain the above copyright notice,
21 //     this list of conditions and the following disclaimer.
22 //
23 //   * Redistribution's in binary form must reproduce the above copyright notice,
24 //     this list of conditions and the following disclaimer in the documentation
25 //     and/or other materials provided with the distribution.
26 //
27 //   * The name of the copyright holders may not be used to endorse or promote products
28 //     derived from this software without specific prior written permission.
29 //
30 // This software is provided by the copyright holders and contributors "as is" and
31 // any express or implied warranties, including, but not limited to, the implied
32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
33 // In no event shall the Intel Corporation or contributors be liable for any direct,
34 // indirect, incidental, special, exemplary, or consequential damages
35 // (including, but not limited to, procurement of substitute goods or services;
36 // loss of use, data, or profits; or business interruption) however caused
37 // and on any theory of liability, whether in contract, strict liability,
38 // or tort (including negligence or otherwise) arising in any way out of
39 // the use of this software, even if advised of the possibility of such damage.
40 //
41 //M*/
42
43 #include "precomp.hpp"
44
45 /****************************************************************************************\
46                                     Base Image Filter
47 \****************************************************************************************/
48
49 /*
50  Various border types, image boundaries are denoted with '|'
51  
52  * BORDER_REPLICATE:     aaaaaa|abcdefgh|hhhhhhh
53  * BORDER_REFLECT:       fedcba|abcdefgh|hgfedcb
54  * BORDER_REFLECT_101:   gfedcb|abcdefgh|gfedcba
55  * BORDER_WRAP:          cdefgh|abcdefgh|abcdefg        
56  * BORDER_CONSTANT:      iiiiii|abcdefgh|iiiiiii  with some specified 'i'
57  */
58 int cv::borderInterpolate( int p, int len, int borderType )
59 {
60     if( (unsigned)p < (unsigned)len )
61         ;
62     else if( borderType == BORDER_REPLICATE )
63         p = p < 0 ? 0 : len - 1;
64     else if( borderType == BORDER_REFLECT || borderType == BORDER_REFLECT_101 )
65     {
66         int delta = borderType == BORDER_REFLECT_101;
67         if( len == 1 )
68             return 0;
69         do
70         {
71             if( p < 0 )
72                 p = -p - 1 + delta;
73             else
74                 p = len - 1 - (p - len) - delta;
75         }
76         while( (unsigned)p >= (unsigned)len );
77     }
78     else if( borderType == BORDER_WRAP )
79     {
80         if( p < 0 )
81             p -= ((p-len+1)/len)*len;
82         if( p >= len )
83             p %= len;
84     }
85     else if( borderType == BORDER_CONSTANT )
86         p = -1;
87     else
88         CV_Error( CV_StsBadArg, "Unknown/unsupported border type" );
89     return p;
90 }
91
92
93 namespace cv
94 {
95
96 BaseRowFilter::BaseRowFilter() { ksize = anchor = -1; }
97 BaseRowFilter::~BaseRowFilter() {}
98
99 BaseColumnFilter::BaseColumnFilter() { ksize = anchor = -1; }
100 BaseColumnFilter::~BaseColumnFilter() {}
101 void BaseColumnFilter::reset() {}
102
103 BaseFilter::BaseFilter() { ksize = Size(-1,-1); anchor = Point(-1,-1); }
104 BaseFilter::~BaseFilter() {}
105 void BaseFilter::reset() {}
106
107 FilterEngine::FilterEngine()
108 {
109     srcType = dstType = bufType = -1;
110     rowBorderType = columnBorderType = BORDER_REPLICATE;
111     bufStep = startY = startY0 = endY = rowCount = dstY = 0;
112     maxWidth = 0;
113
114     wholeSize = Size(-1,-1);
115 }
116     
117
118 FilterEngine::FilterEngine( const Ptr<BaseFilter>& _filter2D,
119                             const Ptr<BaseRowFilter>& _rowFilter,
120                             const Ptr<BaseColumnFilter>& _columnFilter,
121                             int _srcType, int _dstType, int _bufType,
122                             int _rowBorderType, int _columnBorderType,
123                             const Scalar& _borderValue )
124 {
125     init(_filter2D, _rowFilter, _columnFilter, _srcType, _dstType, _bufType,
126          _rowBorderType, _columnBorderType, _borderValue);
127 }
128     
129 FilterEngine::~FilterEngine()
130 {
131 }
132
133
134 void FilterEngine::init( const Ptr<BaseFilter>& _filter2D,
135                          const Ptr<BaseRowFilter>& _rowFilter,
136                          const Ptr<BaseColumnFilter>& _columnFilter,
137                          int _srcType, int _dstType, int _bufType,
138                          int _rowBorderType, int _columnBorderType,
139                          const Scalar& _borderValue )
140 {
141     _srcType = CV_MAT_TYPE(_srcType);
142     _bufType = CV_MAT_TYPE(_bufType);
143     _dstType = CV_MAT_TYPE(_dstType);
144         
145     srcType = _srcType;
146     int srcElemSize = (int)getElemSize(srcType);
147     dstType = _dstType;
148     bufType = _bufType;
149     
150     filter2D = _filter2D;
151     rowFilter = _rowFilter;
152     columnFilter = _columnFilter;
153
154     if( _columnBorderType < 0 )
155         _columnBorderType = _rowBorderType;
156     
157     rowBorderType = _rowBorderType;
158     columnBorderType = _columnBorderType;
159     
160     CV_Assert( columnBorderType != BORDER_WRAP );
161     
162     if( isSeparable() )
163     {
164         CV_Assert( !rowFilter.empty() && !columnFilter.empty() );
165         ksize = Size(rowFilter->ksize, columnFilter->ksize);
166         anchor = Point(rowFilter->anchor, columnFilter->anchor);
167     }
168     else
169     {
170         CV_Assert( bufType == srcType );
171         ksize = filter2D->ksize;
172         anchor = filter2D->anchor;
173     }
174
175     CV_Assert( 0 <= anchor.x && anchor.x < ksize.width &&
176                0 <= anchor.y && anchor.y < ksize.height );
177
178     borderElemSize = srcElemSize/(CV_MAT_DEPTH(srcType) >= CV_32S ? sizeof(int) : 1);    
179     int borderLength = std::max(ksize.width - 1, 1);
180     borderTab.resize(borderLength*borderElemSize);
181
182     maxWidth = bufStep = 0;
183     constBorderRow.clear();
184
185     if( rowBorderType == BORDER_CONSTANT || columnBorderType == BORDER_CONSTANT )
186     {
187         constBorderValue.resize(srcElemSize*borderLength);
188         scalarToRawData(_borderValue, &constBorderValue[0], srcType,
189                         borderLength*CV_MAT_CN(srcType));
190     }
191
192     wholeSize = Size(-1,-1);
193 }
194
195 static const int VEC_ALIGN = CV_MALLOC_ALIGN;
196
197 int FilterEngine::start(Size _wholeSize, Rect _roi, int _maxBufRows)
198 {
199     int i, j;
200     
201     wholeSize = _wholeSize;
202     roi = _roi;
203     CV_Assert( roi.x >= 0 && roi.y >= 0 && roi.width >= 0 && roi.height >= 0 &&
204         roi.x + roi.width <= wholeSize.width &&
205         roi.y + roi.height <= wholeSize.height );
206
207     int esz = (int)getElemSize(srcType);
208     int bufElemSize = (int)getElemSize(bufType);
209     const uchar* constVal = !constBorderValue.empty() ? &constBorderValue[0] : 0;
210
211     if( _maxBufRows < 0 )
212         _maxBufRows = ksize.height + 3;
213     _maxBufRows = std::max(_maxBufRows, std::max(anchor.y, ksize.height-anchor.y-1)*2+1);
214
215     if( maxWidth < roi.width || _maxBufRows != (int)rows.size() )
216     {
217         rows.resize(_maxBufRows);
218         maxWidth = std::max(maxWidth, roi.width);
219         int cn = CV_MAT_CN(srcType);
220         srcRow.resize(esz*(maxWidth + ksize.width - 1));
221         if( columnBorderType == BORDER_CONSTANT )
222         {
223             constBorderRow.resize(getElemSize(bufType)*(maxWidth + ksize.width - 1 + VEC_ALIGN));
224             uchar *dst = alignPtr(&constBorderRow[0], VEC_ALIGN), *tdst;
225             int n = (int)constBorderValue.size(), N;
226             N = (maxWidth + ksize.width - 1)*esz;
227             tdst = isSeparable() ? &srcRow[0] : dst;
228             
229             for( i = 0; i < N; i += n )
230             {
231                 n = std::min( n, N - i );
232                 for(j = 0; j < n; j++)
233                     tdst[i+j] = constVal[j];
234             }
235
236             if( isSeparable() )
237                 (*rowFilter)(&srcRow[0], dst, maxWidth, cn);
238         }
239         
240         int maxBufStep = bufElemSize*(int)alignSize(maxWidth +
241             (!isSeparable() ? ksize.width - 1 : 0),VEC_ALIGN);
242         ringBuf.resize(maxBufStep*rows.size()+VEC_ALIGN);
243     }
244
245     // adjust bufstep so that the used part of the ring buffer stays compact in memory
246     bufStep = bufElemSize*(int)alignSize(roi.width + (!isSeparable() ? ksize.width - 1 : 0),16);
247
248     dx1 = std::max(anchor.x - roi.x, 0);
249     dx2 = std::max(ksize.width - anchor.x - 1 + roi.x + roi.width - wholeSize.width, 0);
250
251     // recompute border tables
252     if( dx1 > 0 || dx2 > 0 )
253     {
254         if( rowBorderType == BORDER_CONSTANT )
255         {
256             int nr = isSeparable() ? 1 : (int)rows.size();
257             for( i = 0; i < nr; i++ )
258             {
259                 uchar* dst = isSeparable() ? &srcRow[0] : alignPtr(&ringBuf[0],VEC_ALIGN) + bufStep*i;
260                 memcpy( dst, constVal, dx1*esz );
261                 memcpy( dst + (roi.width + ksize.width - 1 - dx2)*esz, constVal, dx2*esz );
262             }
263         }
264         else
265         {
266             int xofs1 = std::min(roi.x, anchor.x) - roi.x;
267             
268             int btab_esz = borderElemSize, wholeWidth = wholeSize.width;
269             int* btab = (int*)&borderTab[0];
270             
271             for( i = 0; i < dx1; i++ )
272             {
273                 int p0 = (borderInterpolate(i-dx1, wholeWidth, rowBorderType) + xofs1)*btab_esz;
274                 for( j = 0; j < btab_esz; j++ )
275                     btab[i*btab_esz + j] = p0 + j;
276             }
277
278             for( i = 0; i < dx2; i++ )
279             {
280                 int p0 = (borderInterpolate(wholeWidth + i, wholeWidth, rowBorderType) + xofs1)*btab_esz;
281                 for( j = 0; j < btab_esz; j++ )
282                     btab[(i + dx1)*btab_esz + j] = p0 + j;
283             }
284         }
285     }
286
287     rowCount = dstY = 0;
288     startY = startY0 = std::max(roi.y - anchor.y, 0);
289     endY = std::min(roi.y + roi.height + ksize.height - anchor.y - 1, wholeSize.height);
290     if( !columnFilter.empty() )
291         columnFilter->reset();
292     if( !filter2D.empty() )
293         filter2D->reset();
294
295     return startY;
296 }
297
298
299 int FilterEngine::start(const Mat& src, const Rect& _srcRoi,
300                         bool isolated, int maxBufRows)
301 {
302     Rect srcRoi = _srcRoi;
303     
304     if( srcRoi == Rect(0,0,-1,-1) )
305         srcRoi = Rect(0,0,src.cols,src.rows);
306     
307     CV_Assert( srcRoi.x >= 0 && srcRoi.y >= 0 &&
308         srcRoi.width >= 0 && srcRoi.height >= 0 &&
309         srcRoi.x + srcRoi.width <= src.cols &&
310         srcRoi.y + srcRoi.height <= src.rows );
311
312     Point ofs;
313     Size wholeSize(src.cols, src.rows);
314     if( !isolated )
315         src.locateROI( wholeSize, ofs );
316     start( wholeSize, srcRoi + ofs, maxBufRows );
317
318     return startY - ofs.y;
319 }
320
321
322 int FilterEngine::remainingInputRows() const
323 {
324     return endY - startY - rowCount;
325 }
326
327 int FilterEngine::remainingOutputRows() const
328 {
329     return roi.height - dstY;
330 }
331
332 int FilterEngine::proceed( const uchar* src, int srcstep, int count,
333                            uchar* dst, int dststep )
334 {
335     CV_Assert( wholeSize.width > 0 && wholeSize.height > 0 );
336     
337     const int *btab = &borderTab[0];
338     int esz = (int)getElemSize(srcType), btab_esz = borderElemSize;
339     uchar** brows = &rows[0];
340     int bufRows = (int)rows.size();
341     int cn = CV_MAT_CN(bufType);
342     int width = roi.width, kwidth = ksize.width;
343     int kheight = ksize.height, ay = anchor.y;
344     int _dx1 = dx1, _dx2 = dx2;
345     int width1 = roi.width + kwidth - 1;
346     int xofs1 = std::min(roi.x, anchor.x);
347     bool isSep = isSeparable();
348     bool makeBorder = (_dx1 > 0 || _dx2 > 0) && rowBorderType != BORDER_CONSTANT;
349     int dy = 0, i = 0;
350
351     src -= xofs1*esz;
352     count = std::min(count, remainingInputRows());
353
354     CV_Assert( src && dst && count > 0 );
355
356     for(;; dst += dststep*i, dy += i)
357     {
358         int dcount = bufRows - ay - startY - rowCount + roi.y;
359         dcount = dcount > 0 ? dcount : bufRows - kheight + 1;
360         dcount = std::min(dcount, count);
361         count -= dcount;
362         for( ; dcount-- > 0; src += srcstep )
363         {
364             int bi = (startY - startY0 + rowCount) % bufRows;
365             uchar* brow = alignPtr(&ringBuf[0], VEC_ALIGN) + bi*bufStep;
366             uchar* row = isSep ? &srcRow[0] : brow;
367             
368             if( ++rowCount > bufRows )
369             {
370                 --rowCount;
371                 ++startY;
372             }
373
374             memcpy( row + _dx1*esz, src, (width1 - _dx2 - _dx1)*esz );
375
376             if( makeBorder )
377             {
378                 if( btab_esz*(int)sizeof(int) == esz )
379                 {
380                     const int* isrc = (const int*)src;
381                     int* irow = (int*)row;
382
383                     for( i = 0; i < _dx1*btab_esz; i++ )
384                         irow[i] = isrc[btab[i]];
385                     for( i = 0; i < _dx2*btab_esz; i++ )
386                         irow[i + (width1 - _dx2)*btab_esz] = isrc[btab[i+_dx1*btab_esz]];
387                 }
388                 else
389                 {
390                     for( i = 0; i < _dx1*esz; i++ )
391                         row[i] = src[btab[i]];
392                     for( i = 0; i < _dx2*esz; i++ )
393                         row[i + (width1 - _dx2)*esz] = src[btab[i+_dx1*esz]];
394                 }
395             }
396             
397             if( isSep )
398                 (*rowFilter)(row, brow, width, CV_MAT_CN(srcType));
399         }
400
401         int max_i = std::min(bufRows, roi.height - (dstY + dy) + (kheight - 1));
402         for( i = 0; i < max_i; i++ )
403         {
404             int srcY = borderInterpolate(dstY + dy + i + roi.y - ay,
405                             wholeSize.height, columnBorderType);
406             if( srcY < 0 ) // can happen only with constant border type
407                 brows[i] = alignPtr(&constBorderRow[0], VEC_ALIGN);
408             else
409             {
410                 CV_Assert( srcY >= startY );
411                 if( srcY >= startY + rowCount )
412                     break;
413                 int bi = (srcY - startY0) % bufRows;
414                 brows[i] = alignPtr(&ringBuf[0], VEC_ALIGN) + bi*bufStep;
415             }
416         }
417         if( i < kheight )
418             break;
419         i -= kheight - 1;
420         if( isSeparable() )
421             (*columnFilter)((const uchar**)brows, dst, dststep, i, roi.width*cn);
422         else
423             (*filter2D)((const uchar**)brows, dst, dststep, i, roi.width, cn);
424     }
425
426     dstY += dy;
427     CV_Assert( dstY <= roi.height );
428     return dy;
429 }
430
431
432 void FilterEngine::apply(const Mat& src, Mat& dst,
433     const Rect& _srcRoi, Point dstOfs, bool isolated)
434 {
435     CV_Assert( src.type() == srcType && dst.type() == dstType );
436     
437     Rect srcRoi = _srcRoi;
438     if( srcRoi == Rect(0,0,-1,-1) )
439         srcRoi = Rect(0,0,src.cols,src.rows);
440     
441     if( srcRoi.area() == 0 )
442         return;
443
444     CV_Assert( dstOfs.x >= 0 && dstOfs.y >= 0 &&
445         dstOfs.x + srcRoi.width <= dst.cols &&
446         dstOfs.y + srcRoi.height <= dst.rows );
447
448     int y = start(src, srcRoi, isolated);
449     proceed( src.data + y*src.step, (int)src.step, endY - startY,
450              dst.data + dstOfs.y*dst.step + dstOfs.x*dst.elemSize(), (int)dst.step );
451 }
452
453 }
454
455 /****************************************************************************************\
456 *                                 Separable linear filter                                *
457 \****************************************************************************************/
458
459 int cv::getKernelType(InputArray filter_kernel, Point anchor)
460 {
461     Mat _kernel = filter_kernel.getMat();
462     CV_Assert( _kernel.channels() == 1 );
463     int i, sz = _kernel.rows*_kernel.cols;
464
465     Mat kernel;
466     _kernel.convertTo(kernel, CV_64F);
467
468     const double* coeffs = (double*)kernel.data;
469     double sum = 0;
470     int type = KERNEL_SMOOTH + KERNEL_INTEGER;
471     if( (_kernel.rows == 1 || _kernel.cols == 1) &&
472         anchor.x*2 + 1 == _kernel.cols &&
473         anchor.y*2 + 1 == _kernel.rows )
474         type |= (KERNEL_SYMMETRICAL + KERNEL_ASYMMETRICAL);
475
476     for( i = 0; i < sz; i++ )
477     {
478         double a = coeffs[i], b = coeffs[sz - i - 1];
479         if( a != b )
480             type &= ~KERNEL_SYMMETRICAL;
481         if( a != -b )
482             type &= ~KERNEL_ASYMMETRICAL;
483         if( a < 0 )
484             type &= ~KERNEL_SMOOTH;
485         if( a != saturate_cast<int>(a) )
486             type &= ~KERNEL_INTEGER;
487         sum += a;
488     }
489
490     if( fabs(sum - 1) > FLT_EPSILON*(fabs(sum) + 1) )
491         type &= ~KERNEL_SMOOTH;
492     return type;
493 }
494
495
496 namespace cv
497 {
498
499 struct RowNoVec
500 {
501     RowNoVec() {}
502     RowNoVec(const Mat&) {}
503     int operator()(const uchar*, uchar*, int, int) const { return 0; }
504 };
505
506 struct ColumnNoVec
507 {
508     ColumnNoVec() {}
509     ColumnNoVec(const Mat&, int, int, double) {}
510     int operator()(const uchar**, uchar*, int) const { return 0; }
511 };
512
513 struct SymmRowSmallNoVec
514 {
515     SymmRowSmallNoVec() {}
516     SymmRowSmallNoVec(const Mat&, int) {}
517     int operator()(const uchar*, uchar*, int, int) const { return 0; }
518 };
519
520 struct SymmColumnSmallNoVec
521 {
522     SymmColumnSmallNoVec() {}
523     SymmColumnSmallNoVec(const Mat&, int, int, double) {}
524     int operator()(const uchar**, uchar*, int) const { return 0; }
525 };
526
527 struct FilterNoVec
528 {
529     FilterNoVec() {}
530     FilterNoVec(const Mat&, int, double) {}
531     int operator()(const uchar**, uchar*, int) const { return 0; }
532 };
533
534
535 #if CV_SSE2
536
537 ///////////////////////////////////// 8u-16s & 8u-8u //////////////////////////////////
538
539 struct RowVec_8u32s
540 {
541     RowVec_8u32s() { smallValues = false; }
542     RowVec_8u32s( const Mat& _kernel )
543     {
544         kernel = _kernel;
545         smallValues = true;
546         int k, ksize = kernel.rows + kernel.cols - 1;
547         for( k = 0; k < ksize; k++ )
548         {
549             int v = ((const int*)kernel.data)[k];
550             if( v < SHRT_MIN || v > SHRT_MAX )
551             {
552                 smallValues = false;
553                 break;
554             }
555         }
556     }
557
558     int operator()(const uchar* _src, uchar* _dst, int width, int cn) const
559     {
560         if( !checkHardwareSupport(CV_CPU_SSE2) )
561             return 0;
562         
563         int i = 0, k, _ksize = kernel.rows + kernel.cols - 1;
564         int* dst = (int*)_dst;
565         const int* _kx = (const int*)kernel.data;
566         width *= cn;
567
568         if( smallValues )
569         {
570             for( ; i <= width - 16; i += 16 )
571             {
572                 const uchar* src = _src + i;
573                 __m128i f, z = _mm_setzero_si128(), s0 = z, s1 = z, s2 = z, s3 = z;
574                 __m128i x0, x1, x2, x3;
575
576                 for( k = 0; k < _ksize; k++, src += cn )
577                 {
578                     f = _mm_cvtsi32_si128(_kx[k]);
579                     f = _mm_shuffle_epi32(f, 0);
580                     f = _mm_packs_epi32(f, f);
581
582                     x0 = _mm_loadu_si128((const __m128i*)src);
583                     x2 = _mm_unpackhi_epi8(x0, z);
584                     x0 = _mm_unpacklo_epi8(x0, z);
585                     x1 = _mm_mulhi_epi16(x0, f);
586                     x3 = _mm_mulhi_epi16(x2, f);
587                     x0 = _mm_mullo_epi16(x0, f);
588                     x2 = _mm_mullo_epi16(x2, f);
589
590                     s0 = _mm_add_epi32(s0, _mm_unpacklo_epi16(x0, x1));
591                     s1 = _mm_add_epi32(s1, _mm_unpackhi_epi16(x0, x1));
592                     s2 = _mm_add_epi32(s2, _mm_unpacklo_epi16(x2, x3));
593                     s3 = _mm_add_epi32(s3, _mm_unpackhi_epi16(x2, x3));
594                 }
595                 
596                 _mm_store_si128((__m128i*)(dst + i), s0);
597                 _mm_store_si128((__m128i*)(dst + i + 4), s1);
598                 _mm_store_si128((__m128i*)(dst + i + 8), s2);
599                 _mm_store_si128((__m128i*)(dst + i + 12), s3);
600             }
601
602             for( ; i <= width - 4; i += 4 )
603             {
604                 const uchar* src = _src + i;
605                 __m128i f, z = _mm_setzero_si128(), s0 = z, x0, x1;
606
607                 for( k = 0; k < _ksize; k++, src += cn )
608                 {
609                     f = _mm_cvtsi32_si128(_kx[k]);
610                     f = _mm_shuffle_epi32(f, 0);
611                     f = _mm_packs_epi32(f, f);
612
613                     x0 = _mm_cvtsi32_si128(*(const int*)src);
614                     x0 = _mm_unpacklo_epi8(x0, z);
615                     x1 = _mm_mulhi_epi16(x0, f);
616                     x0 = _mm_mullo_epi16(x0, f);
617                     s0 = _mm_add_epi32(s0, _mm_unpacklo_epi16(x0, x1));
618                 }
619                 _mm_store_si128((__m128i*)(dst + i), s0);
620             }
621         }
622         return i;
623     }
624
625     Mat kernel;
626     bool smallValues;
627 };
628
629
630 struct SymmRowSmallVec_8u32s
631 {
632     SymmRowSmallVec_8u32s() { smallValues = false; }
633     SymmRowSmallVec_8u32s( const Mat& _kernel, int _symmetryType )
634     {
635         kernel = _kernel;
636         symmetryType = _symmetryType;
637         smallValues = true;
638         int k, ksize = kernel.rows + kernel.cols - 1;
639         for( k = 0; k < ksize; k++ )
640         {
641             int v = ((const int*)kernel.data)[k];
642             if( v < SHRT_MIN || v > SHRT_MAX )
643             {
644                 smallValues = false;
645                 break;
646             }
647         }
648     }
649
650     int operator()(const uchar* src, uchar* _dst, int width, int cn) const
651     {
652         if( !checkHardwareSupport(CV_CPU_SSE2) )
653             return 0;
654         
655         int i = 0, j, k, _ksize = kernel.rows + kernel.cols - 1;
656         int* dst = (int*)_dst;
657         bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
658         const int* kx = (const int*)kernel.data + _ksize/2;
659         if( !smallValues )
660             return 0;
661
662         src += (_ksize/2)*cn;
663         width *= cn;
664
665         __m128i z = _mm_setzero_si128();
666         if( symmetrical )
667         {
668             if( _ksize == 1 )
669                 return 0;
670             if( _ksize == 3 )
671             {
672                 if( kx[0] == 2 && kx[1] == 1 )
673                     for( ; i <= width - 16; i += 16, src += 16 )
674                     {
675                         __m128i x0, x1, x2, y0, y1, y2;
676                         x0 = _mm_loadu_si128((__m128i*)(src - cn));
677                         x1 = _mm_loadu_si128((__m128i*)src);
678                         x2 = _mm_loadu_si128((__m128i*)(src + cn));
679                         y0 = _mm_unpackhi_epi8(x0, z);
680                         x0 = _mm_unpacklo_epi8(x0, z);
681                         y1 = _mm_unpackhi_epi8(x1, z);
682                         x1 = _mm_unpacklo_epi8(x1, z);
683                         y2 = _mm_unpackhi_epi8(x2, z);
684                         x2 = _mm_unpacklo_epi8(x2, z);
685                         x0 = _mm_add_epi16(x0, _mm_add_epi16(_mm_add_epi16(x1, x1), x2));
686                         y0 = _mm_add_epi16(y0, _mm_add_epi16(_mm_add_epi16(y1, y1), y2));
687                         _mm_store_si128((__m128i*)(dst + i), _mm_unpacklo_epi16(x0, z));
688                         _mm_store_si128((__m128i*)(dst + i + 4), _mm_unpackhi_epi16(x0, z));
689                         _mm_store_si128((__m128i*)(dst + i + 8), _mm_unpacklo_epi16(y0, z));
690                         _mm_store_si128((__m128i*)(dst + i + 12), _mm_unpackhi_epi16(y0, z));
691                     }
692                 else if( kx[0] == -2 && kx[1] == 1 )
693                     for( ; i <= width - 16; i += 16, src += 16 )
694                     {
695                         __m128i x0, x1, x2, y0, y1, y2;
696                         x0 = _mm_loadu_si128((__m128i*)(src - cn));
697                         x1 = _mm_loadu_si128((__m128i*)src);
698                         x2 = _mm_loadu_si128((__m128i*)(src + cn));
699                         y0 = _mm_unpackhi_epi8(x0, z);
700                         x0 = _mm_unpacklo_epi8(x0, z);
701                         y1 = _mm_unpackhi_epi8(x1, z);
702                         x1 = _mm_unpacklo_epi8(x1, z);
703                         y2 = _mm_unpackhi_epi8(x2, z);
704                         x2 = _mm_unpacklo_epi8(x2, z);
705                         x0 = _mm_add_epi16(x0, _mm_sub_epi16(x2, _mm_add_epi16(x1, x1)));
706                         y0 = _mm_add_epi16(y0, _mm_sub_epi16(y2, _mm_add_epi16(y1, y1)));
707                         _mm_store_si128((__m128i*)(dst + i), _mm_srai_epi32(_mm_unpacklo_epi16(x0, x0),16));
708                         _mm_store_si128((__m128i*)(dst + i + 4), _mm_srai_epi32(_mm_unpackhi_epi16(x0, x0),16));
709                         _mm_store_si128((__m128i*)(dst + i + 8), _mm_srai_epi32(_mm_unpacklo_epi16(y0, y0),16));
710                         _mm_store_si128((__m128i*)(dst + i + 12), _mm_srai_epi32(_mm_unpackhi_epi16(y0, y0),16));
711                     }
712                 else
713                 {
714                     __m128i k0 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[0]), 0),
715                             k1 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[1]), 0);
716                     k0 = _mm_packs_epi32(k0, k0);
717                     k1 = _mm_packs_epi32(k1, k1);
718
719                     for( ; i <= width - 16; i += 16, src += 16 )
720                     {
721                         __m128i x0, x1, x2, y0, y1, t0, t1, z0, z1, z2, z3;
722                         x0 = _mm_loadu_si128((__m128i*)(src - cn));
723                         x1 = _mm_loadu_si128((__m128i*)src);
724                         x2 = _mm_loadu_si128((__m128i*)(src + cn));
725                         y0 = _mm_add_epi16(_mm_unpackhi_epi8(x0, z), _mm_unpackhi_epi8(x2, z));
726                         x0 = _mm_add_epi16(_mm_unpacklo_epi8(x0, z), _mm_unpacklo_epi8(x2, z));
727                         y1 = _mm_unpackhi_epi8(x1, z);
728                         x1 = _mm_unpacklo_epi8(x1, z);
729
730                         t1 = _mm_mulhi_epi16(x1, k0);
731                         t0 = _mm_mullo_epi16(x1, k0);
732                         x2 = _mm_mulhi_epi16(x0, k1);
733                         x0 = _mm_mullo_epi16(x0, k1);
734                         z0 = _mm_unpacklo_epi16(t0, t1);
735                         z1 = _mm_unpackhi_epi16(t0, t1);
736                         z0 = _mm_add_epi32(z0, _mm_unpacklo_epi16(x0, x2));
737                         z1 = _mm_add_epi32(z1, _mm_unpackhi_epi16(x0, x2));
738
739                         t1 = _mm_mulhi_epi16(y1, k0);
740                         t0 = _mm_mullo_epi16(y1, k0);
741                         y1 = _mm_mulhi_epi16(y0, k1);
742                         y0 = _mm_mullo_epi16(y0, k1);
743                         z2 = _mm_unpacklo_epi16(t0, t1);
744                         z3 = _mm_unpackhi_epi16(t0, t1);
745                         z2 = _mm_add_epi32(z2, _mm_unpacklo_epi16(y0, y1));
746                         z3 = _mm_add_epi32(z3, _mm_unpackhi_epi16(y0, y1));
747                         _mm_store_si128((__m128i*)(dst + i), z0);
748                         _mm_store_si128((__m128i*)(dst + i + 4), z1);
749                         _mm_store_si128((__m128i*)(dst + i + 8), z2);
750                         _mm_store_si128((__m128i*)(dst + i + 12), z3);
751                     }
752                 }
753             }
754             else if( _ksize == 5 )
755             {
756                 if( kx[0] == -2 && kx[1] == 0 && kx[2] == 1 )
757                     for( ; i <= width - 16; i += 16, src += 16 )
758                     {
759                         __m128i x0, x1, x2, y0, y1, y2;
760                         x0 = _mm_loadu_si128((__m128i*)(src - cn*2));
761                         x1 = _mm_loadu_si128((__m128i*)src);
762                         x2 = _mm_loadu_si128((__m128i*)(src + cn*2));
763                         y0 = _mm_unpackhi_epi8(x0, z);
764                         x0 = _mm_unpacklo_epi8(x0, z);
765                         y1 = _mm_unpackhi_epi8(x1, z);
766                         x1 = _mm_unpacklo_epi8(x1, z);
767                         y2 = _mm_unpackhi_epi8(x2, z);
768                         x2 = _mm_unpacklo_epi8(x2, z);
769                         x0 = _mm_add_epi16(x0, _mm_sub_epi16(x2, _mm_add_epi16(x1, x1)));
770                         y0 = _mm_add_epi16(y0, _mm_sub_epi16(y2, _mm_add_epi16(y1, y1)));
771                         _mm_store_si128((__m128i*)(dst + i), _mm_srai_epi32(_mm_unpacklo_epi16(x0, x0),16));
772                         _mm_store_si128((__m128i*)(dst + i + 4), _mm_srai_epi32(_mm_unpackhi_epi16(x0, x0),16));
773                         _mm_store_si128((__m128i*)(dst + i + 8), _mm_srai_epi32(_mm_unpacklo_epi16(y0, y0),16));
774                         _mm_store_si128((__m128i*)(dst + i + 12), _mm_srai_epi32(_mm_unpackhi_epi16(y0, y0),16));
775                     }
776                 else
777                 {
778                     __m128i k0 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[0]), 0),
779                             k1 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[1]), 0),
780                             k2 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[2]), 0);
781                     k0 = _mm_packs_epi32(k0, k0);
782                     k1 = _mm_packs_epi32(k1, k1);
783                     k2 = _mm_packs_epi32(k2, k2);
784
785                     for( ; i <= width - 16; i += 16, src += 16 )
786                     {
787                         __m128i x0, x1, x2, y0, y1, t0, t1, z0, z1, z2, z3;
788                         x0 = _mm_loadu_si128((__m128i*)(src - cn));
789                         x1 = _mm_loadu_si128((__m128i*)src);
790                         x2 = _mm_loadu_si128((__m128i*)(src + cn));
791                         y0 = _mm_add_epi16(_mm_unpackhi_epi8(x0, z), _mm_unpackhi_epi8(x2, z));
792                         x0 = _mm_add_epi16(_mm_unpacklo_epi8(x0, z), _mm_unpacklo_epi8(x2, z));
793                         y1 = _mm_unpackhi_epi8(x1, z);
794                         x1 = _mm_unpacklo_epi8(x1, z);
795
796                         t1 = _mm_mulhi_epi16(x1, k0);
797                         t0 = _mm_mullo_epi16(x1, k0);
798                         x2 = _mm_mulhi_epi16(x0, k1);
799                         x0 = _mm_mullo_epi16(x0, k1);
800                         z0 = _mm_unpacklo_epi16(t0, t1);
801                         z1 = _mm_unpackhi_epi16(t0, t1);
802                         z0 = _mm_add_epi32(z0, _mm_unpacklo_epi16(x0, x2));
803                         z1 = _mm_add_epi32(z1, _mm_unpackhi_epi16(x0, x2));
804
805                         t1 = _mm_mulhi_epi16(y1, k0);
806                         t0 = _mm_mullo_epi16(y1, k0);
807                         y1 = _mm_mulhi_epi16(y0, k1);
808                         y0 = _mm_mullo_epi16(y0, k1);
809                         z2 = _mm_unpacklo_epi16(t0, t1);
810                         z3 = _mm_unpackhi_epi16(t0, t1);
811                         z2 = _mm_add_epi32(z2, _mm_unpacklo_epi16(y0, y1));
812                         z3 = _mm_add_epi32(z3, _mm_unpackhi_epi16(y0, y1));
813
814                         x0 = _mm_loadu_si128((__m128i*)(src - cn*2));
815                         x1 = _mm_loadu_si128((__m128i*)(src + cn*2));
816                         y1 = _mm_add_epi16(_mm_unpackhi_epi8(x0, z), _mm_unpackhi_epi8(x1, z));
817                         y0 = _mm_add_epi16(_mm_unpacklo_epi8(x0, z), _mm_unpacklo_epi8(x1, z));
818
819                         t1 = _mm_mulhi_epi16(y0, k2);
820                         t0 = _mm_mullo_epi16(y0, k2);
821                         y0 = _mm_mullo_epi16(y1, k2);
822                         y1 = _mm_mulhi_epi16(y1, k2);
823                         z0 = _mm_add_epi32(z0, _mm_unpacklo_epi16(t0, t1));
824                         z1 = _mm_add_epi32(z1, _mm_unpackhi_epi16(t0, t1));
825                         z2 = _mm_add_epi32(z2, _mm_unpacklo_epi16(y0, y1));
826                         z3 = _mm_add_epi32(z3, _mm_unpackhi_epi16(y0, y1));
827
828                         _mm_store_si128((__m128i*)(dst + i), z0);
829                         _mm_store_si128((__m128i*)(dst + i + 4), z1);
830                         _mm_store_si128((__m128i*)(dst + i + 8), z2);
831                         _mm_store_si128((__m128i*)(dst + i + 12), z3);
832                     }
833                 }
834             }
835         }
836         else
837         {
838             if( _ksize == 3 )
839             {
840                 if( kx[0] == 0 && kx[1] == 1 )
841                     for( ; i <= width - 16; i += 16, src += 16 )
842                     {
843                         __m128i x0, x1, y0;
844                         x0 = _mm_loadu_si128((__m128i*)(src + cn));
845                         x1 = _mm_loadu_si128((__m128i*)(src - cn));
846                         y0 = _mm_sub_epi16(_mm_unpackhi_epi8(x0, z), _mm_unpackhi_epi8(x1, z));
847                         x0 = _mm_sub_epi16(_mm_unpacklo_epi8(x0, z), _mm_unpacklo_epi8(x1, z));
848                         _mm_store_si128((__m128i*)(dst + i), _mm_srai_epi32(_mm_unpacklo_epi16(x0, x0),16));
849                         _mm_store_si128((__m128i*)(dst + i + 4), _mm_srai_epi32(_mm_unpackhi_epi16(x0, x0),16));
850                         _mm_store_si128((__m128i*)(dst + i + 8), _mm_srai_epi32(_mm_unpacklo_epi16(y0, y0),16));
851                         _mm_store_si128((__m128i*)(dst + i + 12), _mm_srai_epi32(_mm_unpackhi_epi16(y0, y0),16));
852                     }
853                 else
854                 {
855                     __m128i k1 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[1]), 0);
856                     k1 = _mm_packs_epi32(k1, k1);
857
858                     for( ; i <= width - 16; i += 16, src += 16 )
859                     {
860                         __m128i x0, x1, y0, y1, z0, z1, z2, z3;
861                         x0 = _mm_loadu_si128((__m128i*)(src + cn));
862                         x1 = _mm_loadu_si128((__m128i*)(src - cn));
863                         y0 = _mm_sub_epi16(_mm_unpackhi_epi8(x0, z), _mm_unpackhi_epi8(x1, z));
864                         x0 = _mm_sub_epi16(_mm_unpacklo_epi8(x0, z), _mm_unpacklo_epi8(x1, z));
865
866                         x1 = _mm_mulhi_epi16(x0, k1);
867                         x0 = _mm_mullo_epi16(x0, k1);
868                         z0 = _mm_unpacklo_epi16(x0, x1);
869                         z1 = _mm_unpackhi_epi16(x0, x1);
870
871                         y1 = _mm_mulhi_epi16(y0, k1);
872                         y0 = _mm_mullo_epi16(y0, k1);
873                         z2 = _mm_unpacklo_epi16(y0, y1);
874                         z3 = _mm_unpackhi_epi16(y0, y1);
875                         _mm_store_si128((__m128i*)(dst + i), z0);
876                         _mm_store_si128((__m128i*)(dst + i + 4), z1);
877                         _mm_store_si128((__m128i*)(dst + i + 8), z2);
878                         _mm_store_si128((__m128i*)(dst + i + 12), z3);
879                     }
880                 }
881             }
882             else if( _ksize == 5 )
883             {
884                 __m128i k0 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[0]), 0),
885                         k1 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[1]), 0),
886                         k2 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[2]), 0);
887                 k0 = _mm_packs_epi32(k0, k0);
888                 k1 = _mm_packs_epi32(k1, k1);
889                 k2 = _mm_packs_epi32(k2, k2);
890
891                 for( ; i <= width - 16; i += 16, src += 16 )
892                 {
893                     __m128i x0, x1, x2, y0, y1, t0, t1, z0, z1, z2, z3;
894                     x0 = _mm_loadu_si128((__m128i*)(src + cn));
895                     x2 = _mm_loadu_si128((__m128i*)(src - cn));
896                     y0 = _mm_sub_epi16(_mm_unpackhi_epi8(x0, z), _mm_unpackhi_epi8(x2, z));
897                     x0 = _mm_sub_epi16(_mm_unpacklo_epi8(x0, z), _mm_unpacklo_epi8(x2, z));
898
899                     x2 = _mm_mulhi_epi16(x0, k1);
900                     x0 = _mm_mullo_epi16(x0, k1);
901                     z0 = _mm_unpacklo_epi16(x0, x2);
902                     z1 = _mm_unpackhi_epi16(x0, x2);
903                     y1 = _mm_mulhi_epi16(y0, k1);
904                     y0 = _mm_mullo_epi16(y0, k1);
905                     z2 = _mm_unpacklo_epi16(y0, y1);
906                     z3 = _mm_unpackhi_epi16(y0, y1);
907
908                     x0 = _mm_loadu_si128((__m128i*)(src + cn*2));
909                     x1 = _mm_loadu_si128((__m128i*)(src - cn*2));
910                     y1 = _mm_sub_epi16(_mm_unpackhi_epi8(x0, z), _mm_unpackhi_epi8(x1, z));
911                     y0 = _mm_sub_epi16(_mm_unpacklo_epi8(x0, z), _mm_unpacklo_epi8(x1, z));
912
913                     t1 = _mm_mulhi_epi16(y0, k2);
914                     t0 = _mm_mullo_epi16(y0, k2);
915                     y0 = _mm_mullo_epi16(y1, k2);
916                     y1 = _mm_mulhi_epi16(y1, k2);
917                     z0 = _mm_add_epi32(z0, _mm_unpacklo_epi16(t0, t1));
918                     z1 = _mm_add_epi32(z1, _mm_unpackhi_epi16(t0, t1));
919                     z2 = _mm_add_epi32(z2, _mm_unpacklo_epi16(y0, y1));
920                     z3 = _mm_add_epi32(z3, _mm_unpackhi_epi16(y0, y1));
921
922                     _mm_store_si128((__m128i*)(dst + i), z0);
923                     _mm_store_si128((__m128i*)(dst + i + 4), z1);
924                     _mm_store_si128((__m128i*)(dst + i + 8), z2);
925                     _mm_store_si128((__m128i*)(dst + i + 12), z3);
926                 }
927             }
928         }
929
930         src -= (_ksize/2)*cn;
931         kx -= _ksize/2;
932         for( ; i <= width - 4; i += 4, src += 4 )
933         {
934             __m128i f, s0 = z, x0, x1;
935
936             for( k = j = 0; k < _ksize; k++, j += cn )
937             {
938                 f = _mm_cvtsi32_si128(kx[k]);
939                 f = _mm_shuffle_epi32(f, 0);
940                 f = _mm_packs_epi32(f, f);
941
942                 x0 = _mm_cvtsi32_si128(*(const int*)(src + j));
943                 x0 = _mm_unpacklo_epi8(x0, z);
944                 x1 = _mm_mulhi_epi16(x0, f);
945                 x0 = _mm_mullo_epi16(x0, f);
946                 s0 = _mm_add_epi32(s0, _mm_unpacklo_epi16(x0, x1));
947             }
948             _mm_store_si128((__m128i*)(dst + i), s0);
949         }
950
951         return i;
952     }
953
954     Mat kernel;
955     int symmetryType;
956     bool smallValues;
957 };
958
959
960 struct SymmColumnVec_32s8u
961 {
962     SymmColumnVec_32s8u() { symmetryType=0; }
963     SymmColumnVec_32s8u(const Mat& _kernel, int _symmetryType, int _bits, double _delta)
964     {
965         symmetryType = _symmetryType;
966         _kernel.convertTo(kernel, CV_32F, 1./(1 << _bits), 0);
967         delta = (float)(_delta/(1 << _bits));
968         CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 );
969     }
970
971     int operator()(const uchar** _src, uchar* dst, int width) const
972     {
973         if( !checkHardwareSupport(CV_CPU_SSE2) )
974             return 0;
975         
976         int ksize2 = (kernel.rows + kernel.cols - 1)/2;
977         const float* ky = (const float*)kernel.data + ksize2;
978         int i = 0, k;
979         bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
980         const int** src = (const int**)_src;
981         const __m128i *S, *S2;
982         __m128 d4 = _mm_set1_ps(delta);
983
984         if( symmetrical )
985         {
986             for( ; i <= width - 16; i += 16 )
987             {
988                 __m128 f = _mm_load_ss(ky);
989                 f = _mm_shuffle_ps(f, f, 0);
990                 __m128 s0, s1, s2, s3;
991                 __m128i x0, x1;
992                 S = (const __m128i*)(src[0] + i);
993                 s0 = _mm_cvtepi32_ps(_mm_load_si128(S));
994                 s1 = _mm_cvtepi32_ps(_mm_load_si128(S+1));
995                 s0 = _mm_add_ps(_mm_mul_ps(s0, f), d4);
996                 s1 = _mm_add_ps(_mm_mul_ps(s1, f), d4);
997                 s2 = _mm_cvtepi32_ps(_mm_load_si128(S+2));
998                 s3 = _mm_cvtepi32_ps(_mm_load_si128(S+3));
999                 s2 = _mm_add_ps(_mm_mul_ps(s2, f), d4);
1000                 s3 = _mm_add_ps(_mm_mul_ps(s3, f), d4);
1001
1002                 for( k = 1; k <= ksize2; k++ )
1003                 {
1004                     S = (const __m128i*)(src[k] + i);
1005                     S2 = (const __m128i*)(src[-k] + i);
1006                     f = _mm_load_ss(ky+k);
1007                     f = _mm_shuffle_ps(f, f, 0);
1008                     x0 = _mm_add_epi32(_mm_load_si128(S), _mm_load_si128(S2));
1009                     x1 = _mm_add_epi32(_mm_load_si128(S+1), _mm_load_si128(S2+1));
1010                     s0 = _mm_add_ps(s0, _mm_mul_ps(_mm_cvtepi32_ps(x0), f));
1011                     s1 = _mm_add_ps(s1, _mm_mul_ps(_mm_cvtepi32_ps(x1), f));
1012                     x0 = _mm_add_epi32(_mm_load_si128(S+2), _mm_load_si128(S2+2));
1013                     x1 = _mm_add_epi32(_mm_load_si128(S+3), _mm_load_si128(S2+3));
1014                     s2 = _mm_add_ps(s2, _mm_mul_ps(_mm_cvtepi32_ps(x0), f));
1015                     s3 = _mm_add_ps(s3, _mm_mul_ps(_mm_cvtepi32_ps(x1), f));
1016                 }
1017
1018                 x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), _mm_cvtps_epi32(s1));
1019                 x1 = _mm_packs_epi32(_mm_cvtps_epi32(s2), _mm_cvtps_epi32(s3));
1020                 x0 = _mm_packus_epi16(x0, x1);
1021                 _mm_storeu_si128((__m128i*)(dst + i), x0);
1022             }
1023
1024             for( ; i <= width - 4; i += 4 )
1025             {
1026                 __m128 f = _mm_load_ss(ky);
1027                 f = _mm_shuffle_ps(f, f, 0);
1028                 __m128i x0;
1029                 __m128 s0 = _mm_cvtepi32_ps(_mm_load_si128((const __m128i*)(src[0] + i)));
1030                 s0 = _mm_add_ps(_mm_mul_ps(s0, f), d4);
1031
1032                 for( k = 1; k <= ksize2; k++ )
1033                 {
1034                     S = (const __m128i*)(src[k] + i);
1035                     S2 = (const __m128i*)(src[-k] + i);
1036                     f = _mm_load_ss(ky+k);
1037                     f = _mm_shuffle_ps(f, f, 0);
1038                     x0 = _mm_add_epi32(_mm_load_si128(S), _mm_load_si128(S2));
1039                     s0 = _mm_add_ps(s0, _mm_mul_ps(_mm_cvtepi32_ps(x0), f));
1040                 }
1041
1042                 x0 = _mm_cvtps_epi32(s0);
1043                 x0 = _mm_packs_epi32(x0, x0);
1044                 x0 = _mm_packus_epi16(x0, x0);
1045                 *(int*)(dst + i) = _mm_cvtsi128_si32(x0);
1046             }
1047         }
1048         else
1049         {
1050             for( ; i <= width - 16; i += 16 )
1051             {
1052                 __m128 f, s0 = d4, s1 = d4, s2 = d4, s3 = d4;
1053                 __m128i x0, x1;
1054
1055                 for( k = 1; k <= ksize2; k++ )
1056                 {
1057                     S = (const __m128i*)(src[k] + i);
1058                     S2 = (const __m128i*)(src[-k] + i);
1059                     f = _mm_load_ss(ky+k);
1060                     f = _mm_shuffle_ps(f, f, 0);
1061                     x0 = _mm_sub_epi32(_mm_load_si128(S), _mm_load_si128(S2));
1062                     x1 = _mm_sub_epi32(_mm_load_si128(S+1), _mm_load_si128(S2+1));
1063                     s0 = _mm_add_ps(s0, _mm_mul_ps(_mm_cvtepi32_ps(x0), f));
1064                     s1 = _mm_add_ps(s1, _mm_mul_ps(_mm_cvtepi32_ps(x1), f));
1065                     x0 = _mm_sub_epi32(_mm_load_si128(S+2), _mm_load_si128(S2+2));
1066                     x1 = _mm_sub_epi32(_mm_load_si128(S+3), _mm_load_si128(S2+3));
1067                     s2 = _mm_add_ps(s2, _mm_mul_ps(_mm_cvtepi32_ps(x0), f));
1068                     s3 = _mm_add_ps(s3, _mm_mul_ps(_mm_cvtepi32_ps(x1), f));
1069                 }
1070
1071                 x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), _mm_cvtps_epi32(s1));
1072                 x1 = _mm_packs_epi32(_mm_cvtps_epi32(s2), _mm_cvtps_epi32(s3));
1073                 x0 = _mm_packus_epi16(x0, x1);
1074                 _mm_storeu_si128((__m128i*)(dst + i), x0);
1075             }
1076
1077             for( ; i <= width - 4; i += 4 )
1078             {
1079                 __m128 f, s0 = d4;
1080                 __m128i x0;
1081
1082                 for( k = 1; k <= ksize2; k++ )
1083                 {
1084                     S = (const __m128i*)(src[k] + i);
1085                     S2 = (const __m128i*)(src[-k] + i);
1086                     f = _mm_load_ss(ky+k);
1087                     f = _mm_shuffle_ps(f, f, 0);
1088                     x0 = _mm_sub_epi32(_mm_load_si128(S), _mm_load_si128(S2));
1089                     s0 = _mm_add_ps(s0, _mm_mul_ps(_mm_cvtepi32_ps(x0), f));
1090                 }
1091
1092                 x0 = _mm_cvtps_epi32(s0);
1093                 x0 = _mm_packs_epi32(x0, x0);
1094                 x0 = _mm_packus_epi16(x0, x0);
1095                 *(int*)(dst + i) = _mm_cvtsi128_si32(x0);
1096             }
1097         }
1098
1099         return i;
1100     }
1101
1102     int symmetryType;
1103     float delta;
1104     Mat kernel;
1105 };
1106
1107
1108 struct SymmColumnSmallVec_32s16s
1109 {
1110     SymmColumnSmallVec_32s16s() { symmetryType=0; }
1111     SymmColumnSmallVec_32s16s(const Mat& _kernel, int _symmetryType, int _bits, double _delta)
1112     {
1113         symmetryType = _symmetryType;
1114         _kernel.convertTo(kernel, CV_32F, 1./(1 << _bits), 0);
1115         delta = (float)(_delta/(1 << _bits));
1116         CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 );
1117     }
1118
1119     int operator()(const uchar** _src, uchar* _dst, int width) const
1120     {
1121         if( !checkHardwareSupport(CV_CPU_SSE2) )
1122             return 0;
1123         
1124         int ksize2 = (kernel.rows + kernel.cols - 1)/2;
1125         const float* ky = (const float*)kernel.data + ksize2;
1126         int i = 0;
1127         bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
1128         const int** src = (const int**)_src;
1129         const int *S0 = src[-1], *S1 = src[0], *S2 = src[1];
1130         short* dst = (short*)_dst;
1131         __m128 df4 = _mm_set1_ps(delta);
1132         __m128i d4 = _mm_cvtps_epi32(df4);
1133
1134         if( symmetrical )
1135         {
1136             if( ky[0] == 2 && ky[1] == 1 )
1137             {
1138                 for( ; i <= width - 8; i += 8 )
1139                 {
1140                     __m128i s0, s1, s2, s3, s4, s5;
1141                     s0 = _mm_load_si128((__m128i*)(S0 + i));
1142                     s1 = _mm_load_si128((__m128i*)(S0 + i + 4));
1143                     s2 = _mm_load_si128((__m128i*)(S1 + i));
1144                     s3 = _mm_load_si128((__m128i*)(S1 + i + 4));
1145                     s4 = _mm_load_si128((__m128i*)(S2 + i));
1146                     s5 = _mm_load_si128((__m128i*)(S2 + i + 4));
1147                     s0 = _mm_add_epi32(s0, _mm_add_epi32(s4, _mm_add_epi32(s2, s2)));
1148                     s1 = _mm_add_epi32(s1, _mm_add_epi32(s5, _mm_add_epi32(s3, s3)));
1149                     s0 = _mm_add_epi32(s0, d4);
1150                     s1 = _mm_add_epi32(s1, d4);
1151                     _mm_storeu_si128((__m128i*)(dst + i), _mm_packs_epi32(s0, s1));
1152                 }
1153             }
1154             else if( ky[0] == -2 && ky[1] == 1 )
1155             {
1156                 for( ; i <= width - 8; i += 8 )
1157                 {
1158                     __m128i s0, s1, s2, s3, s4, s5;
1159                     s0 = _mm_load_si128((__m128i*)(S0 + i));
1160                     s1 = _mm_load_si128((__m128i*)(S0 + i + 4));
1161                     s2 = _mm_load_si128((__m128i*)(S1 + i));
1162                     s3 = _mm_load_si128((__m128i*)(S1 + i + 4));
1163                     s4 = _mm_load_si128((__m128i*)(S2 + i));
1164                     s5 = _mm_load_si128((__m128i*)(S2 + i + 4));
1165                     s0 = _mm_add_epi32(s0, _mm_sub_epi32(s4, _mm_add_epi32(s2, s2)));
1166                     s1 = _mm_add_epi32(s1, _mm_sub_epi32(s5, _mm_add_epi32(s3, s3)));
1167                     s0 = _mm_add_epi32(s0, d4);
1168                     s1 = _mm_add_epi32(s1, d4);
1169                     _mm_storeu_si128((__m128i*)(dst + i), _mm_packs_epi32(s0, s1));
1170                 }
1171             }
1172             else
1173             {
1174                 __m128 k0 = _mm_set1_ps(ky[0]), k1 = _mm_set1_ps(ky[1]);
1175                 for( ; i <= width - 8; i += 8 )
1176                 {
1177                     __m128 s0, s1;
1178                     s0 = _mm_cvtepi32_ps(_mm_load_si128((__m128i*)(S1 + i)));
1179                     s1 = _mm_cvtepi32_ps(_mm_load_si128((__m128i*)(S1 + i + 4)));
1180                     s0 = _mm_add_ps(_mm_mul_ps(s0, k0), df4);
1181                     s1 = _mm_add_ps(_mm_mul_ps(s1, k0), df4);
1182                     __m128i x0, x1;
1183                     x0 = _mm_add_epi32(_mm_load_si128((__m128i*)(S0 + i)),
1184                                        _mm_load_si128((__m128i*)(S2 + i)));
1185                     x1 = _mm_add_epi32(_mm_load_si128((__m128i*)(S0 + i + 4)),
1186                                        _mm_load_si128((__m128i*)(S2 + i + 4)));
1187                     s0 = _mm_add_ps(s0, _mm_mul_ps(_mm_cvtepi32_ps(x0),k1));
1188                     s1 = _mm_add_ps(s1, _mm_mul_ps(_mm_cvtepi32_ps(x1),k1));
1189                     x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), _mm_cvtps_epi32(s1));
1190                     _mm_storeu_si128((__m128i*)(dst + i), x0);
1191                 }
1192             }
1193         }
1194         else
1195         {
1196             if( fabs(ky[1]) == 1 && ky[1] == -ky[-1] )
1197             {
1198                 if( ky[1] < 0 )
1199                     std::swap(S0, S2);
1200                 for( ; i <= width - 8; i += 8 )
1201                 {
1202                     __m128i s0, s1, s2, s3;
1203                     s0 = _mm_load_si128((__m128i*)(S2 + i));
1204                     s1 = _mm_load_si128((__m128i*)(S2 + i + 4));
1205                     s2 = _mm_load_si128((__m128i*)(S0 + i));
1206                     s3 = _mm_load_si128((__m128i*)(S0 + i + 4));
1207                     s0 = _mm_add_epi32(_mm_sub_epi32(s0, s2), d4);
1208                     s1 = _mm_add_epi32(_mm_sub_epi32(s1, s3), d4);
1209                     _mm_storeu_si128((__m128i*)(dst + i), _mm_packs_epi32(s0, s1));
1210                 }
1211             }
1212             else
1213             {
1214                 __m128 k1 = _mm_set1_ps(ky[1]);
1215                 for( ; i <= width - 8; i += 8 )
1216                 {
1217                     __m128 s0 = df4, s1 = df4;
1218                     __m128i x0, x1;
1219                     x0 = _mm_sub_epi32(_mm_load_si128((__m128i*)(S0 + i)),
1220                                        _mm_load_si128((__m128i*)(S2 + i)));
1221                     x1 = _mm_sub_epi32(_mm_load_si128((__m128i*)(S0 + i + 4)),
1222                                        _mm_load_si128((__m128i*)(S2 + i + 4)));
1223                     s0 = _mm_add_ps(s0, _mm_mul_ps(_mm_cvtepi32_ps(x0),k1));
1224                     s1 = _mm_add_ps(s1, _mm_mul_ps(_mm_cvtepi32_ps(x1),k1));
1225                     x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), _mm_cvtps_epi32(s1));
1226                     _mm_storeu_si128((__m128i*)(dst + i), x0);
1227                 }
1228             }
1229         }
1230
1231         return i;
1232     }
1233
1234     int symmetryType;
1235     float delta;
1236     Mat kernel;
1237 };
1238
1239     
1240 /////////////////////////////////////// 16s //////////////////////////////////
1241     
1242 struct RowVec_16s32f
1243 {
1244     RowVec_16s32f() {}
1245     RowVec_16s32f( const Mat& _kernel )
1246     {
1247         kernel = _kernel;
1248         sse2_supported = checkHardwareSupport(CV_CPU_SSE2);
1249     }
1250     
1251     int operator()(const uchar* _src, uchar* _dst, int width, int cn) const
1252     {
1253         if( !sse2_supported )
1254             return 0;
1255         
1256         int i = 0, k, _ksize = kernel.rows + kernel.cols - 1;
1257         float* dst = (float*)_dst;
1258         const float* _kx = (const float*)kernel.data;
1259         width *= cn;
1260         
1261         for( ; i <= width - 8; i += 8 )
1262         {
1263             const short* src = (const short*)_src + i;
1264             __m128 f, s0 = _mm_setzero_ps(), s1 = s0, x0, x1;
1265             for( k = 0; k < _ksize; k++, src += cn )
1266             {
1267                 f = _mm_load_ss(_kx+k);
1268                 f = _mm_shuffle_ps(f, f, 0);
1269                 
1270                 __m128i x0i = _mm_loadu_si128((const __m128i*)src);
1271                 __m128i x1i = _mm_srai_epi32(_mm_unpackhi_epi16(x0i, x0i), 16);
1272                 x0i = _mm_srai_epi32(_mm_unpacklo_epi16(x0i, x0i), 16);
1273                 x0 = _mm_cvtepi32_ps(x0i);
1274                 x1 = _mm_cvtepi32_ps(x1i);
1275                 s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
1276                 s1 = _mm_add_ps(s1, _mm_mul_ps(x1, f));
1277             }
1278             _mm_store_ps(dst + i, s0);
1279             _mm_store_ps(dst + i + 4, s1);
1280         }
1281         return i;
1282     }
1283     
1284     Mat kernel;
1285     bool sse2_supported;
1286 };
1287     
1288     
1289 struct SymmColumnVec_32f16s
1290 {
1291     SymmColumnVec_32f16s() { symmetryType=0; }
1292     SymmColumnVec_32f16s(const Mat& _kernel, int _symmetryType, int, double _delta)
1293     {
1294         symmetryType = _symmetryType;
1295         kernel = _kernel;
1296         delta = (float)_delta;
1297         CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 );
1298         sse2_supported = checkHardwareSupport(CV_CPU_SSE2);
1299     }
1300     
1301     int operator()(const uchar** _src, uchar* _dst, int width) const
1302     {
1303         if( !sse2_supported )
1304             return 0;
1305         
1306         int ksize2 = (kernel.rows + kernel.cols - 1)/2;
1307         const float* ky = (const float*)kernel.data + ksize2;
1308         int i = 0, k;
1309         bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
1310         const float** src = (const float**)_src;
1311         const float *S, *S2;
1312         short* dst = (short*)_dst;
1313         __m128 d4 = _mm_set1_ps(delta);
1314         
1315         if( symmetrical )
1316         {
1317             for( ; i <= width - 16; i += 16 )
1318             {
1319                 __m128 f = _mm_load_ss(ky);
1320                 f = _mm_shuffle_ps(f, f, 0);
1321                 __m128 s0, s1, s2, s3;
1322                 __m128 x0, x1;
1323                 S = src[0] + i;
1324                 s0 = _mm_load_ps(S);
1325                 s1 = _mm_load_ps(S+4);
1326                 s0 = _mm_add_ps(_mm_mul_ps(s0, f), d4);
1327                 s1 = _mm_add_ps(_mm_mul_ps(s1, f), d4);
1328                 s2 = _mm_load_ps(S+8);
1329                 s3 = _mm_load_ps(S+12);
1330                 s2 = _mm_add_ps(_mm_mul_ps(s2, f), d4);
1331                 s3 = _mm_add_ps(_mm_mul_ps(s3, f), d4);
1332                 
1333                 for( k = 1; k <= ksize2; k++ )
1334                 {
1335                     S = src[k] + i;
1336                     S2 = src[-k] + i;
1337                     f = _mm_load_ss(ky+k);
1338                     f = _mm_shuffle_ps(f, f, 0);
1339                     x0 = _mm_add_ps(_mm_load_ps(S), _mm_load_ps(S2));
1340                     x1 = _mm_add_ps(_mm_load_ps(S+4), _mm_load_ps(S2+4));
1341                     s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
1342                     s1 = _mm_add_ps(s1, _mm_mul_ps(x1, f));
1343                     x0 = _mm_add_ps(_mm_load_ps(S+8), _mm_load_ps(S2+8));
1344                     x1 = _mm_add_ps(_mm_load_ps(S+12), _mm_load_ps(S2+12));
1345                     s2 = _mm_add_ps(s2, _mm_mul_ps(x0, f));
1346                     s3 = _mm_add_ps(s3, _mm_mul_ps(x1, f));
1347                 }
1348                 
1349                 __m128i s0i = _mm_cvtps_epi32(s0);
1350                 __m128i s1i = _mm_cvtps_epi32(s1);
1351                 __m128i s2i = _mm_cvtps_epi32(s2);
1352                 __m128i s3i = _mm_cvtps_epi32(s3);
1353                 
1354                 _mm_storeu_si128((__m128i*)(dst + i), _mm_packs_epi32(s0i, s1i));
1355                 _mm_storeu_si128((__m128i*)(dst + i + 8), _mm_packs_epi32(s2i, s3i));
1356             }
1357             
1358             for( ; i <= width - 4; i += 4 )
1359             {
1360                 __m128 f = _mm_load_ss(ky);
1361                 f = _mm_shuffle_ps(f, f, 0);
1362                 __m128 x0, s0 = _mm_load_ps(src[0] + i);
1363                 s0 = _mm_add_ps(_mm_mul_ps(s0, f), d4);
1364                 
1365                 for( k = 1; k <= ksize2; k++ )
1366                 {
1367                     f = _mm_load_ss(ky+k);
1368                     f = _mm_shuffle_ps(f, f, 0);
1369                     S = src[k] + i;
1370                     S2 = src[-k] + i;
1371                     x0 = _mm_add_ps(_mm_load_ps(src[k]+i), _mm_load_ps(src[-k] + i));
1372                     s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
1373                 }
1374                 
1375                 __m128i s0i = _mm_cvtps_epi32(s0);
1376                 _mm_storel_epi64((__m128i*)(dst + i), _mm_packs_epi32(s0i, s0i));
1377             }
1378         }
1379         else
1380         {
1381             for( ; i <= width - 16; i += 16 )
1382             {
1383                 __m128 f, s0 = d4, s1 = d4, s2 = d4, s3 = d4;
1384                 __m128 x0, x1;
1385                 S = src[0] + i;
1386                 
1387                 for( k = 1; k <= ksize2; k++ )
1388                 {
1389                     S = src[k] + i;
1390                     S2 = src[-k] + i;
1391                     f = _mm_load_ss(ky+k);
1392                     f = _mm_shuffle_ps(f, f, 0);
1393                     x0 = _mm_sub_ps(_mm_load_ps(S), _mm_load_ps(S2));
1394                     x1 = _mm_sub_ps(_mm_load_ps(S+4), _mm_load_ps(S2+4));
1395                     s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
1396                     s1 = _mm_add_ps(s1, _mm_mul_ps(x1, f));
1397                     x0 = _mm_sub_ps(_mm_load_ps(S+8), _mm_load_ps(S2+8));
1398                     x1 = _mm_sub_ps(_mm_load_ps(S+12), _mm_load_ps(S2+12));
1399                     s2 = _mm_add_ps(s2, _mm_mul_ps(x0, f));
1400                     s3 = _mm_add_ps(s3, _mm_mul_ps(x1, f));
1401                 }
1402                 
1403                 __m128i s0i = _mm_cvtps_epi32(s0);
1404                 __m128i s1i = _mm_cvtps_epi32(s1);
1405                 __m128i s2i = _mm_cvtps_epi32(s2);
1406                 __m128i s3i = _mm_cvtps_epi32(s3);
1407                 
1408                 _mm_storeu_si128((__m128i*)(dst + i), _mm_packs_epi32(s0i, s1i));
1409                 _mm_storeu_si128((__m128i*)(dst + i + 8), _mm_packs_epi32(s2i, s3i));
1410             }
1411             
1412             for( ; i <= width - 4; i += 4 )
1413             {
1414                 __m128 f, x0, s0 = d4;
1415                 
1416                 for( k = 1; k <= ksize2; k++ )
1417                 {
1418                     f = _mm_load_ss(ky+k);
1419                     f = _mm_shuffle_ps(f, f, 0);
1420                     x0 = _mm_sub_ps(_mm_load_ps(src[k]+i), _mm_load_ps(src[-k] + i));
1421                     s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
1422                 }
1423                 
1424                 __m128i s0i = _mm_cvtps_epi32(s0);
1425                 _mm_storel_epi64((__m128i*)(dst + i), _mm_packs_epi32(s0i, s0i));
1426             }
1427         }
1428         
1429         return i;
1430     }
1431     
1432     int symmetryType;
1433     float delta;
1434     Mat kernel;
1435     bool sse2_supported;
1436 };    
1437     
1438
1439 /////////////////////////////////////// 32f //////////////////////////////////
1440
1441 struct RowVec_32f
1442 {
1443     RowVec_32f() {}
1444     RowVec_32f( const Mat& _kernel )
1445     {
1446         kernel = _kernel;
1447     }
1448
1449     int operator()(const uchar* _src, uchar* _dst, int width, int cn) const
1450     {
1451         if( !checkHardwareSupport(CV_CPU_SSE) )
1452             return 0;
1453         
1454         int i = 0, k, _ksize = kernel.rows + kernel.cols - 1;
1455         float* dst = (float*)_dst;
1456         const float* _kx = (const float*)kernel.data;
1457         width *= cn;
1458
1459         for( ; i <= width - 8; i += 8 )
1460         {
1461             const float* src = (const float*)_src + i;
1462             __m128 f, s0 = _mm_setzero_ps(), s1 = s0, x0, x1;
1463             for( k = 0; k < _ksize; k++, src += cn )
1464             {
1465                 f = _mm_load_ss(_kx+k);
1466                 f = _mm_shuffle_ps(f, f, 0);
1467
1468                 x0 = _mm_loadu_ps(src);
1469                 x1 = _mm_loadu_ps(src + 4);
1470                 s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
1471                 s1 = _mm_add_ps(s1, _mm_mul_ps(x1, f));
1472             }
1473             _mm_store_ps(dst + i, s0);
1474             _mm_store_ps(dst + i + 4, s1);
1475         }
1476         return i;
1477     }
1478
1479     Mat kernel;
1480 };
1481
1482
1483 struct SymmRowSmallVec_32f
1484 {
1485     SymmRowSmallVec_32f() {}
1486     SymmRowSmallVec_32f( const Mat& _kernel, int _symmetryType )
1487     {
1488         kernel = _kernel;
1489         symmetryType = _symmetryType;
1490     }
1491
1492     int operator()(const uchar* _src, uchar* _dst, int width, int cn) const
1493     {
1494         if( !checkHardwareSupport(CV_CPU_SSE) )
1495             return 0;
1496         
1497         int i = 0, _ksize = kernel.rows + kernel.cols - 1;
1498         float* dst = (float*)_dst;
1499         const float* src = (const float*)_src + (_ksize/2)*cn;
1500         bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
1501         const float* kx = (const float*)kernel.data + _ksize/2;
1502         width *= cn;
1503
1504         if( symmetrical )
1505         {
1506             if( _ksize == 1 )
1507                 return 0;
1508             if( _ksize == 3 )
1509             {
1510                 if( kx[0] == 2 && kx[1] == 1 )
1511                     for( ; i <= width - 8; i += 8, src += 8 )
1512                     {
1513                         __m128 x0, x1, x2, y0, y1, y2;
1514                         x0 = _mm_loadu_ps(src - cn);
1515                         x1 = _mm_loadu_ps(src);
1516                         x2 = _mm_loadu_ps(src + cn);
1517                         y0 = _mm_loadu_ps(src - cn + 4);
1518                         y1 = _mm_loadu_ps(src + 4);
1519                         y2 = _mm_loadu_ps(src + cn + 4);
1520                         x0 = _mm_add_ps(x0, _mm_add_ps(_mm_add_ps(x1, x1), x2));
1521                         y0 = _mm_add_ps(y0, _mm_add_ps(_mm_add_ps(y1, y1), y2));
1522                         _mm_store_ps(dst + i, x0);
1523                         _mm_store_ps(dst + i + 4, y0);
1524                     }
1525                 else if( kx[0] == -2 && kx[1] == 1 )
1526                     for( ; i <= width - 8; i += 8, src += 8 )
1527                     {
1528                         __m128 x0, x1, x2, y0, y1, y2;
1529                         x0 = _mm_loadu_ps(src - cn);
1530                         x1 = _mm_loadu_ps(src);
1531                         x2 = _mm_loadu_ps(src + cn);
1532                         y0 = _mm_loadu_ps(src - cn + 4);
1533                         y1 = _mm_loadu_ps(src + 4);
1534                         y2 = _mm_loadu_ps(src + cn + 4);
1535                         x0 = _mm_add_ps(x0, _mm_sub_ps(x2, _mm_add_ps(x1, x1)));
1536                         y0 = _mm_add_ps(y0, _mm_sub_ps(y2, _mm_add_ps(y1, y1)));
1537                         _mm_store_ps(dst + i, x0);
1538                         _mm_store_ps(dst + i + 4, y0);
1539                     }
1540                 else
1541                 {
1542                     __m128 k0 = _mm_set1_ps(kx[0]), k1 = _mm_set1_ps(kx[1]);
1543                     for( ; i <= width - 8; i += 8, src += 8 )
1544                     {
1545                         __m128 x0, x1, x2, y0, y1, y2;
1546                         x0 = _mm_loadu_ps(src - cn);
1547                         x1 = _mm_loadu_ps(src);
1548                         x2 = _mm_loadu_ps(src + cn);
1549                         y0 = _mm_loadu_ps(src - cn + 4);
1550                         y1 = _mm_loadu_ps(src + 4);
1551                         y2 = _mm_loadu_ps(src + cn + 4);
1552
1553                         x0 = _mm_mul_ps(_mm_add_ps(x0, x2), k1);
1554                         y0 = _mm_mul_ps(_mm_add_ps(y0, y2), k1);
1555                         x0 = _mm_add_ps(x0, _mm_mul_ps(x1, k0));
1556                         y0 = _mm_add_ps(y0, _mm_mul_ps(y1, k0));
1557                         _mm_store_ps(dst + i, x0);
1558                         _mm_store_ps(dst + i + 4, y0);
1559                     }
1560                 }
1561             }
1562             else if( _ksize == 5 )
1563             {
1564                 if( kx[0] == -2 && kx[1] == 0 && kx[2] == 1 )
1565                     for( ; i <= width - 8; i += 8, src += 8 )
1566                     {
1567                         __m128 x0, x1, x2, y0, y1, y2;
1568                         x0 = _mm_loadu_ps(src - cn*2);
1569                         x1 = _mm_loadu_ps(src);
1570                         x2 = _mm_loadu_ps(src + cn*2);
1571                         y0 = _mm_loadu_ps(src - cn*2 + 4);
1572                         y1 = _mm_loadu_ps(src + 4);
1573                         y2 = _mm_loadu_ps(src + cn*2 + 4);
1574                         x0 = _mm_add_ps(x0, _mm_sub_ps(x2, _mm_add_ps(x1, x1)));
1575                         y0 = _mm_add_ps(y0, _mm_sub_ps(y2, _mm_add_ps(y1, y1)));
1576                         _mm_store_ps(dst + i, x0);
1577                         _mm_store_ps(dst + i + 4, y0);
1578                     }
1579                 else
1580                 {
1581                     __m128 k0 = _mm_set1_ps(kx[0]), k1 = _mm_set1_ps(kx[1]), k2 = _mm_set1_ps(kx[2]);
1582                     for( ; i <= width - 8; i += 8, src += 8 )
1583                     {
1584                         __m128 x0, x1, x2, y0, y1, y2;
1585                         x0 = _mm_loadu_ps(src - cn);
1586                         x1 = _mm_loadu_ps(src);
1587                         x2 = _mm_loadu_ps(src + cn);
1588                         y0 = _mm_loadu_ps(src - cn + 4);
1589                         y1 = _mm_loadu_ps(src + 4);
1590                         y2 = _mm_loadu_ps(src + cn + 4);
1591
1592                         x0 = _mm_mul_ps(_mm_add_ps(x0, x2), k1);
1593                         y0 = _mm_mul_ps(_mm_add_ps(y0, y2), k1);
1594                         x0 = _mm_add_ps(x0, _mm_mul_ps(x1, k0));
1595                         y0 = _mm_add_ps(y0, _mm_mul_ps(y1, k0));
1596                         
1597                         x2 = _mm_add_ps(_mm_loadu_ps(src + cn*2), _mm_loadu_ps(src - cn*2));
1598                         y2 = _mm_add_ps(_mm_loadu_ps(src + cn*2 + 4), _mm_loadu_ps(src - cn*2 + 4));
1599                         x0 = _mm_add_ps(x0, _mm_mul_ps(x2, k2));
1600                         y0 = _mm_add_ps(y0, _mm_mul_ps(y2, k2));
1601                         
1602                         _mm_store_ps(dst + i, x0);
1603                         _mm_store_ps(dst + i + 4, y0);
1604                     }
1605                 }
1606             }
1607         }
1608         else
1609         {
1610             if( _ksize == 3 )
1611             {
1612                 if( kx[0] == 0 && kx[1] == 1 )
1613                     for( ; i <= width - 8; i += 8, src += 8 )
1614                     {
1615                         __m128 x0, x2, y0, y2;
1616                         x0 = _mm_loadu_ps(src + cn);
1617                         x2 = _mm_loadu_ps(src - cn);
1618                         y0 = _mm_loadu_ps(src + cn + 4);
1619                         y2 = _mm_loadu_ps(src - cn + 4);
1620                         x0 = _mm_sub_ps(x0, x2);
1621                         y0 = _mm_sub_ps(y0, y2);
1622                         _mm_store_ps(dst + i, x0);
1623                         _mm_store_ps(dst + i + 4, y0);
1624                     }
1625                 else
1626                 {
1627                     __m128 k1 = _mm_set1_ps(kx[1]);
1628                     for( ; i <= width - 8; i += 8, src += 8 )
1629                     {
1630                         __m128 x0, x2, y0, y2;
1631                         x0 = _mm_loadu_ps(src + cn);
1632                         x2 = _mm_loadu_ps(src - cn);
1633                         y0 = _mm_loadu_ps(src + cn + 4);
1634                         y2 = _mm_loadu_ps(src - cn + 4);
1635
1636                         x0 = _mm_mul_ps(_mm_sub_ps(x0, x2), k1);
1637                         y0 = _mm_mul_ps(_mm_sub_ps(y0, y2), k1);
1638                         _mm_store_ps(dst + i, x0);
1639                         _mm_store_ps(dst + i + 4, y0);
1640                     }
1641                 }
1642             }
1643             else if( _ksize == 5 )
1644             {
1645                 __m128 k1 = _mm_set1_ps(kx[1]), k2 = _mm_set1_ps(kx[2]);
1646                 for( ; i <= width - 8; i += 8, src += 8 )
1647                 {
1648                     __m128 x0, x2, y0, y2;
1649                     x0 = _mm_loadu_ps(src + cn);
1650                     x2 = _mm_loadu_ps(src - cn);
1651                     y0 = _mm_loadu_ps(src + cn + 4);
1652                     y2 = _mm_loadu_ps(src - cn + 4);
1653
1654                     x0 = _mm_mul_ps(_mm_sub_ps(x0, x2), k1);
1655                     y0 = _mm_mul_ps(_mm_sub_ps(y0, y2), k1);
1656                     
1657                     x2 = _mm_sub_ps(_mm_loadu_ps(src + cn*2), _mm_loadu_ps(src - cn*2));
1658                     y2 = _mm_sub_ps(_mm_loadu_ps(src + cn*2 + 4), _mm_loadu_ps(src - cn*2 + 4));
1659                     x0 = _mm_add_ps(x0, _mm_mul_ps(x2, k2));
1660                     y0 = _mm_add_ps(y0, _mm_mul_ps(y2, k2));
1661                     
1662                     _mm_store_ps(dst + i, x0);
1663                     _mm_store_ps(dst + i + 4, y0);
1664                 }
1665             }
1666         }
1667
1668         return i;
1669     }
1670
1671     Mat kernel;
1672     int symmetryType;
1673 };
1674
1675
1676 struct SymmColumnVec_32f
1677 {
1678     SymmColumnVec_32f() { symmetryType=0; }
1679     SymmColumnVec_32f(const Mat& _kernel, int _symmetryType, int, double _delta)
1680     {
1681         symmetryType = _symmetryType;
1682         kernel = _kernel;
1683         delta = (float)_delta;
1684         CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 );
1685     }
1686
1687     int operator()(const uchar** _src, uchar* _dst, int width) const
1688     {
1689         if( !checkHardwareSupport(CV_CPU_SSE) )
1690             return 0;
1691         
1692         int ksize2 = (kernel.rows + kernel.cols - 1)/2;
1693         const float* ky = (const float*)kernel.data + ksize2;
1694         int i = 0, k;
1695         bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
1696         const float** src = (const float**)_src;
1697         const float *S, *S2;
1698         float* dst = (float*)_dst;
1699         __m128 d4 = _mm_set1_ps(delta);
1700
1701         if( symmetrical )
1702         {
1703             for( ; i <= width - 16; i += 16 )
1704             {
1705                 __m128 f = _mm_load_ss(ky);
1706                 f = _mm_shuffle_ps(f, f, 0);
1707                 __m128 s0, s1, s2, s3;
1708                 __m128 x0, x1;
1709                 S = src[0] + i;
1710                 s0 = _mm_load_ps(S);
1711                 s1 = _mm_load_ps(S+4);
1712                 s0 = _mm_add_ps(_mm_mul_ps(s0, f), d4);
1713                 s1 = _mm_add_ps(_mm_mul_ps(s1, f), d4);
1714                 s2 = _mm_load_ps(S+8);
1715                 s3 = _mm_load_ps(S+12);
1716                 s2 = _mm_add_ps(_mm_mul_ps(s2, f), d4);
1717                 s3 = _mm_add_ps(_mm_mul_ps(s3, f), d4);
1718
1719                 for( k = 1; k <= ksize2; k++ )
1720                 {
1721                     S = src[k] + i;
1722                     S2 = src[-k] + i;
1723                     f = _mm_load_ss(ky+k);
1724                     f = _mm_shuffle_ps(f, f, 0);
1725                     x0 = _mm_add_ps(_mm_load_ps(S), _mm_load_ps(S2));
1726                     x1 = _mm_add_ps(_mm_load_ps(S+4), _mm_load_ps(S2+4));
1727                     s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
1728                     s1 = _mm_add_ps(s1, _mm_mul_ps(x1, f));
1729                     x0 = _mm_add_ps(_mm_load_ps(S+8), _mm_load_ps(S2+8));
1730                     x1 = _mm_add_ps(_mm_load_ps(S+12), _mm_load_ps(S2+12));
1731                     s2 = _mm_add_ps(s2, _mm_mul_ps(x0, f));
1732                     s3 = _mm_add_ps(s3, _mm_mul_ps(x1, f));
1733                 }
1734
1735                 _mm_storeu_ps(dst + i, s0);
1736                 _mm_storeu_ps(dst + i + 4, s1);
1737                 _mm_storeu_ps(dst + i + 8, s2);
1738                 _mm_storeu_ps(dst + i + 12, s3);
1739             }
1740
1741             for( ; i <= width - 4; i += 4 )
1742             {
1743                 __m128 f = _mm_load_ss(ky);
1744                 f = _mm_shuffle_ps(f, f, 0);
1745                 __m128 x0, s0 = _mm_load_ps(src[0] + i);
1746                 s0 = _mm_add_ps(_mm_mul_ps(s0, f), d4);
1747
1748                 for( k = 1; k <= ksize2; k++ )
1749                 {
1750                     f = _mm_load_ss(ky+k);
1751                     f = _mm_shuffle_ps(f, f, 0);
1752                     S = src[k] + i;
1753                     S2 = src[-k] + i;
1754                     x0 = _mm_add_ps(_mm_load_ps(src[k]+i), _mm_load_ps(src[-k] + i));
1755                     s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
1756                 }
1757
1758                 _mm_storeu_ps(dst + i, s0);
1759             }
1760         }
1761         else
1762         {
1763             for( ; i <= width - 16; i += 16 )
1764             {
1765                 __m128 f, s0 = d4, s1 = d4, s2 = d4, s3 = d4;
1766                 __m128 x0, x1;
1767                 S = src[0] + i;
1768
1769                 for( k = 1; k <= ksize2; k++ )
1770                 {
1771                     S = src[k] + i;
1772                     S2 = src[-k] + i;
1773                     f = _mm_load_ss(ky+k);
1774                     f = _mm_shuffle_ps(f, f, 0);
1775                     x0 = _mm_sub_ps(_mm_load_ps(S), _mm_load_ps(S2));
1776                     x1 = _mm_sub_ps(_mm_load_ps(S+4), _mm_load_ps(S2+4));
1777                     s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
1778                     s1 = _mm_add_ps(s1, _mm_mul_ps(x1, f));
1779                     x0 = _mm_sub_ps(_mm_load_ps(S+8), _mm_load_ps(S2+8));
1780                     x1 = _mm_sub_ps(_mm_load_ps(S+12), _mm_load_ps(S2+12));
1781                     s2 = _mm_add_ps(s2, _mm_mul_ps(x0, f));
1782                     s3 = _mm_add_ps(s3, _mm_mul_ps(x1, f));
1783                 }
1784
1785                 _mm_storeu_ps(dst + i, s0);
1786                 _mm_storeu_ps(dst + i + 4, s1);
1787                 _mm_storeu_ps(dst + i + 8, s2);
1788                 _mm_storeu_ps(dst + i + 12, s3);
1789             }
1790
1791             for( ; i <= width - 4; i += 4 )
1792             {
1793                 __m128 f, x0, s0 = d4;
1794
1795                 for( k = 1; k <= ksize2; k++ )
1796                 {
1797                     f = _mm_load_ss(ky+k);
1798                     f = _mm_shuffle_ps(f, f, 0);
1799                     x0 = _mm_sub_ps(_mm_load_ps(src[k]+i), _mm_load_ps(src[-k] + i));
1800                     s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
1801                 }
1802
1803                 _mm_storeu_ps(dst + i, s0);
1804             }
1805         }
1806
1807         return i;
1808     }
1809
1810     int symmetryType;
1811     float delta;
1812     Mat kernel;
1813 };
1814
1815
1816 struct SymmColumnSmallVec_32f
1817 {
1818     SymmColumnSmallVec_32f() { symmetryType=0; }
1819     SymmColumnSmallVec_32f(const Mat& _kernel, int _symmetryType, int, double _delta)
1820     {
1821         symmetryType = _symmetryType;
1822         kernel = _kernel;
1823         delta = (float)_delta;
1824         CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 );
1825     }
1826
1827     int operator()(const uchar** _src, uchar* _dst, int width) const
1828     {
1829         if( !checkHardwareSupport(CV_CPU_SSE) )
1830             return 0;
1831         
1832         int ksize2 = (kernel.rows + kernel.cols - 1)/2;
1833         const float* ky = (const float*)kernel.data + ksize2;
1834         int i = 0;
1835         bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
1836         const float** src = (const float**)_src;
1837         const float *S0 = src[-1], *S1 = src[0], *S2 = src[1];
1838         float* dst = (float*)_dst;
1839         __m128 d4 = _mm_set1_ps(delta);
1840
1841         if( symmetrical )
1842         {
1843             if( ky[0] == 2 && ky[1] == 1 )
1844             {
1845                 for( ; i <= width - 8; i += 8 )
1846                 {
1847                     __m128 s0, s1, s2, s3, s4, s5;
1848                     s0 = _mm_load_ps(S0 + i);
1849                     s1 = _mm_load_ps(S0 + i + 4);
1850                     s2 = _mm_load_ps(S1 + i);
1851                     s3 = _mm_load_ps(S1 + i + 4);
1852                     s4 = _mm_load_ps(S2 + i);
1853                     s5 = _mm_load_ps(S2 + i + 4);
1854                     s0 = _mm_add_ps(s0, _mm_add_ps(s4, _mm_add_ps(s2, s2)));
1855                     s1 = _mm_add_ps(s1, _mm_add_ps(s5, _mm_add_ps(s3, s3)));
1856                     s0 = _mm_add_ps(s0, d4);
1857                     s1 = _mm_add_ps(s1, d4);
1858                     _mm_storeu_ps(dst + i, s0);
1859                     _mm_storeu_ps(dst + i + 4, s1);
1860                 }
1861             }
1862             else if( ky[0] == -2 && ky[1] == 1 )
1863             {
1864                 for( ; i <= width - 8; i += 8 )
1865                 {
1866                     __m128 s0, s1, s2, s3, s4, s5;
1867                     s0 = _mm_load_ps(S0 + i);
1868                     s1 = _mm_load_ps(S0 + i + 4);
1869                     s2 = _mm_load_ps(S1 + i);
1870                     s3 = _mm_load_ps(S1 + i + 4);
1871                     s4 = _mm_load_ps(S2 + i);
1872                     s5 = _mm_load_ps(S2 + i + 4);
1873                     s0 = _mm_add_ps(s0, _mm_sub_ps(s4, _mm_add_ps(s2, s2)));
1874                     s1 = _mm_add_ps(s1, _mm_sub_ps(s5, _mm_add_ps(s3, s3)));
1875                     s0 = _mm_add_ps(s0, d4);
1876                     s1 = _mm_add_ps(s1, d4);
1877                     _mm_storeu_ps(dst + i, s0);
1878                     _mm_storeu_ps(dst + i + 4, s1);
1879                 }
1880             }
1881             else
1882             {
1883                 __m128 k0 = _mm_set1_ps(ky[0]), k1 = _mm_set1_ps(ky[1]);
1884                 for( ; i <= width - 8; i += 8 )
1885                 {
1886                     __m128 s0, s1, x0, x1;
1887                     s0 = _mm_load_ps(S1 + i);
1888                     s1 = _mm_load_ps(S1 + i + 4);
1889                     s0 = _mm_add_ps(_mm_mul_ps(s0, k0), d4);
1890                     s1 = _mm_add_ps(_mm_mul_ps(s1, k0), d4);
1891                     x0 = _mm_add_ps(_mm_load_ps(S0 + i), _mm_load_ps(S2 + i));
1892                     x1 = _mm_add_ps(_mm_load_ps(S0 + i + 4), _mm_load_ps(S2 + i + 4));
1893                     s0 = _mm_add_ps(s0, _mm_mul_ps(x0,k1));
1894                     s1 = _mm_add_ps(s1, _mm_mul_ps(x1,k1));
1895                     _mm_storeu_ps(dst + i, s0);
1896                     _mm_storeu_ps(dst + i + 4, s1);
1897                 }
1898             }
1899         }
1900         else
1901         {
1902             if( fabs(ky[1]) == 1 && ky[1] == -ky[-1] )
1903             {
1904                 if( ky[1] < 0 )
1905                     std::swap(S0, S2);
1906                 for( ; i <= width - 8; i += 8 )
1907                 {
1908                     __m128 s0, s1, s2, s3;
1909                     s0 = _mm_load_ps(S2 + i);
1910                     s1 = _mm_load_ps(S2 + i + 4);
1911                     s2 = _mm_load_ps(S0 + i);
1912                     s3 = _mm_load_ps(S0 + i + 4);
1913                     s0 = _mm_add_ps(_mm_sub_ps(s0, s2), d4);
1914                     s1 = _mm_add_ps(_mm_sub_ps(s1, s3), d4);
1915                     _mm_storeu_ps(dst + i, s0);
1916                     _mm_storeu_ps(dst + i + 4, s1);
1917                 }
1918             }
1919             else
1920             {
1921                 __m128 k1 = _mm_set1_ps(ky[1]);
1922                 for( ; i <= width - 8; i += 8 )
1923                 {
1924                     __m128 s0 = d4, s1 = d4, x0, x1;
1925                     x0 = _mm_sub_ps(_mm_load_ps(S2 + i), _mm_load_ps(S0 + i));
1926                     x1 = _mm_sub_ps(_mm_load_ps(S2 + i + 4), _mm_load_ps(S0 + i + 4));
1927                     s0 = _mm_add_ps(s0, _mm_mul_ps(x0,k1));
1928                     s1 = _mm_add_ps(s1, _mm_mul_ps(x1,k1));
1929                     _mm_storeu_ps(dst + i, s0);
1930                     _mm_storeu_ps(dst + i + 4, s1);
1931                 }
1932             }
1933         }
1934
1935         return i;
1936     }
1937
1938     int symmetryType;
1939     float delta;
1940     Mat kernel;
1941 };
1942
1943
1944 /////////////////////////////// non-separable filters ///////////////////////////////
1945
1946 ///////////////////////////////// 8u<->8u, 8u<->16s /////////////////////////////////
1947
1948 struct FilterVec_8u
1949 {
1950     FilterVec_8u() {}
1951     FilterVec_8u(const Mat& _kernel, int _bits, double _delta)
1952     {
1953         Mat kernel;
1954         _kernel.convertTo(kernel, CV_32F, 1./(1 << _bits), 0);
1955         delta = (float)(_delta/(1 << _bits));
1956         vector<Point> coords;
1957         preprocess2DKernel(kernel, coords, coeffs);
1958         _nz = (int)coords.size();
1959     }
1960
1961     int operator()(const uchar** src, uchar* dst, int width) const
1962     {
1963         if( !checkHardwareSupport(CV_CPU_SSE2) )
1964             return 0;
1965         
1966         const float* kf = (const float*)&coeffs[0];
1967         int i = 0, k, nz = _nz;
1968         __m128 d4 = _mm_set1_ps(delta);
1969
1970         for( ; i <= width - 16; i += 16 )
1971         {
1972             __m128 s0 = d4, s1 = d4, s2 = d4, s3 = d4;
1973             __m128i x0, x1, z = _mm_setzero_si128();
1974
1975             for( k = 0; k < nz; k++ )
1976             {
1977                 __m128 f = _mm_load_ss(kf+k), t0, t1;
1978                 f = _mm_shuffle_ps(f, f, 0);
1979
1980                 x0 = _mm_loadu_si128((const __m128i*)(src[k] + i));
1981                 x1 = _mm_unpackhi_epi8(x0, z);
1982                 x0 = _mm_unpacklo_epi8(x0, z);
1983
1984                 t0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(x0, z));
1985                 t1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(x0, z));
1986                 s0 = _mm_add_ps(s0, _mm_mul_ps(t0, f));
1987                 s1 = _mm_add_ps(s1, _mm_mul_ps(t1, f));
1988
1989                 t0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(x1, z));
1990                 t1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(x1, z));
1991                 s2 = _mm_add_ps(s2, _mm_mul_ps(t0, f));
1992                 s3 = _mm_add_ps(s3, _mm_mul_ps(t1, f));
1993             }
1994
1995             x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), _mm_cvtps_epi32(s1));
1996             x1 = _mm_packs_epi32(_mm_cvtps_epi32(s2), _mm_cvtps_epi32(s3));
1997             x0 = _mm_packus_epi16(x0, x1);
1998             _mm_storeu_si128((__m128i*)(dst + i), x0);
1999         }
2000
2001         for( ; i <= width - 4; i += 4 )
2002         {
2003             __m128 s0 = d4;
2004             __m128i x0, z = _mm_setzero_si128();
2005
2006             for( k = 0; k < nz; k++ )
2007             {
2008                 __m128 f = _mm_load_ss(kf+k), t0;
2009                 f = _mm_shuffle_ps(f, f, 0);
2010
2011                 x0 = _mm_cvtsi32_si128(*(const int*)(src[k] + i));
2012                 x0 = _mm_unpacklo_epi8(x0, z);
2013                 t0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(x0, z));
2014                 s0 = _mm_add_ps(s0, _mm_mul_ps(t0, f));
2015             }
2016
2017             x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), z);
2018             x0 = _mm_packus_epi16(x0, x0);
2019             *(int*)(dst + i) = _mm_cvtsi128_si32(x0);
2020         }
2021
2022         return i;
2023     }
2024
2025     int _nz;
2026     vector<uchar> coeffs;
2027     float delta;
2028 };
2029
2030
2031 struct FilterVec_8u16s
2032 {
2033     FilterVec_8u16s() {}
2034     FilterVec_8u16s(const Mat& _kernel, int _bits, double _delta)
2035     {
2036         Mat kernel;
2037         _kernel.convertTo(kernel, CV_32F, 1./(1 << _bits), 0);
2038         delta = (float)(_delta/(1 << _bits));
2039         vector<Point> coords;
2040         preprocess2DKernel(kernel, coords, coeffs);
2041         _nz = (int)coords.size();
2042     }
2043
2044     int operator()(const uchar** src, uchar* _dst, int width) const
2045     {
2046         if( !checkHardwareSupport(CV_CPU_SSE2) )
2047             return 0;
2048         
2049         const float* kf = (const float*)&coeffs[0];
2050         short* dst = (short*)_dst;
2051         int i = 0, k, nz = _nz;
2052         __m128 d4 = _mm_set1_ps(delta);
2053
2054         for( ; i <= width - 16; i += 16 )
2055         {
2056             __m128 s0 = d4, s1 = d4, s2 = d4, s3 = d4;
2057             __m128i x0, x1, z = _mm_setzero_si128();
2058
2059             for( k = 0; k < nz; k++ )
2060             {
2061                 __m128 f = _mm_load_ss(kf+k), t0, t1;
2062                 f = _mm_shuffle_ps(f, f, 0);
2063
2064                 x0 = _mm_loadu_si128((const __m128i*)(src[k] + i));
2065                 x1 = _mm_unpackhi_epi8(x0, z);
2066                 x0 = _mm_unpacklo_epi8(x0, z);
2067
2068                 t0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(x0, z));
2069                 t1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(x0, z));
2070                 s0 = _mm_add_ps(s0, _mm_mul_ps(t0, f));
2071                 s1 = _mm_add_ps(s1, _mm_mul_ps(t1, f));
2072
2073                 t0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(x1, z));
2074                 t1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(x1, z));
2075                 s2 = _mm_add_ps(s2, _mm_mul_ps(t0, f));
2076                 s3 = _mm_add_ps(s3, _mm_mul_ps(t1, f));
2077             }
2078
2079             x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), _mm_cvtps_epi32(s1));
2080             x1 = _mm_packs_epi32(_mm_cvtps_epi32(s2), _mm_cvtps_epi32(s3));
2081             _mm_storeu_si128((__m128i*)(dst + i), x0);
2082             _mm_storeu_si128((__m128i*)(dst + i + 8), x1);
2083         }
2084
2085         for( ; i <= width - 4; i += 4 )
2086         {
2087             __m128 s0 = d4;
2088             __m128i x0, z = _mm_setzero_si128();
2089
2090             for( k = 0; k < nz; k++ )
2091             {
2092                 __m128 f = _mm_load_ss(kf+k), t0;
2093                 f = _mm_shuffle_ps(f, f, 0);
2094
2095                 x0 = _mm_cvtsi32_si128(*(const int*)(src[k] + i));
2096                 x0 = _mm_unpacklo_epi8(x0, z);
2097                 t0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(x0, z));
2098                 s0 = _mm_add_ps(s0, _mm_mul_ps(t0, f));
2099             }
2100
2101             x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), z);
2102             _mm_storel_epi64((__m128i*)(dst + i), x0);
2103         }
2104
2105         return i;
2106     }
2107
2108     int _nz;
2109     vector<uchar> coeffs;
2110     float delta;
2111 };
2112
2113
2114 struct FilterVec_32f
2115 {
2116     FilterVec_32f() {}
2117     FilterVec_32f(const Mat& _kernel, int, double _delta)
2118     {
2119         delta = (float)_delta;
2120         vector<Point> coords;
2121         preprocess2DKernel(_kernel, coords, coeffs);
2122         _nz = (int)coords.size();
2123     }
2124
2125     int operator()(const uchar** _src, uchar* _dst, int width) const
2126     {
2127         if( !checkHardwareSupport(CV_CPU_SSE) )
2128             return 0;
2129         
2130         const float* kf = (const float*)&coeffs[0];
2131         const float** src = (const float**)_src;
2132         float* dst = (float*)_dst;
2133         int i = 0, k, nz = _nz;
2134         __m128 d4 = _mm_set1_ps(delta);
2135
2136         for( ; i <= width - 16; i += 16 )
2137         {
2138             __m128 s0 = d4, s1 = d4, s2 = d4, s3 = d4;
2139
2140             for( k = 0; k < nz; k++ )
2141             {
2142                 __m128 f = _mm_load_ss(kf+k), t0, t1;
2143                 f = _mm_shuffle_ps(f, f, 0);
2144                 const float* S = src[k] + i;
2145
2146                 t0 = _mm_loadu_ps(S);
2147                 t1 = _mm_loadu_ps(S + 4);
2148                 s0 = _mm_add_ps(s0, _mm_mul_ps(t0, f));
2149                 s1 = _mm_add_ps(s1, _mm_mul_ps(t1, f));
2150
2151                 t0 = _mm_loadu_ps(S + 8);
2152                 t1 = _mm_loadu_ps(S + 12);
2153                 s2 = _mm_add_ps(s2, _mm_mul_ps(t0, f));
2154                 s3 = _mm_add_ps(s3, _mm_mul_ps(t1, f));
2155             }
2156
2157             _mm_storeu_ps(dst + i, s0);
2158             _mm_storeu_ps(dst + i + 4, s1);
2159             _mm_storeu_ps(dst + i + 8, s2);
2160             _mm_storeu_ps(dst + i + 12, s3);
2161         }
2162
2163         for( ; i <= width - 4; i += 4 )
2164         {
2165             __m128 s0 = d4;
2166
2167             for( k = 0; k < nz; k++ )
2168             {
2169                 __m128 f = _mm_load_ss(kf+k), t0;
2170                 f = _mm_shuffle_ps(f, f, 0);
2171                 t0 = _mm_loadu_ps(src[k] + i);
2172                 s0 = _mm_add_ps(s0, _mm_mul_ps(t0, f));
2173             }
2174             _mm_storeu_ps(dst + i, s0);
2175         }
2176
2177         return i;
2178     }
2179
2180     int _nz;
2181     vector<uchar> coeffs;
2182     float delta;
2183 };
2184
2185
2186 #else
2187
2188 typedef RowNoVec RowVec_8u32s;
2189 typedef RowNoVec RowVec_16s32f;
2190 typedef RowNoVec RowVec_32f;
2191 typedef SymmRowSmallNoVec SymmRowSmallVec_8u32s;
2192 typedef SymmRowSmallNoVec SymmRowSmallVec_32f;
2193 typedef ColumnNoVec SymmColumnVec_32s8u;
2194 typedef ColumnNoVec SymmColumnVec_32f16s;
2195 typedef ColumnNoVec SymmColumnVec_32f;
2196 typedef SymmColumnSmallNoVec SymmColumnSmallVec_32s16s;
2197 typedef SymmColumnSmallNoVec SymmColumnSmallVec_32f;
2198 typedef FilterNoVec FilterVec_8u;
2199 typedef FilterNoVec FilterVec_8u16s;
2200 typedef FilterNoVec FilterVec_32f;
2201
2202 #endif
2203
2204
2205 template<typename ST, typename DT, class VecOp> struct RowFilter : public BaseRowFilter
2206 {
2207     RowFilter( const Mat& _kernel, int _anchor, const VecOp& _vecOp=VecOp() )
2208     {
2209         if( _kernel.isContinuous() )
2210             kernel = _kernel;
2211         else
2212             _kernel.copyTo(kernel);
2213         anchor = _anchor;
2214         ksize = kernel.rows + kernel.cols - 1;
2215         CV_Assert( kernel.type() == DataType<DT>::type &&
2216                    (kernel.rows == 1 || kernel.cols == 1));
2217         vecOp = _vecOp;
2218     }
2219     
2220     void operator()(const uchar* src, uchar* dst, int width, int cn)
2221     {
2222         int _ksize = ksize;
2223         const DT* kx = (const DT*)kernel.data;
2224         const ST* S;
2225         DT* D = (DT*)dst;
2226         int i, k;
2227
2228         i = vecOp(src, dst, width, cn);
2229         width *= cn;
2230         #if CV_ENABLE_UNROLLED
2231         for( ; i <= width - 4; i += 4 )
2232         {
2233             S = (const ST*)src + i;
2234             DT f = kx[0];
2235             DT s0 = f*S[0], s1 = f*S[1], s2 = f*S[2], s3 = f*S[3];
2236
2237             for( k = 1; k < _ksize; k++ )
2238             {
2239                 S += cn;
2240                 f = kx[k];
2241                 s0 += f*S[0]; s1 += f*S[1];
2242                 s2 += f*S[2]; s3 += f*S[3];
2243             }
2244             
2245             D[i] = s0; D[i+1] = s1;
2246             D[i+2] = s2; D[i+3] = s3;
2247         }
2248         #endif
2249         for( ; i < width; i++ )
2250         {
2251             S = (const ST*)src + i;
2252             DT s0 = kx[0]*S[0];
2253             for( k = 1; k < _ksize; k++ )
2254             {
2255                 S += cn;
2256                 s0 += kx[k]*S[0];
2257             }
2258             D[i] = s0;
2259         }
2260     }
2261
2262     Mat kernel;
2263     VecOp vecOp;
2264 };
2265
2266
2267 template<typename ST, typename DT, class VecOp> struct SymmRowSmallFilter :
2268     public RowFilter<ST, DT, VecOp>
2269 {
2270     SymmRowSmallFilter( const Mat& _kernel, int _anchor, int _symmetryType,
2271                         const VecOp& _vecOp = VecOp())
2272         : RowFilter<ST, DT, VecOp>( _kernel, _anchor, _vecOp )
2273     {
2274         symmetryType = _symmetryType;
2275         CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 && this->ksize <= 5 );
2276     }
2277     
2278     void operator()(const uchar* src, uchar* dst, int width, int cn)
2279     {
2280         int ksize2 = this->ksize/2, ksize2n = ksize2*cn;
2281         const DT* kx = (const DT*)this->kernel.data + ksize2;
2282         bool symmetrical = (this->symmetryType & KERNEL_SYMMETRICAL) != 0;
2283         DT* D = (DT*)dst;
2284         int i = this->vecOp(src, dst, width, cn), j, k;
2285         const ST* S = (const ST*)src + i + ksize2n;
2286         width *= cn;
2287
2288         if( symmetrical )
2289         {
2290             if( this->ksize == 1 && kx[0] == 1 )
2291             {
2292                 for( ; i <= width - 2; i += 2 )
2293                 {
2294                     DT s0 = S[i], s1 = S[i+1];
2295                     D[i] = s0; D[i+1] = s1;
2296                 }
2297                 S += i;
2298             }
2299             else if( this->ksize == 3 )
2300             {
2301                 if( kx[0] == 2 && kx[1] == 1 )
2302                     for( ; i <= width - 2; i += 2, S += 2 )
2303                     {
2304                         DT s0 = S[-cn] + S[0]*2 + S[cn], s1 = S[1-cn] + S[1]*2 + S[1+cn];
2305                         D[i] = s0; D[i+1] = s1;
2306                     }
2307                 else if( kx[0] == -2 && kx[1] == 1 )
2308                     for( ; i <= width - 2; i += 2, S += 2 )
2309                     {
2310                         DT s0 = S[-cn] - S[0]*2 + S[cn], s1 = S[1-cn] - S[1]*2 + S[1+cn];
2311                         D[i] = s0; D[i+1] = s1;
2312                     }
2313                 else
2314                 {
2315                     DT k0 = kx[0], k1 = kx[1];
2316                     for( ; i <= width - 2; i += 2, S += 2 )
2317                     {
2318                         DT s0 = S[0]*k0 + (S[-cn] + S[cn])*k1, s1 = S[1]*k0 + (S[1-cn] + S[1+cn])*k1;
2319                         D[i] = s0; D[i+1] = s1;
2320                     }
2321                 }
2322             }
2323             else if( this->ksize == 5 )
2324             {
2325                 DT k0 = kx[0], k1 = kx[1], k2 = kx[2];
2326                 if( k0 == -2 && k1 == 0 && k2 == 1 )
2327                     for( ; i <= width - 2; i += 2, S += 2 )
2328                     {
2329                         DT s0 = -2*S[0] + S[-cn*2] + S[cn*2];
2330                         DT s1 = -2*S[1] + S[1-cn*2] + S[1+cn*2];
2331                         D[i] = s0; D[i+1] = s1;
2332                     }
2333                 else
2334                     for( ; i <= width - 2; i += 2, S += 2 )
2335                     {
2336                         DT s0 = S[0]*k0 + (S[-cn] + S[cn])*k1 + (S[-cn*2] + S[cn*2])*k2;
2337                         DT s1 = S[1]*k0 + (S[1-cn] + S[1+cn])*k1 + (S[1-cn*2] + S[1+cn*2])*k2;
2338                         D[i] = s0; D[i+1] = s1;
2339                     }
2340             }
2341
2342             for( ; i < width; i++, S++ )
2343             {
2344                 DT s0 = kx[0]*S[0];
2345                 for( k = 1, j = cn; k <= ksize2; k++, j += cn )
2346                     s0 += kx[k]*(S[j] + S[-j]);
2347                 D[i] = s0;
2348             }
2349         }
2350         else
2351         {
2352             if( this->ksize == 3 )
2353             {
2354                 if( kx[0] == 0 && kx[1] == 1 )
2355                     for( ; i <= width - 2; i += 2, S += 2 )
2356                     {
2357                         DT s0 = S[cn] - S[-cn], s1 = S[1+cn] - S[1-cn];
2358                         D[i] = s0; D[i+1] = s1;
2359                     }
2360                 else
2361                 {
2362                     DT k1 = kx[1];
2363                     for( ; i <= width - 2; i += 2, S += 2 )
2364                     {
2365                         DT s0 = (S[cn] - S[-cn])*k1, s1 = (S[1+cn] - S[1-cn])*k1;
2366                         D[i] = s0; D[i+1] = s1;
2367                     }
2368                 }
2369             }
2370             else if( this->ksize == 5 )
2371             {
2372                 DT k1 = kx[1], k2 = kx[2];
2373                 for( ; i <= width - 2; i += 2, S += 2 )
2374                 {
2375                     DT s0 = (S[cn] - S[-cn])*k1 + (S[cn*2] - S[-cn*2])*k2;
2376                     DT s1 = (S[1+cn] - S[1-cn])*k1 + (S[1+cn*2] - S[1-cn*2])*k2;
2377                     D[i] = s0; D[i+1] = s1;
2378                 }
2379             }
2380
2381             for( ; i < width; i++, S++ )
2382             {
2383                 DT s0 = kx[0]*S[0];
2384                 for( k = 1, j = cn; k <= ksize2; k++, j += cn )
2385                     s0 += kx[k]*(S[j] - S[-j]);
2386                 D[i] = s0;
2387             }
2388         }
2389     }
2390
2391     int symmetryType;
2392 };
2393
2394
2395 template<class CastOp, class VecOp> struct ColumnFilter : public BaseColumnFilter
2396 {
2397     typedef typename CastOp::type1 ST;
2398     typedef typename CastOp::rtype DT;
2399     
2400     ColumnFilter( const Mat& _kernel, int _anchor,
2401         double _delta, const CastOp& _castOp=CastOp(),
2402         const VecOp& _vecOp=VecOp() )
2403     {
2404         if( _kernel.isContinuous() )
2405             kernel = _kernel;
2406         else
2407             _kernel.copyTo(kernel);
2408         anchor = _anchor;
2409         ksize = kernel.rows + kernel.cols - 1;
2410         delta = saturate_cast<ST>(_delta);
2411         castOp0 = _castOp;
2412         vecOp = _vecOp;
2413         CV_Assert( kernel.type() == DataType<ST>::type &&
2414                    (kernel.rows == 1 || kernel.cols == 1));
2415     }
2416
2417     void operator()(const uchar** src, uchar* dst, int dststep, int count, int width)
2418     {
2419         const ST* ky = (const ST*)kernel.data;
2420         ST _delta = delta;
2421         int _ksize = ksize;
2422         int i, k;
2423         CastOp castOp = castOp0;
2424
2425         for( ; count--; dst += dststep, src++ )
2426         {
2427             DT* D = (DT*)dst;
2428             i = vecOp(src, dst, width);
2429                         #if CV_ENABLE_UNROLLED
2430             for( ; i <= width - 4; i += 4 )
2431             {
2432                 ST f = ky[0];
2433                 const ST* S = (const ST*)src[0] + i;
2434                 ST s0 = f*S[0] + _delta, s1 = f*S[1] + _delta,
2435                     s2 = f*S[2] + _delta, s3 = f*S[3] + _delta;
2436
2437                 for( k = 1; k < _ksize; k++ )
2438                 {
2439                     S = (const ST*)src[k] + i; f = ky[k];
2440                     s0 += f*S[0]; s1 += f*S[1];
2441                     s2 += f*S[2]; s3 += f*S[3];
2442                 }
2443
2444                 D[i] = castOp(s0); D[i+1] = castOp(s1);
2445                 D[i+2] = castOp(s2); D[i+3] = castOp(s3);
2446             }
2447             #endif
2448             for( ; i < width; i++ )
2449             {
2450                 ST s0 = ky[0]*((const ST*)src[0])[i] + _delta;
2451                 for( k = 1; k < _ksize; k++ )
2452                     s0 += ky[k]*((const ST*)src[k])[i];
2453                 D[i] = castOp(s0);
2454             }
2455         }
2456     }
2457
2458     Mat kernel;
2459     CastOp castOp0;
2460     VecOp vecOp;
2461     ST delta;
2462 };
2463
2464
2465 template<class CastOp, class VecOp> struct SymmColumnFilter : public ColumnFilter<CastOp, VecOp>
2466 {
2467     typedef typename CastOp::type1 ST;
2468     typedef typename CastOp::rtype DT;
2469
2470     SymmColumnFilter( const Mat& _kernel, int _anchor,
2471         double _delta, int _symmetryType,
2472         const CastOp& _castOp=CastOp(),
2473         const VecOp& _vecOp=VecOp())
2474         : ColumnFilter<CastOp, VecOp>( _kernel, _anchor, _delta, _castOp, _vecOp )
2475     {
2476         symmetryType = _symmetryType;
2477         CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 );
2478     }
2479
2480     void operator()(const uchar** src, uchar* dst, int dststep, int count, int width)
2481     {
2482         int ksize2 = this->ksize/2;
2483         const ST* ky = (const ST*)this->kernel.data + ksize2;
2484         int i, k;
2485         bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
2486         ST _delta = this->delta;
2487         CastOp castOp = this->castOp0;
2488         src += ksize2;
2489
2490         if( symmetrical )
2491         {
2492             for( ; count--; dst += dststep, src++ )
2493             {
2494                 DT* D = (DT*)dst;
2495                 i = (this->vecOp)(src, dst, width);
2496                 #if CV_ENABLE_UNROLLED
2497                 for( ; i <= width - 4; i += 4 )
2498                 {
2499                     ST f = ky[0];
2500                     const ST* S = (const ST*)src[0] + i, *S2;
2501                     ST s0 = f*S[0] + _delta, s1 = f*S[1] + _delta,
2502                         s2 = f*S[2] + _delta, s3 = f*S[3] + _delta;
2503
2504                     for( k = 1; k <= ksize2; k++ )
2505                     {
2506                         S = (const ST*)src[k] + i;
2507                         S2 = (const ST*)src[-k] + i;
2508                         f = ky[k];
2509                         s0 += f*(S[0] + S2[0]);
2510                         s1 += f*(S[1] + S2[1]);
2511                         s2 += f*(S[2] + S2[2]);
2512                         s3 += f*(S[3] + S2[3]);
2513                     }
2514
2515                     D[i] = castOp(s0); D[i+1] = castOp(s1);
2516                     D[i+2] = castOp(s2); D[i+3] = castOp(s3);
2517                 }
2518                 #endif
2519                 for( ; i < width; i++ )
2520                 {
2521                     ST s0 = ky[0]*((const ST*)src[0])[i] + _delta;
2522                     for( k = 1; k <= ksize2; k++ )
2523                         s0 += ky[k]*(((const ST*)src[k])[i] + ((const ST*)src[-k])[i]);
2524                     D[i] = castOp(s0);
2525                 }
2526             }
2527         }
2528         else
2529         {
2530             for( ; count--; dst += dststep, src++ )
2531             {
2532                 DT* D = (DT*)dst;
2533                 i = this->vecOp(src, dst, width);
2534                 #if CV_ENABLE_UNROLLED
2535                 for( ; i <= width - 4; i += 4 )
2536                 {
2537                     ST f = ky[0];
2538                     const ST *S, *S2;
2539                     ST s0 = _delta, s1 = _delta, s2 = _delta, s3 = _delta;
2540
2541                     for( k = 1; k <= ksize2; k++ )
2542                     {
2543                         S = (const ST*)src[k] + i;
2544                         S2 = (const ST*)src[-k] + i;
2545                         f = ky[k];
2546                         s0 += f*(S[0] - S2[0]);
2547                         s1 += f*(S[1] - S2[1]);
2548                         s2 += f*(S[2] - S2[2]);
2549                         s3 += f*(S[3] - S2[3]);
2550                     }
2551
2552                     D[i] = castOp(s0); D[i+1] = castOp(s1);
2553                     D[i+2] = castOp(s2); D[i+3] = castOp(s3);
2554                 }
2555                 #endif
2556                 for( ; i < width; i++ )
2557                 {
2558                     ST s0 = _delta;
2559                     for( k = 1; k <= ksize2; k++ )
2560                         s0 += ky[k]*(((const ST*)src[k])[i] - ((const ST*)src[-k])[i]);
2561                     D[i] = castOp(s0);
2562                 }
2563             }
2564         }
2565     }
2566
2567     int symmetryType;
2568 };
2569
2570
2571 template<class CastOp, class VecOp>
2572 struct SymmColumnSmallFilter : public SymmColumnFilter<CastOp, VecOp>
2573 {
2574     typedef typename CastOp::type1 ST;
2575     typedef typename CastOp::rtype DT;
2576     
2577     SymmColumnSmallFilter( const Mat& _kernel, int _anchor,
2578                            double _delta, int _symmetryType,
2579                            const CastOp& _castOp=CastOp(),
2580                            const VecOp& _vecOp=VecOp())
2581         : SymmColumnFilter<CastOp, VecOp>( _kernel, _anchor, _delta, _symmetryType, _castOp, _vecOp )
2582     {
2583         CV_Assert( this->ksize == 3 );
2584     }
2585
2586     void operator()(const uchar** src, uchar* dst, int dststep, int count, int width)
2587     {
2588         int ksize2 = this->ksize/2;
2589         const ST* ky = (const ST*)this->kernel.data + ksize2;
2590         int i;
2591         bool symmetrical = (this->symmetryType & KERNEL_SYMMETRICAL) != 0;
2592         bool is_1_2_1 = ky[0] == 1 && ky[1] == 2;
2593         bool is_1_m2_1 = ky[0] == 1 && ky[1] == -2;
2594         bool is_m1_0_1 = ky[1] == 1 || ky[1] == -1;
2595         ST f0 = ky[0], f1 = ky[1];
2596         ST _delta = this->delta;
2597         CastOp castOp = this->castOp0;
2598         src += ksize2;
2599
2600         for( ; count--; dst += dststep, src++ )
2601         {
2602             DT* D = (DT*)dst;
2603             i = (this->vecOp)(src, dst, width);
2604             const ST* S0 = (const ST*)src[-1];
2605             const ST* S1 = (const ST*)src[0];
2606             const ST* S2 = (const ST*)src[1];
2607
2608             if( symmetrical )
2609             {
2610                 if( is_1_2_1 )
2611                 {
2612                                         #if CV_ENABLE_UNROLLED
2613                     for( ; i <= width - 4; i += 4 )
2614                     {
2615                         ST s0 = S0[i] + S1[i]*2 + S2[i] + _delta;
2616                         ST s1 = S0[i+1] + S1[i+1]*2 + S2[i+1] + _delta;
2617                         D[i] = castOp(s0);
2618                         D[i+1] = castOp(s1);
2619
2620                         s0 = S0[i+2] + S1[i+2]*2 + S2[i+2] + _delta;
2621                         s1 = S0[i+3] + S1[i+3]*2 + S2[i+3] + _delta;
2622                         D[i+2] = castOp(s0);
2623                         D[i+3] = castOp(s1);
2624                     }
2625                     #else
2626                             for( ; i < width; i ++ )
2627                     {
2628                         ST s0 = S0[i] + S1[i]*2 + S2[i] + _delta;
2629                         D[i] = castOp(s0);
2630                     }
2631                     #endif
2632                 }
2633                 else if( is_1_m2_1 )
2634                 {
2635                                         #if CV_ENABLE_UNROLLED
2636                     for( ; i <= width - 4; i += 4 )
2637                     {
2638                         ST s0 = S0[i] - S1[i]*2 + S2[i] + _delta;
2639                         ST s1 = S0[i+1] - S1[i+1]*2 + S2[i+1] + _delta;
2640                         D[i] = castOp(s0);
2641                         D[i+1] = castOp(s1);
2642
2643                         s0 = S0[i+2] - S1[i+2]*2 + S2[i+2] + _delta;
2644                         s1 = S0[i+3] - S1[i+3]*2 + S2[i+3] + _delta;
2645                         D[i+2] = castOp(s0);
2646                         D[i+3] = castOp(s1);
2647                     }
2648                     #else
2649                             for( ; i < width; i ++ )
2650                     {
2651                         ST s0 = S0[i] - S1[i]*2 + S2[i] + _delta;
2652                         D[i] = castOp(s0);
2653                     }
2654                     #endif
2655                 }
2656                 else
2657                 {
2658                    #if CV_ENABLE_UNROLLED
2659                     for( ; i <= width - 4; i += 4 )
2660                     {
2661                         ST s0 = (S0[i] + S2[i])*f1 + S1[i]*f0 + _delta;
2662                         ST s1 = (S0[i+1] + S2[i+1])*f1 + S1[i+1]*f0 + _delta;
2663                         D[i] = castOp(s0);
2664                         D[i+1] = castOp(s1);
2665
2666                         s0 = (S0[i+2] + S2[i+2])*f1 + S1[i+2]*f0 + _delta;
2667                         s1 = (S0[i+3] + S2[i+3])*f1 + S1[i+3]*f0 + _delta;
2668                         D[i+2] = castOp(s0);
2669                         D[i+3] = castOp(s1);
2670                     }
2671                     #else
2672                     for( ; i < width; i ++ )
2673                     {
2674                         ST s0 = (S0[i] + S2[i])*f1 + S1[i]*f0 + _delta;
2675                         D[i] = castOp(s0);
2676                     }
2677                     #endif
2678                 }
2679                 for( ; i < width; i++ )
2680                     D[i] = castOp((S0[i] + S2[i])*f1 + S1[i]*f0 + _delta);
2681             }
2682             else
2683             {
2684                 if( is_m1_0_1 )
2685                 {
2686                     if( f1 < 0 )
2687                         std::swap(S0, S2);
2688                    #if CV_ENABLE_UNROLLED
2689                     for( ; i <= width - 4; i += 4 )
2690                     {
2691                         ST s0 = S2[i] - S0[i] + _delta;
2692                         ST s1 = S2[i+1] - S0[i+1] + _delta;
2693                         D[i] = castOp(s0);
2694                         D[i+1] = castOp(s1);
2695
2696                         s0 = S2[i+2] - S0[i+2] + _delta;
2697                         s1 = S2[i+3] - S0[i+3] + _delta;
2698                         D[i+2] = castOp(s0);
2699                         D[i+3] = castOp(s1);
2700                     }
2701                     #else
2702                         for( ; i < width; i ++ )
2703                     {
2704                         ST s0 = S2[i] - S0[i] + _delta;
2705                         D[i] = castOp(s0);
2706                     }
2707                     #endif
2708                     if( f1 < 0 )
2709                         std::swap(S0, S2);
2710                 }
2711                 else
2712                 {
2713                    #if CV_ENABLE_UNROLLED
2714                     for( ; i <= width - 4; i += 4 )
2715                     {
2716                         ST s0 = (S2[i] - S0[i])*f1 + _delta;
2717                         ST s1 = (S2[i+1] - S0[i+1])*f1 + _delta;
2718                         D[i] = castOp(s0);
2719                         D[i+1] = castOp(s1);
2720
2721                         s0 = (S2[i+2] - S0[i+2])*f1 + _delta;
2722                         s1 = (S2[i+3] - S0[i+3])*f1 + _delta;
2723                         D[i+2] = castOp(s0);
2724                         D[i+3] = castOp(s1);
2725                     }
2726                     #endif
2727                 }
2728
2729                 for( ; i < width; i++ )
2730                     D[i] = castOp((S2[i] - S0[i])*f1 + _delta);
2731             }
2732         }
2733     }
2734 };
2735
2736 template<typename ST, typename DT> struct Cast
2737 {
2738     typedef ST type1;
2739     typedef DT rtype;
2740
2741     DT operator()(ST val) const { return saturate_cast<DT>(val); }
2742 };
2743
2744 template<typename ST, typename DT, int bits> struct FixedPtCast
2745 {
2746     typedef ST type1;
2747     typedef DT rtype;
2748     enum { SHIFT = bits, DELTA = 1 << (bits-1) };
2749
2750     DT operator()(ST val) const { return saturate_cast<DT>((val + DELTA)>>SHIFT); }
2751 };
2752
2753 template<typename ST, typename DT> struct FixedPtCastEx
2754 {
2755     typedef ST type1;
2756     typedef DT rtype;
2757
2758     FixedPtCastEx() : SHIFT(0), DELTA(0) {}
2759     FixedPtCastEx(int bits) : SHIFT(bits), DELTA(bits ? 1 << (bits-1) : 0) {}
2760     DT operator()(ST val) const { return saturate_cast<DT>((val + DELTA)>>SHIFT); }
2761     int SHIFT, DELTA;
2762 };
2763
2764 }
2765     
2766 cv::Ptr<cv::BaseRowFilter> cv::getLinearRowFilter( int srcType, int bufType,
2767                                                    InputArray _kernel, int anchor,
2768                                                    int symmetryType )
2769 {
2770     Mat kernel = _kernel.getMat();
2771     int sdepth = CV_MAT_DEPTH(srcType), ddepth = CV_MAT_DEPTH(bufType);
2772     int cn = CV_MAT_CN(srcType);
2773     CV_Assert( cn == CV_MAT_CN(bufType) &&
2774         ddepth >= std::max(sdepth, CV_32S) &&
2775         kernel.type() == ddepth );
2776     int ksize = kernel.rows + kernel.cols - 1;
2777
2778     if( (symmetryType & (KERNEL_SYMMETRICAL|KERNEL_ASYMMETRICAL)) != 0 && ksize <= 5 )
2779     {
2780         if( sdepth == CV_8U && ddepth == CV_32S )
2781             return Ptr<BaseRowFilter>(new SymmRowSmallFilter<uchar, int, SymmRowSmallVec_8u32s>
2782                 (kernel, anchor, symmetryType, SymmRowSmallVec_8u32s(kernel, symmetryType)));
2783         if( sdepth == CV_32F && ddepth == CV_32F )
2784             return Ptr<BaseRowFilter>(new SymmRowSmallFilter<float, float, SymmRowSmallVec_32f>
2785                 (kernel, anchor, symmetryType, SymmRowSmallVec_32f(kernel, symmetryType)));
2786     }
2787         
2788     if( sdepth == CV_8U && ddepth == CV_32S )
2789         return Ptr<BaseRowFilter>(new RowFilter<uchar, int, RowVec_8u32s>
2790             (kernel, anchor, RowVec_8u32s(kernel)));
2791     if( sdepth == CV_8U && ddepth == CV_32F )
2792         return Ptr<BaseRowFilter>(new RowFilter<uchar, float, RowNoVec>(kernel, anchor));
2793     if( sdepth == CV_8U && ddepth == CV_64F )
2794         return Ptr<BaseRowFilter>(new RowFilter<uchar, double, RowNoVec>(kernel, anchor));
2795     if( sdepth == CV_16U && ddepth == CV_32F )
2796         return Ptr<BaseRowFilter>(new RowFilter<ushort, float, RowNoVec>(kernel, anchor));
2797     if( sdepth == CV_16U && ddepth == CV_64F )
2798         return Ptr<BaseRowFilter>(new RowFilter<ushort, double, RowNoVec>(kernel, anchor));
2799     if( sdepth == CV_16S && ddepth == CV_32F )
2800         return Ptr<BaseRowFilter>(new RowFilter<short, float, RowVec_16s32f>
2801                                   (kernel, anchor, RowVec_16s32f(kernel)));
2802     if( sdepth == CV_16S && ddepth == CV_64F )
2803         return Ptr<BaseRowFilter>(new RowFilter<short, double, RowNoVec>(kernel, anchor));
2804     if( sdepth == CV_32F && ddepth == CV_32F )
2805         return Ptr<BaseRowFilter>(new RowFilter<float, float, RowVec_32f>
2806             (kernel, anchor, RowVec_32f(kernel)));
2807     if( sdepth == CV_64F && ddepth == CV_64F )
2808         return Ptr<BaseRowFilter>(new RowFilter<double, double, RowNoVec>(kernel, anchor));
2809
2810     CV_Error_( CV_StsNotImplemented,
2811         ("Unsupported combination of source format (=%d), and buffer format (=%d)",
2812         srcType, bufType));
2813
2814     return Ptr<BaseRowFilter>(0);
2815 }
2816
2817
2818 cv::Ptr<cv::BaseColumnFilter> cv::getLinearColumnFilter( int bufType, int dstType,
2819                                              InputArray _kernel, int anchor,
2820                                              int symmetryType, double delta, 
2821                                              int bits )
2822 {
2823     Mat kernel = _kernel.getMat();
2824     int sdepth = CV_MAT_DEPTH(bufType), ddepth = CV_MAT_DEPTH(dstType);
2825     int cn = CV_MAT_CN(dstType);
2826     CV_Assert( cn == CV_MAT_CN(bufType) &&
2827         sdepth >= std::max(ddepth, CV_32S) &&
2828         kernel.type() == sdepth );
2829
2830     if( !(symmetryType & (KERNEL_SYMMETRICAL|KERNEL_ASYMMETRICAL)) )
2831     {
2832         if( ddepth == CV_8U && sdepth == CV_32S )
2833             return Ptr<BaseColumnFilter>(new ColumnFilter<FixedPtCastEx<int, uchar>, ColumnNoVec>
2834             (kernel, anchor, delta, FixedPtCastEx<int, uchar>(bits)));
2835         if( ddepth == CV_8U && sdepth == CV_32F )
2836             return Ptr<BaseColumnFilter>(new ColumnFilter<Cast<float, uchar>, ColumnNoVec>(kernel, anchor, delta));
2837         if( ddepth == CV_8U && sdepth == CV_64F )
2838             return Ptr<BaseColumnFilter>(new ColumnFilter<Cast<double, uchar>, ColumnNoVec>(kernel, anchor, delta));
2839         if( ddepth == CV_16U && sdepth == CV_32F )
2840             return Ptr<BaseColumnFilter>(new ColumnFilter<Cast<float, ushort>, ColumnNoVec>(kernel, anchor, delta));
2841         if( ddepth == CV_16U && sdepth == CV_64F )
2842             return Ptr<BaseColumnFilter>(new ColumnFilter<Cast<double, ushort>, ColumnNoVec>(kernel, anchor, delta));
2843         if( ddepth == CV_16S && sdepth == CV_32F )
2844             return Ptr<BaseColumnFilter>(new ColumnFilter<Cast<float, short>, ColumnNoVec>(kernel, anchor, delta));
2845         if( ddepth == CV_16S && sdepth == CV_64F )
2846             return Ptr<BaseColumnFilter>(new ColumnFilter<Cast<double, short>, ColumnNoVec>(kernel, anchor, delta));
2847         if( ddepth == CV_32F && sdepth == CV_32F )
2848             return Ptr<BaseColumnFilter>(new ColumnFilter<Cast<float, float>, ColumnNoVec>(kernel, anchor, delta));
2849         if( ddepth == CV_64F && sdepth == CV_64F )
2850             return Ptr<BaseColumnFilter>(new ColumnFilter<Cast<double, double>, ColumnNoVec>(kernel, anchor, delta));
2851     }
2852     else
2853     {
2854         int ksize = kernel.rows + kernel.cols - 1;
2855         if( ksize == 3 )
2856         {
2857             if( ddepth == CV_8U && sdepth == CV_32S )
2858                 return Ptr<BaseColumnFilter>(new SymmColumnSmallFilter<
2859                     FixedPtCastEx<int, uchar>, SymmColumnVec_32s8u>
2860                     (kernel, anchor, delta, symmetryType, FixedPtCastEx<int, uchar>(bits),
2861                     SymmColumnVec_32s8u(kernel, symmetryType, bits, delta)));
2862             if( ddepth == CV_16S && sdepth == CV_32S && bits == 0 )
2863                 return Ptr<BaseColumnFilter>(new SymmColumnSmallFilter<Cast<int, short>,
2864                     SymmColumnSmallVec_32s16s>(kernel, anchor, delta, symmetryType,
2865                         Cast<int, short>(), SymmColumnSmallVec_32s16s(kernel, symmetryType, bits, delta)));
2866             if( ddepth == CV_32F && sdepth == CV_32F )
2867                 return Ptr<BaseColumnFilter>(new SymmColumnSmallFilter<
2868                     Cast<float, float>,SymmColumnSmallVec_32f>
2869                     (kernel, anchor, delta, symmetryType, Cast<float, float>(),
2870                     SymmColumnSmallVec_32f(kernel, symmetryType, 0, delta)));
2871         }
2872         if( ddepth == CV_8U && sdepth == CV_32S )
2873             return Ptr<BaseColumnFilter>(new SymmColumnFilter<FixedPtCastEx<int, uchar>, SymmColumnVec_32s8u>
2874                 (kernel, anchor, delta, symmetryType, FixedPtCastEx<int, uchar>(bits),
2875                 SymmColumnVec_32s8u(kernel, symmetryType, bits, delta)));
2876         if( ddepth == CV_8U && sdepth == CV_32F )
2877             return Ptr<BaseColumnFilter>(new SymmColumnFilter<Cast<float, uchar>, ColumnNoVec>
2878                 (kernel, anchor, delta, symmetryType));
2879         if( ddepth == CV_8U && sdepth == CV_64F )
2880             return Ptr<BaseColumnFilter>(new SymmColumnFilter<Cast<double, uchar>, ColumnNoVec>
2881                 (kernel, anchor, delta, symmetryType));
2882         if( ddepth == CV_16U && sdepth == CV_32F )
2883             return Ptr<BaseColumnFilter>(new SymmColumnFilter<Cast<float, ushort>, ColumnNoVec>
2884                 (kernel, anchor, delta, symmetryType));
2885         if( ddepth == CV_16U && sdepth == CV_64F )
2886             return Ptr<BaseColumnFilter>(new SymmColumnFilter<Cast<double, ushort>, ColumnNoVec>
2887                 (kernel, anchor, delta, symmetryType));
2888         if( ddepth == CV_16S && sdepth == CV_32S )
2889             return Ptr<BaseColumnFilter>(new SymmColumnFilter<Cast<int, short>, ColumnNoVec>
2890                 (kernel, anchor, delta, symmetryType));
2891         if( ddepth == CV_16S && sdepth == CV_32F )
2892             return Ptr<BaseColumnFilter>(new SymmColumnFilter<Cast<float, short>, SymmColumnVec_32f16s>
2893                  (kernel, anchor, delta, symmetryType, Cast<float, short>(),
2894                   SymmColumnVec_32f16s(kernel, symmetryType, 0, delta)));
2895         if( ddepth == CV_16S && sdepth == CV_64F )
2896             return Ptr<BaseColumnFilter>(new SymmColumnFilter<Cast<double, short>, ColumnNoVec>
2897                 (kernel, anchor, delta, symmetryType));
2898         if( ddepth == CV_32F && sdepth == CV_32F )
2899             return Ptr<BaseColumnFilter>(new SymmColumnFilter<Cast<float, float>, SymmColumnVec_32f>
2900                 (kernel, anchor, delta, symmetryType, Cast<float, float>(),
2901                 SymmColumnVec_32f(kernel, symmetryType, 0, delta)));
2902         if( ddepth == CV_64F && sdepth == CV_64F )
2903             return Ptr<BaseColumnFilter>(new SymmColumnFilter<Cast<double, double>, ColumnNoVec>
2904                 (kernel, anchor, delta, symmetryType));
2905     }
2906
2907     CV_Error_( CV_StsNotImplemented,
2908         ("Unsupported combination of buffer format (=%d), and destination format (=%d)",
2909         bufType, dstType));
2910
2911     return Ptr<BaseColumnFilter>(0);
2912 }
2913
2914
2915 cv::Ptr<cv::FilterEngine> cv::createSeparableLinearFilter(
2916     int _srcType, int _dstType,
2917     InputArray __rowKernel, InputArray __columnKernel,
2918     Point _anchor, double _delta,
2919     int _rowBorderType, int _columnBorderType,
2920     const Scalar& _borderValue )
2921 {
2922     Mat _rowKernel = __rowKernel.getMat(), _columnKernel = __columnKernel.getMat();
2923     _srcType = CV_MAT_TYPE(_srcType);
2924     _dstType = CV_MAT_TYPE(_dstType);
2925     int sdepth = CV_MAT_DEPTH(_srcType), ddepth = CV_MAT_DEPTH(_dstType);
2926     int cn = CV_MAT_CN(_srcType);
2927     CV_Assert( cn == CV_MAT_CN(_dstType) );
2928     int rsize = _rowKernel.rows + _rowKernel.cols - 1;
2929     int csize = _columnKernel.rows + _columnKernel.cols - 1;
2930     if( _anchor.x < 0 )
2931         _anchor.x = rsize/2;
2932     if( _anchor.y < 0 )
2933         _anchor.y = csize/2;
2934     int rtype = getKernelType(_rowKernel,
2935         _rowKernel.rows == 1 ? Point(_anchor.x, 0) : Point(0, _anchor.x));
2936     int ctype = getKernelType(_columnKernel,
2937         _columnKernel.rows == 1 ? Point(_anchor.y, 0) : Point(0, _anchor.y));
2938     Mat rowKernel, columnKernel;
2939
2940     int bdepth = std::max(CV_32F,std::max(sdepth, ddepth));
2941     int bits = 0;
2942
2943     if( sdepth == CV_8U &&
2944         ((rtype == KERNEL_SMOOTH+KERNEL_SYMMETRICAL &&
2945           ctype == KERNEL_SMOOTH+KERNEL_SYMMETRICAL &&
2946           ddepth == CV_8U) ||
2947          ((rtype & (KERNEL_SYMMETRICAL+KERNEL_ASYMMETRICAL)) &&
2948           (ctype & (KERNEL_SYMMETRICAL+KERNEL_ASYMMETRICAL)) &&
2949           (rtype & ctype & KERNEL_INTEGER) &&
2950           ddepth == CV_16S)) )
2951     {
2952         bdepth = CV_32S;
2953         bits = ddepth == CV_8U ? 8 : 0;
2954         _rowKernel.convertTo( rowKernel, CV_32S, 1 << bits );
2955         _columnKernel.convertTo( columnKernel, CV_32S, 1 << bits );
2956         bits *= 2;
2957         _delta *= (1 << bits);
2958     }
2959     else
2960     {
2961         if( _rowKernel.type() != bdepth )
2962             _rowKernel.convertTo( rowKernel, bdepth );
2963         else
2964             rowKernel = _rowKernel;
2965         if( _columnKernel.type() != bdepth )
2966             _columnKernel.convertTo( columnKernel, bdepth );
2967         else
2968             columnKernel = _columnKernel;
2969     }
2970
2971     int _bufType = CV_MAKETYPE(bdepth, cn);
2972     Ptr<BaseRowFilter> _rowFilter = getLinearRowFilter(
2973         _srcType, _bufType, rowKernel, _anchor.x, rtype);
2974     Ptr<BaseColumnFilter> _columnFilter = getLinearColumnFilter(
2975         _bufType, _dstType, columnKernel, _anchor.y, ctype, _delta, bits );
2976
2977     return Ptr<FilterEngine>( new FilterEngine(Ptr<BaseFilter>(0), _rowFilter, _columnFilter,
2978         _srcType, _dstType, _bufType, _rowBorderType, _columnBorderType, _borderValue ));
2979 }
2980
2981
2982 /****************************************************************************************\
2983 *                               Non-separable linear filter                              *
2984 \****************************************************************************************/
2985
2986 namespace cv
2987 {
2988
2989 void preprocess2DKernel( const Mat& kernel, vector<Point>& coords, vector<uchar>& coeffs )
2990 {
2991     int i, j, k, nz = countNonZero(kernel), ktype = kernel.type();
2992     if(nz == 0)
2993         nz = 1;
2994     CV_Assert( ktype == CV_8U || ktype == CV_32S || ktype == CV_32F || ktype == CV_64F );
2995     coords.resize(nz);
2996     coeffs.resize(nz*getElemSize(ktype));
2997     uchar* _coeffs = &coeffs[0];
2998
2999     for( i = k = 0; i < kernel.rows; i++ )
3000     {
3001         const uchar* krow = kernel.data + kernel.step*i;
3002         for( j = 0; j < kernel.cols; j++ )
3003         {
3004             if( ktype == CV_8U )
3005             {
3006                 uchar val = krow[j];
3007                 if( val == 0 )
3008                     continue;
3009                 coords[k] = Point(j,i);
3010                 _coeffs[k++] = val;
3011             }
3012             else if( ktype == CV_32S )
3013             {
3014                 int val = ((const int*)krow)[j];
3015                 if( val == 0 )
3016                     continue;
3017                 coords[k] = Point(j,i);
3018                 ((int*)_coeffs)[k++] = val;
3019             }
3020             else if( ktype == CV_32F )
3021             {
3022                 float val = ((const float*)krow)[j];
3023                 if( val == 0 )
3024                     continue;
3025                 coords[k] = Point(j,i);
3026                 ((float*)_coeffs)[k++] = val;
3027             }
3028             else
3029             {
3030                 double val = ((const double*)krow)[j];
3031                 if( val == 0 )
3032                     continue;
3033                 coords[k] = Point(j,i);
3034                 ((double*)_coeffs)[k++] = val;
3035             }
3036         }
3037     }
3038 }
3039
3040
3041 template<typename ST, class CastOp, class VecOp> struct Filter2D : public BaseFilter
3042 {
3043     typedef typename CastOp::type1 KT;
3044     typedef typename CastOp::rtype DT;
3045     
3046     Filter2D( const Mat& _kernel, Point _anchor,
3047         double _delta, const CastOp& _castOp=CastOp(),
3048         const VecOp& _vecOp=VecOp() )
3049     {
3050         anchor = _anchor;
3051         ksize = _kernel.size();
3052         delta = saturate_cast<KT>(_delta);
3053         castOp0 = _castOp;
3054         vecOp = _vecOp;
3055         CV_Assert( _kernel.type() == DataType<KT>::type );
3056         preprocess2DKernel( _kernel, coords, coeffs );
3057         ptrs.resize( coords.size() );
3058     }
3059
3060     void operator()(const uchar** src, uchar* dst, int dststep, int count, int width, int cn)
3061     {
3062         KT _delta = delta;
3063         const Point* pt = &coords[0];
3064         const KT* kf = (const KT*)&coeffs[0];
3065         const ST** kp = (const ST**)&ptrs[0];
3066         int i, k, nz = (int)coords.size();
3067         CastOp castOp = castOp0;
3068
3069         width *= cn;
3070         for( ; count > 0; count--, dst += dststep, src++ )
3071         {
3072             DT* D = (DT*)dst;
3073
3074             for( k = 0; k < nz; k++ )
3075                 kp[k] = (const ST*)src[pt[k].y] + pt[k].x*cn;
3076
3077             i = vecOp((const uchar**)kp, dst, width);
3078             #if CV_ENABLE_UNROLLED
3079             for( ; i <= width - 4; i += 4 )
3080             {
3081                 KT s0 = _delta, s1 = _delta, s2 = _delta, s3 = _delta;
3082
3083                 for( k = 0; k < nz; k++ )
3084                 {
3085                     const ST* sptr = kp[k] + i;
3086                     KT f = kf[k];
3087                     s0 += f*sptr[0];
3088                     s1 += f*sptr[1];
3089                     s2 += f*sptr[2];
3090                     s3 += f*sptr[3];
3091                 }
3092
3093                 D[i] = castOp(s0); D[i+1] = castOp(s1);
3094                 D[i+2] = castOp(s2); D[i+3] = castOp(s3);
3095             }
3096             #endif
3097             for( ; i < width; i++ )
3098             {
3099                 KT s0 = _delta;
3100                 for( k = 0; k < nz; k++ )
3101                     s0 += kf[k]*kp[k][i];
3102                 D[i] = castOp(s0);
3103             }
3104         }
3105     }
3106
3107     vector<Point> coords;
3108     vector<uchar> coeffs;
3109     vector<uchar*> ptrs;
3110     KT delta;
3111     CastOp castOp0;
3112     VecOp vecOp;
3113 };
3114
3115 }
3116
3117 cv::Ptr<cv::BaseFilter> cv::getLinearFilter(int srcType, int dstType,
3118                                 InputArray filter_kernel, Point anchor,
3119                                 double delta, int bits)
3120 {
3121     Mat _kernel = filter_kernel.getMat();
3122     int sdepth = CV_MAT_DEPTH(srcType), ddepth = CV_MAT_DEPTH(dstType);
3123     int cn = CV_MAT_CN(srcType), kdepth = _kernel.depth();
3124     CV_Assert( cn == CV_MAT_CN(dstType) && ddepth >= sdepth );
3125
3126     anchor = normalizeAnchor(anchor, _kernel.size());
3127
3128     /*if( sdepth == CV_8U && ddepth == CV_8U && kdepth == CV_32S )
3129         return Ptr<BaseFilter>(new Filter2D<uchar, FixedPtCastEx<int, uchar>, FilterVec_8u>
3130             (_kernel, anchor, delta, FixedPtCastEx<int, uchar>(bits),
3131             FilterVec_8u(_kernel, bits, delta)));
3132     if( sdepth == CV_8U && ddepth == CV_16S && kdepth == CV_32S )
3133         return Ptr<BaseFilter>(new Filter2D<uchar, FixedPtCastEx<int, short>, FilterVec_8u16s>
3134             (_kernel, anchor, delta, FixedPtCastEx<int, short>(bits),
3135             FilterVec_8u16s(_kernel, bits, delta)));*/
3136
3137     kdepth = sdepth == CV_64F || ddepth == CV_64F ? CV_64F : CV_32F;
3138     Mat kernel;
3139     if( _kernel.type() == kdepth )
3140         kernel = _kernel;
3141     else
3142         _kernel.convertTo(kernel, kdepth, _kernel.type() == CV_32S ? 1./(1 << bits) : 1.);
3143     
3144     if( sdepth == CV_8U && ddepth == CV_8U )
3145         return Ptr<BaseFilter>(new Filter2D<uchar, Cast<float, uchar>, FilterVec_8u>
3146             (kernel, anchor, delta, Cast<float, uchar>(), FilterVec_8u(kernel, 0, delta)));
3147     if( sdepth == CV_8U && ddepth == CV_16U )
3148         return Ptr<BaseFilter>(new Filter2D<uchar,
3149             Cast<float, ushort>, FilterNoVec>(kernel, anchor, delta));
3150     if( sdepth == CV_8U && ddepth == CV_16S )
3151         return Ptr<BaseFilter>(new Filter2D<uchar, Cast<float, short>, FilterVec_8u16s>
3152             (kernel, anchor, delta, Cast<float, short>(), FilterVec_8u16s(kernel, 0, delta)));
3153     if( sdepth == CV_8U && ddepth == CV_32F )
3154         return Ptr<BaseFilter>(new Filter2D<uchar,
3155             Cast<float, float>, FilterNoVec>(kernel, anchor, delta));
3156     if( sdepth == CV_8U && ddepth == CV_64F )
3157         return Ptr<BaseFilter>(new Filter2D<uchar,
3158             Cast<double, double>, FilterNoVec>(kernel, anchor, delta));
3159
3160     if( sdepth == CV_16U && ddepth == CV_16U )
3161         return Ptr<BaseFilter>(new Filter2D<ushort,
3162             Cast<float, ushort>, FilterNoVec>(kernel, anchor, delta));
3163     if( sdepth == CV_16U && ddepth == CV_32F )
3164         return Ptr<BaseFilter>(new Filter2D<ushort,
3165             Cast<float, float>, FilterNoVec>(kernel, anchor, delta));
3166     if( sdepth == CV_16U && ddepth == CV_64F )
3167         return Ptr<BaseFilter>(new Filter2D<ushort,
3168             Cast<double, double>, FilterNoVec>(kernel, anchor, delta));
3169
3170     if( sdepth == CV_16S && ddepth == CV_16S )
3171         return Ptr<BaseFilter>(new Filter2D<short,
3172             Cast<float, short>, FilterNoVec>(kernel, anchor, delta));
3173     if( sdepth == CV_16S && ddepth == CV_32F )
3174         return Ptr<BaseFilter>(new Filter2D<short,
3175             Cast<float, float>, FilterNoVec>(kernel, anchor, delta));
3176     if( sdepth == CV_16S && ddepth == CV_64F )
3177         return Ptr<BaseFilter>(new Filter2D<short,
3178             Cast<double, double>, FilterNoVec>(kernel, anchor, delta));
3179
3180     if( sdepth == CV_32F && ddepth == CV_32F )
3181         return Ptr<BaseFilter>(new Filter2D<float, Cast<float, float>, FilterVec_32f>
3182             (kernel, anchor, delta, Cast<float, float>(), FilterVec_32f(kernel, 0, delta)));
3183     if( sdepth == CV_64F && ddepth == CV_64F )
3184         return Ptr<BaseFilter>(new Filter2D<double,
3185             Cast<double, double>, FilterNoVec>(kernel, anchor, delta));
3186
3187     CV_Error_( CV_StsNotImplemented,
3188         ("Unsupported combination of source format (=%d), and destination format (=%d)",
3189         srcType, dstType));
3190
3191     return Ptr<BaseFilter>(0);
3192 }
3193
3194
3195 cv::Ptr<cv::FilterEngine> cv::createLinearFilter( int _srcType, int _dstType,
3196                                               InputArray filter_kernel,
3197                                               Point _anchor, double _delta,
3198                                               int _rowBorderType, int _columnBorderType,
3199                                               const Scalar& _borderValue )
3200 {
3201     Mat _kernel = filter_kernel.getMat();
3202     _srcType = CV_MAT_TYPE(_srcType);
3203     _dstType = CV_MAT_TYPE(_dstType);    
3204     int cn = CV_MAT_CN(_srcType);
3205     CV_Assert( cn == CV_MAT_CN(_dstType) );
3206
3207     Mat kernel = _kernel;
3208     int bits = 0;
3209
3210     /*int sdepth = CV_MAT_DEPTH(_srcType), ddepth = CV_MAT_DEPTH(_dstType);
3211     int ktype = _kernel.depth() == CV_32S ? KERNEL_INTEGER : getKernelType(_kernel, _anchor); 
3212     if( sdepth == CV_8U && (ddepth == CV_8U || ddepth == CV_16S) &&
3213         _kernel.rows*_kernel.cols <= (1 << 10) )
3214     {
3215         bits = (ktype & KERNEL_INTEGER) ? 0 : 11;
3216         _kernel.convertTo(kernel, CV_32S, 1 << bits);
3217     }*/
3218     
3219     Ptr<BaseFilter> _filter2D = getLinearFilter(_srcType, _dstType,
3220         kernel, _anchor, _delta, bits);
3221
3222     return Ptr<FilterEngine>(new FilterEngine(_filter2D, Ptr<BaseRowFilter>(0),
3223         Ptr<BaseColumnFilter>(0), _srcType, _dstType, _srcType,
3224         _rowBorderType, _columnBorderType, _borderValue ));
3225 }
3226
3227
3228 void cv::filter2D( InputArray _src, OutputArray _dst, int ddepth,
3229                    InputArray _kernel, Point anchor,
3230                    double delta, int borderType )
3231 {
3232     Mat src = _src.getMat(), kernel = _kernel.getMat();
3233     
3234     if( ddepth < 0 )
3235         ddepth = src.depth();
3236
3237 #if CV_SSE2
3238     int dft_filter_size = ((src.depth() == CV_8U && (ddepth == CV_8U || ddepth == CV_16S)) ||
3239         (src.depth() == CV_32F && ddepth == CV_32F)) && checkHardwareSupport(CV_CPU_SSE3)? 130 : 50;
3240 #else
3241     int dft_filter_size = 50;
3242 #endif
3243
3244     _dst.create( src.size(), CV_MAKETYPE(ddepth, src.channels()) );
3245     Mat dst = _dst.getMat();
3246     anchor = normalizeAnchor(anchor, kernel.size());
3247
3248 #ifdef HAVE_TEGRA_OPTIMIZATION
3249     if( tegra::filter2D(src, dst, kernel, anchor, delta, borderType) )
3250         return;
3251 #endif
3252
3253     if( kernel.cols*kernel.rows >= dft_filter_size )
3254     {
3255         Mat temp;
3256         if( src.data != dst.data )
3257             temp = dst;
3258         else
3259             temp.create(dst.size(), dst.type());
3260         crossCorr( src, kernel, temp, src.size(),
3261                    CV_MAKETYPE(ddepth, src.channels()),
3262                    anchor, delta, borderType );
3263         if( temp.data != dst.data )
3264             temp.copyTo(dst);
3265         return;
3266     }
3267
3268     Ptr<FilterEngine> f = createLinearFilter(src.type(), dst.type(), kernel,
3269                                              anchor, delta, borderType & ~BORDER_ISOLATED );
3270     f->apply(src, dst, Rect(0,0,-1,-1), Point(), (borderType & BORDER_ISOLATED) != 0 );
3271 }
3272
3273
3274 void cv::sepFilter2D( InputArray _src, OutputArray _dst, int ddepth,
3275                       InputArray _kernelX, InputArray _kernelY, Point anchor,
3276                       double delta, int borderType )
3277 {
3278     Mat src = _src.getMat(), kernelX = _kernelX.getMat(), kernelY = _kernelY.getMat();
3279     
3280     if( ddepth < 0 )
3281         ddepth = src.depth();
3282
3283     _dst.create( src.size(), CV_MAKETYPE(ddepth, src.channels()) );
3284     Mat dst = _dst.getMat();
3285
3286     Ptr<FilterEngine> f = createSeparableLinearFilter(src.type(),
3287         dst.type(), kernelX, kernelY, anchor, delta, borderType & ~BORDER_ISOLATED );
3288     f->apply(src, dst, Rect(0,0,-1,-1), Point(), (borderType & BORDER_ISOLATED) != 0 );
3289 }
3290
3291
3292 CV_IMPL void
3293 cvFilter2D( const CvArr* srcarr, CvArr* dstarr, const CvMat* _kernel, CvPoint anchor )
3294 {
3295     cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr);
3296     cv::Mat kernel = cv::cvarrToMat(_kernel);
3297
3298     CV_Assert( src.size() == dst.size() && src.channels() == dst.channels() );
3299
3300     cv::filter2D( src, dst, dst.depth(), kernel, anchor, 0, cv::BORDER_REPLICATE );
3301 }
3302
3303 /* End of file. */