1 /*M///////////////////////////////////////////////////////////////////////////////////////
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install,
7 // copy or use the software.
11 // For Open Source Computer Vision Library
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
15 // Third party copyrights are property of their respective owners.
17 // Redistribution and use in source and binary forms, with or without modification,
18 // are permitted provided that the following conditions are met:
20 // * Redistribution's of source code must retain the above copyright notice,
21 // this list of conditions and the following disclaimer.
23 // * Redistribution's in binary form must reproduce the above copyright notice,
24 // this list of conditions and the following disclaimer in the documentation
25 // and/or other materials provided with the distribution.
27 // * The name of the copyright holders may not be used to endorse or promote products
28 // derived from this software without specific prior written permission.
30 // This software is provided by the copyright holders and contributors "as is" and
31 // any express or implied warranties, including, but not limited to, the implied
32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
33 // In no event shall the Intel Corporation or contributors be liable for any direct,
34 // indirect, incidental, special, exemplary, or consequential damages
35 // (including, but not limited to, procurement of substitute goods or services;
36 // loss of use, data, or profits; or business interruption) however caused
37 // and on any theory of liability, whether in contract, strict liability,
38 // or tort (including negligence or otherwise) arising in any way out of
39 // the use of this software, even if advised of the possibility of such damage.
43 #include "precomp.hpp"
46 #include "ippversion.h"
52 /****************************************************************************************\
54 \****************************************************************************************/
57 GEMM_CopyBlock( const uchar* src, size_t src_step,
58 uchar* dst, size_t dst_step,
59 Size size, size_t pix_size )
62 size.width *= (int)(pix_size / sizeof(int));
64 for( ; size.height--; src += src_step, dst += dst_step )
67 #if CV_ENABLE_UNROLLED
68 for( ; j <= size.width - 4; j += 4 )
70 int t0 = ((const int*)src)[j];
71 int t1 = ((const int*)src)[j+1];
73 ((int*)dst)[j+1] = t1;
74 t0 = ((const int*)src)[j+2];
75 t1 = ((const int*)src)[j+3];
76 ((int*)dst)[j+2] = t0;
77 ((int*)dst)[j+3] = t1;
80 for( ; j < size.width; j++ )
81 ((int*)dst)[j] = ((const int*)src)[j];
87 GEMM_TransposeBlock( const uchar* src, size_t src_step,
88 uchar* dst, size_t dst_step,
89 Size size, size_t pix_size )
92 for( i = 0; i < size.width; i++, dst += dst_step, src += pix_size )
94 const uchar* _src = src;
98 for( j = 0; j < size.height; j++, _src += src_step )
99 ((int*)dst)[j] = ((int*)_src)[0];
102 for( j = 0; j < size.height*2; j += 2, _src += src_step )
104 int t0 = ((int*)_src)[0];
105 int t1 = ((int*)_src)[1];
107 ((int*)dst)[j+1] = t1;
111 for( j = 0; j < size.height*4; j += 4, _src += src_step )
113 int t0 = ((int*)_src)[0];
114 int t1 = ((int*)_src)[1];
116 ((int*)dst)[j+1] = t1;
117 t0 = ((int*)_src)[2];
118 t1 = ((int*)_src)[3];
119 ((int*)dst)[j+2] = t0;
120 ((int*)dst)[j+3] = t1;
131 template<typename T, typename WT> static void
132 GEMMSingleMul( const T* a_data, size_t a_step,
133 const T* b_data, size_t b_step,
134 const T* c_data, size_t c_step,
135 T* d_data, size_t d_step,
136 Size a_size, Size d_size,
137 double alpha, double beta, int flags )
139 int i, j, k, n = a_size.width, m = d_size.width, drows = d_size.height;
140 const T *_a_data = a_data, *_b_data = b_data, *_c_data = c_data;
141 cv::AutoBuffer<T> _a_buf;
143 size_t a_step0, a_step1, c_step0, c_step1, t_step;
145 a_step /= sizeof(a_data[0]);
146 b_step /= sizeof(b_data[0]);
147 c_step /= sizeof(c_data[0]);
148 d_step /= sizeof(d_data[0]);
153 c_step0 = c_step1 = 0;
154 else if( !(flags & GEMM_3_T) )
155 c_step0 = c_step, c_step1 = 1;
157 c_step0 = 1, c_step1 = c_step;
159 if( flags & GEMM_1_T )
161 CV_SWAP( a_step0, a_step1, t_step );
163 if( a_step > 1 && n > 1 )
170 if( n == 1 ) /* external product */
172 cv::AutoBuffer<T> _b_buf;
175 if( a_step > 1 && a_size.height > 1 )
177 _a_buf.allocate(drows);
179 for( k = 0; k < drows; k++ )
180 a_buf[k] = a_data[a_step*k];
186 _b_buf.allocate(d_size.width);
188 for( j = 0; j < d_size.width; j++ )
189 b_buf[j] = b_data[j*b_step];
193 for( i = 0; i < drows; i++, _c_data += c_step0, d_data += d_step )
195 WT al = WT(a_data[i])*alpha;
197 for( j = 0; j <= d_size.width - 2; j += 2, c_data += 2*c_step1 )
199 WT s0 = al*WT(b_data[j]);
200 WT s1 = al*WT(b_data[j+1]);
208 d_data[j] = T(s0 + WT(c_data[0])*beta);
209 d_data[j+1] = T(s1 + WT(c_data[c_step1])*beta);
213 for( ; j < d_size.width; j++, c_data += c_step1 )
215 WT s0 = al*WT(b_data[j]);
219 d_data[j] = T(s0 + WT(c_data[0])*beta);
223 else if( flags & GEMM_2_T ) /* A * Bt */
225 for( i = 0; i < drows; i++, _a_data += a_step0, _c_data += c_step0, d_data += d_step )
233 for( k = 0; k < n; k++ )
234 a_buf[k] = a_data[a_step1*k];
238 for( j = 0; j < d_size.width; j++, b_data += b_step,
241 WT s0(0), s1(0), s2(0), s3(0);
243 #if CV_ENABLE_UNROLLED
244 for( ; k <= n - 4; k += 4 )
246 s0 += WT(a_data[k])*WT(b_data[k]);
247 s1 += WT(a_data[k+1])*WT(b_data[k+1]);
248 s2 += WT(a_data[k+2])*WT(b_data[k+2]);
249 s3 += WT(a_data[k+3])*WT(b_data[k+3]);
253 s0 += WT(a_data[k])*WT(b_data[k]);
254 s0 = (s0+s1+s2+s3)*alpha;
259 d_data[j] = T(s0 + WT(c_data[0])*beta);
263 else if( d_size.width*sizeof(d_data[0]) <= 1600 )
265 for( i = 0; i < drows; i++, _a_data += a_step0,
269 a_data = _a_data, c_data = _c_data;
273 for( k = 0; k < n; k++ )
274 a_buf[k] = a_data[a_step1*k];
278 for( j = 0; j <= m - 4; j += 4, c_data += 4*c_step1 )
280 const T* b = _b_data + j;
281 WT s0(0), s1(0), s2(0), s3(0);
283 for( k = 0; k < n; k++, b += b_step )
286 s0 += a * WT(b[0]); s1 += a * WT(b[1]);
287 s2 += a * WT(b[2]); s3 += a * WT(b[3]);
292 d_data[j] = T(s0*alpha);
293 d_data[j+1] = T(s1*alpha);
294 d_data[j+2] = T(s2*alpha);
295 d_data[j+3] = T(s3*alpha);
299 s0 = s0*alpha; s1 = s1*alpha;
300 s2 = s2*alpha; s3 = s3*alpha;
301 d_data[j] = T(s0 + WT(c_data[0])*beta);
302 d_data[j+1] = T(s1 + WT(c_data[c_step1])*beta);
303 d_data[j+2] = T(s2 + WT(c_data[c_step1*2])*beta);
304 d_data[j+3] = T(s3 + WT(c_data[c_step1*3])*beta);
308 for( ; j < m; j++, c_data += c_step1 )
310 const T* b = _b_data + j;
313 for( k = 0; k < n; k++, b += b_step )
314 s0 += WT(a_data[k]) * WT(b[0]);
320 d_data[j] = T(s0 + WT(c_data[0])*beta);
326 cv::AutoBuffer<WT> _d_buf(m);
329 for( i = 0; i < drows; i++, _a_data += a_step0, _c_data += c_step0, d_data += d_step )
337 for( k = 0; k < n; k++ )
338 a_buf[k] = _a_data[a_step1*k];
342 for( j = 0; j < m; j++ )
345 for( k = 0; k < n; k++, b_data += b_step )
349 #if CV_ENABLE_UNROLLED
350 for(; j <= m - 4; j += 4 )
352 WT t0 = d_buf[j] + WT(b_data[j])*al;
353 WT t1 = d_buf[j+1] + WT(b_data[j+1])*al;
356 t0 = d_buf[j+2] + WT(b_data[j+2])*al;
357 t1 = d_buf[j+3] + WT(b_data[j+3])*al;
363 d_buf[j] += WT(b_data[j])*al;
367 for( j = 0; j < m; j++ )
368 d_data[j] = T(d_buf[j]*alpha);
370 for( j = 0; j < m; j++, c_data += c_step1 )
372 WT t = d_buf[j]*alpha;
373 d_data[j] = T(t + WT(c_data[0])*beta);
380 template<typename T, typename WT> static void
381 GEMMBlockMul( const T* a_data, size_t a_step,
382 const T* b_data, size_t b_step,
383 WT* d_data, size_t d_step,
384 Size a_size, Size d_size, int flags )
386 int i, j, k, n = a_size.width, m = d_size.width;
387 const T *_a_data = a_data, *_b_data = b_data;
388 cv::AutoBuffer<T> _a_buf;
390 size_t a_step0, a_step1, t_step;
391 int do_acc = flags & 16;
393 a_step /= sizeof(a_data[0]);
394 b_step /= sizeof(b_data[0]);
395 d_step /= sizeof(d_data[0]);
400 if( flags & GEMM_1_T )
402 CV_SWAP( a_step0, a_step1, t_step );
408 if( flags & GEMM_2_T )
410 /* second operand is transposed */
411 for( i = 0; i < d_size.height; i++, _a_data += a_step0, d_data += d_step )
413 a_data = _a_data; b_data = _b_data;
417 for( k = 0; k < n; k++ )
418 a_buf[k] = a_data[a_step1*k];
422 for( j = 0; j < d_size.width; j++, b_data += b_step )
424 WT s0 = do_acc ? d_data[j]:WT(0), s1(0);
425 for( k = 0; k <= n - 2; k += 2 )
427 s0 += WT(a_data[k])*WT(b_data[k]);
428 s1 += WT(a_data[k+1])*WT(b_data[k+1]);
432 s0 += WT(a_data[k])*WT(b_data[k]);
440 for( i = 0; i < d_size.height; i++, _a_data += a_step0, d_data += d_step )
442 a_data = _a_data, b_data = _b_data;
446 for( k = 0; k < n; k++ )
447 a_buf[k] = a_data[a_step1*k];
451 for( j = 0; j <= m - 4; j += 4 )
454 const T* b = b_data + j;
458 s0 = d_data[j]; s1 = d_data[j+1];
459 s2 = d_data[j+2]; s3 = d_data[j+3];
462 s0 = s1 = s2 = s3 = WT(0);
464 for( k = 0; k < n; k++, b += b_step )
467 s0 += a * WT(b[0]); s1 += a * WT(b[1]);
468 s2 += a * WT(b[2]); s3 += a * WT(b[3]);
471 d_data[j] = s0; d_data[j+1] = s1;
472 d_data[j+2] = s2; d_data[j+3] = s3;
477 const T* b = b_data + j;
478 WT s0 = do_acc ? d_data[j] : WT(0);
480 for( k = 0; k < n; k++, b += b_step )
481 s0 += WT(a_data[k]) * WT(b[0]);
490 template<typename T, typename WT> static void
491 GEMMStore( const T* c_data, size_t c_step,
492 const WT* d_buf, size_t d_buf_step,
493 T* d_data, size_t d_step, Size d_size,
494 double alpha, double beta, int flags )
496 const T* _c_data = c_data;
498 size_t c_step0, c_step1;
500 c_step /= sizeof(c_data[0]);
501 d_buf_step /= sizeof(d_buf[0]);
502 d_step /= sizeof(d_data[0]);
505 c_step0 = c_step1 = 0;
506 else if( !(flags & GEMM_3_T) )
507 c_step0 = c_step, c_step1 = 1;
509 c_step0 = 1, c_step1 = c_step;
511 for( ; d_size.height--; _c_data += c_step0, d_buf += d_buf_step, d_data += d_step )
517 #if CV_ENABLE_UNROLLED
518 for(; j <= d_size.width - 4; j += 4, c_data += 4*c_step1 )
520 WT t0 = alpha*d_buf[j];
521 WT t1 = alpha*d_buf[j+1];
522 t0 += beta*WT(c_data[0]);
523 t1 += beta*WT(c_data[c_step1]);
526 t0 = alpha*d_buf[j+2];
527 t1 = alpha*d_buf[j+3];
528 t0 += beta*WT(c_data[c_step1*2]);
529 t1 += beta*WT(c_data[c_step1*3]);
534 for( ; j < d_size.width; j++, c_data += c_step1 )
536 WT t0 = alpha*d_buf[j];
537 d_data[j] = T(t0 + WT(c_data[0])*beta);
543 #if CV_ENABLE_UNROLLED
544 for( ; j <= d_size.width - 4; j += 4 )
546 WT t0 = alpha*d_buf[j];
547 WT t1 = alpha*d_buf[j+1];
550 t0 = alpha*d_buf[j+2];
551 t1 = alpha*d_buf[j+3];
556 for( ; j < d_size.width; j++ )
557 d_data[j] = T(alpha*d_buf[j]);
563 typedef void (*GEMMSingleMulFunc)( const void* src1, size_t step1,
564 const void* src2, size_t step2, const void* src3, size_t step3,
565 void* dst, size_t dststep, Size srcsize, Size dstsize,
566 double alpha, double beta, int flags );
568 typedef void (*GEMMBlockMulFunc)( const void* src1, size_t step1,
569 const void* src2, size_t step2, void* dst, size_t dststep,
570 Size srcsize, Size dstsize, int flags );
572 typedef void (*GEMMStoreFunc)( const void* src1, size_t step1,
573 const void* src2, size_t step2, void* dst, size_t dststep,
574 Size dstsize, double alpha, double beta, int flags );
576 static void GEMMSingleMul_32f( const float* a_data, size_t a_step,
577 const float* b_data, size_t b_step,
578 const float* c_data, size_t c_step,
579 float* d_data, size_t d_step,
580 Size a_size, Size d_size,
581 double alpha, double beta, int flags )
583 GEMMSingleMul<float,double>(a_data, a_step, b_data, b_step, c_data,
584 c_step, d_data, d_step, a_size, d_size,
588 static void GEMMSingleMul_64f( const double* a_data, size_t a_step,
589 const double* b_data, size_t b_step,
590 const double* c_data, size_t c_step,
591 double* d_data, size_t d_step,
592 Size a_size, Size d_size,
593 double alpha, double beta, int flags )
595 GEMMSingleMul<double,double>(a_data, a_step, b_data, b_step, c_data,
596 c_step, d_data, d_step, a_size, d_size,
601 static void GEMMSingleMul_32fc( const Complexf* a_data, size_t a_step,
602 const Complexf* b_data, size_t b_step,
603 const Complexf* c_data, size_t c_step,
604 Complexf* d_data, size_t d_step,
605 Size a_size, Size d_size,
606 double alpha, double beta, int flags )
608 GEMMSingleMul<Complexf,Complexd>(a_data, a_step, b_data, b_step, c_data,
609 c_step, d_data, d_step, a_size, d_size,
613 static void GEMMSingleMul_64fc( const Complexd* a_data, size_t a_step,
614 const Complexd* b_data, size_t b_step,
615 const Complexd* c_data, size_t c_step,
616 Complexd* d_data, size_t d_step,
617 Size a_size, Size d_size,
618 double alpha, double beta, int flags )
620 GEMMSingleMul<Complexd,Complexd>(a_data, a_step, b_data, b_step, c_data,
621 c_step, d_data, d_step, a_size, d_size,
625 static void GEMMBlockMul_32f( const float* a_data, size_t a_step,
626 const float* b_data, size_t b_step,
627 double* d_data, size_t d_step,
628 Size a_size, Size d_size, int flags )
630 GEMMBlockMul(a_data, a_step, b_data, b_step, d_data, d_step, a_size, d_size, flags);
634 static void GEMMBlockMul_64f( const double* a_data, size_t a_step,
635 const double* b_data, size_t b_step,
636 double* d_data, size_t d_step,
637 Size a_size, Size d_size, int flags )
639 GEMMBlockMul(a_data, a_step, b_data, b_step, d_data, d_step, a_size, d_size, flags);
643 static void GEMMBlockMul_32fc( const Complexf* a_data, size_t a_step,
644 const Complexf* b_data, size_t b_step,
645 Complexd* d_data, size_t d_step,
646 Size a_size, Size d_size, int flags )
648 GEMMBlockMul(a_data, a_step, b_data, b_step, d_data, d_step, a_size, d_size, flags);
652 static void GEMMBlockMul_64fc( const Complexd* a_data, size_t a_step,
653 const Complexd* b_data, size_t b_step,
654 Complexd* d_data, size_t d_step,
655 Size a_size, Size d_size, int flags )
657 GEMMBlockMul(a_data, a_step, b_data, b_step, d_data, d_step, a_size, d_size, flags);
661 static void GEMMStore_32f( const float* c_data, size_t c_step,
662 const double* d_buf, size_t d_buf_step,
663 float* d_data, size_t d_step, Size d_size,
664 double alpha, double beta, int flags )
666 GEMMStore(c_data, c_step, d_buf, d_buf_step, d_data, d_step, d_size, alpha, beta, flags);
670 static void GEMMStore_64f( const double* c_data, size_t c_step,
671 const double* d_buf, size_t d_buf_step,
672 double* d_data, size_t d_step, Size d_size,
673 double alpha, double beta, int flags )
675 GEMMStore(c_data, c_step, d_buf, d_buf_step, d_data, d_step, d_size, alpha, beta, flags);
679 static void GEMMStore_32fc( const Complexf* c_data, size_t c_step,
680 const Complexd* d_buf, size_t d_buf_step,
681 Complexf* d_data, size_t d_step, Size d_size,
682 double alpha, double beta, int flags )
684 GEMMStore(c_data, c_step, d_buf, d_buf_step, d_data, d_step, d_size, alpha, beta, flags);
688 static void GEMMStore_64fc( const Complexd* c_data, size_t c_step,
689 const Complexd* d_buf, size_t d_buf_step,
690 Complexd* d_data, size_t d_step, Size d_size,
691 double alpha, double beta, int flags )
693 GEMMStore(c_data, c_step, d_buf, d_buf_step, d_data, d_step, d_size, alpha, beta, flags);
698 void cv::gemm( InputArray matA, InputArray matB, double alpha,
699 InputArray matC, double beta, OutputArray _matD, int flags )
701 const int block_lin_size = 128;
702 const int block_size = block_lin_size * block_lin_size;
704 static double zero[] = {0,0,0,0};
705 static float zerof[] = {0,0,0,0};
707 Mat A = matA.getMat(), B = matB.getMat(), C = beta != 0 ? matC.getMat() : Mat();
708 Size a_size = A.size(), d_size;
709 int i, len = 0, type = A.type();
711 CV_Assert( type == B.type() && (type == CV_32FC1 || type == CV_64FC1 || type == CV_32FC2 || type == CV_64FC2) );
713 switch( flags & (GEMM_1_T|GEMM_2_T) )
716 d_size = Size( B.cols, a_size.height );
718 CV_Assert( a_size.width == len );
721 d_size = Size( B.cols, a_size.width );
723 CV_Assert( a_size.height == len );
726 d_size = Size( B.rows, a_size.height );
728 CV_Assert( a_size.width == len );
731 d_size = Size( B.rows, a_size.width );
733 CV_Assert( a_size.height == len );
739 CV_Assert( C.type() == type &&
740 (((flags&GEMM_3_T) == 0 && C.rows == d_size.height && C.cols == d_size.width) ||
741 ((flags&GEMM_3_T) != 0 && C.rows == d_size.width && C.cols == d_size.height)));
744 _matD.create( d_size.height, d_size.width, type );
745 Mat D = _matD.getMat();
746 if( (flags & GEMM_3_T) != 0 && C.data == D.data )
752 if( flags == 0 && 2 <= len && len <= 4 && (len == d_size.width || len == d_size.height) )
756 float* d = (float*)D.data;
757 const float *a = (const float*)A.data,
758 *b = (const float*)B.data,
759 *c = (const float*)C.data;
760 size_t d_step = D.step/sizeof(d[0]),
761 a_step = A.step/sizeof(a[0]),
762 b_step = B.step/sizeof(b[0]),
763 c_step = C.data ? C.step/sizeof(c[0]) : 0;
771 if( len == d_size.width && b != d )
773 for( i = 0; i < d_size.height; i++, d += d_step, a += a_step, c += c_step )
775 float t0 = a[0]*b[0] + a[1]*b[b_step];
776 float t1 = a[0]*b[1] + a[1]*b[b_step+1];
777 d[0] = (float)(t0*alpha + c[0]*beta);
778 d[1] = (float)(t1*alpha + c[1]*beta);
790 for( i = 0; i < d_size.width; i++, d++, b++, c += c_step0 )
792 float t0 = a[0]*b[0] + a[1]*b[b_step];
793 float t1 = a[a_step]*b[0] + a[a_step+1]*b[b_step];
794 d[0] = (float)(t0*alpha + c[0]*beta);
795 d[d_step] = (float)(t1*alpha + c[c_step]*beta);
802 if( len == d_size.width && b != d )
804 for( i = 0; i < d_size.height; i++, d += d_step, a += a_step, c += c_step )
806 float t0 = a[0]*b[0] + a[1]*b[b_step] + a[2]*b[b_step*2];
807 float t1 = a[0]*b[1] + a[1]*b[b_step+1] + a[2]*b[b_step*2+1];
808 float t2 = a[0]*b[2] + a[1]*b[b_step+2] + a[2]*b[b_step*2+2];
809 d[0] = (float)(t0*alpha + c[0]*beta);
810 d[1] = (float)(t1*alpha + c[1]*beta);
811 d[2] = (float)(t2*alpha + c[2]*beta);
823 for( i = 0; i < d_size.width; i++, d++, b++, c += c_step0 )
825 float t0 = a[0]*b[0] + a[1]*b[b_step] + a[2]*b[b_step*2];
826 float t1 = a[a_step]*b[0] + a[a_step+1]*b[b_step] + a[a_step+2]*b[b_step*2];
827 float t2 = a[a_step*2]*b[0] + a[a_step*2+1]*b[b_step] + a[a_step*2+2]*b[b_step*2];
829 d[0] = (float)(t0*alpha + c[0]*beta);
830 d[d_step] = (float)(t1*alpha + c[c_step]*beta);
831 d[d_step*2] = (float)(t2*alpha + c[c_step*2]*beta);
838 if( len == d_size.width && b != d )
840 for( i = 0; i < d_size.height; i++, d += d_step, a += a_step, c += c_step )
842 float t0 = a[0]*b[0] + a[1]*b[b_step] + a[2]*b[b_step*2] + a[3]*b[b_step*3];
843 float t1 = a[0]*b[1] + a[1]*b[b_step+1] + a[2]*b[b_step*2+1] + a[3]*b[b_step*3+1];
844 float t2 = a[0]*b[2] + a[1]*b[b_step+2] + a[2]*b[b_step*2+2] + a[3]*b[b_step*3+2];
845 float t3 = a[0]*b[3] + a[1]*b[b_step+3] + a[2]*b[b_step*2+3] + a[3]*b[b_step*3+3];
846 d[0] = (float)(t0*alpha + c[0]*beta);
847 d[1] = (float)(t1*alpha + c[1]*beta);
848 d[2] = (float)(t2*alpha + c[2]*beta);
849 d[3] = (float)(t3*alpha + c[3]*beta);
852 else if( len <= 16 && a != d )
861 for( i = 0; i < d_size.width; i++, d++, b++, c += c_step0 )
863 float t0 = a[0]*b[0] + a[1]*b[b_step] + a[2]*b[b_step*2] + a[3]*b[b_step*3];
864 float t1 = a[a_step]*b[0] + a[a_step+1]*b[b_step] +
865 a[a_step+2]*b[b_step*2] + a[a_step+3]*b[b_step*3];
866 float t2 = a[a_step*2]*b[0] + a[a_step*2+1]*b[b_step] +
867 a[a_step*2+2]*b[b_step*2] + a[a_step*2+3]*b[b_step*3];
868 float t3 = a[a_step*3]*b[0] + a[a_step*3+1]*b[b_step] +
869 a[a_step*3+2]*b[b_step*2] + a[a_step*3+3]*b[b_step*3];
870 d[0] = (float)(t0*alpha + c[0]*beta);
871 d[d_step] = (float)(t1*alpha + c[c_step]*beta);
872 d[d_step*2] = (float)(t2*alpha + c[c_step*2]*beta);
873 d[d_step*3] = (float)(t3*alpha + c[c_step*3]*beta);
884 double* d = (double*)D.data;
885 const double *a = (const double*)A.data,
886 *b = (const double*)B.data,
887 *c = (const double*)C.data;
888 size_t d_step = D.step/sizeof(d[0]),
889 a_step = A.step/sizeof(a[0]),
890 b_step = B.step/sizeof(b[0]),
891 c_step = C.data ? C.step/sizeof(c[0]) : 0;
898 if( len == d_size.width && b != d )
900 for( i = 0; i < d_size.height; i++, d += d_step, a += a_step, c += c_step )
902 double t0 = a[0]*b[0] + a[1]*b[b_step];
903 double t1 = a[0]*b[1] + a[1]*b[b_step+1];
904 d[0] = t0*alpha + c[0]*beta;
905 d[1] = t1*alpha + c[1]*beta;
917 for( i = 0; i < d_size.width; i++, d++, b++, c += c_step0 )
919 double t0 = a[0]*b[0] + a[1]*b[b_step];
920 double t1 = a[a_step]*b[0] + a[a_step+1]*b[b_step];
921 d[0] = t0*alpha + c[0]*beta;
922 d[d_step] = t1*alpha + c[c_step]*beta;
929 if( len == d_size.width && b != d )
931 for( i = 0; i < d_size.height; i++, d += d_step, a += a_step, c += c_step )
933 double t0 = a[0]*b[0] + a[1]*b[b_step] + a[2]*b[b_step*2];
934 double t1 = a[0]*b[1] + a[1]*b[b_step+1] + a[2]*b[b_step*2+1];
935 double t2 = a[0]*b[2] + a[1]*b[b_step+2] + a[2]*b[b_step*2+2];
936 d[0] = t0*alpha + c[0]*beta;
937 d[1] = t1*alpha + c[1]*beta;
938 d[2] = t2*alpha + c[2]*beta;
950 for( i = 0; i < d_size.width; i++, d++, b++, c += c_step0 )
952 double t0 = a[0]*b[0] + a[1]*b[b_step] + a[2]*b[b_step*2];
953 double t1 = a[a_step]*b[0] + a[a_step+1]*b[b_step] + a[a_step+2]*b[b_step*2];
954 double t2 = a[a_step*2]*b[0] + a[a_step*2+1]*b[b_step] + a[a_step*2+2]*b[b_step*2];
956 d[0] = t0*alpha + c[0]*beta;
957 d[d_step] = t1*alpha + c[c_step]*beta;
958 d[d_step*2] = t2*alpha + c[c_step*2]*beta;
965 if( len == d_size.width && b != d )
967 for( i = 0; i < d_size.height; i++, d += d_step, a += a_step, c += c_step )
969 double t0 = a[0]*b[0] + a[1]*b[b_step] + a[2]*b[b_step*2] + a[3]*b[b_step*3];
970 double t1 = a[0]*b[1] + a[1]*b[b_step+1] + a[2]*b[b_step*2+1] + a[3]*b[b_step*3+1];
971 double t2 = a[0]*b[2] + a[1]*b[b_step+2] + a[2]*b[b_step*2+2] + a[3]*b[b_step*3+2];
972 double t3 = a[0]*b[3] + a[1]*b[b_step+3] + a[2]*b[b_step*2+3] + a[3]*b[b_step*3+3];
973 d[0] = t0*alpha + c[0]*beta;
974 d[1] = t1*alpha + c[1]*beta;
975 d[2] = t2*alpha + c[2]*beta;
976 d[3] = t3*alpha + c[3]*beta;
979 else if( d_size.width <= 16 && a != d )
988 for( i = 0; i < d_size.width; i++, d++, b++, c += c_step0 )
990 double t0 = a[0]*b[0] + a[1]*b[b_step] + a[2]*b[b_step*2] + a[3]*b[b_step*3];
991 double t1 = a[a_step]*b[0] + a[a_step+1]*b[b_step] +
992 a[a_step+2]*b[b_step*2] + a[a_step+3]*b[b_step*3];
993 double t2 = a[a_step*2]*b[0] + a[a_step*2+1]*b[b_step] +
994 a[a_step*2+2]*b[b_step*2] + a[a_step*2+3]*b[b_step*3];
995 double t3 = a[a_step*3]*b[0] + a[a_step*3+1]*b[b_step] +
996 a[a_step*3+2]*b[b_step*2] + a[a_step*3+3]*b[b_step*3];
997 d[0] = t0*alpha + c[0]*beta;
998 d[d_step] = t1*alpha + c[c_step]*beta;
999 d[d_step*2] = t2*alpha + c[c_step*2]*beta;
1000 d[d_step*3] = t3*alpha + c[c_step*3]*beta;
1011 size_t b_step = B.step;
1012 GEMMSingleMulFunc singleMulFunc;
1013 GEMMBlockMulFunc blockMulFunc;
1014 GEMMStoreFunc storeFunc;
1015 Mat *matD = &D, tmat;
1016 const uchar* Cdata = C.data;
1017 size_t Cstep = C.data ? (size_t)C.step : 0;
1018 AutoBuffer<uchar> buf;
1020 if( type == CV_32FC1 )
1022 singleMulFunc = (GEMMSingleMulFunc)GEMMSingleMul_32f;
1023 blockMulFunc = (GEMMBlockMulFunc)GEMMBlockMul_32f;
1024 storeFunc = (GEMMStoreFunc)GEMMStore_32f;
1026 else if( type == CV_64FC1 )
1028 singleMulFunc = (GEMMSingleMulFunc)GEMMSingleMul_64f;
1029 blockMulFunc = (GEMMBlockMulFunc)GEMMBlockMul_64f;
1030 storeFunc = (GEMMStoreFunc)GEMMStore_64f;
1032 else if( type == CV_32FC2 )
1034 singleMulFunc = (GEMMSingleMulFunc)GEMMSingleMul_32fc;
1035 blockMulFunc = (GEMMBlockMulFunc)GEMMBlockMul_32fc;
1036 storeFunc = (GEMMStoreFunc)GEMMStore_32fc;
1040 CV_Assert( type == CV_64FC2 );
1041 singleMulFunc = (GEMMSingleMulFunc)GEMMSingleMul_64fc;
1042 blockMulFunc = (GEMMBlockMulFunc)GEMMBlockMul_64fc;
1043 storeFunc = (GEMMStoreFunc)GEMMStore_64fc;
1046 if( D.data == A.data || D.data == B.data )
1048 buf.allocate(d_size.width*d_size.height*CV_ELEM_SIZE(type));
1049 tmat = Mat(d_size.height, d_size.width, type, (uchar*)buf );
1053 if( (d_size.width == 1 || len == 1) && !(flags & GEMM_2_T) && B.isContinuous() )
1055 b_step = d_size.width == 1 ? 0 : CV_ELEM_SIZE(type);
1059 /*if( (d_size.width | d_size.height | len) >= 16 && icvBLAS_GEMM_32f_p != 0 )
1061 blas_func = type == CV_32FC1 ? (icvBLAS_GEMM_32f_t)icvBLAS_GEMM_32f_p :
1062 type == CV_64FC1 ? (icvBLAS_GEMM_32f_t)icvBLAS_GEMM_64f_p :
1063 type == CV_32FC2 ? (icvBLAS_GEMM_32f_t)icvBLAS_GEMM_32fc_p :
1064 type == CV_64FC2 ? (icvBLAS_GEMM_32f_t)icvBLAS_GEMM_64fc_p : 0;
1069 const char* transa = flags & GEMM_1_T ? "t" : "n";
1070 const char* transb = flags & GEMM_2_T ? "t" : "n";
1075 if( C->data.ptr != D->data.ptr )
1077 if( !(flags & GEMM_3_T) )
1080 cvTranspose( C, D );
1084 if( CV_MAT_DEPTH(type) == CV_32F )
1086 Complex32f _alpha, _beta;
1088 lda = A->step/sizeof(float);
1089 ldb = b_step/sizeof(float);
1090 ldd = D->step/sizeof(float);
1091 _alpha.re = (float)alpha;
1093 _beta.re = C->data.ptr ? (float)beta : 0;
1095 if( CV_MAT_CN(type) == 2 )
1096 lda /= 2, ldb /= 2, ldd /= 2;
1098 blas_func( transb, transa, &d_size.width, &d_size.height, &len,
1099 &_alpha, B->data.ptr, &ldb, A->data.ptr, &lda,
1100 &_beta, D->data.ptr, &ldd );
1104 CvComplex64f _alpha, _beta;
1106 lda = A->step/sizeof(double);
1107 ldb = b_step/sizeof(double);
1108 ldd = D->step/sizeof(double);
1111 _beta.re = C->data.ptr ? beta : 0;
1113 if( CV_MAT_CN(type) == 2 )
1114 lda /= 2, ldb /= 2, ldd /= 2;
1116 blas_func( transb, transa, &d_size.width, &d_size.height, &len,
1117 &_alpha, B->data.ptr, &ldb, A->data.ptr, &lda,
1118 &_beta, D->data.ptr, &ldd );
1121 else*/ if( ((d_size.height <= block_lin_size/2 || d_size.width <= block_lin_size/2) &&
1122 len <= 10000) || len <= 10 ||
1123 (d_size.width <= block_lin_size &&
1124 d_size.height <= block_lin_size && len <= block_lin_size) )
1126 singleMulFunc( A.data, A.step, B.data, b_step, Cdata, Cstep,
1127 matD->data, matD->step, a_size, d_size, alpha, beta, flags );
1131 int is_a_t = flags & GEMM_1_T;
1132 int is_b_t = flags & GEMM_2_T;
1133 int elem_size = CV_ELEM_SIZE(type);
1135 int a_buf_size = 0, b_buf_size, d_buf_size;
1139 int j, k, di = 0, dj = 0, dk = 0;
1141 size_t a_step0, a_step1, b_step0, b_step1, c_step0, c_step1;
1142 int work_elem_size = elem_size << (CV_MAT_DEPTH(type) == CV_32F ? 1 : 0);
1145 a_step0 = A.step, a_step1 = elem_size;
1147 a_step0 = elem_size, a_step1 = A.step;
1150 b_step0 = b_step, b_step1 = elem_size;
1152 b_step0 = elem_size, b_step1 = b_step;
1156 c_step0 = c_step1 = 0;
1159 else if( !(flags & GEMM_3_T) )
1160 c_step0 = C.step, c_step1 = elem_size;
1162 c_step0 = elem_size, c_step1 = C.step;
1164 dm0 = std::min( block_lin_size, d_size.height );
1165 dn0 = std::min( block_lin_size, d_size.width );
1166 dk0_1 = block_size / dm0;
1167 dk0_2 = block_size / dn0;
1168 dk0 = std::min( dk0_1, dk0_2 );
1169 dk0 = std::min( dk0, len );
1170 if( dk0*dm0 > block_size )
1171 dm0 = block_size / dk0;
1172 if( dk0*dn0 > block_size )
1173 dn0 = block_size / dk0;
1175 dk0_1 = (dn0+dn0/8+2) & -2;
1176 b_buf_size = (dk0+dk0/8+1)*dk0_1*elem_size;
1177 d_buf_size = (dk0+dk0/8+1)*dk0_1*work_elem_size;
1181 a_buf_size = (dm0+dm0/8+1)*((dk0+dk0/8+2)&-2)*elem_size;
1185 buf.allocate(a_buf_size + b_buf_size + d_buf_size);
1186 d_buf = (uchar*)buf;
1187 b_buf = d_buf + d_buf_size;
1190 a_buf = b_buf + b_buf_size;
1192 for( i = 0; i < d_size.height; i += di )
1195 if( i + di >= d_size.height || 8*(i + di) + di > 8*d_size.height )
1196 di = d_size.height - i;
1198 for( j = 0; j < d_size.width; j += dj )
1200 uchar* _d = matD->data + i*matD->step + j*elem_size;
1201 const uchar* _c = Cdata + i*c_step0 + j*c_step1;
1202 size_t _d_step = matD->step;
1205 if( j + dj >= d_size.width || 8*(j + dj) + dj > 8*d_size.width )
1206 dj = d_size.width - j;
1212 _d_step = dj*work_elem_size;
1215 for( k = 0; k < len; k += dk )
1217 const uchar* _a = A.data + i*a_step0 + k*a_step1;
1218 size_t _a_step = A.step;
1219 const uchar* _b = B.data + k*b_step0 + j*b_step1;
1220 size_t _b_step = b_step;
1224 if( k + dk >= len || 8*(k + dk) + dk > 8*len )
1228 a_bl_size.width = dk, a_bl_size.height = di;
1230 a_bl_size.width = di, a_bl_size.height = dk;
1232 if( a_buf && is_a_t )
1234 _a_step = dk*elem_size;
1235 GEMM_TransposeBlock( _a, A.step, a_buf, _a_step, a_bl_size, elem_size );
1236 std::swap( a_bl_size.width, a_bl_size.height );
1240 if( dj < d_size.width )
1244 b_size.width = dj, b_size.height = dk;
1246 b_size.width = dk, b_size.height = dj;
1248 _b_step = b_size.width*elem_size;
1249 GEMM_CopyBlock( _b, b_step, b_buf, _b_step, b_size, elem_size );
1254 blockMulFunc( _a, _a_step, _b, _b_step, _d, _d_step,
1255 a_bl_size, Size(dj,di), flags );
1257 singleMulFunc( _a, _a_step, _b, _b_step, _c, Cstep,
1258 _d, _d_step, a_bl_size, Size(dj,di), alpha, beta, flags );
1263 storeFunc( _c, Cstep, _d, _d_step,
1264 matD->data + i*matD->step + j*elem_size,
1265 matD->step, Size(dj,di), alpha, beta, flags );
1275 /****************************************************************************************\
1277 \****************************************************************************************/
1282 template<typename T, typename WT> static void
1283 transform_( const T* src, T* dst, const WT* m, int len, int scn, int dcn )
1287 if( scn == 2 && dcn == 2 )
1289 for( x = 0; x < len*2; x += 2 )
1291 WT v0 = src[x], v1 = src[x+1];
1292 T t0 = saturate_cast<T>(m[0]*v0 + m[1]*v1 + m[2]);
1293 T t1 = saturate_cast<T>(m[3]*v0 + m[4]*v1 + m[5]);
1294 dst[x] = t0; dst[x+1] = t1;
1297 else if( scn == 3 && dcn == 3 )
1299 for( x = 0; x < len*3; x += 3 )
1301 WT v0 = src[x], v1 = src[x+1], v2 = src[x+2];
1302 T t0 = saturate_cast<T>(m[0]*v0 + m[1]*v1 + m[2]*v2 + m[3]);
1303 T t1 = saturate_cast<T>(m[4]*v0 + m[5]*v1 + m[6]*v2 + m[7]);
1304 T t2 = saturate_cast<T>(m[8]*v0 + m[9]*v1 + m[10]*v2 + m[11]);
1305 dst[x] = t0; dst[x+1] = t1; dst[x+2] = t2;
1308 else if( scn == 3 && dcn == 1 )
1310 for( x = 0; x < len; x++, src += 3 )
1311 dst[x] = saturate_cast<T>(m[0]*src[0] + m[1]*src[1] + m[2]*src[2] + m[3]);
1313 else if( scn == 4 && dcn == 4 )
1315 for( x = 0; x < len*4; x += 4 )
1317 WT v0 = src[x], v1 = src[x+1], v2 = src[x+2], v3 = src[x+3];
1318 T t0 = saturate_cast<T>(m[0]*v0 + m[1]*v1 + m[2]*v2 + m[3]*v3 + m[4]);
1319 T t1 = saturate_cast<T>(m[5]*v0 + m[6]*v1 + m[7]*v2 + m[8]*v3 + m[9]);
1320 dst[x] = t0; dst[x+1] = t1;
1321 t0 = saturate_cast<T>(m[10]*v0 + m[11]*v1 + m[12]*v2 + m[13]*v3 + m[14]);
1322 t1 = saturate_cast<T>(m[15]*v0 + m[16]*v1 + m[17]*v2 + m[18]*v3 + m[19]);
1323 dst[x+2] = t0; dst[x+3] = t1;
1328 for( x = 0; x < len; x++, src += scn, dst += dcn )
1332 for( j = 0; j < dcn; j++, _m += scn + 1 )
1335 for( k = 0; k < scn; k++ )
1337 dst[j] = saturate_cast<T>(s);
1346 load3x3Matrix( const float* m, __m128& m0, __m128& m1, __m128& m2, __m128& m3 )
1348 m0 = _mm_setr_ps(m[0], m[4], m[8], 0);
1349 m1 = _mm_setr_ps(m[1], m[5], m[9], 0);
1350 m2 = _mm_setr_ps(m[2], m[6], m[10], 0);
1351 m3 = _mm_setr_ps(m[3], m[7], m[11], 0);
1355 load4x4Matrix( const float* m, __m128& m0, __m128& m1, __m128& m2, __m128& m3, __m128& m4 )
1357 m0 = _mm_setr_ps(m[0], m[5], m[10], m[15]);
1358 m1 = _mm_setr_ps(m[1], m[6], m[11], m[16]);
1359 m2 = _mm_setr_ps(m[2], m[7], m[12], m[17]);
1360 m3 = _mm_setr_ps(m[3], m[8], m[13], m[18]);
1361 m4 = _mm_setr_ps(m[4], m[9], m[14], m[19]);
1367 transform_8u( const uchar* src, uchar* dst, const float* m, int len, int scn, int dcn )
1370 const int BITS = 10, SCALE = 1 << BITS;
1371 const float MAX_M = (float)(1 << (15 - BITS));
1373 if( USE_SSE2 && scn == 3 && dcn == 3 &&
1374 std::abs(m[0]) < MAX_M && std::abs(m[1]) < MAX_M && std::abs(m[2]) < MAX_M && std::abs(m[3]) < MAX_M*256 &&
1375 std::abs(m[4]) < MAX_M && std::abs(m[5]) < MAX_M && std::abs(m[6]) < MAX_M && std::abs(m[7]) < MAX_M*256 &&
1376 std::abs(m[8]) < MAX_M && std::abs(m[9]) < MAX_M && std::abs(m[10]) < MAX_M && std::abs(m[11]) < MAX_M*256 )
1378 // faster fixed-point transformation
1379 short m00 = saturate_cast<short>(m[0]*SCALE), m01 = saturate_cast<short>(m[1]*SCALE),
1380 m02 = saturate_cast<short>(m[2]*SCALE), m10 = saturate_cast<short>(m[4]*SCALE),
1381 m11 = saturate_cast<short>(m[5]*SCALE), m12 = saturate_cast<short>(m[6]*SCALE),
1382 m20 = saturate_cast<short>(m[8]*SCALE), m21 = saturate_cast<short>(m[9]*SCALE),
1383 m22 = saturate_cast<short>(m[10]*SCALE);
1384 int m03 = saturate_cast<int>((m[3]+0.5f)*SCALE), m13 = saturate_cast<int>((m[7]+0.5f)*SCALE ),
1385 m23 = saturate_cast<int>((m[11]+0.5f)*SCALE);
1387 __m128i m0 = _mm_setr_epi16(0, m00, m01, m02, m00, m01, m02, 0);
1388 __m128i m1 = _mm_setr_epi16(0, m10, m11, m12, m10, m11, m12, 0);
1389 __m128i m2 = _mm_setr_epi16(0, m20, m21, m22, m20, m21, m22, 0);
1390 __m128i m3 = _mm_setr_epi32(m03, m13, m23, 0);
1393 for( ; x <= (len - 8)*3; x += 8*3 )
1395 __m128i z = _mm_setzero_si128(), t0, t1, t2, r0, r1;
1396 __m128i v0 = _mm_loadl_epi64((const __m128i*)(src + x));
1397 __m128i v1 = _mm_loadl_epi64((const __m128i*)(src + x + 8));
1398 __m128i v2 = _mm_loadl_epi64((const __m128i*)(src + x + 16)), v3;
1399 v0 = _mm_unpacklo_epi8(v0, z); // b0 g0 r0 b1 g1 r1 b2 g2
1400 v1 = _mm_unpacklo_epi8(v1, z); // r2 b3 g3 r3 b4 g4 r4 b5
1401 v2 = _mm_unpacklo_epi8(v2, z); // g5 r5 b6 g6 r6 b7 g7 r7
1403 v3 = _mm_srli_si128(v2, 2); // ? b6 g6 r6 b7 g7 r7 0
1404 v2 = _mm_or_si128(_mm_slli_si128(v2, 10), _mm_srli_si128(v1, 6)); // ? b4 g4 r4 b5 g5 r5 ?
1405 v1 = _mm_or_si128(_mm_slli_si128(v1, 6), _mm_srli_si128(v0, 10)); // ? b2 g2 r2 b3 g3 r3 ?
1406 v0 = _mm_slli_si128(v0, 2); // 0 b0 g0 r0 b1 g1 r1 ?
1408 // process pixels 0 & 1
1409 t0 = _mm_madd_epi16(v0, m0); // a0 b0 a1 b1
1410 t1 = _mm_madd_epi16(v0, m1); // c0 d0 c1 d1
1411 t2 = _mm_madd_epi16(v0, m2); // e0 f0 e1 f1
1412 v0 = _mm_unpacklo_epi32(t0, t1); // a0 c0 b0 d0
1413 t0 = _mm_unpackhi_epi32(t0, t1); // a1 b1 c1 d1
1414 t1 = _mm_unpacklo_epi32(t2, z); // e0 0 f0 0
1415 t2 = _mm_unpackhi_epi32(t2, z); // e1 0 f1 0
1416 r0 = _mm_add_epi32(_mm_add_epi32(_mm_unpacklo_epi64(v0, t1), _mm_unpackhi_epi64(v0,t1)), m3); // B0 G0 R0 0
1417 r1 = _mm_add_epi32(_mm_add_epi32(_mm_unpacklo_epi64(t0, t2), _mm_unpackhi_epi64(t0,t2)), m3); // B1 G1 R1 0
1418 r0 = _mm_srai_epi32(r0, BITS);
1419 r1 = _mm_srai_epi32(r1, BITS);
1420 v0 = _mm_packus_epi16(_mm_packs_epi32(_mm_slli_si128(r0, 4), r1), z); // 0 B0 G0 R0 B1 G1 R1 0
1422 // process pixels 2 & 3
1423 t0 = _mm_madd_epi16(v1, m0); // a0 b0 a1 b1
1424 t1 = _mm_madd_epi16(v1, m1); // c0 d0 c1 d1
1425 t2 = _mm_madd_epi16(v1, m2); // e0 f0 e1 f1
1426 v1 = _mm_unpacklo_epi32(t0, t1); // a0 c0 b0 d0
1427 t0 = _mm_unpackhi_epi32(t0, t1); // a1 b1 c1 d1
1428 t1 = _mm_unpacklo_epi32(t2, z); // e0 0 f0 0
1429 t2 = _mm_unpackhi_epi32(t2, z); // e1 0 f1 0
1430 r0 = _mm_add_epi32(_mm_add_epi32(_mm_unpacklo_epi64(v1, t1), _mm_unpackhi_epi64(v1,t1)), m3); // B2 G2 R2 0
1431 r1 = _mm_add_epi32(_mm_add_epi32(_mm_unpacklo_epi64(t0, t2), _mm_unpackhi_epi64(t0,t2)), m3); // B3 G3 R3 0
1432 r0 = _mm_srai_epi32(r0, BITS);
1433 r1 = _mm_srai_epi32(r1, BITS);
1434 v1 = _mm_packus_epi16(_mm_packs_epi32(_mm_slli_si128(r0, 4), r1), z); // 0 B2 G2 R2 B3 G3 R3 0
1436 // process pixels 4 & 5
1437 t0 = _mm_madd_epi16(v2, m0); // a0 b0 a1 b1
1438 t1 = _mm_madd_epi16(v2, m1); // c0 d0 c1 d1
1439 t2 = _mm_madd_epi16(v2, m2); // e0 f0 e1 f1
1440 v2 = _mm_unpacklo_epi32(t0, t1); // a0 c0 b0 d0
1441 t0 = _mm_unpackhi_epi32(t0, t1); // a1 b1 c1 d1
1442 t1 = _mm_unpacklo_epi32(t2, z); // e0 0 f0 0
1443 t2 = _mm_unpackhi_epi32(t2, z); // e1 0 f1 0
1444 r0 = _mm_add_epi32(_mm_add_epi32(_mm_unpacklo_epi64(v2, t1), _mm_unpackhi_epi64(v2,t1)), m3); // B4 G4 R4 0
1445 r1 = _mm_add_epi32(_mm_add_epi32(_mm_unpacklo_epi64(t0, t2), _mm_unpackhi_epi64(t0,t2)), m3); // B5 G5 R5 0
1446 r0 = _mm_srai_epi32(r0, BITS);
1447 r1 = _mm_srai_epi32(r1, BITS);
1448 v2 = _mm_packus_epi16(_mm_packs_epi32(_mm_slli_si128(r0, 4), r1), z); // 0 B4 G4 R4 B5 G5 R5 0
1450 // process pixels 6 & 7
1451 t0 = _mm_madd_epi16(v3, m0); // a0 b0 a1 b1
1452 t1 = _mm_madd_epi16(v3, m1); // c0 d0 c1 d1
1453 t2 = _mm_madd_epi16(v3, m2); // e0 f0 e1 f1
1454 v3 = _mm_unpacklo_epi32(t0, t1); // a0 c0 b0 d0
1455 t0 = _mm_unpackhi_epi32(t0, t1); // a1 b1 c1 d1
1456 t1 = _mm_unpacklo_epi32(t2, z); // e0 0 f0 0
1457 t2 = _mm_unpackhi_epi32(t2, z); // e1 0 f1 0
1458 r0 = _mm_add_epi32(_mm_add_epi32(_mm_unpacklo_epi64(v3, t1), _mm_unpackhi_epi64(v3,t1)), m3); // B6 G6 R6 0
1459 r1 = _mm_add_epi32(_mm_add_epi32(_mm_unpacklo_epi64(t0, t2), _mm_unpackhi_epi64(t0,t2)), m3); // B7 G7 R7 0
1460 r0 = _mm_srai_epi32(r0, BITS);
1461 r1 = _mm_srai_epi32(r1, BITS);
1462 v3 = _mm_packus_epi16(_mm_packs_epi32(_mm_slli_si128(r0, 4), r1), z); // 0 B6 G6 R6 B7 G7 R7 0
1464 v0 = _mm_or_si128(_mm_srli_si128(v0, 1), _mm_slli_si128(v1, 5));
1465 v1 = _mm_or_si128(_mm_srli_si128(v1, 3), _mm_slli_si128(v2, 3));
1466 v2 = _mm_or_si128(_mm_srli_si128(v2, 5), _mm_slli_si128(v3, 1));
1467 _mm_storel_epi64((__m128i*)(dst + x), v0);
1468 _mm_storel_epi64((__m128i*)(dst + x + 8), v1);
1469 _mm_storel_epi64((__m128i*)(dst + x + 16), v2);
1472 for( ; x < len*3; x += 3 )
1474 int v0 = src[x], v1 = src[x+1], v2 = src[x+2];
1475 uchar t0 = saturate_cast<uchar>((m00*v0 + m01*v1 + m02*v2 + m03)>>BITS);
1476 uchar t1 = saturate_cast<uchar>((m10*v0 + m11*v1 + m12*v2 + m13)>>BITS);
1477 uchar t2 = saturate_cast<uchar>((m20*v0 + m21*v1 + m22*v2 + m23)>>BITS);
1478 dst[x] = t0; dst[x+1] = t1; dst[x+2] = t2;
1484 transform_(src, dst, m, len, scn, dcn);
1488 transform_16u( const ushort* src, ushort* dst, const float* m, int len, int scn, int dcn )
1491 if( USE_SSE2 && scn == 3 && dcn == 3 )
1493 __m128 m0, m1, m2, m3;
1494 __m128i delta = _mm_setr_epi16(0,-32768,-32768,-32768,-32768,-32768,-32768,0);
1495 load3x3Matrix(m, m0, m1, m2, m3);
1496 m3 = _mm_sub_ps(m3, _mm_setr_ps(32768.f, 32768.f, 32768.f, 0.f));
1499 for( ; x <= (len - 4)*3; x += 4*3 )
1501 __m128i z = _mm_setzero_si128();
1502 __m128i v0 = _mm_loadu_si128((const __m128i*)(src + x)), v1;
1503 __m128i v2 = _mm_loadl_epi64((const __m128i*)(src + x + 8)), v3;
1504 v1 = _mm_unpacklo_epi16(_mm_srli_si128(v0, 6), z); // b1 g1 r1
1505 v3 = _mm_unpacklo_epi16(_mm_srli_si128(v2, 2), z); // b3 g3 r3
1506 v2 = _mm_or_si128(_mm_srli_si128(v0, 12), _mm_slli_si128(v2, 4));
1507 v0 = _mm_unpacklo_epi16(v0, z); // b0 g0 r0
1508 v2 = _mm_unpacklo_epi16(v2, z); // b2 g2 r2
1509 __m128 x0 = _mm_cvtepi32_ps(v0), x1 = _mm_cvtepi32_ps(v1);
1510 __m128 x2 = _mm_cvtepi32_ps(v2), x3 = _mm_cvtepi32_ps(v3);
1511 __m128 y0 = _mm_add_ps(_mm_add_ps(_mm_add_ps(
1512 _mm_mul_ps(m0, _mm_shuffle_ps(x0,x0,_MM_SHUFFLE(0,0,0,0))),
1513 _mm_mul_ps(m1, _mm_shuffle_ps(x0,x0,_MM_SHUFFLE(1,1,1,1)))),
1514 _mm_mul_ps(m2, _mm_shuffle_ps(x0,x0,_MM_SHUFFLE(2,2,2,2)))), m3);
1515 __m128 y1 = _mm_add_ps(_mm_add_ps(_mm_add_ps(
1516 _mm_mul_ps(m0, _mm_shuffle_ps(x1,x1,_MM_SHUFFLE(0,0,0,0))),
1517 _mm_mul_ps(m1, _mm_shuffle_ps(x1,x1,_MM_SHUFFLE(1,1,1,1)))),
1518 _mm_mul_ps(m2, _mm_shuffle_ps(x1,x1,_MM_SHUFFLE(2,2,2,2)))), m3);
1519 __m128 y2 = _mm_add_ps(_mm_add_ps(_mm_add_ps(
1520 _mm_mul_ps(m0, _mm_shuffle_ps(x2,x2,_MM_SHUFFLE(0,0,0,0))),
1521 _mm_mul_ps(m1, _mm_shuffle_ps(x2,x2,_MM_SHUFFLE(1,1,1,1)))),
1522 _mm_mul_ps(m2, _mm_shuffle_ps(x2,x2,_MM_SHUFFLE(2,2,2,2)))), m3);
1523 __m128 y3 = _mm_add_ps(_mm_add_ps(_mm_add_ps(
1524 _mm_mul_ps(m0, _mm_shuffle_ps(x3,x3,_MM_SHUFFLE(0,0,0,0))),
1525 _mm_mul_ps(m1, _mm_shuffle_ps(x3,x3,_MM_SHUFFLE(1,1,1,1)))),
1526 _mm_mul_ps(m2, _mm_shuffle_ps(x3,x3,_MM_SHUFFLE(2,2,2,2)))), m3);
1527 v0 = _mm_cvtps_epi32(y0); v1 = _mm_cvtps_epi32(y1);
1528 v2 = _mm_cvtps_epi32(y2); v3 = _mm_cvtps_epi32(y3);
1530 v0 = _mm_add_epi16(_mm_packs_epi32(_mm_slli_si128(v0,4), v1), delta); // 0 b0 g0 r0 b1 g1 r1 0
1531 v2 = _mm_add_epi16(_mm_packs_epi32(_mm_slli_si128(v2,4), v3), delta); // 0 b2 g2 r2 b3 g3 r3 0
1532 v1 = _mm_or_si128(_mm_srli_si128(v0,2), _mm_slli_si128(v2,10)); // b0 g0 r0 b1 g1 r1 b2 g2
1533 v2 = _mm_srli_si128(v2, 6); // r2 b3 g3 r3 0 0 0 0
1534 _mm_storeu_si128((__m128i*)(dst + x), v1);
1535 _mm_storel_epi64((__m128i*)(dst + x + 8), v2);
1538 for( ; x < len*3; x += 3 )
1540 float v0 = src[x], v1 = src[x+1], v2 = src[x+2];
1541 ushort t0 = saturate_cast<ushort>(m[0]*v0 + m[1]*v1 + m[2]*v2 + m[3]);
1542 ushort t1 = saturate_cast<ushort>(m[4]*v0 + m[5]*v1 + m[6]*v2 + m[7]);
1543 ushort t2 = saturate_cast<ushort>(m[8]*v0 + m[9]*v1 + m[10]*v2 + m[11]);
1544 dst[x] = t0; dst[x+1] = t1; dst[x+2] = t2;
1550 transform_(src, dst, m, len, scn, dcn);
1555 transform_32f( const float* src, float* dst, const float* m, int len, int scn, int dcn )
1561 if( scn == 3 && dcn == 3 )
1563 __m128 m0, m1, m2, m3;
1564 load3x3Matrix(m, m0, m1, m2, m3);
1566 for( ; x < (len - 1)*3; x += 3 )
1568 __m128 x0 = _mm_loadu_ps(src + x);
1569 __m128 y0 = _mm_add_ps(_mm_add_ps(_mm_add_ps(
1570 _mm_mul_ps(m0, _mm_shuffle_ps(x0,x0,_MM_SHUFFLE(0,0,0,0))),
1571 _mm_mul_ps(m1, _mm_shuffle_ps(x0,x0,_MM_SHUFFLE(1,1,1,1)))),
1572 _mm_mul_ps(m2, _mm_shuffle_ps(x0,x0,_MM_SHUFFLE(2,2,2,2)))), m3);
1573 _mm_storel_pi((__m64*)(dst + x), y0);
1574 _mm_store_ss(dst + x + 2, _mm_movehl_ps(y0,y0));
1577 for( ; x < len*3; x += 3 )
1579 float v0 = src[x], v1 = src[x+1], v2 = src[x+2];
1580 float t0 = saturate_cast<float>(m[0]*v0 + m[1]*v1 + m[2]*v2 + m[3]);
1581 float t1 = saturate_cast<float>(m[4]*v0 + m[5]*v1 + m[6]*v2 + m[7]);
1582 float t2 = saturate_cast<float>(m[8]*v0 + m[9]*v1 + m[10]*v2 + m[11]);
1583 dst[x] = t0; dst[x+1] = t1; dst[x+2] = t2;
1588 if( scn == 4 && dcn == 4 )
1590 __m128 m0, m1, m2, m3, m4;
1591 load4x4Matrix(m, m0, m1, m2, m3, m4);
1593 for( ; x < len*4; x += 4 )
1595 __m128 x0 = _mm_loadu_ps(src + x);
1596 __m128 y0 = _mm_add_ps(_mm_add_ps(_mm_add_ps(_mm_add_ps(
1597 _mm_mul_ps(m0, _mm_shuffle_ps(x0,x0,_MM_SHUFFLE(0,0,0,0))),
1598 _mm_mul_ps(m1, _mm_shuffle_ps(x0,x0,_MM_SHUFFLE(1,1,1,1)))),
1599 _mm_mul_ps(m2, _mm_shuffle_ps(x0,x0,_MM_SHUFFLE(2,2,2,2)))),
1600 _mm_mul_ps(m3, _mm_shuffle_ps(x0,x0,_MM_SHUFFLE(3,3,3,3)))), m4);
1601 _mm_storeu_ps(dst + x, y0);
1608 transform_(src, dst, m, len, scn, dcn);
1613 transform_8s(const schar* src, schar* dst, const float* m, int len, int scn, int dcn)
1615 transform_(src, dst, m, len, scn, dcn);
1619 transform_16s(const short* src, short* dst, const float* m, int len, int scn, int dcn)
1621 transform_(src, dst, m, len, scn, dcn);
1625 transform_32s(const int* src, int* dst, const double* m, int len, int scn, int dcn)
1627 transform_(src, dst, m, len, scn, dcn);
1631 transform_64f(const double* src, double* dst, const double* m, int len, int scn, int dcn)
1633 transform_(src, dst, m, len, scn, dcn);
1636 template<typename T, typename WT> static void
1637 diagtransform_( const T* src, T* dst, const WT* m, int len, int cn, int )
1643 for( x = 0; x < len*2; x += 2 )
1645 T t0 = saturate_cast<T>(m[0]*src[x] + m[2]);
1646 T t1 = saturate_cast<T>(m[4]*src[x+1] + m[5]);
1647 dst[x] = t0; dst[x+1] = t1;
1652 for( x = 0; x < len*3; x += 3 )
1654 T t0 = saturate_cast<T>(m[0]*src[x] + m[3]);
1655 T t1 = saturate_cast<T>(m[5]*src[x+1] + m[7]);
1656 T t2 = saturate_cast<T>(m[10]*src[x+2] + m[11]);
1657 dst[x] = t0; dst[x+1] = t1; dst[x+2] = t2;
1662 for( x = 0; x < len*4; x += 4 )
1664 T t0 = saturate_cast<T>(m[0]*src[x] + m[4]);
1665 T t1 = saturate_cast<T>(m[6]*src[x+1] + m[9]);
1666 dst[x] = t0; dst[x+1] = t1;
1667 t0 = saturate_cast<T>(m[12]*src[x+2] + m[14]);
1668 t1 = saturate_cast<T>(m[18]*src[x+3] + m[19]);
1669 dst[x+2] = t0; dst[x+3] = t1;
1674 for( x = 0; x < len; x++, src += cn, dst += cn )
1677 for( int j = 0; j < cn; j++, _m += cn + 1 )
1678 dst[j] = saturate_cast<T>(src[j]*_m[j] + _m[cn]);
1684 diagtransform_8u(const uchar* src, uchar* dst, const float* m, int len, int scn, int dcn)
1686 diagtransform_(src, dst, m, len, scn, dcn);
1690 diagtransform_8s(const schar* src, schar* dst, const float* m, int len, int scn, int dcn)
1692 diagtransform_(src, dst, m, len, scn, dcn);
1696 diagtransform_16u(const ushort* src, ushort* dst, const float* m, int len, int scn, int dcn)
1698 diagtransform_(src, dst, m, len, scn, dcn);
1702 diagtransform_16s(const short* src, short* dst, const float* m, int len, int scn, int dcn)
1704 diagtransform_(src, dst, m, len, scn, dcn);
1708 diagtransform_32s(const int* src, int* dst, const double* m, int len, int scn, int dcn)
1710 diagtransform_(src, dst, m, len, scn, dcn);
1714 diagtransform_32f(const float* src, float* dst, const float* m, int len, int scn, int dcn)
1716 diagtransform_(src, dst, m, len, scn, dcn);
1720 diagtransform_64f(const double* src, double* dst, const double* m, int len, int scn, int dcn)
1722 diagtransform_(src, dst, m, len, scn, dcn);
1726 typedef void (*TransformFunc)( const uchar* src, uchar* dst, const uchar* m, int, int, int );
1728 static TransformFunc transformTab[] =
1730 (TransformFunc)transform_8u, (TransformFunc)transform_8s, (TransformFunc)transform_16u,
1731 (TransformFunc)transform_16s, (TransformFunc)transform_32s, (TransformFunc)transform_32f,
1732 (TransformFunc)transform_64f, 0
1735 static TransformFunc diagTransformTab[] =
1737 (TransformFunc)diagtransform_8u, (TransformFunc)diagtransform_8s, (TransformFunc)diagtransform_16u,
1738 (TransformFunc)diagtransform_16s, (TransformFunc)diagtransform_32s, (TransformFunc)diagtransform_32f,
1739 (TransformFunc)diagtransform_64f, 0
1744 void cv::transform( InputArray _src, OutputArray _dst, InputArray _mtx )
1746 Mat src = _src.getMat(), m = _mtx.getMat();
1747 int depth = src.depth(), scn = src.channels(), dcn = m.rows;
1748 CV_Assert( scn == m.cols || scn + 1 == m.cols );
1749 bool isDiag = false;
1751 _dst.create( src.size(), CV_MAKETYPE(depth, dcn) );
1752 Mat dst = _dst.getMat();
1754 int mtype = depth == CV_32S || depth == CV_64F ? CV_64F : CV_32F;
1755 AutoBuffer<double> _mbuf;
1758 if( !m.isContinuous() || m.type() != mtype || m.cols != scn + 1 )
1760 _mbuf.allocate(dcn*(scn+1));
1761 mbuf = (double*)_mbuf;
1762 Mat tmp(dcn, scn+1, mtype, mbuf);
1763 memset(tmp.data, 0, tmp.total()*tmp.elemSize());
1764 if( m.cols == scn+1 )
1765 m.convertTo(tmp, mtype);
1768 Mat tmppart = tmp.colRange(0, m.cols);
1769 m.convertTo(tmppart, mtype);
1774 mbuf = (double*)m.data;
1779 double eps = mtype == CV_32F ? FLT_EPSILON : DBL_EPSILON;
1784 if( mtype == CV_32F )
1785 alpha = m.at<float>(0), beta = m.at<float>(1);
1787 alpha = m.at<double>(0), beta = m.at<double>(1);
1788 src.convertTo(dst, dst.type(), alpha, beta);
1792 for( i = 0, isDiag = true; isDiag && i < scn; i++ )
1794 for( j = 0; isDiag && j < scn; j++ )
1796 double v = mtype == CV_32F ? m.at<float>(i, j) : m.at<double>(i, j);
1797 if( i != j && fabs(v) > eps )
1803 TransformFunc func = isDiag ? diagTransformTab[depth] : transformTab[depth];
1804 CV_Assert( func != 0 );
1806 const Mat* arrays[] = {&src, &dst, 0};
1808 NAryMatIterator it(arrays, ptrs);
1809 size_t i, total = it.size;
1811 for( i = 0; i < it.nplanes; i++, ++it )
1812 func( ptrs[0], ptrs[1], (uchar*)mbuf, (int)total, scn, dcn );
1815 /****************************************************************************************\
1816 * Perspective Transform *
1817 \****************************************************************************************/
1822 template<typename T> static void
1823 perspectiveTransform_( const T* src, T* dst, const double* m, int len, int scn, int dcn )
1825 const double eps = FLT_EPSILON;
1828 if( scn == 2 && dcn == 2 )
1830 for( i = 0; i < len*2; i += 2 )
1832 T x = src[i], y = src[i + 1];
1833 double w = x*m[6] + y*m[7] + m[8];
1838 dst[i] = (T)((x*m[0] + y*m[1] + m[2])*w);
1839 dst[i+1] = (T)((x*m[3] + y*m[4] + m[5])*w);
1842 dst[i] = dst[i+1] = (T)0;
1845 else if( scn == 3 && dcn == 3 )
1847 for( i = 0; i < len*3; i += 3 )
1849 T x = src[i], y = src[i + 1], z = src[i + 2];
1850 double w = x*m[12] + y*m[13] + z*m[14] + m[15];
1855 dst[i] = (T)((x*m[0] + y*m[1] + z*m[2] + m[3]) * w);
1856 dst[i+1] = (T)((x*m[4] + y*m[5] + z*m[6] + m[7]) * w);
1857 dst[i+2] = (T)((x*m[8] + y*m[9] + z*m[10] + m[11]) * w);
1860 dst[i] = dst[i+1] = dst[i+2] = (T)0;
1863 else if( scn == 3 && dcn == 2 )
1865 for( i = 0; i < len; i++, src += 3, dst += 2 )
1867 T x = src[0], y = src[1], z = src[2];
1868 double w = x*m[8] + y*m[9] + z*m[10] + m[11];
1873 dst[0] = (T)((x*m[0] + y*m[1] + z*m[2] + m[3])*w);
1874 dst[1] = (T)((x*m[4] + y*m[5] + z*m[6] + m[7])*w);
1877 dst[0] = dst[1] = (T)0;
1882 for( i = 0; i < len; i++, src += scn, dst += dcn )
1884 const double* _m = m + dcn*(scn + 1);
1887 for( k = 0; k < scn; k++ )
1892 for( j = 0; j < dcn; j++, _m += scn + 1 )
1895 for( k = 0; k < scn; k++ )
1901 for( j = 0; j < dcn; j++ )
1909 perspectiveTransform_32f(const float* src, float* dst, const double* m, int len, int scn, int dcn)
1911 perspectiveTransform_(src, dst, m, len, scn, dcn);
1915 perspectiveTransform_64f(const double* src, double* dst, const double* m, int len, int scn, int dcn)
1917 perspectiveTransform_(src, dst, m, len, scn, dcn);
1922 void cv::perspectiveTransform( InputArray _src, OutputArray _dst, InputArray _mtx )
1924 Mat src = _src.getMat(), m = _mtx.getMat();
1925 int depth = src.depth(), scn = src.channels(), dcn = m.rows-1;
1926 CV_Assert( scn + 1 == m.cols && (depth == CV_32F || depth == CV_64F));
1928 _dst.create( src.size(), CV_MAKETYPE(depth, dcn) );
1929 Mat dst = _dst.getMat();
1931 const int mtype = CV_64F;
1932 AutoBuffer<double> _mbuf;
1933 double* mbuf = _mbuf;
1935 if( !m.isContinuous() || m.type() != mtype )
1937 _mbuf.allocate((dcn+1)*(scn+1));
1938 Mat tmp(dcn+1, scn+1, mtype, (double*)_mbuf);
1939 m.convertTo(tmp, mtype);
1943 mbuf = (double*)m.data;
1945 TransformFunc func = depth == CV_32F ?
1946 (TransformFunc)perspectiveTransform_32f :
1947 (TransformFunc)perspectiveTransform_64f;
1948 CV_Assert( func != 0 );
1950 const Mat* arrays[] = {&src, &dst, 0};
1952 NAryMatIterator it(arrays, ptrs);
1953 size_t i, total = it.size;
1955 for( i = 0; i < it.nplanes; i++, ++it )
1956 func( ptrs[0], ptrs[1], (uchar*)mbuf, (int)total, scn, dcn );
1959 /****************************************************************************************\
1961 \****************************************************************************************/
1966 static void scaleAdd_32f(const float* src1, const float* src2, float* dst,
1967 int len, float* _alpha)
1969 float alpha = *_alpha;
1974 __m128 a4 = _mm_set1_ps(alpha);
1975 if( (((size_t)src1|(size_t)src2|(size_t)dst) & 15) == 0 )
1976 for( ; i <= len - 8; i += 8 )
1978 __m128 x0, x1, y0, y1, t0, t1;
1979 x0 = _mm_load_ps(src1 + i); x1 = _mm_load_ps(src1 + i + 4);
1980 y0 = _mm_load_ps(src2 + i); y1 = _mm_load_ps(src2 + i + 4);
1981 t0 = _mm_add_ps(_mm_mul_ps(x0, a4), y0);
1982 t1 = _mm_add_ps(_mm_mul_ps(x1, a4), y1);
1983 _mm_store_ps(dst + i, t0);
1984 _mm_store_ps(dst + i + 4, t1);
1987 for( ; i <= len - 8; i += 8 )
1989 __m128 x0, x1, y0, y1, t0, t1;
1990 x0 = _mm_loadu_ps(src1 + i); x1 = _mm_loadu_ps(src1 + i + 4);
1991 y0 = _mm_loadu_ps(src2 + i); y1 = _mm_loadu_ps(src2 + i + 4);
1992 t0 = _mm_add_ps(_mm_mul_ps(x0, a4), y0);
1993 t1 = _mm_add_ps(_mm_mul_ps(x1, a4), y1);
1994 _mm_storeu_ps(dst + i, t0);
1995 _mm_storeu_ps(dst + i + 4, t1);
2000 //vz why do we need unroll here?
2001 for( ; i <= len - 4; i += 4 )
2004 t0 = src1[i]*alpha + src2[i];
2005 t1 = src1[i+1]*alpha + src2[i+1];
2006 dst[i] = t0; dst[i+1] = t1;
2007 t0 = src1[i+2]*alpha + src2[i+2];
2008 t1 = src1[i+3]*alpha + src2[i+3];
2009 dst[i+2] = t0; dst[i+3] = t1;
2011 for(; i < len; i++ )
2012 dst[i] = src1[i]*alpha + src2[i];
2016 static void scaleAdd_64f(const double* src1, const double* src2, double* dst,
2017 int len, double* _alpha)
2019 double alpha = *_alpha;
2022 if( USE_SSE2 && (((size_t)src1|(size_t)src2|(size_t)dst) & 15) == 0 )
2024 __m128d a2 = _mm_set1_pd(alpha);
2025 for( ; i <= len - 4; i += 4 )
2027 __m128d x0, x1, y0, y1, t0, t1;
2028 x0 = _mm_load_pd(src1 + i); x1 = _mm_load_pd(src1 + i + 2);
2029 y0 = _mm_load_pd(src2 + i); y1 = _mm_load_pd(src2 + i + 2);
2030 t0 = _mm_add_pd(_mm_mul_pd(x0, a2), y0);
2031 t1 = _mm_add_pd(_mm_mul_pd(x1, a2), y1);
2032 _mm_store_pd(dst + i, t0);
2033 _mm_store_pd(dst + i + 2, t1);
2038 //vz why do we need unroll here?
2039 for( ; i <= len - 4; i += 4 )
2042 t0 = src1[i]*alpha + src2[i];
2043 t1 = src1[i+1]*alpha + src2[i+1];
2044 dst[i] = t0; dst[i+1] = t1;
2045 t0 = src1[i+2]*alpha + src2[i+2];
2046 t1 = src1[i+3]*alpha + src2[i+3];
2047 dst[i+2] = t0; dst[i+3] = t1;
2049 for(; i < len; i++ )
2050 dst[i] = src1[i]*alpha + src2[i];
2053 typedef void (*ScaleAddFunc)(const uchar* src1, const uchar* src2, uchar* dst, int len, const void* alpha);
2057 void cv::scaleAdd( InputArray _src1, double alpha, InputArray _src2, OutputArray _dst )
2059 Mat src1 = _src1.getMat(), src2 = _src2.getMat();
2060 int depth = src1.depth(), cn = src1.channels();
2062 CV_Assert( src1.type() == src2.type() );
2063 if( depth < CV_32F )
2065 addWeighted(_src1, alpha, _src2, 1, 0, _dst, depth);
2069 _dst.create(src1.dims, src1.size, src1.type());
2070 Mat dst = _dst.getMat();
2072 float falpha = (float)alpha;
2073 void* palpha = depth == CV_32F ? (void*)&falpha : (void*)α
2075 ScaleAddFunc func = depth == CV_32F ? (ScaleAddFunc)scaleAdd_32f : (ScaleAddFunc)scaleAdd_64f;
2077 if( src1.isContinuous() && src2.isContinuous() && dst.isContinuous() )
2079 size_t len = src1.total()*cn;
2080 func(src1.data, src2.data, dst.data, (int)len, palpha);
2084 const Mat* arrays[] = {&src1, &src2, &dst, 0};
2086 NAryMatIterator it(arrays, ptrs);
2087 size_t i, len = it.size*cn;
2089 for( i = 0; i < it.nplanes; i++, ++it )
2090 func( ptrs[0], ptrs[1], ptrs[2], (int)len, palpha );
2093 /****************************************************************************************\
2094 * Covariation Matrix *
2095 \****************************************************************************************/
2097 void cv::calcCovarMatrix( const Mat* data, int nsamples, Mat& covar, Mat& _mean, int flags, int ctype )
2099 CV_Assert( data && nsamples > 0 );
2100 Size size = data[0].size();
2101 int sz = size.width * size.height, esz = (int)data[0].elemSize();
2102 int type = data[0].type();
2104 ctype = std::max(std::max(CV_MAT_DEPTH(ctype >= 0 ? ctype : type), _mean.depth()), CV_32F);
2106 if( (flags & CV_COVAR_USE_AVG) != 0 )
2108 CV_Assert( _mean.size() == size );
2109 if( _mean.isContinuous() && _mean.type() == ctype )
2110 mean = _mean.reshape(1, 1);
2113 _mean.convertTo(mean, ctype);
2114 mean = mean.reshape(1, 1);
2118 Mat _data(nsamples, sz, type);
2120 for( int i = 0; i < nsamples; i++ )
2122 CV_Assert( data[i].size() == size && data[i].type() == type );
2123 if( data[i].isContinuous() )
2124 memcpy( _data.ptr(i), data[i].data, sz*esz );
2127 Mat dataRow(size.height, size.width, type, _data.ptr(i));
2128 data[i].copyTo(dataRow);
2132 calcCovarMatrix( _data, covar, mean, (flags & ~(CV_COVAR_ROWS|CV_COVAR_COLS)) | CV_COVAR_ROWS, ctype );
2133 if( (flags & CV_COVAR_USE_AVG) == 0 )
2134 _mean = mean.reshape(1, size.height);
2137 void cv::calcCovarMatrix( InputArray _src, OutputArray _covar, InputOutputArray _mean, int flags, int ctype )
2139 if(_src.kind() == _InputArray::STD_VECTOR_MAT)
2141 std::vector<cv::Mat> src;
2142 _src.getMatVector(src);
2144 CV_Assert( src.size() > 0 );
2146 Size size = src[0].size();
2147 int type = src[0].type();
2149 ctype = std::max(std::max(CV_MAT_DEPTH(ctype >= 0 ? ctype : type), _mean.depth()), CV_32F);
2151 Mat _data(static_cast<int>(src.size()), size.area(), type);
2154 for(vector<cv::Mat>::iterator each = src.begin(); each != src.end(); each++, i++ )
2156 CV_Assert( (*each).size() == size && (*each).type() == type );
2157 Mat dataRow(size.height, size.width, type, _data.ptr(i));
2158 (*each).copyTo(dataRow);
2162 if( (flags & CV_COVAR_USE_AVG) != 0 )
2164 CV_Assert( _mean.size() == size );
2166 if( mean.type() != ctype )
2168 mean = _mean.getMat();
2169 _mean.create(mean.size(), ctype);
2170 Mat tmp = _mean.getMat();
2171 mean.convertTo(tmp, ctype);
2175 mean = _mean.getMat().reshape(1, 1);
2178 calcCovarMatrix( _data, _covar, mean, (flags & ~(CV_COVAR_ROWS|CV_COVAR_COLS)) | CV_COVAR_ROWS, ctype );
2180 if( (flags & CV_COVAR_USE_AVG) == 0 )
2182 mean = mean.reshape(1, size.height);
2188 Mat data = _src.getMat(), mean;
2189 CV_Assert( ((flags & CV_COVAR_ROWS) != 0) ^ ((flags & CV_COVAR_COLS) != 0) );
2190 bool takeRows = (flags & CV_COVAR_ROWS) != 0;
2191 int type = data.type();
2192 int nsamples = takeRows ? data.rows : data.cols;
2193 CV_Assert( nsamples > 0 );
2194 Size size = takeRows ? Size(data.cols, 1) : Size(1, data.rows);
2196 if( (flags & CV_COVAR_USE_AVG) != 0 )
2198 mean = _mean.getMat();
2199 ctype = std::max(std::max(CV_MAT_DEPTH(ctype >= 0 ? ctype : type), mean.depth()), CV_32F);
2200 CV_Assert( mean.size() == size );
2201 if( mean.type() != ctype )
2203 _mean.create(mean.size(), ctype);
2204 Mat tmp = _mean.getMat();
2205 mean.convertTo(tmp, ctype);
2211 ctype = std::max(CV_MAT_DEPTH(ctype >= 0 ? ctype : type), CV_32F);
2212 reduce( _src, _mean, takeRows ? 0 : 1, CV_REDUCE_AVG, ctype );
2213 mean = _mean.getMat();
2216 mulTransposed( data, _covar, ((flags & CV_COVAR_NORMAL) == 0) ^ takeRows,
2217 mean, (flags & CV_COVAR_SCALE) != 0 ? 1./nsamples : 1, ctype );
2220 /****************************************************************************************\
2222 \****************************************************************************************/
2224 double cv::Mahalanobis( InputArray _v1, InputArray _v2, InputArray _icovar )
2226 Mat v1 = _v1.getMat(), v2 = _v2.getMat(), icovar = _icovar.getMat();
2227 int type = v1.type(), depth = v1.depth();
2228 Size sz = v1.size();
2229 int i, j, len = sz.width*sz.height*v1.channels();
2230 AutoBuffer<double> buf(len);
2233 CV_Assert( type == v2.type() && type == icovar.type() &&
2234 sz == v2.size() && len == icovar.rows && len == icovar.cols );
2236 sz.width *= v1.channels();
2237 if( v1.isContinuous() && v2.isContinuous() )
2239 sz.width *= sz.height;
2243 if( depth == CV_32F )
2245 const float* src1 = (const float*)v1.data;
2246 const float* src2 = (const float*)v2.data;
2247 size_t step1 = v1.step/sizeof(src1[0]);
2248 size_t step2 = v2.step/sizeof(src2[0]);
2250 const float* mat = (const float*)icovar.data;
2251 size_t matstep = icovar.step/sizeof(mat[0]);
2253 for( ; sz.height--; src1 += step1, src2 += step2, diff += sz.width )
2255 for( i = 0; i < sz.width; i++ )
2256 diff[i] = src1[i] - src2[i];
2260 for( i = 0; i < len; i++, mat += matstep )
2264 #if CV_ENABLE_UNROLLED
2265 for(; j <= len - 4; j += 4 )
2266 row_sum += diff[j]*mat[j] + diff[j+1]*mat[j+1] +
2267 diff[j+2]*mat[j+2] + diff[j+3]*mat[j+3];
2269 for( ; j < len; j++ )
2270 row_sum += diff[j]*mat[j];
2271 result += row_sum * diff[i];
2274 else if( depth == CV_64F )
2276 const double* src1 = (const double*)v1.data;
2277 const double* src2 = (const double*)v2.data;
2278 size_t step1 = v1.step/sizeof(src1[0]);
2279 size_t step2 = v2.step/sizeof(src2[0]);
2281 const double* mat = (const double*)icovar.data;
2282 size_t matstep = icovar.step/sizeof(mat[0]);
2284 for( ; sz.height--; src1 += step1, src2 += step2, diff += sz.width )
2286 for( i = 0; i < sz.width; i++ )
2287 diff[i] = src1[i] - src2[i];
2291 for( i = 0; i < len; i++, mat += matstep )
2295 #if CV_ENABLE_UNROLLED
2296 for(; j <= len - 4; j += 4 )
2297 row_sum += diff[j]*mat[j] + diff[j+1]*mat[j+1] +
2298 diff[j+2]*mat[j+2] + diff[j+3]*mat[j+3];
2300 for( ; j < len; j++ )
2301 row_sum += diff[j]*mat[j];
2302 result += row_sum * diff[i];
2306 CV_Error( CV_StsUnsupportedFormat, "" );
2308 return std::sqrt(result);
2311 double cv::Mahalonobis( InputArray _v1, InputArray _v2, InputArray _icovar )
2313 return Mahalanobis(_v1, _v2, _icovar);
2316 /****************************************************************************************\
2318 \****************************************************************************************/
2323 template<typename sT, typename dT> static void
2324 MulTransposedR( const Mat& srcmat, Mat& dstmat, const Mat& deltamat, double scale )
2327 const sT* src = (const sT*)srcmat.data;
2328 dT* dst = (dT*)dstmat.data;
2329 const dT* delta = (const dT*)deltamat.data;
2330 size_t srcstep = srcmat.step/sizeof(src[0]);
2331 size_t dststep = dstmat.step/sizeof(dst[0]);
2332 size_t deltastep = deltamat.rows > 1 ? deltamat.step/sizeof(delta[0]) : 0;
2333 int delta_cols = deltamat.cols;
2334 Size size = srcmat.size();
2338 int buf_size = size.height*sizeof(dT);
2339 AutoBuffer<uchar> buf;
2341 if( delta && delta_cols < size.width )
2343 assert( delta_cols == 1 );
2346 buf.allocate(buf_size);
2347 col_buf = (dT*)(uchar*)buf;
2349 if( delta && delta_cols < size.width )
2351 delta_buf = col_buf + size.height;
2352 for( i = 0; i < size.height; i++ )
2353 delta_buf[i*4] = delta_buf[i*4+1] =
2354 delta_buf[i*4+2] = delta_buf[i*4+3] = delta[i*deltastep];
2356 deltastep = deltastep ? 4 : 0;
2360 for( i = 0; i < size.width; i++, tdst += dststep )
2362 for( k = 0; k < size.height; k++ )
2363 col_buf[k] = src[k*srcstep+i];
2365 for( j = i; j <= size.width - 4; j += 4 )
2367 double s0 = 0, s1 = 0, s2 = 0, s3 = 0;
2368 const sT *tsrc = src + j;
2370 for( k = 0; k < size.height; k++, tsrc += srcstep )
2372 double a = col_buf[k];
2379 tdst[j] = (dT)(s0*scale);
2380 tdst[j+1] = (dT)(s1*scale);
2381 tdst[j+2] = (dT)(s2*scale);
2382 tdst[j+3] = (dT)(s3*scale);
2385 for( ; j < size.width; j++ )
2388 const sT *tsrc = src + j;
2390 for( k = 0; k < size.height; k++, tsrc += srcstep )
2391 s0 += (double)col_buf[k] * tsrc[0];
2393 tdst[j] = (dT)(s0*scale);
2397 for( i = 0; i < size.width; i++, tdst += dststep )
2400 for( k = 0; k < size.height; k++ )
2401 col_buf[k] = src[k*srcstep+i] - delta[k*deltastep+i];
2403 for( k = 0; k < size.height; k++ )
2404 col_buf[k] = src[k*srcstep+i] - delta_buf[k*deltastep];
2406 for( j = i; j <= size.width - 4; j += 4 )
2408 double s0 = 0, s1 = 0, s2 = 0, s3 = 0;
2409 const sT *tsrc = src + j;
2410 const dT *d = delta_buf ? delta_buf : delta + j;
2412 for( k = 0; k < size.height; k++, tsrc+=srcstep, d+=deltastep )
2414 double a = col_buf[k];
2415 s0 += a * (tsrc[0] - d[0]);
2416 s1 += a * (tsrc[1] - d[1]);
2417 s2 += a * (tsrc[2] - d[2]);
2418 s3 += a * (tsrc[3] - d[3]);
2421 tdst[j] = (dT)(s0*scale);
2422 tdst[j+1] = (dT)(s1*scale);
2423 tdst[j+2] = (dT)(s2*scale);
2424 tdst[j+3] = (dT)(s3*scale);
2427 for( ; j < size.width; j++ )
2430 const sT *tsrc = src + j;
2431 const dT *d = delta_buf ? delta_buf : delta + j;
2433 for( k = 0; k < size.height; k++, tsrc+=srcstep, d+=deltastep )
2434 s0 += (double)col_buf[k] * (tsrc[0] - d[0]);
2436 tdst[j] = (dT)(s0*scale);
2442 template<typename sT, typename dT> static void
2443 MulTransposedL( const Mat& srcmat, Mat& dstmat, const Mat& deltamat, double scale )
2446 const sT* src = (const sT*)srcmat.data;
2447 dT* dst = (dT*)dstmat.data;
2448 const dT* delta = (const dT*)deltamat.data;
2449 size_t srcstep = srcmat.step/sizeof(src[0]);
2450 size_t dststep = dstmat.step/sizeof(dst[0]);
2451 size_t deltastep = deltamat.rows > 1 ? deltamat.step/sizeof(delta[0]) : 0;
2452 int delta_cols = deltamat.cols;
2453 Size size = srcmat.size();
2457 for( i = 0; i < size.height; i++, tdst += dststep )
2458 for( j = i; j < size.height; j++ )
2461 const sT *tsrc1 = src + i*srcstep;
2462 const sT *tsrc2 = src + j*srcstep;
2464 for( k = 0; k <= size.width - 4; k += 4 )
2465 s += (double)tsrc1[k]*tsrc2[k] + (double)tsrc1[k+1]*tsrc2[k+1] +
2466 (double)tsrc1[k+2]*tsrc2[k+2] + (double)tsrc1[k+3]*tsrc2[k+3];
2467 for( ; k < size.width; k++ )
2468 s += (double)tsrc1[k] * tsrc2[k];
2469 tdst[j] = (dT)(s*scale);
2474 int delta_shift = delta_cols == size.width ? 4 : 0;
2475 AutoBuffer<uchar> buf(size.width*sizeof(dT));
2476 dT* row_buf = (dT*)(uchar*)buf;
2478 for( i = 0; i < size.height; i++, tdst += dststep )
2480 const sT *tsrc1 = src + i*srcstep;
2481 const dT *tdelta1 = delta + i*deltastep;
2483 if( delta_cols < size.width )
2484 for( k = 0; k < size.width; k++ )
2485 row_buf[k] = tsrc1[k] - tdelta1[0];
2487 for( k = 0; k < size.width; k++ )
2488 row_buf[k] = tsrc1[k] - tdelta1[k];
2490 for( j = i; j < size.height; j++ )
2493 const sT *tsrc2 = src + j*srcstep;
2494 const dT *tdelta2 = delta + j*deltastep;
2495 if( delta_cols < size.width )
2497 delta_buf[0] = delta_buf[1] =
2498 delta_buf[2] = delta_buf[3] = tdelta2[0];
2499 tdelta2 = delta_buf;
2501 for( k = 0; k <= size.width-4; k += 4, tdelta2 += delta_shift )
2502 s += (double)row_buf[k]*(tsrc2[k] - tdelta2[0]) +
2503 (double)row_buf[k+1]*(tsrc2[k+1] - tdelta2[1]) +
2504 (double)row_buf[k+2]*(tsrc2[k+2] - tdelta2[2]) +
2505 (double)row_buf[k+3]*(tsrc2[k+3] - tdelta2[3]);
2506 for( ; k < size.width; k++, tdelta2++ )
2507 s += (double)row_buf[k]*(tsrc2[k] - tdelta2[0]);
2508 tdst[j] = (dT)(s*scale);
2514 typedef void (*MulTransposedFunc)(const Mat& src, Mat& dst, const Mat& delta, double scale);
2518 void cv::mulTransposed( InputArray _src, OutputArray _dst, bool ata,
2519 InputArray _delta, double scale, int dtype )
2521 Mat src = _src.getMat(), delta = _delta.getMat();
2522 const int gemm_level = 100; // boundary above which GEMM is faster.
2523 int stype = src.type();
2524 dtype = std::max(std::max(CV_MAT_DEPTH(dtype >= 0 ? dtype : stype), delta.depth()), CV_32F);
2525 CV_Assert( src.channels() == 1 );
2529 CV_Assert( delta.channels() == 1 &&
2530 (delta.rows == src.rows || delta.rows == 1) &&
2531 (delta.cols == src.cols || delta.cols == 1));
2532 if( delta.type() != dtype )
2533 delta.convertTo(delta, dtype);
2536 int dsize = ata ? src.cols : src.rows;
2537 _dst.create( dsize, dsize, dtype );
2538 Mat dst = _dst.getMat();
2540 if( src.data == dst.data || (stype == dtype &&
2541 (dst.cols >= gemm_level && dst.rows >= gemm_level &&
2542 src.cols >= gemm_level && src.rows >= gemm_level)))
2545 const Mat* tsrc = &src;
2548 if( delta.size() == src.size() )
2549 subtract( src, delta, src2 );
2552 repeat(delta, src.rows/delta.rows, src.cols/delta.cols, src2);
2553 subtract( src, src2, src2 );
2557 gemm( *tsrc, *tsrc, scale, Mat(), 0, dst, ata ? GEMM_1_T : GEMM_2_T );
2561 MulTransposedFunc func = 0;
2562 if(stype == CV_8U && dtype == CV_32F)
2565 func = MulTransposedR<uchar,float>;
2567 func = MulTransposedL<uchar,float>;
2569 else if(stype == CV_8U && dtype == CV_64F)
2572 func = MulTransposedR<uchar,double>;
2574 func = MulTransposedL<uchar,double>;
2576 else if(stype == CV_16U && dtype == CV_32F)
2579 func = MulTransposedR<ushort,float>;
2581 func = MulTransposedL<ushort,float>;
2583 else if(stype == CV_16U && dtype == CV_64F)
2586 func = MulTransposedR<ushort,double>;
2588 func = MulTransposedL<ushort,double>;
2590 else if(stype == CV_16S && dtype == CV_32F)
2593 func = MulTransposedR<short,float>;
2595 func = MulTransposedL<short,float>;
2597 else if(stype == CV_16S && dtype == CV_64F)
2600 func = MulTransposedR<short,double>;
2602 func = MulTransposedL<short,double>;
2604 else if(stype == CV_32F && dtype == CV_32F)
2607 func = MulTransposedR<float,float>;
2609 func = MulTransposedL<float,float>;
2611 else if(stype == CV_32F && dtype == CV_64F)
2614 func = MulTransposedR<float,double>;
2616 func = MulTransposedL<float,double>;
2618 else if(stype == CV_64F && dtype == CV_64F)
2621 func = MulTransposedR<double,double>;
2623 func = MulTransposedL<double,double>;
2626 CV_Error( CV_StsUnsupportedFormat, "" );
2628 func( src, dst, delta, scale );
2629 completeSymm( dst, false );
2633 /****************************************************************************************\
2635 \****************************************************************************************/
2640 template<typename T> double
2641 dotProd_(const T* src1, const T* src2, int len)
2645 #if CV_ENABLE_UNROLLED
2646 for( ; i <= len - 4; i += 4 )
2647 result += (double)src1[i]*src2[i] + (double)src1[i+1]*src2[i+1] +
2648 (double)src1[i+2]*src2[i+2] + (double)src1[i+3]*src2[i+3];
2650 for( ; i < len; i++ )
2651 result += (double)src1[i]*src2[i];
2657 static double dotProd_8u(const uchar* src1, const uchar* src2, int len)
2661 ippiDotProd_8u64f_C1R(src1, (int)(len*sizeof(src1[0])),
2662 src2, (int)(len*sizeof(src2[0])),
2663 ippiSize(len, 1), &r);
2671 int j, len0 = len & -4, blockSize0 = (1 << 13), blockSize;
2672 __m128i z = _mm_setzero_si128();
2675 blockSize = std::min(len0 - i, blockSize0);
2676 __m128i s = _mm_setzero_si128();
2678 for( ; j <= blockSize - 16; j += 16 )
2680 __m128i b0 = _mm_loadu_si128((const __m128i*)(src1 + j));
2681 __m128i b1 = _mm_loadu_si128((const __m128i*)(src2 + j));
2682 __m128i s0, s1, s2, s3;
2683 s0 = _mm_unpacklo_epi8(b0, z);
2684 s2 = _mm_unpackhi_epi8(b0, z);
2685 s1 = _mm_unpacklo_epi8(b1, z);
2686 s3 = _mm_unpackhi_epi8(b1, z);
2687 s0 = _mm_madd_epi16(s0, s1);
2688 s2 = _mm_madd_epi16(s2, s3);
2689 s = _mm_add_epi32(s, s0);
2690 s = _mm_add_epi32(s, s2);
2693 for( ; j < blockSize; j += 4 )
2695 __m128i s0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src1 + j)), z);
2696 __m128i s1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src2 + j)), z);
2697 s0 = _mm_madd_epi16(s0, s1);
2698 s = _mm_add_epi32(s, s0);
2700 CV_DECL_ALIGNED(16) int buf[4];
2701 _mm_store_si128((__m128i*)buf, s);
2702 r += buf[0] + buf[1] + buf[2] + buf[3];
2710 return r + dotProd_(src1, src2, len - i);
2715 static double dotProd_8s(const schar* src1, const schar* src2, int len)
2717 return dotProd_(src1, src2, len);
2720 static double dotProd_16u(const ushort* src1, const ushort* src2, int len)
2723 IF_IPP(ippiDotProd_16u64f_C1R(src1, (int)(len*sizeof(src1[0])),
2724 src2, (int)(len*sizeof(src2[0])),
2725 ippiSize(len, 1), &r),
2726 r = dotProd_(src1, src2, len));
2730 static double dotProd_16s(const short* src1, const short* src2, int len)
2733 IF_IPP(ippiDotProd_16s64f_C1R(src1, (int)(len*sizeof(src1[0])),
2734 src2, (int)(len*sizeof(src2[0])),
2735 ippiSize(len, 1), &r),
2736 r = dotProd_(src1, src2, len));
2740 static double dotProd_32s(const int* src1, const int* src2, int len)
2743 IF_IPP(ippiDotProd_32s64f_C1R(src1, (int)(len*sizeof(src1[0])),
2744 src2, (int)(len*sizeof(src2[0])),
2745 ippiSize(len, 1), &r),
2746 r = dotProd_(src1, src2, len));
2750 static double dotProd_32f(const float* src1, const float* src2, int len)
2753 IF_IPP(ippsDotProd_32f64f(src1, src2, len, &r),
2754 r = dotProd_(src1, src2, len));
2758 static double dotProd_64f(const double* src1, const double* src2, int len)
2761 IF_IPP(ippsDotProd_64f(src1, src2, len, &r),
2762 r = dotProd_(src1, src2, len));
2767 typedef double (*DotProdFunc)(const uchar* src1, const uchar* src2, int len);
2769 static DotProdFunc dotProdTab[] =
2771 (DotProdFunc)GET_OPTIMIZED(dotProd_8u), (DotProdFunc)GET_OPTIMIZED(dotProd_8s),
2772 (DotProdFunc)dotProd_16u, (DotProdFunc)dotProd_16s,
2773 (DotProdFunc)dotProd_32s, (DotProdFunc)GET_OPTIMIZED(dotProd_32f),
2774 (DotProdFunc)dotProd_64f, 0
2777 double Mat::dot(InputArray _mat) const
2779 Mat mat = _mat.getMat();
2780 int cn = channels();
2781 DotProdFunc func = dotProdTab[depth()];
2782 CV_Assert( mat.type() == type() && mat.size == size && func != 0 );
2784 if( isContinuous() && mat.isContinuous() )
2786 size_t len = total()*cn;
2787 if( len == (size_t)(int)len )
2788 return func(data, mat.data, (int)len);
2791 const Mat* arrays[] = {this, &mat, 0};
2793 NAryMatIterator it(arrays, ptrs);
2794 int len = (int)(it.size*cn);
2797 for( size_t i = 0; i < it.nplanes; i++, ++it )
2798 r += func( ptrs[0], ptrs[1], len );
2803 /****************************************************************************************\
2805 \****************************************************************************************/
2809 PCA::PCA(InputArray data, InputArray _mean, int flags, int maxComponents)
2811 operator()(data, _mean, flags, maxComponents);
2814 PCA::PCA(InputArray data, InputArray _mean, int flags, double retainedVariance)
2816 operator()(data, _mean, flags, retainedVariance);
2819 PCA& PCA::operator()(InputArray _data, InputArray __mean, int flags, int maxComponents)
2821 Mat data = _data.getMat(), _mean = __mean.getMat();
2822 int covar_flags = CV_COVAR_SCALE;
2823 int i, len, in_count;
2826 CV_Assert( data.channels() == 1 );
2827 if( flags & CV_PCA_DATA_AS_COL )
2830 in_count = data.cols;
2831 covar_flags |= CV_COVAR_COLS;
2832 mean_sz = Size(1, len);
2837 in_count = data.rows;
2838 covar_flags |= CV_COVAR_ROWS;
2839 mean_sz = Size(len, 1);
2842 int count = std::min(len, in_count), out_count = count;
2843 if( maxComponents > 0 )
2844 out_count = std::min(count, maxComponents);
2846 // "scrambled" way to compute PCA (when cols(A)>rows(A)):
2847 // B = A'A; B*x=b*x; C = AA'; C*y=c*y -> AA'*y=c*y -> A'A*(A'*y)=c*(A'*y) -> c = b, x=A'*y
2848 if( len <= in_count )
2849 covar_flags |= CV_COVAR_NORMAL;
2851 int ctype = std::max(CV_32F, data.depth());
2852 mean.create( mean_sz, ctype );
2854 Mat covar( count, count, ctype );
2858 CV_Assert( _mean.size() == mean_sz );
2859 _mean.convertTo(mean, ctype);
2862 calcCovarMatrix( data, covar, mean, covar_flags, ctype );
2863 eigen( covar, eigenvalues, eigenvectors );
2865 if( !(covar_flags & CV_COVAR_NORMAL) )
2867 // CV_PCA_DATA_AS_ROW: cols(A)>rows(A). x=A'*y -> x'=y'*A
2868 // CV_PCA_DATA_AS_COL: rows(A)>cols(A). x=A''*y -> x'=y'*A'
2869 Mat tmp_data, tmp_mean = repeat(mean, data.rows/mean.rows, data.cols/mean.cols);
2870 if( data.type() != ctype || tmp_mean.data == mean.data )
2872 data.convertTo( tmp_data, ctype );
2873 subtract( tmp_data, tmp_mean, tmp_data );
2877 subtract( data, tmp_mean, tmp_mean );
2878 tmp_data = tmp_mean;
2881 Mat evects1(count, len, ctype);
2882 gemm( eigenvectors, tmp_data, 1, Mat(), 0, evects1,
2883 (flags & CV_PCA_DATA_AS_COL) ? CV_GEMM_B_T : 0);
2884 eigenvectors = evects1;
2886 // normalize eigenvectors
2887 for( i = 0; i < out_count; i++ )
2889 Mat vec = eigenvectors.row(i);
2890 normalize(vec, vec);
2894 if( count > out_count )
2896 // use clone() to physically copy the data and thus deallocate the original matrices
2897 eigenvalues = eigenvalues.rowRange(0,out_count).clone();
2898 eigenvectors = eigenvectors.rowRange(0,out_count).clone();
2903 PCA& PCA::operator()(InputArray _data, InputArray __mean, int flags, double retainedVariance)
2905 Mat data = _data.getMat(), _mean = __mean.getMat();
2906 int covar_flags = CV_COVAR_SCALE;
2907 int i, len, in_count;
2910 CV_Assert( data.channels() == 1 );
2911 if( flags & CV_PCA_DATA_AS_COL )
2914 in_count = data.cols;
2915 covar_flags |= CV_COVAR_COLS;
2916 mean_sz = Size(1, len);
2921 in_count = data.rows;
2922 covar_flags |= CV_COVAR_ROWS;
2923 mean_sz = Size(len, 1);
2926 CV_Assert( retainedVariance > 0 && retainedVariance <= 1 );
2928 int count = std::min(len, in_count);
2930 // "scrambled" way to compute PCA (when cols(A)>rows(A)):
2931 // B = A'A; B*x=b*x; C = AA'; C*y=c*y -> AA'*y=c*y -> A'A*(A'*y)=c*(A'*y) -> c = b, x=A'*y
2932 if( len <= in_count )
2933 covar_flags |= CV_COVAR_NORMAL;
2935 int ctype = std::max(CV_32F, data.depth());
2936 mean.create( mean_sz, ctype );
2938 Mat covar( count, count, ctype );
2942 CV_Assert( _mean.size() == mean_sz );
2943 _mean.convertTo(mean, ctype);
2946 calcCovarMatrix( data, covar, mean, covar_flags, ctype );
2947 eigen( covar, eigenvalues, eigenvectors );
2949 if( !(covar_flags & CV_COVAR_NORMAL) )
2951 // CV_PCA_DATA_AS_ROW: cols(A)>rows(A). x=A'*y -> x'=y'*A
2952 // CV_PCA_DATA_AS_COL: rows(A)>cols(A). x=A''*y -> x'=y'*A'
2953 Mat tmp_data, tmp_mean = repeat(mean, data.rows/mean.rows, data.cols/mean.cols);
2954 if( data.type() != ctype || tmp_mean.data == mean.data )
2956 data.convertTo( tmp_data, ctype );
2957 subtract( tmp_data, tmp_mean, tmp_data );
2961 subtract( data, tmp_mean, tmp_mean );
2962 tmp_data = tmp_mean;
2965 Mat evects1(count, len, ctype);
2966 gemm( eigenvectors, tmp_data, 1, Mat(), 0, evects1,
2967 (flags & CV_PCA_DATA_AS_COL) ? CV_GEMM_B_T : 0);
2968 eigenvectors = evects1;
2970 // normalize all eigenvectors
2971 for( i = 0; i < eigenvectors.rows; i++ )
2973 Mat vec = eigenvectors.row(i);
2974 normalize(vec, vec);
2978 // compute the cumulative energy content for each eigenvector
2979 Mat g(eigenvalues.size(), ctype);
2981 for(int ig = 0; ig < g.rows; ig++)
2983 g.at<float>(ig,0) = 0;
2984 for(int im = 0; im <= ig; im++)
2986 g.at<float>(ig,0) += eigenvalues.at<float>(im,0);
2991 for(L = 0; L < eigenvalues.rows; L++)
2993 double energy = g.at<float>(L, 0) / g.at<float>(g.rows - 1, 0);
2994 if(energy > retainedVariance)
3000 // use clone() to physically copy the data and thus deallocate the original matrices
3001 eigenvalues = eigenvalues.rowRange(0,L).clone();
3002 eigenvectors = eigenvectors.rowRange(0,L).clone();
3007 void PCA::project(InputArray _data, OutputArray result) const
3009 Mat data = _data.getMat();
3010 CV_Assert( mean.data && eigenvectors.data &&
3011 ((mean.rows == 1 && mean.cols == data.cols) || (mean.cols == 1 && mean.rows == data.rows)));
3012 Mat tmp_data, tmp_mean = repeat(mean, data.rows/mean.rows, data.cols/mean.cols);
3013 int ctype = mean.type();
3014 if( data.type() != ctype || tmp_mean.data == mean.data )
3016 data.convertTo( tmp_data, ctype );
3017 subtract( tmp_data, tmp_mean, tmp_data );
3021 subtract( data, tmp_mean, tmp_mean );
3022 tmp_data = tmp_mean;
3024 if( mean.rows == 1 )
3025 gemm( tmp_data, eigenvectors, 1, Mat(), 0, result, GEMM_2_T );
3027 gemm( eigenvectors, tmp_data, 1, Mat(), 0, result, 0 );
3030 Mat PCA::project(InputArray data) const
3033 project(data, result);
3037 void PCA::backProject(InputArray _data, OutputArray result) const
3039 Mat data = _data.getMat();
3040 CV_Assert( mean.data && eigenvectors.data &&
3041 ((mean.rows == 1 && eigenvectors.rows == data.cols) ||
3042 (mean.cols == 1 && eigenvectors.rows == data.rows)));
3044 Mat tmp_data, tmp_mean;
3045 data.convertTo(tmp_data, mean.type());
3046 if( mean.rows == 1 )
3048 tmp_mean = repeat(mean, data.rows, 1);
3049 gemm( tmp_data, eigenvectors, 1, tmp_mean, 1, result, 0 );
3053 tmp_mean = repeat(mean, 1, data.cols);
3054 gemm( eigenvectors, tmp_data, 1, tmp_mean, 1, result, GEMM_1_T );
3058 Mat PCA::backProject(InputArray data) const
3061 backProject(data, result);
3067 void cv::PCACompute(InputArray data, InputOutputArray mean,
3068 OutputArray eigenvectors, int maxComponents)
3071 pca(data, mean, 0, maxComponents);
3072 pca.mean.copyTo(mean);
3073 pca.eigenvectors.copyTo(eigenvectors);
3076 void cv::PCACompute(InputArray data, InputOutputArray mean,
3077 OutputArray eigenvectors, double retainedVariance)
3080 pca(data, mean, 0, retainedVariance);
3081 pca.mean.copyTo(mean);
3082 pca.eigenvectors.copyTo(eigenvectors);
3085 void cv::PCAProject(InputArray data, InputArray mean,
3086 InputArray eigenvectors, OutputArray result)
3089 pca.mean = mean.getMat();
3090 pca.eigenvectors = eigenvectors.getMat();
3091 pca.project(data, result);
3094 void cv::PCABackProject(InputArray data, InputArray mean,
3095 InputArray eigenvectors, OutputArray result)
3098 pca.mean = mean.getMat();
3099 pca.eigenvectors = eigenvectors.getMat();
3100 pca.backProject(data, result);
3104 /****************************************************************************************\
3106 \****************************************************************************************/
3108 CV_IMPL void cvGEMM( const CvArr* Aarr, const CvArr* Barr, double alpha,
3109 const CvArr* Carr, double beta, CvArr* Darr, int flags )
3111 cv::Mat A = cv::cvarrToMat(Aarr), B = cv::cvarrToMat(Barr);
3112 cv::Mat C, D = cv::cvarrToMat(Darr);
3115 C = cv::cvarrToMat(Carr);
3117 CV_Assert( (D.rows == ((flags & CV_GEMM_A_T) == 0 ? A.rows : A.cols)) &&
3118 (D.cols == ((flags & CV_GEMM_B_T) == 0 ? B.cols : B.rows)) &&
3119 D.type() == A.type() );
3121 gemm( A, B, alpha, C, beta, D, flags );
3126 cvTransform( const CvArr* srcarr, CvArr* dstarr,
3127 const CvMat* transmat, const CvMat* shiftvec )
3129 cv::Mat m = cv::cvarrToMat(transmat), src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr);
3133 cv::Mat v = cv::cvarrToMat(shiftvec).reshape(1,m.rows),
3134 _m(m.rows, m.cols + 1, m.type()), m1 = _m.colRange(0,m.cols), v1 = _m.col(m.cols);
3135 m.convertTo(m1, m1.type());
3136 v.convertTo(v1, v1.type());
3140 CV_Assert( dst.depth() == src.depth() && dst.channels() == m.rows );
3141 cv::transform( src, dst, m );
3146 cvPerspectiveTransform( const CvArr* srcarr, CvArr* dstarr, const CvMat* mat )
3148 cv::Mat m = cv::cvarrToMat(mat), src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr);
3150 CV_Assert( dst.type() == src.type() && dst.channels() == m.rows-1 );
3151 cv::perspectiveTransform( src, dst, m );
3155 CV_IMPL void cvScaleAdd( const CvArr* srcarr1, CvScalar scale,
3156 const CvArr* srcarr2, CvArr* dstarr )
3158 cv::Mat src1 = cv::cvarrToMat(srcarr1), dst = cv::cvarrToMat(dstarr);
3160 CV_Assert( src1.size == dst.size && src1.type() == dst.type() );
3161 cv::scaleAdd( src1, scale.val[0], cv::cvarrToMat(srcarr2), dst );
3166 cvCalcCovarMatrix( const CvArr** vecarr, int count,
3167 CvArr* covarr, CvArr* avgarr, int flags )
3169 cv::Mat cov0 = cv::cvarrToMat(covarr), cov = cov0, mean0, mean;
3170 CV_Assert( vecarr != 0 && count >= 1 );
3173 mean = mean0 = cv::cvarrToMat(avgarr);
3175 if( (flags & CV_COVAR_COLS) != 0 || (flags & CV_COVAR_ROWS) != 0 )
3178 cv::Mat data = cv::cvarrToMat(vecarr[0]);
3179 cv::calcCovarMatrix( data, cov, mean, flags, cov.type() );
3183 std::vector<cv::Mat> data(count);
3184 for( int i = 0; i < count; i++ )
3185 data[i] = cv::cvarrToMat(vecarr[i]);
3186 cv::calcCovarMatrix( &data[0], count, cov, mean, flags, cov.type() );
3189 if( mean.data != mean0.data && mean0.data )
3190 mean.convertTo(mean0, mean0.type());
3192 if( cov.data != cov0.data )
3193 cov.convertTo(cov0, cov0.type());
3198 cvMahalanobis( const CvArr* srcAarr, const CvArr* srcBarr, const CvArr* matarr )
3200 return cv::Mahalanobis(cv::cvarrToMat(srcAarr),
3201 cv::cvarrToMat(srcBarr), cv::cvarrToMat(matarr));
3205 cvMulTransposed( const CvArr* srcarr, CvArr* dstarr,
3206 int order, const CvArr* deltaarr, double scale )
3208 cv::Mat src = cv::cvarrToMat(srcarr), dst0 = cv::cvarrToMat(dstarr), dst = dst0, delta;
3210 delta = cv::cvarrToMat(deltaarr);
3211 cv::mulTransposed( src, dst, order != 0, delta, scale, dst.type());
3212 if( dst.data != dst0.data )
3213 dst.convertTo(dst0, dst0.type());
3216 CV_IMPL double cvDotProduct( const CvArr* srcAarr, const CvArr* srcBarr )
3218 return cv::cvarrToMat(srcAarr).dot(cv::cvarrToMat(srcBarr));
3223 cvCalcPCA( const CvArr* data_arr, CvArr* avg_arr, CvArr* eigenvals, CvArr* eigenvects, int flags )
3225 cv::Mat data = cv::cvarrToMat(data_arr), mean0 = cv::cvarrToMat(avg_arr);
3226 cv::Mat evals0 = cv::cvarrToMat(eigenvals), evects0 = cv::cvarrToMat(eigenvects);
3227 cv::Mat mean = mean0, evals = evals0, evects = evects0;
3231 pca.eigenvalues = evals;
3232 pca.eigenvectors = evects;
3234 pca(data, (flags & CV_PCA_USE_AVG) ? mean : cv::Mat(),
3235 flags, evals.data ? evals.rows + evals.cols - 1 : 0);
3237 if( pca.mean.size() == mean.size() )
3238 pca.mean.convertTo( mean, mean.type() );
3241 cv::Mat temp; pca.mean.convertTo( temp, mean.type() );
3242 transpose( temp, mean );
3245 evals = pca.eigenvalues;
3246 evects = pca.eigenvectors;
3247 int ecount0 = evals0.cols + evals0.rows - 1;
3248 int ecount = evals.cols + evals.rows - 1;
3250 CV_Assert( (evals0.cols == 1 || evals0.rows == 1) &&
3251 ecount0 <= ecount &&
3252 evects0.cols == evects.cols &&
3253 evects0.rows == ecount0 );
3255 cv::Mat temp = evals0;
3256 if( evals.rows == 1 )
3257 evals.colRange(0, ecount0).convertTo(temp, evals0.type());
3259 evals.rowRange(0, ecount0).convertTo(temp, evals0.type());
3260 if( temp.data != evals0.data )
3261 transpose(temp, evals0);
3262 evects.rowRange(0, ecount0).convertTo( evects0, evects0.type() );
3264 // otherwise some datatype's or size's were incorrect, so the output arrays have been reallocated
3265 CV_Assert( mean0.data == mean.data );
3270 cvProjectPCA( const CvArr* data_arr, const CvArr* avg_arr,
3271 const CvArr* eigenvects, CvArr* result_arr )
3273 cv::Mat data = cv::cvarrToMat(data_arr), mean = cv::cvarrToMat(avg_arr);
3274 cv::Mat evects = cv::cvarrToMat(eigenvects), dst0 = cv::cvarrToMat(result_arr), dst = dst0;
3279 if( mean.rows == 1 )
3281 CV_Assert(dst.cols <= evects.rows && dst.rows == data.rows);
3286 CV_Assert(dst.rows <= evects.rows && dst.cols == data.cols);
3289 pca.eigenvectors = evects.rowRange(0, n);
3291 cv::Mat result = pca.project(data);
3292 if( result.cols != dst.cols )
3293 result = result.reshape(1, 1);
3294 result.convertTo(dst, dst.type());
3296 CV_Assert(dst0.data == dst.data);
3301 cvBackProjectPCA( const CvArr* proj_arr, const CvArr* avg_arr,
3302 const CvArr* eigenvects, CvArr* result_arr )
3304 cv::Mat data = cv::cvarrToMat(proj_arr), mean = cv::cvarrToMat(avg_arr);
3305 cv::Mat evects = cv::cvarrToMat(eigenvects), dst0 = cv::cvarrToMat(result_arr), dst = dst0;
3310 if( mean.rows == 1 )
3312 CV_Assert(data.cols <= evects.rows && dst.rows == data.rows);
3317 CV_Assert(data.rows <= evects.rows && dst.cols == data.cols);
3320 pca.eigenvectors = evects.rowRange(0, n);
3322 cv::Mat result = pca.backProject(data);
3323 result.convertTo(dst, dst.type());
3325 CV_Assert(dst0.data == dst.data);