From 008abd28fd27742d574372acdb5e6839ffb5316e Mon Sep 17 00:00:00 2001
From: Maksim Shabunin <maksim.shabunin@itseez.com>
Date: Wed, 13 Jan 2016 17:23:57 +0300
Subject: [PATCH] Extracted HAL interfaces for DFT/DCT, added new test

---
 modules/core/include/opencv2/core/hal/hal.hpp     |   19 +
 modules/core/include/opencv2/core/hal/interface.h |   10 +
 modules/core/src/dxt.cpp                          | 1853 ++++++++++++++-------
 modules/core/src/hal_replacement.hpp              |   25 +
 modules/core/test/test_dxt.cpp                    |   76 +
 modules/imgproc/src/templmatch.cpp                |   36 +-
 6 files changed, 1370 insertions(+), 649 deletions(-)
diff --git a/modules/core/include/opencv2/core/hal/hal.hpp b/modules/core/include/opencv2/core/hal/hal.hpp
index 64af09a..52a5f99 100644
--- a/modules/core/include/opencv2/core/hal/hal.hpp
+++ b/modules/core/include/opencv2/core/hal/hal.hpp
@@ -187,6 +187,25 @@ CV_EXPORTS void addWeighted32s( const int* src1, size_t step1, const int* src2,
 CV_EXPORTS void addWeighted32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* scalars );
 CV_EXPORTS void addWeighted64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* scalars );
 
+struct DftContext
+{
+    void * impl;
+    bool useReplacement;
+    DftContext() : impl(0), useReplacement(false) {}
+};
+
+CV_EXPORTS void dftInit2D(DftContext & c, int _width, int _height, int _depth, int _src_channels, int _dst_channels, int flags, int _nonzero_rows = 0);
+CV_EXPORTS void dftRun2D(const DftContext & c, const void * src, int src_step, void * dst, int dst_step);
+CV_EXPORTS void dftFree2D(DftContext & c);
+
+CV_EXPORTS void dftInit(DftContext & c, int len, int count, int depth, int flags, bool * useBuffer = 0);
+CV_EXPORTS void dftRun(const DftContext & c, const void * src, void * dst);
+CV_EXPORTS void dftFree(DftContext & c);
+
+CV_EXPORTS void dctInit(DftContext & c, int width, int height, int depth, int flags);
+CV_EXPORTS void dctRun(const DftContext & c, const void * src, int src_step, void * dst, int dst_step);
+CV_EXPORTS void dctFree(DftContext & c);
+
 //! @} core_hal
 
 //=============================================================================
diff --git a/modules/core/include/opencv2/core/hal/interface.h b/modules/core/include/opencv2/core/hal/interface.h
index ba3f55c..0da68f1 100644
--- a/modules/core/include/opencv2/core/hal/interface.h
+++ b/modules/core/include/opencv2/core/hal/interface.h
@@ -12,6 +12,16 @@
 //! @}
 
 
+#define CV_HAL_DFT_INVERSE        1
+#define CV_HAL_DFT_SCALE          2
+#define CV_HAL_DFT_ROWS           4
+#define CV_HAL_DFT_COMPLEX_OUTPUT 16
+#define CV_HAL_DFT_REAL_OUTPUT    32
+#define CV_HAL_DFT_TWO_STAGE      64
+#define CV_HAL_DFT_STAGE_COLS    128
+#define CV_HAL_DFT_IS_CONTINUOUS 512
+#define CV_HAL_DFT_IS_INPLACE 1024
+
 #ifdef __cplusplus
 #include <cstddef>
 #else
diff --git a/modules/core/src/dxt.cpp b/modules/core/src/dxt.cpp
index 691b297..1265091 100644
--- a/modules/core/src/dxt.cpp
+++ b/modules/core/src/dxt.cpp
@@ -173,7 +173,7 @@ DFTFactorize( int n, int* factors )
 }
 
 static void
-DFTInit( int n0, int nf, int* factors, int* itab, int elem_size, void* _wave, int inv_itab )
+DFTInit( int n0, int nf, const int* factors, int* itab, int elem_size, void* _wave, int inv_itab )
 {
     int digits[34], radix[34];
     int n = factors[0], m = 0;
@@ -519,19 +519,59 @@ static IppStatus ippsDFTInv_PackToR( const double* src, double* dst,
 }
 #endif
 
-enum { DFT_NO_PERMUTE=256, DFT_COMPLEX_INPUT_OR_OUTPUT=512 };
+struct OcvDftOptions;
+
+typedef void (*DFTFunc)(const OcvDftOptions & c, const void* src, void* dst);
+
+struct OcvDftOptions {
+    int nf;
+    int *factors;
+    double scale;
+
+    int* itab;
+    void* wave;
+    int tab_size;
+    int n;
+
+    bool isInverse;
+    bool noPermute;
+    bool isComplex;
+
+    bool haveSSE3;
+
+    DFTFunc dft_func;
+    bool useIpp;
 
-// mixed-radix complex discrete Fourier transform: double-precision version
-template<typename T> static void
-DFT( const Complex<T>* src, Complex<T>* dst, int n,
-     int nf, const int* factors, const int* itab,
-     const Complex<T>* wave, int tab_size,
-     const void*
 #ifdef USE_IPP_DFT
-     spec
+    uchar* ipp_spec;
+    uchar* ipp_work;
 #endif
-     , Complex<T>* buf,
-     int flags, double _scale )
+
+    OcvDftOptions()
+    {
+        nf = 0;
+        factors = 0;
+        scale = 0;
+        itab = 0;
+        wave = 0;
+        tab_size = 0;
+        n = 0;
+        isInverse = false;
+        noPermute = false;
+        isComplex = false;
+        useIpp = false;
+#ifdef USE_IPP_DFT
+        ipp_spec = 0;
+        ipp_work = 0;
+#endif
+        dft_func = 0;
+        haveSSE3 = checkHardwareSupport(CV_CPU_SSE3);
+    }
+};
+
+// mixed-radix complex discrete Fourier transform: double-precision version
+template<typename T> static void
+DFT(const OcvDftOptions & c, const Complex<T>* src, Complex<T>* dst)
 {
     static const T sin_120 = (T)0.86602540378443864676372317075294;
     static const T fft5_2 = (T)0.559016994374947424102293417182819;
@@ -539,20 +579,23 @@ DFT( const Complex<T>* src, Complex<T>* dst, int n,
     static const T fft5_4 = (T)-1.538841768587626701285145288018455;
     static const T fft5_5 = (T)0.363271264002680442947733378740309;
 
-    int n0 = n, f_idx, nx;
-    int inv = flags & DFT_INVERSE;
-    int dw0 = tab_size, dw;
+    const Complex<T>* wave = (Complex<T>*)c.wave;
+    const int * itab = c.itab;
+
+    int n = c.n;
+    int f_idx, nx;
+    int inv = c.isInverse;
+    int dw0 = c.tab_size, dw;
     int i, j, k;
     Complex<T> t;
-    T scale = (T)_scale;
-    int tab_step;
+    T scale = (T)c.scale;
 
-#ifdef USE_IPP_DFT
-    if( spec )
+    if( c.useIpp )
     {
+#ifdef USE_IPP_DFT
         if( !inv )
         {
-            if (ippsDFTFwd_CToC( src, dst, spec, (uchar*)buf ) >= 0)
+            if (ippsDFTFwd_CToC( src, dst, c.ipp_spec, c.ipp_work ) >= 0)
             {
                 CV_IMPL_ADD(CV_IMPL_IPP);
                 return;
@@ -560,22 +603,22 @@ DFT( const Complex<T>* src, Complex<T>* dst, int n,
         }
         else
         {
-            if (ippsDFTInv_CToC( src, dst, spec, (uchar*)buf ) >= 0)
+            if (ippsDFTInv_CToC( src, dst, c.ipp_spec, c.ipp_work ) >= 0)
             {
                 CV_IMPL_ADD(CV_IMPL_IPP);
                 return;
             }
         }
         setIppErrorStatus();
-    }
 #endif
+    }
 
-    tab_step = tab_size == n ? 1 : tab_size == n*2 ? 2 : tab_size/n;
+    int tab_step = c.tab_size == n ? 1 : c.tab_size == n*2 ? 2 : c.tab_size/n;
 
     // 0. shuffle data
     if( dst != src )
     {
-        assert( (flags & DFT_NO_PERMUTE) == 0 );
+        assert( !c.noPermute );
         if( !inv )
         {
             for( i = 0; i <= n - 2; i += 2, itab += 2*tab_step )
@@ -609,10 +652,10 @@ DFT( const Complex<T>* src, Complex<T>* dst, int n,
     }
     else
     {
-        if( (flags & DFT_NO_PERMUTE) == 0 )
+        if( !c.noPermute )
         {
-            CV_Assert( factors[0] == factors[nf-1] );
-            if( nf == 1 )
+            CV_Assert( c.factors[0] == c.factors[c.nf-1] );
+            if( c.nf == 1 )
             {
                 if( (n & 3) == 0 )
                 {
@@ -662,22 +705,22 @@ DFT( const Complex<T>* src, Complex<T>* dst, int n,
 
     n = 1;
     // 1. power-2 transforms
-    if( (factors[0] & 1) == 0 )
+    if( (c.factors[0] & 1) == 0 )
     {
-        if( factors[0] >= 4 && checkHardwareSupport(CV_CPU_SSE3))
+        if( c.factors[0] >= 4 && c.haveSSE3)
         {
             DFT_VecR4<T> vr4;
-            n = vr4(dst, factors[0], n0, dw0, wave);
+            n = vr4(dst, c.factors[0], c.n, dw0, wave);
         }
 
         // radix-4 transform
-        for( ; n*4 <= factors[0]; )
+        for( ; n*4 <= c.factors[0]; )
         {
             nx = n;
             n *= 4;
             dw0 /= 4;
 
-            for( i = 0; i < n0; i += n )
+            for( i = 0; i < c.n; i += n )
             {
                 Complex<T> *v0, *v1;
                 T r0, i0, r1, i1, r2, i2, r3, i3, r4, i4;
@@ -729,14 +772,14 @@ DFT( const Complex<T>* src, Complex<T>* dst, int n,
             }
         }
 
-        for( ; n < factors[0]; )
+        for( ; n < c.factors[0]; )
         {
             // do the remaining radix-2 transform
             nx = n;
             n *= 2;
             dw0 /= 2;
 
-            for( i = 0; i < n0; i += n )
+            for( i = 0; i < c.n; i += n )
             {
                 Complex<T>* v = dst + i;
                 T r0 = v[0].re + v[nx].re;
@@ -761,9 +804,9 @@ DFT( const Complex<T>* src, Complex<T>* dst, int n,
     }
 
     // 2. all the other transforms
-    for( f_idx = (factors[0]&1) ? 0 : 1; f_idx < nf; f_idx++ )
+    for( f_idx = (c.factors[0]&1) ? 0 : 1; f_idx < c.nf; f_idx++ )
     {
-        int factor = factors[f_idx];
+        int factor = c.factors[f_idx];
         nx = n;
         n *= factor;
         dw0 /= factor;
@@ -771,7 +814,7 @@ DFT( const Complex<T>* src, Complex<T>* dst, int n,
         if( factor == 3 )
         {
             // radix-3
-            for( i = 0; i < n0; i += n )
+            for( i = 0; i < c.n; i += n )
             {
                 Complex<T>* v = dst + i;
 
@@ -807,7 +850,7 @@ DFT( const Complex<T>* src, Complex<T>* dst, int n,
         else if( factor == 5 )
         {
             // radix-5
-            for( i = 0; i < n0; i += n )
+            for( i = 0; i < c.n; i += n )
             {
                 for( j = 0, dw = 0; j < nx; j++, dw += dw0 )
                 {
@@ -863,11 +906,12 @@ DFT( const Complex<T>* src, Complex<T>* dst, int n,
         {
             // radix-"factor" - an odd number
             int p, q, factor2 = (factor - 1)/2;
-            int d, dd, dw_f = tab_size/factor;
+            int d, dd, dw_f = c.tab_size/factor;
+            AutoBuffer<Complex<T> > buf(factor2 * 2);
             Complex<T>* a = buf;
-            Complex<T>* b = buf + factor2;
+            Complex<T>* b = a + factor2;
 
-            for( i = 0; i < n0; i += n )
+            for( i = 0; i < c.n; i += n )
             {
                 for( j = 0, dw = 0; j < nx; j++, dw += dw0 )
                 {
@@ -931,7 +975,7 @@ DFT( const Complex<T>* src, Complex<T>* dst, int n,
                             s1.im += r1 - i1; s0.im += r1 + i1;
 
                             d += dd;
-                            d -= -(d >= tab_size) & tab_size;
+                            d -= -(d >= c.tab_size) & c.tab_size;
                         }
 
                         v[k] = s0;
@@ -948,7 +992,7 @@ DFT( const Complex<T>* src, Complex<T>* dst, int n,
         if( inv )
             im_scale = -im_scale;
 
-        for( i = 0; i < n0; i++ )
+        for( i = 0; i < c.n; i++ )
         {
             T t0 = dst[i].re*re_scale;
             T t1 = dst[i].im*im_scale;
@@ -958,7 +1002,7 @@ DFT( const Complex<T>* src, Complex<T>* dst, int n,
     }
     else if( inv )
     {
-        for( i = 0; i <= n0 - 2; i += 2 )
+        for( i = 0; i <= c.n - 2; i += 2 )
         {
             T t0 = -dst[i].im;
             T t1 = -dst[i+1].im;
@@ -966,8 +1010,8 @@ DFT( const Complex<T>* src, Complex<T>* dst, int n,
             dst[i+1].im = t1;
         }
 
-        if( i < n0 )
-            dst[n0-1].im = -dst[n0-1].im;
+        if( i < c.n )
+            dst[c.n-1].im = -dst[c.n-1].im;
     }
 }
 
@@ -977,23 +1021,18 @@ DFT( const Complex<T>* src, Complex<T>* dst, int n,
      re(0), re(1), im(1), ... , re(n/2-1), im((n+1)/2-1) [, re((n+1)/2)] OR ...
      re(0), 0, re(1), im(1), ..., re(n/2-1), im((n+1)/2-1) [, re((n+1)/2), 0] */
 template<typename T> static void
-RealDFT( const T* src, T* dst, int n, int nf, int* factors, const int* itab,
-         const Complex<T>* wave, int tab_size, const void*
-#ifdef USE_IPP_DFT
-         spec
-#endif
-         ,
-         Complex<T>* buf, int flags, double _scale )
+RealDFT(const OcvDftOptions & c, const T* src, T* dst)
 {
-    int complex_output = (flags & DFT_COMPLEX_INPUT_OR_OUTPUT) != 0;
-    T scale = (T)_scale;
-    int j, n2 = n >> 1;
+    int n = c.n;
+    int complex_output = c.isComplex;
+    T scale = (T)c.scale;
+    int j;
     dst += complex_output;
 
-#ifdef USE_IPP_DFT
-    if( spec )
+    if( c.useIpp )
     {
-        if (ippsDFTFwd_RToPack( src, dst, spec, (uchar*)buf ) >=0)
+#ifdef USE_IPP_DFT
+        if (ippsDFTFwd_RToPack( src, dst, c.ipp_spec, c.ipp_work ) >=0)
         {
             if( complex_output )
             {
@@ -1006,9 +1045,9 @@ RealDFT( const T* src, T* dst, int n, int nf, int* factors, const int* itab,
             return;
         }
         setIppErrorStatus();
-    }
 #endif
-    assert( tab_size == n );
+    }
+    assert( c.tab_size == n );
 
     if( n == 1 )
     {
@@ -1028,15 +1067,19 @@ RealDFT( const T* src, T* dst, int n, int nf, int* factors, const int* itab,
         _dst[0].im = 0;
         for( j = 1; j < n; j += 2 )
         {
-            T t0 = src[itab[j]]*scale;
-            T t1 = src[itab[j+1]]*scale;
+            T t0 = src[c.itab[j]]*scale;
+            T t1 = src[c.itab[j+1]]*scale;
             _dst[j].re = t0;
             _dst[j].im = 0;
             _dst[j+1].re = t1;
             _dst[j+1].im = 0;
         }
-        DFT( _dst, _dst, n, nf, factors, itab, wave,
-             tab_size, 0, buf, DFT_NO_PERMUTE, 1 );
+        OcvDftOptions sub_c = c;
+        sub_c.isComplex = false;
+        sub_c.isInverse = false;
+        sub_c.noPermute = true;
+        sub_c.scale = 1.;
+        DFT(sub_c, _dst, _dst);
         if( !complex_output )
             dst[1] = dst[0];
     }
@@ -1045,12 +1088,22 @@ RealDFT( const T* src, T* dst, int n, int nf, int* factors, const int* itab,
         T t0, t;
         T h1_re, h1_im, h2_re, h2_im;
         T scale2 = scale*(T)0.5;
-        factors[0] >>= 1;
+        int n2 = n >> 1;
+
+        c.factors[0] >>= 1;
+
+        OcvDftOptions sub_c = c;
+        sub_c.factors += (c.factors[0] == 1);
+        sub_c.nf -= (c.factors[0] == 1);
+        sub_c.isComplex = false;
+        sub_c.isInverse = false;
+        sub_c.noPermute = false;
+        sub_c.scale = 1.;
+        sub_c.n = n2;
 
-        DFT( (Complex<T>*)src, (Complex<T>*)dst, n2, nf - (factors[0] == 1),
-             factors + (factors[0] == 1),
-             itab, wave, tab_size, 0, buf, 0, 1 );
-        factors[0] <<= 1;
+        DFT(sub_c, (Complex<T>*)src, (Complex<T>*)dst);
+
+        c.factors[0] <<= 1;
 
         t = dst[0] - dst[1];
         dst[0] = (dst[0] + dst[1])*scale;
@@ -1060,6 +1113,8 @@ RealDFT( const T* src, T* dst, int n, int nf, int* factors, const int* itab,
         t = dst[n-1];
         dst[n-1] = dst[1];
 
+        const Complex<T> *wave = (const Complex<T>*)c.wave;
+
         for( j = 2, wave++; j < n2; j += 2, wave++ )
         {
             /* calc odd */
@@ -1103,22 +1158,16 @@ RealDFT( const T* src, T* dst, int n, int nf, int* factors, const int* itab,
       re[0], re[1], im[1], ... , re[n/2-1], im[n/2-1], re[n/2] OR
       re(0), 0, re(1), im(1), ..., re(n/2-1), im((n+1)/2-1) [, re((n+1)/2), 0] */
 template<typename T> static void
-CCSIDFT( const T* src, T* dst, int n, int nf, int* factors, const int* itab,
-         const Complex<T>* wave, int tab_size,
-         const void*
-#ifdef USE_IPP_DFT
-         spec
-#endif
-         , Complex<T>* buf,
-         int flags, double _scale )
+CCSIDFT(const OcvDftOptions & c, const T* src, T* dst)
 {
-    int complex_input = (flags & DFT_COMPLEX_INPUT_OR_OUTPUT) != 0;
-    int j, k, n2 = (n+1) >> 1;
-    T scale = (T)_scale;
+    int n = c.n;
+    int complex_input = c.isComplex;
+    int j, k;
+    T scale = (T)c.scale;
     T save_s1 = 0.;
     T t0, t1, t2, t3, t;
 
-    assert( tab_size == n );
+    assert( c.tab_size == n );
 
     if( complex_input )
     {
@@ -1127,10 +1176,10 @@ CCSIDFT( const T* src, T* dst, int n, int nf, int* factors, const int* itab,
         ((T*)src)[1] = src[0];
         src++;
     }
-#ifdef USE_IPP_DFT
-    if( spec )
+    if( c.useIpp )
     {
-        if (ippsDFTInv_PackToR( src, dst, spec, (uchar*)buf ) >=0)
+#ifdef USE_IPP_DFT
+        if (ippsDFTInv_PackToR( src, dst, c.ipp_spec, c.ipp_work ) >=0)
         {
             if( complex_input )
                 ((T*)src)[0] = (T)save_s1;
@@ -1139,8 +1188,8 @@ CCSIDFT( const T* src, T* dst, int n, int nf, int* factors, const int* itab,
         }
 
         setIppErrorStatus();
-    }
 #endif
+    }
     if( n == 1 )
     {
         dst[0] = (T)(src[0]*scale);
@@ -1158,16 +1207,25 @@ CCSIDFT( const T* src, T* dst, int n, int nf, int* factors, const int* itab,
 
         _dst[0].re = src[0];
         _dst[0].im = 0;
+
+        int n2 = (n+1) >> 1;
+
         for( j = 1; j < n2; j++ )
         {
-            int k0 = itab[j], k1 = itab[n-j];
+            int k0 = c.itab[j], k1 = c.itab[n-j];
             t0 = _src[j].re; t1 = _src[j].im;
             _dst[k0].re = t0; _dst[k0].im = -t1;
             _dst[k1].re = t0; _dst[k1].im = t1;
         }
 
-        DFT( _dst, _dst, n, nf, factors, itab, wave,
-             tab_size, 0, buf, DFT_NO_PERMUTE, 1. );
+        OcvDftOptions sub_c = c;
+        sub_c.isComplex = false;
+        sub_c.isInverse = false;
+        sub_c.noPermute = true;
+        sub_c.scale = 1.;
+        sub_c.n = n;
+
+        DFT(sub_c, _dst, _dst);
         dst[0] *= scale;
         for( j = 1; j < n; j += 2 )
         {
@@ -1180,7 +1238,7 @@ CCSIDFT( const T* src, T* dst, int n, int nf, int* factors, const int* itab,
     else
     {
         int inplace = src == dst;
-        const Complex<T>* w = wave;
+        const Complex<T>* w = (const Complex<T>*)c.wave;
 
         t = src[1];
         t0 = (src[0] + src[n-1]);
@@ -1188,6 +1246,8 @@ CCSIDFT( const T* src, T* dst, int n, int nf, int* factors, const int* itab,
         dst[0] = t0;
         dst[1] = t1;
 
+        int n2 = (n+1) >> 1;
+
         for( j = 2, w++; j < n2; j += 2, w++ )
         {
             T h1_re, h1_im, h2_re, h2_im;
@@ -1218,10 +1278,10 @@ CCSIDFT( const T* src, T* dst, int n, int nf, int* factors, const int* itab,
             else
             {
                 int j2 = j >> 1;
-                k = itab[j2];
+                k = c.itab[j2];
                 dst[k] = t0;
                 dst[k+1] = t1;
-                k = itab[n2-j2];
+                k = c.itab[n2-j2];
                 dst[k] = t2;
                 dst[k+1]= t3;
             }
@@ -1239,19 +1299,26 @@ CCSIDFT( const T* src, T* dst, int n, int nf, int* factors, const int* itab,
             }
             else
             {
-                k = itab[n2];
+                k = c.itab[n2];
                 dst[k*2] = t0;
                 dst[k*2+1] = t1;
             }
         }
 
-        factors[0] >>= 1;
-        DFT( (Complex<T>*)dst, (Complex<T>*)dst, n2,
-             nf - (factors[0] == 1),
-             factors + (factors[0] == 1), itab,
-             wave, tab_size, 0, buf,
-             inplace ? 0 : DFT_NO_PERMUTE, 1. );
-        factors[0] <<= 1;
+        c.factors[0] >>= 1;
+
+        OcvDftOptions sub_c = c;
+        sub_c.factors += (c.factors[0] == 1);
+        sub_c.nf -= (c.factors[0] == 1);
+        sub_c.isComplex = false;
+        sub_c.isInverse = false;
+        sub_c.noPermute = !inplace;
+        sub_c.scale = 1.;
+        sub_c.n = n2;
+
+        DFT(sub_c, (Complex<T>*)dst, (Complex<T>*)dst);
+
+        c.factors[0] <<= 1;
 
         for( j = 0; j < n; j += 2 )
         {
@@ -1436,57 +1503,35 @@ ExpandCCS( uchar* _ptr, int n, int elem_size )
     }
 }
 
-
-typedef void (*DFTFunc)(
-     const void* src, void* dst, int n, int nf, int* factors,
-     const int* itab, const void* wave, int tab_size,
-     const void* spec, void* buf, int inv, double scale );
-
-static void DFT_32f( const Complexf* src, Complexf* dst, int n,
-    int nf, const int* factors, const int* itab,
-    const Complexf* wave, int tab_size,
-    const void* spec, Complexf* buf,
-    int flags, double scale )
+static void DFT_32f(const OcvDftOptions & c, const Complexf* src, Complexf* dst)
 {
-    DFT(src, dst, n, nf, factors, itab, wave, tab_size, spec, buf, flags, scale);
+    DFT(c, src, dst);
 }
 
-static void DFT_64f( const Complexd* src, Complexd* dst, int n,
-    int nf, const int* factors, const int* itab,
-    const Complexd* wave, int tab_size,
-    const void* spec, Complexd* buf,
-    int flags, double scale )
+static void DFT_64f(const OcvDftOptions & c, const Complexd* src, Complexd* dst)
 {
-    DFT(src, dst, n, nf, factors, itab, wave, tab_size, spec, buf, flags, scale);
+    DFT(c, src, dst);
 }
 
 
-static void RealDFT_32f( const float* src, float* dst, int n, int nf, int* factors,
-        const int* itab,  const Complexf* wave, int tab_size, const void* spec,
-        Complexf* buf, int flags, double scale )
+static void RealDFT_32f(const OcvDftOptions & c, const float* src, float* dst)
 {
-    RealDFT( src, dst, n, nf, factors, itab, wave, tab_size, spec, buf, flags, scale);
+    RealDFT(c, src, dst);
 }
 
-static void RealDFT_64f( const double* src, double* dst, int n, int nf, int* factors,
-        const int* itab,  const Complexd* wave, int tab_size, const void* spec,
-        Complexd* buf, int flags, double scale )
+static void RealDFT_64f(const OcvDftOptions & c, const double* src, double* dst)
 {
-    RealDFT( src, dst, n, nf, factors, itab, wave, tab_size, spec, buf, flags, scale);
+    RealDFT(c, src, dst);
 }
 
-static void CCSIDFT_32f( const float* src, float* dst, int n, int nf, int* factors,
-                         const int* itab,  const Complexf* wave, int tab_size, const void* spec,
-                         Complexf* buf, int flags, double scale )
+static void CCSIDFT_32f(const OcvDftOptions & c, const float* src, float* dst)
 {
-    CCSIDFT( src, dst, n, nf, factors, itab, wave, tab_size, spec, buf, flags, scale);
+    CCSIDFT(c, src, dst);
 }
 
-static void CCSIDFT_64f( const double* src, double* dst, int n, int nf, int* factors,
-                         const int* itab,  const Complexd* wave, int tab_size, const void* spec,
-                         Complexd* buf, int flags, double scale )
+static void CCSIDFT_64f(const OcvDftOptions & c, const double* src, double* dst)
 {
-    CCSIDFT( src, dst, n, nf, factors, itab, wave, tab_size, spec, buf, flags, scale);
+    CCSIDFT(c, src, dst);
 }
 
 }
@@ -1508,8 +1553,11 @@ class Dft_C_IPPLoop_Invoker : public ParallelLoopBody
 {
 public:
 
-    Dft_C_IPPLoop_Invoker(const Mat& _src, Mat& _dst, const Dft& _ippidft, int _norm_flag, bool *_ok) :
-        ParallelLoopBody(), src(_src), dst(_dst), ippidft(_ippidft), norm_flag(_norm_flag), ok(_ok)
+    Dft_C_IPPLoop_Invoker(uchar * _src, int _src_step, uchar * _dst, int _dst_step, int _width,
+                          const Dft& _ippidft, int _norm_flag, bool *_ok) :
+        ParallelLoopBody(),
+        src(_src), src_step(_src_step), dst(_dst), dst_step(_dst_step), width(_width),
+        ippidft(_ippidft), norm_flag(_norm_flag), ok(_ok)
     {
         *ok = true;
     }
@@ -1523,7 +1571,7 @@ public:
         int sizeSpec=0;
         int sizeInit=0;
 
-        IppiSize srcRoiSize = {src.cols, 1};
+        IppiSize srcRoiSize = {width, 1};
 
         status = ippiDFTGetSize_C_32fc(srcRoiSize, norm_flag, ippAlgHintNone, &sizeSpec, &sizeInit, &sizeBuffer );
         if ( status < 0 )
@@ -1555,7 +1603,8 @@ public:
         }
 
         for( int i = range.start; i < range.end; ++i)
-            if(!ippidft(src.ptr<Ipp32fc>(i), (int)src.step,dst.ptr<Ipp32fc>(i), (int)dst.step, pDFTSpec, (Ipp8u*)pBuffer))
+            if(!ippidft((Ipp32fc*)(src + src_step * i), src_step, (Ipp32fc*)(dst + dst_step * i), dst_step,
+                        pDFTSpec, (Ipp8u*)pBuffer))
             {
                 *ok = false;
             }
@@ -1568,8 +1617,11 @@ public:
     }
 
 private:
-    const Mat& src;
-    Mat& dst;
+    uchar * src;
+    int src_step;
+    uchar * dst;
+    int dst_step;
+    int width;
     const Dft& ippidft;
     int norm_flag;
     bool *ok;
@@ -1582,8 +1634,11 @@ class Dft_R_IPPLoop_Invoker : public ParallelLoopBody
 {
 public:
 
-    Dft_R_IPPLoop_Invoker(const Mat& _src, Mat& _dst, const Dft& _ippidft, int _norm_flag, bool *_ok) :
-        ParallelLoopBody(), src(_src), dst(_dst), ippidft(_ippidft), norm_flag(_norm_flag), ok(_ok)
+    Dft_R_IPPLoop_Invoker(uchar * _src, int _src_step, uchar * _dst, int _dst_step, int _width,
+                          const Dft& _ippidft, int _norm_flag, bool *_ok) :
+        ParallelLoopBody(),
+        src(_src), src_step(_src_step), dst(_dst), dst_step(_dst_step), width(_width),
+        ippidft(_ippidft), norm_flag(_norm_flag), ok(_ok)
     {
         *ok = true;
     }
@@ -1597,7 +1652,7 @@ public:
         int sizeSpec=0;
         int sizeInit=0;
 
-        IppiSize srcRoiSize = {src.cols, 1};
+        IppiSize srcRoiSize = {width, 1};
 
         status = ippiDFTGetSize_R_32f(srcRoiSize, norm_flag, ippAlgHintNone, &sizeSpec, &sizeInit, &sizeBuffer );
         if ( status < 0 )
@@ -1629,7 +1684,8 @@ public:
         }
 
         for( int i = range.start; i < range.end; ++i)
-            if(!ippidft(src.ptr<float>(i), (int)src.step,dst.ptr<float>(i), (int)dst.step, pDFTSpec, (Ipp8u*)pBuffer))
+            if(!ippidft((float*)(src + src_step * i), src_step, (float*)(dst + dst_step * i), dst_step,
+                        pDFTSpec, (Ipp8u*)pBuffer))
             {
                 *ok = false;
             }
@@ -1642,8 +1698,11 @@ public:
     }
 
 private:
-    const Mat& src;
-    Mat& dst;
+    uchar * src;
+    int src_step;
+    uchar * dst;
+    int dst_step;
+    int width;
     const Dft& ippidft;
     int norm_flag;
     bool *ok;
@@ -1652,18 +1711,18 @@ private:
 };
 
 template <typename Dft>
-bool Dft_C_IPPLoop(const Mat& src, Mat& dst, const Dft& ippidft, int norm_flag)
+bool Dft_C_IPPLoop(uchar * src, int src_step, uchar * dst, int dst_step, int width, int height, const Dft& ippidft, int norm_flag)
 {
     bool ok;
-    parallel_for_(Range(0, src.rows), Dft_C_IPPLoop_Invoker<Dft>(src, dst, ippidft, norm_flag, &ok), src.total()/(double)(1<<16) );
+    parallel_for_(Range(0, height), Dft_C_IPPLoop_Invoker<Dft>(src, src_step, dst, dst_step, width, ippidft, norm_flag, &ok), (width * height)/(double)(1<<16) );
     return ok;
 }
 
 template <typename Dft>
-bool Dft_R_IPPLoop(const Mat& src, Mat& dst, const Dft& ippidft, int norm_flag)
+bool Dft_R_IPPLoop(uchar * src, int src_step, uchar * dst, int dst_step, int width, int height, const Dft& ippidft, int norm_flag)
 {
     bool ok;
-    parallel_for_(Range(0, src.rows), Dft_R_IPPLoop_Invoker<Dft>(src, dst, ippidft, norm_flag, &ok), src.total()/(double)(1<<16) );
+    parallel_for_(Range(0, height), Dft_R_IPPLoop_Invoker<Dft>(src, src_step, dst, dst_step, width, ippidft, norm_flag, &ok), (width * height)/(double)(1<<16) );
     return ok;
 }
 
@@ -1691,7 +1750,7 @@ private:
     ippiDFT_R_Func func;
 };
 
-static bool ippi_DFT_C_32F(const Mat& src, Mat& dst, bool inv, int norm_flag)
+static bool ippi_DFT_C_32F(uchar * src, int src_step, uchar * dst, int dst_step, int width, int height, bool inv, int norm_flag)
 {
     IppStatus status;
     Ipp8u* pBuffer = 0;
@@ -1700,7 +1759,7 @@ static bool ippi_DFT_C_32F(const Mat& src, Mat& dst, bool inv, int norm_flag)
     int sizeSpec=0;
     int sizeInit=0;
 
-    IppiSize srcRoiSize = {src.cols, src.rows};
+    IppiSize srcRoiSize = {width, height};
 
     status = ippiDFTGetSize_C_32fc(srcRoiSize, norm_flag, ippAlgHintNone, &sizeSpec, &sizeInit, &sizeBuffer );
     if ( status < 0 )
@@ -1728,9 +1787,9 @@ static bool ippi_DFT_C_32F(const Mat& src, Mat& dst, bool inv, int norm_flag)
     }
 
     if (!inv)
-        status = ippiDFTFwd_CToC_32fc_C1R( src.ptr<Ipp32fc>(), (int)src.step, dst.ptr<Ipp32fc>(), (int)dst.step, pDFTSpec, pBuffer );
+        status = ippiDFTFwd_CToC_32fc_C1R( (Ipp32fc*)src, src_step, (Ipp32fc*)dst, dst_step, pDFTSpec, pBuffer );
     else
-        status = ippiDFTInv_CToC_32fc_C1R( src.ptr<Ipp32fc>(), (int)src.step, dst.ptr<Ipp32fc>(), (int)dst.step, pDFTSpec, pBuffer );
+        status = ippiDFTInv_CToC_32fc_C1R( (Ipp32fc*)src, src_step, (Ipp32fc*)dst, dst_step, pDFTSpec, pBuffer );
 
     if ( sizeBuffer > 0 )
         ippFree( pBuffer );
@@ -1745,7 +1804,7 @@ static bool ippi_DFT_C_32F(const Mat& src, Mat& dst, bool inv, int norm_flag)
     return false;
 }
 
-static bool ippi_DFT_R_32F(const Mat& src, Mat& dst, bool inv, int norm_flag)
+static bool ippi_DFT_R_32F(uchar * src, int src_step, uchar * dst, int dst_step, int width, int height, bool inv, int norm_flag)
 {
     IppStatus status;
     Ipp8u* pBuffer = 0;
@@ -1754,7 +1813,7 @@ static bool ippi_DFT_R_32F(const Mat& src, Mat& dst, bool inv, int norm_flag)
     int sizeSpec=0;
     int sizeInit=0;
 
-    IppiSize srcRoiSize = {src.cols, src.rows};
+    IppiSize srcRoiSize = {width, height};
 
     status = ippiDFTGetSize_R_32f(srcRoiSize, norm_flag, ippAlgHintNone, &sizeSpec, &sizeInit, &sizeBuffer );
     if ( status < 0 )
@@ -1782,9 +1841,9 @@ static bool ippi_DFT_R_32F(const Mat& src, Mat& dst, bool inv, int norm_flag)
     }
 
     if (!inv)
-        status = ippiDFTFwd_RToPack_32f_C1R( src.ptr<float>(), (int)(src.step), dst.ptr<float>(), (int)dst.step, pDFTSpec, pBuffer );
+        status = ippiDFTFwd_RToPack_32f_C1R( (float*)src, src_step, (float*)dst, dst_step, pDFTSpec, pBuffer );
     else
-        status = ippiDFTInv_PackToR_32f_C1R( src.ptr<float>(), (int)src.step, dst.ptr<float>(), (int)dst.step, pDFTSpec, pBuffer );
+        status = ippiDFTInv_PackToR_32f_C1R( (float*)src, src_step, (float*)dst, dst_step, pDFTSpec, pBuffer );
 
     if ( sizeBuffer > 0 )
         ippFree( pBuffer );
@@ -2426,111 +2485,324 @@ static bool ocl_dft_amdfft(InputArray _src, OutputArray _dst, int flags)
 
 namespace cv
 {
-static void complementComplexOutput(Mat& dst, int len, int dft_dims)
+
+template <typename T>
+static void complementComplex(T * ptr, int step, int n, int len, int dft_dims)
 {
-    int i, n = dst.cols;
-    size_t elem_size = dst.elemSize1();
-    if( elem_size == sizeof(float) )
+    T* p0 = (T*)ptr;
+    size_t dstep = step/sizeof(p0[0]);
+    for(int i = 0; i < len; i++ )
     {
-        float* p0 = dst.ptr<float>();
-        size_t dstep = dst.step/sizeof(p0[0]);
-        for( i = 0; i < len; i++ )
-        {
-            float* p = p0 + dstep*i;
-            float* q = dft_dims == 1 || i == 0 || i*2 == len ? p : p0 + dstep*(len-i);
+        T* p = p0 + dstep*i;
+        T* q = dft_dims == 1 || i == 0 || i*2 == len ? p : p0 + dstep*(len-i);
 
-            for( int j = 1; j < (n+1)/2; j++ )
-            {
-                p[(n-j)*2] = q[j*2];
-                p[(n-j)*2+1] = -q[j*2+1];
-            }
+        for( int j = 1; j < (n+1)/2; j++ )
+        {
+            p[(n-j)*2] = q[j*2];
+            p[(n-j)*2+1] = -q[j*2+1];
         }
     }
+}
+
+static void complementComplexOutput(int depth, uchar * ptr, int step, int count, int len, int dft_dims)
+{
+    if( depth == CV_32F )
+        complementComplex((float*)ptr, step, count, len, dft_dims);
     else
+        complementComplex((double*)ptr, step, count, len, dft_dims);
+}
+
+enum DftMode {
+    InvalidDft = 0,
+    FwdRealToCCS,
+    FwdRealToComplex,
+    FwdComplexToComplex,
+    InvCCSToReal,
+    InvComplexToReal,
+    InvComplexToComplex,
+};
+
+enum DftDims {
+    InvalidDim = 0,
+    OneDim,
+    OneDimColWise,
+    TwoDims
+};
+
+inline const char * modeName(DftMode m)
+{
+    switch (m)
     {
-        double* p0 = dst.ptr<double>();
-        size_t dstep = dst.step/sizeof(p0[0]);
-        for( i = 0; i < len; i++ )
-        {
-            double* p = p0 + dstep*i;
-            double* q = dft_dims == 1 || i == 0 || i*2 == len ? p : p0 + dstep*(len-i);
+    case InvalidDft: return "InvalidDft";
+    case FwdRealToCCS: return "FwdRealToCCS";
+    case FwdRealToComplex: return "FwdRealToComplex";
+    case FwdComplexToComplex: return "FwdComplexToComplex";
+    case InvCCSToReal: return "InvCCSToReal";
+    case InvComplexToReal: return "InvComplexToReal";
+    case InvComplexToComplex: return "InvComplexToComplex";
+    }
+    return 0;
+}
 
-            for( int j = 1; j < (n+1)/2; j++ )
-            {
-                p[(n-j)*2] = q[j*2];
-                p[(n-j)*2+1] = -q[j*2+1];
-            }
-        }
+inline const char * dimsName(DftDims d)
+{
+    switch (d)
+    {
+    case InvalidDim: return "InvalidDim";
+    case OneDim: return "OneDim";
+    case OneDimColWise: return "OneDimColWise";
+    case TwoDims: return "TwoDims";
+    };
+    return 0;
+}
+
+template <typename T>
+inline bool isInv(T mode)
+{
+    switch ((DftMode)mode)
+    {
+        case InvCCSToReal:
+        case InvComplexToReal:
+        case InvComplexToComplex: return true;
+        default: return false;
     }
 }
+
+inline DftMode determineMode(bool inv, int cn1, int cn2)
+{
+    if (!inv)
+    {
+        if (cn1 == 1 && cn2 == 1)
+            return FwdRealToCCS;
+        else if (cn1 == 1 && cn2 == 2)
+            return FwdRealToComplex;
+        else if (cn1 == 2 && cn2 == 2)
+            return FwdComplexToComplex;
+    }
+    else
+    {
+        if (cn1 == 1 && cn2 == 1)
+            return InvCCSToReal;
+        else if (cn1 == 2 && cn2 == 1)
+            return InvComplexToReal;
+        else if (cn1 == 2 && cn2 == 2)
+            return InvComplexToComplex;
+    }
+    return InvalidDft;
 }
 
-void cv::dft( InputArray _src0, OutputArray _dst, int flags, int nonzero_rows )
+
+inline DftDims determineDims(int rows, int cols, bool isRowWise, bool isContinuous)
 {
-#ifdef HAVE_CLAMDFFT
-    CV_OCL_RUN(ocl::haveAmdFft() && ocl::Device::getDefault().type() != ocl::Device::TYPE_CPU &&
-            _dst.isUMat() && _src0.dims() <= 2 && nonzero_rows == 0,
-               ocl_dft_amdfft(_src0, _dst, flags))
-#endif
+    // printf("%d x %d (%d, %d)\n", rows, cols, isRowWise, isContinuous);
+    if (isRowWise)
+        return OneDim;
+    if (cols == 1 && rows > 1) // one-column-shaped input
+    {
+        if (isContinuous)
+            return OneDim;
+        else
+            return OneDimColWise;
+    }
+    if (rows == 1)
+        return OneDim;
+    if (cols > 1 && rows > 1)
+        return TwoDims;
+    return InvalidDim;
+}
 
-#ifdef HAVE_OPENCL
-    CV_OCL_RUN(_dst.isUMat() && _src0.dims() <= 2,
-               ocl_dft(_src0, _dst, flags, nonzero_rows))
-#endif
+class OcvDftImpl
+{
+protected:
+    hal::DftContext contextA;
+    hal::DftContext contextB;
+    bool needBufferA;
+    bool needBufferB;
+    bool inv;
+    int width;
+    int height;
+    DftMode mode;
+    int elem_size;
+    int complex_elem_size;
+    int depth;
+    bool real_transform;
+    int nonzero_rows;
+    bool isRowTransform;
+    bool isScaled;
+    std::vector<int> stages;
+    bool useIpp;
+    int src_channels;
+    int dst_channels;
+
+    AutoBuffer<uchar> tmp_bufA;
+    AutoBuffer<uchar> tmp_bufB;
+    AutoBuffer<uchar> buf0;
+    AutoBuffer<uchar> buf1;
 
-    static DFTFunc dft_tbl[6] =
+public:
+    OcvDftImpl()
     {
-        (DFTFunc)DFT_32f,
-        (DFTFunc)RealDFT_32f,
-        (DFTFunc)CCSIDFT_32f,
-        (DFTFunc)DFT_64f,
-        (DFTFunc)RealDFT_64f,
-        (DFTFunc)CCSIDFT_64f
-    };
-    AutoBuffer<uchar> buf;
-    Mat src0 = _src0.getMat(), src = src0;
-    int prev_len = 0, stage = 0;
-    bool inv = (flags & DFT_INVERSE) != 0;
-    int nf = 0, real_transform = src.channels() == 1 || (inv && (flags & DFT_REAL_OUTPUT)!=0);
-    int type = src.type(), depth = src.depth();
-    int elem_size = (int)src.elemSize1(), complex_elem_size = elem_size*2;
-    int factors[34];
-    bool inplace_transform = false;
-#ifdef USE_IPP_DFT
-    AutoBuffer<uchar> ippbuf;
-    int ipp_norm_flag = !(flags & DFT_SCALE) ? 8 : inv ? 2 : 1;
+        needBufferA = false;
+        needBufferB = false;
+        inv = false;
+        width = 0;
+        height = 0;
+        mode = InvalidDft;
+        elem_size = 0;
+        complex_elem_size = 0;
+        depth = 0;
+        real_transform = false;
+        nonzero_rows = 0;
+        isRowTransform = false;
+        isScaled = false;
+        useIpp = false;
+        src_channels = 0;
+        dst_channels = 0;
+    }
+
+    void init(int _width, int _height, int _depth, int _src_channels, int _dst_channels, int flags, int _nonzero_rows)
+    {
+        bool isComplex = _src_channels != _dst_channels;
+        nonzero_rows = _nonzero_rows;
+        width = _width;
+        height = _height;
+        depth = _depth;
+        src_channels = _src_channels;
+        dst_channels = _dst_channels;
+        bool isInverse = (flags & CV_HAL_DFT_INVERSE) != 0;
+        bool isInplace = (flags & CV_HAL_DFT_IS_INPLACE) != 0;
+        bool isContinuous = (flags & CV_HAL_DFT_IS_CONTINUOUS) != 0;
+        mode = determineMode(isInverse, _src_channels, _dst_channels);
+        inv = isInverse;
+        isRowTransform = (flags & CV_HAL_DFT_ROWS) != 0;
+        isScaled = (flags & CV_HAL_DFT_SCALE) != 0;
+        needBufferA = false;
+        needBufferB = false;
+        real_transform = (mode != FwdComplexToComplex && mode != InvComplexToComplex);
+
+        elem_size = (depth == CV_32F) ? sizeof(float) : sizeof(double);
+        complex_elem_size = elem_size * 2;
+        if( !real_transform )
+            elem_size = complex_elem_size;
+
+#if defined USE_IPP_DFT
+        CV_IPP_CHECK()
+        {
+            if (nonzero_rows == 0 && depth == CV_32F && ((width * height)>(int)(1<<6)))
+            {
+                if (mode == FwdComplexToComplex || mode == InvComplexToComplex || mode == FwdRealToCCS || mode == InvCCSToReal)
+                {
+                    useIpp = true;
+                    return;
+                }
+            }
+        }
 #endif
 
-    CV_Assert( type == CV_32FC1 || type == CV_32FC2 || type == CV_64FC1 || type == CV_64FC2 );
+        DftDims dims = determineDims(height, width, isRowTransform, isContinuous);
+        if (dims == TwoDims)
+        {
+            stages.resize(2);
+            if (mode == InvCCSToReal || mode == InvComplexToReal)
+            {
+                stages[0] = 1;
+                stages[1] = 0;
+            }
+            else
+            {
+                stages[0] = 0;
+                stages[1] = 1;
+            }
+        }
+        else
+        {
+            stages.resize(1);
+            if (dims == OneDimColWise)
+                stages[0] = 1;
+            else
+                stages[0] = 0;
+        }
 
-    if( !inv && src.channels() == 1 && (flags & DFT_COMPLEX_OUTPUT) )
-        _dst.create( src.size(), CV_MAKETYPE(depth, 2) );
-    else if( inv && src.channels() == 2 && (flags & DFT_REAL_OUTPUT) )
-        _dst.create( src.size(), depth );
-    else
-        _dst.create( src.size(), type );
+        for(uint stageIndex = 0; stageIndex < stages.size(); ++stageIndex)
+        {
+            if (stageIndex == 1)
+            {
+                isInplace = true;
+                isComplex = false;
+            }
 
-    Mat dst = _dst.getMat();
+            int stage = stages[stageIndex];
+            bool isLastStage = (stageIndex + 1 == stages.size());
+
+            int len, count;
+
+            int f = 0;
+            if (inv)
+                f |= CV_HAL_DFT_INVERSE;
+            if (isScaled)
+                f |= CV_HAL_DFT_SCALE;
+            if (isRowTransform)
+                f |= CV_HAL_DFT_ROWS;
+            if (isComplex)
+                f |= CV_HAL_DFT_COMPLEX_OUTPUT;
+            if (real_transform)
+                f |= CV_HAL_DFT_REAL_OUTPUT;
+            if (!isLastStage)
+                f |= CV_HAL_DFT_TWO_STAGE;
+
+            if( stage == 0 ) // row-wise transform
+            {
+                if (width == 1 && !isRowTransform )
+                {
+                    len = height;
+                    count = width;
+                }
+                else
+                {
+                    len = width;
+                    count = height;
+                }
+                needBufferA = isInplace;
+                hal::dftInit(contextA, len, count, depth, f, &needBufferA);
+                if (needBufferA)
+                    tmp_bufA.allocate(len * complex_elem_size);
+            }
+            else
+            {
+                len = height;
+                count = width;
+                f |= CV_HAL_DFT_STAGE_COLS;
+                needBufferB = isInplace;
+                hal::dftInit(contextB, len, count, depth, f, &needBufferB);
+                if (needBufferB)
+                    tmp_bufB.allocate(len * complex_elem_size);
+
+                buf0.allocate(len * complex_elem_size);
+                buf1.allocate(len * complex_elem_size);
+            }
+        }
+    }
 
-#if defined USE_IPP_DFT
-    CV_IPP_CHECK()
+    void run(uchar * src, int src_step, uchar * dst, int dst_step)
     {
-        if ((src.depth() == CV_32F) && (src.total()>(int)(1<<6)) && nonzero_rows == 0)
+#if defined USE_IPP_DFT
+        if (useIpp)
         {
-            if ((flags & DFT_ROWS) == 0)
+            int ipp_norm_flag = !isScaled ? 8 : inv ? 2 : 1;
+            if (!isRowTransform)
             {
-                if (src.channels() == 2 && !(inv && (flags & DFT_REAL_OUTPUT)))
+                if (mode == FwdComplexToComplex || mode == InvComplexToComplex)
                 {
-                    if (ippi_DFT_C_32F(src, dst, inv, ipp_norm_flag))
+                    if (ippi_DFT_C_32F(src, src_step, dst, dst_step, width, height, inv, ipp_norm_flag))
                     {
                         CV_IMPL_ADD(CV_IMPL_IPP);
                         return;
                     }
                     setIppErrorStatus();
                 }
-                if (src.channels() == 1 && (inv || !(flags & DFT_COMPLEX_OUTPUT)))
+                else if (mode == FwdRealToCCS || mode == InvCCSToReal)
                 {
-                    if (ippi_DFT_R_32F(src, dst, inv, ipp_norm_flag))
+                    if (ippi_DFT_R_32F(src, src_step, dst, dst_step, width, height, inv, ipp_norm_flag))
                     {
                         CV_IMPL_ADD(CV_IMPL_IPP);
                         return;
@@ -2540,20 +2812,20 @@ void cv::dft( InputArray _src0, OutputArray _dst, int flags, int nonzero_rows )
             }
             else
             {
-                if (src.channels() == 2 && !(inv && (flags & DFT_REAL_OUTPUT)))
+                if (mode == FwdComplexToComplex || mode == InvComplexToComplex)
                 {
                     ippiDFT_C_Func ippiFunc = inv ? (ippiDFT_C_Func)ippiDFTInv_CToC_32fc_C1R : (ippiDFT_C_Func)ippiDFTFwd_CToC_32fc_C1R;
-                    if (Dft_C_IPPLoop(src, dst, IPPDFT_C_Functor(ippiFunc),ipp_norm_flag))
+                    if (Dft_C_IPPLoop(src, src_step, dst, dst_step, width, height, IPPDFT_C_Functor(ippiFunc),ipp_norm_flag))
                     {
                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
                         return;
                     }
                     setIppErrorStatus();
                 }
-                if (src.channels() == 1 && (inv || !(flags & DFT_COMPLEX_OUTPUT)))
+                else if (mode == FwdRealToCCS || mode == InvCCSToReal)
                 {
                     ippiDFT_R_Func ippiFunc = inv ? (ippiDFT_R_Func)ippiDFTInv_PackToR_32f_C1R : (ippiDFT_R_Func)ippiDFTFwd_RToPack_32f_C1R;
-                    if (Dft_R_IPPLoop(src, dst, IPPDFT_R_Functor(ippiFunc),ipp_norm_flag))
+                    if (Dft_R_IPPLoop(src, src_step, dst, dst_step, width, height, IPPDFT_R_Functor(ippiFunc),ipp_norm_flag))
                     {
                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
                         return;
@@ -2561,57 +2833,269 @@ void cv::dft( InputArray _src0, OutputArray _dst, int flags, int nonzero_rows )
                     setIppErrorStatus();
                 }
             }
+            return;
         }
-    }
 #endif
 
-    if( !real_transform )
-        elem_size = complex_elem_size;
+        for(uint stageIndex = 0; stageIndex < stages.size(); ++stageIndex)
+        {
+            int stage_src_channels = src_channels;
+            int stage_dst_channels = dst_channels;
 
-    if( src.cols == 1 && nonzero_rows > 0 )
-        CV_Error( CV_StsNotImplemented,
-        "This mode (using nonzero_rows with a single-column matrix) breaks the function's logic, so it is prohibited.\n"
-        "For fast convolution/correlation use 2-column matrix or single-row matrix instead" );
+            if (stageIndex == 1)
+            {
+                src = dst;
+                src_step = dst_step;
+                stage_src_channels = stage_dst_channels;
+            }
+
+            int stage = stages[stageIndex];
+            bool isLastStage = (stageIndex + 1 == stages.size());
+            bool isComplex = stage_src_channels != stage_dst_channels;
+
+            if( stage == 0 )
+                rowDft(src, src_step, dst, dst_step, isComplex, isLastStage);
+            else
+                colDft(src, src_step, dst, dst_step, stage_src_channels, stage_dst_channels, isLastStage);
+        }
+    }
+
+    void free()
+    {
+        if (useIpp)
+            return;
+        hal::dftFree(contextA);
+        hal::dftFree(contextB);
+    }
 
-    // determine, which transform to do first - row-wise
-    // (stage 0) or column-wise (stage 1) transform
-    if( !(flags & DFT_ROWS) && src.rows > 1 &&
-        ((src.cols == 1 && (!src.isContinuous() || !dst.isContinuous())) ||
-         (src.cols > 1 && inv && real_transform)) )
-        stage = 1;
+protected:
 
-    for(;;)
+    void rowDft(uchar* src_data, int src_step, uchar* dst_data, int dst_step, bool isComplex, bool isLastStage)
     {
-        double scale = 1;
-        uchar* wave = 0;
-        int* itab = 0;
-        uchar* ptr;
-        int i, len, count, sz = 0;
-        int use_buf = 0, odd_real = 0;
-        DFTFunc dft_func;
+        int len, count;
+        if (width == 1 && !isRowTransform )
+        {
+            len = height;
+            count = width;
+        }
+        else
+        {
+            len = width;
+            count = height;
+        }
+        int dptr_offset = 0;
+        int dst_full_len = len*elem_size;
+
+        if( needBufferA )
+        {
+            if (mode == FwdRealToCCS && (len & 1) && len > 1)
+                dptr_offset = elem_size;
+        }
+
+        if( !inv && isComplex )
+            dst_full_len += (len & 1) ? elem_size : complex_elem_size;
+
+        int nz = nonzero_rows;
+        if( nz <= 0 || nz > count )
+            nz = count;
 
-        if( stage == 0 ) // row-wise transform
+        int i;
+        for( i = 0; i < nz; i++ )
         {
-            len = !inv ? src.cols : dst.cols;
-            count = src.rows;
-            if( len == 1 && !(flags & DFT_ROWS) )
+            const uchar* sptr = src_data + src_step * i;
+            uchar* dptr0 = dst_data + dst_step * i;
+            uchar* dptr = dptr0;
+
+            if( needBufferA )
+                dptr = tmp_bufA;
+
+            hal::dftRun(contextA, sptr, dptr);
+
+            if( needBufferA )
+                memcpy( dptr0, dptr + dptr_offset, dst_full_len );
+        }
+
+        for( ; i < count; i++ )
+        {
+            uchar* dptr0 = dst_data + dst_step * i;
+            memset( dptr0, 0, dst_full_len );
+        }
+        if(isLastStage &&  mode == FwdRealToComplex)
+            complementComplexOutput(depth, dst_data, dst_step, len, nz, 1);
+    }
+
+    void colDft(uchar* src_data, int src_step, uchar* dst_data, int dst_step, int stage_src_channels, int stage_dst_channels, bool isLastStage)
+    {
+        int len = height;
+        int count = width;
+        int a = 0, b = count;
+        uchar *dbuf0, *dbuf1;
+        const uchar* sptr0 = src_data;
+        uchar* dptr0 = dst_data;
+
+        dbuf0 = buf0, dbuf1 = buf1;
+
+        if( needBufferB )
+        {
+            dbuf1 = tmp_bufB;
+            dbuf0 = buf1;
+        }
+
+        if( real_transform )
+        {
+            int even;
+            a = 1;
+            even = (count & 1) == 0;
+            b = (count+1)/2;
+            if( !inv )
+            {
+                memset( buf0, 0, len*complex_elem_size );
+                CopyColumn( sptr0, src_step, buf0, complex_elem_size, len, elem_size );
+                sptr0 += stage_dst_channels*elem_size;
+                if( even )
+                {
+                    memset( buf1, 0, len*complex_elem_size );
+                    CopyColumn( sptr0 + (count-2)*elem_size, src_step,
+                                buf1, complex_elem_size, len, elem_size );
+                }
+            }
+            else if( stage_src_channels == 1 )
+            {
+                CopyColumn( sptr0, src_step, buf0, elem_size, len, elem_size );
+                ExpandCCS( buf0, len, elem_size );
+                if( even )
+                {
+                    CopyColumn( sptr0 + (count-1)*elem_size, src_step,
+                                buf1, elem_size, len, elem_size );
+                    ExpandCCS( buf1, len, elem_size );
+                }
+                sptr0 += elem_size;
+            }
+            else
             {
-                len = !inv ? src.rows : dst.rows;
-                count = 1;
+                CopyColumn( sptr0, src_step, buf0, complex_elem_size, len, complex_elem_size );
+                if( even )
+                {
+                    CopyColumn( sptr0 + b*complex_elem_size, src_step,
+                                   buf1, complex_elem_size, len, complex_elem_size );
+                }
+                sptr0 += complex_elem_size;
+            }
+
+            if( even )
+                hal::dftRun(contextB, buf1, dbuf1);
+            hal::dftRun(contextB, buf0, dbuf0);
+
+            if( stage_dst_channels == 1 )
+            {
+                if( !inv )
+                {
+                    // copy the half of output vector to the first/last column.
+                    // before doing that, defgragment the vector
+                    memcpy( dbuf0 + elem_size, dbuf0, elem_size );
+                    CopyColumn( dbuf0 + elem_size, elem_size, dptr0,
+                                   dst_step, len, elem_size );
+                    if( even )
+                    {
+                        memcpy( dbuf1 + elem_size, dbuf1, elem_size );
+                        CopyColumn( dbuf1 + elem_size, elem_size,
+                                       dptr0 + (count-1)*elem_size,
+                                       dst_step, len, elem_size );
+                    }
+                    dptr0 += elem_size;
+                }
+                else
+                {
+                    // copy the real part of the complex vector to the first/last column
+                    CopyColumn( dbuf0, complex_elem_size, dptr0, dst_step, len, elem_size );
+                    if( even )
+                        CopyColumn( dbuf1, complex_elem_size, dptr0 + (count-1)*elem_size,
+                                       dst_step, len, elem_size );
+                    dptr0 += elem_size;
+                }
+            }
+            else
+            {
+                assert( !inv );
+                CopyColumn( dbuf0, complex_elem_size, dptr0,
+                               dst_step, len, complex_elem_size );
+                if( even )
+                    CopyColumn( dbuf1, complex_elem_size,
+                                   dptr0 + b*complex_elem_size,
+                                   dst_step, len, complex_elem_size );
+                dptr0 += complex_elem_size;
             }
-            odd_real = real_transform && (len & 1);
         }
-        else
+
+        for(int i = a; i < b; i += 2 )
         {
-            len = dst.rows;
-            count = !inv ? src0.cols : dst.cols;
-            sz = 2*len*complex_elem_size;
+            if( i+1 < b )
+            {
+                CopyFrom2Columns( sptr0, src_step, buf0, buf1, len, complex_elem_size );
+                hal::dftRun(contextB, buf1, dbuf1);
+            }
+            else
+                CopyColumn( sptr0, src_step, buf0, complex_elem_size, len, complex_elem_size );
+
+            hal::dftRun(contextB, buf0, dbuf0);
+
+            if( i+1 < b )
+                CopyTo2Columns( dbuf0, dbuf1, dptr0, dst_step, len, complex_elem_size );
+            else
+                CopyColumn( dbuf0, complex_elem_size, dptr0, dst_step, len, complex_elem_size );
+            sptr0 += 2*complex_elem_size;
+            dptr0 += 2*complex_elem_size;
         }
+        if(isLastStage && mode == FwdRealToComplex)
+            complementComplexOutput(depth, dst_data, dst_step, count, len, 2);
+    }
+};
 
-        void *spec = 0;
+class OcvDftBasicImpl
+{
+public:
+    OcvDftOptions opt;
+    int _factors[34];
+    AutoBuffer<uchar> wave_buf;
+    AutoBuffer<int> itab_buf;
 #ifdef USE_IPP_DFT
-        if( CV_IPP_CHECK_COND && (len*count >= 64) ) // use IPP DFT if available
+    AutoBuffer<uchar> ippbuf;
+    AutoBuffer<uchar> ippworkbuf;
+#endif
+
+public:
+    OcvDftBasicImpl()
+    {
+        opt.factors = _factors;
+    }
+    OcvDftBasicImpl & operator=(const OcvDftBasicImpl & other)
+    {
+        this->opt = other.opt;
+        return *this;
+    }
+    void init(int len, int count, int depth, int flags, bool *needBuffer)
+    {
+        int prev_len = opt.n;
+
+        int stage = (flags & CV_HAL_DFT_STAGE_COLS) != 0 ? 1 : 0;
+        int complex_elem_size = depth == CV_32F ? sizeof(Complex<float>) : sizeof(Complex<double>);
+        opt.isInverse = (flags & CV_HAL_DFT_INVERSE) != 0;
+        bool real_transform = (flags & CV_HAL_DFT_REAL_OUTPUT) != 0;
+        opt.isComplex = (stage == 0) && (flags & CV_HAL_DFT_COMPLEX_OUTPUT) != 0;
+        bool needAnotherStage = (flags & CV_HAL_DFT_TWO_STAGE) != 0;
+
+        opt.scale = 1;
+        opt.tab_size = len;
+        opt.n = len;
+
+        opt.useIpp = false;
+    #ifdef USE_IPP_DFT
+        opt.ipp_spec = 0;
+        opt.ipp_work = 0;
+
+        if( CV_IPP_CHECK_COND && (opt.n*count >= 64) ) // use IPP DFT if available
         {
+            int ipp_norm_flag = (flags & CV_HAL_DFT_SCALE) == 0 ? 8 : opt.isInverse ? 2 : 1;
             int specsize=0, initsize=0, worksize=0;
             IppDFTGetSizeFunc getSizeFunc = 0;
             IppDFTInitFunc initFunc = 0;
@@ -2642,260 +3126,266 @@ void cv::dft( InputArray _src0, OutputArray _dst, int flags, int nonzero_rows )
                     initFunc = (IppDFTInitFunc)ippsDFTInit_C_64fc;
                 }
             }
-            if( getSizeFunc(len, ipp_norm_flag, ippAlgHintNone, &specsize, &initsize, &worksize) >= 0 )
+            if( getSizeFunc(opt.n, ipp_norm_flag, ippAlgHintNone, &specsize, &initsize, &worksize) >= 0 )
             {
                 ippbuf.allocate(specsize + initsize + 64);
-                spec = alignPtr(&ippbuf[0], 32);
-                uchar* initbuf = alignPtr((uchar*)spec + specsize, 32);
-                if( initFunc(len, ipp_norm_flag, ippAlgHintNone, spec, initbuf) < 0 )
-                    spec = 0;
-                sz += worksize;
+                opt.ipp_spec = alignPtr(&ippbuf[0], 32);
+                ippworkbuf.allocate(worksize + 32);
+                opt.ipp_work = alignPtr(&ippworkbuf[0], 32);
+                uchar* initbuf = alignPtr((uchar*)opt.ipp_spec + specsize, 32);
+                if( initFunc(opt.n, ipp_norm_flag, ippAlgHintNone, opt.ipp_spec, initbuf) >= 0 )
+                    opt.useIpp = true;
             }
             else
                 setIppErrorStatus();
         }
+    #endif
+
+        if (!opt.useIpp)
+        {
+            if (len != prev_len)
+            {
+                opt.nf = DFTFactorize( opt.n, opt.factors );
+            }
+            bool inplace_transform = opt.factors[0] == opt.factors[opt.nf-1];
+            if (len != prev_len || (!inplace_transform && opt.isInverse && real_transform))
+            {
+                wave_buf.allocate(opt.n*complex_elem_size);
+                opt.wave = wave_buf;
+                itab_buf.allocate(opt.n);
+                opt.itab = itab_buf;
+                DFTInit( opt.n, opt.nf, opt.factors, opt.itab, complex_elem_size,
+                         opt.wave, stage == 0 && opt.isInverse && real_transform );
+            }
+            // otherwise reuse the tables calculated on the previous stage
+            if (needBuffer)
+            {
+                if( (stage == 0 && ((*needBuffer && !inplace_transform) || (real_transform && (len & 1)))) ||
+                    (stage == 1 && !inplace_transform) )
+                {
+                    *needBuffer = true;
+                }
+            }
+        }
         else
-#endif
         {
-            if( len != prev_len )
-                nf = DFTFactorize( len, factors );
-
-            inplace_transform = factors[0] == factors[nf-1];
-            sz += len*(complex_elem_size + sizeof(int));
-            i = nf > 1 && (factors[0] & 1) == 0;
-            if( (factors[i] & 1) != 0 && factors[i] > 5 )
-                sz += (factors[i]+1)*complex_elem_size;
-
-            if( (stage == 0 && ((src.data == dst.data && !inplace_transform) || odd_real)) ||
-                (stage == 1 && !inplace_transform) )
+            if (needBuffer)
             {
-                use_buf = 1;
-                sz += len*complex_elem_size;
+                *needBuffer = false;
             }
         }
 
-        ptr = (uchar*)buf;
-        buf.allocate( sz + 32 );
-        if( ptr != (uchar*)buf )
-            prev_len = 0; // because we release the buffer,
-                          // force recalculation of
-                          // twiddle factors and permutation table
-        ptr = (uchar*)buf;
-        if( !spec )
         {
-            wave = ptr;
-            ptr += len*complex_elem_size;
-            itab = (int*)ptr;
-            ptr = (uchar*)cvAlignPtr( ptr + len*sizeof(int), 16 );
+            static DFTFunc dft_tbl[6] =
+            {
+                (DFTFunc)DFT_32f,
+                (DFTFunc)RealDFT_32f,
+                (DFTFunc)CCSIDFT_32f,
+                (DFTFunc)DFT_64f,
+                (DFTFunc)RealDFT_64f,
+                (DFTFunc)CCSIDFT_64f
+            };
+            int idx = 0;
+            if (stage == 0)
+            {
+                if (real_transform)
+                {
+                    if (!opt.isInverse)
+                        idx = 1;
+                    else
+                        idx = 2;
+                }
+            }
+            if (depth == CV_64F)
+                idx += 3;
 
-            if( len != prev_len || (!inplace_transform && inv && real_transform))
-                DFTInit( len, nf, factors, itab, complex_elem_size,
-                            wave, stage == 0 && inv && real_transform );
-            // otherwise reuse the tables calculated on the previous stage
+            opt.dft_func = dft_tbl[idx];
         }
 
-        if( stage == 0 )
+        if(!needAnotherStage && (flags & CV_HAL_DFT_SCALE) != 0)
         {
-            uchar* tmp_buf = 0;
-            int dptr_offset = 0;
-            int dst_full_len = len*elem_size;
-            int _flags = (int)inv + (src.channels() != dst.channels() ?
-                         DFT_COMPLEX_INPUT_OR_OUTPUT : 0);
-            if( use_buf )
-            {
-                tmp_buf = ptr;
-                ptr += len*complex_elem_size;
-                if( odd_real && !inv && len > 1 &&
-                    !(_flags & DFT_COMPLEX_INPUT_OR_OUTPUT))
-                    dptr_offset = elem_size;
-            }
+            int rowCount = count;
+            if (stage == 0 && (flags & CV_HAL_DFT_ROWS) != 0)
+                rowCount = 1;
+            opt.scale = 1./(len * rowCount);
+        }
+    }
 
-            if( !inv && (_flags & DFT_COMPLEX_INPUT_OR_OUTPUT) )
-                dst_full_len += (len & 1) ? elem_size : complex_elem_size;
+    void run(const void * src, void * dst)
+    {
+        opt.dft_func(opt, src, dst);
+    }
 
-            dft_func = dft_tbl[(!real_transform ? 0 : !inv ? 1 : 2) + (depth == CV_64F)*3];
+    void free() {}
+};
 
-            if( count > 1 && !(flags & DFT_ROWS) && (!inv || !real_transform) )
-                stage = 1;
-            else if( flags & CV_DXT_SCALE )
-                scale = 1./(len * (flags & DFT_ROWS ? 1 : count));
+namespace hal {
 
-            if( nonzero_rows <= 0 || nonzero_rows > count )
-                nonzero_rows = count;
+//================== 1D ======================
 
-            for( i = 0; i < nonzero_rows; i++ )
-            {
-                const uchar* sptr = src.ptr(i);
-                uchar* dptr0 = dst.ptr(i);
-                uchar* dptr = dptr0;
+void dftInit(DftContext & context, int len, int count, int depth, int flags, bool *needBuffer)
+{
+    int res = cv_hal_dftInit(&context.impl, len, count, depth, flags, needBuffer);
+    if (res == CV_HAL_ERROR_OK)
+    {
+        context.useReplacement = true;
+        return;
+    }
 
-                if( tmp_buf )
-                    dptr = tmp_buf;
+    context.useReplacement = false;
+    OcvDftBasicImpl * c = (OcvDftBasicImpl*)context.impl;
+    if (!c)
+    {
+        c = new OcvDftBasicImpl();
+        context.impl = (void*)c;
+    }
+    c->init(len, count, depth, flags, needBuffer);
+}
 
-                dft_func( sptr, dptr, len, nf, factors, itab, wave, len, spec, ptr, _flags, scale );
-                if( dptr != dptr0 )
-                    memcpy( dptr0, dptr + dptr_offset, dst_full_len );
-            }
+void dftRun(const DftContext & context, const void * src, void * dst)
+{
+    if (context.useReplacement)
+    {
+        int res = cv_hal_dftRun(context.impl, src, dst);
+        if (res != CV_HAL_ERROR_OK)
+        {
+            CV_Error( CV_StsNotImplemented, "Custom HAL implementation failed to call dftRun");
+        }
+        return;
+    }
+    OcvDftBasicImpl * c = (OcvDftBasicImpl*)context.impl;
+    c->run(src, dst);
+}
 
-            for( ; i < count; i++ )
-            {
-                uchar* dptr0 = dst.ptr(i);
-                memset( dptr0, 0, dst_full_len );
-            }
+void dftFree(DftContext & context)
+{
+    if (context.useReplacement)
+    {
+        int res = cv_hal_dftFree(context.impl);
+        if (res != CV_HAL_ERROR_OK)
+        {
+            CV_Error( CV_StsNotImplemented, "Custom HAL implementation failed to call dftFree");
+        }
+        return;
+    }
 
-            if( stage != 1 )
-            {
-                if( !inv && real_transform && dst.channels() == 2 )
-                    complementComplexOutput(dst, nonzero_rows, 1);
-                break;
-            }
-            src = dst;
+    OcvDftBasicImpl * c = (OcvDftBasicImpl*)context.impl;
+    if (c)
+    {
+        c->free();
+        delete c;
+        context.impl = 0;
+    }
+}
+
+
+//================== 2D ======================
+
+void dftInit2D(DftContext & c,
+               int _width, int _height, int _depth, int _src_channels, int _dst_channels,
+               int flags,
+               int _nonzero_rows)
+{
+    int res = cv_hal_dftInit2D(&c.impl, _width, _height, _depth, _src_channels, _dst_channels, flags, _nonzero_rows);
+    if (res == CV_HAL_ERROR_OK)
+    {
+        c.useReplacement = true;
+        return;
+    }
+    c.useReplacement = false;
+
+    if( _width == 1 && _nonzero_rows > 0 )
+        CV_Error( CV_StsNotImplemented,
+        "This mode (using nonzero_rows with a single-column matrix) breaks the function's logic, so it is prohibited.\n"
+        "For fast convolution/correlation use 2-column matrix or single-row matrix instead" );
+
+    OcvDftImpl * d = new OcvDftImpl();
+    d->init(_width, _height, _depth, _src_channels, _dst_channels, flags, _nonzero_rows);
+    c.impl = (void*)d;
+}
+
+void dftRun2D(const DftContext & c,
+              const void * src, int src_step, void * dst, int dst_step)
+{
+    if (c.useReplacement)
+    {
+        int res = cv_hal_dftRun2D(c.impl, (uchar*)src, src_step, (uchar*)dst, dst_step);
+        if (res != CV_HAL_ERROR_OK)
+        {
+            CV_Error( CV_StsNotImplemented, "Custom HAL implementation failed to call dftRun2D");
         }
-        else
+        return;
+    }
+    OcvDftImpl * d = (OcvDftImpl*)c.impl;
+    d->run((uchar*)src, src_step, (uchar*)dst, dst_step);
+}
+
+void dftFree2D(DftContext & c)
+{
+    if (c.useReplacement)
+    {
+        int res = cv_hal_dftFree2D(c.impl);
+        if (res != CV_HAL_ERROR_OK)
         {
-            int a = 0, b = count;
-            uchar *buf0, *buf1, *dbuf0, *dbuf1;
-            const uchar* sptr0 = src.ptr();
-            uchar* dptr0 = dst.ptr();
-            buf0 = ptr;
-            ptr += len*complex_elem_size;
-            buf1 = ptr;
-            ptr += len*complex_elem_size;
-            dbuf0 = buf0, dbuf1 = buf1;
+            CV_Error( CV_StsNotImplemented, "Custom HAL implementation failed to call dftFree2D");
+        }
+        return;
+    }
+    OcvDftImpl * d = (OcvDftImpl*)c.impl;
+    d->free();
+    delete d;
+    c.impl = 0;
+}
 
-            if( use_buf )
-            {
-                dbuf1 = ptr;
-                dbuf0 = buf1;
-                ptr += len*complex_elem_size;
-            }
+} // cv::hal::
 
-            dft_func = dft_tbl[(depth == CV_64F)*3];
+} // cv::
 
-            if( real_transform && inv && src.cols > 1 )
-                stage = 0;
-            else if( flags & CV_DXT_SCALE )
-                scale = 1./(len * count);
 
-            if( real_transform )
-            {
-                int even;
-                a = 1;
-                even = (count & 1) == 0;
-                b = (count+1)/2;
-                if( !inv )
-                {
-                    memset( buf0, 0, len*complex_elem_size );
-                    CopyColumn( sptr0, src.step, buf0, complex_elem_size, len, elem_size );
-                    sptr0 += dst.channels()*elem_size;
-                    if( even )
-                    {
-                        memset( buf1, 0, len*complex_elem_size );
-                        CopyColumn( sptr0 + (count-2)*elem_size, src.step,
-                                    buf1, complex_elem_size, len, elem_size );
-                    }
-                }
-                else if( src.channels() == 1 )
-                {
-                    CopyColumn( sptr0, src.step, buf0, elem_size, len, elem_size );
-                    ExpandCCS( buf0, len, elem_size );
-                    if( even )
-                    {
-                        CopyColumn( sptr0 + (count-1)*elem_size, src.step,
-                                    buf1, elem_size, len, elem_size );
-                        ExpandCCS( buf1, len, elem_size );
-                    }
-                    sptr0 += elem_size;
-                }
-                else
-                {
-                    CopyColumn( sptr0, src.step, buf0, complex_elem_size, len, complex_elem_size );
-                    if( even )
-                    {
-                        CopyColumn( sptr0 + b*complex_elem_size, src.step,
-                                       buf1, complex_elem_size, len, complex_elem_size );
-                    }
-                    sptr0 += complex_elem_size;
-                }
+void cv::dft( InputArray _src0, OutputArray _dst, int flags, int nonzero_rows )
+{
+#ifdef HAVE_CLAMDFFT
+    CV_OCL_RUN(ocl::haveAmdFft() && ocl::Device::getDefault().type() != ocl::Device::TYPE_CPU &&
+            _dst.isUMat() && _src0.dims() <= 2 && nonzero_rows == 0,
+               ocl_dft_amdfft(_src0, _dst, flags))
+#endif
 
-                if( even )
-                    dft_func( buf1, dbuf1, len, nf, factors, itab,
-                              wave, len, spec, ptr, inv, scale );
-                dft_func( buf0, dbuf0, len, nf, factors, itab,
-                          wave, len, spec, ptr, inv, scale );
+#ifdef HAVE_OPENCL
+    CV_OCL_RUN(_dst.isUMat() && _src0.dims() <= 2,
+               ocl_dft(_src0, _dst, flags, nonzero_rows))
+#endif
 
-                if( dst.channels() == 1 )
-                {
-                    if( !inv )
-                    {
-                        // copy the half of output vector to the first/last column.
-                        // before doing that, defgragment the vector
-                        memcpy( dbuf0 + elem_size, dbuf0, elem_size );
-                        CopyColumn( dbuf0 + elem_size, elem_size, dptr0,
-                                       dst.step, len, elem_size );
-                        if( even )
-                        {
-                            memcpy( dbuf1 + elem_size, dbuf1, elem_size );
-                            CopyColumn( dbuf1 + elem_size, elem_size,
-                                           dptr0 + (count-1)*elem_size,
-                                           dst.step, len, elem_size );
-                        }
-                        dptr0 += elem_size;
-                    }
-                    else
-                    {
-                        // copy the real part of the complex vector to the first/last column
-                        CopyColumn( dbuf0, complex_elem_size, dptr0, dst.step, len, elem_size );
-                        if( even )
-                            CopyColumn( dbuf1, complex_elem_size, dptr0 + (count-1)*elem_size,
-                                           dst.step, len, elem_size );
-                        dptr0 += elem_size;
-                    }
-                }
-                else
-                {
-                    assert( !inv );
-                    CopyColumn( dbuf0, complex_elem_size, dptr0,
-                                   dst.step, len, complex_elem_size );
-                    if( even )
-                        CopyColumn( dbuf1, complex_elem_size,
-                                       dptr0 + b*complex_elem_size,
-                                       dst.step, len, complex_elem_size );
-                    dptr0 += complex_elem_size;
-                }
-            }
+    Mat src0 = _src0.getMat(), src = src0;
+    bool inv = (flags & DFT_INVERSE) != 0;
+    int type = src.type();
+    int depth = src.depth();
 
-            for( i = a; i < b; i += 2 )
-            {
-                if( i+1 < b )
-                {
-                    CopyFrom2Columns( sptr0, src.step, buf0, buf1, len, complex_elem_size );
-                    dft_func( buf1, dbuf1, len, nf, factors, itab,
-                              wave, len, spec, ptr, inv, scale );
-                }
-                else
-                    CopyColumn( sptr0, src.step, buf0, complex_elem_size, len, complex_elem_size );
+    CV_Assert( type == CV_32FC1 || type == CV_32FC2 || type == CV_64FC1 || type == CV_64FC2 );
 
-                dft_func( buf0, dbuf0, len, nf, factors, itab,
-                          wave, len, spec, ptr, inv, scale );
+    if( !inv && src.channels() == 1 && (flags & DFT_COMPLEX_OUTPUT) )
+        _dst.create( src.size(), CV_MAKETYPE(depth, 2) );
+    else if( inv && src.channels() == 2 && (flags & DFT_REAL_OUTPUT) )
+        _dst.create( src.size(), depth );
+    else
+        _dst.create( src.size(), type );
 
-                if( i+1 < b )
-                    CopyTo2Columns( dbuf0, dbuf1, dptr0, dst.step, len, complex_elem_size );
-                else
-                    CopyColumn( dbuf0, complex_elem_size, dptr0, dst.step, len, complex_elem_size );
-                sptr0 += 2*complex_elem_size;
-                dptr0 += 2*complex_elem_size;
-            }
+    Mat dst = _dst.getMat();
 
-            if( stage != 0 )
-            {
-                if( !inv && real_transform && dst.channels() == 2 && len > 1 )
-                    complementComplexOutput(dst, len, 2);
-                break;
-            }
-            src = dst;
-        }
-    }
+    int f = 0;
+    if (src.isContinuous() && dst.isContinuous())
+        f |= CV_HAL_DFT_IS_CONTINUOUS;
+    if (inv)
+        f |= CV_HAL_DFT_INVERSE;
+    if (flags & DFT_ROWS)
+        f |= CV_HAL_DFT_ROWS;
+    if (flags & DFT_SCALE)
+        f |= CV_HAL_DFT_SCALE;
+    if (src.data == dst.data)
+        f |= CV_HAL_DFT_IS_INPLACE;
+    hal::DftContext c;
+    hal::dftInit2D(c, src.cols, src.rows, depth, src.channels(), dst.channels(), f, nonzero_rows);
+    hal::dftRun2D(c, src.data, (int)src.step, dst.data, (int)dst.step);
+    hal::dftFree2D(c);
 }
 
 
@@ -3117,11 +3607,12 @@ namespace cv
    http://www.ece.utexas.edu/~bevans/courses/ee381k/lectures/09_DCT/lecture9/:
 */
 template<typename T> static void
-DCT( const T* src, int src_step, T* dft_src, T* dft_dst, T* dst, int dst_step,
-     int n, int nf, int* factors, const int* itab, const Complex<T>* dft_wave,
-     const Complex<T>* dct_wave, const void* spec, Complex<T>* buf )
+DCT( const OcvDftOptions & c, const T* src, int src_step, T* dft_src, T* dft_dst, T* dst, int dst_step,
+     const Complex<T>* dct_wave )
 {
     static const T sin_45 = (T)0.70710678118654752440084436210485;
+
+    int n = c.n;
     int j, n2 = n >> 1;
 
     src_step /= sizeof(src[0]);
@@ -3140,8 +3631,7 @@ DCT( const T* src, int src_step, T* dft_src, T* dft_dst, T* dst, int dst_step,
         dft_src[n-j-1] = src[src_step];
     }
 
-    RealDFT( dft_src, dft_dst, n, nf, factors,
-             itab, dft_wave, n, spec, buf, 0, 1.0 );
+    RealDFT(c, dft_src, dft_dst);
     src = dft_dst;
 
     dst[0] = (T)(src[0]*dct_wave->re*sin_45);
@@ -3160,11 +3650,11 @@ DCT( const T* src, int src_step, T* dft_src, T* dft_dst, T* dst, int dst_step,
 
 
 template<typename T> static void
-IDCT( const T* src, int src_step, T* dft_src, T* dft_dst, T* dst, int dst_step,
-      int n, int nf, int* factors, const int* itab, const Complex<T>* dft_wave,
-      const Complex<T>* dct_wave, const void* spec, Complex<T>* buf )
+IDCT( const OcvDftOptions & c, const T* src, int src_step, T* dft_src, T* dft_dst, T* dst, int dst_step,
+      const Complex<T>* dct_wave)
 {
     static const T sin_45 = (T)0.70710678118654752440084436210485;
+    int n = c.n;
     int j, n2 = n >> 1;
 
     src_step /= sizeof(src[0]);
@@ -3189,8 +3679,7 @@ IDCT( const T* src, int src_step, T* dft_src, T* dft_dst, T* dst, int dst_step,
     }
 
     dft_src[n-1] = (T)(src[0]*2*dct_wave->re);
-    CCSIDFT( dft_src, dft_dst, n, nf, factors, itab,
-             dft_wave, n, spec, buf, 0, 1.0 );
+    CCSIDFT(c, dft_src, dft_dst);
 
     for( j = 0; j < n2; j++, dst += dst_step*2 )
     {
@@ -3279,41 +3768,31 @@ DCTInit( int n, int elem_size, void* _wave, int inv )
 }
 
 
-typedef void (*DCTFunc)(const void* src, int src_step, void* dft_src,
-                        void* dft_dst, void* dst, int dst_step, int n,
-                        int nf, int* factors, const int* itab, const void* dft_wave,
-                        const void* dct_wave, const void* spec, void* buf );
+typedef void (*DCTFunc)(const OcvDftOptions & c, const void* src, int src_step, void* dft_src,
+                        void* dft_dst, void* dst, int dst_step, const void* dct_wave);
 
-static void DCT_32f(const float* src, int src_step, float* dft_src, float* dft_dst,
-                    float* dst, int dst_step, int n, int nf, int* factors, const int* itab,
-                    const Complexf* dft_wave, const Complexf* dct_wave, const void* spec, Complexf* buf )
+static void DCT_32f(const OcvDftOptions & c, const float* src, int src_step, float* dft_src, float* dft_dst,
+                    float* dst, int dst_step, const Complexf* dct_wave)
 {
-    DCT(src, src_step, dft_src, dft_dst, dst, dst_step,
-        n, nf, factors, itab, dft_wave, dct_wave, spec, buf);
+    DCT(c, src, src_step, dft_src, dft_dst, dst, dst_step, dct_wave);
 }
 
-static void IDCT_32f(const float* src, int src_step, float* dft_src, float* dft_dst,
-                    float* dst, int dst_step, int n, int nf, int* factors, const int* itab,
-                    const Complexf* dft_wave, const Complexf* dct_wave, const void* spec, Complexf* buf )
+static void IDCT_32f(const OcvDftOptions & c, const float* src, int src_step, float* dft_src, float* dft_dst,
+                    float* dst, int dst_step, const Complexf* dct_wave)
 {
-    IDCT(src, src_step, dft_src, dft_dst, dst, dst_step,
-         n, nf, factors, itab, dft_wave, dct_wave, spec, buf);
+    IDCT(c, src, src_step, dft_src, dft_dst, dst, dst_step, dct_wave);
 }
 
-static void DCT_64f(const double* src, int src_step, double* dft_src, double* dft_dst,
-                    double* dst, int dst_step, int n, int nf, int* factors, const int* itab,
-                    const Complexd* dft_wave, const Complexd* dct_wave, const void* spec, Complexd* buf )
+static void DCT_64f(const OcvDftOptions & c, const double* src, int src_step, double* dft_src, double* dft_dst,
+                    double* dst, int dst_step, const Complexd* dct_wave)
 {
-    DCT(src, src_step, dft_src, dft_dst, dst, dst_step,
-        n, nf, factors, itab, dft_wave, dct_wave, spec, buf);
+    DCT(c, src, src_step, dft_src, dft_dst, dst, dst_step, dct_wave);
 }
 
-static void IDCT_64f(const double* src, int src_step, double* dft_src, double* dft_dst,
-                     double* dst, int dst_step, int n, int nf, int* factors, const int* itab,
-                     const Complexd* dft_wave, const Complexd* dct_wave, const void* spec, Complexd* buf )
+static void IDCT_64f(const OcvDftOptions & c, const double* src, int src_step, double* dft_src, double* dft_dst,
+                     double* dst, int dst_step, const Complexd* dct_wave)
 {
-    IDCT(src, src_step, dft_src, dft_dst, dst, dst_step,
-         n, nf, factors, itab, dft_wave, dct_wave, spec, buf);
+    IDCT(c, src, src_step, dft_src, dft_dst, dst, dst_step, dct_wave);
 }
 
 }
@@ -3336,8 +3815,8 @@ typedef IppStatus (CV_STDCALL * ippiDCTGetBufSize)(const void*, int*);
 class DctIPPLoop_Invoker : public ParallelLoopBody
 {
 public:
-    DctIPPLoop_Invoker(const Mat& _src, Mat& _dst, bool _inv, bool *_ok) :
-        ParallelLoopBody(), src(&_src), dst(&_dst), inv(_inv), ok(_ok)
+    DctIPPLoop_Invoker(const uchar * _src, int _src_step, uchar * _dst, int _dst_step, int _width, bool _inv, bool *_ok) :
+        ParallelLoopBody(), src(_src), src_step(_src_step), dst(_dst), dst_step(_dst_step), width(_width), inv(_inv), ok(_ok)
     {
         *ok = true;
     }
@@ -3348,7 +3827,7 @@ public:
             return;
 
 #if IPP_VERSION_X100 >= 900
-        IppiSize srcRoiSize = {src->cols, 1};
+        IppiSize srcRoiSize = {width, 1};
 
         int specSize    = 0;
         int initSize    = 0;
@@ -3405,7 +3884,7 @@ public:
 
         for(int i = range.start; i < range.end; ++i)
         {
-            if(ippDctFun(src->ptr<float>(i), (int)src->step,dst->ptr<float>(i), (int)dst->step, pDCTSpec, pBuffer) < 0)
+            if(ippDctFun((float*)(src + src_step * i), src_step, (float*)(dst + dst_step * i), dst_step, pDCTSpec, pBuffer) < 0)
             {
                 *ok = false;
                 IPP_RETURN
@@ -3419,7 +3898,7 @@ public:
         uchar* pBuffer = 0;
         int bufSize=0;
 
-        IppiSize srcRoiSize = {src->cols, 1};
+        IppiSize srcRoiSize = {width, 1};
 
         CV_SUPPRESS_DEPRECATED_START
 
@@ -3435,7 +3914,7 @@ public:
 
             for( int i = range.start; i < range.end; ++i)
             {
-                if(ippDctFun(src->ptr<float>(i), (int)src->step,dst->ptr<float>(i), (int)dst->step, pDCTSpec, (Ipp8u*)pBuffer) < 0)
+                if(ippDctFun((float*)(src + src_step * i), src_step, (float*)(dst + dst_step * i), dst_step, pDCTSpec, (Ipp8u*)pBuffer) < 0)
                 {
                     *ok = false;
                     break;
@@ -3456,27 +3935,30 @@ public:
     }
 
 private:
-    const Mat* src;
-    Mat* dst;
+    const uchar * src;
+    int src_step;
+    uchar * dst;
+    int dst_step;
+    int width;
     bool inv;
     bool *ok;
 };
 
-static bool DctIPPLoop(const Mat& src, Mat& dst, bool inv)
+static bool DctIPPLoop(const uchar * src, int src_step, uchar * dst, int dst_step, int width, int height, bool inv)
 {
     bool ok;
-    parallel_for_(Range(0, src.rows), DctIPPLoop_Invoker(src, dst, inv, &ok), src.rows/(double)(1<<4) );
+    parallel_for_(Range(0, height), DctIPPLoop_Invoker(src, src_step, dst, dst_step, width, inv, &ok), height/(double)(1<<4) );
     return ok;
 }
 
-static bool ippi_DCT_32f(const Mat& src, Mat& dst, bool inv, bool row)
+static bool ippi_DCT_32f(const uchar * src, int src_step, uchar * dst, int dst_step, int width, int height, bool inv, bool row)
 {
     if(row)
-        return DctIPPLoop(src, dst, inv);
+        return DctIPPLoop(src, src_step, dst, dst_step, width, height, inv);
     else
     {
 #if IPP_VERSION_X100 >= 900
-        IppiSize srcRoiSize = {src.cols, src.rows};
+        IppiSize srcRoiSize = {width, height};
 
         int specSize    = 0;
         int initSize    = 0;
@@ -3524,7 +4006,7 @@ static bool ippi_DCT_32f(const Mat& src, Mat& dst, bool inv, bool row)
             return false;
         }
 
-        if(ippDctFun(src.ptr<float>(), (int)src.step,dst.ptr<float>(), (int)dst.step, pDCTSpec, pBuffer) < 0)
+        if(ippDctFun((float*)src, src_step, (float*)dst, dst_step, pDCTSpec, pBuffer) < 0)
         {
             IPP_RELEASE
             return false;
@@ -3540,7 +4022,7 @@ static bool ippi_DCT_32f(const Mat& src, Mat& dst, bool inv, bool row)
         uchar* pBuffer = 0;
         int bufSize=0;
 
-        IppiSize srcRoiSize = {src.cols, src.rows};
+        IppiSize srcRoiSize = {width, height};
 
         CV_SUPPRESS_DEPRECATED_START
 
@@ -3556,7 +4038,7 @@ static bool ippi_DCT_32f(const Mat& src, Mat& dst, bool inv, bool row)
             buf.allocate( bufSize );
             pBuffer = (uchar*)buf;
 
-            status = ippDctFun(src.ptr<float>(), (int)src.step, dst.ptr<float>(), (int)dst.step, pDCTSpec, (Ipp8u*)pBuffer);
+            status = ippDctFun((float*)src, src_step, (float*)dst, dst_step, pDCTSpec, (Ipp8u*)pBuffer);
         }
 
         if (pDCTSpec)
@@ -3574,140 +4056,219 @@ static bool ippi_DCT_32f(const Mat& src, Mat& dst, bool inv, bool row)
 }
 #endif
 
-void cv::dct( InputArray _src0, OutputArray _dst, int flags )
-{
-    static DCTFunc dct_tbl[4] =
-    {
-        (DCTFunc)DCT_32f,
-        (DCTFunc)IDCT_32f,
-        (DCTFunc)DCT_64f,
-        (DCTFunc)IDCT_64f
-    };
-
-    bool inv = (flags & DCT_INVERSE) != 0;
-    Mat src0 = _src0.getMat(), src = src0;
-    int type = src.type(), depth = src.depth();
-    void *spec = 0;
-
-    double scale = 1.;
-    int prev_len = 0, nf = 0, stage, end_stage;
-    uchar *src_dft_buf = 0, *dst_dft_buf = 0;
-    uchar *dft_wave = 0, *dct_wave = 0;
-    int* itab = 0;
-    uchar* ptr = 0;
-    int elem_size = (int)src.elemSize(), complex_elem_size = elem_size*2;
-    int factors[34], inplace_transform;
-    int i, len, count;
-    AutoBuffer<uchar> buf;
-
-    CV_Assert( type == CV_32FC1 || type == CV_64FC1 );
-    _dst.create( src.rows, src.cols, type );
-    Mat dst = _dst.getMat();
-
-    CV_IPP_RUN(IPP_VERSION_X100 >= 700 && src.type() == CV_32F, ippi_DCT_32f(src, dst, inv, ((flags & DCT_ROWS) != 0)))
-
-    DCTFunc dct_func = dct_tbl[(int)inv + (depth == CV_64F)*2];
-
-    if( (flags & DCT_ROWS) || src.rows == 1 ||
-        (src.cols == 1 && (src.isContinuous() && dst.isContinuous())))
-    {
-        stage = end_stage = 0;
-    }
-    else
-    {
-        stage = src.cols == 1;
-        end_stage = 1;
-    }
+namespace cv {
 
-    for( ; stage <= end_stage; stage++ )
+class OcvDctImpl
+{
+public:
+    OcvDftOptions opt;
+
+    int _factors[34];
+    AutoBuffer<uint> wave_buf;
+    AutoBuffer<int> itab_buf;
+
+    DCTFunc dct_func;
+    bool isRowTransform;
+    bool isInverse;
+    bool isContinuous;
+    int start_stage;
+    int end_stage;
+    int width;
+    int height;
+    int depth;
+
+    void init(int _width, int _height, int _depth, int flags)
     {
-        const uchar* sptr = src.ptr();
-        uchar* dptr = dst.ptr();
-        size_t sstep0, sstep1, dstep0, dstep1;
-
-        if( stage == 0 )
+        width = _width;
+        height = _height;
+        depth = _depth;
+        isInverse = (flags & CV_HAL_DFT_INVERSE) != 0;
+        isRowTransform = (flags & CV_HAL_DFT_ROWS) != 0;
+        isContinuous = (flags & CV_HAL_DFT_IS_CONTINUOUS) != 0;
+        static DCTFunc dct_tbl[4] =
         {
-            len = src.cols;
-            count = src.rows;
-            if( len == 1 && !(flags & DCT_ROWS) )
-            {
-                len = src.rows;
-                count = 1;
-            }
-            sstep0 = src.step;
-            dstep0 = dst.step;
-            sstep1 = dstep1 = elem_size;
+            (DCTFunc)DCT_32f,
+            (DCTFunc)IDCT_32f,
+            (DCTFunc)DCT_64f,
+            (DCTFunc)IDCT_64f
+        };
+        dct_func = dct_tbl[(int)isInverse + (depth == CV_64F)*2];
+        opt.nf = 0;
+        opt.isComplex = false;
+        opt.isInverse = false;
+        opt.noPermute = false;
+        opt.scale = 1.;
+        opt.factors = _factors;
+
+        if (isRowTransform || height == 1 || (width == 1 && isContinuous))
+        {
+            start_stage = end_stage = 0;
         }
         else
         {
-            len = dst.rows;
-            count = dst.cols;
-            sstep1 = src.step;
-            dstep1 = dst.step;
-            sstep0 = dstep0 = elem_size;
+            start_stage = (width == 1);
+            end_stage = 1;
         }
+    }
+    void run(uchar * src, int src_step, uchar * dst, int dst_step)
+    {
+        CV_IPP_RUN(IPP_VERSION_X100 >= 700 && depth == CV_32F, ippi_DCT_32f(src, src_step, dst, dst_step, width, height, isInverse, isRowTransform))
 
-        if( len != prev_len )
-        {
-            int sz;
-
-            if( len > 1 && (len & 1) )
-                CV_Error( CV_StsNotImplemented, "Odd-size DCT\'s are not implemented" );
+        AutoBuffer<uchar> dct_wave;
+        AutoBuffer<uchar> src_buf, dst_buf;
+        uchar *src_dft_buf = 0, *dst_dft_buf = 0;
+        int prev_len = 0;
+        int elem_size = (depth == CV_32F) ? sizeof(float) : sizeof(double);
+        int complex_elem_size = elem_size*2;
 
-            sz = len*elem_size;
-            sz += (len/2 + 1)*complex_elem_size;
+        for(int stage = start_stage ; stage <= end_stage; stage++ )
+        {
+            const uchar* sptr = src;
+            uchar* dptr = dst;
+            size_t sstep0, sstep1, dstep0, dstep1;
+            int len, count;
 
-            spec = 0;
-            inplace_transform = 1;
+            if( stage == 0 )
             {
-                sz += len*(complex_elem_size + sizeof(int)) + complex_elem_size;
-
-                nf = DFTFactorize( len, factors );
-                inplace_transform = factors[0] == factors[nf-1];
-
-                i = nf > 1 && (factors[0] & 1) == 0;
-                if( (factors[i] & 1) != 0 && factors[i] > 5 )
-                    sz += (factors[i]+1)*complex_elem_size;
-
-                if( !inplace_transform )
-                    sz += len*elem_size;
+                len = width;
+                count = height;
+                if( len == 1 && !isRowTransform )
+                {
+                    len = height;
+                    count = 1;
+                }
+                sstep0 = src_step;
+                dstep0 = dst_step;
+                sstep1 = dstep1 = elem_size;
+            }
+            else
+            {
+                len = height;
+                count = width;
+                sstep1 = src_step;
+                dstep1 = dst_step;
+                sstep0 = dstep0 = elem_size;
             }
 
-            buf.allocate( sz + 32 );
-            ptr = (uchar*)buf;
+            opt.n = len;
+            opt.tab_size = len;
 
-            if( !spec )
+            if( len != prev_len )
             {
-                dft_wave = ptr;
-                ptr += len*complex_elem_size;
-                itab = (int*)ptr;
-                ptr = (uchar*)cvAlignPtr( ptr + len*sizeof(int), 16 );
-                DFTInit( len, nf, factors, itab, complex_elem_size, dft_wave, inv );
+                if( len > 1 && (len & 1) )
+                    CV_Error( CV_StsNotImplemented, "Odd-size DCT\'s are not implemented" );
+
+                opt.nf = DFTFactorize( len, opt.factors );
+                bool inplace_transform = opt.factors[0] == opt.factors[opt.nf-1];
+
+                wave_buf.allocate(len*complex_elem_size);
+                opt.wave = wave_buf;
+                itab_buf.allocate(len);
+                opt.itab = itab_buf;
+                DFTInit( len, opt.nf, opt.factors, opt.itab, complex_elem_size, opt.wave, isInverse );
+
+                dct_wave.allocate((len/2 + 1)*complex_elem_size);
+                src_buf.allocate(len*elem_size);
+                src_dft_buf = src_buf;
+                if(!inplace_transform)
+                {
+                    dst_buf.allocate(len*elem_size);
+                    dst_dft_buf = dst_buf;
+                }
+                else
+                {
+                    dst_dft_buf = src_buf;
+                }
+                DCTInit( len, complex_elem_size, dct_wave, isInverse);
+                prev_len = len;
             }
-
-            dct_wave = ptr;
-            ptr += (len/2 + 1)*complex_elem_size;
-            src_dft_buf = dst_dft_buf = ptr;
-            ptr += len*elem_size;
-            if( !inplace_transform )
+            // otherwise reuse the tables calculated on the previous stage
+            for(int i = 0; i < count; i++ )
             {
-                dst_dft_buf = ptr;
-                ptr += len*elem_size;
+                dct_func( opt, sptr + i*sstep0, (int)sstep1, src_dft_buf, dst_dft_buf,
+                          dptr + i*dstep0, (int)dstep1, dct_wave);
             }
-            DCTInit( len, complex_elem_size, dct_wave, inv );
-            if( !inv )
-                scale += scale;
-            prev_len = len;
+            src = dst;
+            src_step = dst_step;
         }
-        // otherwise reuse the tables calculated on the previous stage
-        for( i = 0; i < count; i++ )
+
+    }
+    void free() {}
+};
+
+namespace hal {
+
+void dctInit(DftContext & c, int width, int height, int depth, int flags)
+{
+    int res = cv_hal_dctInit(&c.impl, width, height, depth, flags);
+    if (res == CV_HAL_ERROR_OK)
+    {
+        c.useReplacement = true;
+        return;
+    }
+    c.useReplacement = false;
+    OcvDctImpl * impl = new OcvDctImpl();
+    impl->init(width, height, depth, flags);
+    c.impl = impl;
+}
+
+void dctRun(const DftContext & c, const void * src, int src_step, void * dst, int dst_step)
+{
+    if (c.useReplacement)
+    {
+        int res = cv_hal_dctRun(c.impl, src, src_step, dst, dst_step);
+        if (res != CV_HAL_ERROR_OK)
         {
-            dct_func( sptr + i*sstep0, (int)sstep1, src_dft_buf, dst_dft_buf,
-                      dptr + i*dstep0, (int)dstep1, len, nf, factors,
-                      itab, dft_wave, dct_wave, spec, ptr );
+            CV_Error( CV_StsNotImplemented, "Custom HAL implementation failed to call dctRun");
         }
-        src = dst;
+        return;
     }
+    OcvDctImpl * impl = (OcvDctImpl*)c.impl;
+    impl->run((uchar*)src, src_step, (uchar*)dst, dst_step);
+}
+
+void dctFree(DftContext & c)
+{
+    if (c.useReplacement)
+    {
+        int res = cv_hal_dctFree(c.impl);
+        if (res != CV_HAL_ERROR_OK)
+        {
+            CV_Error( CV_StsNotImplemented, "Custom HAL implementation failed to call dctFree");
+        }
+        return;
+    }
+    OcvDctImpl * impl = (OcvDctImpl*)c.impl;
+    impl->free();
+    delete impl;
+    c.impl = 0;
+}
+
+} // cv::hal::
+
+} // cv::
+
+void cv::dct( InputArray _src0, OutputArray _dst, int flags )
+{
+    Mat src0 = _src0.getMat(), src = src0;
+    int type = src.type(), depth = src.depth();
+
+    CV_Assert( type == CV_32FC1 || type == CV_64FC1 );
+    _dst.create( src.rows, src.cols, type );
+    Mat dst = _dst.getMat();
+
+    int f = 0;
+    if ((flags & DFT_ROWS) != 0)
+        f |= CV_HAL_DFT_ROWS;
+    if ((flags & DCT_INVERSE) != 0)
+        f |= CV_HAL_DFT_INVERSE;
+    if (src.isContinuous() && dst.isContinuous())
+        f |= CV_HAL_DFT_IS_CONTINUOUS;
+
+    hal::DftContext c;
+    hal::dctInit(c, src.cols, src.rows, depth, f);
+    hal::dctRun(c, (void*)src.data, (int)src.step, (void*)dst.data, (int)dst.step);
+    hal::dctFree(c);
 }
 
 
diff --git a/modules/core/src/hal_replacement.hpp b/modules/core/src/hal_replacement.hpp
index 69345ca..d4d4333 100644
--- a/modules/core/src/hal_replacement.hpp
+++ b/modules/core/src/hal_replacement.hpp
@@ -384,6 +384,31 @@ inline int hal_ni_merge64s(const int64 **src_data, int64 *dst_data, int len, int
 #  pragma warning( pop )
 #endif
 
+inline int hal_ni_dftInit(void**, int, int, int, int, bool*) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_dftRun(const void*, const void*, void*) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_dftFree(void*) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+
+#define cv_hal_dftInit hal_ni_dftInit
+#define cv_hal_dftRun hal_ni_dftRun
+#define cv_hal_dftFree hal_ni_dftFree
+
+inline int hal_ni_dftInit2D(void **, int, int, int, int, int, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_dftRun2D(const void *, const void *, int, void *, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_dftFree2D(void *) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+
+#define cv_hal_dftInit2D hal_ni_dftInit2D
+#define cv_hal_dftRun2D hal_ni_dftRun2D
+#define cv_hal_dftFree2D hal_ni_dftFree2D
+
+
+inline int hal_ni_dctInit(void **, int, int, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_dctRun(const void *, const void *, int, void *, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_dctFree(void *) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+
+#define cv_hal_dctInit hal_ni_dctInit
+#define cv_hal_dctRun hal_ni_dctRun
+#define cv_hal_dctFree hal_ni_dctFree
+
 #include "custom_hal.hpp"
 
 #endif
diff --git a/modules/core/test/test_dxt.cpp b/modules/core/test/test_dxt.cpp
index ad75e52..45994e0 100644
--- a/modules/core/test/test_dxt.cpp
+++ b/modules/core/test/test_dxt.cpp
@@ -887,3 +887,79 @@ TEST(Core_DFT, complex_output2)
         }
     }
 }
+
+class Core_DXTReverseTest : public cvtest::BaseTest
+{
+public:
+    enum Mode
+    {
+        ModeDFT,
+        ModeDCT
+    };
+    Core_DXTReverseTest(Mode m) : mode(m) {}
+private:
+    Mode mode;
+protected:
+    void run(int)
+    {
+        for (int i = 0; i < 3; ++i)
+        {
+            if (mode == ModeDCT && i != 0)
+                continue;
+            int flags = 0;
+            int flags_inv = DFT_INVERSE | DFT_SCALE;
+            int cn_in = 0;
+            int cn_out = 0;
+            switch (i)
+            {
+                case 0: cn_in = 1; cn_out = 1; break;
+                case 1: cn_in = 1; cn_out = 2; flags |= DFT_COMPLEX_OUTPUT; flags_inv |= DFT_REAL_OUTPUT; break;
+                case 2: cn_in = 2; cn_out = 2; break;
+            };
+            for (int j = 0; j < 100; ++j)
+            {
+                RNG& rng = ts->get_rng();
+                int type = rng.uniform(0, 2) ? CV_64F : CV_32F;
+                int m = rng.uniform(1, 10);
+                int n = rng.uniform(1, 10);
+                if (mode == ModeDCT)
+                {
+                    m *= 2;
+                    n *= 2;
+                }
+                Mat one(m, n, CV_MAKETYPE(type, cn_in));
+                cvtest::randUni(rng, one, Scalar::all(-1.), Scalar::all(1.));
+                Mat out;
+                Mat two;
+                if (mode == ModeDFT)
+                {
+                    cv::dft(one, out, flags);
+                    cv::dft(out, two, flags_inv);
+                }
+                else if (mode == ModeDCT)
+                {
+                    cv::dct(one, out, flags);
+                    cv::dct(out, two, flags_inv);
+                }
+                if (out.channels() != cn_out || two.channels() != cn_in || cvtest::norm(one, two, NORM_INF) > 1e-5)
+                {
+                    cout << "Test #" << j + 1 << " - "
+                        << "elements: " << m << " x " << n << ", "
+                        << "channels: "
+                        << one.channels() << " (" << cn_in << ")" << " -> "
+                        << out.channels() << " (" << cn_out << ")" << " -> "
+                        << two.channels() << " (" << cn_in << ")"
+                        << endl;
+                    cout << "signal:\n" << one << endl << endl;
+                    cout << "spectrum:\n" << out << endl << endl;
+                    cout << "inverse:\n" << two << endl << endl;
+                    ts->set_failed_test_info(cvtest::TS::FAIL_INVALID_OUTPUT);
+                    break;
+                }
+            }
+        }
+    }
+};
+
+TEST(Core_DFT, reverse) { Core_DXTReverseTest test(Core_DXTReverseTest::ModeDFT); test.safe_run(); }
+TEST(Core_DCT, reverse) { Core_DXTReverseTest test(Core_DXTReverseTest::ModeDCT); test.safe_run(); }
diff --git a/modules/imgproc/src/templmatch.cpp b/modules/imgproc/src/templmatch.cpp
index 59b0703..64742ea 100644
--- a/modules/imgproc/src/templmatch.cpp
+++ b/modules/imgproc/src/templmatch.cpp
@@ -632,6 +632,8 @@ static bool ipp_sqrDistance(const Mat& src, const Mat& tpl, Mat& dst)
 
 #endif
 
+#include "opencv2/core/hal/hal.hpp"
+
 void crossCorr( const Mat& img, const Mat& _templ, Mat& corr,
                 Size corrsize, int ctype,
                 Point anchor, double delta, int borderType )
@@ -698,6 +700,9 @@ void crossCorr( const Mat& img, const Mat& _templ, Mat& corr,
 
     buf.resize(bufSize);
 
+    hal::DftContext c;
+    hal::dftInit2D(c, dftsize.width, dftsize.height, dftTempl.depth(), 1, 1, CV_HAL_DFT_IS_INPLACE, templ.rows);
+
     // compute DFT of each template plane
     for( k = 0; k < tcn; k++ )
     {
@@ -721,9 +726,11 @@ void crossCorr( const Mat& img, const Mat& _templ, Mat& corr,
             Mat part(dst, Range(0, templ.rows), Range(templ.cols, dst.cols));
             part = Scalar::all(0);
         }
-        dft(dst, dst, 0, templ.rows);
+        hal::dftRun2D(c, dst.data, (int)dst.step, dst.data, (int)dst.step);
     }
 
+    hal::dftFree2D(c);
+
     int tileCountX = (corr.cols + blocksize.width - 1)/blocksize.width;
     int tileCountY = (corr.rows + blocksize.height - 1)/blocksize.height;
     int tileCount = tileCountX * tileCountY;
@@ -740,6 +747,16 @@ void crossCorr( const Mat& img, const Mat& _templ, Mat& corr,
     }
     borderType |= BORDER_ISOLATED;
 
+    bool useHalDft = tileCount > 1;
+    hal::DftContext cF, cR;
+    if (useHalDft)
+    {
+        int f = CV_HAL_DFT_IS_INPLACE;
+        int f_inv = f | CV_HAL_DFT_INVERSE | CV_HAL_DFT_SCALE;
+        hal::dftInit2D(cF, dftsize.width, dftsize.height, maxDepth, 1, 1, f, blocksize.height + templ.rows - 1);
+        hal::dftInit2D(cR, dftsize.width, dftsize.height, maxDepth, 1, 1, f_inv, blocksize.height);
+    }
+
     // calculate correlation by blocks
     for( i = 0; i < tileCount; i++ )
     {
@@ -777,11 +794,19 @@ void crossCorr( const Mat& img, const Mat& _templ, Mat& corr,
                 copyMakeBorder(dst1, dst, y1-y0, dst.rows-dst1.rows-(y1-y0),
                                x1-x0, dst.cols-dst1.cols-(x1-x0), borderType);
 
-            dft( dftImg, dftImg, 0, dsz.height );
+            if (useHalDft && bsz.height == blocksize.height)
+                hal::dftRun2D(cF, dftImg.data, (int)dftImg.step, dftImg.data, (int)dftImg.step);
+            else
+                dft( dftImg, dftImg, 0, dsz.height );
+
             Mat dftTempl1(dftTempl, Rect(0, tcn > 1 ? k*dftsize.height : 0,
                                          dftsize.width, dftsize.height));
             mulSpectrums(dftImg, dftTempl1, dftImg, 0, true);
-            dft( dftImg, dftImg, DFT_INVERSE + DFT_SCALE, bsz.height );
+
+            if (useHalDft && bsz.height == blocksize.height)
+                hal::dftRun2D(cR, dftImg.data, (int)dftImg.step, dftImg.data, (int)dftImg.step);
+            else
+                dft( dftImg, dftImg, DFT_INVERSE + DFT_SCALE, bsz.height );
 
             src = dftImg(Rect(0, 0, bsz.width, bsz.height));
 
@@ -813,6 +838,11 @@ void crossCorr( const Mat& img, const Mat& _templ, Mat& corr,
             }
         }
     }
+    if (useHalDft)
+    {
+        hal::dftFree2D(cF);
+        hal::dftFree2D(cR);
+    }
 }
 
 static void matchTemplateMask( InputArray _img, InputArray _templ, OutputArray _result, int method, InputArray _mask )
-- 
2.7.4