Fixed

[profile/ivi/opencv.git] / modules / imgproc / src / imgwarp.cpp
diff --git a/modules/imgproc/src/imgwarp.cpp b/modules/imgproc/src/imgwarp.cpp

index 2c87efe..1953a47 100644 (file)
--- a/modules/imgproc/src/imgwarp.cpp
+++ b/modules/imgproc/src/imgwarp.cpp
@@ -47,8 +47,7 @@
  // */
  
  #include "precomp.hpp"
-#include <iostream>
-#include <vector>
+#include "opencl_kernels.hpp"
  
  #if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7)
  static IppStatus sts = ippInit();
@@ -56,12 +55,14 @@ static IppStatus sts = ippInit();
  
  namespace cv
  {
+#if defined (HAVE_IPP) && ((IPP_VERSION_MAJOR == 7 && IPP_VERSION_MINOR >= 1) || IPP_VERSION_MAJOR > 7)
+    typedef IppStatus (CV_STDCALL* ippiResizeFunc)(const void*, int, const void*, int, IppiPoint, IppiSize, IppiBorderType, void*, void*, Ipp8u*);
+#endif
  
  #if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7)
      typedef IppStatus (CV_STDCALL* ippiSetFunc)(const void*, void *, int, IppiSize);
      typedef IppStatus (CV_STDCALL* ippiWarpPerspectiveBackFunc)(const void*, IppiSize, int, IppiRect, void *, int, IppiRect, double [3][3], int);
      typedef IppStatus (CV_STDCALL* ippiWarpAffineBackFunc)(const void*, IppiSize, int, IppiRect, void *, int, IppiRect, double [2][3], int);
-    typedef IppStatus (CV_STDCALL* ippiResizeSqrPixelFunc)(const void*, IppiSize, int, IppiRect, void*, int, IppiRect, double, double, double, double, int, Ipp8u *);
  
      template <int channels, typename Type>
      bool IPPSetSimple(cv::Scalar value, void *dataPointer, int step, IppiSize &size, ippiSetFunc func)
@@ -1303,27 +1304,221 @@ static void resizeGeneric_( const Mat& src, Mat& dst,
  template <typename T, typename WT>
  struct ResizeAreaFastNoVec
  {
-    ResizeAreaFastNoVec(int /*_scale_x*/, int /*_scale_y*/,
-        int /*_cn*/, int /*_step*//*, const int**/ /*_ofs*/) { }
-    int operator() (const T* /*S*/, T* /*D*/, int /*w*/) const { return 0; }
+    ResizeAreaFastNoVec(int, int) { }
+    ResizeAreaFastNoVec(int, int, int, int) { }
+    int operator() (const T*, T*, int) const
+    { return 0; }
  };
  
-template<typename T>
+#if CV_SSE2
+class ResizeAreaFastVec_SIMD_8u
+{
+public:
+    ResizeAreaFastVec_SIMD_8u(int _cn, int _step) :
+        cn(_cn), step(_step)
+    {
+        use_simd = checkHardwareSupport(CV_CPU_SSE2);
+    }
+
+    int operator() (const uchar* S, uchar* D, int w) const
+    {
+        if (!use_simd)
+            return 0;
+
+        int dx = 0;
+        const uchar* S0 = S;
+        const uchar* S1 = S0 + step;
+        __m128i zero = _mm_setzero_si128();
+        __m128i delta2 = _mm_set1_epi16(2);
+
+        if (cn == 1)
+        {
+            __m128i masklow = _mm_set1_epi16(0x00ff);
+            for ( ; dx <= w - 8; dx += 8, S0 += 16, S1 += 16, D += 8)
+            {
+                __m128i r0 = _mm_loadu_si128((const __m128i*)S0);
+                __m128i r1 = _mm_loadu_si128((const __m128i*)S1);
+
+                __m128i s0 = _mm_add_epi16(_mm_srli_epi16(r0, 8), _mm_and_si128(r0, masklow));
+                __m128i s1 = _mm_add_epi16(_mm_srli_epi16(r1, 8), _mm_and_si128(r1, masklow));
+                s0 = _mm_add_epi16(_mm_add_epi16(s0, s1), delta2);
+                s0 = _mm_packus_epi16(_mm_srli_epi16(s0, 2), zero);
+
+                _mm_storel_epi64((__m128i*)D, s0);
+            }
+        }
+        else if (cn == 3)
+            for ( ; dx <= w - 11; dx += 6, S0 += 12, S1 += 12, D += 6)
+            {
+                __m128i r0 = _mm_loadu_si128((const __m128i*)S0);
+                __m128i r1 = _mm_loadu_si128((const __m128i*)S1);
+
+                __m128i r0_16l = _mm_unpacklo_epi8(r0, zero);
+                __m128i r0_16h = _mm_unpacklo_epi8(_mm_srli_si128(r0, 6), zero);
+                __m128i r1_16l = _mm_unpacklo_epi8(r1, zero);
+                __m128i r1_16h = _mm_unpacklo_epi8(_mm_srli_si128(r1, 6), zero);
+
+                __m128i s0 = _mm_add_epi16(r0_16l, _mm_srli_si128(r0_16l, 6));
+                __m128i s1 = _mm_add_epi16(r1_16l, _mm_srli_si128(r1_16l, 6));
+                s0 = _mm_add_epi16(s1, _mm_add_epi16(s0, delta2));
+                s0 = _mm_packus_epi16(_mm_srli_epi16(s0, 2), zero);
+                _mm_storel_epi64((__m128i*)D, s0);
+
+                s0 = _mm_add_epi16(r0_16h, _mm_srli_si128(r0_16h, 6));
+                s1 = _mm_add_epi16(r1_16h, _mm_srli_si128(r1_16h, 6));
+                s0 = _mm_add_epi16(s1, _mm_add_epi16(s0, delta2));
+                s0 = _mm_packus_epi16(_mm_srli_epi16(s0, 2), zero);
+                _mm_storel_epi64((__m128i*)(D+3), s0);
+            }
+        else
+        {
+            CV_Assert(cn == 4);
+            int v[] = { 0, 0, -1, -1 };
+            __m128i mask = _mm_loadu_si128((const __m128i*)v);
+
+            for ( ; dx <= w - 8; dx += 8, S0 += 16, S1 += 16, D += 8)
+            {
+                __m128i r0 = _mm_loadu_si128((const __m128i*)S0);
+                __m128i r1 = _mm_loadu_si128((const __m128i*)S1);
+
+                __m128i r0_16l = _mm_unpacklo_epi8(r0, zero);
+                __m128i r0_16h = _mm_unpackhi_epi8(r0, zero);
+                __m128i r1_16l = _mm_unpacklo_epi8(r1, zero);
+                __m128i r1_16h = _mm_unpackhi_epi8(r1, zero);
+
+                __m128i s0 = _mm_add_epi16(r0_16l, _mm_srli_si128(r0_16l, 8));
+                __m128i s1 = _mm_add_epi16(r1_16l, _mm_srli_si128(r1_16l, 8));
+                s0 = _mm_add_epi16(s1, _mm_add_epi16(s0, delta2));
+                __m128i res0 = _mm_srli_epi16(s0, 2);
+
+                s0 = _mm_add_epi16(r0_16h, _mm_srli_si128(r0_16h, 8));
+                s1 = _mm_add_epi16(r1_16h, _mm_srli_si128(r1_16h, 8));
+                s0 = _mm_add_epi16(s1, _mm_add_epi16(s0, delta2));
+                __m128i res1 = _mm_srli_epi16(s0, 2);
+                s0 = _mm_packus_epi16(_mm_or_si128(_mm_andnot_si128(mask, res0),
+                                                   _mm_and_si128(mask, _mm_slli_si128(res1, 8))), zero);
+                _mm_storel_epi64((__m128i*)(D), s0);
+            }
+        }
+
+        return dx;
+    }
+
+private:
+    int cn;
+    bool use_simd;
+    int step;
+};
+
+class ResizeAreaFastVec_SIMD_16u
+{
+public:
+    ResizeAreaFastVec_SIMD_16u(int _cn, int _step) :
+        cn(_cn), step(_step)
+    {
+        use_simd = checkHardwareSupport(CV_CPU_SSE2);
+    }
+
+    int operator() (const ushort* S, ushort* D, int w) const
+    {
+        if (!use_simd)
+            return 0;
+
+        int dx = 0;
+        const ushort* S0 = (const ushort*)S;
+        const ushort* S1 = (const ushort*)((const uchar*)(S) + step);
+        __m128i masklow = _mm_set1_epi32(0x0000ffff);
+        __m128i zero = _mm_setzero_si128();
+        __m128i delta2 = _mm_set1_epi32(2);
+
+#define _mm_packus_epi32(a, zero) _mm_packs_epi32(_mm_srai_epi32(_mm_slli_epi32(a, 16), 16), zero)
+
+        if (cn == 1)
+        {
+            for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
+            {
+                __m128i r0 = _mm_loadu_si128((const __m128i*)S0);
+                __m128i r1 = _mm_loadu_si128((const __m128i*)S1);
+
+                __m128i s0 = _mm_add_epi32(_mm_srli_epi32(r0, 16), _mm_and_si128(r0, masklow));
+                __m128i s1 = _mm_add_epi32(_mm_srli_epi32(r1, 16), _mm_and_si128(r1, masklow));
+                s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), delta2);
+                s0 = _mm_srli_epi32(s0, 2);
+                s0 = _mm_packus_epi32(s0, zero);
+
+                _mm_storel_epi64((__m128i*)D, s0);
+            }
+        }
+        else if (cn == 3)
+            for ( ; dx <= w - 4; dx += 3, S0 += 6, S1 += 6, D += 3)
+            {
+                __m128i r0 = _mm_loadu_si128((const __m128i*)S0);
+                __m128i r1 = _mm_loadu_si128((const __m128i*)S1);
+
+                __m128i r0_16l = _mm_unpacklo_epi16(r0, zero);
+                __m128i r0_16h = _mm_unpacklo_epi16(_mm_srli_si128(r0, 6), zero);
+                __m128i r1_16l = _mm_unpacklo_epi16(r1, zero);
+                __m128i r1_16h = _mm_unpacklo_epi16(_mm_srli_si128(r1, 6), zero);
+
+                __m128i s0 = _mm_add_epi32(r0_16l, r0_16h);
+                __m128i s1 = _mm_add_epi32(r1_16l, r1_16h);
+                s0 = _mm_add_epi32(delta2, _mm_add_epi32(s0, s1));
+                s0 = _mm_packus_epi32(_mm_srli_epi32(s0, 2), zero);
+                _mm_storel_epi64((__m128i*)D, s0);
+            }
+        else
+        {
+            CV_Assert(cn == 4);
+            for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
+            {
+                __m128i r0 = _mm_loadu_si128((const __m128i*)S0);
+                __m128i r1 = _mm_loadu_si128((const __m128i*)S1);
+
+                __m128i r0_32l = _mm_unpacklo_epi16(r0, zero);
+                __m128i r0_32h = _mm_unpackhi_epi16(r0, zero);
+                __m128i r1_32l = _mm_unpacklo_epi16(r1, zero);
+                __m128i r1_32h = _mm_unpackhi_epi16(r1, zero);
+
+                __m128i s0 = _mm_add_epi32(r0_32l, r0_32h);
+                __m128i s1 = _mm_add_epi32(r1_32l, r1_32h);
+                s0 = _mm_add_epi32(s1, _mm_add_epi32(s0, delta2));
+                s0 = _mm_packus_epi32(_mm_srli_epi32(s0, 2), zero);
+                _mm_storel_epi64((__m128i*)D, s0);
+            }
+        }
+
+#undef _mm_packus_epi32
+
+        return dx;
+    }
+
+private:
+    int cn;
+    int step;
+    bool use_simd;
+};
+
+#else
+typedef ResizeAreaFastNoVec<uchar, uchar> ResizeAreaFastVec_SIMD_8u;
+typedef ResizeAreaFastNoVec<ushort, ushort> ResizeAreaFastVec_SIMD_16u;
+#endif
+
+template<typename T, typename SIMDVecOp>
  struct ResizeAreaFastVec
  {
-    ResizeAreaFastVec(int _scale_x, int _scale_y, int _cn, int _step/*, const int* _ofs*/) :
-        scale_x(_scale_x), scale_y(_scale_y), cn(_cn), step(_step)/*, ofs(_ofs)*/
+    ResizeAreaFastVec(int _scale_x, int _scale_y, int _cn, int _step) :
+        scale_x(_scale_x), scale_y(_scale_y), cn(_cn), step(_step), vecOp(_cn, _step)
      {
          fast_mode = scale_x == 2 && scale_y == 2 && (cn == 1 || cn == 3 || cn == 4);
      }
  
      int operator() (const T* S, T* D, int w) const
      {
-        if( !fast_mode )
+        if (!fast_mode)
              return 0;
  
          const T* nextS = (const T*)((const uchar*)S + step);
-        int dx = 0;
+        int dx = vecOp(S, D, w);
  
          if (cn == 1)
              for( ; dx < w; ++dx )
@@ -1341,7 +1536,7 @@ struct ResizeAreaFastVec
              }
          else
              {
-                assert(cn == 4);
+                CV_Assert(cn == 4);
                  for( ; dx < w; dx += 4 )
                  {
                      int index = dx*2;
@@ -1360,6 +1555,7 @@ private:
      int cn;
      bool fast_mode;
      int step;
+    SIMDVecOp vecOp;
  };
  
  template <typename T, typename WT, typename VecOp>
@@ -1634,7 +1830,7 @@ static int computeResizeAreaTab( int ssize, int dsize, int cn, double scale, Dec
      {
          double fsx1 = dx * scale;
          double fsx2 = fsx1 + scale;
-        double cellWidth = min(scale, ssize - fsx1);
+        double cellWidth = std::min(scale, ssize - fsx1);
  
          int sx1 = cvCeil(fsx1), sx2 = cvFloor(fsx2);
  
@@ -1662,36 +1858,210 @@ static int computeResizeAreaTab( int ssize, int dsize, int cn, double scale, Dec
              assert( k < ssize*2 );
              tab[k].di = dx * cn;
              tab[k].si = sx2 * cn;
-            tab[k++].alpha = (float)(min(min(fsx2 - sx2, 1.), cellWidth) / cellWidth);
+            tab[k++].alpha = (float)(std::min(std::min(fsx2 - sx2, 1.), cellWidth) / cellWidth);
          }
      }
      return k;
  }
  
-#if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7)
+#if defined (HAVE_IPP) && ((IPP_VERSION_MAJOR == 7 && IPP_VERSION_MINOR >= 1) || IPP_VERSION_MAJOR > 7)
  class IPPresizeInvoker :
      public ParallelLoopBody
  {
  public:
-    IPPresizeInvoker(Mat &_src, Mat &_dst, double &_inv_scale_x, double &_inv_scale_y, int _mode, ippiResizeSqrPixelFunc _func, bool *_ok) :
-      ParallelLoopBody(), src(_src), dst(_dst), inv_scale_x(_inv_scale_x), inv_scale_y(_inv_scale_y), mode(_mode), func(_func), ok(_ok)
+    IPPresizeInvoker(Mat &_src, Mat &_dst, double _inv_scale_x, double _inv_scale_y, int _mode, bool *_ok) :
+      ParallelLoopBody(), src(_src), dst(_dst), inv_scale_x(_inv_scale_x), inv_scale_y(_inv_scale_y), mode(_mode), ok(_ok)
        {
+          IppStatus status = ippStsNotSupportedModeErr;
            *ok = true;
+          IppiSize srcSize, dstSize;
+          int type = src.type();
+          int specSize = 0, initSize = 0;
+          srcSize.width  = src.cols;
+          srcSize.height = src.rows;
+          dstSize.width  = dst.cols;
+          dstSize.height = dst.rows;
+
+          if (mode == (int)ippLinear)
+          {
+              func =
+                  type == CV_8UC1  ? (ippiResizeFunc)ippiResizeLinear_8u_C1R :
+                  type == CV_8UC3  ? (ippiResizeFunc)ippiResizeLinear_8u_C3R :
+                  type == CV_8UC4  ? (ippiResizeFunc)ippiResizeLinear_8u_C4R :
+                  type == CV_16UC1 ? (ippiResizeFunc)ippiResizeLinear_16u_C1R :
+                  type == CV_16UC3 ? (ippiResizeFunc)ippiResizeLinear_16u_C3R :
+                  type == CV_16UC4 ? (ippiResizeFunc)ippiResizeLinear_16u_C4R :
+                  type == CV_16SC1 ? (ippiResizeFunc)ippiResizeLinear_16s_C1R :
+                  type == CV_16SC3 ? (ippiResizeFunc)ippiResizeLinear_16s_C3R :
+                  type == CV_16SC4 ? (ippiResizeFunc)ippiResizeLinear_16s_C4R :
+                  type == CV_32FC1 ? (ippiResizeFunc)ippiResizeLinear_32f_C1R :
+                  type == CV_32FC3 ? (ippiResizeFunc)ippiResizeLinear_32f_C3R :
+                  type == CV_32FC4 ? (ippiResizeFunc)ippiResizeLinear_32f_C4R :
+                  type == CV_64FC1 ? (ippiResizeFunc)ippiResizeLinear_64f_C1R :
+                  type == CV_64FC3 ? (ippiResizeFunc)ippiResizeLinear_64f_C3R :
+                  type == CV_64FC4 ? (ippiResizeFunc)ippiResizeLinear_64f_C4R :
+                  0;
+          }
+          else if (mode == (int)ippCubic)
+          {
+              func =
+                  type == CV_8UC1  ? (ippiResizeFunc)ippiResizeCubic_8u_C1R :
+                  type == CV_8UC3  ? (ippiResizeFunc)ippiResizeCubic_8u_C3R :
+                  type == CV_8UC4  ? (ippiResizeFunc)ippiResizeCubic_8u_C4R :
+                  type == CV_16UC1 ? (ippiResizeFunc)ippiResizeCubic_16u_C1R :
+                  type == CV_16UC3 ? (ippiResizeFunc)ippiResizeCubic_16u_C3R :
+                  type == CV_16UC4 ? (ippiResizeFunc)ippiResizeCubic_16u_C4R :
+                  type == CV_16SC1 ? (ippiResizeFunc)ippiResizeCubic_16s_C1R :
+                  type == CV_16SC3 ? (ippiResizeFunc)ippiResizeCubic_16s_C3R :
+                  type == CV_16SC4 ? (ippiResizeFunc)ippiResizeCubic_16s_C4R :
+                  type == CV_32FC1 ? (ippiResizeFunc)ippiResizeCubic_32f_C1R :
+                  type == CV_32FC3 ? (ippiResizeFunc)ippiResizeCubic_32f_C3R :
+                  type == CV_32FC4 ? (ippiResizeFunc)ippiResizeCubic_32f_C4R :
+                  0;
+          }
+
+          switch (src.depth())
+          {
+          case CV_8U:
+              status = ippiResizeGetSize_8u(srcSize, dstSize, (IppiInterpolationType)mode, 0, &specSize, &initSize);
+              break;
+          case CV_16U:
+              status = ippiResizeGetSize_16u(srcSize, dstSize, (IppiInterpolationType)mode, 0, &specSize, &initSize);
+              break;
+          case CV_16S:
+              status = ippiResizeGetSize_16s(srcSize, dstSize, (IppiInterpolationType)mode, 0, &specSize, &initSize);
+              break;
+          case CV_32F:
+              status = ippiResizeGetSize_32f(srcSize, dstSize, (IppiInterpolationType)mode, 0, &specSize, &initSize);
+              break;
+          case CV_64F:
+              status = ippiResizeGetSize_64f(srcSize, dstSize, (IppiInterpolationType)mode, 0, &specSize, &initSize);
+              break;
+          }
+          if (status != ippStsNoErr)
+          {
+              *ok = false;
+              return;
+          }
+
+          specBuf.allocate(specSize);
+          pSpec = (uchar*)specBuf;
+
+          status = ippStsNotSupportedModeErr;
+          if (mode == (int)ippLinear)
+          {
+              switch (src.depth())
+              {
+              case CV_8U:
+                  status = ippiResizeLinearInit_8u(srcSize, dstSize, (IppiResizeSpec_32f*)pSpec);
+                  break;
+              case CV_16U:
+                  status = ippiResizeLinearInit_16u(srcSize, dstSize, (IppiResizeSpec_32f*)pSpec);
+                  break;
+              case CV_16S:
+                  status = ippiResizeLinearInit_16s(srcSize, dstSize, (IppiResizeSpec_32f*)pSpec);
+                  break;
+              case CV_32F:
+                  status = ippiResizeLinearInit_32f(srcSize, dstSize, (IppiResizeSpec_32f*)pSpec);
+                  break;
+              case CV_64F:
+                  status = ippiResizeLinearInit_64f(srcSize, dstSize, (IppiResizeSpec_64f*)pSpec);
+                  break;
+              }
+              if (status != ippStsNoErr)
+              {
+                  *ok = false;
+                  return;
+              }
+          }
+          else if (mode == (int)ippCubic)
+          {
+              AutoBuffer<uchar> buf(initSize);
+              uchar* pInit = (uchar*)buf;
+
+              switch (src.depth())
+              {
+              case CV_8U:
+                  status = ippiResizeCubicInit_8u(srcSize, dstSize,  0.f, 0.75f, (IppiResizeSpec_32f*)pSpec, pInit);
+                  break;
+              case CV_16U:
+                  status = ippiResizeCubicInit_16u(srcSize, dstSize, 0.f, 0.75f, (IppiResizeSpec_32f*)pSpec, pInit);
+                  break;
+              case CV_16S:
+                  status = ippiResizeCubicInit_16s(srcSize, dstSize, 0.f, 0.75f, (IppiResizeSpec_32f*)pSpec, pInit);
+                  break;
+              case CV_32F:
+                  status = ippiResizeCubicInit_32f(srcSize, dstSize, 0.f, 0.75f, (IppiResizeSpec_32f*)pSpec, pInit);
+                  break;
+              }
+              if (status != ippStsNoErr) *ok = false;
+          }
+      }
+
+      ~IPPresizeInvoker()
+      {
        }
  
        virtual void operator() (const Range& range) const
        {
+          if (*ok == false) return;
+
            int cn = src.channels();
-          IppiRect srcroi = { 0, range.start, src.cols, range.end - range.start };
            int dsty = CV_IMIN(cvRound(range.start * inv_scale_y), dst.rows);
-          int dstwidth = CV_IMIN(cvRound(src.cols * inv_scale_x), dst.cols);
+          int dstwidth  = CV_IMIN(cvRound(src.cols * inv_scale_x), dst.cols);
            int dstheight = CV_IMIN(cvRound(range.end * inv_scale_y), dst.rows);
-          IppiRect dstroi = { 0, dsty, dstwidth, dstheight - dsty };
-          int bufsize;
-          ippiResizeGetBufSize( srcroi, dstroi, cn, mode, &bufsize );
+
+          IppiPoint dstOffset = { 0, dsty }, srcOffset = {0, 0};
+          IppiSize  dstSize   = { dstwidth, dstheight - dsty };
+          int bufsize = 0, itemSize = 0;
+
+          IppStatus status = ippStsNotSupportedModeErr;
+
+          switch (src.depth())
+          {
+          case CV_8U:
+              itemSize = 1;
+              status = ippiResizeGetBufferSize_8u((IppiResizeSpec_32f*)pSpec, dstSize, cn, &bufsize);
+              if (status == ippStsNoErr)
+                  status = ippiResizeGetSrcOffset_8u((IppiResizeSpec_32f*)pSpec, dstOffset, &srcOffset);
+              break;
+          case CV_16U:
+              itemSize = 2;
+              status = ippiResizeGetBufferSize_16u((IppiResizeSpec_32f*)pSpec, dstSize, cn, &bufsize);
+              if (status == ippStsNoErr)
+                  status = ippiResizeGetSrcOffset_16u((IppiResizeSpec_32f*)pSpec, dstOffset, &srcOffset);
+              break;
+          case CV_16S:
+              itemSize = 2;
+              status = ippiResizeGetBufferSize_16s((IppiResizeSpec_32f*)pSpec, dstSize, cn, &bufsize);
+              if (status == ippStsNoErr)
+                  status = ippiResizeGetSrcOffset_16s((IppiResizeSpec_32f*)pSpec, dstOffset, &srcOffset);
+              break;
+          case CV_32F:
+              itemSize = 4;
+              status = ippiResizeGetBufferSize_32f((IppiResizeSpec_32f*)pSpec, dstSize, cn, &bufsize);
+              if (status == ippStsNoErr)
+                  status = ippiResizeGetSrcOffset_32f((IppiResizeSpec_32f*)pSpec, dstOffset, &srcOffset);
+              break;
+          case CV_64F:
+              itemSize = 4;
+              status = ippiResizeGetBufferSize_64f((IppiResizeSpec_64f*)pSpec, dstSize, cn, &bufsize);
+              if (status == ippStsNoErr)
+                  status = ippiResizeGetSrcOffset_64f((IppiResizeSpec_64f*)pSpec, dstOffset, &srcOffset);
+              break;
+          }
+          if (status != ippStsNoErr)
+          {
+              *ok = false;
+              return;
+          }
+
+          Ipp8u* pSrc = (Ipp8u*)src.data + (int)src.step[0] * srcOffset.y + srcOffset.x * cn * itemSize;
+          Ipp8u* pDst = (Ipp8u*)dst.data + (int)dst.step[0] * dstOffset.y + dstOffset.x * cn * itemSize;
+
            AutoBuffer<uchar> buf(bufsize + 64);
            uchar* bufptr = alignPtr((uchar*)buf, 32);
-          if( func( src.data, ippiSize(src.cols, src.rows), (int)src.step[0], srcroi, dst.data, (int)dst.step[0], dstroi, inv_scale_x, inv_scale_y, 0, 0, mode, bufptr ) < 0 )
+          if( func( pSrc, (int)src.step[0], pDst, (int)dst.step[0], dstOffset, dstSize, ippBorderRepl, 0, pSpec, bufptr ) <= 0 )
                *ok = false;
        }
  private:
@@ -1699,15 +2069,192 @@ private:
      Mat &dst;
      double inv_scale_x;
      double inv_scale_y;
+    void *pSpec;
+    AutoBuffer<uchar>   specBuf;
      int mode;
-    ippiResizeSqrPixelFunc func;
+    ippiResizeFunc func;
      bool *ok;
      const IPPresizeInvoker& operator= (const IPPresizeInvoker&);
  };
  #endif
  
+#ifdef HAVE_OPENCL
+
+static void ocl_computeResizeAreaTabs(int ssize, int dsize, double scale, int * const map_tab,
+                                          float * const alpha_tab, int * const ofs_tab)
+{
+    int k = 0, dx = 0;
+    for ( ; dx < dsize; dx++)
+    {
+        ofs_tab[dx] = k;
+
+        double fsx1 = dx * scale;
+        double fsx2 = fsx1 + scale;
+        double cellWidth = std::min(scale, ssize - fsx1);
+
+        int sx1 = cvCeil(fsx1), sx2 = cvFloor(fsx2);
+
+        sx2 = std::min(sx2, ssize - 1);
+        sx1 = std::min(sx1, sx2);
+
+        if (sx1 - fsx1 > 1e-3)
+        {
+            map_tab[k] = sx1 - 1;
+            alpha_tab[k++] = (float)((sx1 - fsx1) / cellWidth);
+        }
+
+        for (int sx = sx1; sx < sx2; sx++)
+        {
+            map_tab[k] = sx;
+            alpha_tab[k++] = float(1.0 / cellWidth);
+        }
+
+        if (fsx2 - sx2 > 1e-3)
+        {
+            map_tab[k] = sx2;
+            alpha_tab[k++] = (float)(std::min(std::min(fsx2 - sx2, 1.), cellWidth) / cellWidth);
+        }
+    }
+    ofs_tab[dx] = k;
+}
+
+static void ocl_computeResizeAreaFastTabs(int * dmap_tab, int * smap_tab, int scale, int dcols, int scol)
+{
+    for (int i = 0; i < dcols; ++i)
+        dmap_tab[i] = scale * i;
+
+    for (int i = 0, size = dcols * scale; i < size; ++i)
+        smap_tab[i] = std::min(scol - 1, i);
  }
  
+static bool ocl_resize( InputArray _src, OutputArray _dst, Size dsize,
+                        double fx, double fy, int interpolation)
+{
+    int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
+
+    double inv_fx = 1. / fx, inv_fy = 1. / fy;
+    float inv_fxf = (float)inv_fx, inv_fyf = (float)inv_fy;
+
+    if( !(cn <= 4 &&
+           (interpolation == INTER_NEAREST || interpolation == INTER_LINEAR ||
+            (interpolation == INTER_AREA && inv_fx >= 1 && inv_fy >= 1) )) )
+        return false;
+
+    UMat src = _src.getUMat();
+    _dst.create(dsize, type);
+    UMat dst = _dst.getUMat();
+
+    ocl::Kernel k;
+    size_t globalsize[] = { dst.cols, dst.rows };
+
+    if (interpolation == INTER_LINEAR)
+    {
+        int wdepth = std::max(depth, CV_32S);
+        int wtype = CV_MAKETYPE(wdepth, cn);
+        char buf[2][32];
+        k.create("resizeLN", ocl::imgproc::resize_oclsrc,
+                 format("-D INTER_LINEAR -D depth=%d -D PIXTYPE=%s -D PIXTYPE1=%s "
+                        "-D WORKTYPE=%s -D convertToWT=%s -D convertToDT=%s -D cn=%d",
+                        depth, ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(wtype),
+                        ocl::convertTypeStr(depth, wdepth, cn, buf[0]),
+                        ocl::convertTypeStr(wdepth, depth, cn, buf[1]),
+                        cn));
+    }
+    else if (interpolation == INTER_NEAREST)
+    {
+        k.create("resizeNN", ocl::imgproc::resize_oclsrc,
+                 format("-D INTER_NEAREST -D PIXTYPE=%s -D PIXTYPE1=%s -D cn=%d",
+                        ocl::memopTypeToStr(type), ocl::memopTypeToStr(depth), cn));
+    }
+    else if (interpolation == INTER_AREA)
+    {
+        int iscale_x = saturate_cast<int>(inv_fx);
+        int iscale_y = saturate_cast<int>(inv_fy);
+        bool is_area_fast = std::abs(inv_fx - iscale_x) < DBL_EPSILON &&
+                        std::abs(inv_fy - iscale_y) < DBL_EPSILON;
+        int wdepth = std::max(depth, is_area_fast ? CV_32S : CV_32F);
+        int wtype = CV_MAKE_TYPE(wdepth, cn);
+
+        char cvt[2][40];
+        String buildOption = format("-D INTER_AREA -D PIXTYPE=%s -D PIXTYPE1=%s -D WTV=%s -D convertToWTV=%s -D cn=%d",
+                                    ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(wtype),
+                                    ocl::convertTypeStr(depth, wdepth, cn, cvt[0]), cn);
+
+        UMat alphaOcl, tabofsOcl, mapOcl;
+        UMat dmap, smap;
+
+        if (is_area_fast)
+        {
+            int wdepth2 = std::max(CV_32F, depth), wtype2 = CV_MAKE_TYPE(wdepth2, cn);
+            buildOption = buildOption + format(" -D convertToPIXTYPE=%s -D WT2V=%s -D convertToWT2V=%s -D INTER_AREA_FAST"
+                                               " -D XSCALE=%d -D YSCALE=%d -D SCALE=%ff",
+                                               ocl::convertTypeStr(wdepth2, depth, cn, cvt[0]),
+                                               ocl::typeToStr(wtype2), ocl::convertTypeStr(wdepth, wdepth2, cn, cvt[1]),
+                                  iscale_x, iscale_y, 1.0f / (iscale_x * iscale_y));
+
+            k.create("resizeAREA_FAST", ocl::imgproc::resize_oclsrc, buildOption);
+            if (k.empty())
+                return false;
+
+            int smap_tab_size = dst.cols * iscale_x + dst.rows * iscale_y;
+            AutoBuffer<int> dmap_tab(dst.cols + dst.rows), smap_tab(smap_tab_size);
+            int * dxmap_tab = dmap_tab, * dymap_tab = dxmap_tab + dst.cols;
+            int * sxmap_tab = smap_tab, * symap_tab = smap_tab + dst.cols * iscale_y;
+
+            ocl_computeResizeAreaFastTabs(dxmap_tab, sxmap_tab, iscale_x, dst.cols, src.cols);
+            ocl_computeResizeAreaFastTabs(dymap_tab, symap_tab, iscale_y, dst.rows, src.rows);
+
+            Mat(1, dst.cols + dst.rows, CV_32SC1, (void *)dmap_tab).copyTo(dmap);
+            Mat(1, smap_tab_size, CV_32SC1, (void *)smap_tab).copyTo(smap);
+        }
+        else
+        {
+            buildOption = buildOption + format(" -D convertToPIXTYPE=%s", ocl::convertTypeStr(wdepth, depth, cn, cvt[0]));
+            k.create("resizeAREA", ocl::imgproc::resize_oclsrc, buildOption);
+            if (k.empty())
+                return false;
+
+            Size ssize = src.size();
+            int xytab_size = (ssize.width + ssize.height) << 1;
+            int tabofs_size = dsize.height + dsize.width + 2;
+
+            AutoBuffer<int> _xymap_tab(xytab_size), _xyofs_tab(tabofs_size);
+            AutoBuffer<float> _xyalpha_tab(xytab_size);
+            int * xmap_tab = _xymap_tab, * ymap_tab = _xymap_tab + (ssize.width << 1);
+            float * xalpha_tab = _xyalpha_tab, * yalpha_tab = _xyalpha_tab + (ssize.width << 1);
+            int * xofs_tab = _xyofs_tab, * yofs_tab = _xyofs_tab + dsize.width + 1;
+
+            ocl_computeResizeAreaTabs(ssize.width, dsize.width, inv_fx, xmap_tab, xalpha_tab, xofs_tab);
+            ocl_computeResizeAreaTabs(ssize.height, dsize.height, inv_fy, ymap_tab, yalpha_tab, yofs_tab);
+
+            // loading precomputed arrays to GPU
+            Mat(1, xytab_size, CV_32FC1, (void *)_xyalpha_tab).copyTo(alphaOcl);
+            Mat(1, xytab_size, CV_32SC1, (void *)_xymap_tab).copyTo(mapOcl);
+            Mat(1, tabofs_size, CV_32SC1, (void *)_xyofs_tab).copyTo(tabofsOcl);
+        }
+
+        ocl::KernelArg srcarg = ocl::KernelArg::ReadOnly(src), dstarg = ocl::KernelArg::WriteOnly(dst);
+
+        if (is_area_fast)
+            k.args(srcarg, dstarg, ocl::KernelArg::PtrReadOnly(dmap), ocl::KernelArg::PtrReadOnly(smap));
+        else
+            k.args(srcarg, dstarg, inv_fxf, inv_fyf, ocl::KernelArg::PtrReadOnly(tabofsOcl),
+                   ocl::KernelArg::PtrReadOnly(mapOcl), ocl::KernelArg::PtrReadOnly(alphaOcl));
+
+        return k.run(2, globalsize, NULL, false);
+    }
+
+    if( k.empty() )
+        return false;
+    k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst),
+           (float)inv_fx, (float)inv_fy);
+
+    return k.run(2, globalsize, 0, false);
+}
+
+#endif
+
+}
  
  //////////////////////////////////////////////////////////////////////////////////////////
  
@@ -1801,10 +2348,10 @@ void cv::resize( InputArray _src, OutputArray _dst, Size dsize,
  
      static ResizeAreaFastFunc areafast_tab[] =
      {
-        resizeAreaFast_<uchar, int, ResizeAreaFastVec<uchar> >,
+        resizeAreaFast_<uchar, int, ResizeAreaFastVec<uchar, ResizeAreaFastVec_SIMD_8u> >,
          0,
-        resizeAreaFast_<ushort, float, ResizeAreaFastVec<ushort> >,
-        resizeAreaFast_<short, float, ResizeAreaFastVec<short> >,
+        resizeAreaFast_<ushort, float, ResizeAreaFastVec<ushort, ResizeAreaFastVec_SIMD_16u> >,
+        resizeAreaFast_<short, float, ResizeAreaFastVec<short, ResizeAreaFastNoVec<short, float> > >,
          0,
          resizeAreaFast_<float, float, ResizeAreaFastNoVec<float, float> >,
          resizeAreaFast_<double, double, ResizeAreaFastNoVec<double, double> >,
@@ -1818,26 +2365,29 @@ void cv::resize( InputArray _src, OutputArray _dst, Size dsize,
          resizeArea_<double, double>, 0
      };
  
-    Mat src = _src.getMat();
-    Size ssize = src.size();
+    Size ssize = _src.size();
  
      CV_Assert( ssize.area() > 0 );
-    CV_Assert( dsize.area() || (inv_scale_x > 0 && inv_scale_y > 0) );
-    if( !dsize.area() )
+    CV_Assert( dsize.area() > 0 || (inv_scale_x > 0 && inv_scale_y > 0) );
+    if( dsize.area() == 0 )
      {
-        dsize = Size(saturate_cast<int>(src.cols*inv_scale_x),
-            saturate_cast<int>(src.rows*inv_scale_y));
-        CV_Assert( dsize.area() );
+        dsize = Size(saturate_cast<int>(ssize.width*inv_scale_x),
+                     saturate_cast<int>(ssize.height*inv_scale_y));
+        CV_Assert( dsize.area() > 0 );
      }
      else
      {
-        inv_scale_x = (double)dsize.width/src.cols;
-        inv_scale_y = (double)dsize.height/src.rows;
+        inv_scale_x = (double)dsize.width/ssize.width;
+        inv_scale_y = (double)dsize.height/ssize.height;
      }
+
+    CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(),
+               ocl_resize(_src, _dst, dsize, inv_scale_x, inv_scale_y, interpolation))
+
+    Mat src = _src.getMat();
      _dst.create(dsize, src.type());
      Mat dst = _dst.getMat();
  
-
  #ifdef HAVE_TEGRA_OPTIMIZATION
      if (tegra::resize(src, dst, (float)inv_scale_x, (float)inv_scale_y, interpolation))
          return;
@@ -1846,35 +2396,40 @@ void cv::resize( InputArray _src, OutputArray _dst, Size dsize,
      int depth = src.depth(), cn = src.channels();
      double scale_x = 1./inv_scale_x, scale_y = 1./inv_scale_y;
      int k, sx, sy, dx, dy;
-/*
-#if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7)
-    int mode = interpolation == INTER_LINEAR ? IPPI_INTER_LINEAR : 0;
-    int type = src.type();
-    ippiResizeSqrPixelFunc ippFunc =
-        type == CV_8UC1 ? (ippiResizeSqrPixelFunc)ippiResizeSqrPixel_8u_C1R :
-        type == CV_8UC3 ? (ippiResizeSqrPixelFunc)ippiResizeSqrPixel_8u_C3R :
-        type == CV_8UC4 ? (ippiResizeSqrPixelFunc)ippiResizeSqrPixel_8u_C4R :
-        type == CV_16UC1 ? (ippiResizeSqrPixelFunc)ippiResizeSqrPixel_16u_C1R :
-        type == CV_16UC3 ? (ippiResizeSqrPixelFunc)ippiResizeSqrPixel_16u_C3R :
-        type == CV_16UC4 ? (ippiResizeSqrPixelFunc)ippiResizeSqrPixel_16u_C4R :
-        type == CV_16SC1 ? (ippiResizeSqrPixelFunc)ippiResizeSqrPixel_16s_C1R :
-        type == CV_16SC3 ? (ippiResizeSqrPixelFunc)ippiResizeSqrPixel_16s_C3R :
-        type == CV_16SC4 ? (ippiResizeSqrPixelFunc)ippiResizeSqrPixel_16s_C4R :
-        type == CV_32FC1 ? (ippiResizeSqrPixelFunc)ippiResizeSqrPixel_32f_C1R :
-        type == CV_32FC3 ? (ippiResizeSqrPixelFunc)ippiResizeSqrPixel_32f_C3R :
-        type == CV_32FC4 ? (ippiResizeSqrPixelFunc)ippiResizeSqrPixel_32f_C4R :
-        0;
-    if( ippFunc && mode != 0 )
+
+#if defined (HAVE_IPP) && ((IPP_VERSION_MAJOR == 7 && IPP_VERSION_MINOR >= 1) || IPP_VERSION_MAJOR > 7)
+#define IPP_RESIZE_EPS    1.e-10
+
+    double ex = fabs((double)dsize.width/src.cols  - inv_scale_x)/inv_scale_x;
+    double ey = fabs((double)dsize.height/src.rows - inv_scale_y)/inv_scale_y;
+
+    if ((ex < IPP_RESIZE_EPS && ey < IPP_RESIZE_EPS && depth != CV_64F) ||
+        (ex == 0 && ey == 0 && depth == CV_64F))
      {
-        bool ok;
-        Range range(0, src.rows);
-        IPPresizeInvoker invoker(src, dst, inv_scale_x, inv_scale_y, mode, ippFunc, &ok);
-        parallel_for_(range, invoker, dst.total()/(double)(1<<16));
-        if( ok )
-            return;
+        int mode = 0;
+        if (interpolation == INTER_LINEAR && src.rows >= 2 && src.cols >= 2)
+        {
+            mode = ippLinear;
+        }
+        else if (interpolation == INTER_CUBIC && src.rows >= 4 && src.cols >= 4)
+        {
+            mode = ippCubic;
+        }
+        if( mode != 0 && (cn == 1 || cn ==3 || cn == 4) &&
+            (depth == CV_8U || depth == CV_16U || depth == CV_16S || depth == CV_32F ||
+            (depth == CV_64F && mode == ippLinear)))
+        {
+            bool ok = true;
+            Range range(0, src.rows);
+            IPPresizeInvoker invoker(src, dst, inv_scale_x, inv_scale_y, mode, &ok);
+            parallel_for_(range, invoker, dst.total()/(double)(1<<16));
+            if( ok )
+                return;
+        }
      }
+#undef IPP_RESIZE_EPS
  #endif
-*/
+
      if( interpolation == INTER_NEAREST )
      {
          resizeNN( src, dst, inv_scale_x, inv_scale_y );
@@ -1891,9 +2446,7 @@ void cv::resize( InputArray _src, OutputArray _dst, Size dsize,
          // in case of scale_x && scale_y is equal to 2
          // INTER_AREA (fast) also is equal to INTER_LINEAR
          if( interpolation == INTER_LINEAR && is_area_fast && iscale_x == 2 && iscale_y == 2 )
-        {
              interpolation = INTER_AREA;
-        }
  
          // true "area" interpolation is only implemented for the case (scale_x <= 1 && scale_y <= 1).
          // In other cases it is emulated using some variant of bilinear interpolation
@@ -1996,14 +2549,14 @@ void cv::resize( InputArray _src, OutputArray _dst, Size dsize,
          if( sx < ksize2-1 )
          {
              xmin = dx+1;
-            if( sx < 0 )
+            if( sx < 0 && (interpolation != INTER_CUBIC && interpolation != INTER_LANCZOS4))
                  fx = 0, sx = 0;
          }
  
          if( sx + ksize2 >= ssize.width )
          {
              xmax = std::min( xmax, dx );
-            if( sx >= ssize.width-1 )
+            if( sx >= ssize.width-1 && (interpolation != INTER_CUBIC && interpolation != INTER_LANCZOS4))
                  fx = 0, sx = ssize.width-1;
          }
  
@@ -2201,15 +2754,15 @@ struct RemapVec_8u
      int operator()( const Mat& _src, void* _dst, const short* XY,
                      const ushort* FXY, const void* _wtab, int width ) const
      {
-        int cn = _src.channels();
+        int cn = _src.channels(), x = 0, sstep = (int)_src.step;
  
-        if( (cn != 1 && cn != 3 && cn != 4) || !checkHardwareSupport(CV_CPU_SSE2) )
+        if( (cn != 1 && cn != 3 && cn != 4) || !checkHardwareSupport(CV_CPU_SSE2)||
+            sstep > 0x8000 )
              return 0;
  
          const uchar *S0 = _src.data, *S1 = _src.data + _src.step;
          const short* wtab = cn == 1 ? (const short*)_wtab : &BilinearTab_iC4[0][0][0];
          uchar* D = (uchar*)_dst;
-        int x = 0, sstep = (int)_src.step;
          __m128i delta = _mm_set1_epi32(INTER_REMAP_COEF_SCALE/2);
          __m128i xy2ofs = _mm_set1_epi32(cn + (sstep << 16));
          __m128i z = _mm_setzero_si128();
@@ -2829,10 +3382,10 @@ class RemapInvoker :
  {
  public:
      RemapInvoker(const Mat& _src, Mat& _dst, const Mat *_m1,
-                 const Mat *_m2, int _interpolation, int _borderType, const Scalar &_borderValue,
+                 const Mat *_m2, int _borderType, const Scalar &_borderValue,
                   int _planar_input, RemapNNFunc _nnfunc, RemapFunc _ifunc, const void *_ctab) :
          ParallelLoopBody(), src(&_src), dst(&_dst), m1(_m1), m2(_m2),
-        interpolation(_interpolation), borderType(_borderType), borderValue(_borderValue),
+        borderType(_borderType), borderValue(_borderValue),
          planar_input(_planar_input), nnfunc(_nnfunc), ifunc(_ifunc), ctab(_ctab)
      {
      }
@@ -3018,7 +3571,7 @@ private:
      const Mat* src;
      Mat* dst;
      const Mat *m1, *m2;
-    int interpolation, borderType;
+    int borderType;
      Scalar borderValue;
      int planar_input;
      RemapNNFunc nnfunc;
@@ -3026,6 +3579,82 @@ private:
      const void *ctab;
  };
  
+#ifdef HAVE_OPENCL
+
+static bool ocl_remap(InputArray _src, OutputArray _dst, InputArray _map1, InputArray _map2,
+                      int interpolation, int borderType, const Scalar& borderValue)
+{
+    int cn = _src.channels(), type = _src.type(), depth = _src.depth();
+
+    if (borderType == BORDER_TRANSPARENT || cn == 3 || !(interpolation == INTER_LINEAR || interpolation == INTER_NEAREST)
+            || _map1.type() == CV_16SC1 || _map2.type() == CV_16SC1)
+        return false;
+
+    UMat src = _src.getUMat(), map1 = _map1.getUMat(), map2 = _map2.getUMat();
+
+    if( (map1.type() == CV_16SC2 && (map2.type() == CV_16UC1 || map2.empty())) ||
+        (map2.type() == CV_16SC2 && (map1.type() == CV_16UC1 || map1.empty())) )
+    {
+        if (map1.type() != CV_16SC2)
+            std::swap(map1, map2);
+    }
+    else
+        CV_Assert( map1.type() == CV_32FC2 || (map1.type() == CV_32FC1 && map2.type() == CV_32FC1) );
+
+    _dst.create(map1.size(), type);
+    UMat dst = _dst.getUMat();
+
+    String kernelName = "remap";
+    if (map1.type() == CV_32FC2 && map2.empty())
+        kernelName += "_32FC2";
+    else if (map1.type() == CV_16SC2)
+    {
+        kernelName += "_16SC2";
+        if (!map2.empty())
+            kernelName += "_16UC1";
+    }
+    else if (map1.type() == CV_32FC1 && map2.type() == CV_32FC1)
+        kernelName += "_2_32FC1";
+    else
+        CV_Error(Error::StsBadArg, "Unsupported map types");
+
+    static const char * const interMap[] = { "INTER_NEAREST", "INTER_LINEAR", "INTER_CUBIC", "INTER_LINEAR", "INTER_LANCZOS" };
+    static const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", "BORDER_WRAP",
+                           "BORDER_REFLECT_101", "BORDER_TRANSPARENT" };
+    String buildOptions = format("-D %s -D %s -D T=%s", interMap[interpolation], borderMap[borderType], ocl::typeToStr(type));
+
+    if (interpolation != INTER_NEAREST)
+    {
+        char cvt[3][40];
+        int wdepth = std::max(CV_32F, dst.depth());
+        buildOptions = buildOptions
+                      + format(" -D WT=%s -D convertToT=%s -D convertToWT=%s"
+                               " -D convertToWT2=%s -D WT2=%s",
+                               ocl::typeToStr(CV_MAKE_TYPE(wdepth, cn)),
+                               ocl::convertTypeStr(wdepth, depth, cn, cvt[0]),
+                               ocl::convertTypeStr(depth, wdepth, cn, cvt[1]),
+                               ocl::convertTypeStr(CV_32S, wdepth, 2, cvt[2]),
+                               ocl::typeToStr(CV_MAKE_TYPE(wdepth, 2)));
+    }
+
+    ocl::Kernel k(kernelName.c_str(), ocl::imgproc::remap_oclsrc, buildOptions);
+
+    Mat scalar(1, 1, type, borderValue);
+    ocl::KernelArg srcarg = ocl::KernelArg::ReadOnly(src), dstarg = ocl::KernelArg::WriteOnly(dst),
+            map1arg = ocl::KernelArg::ReadOnlyNoSize(map1),
+            scalararg = ocl::KernelArg::Constant((void*)scalar.data, scalar.elemSize());
+
+    if (map2.empty())
+        k.args(srcarg, dstarg, map1arg, scalararg);
+    else
+        k.args(srcarg, dstarg, map1arg, ocl::KernelArg::ReadOnlyNoSize(map2), scalararg);
+
+    size_t globalThreads[2] = { dst.cols, dst.rows };
+    return k.run(2, globalThreads, NULL, false);
+}
+
+#endif
+
  }
  
  void cv::remap( InputArray _src, OutputArray _dst,
@@ -3065,11 +3694,13 @@ void cv::remap( InputArray _src, OutputArray _dst,
          remapLanczos4<Cast<double, double>, float, 1>, 0
      };
  
-    Mat src = _src.getMat(), map1 = _map1.getMat(), map2 = _map2.getMat();
+    CV_Assert( _map1.size().area() > 0 );
+    CV_Assert( _map2.empty() || (_map2.size() == _map1.size()));
  
-    CV_Assert( map1.size().area() > 0 );
-    CV_Assert( !map2.data || (map2.size() == map1.size()));
+    CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(),
+               ocl_remap(_src, _dst, _map1, _map2, interpolation, borderType, borderValue))
  
+    Mat src = _src.getMat(), map1 = _map1.getMat(), map2 = _map2.getMat();
      _dst.create( map1.size(), src.type() );
      Mat dst = _dst.getMat();
      if( dst.data == src.data )
@@ -3119,7 +3750,7 @@ void cv::remap( InputArray _src, OutputArray _dst,
          planar_input = map1.channels() == 1;
      }
  
-    RemapInvoker invoker(src, dst, m1, m2, interpolation,
+    RemapInvoker invoker(src, dst, m1, m2,
                           borderType, borderValue, planar_input, nnfunc, ifunc,
                           ctab);
      parallel_for_(Range(0, dst.rows), invoker, dst.total()/(double)(1<<16));
@@ -3432,6 +4063,105 @@ private:
  };
  #endif
  
+#ifdef HAVE_OPENCL
+
+enum { OCL_OP_PERSPECTIVE = 1, OCL_OP_AFFINE = 0 };
+
+static bool ocl_warpTransform(InputArray _src, OutputArray _dst, InputArray _M0,
+                              Size dsize, int flags, int borderType, const Scalar& borderValue,
+                              int op_type)
+{
+    CV_Assert(op_type == OCL_OP_AFFINE || op_type == OCL_OP_PERSPECTIVE);
+
+    int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
+    double doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
+
+    int interpolation = flags & INTER_MAX;
+    if( interpolation == INTER_AREA )
+        interpolation = INTER_LINEAR;
+
+    if ( !(borderType == cv::BORDER_CONSTANT &&
+           (interpolation == cv::INTER_NEAREST || interpolation == cv::INTER_LINEAR || interpolation == cv::INTER_CUBIC)) ||
+         (!doubleSupport && depth == CV_64F) || cn > 4)
+        return false;
+
+    const char * const interpolationMap[3] = { "NEAREST", "LINEAR", "CUBIC" };
+    ocl::ProgramSource program = op_type == OCL_OP_AFFINE ?
+                ocl::imgproc::warp_affine_oclsrc : ocl::imgproc::warp_perspective_oclsrc;
+    const char * const kernelName = op_type == OCL_OP_AFFINE ? "warpAffine" : "warpPerspective";
+
+    int scalarcn = cn == 3 ? 4 : cn;
+    int wdepth = interpolation == INTER_NEAREST ? depth : std::max(CV_32S, depth);
+    int sctype = CV_MAKETYPE(wdepth, scalarcn);
+
+    ocl::Kernel k;
+    String opts;
+    if (interpolation == INTER_NEAREST)
+    {
+        opts = format("-D INTER_NEAREST -D T=%s%s -D T1=%s -D ST=%s -D cn=%d", ocl::typeToStr(type),
+                      doubleSupport ? " -D DOUBLE_SUPPORT" : "",
+                      ocl::typeToStr(CV_MAT_DEPTH(type)),
+                      ocl::typeToStr(sctype),
+                      cn);
+    }
+    else
+    {
+        char cvt[2][50];
+        opts = format("-D INTER_%s -D T=%s -D T1=%s -D ST=%s -D WT=%s -D depth=%d -D convertToWT=%s -D convertToT=%s%s -D cn=%d",
+                      interpolationMap[interpolation], ocl::typeToStr(type),
+                      ocl::typeToStr(CV_MAT_DEPTH(type)),
+                      ocl::typeToStr(sctype),
+                      ocl::typeToStr(CV_MAKE_TYPE(wdepth, cn)), depth,
+                      ocl::convertTypeStr(depth, wdepth, cn, cvt[0]),
+                      ocl::convertTypeStr(wdepth, depth, cn, cvt[1]),
+                      doubleSupport ? " -D DOUBLE_SUPPORT" : "", cn);
+    }
+
+    k.create(kernelName, program, opts);
+    if (k.empty())
+        return false;
+
+    double borderBuf[] = {0, 0, 0, 0};
+    scalarToRawData(borderValue, borderBuf, sctype);
+
+    UMat src = _src.getUMat(), M0;
+    _dst.create( dsize.area() == 0 ? src.size() : dsize, src.type() );
+    UMat dst = _dst.getUMat();
+
+    double M[9];
+    int matRows = (op_type == OCL_OP_AFFINE ? 2 : 3);
+    Mat matM(matRows, 3, CV_64F, M), M1 = _M0.getMat();
+    CV_Assert( (M1.type() == CV_32F || M1.type() == CV_64F) &&
+               M1.rows == matRows && M1.cols == 3 );
+    M1.convertTo(matM, matM.type());
+
+    if( !(flags & WARP_INVERSE_MAP) )
+    {
+        if (op_type == OCL_OP_PERSPECTIVE)
+            invert(matM, matM);
+        else
+        {
+            double D = M[0]*M[4] - M[1]*M[3];
+            D = D != 0 ? 1./D : 0;
+            double A11 = M[4]*D, A22=M[0]*D;
+            M[0] = A11; M[1] *= -D;
+            M[3] *= -D; M[4] = A22;
+            double b1 = -M[0]*M[2] - M[1]*M[5];
+            double b2 = -M[3]*M[2] - M[4]*M[5];
+            M[2] = b1; M[5] = b2;
+        }
+    }
+    matM.convertTo(M0, doubleSupport ? CV_64F : CV_32F);
+
+    k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst), ocl::KernelArg::PtrReadOnly(M0),
+           ocl::KernelArg(0, 0, 0, borderBuf, CV_ELEM_SIZE(sctype)));
+
+    size_t globalThreads[2] = { dst.cols, dst.rows };
+    return k.run(2, globalThreads, NULL, false);
+}
+
+#endif
+
  }
  
  
@@ -3439,6 +4169,10 @@ void cv::warpAffine( InputArray _src, OutputArray _dst,
                       InputArray _M0, Size dsize,
                       int flags, int borderType, const Scalar& borderValue )
  {
+    CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(),
+               ocl_warpTransform(_src, _dst, _M0, dsize, flags, borderType,
+                                 borderValue, OCL_OP_AFFINE))
+
      Mat src = _src.getMat(), M0 = _M0.getMat();
      _dst.create( dsize.area() == 0 ? src.size() : dsize, src.type() );
      Mat dst = _dst.getMat();
@@ -3678,11 +4412,16 @@ private:
  void cv::warpPerspective( InputArray _src, OutputArray _dst, InputArray _M0,
                            Size dsize, int flags, int borderType, const Scalar& borderValue )
  {
+    CV_Assert( _src.total() > 0 );
+
+    CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(),
+               ocl_warpTransform(_src, _dst, _M0, dsize, flags, borderType, borderValue,
+                              OCL_OP_PERSPECTIVE))
+
      Mat src = _src.getMat(), M0 = _M0.getMat();
      _dst.create( dsize.area() == 0 ? src.size() : dsize, src.type() );
      Mat dst = _dst.getMat();
  
-    CV_Assert( src.cols > 0 && src.rows > 0 );
      if( dst.data == src.data )
          src = src.clone();
  
@@ -4049,8 +4788,8 @@ cvLogPolar( const CvArr* srcarr, CvArr* dstarr,
      ssize = cvGetMatSize(src);
      dsize = cvGetMatSize(dst);
  
-    mapx = cvCreateMat( dsize.height, dsize.width, CV_32F );
-    mapy = cvCreateMat( dsize.height, dsize.width, CV_32F );
+    mapx.reset(cvCreateMat( dsize.height, dsize.width, CV_32F ));
+    mapy.reset(cvCreateMat( dsize.height, dsize.width, CV_32F ));
  
      if( !(flags & CV_WARP_INVERSE_MAP) )
      {
@@ -4125,7 +4864,7 @@ cvLogPolar( const CvArr* srcarr, CvArr* dstarr,
                  double xx = bufx.data.fl[x];
                  double yy = bufy.data.fl[x];
  
-                double p = log(sqrt(xx*xx + yy*yy) + 1.)*M;
+                double p = log(std::sqrt(xx*xx + yy*yy) + 1.)*M;
                  double a = atan2(yy,xx);
                  if( a < 0 )
                      a = 2*CV_PI + a;
@@ -4141,6 +4880,14 @@ cvLogPolar( const CvArr* srcarr, CvArr* dstarr,
      cvRemap( src, dst, mapx, mapy, flags, cvScalarAll(0) );
  }
  
+void cv::logPolar( InputArray _src, OutputArray _dst,
+                   Point2f center, double M, int flags )
+{
+    Mat src = _src.getMat();
+    _dst.create( src.size(), src.type() );
+    CvMat c_src = src, c_dst = _dst.getMat();
+    cvLogPolar( &c_src, &c_dst, center, M, flags );
+}
  
  /****************************************************************************************
                                     Linear-Polar Transform
@@ -4167,8 +4914,8 @@ void cvLinearPolar( const CvArr* srcarr, CvArr* dstarr,
      dsize.width = dst->cols;
      dsize.height = dst->rows;
  
-    mapx = cvCreateMat( dsize.height, dsize.width, CV_32F );
-    mapy = cvCreateMat( dsize.height, dsize.width, CV_32F );
+    mapx.reset(cvCreateMat( dsize.height, dsize.width, CV_32F ));
+    mapy.reset(cvCreateMat( dsize.height, dsize.width, CV_32F ));
  
      if( !(flags & CV_WARP_INVERSE_MAP) )
      {
@@ -4236,5 +4983,13 @@ void cvLinearPolar( const CvArr* srcarr, CvArr* dstarr,
      cvRemap( src, dst, mapx, mapy, flags, cvScalarAll(0) );
  }
  
+void cv::linearPolar( InputArray _src, OutputArray _dst,
+                      Point2f center, double maxRadius, int flags )
+{
+    Mat src = _src.getMat();
+    _dst.create( src.size(), src.type() );
+    CvMat c_src = src, c_dst = _dst.getMat();
+    cvLinearPolar( &c_src, &c_dst, center, maxRadius, flags );
+}
  
  /* End of file. */