fixed many bugs related to Mat::getUMat(), asynchronous kernel execution etc. Also...

author Vadim Pisarevsky <vadim.pisarevsky@gmail.com>

Sat, 30 Nov 2013 23:12:19 +0000 (03:12 +0400)

committer Vadim Pisarevsky <vadim.pisarevsky@gmail.com>

Sat, 30 Nov 2013 23:12:19 +0000 (03:12 +0400)
author Vadim Pisarevsky <vadim.pisarevsky@gmail.com>
Sat, 30 Nov 2013 23:12:19 +0000 (03:12 +0400)
committer Vadim Pisarevsky <vadim.pisarevsky@gmail.com>
Sat, 30 Nov 2013 23:12:19 +0000 (03:12 +0400)
diff --git a/modules/core/include/opencv2/core/mat.hpp b/modules/core/include/opencv2/core/mat.hpp

index 2f38f8b..2cf1e35 100644 (file)
--- a/modules/core/include/opencv2/core/mat.hpp
+++ b/modules/core/include/opencv2/core/mat.hpp
@@ -279,21 +279,22 @@ public:
      //virtual void allocate(int dims, const int* sizes, int type, int*& refcount,
      //                      uchar*& datastart, uchar*& data, size_t* step) = 0;
      //virtual void deallocate(int* refcount, uchar* datastart, uchar* data) = 0;
-    virtual UMatData* allocate(int dims, const int* sizes,
-                               int type, size_t* step) const = 0;
+    virtual UMatData* allocate(int dims, const int* sizes, int type,
+                               void* data, size_t* step, int flags) const = 0;
      virtual bool allocate(UMatData* data, int accessflags) const = 0;
      virtual void deallocate(UMatData* data) const = 0;
-    virtual void map(UMatData* data, int accessflags) const = 0;
-    virtual void unmap(UMatData* data) const = 0;
+    virtual void sync(UMatData* u) const;
+    virtual void map(UMatData* data, int accessflags) const;
+    virtual void unmap(UMatData* data) const;
      virtual void download(UMatData* data, void* dst, int dims, const size_t sz[],
                            const size_t srcofs[], const size_t srcstep[],
-                          const size_t dststep[]) const = 0;
+                          const size_t dststep[]) const;
      virtual void upload(UMatData* data, const void* src, int dims, const size_t sz[],
                          const size_t dstofs[], const size_t dststep[],
-                        const size_t srcstep[]) const = 0;
+                        const size_t srcstep[]) const;
      virtual void copy(UMatData* srcdata, UMatData* dstdata, int dims, const size_t sz[],
                        const size_t srcofs[], const size_t srcstep[],
-                      const size_t dstofs[], const size_t dststep[], bool sync) const = 0;
+                      const size_t dstofs[], const size_t dststep[], bool sync) const;
  };
  
  
@@ -335,8 +336,10 @@ protected:
  struct CV_EXPORTS UMatData
  {
      enum { COPY_ON_MAP=1, HOST_COPY_OBSOLETE=2,
-        DEVICE_COPY_OBSOLETE=4, TEMP_UMAT=8, TEMP_COPIED_UMAT=24 };
+        DEVICE_COPY_OBSOLETE=4, TEMP_UMAT=8, TEMP_COPIED_UMAT=24,
+        USER_ALLOCATED=32 };
      UMatData(const MatAllocator* allocator);
+    ~UMatData();
  
      // provide atomic access to the structure
      void lock();
diff --git a/modules/core/include/opencv2/core/mat.inl.hpp b/modules/core/include/opencv2/core/mat.inl.hpp

index 84f1cc4..8a0a6a6 100644 (file)
--- a/modules/core/include/opencv2/core/mat.inl.hpp
+++ b/modules/core/include/opencv2/core/mat.inl.hpp
@@ -3132,14 +3132,6 @@ cols(1), allocator(0), u(0), offset(0), size(&rows)
  
  
  inline
-UMat::~UMat()
-{
-    release();
-    if( step.p != step.buf )
-        fastFree(step.p);
-}
-
-inline
  UMat& UMat::operator = (const UMat& m)
  {
      if( this != &m )
diff --git a/modules/core/src/matrix.cpp b/modules/core/src/matrix.cpp

index cb5d7e4..995c10b 100644 (file)
--- a/modules/core/src/matrix.cpp
+++ b/modules/core/src/matrix.cpp
@@ -48,146 +48,165 @@
  
  namespace cv {
  
-class StdMatAllocator : public MatAllocator
+void MatAllocator::sync(UMatData*) const
  {
-public:
-    UMatData* allocate(int dims, const int* sizes, int type, size_t* step) const
-    {
-        size_t total = CV_ELEM_SIZE(type);
-        for( int i = dims-1; i >= 0; i-- )
-        {
-            if( step )
-                step[i] = total;
-            total *= sizes[i];
-        }
-        uchar* data = (uchar*)fastMalloc(total);
-        UMatData* u = new UMatData(this);
-        u->data = u->origdata = data;
-        u->size = total;
-        u->refcount = 1;
+}
  
-        return u;
-    }
+void MatAllocator::map(UMatData*, int) const
+{
+}
  
-    bool allocate(UMatData* u, int accessFlags) const
-    {
-        if(!u) return false;
-        if(u->handle != 0)
-            return true;
-        return UMat::getStdAllocator()->allocate(u, accessFlags);
-    }
+void MatAllocator::unmap(UMatData* u) const
+{
+    if(u->urefcount == 0 && u->refcount == 0)
+        deallocate(u);
+}
  
-    void deallocate(UMatData* u) const
+void MatAllocator::download(UMatData* u, void* dstptr,
+         int dims, const size_t sz[],
+         const size_t srcofs[], const size_t srcstep[],
+         const size_t dststep[]) const
+{
+    if(!u)
+        return;
+    int isz[CV_MAX_DIM];
+    uchar* srcptr = u->data;
+    for( int i = 0; i < dims; i++ )
      {
-        if(u)
-        {
-            fastFree(u->origdata);
-            delete u;
-        }
+        CV_Assert( sz[i] <= (size_t)INT_MAX );
+        if( sz[i] == 0 )
+        return;
+        if( srcofs )
+        srcptr += srcofs[i]*(i <= dims-2 ? srcstep[i] : 1);
+        isz[i] = (int)sz[i];
      }
  
-    void map(UMatData*, int) const
-    {
-    }
+    Mat src(dims, isz, CV_8U, srcptr, srcstep);
+    Mat dst(dims, isz, CV_8U, dstptr, dststep);
+
+    const Mat* arrays[] = { &src, &dst };
+    uchar* ptrs[2];
+    NAryMatIterator it(arrays, ptrs, 2);
+    size_t j, planesz = it.size;
  
-    void unmap(UMatData* u) const
+    for( j = 0; j < it.nplanes; j++, ++it )
+        memcpy(ptrs[1], ptrs[0], planesz);
+}
+
+
+void MatAllocator::upload(UMatData* u, const void* srcptr, int dims, const size_t sz[],
+                    const size_t dstofs[], const size_t dststep[],
+                    const size_t srcstep[]) const
+{
+    if(!u)
+        return;
+    int isz[CV_MAX_DIM];
+    uchar* dstptr = u->data;
+    for( int i = 0; i < dims; i++ )
      {
-        if(u->urefcount == 0)
-            deallocate(u);
+        CV_Assert( sz[i] <= (size_t)INT_MAX );
+        if( sz[i] == 0 )
+        return;
+        if( dstofs )
+        dstptr += dstofs[i]*(i <= dims-2 ? dststep[i] : 1);
+        isz[i] = (int)sz[i];
      }
  
-    void download(UMatData* u, void* dstptr,
-                  int dims, const size_t sz[],
+    Mat src(dims, isz, CV_8U, (void*)srcptr, srcstep);
+    Mat dst(dims, isz, CV_8U, dstptr, dststep);
+
+    const Mat* arrays[] = { &src, &dst };
+    uchar* ptrs[2];
+    NAryMatIterator it(arrays, ptrs, 2);
+    size_t j, planesz = it.size;
+
+    for( j = 0; j < it.nplanes; j++, ++it )
+        memcpy(ptrs[1], ptrs[0], planesz);
+}
+
+void MatAllocator::copy(UMatData* usrc, UMatData* udst, int dims, const size_t sz[],
                    const size_t srcofs[], const size_t srcstep[],
-                  const size_t dststep[]) const
+                  const size_t dstofs[], const size_t dststep[], bool sync) const
+{
+    if(!usrc || !udst)
+        return;
+    int isz[CV_MAX_DIM];
+    uchar* srcptr = usrc->data;
+    uchar* dstptr = udst->data;
+    for( int i = 0; i < dims; i++ )
      {
-        if(!u)
-            return;
-        int isz[CV_MAX_DIM];
-        uchar* srcptr = u->data;
-        for( int i = 0; i < dims; i++ )
-        {
-            CV_Assert( sz[i] <= (size_t)INT_MAX );
-            if( sz[i] == 0 )
-                return;
-            if( srcofs )
-                srcptr += srcofs[i]*(i <= dims-2 ? srcstep[i] : 1);
-            isz[i] = (int)sz[i];
-        }
+        CV_Assert( sz[i] <= (size_t)INT_MAX );
+        if( sz[i] == 0 )
+        return;
+        if( srcofs )
+        srcptr += srcofs[i]*(i <= dims-2 ? srcstep[i] : 1);
+        if( dstofs )
+        dstptr += dstofs[i]*(i <= dims-2 ? dststep[i] : 1);
+        isz[i] = (int)sz[i];
+    }
  
-        Mat src(dims, isz, CV_8U, srcptr, srcstep);
-        Mat dst(dims, isz, CV_8U, dstptr, dststep);
+    Mat src(dims, isz, CV_8U, srcptr, srcstep);
+    Mat dst(dims, isz, CV_8U, dstptr, dststep);
  
-        const Mat* arrays[] = { &src, &dst };
-        uchar* ptrs[2];
-        NAryMatIterator it(arrays, ptrs, 2);
-        size_t j, planesz = it.size;
+    const Mat* arrays[] = { &src, &dst };
+    uchar* ptrs[2];
+    NAryMatIterator it(arrays, ptrs, 2);
+    size_t j, planesz = it.size;
  
-        for( j = 0; j < it.nplanes; j++, ++it )
-            memcpy(ptrs[1], ptrs[0], planesz);
-    }
+    for( j = 0; j < it.nplanes; j++, ++it )
+        memcpy(ptrs[1], ptrs[0], planesz);
+}
  
-    void upload(UMatData* u, const void* srcptr, int dims, const size_t sz[],
-                const size_t dstofs[], const size_t dststep[],
-                const size_t srcstep[]) const
+class StdMatAllocator : public MatAllocator
+{
+public:
+    UMatData* allocate(int dims, const int* sizes, int type,
+                       void* data0, size_t* step, int /*flags*/) const
      {
-        if(!u)
-            return;
-        int isz[CV_MAX_DIM];
-        uchar* dstptr = u->data;
-        for( int i = 0; i < dims; i++ )
+        size_t total = CV_ELEM_SIZE(type);
+        for( int i = dims-1; i >= 0; i-- )
          {
-            CV_Assert( sz[i] <= (size_t)INT_MAX );
-            if( sz[i] == 0 )
-                return;
-            if( dstofs )
-                dstptr += dstofs[i]*(i <= dims-2 ? dststep[i] : 1);
-            isz[i] = (int)sz[i];
+            if( step )
+            {
+                if( data0 && step[i] != CV_AUTOSTEP )
+                {
+                    CV_Assert(total <= step[i]);
+                    total = step[i];
+                }
+                else
+                    step[i] = total;
+            }
+            total *= sizes[i];
          }
+        uchar* data = data0 ? (uchar*)data0 : (uchar*)fastMalloc(total);
+        UMatData* u = new UMatData(this);
+        u->data = u->origdata = data;
+        u->size = total;
+        u->refcount = data0 == 0;
+        if(data0)
+            u->flags |= UMatData::USER_ALLOCATED;
  
-        Mat src(dims, isz, CV_8U, (void*)srcptr, srcstep);
-        Mat dst(dims, isz, CV_8U, dstptr, dststep);
-
-        const Mat* arrays[] = { &src, &dst };
-        uchar* ptrs[2];
-        NAryMatIterator it(arrays, ptrs, 2);
-        size_t j, planesz = it.size;
+        return u;
+    }
  
-        for( j = 0; j < it.nplanes; j++, ++it )
-            memcpy(ptrs[1], ptrs[0], planesz);
+    bool allocate(UMatData* u, int /*accessFlags*/) const
+    {
+        if(!u) return false;
+        CV_XADD(&u->urefcount, 1);
+        return true;
      }
  
-    void copy(UMatData* usrc, UMatData* udst, int dims, const size_t sz[],
-              const size_t srcofs[], const size_t srcstep[],
-              const size_t dstofs[], const size_t dststep[], bool) const
+    void deallocate(UMatData* u) const
      {
-        if(!usrc || !udst)
-            return;
-        int isz[CV_MAX_DIM];
-        uchar* srcptr = usrc->data;
-        uchar* dstptr = udst->data;
-        for( int i = 0; i < dims; i++ )
+        if(u && u->refcount == 0)
          {
-            CV_Assert( sz[i] <= (size_t)INT_MAX );
-            if( sz[i] == 0 )
-                return;
-            if( srcofs )
-                srcptr += srcofs[i]*(i <= dims-2 ? srcstep[i] : 1);
-            if( dstofs )
-                dstptr += dstofs[i]*(i <= dims-2 ? dststep[i] : 1);
-            isz[i] = (int)sz[i];
+            if( !(u->flags & UMatData::USER_ALLOCATED) )
+            {
+                fastFree(u->origdata);
+                u->origdata = 0;
+            }
+            delete u;
          }
-
-        Mat src(dims, isz, CV_8U, srcptr, srcstep);
-        Mat dst(dims, isz, CV_8U, dstptr, dststep);
-
-        const Mat* arrays[] = { &src, &dst };
-        uchar* ptrs[2];
-        NAryMatIterator it(arrays, ptrs, 2);
-        size_t j, planesz = it.size;
-
-        for( j = 0; j < it.nplanes; j++, ++it )
-            memcpy(ptrs[1], ptrs[0], planesz);
      }
  };
  
@@ -364,13 +383,13 @@ void Mat::create(int d, const int* _sizes, int _type)
              a = a0;
          try
          {
-            u = a->allocate(dims, size, _type, step.p);
+            u = a->allocate(dims, size, _type, 0, step.p, 0);
              CV_Assert(u != 0);
          }
          catch(...)
          {
              if(a != a0)
-                u = a0->allocate(dims, size, _type, step.p);
+                u = a0->allocate(dims, size, _type, 0, step.p, 0);
              CV_Assert(u != 0);
          }
          CV_Assert( step[dims-1] == (size_t)CV_ELEM_SIZE(flags) );
diff --git a/modules/core/src/ocl.cpp b/modules/core/src/ocl.cpp

index 1d4c419..22d8022 100644 (file)
--- a/modules/core/src/ocl.cpp
+++ b/modules/core/src/ocl.cpp
@@ -612,7 +612,7 @@ static void* initOpenCLAndLoad(const char* funcname)
              return 0;
      }
  
-    return funcname ? dlsym(handle, funcname) : 0;
+    return funcname && handle ? dlsym(handle, funcname) : 0;
  }
  
  #elif defined WIN32 || defined _WIN32
@@ -2002,7 +2002,7 @@ void* Queue::ptr() const
  Queue& Queue::getDefault()
  {
      Queue& q = TLSData::get()->oclQueue;
-    if( !q.p )
+    if( !q.p && haveOpenCL() )
          q.create(Context2::getDefault());
      return q;
  }
@@ -2251,22 +2251,32 @@ int Kernel::set(int i, const KernelArg& arg)
  }
  
  
-bool Kernel::run(int dims, size_t globalsize[], size_t localsize[],
+bool Kernel::run(int dims, size_t _globalsize[], size_t _localsize[],
                   bool sync, const Queue& q)
  {
      if(!p || !p->handle || p->e != 0)
          return false;
  
-    AutoBuffer<size_t> _globalSize(dims);
-    size_t * globalSizePtr = (size_t *)_globalSize;
-    for (int i = 0; i < dims; ++i)
-        globalSizePtr[i] = localsize == NULL ? globalsize[i] :
-                                               ((globalsize[i] + localsize[i] - 1) / localsize[i]) * localsize[i];
-
      cl_command_queue qq = getQueue(q);
-    size_t offset[CV_MAX_DIM] = {0};
+    size_t offset[CV_MAX_DIM] = {0}, globalsize[CV_MAX_DIM] = {1,1,1}, localsize[CV_MAX_DIM] = {1,1,1};
+    size_t total = 1;
+    for (int i = 0; i < dims; i++)
+    {
+        size_t val0 = _localsize ? _localsize[i] :
+            dims == 1 ? 64 : dims == 2 ? 16>>i : dims == 3 ? 8>>(i>0) : 1;
+        size_t val = 1;
+        while( val*2 < val0 )
+            val *= 2;
+        if( _localsize )
+            localsize[i] = val;
+        CV_Assert(_globalsize && _globalsize[i] >= 0);
+        total *= _globalsize[i];
+        globalsize[i] = ((_globalsize[i] + val - 1)/val)*val;
+    }
+    if( total == 0 )
+        return true;
      cl_int retval = clEnqueueNDRangeKernel(qq, p->handle, (cl_uint)dims,
-                                           offset, globalSizePtr, localsize, 0, 0,
+                                           offset, globalsize, _localsize ? localsize : 0, 0, 0,
                                             sync ? 0 : &p->e);
      if( sync || retval < 0 )
      {
@@ -2361,14 +2371,23 @@ struct Program::Impl
              retval = clBuildProgram(handle, n,
                                      (const cl_device_id*)deviceList,
                                      buildflags.c_str(), 0, 0);
-            if( retval == CL_BUILD_PROGRAM_FAILURE )
+            if( retval < 0 )
              {
-                char buf[1<<16];
                  size_t retsz = 0;
-                clGetProgramBuildInfo(handle, (cl_device_id)deviceList[0], CL_PROGRAM_BUILD_LOG,
-                                      sizeof(buf)-16, buf, &retsz);
-                errmsg = String(buf);
-                CV_Error_(Error::StsAssert, ("OpenCL program can not be built: %s", errmsg.c_str()));
+                retval = clGetProgramBuildInfo(handle, (cl_device_id)deviceList[0],
+                                               CL_PROGRAM_BUILD_LOG, 0, 0, &retsz);
+                if( retval >= 0 && retsz > 0 )
+                {
+                    AutoBuffer<char> bufbuf(retsz + 16);
+                    char* buf = bufbuf;
+                    retval = clGetProgramBuildInfo(handle, (cl_device_id)deviceList[0],
+                                                   CL_PROGRAM_BUILD_LOG, retsz+1, buf, &retsz);
+                    if( retval >= 0 )
+                    {
+                        errmsg = String(buf);
+                        CV_Error_(Error::StsAssert, ("OpenCL program can not be built: %s", errmsg.c_str()));
+                    }
+                }
              }
              CV_Assert(retval >= 0);
          }
@@ -2608,17 +2627,17 @@ ProgramSource2::hash_t ProgramSource2::hash() const
  class OpenCLAllocator : public MatAllocator
  {
  public:
-    OpenCLAllocator() {}
+    OpenCLAllocator() { matStdAllocator = Mat::getStdAllocator(); }
  
-    UMatData* defaultAllocate(int dims, const int* sizes, int type, size_t* step) const
+    UMatData* defaultAllocate(int dims, const int* sizes, int type, void* data, size_t* step, int flags) const
      {
-        UMatData* u = Mat::getStdAllocator()->allocate(dims, sizes, type, step);
+        UMatData* u = matStdAllocator->allocate(dims, sizes, type, data, step, flags);
          u->urefcount = 1;
          u->refcount = 0;
          return u;
      }
  
-    void getBestFlags(const Context2& ctx, int& createFlags, int& flags0) const
+    void getBestFlags(const Context2& ctx, int /*flags*/, int& createFlags, int& flags0) const
      {
          const Device& dev = ctx.device(0);
          createFlags = CL_MEM_READ_WRITE;
@@ -2629,10 +2648,12 @@ public:
              flags0 = UMatData::COPY_ON_MAP;
      }
  
-    UMatData* allocate(int dims, const int* sizes, int type, size_t* step) const
+    UMatData* allocate(int dims, const int* sizes, int type,
+                       void* data, size_t* step, int flags) const
      {
          if(!useOpenCL())
-            return defaultAllocate(dims, sizes, type, step);
+            return defaultAllocate(dims, sizes, type, data, step, flags);
+        CV_Assert(data == 0);
          size_t total = CV_ELEM_SIZE(type);
          for( int i = dims-1; i >= 0; i-- )
          {
@@ -2643,13 +2664,13 @@ public:
  
          Context2& ctx = Context2::getDefault();
          int createFlags = 0, flags0 = 0;
-        getBestFlags(ctx, createFlags, flags0);
+        getBestFlags(ctx, flags, createFlags, flags0);
  
          cl_int retval = 0;
          void* handle = clCreateBuffer((cl_context)ctx.ptr(),
                                        createFlags, total, 0, &retval);
          if( !handle || retval < 0 )
-            return defaultAllocate(dims, sizes, type, step);
+            return defaultAllocate(dims, sizes, type, data, step, flags);
          UMatData* u = new UMatData(this);
          u->data = 0;
          u->size = total;
@@ -2672,7 +2693,7 @@ public:
              CV_Assert(u->origdata != 0);
              Context2& ctx = Context2::getDefault();
              int createFlags = 0, flags0 = 0;
-            getBestFlags(ctx, createFlags, flags0);
+            getBestFlags(ctx, accessFlags, createFlags, flags0);
  
              cl_context ctx_handle = (cl_context)ctx.ptr();
              cl_int retval = 0;
@@ -2697,19 +2718,41 @@ public:
          return true;
      }
  
+    void sync(UMatData* u) const
+    {
+        cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
+        clFinish(q);
+
+        if( u->hostCopyObsolete() && u->handle &&
+            u->tempCopiedUMat() && u->refcount > 0 && u->origdata)
+        {
+            UMatDataAutoLock lock(u);
+            clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE, 0,
+                                u->size, u->origdata, 0, 0, 0);
+            u->markHostCopyObsolete(false);
+        }
+        else if( u->copyOnMap() && u->deviceCopyObsolete() && u->data )
+        {
+            UMatDataAutoLock lock(u);
+            clEnqueueWriteBuffer(q, (cl_mem)u->handle, CL_TRUE, 0,
+                                 u->size, u->data, 0, 0, 0);
+        }
+    }
+
      void deallocate(UMatData* u) const
      {
          if(!u)
              return;
+        UMatDataAutoLock lock(u);
  
          // TODO: !!! when we add Shared Virtual Memory Support,
-        // this function (as well as the others should be corrected)
+        // this function (as well as the others) should be corrected
          CV_Assert(u->handle != 0 && u->urefcount == 0);
          if(u->tempUMat())
          {
              if( u->hostCopyObsolete() && u->refcount > 0 && u->tempCopiedUMat() )
              {
-                clEnqueueWriteBuffer((cl_command_queue)Queue::getDefault().ptr(),
+                clEnqueueReadBuffer((cl_command_queue)Queue::getDefault().ptr(),
                                       (cl_mem)u->handle, CL_TRUE, 0,
                                       u->size, u->origdata, 0, 0, 0);
              }
@@ -2717,7 +2760,7 @@ public:
              clReleaseMemObject((cl_mem)u->handle);
              u->handle = 0;
              u->currAllocator = u->prevAllocator;
-            if(u->data && u->copyOnMap())
+            if(u->data && u->copyOnMap() && !(u->flags & UMatData::USER_ALLOCATED))
                  fastFree(u->data);
              u->data = u->origdata;
              if(u->refcount == 0)
@@ -2725,8 +2768,11 @@ public:
          }
          else
          {
-            if(u->data && u->copyOnMap())
+            if(u->data && u->copyOnMap() && !(u->flags & UMatData::USER_ALLOCATED))
+            {
                  fastFree(u->data);
+                u->data = 0;
+            }
              clReleaseMemObject((cl_mem)u->handle);
              u->handle = 0;
              delete u;
@@ -2793,15 +2839,18 @@ public:
          UMatDataAutoLock autolock(u);
  
          cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
+        cl_int retval = 0;
          if( !u->copyOnMap() && u->data )
          {
-            CV_Assert( clEnqueueUnmapMemObject(q, (cl_mem)u->handle, u->data, 0, 0, 0) >= 0 );
+            CV_Assert( (retval = clEnqueueUnmapMemObject(q,
+                                (cl_mem)u->handle, u->data, 0, 0, 0)) >= 0 );
+            clFinish(q);
              u->data = 0;
          }
          else if( u->copyOnMap() && u->deviceCopyObsolete() )
          {
-            CV_Assert( clEnqueueWriteBuffer(q, (cl_mem)u->handle, CL_TRUE, 0,
-                                            u->size, u->data, 0, 0, 0) >= 0 );
+            CV_Assert( (retval = clEnqueueWriteBuffer(q, (cl_mem)u->handle, CL_TRUE, 0,
+                                u->size, u->data, 0, 0, 0)) >= 0 );
          }
          u->markDeviceCopyObsolete(false);
          u->markHostCopyObsolete(false);
@@ -3033,6 +3082,8 @@ public:
          if( sync )
              clFinish(q);
      }
+
+    MatAllocator* matStdAllocator;
  };
  
  MatAllocator* getOpenCLAllocator()
diff --git a/modules/core/src/umatrix.cpp b/modules/core/src/umatrix.cpp

index 2b659fb..33c193d 100644 (file)
--- a/modules/core/src/umatrix.cpp
+++ b/modules/core/src/umatrix.cpp
@@ -62,6 +62,17 @@ UMatData::UMatData(const MatAllocator* allocator)
      userdata = 0;
  }
  
+UMatData::~UMatData()
+{
+    prevAllocator = currAllocator = 0;
+    urefcount = refcount = 0;
+    data = origdata = 0;
+    size = 0;
+    flags = 0;
+    handle = 0;
+    userdata = 0;
+}
+
  void UMatData::lock()
  {
      umatLocks[(size_t)(void*)this % UMAT_NLOCKS].lock();
@@ -75,7 +86,9 @@ void UMatData::unlock()
  
  MatAllocator* UMat::getStdAllocator()
  {
-    return ocl::getOpenCLAllocator();
+    if( ocl::haveOpenCL() )
+        return ocl::getOpenCLAllocator();
+    return Mat::getStdAllocator();
  }
  
  void swap( UMat& a, UMat& b )
@@ -195,13 +208,21 @@ static void finalizeHdr(UMat& m)
  UMat Mat::getUMat(int accessFlags) const
  {
      UMat hdr;
-    if(!u)
+    if(!data)
          return hdr;
-    UMat::getStdAllocator()->allocate(u, accessFlags);
+    UMatData* temp_u = u;
+    if(!temp_u)
+    {
+        MatAllocator *a = allocator, *a0 = getStdAllocator();
+        if(!a)
+            a = a0;
+        temp_u = a->allocate(dims, size.p, type(), data, step.p, accessFlags);
+    }
+    UMat::getStdAllocator()->allocate(temp_u, accessFlags);
      hdr.flags = flags;
      setSize(hdr, dims, size.p, step.p);
      finalizeHdr(hdr);
-    hdr.u = u;
+    hdr.u = temp_u;
      hdr.offset = data - datastart;
      return hdr;
  }
@@ -237,13 +258,13 @@ void UMat::create(int d, const int* _sizes, int _type)
              a = a0;
          try
          {
-            u = a->allocate(dims, size, _type, step.p);
+            u = a->allocate(dims, size, _type, 0, step.p, 0);
              CV_Assert(u != 0);
          }
          catch(...)
          {
              if(a != a0)
-                u = a0->allocate(dims, size, _type, step.p);
+                u = a0->allocate(dims, size, _type, 0, step.p, 0);
              CV_Assert(u != 0);
          }
          CV_Assert( step[dims-1] == (size_t)CV_ELEM_SIZE(flags) );
@@ -262,6 +283,16 @@ void UMat::copySize(const UMat& m)
      }
  }
  
+
+UMat::~UMat()
+{
+    if( u && u->refcount > 0 )
+        u->currAllocator->sync(u);
+    release();
+    if( step.p != step.buf )
+        fastFree(step.p);
+}
+
  void UMat::deallocate()
  {
      u->currAllocator->deallocate(u);
@@ -546,7 +577,7 @@ Mat UMat::getMat(int accessFlags) const
  {
      if(!u)
          return Mat();
-    u->currAllocator->map(u, accessFlags);
+    u->currAllocator->map(u, accessFlags | ACCESS_READ);
      CV_Assert(u->data != 0);
      Mat hdr(dims, size.p, type(), u->data + offset, step.p);
      hdr.u = u;
diff --git a/modules/core/test/test_umat.cpp b/modules/core/test/test_umat.cpp

index d7efaa0..54df893 100644 (file)
--- a/modules/core/test/test_umat.cpp
+++ b/modules/core/test/test_umat.cpp
@@ -200,3 +200,40 @@ void CV_UMatTest::run( int /* start_from */)
  }
  
  TEST(Core_UMat, base) { CV_UMatTest test; test.safe_run(); }
+
+TEST(Core_UMat, simple)
+{
+    {
+    int a[3] = { 1, 2, 3 };
+    Mat m = Mat(1, 1, CV_32SC3, a);
+    UMat u = m.getUMat(ACCESS_READ);
+    EXPECT_NE((void*)NULL, u.u);
+    }
+
+    {
+    Mat m(10, 10, CV_8UC1), ref;
+    for (int y = 0; y < m.rows; ++y)
+    {
+        uchar * const ptr = m.ptr<uchar>(y);
+        for (int x = 0; x < m.cols; ++x)
+            ptr[x] = x + y * 2;
+    }
+
+    ref = m.clone();
+    Rect r(1, 1, 8, 8);
+    ref(r).setTo(17);
+
+    {
+        UMat um = m(r).getUMat(ACCESS_WRITE);
+        um.setTo(17);
+    }
+
+    double err = norm(m, ref, NORM_INF);
+    if(err > 0)
+    {
+        std::cout << "m: " << m << std::endl;
+        std::cout << "ref: " << ref << std::endl;
+    }
+    EXPECT_EQ(err, 0.);
+    }
+}
diff --git a/modules/imgproc/src/color.cpp b/modules/imgproc/src/color.cpp

index fb2627a..4bc3ffb 100644 (file)
--- a/modules/imgproc/src/color.cpp
+++ b/modules/imgproc/src/color.cpp
@@ -2695,6 +2695,7 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
      UMat src = _src.getUMat(), dst;
      Size sz = src.size(), dstSz = sz;
      int scn = src.channels(), depth = src.depth(), bidx;
+    int dims = 2, stripeSize = 32;
      size_t globalsize[] = { src.cols, src.rows };
      ocl::Kernel k;
  
@@ -2765,7 +2766,9 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
          bidx = code == COLOR_BGR2GRAY || code == COLOR_BGRA2GRAY ? 0 : 2;
          dcn = 1;
          k.create("RGB2Gray", ocl::imgproc::cvtcolor_oclsrc,
-                 format("-D depth=%d -D scn=%d -D dcn=1 -D bidx=%d", depth, scn, bidx));
+                 format("-D depth=%d -D scn=%d -D dcn=1 -D bidx=%d -D STRIPE_SIZE=%d",
+                        depth, scn, bidx, stripeSize));
+        globalsize[0] = (src.cols + stripeSize-1)/stripeSize;
          break;
      }
      case COLOR_GRAY2BGR:
@@ -3027,7 +3030,7 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
          _dst.create(dstSz, CV_MAKETYPE(depth, dcn));
          dst = _dst.getUMat();
          k.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnly(dst));
-        ok = k.run(2, globalsize, 0, false);
+        ok = k.run(dims, globalsize, 0, false);
      }
      return ok;
  }
diff --git a/modules/imgproc/src/opencl/cvtcolor.cl b/modules/imgproc/src/opencl/cvtcolor.cl

index ca69629..85c0ca6 100644 (file)
--- a/modules/imgproc/src/opencl/cvtcolor.cl
+++ b/modules/imgproc/src/opencl/cvtcolor.cl
@@ -75,6 +75,10 @@
      #error "invalid depth: should be 0 (CV_8U), 2 (CV_16U) or 5 (CV_32F)"
  #endif
  
+#ifndef STRIPE_SIZE
+#define STRIPE_SIZE 1
+#endif
+
  #define CV_DESCALE(x,n) (((x) + (1 << ((n)-1))) >> (n))
  
  enum
@@ -105,6 +109,7 @@ __kernel void RGB2Gray(__global const uchar* srcptr, int srcstep, int srcoffset,
                         __global uchar* dstptr, int dststep, int dstoffset,
                         int rows, int cols)
  {
+#if 0
      const int x = get_global_id(0);
      const int y = get_global_id(1);
  
@@ -118,6 +123,25 @@ __kernel void RGB2Gray(__global const uchar* srcptr, int srcstep, int srcoffset,
          dst[0] = (DATA_TYPE)CV_DESCALE((src[bidx] * B2Y + src[1] * G2Y + src[(bidx^2)] * R2Y), yuv_shift);
  #endif
      }
+#else
+    const int x0 = get_global_id(0)*STRIPE_SIZE;
+    const int x1 = min(x0 + STRIPE_SIZE, cols);
+    const int y = get_global_id(1);
+
+    if( y < rows )
+    {
+        __global const DATA_TYPE* src = (__global const DATA_TYPE*)(srcptr + mad24(y, srcstep, srcoffset)) + x0*scn;
+        __global DATA_TYPE* dst = (__global DATA_TYPE*)(dstptr + mad24(y, dststep, dstoffset));
+        int x;
+        for( x = x0; x < x1; x++, src += scn )
+#ifdef DEPTH_5
+        dst[x] = src[bidx] * 0.114f + src[1] * 0.587f + src[(bidx^2)] * 0.299f;
+#else
+        dst[x] = (DATA_TYPE)(mad24(src[bidx], B2Y, mad24(src[1], G2Y,
+                        mad24(src[(bidx^2)], R2Y, 1 << (yuv_shift-1)))) >> yuv_shift);
+#endif
+    }
+#endif
  }
  
  __kernel void Gray2RGB(__global const uchar* srcptr, int srcstep, int srcoffset,
diff --git a/modules/ocl/test/test_api.cpp b/modules/ocl/test/test_api.cpp

index 36eb8bd..a95e60e 100644 (file)
--- a/modules/ocl/test/test_api.cpp
+++ b/modules/ocl/test/test_api.cpp
@@ -41,6 +41,7 @@
  
  #include "test_precomp.hpp"
  #include "opencv2/ocl/cl_runtime/cl_runtime.hpp" // for OpenCL types: cl_mem
+#include "opencv2/core/ocl.hpp"
  
  TEST(TestAPI, openCLExecuteKernelInterop)
  {
@@ -78,3 +79,61 @@ TEST(TestAPI, openCLExecuteKernelInterop)
  
      EXPECT_LE(checkNorm(cpuMat, dst), 1e-3);
  }
+
+TEST(OCL_TestTAPI, performance)
+{
+    cv::RNG rng;
+#if 1
+    cv::Mat src(1280,768,CV_8UC4), dst;
+    rng.fill(src, RNG::UNIFORM, 0, 255);
+#else
+    cv::Mat src = cv::imread("/Users/vp/work/opencv/samples/c/lena.jpg", 1), dst;
+#endif
+
+    cv::UMat usrc, udst;
+    src.copyTo(usrc);
+
+    cv::ocl::oclMat osrc(src);
+    cv::ocl::oclMat odst;
+
+    int cvtcode = cv::COLOR_BGR2GRAY;
+    int i, niters = 10;
+    double t;
+
+    cv::ocl::cvtColor(osrc, odst, cvtcode);
+    cv::ocl::finish();
+    t = (double)cv::getTickCount();
+    for(i = 0; i < niters; i++)
+    {
+        cv::ocl::cvtColor(osrc, odst, cvtcode);
+    }
+    cv::ocl::finish();
+    t = (double)cv::getTickCount() - t;
+    printf("ocl exec time = %gms per iter\n", t*1000./niters/cv::getTickFrequency());
+
+    cv::cvtColor(usrc, udst, cvtcode);
+    cv::ocl::finish2();
+    t = (double)cv::getTickCount();
+    for(i = 0; i < niters; i++)
+    {
+        cv::cvtColor(usrc, udst, cvtcode);
+    }
+    cv::ocl::finish2();
+    t = (double)cv::getTickCount() - t;
+    printf("t-api exec time = %gms per iter\n", t*1000./niters/cv::getTickFrequency());
+
+    cv::cvtColor(src, dst, cvtcode);
+    t = (double)cv::getTickCount();
+    for(i = 0; i < niters; i++)
+    {
+        cv::cvtColor(src, dst, cvtcode);
+    }
+    t = (double)cv::getTickCount() - t;
+    printf("cpu exec time = %gms per iter\n", t*1000./niters/cv::getTickFrequency());
+    /*cv::imshow("result0", dst);
+    cv::imshow("result1", udst);
+    cv::waitKey();
+    cv::destroyWindow("result0");
+    cv::destroyWindow("result1");*/
+}
+
diff --git a/modules/python/src2/cv2.cpp b/modules/python/src2/cv2.cpp

index 20b4128..734f121 100644 (file)
--- a/modules/python/src2/cv2.cpp
+++ b/modules/python/src2/cv2.cpp
@@ -195,8 +195,14 @@ public:
          return u;
      }
  
-    UMatData* allocate(int dims0, const int* sizes, int type, size_t* step) const
+    UMatData* allocate(int dims0, const int* sizes, int type, void* data, size_t* step, int flags) const
      {
+        if( data != 0 )
+        {
+            CV_Error(Error::StsAssert, "The data should normally be NULL!");
+            // probably this is safe to do in such extreme case
+            return stdAllocator->allocate(dims0, sizes, type, data, step, flags);
+        }
          PyEnsureGIL gil;
  
          int depth = CV_MAT_DEPTH(type);
@@ -229,43 +235,11 @@ public:
          {
              PyEnsureGIL gil;
              PyObject* o = (PyObject*)u->userdata;
-            Py_DECREF(o);
+            Py_XDECREF(o);
              delete u;
          }
      }
  
-    void map(UMatData*, int) const
-    {
-    }
-
-    void unmap(UMatData* u) const
-    {
-        if(u->urefcount == 0)
-            deallocate(u);
-    }
-
-    void download(UMatData* u, void* dstptr,
-                  int dims, const size_t sz[],
-                  const size_t srcofs[], const size_t srcstep[],
-                  const size_t dststep[]) const
-    {
-        stdAllocator->download(u, dstptr, dims, sz, srcofs, srcstep, dststep);
-    }
-
-    void upload(UMatData* u, const void* srcptr, int dims, const size_t sz[],
-                const size_t dstofs[], const size_t dststep[],
-                const size_t srcstep[]) const
-    {
-        stdAllocator->upload(u, srcptr, dims, sz, dstofs, dststep, srcstep);
-    }
-
-    void copy(UMatData* usrc, UMatData* udst, int dims, const size_t sz[],
-              const size_t srcofs[], const size_t srcstep[],
-              const size_t dstofs[], const size_t dststep[], bool sync) const
-    {
-        stdAllocator->copy(usrc, udst, dims, sz, srcofs, srcstep, dstofs, dststep, sync);
-    }
-
      const MatAllocator* stdAllocator;
  };
author	Vadim Pisarevsky <vadim.pisarevsky@gmail.com>
	Sat, 30 Nov 2013 23:12:19 +0000 (03:12 +0400)
committer	Vadim Pisarevsky <vadim.pisarevsky@gmail.com>
	Sat, 30 Nov 2013 23:12:19 +0000 (03:12 +0400)
modules/core/include/opencv2/core/mat.hpp		patch \| blob \| history
modules/core/include/opencv2/core/mat.inl.hpp		patch \| blob \| history
modules/core/src/matrix.cpp		patch \| blob \| history
modules/core/src/ocl.cpp		patch \| blob \| history
modules/core/src/umatrix.cpp		patch \| blob \| history
modules/core/test/test_umat.cpp		patch \| blob \| history
modules/imgproc/src/color.cpp		patch \| blob \| history
modules/imgproc/src/opencl/cvtcolor.cl		patch \| blob \| history
modules/ocl/test/test_api.cpp		patch \| blob \| history
modules/python/src2/cv2.cpp		patch \| blob \| history