modules/ocl/src/pyrlk.cpp

   1 /*M///////////////////////////////////////////////////////////////////////////////////////
   2 //
   3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
   4 //
   5 //  By downloading, copying, installing or using the software you agree to this license.
   6 //  If you do not agree to this license, do not download, install,
   7 //  copy or use the software.
   8 //
   9 //
  10 //                           License Agreement
  11 //                For Open Source Computer Vision Library
  12 //
  13 // Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
  14 // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
  15 // Third party copyrights are property of their respective owners.
  16 //
  17 // @Authors
  18 //              Dachuan Zhao, dachuan@multicorewareinc.com
  19 //              Yao Wang, yao@multicorewareinc.com
  20 //      Nathan, liujun@multicorewareinc.com
  21 //
  22 // Redistribution and use in source and binary forms, with or without modification,
  23 // are permitted provided that the following conditions are met:
  24 //
  25 //   * Redistribution's of source code must retain the above copyright notice,
  26 //     this list of conditions and the following disclaimer.
  27 //
  28 //   * Redistribution's in binary form must reproduce the above copyright notice,
  29 //     this list of conditions and the following disclaimer in the documentation
  30 //     and/or other oclMaterials provided with the distribution.
  31 //
  32 //   * The name of the copyright holders may not be used to endorse or promote products
  33 //     derived from this software without specific prior written permission.
  34 //
  35 // This software is provided by the copyright holders and contributors "as is" and
  36 // any express or implied warranties, including, but not limited to, the implied
  37 // warranties of merchantability and fitness for a particular purpose are disclaimed.
  38 // In no event shall the Intel Corporation or contributors be liable for any direct,
  39 // indirect, incidental, special, exemplary, or consequential damages
  40 // (including, but not limited to, procurement of substitute goods or services;
  41 // loss of use, data, or profits; or business interruption) however caused
  42 // and on any theory of liability, whether in contract, strict liability,
  43 // or tort (including negligence or otherwise) arising in any way out of
  44 // the use of this software, even if advised of the possibility of such damage.
  45 //
  46 //M*/
  47
  48
  49 #include "precomp.hpp"
  50 using namespace cv;
  51 using namespace cv::ocl;
  52
  53 namespace cv
  54 {
  55 namespace ocl
  56 {
  57 ///////////////////////////OpenCL kernel strings///////////////////////////
  58 extern const char *pyrlk;
  59 extern const char *pyrlk_no_image;
  60 extern const char *operator_setTo;
  61 extern const char *operator_convertTo;
  62 extern const char *operator_copyToM;
  63 extern const char *arithm_mul;
  64 extern const char *pyr_down;
  65 }
  66 }
  67
  68 struct dim3
  69 {
  70     unsigned int x, y, z;
  71 };
  72
  73 struct float2
  74 {
  75     float x, y;
  76 };
  77
  78 struct int2
  79 {
  80     int x, y;
  81 };
  82
  83 namespace
  84 {
  85 void calcPatchSize(cv::Size winSize, int cn, dim3 &block, dim3 &patch, bool isDeviceArch11)
  86 {
  87     winSize.width *= cn;
  88
  89     if (winSize.width > 32 && winSize.width > 2 * winSize.height)
  90     {
  91         block.x = isDeviceArch11 ? 16 : 32;
  92         block.y = 8;
  93     }
  94     else
  95     {
  96         block.x = 16;
  97         block.y = isDeviceArch11 ? 8 : 16;
  98     }
  99
 100     patch.x = (winSize.width  + block.x - 1) / block.x;
 101     patch.y = (winSize.height + block.y - 1) / block.y;
 102
 103     block.z = patch.z = 1;
 104 }
 105 }
 106
 107 inline int divUp(int total, int grain)
 108 {
 109     return (total + grain - 1) / grain;
 110 }
 111
 112 ///////////////////////////////////////////////////////////////////////////
 113 //////////////////////////////// ConvertTo ////////////////////////////////
 114 ///////////////////////////////////////////////////////////////////////////
 115 static void convert_run_cus(const oclMat &src, oclMat &dst, double alpha, double beta)
 116 {
 117     String kernelName = "convert_to_S";
 118     std::stringstream idxStr;
 119     idxStr << src.depth();
 120     kernelName = kernelName + idxStr.str().c_str();
 121     float alpha_f = (float)alpha, beta_f = (float)beta;
 122     CV_DbgAssert(src.rows == dst.rows && src.cols == dst.cols);
 123     std::vector<std::pair<size_t , const void *> > args;
 124     size_t localThreads[3] = {16, 16, 1};
 125     size_t globalThreads[3];
 126     globalThreads[0] = (dst.cols + localThreads[0] - 1) / localThreads[0] * localThreads[0];
 127     globalThreads[1] = (dst.rows + localThreads[1] - 1) / localThreads[1] * localThreads[1];
 128     globalThreads[2] = 1;
 129     int dststep_in_pixel = dst.step / dst.elemSize(), dstoffset_in_pixel = dst.offset / dst.elemSize();
 130     int srcstep_in_pixel = src.step / src.elemSize(), srcoffset_in_pixel = src.offset / src.elemSize();
 131     if(dst.type() == CV_8UC1)
 132     {
 133         globalThreads[0] = ((dst.cols + 4) / 4 + localThreads[0]) / localThreads[0] * localThreads[0];
 134     }
 135     args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data ));
 136     args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst.data ));
 137     args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.cols ));
 138     args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.rows ));
 139     args.push_back( std::make_pair( sizeof(cl_int) , (void *)&srcstep_in_pixel ));
 140     args.push_back( std::make_pair( sizeof(cl_int) , (void *)&srcoffset_in_pixel ));
 141     args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dststep_in_pixel ));
 142     args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dstoffset_in_pixel ));
 143     args.push_back( std::make_pair( sizeof(cl_float) , (void *)&alpha_f ));
 144     args.push_back( std::make_pair( sizeof(cl_float) , (void *)&beta_f ));
 145     openCLExecuteKernel2(dst.clCxt , &operator_convertTo, kernelName, globalThreads,
 146                          localThreads, args, dst.oclchannels(), dst.depth(), CLFLUSH);
 147 }
 148 void convertTo( const oclMat &src, oclMat &m, int rtype, double alpha = 1, double beta = 0 );
 149 void convertTo( const oclMat &src, oclMat &dst, int rtype, double alpha, double beta )
 150 {
 151     //cout << "cv::ocl::oclMat::convertTo()" << endl;
 152
 153     bool noScale = fabs(alpha - 1) < std::numeric_limits<double>::epsilon()
 154                    && fabs(beta) < std::numeric_limits<double>::epsilon();
 155
 156     if( rtype < 0 )
 157         rtype = src.type();
 158     else
 159         rtype = CV_MAKETYPE(CV_MAT_DEPTH(rtype), src.oclchannels());
 160
 161     int sdepth = src.depth(), ddepth = CV_MAT_DEPTH(rtype);
 162     if( sdepth == ddepth && noScale )
 163     {
 164         src.copyTo(dst);
 165         return;
 166     }
 167
 168     oclMat temp;
 169     const oclMat *psrc = &src;
 170     if( sdepth != ddepth && psrc == &dst )
 171         psrc = &(temp = src);
 172
 173     dst.create( src.size(), rtype );
 174     convert_run_cus(*psrc, dst, alpha, beta);
 175 }
 176
 177 ///////////////////////////////////////////////////////////////////////////
 178 //////////////////////////////// setTo ////////////////////////////////////
 179 ///////////////////////////////////////////////////////////////////////////
 180 //oclMat &operator = (const Scalar &s)
 181 //{
 182 //    //cout << "cv::ocl::oclMat::=" << endl;
 183 //    setTo(s);
 184 //    return *this;
 185 //}
 186 static void set_to_withoutmask_run_cus(const oclMat &dst, const Scalar &scalar, String kernelName)
 187 {
 188     std::vector<std::pair<size_t , const void *> > args;
 189
 190     size_t localThreads[3] = {16, 16, 1};
 191     size_t globalThreads[3];
 192     globalThreads[0] = (dst.cols + localThreads[0] - 1) / localThreads[0] * localThreads[0];
 193     globalThreads[1] = (dst.rows + localThreads[1] - 1) / localThreads[1] * localThreads[1];
 194     globalThreads[2] = 1;
 195     int step_in_pixel = dst.step / dst.elemSize(), offset_in_pixel = dst.offset / dst.elemSize();
 196     if(dst.type() == CV_8UC1)
 197     {
 198         globalThreads[0] = ((dst.cols + 4) / 4 + localThreads[0] - 1) / localThreads[0] * localThreads[0];
 199     }
 200     char compile_option[32];
 201     union sc
 202     {
 203         cl_uchar4 uval;
 204         cl_char4  cval;
 205         cl_ushort4 usval;
 206         cl_short4 shval;
 207         cl_int4 ival;
 208         cl_float4 fval;
 209         cl_double4 dval;
 210     } val;
 211     switch(dst.depth())
 212     {
 213     case 0:
 214         val.uval.s[0] = saturate_cast<uchar>(scalar.val[0]);
 215         val.uval.s[1] = saturate_cast<uchar>(scalar.val[1]);
 216         val.uval.s[2] = saturate_cast<uchar>(scalar.val[2]);
 217         val.uval.s[3] = saturate_cast<uchar>(scalar.val[3]);
 218         switch(dst.oclchannels())
 219         {
 220         case 1:
 221             sprintf(compile_option, "-D GENTYPE=uchar");
 222             args.push_back( std::make_pair( sizeof(cl_uchar) , (void *)&val.uval.s[0] ));
 223             break;
 224         case 4:
 225             sprintf(compile_option, "-D GENTYPE=uchar4");
 226             args.push_back( std::make_pair( sizeof(cl_uchar4) , (void *)&val.uval ));
 227             break;
 228         default:
 229             CV_Error(Error::StsUnsupportedFormat, "unsupported channels");
 230         }
 231         break;
 232     case 1:
 233         val.cval.s[0] = saturate_cast<char>(scalar.val[0]);
 234         val.cval.s[1] = saturate_cast<char>(scalar.val[1]);
 235         val.cval.s[2] = saturate_cast<char>(scalar.val[2]);
 236         val.cval.s[3] = saturate_cast<char>(scalar.val[3]);
 237         switch(dst.oclchannels())
 238         {
 239         case 1:
 240             sprintf(compile_option, "-D GENTYPE=char");
 241             args.push_back( std::make_pair( sizeof(cl_char) , (void *)&val.cval.s[0] ));
 242             break;
 243         case 4:
 244             sprintf(compile_option, "-D GENTYPE=char4");
 245             args.push_back( std::make_pair( sizeof(cl_char4) , (void *)&val.cval ));
 246             break;
 247         default:
 248             CV_Error(Error::StsUnsupportedFormat, "unsupported channels");
 249         }
 250         break;
 251     case 2:
 252         val.usval.s[0] = saturate_cast<ushort>(scalar.val[0]);
 253         val.usval.s[1] = saturate_cast<ushort>(scalar.val[1]);
 254         val.usval.s[2] = saturate_cast<ushort>(scalar.val[2]);
 255         val.usval.s[3] = saturate_cast<ushort>(scalar.val[3]);
 256         switch(dst.oclchannels())
 257         {
 258         case 1:
 259             sprintf(compile_option, "-D GENTYPE=ushort");
 260             args.push_back( std::make_pair( sizeof(cl_ushort) , (void *)&val.usval.s[0] ));
 261             break;
 262         case 4:
 263             sprintf(compile_option, "-D GENTYPE=ushort4");
 264             args.push_back( std::make_pair( sizeof(cl_ushort4) , (void *)&val.usval ));
 265             break;
 266         default:
 267             CV_Error(Error::StsUnsupportedFormat, "unsupported channels");
 268         }
 269         break;
 270     case 3:
 271         val.shval.s[0] = saturate_cast<short>(scalar.val[0]);
 272         val.shval.s[1] = saturate_cast<short>(scalar.val[1]);
 273         val.shval.s[2] = saturate_cast<short>(scalar.val[2]);
 274         val.shval.s[3] = saturate_cast<short>(scalar.val[3]);
 275         switch(dst.oclchannels())
 276         {
 277         case 1:
 278             sprintf(compile_option, "-D GENTYPE=short");
 279             args.push_back( std::make_pair( sizeof(cl_short) , (void *)&val.shval.s[0] ));
 280             break;
 281         case 4:
 282             sprintf(compile_option, "-D GENTYPE=short4");
 283             args.push_back( std::make_pair( sizeof(cl_short4) , (void *)&val.shval ));
 284             break;
 285         default:
 286             CV_Error(Error::StsUnsupportedFormat, "unsupported channels");
 287         }
 288         break;
 289     case 4:
 290         val.ival.s[0] = saturate_cast<int>(scalar.val[0]);
 291         val.ival.s[1] = saturate_cast<int>(scalar.val[1]);
 292         val.ival.s[2] = saturate_cast<int>(scalar.val[2]);
 293         val.ival.s[3] = saturate_cast<int>(scalar.val[3]);
 294         switch(dst.oclchannels())
 295         {
 296         case 1:
 297             sprintf(compile_option, "-D GENTYPE=int");
 298             args.push_back( std::make_pair( sizeof(cl_int) , (void *)&val.ival.s[0] ));
 299             break;
 300         case 2:
 301             sprintf(compile_option, "-D GENTYPE=int2");
 302             cl_int2 i2val;
 303             i2val.s[0] = val.ival.s[0];
 304             i2val.s[1] = val.ival.s[1];
 305             args.push_back( std::make_pair( sizeof(cl_int2) , (void *)&i2val ));
 306             break;
 307         case 4:
 308             sprintf(compile_option, "-D GENTYPE=int4");
 309             args.push_back( std::make_pair( sizeof(cl_int4) , (void *)&val.ival ));
 310             break;
 311         default:
 312             CV_Error(Error::StsUnsupportedFormat, "unsupported channels");
 313         }
 314         break;
 315     case 5:
 316         val.fval.s[0] = (float)scalar.val[0];
 317         val.fval.s[1] = (float)scalar.val[1];
 318         val.fval.s[2] = (float)scalar.val[2];
 319         val.fval.s[3] = (float)scalar.val[3];
 320         switch(dst.oclchannels())
 321         {
 322         case 1:
 323             sprintf(compile_option, "-D GENTYPE=float");
 324             args.push_back( std::make_pair( sizeof(cl_float) , (void *)&val.fval.s[0] ));
 325             break;
 326         case 4:
 327             sprintf(compile_option, "-D GENTYPE=float4");
 328             args.push_back( std::make_pair( sizeof(cl_float4) , (void *)&val.fval ));
 329             break;
 330         default:
 331             CV_Error(Error::StsUnsupportedFormat, "unsupported channels");
 332         }
 333         break;
 334     case 6:
 335         val.dval.s[0] = scalar.val[0];
 336         val.dval.s[1] = scalar.val[1];
 337         val.dval.s[2] = scalar.val[2];
 338         val.dval.s[3] = scalar.val[3];
 339         switch(dst.oclchannels())
 340         {
 341         case 1:
 342             sprintf(compile_option, "-D GENTYPE=double");
 343             args.push_back( std::make_pair( sizeof(cl_double) , (void *)&val.dval.s[0] ));
 344             break;
 345         case 4:
 346             sprintf(compile_option, "-D GENTYPE=double4");
 347             args.push_back( std::make_pair( sizeof(cl_double4) , (void *)&val.dval ));
 348             break;
 349         default:
 350             CV_Error(Error::StsUnsupportedFormat, "unsupported channels");
 351         }
 352         break;
 353     default:
 354         CV_Error(Error::StsUnsupportedFormat, "unknown depth");
 355     }
 356 #ifdef CL_VERSION_1_2
 357     if(dst.offset == 0 && dst.cols == dst.wholecols)
 358     {
 359         clEnqueueFillBuffer((cl_command_queue)dst.clCxt->oclCommandQueue(), (cl_mem)dst.data, args[0].second, args[0].first, 0, dst.step * dst.rows, 0, NULL, NULL);
 360     }
 361     else
 362     {
 363         args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst.data ));
 364         args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.cols ));
 365         args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.rows ));
 366         args.push_back( std::make_pair( sizeof(cl_int) , (void *)&step_in_pixel ));
 367         args.push_back( std::make_pair( sizeof(cl_int) , (void *)&offset_in_pixel));
 368         openCLExecuteKernel2(dst.clCxt , &operator_setTo, kernelName, globalThreads,
 369                              localThreads, args, -1, -1, compile_option, CLFLUSH);
 370     }
 371 #else
 372     args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst.data ));
 373     args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.cols ));
 374     args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.rows ));
 375     args.push_back( std::make_pair( sizeof(cl_int) , (void *)&step_in_pixel ));
 376     args.push_back( std::make_pair( sizeof(cl_int) , (void *)&offset_in_pixel));
 377     openCLExecuteKernel2(dst.clCxt , &operator_setTo, kernelName, globalThreads,
 378                          localThreads, args, -1, -1, compile_option, CLFLUSH);
 379 #endif
 380 }
 381
 382 static oclMat &setTo(oclMat &src, const Scalar &scalar)
 383 {
 384     CV_Assert( src.depth() >= 0 && src.depth() <= 6 );
 385     CV_DbgAssert( !src.empty());
 386
 387     if(src.type() == CV_8UC1)
 388     {
 389         set_to_withoutmask_run_cus(src, scalar, "set_to_without_mask_C1_D0");
 390     }
 391     else
 392     {
 393         set_to_withoutmask_run_cus(src, scalar, "set_to_without_mask");
 394     }
 395
 396     return src;
 397 }
 398
 399 ///////////////////////////////////////////////////////////////////////////
 400 ////////////////////////////////// CopyTo /////////////////////////////////
 401 ///////////////////////////////////////////////////////////////////////////
 402 // static void copy_to_with_mask_cus(const oclMat &src, oclMat &dst, const oclMat &mask, String kernelName)
 403 // {
 404 //     CV_DbgAssert( dst.rows == mask.rows && dst.cols == mask.cols &&
 405 //                   src.rows == dst.rows && src.cols == dst.cols
 406 //                   && mask.type() == CV_8UC1);
 407
 408 //     std::vector<std::pair<size_t , const void *> > args;
 409
 410 //     String string_types[4][7] = {{"uchar", "char", "ushort", "short", "int", "float", "double"},
 411 //         {"uchar2", "char2", "ushort2", "short2", "int2", "float2", "double2"},
 412 //         {"uchar3", "char3", "ushort3", "short3", "int3", "float3", "double3"},
 413 //         {"uchar4", "char4", "ushort4", "short4", "int4", "float4", "double4"}
 414 //     };
 415 //     char compile_option[32];
 416 //     sprintf(compile_option, "-D GENTYPE=%s", string_types[dst.oclchannels() - 1][dst.depth()].c_str());
 417 //     size_t localThreads[3] = {16, 16, 1};
 418 //     size_t globalThreads[3];
 419
 420 //     globalThreads[0] = divUp(dst.cols, localThreads[0]) * localThreads[0];
 421 //     globalThreads[1] = divUp(dst.rows, localThreads[1]) * localThreads[1];
 422 //     globalThreads[2] = 1;
 423
 424 //     int dststep_in_pixel = dst.step / dst.elemSize(), dstoffset_in_pixel = dst.offset / dst.elemSize();
 425 //     int srcstep_in_pixel = src.step / src.elemSize(), srcoffset_in_pixel = src.offset / src.elemSize();
 426
 427 //     args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data ));
 428 //     args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst.data ));
 429 //     args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&mask.data ));
 430 //     args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.cols ));
 431 //     args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.rows ));
 432 //     args.push_back( std::make_pair( sizeof(cl_int) , (void *)&srcstep_in_pixel ));
 433 //     args.push_back( std::make_pair( sizeof(cl_int) , (void *)&srcoffset_in_pixel ));
 434 //     args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dststep_in_pixel ));
 435 //     args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dstoffset_in_pixel ));
 436 //     args.push_back( std::make_pair( sizeof(cl_int) , (void *)&mask.step ));
 437 //     args.push_back( std::make_pair( sizeof(cl_int) , (void *)&mask.offset ));
 438
 439 //     openCLExecuteKernel2(dst.clCxt , &operator_copyToM, kernelName, globalThreads,
 440 //                          localThreads, args, -1, -1, compile_option, CLFLUSH);
 441 // }
 442
 443 static void copyTo(const oclMat &src, oclMat &m )
 444 {
 445     CV_DbgAssert(!src.empty());
 446     m.create(src.size(), src.type());
 447     openCLCopyBuffer2D(src.clCxt, m.data, m.step, m.offset,
 448                        src.data, src.step, src.cols * src.elemSize(), src.rows, src.offset);
 449 }
 450
 451 // static void copyTo(const oclMat &src, oclMat &mat, const oclMat &mask)
 452 // {
 453 //     if (mask.empty())
 454 //     {
 455 //         copyTo(src, mat);
 456 //     }
 457 //     else
 458 //     {
 459 //         mat.create(src.size(), src.type());
 460 //         copy_to_with_mask_cus(src, mat, mask, "copy_to_with_mask");
 461 //     }
 462 // }
 463
 464 static void arithmetic_run(const oclMat &src1, oclMat &dst, String kernelName, const char **kernelString, void *_scalar)
 465 {
 466     if(!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F)
 467     {
 468         CV_Error(Error::GpuNotSupported, "Selected device don't support double\r\n");
 469         return;
 470     }
 471
 472     //dst.create(src1.size(), src1.type());
 473     //CV_Assert(src1.cols == src2.cols && src2.cols == dst.cols &&
 474     //          src1.rows == src2.rows && src2.rows == dst.rows);
 475     CV_Assert(src1.cols == dst.cols &&
 476               src1.rows == dst.rows);
 477
 478     CV_Assert(src1.type() == dst.type());
 479     CV_Assert(src1.depth() != CV_8S);
 480
 481     Context  *clCxt = src1.clCxt;
 482     //int channels = dst.channels();
 483     //int depth = dst.depth();
 484
 485     //int vector_lengths[4][7] = {{4, 0, 4, 4, 1, 1, 1},
 486     //    {4, 0, 4, 4, 1, 1, 1},
 487     //    {4, 0, 4, 4, 1, 1, 1},
 488     //    {4, 0, 4, 4, 1, 1, 1}
 489     //};
 490
 491     //size_t vector_length = vector_lengths[channels-1][depth];
 492     //int offset_cols = (dst.offset / dst.elemSize1()) & (vector_length - 1);
 493     //int cols = divUp(dst.cols * channels + offset_cols, vector_length);
 494
 495     size_t localThreads[3]  = { 16, 16, 1 };
 496     //size_t globalThreads[3] = { divUp(cols, localThreads[0]) * localThreads[0],
 497     //                               divUp(dst.rows, localThreads[1]) * localThreads[1],
 498     //                               1
 499     //                             };
 500     size_t globalThreads[3] = { src1.cols,
 501                                 src1.rows,
 502                                 1
 503                               };
 504
 505     int dst_step1 = dst.cols * dst.elemSize();
 506     std::vector<std::pair<size_t , const void *> > args;
 507     args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src1.data ));
 508     args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1.step ));
 509     args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1.offset ));
 510     //args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src2.data ));
 511     //args.push_back( std::make_pair( sizeof(cl_int), (void *)&src2.step ));
 512     //args.push_back( std::make_pair( sizeof(cl_int), (void *)&src2.offset ));
 513     args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst.data ));
 514     args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.step ));
 515     args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.offset ));
 516     args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1.rows ));
 517     args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1.cols ));
 518     args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst_step1 ));
 519
 520     //if(_scalar != NULL)
 521     //{
 522     float scalar1 = *((float *)_scalar);
 523     args.push_back( std::make_pair( sizeof(float), (float *)&scalar1 ));
 524     //}
 525
 526     openCLExecuteKernel2(clCxt, kernelString, kernelName, globalThreads, localThreads, args, -1, src1.depth(), CLFLUSH);
 527 }
 528
 529 static void multiply_cus(const oclMat &src1, oclMat &dst, float scalar)
 530 {
 531     arithmetic_run(src1, dst, "arithm_muls", &arithm_mul, (void *)(&scalar));
 532 }
 533
 534 static void pyrdown_run_cus(const oclMat &src, const oclMat &dst)
 535 {
 536
 537     CV_Assert(src.type() == dst.type());
 538     CV_Assert(src.depth() != CV_8S);
 539
 540     Context  *clCxt = src.clCxt;
 541
 542     String kernelName = "pyrDown";
 543
 544     size_t localThreads[3]  = { 256, 1, 1 };
 545     size_t globalThreads[3] = { src.cols, dst.rows, 1};
 546
 547     std::vector<std::pair<size_t , const void *> > args;
 548     args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src.data ));
 549     args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.step ));
 550     args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.rows));
 551     args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.cols));
 552     args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst.data ));
 553     args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.step ));
 554     args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.cols));
 555
 556     openCLExecuteKernel2(clCxt, &pyr_down, kernelName, globalThreads, localThreads, args, src.oclchannels(), src.depth(), CLFLUSH);
 557 }
 558
 559 static void pyrDown_cus(const oclMat &src, oclMat &dst)
 560 {
 561     CV_Assert(src.depth() <= CV_32F && src.channels() <= 4);
 562
 563     dst.create((src.rows + 1) / 2, (src.cols + 1) / 2, src.type());
 564
 565     pyrdown_run_cus(src, dst);
 566 }
 567
 568 static void lkSparse_run(oclMat &I, oclMat &J,
 569                   const oclMat &prevPts, oclMat &nextPts, oclMat &status, oclMat& err, bool /*GET_MIN_EIGENVALS*/, int ptcount,
 570                   int level, /*dim3 block, */dim3 patch, Size winSize, int iters)
 571 {
 572     Context  *clCxt = I.clCxt;
 573     int elemCntPerRow = I.step / I.elemSize();
 574     String kernelName = "lkSparse";
 575     bool isImageSupported = support_image2d();
 576     size_t localThreads[3]  = { 8, isImageSupported ? 8 : 32, 1 };
 577     size_t globalThreads[3] = { 8 * ptcount, isImageSupported ? 8 : 32, 1};
 578     int cn = I.oclchannels();
 579     char calcErr;
 580     if (level == 0)
 581     {
 582         calcErr = 1;
 583     }
 584     else
 585     {
 586         calcErr = 0;
 587     }
 588
 589     std::vector<std::pair<size_t , const void *> > args;
 590
 591     cl_mem ITex = isImageSupported ? bindTexture(I) : (cl_mem)I.data;
 592     cl_mem JTex = isImageSupported ? bindTexture(J) : (cl_mem)J.data;
 593
 594     args.push_back( std::make_pair( sizeof(cl_mem), (void *)&ITex ));
 595     args.push_back( std::make_pair( sizeof(cl_mem), (void *)&JTex ));
 596     args.push_back( std::make_pair( sizeof(cl_mem), (void *)&prevPts.data ));
 597     args.push_back( std::make_pair( sizeof(cl_int), (void *)&prevPts.step ));
 598     args.push_back( std::make_pair( sizeof(cl_mem), (void *)&nextPts.data ));
 599     args.push_back( std::make_pair( sizeof(cl_int), (void *)&nextPts.step ));
 600     args.push_back( std::make_pair( sizeof(cl_mem), (void *)&status.data ));
 601     args.push_back( std::make_pair( sizeof(cl_mem), (void *)&err.data ));
 602     args.push_back( std::make_pair( sizeof(cl_int), (void *)&level ));
 603     args.push_back( std::make_pair( sizeof(cl_int), (void *)&I.rows ));
 604     args.push_back( std::make_pair( sizeof(cl_int), (void *)&I.cols ));
 605     if (!isImageSupported)
 606         args.push_back( std::make_pair( sizeof(cl_int), (void *)&elemCntPerRow ) );
 607     args.push_back( std::make_pair( sizeof(cl_int), (void *)&patch.x ));
 608     args.push_back( std::make_pair( sizeof(cl_int), (void *)&patch.y ));
 609     args.push_back( std::make_pair( sizeof(cl_int), (void *)&cn ));
 610     args.push_back( std::make_pair( sizeof(cl_int), (void *)&winSize.width ));
 611     args.push_back( std::make_pair( sizeof(cl_int), (void *)&winSize.height ));
 612     args.push_back( std::make_pair( sizeof(cl_int), (void *)&iters ));
 613     args.push_back( std::make_pair( sizeof(cl_char), (void *)&calcErr ));
 614
 615     if(isImageSupported)
 616     {
 617         openCLExecuteKernel2(clCxt, &pyrlk, kernelName, globalThreads, localThreads, args, I.oclchannels(), I.depth(), CLFLUSH);
 618         releaseTexture(ITex);
 619         releaseTexture(JTex);
 620     }
 621     else
 622     {
 623         openCLExecuteKernel2(clCxt, &pyrlk_no_image, kernelName, globalThreads, localThreads, args, I.oclchannels(), I.depth(), CLFLUSH);
 624     }
 625 }
 626
 627 void cv::ocl::PyrLKOpticalFlow::sparse(const oclMat &prevImg, const oclMat &nextImg, const oclMat &prevPts, oclMat &nextPts, oclMat &status, oclMat *err)
 628 {
 629     if (prevPts.empty())
 630     {
 631         nextPts.release();
 632         status.release();
 633         //if (err) err->release();
 634         return;
 635     }
 636
 637     derivLambda = std::min(std::max(derivLambda, 0.0), 1.0);
 638
 639     iters = std::min(std::max(iters, 0), 100);
 640
 641     const int cn = prevImg.oclchannels();
 642
 643     dim3 block, patch;
 644     calcPatchSize(winSize, cn, block, patch, isDeviceArch11_);
 645
 646     CV_Assert(derivLambda >= 0);
 647     CV_Assert(maxLevel >= 0 && winSize.width > 2 && winSize.height > 2);
 648     CV_Assert(prevImg.size() == nextImg.size() && prevImg.type() == nextImg.type());
 649     CV_Assert(patch.x > 0 && patch.x < 6 && patch.y > 0 && patch.y < 6);
 650     CV_Assert(prevPts.rows == 1 && prevPts.type() == CV_32FC2);
 651
 652     if (useInitialFlow)
 653         CV_Assert(nextPts.size() == prevPts.size() && nextPts.type() == CV_32FC2);
 654     else
 655         ensureSizeIsEnough(1, prevPts.cols, prevPts.type(), nextPts);
 656
 657     oclMat temp1 = (useInitialFlow ? nextPts : prevPts).reshape(1);
 658     oclMat temp2 = nextPts.reshape(1);
 659     //oclMat scalar(temp1.rows, temp1.cols, temp1.type(), Scalar(1.0f / (1 << maxLevel) / 2.0f));
 660     multiply_cus(temp1, temp2, 1.0f / (1 << maxLevel) / 2.0f);
 661     //::multiply(temp1, 1.0f / (1 << maxLevel) / 2.0f, temp2);
 662
 663     ensureSizeIsEnough(1, prevPts.cols, CV_8UC1, status);
 664     //status.setTo(Scalar::all(1));
 665     setTo(status, Scalar::all(1));
 666
 667     bool errMat = false;
 668     if (!err)
 669     {
 670         err = new oclMat(1, prevPts.cols, CV_32FC1);
 671         errMat = true;
 672     }
 673     else
 674         ensureSizeIsEnough(1, prevPts.cols, CV_32FC1, *err);
 675     //ensureSizeIsEnough(1, prevPts.cols, CV_32FC1, err);
 676
 677     // build the image pyramids.
 678
 679     prevPyr_.resize(maxLevel + 1);
 680     nextPyr_.resize(maxLevel + 1);
 681
 682     if (cn == 1 || cn == 4)
 683     {
 684         //prevImg.convertTo(prevPyr_[0], CV_32F);
 685         //nextImg.convertTo(nextPyr_[0], CV_32F);
 686         convertTo(prevImg, prevPyr_[0], CV_32F);
 687         convertTo(nextImg, nextPyr_[0], CV_32F);
 688     }
 689     else
 690     {
 691         //oclMat buf_;
 692         //      cvtColor(prevImg, buf_, COLOR_BGR2BGRA);
 693         //      buf_.convertTo(prevPyr_[0], CV_32F);
 694
 695         //      cvtColor(nextImg, buf_, COLOR_BGR2BGRA);
 696         //      buf_.convertTo(nextPyr_[0], CV_32F);
 697     }
 698
 699     for (int level = 1; level <= maxLevel; ++level)
 700     {
 701         pyrDown_cus(prevPyr_[level - 1], prevPyr_[level]);
 702         pyrDown_cus(nextPyr_[level - 1], nextPyr_[level]);
 703     }
 704
 705     // dI/dx ~ Ix, dI/dy ~ Iy
 706
 707     for (int level = maxLevel; level >= 0; level--)
 708     {
 709         lkSparse_run(prevPyr_[level], nextPyr_[level],
 710                      prevPts, nextPts, status, *err, getMinEigenVals, prevPts.cols,
 711                      level, /*block, */patch, winSize, iters);
 712     }
 713
 714     clFinish((cl_command_queue)prevImg.clCxt->oclCommandQueue());
 715
 716     if(errMat)
 717         delete err;
 718 }
 719
 720 static void lkDense_run(oclMat &I, oclMat &J, oclMat &u, oclMat &v,
 721                  oclMat &prevU, oclMat &prevV, oclMat *err, Size winSize, int iters)
 722 {
 723     Context  *clCxt = I.clCxt;
 724     bool isImageSupported = support_image2d();
 725     int elemCntPerRow = I.step / I.elemSize();
 726
 727     String kernelName = "lkDense";
 728
 729     size_t localThreads[3]  = { 16, 16, 1 };
 730     size_t globalThreads[3] = { I.cols, I.rows, 1};
 731
 732     bool calcErr;
 733     if (err)
 734     {
 735         calcErr = true;
 736     }
 737     else
 738     {
 739         calcErr = false;
 740     }
 741
 742     cl_mem ITex;
 743     cl_mem JTex;
 744
 745     if (isImageSupported)
 746     {
 747         ITex = bindTexture(I);
 748         JTex = bindTexture(J);
 749     }
 750     else
 751     {
 752         ITex = (cl_mem)I.data;
 753         JTex = (cl_mem)J.data;
 754     }
 755
 756     //int2 halfWin = {(winSize.width - 1) / 2, (winSize.height - 1) / 2};
 757     //const int patchWidth  = 16 + 2 * halfWin.x;
 758     //const int patchHeight = 16 + 2 * halfWin.y;
 759     //size_t smem_size = 3 * patchWidth * patchHeight * sizeof(int);
 760
 761     std::vector<std::pair<size_t , const void *> > args;
 762
 763     args.push_back( std::make_pair( sizeof(cl_mem), (void *)&ITex ));
 764     args.push_back( std::make_pair( sizeof(cl_mem), (void *)&JTex ));
 765
 766     args.push_back( std::make_pair( sizeof(cl_mem), (void *)&u.data ));
 767     args.push_back( std::make_pair( sizeof(cl_int), (void *)&u.step ));
 768     args.push_back( std::make_pair( sizeof(cl_mem), (void *)&v.data ));
 769     args.push_back( std::make_pair( sizeof(cl_int), (void *)&v.step ));
 770     args.push_back( std::make_pair( sizeof(cl_mem), (void *)&prevU.data ));
 771     args.push_back( std::make_pair( sizeof(cl_int), (void *)&prevU.step ));
 772     args.push_back( std::make_pair( sizeof(cl_mem), (void *)&prevV.data ));
 773     args.push_back( std::make_pair( sizeof(cl_int), (void *)&prevV.step ));
 774     args.push_back( std::make_pair( sizeof(cl_int), (void *)&I.rows ));
 775     args.push_back( std::make_pair( sizeof(cl_int), (void *)&I.cols ));
 776     //args.push_back( std::make_pair( sizeof(cl_mem), (void *)&(*err).data ));
 777     //args.push_back( std::make_pair( sizeof(cl_int), (void *)&(*err).step ));
 778     if (!isImageSupported)
 779     {
 780         args.push_back( std::make_pair( sizeof(cl_int), (void *)&elemCntPerRow ) );
 781     }
 782     args.push_back( std::make_pair( sizeof(cl_int), (void *)&winSize.width ));
 783     args.push_back( std::make_pair( sizeof(cl_int), (void *)&winSize.height ));
 784     args.push_back( std::make_pair( sizeof(cl_int), (void *)&iters ));
 785     args.push_back( std::make_pair( sizeof(cl_char), (void *)&calcErr ));
 786
 787     if (isImageSupported)
 788     {
 789         openCLExecuteKernel2(clCxt, &pyrlk, kernelName, globalThreads, localThreads, args, I.oclchannels(), I.depth(), CLFLUSH);
 790
 791         releaseTexture(ITex);
 792         releaseTexture(JTex);
 793     }
 794     else
 795     {
 796         //printf("Warning: The image2d_t is not supported by the device. Using alternative method!\n");
 797         openCLExecuteKernel2(clCxt, &pyrlk_no_image, kernelName, globalThreads, localThreads, args, I.oclchannels(), I.depth(), CLFLUSH);
 798     }
 799 }
 800
 801 void cv::ocl::PyrLKOpticalFlow::dense(const oclMat &prevImg, const oclMat &nextImg, oclMat &u, oclMat &v, oclMat *err)
 802 {
 803     CV_Assert(prevImg.type() == CV_8UC1);
 804     CV_Assert(prevImg.size() == nextImg.size() && prevImg.type() == nextImg.type());
 805     CV_Assert(maxLevel >= 0);
 806     CV_Assert(winSize.width > 2 && winSize.height > 2);
 807
 808     if (err)
 809         err->create(prevImg.size(), CV_32FC1);
 810
 811     prevPyr_.resize(maxLevel + 1);
 812     nextPyr_.resize(maxLevel + 1);
 813
 814     prevPyr_[0] = prevImg;
 815     //nextImg.convertTo(nextPyr_[0], CV_32F);
 816     convertTo(nextImg, nextPyr_[0], CV_32F);
 817
 818     for (int level = 1; level <= maxLevel; ++level)
 819     {
 820         pyrDown_cus(prevPyr_[level - 1], prevPyr_[level]);
 821         pyrDown_cus(nextPyr_[level - 1], nextPyr_[level]);
 822     }
 823
 824     ensureSizeIsEnough(prevImg.size(), CV_32FC1, uPyr_[0]);
 825     ensureSizeIsEnough(prevImg.size(), CV_32FC1, vPyr_[0]);
 826     ensureSizeIsEnough(prevImg.size(), CV_32FC1, uPyr_[1]);
 827     ensureSizeIsEnough(prevImg.size(), CV_32FC1, vPyr_[1]);
 828     //uPyr_[1].setTo(Scalar::all(0));
 829     //vPyr_[1].setTo(Scalar::all(0));
 830     setTo(uPyr_[1], Scalar::all(0));
 831     setTo(vPyr_[1], Scalar::all(0));
 832
 833     Size winSize2i(winSize.width, winSize.height);
 834
 835     int idx = 0;
 836
 837     for (int level = maxLevel; level >= 0; level--)
 838     {
 839         int idx2 = (idx + 1) & 1;
 840
 841         lkDense_run(prevPyr_[level], nextPyr_[level], uPyr_[idx], vPyr_[idx], uPyr_[idx2], vPyr_[idx2],
 842                     level == 0 ? err : 0, winSize2i, iters);
 843
 844         if (level > 0)
 845             idx = idx2;
 846     }
 847
 848     //uPyr_[idx].copyTo(u);
 849     //vPyr_[idx].copyTo(v);
 850     copyTo(uPyr_[idx], u);
 851     copyTo(vPyr_[idx], v);
 852
 853     clFinish((cl_command_queue)prevImg.clCxt->oclCommandQueue());
 854 }