modules/softcascade/src/cuda/icf-sc.cu

   1 /*M///////////////////////////////////////////////////////////////////////////////////////
   2 //
   3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
   4 //
   5 //  By downloading, copying, installing or using the software you agree to this license.
   6 //  If you do not agree to this license, do not download, install,
   7 //  copy or use the software.
   8 //
   9 //
  10 //                           License Agreement
  11 //                For Open Source Computer Vision Library
  12 //
  13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
  14 // Copyright (C) 2008-2012, Willow Garage Inc., all rights reserved.
  15 // Third party copyrights are property of their respective owners.
  16 //
  17 // Redistribution and use in source and binary forms, with or without modification,
  18 // are permitted provided that the following conditions are met:
  19 //
  20 //   * Redistribution's of source code must retain the above copyright notice,
  21 //     this list of conditions and the following disclaimer.
  22 //
  23 //   * Redistribution's in binary form must reproduce the above copyright notice,
  24 //     this list of conditions and the following disclaimer in the documentation
  25 //     and/or other materials provided with the distribution.
  26 //
  27 //   * The name of the copyright holders may not be used to endorse or promote products
  28 //     derived from this software without specific prior written permission.
  29 //
  30 // This software is provided by the copyright holders and contributors "as is" and
  31 // any express or implied warranties, including, but not limited to, the implied
  32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
  33 // In no event shall the Intel Corporation or contributors be liable for any direct,
  34 // indirect, incidental, special, exemplary, or consequential damages
  35 // (including, but not limited to, procurement of substitute goods or services;
  36 // loss of use, data, or profits; or business interruption) however caused
  37 // and on any theory of liability, whether in contract, strict liability,
  38 // or tort (including negligence or otherwise) arising in any way out of
  39 // the use of this software, even if advised of the possibility of such damage.
  40 //
  41 //M*/
  42
  43 #include <cuda_invoker.hpp>
  44 #include <float.h>
  45 #include <stdio.h>
  46 #include "opencv2/core/cuda/common.hpp"
  47
  48 namespace cv { namespace softcascade { namespace cudev {
  49
  50 typedef unsigned char uchar;
  51
  52     template <int FACTOR>
  53     __device__ __forceinline__ uchar shrink(const uchar* ptr, const int pitch, const int y, const int x)
  54     {
  55         int out = 0;
  56 #pragma unroll
  57         for(int dy = 0; dy < FACTOR; ++dy)
  58 #pragma unroll
  59             for(int dx = 0; dx < FACTOR; ++dx)
  60             {
  61                 out += ptr[dy * pitch + dx];
  62             }
  63
  64         return static_cast<uchar>(out / (FACTOR * FACTOR));
  65     }
  66
  67     template<int FACTOR>
  68     __global__ void shrink(const uchar* __restrict__ hogluv, const size_t inPitch,
  69                                  uchar* __restrict__ shrank, const size_t outPitch )
  70     {
  71         const int y = blockIdx.y * blockDim.y + threadIdx.y;
  72         const int x = blockIdx.x * blockDim.x + threadIdx.x;
  73
  74         const uchar* ptr = hogluv + (FACTOR * y) * inPitch + (FACTOR * x);
  75
  76         shrank[ y * outPitch + x] = shrink<FACTOR>(ptr, inPitch, y, x);
  77     }
  78
  79     void shrink(const cv::cuda::PtrStepSzb& channels, cv::cuda::PtrStepSzb shrunk)
  80     {
  81         dim3 block(32, 8);
  82         dim3 grid(shrunk.cols / 32, shrunk.rows / 8);
  83         shrink<4><<<grid, block>>>((uchar*)channels.ptr(), channels.step, (uchar*)shrunk.ptr(), shrunk.step);
  84         cudaSafeCall(cudaDeviceSynchronize());
  85     }
  86
  87     __device__ __forceinline__ void luv(const float& b, const float& g, const float& r, uchar& __l, uchar& __u, uchar& __v)
  88     {
  89         // rgb -> XYZ
  90         float x = 0.412453f * r + 0.357580f * g + 0.180423f * b;
  91         float y = 0.212671f * r + 0.715160f * g + 0.072169f * b;
  92         float z = 0.019334f * r + 0.119193f * g + 0.950227f * b;
  93
  94         // computed for D65
  95         const float _ur = 0.19783303699678276f;
  96         const float _vr = 0.46833047435252234f;
  97
  98         const float divisor = fmax((x + 15.f * y + 3.f * z), FLT_EPSILON);
  99         const float _u = __fdividef(4.f * x, divisor);
 100         const float _v = __fdividef(9.f * y, divisor);
 101
 102         float hack = static_cast<float>(__float2int_rn(y * 2047)) / 2047;
 103         const float L = fmax(0.f, ((116.f * cbrtf(hack)) - 16.f));
 104         const float U = 13.f * L * (_u - _ur);
 105         const float V = 13.f * L * (_v - _vr);
 106
 107         // L in [0, 100], u in [-134, 220], v in [-140, 122]
 108         __l = static_cast<uchar>( L * (255.f / 100.f));
 109         __u = static_cast<uchar>((U + 134.f) * (255.f / (220.f + 134.f )));
 110         __v = static_cast<uchar>((V + 140.f) * (255.f / (122.f + 140.f )));
 111     }
 112
 113     __global__ void bgr2Luv_d(const uchar* rgb, const size_t rgbPitch, uchar* luvg, const size_t luvgPitch)
 114     {
 115         const int y = blockIdx.y * blockDim.y + threadIdx.y;
 116         const int x = blockIdx.x * blockDim.x + threadIdx.x;
 117
 118         uchar3 color = ((uchar3*)(rgb + rgbPitch * y))[x];
 119         uchar l, u, v;
 120         luv(color.x / 255.f, color.y / 255.f, color.z / 255.f, l, u, v);
 121
 122         luvg[luvgPitch *  y + x] = l;
 123         luvg[luvgPitch * (y + 480) + x] = u;
 124         luvg[luvgPitch * (y + 2 * 480) + x] = v;
 125     }
 126
 127     void bgr2Luv(const cv::cuda::PtrStepSzb& bgr, cv::cuda::PtrStepSzb luv)
 128     {
 129         dim3 block(32, 8);
 130         dim3 grid(bgr.cols / 32, bgr.rows / 8);
 131
 132         bgr2Luv_d<<<grid, block>>>((const uchar*)bgr.ptr(0), bgr.step, (uchar*)luv.ptr(0), luv.step);
 133
 134         cudaSafeCall(cudaDeviceSynchronize());
 135     }
 136
 137     template<bool isDefaultNum>
 138     __device__ __forceinline__ int fast_angle_bin(const float& dx, const float& dy)
 139     {
 140         const float angle_quantum = CV_PI_F / 6.f;
 141         float angle = atan2(dx, dy) + (angle_quantum / 2.f);
 142
 143         if (angle < 0) angle += CV_PI_F;
 144
 145         const float angle_scaling = 1.f / angle_quantum;
 146         return static_cast<int>(angle * angle_scaling) % 6;
 147     }
 148
 149     template<>
 150     __device__ __forceinline__ int fast_angle_bin<true>(const float& dy, const float& dx)
 151     {
 152         int index = 0;
 153
 154         float max_dot = fabs(dx);
 155
 156         {
 157             const float dot_product = fabs(dx * 0.8660254037844386f + dy * 0.5f);
 158
 159             if(dot_product > max_dot)
 160             {
 161                 max_dot = dot_product;
 162                 index = 1;
 163             }
 164         }
 165         {
 166             const float dot_product = fabs(dy * 0.8660254037844386f + dx * 0.5f);
 167
 168             if(dot_product > max_dot)
 169             {
 170                 max_dot = dot_product;
 171                 index = 2;
 172             }
 173         }
 174         {
 175             int i = 3;
 176             float2 bin_vector_i;
 177             bin_vector_i.x = ::cos(i * (CV_PI_F / 6.f));
 178             bin_vector_i.y = ::sin(i * (CV_PI_F / 6.f));
 179
 180             const float dot_product = fabs(dx * bin_vector_i.x + dy * bin_vector_i.y);
 181             if(dot_product > max_dot)
 182             {
 183                 max_dot = dot_product;
 184                 index = i;
 185             }
 186         }
 187         {
 188             const float dot_product = fabs(dx * (-0.4999999999999998f) + dy * 0.8660254037844387f);
 189             if(dot_product > max_dot)
 190             {
 191                 max_dot = dot_product;
 192                 index = 4;
 193             }
 194         }
 195         {
 196             const float dot_product = fabs(dx * (-0.8660254037844387f) + dy * 0.49999999999999994f);
 197             if(dot_product > max_dot)
 198             {
 199                 max_dot = dot_product;
 200                 index = 5;
 201             }
 202         }
 203         return index;
 204     }
 205
 206     texture<uchar,  cudaTextureType2D, cudaReadModeElementType> tgray;
 207
 208     template<bool isDefaultNum>
 209     __global__ void gray2hog(cv::cuda::PtrStepSzb mag)
 210     {
 211         const int x = blockIdx.x * blockDim.x + threadIdx.x;
 212         const int y = blockIdx.y * blockDim.y + threadIdx.y;
 213
 214         const float dx = tex2D(tgray, x + 1, y + 0) - tex2D(tgray, x - 1, y - 0);
 215         const float dy = tex2D(tgray, x + 0, y + 1) - tex2D(tgray, x - 0, y - 1);
 216
 217         const float magnitude = sqrtf((dx * dx) + (dy * dy)) * (1.0f / sqrtf(2));
 218         const uchar cmag = static_cast<uchar>(magnitude);
 219
 220         mag( 480 * 6 + y, x) = cmag;
 221         mag( 480 * fast_angle_bin<isDefaultNum>(dy, dx) + y, x) = cmag;
 222     }
 223
 224     void gray2hog(const cv::cuda::PtrStepSzb& gray, cv::cuda::PtrStepSzb mag, const int bins)
 225     {
 226         dim3 block(32, 8);
 227         dim3 grid(gray.cols / 32, gray.rows / 8);
 228
 229         cudaChannelFormatDesc desc = cudaCreateChannelDesc<uchar>();
 230         cudaSafeCall( cudaBindTexture2D(0, tgray, gray.data, desc, gray.cols, gray.rows, gray.step) );
 231
 232         if (bins == 6)
 233             gray2hog<true><<<grid, block>>>(mag);
 234         else
 235             gray2hog<false><<<grid, block>>>(mag);
 236
 237         cudaSafeCall(cudaDeviceSynchronize());
 238     }
 239
 240     // ToDo: use textures or uncached load instruction.
 241     __global__ void magToHist(const uchar* __restrict__ mag,
 242                               const float* __restrict__ angle, const size_t angPitch,
 243                                     uchar* __restrict__ hog,   const size_t hogPitch, const int fh)
 244     {
 245         const int y = blockIdx.y * blockDim.y + threadIdx.y;
 246         const int x = blockIdx.x * blockDim.x + threadIdx.x;
 247
 248         const int bin = (int)(angle[y * angPitch + x]);
 249         const uchar val = mag[y * hogPitch + x];
 250         hog[((fh * bin) + y) * hogPitch + x] = val;
 251     }
 252
 253     void fillBins(cv::cuda::PtrStepSzb hogluv, const cv::cuda::PtrStepSzf& nangle,
 254                   const int fw,  const int fh, const int bins, cudaStream_t stream )
 255     {
 256         const uchar* mag = (const uchar*)hogluv.ptr(fh * bins);
 257         uchar* hog = (uchar*)hogluv.ptr();
 258         const float* angle = (const float*)nangle.ptr();
 259
 260         dim3 block(32, 8);
 261         dim3 grid(fw / 32, fh / 8);
 262
 263         magToHist<<<grid, block, 0, stream>>>(mag, angle, nangle.step / sizeof(float), hog, hogluv.step, fh);
 264         if (!stream)
 265         {
 266             cudaSafeCall( cudaGetLastError() );
 267             cudaSafeCall( cudaDeviceSynchronize() );
 268         }
 269     }
 270
 271     __device__ __forceinline__ float overlapArea(const Detection &a, const Detection &b)
 272     {
 273         int w = ::min(a.x + a.w, b.x + b.w) - ::max(a.x, b.x);
 274         int h = ::min(a.y + a.h, b.y + b.h) - ::max(a.y, b.y);
 275
 276         return (w < 0 || h < 0)? 0.f : (float)(w * h);
 277     }
 278
 279     texture<uint4,  cudaTextureType2D, cudaReadModeElementType> tdetections;
 280
 281     __global__ void overlap(const uint* n, uchar* overlaps)
 282     {
 283         const int idx = threadIdx.x;
 284         const int total = *n;
 285
 286         for (int i = idx + 1; i < total; i += 192)
 287         {
 288             const uint4 _a = tex2D(tdetections, i, 0);
 289             const Detection& a = *((Detection*)(&_a));
 290             bool excluded = false;
 291
 292             for (int j = i + 1; j < total; ++j)
 293             {
 294                 const uint4 _b = tex2D(tdetections, j, 0);
 295                 const Detection& b = *((Detection*)(&_b));
 296                 float ovl = overlapArea(a, b) / ::min(a.w * a.h, b.w * b.h);
 297
 298                 if (ovl > 0.65f)
 299                 {
 300                     int suppessed = (a.confidence > b.confidence)? j : i;
 301                     overlaps[suppessed] = 1;
 302                     excluded = excluded || (suppessed == i);
 303                 }
 304
 305             #if defined __CUDA_ARCH__ && (__CUDA_ARCH__ >= 120)
 306                 if (__all(excluded)) break;
 307             #endif
 308             }
 309         }
 310     }
 311
 312     __global__ void collect(const uint* n, uchar* overlaps, uint* ctr, uint4* suppressed)
 313     {
 314         const int idx = threadIdx.x;
 315         const int total = *n;
 316
 317         for (int i = idx; i < total; i += 192)
 318         {
 319             if (!overlaps[i])
 320             {
 321                 int oidx = atomicInc(ctr, 50);
 322                 suppressed[oidx] = tex2D(tdetections, i + 1, 0);
 323             }
 324         }
 325     }
 326
 327     void suppress(const cv::cuda::PtrStepSzb& objects, cv::cuda::PtrStepSzb overlaps, cv::cuda::PtrStepSzi ndetections,
 328         cv::cuda::PtrStepSzb suppressed, cudaStream_t stream)
 329     {
 330         int block = 192;
 331         int grid = 1;
 332
 333         cudaChannelFormatDesc desc = cudaCreateChannelDesc<uint4>();
 334         size_t offset;
 335         cudaSafeCall( cudaBindTexture2D(&offset, tdetections, objects.data, desc, objects.cols / sizeof(uint4), objects.rows, objects.step));
 336
 337         overlap<<<grid, block>>>((uint*)ndetections.ptr(0), (uchar*)overlaps.ptr(0));
 338         collect<<<grid, block>>>((uint*)ndetections.ptr(0), (uchar*)overlaps.ptr(0), (uint*)suppressed.ptr(0), ((uint4*)suppressed.ptr(0)) + 1);
 339
 340         if (!stream)
 341         {
 342             cudaSafeCall( cudaGetLastError());
 343             cudaSafeCall( cudaDeviceSynchronize());
 344         }
 345     }
 346
 347     template<typename Policy>
 348     struct PrefixSum
 349     {
 350     __device_inline__ static void apply(float& impact)
 351         {
 352     #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300
 353     #pragma unroll
 354             // scan on shuffle functions
 355             for (int i = 1; i < Policy::WARP; i *= 2)
 356             {
 357                 const float n = __shfl_up(impact, i, Policy::WARP);
 358
 359                 if (threadIdx.x >= i)
 360                     impact += n;
 361             }
 362     #else
 363             __shared__ volatile float ptr[Policy::STA_X * Policy::STA_Y];
 364
 365             const int idx = threadIdx.y * Policy::STA_X + threadIdx.x;
 366
 367             ptr[idx] = impact;
 368
 369             if ( threadIdx.x >=  1) ptr [idx ] = (ptr [idx -  1] + ptr [idx]);
 370             if ( threadIdx.x >=  2) ptr [idx ] = (ptr [idx -  2] + ptr [idx]);
 371             if ( threadIdx.x >=  4) ptr [idx ] = (ptr [idx -  4] + ptr [idx]);
 372             if ( threadIdx.x >=  8) ptr [idx ] = (ptr [idx -  8] + ptr [idx]);
 373             if ( threadIdx.x >= 16) ptr [idx ] = (ptr [idx - 16] + ptr [idx]);
 374
 375             impact = ptr[idx];
 376     #endif
 377         }
 378     };
 379
 380     texture<int,  cudaTextureType2D, cudaReadModeElementType> thogluv;
 381
 382     template<bool isUp>
 383     __device__ __forceinline__ float rescale(const Level& level, Node& node)
 384     {
 385         uchar4& scaledRect = node.rect;
 386         float relScale = level.relScale;
 387         float farea = (scaledRect.z - scaledRect.x) * (scaledRect.w - scaledRect.y);
 388
 389         // rescale
 390         scaledRect.x = __float2int_rn(relScale * scaledRect.x);
 391         scaledRect.y = __float2int_rn(relScale * scaledRect.y);
 392         scaledRect.z = __float2int_rn(relScale * scaledRect.z);
 393         scaledRect.w = __float2int_rn(relScale * scaledRect.w);
 394
 395         float sarea = (scaledRect.z - scaledRect.x) * (scaledRect.w - scaledRect.y);
 396
 397         const float expected_new_area = farea * relScale * relScale;
 398         float approx = (sarea == 0)? 1: __fdividef(sarea, expected_new_area);
 399
 400         float rootThreshold = (node.threshold & 0x0FFFFFFFU) * approx * level.scaling[(node.threshold >> 28) > 6];
 401
 402         return rootThreshold;
 403     }
 404
 405     template<>
 406     __device__ __forceinline__ float rescale<true>(const Level& level, Node& node)
 407     {
 408         uchar4& scaledRect = node.rect;
 409         float relScale = level.relScale;
 410         float farea = scaledRect.z * scaledRect.w;
 411
 412         // rescale
 413         scaledRect.x = __float2int_rn(relScale * scaledRect.x);
 414         scaledRect.y = __float2int_rn(relScale * scaledRect.y);
 415         scaledRect.z = __float2int_rn(relScale * scaledRect.z);
 416         scaledRect.w = __float2int_rn(relScale * scaledRect.w);
 417
 418         float sarea = scaledRect.z * scaledRect.w;
 419
 420         const float expected_new_area = farea * relScale * relScale;
 421         float approx = __fdividef(sarea, expected_new_area);
 422
 423         float rootThreshold = (node.threshold & 0x0FFFFFFFU) * approx * level.scaling[(node.threshold >> 28) > 6];
 424
 425         return rootThreshold;
 426     }
 427
 428     template<bool isUp>
 429     __device__ __forceinline__ int get(int x, int y, uchar4 area)
 430     {
 431         int a = tex2D(thogluv, x + area.x, y + area.y);
 432         int b = tex2D(thogluv, x + area.z, y + area.y);
 433         int c = tex2D(thogluv, x + area.z, y + area.w);
 434         int d = tex2D(thogluv, x + area.x, y + area.w);
 435
 436         return (a - b + c - d);
 437     }
 438
 439     template<>
 440     __device__ __forceinline__ int get<true>(int x, int y, uchar4 area)
 441     {
 442         x += area.x;
 443         y += area.y;
 444
 445         int a = tex2D(thogluv, x, y);
 446         int b = tex2D(thogluv, x + area.z, y);
 447         int c = tex2D(thogluv, x + area.z, y + area.w);
 448         int d = tex2D(thogluv, x, y + area.w);
 449
 450         return (a - b + c - d);
 451     }
 452
 453     texture<float2,  cudaTextureType2D, cudaReadModeElementType> troi;
 454
 455 template<typename Policy>
 456 template<bool isUp>
 457 __device_inline__ void CascadeInvoker<Policy>::detect(Detection* objects, const uint ndetections, uint* ctr, const int downscales) const
 458 {
 459     const int y = blockIdx.y * blockDim.y + threadIdx.y;
 460     const int x = blockIdx.x;
 461
 462     // load Level
 463     __shared__ Level level;
 464
 465     // check POI
 466     __shared__ volatile char roiCache[Policy::STA_Y];
 467
 468     if (!threadIdx.y && !threadIdx.x)
 469         ((float2*)roiCache)[threadIdx.x] = tex2D(troi, blockIdx.y, x);
 470
 471     __syncthreads();
 472
 473     if (!roiCache[threadIdx.y]) return;
 474
 475     if (!threadIdx.x)
 476         level = levels[downscales + blockIdx.z];
 477
 478     if(x >= level.workRect.x || y >= level.workRect.y) return;
 479
 480     int st = level.octave * level.step;
 481     const int stEnd = st + level.step;
 482
 483     const int hogluvStep = gridDim.y * Policy::STA_Y;
 484     float confidence = 0.f;
 485     for(; st < stEnd; st += Policy::WARP)
 486     {
 487         const int nId = (st + threadIdx.x) * 3;
 488
 489         Node node = nodes[nId];
 490
 491         float threshold = rescale<isUp>(level, node);
 492         int sum = get<isUp>(x, y + (node.threshold >> 28) * hogluvStep, node.rect);
 493
 494         int next = 1 + (int)(sum >= threshold);
 495
 496         node = nodes[nId + next];
 497         threshold = rescale<isUp>(level, node);
 498         sum = get<isUp>(x, y + (node.threshold >> 28) * hogluvStep, node.rect);
 499
 500         const int lShift = (next - 1) * 2 + (int)(sum >= threshold);
 501         float impact = leaves[(st + threadIdx.x) * 4 + lShift];
 502
 503         PrefixSum<Policy>::apply(impact);
 504
 505     #if __CUDA_ARCH__ >= 120
 506         if(__any((confidence + impact <= stages[(st + threadIdx.x)]))) st += 2048;
 507     #endif
 508     #if __CUDA_ARCH__ >= 300
 509         impact = __shfl(impact, 31);
 510     #endif
 511
 512         confidence += impact;
 513     }
 514
 515     if(!threadIdx.x && st == stEnd &&  ((confidence - FLT_EPSILON) >= 0))
 516     {
 517         int idx = atomicInc(ctr, ndetections);
 518         objects[idx] = Detection(__float2int_rn(x * Policy::SHRINKAGE),
 519             __float2int_rn(y * Policy::SHRINKAGE), level.objSize.x, level.objSize.y, confidence);
 520     }
 521 }
 522
 523 template<typename Policy, bool isUp>
 524 __global__ void soft_cascade(const CascadeInvoker<Policy> invoker, Detection* objects, const uint n, uint* ctr, const int downs)
 525 {
 526     invoker.template detect<isUp>(objects, n, ctr, downs);
 527 }
 528
 529 template<typename Policy>
 530 void CascadeInvoker<Policy>::operator()(const cv::cuda::PtrStepSzb& roi, const cv::cuda::PtrStepSzi& hogluv,
 531     cv::cuda::PtrStepSz<uchar4> objects, const int downscales, const cudaStream_t& stream) const
 532 {
 533     int fw = roi.rows;
 534     int fh = roi.cols;
 535
 536     dim3 grid(fw, fh / Policy::STA_Y, downscales);
 537
 538     uint* ctr = (uint*)(objects.ptr(0));
 539     Detection* det = ((Detection*)objects.ptr(0)) + 1;
 540     uint max_det = objects.cols / sizeof(Detection);
 541
 542     cudaChannelFormatDesc desc = cudaCreateChannelDesc<int>();
 543     cudaSafeCall( cudaBindTexture2D(0, thogluv, hogluv.data, desc, hogluv.cols, hogluv.rows, hogluv.step));
 544
 545     cudaChannelFormatDesc desc_roi = cudaCreateChannelDesc<typename Policy::roi_type>();
 546     cudaSafeCall( cudaBindTexture2D(0, troi, roi.data, desc_roi, roi.cols / Policy::STA_Y, roi.rows, roi.step));
 547
 548     const CascadeInvoker<Policy> inv = *this;
 549
 550     soft_cascade<Policy, false><<<grid, Policy::block(), 0, stream>>>(inv, det, max_det, ctr, 0);
 551     cudaSafeCall( cudaGetLastError());
 552
 553     grid = dim3(fw, fh / Policy::STA_Y, min(38, scales) - downscales);
 554     soft_cascade<Policy, true><<<grid, Policy::block(), 0, stream>>>(inv, det, max_det, ctr, downscales);
 555
 556     if (!stream)
 557     {
 558         cudaSafeCall( cudaGetLastError());
 559         cudaSafeCall( cudaDeviceSynchronize());
 560     }
 561 }
 562
 563 template void CascadeInvoker<GK107PolicyX4>::operator()(const cv::cuda::PtrStepSzb& roi, const cv::cuda::PtrStepSzi& hogluv,
 564     cv::cuda::PtrStepSz<uchar4> objects, const int downscales, const cudaStream_t& stream) const;
 565
 566 }}}