#include <algorithm>\r
#include "internal_shared.hpp"\r
\r
-namespace cv { namespace gpu { namespace device \r
+namespace cv { namespace gpu { namespace device\r
{\r
- namespace canny \r
+ namespace canny\r
{\r
__global__ void calcSobelRowPass(const PtrStepb src, PtrStepi dx_buf, PtrStepi dy_buf, int rows, int cols)\r
{\r
}\r
};\r
\r
- template <typename Norm> __global__ void calcMagnitude(const PtrStepi dx_buf, const PtrStepi dy_buf, \r
+ template <typename Norm> __global__ void calcMagnitude(const PtrStepi dx_buf, const PtrStepi dy_buf,\r
PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols)\r
{\r
__shared__ int sdx[18][16];\r
}\r
\r
//////////////////////////////////////////////////////////////////////////////////////////\r
- \r
+\r
#define CANNY_SHIFT 15\r
#define TG22 (int)(0.4142135623730950488016887242097*(1<<CANNY_SHIFT) + 0.5)\r
\r
edge_type = 1 + (int)(m > high_thresh);\r
}\r
}\r
- \r
+\r
map.ptr(i + 1)[j + 1] = edge_type;\r
}\r
}\r
\r
const int tid = threadIdx.y * 16 + threadIdx.x;\r
const int lx = tid % 18;\r
- const int ly = tid / 18; \r
+ const int ly = tid / 18;\r
\r
if (ly < 14)\r
smem[ly][lx] = map.ptr(blockIdx.y * 16 + ly)[blockIdx.x * 16 + lx];\r
n += smem[threadIdx.y ][threadIdx.x ] == 2;\r
n += smem[threadIdx.y ][threadIdx.x + 1] == 2;\r
n += smem[threadIdx.y ][threadIdx.x + 2] == 2;\r
- \r
+\r
n += smem[threadIdx.y + 1][threadIdx.x ] == 2;\r
n += smem[threadIdx.y + 1][threadIdx.x + 2] == 2;\r
- \r
+\r
n += smem[threadIdx.y + 2][threadIdx.x ] == 2;\r
n += smem[threadIdx.y + 2][threadIdx.x + 1] == 2;\r
n += smem[threadIdx.y + 2][threadIdx.x + 2] == 2;\r
n += smem[threadIdx.y ][threadIdx.x ] == 1;\r
n += smem[threadIdx.y ][threadIdx.x + 1] == 1;\r
n += smem[threadIdx.y ][threadIdx.x + 2] == 1;\r
- \r
+\r
n += smem[threadIdx.y + 1][threadIdx.x ] == 1;\r
n += smem[threadIdx.y + 1][threadIdx.x + 2] == 1;\r
- \r
+\r
n += smem[threadIdx.y + 2][threadIdx.x ] == 1;\r
n += smem[threadIdx.y + 2][threadIdx.x + 1] == 1;\r
n += smem[threadIdx.y + 2][threadIdx.x + 2] == 1;\r
#if __CUDA_ARCH__ >= 120\r
\r
const int stack_size = 512;\r
- \r
+\r
__shared__ unsigned int s_counter;\r
__shared__ unsigned int s_ind;\r
__shared__ ushort2 s_st[stack_size];\r
if (subTaskIdx < portion)\r
pos = s_st[s_counter - 1 - subTaskIdx];\r
__syncthreads();\r
- \r
+\r
if (threadIdx.x == 0)\r
s_counter -= portion;\r
__syncthreads();\r
- \r
+\r
if (pos.x > 0 && pos.x <= cols && pos.y > 0 && pos.y <= rows)\r
{\r
pos.x += c_dx[threadIdx.x & 7];\r
{\r
void* counter_ptr;\r
cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, counter) );\r
- \r
+\r
unsigned int count;\r
cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost) );\r
\r
#include <opencv2/gpu/device/color.hpp>\r
#include <cvt_colot_internal.h>\r
\r
-namespace cv { namespace gpu { namespace device \r
+namespace cv { namespace gpu { namespace device\r
{\r
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_rgba_traits<uchar>::functor_type)\r
{\r
{\r
enum { smart_block_dim_y = 8 };\r
enum { smart_shift = 4 };\r
- }; \r
+ };\r
\r
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_xyz4_traits<uchar>::functor_type)\r
{\r
#include "opencv2/gpu/device/border_interpolate.hpp"\r
#include "opencv2/gpu/device/static_check.hpp"\r
\r
-namespace cv { namespace gpu { namespace device \r
+namespace cv { namespace gpu { namespace device\r
{\r
- namespace column_filter \r
+ namespace column_filter\r
{\r
#define MAX_KERNEL_SIZE 32\r
\r
\r
const dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y);\r
const dim3 grid(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y * PATCH_PER_BLOCK));\r
- \r
+\r
B<T> brd(src.rows);\r
\r
linearColumnFilter<KSIZE, T, D><<<grid, block, 0, stream>>>(src, dst, anchor, brd);\r
{\r
typedef void (*caller_t)(DevMem2D_<T> src, DevMem2D_<D> dst, int anchor, int cc, cudaStream_t stream);\r
\r
- static const caller_t callers[5][33] = \r
+ static const caller_t callers[5][33] =\r
{\r
{\r
0,\r
linearColumnFilter_caller<30, T, D, BrdColWrap>,\r
linearColumnFilter_caller<31, T, D, BrdColWrap>,\r
linearColumnFilter_caller<32, T, D, BrdColWrap>\r
- } \r
+ }\r
};\r
- \r
+\r
loadKernel(kernel, ksize);\r
\r
callers[brd_type][ksize]((DevMem2D_<T>)src, (DevMem2D_<D>)dst, anchor, cc, stream);\r
#include "internal_shared.hpp"\r
#include "opencv2/gpu/device/border_interpolate.hpp"\r
\r
-namespace cv { namespace gpu { namespace device \r
+namespace cv { namespace gpu { namespace device\r
{\r
- namespace imgproc \r
+ namespace imgproc\r
{\r
template <typename Ptr2D, typename T> __global__ void copyMakeBorder(const Ptr2D src, DevMem2D_<T> dst, int top, int left)\r
{\r
\r
template <template <typename> class B, typename T> struct CopyMakeBorderDispatcher\r
{\r
- static void call(const DevMem2D_<T>& src, const DevMem2D_<T>& dst, int top, int left, \r
+ static void call(const DevMem2D_<T>& src, const DevMem2D_<T>& dst, int top, int left,\r
const typename VecTraits<T>::elem_type* borderValue, cudaStream_t stream)\r
- { \r
+ {\r
dim3 block(32, 8);\r
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));\r
\r
}\r
};\r
\r
- template <typename T, int cn> void copyMakeBorder_gpu(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, \r
+ template <typename T, int cn> void copyMakeBorder_gpu(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode,\r
const T* borderValue, cudaStream_t stream)\r
{\r
typedef typename TypeVec<T, cn>::vec_type vec_type;\r
\r
typedef void (*caller_t)(const DevMem2D_<vec_type>& src, const DevMem2D_<vec_type>& dst, int top, int left, const T* borderValue, cudaStream_t stream);\r
\r
- static const caller_t callers[5] = \r
+ static const caller_t callers[5] =\r
{\r
- CopyMakeBorderDispatcher<BrdReflect101, vec_type>::call, \r
- CopyMakeBorderDispatcher<BrdReplicate, vec_type>::call, \r
- CopyMakeBorderDispatcher<BrdConstant, vec_type>::call, \r
- CopyMakeBorderDispatcher<BrdReflect, vec_type>::call, \r
- CopyMakeBorderDispatcher<BrdWrap, vec_type>::call \r
+ CopyMakeBorderDispatcher<BrdReflect101, vec_type>::call,\r
+ CopyMakeBorderDispatcher<BrdReplicate, vec_type>::call,\r
+ CopyMakeBorderDispatcher<BrdConstant, vec_type>::call,\r
+ CopyMakeBorderDispatcher<BrdReflect, vec_type>::call,\r
+ CopyMakeBorderDispatcher<BrdWrap, vec_type>::call\r
};\r
\r
callers[borderMode](DevMem2D_<vec_type>(src), DevMem2D_<vec_type>(dst), top, left, borderValue, stream);\r
//\r
// Copyright (c) 2010, Paul Furgale, Chi Hay Tong\r
//\r
-// The original code was written by Paul Furgale and Chi Hay Tong \r
+// The original code was written by Paul Furgale and Chi Hay Tong\r
// and later optimized and prepared for integration into OpenCV by Itseez.\r
//\r
//M*/\r
#include "opencv2/gpu/device/common.hpp"\r
#include "opencv2/gpu/device/utility.hpp"\r
\r
-namespace cv { namespace gpu { namespace device \r
+namespace cv { namespace gpu { namespace device\r
{\r
- namespace fast \r
+ namespace fast\r
{\r
__device__ unsigned int g_counter = 0;\r
\r
\r
\r
\r
- d1 = diffType(v, C[0] & 0xff, th); \r
+ d1 = diffType(v, C[0] & 0xff, th);\r
d2 = diffType(v, C[2] & 0xff, th);\r
\r
if ((d1 | d2) == 0)\r
return;\r
\r
mask1 |= (d1 & 1) << 0;\r
- mask2 |= ((d1 & 2) >> 1) << 0; \r
+ mask2 |= ((d1 & 2) >> 1) << 0;\r
\r
mask1 |= (d2 & 1) << 8;\r
mask2 |= ((d2 & 2) >> 1) << 8;\r
return;*/\r
\r
mask1 |= (d1 & 1) << 1;\r
- mask2 |= ((d1 & 2) >> 1) << 1; \r
+ mask2 |= ((d1 & 2) >> 1) << 1;\r
\r
mask1 |= (d2 & 1) << 9;\r
mask2 |= ((d2 & 2) >> 1) << 9;\r
return;*/\r
\r
mask1 |= (d1 & 1) << 5;\r
- mask2 |= ((d1 & 2) >> 1) << 5; \r
+ mask2 |= ((d1 & 2) >> 1) << 5;\r
\r
mask1 |= (d2 & 1) << 13;\r
mask2 |= ((d2 & 2) >> 1) << 13;\r
// 0 -> not a keypoint\r
__device__ __forceinline__ bool isKeyPoint(int mask1, int mask2)\r
{\r
- return (__popc(mask1) > 8 && (c_table[(mask1 >> 3) - 63] & (1 << (mask1 & 7)))) || \r
+ return (__popc(mask1) > 8 && (c_table[(mask1 >> 3) - 63] & (1 << (mask1 & 7)))) ||\r
(__popc(mask2) > 8 && (c_table[(mask2 >> 3) - 63] & (1 << (mask2 & 7))));\r
}\r
\r
calcMask(C, v, mid, mask1, mask2);\r
\r
int isKp = static_cast<int>(isKeyPoint(mask1, mask2));\r
- \r
+\r
min = isKp * (mid + 1) + (isKp ^ 1) * min;\r
max = (isKp ^ 1) * (mid - 1) + isKp * max;\r
}\r
\r
return min - 1;\r
}\r
- \r
+\r
template <bool calcScore, class Mask>\r
__global__ void calcKeypoints(const DevMem2Db img, const Mask mask, short2* kpLoc, const unsigned int maxKeypoints, PtrStepi score, const int threshold)\r
{\r
C[2] |= static_cast<uint>(img(i - 1, j - 3)) << (3 * 8);\r
C[1] |= static_cast<uint>(img(i - 1, j + 3)) << 8;\r
\r
- C[3] |= static_cast<uint>(img(i, j - 3)); \r
+ C[3] |= static_cast<uint>(img(i, j - 3));\r
v = static_cast<int>(img(i, j));\r
C[1] |= static_cast<uint>(img(i, j + 3));\r
\r
cudaSafeCall( cudaGetLastError() );\r
\r
cudaSafeCall( cudaDeviceSynchronize() );\r
- \r
+\r
unsigned int count;\r
cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost) );\r
\r
\r
int score = scoreMat(loc.y, loc.x);\r
\r
- bool ismax = \r
+ bool ismax =\r
score > scoreMat(loc.y - 1, loc.x - 1) &&\r
score > scoreMat(loc.y - 1, loc.x ) &&\r
score > scoreMat(loc.y - 1, loc.x + 1) &&\r
\r
score > scoreMat(loc.y , loc.x - 1) &&\r
score > scoreMat(loc.y , loc.x + 1) &&\r
- \r
+\r
score > scoreMat(loc.y + 1, loc.x - 1) &&\r
score > scoreMat(loc.y + 1, loc.x ) &&\r
score > scoreMat(loc.y + 1, loc.x + 1);\r
cudaSafeCall( cudaGetLastError() );\r
\r
cudaSafeCall( cudaDeviceSynchronize() );\r
- \r
+\r
unsigned int new_count;\r
cudaSafeCall( cudaMemcpy(&new_count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost) );\r
\r
//\r
// Copyright (c) 2010, Paul Furgale, Chi Hay Tong\r
//\r
-// The original code was written by Paul Furgale and Chi Hay Tong \r
+// The original code was written by Paul Furgale and Chi Hay Tong\r
// and later optimized and prepared for integration into OpenCV by Itseez.\r
//\r
//M*/\r
#include "opencv2/gpu/device/common.hpp"\r
#include "opencv2/gpu/device/utility.hpp"\r
\r
-namespace cv { namespace gpu { namespace device \r
+namespace cv { namespace gpu { namespace device\r
{\r
- namespace gfft \r
+ namespace gfft\r
{\r
texture<float, cudaTextureType2D, cudaReadModeElementType> eigTex(0, cudaFilterModePoint, cudaAddressModeClamp);\r
\r
cudaSafeCall( cudaGetLastError() );\r
\r
cudaSafeCall( cudaDeviceSynchronize() );\r
- \r
+\r
uint count;\r
cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(uint), cudaMemcpyDeviceToHost) );\r
\r
\r
class EigGreater\r
{\r
- public: \r
- __device__ __forceinline__ bool operator()(float2 a, float2 b) const \r
- { \r
+ public:\r
+ __device__ __forceinline__ bool operator()(float2 a, float2 b) const\r
+ {\r
return tex2D(eigTex, a.x, a.y) > tex2D(eigTex, b.x, b.y);\r
}\r
};\r
#include "opencv2/gpu/device/utility.hpp"\r
#include "opencv2/gpu/device/saturate_cast.hpp"\r
\r
-namespace cv { namespace gpu { namespace device \r
+namespace cv { namespace gpu { namespace device\r
{\r
#define UINT_BITS 32U\r
\r
\r
#define USE_SMEM_ATOMICS (__CUDA_ARCH__ >= 120)\r
\r
- namespace hist \r
+ namespace hist\r
{\r
#if (!USE_SMEM_ATOMICS)\r
\r
{\r
histogram256<<<PARTIAL_HISTOGRAM256_COUNT, HISTOGRAM256_THREADBLOCK_SIZE, 0, stream>>>(\r
DevMem2D_<uint>(src),\r
- buf, \r
+ buf,\r
static_cast<uint>(src.rows * src.step / sizeof(uint)),\r
src.cols);\r
\r
\r
#include "internal_shared.hpp"\r
\r
-namespace cv { namespace gpu { namespace device \r
+namespace cv { namespace gpu { namespace device\r
{\r
// Other values are not supported\r
#define CELL_WIDTH 8\r
#define CELLS_PER_BLOCK_X 2\r
#define CELLS_PER_BLOCK_Y 2\r
\r
- namespace hog \r
+ namespace hog\r
{\r
__constant__ int cnbins;\r
__constant__ int cblock_stride_x;\r
__constant__ int cdescr_width;\r
\r
\r
- /* Returns the nearest upper power of two, works only for \r
+ /* Returns the nearest upper power of two, works only for\r
the typical GPU thread count (pert block) values */\r
int power_2up(unsigned int n)\r
{\r
}\r
\r
\r
- void set_up_constants(int nbins, int block_stride_x, int block_stride_y, \r
+ void set_up_constants(int nbins, int block_stride_x, int block_stride_y,\r
int nblocks_win_x, int nblocks_win_y)\r
{\r
- cudaSafeCall( cudaMemcpyToSymbol(cnbins, &nbins, sizeof(nbins)) ); \r
- cudaSafeCall( cudaMemcpyToSymbol(cblock_stride_x, &block_stride_x, sizeof(block_stride_x)) ); \r
- cudaSafeCall( cudaMemcpyToSymbol(cblock_stride_y, &block_stride_y, sizeof(block_stride_y)) ); \r
- cudaSafeCall( cudaMemcpyToSymbol(cnblocks_win_x, &nblocks_win_x, sizeof(nblocks_win_x)) ); \r
- cudaSafeCall( cudaMemcpyToSymbol(cnblocks_win_y, &nblocks_win_y, sizeof(nblocks_win_y)) ); \r
+ cudaSafeCall( cudaMemcpyToSymbol(cnbins, &nbins, sizeof(nbins)) );\r
+ cudaSafeCall( cudaMemcpyToSymbol(cblock_stride_x, &block_stride_x, sizeof(block_stride_x)) );\r
+ cudaSafeCall( cudaMemcpyToSymbol(cblock_stride_y, &block_stride_y, sizeof(block_stride_y)) );\r
+ cudaSafeCall( cudaMemcpyToSymbol(cnblocks_win_x, &nblocks_win_x, sizeof(nblocks_win_x)) );\r
+ cudaSafeCall( cudaMemcpyToSymbol(cnblocks_win_y, &nblocks_win_y, sizeof(nblocks_win_y)) );\r
\r
- int block_hist_size = nbins * CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y; \r
- cudaSafeCall( cudaMemcpyToSymbol(cblock_hist_size, &block_hist_size, sizeof(block_hist_size)) ); \r
+ int block_hist_size = nbins * CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y;\r
+ cudaSafeCall( cudaMemcpyToSymbol(cblock_hist_size, &block_hist_size, sizeof(block_hist_size)) );\r
\r
- int block_hist_size_2up = power_2up(block_hist_size); \r
+ int block_hist_size_2up = power_2up(block_hist_size);\r
cudaSafeCall( cudaMemcpyToSymbol(cblock_hist_size_2up, &block_hist_size_2up, sizeof(block_hist_size_2up)) );\r
\r
int descr_width = nblocks_win_x * block_hist_size;\r
\r
\r
template <int nblocks> // Number of histogram blocks processed by single GPU thread block\r
- __global__ void compute_hists_kernel_many_blocks(const int img_block_width, const PtrElemStepf grad, \r
+ __global__ void compute_hists_kernel_many_blocks(const int img_block_width, const PtrElemStepf grad,\r
const PtrElemStep qangle, float scale, float* block_hists)\r
{\r
const int block_x = threadIdx.z;\r
float* hists = smem;\r
float* final_hist = smem + cnbins * 48 * nblocks;\r
\r
- const int offset_x = (blockIdx.x * blockDim.z + block_x) * cblock_stride_x + \r
+ const int offset_x = (blockIdx.x * blockDim.z + block_x) * cblock_stride_x +\r
4 * cell_x + cell_thread_x;\r
const int offset_y = blockIdx.y * cblock_stride_y + 4 * cell_y;\r
\r
// 12 means that 12 pixels affect on block's cell (in one row)\r
if (cell_thread_x < 12)\r
{\r
- float* hist = hists + 12 * (cell_y * blockDim.z * CELLS_PER_BLOCK_Y + \r
- cell_x + block_x * CELLS_PER_BLOCK_X) + \r
+ float* hist = hists + 12 * (cell_y * blockDim.z * CELLS_PER_BLOCK_Y +\r
+ cell_x + block_x * CELLS_PER_BLOCK_X) +\r
cell_thread_x;\r
for (int bin_id = 0; bin_id < cnbins; ++bin_id)\r
hist[bin_id * 48 * nblocks] = 0.f;\r
int dist_center_y = dist_y - 4 * (1 - 2 * cell_y);\r
int dist_center_x = dist_x - 4 * (1 - 2 * cell_x);\r
\r
- float gaussian = ::expf(-(dist_center_y * dist_center_y + \r
+ float gaussian = ::expf(-(dist_center_y * dist_center_y +\r
dist_center_x * dist_center_x) * scale);\r
- float interp_weight = (8.f - ::fabs(dist_y + 0.5f)) * \r
+ float interp_weight = (8.f - ::fabs(dist_y + 0.5f)) *\r
(8.f - ::fabs(dist_x + 0.5f)) / 64.f;\r
\r
hist[bin.x * 48 * nblocks] += gaussian * interp_weight * vote.x;\r
{\r
if (cell_thread_x < 6) hist_[0] += hist_[6];\r
if (cell_thread_x < 3) hist_[0] += hist_[3];\r
- if (cell_thread_x == 0) \r
- final_hist[((cell_x + block_x * 2) * 2 + cell_y) * cnbins + bin_id] \r
+ if (cell_thread_x == 0)\r
+ final_hist[((cell_x + block_x * 2) * 2 + cell_y) * cnbins + bin_id]\r
= hist_[0] + hist_[1] + hist_[2];\r
}\r
}\r
\r
__syncthreads();\r
\r
- float* block_hist = block_hists + (blockIdx.y * img_block_width + \r
- blockIdx.x * blockDim.z + block_x) * \r
- cblock_hist_size; \r
+ float* block_hist = block_hists + (blockIdx.y * img_block_width +\r
+ blockIdx.x * blockDim.z + block_x) *\r
+ cblock_hist_size;\r
\r
int tid = (cell_y * CELLS_PER_BLOCK_Y + cell_x) * 16 + cell_thread_x;\r
if (tid < cblock_hist_size)\r
- block_hist[tid] = final_hist[block_x * cblock_hist_size + tid]; \r
+ block_hist[tid] = final_hist[block_x * cblock_hist_size + tid];\r
}\r
\r
\r
- void compute_hists(int nbins, int block_stride_x, int block_stride_y, \r
- int height, int width, const DevMem2Df& grad, \r
- const DevMem2Db& qangle, float sigma, float* block_hists) \r
+ void compute_hists(int nbins, int block_stride_x, int block_stride_y,\r
+ int height, int width, const DevMem2Df& grad,\r
+ const DevMem2Db& qangle, float sigma, float* block_hists)\r
{\r
const int nblocks = 1;\r
\r
- int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / \r
+ int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) /\r
block_stride_x;\r
- int img_block_height = (height - CELLS_PER_BLOCK_Y * CELL_HEIGHT + block_stride_y) / \r
+ int img_block_height = (height - CELLS_PER_BLOCK_Y * CELL_HEIGHT + block_stride_y) /\r
block_stride_y;\r
\r
dim3 grid(divUp(img_block_width, nblocks), img_block_height);\r
dim3 threads(32, 2, nblocks);\r
\r
- cudaSafeCall(cudaFuncSetCacheConfig(compute_hists_kernel_many_blocks<nblocks>, \r
+ cudaSafeCall(cudaFuncSetCacheConfig(compute_hists_kernel_many_blocks<nblocks>,\r
cudaFuncCachePreferL1));\r
- \r
+\r
// Precompute gaussian spatial window parameter\r
float scale = 1.f / (2.f * sigma * sigma);\r
\r
//\r
\r
\r
- template<int size> \r
+ template<int size>\r
__device__ float reduce_smem(volatile float* smem)\r
- { \r
+ {\r
unsigned int tid = threadIdx.x;\r
float sum = smem[tid];\r
\r
if (size >= 512) { if (tid < 256) smem[tid] = sum = sum + smem[tid + 256]; __syncthreads(); }\r
if (size >= 256) { if (tid < 128) smem[tid] = sum = sum + smem[tid + 128]; __syncthreads(); }\r
if (size >= 128) { if (tid < 64) smem[tid] = sum = sum + smem[tid + 64]; __syncthreads(); }\r
- \r
+\r
if (tid < 32)\r
- { \r
+ {\r
if (size >= 64) smem[tid] = sum = sum + smem[tid + 32];\r
if (size >= 32) smem[tid] = sum = sum + smem[tid + 16];\r
if (size >= 16) smem[tid] = sum = sum + smem[tid + 8];\r
\r
__syncthreads();\r
sum = smem[0];\r
- \r
+\r
return sum;\r
}\r
\r
\r
- template <int nthreads, // Number of threads which process one block historgam \r
+ template <int nthreads, // Number of threads which process one block historgam\r
int nblocks> // Number of block hisograms processed by one GPU thread block\r
__global__ void normalize_hists_kernel_many_blocks(const int block_hist_size,\r
- const int img_block_width, \r
+ const int img_block_width,\r
float* block_hists, float threshold)\r
{\r
if (blockIdx.x * blockDim.z + threadIdx.z >= img_block_width)\r
return;\r
\r
- float* hist = block_hists + (blockIdx.y * img_block_width + \r
- blockIdx.x * blockDim.z + threadIdx.z) * \r
+ float* hist = block_hists + (blockIdx.y * img_block_width +\r
+ blockIdx.x * blockDim.z + threadIdx.z) *\r
block_hist_size + threadIdx.x;\r
- \r
+\r
__shared__ float sh_squares[nthreads * nblocks];\r
float* squares = sh_squares + threadIdx.z * nthreads;\r
- \r
+\r
float elem = 0.f;\r
if (threadIdx.x < block_hist_size)\r
elem = hist[0];\r
- \r
- squares[threadIdx.x] = elem * elem; \r
+\r
+ squares[threadIdx.x] = elem * elem;\r
\r
__syncthreads();\r
float sum = reduce_smem<nthreads>(squares);\r
- \r
- float scale = 1.0f / (::sqrtf(sum) + 0.1f * block_hist_size); \r
+\r
+ float scale = 1.0f / (::sqrtf(sum) + 0.1f * block_hist_size);\r
elem = ::min(elem * scale, threshold);\r
- \r
+\r
__syncthreads();\r
squares[threadIdx.x] = elem * elem;\r
\r
__syncthreads();\r
sum = reduce_smem<nthreads>(squares);\r
scale = 1.0f / (::sqrtf(sum) + 1e-3f);\r
- \r
+\r
if (threadIdx.x < block_hist_size)\r
hist[0] = elem * scale;\r
}\r
\r
\r
- void normalize_hists(int nbins, int block_stride_x, int block_stride_y, \r
+ void normalize_hists(int nbins, int block_stride_x, int block_stride_y,\r
int height, int width, float* block_hists, float threshold)\r
- { \r
+ {\r
const int nblocks = 1;\r
\r
int block_hist_size = nbins * CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y;\r
//\r
\r
\r
- template <int nthreads, // Number of threads per one histogram block \r
+ template <int nthreads, // Number of threads per one histogram block\r
int nblocks> // Number of histogram block processed by single GPU thread block\r
- __global__ void classify_hists_kernel_many_blocks(const int img_win_width, const int img_block_width, \r
+ __global__ void classify_hists_kernel_many_blocks(const int img_win_width, const int img_block_width,\r
const int win_block_stride_x, const int win_block_stride_y,\r
const float* block_hists, const float* coefs,\r
float free_coef, float threshold, unsigned char* labels)\r
- { \r
+ {\r
const int win_x = threadIdx.z;\r
if (blockIdx.x * blockDim.z + win_x >= img_win_width)\r
return;\r
\r
- const float* hist = block_hists + (blockIdx.y * win_block_stride_y * img_block_width + \r
- blockIdx.x * win_block_stride_x * blockDim.z + win_x) * \r
+ const float* hist = block_hists + (blockIdx.y * win_block_stride_y * img_block_width +\r
+ blockIdx.x * win_block_stride_x * blockDim.z + win_x) *\r
cblock_hist_size;\r
\r
float product = 0.f;\r
\r
__syncthreads();\r
\r
- if (nthreads >= 512) \r
- { \r
+ if (nthreads >= 512)\r
+ {\r
if (threadIdx.x < 256) products[tid] = product = product + products[tid + 256];\r
- __syncthreads(); \r
+ __syncthreads();\r
}\r
- if (nthreads >= 256) \r
- { \r
- if (threadIdx.x < 128) products[tid] = product = product + products[tid + 128]; \r
- __syncthreads(); \r
+ if (nthreads >= 256)\r
+ {\r
+ if (threadIdx.x < 128) products[tid] = product = product + products[tid + 128];\r
+ __syncthreads();\r
}\r
- if (nthreads >= 128) \r
- { \r
- if (threadIdx.x < 64) products[tid] = product = product + products[tid + 64]; \r
- __syncthreads(); \r
+ if (nthreads >= 128)\r
+ {\r
+ if (threadIdx.x < 64) products[tid] = product = product + products[tid + 64];\r
+ __syncthreads();\r
}\r
- \r
+\r
if (threadIdx.x < 32)\r
- { \r
+ {\r
volatile float* smem = products;\r
if (nthreads >= 64) smem[tid] = product = product + smem[tid + 32];\r
if (nthreads >= 32) smem[tid] = product = product + smem[tid + 16];\r
}\r
\r
\r
- void classify_hists(int win_height, int win_width, int block_stride_y, int block_stride_x, \r
- int win_stride_y, int win_stride_x, int height, int width, float* block_hists, \r
+ void classify_hists(int win_height, int win_width, int block_stride_y, int block_stride_x,\r
+ int win_stride_y, int win_stride_x, int height, int width, float* block_hists,\r
float* coefs, float free_coef, float threshold, unsigned char* labels)\r
- { \r
+ {\r
const int nthreads = 256;\r
const int nblocks = 1;\r
\r
\r
int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / block_stride_x;\r
classify_hists_kernel_many_blocks<nthreads, nblocks><<<grid, threads>>>(\r
- img_win_width, img_block_width, win_block_stride_x, win_block_stride_y, \r
+ img_win_width, img_block_width, win_block_stride_x, win_block_stride_y,\r
block_hists, coefs, free_coef, threshold, labels);\r
cudaSafeCall( cudaGetLastError() );\r
\r
\r
\r
template <int nthreads>\r
- __global__ void extract_descrs_by_rows_kernel(const int img_block_width, const int win_block_stride_x, const int win_block_stride_y, \r
+ __global__ void extract_descrs_by_rows_kernel(const int img_block_width, const int win_block_stride_x, const int win_block_stride_y,\r
const float* block_hists, PtrElemStepf descriptors)\r
{\r
// Get left top corner of the window in src\r
- const float* hist = block_hists + (blockIdx.y * win_block_stride_y * img_block_width + \r
+ const float* hist = block_hists + (blockIdx.y * win_block_stride_y * img_block_width +\r
blockIdx.x * win_block_stride_x) * cblock_hist_size;\r
\r
// Get left top corner of the window in dst\r
}\r
\r
\r
- void extract_descrs_by_rows(int win_height, int win_width, int block_stride_y, int block_stride_x, int win_stride_y, int win_stride_x, \r
+ void extract_descrs_by_rows(int win_height, int win_width, int block_stride_y, int block_stride_x, int win_stride_y, int win_stride_x,\r
int height, int width, float* block_hists, DevMem2Df descriptors)\r
{\r
const int nthreads = 256;\r
\r
\r
template <int nthreads>\r
- __global__ void extract_descrs_by_cols_kernel(const int img_block_width, const int win_block_stride_x, \r
- const int win_block_stride_y, const float* block_hists, \r
+ __global__ void extract_descrs_by_cols_kernel(const int img_block_width, const int win_block_stride_x,\r
+ const int win_block_stride_y, const float* block_hists,\r
PtrElemStepf descriptors)\r
{\r
// Get left top corner of the window in src\r
- const float* hist = block_hists + (blockIdx.y * win_block_stride_y * img_block_width + \r
+ const float* hist = block_hists + (blockIdx.y * win_block_stride_y * img_block_width +\r
blockIdx.x * win_block_stride_x) * cblock_hist_size;\r
\r
// Get left top corner of the window in dst\r
int y = block_idx / cnblocks_win_x;\r
int x = block_idx - y * cnblocks_win_x;\r
\r
- descriptor[(x * cnblocks_win_y + y) * cblock_hist_size + idx_in_block] \r
+ descriptor[(x * cnblocks_win_y + y) * cblock_hist_size + idx_in_block]\r
= hist[(y * img_block_width + x) * cblock_hist_size + idx_in_block];\r
}\r
}\r
\r
\r
- void extract_descrs_by_cols(int win_height, int win_width, int block_stride_y, int block_stride_x, \r
- int win_stride_y, int win_stride_x, int height, int width, float* block_hists, \r
+ void extract_descrs_by_cols(int win_height, int win_width, int block_stride_y, int block_stride_x,\r
+ int win_stride_y, int win_stride_x, int height, int width, float* block_hists,\r
DevMem2Df descriptors)\r
{\r
const int nthreads = 256;\r
\r
\r
template <int nthreads, int correct_gamma>\r
- __global__ void compute_gradients_8UC4_kernel(int height, int width, const PtrElemStep img, \r
+ __global__ void compute_gradients_8UC4_kernel(int height, int width, const PtrElemStep img,\r
float angle_scale, PtrElemStepf grad, PtrElemStep qangle)\r
{\r
const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
__shared__ float sh_row[(nthreads + 2) * 3];\r
\r
uchar4 val;\r
- if (x < width) \r
- val = row[x]; \r
- else \r
+ if (x < width)\r
+ val = row[x];\r
+ else\r
val = row[width - 2];\r
\r
sh_row[threadIdx.x + 1] = val.x;\r
\r
float3 dx;\r
if (correct_gamma)\r
- dx = make_float3(::sqrtf(b.x) - ::sqrtf(a.x), ::sqrtf(b.y) - ::sqrtf(a.y), ::sqrtf(b.z) - ::sqrtf(a.z)); \r
+ dx = make_float3(::sqrtf(b.x) - ::sqrtf(a.x), ::sqrtf(b.y) - ::sqrtf(a.y), ::sqrtf(b.z) - ::sqrtf(a.z));\r
else\r
- dx = make_float3(b.x - a.x, b.y - a.y, b.z - a.z); \r
+ dx = make_float3(b.x - a.x, b.y - a.y, b.z - a.z);\r
\r
float3 dy = make_float3(0.f, 0.f, 0.f);\r
\r
\r
float mag0 = dx.x * dx.x + dy.x * dy.x;\r
float mag1 = dx.y * dx.y + dy.y * dy.y;\r
- if (mag0 < mag1) \r
+ if (mag0 < mag1)\r
{\r
best_dx = dx.y;\r
best_dy = dy.y;\r
}\r
\r
\r
- void compute_gradients_8UC4(int nbins, int height, int width, const DevMem2Db& img, \r
+ void compute_gradients_8UC4(int nbins, int height, int width, const DevMem2Db& img,\r
float angle_scale, DevMem2Df grad, DevMem2Db qangle, bool correct_gamma)\r
{\r
const int nthreads = 256;\r
}\r
\r
template <int nthreads, int correct_gamma>\r
- __global__ void compute_gradients_8UC1_kernel(int height, int width, const PtrElemStep img, \r
+ __global__ void compute_gradients_8UC1_kernel(int height, int width, const PtrElemStep img,\r
float angle_scale, PtrElemStepf grad, PtrElemStep qangle)\r
{\r
const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
\r
__shared__ float sh_row[nthreads + 2];\r
\r
- if (x < width) \r
- sh_row[threadIdx.x + 1] = row[x]; \r
- else \r
+ if (x < width)\r
+ sh_row[threadIdx.x + 1] = row[x];\r
+ else\r
sh_row[threadIdx.x + 1] = row[width - 2];\r
\r
if (threadIdx.x == 0)\r
}\r
\r
\r
- void compute_gradients_8UC1(int nbins, int height, int width, const DevMem2Db& img, \r
+ void compute_gradients_8UC1(int nbins, int height, int width, const DevMem2Db& img,\r
float angle_scale, DevMem2Df grad, DevMem2Db qangle, bool correct_gamma)\r
{\r
const int nthreads = 256;\r
unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;\r
\r
if (x < dst.cols && y < dst.rows)\r
- { \r
+ {\r
float4 val = tex2D(resize8UC4_tex, x * sx + colOfs, y * sy);\r
dst.ptr(y)[x] = make_uchar4(val.x * 255, val.y * 255, val.z * 255, val.w * 255);\r
}\r
}\r
\r
- template<class T, class TEX> \r
+ template<class T, class TEX>\r
static void resize_for_hog(const DevMem2Db& src, DevMem2Db dst, TEX& tex)\r
{\r
tex.filterMode = cudaFilterModeLinear;\r
size_t texOfs = 0;\r
int colOfs = 0;\r
\r
- cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>(); \r
+ cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();\r
cudaSafeCall( cudaBindTexture2D(&texOfs, tex, src.data, desc, src.cols, src.rows, src.step) );\r
\r
- if (texOfs != 0) \r
+ if (texOfs != 0)\r
{\r
colOfs = static_cast<int>( texOfs/sizeof(T) );\r
cudaSafeCall( cudaUnbindTexture(tex) );\r
cudaSafeCall( cudaBindTexture2D(&texOfs, tex, src.data, desc, src.cols, src.rows, src.step) );\r
- } \r
+ }\r
\r
dim3 threads(32, 8);\r
dim3 grid(divUp(dst.cols, threads.x), divUp(dst.rows, threads.y));\r
- \r
+\r
float sx = static_cast<float>(src.cols) / dst.cols;\r
float sy = static_cast<float>(src.rows) / dst.rows;\r
\r
\r
void resize_8UC1(const DevMem2Db& src, DevMem2Db dst) { resize_for_hog<uchar> (src, dst, resize8UC1_tex); }\r
void resize_8UC4(const DevMem2Db& src, DevMem2Db dst) { resize_for_hog<uchar4>(src, dst, resize8UC4_tex); }\r
- } // namespace hog \r
+ } // namespace hog\r
}}} // namespace cv { namespace gpu { namespace device\r
#undef IMPLEMENT_FILTER2D_TEX_READER\r
\r
template <typename T, typename D>\r
- void filter2D_gpu(DevMem2Db srcWhole, int ofsX, int ofsY, DevMem2Db dst, \r
- int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel, \r
+ void filter2D_gpu(DevMem2Db srcWhole, int ofsX, int ofsY, DevMem2Db dst,\r
+ int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel,\r
int borderMode, const float* borderValue, cudaStream_t stream)\r
{\r
typedef void (*func_t)(const DevMem2D_<T> srcWhole, int xoff, int yoff, DevMem2D_<D> dst, int kWidth, int kHeight, int anchorX, int anchorY, const float* borderValue, cudaStream_t stream);\r
- static const func_t funcs[] = \r
+ static const func_t funcs[] =\r
{\r
Filter2DCaller<T, D, BrdReflect101>::call,\r
Filter2DCaller<T, D, BrdReplicate>::call,\r
#include "safe_call.hpp"\r
#include "opencv2/gpu/device/common.hpp"\r
\r
-namespace cv { namespace gpu \r
+namespace cv { namespace gpu\r
{\r
- enum \r
+ enum\r
{\r
BORDER_REFLECT101_GPU = 0,\r
BORDER_REPLICATE_GPU,\r
BORDER_REFLECT_GPU,\r
BORDER_WRAP_GPU\r
};\r
- \r
+\r
// Converts CPU border extrapolation mode into GPU internal analogue.\r
// Returns true if the GPU analogue exists, false otherwise.\r
bool tryConvertToGpuBorderType(int cpuBorderType, int& gpuBorderType);\r
#include "internal_shared.hpp"\r
#include "opencv2/gpu/device/vec_math.hpp"\r
\r
-namespace cv { namespace gpu { namespace device \r
+namespace cv { namespace gpu { namespace device\r
{\r
- namespace match_template \r
+ namespace match_template\r
{\r
__device__ __forceinline__ float sum(float v) { return v; }\r
__device__ __forceinline__ float sum(float2 v) { return v.x + v.y; }\r
//////////////////////////////////////////////////////////////////////\r
// Naive_CCORR\r
\r
- template <typename T, int cn> \r
+ template <typename T, int cn>\r
__global__ void matchTemplateNaiveKernel_CCORR(int w, int h, const PtrStepb image, const PtrStepb templ, DevMem2Df result)\r
{\r
typedef typename TypeVec<T, cn>::vec_type Type;\r
{\r
typedef void (*caller_t)(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, cudaStream_t stream);\r
\r
- static const caller_t callers[] = \r
+ static const caller_t callers[] =\r
{\r
0, matchTemplateNaive_CCORR<float, 1>, matchTemplateNaive_CCORR<float, 2>, matchTemplateNaive_CCORR<float, 3>, matchTemplateNaive_CCORR<float, 4>\r
};\r
{\r
typedef void (*caller_t)(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, cudaStream_t stream);\r
\r
- static const caller_t callers[] = \r
+ static const caller_t callers[] =\r
{\r
0, matchTemplateNaive_CCORR<uchar, 1>, matchTemplateNaive_CCORR<uchar, 2>, matchTemplateNaive_CCORR<uchar, 3>, matchTemplateNaive_CCORR<uchar, 4>\r
};\r
{\r
typedef void (*caller_t)(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, cudaStream_t stream);\r
\r
- static const caller_t callers[] = \r
+ static const caller_t callers[] =\r
{\r
0, matchTemplateNaive_SQDIFF<float, 1>, matchTemplateNaive_SQDIFF<float, 2>, matchTemplateNaive_SQDIFF<float, 3>, matchTemplateNaive_SQDIFF<float, 4>\r
};\r
{\r
typedef void (*caller_t)(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, cudaStream_t stream);\r
\r
- static const caller_t callers[] = \r
+ static const caller_t callers[] =\r
{\r
0, matchTemplateNaive_SQDIFF<uchar, 1>, matchTemplateNaive_SQDIFF<uchar, 2>, matchTemplateNaive_SQDIFF<uchar, 3>, matchTemplateNaive_SQDIFF<uchar, 4>\r
};\r
{\r
typedef void (*caller_t)(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, unsigned long long templ_sqsum, DevMem2Df result, cudaStream_t stream);\r
\r
- static const caller_t callers[] = \r
+ static const caller_t callers[] =\r
{\r
0, matchTemplatePrepared_SQDIFF_8U<1>, matchTemplatePrepared_SQDIFF_8U<2>, matchTemplatePrepared_SQDIFF_8U<3>, matchTemplatePrepared_SQDIFF_8U<4>\r
};\r
DevMem2Df result, int cn, cudaStream_t stream)\r
{\r
typedef void (*caller_t)(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, unsigned long long templ_sqsum, DevMem2Df result, cudaStream_t stream);\r
- static const caller_t callers[] = \r
+ static const caller_t callers[] =\r
{\r
0, matchTemplatePrepared_SQDIFF_NORMED_8U<1>, matchTemplatePrepared_SQDIFF_NORMED_8U<2>, matchTemplatePrepared_SQDIFF_NORMED_8U<3>, matchTemplatePrepared_SQDIFF_NORMED_8U<4>\r
};\r
(image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) -\r
(image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x]));\r
float ccorr = result.ptr(y)[x];\r
- result.ptr(y)[x] = ccorr - image_sum_r_ * templ_sum_scale_r \r
+ result.ptr(y)[x] = ccorr - image_sum_r_ * templ_sum_scale_r\r
- image_sum_g_ * templ_sum_scale_g;\r
}\r
}\r
\r
void matchTemplatePrepared_CCOFF_8UC2(\r
- int w, int h, \r
- const DevMem2D_<unsigned int> image_sum_r, \r
+ int w, int h,\r
+ const DevMem2D_<unsigned int> image_sum_r,\r
const DevMem2D_<unsigned int> image_sum_g,\r
- unsigned int templ_sum_r, unsigned int templ_sum_g, \r
+ unsigned int templ_sum_r, unsigned int templ_sum_g,\r
DevMem2Df result, cudaStream_t stream)\r
{\r
dim3 threads(32, 8);\r
\r
\r
__global__ void matchTemplatePreparedKernel_CCOFF_8UC3(\r
- int w, int h, \r
+ int w, int h,\r
float templ_sum_scale_r,\r
float templ_sum_scale_g,\r
float templ_sum_scale_b,\r
}\r
\r
void matchTemplatePrepared_CCOFF_8UC3(\r
- int w, int h, \r
- const DevMem2D_<unsigned int> image_sum_r, \r
+ int w, int h,\r
+ const DevMem2D_<unsigned int> image_sum_r,\r
const DevMem2D_<unsigned int> image_sum_g,\r
const DevMem2D_<unsigned int> image_sum_b,\r
- unsigned int templ_sum_r, \r
- unsigned int templ_sum_g, \r
- unsigned int templ_sum_b, \r
+ unsigned int templ_sum_r,\r
+ unsigned int templ_sum_g,\r
+ unsigned int templ_sum_b,\r
DevMem2Df result, cudaStream_t stream)\r
{\r
dim3 threads(32, 8);\r
dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));\r
\r
matchTemplatePreparedKernel_CCOFF_8UC3<<<grid, threads, 0, stream>>>(\r
- w, h, \r
+ w, h,\r
(float)templ_sum_r / (w * h),\r
(float)templ_sum_g / (w * h),\r
(float)templ_sum_b / (w * h),\r
\r
\r
__global__ void matchTemplatePreparedKernel_CCOFF_8UC4(\r
- int w, int h, \r
- float templ_sum_scale_r, \r
+ int w, int h,\r
+ float templ_sum_scale_r,\r
float templ_sum_scale_g,\r
float templ_sum_scale_b,\r
float templ_sum_scale_a,\r
(image_sum_a.ptr(y + h)[x + w] - image_sum_a.ptr(y)[x + w]) -\r
(image_sum_a.ptr(y + h)[x] - image_sum_a.ptr(y)[x]));\r
float ccorr = result.ptr(y)[x];\r
- result.ptr(y)[x] = ccorr - image_sum_r_ * templ_sum_scale_r \r
+ result.ptr(y)[x] = ccorr - image_sum_r_ * templ_sum_scale_r\r
- image_sum_g_ * templ_sum_scale_g\r
- image_sum_b_ * templ_sum_scale_b\r
- image_sum_a_ * templ_sum_scale_a;\r
}\r
\r
void matchTemplatePrepared_CCOFF_8UC4(\r
- int w, int h, \r
- const DevMem2D_<unsigned int> image_sum_r, \r
+ int w, int h,\r
+ const DevMem2D_<unsigned int> image_sum_r,\r
const DevMem2D_<unsigned int> image_sum_g,\r
const DevMem2D_<unsigned int> image_sum_b,\r
const DevMem2D_<unsigned int> image_sum_a,\r
- unsigned int templ_sum_r, \r
- unsigned int templ_sum_g, \r
- unsigned int templ_sum_b, \r
- unsigned int templ_sum_a, \r
+ unsigned int templ_sum_r,\r
+ unsigned int templ_sum_g,\r
+ unsigned int templ_sum_b,\r
+ unsigned int templ_sum_a,\r
DevMem2Df result, cudaStream_t stream)\r
{\r
dim3 threads(32, 8);\r
dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));\r
\r
matchTemplatePreparedKernel_CCOFF_8UC4<<<grid, threads, 0, stream>>>(\r
- w, h, \r
- (float)templ_sum_r / (w * h), \r
- (float)templ_sum_g / (w * h), \r
+ w, h,\r
+ (float)templ_sum_r / (w * h),\r
+ (float)templ_sum_g / (w * h),\r
(float)templ_sum_b / (w * h),\r
(float)templ_sum_a / (w * h),\r
image_sum_r, image_sum_g, image_sum_b, image_sum_a,\r
// Prepared_CCOFF_NORMED\r
\r
__global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8U(\r
- int w, int h, float weight, \r
+ int w, int h, float weight,\r
float templ_sum_scale, float templ_sqsum_scale,\r
- const PtrStep<unsigned int> image_sum, \r
+ const PtrStep<unsigned int> image_sum,\r
const PtrStep<unsigned long long> image_sqsum,\r
DevMem2Df result)\r
{\r
}\r
\r
void matchTemplatePrepared_CCOFF_NORMED_8U(\r
- int w, int h, const DevMem2D_<unsigned int> image_sum, \r
+ int w, int h, const DevMem2D_<unsigned int> image_sum,\r
const DevMem2D_<unsigned long long> image_sqsum,\r
unsigned int templ_sum, unsigned long long templ_sqsum,\r
DevMem2Df result, cudaStream_t stream)\r
float templ_sqsum_scale = templ_sqsum - weight * templ_sum * templ_sum;\r
\r
matchTemplatePreparedKernel_CCOFF_NORMED_8U<<<grid, threads, 0, stream>>>(\r
- w, h, weight, templ_sum_scale, templ_sqsum_scale, \r
+ w, h, weight, templ_sum_scale, templ_sqsum_scale,\r
image_sum, image_sqsum, result);\r
cudaSafeCall( cudaGetLastError() );\r
\r
\r
\r
__global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8UC2(\r
- int w, int h, float weight, \r
- float templ_sum_scale_r, float templ_sum_scale_g, \r
+ int w, int h, float weight,\r
+ float templ_sum_scale_r, float templ_sum_scale_g,\r
float templ_sqsum_scale,\r
const PtrStep<unsigned int> image_sum_r, const PtrStep<unsigned long long> image_sqsum_r,\r
const PtrStep<unsigned int> image_sum_g, const PtrStep<unsigned long long> image_sqsum_g,\r
}\r
\r
void matchTemplatePrepared_CCOFF_NORMED_8UC2(\r
- int w, int h, \r
+ int w, int h,\r
const DevMem2D_<unsigned int> image_sum_r, const DevMem2D_<unsigned long long> image_sqsum_r,\r
const DevMem2D_<unsigned int> image_sum_g, const DevMem2D_<unsigned long long> image_sqsum_g,\r
unsigned int templ_sum_r, unsigned long long templ_sqsum_r,\r
float weight = 1.f / (w * h);\r
float templ_sum_scale_r = templ_sum_r * weight;\r
float templ_sum_scale_g = templ_sum_g * weight;\r
- float templ_sqsum_scale = templ_sqsum_r - weight * templ_sum_r * templ_sum_r \r
+ float templ_sqsum_scale = templ_sqsum_r - weight * templ_sum_r * templ_sum_r\r
+ templ_sqsum_g - weight * templ_sum_g * templ_sum_g;\r
\r
matchTemplatePreparedKernel_CCOFF_NORMED_8UC2<<<grid, threads, 0, stream>>>(\r
- w, h, weight, \r
+ w, h, weight,\r
templ_sum_scale_r, templ_sum_scale_g,\r
templ_sqsum_scale,\r
- image_sum_r, image_sqsum_r, \r
- image_sum_g, image_sqsum_g, \r
+ image_sum_r, image_sqsum_r,\r
+ image_sum_g, image_sqsum_g,\r
result);\r
cudaSafeCall( cudaGetLastError() );\r
\r
\r
\r
__global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8UC3(\r
- int w, int h, float weight, \r
- float templ_sum_scale_r, float templ_sum_scale_g, float templ_sum_scale_b, \r
+ int w, int h, float weight,\r
+ float templ_sum_scale_r, float templ_sum_scale_g, float templ_sum_scale_b,\r
float templ_sqsum_scale,\r
const PtrStep<unsigned int> image_sum_r, const PtrStep<unsigned long long> image_sqsum_r,\r
const PtrStep<unsigned int> image_sum_g, const PtrStep<unsigned long long> image_sqsum_g,\r
}\r
\r
void matchTemplatePrepared_CCOFF_NORMED_8UC3(\r
- int w, int h, \r
+ int w, int h,\r
const DevMem2D_<unsigned int> image_sum_r, const DevMem2D_<unsigned long long> image_sqsum_r,\r
const DevMem2D_<unsigned int> image_sum_g, const DevMem2D_<unsigned long long> image_sqsum_g,\r
const DevMem2D_<unsigned int> image_sum_b, const DevMem2D_<unsigned long long> image_sqsum_b,\r
float templ_sum_scale_r = templ_sum_r * weight;\r
float templ_sum_scale_g = templ_sum_g * weight;\r
float templ_sum_scale_b = templ_sum_b * weight;\r
- float templ_sqsum_scale = templ_sqsum_r - weight * templ_sum_r * templ_sum_r \r
+ float templ_sqsum_scale = templ_sqsum_r - weight * templ_sum_r * templ_sum_r\r
+ templ_sqsum_g - weight * templ_sum_g * templ_sum_g\r
+ templ_sqsum_b - weight * templ_sum_b * templ_sum_b;\r
\r
matchTemplatePreparedKernel_CCOFF_NORMED_8UC3<<<grid, threads, 0, stream>>>(\r
- w, h, weight, \r
- templ_sum_scale_r, templ_sum_scale_g, templ_sum_scale_b, \r
- templ_sqsum_scale, \r
- image_sum_r, image_sqsum_r, \r
- image_sum_g, image_sqsum_g, \r
- image_sum_b, image_sqsum_b, \r
+ w, h, weight,\r
+ templ_sum_scale_r, templ_sum_scale_g, templ_sum_scale_b,\r
+ templ_sqsum_scale,\r
+ image_sum_r, image_sqsum_r,\r
+ image_sum_g, image_sqsum_g,\r
+ image_sum_b, image_sqsum_b,\r
result);\r
cudaSafeCall( cudaGetLastError() );\r
\r
\r
\r
__global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8UC4(\r
- int w, int h, float weight, \r
- float templ_sum_scale_r, float templ_sum_scale_g, float templ_sum_scale_b, \r
+ int w, int h, float weight,\r
+ float templ_sum_scale_r, float templ_sum_scale_g, float templ_sum_scale_b,\r
float templ_sum_scale_a, float templ_sqsum_scale,\r
const PtrStep<unsigned int> image_sum_r, const PtrStep<unsigned long long> image_sqsum_r,\r
const PtrStep<unsigned int> image_sum_g, const PtrStep<unsigned long long> image_sqsum_g,\r
}\r
\r
void matchTemplatePrepared_CCOFF_NORMED_8UC4(\r
- int w, int h, \r
+ int w, int h,\r
const DevMem2D_<unsigned int> image_sum_r, const DevMem2D_<unsigned long long> image_sqsum_r,\r
const DevMem2D_<unsigned int> image_sum_g, const DevMem2D_<unsigned long long> image_sqsum_g,\r
const DevMem2D_<unsigned int> image_sum_b, const DevMem2D_<unsigned long long> image_sqsum_b,\r
+ templ_sqsum_a - weight * templ_sum_a * templ_sum_a;\r
\r
matchTemplatePreparedKernel_CCOFF_NORMED_8UC4<<<grid, threads, 0, stream>>>(\r
- w, h, weight, \r
- templ_sum_scale_r, templ_sum_scale_g, templ_sum_scale_b, templ_sum_scale_a, \r
- templ_sqsum_scale, \r
- image_sum_r, image_sqsum_r, \r
- image_sum_g, image_sqsum_g, \r
- image_sum_b, image_sqsum_b, \r
- image_sum_a, image_sqsum_a, \r
+ w, h, weight,\r
+ templ_sum_scale_r, templ_sum_scale_g, templ_sum_scale_b, templ_sum_scale_a,\r
+ templ_sqsum_scale,\r
+ image_sum_r, image_sqsum_r,\r
+ image_sum_g, image_sqsum_g,\r
+ image_sum_b, image_sqsum_b,\r
+ image_sum_a, image_sqsum_a,\r
result);\r
cudaSafeCall( cudaGetLastError() );\r
\r
\r
template <int cn>\r
__global__ void normalizeKernel_8U(\r
- int w, int h, const PtrStep<unsigned long long> image_sqsum, \r
+ int w, int h, const PtrStep<unsigned long long> image_sqsum,\r
unsigned long long templ_sqsum, DevMem2Df result)\r
{\r
const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
}\r
}\r
\r
- void normalize_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, \r
+ void normalize_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum,\r
unsigned long long templ_sqsum, DevMem2Df result, int cn, cudaStream_t stream)\r
{\r
dim3 threads(32, 8);\r
\r
#include "internal_shared.hpp"\r
\r
-namespace cv { namespace gpu { namespace device \r
+namespace cv { namespace gpu { namespace device\r
{\r
- namespace mathfunc \r
+ namespace mathfunc\r
{\r
//////////////////////////////////////////////////////////////////////////////////////\r
// Cart <-> Polar\r
}\r
};\r
template <typename Mag, typename Angle>\r
- __global__ void cartToPolar(const float* xptr, size_t x_step, const float* yptr, size_t y_step, \r
+ __global__ void cartToPolar(const float* xptr, size_t x_step, const float* yptr, size_t y_step,\r
float* mag, size_t mag_step, float* angle, size_t angle_step, float scale, int width, int height)\r
{\r
const int x = blockDim.x * blockIdx.x + threadIdx.x;\r
\r
grid.x = divUp(x.cols, threads.x);\r
grid.y = divUp(x.rows, threads.y);\r
- \r
+\r
const float scale = angleInDegrees ? (float)(180.0f / CV_PI) : 1.f;\r
\r
cartToPolar<Mag, Angle><<<grid, threads, 0, stream>>>(\r
- x.data, x.step/x.elemSize(), y.data, y.step/y.elemSize(), \r
+ x.data, x.step/x.elemSize(), y.data, y.step/y.elemSize(),\r
mag.data, mag.step/mag.elemSize(), angle.data, angle.step/angle.elemSize(), scale, x.cols, x.rows);\r
cudaSafeCall( cudaGetLastError() );\r
\r
void cartToPolar_gpu(DevMem2Df x, DevMem2Df y, DevMem2Df mag, bool magSqr, DevMem2Df angle, bool angleInDegrees, cudaStream_t stream)\r
{\r
typedef void (*caller_t)(DevMem2Df x, DevMem2Df y, DevMem2Df mag, DevMem2Df angle, bool angleInDegrees, cudaStream_t stream);\r
- static const caller_t callers[2][2][2] = \r
+ static const caller_t callers[2][2][2] =\r
{\r
{\r
{\r
\r
grid.x = divUp(mag.cols, threads.x);\r
grid.y = divUp(mag.rows, threads.y);\r
- \r
+\r
const float scale = angleInDegrees ? (float)(CV_PI / 180.0f) : 1.0f;\r
\r
- polarToCart<Mag><<<grid, threads, 0, stream>>>(mag.data, mag.step/mag.elemSize(), \r
+ polarToCart<Mag><<<grid, threads, 0, stream>>>(mag.data, mag.step/mag.elemSize(),\r
angle.data, angle.step/angle.elemSize(), scale, x.data, x.step/x.elemSize(), y.data, y.step/y.elemSize(), mag.cols, mag.rows);\r
cudaSafeCall( cudaGetLastError() );\r
\r
void polarToCart_gpu(DevMem2Df mag, DevMem2Df angle, DevMem2Df x, DevMem2Df y, bool angleInDegrees, cudaStream_t stream)\r
{\r
typedef void (*caller_t)(DevMem2Df mag, DevMem2Df angle, DevMem2Df x, DevMem2Df y, bool angleInDegrees, cudaStream_t stream);\r
- static const caller_t callers[2] = \r
+ static const caller_t callers[2] =\r
{\r
polarToCart_caller<NonEmptyMag>,\r
polarToCart_caller<EmptyMag>\r
#include "opencv2/gpu/device/saturate_cast.hpp"\r
#include "opencv2/gpu/device/vec_math.hpp"\r
\r
-namespace cv { namespace gpu { namespace device \r
+namespace cv { namespace gpu { namespace device\r
{\r
- namespace matrix_reductions \r
+ namespace matrix_reductions\r
{\r
// Performs reduction in shared memory\r
template <int size, typename T>\r
{\r
explicit Mask8U(PtrStepb mask): mask(mask) {}\r
\r
- __device__ __forceinline__ bool operator()(int y, int x) const \r
- { \r
- return mask.ptr(y)[x]; \r
+ __device__ __forceinline__ bool operator()(int y, int x) const\r
+ {\r
+ return mask.ptr(y)[x];\r
}\r
\r
PtrStepb mask;\r
};\r
\r
- struct MaskTrue \r
- { \r
- __device__ __forceinline__ bool operator()(int y, int x) const \r
- { \r
- return true; \r
+ struct MaskTrue\r
+ {\r
+ __device__ __forceinline__ bool operator()(int y, int x) const\r
+ {\r
+ return true;\r
}\r
__device__ __forceinline__ MaskTrue(){}\r
__device__ __forceinline__ MaskTrue(const MaskTrue& mask_){}\r
//////////////////////////////////////////////////////////////////////////////\r
// Min max\r
\r
- // To avoid shared bank conflicts we convert each value into value of \r
+ // To avoid shared bank conflicts we convert each value into value of\r
// appropriate type (32 bits minimum)\r
template <typename T> struct MinMaxTypeTraits {};\r
template <> struct MinMaxTypeTraits<uchar> { typedef int best_type; };\r
template <> struct MinMaxTypeTraits<float> { typedef float best_type; };\r
template <> struct MinMaxTypeTraits<double> { typedef double best_type; };\r
\r
- namespace minmax \r
+ namespace minmax\r
{\r
__constant__ int ctwidth;\r
__constant__ int ctheight;\r
{\r
dim3 threads, grid;\r
estimateThreadCfg(cols, rows, threads, grid);\r
- bufcols = grid.x * grid.y * elem_size; \r
+ bufcols = grid.x * grid.y * elem_size;\r
bufrows = 2;\r
}\r
\r
\r
// Estimates device constants which are used in the kernels using specified thread configuration\r
void setKernelConsts(int cols, int rows, const dim3& threads, const dim3& grid)\r
- { \r
+ {\r
int twidth = divUp(divUp(cols, grid.x), threads.x);\r
int theight = divUp(divUp(rows, grid.y), threads.y);\r
- cudaSafeCall(cudaMemcpyToSymbol(ctwidth, &twidth, sizeof(ctwidth))); \r
- cudaSafeCall(cudaMemcpyToSymbol(ctheight, &theight, sizeof(ctheight))); \r
- } \r
+ cudaSafeCall(cudaMemcpyToSymbol(ctwidth, &twidth, sizeof(ctwidth)));\r
+ cudaSafeCall(cudaMemcpyToSymbol(ctheight, &theight, sizeof(ctheight)));\r
+ }\r
\r
\r
// Does min and max in shared memory\r
for (uint x = x0; x < x_end; x += blockDim.x)\r
{\r
T val = src_row[x];\r
- if (mask(y, x)) \r
- { \r
- mymin = ::min(mymin, val); \r
- mymax = ::max(mymax, val); \r
+ if (mask(y, x))\r
+ {\r
+ mymin = ::min(mymin, val);\r
+ mymax = ::max(mymax, val);\r
}\r
}\r
}\r
\r
findMinMaxInSmem<nthreads, best_type>(sminval, smaxval, tid);\r
\r
- if (tid == 0) \r
+ if (tid == 0)\r
{\r
minval[blockIdx.y * gridDim.x + blockIdx.x] = (T)sminval[0];\r
maxval[blockIdx.y * gridDim.x + blockIdx.x] = (T)smaxval[0];\r
\r
findMinMaxInSmem<nthreads, best_type>(sminval, smaxval, tid);\r
\r
- if (tid == 0) \r
+ if (tid == 0)\r
{\r
minval[0] = (T)sminval[0];\r
maxval[0] = (T)smaxval[0];\r
}\r
}\r
#else\r
- if (tid == 0) \r
+ if (tid == 0)\r
{\r
minval[blockIdx.y * gridDim.x + blockIdx.x] = (T)sminval[0];\r
maxval[blockIdx.y * gridDim.x + blockIdx.x] = (T)smaxval[0];\r
#endif\r
}\r
\r
- \r
+\r
template <typename T>\r
void minMaxMaskCaller(const DevMem2Db src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf)\r
{\r
cudaSafeCall( cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost) );\r
*minval = minval_;\r
*maxval = maxval_;\r
- } \r
+ }\r
\r
template void minMaxMaskCaller<uchar>(const DevMem2Db, const PtrStepb, double*, double*, PtrStepb);\r
template void minMaxMaskCaller<char>(const DevMem2Db, const PtrStepb, double*, double*, PtrStepb);\r
cudaSafeCall( cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost) );\r
*minval = minval_;\r
*maxval = maxval_;\r
- } \r
+ }\r
\r
template void minMaxCaller<uchar>(const DevMem2Db, double*, double*, PtrStepb);\r
template void minMaxCaller<char>(const DevMem2Db, double*, double*, PtrStepb);\r
typedef typename MinMaxTypeTraits<T>::best_type best_type;\r
__shared__ best_type sminval[nthreads];\r
__shared__ best_type smaxval[nthreads];\r
- \r
+\r
uint tid = threadIdx.y * blockDim.x + threadIdx.x;\r
uint idx = ::min(tid, size - 1);\r
\r
\r
findMinMaxInSmem<nthreads, best_type>(sminval, smaxval, tid);\r
\r
- if (tid == 0) \r
+ if (tid == 0)\r
{\r
minval[0] = (T)sminval[0];\r
maxval[0] = (T)smaxval[0];\r
///////////////////////////////////////////////////////////////////////////////\r
// minMaxLoc\r
\r
- namespace minmaxloc \r
+ namespace minmaxloc\r
{\r
__constant__ int ctwidth;\r
__constant__ int ctheight;\r
\r
\r
// Returns required buffer sizes\r
- void getBufSizeRequired(int cols, int rows, int elem_size, int& b1cols, \r
+ void getBufSizeRequired(int cols, int rows, int elem_size, int& b1cols,\r
int& b1rows, int& b2cols, int& b2rows)\r
{\r
dim3 threads, grid;\r
\r
// Estimates device constants which are used in the kernels using specified thread configuration\r
void setKernelConsts(int cols, int rows, const dim3& threads, const dim3& grid)\r
- { \r
+ {\r
int twidth = divUp(divUp(cols, grid.x), threads.x);\r
int theight = divUp(divUp(rows, grid.y), threads.y);\r
- cudaSafeCall(cudaMemcpyToSymbol(ctwidth, &twidth, sizeof(ctwidth))); \r
- cudaSafeCall(cudaMemcpyToSymbol(ctheight, &theight, sizeof(ctheight))); \r
- } \r
+ cudaSafeCall(cudaMemcpyToSymbol(ctwidth, &twidth, sizeof(ctwidth)));\r
+ cudaSafeCall(cudaMemcpyToSymbol(ctheight, &theight, sizeof(ctheight)));\r
+ }\r
\r
\r
template <typename T>\r
- __device__ void merge(uint tid, uint offset, volatile T* minval, volatile T* maxval, \r
+ __device__ void merge(uint tid, uint offset, volatile T* minval, volatile T* maxval,\r
volatile uint* minloc, volatile uint* maxloc)\r
{\r
T val = minval[tid + offset];\r
\r
\r
template <int size, typename T>\r
- __device__ void findMinMaxLocInSmem(volatile T* minval, volatile T* maxval, volatile uint* minloc, \r
+ __device__ void findMinMaxLocInSmem(volatile T* minval, volatile T* maxval, volatile uint* minloc,\r
volatile uint* maxloc, const uint tid)\r
{\r
if (size >= 512) { if (tid < 256) { merge(tid, 256, minval, maxval, minloc, maxloc); } __syncthreads(); }\r
\r
\r
template <int nthreads, typename T, typename Mask>\r
- __global__ void minMaxLocKernel(const DevMem2Db src, Mask mask, T* minval, T* maxval, \r
+ __global__ void minMaxLocKernel(const DevMem2Db src, Mask mask, T* minval, T* maxval,\r
uint* minloc, uint* maxloc)\r
{\r
typedef typename MinMaxTypeTraits<T>::best_type best_type;\r
uint tid = threadIdx.y * blockDim.x + threadIdx.x;\r
\r
T mymin = numeric_limits<T>::max();\r
- T mymax = numeric_limits<T>::is_signed ? -numeric_limits<T>::max() : numeric_limits<T>::min(); \r
+ T mymax = numeric_limits<T>::is_signed ? -numeric_limits<T>::max() : numeric_limits<T>::min();\r
uint myminloc = 0;\r
uint mymaxloc = 0;\r
uint y_end = ::min(y0 + (ctheight - 1) * blockDim.y + 1, src.rows);\r
}\r
}\r
\r
- sminval[tid] = mymin; \r
+ sminval[tid] = mymin;\r
smaxval[tid] = mymax;\r
sminloc[tid] = myminloc;\r
smaxloc[tid] = mymaxloc;\r
\r
findMinMaxLocInSmem<nthreads, best_type>(sminval, smaxval, sminloc, smaxloc, tid);\r
\r
- if (tid == 0) \r
+ if (tid == 0)\r
{\r
minval[0] = (T)sminval[0];\r
maxval[0] = (T)smaxval[0];\r
}\r
}\r
#else\r
- if (tid == 0) \r
+ if (tid == 0)\r
{\r
minval[blockIdx.y * gridDim.x + blockIdx.x] = (T)sminval[0];\r
maxval[blockIdx.y * gridDim.x + blockIdx.x] = (T)smaxval[0];\r
\r
\r
template <typename T>\r
- void minMaxLocMaskCaller(const DevMem2Db src, const PtrStepb mask, double* minval, double* maxval, \r
+ void minMaxLocMaskCaller(const DevMem2Db src, const PtrStepb mask, double* minval, double* maxval,\r
int minloc[2], int maxloc[2], PtrStepb valbuf, PtrStepb locbuf)\r
{\r
dim3 threads, grid;\r
uint* minloc_buf = (uint*)locbuf.ptr(0);\r
uint* maxloc_buf = (uint*)locbuf.ptr(1);\r
\r
- minMaxLocKernel<256, T, Mask8U><<<grid, threads>>>(src, Mask8U(mask), minval_buf, maxval_buf, \r
+ minMaxLocKernel<256, T, Mask8U><<<grid, threads>>>(src, Mask8U(mask), minval_buf, maxval_buf,\r
minloc_buf, maxloc_buf);\r
cudaSafeCall( cudaGetLastError() );\r
\r
\r
\r
template <typename T>\r
- void minMaxLocCaller(const DevMem2Db src, double* minval, double* maxval, \r
+ void minMaxLocCaller(const DevMem2Db src, double* minval, double* maxval,\r
int minloc[2], int maxloc[2], PtrStepb valbuf, PtrStepb locbuf)\r
{\r
dim3 threads, grid;\r
uint* minloc_buf = (uint*)locbuf.ptr(0);\r
uint* maxloc_buf = (uint*)locbuf.ptr(1);\r
\r
- minMaxLocKernel<256, T, MaskTrue><<<grid, threads>>>(src, MaskTrue(), minval_buf, maxval_buf, \r
+ minMaxLocKernel<256, T, MaskTrue><<<grid, threads>>>(src, MaskTrue(), minval_buf, maxval_buf,\r
minloc_buf, maxloc_buf);\r
cudaSafeCall( cudaGetLastError() );\r
\r
\r
findMinMaxLocInSmem<nthreads, best_type>(sminval, smaxval, sminloc, smaxloc, tid);\r
\r
- if (tid == 0) \r
+ if (tid == 0)\r
{\r
minval[0] = (T)sminval[0];\r
maxval[0] = (T)smaxval[0];\r
\r
\r
template <typename T>\r
- void minMaxLocMaskMultipassCaller(const DevMem2Db src, const PtrStepb mask, double* minval, double* maxval, \r
+ void minMaxLocMaskMultipassCaller(const DevMem2Db src, const PtrStepb mask, double* minval, double* maxval,\r
int minloc[2], int maxloc[2], PtrStepb valbuf, PtrStepb locbuf)\r
{\r
dim3 threads, grid;\r
uint* minloc_buf = (uint*)locbuf.ptr(0);\r
uint* maxloc_buf = (uint*)locbuf.ptr(1);\r
\r
- minMaxLocKernel<256, T, Mask8U><<<grid, threads>>>(src, Mask8U(mask), minval_buf, maxval_buf, \r
+ minMaxLocKernel<256, T, Mask8U><<<grid, threads>>>(src, Mask8U(mask), minval_buf, maxval_buf,\r
minloc_buf, maxloc_buf);\r
cudaSafeCall( cudaGetLastError() );\r
minMaxLocPass2Kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, minloc_buf, maxloc_buf, grid.x * grid.y);\r
\r
\r
template <typename T>\r
- void minMaxLocMultipassCaller(const DevMem2Db src, double* minval, double* maxval, \r
+ void minMaxLocMultipassCaller(const DevMem2Db src, double* minval, double* maxval,\r
int minloc[2], int maxloc[2], PtrStepb valbuf, PtrStepb locbuf)\r
{\r
dim3 threads, grid;\r
uint* minloc_buf = (uint*)locbuf.ptr(0);\r
uint* maxloc_buf = (uint*)locbuf.ptr(1);\r
\r
- minMaxLocKernel<256, T, MaskTrue><<<grid, threads>>>(src, MaskTrue(), minval_buf, maxval_buf, \r
+ minMaxLocKernel<256, T, MaskTrue><<<grid, threads>>>(src, MaskTrue(), minval_buf, maxval_buf,\r
minloc_buf, maxloc_buf);\r
cudaSafeCall( cudaGetLastError() );\r
minMaxLocPass2Kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, minloc_buf, maxloc_buf, grid.x * grid.y);\r
//////////////////////////////////////////////////////////////////////////////////////////////////////////\r
// countNonZero\r
\r
- namespace countnonzero \r
+ namespace countnonzero\r
{\r
__constant__ int ctwidth;\r
__constant__ int ctheight;\r
\r
\r
void setKernelConsts(int cols, int rows, const dim3& threads, const dim3& grid)\r
- { \r
+ {\r
int twidth = divUp(divUp(cols, grid.x), threads.x);\r
int theight = divUp(divUp(rows, grid.y), threads.y);\r
- cudaSafeCall(cudaMemcpyToSymbol(ctwidth, &twidth, sizeof(twidth))); \r
- cudaSafeCall(cudaMemcpyToSymbol(ctheight, &theight, sizeof(theight))); \r
+ cudaSafeCall(cudaMemcpyToSymbol(ctwidth, &twidth, sizeof(twidth)));\r
+ cudaSafeCall(cudaMemcpyToSymbol(ctheight, &theight, sizeof(theight)));\r
}\r
\r
\r
\r
sumInSmem<nthreads, uint>(scount, tid);\r
\r
- if (tid == 0) \r
+ if (tid == 0)\r
{\r
count[0] = scount[0];\r
blocks_finished = 0;\r
#endif\r
}\r
\r
- \r
+\r
template <typename T>\r
int countNonZeroCaller(const DevMem2Db src, PtrStepb buf)\r
{\r
\r
uint count;\r
cudaSafeCall(cudaMemcpy(&count, count_buf, sizeof(int), cudaMemcpyDeviceToHost));\r
- \r
+\r
return count;\r
- } \r
+ }\r
\r
template int countNonZeroCaller<uchar>(const DevMem2Db, PtrStepb);\r
template int countNonZeroCaller<char>(const DevMem2Db, PtrStepb);\r
\r
sumInSmem<nthreads, uint>(scount, tid);\r
\r
- if (tid == 0) \r
+ if (tid == 0)\r
count[0] = scount[0];\r
}\r
\r
\r
uint count;\r
cudaSafeCall(cudaMemcpy(&count, count_buf, sizeof(int), cudaMemcpyDeviceToHost));\r
- \r
+\r
return count;\r
- } \r
+ }\r
\r
template int countNonZeroMultipassCaller<uchar>(const DevMem2Db, PtrStepb);\r
template int countNonZeroMultipassCaller<char>(const DevMem2Db, PtrStepb);\r
template <> struct SumType<float> { typedef float R; };\r
template <> struct SumType<double> { typedef double R; };\r
\r
- template <typename R> \r
+ template <typename R>\r
struct IdentityOp { static __device__ __forceinline__ R call(R x) { return x; } };\r
\r
- template <typename R> \r
+ template <typename R>\r
struct AbsOp { static __device__ __forceinline__ R call(R x) { return ::abs(x); } };\r
\r
template <>\r
struct AbsOp<uint> { static __device__ __forceinline__ uint call(uint x) { return x; } };\r
\r
- template <typename R> \r
+ template <typename R>\r
struct SqrOp { static __device__ __forceinline__ R call(R x) { return x * x; } };\r
\r
__constant__ int ctwidth;\r
void estimateThreadCfg(int cols, int rows, dim3& threads, dim3& grid)\r
{\r
threads = dim3(threads_x, threads_y);\r
- grid = dim3(divUp(cols, threads.x * threads.y), \r
+ grid = dim3(divUp(cols, threads.x * threads.y),\r
divUp(rows, threads.y * threads.x));\r
grid.x = std::min(grid.x, threads.x);\r
grid.y = std::min(grid.y, threads.y);\r
\r
\r
void setKernelConsts(int cols, int rows, const dim3& threads, const dim3& grid)\r
- { \r
+ {\r
int twidth = divUp(divUp(cols, grid.x), threads.x);\r
int theight = divUp(divUp(rows, grid.y), threads.y);\r
- cudaSafeCall(cudaMemcpyToSymbol(ctwidth, &twidth, sizeof(twidth))); \r
- cudaSafeCall(cudaMemcpyToSymbol(ctheight, &theight, sizeof(theight))); \r
+ cudaSafeCall(cudaMemcpyToSymbol(ctwidth, &twidth, sizeof(twidth)));\r
+ cudaSafeCall(cudaMemcpyToSymbol(ctheight, &theight, sizeof(theight)));\r
}\r
\r
template <typename T, typename R, typename Op, int nthreads>\r
\r
sumInSmem<nthreads, R>(smem, tid);\r
\r
- if (tid == 0) \r
+ if (tid == 0)\r
{\r
result[0] = smem[0];\r
blocks_finished = 0;\r
\r
sumInSmem<nthreads, R>(smem, tid);\r
\r
- if (tid == 0) \r
+ if (tid == 0)\r
result[0] = smem[0];\r
}\r
\r
sumInSmem<nthreads, R>(smem, tid);\r
sumInSmem<nthreads, R>(smem + nthreads, tid);\r
\r
- if (tid == 0) \r
+ if (tid == 0)\r
{\r
res.x = smem[0];\r
res.y = smem[nthreads];\r
}\r
}\r
#else\r
- if (tid == 0) \r
+ if (tid == 0)\r
{\r
DstType res;\r
res.x = smem[0];\r
sumInSmem<nthreads, R>(smem, tid);\r
sumInSmem<nthreads, R>(smem + nthreads, tid);\r
\r
- if (tid == 0) \r
+ if (tid == 0)\r
{\r
res.x = smem[0];\r
res.y = smem[nthreads];\r
sumInSmem<nthreads, R>(smem + nthreads, tid);\r
sumInSmem<nthreads, R>(smem + 2 * nthreads, tid);\r
\r
- if (tid == 0) \r
+ if (tid == 0)\r
{\r
res.x = smem[0];\r
res.y = smem[nthreads];\r
}\r
}\r
#else\r
- if (tid == 0) \r
+ if (tid == 0)\r
{\r
DstType res;\r
res.x = smem[0];\r
sumInSmem<nthreads, R>(smem + nthreads, tid);\r
sumInSmem<nthreads, R>(smem + 2 * nthreads, tid);\r
\r
- if (tid == 0) \r
+ if (tid == 0)\r
{\r
res.x = smem[0];\r
res.y = smem[nthreads];\r
for (int x = 0; x < ctwidth && x0 + x * blockDim.x < src.cols; ++x)\r
{\r
val = ptr[x0 + x * blockDim.x];\r
- sum = sum + VecTraits<DstType>::make(Op::call(val.x), Op::call(val.y), \r
+ sum = sum + VecTraits<DstType>::make(Op::call(val.x), Op::call(val.y),\r
Op::call(val.z), Op::call(val.w));\r
}\r
}\r
sumInSmem<nthreads, R>(smem + 2 * nthreads, tid);\r
sumInSmem<nthreads, R>(smem + 3 * nthreads, tid);\r
\r
- if (tid == 0) \r
+ if (tid == 0)\r
{\r
res.x = smem[0];\r
res.y = smem[nthreads];\r
}\r
}\r
#else\r
- if (tid == 0) \r
+ if (tid == 0)\r
{\r
DstType res;\r
res.x = smem[0];\r
sumInSmem<nthreads, R>(smem + 2 * nthreads, tid);\r
sumInSmem<nthreads, R>(smem + 3 * nthreads, tid);\r
\r
- if (tid == 0) \r
+ if (tid == 0)\r
{\r
res.x = smem[0];\r
res.y = smem[nthreads];\r
sum[1] = result[1];\r
sum[2] = result[2];\r
sum[3] = result[3];\r
- } \r
+ }\r
\r
template void sumMultipassCaller<uchar>(const DevMem2Db, PtrStepb, double*, int);\r
template void sumMultipassCaller<char>(const DevMem2Db, PtrStepb, double*, int);\r
sum[1] = result[1];\r
sum[2] = result[2];\r
sum[3] = result[3];\r
- } \r
+ }\r
\r
template void sumCaller<uchar>(const DevMem2Db, PtrStepb, double*, int);\r
template void sumCaller<char>(const DevMem2Db, PtrStepb, double*, int);\r
sum[1] = result[1];\r
sum[2] = result[2];\r
sum[3] = result[3];\r
- } \r
+ }\r
\r
template void absSumMultipassCaller<uchar>(const DevMem2Db, PtrStepb, double*, int);\r
template void absSumMultipassCaller<char>(const DevMem2Db, PtrStepb, double*, int);\r
sum[1] = result[1];\r
sum[2] = result[2];\r
sum[3] = result[3];\r
- } \r
+ }\r
\r
template void sqrSumMultipassCaller<uchar>(const DevMem2Db, PtrStepb, double*, int);\r
template void sqrSumMultipassCaller<char>(const DevMem2Db, PtrStepb, double*, int);\r
{\r
for (int y = threadIdx.y; y < src.rows; y += 16)\r
myVal = op(myVal, src.ptr(y)[x]);\r
- } \r
+ }\r
\r
smem[threadIdx.x * 16 + threadIdx.y] = myVal;\r
__syncthreads();\r
{\r
typedef void (*caller_t)(const DevMem2D_<T>& src, DevMem2D_<D> dst, cudaStream_t stream);\r
\r
- static const caller_t callers[] = \r
+ static const caller_t callers[] =\r
{\r
- reduceRows_caller<SumReductor, T, S, D>, \r
- reduceRows_caller<AvgReductor, T, S, D>, \r
- reduceRows_caller<MaxReductor, T, S, D>, \r
+ reduceRows_caller<SumReductor, T, S, D>,\r
+ reduceRows_caller<AvgReductor, T, S, D>,\r
+ reduceRows_caller<MaxReductor, T, S, D>,\r
reduceRows_caller<MinReductor, T, S, D>\r
};\r
\r
\r
template void reduceRows_gpu<uchar, int, uchar>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);\r
template void reduceRows_gpu<uchar, int, int>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);\r
- template void reduceRows_gpu<uchar, int, float>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream); \r
+ template void reduceRows_gpu<uchar, int, float>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);\r
\r
template void reduceRows_gpu<ushort, int, ushort>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);\r
template void reduceRows_gpu<ushort, int, int>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);\r
- template void reduceRows_gpu<ushort, int, float>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream); \r
+ template void reduceRows_gpu<ushort, int, float>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);\r
\r
template void reduceRows_gpu<short, int, short>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);\r
template void reduceRows_gpu<short, int, int>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);\r
- template void reduceRows_gpu<short, int, float>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream); \r
+ template void reduceRows_gpu<short, int, float>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);\r
\r
template void reduceRows_gpu<int, int, int>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);\r
template void reduceRows_gpu<int, int, float>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);\r
{\r
typedef void (*caller_t)(const DevMem2D_<T>& src, DevMem2D_<D> dst, cudaStream_t stream);\r
\r
- static const caller_t callers[4][4] = \r
+ static const caller_t callers[4][4] =\r
{\r
{reduceCols_caller<1, SumReductor, T, S, D>, reduceCols_caller<1, AvgReductor, T, S, D>, reduceCols_caller<1, MaxReductor, T, S, D>, reduceCols_caller<1, MinReductor, T, S, D>},\r
{reduceCols_caller<2, SumReductor, T, S, D>, reduceCols_caller<2, AvgReductor, T, S, D>, reduceCols_caller<2, MaxReductor, T, S, D>, reduceCols_caller<2, MinReductor, T, S, D>},\r
template void reduceCols_gpu<uchar, int, int>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);\r
template void reduceCols_gpu<uchar, int, float>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);\r
\r
- template void reduceCols_gpu<ushort, int, ushort>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream); \r
- template void reduceCols_gpu<ushort, int, int>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream); \r
+ template void reduceCols_gpu<ushort, int, ushort>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);\r
+ template void reduceCols_gpu<ushort, int, int>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);\r
template void reduceCols_gpu<ushort, int, float>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);\r
\r
- template void reduceCols_gpu<short, int, short>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream); \r
- template void reduceCols_gpu<short, int, int>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream); \r
- template void reduceCols_gpu<short, int, float>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream); \r
+ template void reduceCols_gpu<short, int, short>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);\r
+ template void reduceCols_gpu<short, int, int>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);\r
+ template void reduceCols_gpu<short, int, float>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);\r
\r
- template void reduceCols_gpu<int, int, int>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream); \r
+ template void reduceCols_gpu<int, int, int>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);\r
template void reduceCols_gpu<int, int, float>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);\r
\r
template void reduceCols_gpu<float, float, float>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);\r
\r
#include "opencv2/gpu/device/common.hpp"\r
\r
-namespace cv { namespace gpu { namespace device \r
+namespace cv { namespace gpu { namespace device\r
{\r
namespace optical_flow\r
{\r
#define NUM_VERTS_PER_ARROW 6\r
\r
__global__ void NeedleMapAverageKernel(const DevMem2Df u, const PtrStepf v, PtrStepf u_avg, PtrStepf v_avg)\r
- { \r
+ {\r
__shared__ float smem[2 * NEEDLE_MAP_SCALE];\r
\r
volatile float* u_col_sum = smem;\r
}\r
\r
if (threadIdx.x < 8)\r
- { \r
+ {\r
// now add the column sums\r
const uint X = threadIdx.x;\r
\r
v_col_sum[threadIdx.x] += v_col_sum[threadIdx.x + 1];\r
}\r
\r
- if (X | 0xfe == 0xfc) // bits 0 & 1 == 0 \r
- { \r
+ if (X | 0xfe == 0xfc) // bits 0 & 1 == 0\r
+ {\r
u_col_sum[threadIdx.x] += u_col_sum[threadIdx.x + 2];\r
v_col_sum[threadIdx.x] += v_col_sum[threadIdx.x + 2];\r
}\r
v_avg(blockIdx.y, blockIdx.x) = v_col_sum[0];\r
}\r
}\r
- \r
+\r
void NeedleMapAverage_gpu(DevMem2Df u, DevMem2Df v, DevMem2Df u_avg, DevMem2Df v_avg)\r
{\r
const dim3 block(NEEDLE_MAP_SCALE);\r
//\r
// Copyright (c) 2010, Paul Furgale, Chi Hay Tong\r
//\r
-// The original code was written by Paul Furgale and Chi Hay Tong \r
+// The original code was written by Paul Furgale and Chi Hay Tong\r
// and later optimized and prepared for integration into OpenCV by Itseez.\r
//\r
//M*/\r
#include "opencv2/gpu/device/utility.hpp"\r
#include "opencv2/gpu/device/functional.hpp"\r
\r
-namespace cv { namespace gpu { namespace device \r
+namespace cv { namespace gpu { namespace device\r
{\r
namespace orb\r
{\r
// cull\r
\r
int cull_gpu(int* loc, float* response, int size, int n_points)\r
- { \r
+ {\r
thrust::device_ptr<int> loc_ptr(loc);\r
thrust::device_ptr<float> response_ptr(response);\r
\r
{\r
const short2 loc = loc_[ptidx];\r
\r
- const int r = blockSize / 2; \r
+ const int r = blockSize / 2;\r
const int x0 = loc.x - r;\r
const int y0 = loc.y - r;\r
- \r
+\r
int a = 0, b = 0, c = 0;\r
\r
for (int ind = threadIdx.x; ind < blockSize * blockSize; ind += blockDim.x)\r
const int i = ind / blockSize;\r
const int j = ind % blockSize;\r
\r
- int Ix = (img(y0 + i, x0 + j + 1) - img(y0 + i, x0 + j - 1)) * 2 + \r
- (img(y0 + i - 1, x0 + j + 1) - img(y0 + i - 1, x0 + j - 1)) + \r
+ int Ix = (img(y0 + i, x0 + j + 1) - img(y0 + i, x0 + j - 1)) * 2 +\r
+ (img(y0 + i - 1, x0 + j + 1) - img(y0 + i - 1, x0 + j - 1)) +\r
(img(y0 + i + 1, x0 + j + 1) - img(y0 + i + 1, x0 + j - 1));\r
\r
- int Iy = (img(y0 + i + 1, x0 + j) - img(y0 + i - 1, x0 + j)) * 2 + \r
- (img(y0 + i + 1, x0 + j - 1) - img(y0 + i - 1, x0 + j - 1)) + \r
+ int Iy = (img(y0 + i + 1, x0 + j) - img(y0 + i - 1, x0 + j)) * 2 +\r
+ (img(y0 + i + 1, x0 + j - 1) - img(y0 + i - 1, x0 + j - 1)) +\r
(img(y0 + i + 1, x0 + j + 1) - img(y0 + i - 1, x0 + j + 1));\r
\r
a += Ix * Ix;\r
int m_01 = 0, m_10 = 0;\r
\r
const short2 loc = loc_[ptidx];\r
- \r
+\r
// Treat the center line differently, v=0\r
for (int u = threadIdx.x - half_k; u <= half_k; u += blockDim.x)\r
m_10 += u * image(loc.y, loc.x + u);\r
int v_sum = 0;\r
int m_sum = 0;\r
const int d = c_u_max[v];\r
- \r
+\r
for (int u = threadIdx.x - d; u <= d; u += blockDim.x)\r
{\r
int val_plus = image(loc.y + v, loc.x + u);\r
{\r
__device__ static int calc(const PtrStepb& img, short2 loc, const int* pattern_x, const int* pattern_y, float sina, float cosa, int i)\r
{\r
- pattern_x += 16 * i; \r
+ pattern_x += 16 * i;\r
pattern_y += 16 * i;\r
\r
int t0, t1, val;\r
\r
t0 = GET_VALUE(14); t1 = GET_VALUE(15);\r
val |= (t0 < t1) << 7;\r
- \r
+\r
return val;\r
}\r
};\r
{\r
__device__ static int calc(const PtrStepb& img, short2 loc, const int* pattern_x, const int* pattern_y, float sina, float cosa, int i)\r
{\r
- pattern_x += 12 * i; \r
+ pattern_x += 12 * i;\r
pattern_y += 12 * i;\r
- \r
+\r
int t0, t1, t2, val;\r
\r
t0 = GET_VALUE(0); t1 = GET_VALUE(1); t2 = GET_VALUE(2);\r
val = t2 > t1 ? (t2 > t0 ? 2 : 0) : (t1 > t0);\r
- \r
+\r
t0 = GET_VALUE(3); t1 = GET_VALUE(4); t2 = GET_VALUE(5);\r
val |= (t2 > t1 ? (t2 > t0 ? 2 : 0) : (t1 > t0)) << 2;\r
- \r
+\r
t0 = GET_VALUE(6); t1 = GET_VALUE(7); t2 = GET_VALUE(8);\r
val |= (t2 > t1 ? (t2 > t0 ? 2 : 0) : (t1 > t0)) << 4;\r
- \r
+\r
t0 = GET_VALUE(9); t1 = GET_VALUE(10); t2 = GET_VALUE(11);\r
val |= (t2 > t1 ? (t2 > t0 ? 2 : 0) : (t1 > t0)) << 6;\r
- \r
+\r
return val;\r
}\r
};\r
{\r
__device__ static int calc(const PtrStepb& img, short2 loc, const int* pattern_x, const int* pattern_y, float sina, float cosa, int i)\r
{\r
- pattern_x += 16 * i; \r
+ pattern_x += 16 * i;\r
pattern_y += 16 * i;\r
- \r
+\r
int t0, t1, t2, t3, k, val;\r
int a, b;\r
\r
if( t3 > t2 ) t2 = t3, b = 3;\r
k = t0 > t2 ? a : b;\r
val = k;\r
- \r
+\r
t0 = GET_VALUE(4); t1 = GET_VALUE(5);\r
t2 = GET_VALUE(6); t3 = GET_VALUE(7);\r
a = 0, b = 2;\r
if( t3 > t2 ) t2 = t3, b = 3;\r
k = t0 > t2 ? a : b;\r
val |= k << 2;\r
- \r
+\r
t0 = GET_VALUE(8); t1 = GET_VALUE(9);\r
t2 = GET_VALUE(10); t3 = GET_VALUE(11);\r
a = 0, b = 2;\r
if( t3 > t2 ) t2 = t3, b = 3;\r
k = t0 > t2 ? a : b;\r
val |= k << 4;\r
- \r
+\r
t0 = GET_VALUE(12); t1 = GET_VALUE(13);\r
t2 = GET_VALUE(14); t3 = GET_VALUE(15);\r
a = 0, b = 2;\r
if( t3 > t2 ) t2 = t3, b = 3;\r
k = t0 > t2 ? a : b;\r
val |= k << 6;\r
- \r
+\r
return val;\r
}\r
};\r
y[ptidx] = loc.y * scale;\r
}\r
}\r
- \r
+\r
void mergeLocation_gpu(const short2* loc, float* x, float* y, int npoints, float scale, cudaStream_t stream)\r
{\r
dim3 block(256);\r
{\r
static void call(DevMem2D_<T> src, DevMem2Df mapx, DevMem2Df mapy, DevMem2D_<T> dst, const float* borderValue, cudaStream_t stream, int)\r
{\r
- typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type; \r
+ typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type;\r
\r
dim3 block(32, 8);\r
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));\r
cudaSafeCall( cudaDeviceSynchronize() ); \\r
} \\r
};\r
- \r
+\r
OPENCV_GPU_IMPLEMENT_REMAP_TEX(uchar)\r
//OPENCV_GPU_IMPLEMENT_REMAP_TEX(uchar2)\r
OPENCV_GPU_IMPLEMENT_REMAP_TEX(uchar4)\r
\r
template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcher\r
{\r
- static void call(DevMem2D_<T> src, DevMem2D_<T> srcWhole, int xoff, int yoff, DevMem2Df mapx, DevMem2Df mapy, \r
+ static void call(DevMem2D_<T> src, DevMem2D_<T> srcWhole, int xoff, int yoff, DevMem2Df mapx, DevMem2Df mapy,\r
DevMem2D_<T> dst, const float* borderValue, cudaStream_t stream, int cc)\r
{\r
if (stream == 0)\r
}\r
};\r
\r
- template <typename T> void remap_gpu(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, DevMem2Df xmap, DevMem2Df ymap, \r
+ template <typename T> void remap_gpu(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, DevMem2Df xmap, DevMem2Df ymap,\r
DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc)\r
{\r
- typedef void (*caller_t)(DevMem2D_<T> src, DevMem2D_<T> srcWhole, int xoff, int yoff, DevMem2Df xmap, DevMem2Df ymap, \r
+ typedef void (*caller_t)(DevMem2D_<T> src, DevMem2D_<T> srcWhole, int xoff, int yoff, DevMem2Df xmap, DevMem2Df ymap,\r
DevMem2D_<T> dst, const float* borderValue, cudaStream_t stream, int cc);\r
\r
- static const caller_t callers[3][5] = \r
+ static const caller_t callers[3][5] =\r
{\r
{\r
RemapDispatcher<PointFilter, BrdReflect101, T>::call,\r
}\r
};\r
\r
- callers[interpolation][borderMode](static_cast< DevMem2D_<T> >(src), static_cast< DevMem2D_<T> >(srcWhole), xoff, yoff, xmap, ymap, \r
+ callers[interpolation][borderMode](static_cast< DevMem2D_<T> >(src), static_cast< DevMem2D_<T> >(srcWhole), xoff, yoff, xmap, ymap,\r
static_cast< DevMem2D_<T> >(dst), borderValue, stream, cc);\r
}\r
\r
}\r
};\r
\r
- template <typename T> void resize_gpu(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float fx, float fy, \r
+ template <typename T> void resize_gpu(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float fx, float fy,\r
DevMem2Db dst, int interpolation, cudaStream_t stream)\r
{\r
typedef void (*caller_t)(DevMem2D_<T> src, DevMem2D_<T> srcWhole, int xoff, int yoff, float fx, float fy, DevMem2D_<T> dst, cudaStream_t stream);\r
if (interpolation == 3 && (fx <= 1.f || fy <= 1.f))\r
interpolation = 1;\r
\r
- callers[interpolation](static_cast< DevMem2D_<T> >(src), static_cast< DevMem2D_<T> >(srcWhole), xoff, yoff, fx, fy, \r
+ callers[interpolation](static_cast< DevMem2D_<T> >(src), static_cast< DevMem2D_<T> >(srcWhole), xoff, yoff, fx, fy,\r
static_cast< DevMem2D_<T> >(dst), stream);\r
}\r
\r
#include "opencv2/gpu/device/common.hpp"\r
#include "opencv2/gpu/device/vec_traits.hpp"\r
\r
-namespace cv { namespace gpu { namespace device \r
+namespace cv { namespace gpu { namespace device\r
{\r
namespace video_encoding\r
{\r
void YV12_gpu(const DevMem2Db src, int cn, DevMem2Db dst)\r
{\r
typedef void (*func_t)(const DevMem2Db src, PtrStepb dst);\r
- \r
- static const func_t funcs[] = \r
+\r
+ static const func_t funcs[] =\r
{\r
0, Gray_to_YV12_caller, 0, BGR_to_YV12_caller<3>, BGR_to_YV12_caller<4>\r
};\r
- \r
+\r
funcs[cn](src, dst);\r
}\r
}\r
#include "opencv2/gpu/device/border_interpolate.hpp"\r
#include "opencv2/gpu/device/static_check.hpp"\r
\r
-namespace cv { namespace gpu { namespace device \r
+namespace cv { namespace gpu { namespace device\r
{\r
- namespace row_filter \r
+ namespace row_filter\r
{\r
#define MAX_KERNEL_SIZE 32\r
\r
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t;\r
\r
__shared__ sum_t smem[BLOCK_DIM_Y][(PATCH_PER_BLOCK + 2 * HALO_SIZE) * BLOCK_DIM_X];\r
- \r
+\r
const int y = blockIdx.y * BLOCK_DIM_Y + threadIdx.y;\r
\r
if (y >= src.rows)\r
{\r
typedef void (*caller_t)(DevMem2D_<T> src, DevMem2D_<D> dst, int anchor, int cc, cudaStream_t stream);\r
\r
- static const caller_t callers[5][33] = \r
+ static const caller_t callers[5][33] =\r
{\r
{\r
0,\r
linearRowFilter_caller<30, T, D, BrdRowWrap>,\r
linearRowFilter_caller<31, T, D, BrdRowWrap>,\r
linearRowFilter_caller<32, T, D, BrdRowWrap>\r
- } \r
+ }\r
};\r
- \r
+\r
loadKernel(kernel, ksize);\r
\r
callers[brd_type][ksize]((DevMem2D_<T>)src, (DevMem2D_<D>)dst, anchor, cc, stream);\r
#define cublasSafeCall(expr) ___cublasSafeCall(expr, __FILE__, __LINE__)\r
#endif\r
\r
-namespace cv { namespace gpu \r
+namespace cv { namespace gpu\r
{\r
void nppError(int err, const char *file, const int line, const char *func = "");\r
void ncvError(int err, const char *file, const int line, const char *func = "");\r
\r
#include "internal_shared.hpp"\r
\r
-namespace cv { namespace gpu { namespace device \r
+namespace cv { namespace gpu { namespace device\r
{\r
- namespace split_merge \r
+ namespace split_merge\r
{\r
template <typename T, size_t elem_size = sizeof(T)>\r
- struct TypeTraits \r
+ struct TypeTraits\r
{\r
typedef T type;\r
typedef T type2;\r
};\r
\r
template <typename T>\r
- struct TypeTraits<T, 4> \r
+ struct TypeTraits<T, 4>\r
{\r
typedef int type;\r
typedef int2 type2;\r
};\r
\r
template <typename T>\r
- struct TypeTraits<T, 8> \r
+ struct TypeTraits<T, 8>\r
{\r
typedef double type;\r
typedef double2 type2;\r
typedef void (*SplitFunction)(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream);\r
\r
//------------------------------------------------------------\r
- // Merge \r
+ // Merge\r
\r
template <typename T>\r
- __global__ void mergeC2_(const uchar* src0, size_t src0_step, \r
- const uchar* src1, size_t src1_step, \r
+ __global__ void mergeC2_(const uchar* src0, size_t src0_step,\r
+ const uchar* src1, size_t src1_step,\r
int rows, int cols, uchar* dst, size_t dst_step)\r
{\r
typedef typename TypeTraits<T>::type2 dst_type;\r
const T* src1_y = (const T*)(src1 + y * src1_step);\r
dst_type* dst_y = (dst_type*)(dst + y * dst_step);\r
\r
- if (x < cols && y < rows) \r
- { \r
+ if (x < cols && y < rows)\r
+ {\r
dst_type dst_elem;\r
dst_elem.x = src0_y[x];\r
dst_elem.y = src1_y[x];\r
\r
\r
template <typename T>\r
- __global__ void mergeC3_(const uchar* src0, size_t src0_step, \r
- const uchar* src1, size_t src1_step, \r
- const uchar* src2, size_t src2_step, \r
+ __global__ void mergeC3_(const uchar* src0, size_t src0_step,\r
+ const uchar* src1, size_t src1_step,\r
+ const uchar* src2, size_t src2_step,\r
int rows, int cols, uchar* dst, size_t dst_step)\r
{\r
typedef typename TypeTraits<T>::type3 dst_type;\r
const T* src2_y = (const T*)(src2 + y * src2_step);\r
dst_type* dst_y = (dst_type*)(dst + y * dst_step);\r
\r
- if (x < cols && y < rows) \r
- { \r
+ if (x < cols && y < rows)\r
+ {\r
dst_type dst_elem;\r
dst_elem.x = src0_y[x];\r
dst_elem.y = src1_y[x];\r
\r
\r
template <>\r
- __global__ void mergeC3_<double>(const uchar* src0, size_t src0_step, \r
- const uchar* src1, size_t src1_step, \r
- const uchar* src2, size_t src2_step, \r
+ __global__ void mergeC3_<double>(const uchar* src0, size_t src0_step,\r
+ const uchar* src1, size_t src1_step,\r
+ const uchar* src2, size_t src2_step,\r
int rows, int cols, uchar* dst, size_t dst_step)\r
{\r
const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
const double* src2_y = (const double*)(src2 + y * src2_step);\r
double* dst_y = (double*)(dst + y * dst_step);\r
\r
- if (x < cols && y < rows) \r
- { \r
+ if (x < cols && y < rows)\r
+ {\r
dst_y[3 * x] = src0_y[x];\r
dst_y[3 * x + 1] = src1_y[x];\r
dst_y[3 * x + 2] = src2_y[x];\r
\r
\r
template <typename T>\r
- __global__ void mergeC4_(const uchar* src0, size_t src0_step, \r
- const uchar* src1, size_t src1_step, \r
- const uchar* src2, size_t src2_step, \r
- const uchar* src3, size_t src3_step, \r
+ __global__ void mergeC4_(const uchar* src0, size_t src0_step,\r
+ const uchar* src1, size_t src1_step,\r
+ const uchar* src2, size_t src2_step,\r
+ const uchar* src3, size_t src3_step,\r
int rows, int cols, uchar* dst, size_t dst_step)\r
{\r
typedef typename TypeTraits<T>::type4 dst_type;\r
const T* src3_y = (const T*)(src3 + y * src3_step);\r
dst_type* dst_y = (dst_type*)(dst + y * dst_step);\r
\r
- if (x < cols && y < rows) \r
- { \r
+ if (x < cols && y < rows)\r
+ {\r
dst_type dst_elem;\r
dst_elem.x = src0_y[x];\r
dst_elem.y = src1_y[x];\r
\r
\r
template <>\r
- __global__ void mergeC4_<double>(const uchar* src0, size_t src0_step, \r
- const uchar* src1, size_t src1_step, \r
- const uchar* src2, size_t src2_step, \r
- const uchar* src3, size_t src3_step, \r
+ __global__ void mergeC4_<double>(const uchar* src0, size_t src0_step,\r
+ const uchar* src1, size_t src1_step,\r
+ const uchar* src2, size_t src2_step,\r
+ const uchar* src3, size_t src3_step,\r
int rows, int cols, uchar* dst, size_t dst_step)\r
{\r
const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
const double* src3_y = (const double*)(src3 + y * src3_step);\r
double2* dst_y = (double2*)(dst + y * dst_step);\r
\r
- if (x < cols && y < rows) \r
- { \r
+ if (x < cols && y < rows)\r
+ {\r
dst_y[2 * x] = make_double2(src0_y[x], src1_y[x]);\r
dst_y[2 * x + 1] = make_double2(src2_y[x], src3_y[x]);\r
}\r
\r
\r
template <typename T>\r
- __global__ void splitC2_(const uchar* src, size_t src_step, \r
+ __global__ void splitC2_(const uchar* src, size_t src_step,\r
int rows, int cols,\r
uchar* dst0, size_t dst0_step,\r
uchar* dst1, size_t dst1_step)\r
T* dst0_y = (T*)(dst0 + y * dst0_step);\r
T* dst1_y = (T*)(dst1 + y * dst1_step);\r
\r
- if (x < cols && y < rows) \r
+ if (x < cols && y < rows)\r
{\r
src_type src_elem = src_y[x];\r
dst0_y[x] = src_elem.x;\r
\r
\r
template <typename T>\r
- __global__ void splitC3_(const uchar* src, size_t src_step, \r
+ __global__ void splitC3_(const uchar* src, size_t src_step,\r
int rows, int cols,\r
uchar* dst0, size_t dst0_step,\r
uchar* dst1, size_t dst1_step,\r
T* dst1_y = (T*)(dst1 + y * dst1_step);\r
T* dst2_y = (T*)(dst2 + y * dst2_step);\r
\r
- if (x < cols && y < rows) \r
+ if (x < cols && y < rows)\r
{\r
src_type src_elem = src_y[x];\r
dst0_y[x] = src_elem.x;\r
double* dst1_y = (double*)(dst1 + y * dst1_step);\r
double* dst2_y = (double*)(dst2 + y * dst2_step);\r
\r
- if (x < cols && y < rows) \r
+ if (x < cols && y < rows)\r
{\r
dst0_y[x] = src_y[3 * x];\r
dst1_y[x] = src_y[3 * x + 1];\r
T* dst2_y = (T*)(dst2 + y * dst2_step);\r
T* dst3_y = (T*)(dst3 + y * dst3_step);\r
\r
- if (x < cols && y < rows) \r
+ if (x < cols && y < rows)\r
{\r
src_type src_elem = src_y[x];\r
dst0_y[x] = src_elem.x;\r
double* dst2_y = (double*)(dst2 + y * dst2_step);\r
double* dst3_y = (double*)(dst3 + y * dst3_step);\r
\r
- if (x < cols && y < rows) \r
+ if (x < cols && y < rows)\r
{\r
double2 src_elem1 = src_y[2 * x];\r
double2 src_elem2 = src_y[2 * x + 1];\r
\r
#include "internal_shared.hpp"\r
\r
-namespace cv { namespace gpu { namespace device \r
+namespace cv { namespace gpu { namespace device\r
{\r
- namespace stereobm \r
+ namespace stereobm\r
{\r
//////////////////////////////////////////////////////////////////////////////////////////////////\r
/////////////////////////////////////// Stereo BM ////////////////////////////////////////////////\r
\r
template<int RADIUS>\r
__device__ unsigned int CalcSSD(volatile unsigned int *col_ssd_cache, volatile unsigned int *col_ssd)\r
- { \r
+ {\r
unsigned int cache = 0;\r
unsigned int cache2 = 0;\r
\r
prefilter_kernel<<<grid, threads, 0, stream>>>(output, prefilterCap);\r
cudaSafeCall( cudaGetLastError() );\r
\r
- if (stream == 0) \r
- cudaSafeCall( cudaDeviceSynchronize() ); \r
+ if (stream == 0)\r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
\r
cudaSafeCall( cudaUnbindTexture (texForSobel ) );\r
}\r
#include "opencv2/gpu/device/saturate_cast.hpp"\r
#include "opencv2/gpu/device/limits.hpp"\r
\r
-namespace cv { namespace gpu { namespace device \r
+namespace cv { namespace gpu { namespace device\r
{\r
- namespace stereobp \r
+ namespace stereobp\r
{\r
///////////////////////////////////////////////////////////////\r
/////////////////////// load constants ////////////////////////\r
#include "opencv2/gpu/device/saturate_cast.hpp"\r
#include "opencv2/gpu/device/limits.hpp"\r
\r
-namespace cv { namespace gpu { namespace device \r
+namespace cv { namespace gpu { namespace device\r
{\r
- namespace stereocsbp \r
+ namespace stereocsbp\r
{\r
///////////////////////////////////////////////////////////////\r
/////////////////////// load constants ////////////////////////\r
__constant__ int cth;\r
\r
__constant__ size_t cimg_step;\r
- __constant__ size_t cmsg_step; \r
+ __constant__ size_t cmsg_step;\r
__constant__ size_t cdisp_step1;\r
__constant__ size_t cdisp_step2;\r
\r
get_first_k_initial_local<<<grid, threads, 0, stream>>> (data_cost_selected, disp_selected_pyr, h, w, nr_plane);\r
else\r
get_first_k_initial_global<<<grid, threads, 0, stream>>>(data_cost_selected, disp_selected_pyr, h, w, nr_plane);\r
- \r
+\r
cudaSafeCall( cudaGetLastError() );\r
\r
if (stream == 0)\r
cudaSafeCall( cudaMemcpyToSymbol(cdisp_step1, &disp_step1, sizeof(size_t)) );\r
cudaSafeCall( cudaMemcpyToSymbol(cdisp_step2, &disp_step2, sizeof(size_t)) );\r
cudaSafeCall( cudaMemcpyToSymbol(cmsg_step, &msg_step, sizeof(size_t)) );\r
- \r
+\r
callers[level](disp_selected_pyr, data_cost, rows, cols, h, w, level, nr_plane, channels, stream);\r
cudaSafeCall( cudaGetLastError() );\r
\r
\r
template void compute_data_cost(const float* disp_selected_pyr, float* data_cost, size_t msg_step,\r
int rows, int cols, int h, int w, int h2, int level, int nr_plane, int channels, cudaStream_t stream);\r
- \r
+\r
\r
///////////////////////////////////////////////////////////////\r
//////////////////////// init message /////////////////////////\r
///////////////////////////////////////////////////////////////\r
\r
- \r
+\r
template <typename T>\r
__device__ void get_first_k_element_increase(T* u_new, T* d_new, T* l_new, T* r_new,\r
const T* u_cur, const T* d_cur, const T* l_cur, const T* r_cur,\r
cudaSafeCall( cudaMemcpyToSymbol(cdisp_step1, &disp_step1, sizeof(size_t)) );\r
cudaSafeCall( cudaMemcpyToSymbol(cdisp_step2, &disp_step2, sizeof(size_t)) );\r
cudaSafeCall( cudaMemcpyToSymbol(cmsg_step, &msg_step, sizeof(size_t)) );\r
- \r
+\r
dim3 threads(32, 8, 1);\r
dim3 grid(1, 1, 1);\r
\r
const float* u_cur, const float* d_cur, const float* l_cur, const float* r_cur,\r
float* selected_disp_pyr_new, const float* selected_disp_pyr_cur,\r
float* data_cost_selected, const float* data_cost, size_t msg_step,\r
- int h, int w, int nr_plane, int h2, int w2, int nr_plane2, cudaStream_t stream); \r
+ int h, int w, int nr_plane, int h2, int w2, int nr_plane2, cudaStream_t stream);\r
\r
///////////////////////////////////////////////////////////////\r
//////////////////// calc all iterations /////////////////////\r
for(int t = 0; t < iters; ++t)\r
{\r
compute_message<<<grid, threads, 0, stream>>>(u, d, l, r, data_cost_selected, selected_disp_pyr_cur, h, w, nr_plane, t & 1);\r
- cudaSafeCall( cudaGetLastError() ); \r
+ cudaSafeCall( cudaGetLastError() );\r
}\r
if (stream == 0)\r
cudaSafeCall( cudaDeviceSynchronize() );\r
template void calc_all_iterations(short* u, short* d, short* l, short* r, const short* data_cost_selected, const short* selected_disp_pyr_cur, size_t msg_step,\r
int h, int w, int nr_plane, int iters, cudaStream_t stream);\r
\r
- template void calc_all_iterations(float* u, float* d, float* l, float* r, const float* data_cost_selected, const float* selected_disp_pyr_cur, size_t msg_step, \r
+ template void calc_all_iterations(float* u, float* d, float* l, float* r, const float* data_cost_selected, const float* selected_disp_pyr_cur, size_t msg_step,\r
int h, int w, int nr_plane, int iters, cudaStream_t stream);\r
\r
\r
cudaSafeCall( cudaDeviceSynchronize() );\r
}\r
\r
- template void compute_disp(const short* u, const short* d, const short* l, const short* r, const short* data_cost_selected, const short* disp_selected, size_t msg_step, \r
+ template void compute_disp(const short* u, const short* d, const short* l, const short* r, const short* data_cost_selected, const short* disp_selected, size_t msg_step,\r
const DevMem2D_<short>& disp, int nr_plane, cudaStream_t stream);\r
\r
template void compute_disp(const float* u, const float* d, const float* l, const float* r, const float* data_cost_selected, const float* disp_selected, size_t msg_step,\r
{\r
dim3 block(32, 8);\r
dim3 grid(divUp(xmap.cols, block.x), divUp(xmap.rows, block.y));\r
- \r
+\r
buildWarpMaps<Transform><<<grid, block, 0, stream>>>(xmap, ymap);\r
cudaSafeCall( cudaGetLastError() );\r
\r
{\r
static void call(DevMem2D_<T> src, DevMem2D_<T> srcWhole, int xoff, int yoff, DevMem2D_<T> dst, const float* borderValue, int)\r
{\r
- typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type; \r
+ typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type;\r
\r
dim3 block(32, 8);\r
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));\r
#undef OPENCV_GPU_IMPLEMENT_WARP_TEX\r
\r
template <class Transform, template <typename> class Filter, template <typename> class B, typename T> struct WarpDispatcher\r
- { \r
+ {\r
static void call(DevMem2D_<T> src, DevMem2D_<T> srcWhole, int xoff, int yoff, DevMem2D_<T> dst, const float* borderValue, cudaStream_t stream, int cc)\r
{\r
if (stream == 0)\r
}\r
};\r
\r
- template <class Transform, typename T> \r
+ template <class Transform, typename T>\r
void warp_caller(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, DevMem2Db dst, int interpolation,\r
int borderMode, const float* borderValue, cudaStream_t stream, int cc)\r
{\r