if(${CMAKE_VERSION} VERSION_LESS "2.8.3")
message(STATUS WITH_CUDA flag requires CMake 2.8.3. CUDA support is disabled.)
- return()
+ return()
endif()
-
+
find_package(CUDA 4.1)
if(CUDA_FOUND)
else()
set(CUDA_ARCH_BIN "1.1 1.2 1.3 2.0 2.1(2.0)" CACHE STRING "Specify 'real' GPU architectures to build binaries for, BIN(PTX) format is supported")
endif()
-
+
set(CUDA_ARCH_PTX "2.0" CACHE STRING "Specify 'virtual' PTX architectures to build PTX intermediate code for")
string(REGEX REPLACE "\\." "" ARCH_BIN_NO_POINTS "${CUDA_ARCH_BIN}")
set (CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -Xcompiler -fno-finite-math-only)
endif()
- # we remove -ggdb3 flag as it leads to preprocessor errors when compiling CUDA files (CUDA 4.1)
- set(CMAKE_CXX_FLAGS_DEBUG_ ${CMAKE_CXX_FLAGS_DEBUG})
+ # we remove -ggdb3 flag as it leads to preprocessor errors when compiling CUDA files (CUDA 4.1)
+ set(CMAKE_CXX_FLAGS_DEBUG_ ${CMAKE_CXX_FLAGS_DEBUG})
string(REPLACE "-ggdb3" "" CMAKE_CXX_FLAGS_DEBUG ${CMAKE_CXX_FLAGS_DEBUG})
CUDA_COMPILE(${VAR} ${ARGN})
set(CMAKE_CXX_DEBUG_FLAGS ${CMAKE_CXX_FLAGS_DEBUG_})
Interpolation(cv::INTER_CUBIC), Interpolation(cv::INTER_AREA)),\r
testing::Values(Scale(0.5), Scale(0.3), Scale(2.0))));\r
\r
+GPU_PERF_TEST(ResizeArea, cv::gpu::DeviceInfo, cv::Size, MatType, Scale)\r
+{\r
+ cv::gpu::DeviceInfo devInfo = GET_PARAM(0);\r
+ cv::gpu::setDevice(devInfo.deviceID());\r
+\r
+ cv::Size size = GET_PARAM(1);\r
+ int type = GET_PARAM(2);\r
+ int interpolation = cv::INTER_AREA;\r
+ double f = GET_PARAM(3);\r
+\r
+ cv::Mat src_host(size, type);\r
+ fill(src_host, 0, 255);\r
+\r
+ cv::gpu::GpuMat src(src_host);\r
+ cv::gpu::GpuMat dst;\r
+\r
+ cv::gpu::resize(src, dst, cv::Size(), f, f, interpolation);\r
+\r
+ declare.time(1.0);\r
+\r
+ TEST_CYCLE()\r
+ {\r
+ cv::gpu::resize(src, dst, cv::Size(), f, f, interpolation);\r
+ }\r
+}\r
+\r
+INSTANTIATE_TEST_CASE_P(ImgProc, ResizeArea, testing::Combine(\r
+ ALL_DEVICES,\r
+ testing::Values(perf::sz1080p/*, cv::Size(4096, 2048)*/),\r
+ testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4),\r
+ MatType(CV_16UC1), MatType(CV_16UC3), MatType(CV_16UC4),\r
+ MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)),\r
+ testing::Values(Scale(0.2),Scale(0.1),Scale(0.05))));\r
+\r
//////////////////////////////////////////////////////////////////////\r
// WarpAffine\r
\r
\r
struct Mask8U\r
{\r
- explicit Mask8U(PtrStepb mask): mask(mask) {}\r
+ explicit Mask8U(PtrStepb mask_): mask(mask_) {}\r
\r
__device__ __forceinline__ bool operator()(int y, int x) const\r
{\r
#include "opencv2/gpu/device/vec_math.hpp"\r
#include "opencv2/gpu/device/saturate_cast.hpp"\r
#include "opencv2/gpu/device/filters.hpp"\r
-# include <cfloat>\r
+#include <cfloat>\r
+#include <opencv2/gpu/device/scan.hpp>\r
\r
namespace cv { namespace gpu { namespace device\r
{\r
template <typename T>\r
static void mergeC2_(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream)\r
{\r
- dim3 blockDim(32, 8);\r
- dim3 gridDim(divUp(dst.cols, blockDim.x), divUp(dst.rows, blockDim.y));\r
- mergeC2_<T><<<gridDim, blockDim, 0, stream>>>(\r
+ dim3 block(32, 8);\r
+ dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));\r
+ mergeC2_<T><<<grid, block, 0, stream>>>(\r
src[0].data, src[0].step,\r
src[1].data, src[1].step,\r
dst.rows, dst.cols, dst.data, dst.step);\r
template <typename T>\r
static void mergeC3_(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream)\r
{\r
- dim3 blockDim(32, 8);\r
- dim3 gridDim(divUp(dst.cols, blockDim.x), divUp(dst.rows, blockDim.y));\r
- mergeC3_<T><<<gridDim, blockDim, 0, stream>>>(\r
+ dim3 block(32, 8);\r
+ dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));\r
+ mergeC3_<T><<<grid, block, 0, stream>>>(\r
src[0].data, src[0].step,\r
src[1].data, src[1].step,\r
src[2].data, src[2].step,\r
template <typename T>\r
static void mergeC4_(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream)\r
{\r
- dim3 blockDim(32, 8);\r
- dim3 gridDim(divUp(dst.cols, blockDim.x), divUp(dst.rows, blockDim.y));\r
- mergeC4_<T><<<gridDim, blockDim, 0, stream>>>(\r
+ dim3 block(32, 8);\r
+ dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));\r
+ mergeC4_<T><<<grid, block, 0, stream>>>(\r
src[0].data, src[0].step,\r
src[1].data, src[1].step,\r
src[2].data, src[2].step,\r
template <typename T>\r
static void splitC2_(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream)\r
{\r
- dim3 blockDim(32, 8);\r
- dim3 gridDim(divUp(src.cols, blockDim.x), divUp(src.rows, blockDim.y));\r
- splitC2_<T><<<gridDim, blockDim, 0, stream>>>(\r
+ dim3 block(32, 8);\r
+ dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));\r
+ splitC2_<T><<<grid, block, 0, stream>>>(\r
src.data, src.step, src.rows, src.cols,\r
dst[0].data, dst[0].step,\r
dst[1].data, dst[1].step);\r
template <typename T>\r
static void splitC3_(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream)\r
{\r
- dim3 blockDim(32, 8);\r
- dim3 gridDim(divUp(src.cols, blockDim.x), divUp(src.rows, blockDim.y));\r
- splitC3_<T><<<gridDim, blockDim, 0, stream>>>(\r
+ dim3 block(32, 8);\r
+ dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));\r
+ splitC3_<T><<<grid, block, 0, stream>>>(\r
src.data, src.step, src.rows, src.cols,\r
dst[0].data, dst[0].step,\r
dst[1].data, dst[1].step,\r
template <typename T>\r
static void splitC4_(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream)\r
{\r
- dim3 blockDim(32, 8);\r
- dim3 gridDim(divUp(src.cols, blockDim.x), divUp(src.rows, blockDim.y));\r
- splitC4_<T><<<gridDim, blockDim, 0, stream>>>(\r
+ dim3 block(32, 8);\r
+ dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));\r
+ splitC4_<T><<<grid, block, 0, stream>>>(\r
src.data, src.step, src.rows, src.cols,\r
dst[0].data, dst[0].step,\r
dst[1].data, dst[1].step,\r
/*M///////////////////////////////////////////////////////////////////////////////////////\r
//\r
-// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. \r
-// \r
+// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.\r
+//\r
// By downloading, copying, installing or using the software you agree to this license.\r
// If you do not agree to this license, do not download, install,\r
// copy or use the software.\r
__global__ void pointwise_add(float *d_res, const float *d_op1, const float *d_op2, const int len)\r
{\r
const int pos = blockIdx.x*blockDim.x + threadIdx.x;\r
- \r
+\r
if(pos >= len) return;\r
- \r
+\r
d_res[pos] = d_op1[pos] + d_op2[pos];\r
}\r
\r
///////////////////////////////////////////////////////////////////////////////\r
template<int tex_id>\r
__forceinline__ __device__ void load_array_element(float *smem, int is, int js, int i, int j, int w, int h, int p)\r
-{ \r
+{\r
//position within shared memory array\r
const int ijs = js * PSOR_PITCH + is;\r
//mirror reflection across borders\r
///\param h number of rows in global memory array\r
///\param p global memory array pitch in floats\r
///////////////////////////////////////////////////////////////////////////////\r
-template<int tex> \r
+template<int tex>\r
__forceinline__ __device__ void load_array(float *smem, int ig, int jg, int w, int h, int p)\r
{\r
const int i = threadIdx.x + 2;\r
/// \param gamma (in) gamma in Brox model (edge importance)\r
///////////////////////////////////////////////////////////////////////////////\r
\r
-__global__ void prepare_sor_stage_1_tex(float *diffusivity_x, float *diffusivity_y, \r
+__global__ void prepare_sor_stage_1_tex(float *diffusivity_x, float *diffusivity_y,\r
float *denominator_u, float *denominator_v,\r
float *numerator_dudv,\r
float *numerator_u, float *numerator_v,\r
// Red-Black SOR\r
/////////////////////////////////////////////////////////////////////////////////////////\r
\r
-template<int isBlack> __global__ void sor_pass(float *new_du, \r
- float *new_dv, \r
- const float *g_inv_denominator_u, \r
+template<int isBlack> __global__ void sor_pass(float *new_du,\r
+ float *new_dv,\r
+ const float *g_inv_denominator_u,\r
const float *g_inv_denominator_v,\r
- const float *g_numerator_u, \r
- const float *g_numerator_v, \r
- const float *g_numerator_dudv, \r
- float omega, \r
- int width, \r
- int height, \r
+ const float *g_numerator_u,\r
+ const float *g_numerator_v,\r
+ const float *g_numerator_dudv,\r
+ float omega,\r
+ int width,\r
+ int height,\r
int stride)\r
{\r
int i = blockIdx.x * blockDim.x + threadIdx.x;\r
if((i+j)%2 == isBlack)\r
{\r
// update du\r
- float numerator_u = (s_left*(u_left + du_left) + s_up*(u_up + du_up) + s_right*(u_right + du_right) + s_down*(u_down + du_down) - \r
+ float numerator_u = (s_left*(u_left + du_left) + s_up*(u_up + du_up) + s_right*(u_right + du_right) + s_down*(u_down + du_down) -\r
u * (s_left + s_right + s_up + s_down) - g_numerator_u[pos] - numerator_dudv*dv);\r
\r
du = (1.0f - omega) * du + omega * g_inv_denominator_u[pos] * numerator_u;\r
initTexture2D(tex_I1);\r
initTexture2D(tex_fine); // for downsampling\r
initTexture2D(tex_coarse); // for prolongation\r
- \r
+\r
initTexture2D(tex_Ix);\r
initTexture2D(tex_Ixx);\r
initTexture2D(tex_Ix0);\r
const Ncv32u kSourceHeight = frame0.height();\r
\r
ncvAssertPrintReturn(frame1.width() == kSourceWidth && frame1.height() == kSourceHeight, "Frame dims do not match", NCV_INCONSISTENT_INPUT);\r
- ncvAssertReturn(uOut.width() == kSourceWidth && vOut.width() == kSourceWidth && \r
+ ncvAssertReturn(uOut.width() == kSourceWidth && vOut.width() == kSourceWidth &&\r
uOut.height() == kSourceHeight && vOut.height() == kSourceHeight, NCV_INCONSISTENT_INPUT);\r
\r
ncvAssertReturn(gpu_mem_allocator.isInitialized(), NCV_ALLOCATOR_NOT_INITIALIZED);\r
SAFE_VECTOR_DECL(dv_new, gpu_mem_allocator, kSizeInPixelsAligned);\r
\r
// temporary storage\r
- SAFE_VECTOR_DECL(device_buffer, gpu_mem_allocator, \r
+ SAFE_VECTOR_DECL(device_buffer, gpu_mem_allocator,\r
alignUp(kSourceWidth, kStrideAlignmentFloat) * alignUp(kSourceHeight, kStrideAlignmentFloat));\r
\r
// image derivatives\r
{\r
const float derivativeFilterHost[kDFilterSize] = {1.0f, -8.0f, 0.0f, 8.0f, -1.0f};\r
\r
- ncvAssertCUDAReturn(cudaMemcpy(derivativeFilter.ptr(), derivativeFilterHost, sizeof(float) * kDFilterSize, \r
+ ncvAssertCUDAReturn(cudaMemcpy(derivativeFilter.ptr(), derivativeFilterHost, sizeof(float) * kDFilterSize,\r
cudaMemcpyHostToDevice), NCV_CUDA_ERROR);\r
\r
InitTextures();\r
size_t src_width_in_bytes = kSourceWidth * sizeof(float);\r
size_t src_pitch_in_bytes = frame0.pitch();\r
\r
- ncvAssertCUDAReturn( cudaMemcpy2DAsync(pI0->ptr(), dst_width_in_bytes, frame0.ptr(), \r
+ ncvAssertCUDAReturn( cudaMemcpy2DAsync(pI0->ptr(), dst_width_in_bytes, frame0.ptr(),\r
src_pitch_in_bytes, src_width_in_bytes, kSourceHeight, cudaMemcpyDeviceToDevice, stream), NCV_CUDA_ERROR );\r
\r
- ncvAssertCUDAReturn( cudaMemcpy2DAsync(pI1->ptr(), dst_width_in_bytes, frame1.ptr(), \r
+ ncvAssertCUDAReturn( cudaMemcpy2DAsync(pI1->ptr(), dst_width_in_bytes, frame1.ptr(),\r
src_pitch_in_bytes, src_width_in_bytes, kSourceHeight, cudaMemcpyDeviceToDevice, stream), NCV_CUDA_ERROR );\r
}\r
\r
NcvRect32u dstROI (0, 0, level_width, level_height);\r
\r
// frame 0\r
- ncvAssertReturnNcvStat( nppiStResize_32f_C1R (I0->ptr(), srcSize, prev_level_pitch, srcROI, \r
+ ncvAssertReturnNcvStat( nppiStResize_32f_C1R (I0->ptr(), srcSize, prev_level_pitch, srcROI,\r
level_frame0->ptr(), dstSize, level_width_aligned * sizeof (float), dstROI, scale_factor, scale_factor, nppStSupersample) );\r
\r
// frame 1\r
- ncvAssertReturnNcvStat( nppiStResize_32f_C1R (I1->ptr(), srcSize, prev_level_pitch, srcROI, \r
+ ncvAssertReturnNcvStat( nppiStResize_32f_C1R (I1->ptr(), srcSize, prev_level_pitch, srcROI,\r
level_frame1->ptr(), dstSize, level_width_aligned * sizeof (float), dstROI, scale_factor, scale_factor, nppStSupersample) );\r
}\r
\r
dim3 dThreads(32, 6);\r
\r
const int kPitchTex = kLevelStride * sizeof(float);\r
- \r
+\r
NcvSize32u srcSize(kLevelWidth, kLevelHeight);\r
Ncv32u nSrcStep = kLevelStride * sizeof(float);\r
NcvRect32u oROI(0, 0, kLevelWidth, kLevelHeight);\r
\r
// Ix0\r
ncvAssertReturnNcvStat( nppiStFilterRowBorder_32f_C1R (I0->ptr(), srcSize, nSrcStep, Ix0.ptr(), srcSize, nSrcStep, oROI,\r
- nppStBorderMirror, derivativeFilter.ptr(), kDFilterSize, kDFilterSize/2, 1.0f/12.0f) ); \r
+ nppStBorderMirror, derivativeFilter.ptr(), kDFilterSize, kDFilterSize/2, 1.0f/12.0f) );\r
\r
// Iy0\r
ncvAssertReturnNcvStat( nppiStFilterColumnBorder_32f_C1R (I0->ptr(), srcSize, nSrcStep, Iy0.ptr(), srcSize, nSrcStep, oROI,\r
\r
// Ixy\r
ncvAssertReturnNcvStat( nppiStFilterRowBorder_32f_C1R (Iy.ptr(), srcSize, nSrcStep, Ixy.ptr(), srcSize, nSrcStep, oROI,\r
- nppStBorderMirror, derivativeFilter.ptr(), kDFilterSize, kDFilterSize/2, 1.0f/12.0f) ); \r
- \r
+ nppStBorderMirror, derivativeFilter.ptr(), kDFilterSize, kDFilterSize/2, 1.0f/12.0f) );\r
+\r
ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Ix, Ix.ptr(), ch_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR);\r
ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Ixx, Ixx.ptr(), ch_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR);\r
ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Ix0, Ix0.ptr(), ch_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR);\r
{\r
//compute coefficients\r
prepare_sor_stage_1_tex<<<psor_blocks, psor_threads, 0, stream>>>\r
- (diffusivity_x.ptr(), \r
- diffusivity_y.ptr(), \r
- denom_u.ptr(), \r
- denom_v.ptr(), \r
- num_dudv.ptr(), \r
- num_u.ptr(), \r
- num_v.ptr(), \r
- kLevelWidth, \r
- kLevelHeight, \r
- kLevelStride, \r
- alpha, \r
+ (diffusivity_x.ptr(),\r
+ diffusivity_y.ptr(),\r
+ denom_u.ptr(),\r
+ denom_v.ptr(),\r
+ num_dudv.ptr(),\r
+ num_u.ptr(),\r
+ num_v.ptr(),\r
+ kLevelWidth,\r
+ kLevelHeight,\r
+ kLevelStride,\r
+ alpha,\r
gamma);\r
\r
ncvAssertCUDALastErrorReturn(NCV_CUDA_ERROR);\r
- \r
+\r
ncvAssertCUDAReturn(cudaBindTexture(0, tex_diffusivity_x, diffusivity_x.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);\r
ncvAssertCUDAReturn(cudaBindTexture(0, tex_diffusivity_y, diffusivity_y.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);\r
\r
prepare_sor_stage_2<<<psor_blocks, psor_threads, 0, stream>>>(denom_u.ptr(), denom_v.ptr(), kLevelWidth, kLevelHeight, kLevelStride);\r
\r
ncvAssertCUDALastErrorReturn(NCV_CUDA_ERROR);\r
- \r
+\r
// linear system coefficients\r
ncvAssertCUDAReturn(cudaBindTexture(0, tex_diffusivity_x, diffusivity_x.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);\r
ncvAssertCUDAReturn(cudaBindTexture(0, tex_diffusivity_y, diffusivity_y.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);\r
\r
ncvAssertCUDAReturn(cudaBindTexture(0, tex_inv_denominator_u, denom_u.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);\r
ncvAssertCUDAReturn(cudaBindTexture(0, tex_inv_denominator_v, denom_v.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);\r
- \r
+\r
//solve linear system\r
for (Ncv32u solver_iteration = 0; solver_iteration < desc.number_of_solver_iterations; ++solver_iteration)\r
{\r
float omega = 1.99f;\r
- \r
+\r
ncvAssertCUDAReturn(cudaBindTexture(0, tex_du, du.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);\r
ncvAssertCUDAReturn(cudaBindTexture(0, tex_dv, dv.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);\r
\r
sor_pass<0><<<sor_blocks, sor_threads, 0, stream>>>\r
- (du_new.ptr(), \r
- dv_new.ptr(), \r
- denom_u.ptr(), \r
+ (du_new.ptr(),\r
+ dv_new.ptr(),\r
+ denom_u.ptr(),\r
denom_v.ptr(),\r
- num_u.ptr(), \r
- num_v.ptr(), \r
- num_dudv.ptr(), \r
- omega, \r
- kLevelWidth, \r
- kLevelHeight, \r
+ num_u.ptr(),\r
+ num_v.ptr(),\r
+ num_dudv.ptr(),\r
+ omega,\r
+ kLevelWidth,\r
+ kLevelHeight,\r
kLevelStride);\r
\r
ncvAssertCUDALastErrorReturn(NCV_CUDA_ERROR);\r
ncvAssertCUDAReturn(cudaBindTexture(0, tex_dv, dv_new.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);\r
\r
sor_pass<1><<<sor_blocks, sor_threads, 0, stream>>>\r
- (du.ptr(), \r
- dv.ptr(), \r
- denom_u.ptr(), \r
+ (du.ptr(),\r
+ dv.ptr(),\r
+ denom_u.ptr(),\r
denom_v.ptr(),\r
- num_u.ptr(), \r
- num_v.ptr(), \r
- num_dudv.ptr(), \r
- omega, \r
- kLevelWidth, \r
- kLevelHeight, \r
+ num_u.ptr(),\r
+ num_v.ptr(),\r
+ num_dudv.ptr(),\r
+ omega,\r
+ kLevelWidth,\r
+ kLevelHeight,\r
kLevelStride);\r
\r
ncvAssertCUDALastErrorReturn(NCV_CUDA_ERROR);\r
\r
dim3 p_blocks(iDivUp(nw, 32), iDivUp(nh, 8));\r
dim3 p_threads(32, 8);\r
- \r
- NcvSize32u srcSize (kLevelWidth, kLevelHeight);\r
+\r
+ NcvSize32u inner_srcSize (kLevelWidth, kLevelHeight);\r
NcvSize32u dstSize (nw, nh);\r
NcvRect32u srcROI (0, 0, kLevelWidth, kLevelHeight);\r
NcvRect32u dstROI (0, 0, nw, nh);\r
\r
- ncvAssertReturnNcvStat( nppiStResize_32f_C1R (ptrU->ptr(), srcSize, kLevelStride * sizeof (float), srcROI, \r
+ ncvAssertReturnNcvStat( nppiStResize_32f_C1R (ptrU->ptr(), inner_srcSize, kLevelStride * sizeof (float), srcROI,\r
ptrUNew->ptr(), dstSize, ns * sizeof (float), dstROI, 1.0f/scale_factor, 1.0f/scale_factor, nppStBicubic) );\r
\r
ScaleVector(ptrUNew->ptr(), ptrUNew->ptr(), 1.0f/scale_factor, ns * nh, stream);\r
ncvAssertCUDALastErrorReturn(NCV_CUDA_ERROR);\r
\r
- ncvAssertReturnNcvStat( nppiStResize_32f_C1R (ptrV->ptr(), srcSize, kLevelStride * sizeof (float), srcROI, \r
+ ncvAssertReturnNcvStat( nppiStResize_32f_C1R (ptrV->ptr(), inner_srcSize, kLevelStride * sizeof (float), srcROI,\r
ptrVNew->ptr(), dstSize, ns * sizeof (float), dstROI, 1.0f/scale_factor, 1.0f/scale_factor, nppStBicubic) );\r
\r
ScaleVector(ptrVNew->ptr(), ptrVNew->ptr(), 1.0f/scale_factor, ns * nh, stream);\r
ncvAssertCUDAReturn(cudaStreamSynchronize(stream), NCV_CUDA_ERROR);\r
\r
ncvAssertCUDAReturn( cudaMemcpy2DAsync\r
- (uOut.ptr(), uOut.pitch(), ptrU->ptr(), \r
+ (uOut.ptr(), uOut.pitch(), ptrU->ptr(),\r
kSourcePitch, kSourceWidth*sizeof(float), kSourceHeight, cudaMemcpyDeviceToDevice, stream), NCV_CUDA_ERROR );\r
\r
ncvAssertCUDAReturn( cudaMemcpy2DAsync\r
- (vOut.ptr(), vOut.pitch(), ptrV->ptr(), \r
+ (vOut.ptr(), vOut.pitch(), ptrV->ptr(),\r
kSourcePitch, kSourceWidth*sizeof(float), kSourceHeight, cudaMemcpyDeviceToDevice, stream), NCV_CUDA_ERROR );\r
\r
ncvAssertCUDAReturn(cudaStreamSynchronize(stream), NCV_CUDA_ERROR);\r
//===================================================================\r
\r
\r
-NCVMemStackAllocator::NCVMemStackAllocator(Ncv32u alignment)\r
+NCVMemStackAllocator::NCVMemStackAllocator(Ncv32u alignment_)\r
:\r
currentSize(0),\r
_maxSize(0),\r
begin(NULL),\r
end(NULL),\r
_memType(NCVMemoryTypeNone),\r
- _alignment(alignment),\r
+ _alignment(alignment_),\r
bReusesMemory(false)\r
{\r
- NcvBool bProperAlignment = (alignment & (alignment-1)) == 0;\r
+ NcvBool bProperAlignment = (alignment_ & (alignment_ - 1)) == 0;\r
ncvAssertPrintCheck(bProperAlignment, "NCVMemStackAllocator ctor:: alignment not power of 2");\r
}\r
\r
\r
-NCVMemStackAllocator::NCVMemStackAllocator(NCVMemoryType memT, size_t capacity, Ncv32u alignment, void *reusePtr)\r
+NCVMemStackAllocator::NCVMemStackAllocator(NCVMemoryType memT, size_t capacity, Ncv32u alignment_, void *reusePtr)\r
:\r
currentSize(0),\r
_maxSize(0),\r
allocBegin(NULL),\r
_memType(memT),\r
- _alignment(alignment)\r
+ _alignment(alignment_)\r
{\r
- NcvBool bProperAlignment = (alignment & (alignment-1)) == 0;\r
+ NcvBool bProperAlignment = (alignment_ & (alignment_ - 1)) == 0;\r
ncvAssertPrintCheck(bProperAlignment, "NCVMemStackAllocator ctor:: _alignment not power of 2");\r
ncvAssertPrintCheck(memT != NCVMemoryTypeNone, "NCVMemStackAllocator ctor:: Incorrect allocator type");\r
\r
//===================================================================\r
\r
\r
-NCVMemNativeAllocator::NCVMemNativeAllocator(NCVMemoryType memT, Ncv32u alignment)\r
+NCVMemNativeAllocator::NCVMemNativeAllocator(NCVMemoryType memT, Ncv32u alignment_)\r
:\r
currentSize(0),\r
_maxSize(0),\r
_memType(memT),\r
- _alignment(alignment)\r
+ _alignment(alignment_)\r
{\r
ncvAssertPrintReturn(memT != NCVMemoryTypeNone, "NCVMemNativeAllocator ctor:: counting not permitted for this allocator type", );\r
}\r
#define cudaSafeCall(expr) ___cudaSafeCall(expr, __FILE__, __LINE__)\r
#endif\r
\r
-namespace cv { namespace gpu \r
+namespace cv { namespace gpu\r
{\r
void error(const char *error_string, const char *file, const int line, const char *func);\r
\r
\r
#ifdef __CUDACC__\r
\r
-namespace cv { namespace gpu \r
-{ \r
- __host__ __device__ __forceinline__ int divUp(int total, int grain) \r
- { \r
- return (total + grain - 1) / grain; \r
+namespace cv { namespace gpu\r
+{\r
+ __host__ __device__ __forceinline__ int divUp(int total, int grain)\r
+ {\r
+ return (total + grain - 1) / grain;\r
}\r
\r
- namespace device \r
+ namespace device\r
{\r
typedef unsigned char uchar;\r
typedef unsigned short ushort;\r
\r
#include "common.hpp"\r
\r
-namespace cv { namespace gpu { namespace device \r
+namespace cv { namespace gpu { namespace device\r
{\r
#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 200\r
\r
{\r
__device__ __forceinline__ static void Load(const T* ptr, int offset, T& val) { val = ptr[offset]; }\r
};\r
- \r
- #else // __CUDA_ARCH__ >= 200 \r
\r
- #if defined(_WIN64) || defined(__LP64__) \r
+ #else // __CUDA_ARCH__ >= 200\r
+\r
+ #if defined(_WIN64) || defined(__LP64__)\r
// 64-bit register modifier for inlined asm\r
#define OPENCV_GPU_ASM_PTR "l"\r
- #else \r
+ #else\r
// 32-bit register modifier for inlined asm\r
#define OPENCV_GPU_ASM_PTR "r"\r
#endif\r
asm("ld.global."#ptx_type" %0, [%1];" : "=r"(*reinterpret_cast<uint*>(&val)) : OPENCV_GPU_ASM_PTR(ptr + offset)); \\r
} \\r
};\r
- \r
+\r
OPENCV_GPU_DEFINE_FORCE_GLOB_B(uchar, u8)\r
OPENCV_GPU_DEFINE_FORCE_GLOB_B(schar, s8)\r
OPENCV_GPU_DEFINE_FORCE_GLOB_B(char, b8)\r
OPENCV_GPU_DEFINE_FORCE_GLOB (ushort, u16, h)\r
OPENCV_GPU_DEFINE_FORCE_GLOB (short, s16, h)\r
OPENCV_GPU_DEFINE_FORCE_GLOB (uint, u32, r)\r
- OPENCV_GPU_DEFINE_FORCE_GLOB (int, s32, r) \r
- OPENCV_GPU_DEFINE_FORCE_GLOB (float, f32, f) \r
- OPENCV_GPU_DEFINE_FORCE_GLOB (double, f64, d) \r
+ OPENCV_GPU_DEFINE_FORCE_GLOB (int, s32, r)\r
+ OPENCV_GPU_DEFINE_FORCE_GLOB (float, f32, f)\r
+ OPENCV_GPU_DEFINE_FORCE_GLOB (double, f64, d)\r
\r
#undef OPENCV_GPU_DEFINE_FORCE_GLOB\r
#undef OPENCV_GPU_DEFINE_FORCE_GLOB_B\r
#undef OPENCV_GPU_ASM_PTR\r
- \r
+\r
#endif // __CUDA_ARCH__ >= 200\r
}}} // namespace cv { namespace gpu { namespace device\r
\r
#define __OPENCV_GPU_DYNAMIC_SMEM_HPP__\r
\r
namespace cv { namespace gpu { namespace device\r
-{ \r
+{\r
template<class T> struct DynamicSharedMem\r
{\r
__device__ __forceinline__ operator T*()\r
\r
#include "warp_reduce.hpp"\r
\r
-namespace cv { namespace gpu { namespace device \r
+namespace cv { namespace gpu { namespace device\r
{\r
struct Emulation\r
{\r
- static __forceinline__ __device__ int Ballot(int predicate, volatile int* cta_buffer)\r
- {\r
+ static __forceinline__ __device__ int Ballot(int predicate, volatile int* cta_buffer)\r
+ {\r
#if __CUDA_ARCH__ >= 200\r
- (void)cta_buffer;\r
- return __ballot(predicate);\r
+ (void)cta_buffer;\r
+ return __ballot(predicate);\r
#else\r
- int tid = threadIdx.x; \r
- cta_buffer[tid] = predicate ? (1 << (tid & 31)) : 0;\r
- return warp_reduce(cta_buffer);\r
+ int tid = threadIdx.x;\r
+ cta_buffer[tid] = predicate ? (1 << (tid & 31)) : 0;\r
+ return warp_reduce(cta_buffer);\r
#endif\r
- }\r
+ }\r
};\r
}}} // namespace cv { namespace gpu { namespace device\r
\r
\r
#include <cstdio>\r
\r
-namespace cv { namespace gpu { namespace device \r
+namespace cv { namespace gpu { namespace device\r
{\r
- template<class Func> \r
+ template<class Func>\r
void printFuncAttrib(Func& func)\r
{\r
\r
cudaFuncAttributes attrs;\r
- cudaFuncGetAttributes(&attrs, func); \r
+ cudaFuncGetAttributes(&attrs, func);\r
\r
printf("=== Function stats ===\n");\r
printf("Name: \n");\r
printf("ptxVersion = %d\n", attrs.ptxVersion);\r
printf("binaryVersion = %d\n", attrs.binaryVersion);\r
printf("\n");\r
- fflush(stdout); \r
+ fflush(stdout);\r
}\r
}}} // namespace cv { namespace gpu { namespace device\r
\r
#include "vec_traits.hpp"\r
#include "type_traits.hpp"\r
\r
-namespace cv { namespace gpu { namespace device \r
+namespace cv { namespace gpu { namespace device\r
{\r
// Function Objects\r
\r
\r
template <typename T> struct bit_not : unary_function<T, T>\r
{\r
- __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType v) const \r
+ __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType v) const\r
{\r
return ~v;\r
}\r
// Generalized Identity Operations\r
template <typename T> struct identity : unary_function<T, T>\r
{\r
- __device__ __forceinline__ typename TypeTraits<T>::ParameterType operator()(typename TypeTraits<T>::ParameterType x) const \r
+ __device__ __forceinline__ typename TypeTraits<T>::ParameterType operator()(typename TypeTraits<T>::ParameterType x) const\r
{\r
return x;\r
}\r
\r
template <typename T1, typename T2> struct project1st : binary_function<T1, T2, T1>\r
{\r
- __device__ __forceinline__ typename TypeTraits<T1>::ParameterType operator()(typename TypeTraits<T1>::ParameterType lhs, typename TypeTraits<T2>::ParameterType rhs) const \r
+ __device__ __forceinline__ typename TypeTraits<T1>::ParameterType operator()(typename TypeTraits<T1>::ParameterType lhs, typename TypeTraits<T2>::ParameterType rhs) const\r
{\r
return lhs;\r
}\r
\r
template <typename T1, typename T2> struct project2nd : binary_function<T1, T2, T2>\r
{\r
- __device__ __forceinline__ typename TypeTraits<T2>::ParameterType operator()(typename TypeTraits<T1>::ParameterType lhs, typename TypeTraits<T2>::ParameterType rhs) const \r
+ __device__ __forceinline__ typename TypeTraits<T2>::ParameterType operator()(typename TypeTraits<T1>::ParameterType lhs, typename TypeTraits<T2>::ParameterType rhs) const\r
{\r
return rhs;\r
}\r
\r
template <typename T> struct maximum : binary_function<T, T, T>\r
{\r
- __device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType lhs, typename TypeTraits<T>::ParameterType rhs) const \r
+ __device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType lhs, typename TypeTraits<T>::ParameterType rhs) const\r
{\r
return lhs < rhs ? rhs : lhs;\r
}\r
\r
template <typename T> struct minimum : binary_function<T, T, T>\r
{\r
- __device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType lhs, typename TypeTraits<T>::ParameterType rhs) const \r
+ __device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType lhs, typename TypeTraits<T>::ParameterType rhs) const\r
{\r
return lhs < rhs ? lhs : rhs;\r
}\r
#undef OPENCV_GPU_IMPLEMENT_UN_FUNCTOR\r
#undef OPENCV_GPU_IMPLEMENT_BIN_FUNCTOR\r
\r
- template<typename T> struct hypot_sqr_func : binary_function<T, T, float> \r
+ template<typename T> struct hypot_sqr_func : binary_function<T, T, float>\r
{\r
__device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType src1, typename TypeTraits<T>::ParameterType src2) const\r
{\r
return src1 * src1 + src2 * src2;\r
}\r
+ __device__ __forceinline__ hypot_sqr_func(const hypot_sqr_func& other) : binary_function<T, T, float>(){}\r
+ __device__ __forceinline__ hypot_sqr_func() : binary_function<T, T, float>(){}\r
};\r
\r
// Saturate Cast Functor\r
{\r
return (src > thresh) * maxVal;\r
}\r
+\r
__device__ __forceinline__ thresh_binary_func(const thresh_binary_func& other)\r
: unary_function<T, T>(), thresh(other.thresh), maxVal(other.maxVal){}\r
\r
{\r
return (src <= thresh) * maxVal;\r
}\r
+\r
__device__ __forceinline__ thresh_binary_inv_func(const thresh_binary_inv_func& other)\r
: unary_function<T, T>(), thresh(other.thresh), maxVal(other.maxVal){}\r
\r
explicit __host__ __device__ __forceinline__ unary_negate(const Predicate& p) : pred(p) {}\r
\r
__device__ __forceinline__ bool operator()(typename TypeTraits<typename Predicate::argument_type>::ParameterType x) const\r
- { \r
- return !pred(x); \r
+ {\r
+ return !pred(x);\r
}\r
\r
+ __device__ __forceinline__ unary_negate(const unary_negate& other) : unary_function<typename Predicate::argument_type, bool>(){}\r
+ __device__ __forceinline__ unary_negate() : unary_function<typename Predicate::argument_type, bool>(){}\r
+\r
const Predicate pred;\r
};\r
+\r
template <typename Predicate> __host__ __device__ __forceinline__ unary_negate<Predicate> not1(const Predicate& pred)\r
{\r
return unary_negate<Predicate>(pred);\r
{\r
explicit __host__ __device__ __forceinline__ binary_negate(const Predicate& p) : pred(p) {}\r
\r
- __device__ __forceinline__ bool operator()(typename TypeTraits<typename Predicate::first_argument_type>::ParameterType x, typename TypeTraits<typename Predicate::second_argument_type>::ParameterType y) const\r
- { \r
- return !pred(x,y); \r
+ __device__ __forceinline__ bool operator()(typename TypeTraits<typename Predicate::first_argument_type>::ParameterType x,\r
+ typename TypeTraits<typename Predicate::second_argument_type>::ParameterType y) const\r
+ {\r
+ return !pred(x,y);\r
}\r
+ __device__ __forceinline__ binary_negate(const binary_negate& other)\r
+ : binary_function<typename Predicate::first_argument_type, typename Predicate::second_argument_type, bool>(){}\r
+\r
+ __device__ __forceinline__ binary_negate() :\r
+ binary_function<typename Predicate::first_argument_type, typename Predicate::second_argument_type, bool>(){}\r
\r
const Predicate pred;\r
};\r
+\r
template <typename BinaryPredicate> __host__ __device__ __forceinline__ binary_negate<BinaryPredicate> not2(const BinaryPredicate& pred)\r
{\r
return binary_negate<BinaryPredicate>(pred);\r
}\r
\r
- template <typename Op> struct binder1st : unary_function<typename Op::second_argument_type, typename Op::result_type> \r
+ template <typename Op> struct binder1st : unary_function<typename Op::second_argument_type, typename Op::result_type>\r
{\r
__host__ __device__ __forceinline__ binder1st(const Op& op_, const typename Op::first_argument_type& arg1_) : op(op_), arg1(arg1_) {}\r
\r
return op(arg1, a);\r
}\r
\r
+ __device__ __forceinline__ binder1st(const binder1st& other) :\r
+ unary_function<typename Op::second_argument_type, typename Op::result_type>(){}\r
+\r
const Op op;\r
const typename Op::first_argument_type arg1;\r
};\r
+\r
template <typename Op, typename T> __host__ __device__ __forceinline__ binder1st<Op> bind1st(const Op& op, const T& x)\r
{\r
return binder1st<Op>(op, typename Op::first_argument_type(x));\r
}\r
\r
- template <typename Op> struct binder2nd : unary_function<typename Op::first_argument_type, typename Op::result_type> \r
+ template <typename Op> struct binder2nd : unary_function<typename Op::first_argument_type, typename Op::result_type>\r
{\r
__host__ __device__ __forceinline__ binder2nd(const Op& op_, const typename Op::second_argument_type& arg2_) : op(op_), arg2(arg2_) {}\r
\r
return op(a, arg2);\r
}\r
\r
+ __device__ __forceinline__ binder2nd(const binder2nd& other) :\r
+ unary_function<typename Op::first_argument_type, typename Op::result_type>(), op(other.op), arg2(other.arg2){}\r
+\r
const Op op;\r
const typename Op::second_argument_type arg2;\r
};\r
+\r
template <typename Op, typename T> __host__ __device__ __forceinline__ binder2nd<Op> bind2nd(const Op& op, const T& x)\r
{\r
return binder2nd<Op>(op, typename Op::second_argument_type(x));\r
}\r
\r
// Functor Traits\r
-\r
template <typename F> struct IsUnaryFunction\r
{\r
typedef char Yes;\r
{\r
enum { shift = UnOpShift<sizeof(T), sizeof(D)>::shift };\r
};\r
- \r
+\r
template <size_t src_elem_size1, size_t src_elem_size2, size_t dst_elem_size> struct BinOpShift { enum { shift = 1 }; };\r
template <size_t src_elem_size1, size_t src_elem_size2> struct BinOpShift<src_elem_size1, src_elem_size2, 1> { enum { shift = 4 }; };\r
template <size_t src_elem_size1, size_t src_elem_size2> struct BinOpShift<src_elem_size1, src_elem_size2, 2> { enum { shift = 2 }; };\r
#include <limits>\r
#include "common.hpp"\r
\r
-namespace cv { namespace gpu { namespace device \r
+namespace cv { namespace gpu { namespace device\r
{\r
template<class T> struct numeric_limits\r
{\r
template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(double v) { return _Tp(v); }\r
\r
template<> __device__ __forceinline__ uchar saturate_cast<uchar>(schar v)\r
- { \r
- return (uchar) ::max((int)v, 0); \r
+ {\r
+ return (uchar) ::max((int)v, 0);\r
}\r
template<> __device__ __forceinline__ uchar saturate_cast<uchar>(ushort v)\r
- { \r
- return (uchar) ::min((uint)v, (uint)UCHAR_MAX); \r
+ {\r
+ return (uchar) ::min((uint)v, (uint)UCHAR_MAX);\r
}\r
template<> __device__ __forceinline__ uchar saturate_cast<uchar>(int v)\r
- { \r
- return (uchar)((uint)v <= UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0); \r
+ {\r
+ return (uchar)((uint)v <= UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0);\r
}\r
template<> __device__ __forceinline__ uchar saturate_cast<uchar>(uint v)\r
- { \r
- return (uchar) ::min(v, (uint)UCHAR_MAX); \r
+ {\r
+ return (uchar) ::min(v, (uint)UCHAR_MAX);\r
}\r
template<> __device__ __forceinline__ uchar saturate_cast<uchar>(short v)\r
- { \r
- return saturate_cast<uchar>((uint)v); \r
+ {\r
+ return saturate_cast<uchar>((uint)v);\r
}\r
\r
template<> __device__ __forceinline__ uchar saturate_cast<uchar>(float v)\r
- { \r
- int iv = __float2int_rn(v); \r
- return saturate_cast<uchar>(iv); \r
+ {\r
+ int iv = __float2int_rn(v);\r
+ return saturate_cast<uchar>(iv);\r
}\r
template<> __device__ __forceinline__ uchar saturate_cast<uchar>(double v)\r
{\r
#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130\r
- int iv = __double2int_rn(v); \r
+ int iv = __double2int_rn(v);\r
return saturate_cast<uchar>(iv);\r
#else\r
return saturate_cast<uchar>((float)v);\r
}\r
\r
template<> __device__ __forceinline__ schar saturate_cast<schar>(uchar v)\r
- { \r
- return (schar) ::min((int)v, SCHAR_MAX); \r
+ {\r
+ return (schar) ::min((int)v, SCHAR_MAX);\r
}\r
template<> __device__ __forceinline__ schar saturate_cast<schar>(ushort v)\r
- { \r
- return (schar) ::min((uint)v, (uint)SCHAR_MAX); \r
+ {\r
+ return (schar) ::min((uint)v, (uint)SCHAR_MAX);\r
}\r
template<> __device__ __forceinline__ schar saturate_cast<schar>(int v)\r
{\r
return (schar)((uint)(v-SCHAR_MIN) <= (uint)UCHAR_MAX ? v : v > 0 ? SCHAR_MAX : SCHAR_MIN);\r
}\r
template<> __device__ __forceinline__ schar saturate_cast<schar>(short v)\r
- { \r
- return saturate_cast<schar>((int)v); \r
+ {\r
+ return saturate_cast<schar>((int)v);\r
}\r
template<> __device__ __forceinline__ schar saturate_cast<schar>(uint v)\r
- { \r
- return (schar) ::min(v, (uint)SCHAR_MAX); \r
+ {\r
+ return (schar) ::min(v, (uint)SCHAR_MAX);\r
}\r
\r
template<> __device__ __forceinline__ schar saturate_cast<schar>(float v)\r
- { \r
- int iv = __float2int_rn(v); \r
- return saturate_cast<schar>(iv); \r
+ {\r
+ int iv = __float2int_rn(v);\r
+ return saturate_cast<schar>(iv);\r
}\r
template<> __device__ __forceinline__ schar saturate_cast<schar>(double v)\r
- { \r
+ {\r
#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130\r
- int iv = __double2int_rn(v); \r
+ int iv = __double2int_rn(v);\r
return saturate_cast<schar>(iv);\r
#else\r
return saturate_cast<schar>((float)v);\r
}\r
\r
template<> __device__ __forceinline__ ushort saturate_cast<ushort>(schar v)\r
- { \r
- return (ushort) ::max((int)v, 0); \r
+ {\r
+ return (ushort) ::max((int)v, 0);\r
}\r
template<> __device__ __forceinline__ ushort saturate_cast<ushort>(short v)\r
- { \r
- return (ushort) ::max((int)v, 0); \r
+ {\r
+ return (ushort) ::max((int)v, 0);\r
}\r
template<> __device__ __forceinline__ ushort saturate_cast<ushort>(int v)\r
- { \r
- return (ushort)((uint)v <= (uint)USHRT_MAX ? v : v > 0 ? USHRT_MAX : 0); \r
+ {\r
+ return (ushort)((uint)v <= (uint)USHRT_MAX ? v : v > 0 ? USHRT_MAX : 0);\r
}\r
template<> __device__ __forceinline__ ushort saturate_cast<ushort>(uint v)\r
- { \r
- return (ushort) ::min(v, (uint)USHRT_MAX); \r
+ {\r
+ return (ushort) ::min(v, (uint)USHRT_MAX);\r
}\r
template<> __device__ __forceinline__ ushort saturate_cast<ushort>(float v)\r
{\r
- int iv = __float2int_rn(v); \r
- return saturate_cast<ushort>(iv); \r
+ int iv = __float2int_rn(v);\r
+ return saturate_cast<ushort>(iv);\r
}\r
template<> __device__ __forceinline__ ushort saturate_cast<ushort>(double v)\r
- { \r
+ {\r
#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130\r
- int iv = __double2int_rn(v); \r
+ int iv = __double2int_rn(v);\r
return saturate_cast<ushort>(iv);\r
#else\r
return saturate_cast<ushort>((float)v);\r
}\r
\r
template<> __device__ __forceinline__ short saturate_cast<short>(ushort v)\r
- { \r
- return (short) ::min((int)v, SHRT_MAX); \r
+ {\r
+ return (short) ::min((int)v, SHRT_MAX);\r
}\r
template<> __device__ __forceinline__ short saturate_cast<short>(int v)\r
{\r
return (short)((uint)(v - SHRT_MIN) <= (uint)USHRT_MAX ? v : v > 0 ? SHRT_MAX : SHRT_MIN);\r
}\r
template<> __device__ __forceinline__ short saturate_cast<short>(uint v)\r
- { \r
- return (short) ::min(v, (uint)SHRT_MAX); \r
+ {\r
+ return (short) ::min(v, (uint)SHRT_MAX);\r
}\r
template<> __device__ __forceinline__ short saturate_cast<short>(float v)\r
- { \r
- int iv = __float2int_rn(v); \r
- return saturate_cast<short>(iv); \r
+ {\r
+ int iv = __float2int_rn(v);\r
+ return saturate_cast<short>(iv);\r
}\r
template<> __device__ __forceinline__ short saturate_cast<short>(double v)\r
- { \r
+ {\r
#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130\r
- int iv = __double2int_rn(v); \r
+ int iv = __double2int_rn(v);\r
return saturate_cast<short>(iv);\r
#else\r
return saturate_cast<short>((float)v);\r
#endif\r
}\r
\r
- template<> __device__ __forceinline__ int saturate_cast<int>(float v) \r
- { \r
- return __float2int_rn(v); \r
+ template<> __device__ __forceinline__ int saturate_cast<int>(float v)\r
+ {\r
+ return __float2int_rn(v);\r
}\r
- template<> __device__ __forceinline__ int saturate_cast<int>(double v) \r
+ template<> __device__ __forceinline__ int saturate_cast<int>(double v)\r
{\r
#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130\r
return __double2int_rn(v);\r
}\r
\r
template<> __device__ __forceinline__ uint saturate_cast<uint>(float v)\r
- { \r
- return __float2uint_rn(v); \r
+ {\r
+ return __float2uint_rn(v);\r
}\r
- template<> __device__ __forceinline__ uint saturate_cast<uint>(double v) \r
- { \r
+ template<> __device__ __forceinline__ uint saturate_cast<uint>(double v)\r
+ {\r
#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130\r
return __double2uint_rn(v);\r
#else\r
--- /dev/null
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+// By downloading, copying, installing or using the software you agree to this license.
+// If you do not agree to this license, do not download, install,
+// copy or use the software.
+//
+//
+// License Agreement
+// For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+// * Redistribution's of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+//
+// * Redistribution's in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+//
+// * The name of the copyright holders may not be used to endorse or promote products
+// derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_GPU_SCAN_HPP__
+#define __OPENCV_GPU_SCAN_HPP__
+
+ enum ScanKind { EXCLUSIVE = 0, INCLUSIVE = 1 };
+
+ template <ScanKind Kind, typename T, typename F> struct WarpScan
+ {
+ __device__ __forceinline__ WarpScan() {}
+ __device__ __forceinline__ WarpScan(const WarpScan& other) { (void)other; }
+
+ __device__ __forceinline__ T operator()( volatile T *ptr , const unsigned int idx)
+ {
+ const unsigned int lane = idx & 31;
+ F op;
+
+ if ( lane >= 1) ptr [idx ] = op(ptr [idx - 1], ptr [idx]);
+ if ( lane >= 2) ptr [idx ] = op(ptr [idx - 2], ptr [idx]);
+ if ( lane >= 4) ptr [idx ] = op(ptr [idx - 4], ptr [idx]);
+ if ( lane >= 8) ptr [idx ] = op(ptr [idx - 8], ptr [idx]);
+ if ( lane >= 16) ptr [idx ] = op(ptr [idx - 16], ptr [idx]);
+
+ if( Kind == INCLUSIVE )
+ return ptr [idx];
+ else
+ return (lane > 0) ? ptr [idx - 1] : 0;
+ }
+
+ __device__ __forceinline__ unsigned int index(const unsigned int tid)
+ {
+ return tid;
+ }
+
+ __device__ __forceinline__ void init(volatile T *ptr){}
+
+ static const int warp_offset = 0;
+
+ typedef WarpScan<INCLUSIVE, T, F> merge;
+ };
+
+ template <ScanKind Kind , typename T, typename F> struct WarpScanNoComp
+ {
+ __device__ __forceinline__ WarpScanNoComp() {}
+ __device__ __forceinline__ WarpScanNoComp(const WarpScanNoComp& other) { (void)other; }
+
+ __device__ __forceinline__ T operator()( volatile T *ptr , const unsigned int idx)
+ {
+ const unsigned int lane = threadIdx.x & 31;
+ F op;
+
+ ptr [idx ] = op(ptr [idx - 1], ptr [idx]);
+ ptr [idx ] = op(ptr [idx - 2], ptr [idx]);
+ ptr [idx ] = op(ptr [idx - 4], ptr [idx]);
+ ptr [idx ] = op(ptr [idx - 8], ptr [idx]);
+ ptr [idx ] = op(ptr [idx - 16], ptr [idx]);
+
+ if( Kind == INCLUSIVE )
+ return ptr [idx];
+ else
+ return (lane > 0) ? ptr [idx - 1] : 0;
+ }
+
+ __device__ __forceinline__ unsigned int index(const unsigned int tid)
+ {
+ return (tid >> warp_log) * warp_smem_stride + 16 + (tid & warp_mask);
+ }
+
+ __device__ __forceinline__ void init(volatile T *ptr)
+ {
+ ptr[threadIdx.x] = 0;
+ }
+
+ static const int warp_smem_stride = 32 + 16 + 1;
+ static const int warp_offset = 16;
+ static const int warp_log = 5;
+ static const int warp_mask = 31;
+
+ typedef WarpScanNoComp<INCLUSIVE, T, F> merge;
+ };
+
+ template <ScanKind Kind , typename T, typename Sc, typename F> struct BlockScan
+ {
+ __device__ __forceinline__ BlockScan() {}
+ __device__ __forceinline__ BlockScan(const BlockScan& other) { (void)other; }
+
+ __device__ __forceinline__ T operator()(volatile T *ptr)
+ {
+ const unsigned int tid = threadIdx.x;
+ const unsigned int lane = tid & warp_mask;
+ const unsigned int warp = tid >> warp_log;
+
+ Sc scan;
+ typename Sc::merge merge_scan;
+ const unsigned int idx = scan.index(tid);
+
+ T val = scan(ptr, idx);
+ __syncthreads ();
+
+ if( warp == 0)
+ scan.init(ptr);
+ __syncthreads ();
+
+ if( lane == 31 )
+ ptr [scan.warp_offset + warp ] = (Kind == INCLUSIVE) ? val : ptr [idx];
+ __syncthreads ();
+
+ if( warp == 0 )
+ merge_scan(ptr, idx);
+ __syncthreads();
+
+ if ( warp > 0)
+ val = ptr [scan.warp_offset + warp - 1] + val;
+ __syncthreads ();
+
+ ptr[idx] = val;
+ __syncthreads ();
+
+ return val ;
+ }
+
+ static const int warp_log = 5;
+ static const int warp_mask = 31;
+ };
+
+#endif
\ No newline at end of file
#ifndef __OPENCV_GPU_GPU_DEVICE_STATIC_CHECK_HPP__\r
#define __OPENCV_GPU_GPU_DEVICE_STATIC_CHECK_HPP__\r
\r
-#if defined(__CUDACC__) \r
- #define __OPENCV_GPU_HOST_DEVICE__ __host__ __device__ __forceinline__ \r
+#if defined(__CUDACC__)\r
+ #define __OPENCV_GPU_HOST_DEVICE__ __host__ __device__ __forceinline__\r
#else\r
#define __OPENCV_GPU_HOST_DEVICE__\r
-#endif \r
+#endif\r
\r
-namespace cv { namespace gpu \r
-{ \r
+namespace cv { namespace gpu\r
+{\r
namespace device\r
{\r
template<bool expr> struct Static {};\r
- \r
- template<> struct Static<true> \r
- { \r
- __OPENCV_GPU_HOST_DEVICE__ static void check() {}; \r
+\r
+ template<> struct Static<true>\r
+ {\r
+ __OPENCV_GPU_HOST_DEVICE__ static void check() {};\r
};\r
- } \r
+ }\r
\r
using ::cv::gpu::device::Static;\r
}}\r
\r
#undef __OPENCV_GPU_HOST_DEVICE__\r
\r
-#endif /* __OPENCV_GPU_GPU_DEVICE_STATIC_CHECK_HPP__ */
\ No newline at end of file
+#endif /* __OPENCV_GPU_GPU_DEVICE_STATIC_CHECK_HPP__ */
\ No newline at end of file
#include "utility.hpp"\r
#include "detail/transform_detail.hpp"\r
\r
-namespace cv { namespace gpu { namespace device \r
+namespace cv { namespace gpu { namespace device\r
{\r
template <typename T, typename D, typename UnOp, typename Mask>\r
static inline void transform(DevMem2D_<T> src, DevMem2D_<D> dst, UnOp op, const Mask& mask, cudaStream_t stream)\r
\r
#include "detail/type_traits_detail.hpp"\r
\r
-namespace cv { namespace gpu { namespace device \r
+namespace cv { namespace gpu { namespace device\r
{\r
template <typename T> struct IsSimpleParameter\r
{\r
- enum {value = type_traits_detail::IsIntegral<T>::value || type_traits_detail::IsFloat<T>::value || \r
+ enum {value = type_traits_detail::IsIntegral<T>::value || type_traits_detail::IsFloat<T>::value ||\r
type_traits_detail::PointerTraits<typename type_traits_detail::ReferenceTraits<T>::type>::value};\r
};\r
\r
enum { isVolatile = type_traits_detail::UnVolatile<T>::value };\r
\r
enum { isReference = type_traits_detail::ReferenceTraits<UnqualifiedType>::value };\r
- enum { isPointer = type_traits_detail::PointerTraits<typename type_traits_detail::ReferenceTraits<UnqualifiedType>::type>::value }; \r
+ enum { isPointer = type_traits_detail::PointerTraits<typename type_traits_detail::ReferenceTraits<UnqualifiedType>::type>::value };\r
\r
- enum { isUnsignedInt = type_traits_detail::IsUnsignedIntegral<UnqualifiedType>::value };\r
- enum { isSignedInt = type_traits_detail::IsSignedIntergral<UnqualifiedType>::value };\r
- enum { isIntegral = type_traits_detail::IsIntegral<UnqualifiedType>::value };\r
- enum { isFloat = type_traits_detail::IsFloat<UnqualifiedType>::value };\r
- enum { isArith = isIntegral || isFloat };\r
- enum { isVec = type_traits_detail::IsVec<UnqualifiedType>::value };\r
- \r
- typedef typename type_traits_detail::Select<IsSimpleParameter<UnqualifiedType>::value, \r
+ enum { isUnsignedInt = type_traits_detail::IsUnsignedIntegral<UnqualifiedType>::value };\r
+ enum { isSignedInt = type_traits_detail::IsSignedIntergral<UnqualifiedType>::value };\r
+ enum { isIntegral = type_traits_detail::IsIntegral<UnqualifiedType>::value };\r
+ enum { isFloat = type_traits_detail::IsFloat<UnqualifiedType>::value };\r
+ enum { isArith = isIntegral || isFloat };\r
+ enum { isVec = type_traits_detail::IsVec<UnqualifiedType>::value };\r
+\r
+ typedef typename type_traits_detail::Select<IsSimpleParameter<UnqualifiedType>::value,\r
T, typename type_traits_detail::AddParameterType<T>::type>::type ParameterType;\r
};\r
}}}\r
#include "datamov_utils.hpp"\r
#include "detail/utility_detail.hpp"\r
\r
-namespace cv { namespace gpu { namespace device \r
+namespace cv { namespace gpu { namespace device\r
{\r
- #define OPENCV_GPU_LOG_WARP_SIZE (5)\r
- #define OPENCV_GPU_WARP_SIZE (1 << OPENCV_GPU_LOG_WARP_SIZE)\r
+ #define OPENCV_GPU_LOG_WARP_SIZE (5)\r
+ #define OPENCV_GPU_WARP_SIZE (1 << OPENCV_GPU_LOG_WARP_SIZE)\r
#define OPENCV_GPU_LOG_MEM_BANKS ((__CUDA_ARCH__ >= 200) ? 5 : 4) // 32 banks on fermi, 16 on tesla\r
#define OPENCV_GPU_MEM_BANKS (1 << OPENCV_GPU_LOG_MEM_BANKS)\r
\r
///////////////////////////////////////////////////////////////////////////////\r
// swap\r
\r
- template <typename T> void __device__ __host__ __forceinline__ swap(T& a, T& b) \r
+ template <typename T> void __device__ __host__ __forceinline__ swap(T& a, T& b)\r
{\r
const T temp = a;\r
a = b;\r
{\r
explicit __host__ __device__ __forceinline__ SingleMask(PtrStepb mask_) : mask(mask_) {}\r
__host__ __device__ __forceinline__ SingleMask(const SingleMask& mask_): mask(mask_.mask){}\r
- \r
+\r
__device__ __forceinline__ bool operator()(int y, int x) const\r
- { \r
+ {\r
return mask.ptr(y)[x] != 0;\r
}\r
\r
\r
struct SingleMaskChannels\r
{\r
- __host__ __device__ __forceinline__ SingleMaskChannels(PtrStepb mask_, int channels_)
+ __host__ __device__ __forceinline__ SingleMaskChannels(PtrStepb mask_, int channels_)\r
: mask(mask_), channels(channels_) {}\r
__host__ __device__ __forceinline__ SingleMaskChannels(const SingleMaskChannels& mask_)\r
:mask(mask_.mask), channels(mask_.channels){}\r
- \r
+\r
__device__ __forceinline__ bool operator()(int y, int x) const\r
- { \r
+ {\r
return mask.ptr(y)[x / channels] != 0;\r
}\r
\r
{\r
curMask = maskCollection[z];\r
}\r
- \r
+\r
__device__ __forceinline__ bool operator()(int y, int x) const\r
{\r
uchar val;\r
utility_detail::ReductionDispatcher<n <= 64>::reduce<n>(data, partial_reduction, tid, op);\r
}\r
\r
- template <int n, typename T, typename V, typename Pred> \r
+ template <int n, typename T, typename V, typename Pred>\r
__device__ __forceinline__ void reducePredVal(volatile T* sdata, T& myData, V* sval, V& myVal, int tid, const Pred& pred)\r
{\r
StaticAssert<n >= 8 && n <= 512>::check();\r
utility_detail::PredValReductionDispatcher<n <= 64>::reduce<n>(myData, myVal, sdata, sval, tid, pred);\r
}\r
\r
- template <int n, typename T, typename V1, typename V2, typename Pred> \r
+ template <int n, typename T, typename V1, typename V2, typename Pred>\r
__device__ __forceinline__ void reducePredVal2(volatile T* sdata, T& myData, V1* sval1, V1& myVal1, V2* sval2, V2& myVal2, int tid, const Pred& pred)\r
{\r
StaticAssert<n >= 8 && n <= 512>::check();\r
utility_detail::PredVal2ReductionDispatcher<n <= 64>::reduce<n>(myData, myVal1, myVal2, sdata, sval1, sval2, tid, pred);\r
}\r
- \r
+\r
///////////////////////////////////////////////////////////////////////////////\r
// Solve linear system\r
\r
{\r
double invdet = 1.0 / det;\r
\r
- x[0] = saturate_cast<T>(invdet * \r
+ x[0] = saturate_cast<T>(invdet *\r
(b[0] * (A[1][1] * A[2][2] - A[1][2] * A[2][1]) -\r
A[0][1] * (b[1] * A[2][2] - A[1][2] * b[2] ) +\r
A[0][2] * (b[1] * A[2][1] - A[1][1] * b[2] )));\r
\r
- x[1] = saturate_cast<T>(invdet * \r
+ x[1] = saturate_cast<T>(invdet *\r
(A[0][0] * (b[1] * A[2][2] - A[1][2] * b[2] ) -\r
b[0] * (A[1][0] * A[2][2] - A[1][2] * A[2][0]) +\r
A[0][2] * (A[1][0] * b[2] - b[1] * A[2][0])));\r
\r
- x[2] = saturate_cast<T>(invdet * \r
+ x[2] = saturate_cast<T>(invdet *\r
(A[0][0] * (A[1][1] * b[2] - b[1] * A[2][1]) -\r
A[0][1] * (A[1][0] * b[2] - b[1] * A[2][0]) +\r
b[0] * (A[1][0] * A[2][1] - A[1][1] * A[2][0])));\r
#include "functional.hpp"\r
#include "detail/vec_distance_detail.hpp"\r
\r
-namespace cv { namespace gpu { namespace device \r
+namespace cv { namespace gpu { namespace device\r
{\r
template <typename T> struct L1Dist\r
{\r
};\r
\r
// calc distance between two vectors in global memory\r
- template <int THREAD_DIM, typename Dist, typename T1, typename T2> \r
+ template <int THREAD_DIM, typename Dist, typename T1, typename T2>\r
__device__ void calcVecDiffGlobal(const T1* vec1, const T2* vec2, int len, Dist& dist, typename Dist::result_type* smem, int tid)\r
{\r
for (int i = tid; i < len; i += THREAD_DIM)\r
// calc distance between two vectors, first vector is cached in register or shared memory, second vector is in global memory\r
template <int THREAD_DIM, int MAX_LEN, bool LEN_EQ_MAX_LEN, typename Dist, typename T1, typename T2>\r
__device__ __forceinline__ void calcVecDiffCached(const T1* vecCached, const T2* vecGlob, int len, Dist& dist, typename Dist::result_type* smem, int tid)\r
- { \r
+ {\r
vec_distance_detail::VecDiffCachedCalculator<THREAD_DIM, MAX_LEN, LEN_EQ_MAX_LEN>::calc(vecCached, vecGlob, len, dist, tid);\r
- \r
+\r
dist.reduceAll<THREAD_DIM>(smem, tid);\r
}\r
\r
#include "vec_traits.hpp"\r
#include "functional.hpp"\r
\r
-namespace cv { namespace gpu { namespace device \r
+namespace cv { namespace gpu { namespace device\r
{\r
namespace vec_math_detail\r
{\r
}\r
\r
namespace vec_math_detail\r
- { \r
+ {\r
template <typename T1, typename T2> struct BinOpTraits\r
{\r
typedef int argument_type;\r
#undef OPENCV_GPU_IMPLEMENT_VEC_OP\r
#undef OPENCV_GPU_IMPLEMENT_VEC_INT_OP\r
}}} // namespace cv { namespace gpu { namespace device\r
- \r
+\r
#endif // __OPENCV_GPU_VECMATH_HPP__
\ No newline at end of file
\r
#include "common.hpp"\r
\r
-namespace cv { namespace gpu { namespace device \r
+namespace cv { namespace gpu { namespace device\r
{\r
template<typename T, int N> struct TypeVec;\r
\r
\r
#undef OPENCV_GPU_IMPLEMENT_VEC_TRAITS\r
\r
- template<> struct VecTraits<char> \r
- { \r
+ template<> struct VecTraits<char>\r
+ {\r
typedef char elem_type;\r
- enum {cn=1}; \r
+ enum {cn=1};\r
static __device__ __host__ __forceinline__ char all(char v) {return v;}\r
static __device__ __host__ __forceinline__ char make(char x) {return x;}\r
static __device__ __host__ __forceinline__ char make(const char* x) {return *x;}\r
};\r
- template<> struct VecTraits<schar> \r
- { \r
+ template<> struct VecTraits<schar>\r
+ {\r
typedef schar elem_type;\r
- enum {cn=1}; \r
+ enum {cn=1};\r
static __device__ __host__ __forceinline__ schar all(schar v) {return v;}\r
static __device__ __host__ __forceinline__ schar make(schar x) {return x;}\r
static __device__ __host__ __forceinline__ schar make(const schar* x) {return *x;}\r
#ifndef __OPENCV_GPU_DEVICE_WARP_HPP__\r
#define __OPENCV_GPU_DEVICE_WARP_HPP__\r
\r
-namespace cv { namespace gpu { namespace device \r
+namespace cv { namespace gpu { namespace device\r
{\r
struct Warp\r
{\r
\r
template<typename It, typename T>\r
static __device__ __forceinline__ void fill(It beg, It end, const T& value)\r
- { \r
+ {\r
for(It t = beg + laneId(); t < end; t += STRIDE)\r
*t = value;\r
- } \r
+ }\r
\r
template<typename InIt, typename OutIt>\r
static __device__ __forceinline__ OutIt copy(InIt beg, InIt end, OutIt out)\r
- { \r
+ {\r
for(InIt t = beg + laneId(); t < end; t += STRIDE, out += STRIDE)\r
*out = *t;\r
return out;\r
- } \r
+ }\r
\r
template<typename InIt, typename OutIt, class UnOp>\r
static __device__ __forceinline__ OutIt transform(InIt beg, InIt end, OutIt out, UnOp op)\r
{\r
unsigned int lane = laneId();\r
\r
- InIt1 t1 = beg1 + lane; \r
+ InIt1 t1 = beg1 + lane;\r
InIt2 t2 = beg2 + lane;\r
for(; t1 < end1; t1 += STRIDE, t2 += STRIDE, out += STRIDE)\r
*out = op(*t1, *t2);\r
template<typename OutIt, typename T>\r
static __device__ __forceinline__ void yota(OutIt beg, OutIt end, T value)\r
{\r
- unsigned int lane = laneId(); \r
+ unsigned int lane = laneId();\r
value += lane;\r
\r
for(OutIt t = beg + lane; t < end; t += STRIDE, value += STRIDE)\r
\r
#ifndef HAVE_CUDA\r
\r
-void cv::gpu::resize(const GpuMat&, GpuMat&, Size, double, double, int, Stream&) { throw_nogpu(); }\r
+void cv::gpu::resize(const GpuMat& src, GpuMat& dst, Size dsize, double fx, double fy, int interpolation, Stream& s)\r
+{\r
+ (void)src;\r
+ (void)dst;\r
+ (void)dsize;\r
+ (void)fx;\r
+ (void)fy;\r
+ (void)interpolation;\r
+ (void)s;\r
+\r
+ throw_nogpu();\r
+}\r
+void cv::gpu::resize(const GpuMat& src, GpuMat& dst,GpuMat& buffer, Size dsize,\r
+ double fx, double fy, int interpolation, Stream& s)\r
+{\r
+ (void)src;\r
+ (void)dst;\r
+ (void)dsize;\r
+ (void)fx;\r
+ (void)fy;\r
+ (void)interpolation;\r
+ (void)buffer;\r
+ (void)s;\r
+\r
+ throw_nogpu();\r
+}\r
\r
#else // HAVE_CUDA\r
\r