\r
void enqueueCopy(const GpuMat& src, GpuMat& dst);\r
\r
- void enqueueMemSet(const GpuMat& src, Scalar val);\r
- void enqueueMemSet(const GpuMat& src, Scalar val, const GpuMat& mask);\r
+ void enqueueMemSet(GpuMat& src, Scalar val);\r
+ void enqueueMemSet(GpuMat& src, Scalar val, const GpuMat& mask);\r
\r
// converts matrix type, ex from float to uchar depending on type\r
void enqueueConvert(const GpuMat& src, GpuMat& dst, int type, double a = 1, double b = 0);\r
{\r
template <typename T>\r
void matchSingleL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs,\r
- const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);\r
+ const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, \r
+ bool cc_12);\r
template <typename T>\r
void matchSingleL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs,\r
- const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);\r
+ const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, \r
+ bool cc_12);\r
template <typename T>\r
void matchCollectionL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection,\r
const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx,\r
- const DevMem2Df& distance);\r
+ const DevMem2Df& distance, \r
+ bool cc_12);\r
template <typename T>\r
void matchCollectionL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection,\r
const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx,\r
- const DevMem2Df& distance);\r
+ const DevMem2Df& distance, \r
+ bool cc_12);\r
\r
template <typename T>\r
void knnMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn,\r
using namespace cv::gpu::bfmatcher;\r
\r
typedef void (*match_caller_t)(const DevMem2D& queryDescs, const DevMem2D& trainDescs,\r
- const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);\r
+ const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, \r
+ bool cc_12);\r
\r
static const match_caller_t match_callers[2][8] =\r
{\r
{\r
- matchSingleL1_gpu<unsigned char>, matchSingleL1_gpu<char>, matchSingleL1_gpu<unsigned short>,\r
- matchSingleL1_gpu<short>, matchSingleL1_gpu<int>, matchSingleL1_gpu<float>, 0, 0\r
+ matchSingleL1_gpu<unsigned char>, matchSingleL1_gpu<signed char>, \r
+ matchSingleL1_gpu<unsigned short>, matchSingleL1_gpu<short>, \r
+ matchSingleL1_gpu<int>, matchSingleL1_gpu<float>, 0, 0\r
},\r
{\r
- matchSingleL2_gpu<unsigned char>, matchSingleL2_gpu<char>, matchSingleL2_gpu<unsigned short>,\r
- matchSingleL2_gpu<short>, matchSingleL2_gpu<int>, matchSingleL2_gpu<float>, 0, 0\r
+ matchSingleL2_gpu<unsigned char>, matchSingleL2_gpu<signed char>, \r
+ matchSingleL2_gpu<unsigned short>, matchSingleL2_gpu<short>, \r
+ matchSingleL2_gpu<int>, matchSingleL2_gpu<float>, 0, 0\r
}\r
};\r
\r
match_caller_t func = match_callers[distType][queryDescs.depth()];\r
CV_Assert(func != 0);\r
\r
+ bool cc_12 = TargetArchs::builtWith(COMPUTE_12) && DeviceInfo().supports(COMPUTE_12);\r
+\r
// For single train there is no need to save imgIdx, so we just save imgIdx to trainIdx.\r
// trainIdx store after imgIdx, so we doesn't lose it value.\r
- func(queryDescs, trainDescs, mask, trainIdx, trainIdx, distance);\r
+ func(queryDescs, trainDescs, mask, trainIdx, trainIdx, distance, cc_12);\r
}\r
\r
void cv::gpu::BruteForceMatcher_GPU_base::matchDownload(const GpuMat& trainIdx, const GpuMat& distance,\r
\r
typedef void (*match_caller_t)(const DevMem2D& queryDescs, const DevMem2D& trainCollection,\r
const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx,\r
- const DevMem2Df& distance);\r
+ const DevMem2Df& distance, bool cc_12);\r
\r
static const match_caller_t match_callers[2][8] =\r
{\r
{\r
- matchCollectionL1_gpu<unsigned char>, matchCollectionL1_gpu<char>,\r
+ matchCollectionL1_gpu<unsigned char>, matchCollectionL1_gpu<signed char>,\r
matchCollectionL1_gpu<unsigned short>, matchCollectionL1_gpu<short>,\r
matchCollectionL1_gpu<int>, matchCollectionL1_gpu<float>, 0, 0\r
},\r
{\r
- matchCollectionL2_gpu<unsigned char>, matchCollectionL2_gpu<char>,\r
+ matchCollectionL2_gpu<unsigned char>, matchCollectionL2_gpu<signed char>,\r
matchCollectionL2_gpu<unsigned short>, matchCollectionL2_gpu<short>,\r
matchCollectionL2_gpu<int>, matchCollectionL2_gpu<float>, 0, 0\r
}\r
match_caller_t func = match_callers[distType][queryDescs.depth()];\r
CV_Assert(func != 0);\r
\r
- func(queryDescs, trainCollection, maskCollection, trainIdx, imgIdx, distance);\r
+ bool cc_12 = TargetArchs::builtWith(COMPUTE_12) && DeviceInfo().supports(COMPUTE_12);\r
+\r
+ func(queryDescs, trainCollection, maskCollection, trainIdx, imgIdx, distance, cc_12);\r
}\r
\r
void cv::gpu::BruteForceMatcher_GPU_base::matchDownload(const GpuMat& trainIdx, const GpuMat& imgIdx,\r
static const match_caller_t match_callers[2][8] =\r
{\r
{\r
- knnMatchL1_gpu<unsigned char>, knnMatchL1_gpu<char>, knnMatchL1_gpu<unsigned short>,\r
+ knnMatchL1_gpu<unsigned char>, knnMatchL1_gpu<signed char>, knnMatchL1_gpu<unsigned short>,\r
knnMatchL1_gpu<short>, knnMatchL1_gpu<int>, knnMatchL1_gpu<float>, 0, 0\r
},\r
{\r
- knnMatchL2_gpu<unsigned char>, knnMatchL2_gpu<char>, knnMatchL2_gpu<unsigned short>,\r
+ knnMatchL2_gpu<unsigned char>, knnMatchL2_gpu<signed char>, knnMatchL2_gpu<unsigned short>,\r
knnMatchL2_gpu<short>, knnMatchL2_gpu<int>, knnMatchL2_gpu<float>, 0, 0\r
}\r
};\r
static const radiusMatch_caller_t radiusMatch_callers[2][8] =\r
{\r
{\r
- radiusMatchL1_gpu<unsigned char>, radiusMatchL1_gpu<char>, radiusMatchL1_gpu<unsigned short>,\r
+ radiusMatchL1_gpu<unsigned char>, radiusMatchL1_gpu<signed char>, radiusMatchL1_gpu<unsigned short>,\r
radiusMatchL1_gpu<short>, radiusMatchL1_gpu<int>, radiusMatchL1_gpu<float>, 0, 0\r
},\r
{\r
- radiusMatchL2_gpu<unsigned char>, radiusMatchL2_gpu<char>, radiusMatchL2_gpu<unsigned short>,\r
+ radiusMatchL2_gpu<unsigned char>, radiusMatchL2_gpu<signed char>, radiusMatchL2_gpu<unsigned short>,\r
radiusMatchL2_gpu<short>, radiusMatchL2_gpu<int>, radiusMatchL2_gpu<float>, 0, 0\r
}\r
};\r
match<BLOCK_DIM_X, BLOCK_DIM_Y, ReduceDescCalculatorSimple<BLOCK_DIM_X, T>, Dist, T>\r
<<<grid, threads>>>(queryDescs, train, mask, trainIdx.data, \r
imgIdx.data, distance.data);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
cudaSafeCall( cudaThreadSynchronize() );\r
}\r
Dist, T>\r
<<<grid, threads>>>(queryDescs, train, mask, trainIdx.data, \r
imgIdx.data, distance.data);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
cudaSafeCall( cudaThreadSynchronize() );\r
}\r
\r
template <typename Dist, typename T, typename Train, typename Mask>\r
void match_chooser(const DevMem2D_<T>& queryDescs, const Train& train, \r
- const Mask& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance)\r
+ const Mask& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance,\r
+ bool cc_12)\r
{\r
if (queryDescs.cols < 64)\r
matchCached_caller<16, 16, 64, false, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance);\r
matchCached_caller<16, 16, 128, true, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance);\r
else if (queryDescs.cols < 256)\r
matchCached_caller<16, 16, 256, false, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance);\r
- else if (queryDescs.cols == 256)\r
+ else if (queryDescs.cols == 256 && cc_12)\r
matchCached_caller<16, 16, 256, true, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance);\r
else\r
matchSimple_caller<16, 16, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance);\r
\r
template <typename T>\r
void matchSingleL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, \r
- const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance)\r
+ const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance,\r
+ bool cc_12)\r
{\r
SingleTrain<T> train((DevMem2D_<T>)trainDescs);\r
if (mask.data)\r
{\r
SingleMask m(mask);\r
- match_chooser<L1Dist>((DevMem2D_<T>)queryDescs, train, m, trainIdx, imgIdx, distance);\r
+ match_chooser<L1Dist>((DevMem2D_<T>)queryDescs, train, m, trainIdx, imgIdx, distance, cc_12);\r
}\r
else\r
{\r
- match_chooser<L1Dist>((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance);\r
+ match_chooser<L1Dist>((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12);\r
}\r
}\r
\r
- template void matchSingleL1_gpu<unsigned char >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);\r
- template void matchSingleL1_gpu<char >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);\r
- template void matchSingleL1_gpu<unsigned short>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);\r
- template void matchSingleL1_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);\r
- template void matchSingleL1_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);\r
- template void matchSingleL1_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);\r
+ template void matchSingleL1_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);\r
+ template void matchSingleL1_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);\r
+ template void matchSingleL1_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);\r
+ template void matchSingleL1_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);\r
+ template void matchSingleL1_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);\r
+ template void matchSingleL1_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);\r
\r
template <typename T>\r
void matchSingleL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, \r
- const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance)\r
+ const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, \r
+ bool cc_12)\r
{\r
SingleTrain<T> train((DevMem2D_<T>)trainDescs);\r
if (mask.data)\r
{\r
SingleMask m(mask);\r
- match_chooser<L2Dist>((DevMem2D_<T>)queryDescs, train, m, trainIdx, imgIdx, distance);\r
+ match_chooser<L2Dist>((DevMem2D_<T>)queryDescs, train, m, trainIdx, imgIdx, distance, cc_12);\r
}\r
else\r
{\r
- match_chooser<L2Dist>((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance);\r
+ match_chooser<L2Dist>((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12);\r
}\r
}\r
\r
- template void matchSingleL2_gpu<unsigned char >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);\r
- template void matchSingleL2_gpu<char >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);\r
- template void matchSingleL2_gpu<unsigned short>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);\r
- template void matchSingleL2_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);\r
- template void matchSingleL2_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);\r
- template void matchSingleL2_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);\r
+ template void matchSingleL2_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);\r
+ template void matchSingleL2_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);\r
+ template void matchSingleL2_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);\r
+ template void matchSingleL2_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);\r
+ template void matchSingleL2_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);\r
+ template void matchSingleL2_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);\r
\r
template <typename T>\r
void matchCollectionL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, \r
- const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance)\r
+ const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, \r
+ const DevMem2Df& distance, bool cc_12)\r
{\r
TrainCollection<T> train((DevMem2D_<T>*)trainCollection.ptr(), trainCollection.cols, queryDescs.cols);\r
if (maskCollection.data)\r
{\r
MaskCollection mask(maskCollection.data);\r
- match_chooser<L1Dist>((DevMem2D_<T>)queryDescs, train, mask, trainIdx, imgIdx, distance);\r
+ match_chooser<L1Dist>((DevMem2D_<T>)queryDescs, train, mask, trainIdx, imgIdx, distance, cc_12);\r
}\r
else\r
{\r
- match_chooser<L1Dist>((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance);\r
+ match_chooser<L1Dist>((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12);\r
}\r
}\r
\r
- template void matchCollectionL1_gpu<unsigned char >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);\r
- template void matchCollectionL1_gpu<char >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);\r
- template void matchCollectionL1_gpu<unsigned short>(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);\r
- template void matchCollectionL1_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);\r
- template void matchCollectionL1_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);\r
- template void matchCollectionL1_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);\r
+ template void matchCollectionL1_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);\r
+ template void matchCollectionL1_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);\r
+ template void matchCollectionL1_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);\r
+ template void matchCollectionL1_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);\r
+ template void matchCollectionL1_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);\r
+ template void matchCollectionL1_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);\r
\r
template <typename T>\r
void matchCollectionL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, \r
- const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance)\r
+ const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, \r
+ const DevMem2Df& distance, bool cc_12)\r
{\r
TrainCollection<T> train((DevMem2D_<T>*)trainCollection.ptr(), trainCollection.cols, queryDescs.cols);\r
if (maskCollection.data)\r
{\r
MaskCollection mask(maskCollection.data);\r
- match_chooser<L2Dist>((DevMem2D_<T>)queryDescs, train, mask, trainIdx, imgIdx, distance);\r
+ match_chooser<L2Dist>((DevMem2D_<T>)queryDescs, train, mask, trainIdx, imgIdx, distance, cc_12);\r
}\r
else\r
{\r
- match_chooser<L2Dist>((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance);\r
+ match_chooser<L2Dist>((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12);\r
}\r
}\r
\r
- template void matchCollectionL2_gpu<unsigned char >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);\r
- template void matchCollectionL2_gpu<char >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);\r
- template void matchCollectionL2_gpu<unsigned short>(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);\r
- template void matchCollectionL2_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);\r
- template void matchCollectionL2_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);\r
- template void matchCollectionL2_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);\r
+ template void matchCollectionL2_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);\r
+ template void matchCollectionL2_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);\r
+ template void matchCollectionL2_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);\r
+ template void matchCollectionL2_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);\r
+ template void matchCollectionL2_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);\r
+ template void matchCollectionL2_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);\r
\r
///////////////////////////////////////////////////////////////////////////////////\r
//////////////////////////////////// Knn Match ////////////////////////////////////\r
\r
calcDistance<BLOCK_DIM_X, BLOCK_DIM_Y, Dist, T><<<grid, threads>>>(\r
queryDescs, trainDescs, mask, distance);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
cudaSafeCall( cudaThreadSynchronize() );\r
}\r
dim3 grid(trainIdx.rows, 1, 1);\r
\r
for (int i = 0; i < knn; ++i)\r
+ {\r
findBestMatch<BLOCK_SIZE><<<grid, threads>>>(allDist, i, trainIdx, distance);\r
+ cudaSafeCall( cudaGetLastError() );\r
+ }\r
\r
cudaSafeCall( cudaThreadSynchronize() );\r
}\r
findKnnMatch_caller<256>(knn, trainIdx, distance, allDist);\r
}\r
\r
- template void knnMatchL1_gpu<unsigned char >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);\r
- template void knnMatchL1_gpu<char >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);\r
- template void knnMatchL1_gpu<unsigned short>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);\r
- template void knnMatchL1_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);\r
- template void knnMatchL1_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);\r
- template void knnMatchL1_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);\r
+ template void knnMatchL1_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);\r
+ template void knnMatchL1_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);\r
+ template void knnMatchL1_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);\r
+ template void knnMatchL1_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);\r
+ template void knnMatchL1_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);\r
+ template void knnMatchL1_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);\r
\r
template <typename T>\r
void knnMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn,\r
findKnnMatch_caller<256>(knn, trainIdx, distance, allDist);\r
}\r
\r
- template void knnMatchL2_gpu<unsigned char >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);\r
- template void knnMatchL2_gpu<char >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);\r
- template void knnMatchL2_gpu<unsigned short>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);\r
- template void knnMatchL2_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);\r
- template void knnMatchL2_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);\r
- template void knnMatchL2_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);\r
+ template void knnMatchL2_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);\r
+ template void knnMatchL2_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);\r
+ template void knnMatchL2_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);\r
+ template void knnMatchL2_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);\r
+ template void knnMatchL2_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);\r
+ template void knnMatchL2_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);\r
\r
///////////////////////////////////////////////////////////////////////////////////\r
/////////////////////////////////// Radius Match //////////////////////////////////\r
\r
radiusMatch<BLOCK_DIM_X, BLOCK_DIM_Y, Dist, T><<<grid, threads>>>(\r
queryDescs, trainDescs, maxDistance, mask, trainIdx, nMatches, distance);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
cudaSafeCall( cudaThreadSynchronize() );\r
}\r
}\r
}\r
\r
- template void radiusMatchL1_gpu<unsigned char >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);\r
- template void radiusMatchL1_gpu<char >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);\r
- template void radiusMatchL1_gpu<unsigned short>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);\r
- template void radiusMatchL1_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);\r
- template void radiusMatchL1_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);\r
- template void radiusMatchL1_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);\r
+ template void radiusMatchL1_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);\r
+ template void radiusMatchL1_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);\r
+ template void radiusMatchL1_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);\r
+ template void radiusMatchL1_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);\r
+ template void radiusMatchL1_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);\r
+ template void radiusMatchL1_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);\r
\r
template <typename T>\r
void radiusMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance,\r
}\r
}\r
\r
- template void radiusMatchL2_gpu<unsigned char >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);\r
- template void radiusMatchL2_gpu<char >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);\r
- template void radiusMatchL2_gpu<unsigned short>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);\r
- template void radiusMatchL2_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);\r
- template void radiusMatchL2_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);\r
- template void radiusMatchL2_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);\r
+ template void radiusMatchL2_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);\r
+ template void radiusMatchL2_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);\r
+ template void radiusMatchL2_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);\r
+ template void radiusMatchL2_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);\r
+ template void radiusMatchL2_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);\r
+ template void radiusMatchL2_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);\r
}}}\r
#include "internal_shared.hpp"\r
#include "opencv2/gpu/device/saturate_cast.hpp"\r
#include "opencv2/gpu/device/vecmath.hpp"\r
+#include "opencv2/gpu/device/limits_gpu.hpp"\r
\r
using namespace cv::gpu;\r
using namespace cv::gpu::device;\r
#define CV_DESCALE(x, n) (((x) + (1 << ((n)-1))) >> (n))\r
#endif\r
\r
-#ifndef FLT_EPSILON\r
- #define FLT_EPSILON 1.192092896e-07F\r
-#endif\r
-\r
namespace cv { namespace gpu { namespace color\r
{\r
- template<typename T> struct ColorChannel {};\r
+ template<typename T> struct ColorChannel;\r
template<> struct ColorChannel<uchar>\r
{\r
typedef float worktype_f;\r
\r
RGB2RGB<SRCCN, DSTCN, T><<<grid, threads, 0, stream>>>(src.data, src.step, \r
dst.data, dst.step, src.rows, src.cols, bidx);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
if (stream == 0)\r
cudaSafeCall( cudaThreadSynchronize() );\r
\r
RGB5x52RGB<GREEN_BITS, DSTCN><<<grid, threads, 0, stream>>>(src.data, src.step, \r
dst.data, dst.step, src.rows, src.cols, bidx);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
if (stream == 0)\r
cudaSafeCall( cudaThreadSynchronize() );\r
\r
RGB2RGB5x5<SRCCN, GREEN_BITS><<<grid, threads, 0, stream>>>(src.data, src.step, \r
dst.data, dst.step, src.rows, src.cols, bidx);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
if (stream == 0)\r
cudaSafeCall( cudaThreadSynchronize() );\r
\r
Gray2RGB<DSTCN, T><<<grid, threads, 0, stream>>>(src.data, src.step, \r
dst.data, dst.step, src.rows, src.cols);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
if (stream == 0)\r
cudaSafeCall( cudaThreadSynchronize() );\r
\r
Gray2RGB5x5<GREEN_BITS><<<grid, threads, 0, stream>>>(src.data, src.step, \r
dst.data, dst.step, src.rows, src.cols);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
if (stream == 0)\r
cudaSafeCall( cudaThreadSynchronize() );\r
\r
RGB2Gray<SRCCN, T><<<grid, threads, 0, stream>>>(src.data, src.step, \r
dst.data, dst.step, src.rows, src.cols, bidx);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
if (stream == 0)\r
cudaSafeCall( cudaThreadSynchronize() );\r
\r
RGB5x52Gray<GREEN_BITS><<<grid, threads, 0, stream>>>(src.data, src.step, \r
dst.data, dst.step, src.rows, src.cols);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
if (stream == 0)\r
cudaSafeCall( cudaThreadSynchronize() );\r
\r
RGB2YCrCb<SRCCN, DSTCN, T><<<grid, threads, 0, stream>>>(src.data, src.step, \r
dst.data, dst.step, src.rows, src.cols, bidx);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
if (stream == 0)\r
cudaSafeCall( cudaThreadSynchronize() );\r
\r
YCrCb2RGB<SRCCN, DSTCN, T><<<grid, threads, 0, stream>>>(src.data, src.step, \r
dst.data, dst.step, src.rows, src.cols, bidx);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
if (stream == 0)\r
cudaSafeCall( cudaThreadSynchronize() );\r
\r
RGB2XYZ<SRCCN, DSTCN, T><<<grid, threads, 0, stream>>>(src.data, src.step, \r
dst.data, dst.step, src.rows, src.cols);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
if (stream == 0)\r
cudaSafeCall( cudaThreadSynchronize() );\r
\r
XYZ2RGB<SRCCN, DSTCN, T><<<grid, threads, 0, stream>>>(src.data, src.step, \r
dst.data, dst.step, src.rows, src.cols);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
if (stream == 0)\r
cudaSafeCall( cudaThreadSynchronize() );\r
vmin = fmin(vmin, b);\r
\r
diff = v - vmin;\r
- s = diff / (float)(fabs(v) + FLT_EPSILON);\r
- diff = (float)(60. / (diff + FLT_EPSILON));\r
+ s = diff / (float)(fabs(v) + numeric_limits_gpu<float>::epsilon());\r
+ diff = (float)(60. / (diff + numeric_limits_gpu<float>::epsilon()));\r
\r
if (v == r)\r
h = (g - b) * diff;\r
RGB2HSV<SRCCN, DSTCN, 255, T><<<grid, threads, 0, stream>>>(src.data, src.step, \r
dst.data, dst.step, src.rows, src.cols, bidx);\r
\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
if (stream == 0)\r
cudaSafeCall( cudaThreadSynchronize() );\r
}\r
HSV2RGB<SRCCN, DSTCN, 255, T><<<grid, threads, 0, stream>>>(src.data, src.step, \r
dst.data, dst.step, src.rows, src.cols, bidx);\r
\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
if (stream == 0)\r
cudaSafeCall( cudaThreadSynchronize() );\r
}\r
diff = vmax - vmin;\r
l = (vmax + vmin) * 0.5f;\r
\r
- if (diff > FLT_EPSILON)\r
+ if (diff > numeric_limits_gpu<float>::epsilon())\r
{\r
s = l < 0.5f ? diff / (vmax + vmin) : diff / (2.0f - vmax - vmin);\r
diff = 60.f / diff;\r
HLS2RGB<SRCCN, DSTCN, 255, T><<<grid, threads, 0, stream>>>(src.data, src.step, \r
dst.data, dst.step, src.rows, src.cols, bidx);\r
\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
if (stream == 0)\r
cudaSafeCall( cudaThreadSynchronize() );\r
}\r
divUp(rows, threads.y));\r
\r
bitwiseUnOpKernel<opid><<<grid, threads>>>(rows, width, src, dst);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
if (stream == 0) \r
cudaSafeCall(cudaThreadSynchronize());\r
dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));\r
\r
bitwiseUnOpKernel<T, opid><<<grid, threads>>>(rows, cols, cn, src, mask, dst); \r
+ cudaSafeCall( cudaGetLastError() );\r
\r
if (stream == 0) \r
cudaSafeCall(cudaThreadSynchronize());\r
dim3 grid(divUp(width, threads.x * sizeof(uint)), divUp(rows, threads.y));\r
\r
bitwiseBinOpKernel<opid><<<grid, threads>>>(rows, width, src1, src2, dst);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
if (stream == 0) \r
cudaSafeCall(cudaThreadSynchronize());\r
dim3 threads(16, 16);\r
dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));\r
\r
- bitwiseBinOpKernel<T, opid><<<grid, threads>>>(rows, cols, cn, src1, src2, mask, dst); \r
+ bitwiseBinOpKernel<T, opid><<<grid, threads>>>(rows, cols, cn, src1, src2, mask, dst);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
if (stream == 0) \r
cudaSafeCall(cudaThreadSynchronize());\r
}\r
};\r
\r
- struct ScalarMinOp\r
+ template <typename T> struct ScalarMinOp\r
+ {\r
+ T s;\r
+\r
+ explicit ScalarMinOp(T s_) : s(s_) {}\r
+\r
+ __device__ T operator()(T a)\r
+ {\r
+ return min(a, s);\r
+ }\r
+ };\r
+ template <> struct ScalarMinOp<float>\r
+ {\r
+ float s;\r
+\r
+ explicit ScalarMinOp(float s_) : s(s_) {}\r
+\r
+ __device__ float operator()(float a)\r
+ {\r
+ return fmin(a, s);\r
+ }\r
+ };\r
+ template <> struct ScalarMinOp<double>\r
{\r
double s;\r
\r
explicit ScalarMinOp(double s_) : s(s_) {}\r
\r
- template <typename T>\r
- __device__ T operator()(T a)\r
+ __device__ double operator()(double a)\r
{\r
- return saturate_cast<T>(fmin((double)a, s));\r
+ return fmin(a, s);\r
}\r
};\r
\r
- struct ScalarMaxOp\r
+ template <typename T> struct ScalarMaxOp\r
+ {\r
+ T s;\r
+\r
+ explicit ScalarMaxOp(T s_) : s(s_) {}\r
+\r
+ __device__ T operator()(T a)\r
+ {\r
+ return max(a, s);\r
+ }\r
+ };\r
+ template <> struct ScalarMaxOp<float>\r
+ {\r
+ float s;\r
+\r
+ explicit ScalarMaxOp(float s_) : s(s_) {}\r
+\r
+ __device__ float operator()(float a)\r
+ {\r
+ return fmax(a, s);\r
+ }\r
+ };\r
+ template <> struct ScalarMaxOp<double>\r
{\r
double s;\r
\r
explicit ScalarMaxOp(double s_) : s(s_) {}\r
\r
- template <typename T>\r
- __device__ T operator()(T a)\r
+ __device__ double operator()(double a)\r
{\r
- return saturate_cast<T>(fmax((double)a, s));\r
+ return fmax(a, s);\r
}\r
};\r
\r
}\r
\r
template void min_gpu<uchar >(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream);\r
- template void min_gpu<char >(const DevMem2D_<char>& src1, const DevMem2D_<char>& src2, const DevMem2D_<char>& dst, cudaStream_t stream);\r
+ template void min_gpu<schar >(const DevMem2D_<schar>& src1, const DevMem2D_<schar>& src2, const DevMem2D_<schar>& dst, cudaStream_t stream);\r
template void min_gpu<ushort>(const DevMem2D_<ushort>& src1, const DevMem2D_<ushort>& src2, const DevMem2D_<ushort>& dst, cudaStream_t stream);\r
template void min_gpu<short >(const DevMem2D_<short>& src1, const DevMem2D_<short>& src2, const DevMem2D_<short>& dst, cudaStream_t stream);\r
template void min_gpu<int >(const DevMem2D_<int>& src1, const DevMem2D_<int>& src2, const DevMem2D_<int>& dst, cudaStream_t stream);\r
}\r
\r
template void max_gpu<uchar >(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream);\r
- template void max_gpu<char >(const DevMem2D_<char>& src1, const DevMem2D_<char>& src2, const DevMem2D_<char>& dst, cudaStream_t stream);\r
+ template void max_gpu<schar >(const DevMem2D_<schar>& src1, const DevMem2D_<schar>& src2, const DevMem2D_<schar>& dst, cudaStream_t stream);\r
template void max_gpu<ushort>(const DevMem2D_<ushort>& src1, const DevMem2D_<ushort>& src2, const DevMem2D_<ushort>& dst, cudaStream_t stream);\r
template void max_gpu<short >(const DevMem2D_<short>& src1, const DevMem2D_<short>& src2, const DevMem2D_<short>& dst, cudaStream_t stream);\r
template void max_gpu<int >(const DevMem2D_<int>& src1, const DevMem2D_<int>& src2, const DevMem2D_<int>& dst, cudaStream_t stream);\r
template void max_gpu<double>(const DevMem2D_<double>& src1, const DevMem2D_<double>& src2, const DevMem2D_<double>& dst, cudaStream_t stream);\r
\r
template <typename T>\r
- void min_gpu(const DevMem2D_<T>& src1, double src2, const DevMem2D_<T>& dst, cudaStream_t stream)\r
+ void min_gpu(const DevMem2D_<T>& src1, T src2, const DevMem2D_<T>& dst, cudaStream_t stream)\r
{\r
- ScalarMinOp op(src2);\r
+ ScalarMinOp<T> op(src2);\r
transform(src1, dst, op, stream); \r
}\r
\r
- template void min_gpu<uchar >(const DevMem2D& src1, double src2, const DevMem2D& dst, cudaStream_t stream);\r
- template void min_gpu<char >(const DevMem2D_<char>& src1, double src2, const DevMem2D_<char>& dst, cudaStream_t stream);\r
- template void min_gpu<ushort>(const DevMem2D_<ushort>& src1, double src2, const DevMem2D_<ushort>& dst, cudaStream_t stream);\r
- template void min_gpu<short >(const DevMem2D_<short>& src1, double src2, const DevMem2D_<short>& dst, cudaStream_t stream);\r
- template void min_gpu<int >(const DevMem2D_<int>& src1, double src2, const DevMem2D_<int>& dst, cudaStream_t stream);\r
- template void min_gpu<float >(const DevMem2D_<float>& src1, double src2, const DevMem2D_<float>& dst, cudaStream_t stream);\r
+ template void min_gpu<uchar >(const DevMem2D& src1, uchar src2, const DevMem2D& dst, cudaStream_t stream);\r
+ template void min_gpu<schar >(const DevMem2D_<schar>& src1, schar src2, const DevMem2D_<schar>& dst, cudaStream_t stream);\r
+ template void min_gpu<ushort>(const DevMem2D_<ushort>& src1, ushort src2, const DevMem2D_<ushort>& dst, cudaStream_t stream);\r
+ template void min_gpu<short >(const DevMem2D_<short>& src1, short src2, const DevMem2D_<short>& dst, cudaStream_t stream);\r
+ template void min_gpu<int >(const DevMem2D_<int>& src1, int src2, const DevMem2D_<int>& dst, cudaStream_t stream);\r
+ template void min_gpu<float >(const DevMem2D_<float>& src1, float src2, const DevMem2D_<float>& dst, cudaStream_t stream);\r
template void min_gpu<double>(const DevMem2D_<double>& src1, double src2, const DevMem2D_<double>& dst, cudaStream_t stream);\r
\r
template <typename T>\r
- void max_gpu(const DevMem2D_<T>& src1, double src2, const DevMem2D_<T>& dst, cudaStream_t stream)\r
+ void max_gpu(const DevMem2D_<T>& src1, T src2, const DevMem2D_<T>& dst, cudaStream_t stream)\r
{\r
- ScalarMaxOp op(src2);\r
+ ScalarMaxOp<T> op(src2);\r
transform(src1, dst, op, stream); \r
}\r
\r
- template void max_gpu<uchar >(const DevMem2D& src1, double src2, const DevMem2D& dst, cudaStream_t stream);\r
- template void max_gpu<char >(const DevMem2D_<char>& src1, double src2, const DevMem2D_<char>& dst, cudaStream_t stream);\r
- template void max_gpu<ushort>(const DevMem2D_<ushort>& src1, double src2, const DevMem2D_<ushort>& dst, cudaStream_t stream);\r
- template void max_gpu<short >(const DevMem2D_<short>& src1, double src2, const DevMem2D_<short>& dst, cudaStream_t stream);\r
- template void max_gpu<int >(const DevMem2D_<int>& src1, double src2, const DevMem2D_<int>& dst, cudaStream_t stream);\r
- template void max_gpu<float >(const DevMem2D_<float>& src1, double src2, const DevMem2D_<float>& dst, cudaStream_t stream);\r
+ template void max_gpu<uchar >(const DevMem2D& src1, uchar src2, const DevMem2D& dst, cudaStream_t stream);\r
+ template void max_gpu<schar >(const DevMem2D_<schar>& src1, schar src2, const DevMem2D_<schar>& dst, cudaStream_t stream);\r
+ template void max_gpu<ushort>(const DevMem2D_<ushort>& src1, ushort src2, const DevMem2D_<ushort>& dst, cudaStream_t stream);\r
+ template void max_gpu<short >(const DevMem2D_<short>& src1, short src2, const DevMem2D_<short>& dst, cudaStream_t stream);\r
+ template void max_gpu<int >(const DevMem2D_<int>& src1, int src2, const DevMem2D_<int>& dst, cudaStream_t stream);\r
+ template void max_gpu<float >(const DevMem2D_<float>& src1, float src2, const DevMem2D_<float>& dst, cudaStream_t stream);\r
template void max_gpu<double>(const DevMem2D_<double>& src1, double src2, const DevMem2D_<double>& dst, cudaStream_t stream);\r
\r
\r
//////////////////////////////////////////////////////////////////////////\r
// threshold\r
\r
- class ThreshOp\r
+ template <typename T> struct ThreshBinary\r
{\r
- public:\r
- ThreshOp(float thresh_, float maxVal_) : thresh(thresh_), maxVal(maxVal_) {}\r
+ ThreshBinary(T thresh_, T maxVal_) : thresh(thresh_), maxVal(maxVal_) {}\r
\r
- protected:\r
- float thresh;\r
- float maxVal;\r
+ __device__ T operator()(const T& src) const\r
+ {\r
+ return src > thresh ? maxVal : 0;\r
+ }\r
+\r
+ private:\r
+ T thresh;\r
+ T maxVal;\r
};\r
\r
- class ThreshBinary : public ThreshOp\r
+ template <typename T> struct ThreshBinaryInv\r
{\r
- public:\r
- ThreshBinary(float thresh_, float maxVal_) : ThreshOp(thresh_, maxVal_) {}\r
+ ThreshBinaryInv(T thresh_, T maxVal_) : thresh(thresh_), maxVal(maxVal_) {}\r
\r
- template<typename T>\r
__device__ T operator()(const T& src) const\r
{\r
- return (float)src > thresh ? saturate_cast<T>(maxVal) : 0;\r
+ return src > thresh ? 0 : maxVal;\r
}\r
+\r
+ private:\r
+ T thresh;\r
+ T maxVal;\r
};\r
\r
- class ThreshBinaryInv : public ThreshOp\r
+ template <typename T> struct ThreshTrunc\r
{\r
- public:\r
- ThreshBinaryInv(float thresh_, float maxVal_) : ThreshOp(thresh_, maxVal_) {}\r
+ ThreshTrunc(T thresh_, T) : thresh(thresh_) {}\r
\r
- template<typename T>\r
__device__ T operator()(const T& src) const\r
{\r
- return (float)src > thresh ? 0 : saturate_cast<T>(maxVal);\r
+ return min(src, thresh);\r
}\r
+\r
+ private:\r
+ T thresh;\r
};\r
+ template <> struct ThreshTrunc<float>\r
+ {\r
+ ThreshTrunc(float thresh_, float) : thresh(thresh_) {}\r
+\r
+ __device__ float operator()(const float& src) const\r
+ {\r
+ return fmin(src, thresh);\r
+ }\r
\r
- class ThreshTrunc : public ThreshOp\r
+ private:\r
+ float thresh;\r
+ };\r
+ template <> struct ThreshTrunc<double>\r
{\r
- public:\r
- ThreshTrunc(float thresh_, float maxVal_) : ThreshOp(thresh_, maxVal_) {}\r
+ ThreshTrunc(double thresh_, double) : thresh(thresh_) {}\r
\r
- template<typename T>\r
- __device__ T operator()(const T& src) const\r
+ __device__ double operator()(const double& src) const\r
{\r
- return saturate_cast<T>(fmin((float)src, thresh));\r
+ return fmin(src, thresh);\r
}\r
+\r
+ private:\r
+ double thresh;\r
};\r
\r
- class ThreshToZero : public ThreshOp\r
+ template <typename T> struct ThreshToZero\r
{\r
public:\r
- ThreshToZero(float thresh_, float maxVal_) : ThreshOp(thresh_, maxVal_) {}\r
+ ThreshToZero(T thresh_, T) : thresh(thresh_) {}\r
\r
- template<typename T>\r
__device__ T operator()(const T& src) const\r
{\r
- return (float)src > thresh ? src : 0;\r
+ return src > thresh ? src : 0;\r
}\r
+\r
+ private:\r
+ T thresh;\r
};\r
\r
- class ThreshToZeroInv : public ThreshOp\r
+ template <typename T> struct ThreshToZeroInv\r
{\r
public:\r
- ThreshToZeroInv(float thresh_, float maxVal_) : ThreshOp(thresh_, maxVal_) {}\r
+ ThreshToZeroInv(T thresh_, T) : thresh(thresh_) {}\r
\r
- template<typename T>\r
__device__ T operator()(const T& src) const\r
{\r
- return (float)src > thresh ? 0 : src;\r
+ return src > thresh ? 0 : src;\r
}\r
+\r
+ private:\r
+ T thresh;\r
};\r
\r
- template <class Op, typename T>\r
- void threshold_caller(const DevMem2D_<T>& src, const DevMem2D_<T>& dst, float thresh, float maxVal, \r
+ template <template <typename> class Op, typename T>\r
+ void threshold_caller(const DevMem2D_<T>& src, const DevMem2D_<T>& dst, T thresh, T maxVal, \r
cudaStream_t stream)\r
{\r
- Op op(thresh, maxVal);\r
+ Op<T> op(thresh, maxVal);\r
transform(src, dst, op, stream);\r
}\r
\r
template <typename T>\r
- void threshold_gpu(const DevMem2D& src, const DevMem2D& dst, float thresh, float maxVal, int type,\r
+ void threshold_gpu(const DevMem2D& src, const DevMem2D& dst, T thresh, T maxVal, int type,\r
cudaStream_t stream)\r
{\r
- typedef void (*caller_t)(const DevMem2D_<T>& src, const DevMem2D_<T>& dst, float thresh, float maxVal, \r
+ typedef void (*caller_t)(const DevMem2D_<T>& src, const DevMem2D_<T>& dst, T thresh, T maxVal, \r
cudaStream_t stream);\r
\r
static const caller_t callers[] = \r
callers[type]((DevMem2D_<T>)src, (DevMem2D_<T>)dst, thresh, maxVal, stream);\r
}\r
\r
- template void threshold_gpu<uchar>(const DevMem2D& src, const DevMem2D& dst, float thresh, float maxVal, int type, cudaStream_t stream);\r
- template void threshold_gpu<schar>(const DevMem2D& src, const DevMem2D& dst, float thresh, float maxVal, int type, cudaStream_t stream);\r
- template void threshold_gpu<ushort>(const DevMem2D& src, const DevMem2D& dst, float thresh, float maxVal, int type, cudaStream_t stream);\r
- template void threshold_gpu<short>(const DevMem2D& src, const DevMem2D& dst, float thresh, float maxVal, int type, cudaStream_t stream);\r
- template void threshold_gpu<int>(const DevMem2D& src, const DevMem2D& dst, float thresh, float maxVal, int type, cudaStream_t stream);\r
+ template void threshold_gpu<uchar>(const DevMem2D& src, const DevMem2D& dst, uchar thresh, uchar maxVal, int type, cudaStream_t stream);\r
+ template void threshold_gpu<schar>(const DevMem2D& src, const DevMem2D& dst, schar thresh, schar maxVal, int type, cudaStream_t stream);\r
+ template void threshold_gpu<ushort>(const DevMem2D& src, const DevMem2D& dst, ushort thresh, ushort maxVal, int type, cudaStream_t stream);\r
+ template void threshold_gpu<short>(const DevMem2D& src, const DevMem2D& dst, short thresh, short maxVal, int type, cudaStream_t stream);\r
+ template void threshold_gpu<int>(const DevMem2D& src, const DevMem2D& dst, int thresh, int maxVal, int type, cudaStream_t stream);\r
template void threshold_gpu<float>(const DevMem2D& src, const DevMem2D& dst, float thresh, float maxVal, int type, cudaStream_t stream);\r
+ template void threshold_gpu<double>(const DevMem2D& src, const DevMem2D& dst, double thresh, double maxVal, int type, cudaStream_t stream);\r
}}}\r
#include "opencv2/gpu/device/saturate_cast.hpp"\r
#include "opencv2/gpu/device/vecmath.hpp"\r
#include "opencv2/gpu/device/limits_gpu.hpp"\r
+#include "opencv2/gpu/device/border_interpolate.hpp"\r
\r
#include "safe_call.hpp"\r
#include "internal_shared.hpp"\r
using namespace cv::gpu;\r
using namespace cv::gpu::device;\r
\r
-namespace cv \r
-{ \r
- namespace gpu \r
- {\r
- namespace device\r
- {\r
- struct BrdReflect101 \r
- {\r
- explicit BrdReflect101(int len): last(len - 1) {}\r
-\r
- __device__ int idx_low(int i) const\r
- {\r
- return abs(i);\r
- }\r
-\r
- __device__ int idx_high(int i) const \r
- {\r
- return last - abs(last - i);\r
- }\r
-\r
- __device__ int idx(int i) const\r
- {\r
- return abs(idx_high(i));\r
- }\r
-\r
- bool is_range_safe(int mini, int maxi) const \r
- {\r
- return -last <= mini && maxi <= 2 * last;\r
- }\r
-\r
- int last;\r
- };\r
- template <typename D>\r
- struct BrdRowReflect101: BrdReflect101\r
- {\r
- explicit BrdRowReflect101(int len): BrdReflect101(len) {}\r
-\r
- template <typename T>\r
- __device__ D at_low(int i, const T* data) const \r
- {\r
- return saturate_cast<D>(data[idx_low(i)]);\r
- }\r
-\r
- template <typename T>\r
- __device__ D at_high(int i, const T* data) const \r
- {\r
- return saturate_cast<D>(data[idx_high(i)]);\r
- }\r
- };\r
- template <typename D>\r
- struct BrdColReflect101: BrdReflect101\r
- {\r
- BrdColReflect101(int len, int step): BrdReflect101(len), step(step) {}\r
-\r
- template <typename T>\r
- __device__ D at_low(int i, const T* data) const \r
- {\r
- return saturate_cast<D>(data[idx_low(i) * step]);\r
- }\r
-\r
- template <typename T>\r
- __device__ D at_high(int i, const T* data) const \r
- {\r
- return saturate_cast<D>(data[idx_high(i) * step]);\r
- }\r
-\r
- int step;\r
- };\r
-\r
- struct BrdReplicate\r
- {\r
- explicit BrdReplicate(int len): last(len - 1) {}\r
-\r
- __device__ int idx_low(int i) const\r
- {\r
- return max(i, 0);\r
- }\r
-\r
- __device__ int idx_high(int i) const \r
- {\r
- return min(i, last);\r
- }\r
-\r
- __device__ int idx(int i) const\r
- {\r
- return max(min(i, last), 0);\r
- }\r
-\r
- bool is_range_safe(int mini, int maxi) const \r
- {\r
- return true;\r
- }\r
-\r
- int last;\r
- };\r
- template <typename D>\r
- struct BrdRowReplicate: BrdReplicate\r
- {\r
- explicit BrdRowReplicate(int len): BrdReplicate(len) {}\r
-\r
- template <typename T>\r
- __device__ D at_low(int i, const T* data) const \r
- {\r
- return saturate_cast<D>(data[idx_low(i)]);\r
- }\r
-\r
- template <typename T>\r
- __device__ D at_high(int i, const T* data) const \r
- {\r
- return saturate_cast<D>(data[idx_high(i)]);\r
- }\r
- };\r
- template <typename D>\r
- struct BrdColReplicate: BrdReplicate\r
- {\r
- BrdColReplicate(int len, int step): BrdReplicate(len), step(step) {}\r
-\r
- template <typename T>\r
- __device__ D at_low(int i, const T* data) const \r
- {\r
- return saturate_cast<D>(data[idx_low(i) * step]);\r
- }\r
-\r
- template <typename T>\r
- __device__ D at_high(int i, const T* data) const \r
- {\r
- return saturate_cast<D>(data[idx_high(i) * step]);\r
- }\r
- int step;\r
- };\r
-\r
- template <typename D>\r
- struct BrdRowConstant\r
- {\r
- explicit BrdRowConstant(int len_, const D& val_ = VecTraits<D>::all(0)): len(len_), val(val_) {}\r
-\r
- template <typename T>\r
- __device__ D at_low(int i, const T* data) const \r
- {\r
- return i >= 0 ? saturate_cast<D>(data[i]) : val;\r
- }\r
-\r
- template <typename T>\r
- __device__ D at_high(int i, const T* data) const \r
- {\r
- return i < len ? saturate_cast<D>(data[i]) : val;\r
- }\r
-\r
- bool is_range_safe(int mini, int maxi) const \r
- {\r
- return true;\r
- }\r
-\r
- int len;\r
- D val;\r
- };\r
- template <typename D>\r
- struct BrdColConstant\r
- {\r
- BrdColConstant(int len_, int step_, const D& val_ = VecTraits<D>::all(0)): len(len_), step(step_), val(val_) {}\r
-\r
- template <typename T>\r
- __device__ D at_low(int i, const T* data) const \r
- {\r
- return i >= 0 ? saturate_cast<D>(data[i * step]) : val;\r
- }\r
-\r
- template <typename T>\r
- __device__ D at_high(int i, const T* data) const \r
- {\r
- return i < len ? saturate_cast<D>(data[i * step]) : val;\r
- }\r
-\r
- bool is_range_safe(int mini, int maxi) const \r
- {\r
- return true;\r
- }\r
-\r
- int len;\r
- int step;\r
- D val;\r
- };\r
- }\r
- }\r
-}\r
-\r
/////////////////////////////////////////////////////////////////////////////////////////////////\r
// Linear filters\r
\r
}\r
\r
filter_krnls::linearRowFilter<ksize, T, D><<<grid, threads>>>(src, dst, anchor, b);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
cudaSafeCall( cudaThreadSynchronize() );\r
}\r
}\r
\r
filter_krnls::linearColumnFilter<ksize, T, D><<<grid, threads>>>(src, dst, anchor, b);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
cudaSafeCall( cudaThreadSynchronize() );\r
}\r
for (int i = 0; i < iters; ++i)\r
{\r
bf_krnls::bilateral_filter<1><<<grid, threads, 0, stream>>>(0, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);\r
+ cudaSafeCall( cudaGetLastError() );\r
bf_krnls::bilateral_filter<1><<<grid, threads, 0, stream>>>(1, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);\r
+ cudaSafeCall( cudaGetLastError() );\r
}\r
break;\r
case 3:\r
for (int i = 0; i < iters; ++i)\r
{\r
bf_krnls::bilateral_filter<3><<<grid, threads, 0, stream>>>(0, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);\r
+ cudaSafeCall( cudaGetLastError() );\r
bf_krnls::bilateral_filter<3><<<grid, threads, 0, stream>>>(1, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);\r
+ cudaSafeCall( cudaGetLastError() );\r
}\r
break;\r
default:\r
int smem = hists_size + final_hists_size;\r
compute_hists_kernel_many_blocks<nblocks><<<grid, threads, smem>>>(\r
img_block_width, grad, qangle, scale, block_hists);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
cudaSafeCall(cudaThreadSynchronize());\r
}\r
else\r
cv::gpu::error("normalize_hists: histogram's size is too big, try to decrease number of bins", __FILE__, __LINE__);\r
\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
cudaSafeCall(cudaThreadSynchronize());\r
}\r
\r
classify_hists_kernel_many_blocks<nthreads, nblocks><<<grid, threads>>>(\r
img_win_width, img_block_width, win_block_stride_x, win_block_stride_y, \r
block_hists, coefs, free_coef, threshold, labels);\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
cudaSafeCall(cudaThreadSynchronize());\r
}\r
\r
block_stride_x;\r
extract_descrs_by_rows_kernel<nthreads><<<grid, threads>>>(\r
img_block_width, win_block_stride_x, win_block_stride_y, block_hists, descriptors);\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
cudaSafeCall(cudaThreadSynchronize());\r
}\r
\r
block_stride_x;\r
extract_descrs_by_cols_kernel<nthreads><<<grid, threads>>>(\r
img_block_width, win_block_stride_x, win_block_stride_y, block_hists, descriptors);\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
cudaSafeCall(cudaThreadSynchronize());\r
}\r
\r
compute_gradients_8UC4_kernel<nthreads, 0><<<gdim, bdim>>>(\r
height, width, img, angle_scale, grad, qangle);\r
\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
cudaSafeCall(cudaThreadSynchronize());\r
}\r
\r
compute_gradients_8UC1_kernel<nthreads, 0><<<gdim, bdim>>>(\r
height, width, img, angle_scale, grad, qangle);\r
\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
cudaSafeCall(cudaThreadSynchronize());\r
}\r
\r
float sx = (float)src.cols / dst.cols;\r
float sy = (float)src.rows / dst.rows;\r
resize_8UC4_kernel<<<grid, threads>>>(sx, sy, dst);\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
cudaSafeCall(cudaThreadSynchronize());\r
\r
cudaSafeCall(cudaUnbindTexture(resize8UC4_tex));\r
float sx = (float)src.cols / dst.cols;\r
float sy = (float)src.rows / dst.rows;\r
resize_8UC1_kernel<<<grid, threads>>>(sx, sy, dst);\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
cudaSafeCall(cudaThreadSynchronize());\r
\r
cudaSafeCall(cudaUnbindTexture(resize8UC1_tex));\r
cudaSafeCall( cudaBindTexture2D(0, tex_remap, src.data, desc, src.cols, src.rows, src.step) );\r
\r
remap_1c<<<grid, threads>>>(xmap.data, ymap.data, xmap.step, dst.data, dst.step, dst.cols, dst.rows);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
cudaSafeCall( cudaThreadSynchronize() ); \r
cudaSafeCall( cudaUnbindTexture(tex_remap) );\r
grid.y = divUp(dst.rows, threads.y);\r
\r
remap_3c<<<grid, threads>>>(src.data, src.step, xmap.data, ymap.data, xmap.step, dst.data, dst.step, dst.cols, dst.rows);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
cudaSafeCall( cudaThreadSynchronize() ); \r
}\r
cudaSafeCall( cudaBindTexture2D( 0, tex_meanshift, src.data, desc, src.cols, src.rows, src.step ) );\r
\r
meanshift_kernel<<< grid, threads >>>( dst.data, dst.step, dst.cols, dst.rows, sp, sr, maxIter, eps );\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
cudaSafeCall( cudaThreadSynchronize() );\r
cudaSafeCall( cudaUnbindTexture( tex_meanshift ) ); \r
}\r
cudaSafeCall( cudaBindTexture2D( 0, tex_meanshift, src.data, desc, src.cols, src.rows, src.step ) );\r
\r
meanshiftproc_kernel<<< grid, threads >>>( dstr.data, dstr.step, dstsp.data, dstsp.step, dstr.cols, dstr.rows, sp, sr, maxIter, eps );\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
cudaSafeCall( cudaThreadSynchronize() );\r
cudaSafeCall( cudaUnbindTexture( tex_meanshift ) ); \r
}\r
grid.y = divUp(src.rows, threads.y);\r
\r
drawColorDisp<<<grid, threads, 0, stream>>>(src.data, src.step, dst.data, dst.step, src.cols, src.rows, ndisp);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
if (stream == 0)\r
cudaSafeCall( cudaThreadSynchronize() ); \r
grid.y = divUp(src.rows, threads.y);\r
\r
drawColorDisp<<<grid, threads, 0, stream>>>(src.data, src.step / sizeof(short), dst.data, dst.step, src.cols, src.rows, ndisp);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
if (stream == 0)\r
cudaSafeCall( cudaThreadSynchronize() );\r
cudaSafeCall( cudaMemcpyToSymbol(cq, q, 16 * sizeof(float)) );\r
\r
reprojectImageTo3D<<<grid, threads, 0, stream>>>(disp.data, disp.step / sizeof(T), xyzw.data, xyzw.step / sizeof(float), disp.rows, disp.cols);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
if (stream == 0)\r
cudaSafeCall( cudaThreadSynchronize() );\r
dim3 grid(divUp(Dx.cols, threads.x), divUp(Dx.rows, threads.y));\r
\r
extractCovData_kernel<<<grid, threads>>>(Dx.cols, Dx.rows, Dx, Dy, dst);\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
cudaSafeCall(cudaThreadSynchronize());\r
}\r
\r
break;\r
}\r
\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
cudaSafeCall(cudaThreadSynchronize());\r
cudaSafeCall(cudaUnbindTexture(harrisDxTex));\r
cudaSafeCall(cudaUnbindTexture(harrisDyTex));\r
break;\r
}\r
\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
cudaSafeCall(cudaThreadSynchronize());\r
cudaSafeCall(cudaUnbindTexture(minEigenValDxTex));\r
cudaSafeCall(cudaUnbindTexture(minEigenValDyTex));\r
dim3 grid(divUp(src.cols, threads.x));\r
\r
column_sumKernel_32F<<<grid, threads>>>(src.cols, src.rows, src, dst);\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
cudaSafeCall(cudaThreadSynchronize());\r
}\r
\r
dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y));\r
\r
mulSpectrumsKernel<<<grid, threads>>>(a, b, c);\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
cudaSafeCall(cudaThreadSynchronize());\r
}\r
\r
dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y));\r
\r
mulSpectrumsKernel_CONJ<<<grid, threads>>>(a, b, c);\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
cudaSafeCall(cudaThreadSynchronize());\r
}\r
\r
dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y));\r
\r
mulAndScaleSpectrumsKernel<<<grid, threads>>>(a, b, scale, c);\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
cudaSafeCall(cudaThreadSynchronize());\r
}\r
\r
dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y));\r
\r
mulAndScaleSpectrumsKernel_CONJ<<<grid, threads>>>(a, b, scale, c);\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
cudaSafeCall(cudaThreadSynchronize());\r
}\r
\r
templ.cols, templ.rows, image, templ, result);\r
break;\r
}\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
cudaSafeCall(cudaThreadSynchronize());\r
}\r
\r
templ.cols, templ.rows, image, templ, result);\r
break;\r
}\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
cudaSafeCall(cudaThreadSynchronize());\r
}\r
\r
templ.cols, templ.rows, image, templ, result);\r
break;\r
}\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
cudaSafeCall(cudaThreadSynchronize());\r
}\r
\r
templ.cols, templ.rows, image, templ, result);\r
break;\r
}\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
cudaSafeCall(cudaThreadSynchronize());\r
}\r
\r
w, h, image_sqsum, templ_sqsum, result);\r
break;\r
}\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
cudaSafeCall(cudaThreadSynchronize());\r
}\r
\r
w, h, image_sqsum, templ_sqsum, result);\r
break;\r
}\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
cudaSafeCall(cudaThreadSynchronize());\r
}\r
\r
dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));\r
matchTemplatePreparedKernel_CCOFF_8U<<<grid, threads>>>(\r
w, h, (float)templ_sum / (w * h), image_sum, result);\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
cudaSafeCall(cudaThreadSynchronize());\r
}\r
\r
matchTemplatePreparedKernel_CCOFF_8UC2<<<grid, threads>>>(\r
w, h, (float)templ_sum_r / (w * h), (float)templ_sum_g / (w * h),\r
image_sum_r, image_sum_g, result);\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
cudaSafeCall(cudaThreadSynchronize());\r
}\r
\r
(float)templ_sum_g / (w * h), \r
(float)templ_sum_b / (w * h),\r
image_sum_r, image_sum_g, image_sum_b, result);\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
cudaSafeCall(cudaThreadSynchronize());\r
}\r
\r
(float)templ_sum_a / (w * h),\r
image_sum_r, image_sum_g, image_sum_b, image_sum_a,\r
result);\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
cudaSafeCall(cudaThreadSynchronize());\r
}\r
\r
matchTemplatePreparedKernel_CCOFF_NORMED_8U<<<grid, threads>>>(\r
w, h, weight, templ_sum_scale, templ_sqsum_scale, \r
image_sum, image_sqsum, result);\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
cudaSafeCall(cudaThreadSynchronize());\r
}\r
\r
image_sum_r, image_sqsum_r, \r
image_sum_g, image_sqsum_g, \r
result);\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
cudaSafeCall(cudaThreadSynchronize());\r
}\r
\r
image_sum_g, image_sqsum_g, \r
image_sum_b, image_sqsum_b, \r
result);\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
cudaSafeCall(cudaThreadSynchronize());\r
}\r
\r
image_sum_b, image_sqsum_b, \r
image_sum_a, image_sqsum_a, \r
result);\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
cudaSafeCall(cudaThreadSynchronize());\r
}\r
\r
normalizeKernel_8U<4><<<grid, threads>>>(w, h, image_sqsum, templ_sqsum, result);\r
break;\r
}\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
cudaSafeCall(cudaThreadSynchronize());\r
}\r
\r
extractFirstChannel_32F<4><<<grid, threads>>>(image, result);\r
break;\r
}\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
cudaSafeCall(cudaThreadSynchronize());\r
}\r
\r
cartToPolar<Mag, Angle><<<grid, threads, 0, stream>>>(\r
x.data, x.step/x.elemSize(), y.data, y.step/y.elemSize(), \r
mag.data, mag.step/mag.elemSize(), angle.data, angle.step/angle.elemSize(), scale, x.cols, x.rows);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
if (stream == 0)\r
cudaSafeCall( cudaThreadSynchronize() );\r
\r
polarToCart<Mag><<<grid, threads, 0, stream>>>(mag.data, mag.step/mag.elemSize(), \r
angle.data, angle.step/angle.elemSize(), scale, x.data, x.step/x.elemSize(), y.data, y.step/y.elemSize(), mag.cols, mag.rows);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
if (stream == 0)\r
cudaSafeCall( cudaThreadSynchronize() );\r
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-// By downloading, copying, installing or using the software you agree to this license.
-// If you do not agree to this license, do not download, install,
-// copy or use the software.
-//
-//
-// License Agreement
-// For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-// * Redistribution's of source code must retain the above copyright notice,
-// this list of conditions and the following disclaimer.
-//
-// * Redistribution's in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution.
-//
-// * The name of the copyright holders may not be used to endorse or promote products
-// derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "internal_shared.hpp"
-#include "opencv2/gpu/device/saturate_cast.hpp"
-#include "opencv2/gpu/device/transform.hpp"
-
-using namespace cv::gpu::device;
-
-namespace cv { namespace gpu { namespace matrix_operations {
-
- template <typename T> struct shift_and_sizeof;
- template <> struct shift_and_sizeof<char> { enum { shift = 0 }; };
- template <> struct shift_and_sizeof<unsigned char> { enum { shift = 0 }; };
- template <> struct shift_and_sizeof<short> { enum { shift = 1 }; };
- template <> struct shift_and_sizeof<unsigned short> { enum { shift = 1 }; };
- template <> struct shift_and_sizeof<int> { enum { shift = 2 }; };
- template <> struct shift_and_sizeof<float> { enum { shift = 2 }; };
- template <> struct shift_and_sizeof<double> { enum { shift = 3 }; };
-
-///////////////////////////////////////////////////////////////////////////
-////////////////////////////////// CopyTo /////////////////////////////////
-///////////////////////////////////////////////////////////////////////////
-
- template<typename T>
- __global__ void copy_to_with_mask(T * mat_src, T * mat_dst, const unsigned char * mask, int cols, int rows, int step_mat, int step_mask, int channels)
- {
- size_t x = blockIdx.x * blockDim.x + threadIdx.x;
- size_t y = blockIdx.y * blockDim.y + threadIdx.y;
-
- if ((x < cols * channels ) && (y < rows))
- if (mask[y * step_mask + x / channels] != 0)
- {
- size_t idx = y * ( step_mat >> shift_and_sizeof<T>::shift ) + x;
- mat_dst[idx] = mat_src[idx];
- }
- }
- typedef void (*CopyToFunc)(const DevMem2D& mat_src, const DevMem2D& mat_dst, const DevMem2D& mask, int channels, const cudaStream_t & stream);
-
- template<typename T>
- void copy_to_with_mask_run(const DevMem2D& mat_src, const DevMem2D& mat_dst, const DevMem2D& mask, int channels, const cudaStream_t & stream)
- {
- dim3 threadsPerBlock(16,16, 1);
- dim3 numBlocks ( divUp(mat_src.cols * channels , threadsPerBlock.x) , divUp(mat_src.rows , threadsPerBlock.y), 1);
-
- copy_to_with_mask<T><<<numBlocks,threadsPerBlock, 0, stream>>>
- ((T*)mat_src.data, (T*)mat_dst.data, (unsigned char*)mask.data, mat_src.cols, mat_src.rows, mat_src.step, mask.step, channels);
-
- if (stream == 0)
- cudaSafeCall ( cudaThreadSynchronize() );
- }
-
- void copy_to_with_mask(const DevMem2D& mat_src, DevMem2D mat_dst, int depth, const DevMem2D& mask, int channels, const cudaStream_t & stream)
- {
- static CopyToFunc tab[8] =
- {
- copy_to_with_mask_run<unsigned char>,
- copy_to_with_mask_run<char>,
- copy_to_with_mask_run<unsigned short>,
- copy_to_with_mask_run<short>,
- copy_to_with_mask_run<int>,
- copy_to_with_mask_run<float>,
- copy_to_with_mask_run<double>,
- 0
- };
-
- CopyToFunc func = tab[depth];
-
- if (func == 0) cv::gpu::error("Unsupported copyTo operation", __FILE__, __LINE__);
-
- func(mat_src, mat_dst, mask, channels, stream);
- }
-
-///////////////////////////////////////////////////////////////////////////
-////////////////////////////////// SetTo //////////////////////////////////
-///////////////////////////////////////////////////////////////////////////
-
- __constant__ double scalar_d[4];
-
- template<typename T>
- __global__ void set_to_without_mask(T * mat, int cols, int rows, int step, int channels)
- {
- size_t x = blockIdx.x * blockDim.x + threadIdx.x;
- size_t y = blockIdx.y * blockDim.y + threadIdx.y;
-
- if ((x < cols * channels ) && (y < rows))
- {
- size_t idx = y * ( step >> shift_and_sizeof<T>::shift ) + x;
- mat[idx] = scalar_d[ x % channels ];
- }
- }
-
- template<typename T>
- __global__ void set_to_with_mask(T * mat, const unsigned char * mask, int cols, int rows, int step, int channels, int step_mask)
- {
- size_t x = blockIdx.x * blockDim.x + threadIdx.x;
- size_t y = blockIdx.y * blockDim.y + threadIdx.y;
-
- if ((x < cols * channels ) && (y < rows))
- if (mask[y * step_mask + x / channels] != 0)
- {
- size_t idx = y * ( step >> shift_and_sizeof<T>::shift ) + x;
- mat[idx] = scalar_d[ x % channels ];
- }
- }
- typedef void (*SetToFunc_with_mask)(const DevMem2D& mat, const DevMem2D& mask, int channels, const cudaStream_t & stream);
- typedef void (*SetToFunc_without_mask)(const DevMem2D& mat, int channels, const cudaStream_t & stream);
-
- template <typename T>
- void set_to_with_mask_run(const DevMem2D& mat, const DevMem2D& mask, int channels, const cudaStream_t & stream)
- {
- dim3 threadsPerBlock(32, 8, 1);
- dim3 numBlocks (mat.cols * channels / threadsPerBlock.x + 1, mat.rows / threadsPerBlock.y + 1, 1);
-
- set_to_with_mask<T><<<numBlocks,threadsPerBlock, 0, stream>>>((T*)mat.data, (unsigned char *)mask.data, mat.cols, mat.rows, mat.step, channels, mask.step);
- if (stream == 0)
- cudaSafeCall ( cudaThreadSynchronize() );
- }
-
- template <typename T>
- void set_to_without_mask_run(const DevMem2D& mat, int channels, const cudaStream_t & stream)
- {
- dim3 threadsPerBlock(32, 8, 1);
- dim3 numBlocks (mat.cols * channels / threadsPerBlock.x + 1, mat.rows / threadsPerBlock.y + 1, 1);
-
- set_to_without_mask<T><<<numBlocks,threadsPerBlock, 0, stream>>>((T*)mat.data, mat.cols, mat.rows, mat.step, channels);
-
- if (stream == 0)
- cudaSafeCall ( cudaThreadSynchronize() );
- }
-
- void set_to_without_mask(DevMem2D mat, int depth, const double *scalar, int channels, const cudaStream_t & stream)
- {
- cudaSafeCall( cudaMemcpyToSymbol(scalar_d, scalar, sizeof(double) * 4));
-
- static SetToFunc_without_mask tab[8] =
- {
- set_to_without_mask_run<unsigned char>,
- set_to_without_mask_run<char>,
- set_to_without_mask_run<unsigned short>,
- set_to_without_mask_run<short>,
- set_to_without_mask_run<int>,
- set_to_without_mask_run<float>,
- set_to_without_mask_run<double>,
- 0
- };
-
- SetToFunc_without_mask func = tab[depth];
-
- if (func == 0)
- cv::gpu::error("Unsupported setTo operation", __FILE__, __LINE__);
-
- func(mat, channels, stream);
- }
-
- void set_to_with_mask(DevMem2D mat, int depth, const double * scalar, const DevMem2D& mask, int channels, const cudaStream_t & stream)
- {
- cudaSafeCall( cudaMemcpyToSymbol(scalar_d, scalar, sizeof(double) * 4));
-
- static SetToFunc_with_mask tab[8] =
- {
- set_to_with_mask_run<unsigned char>,
- set_to_with_mask_run<char>,
- set_to_with_mask_run<unsigned short>,
- set_to_with_mask_run<short>,
- set_to_with_mask_run<int>,
- set_to_with_mask_run<float>,
- set_to_with_mask_run<double>,
- 0
- };
-
- SetToFunc_with_mask func = tab[depth];
-
- if (func == 0)
- cv::gpu::error("Unsupported setTo operation", __FILE__, __LINE__);
-
- func(mat, mask, channels, stream);
- }
-
-///////////////////////////////////////////////////////////////////////////
-//////////////////////////////// ConvertTo ////////////////////////////////
-///////////////////////////////////////////////////////////////////////////
-
- template <typename T, typename D>
- class Convertor
- {
- public:
- Convertor(double alpha_, double beta_): alpha(alpha_), beta(beta_) {}
-
- __device__ D operator()(const T& src)
- {
- return saturate_cast<D>(alpha * src + beta);
- }
-
- private:
- double alpha, beta;
- };
-
- template<typename T, typename D>
- void cvt_(const DevMem2D& src, const DevMem2D& dst, double alpha, double beta, cudaStream_t stream)
- {
- Convertor<T, D> op(alpha, beta);
- transform((DevMem2D_<T>)src, (DevMem2D_<D>)dst, op, stream);
- }
-
- void convert_gpu(const DevMem2D& src, int sdepth, const DevMem2D& dst, int ddepth, double alpha, double beta,
- cudaStream_t stream = 0)
- {
- typedef void (*caller_t)(const DevMem2D& src, const DevMem2D& dst, double alpha, double beta,
- cudaStream_t stream);
-
+/*M///////////////////////////////////////////////////////////////////////////////////////\r
+//\r
+// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.\r
+//\r
+// By downloading, copying, installing or using the software you agree to this license.\r
+// If you do not agree to this license, do not download, install,\r
+// copy or use the software.\r
+//\r
+//\r
+// License Agreement\r
+// For Open Source Computer Vision Library\r
+//\r
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.\r
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.\r
+// Third party copyrights are property of their respective owners.\r
+//\r
+// Redistribution and use in source and binary forms, with or without modification,\r
+// are permitted provided that the following conditions are met:\r
+//\r
+// * Redistribution's of source code must retain the above copyright notice,\r
+// this list of conditions and the following disclaimer.\r
+//\r
+// * Redistribution's in binary form must reproduce the above copyright notice,\r
+// this list of conditions and the following disclaimer in the documentation\r
+// and/or other materials provided with the distribution.\r
+//\r
+// * The name of the copyright holders may not be used to endorse or promote products\r
+// derived from this software without specific prior written permission.\r
+//\r
+// This software is provided by the copyright holders and contributors "as is" and\r
+// any express or implied warranties, including, but not limited to, the implied\r
+// warranties of merchantability and fitness for a particular purpose are disclaimed.\r
+// In no event shall the Intel Corporation or contributors be liable for any direct,\r
+// indirect, incidental, special, exemplary, or consequential damages\r
+// (including, but not limited to, procurement of substitute goods or services;\r
+// loss of use, data, or profits; or business interruption) however caused\r
+// and on any theory of liability, whether in contract, strict liability,\r
+// or tort (including negligence or otherwise) arising in any way out of\r
+// the use of this software, even if advised of the possibility of such damage.\r
+//\r
+//M*/\r
+\r
+#include "internal_shared.hpp"\r
+#include "opencv2/gpu/device/saturate_cast.hpp"\r
+#include "opencv2/gpu/device/transform.hpp"\r
+\r
+using namespace cv::gpu::device;\r
+\r
+namespace cv { namespace gpu { namespace matrix_operations {\r
+\r
+ template <typename T> struct shift_and_sizeof;\r
+ template <> struct shift_and_sizeof<signed char> { enum { shift = 0 }; };\r
+ template <> struct shift_and_sizeof<unsigned char> { enum { shift = 0 }; };\r
+ template <> struct shift_and_sizeof<short> { enum { shift = 1 }; };\r
+ template <> struct shift_and_sizeof<unsigned short> { enum { shift = 1 }; };\r
+ template <> struct shift_and_sizeof<int> { enum { shift = 2 }; };\r
+ template <> struct shift_and_sizeof<float> { enum { shift = 2 }; };\r
+ template <> struct shift_and_sizeof<double> { enum { shift = 3 }; };\r
+\r
+///////////////////////////////////////////////////////////////////////////\r
+////////////////////////////////// CopyTo /////////////////////////////////\r
+///////////////////////////////////////////////////////////////////////////\r
+\r
+ template<typename T>\r
+ __global__ void copy_to_with_mask(T * mat_src, T * mat_dst, const unsigned char * mask, int cols, int rows, int step_mat, int step_mask, int channels)\r
+ {\r
+ size_t x = blockIdx.x * blockDim.x + threadIdx.x;\r
+ size_t y = blockIdx.y * blockDim.y + threadIdx.y;\r
+\r
+ if ((x < cols * channels ) && (y < rows))\r
+ if (mask[y * step_mask + x / channels] != 0)\r
+ {\r
+ size_t idx = y * ( step_mat >> shift_and_sizeof<T>::shift ) + x;\r
+ mat_dst[idx] = mat_src[idx];\r
+ }\r
+ }\r
+ typedef void (*CopyToFunc)(const DevMem2D& mat_src, const DevMem2D& mat_dst, const DevMem2D& mask, int channels, const cudaStream_t & stream);\r
+\r
+ template<typename T>\r
+ void copy_to_with_mask_run(const DevMem2D& mat_src, const DevMem2D& mat_dst, const DevMem2D& mask, int channels, const cudaStream_t & stream)\r
+ {\r
+ dim3 threadsPerBlock(16,16, 1);\r
+ dim3 numBlocks ( divUp(mat_src.cols * channels , threadsPerBlock.x) , divUp(mat_src.rows , threadsPerBlock.y), 1);\r
+\r
+ copy_to_with_mask<T><<<numBlocks,threadsPerBlock, 0, stream>>>\r
+ ((T*)mat_src.data, (T*)mat_dst.data, (unsigned char*)mask.data, mat_src.cols, mat_src.rows, mat_src.step, mask.step, channels);\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
+ if (stream == 0)\r
+ cudaSafeCall ( cudaThreadSynchronize() ); \r
+ }\r
+\r
+ void copy_to_with_mask(const DevMem2D& mat_src, DevMem2D mat_dst, int depth, const DevMem2D& mask, int channels, const cudaStream_t & stream)\r
+ {\r
+ static CopyToFunc tab[8] =\r
+ {\r
+ copy_to_with_mask_run<unsigned char>,\r
+ copy_to_with_mask_run<signed char>,\r
+ copy_to_with_mask_run<unsigned short>,\r
+ copy_to_with_mask_run<short>,\r
+ copy_to_with_mask_run<int>,\r
+ copy_to_with_mask_run<float>,\r
+ copy_to_with_mask_run<double>,\r
+ 0\r
+ };\r
+\r
+ CopyToFunc func = tab[depth];\r
+\r
+ if (func == 0) cv::gpu::error("Unsupported copyTo operation", __FILE__, __LINE__);\r
+\r
+ func(mat_src, mat_dst, mask, channels, stream);\r
+ }\r
+\r
+///////////////////////////////////////////////////////////////////////////\r
+////////////////////////////////// SetTo //////////////////////////////////\r
+///////////////////////////////////////////////////////////////////////////\r
+\r
+ __constant__ uchar scalar_8u[4];\r
+ __constant__ schar scalar_8s[4];\r
+ __constant__ ushort scalar_16u[4];\r
+ __constant__ short scalar_16s[4];\r
+ __constant__ int scalar_32s[4];\r
+ __constant__ float scalar_32f[4]; \r
+ __constant__ double scalar_64f[4];\r
+\r
+ template <typename T> __device__ T readScalar(int i);\r
+ template <> __device__ uchar readScalar<uchar>(int i) {return scalar_8u[i];}\r
+ template <> __device__ schar readScalar<schar>(int i) {return scalar_8s[i];}\r
+ template <> __device__ ushort readScalar<ushort>(int i) {return scalar_16u[i];}\r
+ template <> __device__ short readScalar<short>(int i) {return scalar_16s[i];}\r
+ template <> __device__ int readScalar<int>(int i) {return scalar_32s[i];}\r
+ template <> __device__ float readScalar<float>(int i) {return scalar_32f[i];}\r
+ template <> __device__ double readScalar<double>(int i) {return scalar_64f[i];}\r
+\r
+ void writeScalar(const uchar* vals)\r
+ {\r
+ cudaSafeCall( cudaMemcpyToSymbol(scalar_8u, vals, sizeof(uchar) * 4) );\r
+ }\r
+ void writeScalar(const schar* vals)\r
+ {\r
+ cudaSafeCall( cudaMemcpyToSymbol(scalar_8s, vals, sizeof(schar) * 4) );\r
+ }\r
+ void writeScalar(const ushort* vals)\r
+ {\r
+ cudaSafeCall( cudaMemcpyToSymbol(scalar_16u, vals, sizeof(ushort) * 4) );\r
+ }\r
+ void writeScalar(const short* vals)\r
+ {\r
+ cudaSafeCall( cudaMemcpyToSymbol(scalar_16s, vals, sizeof(short) * 4) );\r
+ }\r
+ void writeScalar(const int* vals)\r
+ {\r
+ cudaSafeCall( cudaMemcpyToSymbol(scalar_32s, vals, sizeof(int) * 4) );\r
+ }\r
+ void writeScalar(const float* vals)\r
+ {\r
+ cudaSafeCall( cudaMemcpyToSymbol(scalar_32f, vals, sizeof(float) * 4) );\r
+ }\r
+ void writeScalar(const double* vals)\r
+ {\r
+ cudaSafeCall( cudaMemcpyToSymbol(scalar_64f, vals, sizeof(double) * 4) );\r
+ }\r
+\r
+ template<typename T>\r
+ __global__ void set_to_without_mask(T * mat, int cols, int rows, int step, int channels)\r
+ {\r
+ size_t x = blockIdx.x * blockDim.x + threadIdx.x;\r
+ size_t y = blockIdx.y * blockDim.y + threadIdx.y;\r
+\r
+ if ((x < cols * channels ) && (y < rows))\r
+ {\r
+ size_t idx = y * ( step >> shift_and_sizeof<T>::shift ) + x;\r
+ mat[idx] = readScalar<T>(x % channels);\r
+ }\r
+ }\r
+\r
+ template<typename T>\r
+ __global__ void set_to_with_mask(T * mat, const unsigned char * mask, int cols, int rows, int step, int channels, int step_mask)\r
+ {\r
+ size_t x = blockIdx.x * blockDim.x + threadIdx.x;\r
+ size_t y = blockIdx.y * blockDim.y + threadIdx.y;\r
+\r
+ if ((x < cols * channels ) && (y < rows))\r
+ if (mask[y * step_mask + x / channels] != 0)\r
+ {\r
+ size_t idx = y * ( step >> shift_and_sizeof<T>::shift ) + x;\r
+ mat[idx] = readScalar<T>(x % channels);\r
+ }\r
+ }\r
+ template <typename T>\r
+ void set_to_gpu(const DevMem2D& mat, const T* scalar, const DevMem2D& mask, int channels, cudaStream_t stream)\r
+ {\r
+ writeScalar(scalar);\r
+\r
+ dim3 threadsPerBlock(32, 8, 1);\r
+ dim3 numBlocks (mat.cols * channels / threadsPerBlock.x + 1, mat.rows / threadsPerBlock.y + 1, 1);\r
+\r
+ set_to_with_mask<T><<<numBlocks, threadsPerBlock, 0, stream>>>((T*)mat.data, (uchar*)mask.data, mat.cols, mat.rows, mat.step, channels, mask.step);\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
+ if (stream == 0)\r
+ cudaSafeCall ( cudaThreadSynchronize() );\r
+ }\r
+\r
+ template void set_to_gpu<uchar >(const DevMem2D& mat, const uchar* scalar, const DevMem2D& mask, int channels, cudaStream_t stream);\r
+ template void set_to_gpu<schar >(const DevMem2D& mat, const schar* scalar, const DevMem2D& mask, int channels, cudaStream_t stream);\r
+ template void set_to_gpu<ushort>(const DevMem2D& mat, const ushort* scalar, const DevMem2D& mask, int channels, cudaStream_t stream);\r
+ template void set_to_gpu<short >(const DevMem2D& mat, const short* scalar, const DevMem2D& mask, int channels, cudaStream_t stream);\r
+ template void set_to_gpu<int >(const DevMem2D& mat, const int* scalar, const DevMem2D& mask, int channels, cudaStream_t stream);\r
+ template void set_to_gpu<float >(const DevMem2D& mat, const float* scalar, const DevMem2D& mask, int channels, cudaStream_t stream);\r
+ template void set_to_gpu<double>(const DevMem2D& mat, const double* scalar, const DevMem2D& mask, int channels, cudaStream_t stream);\r
+\r
+ template <typename T>\r
+ void set_to_gpu(const DevMem2D& mat, const T* scalar, int channels, cudaStream_t stream)\r
+ {\r
+ writeScalar(scalar);\r
+\r
+ dim3 threadsPerBlock(32, 8, 1);\r
+ dim3 numBlocks (mat.cols * channels / threadsPerBlock.x + 1, mat.rows / threadsPerBlock.y + 1, 1);\r
+\r
+ set_to_without_mask<T><<<numBlocks, threadsPerBlock, 0, stream>>>((T*)mat.data, mat.cols, mat.rows, mat.step, channels);\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
+ if (stream == 0)\r
+ cudaSafeCall ( cudaThreadSynchronize() );\r
+ }\r
+\r
+ template void set_to_gpu<uchar >(const DevMem2D& mat, const uchar* scalar, int channels, cudaStream_t stream);\r
+ template void set_to_gpu<schar >(const DevMem2D& mat, const schar* scalar, int channels, cudaStream_t stream);\r
+ template void set_to_gpu<ushort>(const DevMem2D& mat, const ushort* scalar, int channels, cudaStream_t stream);\r
+ template void set_to_gpu<short >(const DevMem2D& mat, const short* scalar, int channels, cudaStream_t stream);\r
+ template void set_to_gpu<int >(const DevMem2D& mat, const int* scalar, int channels, cudaStream_t stream);\r
+ template void set_to_gpu<float >(const DevMem2D& mat, const float* scalar, int channels, cudaStream_t stream);\r
+ template void set_to_gpu<double>(const DevMem2D& mat, const double* scalar, int channels, cudaStream_t stream);\r
+\r
+///////////////////////////////////////////////////////////////////////////\r
+//////////////////////////////// ConvertTo ////////////////////////////////\r
+///////////////////////////////////////////////////////////////////////////\r
+\r
+ template <typename T, typename D>\r
+ class Convertor\r
+ {\r
+ public:\r
+ Convertor(double alpha_, double beta_) : alpha(alpha_), beta(beta_) {}\r
+\r
+ __device__ D operator()(const T& src)\r
+ {\r
+ return saturate_cast<D>(alpha * src + beta);\r
+ }\r
+\r
+ private:\r
+ double alpha, beta;\r
+ };\r
+ \r
+ template<typename T, typename D>\r
+ void cvt_(const DevMem2D& src, const DevMem2D& dst, double alpha, double beta, cudaStream_t stream)\r
+ {\r
+ cudaSafeCall( cudaSetDoubleForDevice(&alpha) );\r
+ cudaSafeCall( cudaSetDoubleForDevice(&beta) );\r
+ Convertor<T, D> op(alpha, beta);\r
+ transform((DevMem2D_<T>)src, (DevMem2D_<D>)dst, op, stream);\r
+ }\r
+\r
+ void convert_gpu(const DevMem2D& src, int sdepth, const DevMem2D& dst, int ddepth, double alpha, double beta, \r
+ cudaStream_t stream = 0)\r
+ {\r
+ typedef void (*caller_t)(const DevMem2D& src, const DevMem2D& dst, double alpha, double beta, \r
+ cudaStream_t stream);\r
+\r
static const caller_t tab[8][8] =\r
{\r
{cvt_<uchar, uchar>, cvt_<uchar, schar>, cvt_<uchar, ushort>, cvt_<uchar, short>,\r
cvt_<double, short>, cvt_<double, int>, cvt_<double, float>, cvt_<double, double>, 0},\r
\r
{0,0,0,0,0,0,0,0}\r
- };
-
+ };\r
+\r
caller_t func = tab[sdepth][ddepth];\r
if (!func)\r
cv::gpu::error("Unsupported convert operation", __FILE__, __LINE__);\r
\r
- func(src, dst, alpha, beta, stream);
- }
-}}}
+ func(src, dst, alpha, beta, stream);\r
+ }\r
+}}}\r
T* maxval_buf = (T*)buf.ptr(1);\r
\r
minMaxKernel<256, T, Mask8U><<<grid, threads>>>(src, Mask8U(mask), minval_buf, maxval_buf);\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
cudaSafeCall(cudaThreadSynchronize());\r
\r
T minval_, maxval_;\r
T* maxval_buf = (T*)buf.ptr(1);\r
\r
minMaxKernel<256, T, MaskTrue><<<grid, threads>>>(src, MaskTrue(), minval_buf, maxval_buf);\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
cudaSafeCall(cudaThreadSynchronize());\r
\r
T minval_, maxval_;\r
T* maxval_buf = (T*)buf.ptr(1);\r
\r
minMaxKernel<256, T, Mask8U><<<grid, threads>>>(src, Mask8U(mask), minval_buf, maxval_buf);\r
+ cudaSafeCall( cudaGetLastError() );\r
minMaxPass2Kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, grid.x * grid.y);\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
cudaSafeCall(cudaThreadSynchronize());\r
\r
T minval_, maxval_;\r
T* maxval_buf = (T*)buf.ptr(1);\r
\r
minMaxKernel<256, T, MaskTrue><<<grid, threads>>>(src, MaskTrue(), minval_buf, maxval_buf);\r
+ cudaSafeCall( cudaGetLastError() );\r
minMaxPass2Kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, grid.x * grid.y);\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
cudaSafeCall(cudaThreadSynchronize());\r
\r
T minval_, maxval_;\r
\r
minMaxLocKernel<256, T, Mask8U><<<grid, threads>>>(src, Mask8U(mask), minval_buf, maxval_buf, \r
minloc_buf, maxloc_buf);\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
cudaSafeCall(cudaThreadSynchronize());\r
\r
T minval_, maxval_;\r
\r
minMaxLocKernel<256, T, MaskTrue><<<grid, threads>>>(src, MaskTrue(), minval_buf, maxval_buf, \r
minloc_buf, maxloc_buf);\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
cudaSafeCall(cudaThreadSynchronize());\r
\r
T minval_, maxval_;\r
\r
minMaxLocKernel<256, T, Mask8U><<<grid, threads>>>(src, Mask8U(mask), minval_buf, maxval_buf, \r
minloc_buf, maxloc_buf);\r
+ cudaSafeCall( cudaGetLastError() );\r
minMaxLocPass2Kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, minloc_buf, maxloc_buf, grid.x * grid.y);\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
cudaSafeCall(cudaThreadSynchronize());\r
\r
T minval_, maxval_;\r
\r
minMaxLocKernel<256, T, MaskTrue><<<grid, threads>>>(src, MaskTrue(), minval_buf, maxval_buf, \r
minloc_buf, maxloc_buf);\r
+ cudaSafeCall( cudaGetLastError() );\r
minMaxLocPass2Kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, minloc_buf, maxloc_buf, grid.x * grid.y);\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
cudaSafeCall(cudaThreadSynchronize());\r
\r
T minval_, maxval_;\r
uint* count_buf = (uint*)buf.ptr(0);\r
\r
countNonZeroKernel<256, T><<<grid, threads>>>(src, count_buf);\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
cudaSafeCall(cudaThreadSynchronize());\r
\r
uint count;\r
uint* count_buf = (uint*)buf.ptr(0);\r
\r
countNonZeroKernel<256, T><<<grid, threads>>>(src, count_buf);\r
+ cudaSafeCall( cudaGetLastError() );\r
countNonZeroPass2Kernel<256, T><<<1, 256>>>(count_buf, grid.x * grid.y);\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
cudaSafeCall(cudaThreadSynchronize());\r
\r
uint count;\r
case 1:\r
sumKernel<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(\r
src, (typename TypeVec<R, 1>::vec_t*)buf.ptr(0));\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
sumPass2Kernel<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(\r
(typename TypeVec<R, 1>::vec_t*)buf.ptr(0), grid.x * grid.y);\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
break;\r
case 2:\r
sumKernel_C2<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(\r
src, (typename TypeVec<R, 2>::vec_t*)buf.ptr(0));\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
sumPass2Kernel_C2<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(\r
(typename TypeVec<R, 2>::vec_t*)buf.ptr(0), grid.x * grid.y);\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
break;\r
case 3:\r
sumKernel_C3<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(\r
src, (typename TypeVec<R, 3>::vec_t*)buf.ptr(0));\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
sumPass2Kernel_C3<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(\r
(typename TypeVec<R, 3>::vec_t*)buf.ptr(0), grid.x * grid.y);\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
break;\r
case 4:\r
sumKernel_C4<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(\r
src, (typename TypeVec<R, 4>::vec_t*)buf.ptr(0));\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
sumPass2Kernel_C4<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(\r
(typename TypeVec<R, 4>::vec_t*)buf.ptr(0), grid.x * grid.y);\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
break;\r
}\r
cudaSafeCall(cudaThreadSynchronize());\r
src, (typename TypeVec<R, 4>::vec_t*)buf.ptr(0));\r
break;\r
}\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
cudaSafeCall(cudaThreadSynchronize());\r
\r
R result[4] = {0, 0, 0, 0};\r
case 1:\r
sumKernel<T, R, AbsOp<R>, threads_x * threads_y><<<grid, threads>>>(\r
src, (typename TypeVec<R, 1>::vec_t*)buf.ptr(0));\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
sumPass2Kernel<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(\r
(typename TypeVec<R, 1>::vec_t*)buf.ptr(0), grid.x * grid.y);\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
break;\r
case 2:\r
sumKernel_C2<T, R, AbsOp<R>, threads_x * threads_y><<<grid, threads>>>(\r
src, (typename TypeVec<R, 2>::vec_t*)buf.ptr(0));\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
sumPass2Kernel_C2<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(\r
(typename TypeVec<R, 2>::vec_t*)buf.ptr(0), grid.x * grid.y);\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
break;\r
case 3:\r
sumKernel_C3<T, R, AbsOp<R>, threads_x * threads_y><<<grid, threads>>>(\r
src, (typename TypeVec<R, 3>::vec_t*)buf.ptr(0));\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
sumPass2Kernel_C3<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(\r
(typename TypeVec<R, 3>::vec_t*)buf.ptr(0), grid.x * grid.y);\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
break;\r
case 4:\r
sumKernel_C4<T, R, AbsOp<R>, threads_x * threads_y><<<grid, threads>>>(\r
src, (typename TypeVec<R, 4>::vec_t*)buf.ptr(0));\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
sumPass2Kernel_C4<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(\r
(typename TypeVec<R, 4>::vec_t*)buf.ptr(0), grid.x * grid.y);\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
break;\r
}\r
cudaSafeCall(cudaThreadSynchronize());\r
src, (typename TypeVec<R, 4>::vec_t*)buf.ptr(0));\r
break;\r
}\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
cudaSafeCall(cudaThreadSynchronize());\r
\r
R result[4] = {0, 0, 0, 0};\r
case 1:\r
sumKernel<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(\r
src, (typename TypeVec<R, 1>::vec_t*)buf.ptr(0));\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
sumPass2Kernel<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(\r
(typename TypeVec<R, 1>::vec_t*)buf.ptr(0), grid.x * grid.y);\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
break;\r
case 2:\r
sumKernel_C2<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(\r
src, (typename TypeVec<R, 2>::vec_t*)buf.ptr(0));\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
sumPass2Kernel_C2<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(\r
(typename TypeVec<R, 2>::vec_t*)buf.ptr(0), grid.x * grid.y);\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
break;\r
case 3:\r
sumKernel_C3<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(\r
src, (typename TypeVec<R, 3>::vec_t*)buf.ptr(0));\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
sumPass2Kernel_C3<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(\r
(typename TypeVec<R, 3>::vec_t*)buf.ptr(0), grid.x * grid.y);\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
break;\r
case 4:\r
sumKernel_C4<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(\r
src, (typename TypeVec<R, 4>::vec_t*)buf.ptr(0));\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
sumPass2Kernel_C4<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(\r
(typename TypeVec<R, 4>::vec_t*)buf.ptr(0), grid.x * grid.y);\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
break;\r
}\r
cudaSafeCall(cudaThreadSynchronize());\r
src, (typename TypeVec<R, 4>::vec_t*)buf.ptr(0));\r
break;\r
}\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
cudaSafeCall(cudaThreadSynchronize());\r
\r
R result[4] = {0, 0, 0, 0};\r
src[0].data, src[0].step,\r
src[1].data, src[1].step,\r
dst.rows, dst.cols, dst.data, dst.step);\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
if (stream == 0)\r
cudaSafeCall(cudaThreadSynchronize());\r
}\r
src[1].data, src[1].step,\r
src[2].data, src[2].step,\r
dst.rows, dst.cols, dst.data, dst.step);\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
if (stream == 0)\r
cudaSafeCall(cudaThreadSynchronize());\r
}\r
src[2].data, src[2].step,\r
src[3].data, src[3].step,\r
dst.rows, dst.cols, dst.data, dst.step);\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
if (stream == 0)\r
cudaSafeCall(cudaThreadSynchronize());\r
}\r
src.data, src.step, src.rows, src.cols,\r
dst[0].data, dst[0].step,\r
dst[1].data, dst[1].step);\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
if (stream == 0)\r
cudaSafeCall(cudaThreadSynchronize());\r
}\r
dst[0].data, dst[0].step,\r
dst[1].data, dst[1].step,\r
dst[2].data, dst[2].step);\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
if (stream == 0)\r
cudaSafeCall(cudaThreadSynchronize());\r
}\r
dst[1].data, dst[1].step,\r
dst[2].data, dst[2].step,\r
dst[3].data, dst[3].step);\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
if (stream == 0)\r
cudaSafeCall(cudaThreadSynchronize());\r
}\r
size_t smem_size = (BLOCK_W + N_DISPARITIES * (BLOCK_W + 2 * RADIUS)) * sizeof(unsigned int);\r
\r
stereoKernel<RADIUS><<<grid, threads, smem_size, stream>>>(left.data, right.data, left.step, disp, maxdisp);\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
if (stream == 0) \r
cudaSafeCall( cudaThreadSynchronize() );\r
};\r
grid.y = divUp(input.rows, threads.y);\r
\r
prefilter_kernel<<<grid, threads, 0, stream>>>(output, prefilterCap);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
if (stream == 0) \r
cudaSafeCall( cudaThreadSynchronize() ); \r
\r
size_t smem_size = (threads.x + threads.x + (winsz/2) * 2 ) * sizeof(float);\r
textureness_kernel<<<grid, threads, smem_size, stream>>>(disp, winsz, avgTexturenessThreshold);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
if (stream == 0) \r
cudaSafeCall( cudaThreadSynchronize() ); \r
grid.y = divUp(left.rows, threads.y);\r
\r
comp_data<1, short><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<short>)data);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
if (stream == 0)\r
cudaSafeCall( cudaThreadSynchronize() );\r
grid.y = divUp(left.rows, threads.y);\r
\r
comp_data<1, float><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<float>)data);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
if (stream == 0)\r
cudaSafeCall( cudaThreadSynchronize() );\r
grid.y = divUp(left.rows, threads.y);\r
\r
comp_data<3, short><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<short>)data);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
if (stream == 0)\r
cudaSafeCall( cudaThreadSynchronize() );\r
grid.y = divUp(left.rows, threads.y);\r
\r
comp_data<3, float><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<float>)data);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
if (stream == 0)\r
cudaSafeCall( cudaThreadSynchronize() );\r
grid.y = divUp(left.rows, threads.y);\r
\r
comp_data<4, short><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<short>)data);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
if (stream == 0)\r
cudaSafeCall( cudaThreadSynchronize() );\r
grid.y = divUp(left.rows, threads.y);\r
\r
comp_data<4, float><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<float>)data);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
if (stream == 0)\r
cudaSafeCall( cudaThreadSynchronize() );\r
grid.y = divUp(dst_rows, threads.y);\r
\r
data_step_down<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (DevMem2D_<T>)src, (DevMem2D_<T>)dst);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
if (stream == 0)\r
cudaSafeCall( cudaThreadSynchronize() );\r
int src_idx = (dst_idx + 1) & 1;\r
\r
level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (DevMem2D_<T>)mus[src_idx], (DevMem2D_<T>)mus[dst_idx]);\r
+ cudaSafeCall( cudaGetLastError() );\r
level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (DevMem2D_<T>)mds[src_idx], (DevMem2D_<T>)mds[dst_idx]);\r
+ cudaSafeCall( cudaGetLastError() );\r
level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (DevMem2D_<T>)mls[src_idx], (DevMem2D_<T>)mls[dst_idx]);\r
+ cudaSafeCall( cudaGetLastError() );\r
level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (DevMem2D_<T>)mrs[src_idx], (DevMem2D_<T>)mrs[dst_idx]);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
if (stream == 0)\r
cudaSafeCall( cudaThreadSynchronize() );\r
for(int t = 0; t < iters; ++t)\r
{\r
one_iteration<T><<<grid, threads, 0, stream>>>(t, (DevMem2D_<T>)u, (T*)d.data, (T*)l.data, (T*)r.data, (DevMem2D_<T>)data, cols, rows);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
if (stream == 0)\r
cudaSafeCall( cudaThreadSynchronize() );\r
grid.y = divUp(disp.rows, threads.y);\r
\r
output<T><<<grid, threads, 0, stream>>>((DevMem2D_<T>)u, (const T*)d.data, (const T*)l.data, (const T*)r.data, (const T*)data.data, disp);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
if (stream == 0)\r
cudaSafeCall( cudaThreadSynchronize() );\r
cudaSafeCall( cudaMemcpyToSymbol(cmsg_step1, &msg_step, sizeof(size_t)) );\r
\r
init_data_cost_callers[level](rows, cols, h, w, level, ndisp, channels, stream);\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
if (stream == 0)\r
cudaSafeCall( cudaThreadSynchronize() );\r
\r
get_first_k_initial_local<<<grid, threads, 0, stream>>> (data_cost_selected, disp_selected_pyr, h, w, nr_plane);\r
else\r
get_first_k_initial_global<<<grid, threads, 0, stream>>>(data_cost_selected, disp_selected_pyr, h, w, nr_plane);\r
+ \r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
if (stream == 0)\r
cudaSafeCall( cudaThreadSynchronize() );\r
}\r
cudaSafeCall( cudaMemcpyToSymbol(cmsg_step2, &msg_step2, sizeof(size_t)) );\r
\r
callers[level](disp_selected_pyr, data_cost, rows, cols, h, w, level, nr_plane, channels, stream);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
if (stream == 0)\r
cudaSafeCall( cudaThreadSynchronize() );\r
grid.y = divUp(h, threads.y);\r
\r
init_message<<<grid, threads, 0, stream>>>(u_new, d_new, l_new, r_new,\r
- u_cur, d_cur, l_cur, r_cur,\r
- selected_disp_pyr_new, selected_disp_pyr_cur,\r
- data_cost_selected, data_cost,\r
- h, w, nr_plane, h2, w2, nr_plane2);\r
+ u_cur, d_cur, l_cur, r_cur,\r
+ selected_disp_pyr_new, selected_disp_pyr_cur,\r
+ data_cost_selected, data_cost,\r
+ h, w, nr_plane, h2, w2, nr_plane2);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
if (stream == 0)\r
cudaSafeCall( cudaThreadSynchronize() );\r
for(int t = 0; t < iters; ++t)\r
{\r
compute_message<<<grid, threads, 0, stream>>>(u, d, l, r, data_cost_selected, selected_disp_pyr_cur, h, w, nr_plane, t & 1);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
if (stream == 0)\r
cudaSafeCall( cudaThreadSynchronize() );\r
grid.y = divUp(disp.rows, threads.y);\r
\r
compute_disp<<<grid, threads, 0, stream>>>(u, d, l, r, data_cost_selected, disp_selected,\r
- disp.data, disp.step / disp.elemSize(), disp.cols, disp.rows, nr_plane);\r
+ disp.data, disp.step / disp.elemSize(), disp.cols, disp.rows, nr_plane);\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
if (stream == 0)\r
cudaSafeCall( cudaThreadSynchronize() );\r
}\r
hessianBuffer.ptr(c_y_size * hidx_z + hidx_y)[hidx_x] = result;\r
}\r
}\r
+\r
+ __global__ void fasthessian_old(PtrStepf hessianBuffer)\r
+ {\r
+ // Determine the indices in the Hessian buffer\r
+ int gridDim_y = gridDim.y / c_nIntervals;\r
+ int blockIdx_y = blockIdx.y % gridDim_y;\r
+ int blockIdx_z = blockIdx.y / gridDim_y;\r
+\r
+ int hidx_x = threadIdx.x + blockIdx.x * blockDim.x;\r
+ int hidx_y = threadIdx.y + blockIdx_y * blockDim.y;\r
+ int hidx_z = blockIdx_z;\r
+\r
+ float fscale = calcScale(hidx_z);\r
+\r
+ // Compute the lookup location of the mask center\r
+ float x = hidx_x * c_step + c_border;\r
+ float y = hidx_y * c_step + c_border;\r
+\r
+ // Scale the mask dimensions according to the scale\r
+ if (hidx_x < c_x_size && hidx_y < c_y_size && hidx_z < c_nIntervals)\r
+ {\r
+ float mask_width = c_mask_width * fscale;\r
+ float mask_height = c_mask_height * fscale;\r
+\r
+ // Compute the filter responses\r
+ float Dyy = evalDyy(x, y, c_mask_height, mask_width, mask_height, fscale);\r
+ float Dxx = evalDxx(x, y, c_mask_height, mask_width, mask_height, fscale);\r
+ float Dxy = evalDxy(x, y, fscale);\r
+ \r
+ // Combine the responses and store the Laplacian sign\r
+ float result = (Dxx * Dyy) - c_dxy_scale * (Dxy * Dxy);\r
+\r
+ if (Dxx + Dyy > 0.f)\r
+ setLastBit(result);\r
+ else\r
+ clearLastBit(result);\r
+\r
+ hessianBuffer.ptr(c_y_size * hidx_z + hidx_y)[hidx_x] = result;\r
+ }\r
+ }\r
\r
dim3 calcBlockSize(int nIntervals)\r
{\r
grid.y = divUp(y_size, threads.y);\r
\r
fasthessian<<<grid, threads>>>(hessianBuffer);\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
+ cudaSafeCall( cudaThreadSynchronize() );\r
+ }\r
+\r
+ void fasthessian_gpu_old(PtrStepf hessianBuffer, int x_size, int y_size, const dim3& threadsOld)\r
+ {\r
+ dim3 threads(16, 16);\r
+\r
+ dim3 grid;\r
+ grid.x = divUp(x_size, threads.x);\r
+ grid.y = divUp(y_size, threads.y) * threadsOld.z;\r
+ \r
+ fasthessian_old<<<grid, threads>>>(hessianBuffer);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
cudaSafeCall( cudaThreadSynchronize() );\r
}\r
nonmaxonly<WithMask><<<grid, threads, smem_size>>>(hessianBuffer, maxPosBuffer, maxCounterWrapper);\r
else\r
nonmaxonly<WithOutMask><<<grid, threads, smem_size>>>(hessianBuffer, maxPosBuffer, maxCounterWrapper);\r
+ \r
+ cudaSafeCall( cudaGetLastError() );\r
\r
cudaSafeCall( cudaThreadSynchronize() );\r
}\r
DeviceReference<unsigned int> featureCounterWrapper(featureCounter);\r
\r
fh_interp_extremum<<<grid, threads>>>(hessianBuffer, maxPosBuffer, featuresBuffer, featureCounterWrapper);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
cudaSafeCall( cudaThreadSynchronize() );\r
}\r
grid.x = nFeatures;\r
\r
find_orientation<<<grid, threads>>>(features);\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
cudaSafeCall( cudaThreadSynchronize() );\r
}\r
\r
if (descriptors.cols == 64)\r
{\r
compute_descriptors64<<<dim3(nFeatures, 1, 1), dim3(25, 4, 4)>>>(descriptors, features);\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
cudaSafeCall( cudaThreadSynchronize() );\r
\r
normalize_descriptors<64><<<dim3(nFeatures, 1, 1), dim3(64, 1, 1)>>>(descriptors);\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
cudaSafeCall( cudaThreadSynchronize() );\r
}\r
else\r
{\r
compute_descriptors128<<<dim3(nFeatures, 1, 1), dim3(25, 4, 4)>>>(descriptors, features);\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
cudaSafeCall( cudaThreadSynchronize() );\r
\r
normalize_descriptors<128><<<dim3(nFeatures, 1, 1), dim3(128, 1, 1)>>>(descriptors);\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
+ cudaSafeCall( cudaThreadSynchronize() );\r
+ }\r
+ }\r
+\r
+ __device__ void calc_dx_dy_old(float sdx[25], float sdy[25], const KeyPoint_GPU* features, int tid)\r
+ { \r
+ // get the interest point parameters (x, y, scale, strength, theta)\r
+ __shared__ float ipt[5];\r
+ if (tid < 5)\r
+ {\r
+ ipt[tid] = ((float*)&features[blockIdx.x])[tid];\r
+ }\r
+ __syncthreads();\r
+\r
+ float sin_theta, cos_theta;\r
+ sincosf(ipt[SF_ANGLE], &sin_theta, &cos_theta);\r
+\r
+ // Compute sampling points\r
+ // since grids are 2D, need to compute xBlock and yBlock indices\r
+ const int xBlock = (blockIdx.y & 3); // blockIdx.y % 4\r
+ const int yBlock = (blockIdx.y >> 2); // floor(blockIdx.y/4)\r
+ const int xIndex = xBlock * blockDim.x + threadIdx.x;\r
+ const int yIndex = yBlock * blockDim.y + threadIdx.y;\r
+\r
+ // Compute rotated sampling points\r
+ // (clockwise rotation since we are rotating the lattice)\r
+ // (subtract 9.5f to start sampling at the top left of the lattice, 0.5f is to space points out properly - there is no center pixel)\r
+ const float sample_x = ipt[SF_X] + (cos_theta * ((float) (xIndex-9.5f)) * ipt[SF_SIZE] \r
+ + sin_theta * ((float) (yIndex-9.5f)) * ipt[SF_SIZE]);\r
+ const float sample_y = ipt[SF_Y] + (-sin_theta * ((float) (xIndex-9.5f)) * ipt[SF_SIZE] \r
+ + cos_theta * ((float) (yIndex-9.5f)) * ipt[SF_SIZE]);\r
+\r
+ // gather integral image lookups for Haar wavelets at each point (some lookups are shared between dx and dy)\r
+ // a b c\r
+ // d f\r
+ // g h i\r
+ const float a = tex2D(sumTex, sample_x - ipt[SF_SIZE], sample_y - ipt[SF_SIZE]);\r
+ const float b = tex2D(sumTex, sample_x, sample_y - ipt[SF_SIZE]);\r
+ const float c = tex2D(sumTex, sample_x + ipt[SF_SIZE], sample_y - ipt[SF_SIZE]);\r
+ const float d = tex2D(sumTex, sample_x - ipt[SF_SIZE], sample_y);\r
+ const float f = tex2D(sumTex, sample_x + ipt[SF_SIZE], sample_y);\r
+ const float g = tex2D(sumTex, sample_x - ipt[SF_SIZE], sample_y + ipt[SF_SIZE]);\r
+ const float h = tex2D(sumTex, sample_x, sample_y + ipt[SF_SIZE]);\r
+ const float i = tex2D(sumTex, sample_x + ipt[SF_SIZE], sample_y + ipt[SF_SIZE]); \r
+\r
+ // compute axis-aligned HaarX, HaarY\r
+ // (could group the additions together into multiplications)\r
+ const float gauss = c_3p3gauss1D[xIndex] * c_3p3gauss1D[yIndex]; // separable because independent (circular)\r
+ const float aa_dx = gauss * (-(a-b-g+h) + (b-c-h+i)); // unrotated dx\r
+ const float aa_dy = gauss * (-(a-c-d+f) + (d-f-g+i)); // unrotated dy\r
+\r
+ // rotate responses (store all dxs then all dys)\r
+ // - counterclockwise rotation to rotate back to zero orientation\r
+ sdx[tid] = aa_dx * cos_theta - aa_dy * sin_theta; // rotated dx\r
+ sdy[tid] = aa_dx * sin_theta + aa_dy * cos_theta; // rotated dy\r
+ }\r
+\r
+ __device__ void reduce_sum_old(float sdata[25], int tid)\r
+ {\r
+ // first step is to reduce from 25 to 16\r
+ if (tid < 9) // use 9 threads\r
+ sdata[tid] += sdata[tid + 16];\r
+ __syncthreads();\r
+\r
+ // sum (reduce) from 16 to 1 (unrolled - aligned to a half-warp)\r
+ if (tid < 16)\r
+ {\r
+ volatile float* smem = sdata;\r
+\r
+ smem[tid] += smem[tid + 8];\r
+ smem[tid] += smem[tid + 4];\r
+ smem[tid] += smem[tid + 2];\r
+ smem[tid] += smem[tid + 1];\r
+ }\r
+ }\r
+\r
+ // Spawn 16 blocks per interest point\r
+ // - computes unnormalized 64 dimensional descriptor, puts it into d_descriptors in the correct location\r
+ __global__ void compute_descriptors64_old(PtrStepf descriptors, const KeyPoint_GPU* features)\r
+ {\r
+ const int tid = threadIdx.y * blockDim.x + threadIdx.x;\r
+ \r
+ float* descriptors_block = descriptors.ptr(blockIdx.x) + (blockIdx.y << 2);\r
+ \r
+ // 2 floats (dx,dy) for each thread (5x5 sample points in each sub-region)\r
+ __shared__ float sdx[25]; \r
+ __shared__ float sdy[25];\r
+\r
+ calc_dx_dy_old(sdx, sdy, features, tid);\r
+ __syncthreads();\r
+\r
+ __shared__ float sabs[25];\r
+\r
+ sabs[tid] = fabs(sdx[tid]); // |dx| array\r
+ __syncthreads();\r
+\r
+ reduce_sum_old(sdx, tid);\r
+ reduce_sum_old(sdy, tid);\r
+ reduce_sum_old(sabs, tid);\r
+\r
+ // write dx, dy, |dx|\r
+ if (tid == 0)\r
+ {\r
+ descriptors_block[0] = sdx[0];\r
+ descriptors_block[1] = sdy[0];\r
+ descriptors_block[2] = sabs[0];\r
+ }\r
+ __syncthreads();\r
+\r
+ sabs[tid] = fabs(sdy[tid]); // |dy| array\r
+ __syncthreads();\r
+ \r
+ reduce_sum_old(sabs, tid);\r
+\r
+ // write |dy|\r
+ if (tid == 0)\r
+ {\r
+ descriptors_block[3] = sabs[0];\r
+ }\r
+ }\r
+\r
+ // Spawn 16 blocks per interest point\r
+ // - computes unnormalized 128 dimensional descriptor, puts it into d_descriptors in the correct location\r
+ __global__ void compute_descriptors128_old(PtrStepf descriptors, const KeyPoint_GPU* features)\r
+ {\r
+ float* descriptors_block = descriptors.ptr(blockIdx.x) + (blockIdx.y << 3);\r
+\r
+ const int tid = threadIdx.y * blockDim.x + threadIdx.x;\r
+ \r
+ // 2 floats (dx,dy) for each thread (5x5 sample points in each sub-region)\r
+ __shared__ float sdx[25]; \r
+ __shared__ float sdy[25];\r
+ \r
+ calc_dx_dy_old(sdx, sdy, features, tid);\r
+ __syncthreads();\r
+\r
+ // sum (reduce) 5x5 area response\r
+ __shared__ float sd1[25];\r
+ __shared__ float sd2[25];\r
+ __shared__ float sdabs1[25]; \r
+ __shared__ float sdabs2[25];\r
+\r
+ if (sdy[tid] >= 0)\r
+ {\r
+ sd1[tid] = sdx[tid];\r
+ sdabs1[tid] = fabs(sdx[tid]);\r
+ sd2[tid] = 0;\r
+ sdabs2[tid] = 0;\r
+ }\r
+ else\r
+ {\r
+ sd1[tid] = 0;\r
+ sdabs1[tid] = 0;\r
+ sd2[tid] = sdx[tid];\r
+ sdabs2[tid] = fabs(sdx[tid]);\r
+ }\r
+ __syncthreads();\r
+ \r
+ reduce_sum_old(sd1, tid);\r
+ reduce_sum_old(sd2, tid);\r
+ reduce_sum_old(sdabs1, tid);\r
+ reduce_sum_old(sdabs2, tid);\r
+\r
+ // write dx (dy >= 0), |dx| (dy >= 0), dx (dy < 0), |dx| (dy < 0)\r
+ if (tid == 0)\r
+ {\r
+ descriptors_block[0] = sd1[0];\r
+ descriptors_block[1] = sdabs1[0];\r
+ descriptors_block[2] = sd2[0];\r
+ descriptors_block[3] = sdabs2[0];\r
+ }\r
+ __syncthreads();\r
+\r
+ if (sdx[tid] >= 0)\r
+ {\r
+ sd1[tid] = sdy[tid];\r
+ sdabs1[tid] = fabs(sdy[tid]);\r
+ sd2[tid] = 0;\r
+ sdabs2[tid] = 0;\r
+ }\r
+ else\r
+ {\r
+ sd1[tid] = 0;\r
+ sdabs1[tid] = 0;\r
+ sd2[tid] = sdy[tid];\r
+ sdabs2[tid] = fabs(sdy[tid]);\r
+ }\r
+ __syncthreads();\r
+ \r
+ reduce_sum_old(sd1, tid);\r
+ reduce_sum_old(sd2, tid);\r
+ reduce_sum_old(sdabs1, tid);\r
+ reduce_sum_old(sdabs2, tid);\r
+\r
+ // write dy (dx >= 0), |dy| (dx >= 0), dy (dx < 0), |dy| (dx < 0)\r
+ if (tid == 0)\r
+ {\r
+ descriptors_block[4] = sd1[0];\r
+ descriptors_block[5] = sdabs1[0];\r
+ descriptors_block[6] = sd2[0];\r
+ descriptors_block[7] = sdabs2[0];\r
+ }\r
+ }\r
+\r
+ void compute_descriptors_gpu_old(const DevMem2Df& descriptors, const KeyPoint_GPU* features, int nFeatures)\r
+ {\r
+ // compute unnormalized descriptors, then normalize them - odd indexing since grid must be 2D\r
+ \r
+ if (descriptors.cols == 64)\r
+ {\r
+ compute_descriptors64_old<<<dim3(nFeatures, 16, 1), dim3(5, 5, 1)>>>(descriptors, features);\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
+ cudaSafeCall( cudaThreadSynchronize() );\r
+\r
+ normalize_descriptors<64><<<dim3(nFeatures, 1, 1), dim3(64, 1, 1)>>>(descriptors);\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
+ cudaSafeCall( cudaThreadSynchronize() );\r
+ }\r
+ else\r
+ {\r
+ compute_descriptors128_old<<<dim3(nFeatures, 16, 1), dim3(5, 5, 1)>>>(descriptors, features); \r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
+ cudaSafeCall( cudaThreadSynchronize() );\r
+\r
+ normalize_descriptors<128><<<dim3(nFeatures, 1, 1), dim3(128, 1, 1)>>>(descriptors); \r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
cudaSafeCall( cudaThreadSynchronize() );\r
}\r
}\r
void cv::gpu::Stream::enqueueUpload(const CudaMem& /*src*/, GpuMat& /*dst*/) { throw_nogpu(); }\r
void cv::gpu::Stream::enqueueUpload(const Mat& /*src*/, GpuMat& /*dst*/) { throw_nogpu(); }\r
void cv::gpu::Stream::enqueueCopy(const GpuMat& /*src*/, GpuMat& /*dst*/) { throw_nogpu(); }\r
-void cv::gpu::Stream::enqueueMemSet(const GpuMat& /*src*/, Scalar /*val*/) { throw_nogpu(); }\r
-void cv::gpu::Stream::enqueueMemSet(const GpuMat& /*src*/, Scalar /*val*/, const GpuMat& /*mask*/) { throw_nogpu(); }\r
+void cv::gpu::Stream::enqueueMemSet(GpuMat& /*src*/, Scalar /*val*/) { throw_nogpu(); }\r
+void cv::gpu::Stream::enqueueMemSet(GpuMat& /*src*/, Scalar /*val*/, const GpuMat& /*mask*/) { throw_nogpu(); }\r
void cv::gpu::Stream::enqueueConvert(const GpuMat& /*src*/, GpuMat& /*dst*/, int /*type*/, double /*a*/, double /*b*/) { throw_nogpu(); }\r
\r
#else /* !defined (HAVE_CUDA) */\r
{ \r
void copy_to_with_mask(const DevMem2D& src, DevMem2D dst, int depth, const DevMem2D& mask, int channels, const cudaStream_t & stream = 0);\r
\r
- void set_to_without_mask (DevMem2D dst, int depth, const double *scalar, int channels, const cudaStream_t & stream = 0);\r
- void set_to_with_mask (DevMem2D dst, int depth, const double *scalar, const DevMem2D& mask, int channels, const cudaStream_t & stream = 0);\r
+ template <typename T>\r
+ void set_to_gpu(const DevMem2D& mat, const T* scalar, int channels, cudaStream_t stream);\r
+ template <typename T>\r
+ void set_to_gpu(const DevMem2D& mat, const T* scalar, const DevMem2D& mask, int channels, cudaStream_t stream);\r
\r
void convert_gpu(const DevMem2D& src, int sdepth, const DevMem2D& dst, int ddepth, double alpha, double beta, cudaStream_t stream = 0);\r
}\r
size_t bwidth = src.cols * src.elemSize();\r
cudaSafeCall( cudaMemcpy2DAsync(dst.data, dst.step, src.data, src.step, bwidth, src.rows, k, s) );\r
};\r
+\r
+ template <typename T>\r
+ void kernelSet(GpuMat& src, const Scalar& s, cudaStream_t stream)\r
+ {\r
+ Scalar_<T> sf = s;\r
+ matrix_operations::set_to_gpu(src, sf.val, src.channels(), stream);\r
+ }\r
+\r
+ template <typename T>\r
+ void kernelSetMask(GpuMat& src, const Scalar& s, const GpuMat& mask, cudaStream_t stream)\r
+ {\r
+ Scalar_<T> sf = s;\r
+ matrix_operations::set_to_gpu(src, sf.val, mask, src.channels(), stream);\r
+ }\r
}\r
\r
CV_EXPORTS cudaStream_t cv::gpu::StreamAccessor::getStream(const Stream& stream) { return stream.impl->stream; };\r
void cv::gpu::Stream::enqueueUpload(const Mat& src, GpuMat& dst) { devcopy(src, dst, impl->stream, cudaMemcpyHostToDevice); }\r
void cv::gpu::Stream::enqueueCopy(const GpuMat& src, GpuMat& dst) { devcopy(src, dst, impl->stream, cudaMemcpyDeviceToDevice); }\r
\r
-void cv::gpu::Stream::enqueueMemSet(const GpuMat& src, Scalar val)\r
+void cv::gpu::Stream::enqueueMemSet(GpuMat& src, Scalar val)\r
{\r
- matrix_operations::set_to_without_mask(src, src.depth(), val.val, src.channels(), impl->stream);\r
+ typedef void (*set_caller_t)(GpuMat& src, const Scalar& s, cudaStream_t stream);\r
+ static const set_caller_t set_callers[] =\r
+ {\r
+ kernelSet<uchar>, kernelSet<schar>, kernelSet<ushort>, kernelSet<short>,\r
+ kernelSet<int>, kernelSet<float>, kernelSet<double>\r
+ };\r
+ set_callers[src.depth()](src, val, impl->stream);\r
}\r
\r
-void cv::gpu::Stream::enqueueMemSet(const GpuMat& src, Scalar val, const GpuMat& mask)\r
+void cv::gpu::Stream::enqueueMemSet(GpuMat& src, Scalar val, const GpuMat& mask)\r
{\r
- matrix_operations::set_to_with_mask(src, src.depth(), val.val, mask, src.channels(), impl->stream);\r
+ typedef void (*set_caller_t)(GpuMat& src, const Scalar& s, const GpuMat& mask, cudaStream_t stream);\r
+ static const set_caller_t set_callers[] =\r
+ {\r
+ kernelSetMask<uchar>, kernelSetMask<schar>, kernelSetMask<ushort>, kernelSetMask<short>,\r
+ kernelSetMask<int>, kernelSetMask<float>, kernelSetMask<double>\r
+ };\r
+ set_callers[src.depth()](src, val, mask, impl->stream);\r
}\r
\r
void cv::gpu::Stream::enqueueConvert(const GpuMat& src, GpuMat& dst, int rtype, double alpha, double beta)\r
void max_gpu(const DevMem2D_<T>& src1, const DevMem2D_<T>& src2, const DevMem2D_<T>& dst, cudaStream_t stream);\r
\r
template <typename T>\r
- void min_gpu(const DevMem2D_<T>& src1, double src2, const DevMem2D_<T>& dst, cudaStream_t stream);\r
+ void min_gpu(const DevMem2D_<T>& src1, T src2, const DevMem2D_<T>& dst, cudaStream_t stream);\r
\r
template <typename T>\r
- void max_gpu(const DevMem2D_<T>& src1, double src2, const DevMem2D_<T>& dst, cudaStream_t stream);\r
+ void max_gpu(const DevMem2D_<T>& src1, T src2, const DevMem2D_<T>& dst, cudaStream_t stream);\r
}}}\r
\r
namespace\r
void min_caller(const GpuMat& src1, double src2, GpuMat& dst, cudaStream_t stream)\r
{\r
dst.create(src1.size(), src1.type());\r
- mathfunc::min_gpu<T>(src1.reshape(1), src2, dst.reshape(1), stream);\r
+ mathfunc::min_gpu<T>(src1.reshape(1), saturate_cast<T>(src2), dst.reshape(1), stream);\r
}\r
\r
template <typename T>\r
void max_caller(const GpuMat& src1, double src2, GpuMat& dst, cudaStream_t stream)\r
{\r
dst.create(src1.size(), src1.type());\r
- mathfunc::max_gpu<T>(src1.reshape(1), src2, dst.reshape(1), stream);\r
+ mathfunc::max_gpu<T>(src1.reshape(1), saturate_cast<T>(src2), dst.reshape(1), stream);\r
}\r
}\r
\r
typedef void (*func_t)(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cudaStream_t stream);\r
static const func_t funcs[] = \r
{\r
- min_caller<uchar>, min_caller<char>, min_caller<ushort>, min_caller<short>, min_caller<int>, \r
+ min_caller<uchar>, min_caller<schar>, min_caller<ushort>, min_caller<short>, min_caller<int>, \r
min_caller<float>, min_caller<double>\r
};\r
funcs[src1.depth()](src1, src2, dst, 0);\r
typedef void (*func_t)(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cudaStream_t stream);\r
static const func_t funcs[] = \r
{\r
- min_caller<uchar>, min_caller<char>, min_caller<ushort>, min_caller<short>, min_caller<int>, \r
+ min_caller<uchar>, min_caller<schar>, min_caller<ushort>, min_caller<short>, min_caller<int>, \r
min_caller<float>, min_caller<double>\r
};\r
funcs[src1.depth()](src1, src2, dst, StreamAccessor::getStream(stream));\r
typedef void (*func_t)(const GpuMat& src1, double src2, GpuMat& dst, cudaStream_t stream);\r
static const func_t funcs[] = \r
{\r
- min_caller<uchar>, min_caller<char>, min_caller<ushort>, min_caller<short>, min_caller<int>, \r
+ min_caller<uchar>, min_caller<schar>, min_caller<ushort>, min_caller<short>, min_caller<int>, \r
min_caller<float>, min_caller<double>\r
};\r
funcs[src1.depth()](src1, src2, dst, 0);\r
typedef void (*func_t)(const GpuMat& src1, double src2, GpuMat& dst, cudaStream_t stream);\r
static const func_t funcs[] = \r
{\r
- min_caller<uchar>, min_caller<char>, min_caller<ushort>, min_caller<short>, min_caller<int>, \r
+ min_caller<uchar>, min_caller<schar>, min_caller<ushort>, min_caller<short>, min_caller<int>, \r
min_caller<float>, min_caller<double>\r
};\r
funcs[src1.depth()](src1, src2, dst, StreamAccessor::getStream(stream));\r
typedef void (*func_t)(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cudaStream_t stream);\r
static const func_t funcs[] = \r
{\r
- max_caller<uchar>, max_caller<char>, max_caller<ushort>, max_caller<short>, max_caller<int>, \r
+ max_caller<uchar>, max_caller<schar>, max_caller<ushort>, max_caller<short>, max_caller<int>, \r
max_caller<float>, max_caller<double>\r
};\r
funcs[src1.depth()](src1, src2, dst, 0);\r
typedef void (*func_t)(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cudaStream_t stream);\r
static const func_t funcs[] = \r
{\r
- max_caller<uchar>, max_caller<char>, max_caller<ushort>, max_caller<short>, max_caller<int>, \r
+ max_caller<uchar>, max_caller<schar>, max_caller<ushort>, max_caller<short>, max_caller<int>, \r
max_caller<float>, max_caller<double>\r
};\r
funcs[src1.depth()](src1, src2, dst, StreamAccessor::getStream(stream));\r
typedef void (*func_t)(const GpuMat& src1, double src2, GpuMat& dst, cudaStream_t stream);\r
static const func_t funcs[] = \r
{\r
- max_caller<uchar>, max_caller<char>, max_caller<ushort>, max_caller<short>, max_caller<int>, \r
+ max_caller<uchar>, max_caller<schar>, max_caller<ushort>, max_caller<short>, max_caller<int>, \r
max_caller<float>, max_caller<double>\r
};\r
funcs[src1.depth()](src1, src2, dst, 0);\r
typedef void (*func_t)(const GpuMat& src1, double src2, GpuMat& dst, cudaStream_t stream);\r
static const func_t funcs[] = \r
{\r
- max_caller<uchar>, max_caller<char>, max_caller<ushort>, max_caller<short>, max_caller<int>, \r
+ max_caller<uchar>, max_caller<schar>, max_caller<ushort>, max_caller<short>, max_caller<int>, \r
max_caller<float>, max_caller<double>\r
};\r
funcs[src1.depth()](src1, src2, dst, StreamAccessor::getStream(stream));\r
namespace cv { namespace gpu { namespace mathfunc\r
{\r
template <typename T>\r
- void threshold_gpu(const DevMem2D& src, const DevMem2D& dst, float thresh, float maxVal, int type,\r
+ void threshold_gpu(const DevMem2D& src, const DevMem2D& dst, T thresh, T maxVal, int type,\r
cudaStream_t stream);\r
}}}\r
\r
namespace\r
{\r
+ template <typename T>\r
void threshold_caller(const GpuMat& src, GpuMat& dst, double thresh, double maxVal, int type, \r
- cudaStream_t stream = 0)\r
+ cudaStream_t stream)\r
{\r
- using namespace cv::gpu::mathfunc;\r
+ mathfunc::threshold_gpu<T>(src, dst, saturate_cast<T>(thresh), saturate_cast<T>(maxVal), type, stream);\r
+ }\r
+}\r
\r
- typedef void (*caller_t)(const DevMem2D& src, const DevMem2D& dst, float thresh, float maxVal, int type,\r
+double cv::gpu::threshold(const GpuMat& src, GpuMat& dst, double thresh, double maxVal, int type)\r
+{\r
+ if (src.type() == CV_32FC1 && type == THRESH_TRUNC)\r
+ {\r
+ dst.create(src.size(), src.type());\r
+\r
+ NppiSize sz;\r
+ sz.width = src.cols;\r
+ sz.height = src.rows;\r
+\r
+ nppSafeCall( nppiThreshold_32f_C1R(src.ptr<Npp32f>(), src.step,\r
+ dst.ptr<Npp32f>(), dst.step, sz, static_cast<Npp32f>(thresh), NPP_CMP_GREATER) );\r
+\r
+ cudaSafeCall( cudaThreadSynchronize() );\r
+ }\r
+ else\r
+ {\r
+ typedef void (*caller_t)(const GpuMat& src, GpuMat& dst, double thresh, double maxVal, int type, \r
cudaStream_t stream);\r
\r
static const caller_t callers[] = \r
{\r
- threshold_gpu<unsigned char>, threshold_gpu<signed char>, \r
- threshold_gpu<unsigned short>, threshold_gpu<short>, threshold_gpu<int>, threshold_gpu<float>, 0\r
+ threshold_caller<unsigned char>, threshold_caller<signed char>, \r
+ threshold_caller<unsigned short>, threshold_caller<short>, \r
+ threshold_caller<int>, threshold_caller<float>, threshold_caller<double>\r
};\r
\r
- CV_Assert(src.channels() == 1 && src.depth() < CV_64F);\r
+ CV_Assert(src.channels() == 1 && src.depth() <= CV_64F);\r
CV_Assert(type <= THRESH_TOZERO_INV);\r
\r
dst.create(src.size(), src.type());\r
maxVal = cvRound(maxVal);\r
}\r
\r
- callers[src.depth()](src, dst, static_cast<float>(thresh), static_cast<float>(maxVal), type, stream);\r
+ callers[src.depth()](src, dst, thresh, maxVal, type, 0);\r
}\r
+\r
+ return thresh;\r
}\r
\r
-double cv::gpu::threshold(const GpuMat& src, GpuMat& dst, double thresh, double maxVal, int type)\r
+double cv::gpu::threshold(const GpuMat& src, GpuMat& dst, double thresh, double maxVal, int type, const Stream& stream)\r
{\r
- if (src.type() == CV_32FC1 && type == THRESH_TRUNC)\r
+ typedef void (*caller_t)(const GpuMat& src, GpuMat& dst, double thresh, double maxVal, int type, \r
+ cudaStream_t stream);\r
+\r
+ static const caller_t callers[] = \r
{\r
- dst.create(src.size(), src.type());\r
+ threshold_caller<unsigned char>, threshold_caller<signed char>, \r
+ threshold_caller<unsigned short>, threshold_caller<short>, \r
+ threshold_caller<int>, threshold_caller<float>, threshold_caller<double>\r
+ };\r
\r
- NppiSize sz;\r
- sz.width = src.cols;\r
- sz.height = src.rows;\r
+ CV_Assert(src.channels() == 1 && src.depth() <= CV_64F);\r
+ CV_Assert(type <= THRESH_TOZERO_INV);\r
\r
- nppSafeCall( nppiThreshold_32f_C1R(src.ptr<Npp32f>(), src.step,\r
- dst.ptr<Npp32f>(), dst.step, sz, static_cast<Npp32f>(thresh), NPP_CMP_GREATER) );\r
+ dst.create(src.size(), src.type());\r
\r
- cudaSafeCall( cudaThreadSynchronize() );\r
- }\r
- else\r
+ if (src.depth() != CV_32F)\r
{\r
- threshold_caller(src, dst, thresh, maxVal, type);\r
+ thresh = cvFloor(thresh);\r
+ maxVal = cvRound(maxVal);\r
}\r
\r
- return thresh;\r
-}\r
-\r
-double cv::gpu::threshold(const GpuMat& src, GpuMat& dst, double thresh, double maxVal, int type, const Stream& stream)\r
-{\r
- threshold_caller(src, dst, thresh, maxVal, type, StreamAccessor::getStream(stream));\r
+ callers[src.depth()](src, dst, thresh, maxVal, type, StreamAccessor::getStream(stream));\r
return thresh;\r
}\r
\r
\r
void cv::gpu::meanShiftFiltering(const GpuMat& src, GpuMat& dst, int sp, int sr, TermCriteria criteria)\r
{\r
+ CV_Assert(TargetArchs::builtWith(COMPUTE_12) && DeviceInfo().supports(COMPUTE_12));\r
+\r
if( src.empty() )\r
CV_Error( CV_StsBadArg, "The input image is empty" );\r
\r
\r
void cv::gpu::meanShiftProc(const GpuMat& src, GpuMat& dstr, GpuMat& dstsp, int sp, int sr, TermCriteria criteria)\r
{\r
+ CV_Assert(TargetArchs::builtWith(COMPUTE_12) && DeviceInfo().supports(COMPUTE_12));\r
+\r
if( src.empty() )\r
CV_Error( CV_StsBadArg, "The input image is empty" );\r
\r
{\r
void copy_to_with_mask(const DevMem2D& src, DevMem2D dst, int depth, const DevMem2D& mask, int channels, const cudaStream_t & stream = 0);\r
\r
- void set_to_without_mask (DevMem2D dst, int depth, const double *scalar, int channels, const cudaStream_t & stream = 0);\r
- void set_to_with_mask (DevMem2D dst, int depth, const double *scalar, const DevMem2D& mask, int channels, const cudaStream_t & stream = 0);\r
+ template <typename T>\r
+ void set_to_gpu(const DevMem2D& mat, const T* scalar, int channels, cudaStream_t stream);\r
+ template <typename T>\r
+ void set_to_gpu(const DevMem2D& mat, const T* scalar, const DevMem2D& mask, int channels, cudaStream_t stream);\r
\r
void convert_gpu(const DevMem2D& src, int sdepth, const DevMem2D& dst, int ddepth, double alpha, double beta, cudaStream_t stream = 0);\r
}\r
}\r
};\r
\r
+ template <typename T>\r
void kernelSet(GpuMat& src, const Scalar& s)\r
{\r
- matrix_operations::set_to_without_mask(src, src.depth(), s.val, src.channels());\r
+ Scalar_<T> sf = s;\r
+ matrix_operations::set_to_gpu(src, sf.val, src.channels(), 0);\r
}\r
\r
template<int SDEPTH, int SCN> struct NppSetMaskFunc\r
}\r
};\r
\r
+ template <typename T>\r
void kernelSetMask(GpuMat& src, const Scalar& s, const GpuMat& mask)\r
{\r
- matrix_operations::set_to_with_mask(src, src.depth(), s.val, mask, src.channels());\r
+ Scalar_<T> sf = s;\r
+ matrix_operations::set_to_gpu(src, sf.val, mask, src.channels(), 0);\r
}\r
}\r
\r
typedef void (*set_caller_t)(GpuMat& src, const Scalar& s);\r
static const set_caller_t set_callers[8][4] =\r
{\r
- {NppSet<CV_8U, 1, nppiSet_8u_C1R>::set,kernelSet,kernelSet,NppSet<CV_8U, 4, nppiSet_8u_C4R>::set},\r
- {kernelSet,kernelSet,kernelSet,kernelSet},\r
- {NppSet<CV_16U, 1, nppiSet_16u_C1R>::set,kernelSet,kernelSet,NppSet<CV_16U, 4, nppiSet_16u_C4R>::set},\r
- {NppSet<CV_16S, 1, nppiSet_16s_C1R>::set,kernelSet,kernelSet,NppSet<CV_16S, 4, nppiSet_16s_C4R>::set},\r
- {NppSet<CV_32S, 1, nppiSet_32s_C1R>::set,kernelSet,kernelSet,NppSet<CV_32S, 4, nppiSet_32s_C4R>::set},\r
- {NppSet<CV_32F, 1, nppiSet_32f_C1R>::set,kernelSet,kernelSet,NppSet<CV_32F, 4, nppiSet_32f_C4R>::set},\r
- {kernelSet,kernelSet,kernelSet,kernelSet},\r
+ {NppSet<CV_8U, 1, nppiSet_8u_C1R>::set,kernelSet<uchar>,kernelSet<uchar>,NppSet<CV_8U, 4, nppiSet_8u_C4R>::set},\r
+ {kernelSet<schar>,kernelSet<schar>,kernelSet<schar>,kernelSet<schar>},\r
+ {NppSet<CV_16U, 1, nppiSet_16u_C1R>::set,kernelSet<ushort>,kernelSet<ushort>,NppSet<CV_16U, 4, nppiSet_16u_C4R>::set},\r
+ {NppSet<CV_16S, 1, nppiSet_16s_C1R>::set,kernelSet<short>,kernelSet<short>,NppSet<CV_16S, 4, nppiSet_16s_C4R>::set},\r
+ {NppSet<CV_32S, 1, nppiSet_32s_C1R>::set,kernelSet<int>,kernelSet<int>,NppSet<CV_32S, 4, nppiSet_32s_C4R>::set},\r
+ {NppSet<CV_32F, 1, nppiSet_32f_C1R>::set,kernelSet<float>,kernelSet<float>,NppSet<CV_32F, 4, nppiSet_32f_C4R>::set},\r
+ {kernelSet<double>,kernelSet<double>,kernelSet<double>,kernelSet<double>},\r
{0,0,0,0}\r
};\r
set_callers[depth()][channels()-1](*this, s);\r
typedef void (*set_caller_t)(GpuMat& src, const Scalar& s, const GpuMat& mask);\r
static const set_caller_t set_callers[8][4] =\r
{\r
- {NppSetMask<CV_8U, 1, nppiSet_8u_C1MR>::set,kernelSetMask,kernelSetMask,NppSetMask<CV_8U, 4, nppiSet_8u_C4MR>::set},\r
- {kernelSetMask,kernelSetMask,kernelSetMask,kernelSetMask},\r
- {NppSetMask<CV_16U, 1, nppiSet_16u_C1MR>::set,kernelSetMask,kernelSetMask,NppSetMask<CV_16U, 4, nppiSet_16u_C4MR>::set},\r
- {NppSetMask<CV_16S, 1, nppiSet_16s_C1MR>::set,kernelSetMask,kernelSetMask,NppSetMask<CV_16S, 4, nppiSet_16s_C4MR>::set},\r
- {NppSetMask<CV_32S, 1, nppiSet_32s_C1MR>::set,kernelSetMask,kernelSetMask,NppSetMask<CV_32S, 4, nppiSet_32s_C4MR>::set},\r
- {NppSetMask<CV_32F, 1, nppiSet_32f_C1MR>::set,kernelSetMask,kernelSetMask,NppSetMask<CV_32F, 4, nppiSet_32f_C4MR>::set},\r
- {kernelSetMask,kernelSetMask,kernelSetMask,kernelSetMask},\r
+ {NppSetMask<CV_8U, 1, nppiSet_8u_C1MR>::set,kernelSetMask<uchar>,kernelSetMask<uchar>,NppSetMask<CV_8U, 4, nppiSet_8u_C4MR>::set},\r
+ {kernelSetMask<schar>,kernelSetMask<schar>,kernelSetMask<schar>,kernelSetMask<schar>},\r
+ {NppSetMask<CV_16U, 1, nppiSet_16u_C1MR>::set,kernelSetMask<ushort>,kernelSetMask<ushort>,NppSetMask<CV_16U, 4, nppiSet_16u_C4MR>::set},\r
+ {NppSetMask<CV_16S, 1, nppiSet_16s_C1MR>::set,kernelSetMask<short>,kernelSetMask<short>,NppSetMask<CV_16S, 4, nppiSet_16s_C4MR>::set},\r
+ {NppSetMask<CV_32S, 1, nppiSet_32s_C1MR>::set,kernelSetMask<int>,kernelSetMask<int>,NppSetMask<CV_32S, 4, nppiSet_32s_C4MR>::set},\r
+ {NppSetMask<CV_32F, 1, nppiSet_32f_C1MR>::set,kernelSetMask<float>,kernelSetMask<float>,NppSetMask<CV_32F, 4, nppiSet_32f_C4MR>::set},\r
+ {kernelSetMask<double>,kernelSetMask<double>,kernelSetMask<double>,kernelSetMask<double>},\r
{0,0,0,0}\r
};\r
set_callers[depth()][channels()-1](*this, s, mask);\r
\r
void cv::gpu::meanShiftSegmentation(const GpuMat& src, Mat& dst, int sp, int sr, int minsize, TermCriteria criteria)\r
{\r
+ CV_Assert(TargetArchs::builtWith(COMPUTE_12) && DeviceInfo().supports(COMPUTE_12));\r
+\r
CV_Assert(src.type() == CV_8UC4);\r
const int nrows = src.rows;\r
const int ncols = src.cols;\r
//\r
//M*/\r
\r
+#include "opencv2/gpu/device/saturate_cast.hpp"\r
+#include "opencv2/gpu/device/vecmath.hpp"\r
+\r
namespace cv \r
{ \r
namespace gpu \r
{\r
struct BrdReflect101 \r
{\r
- BrdReflect101(int len): last(len - 1) {}\r
+ explicit BrdReflect101(int len): last(len - 1) {}\r
\r
__device__ int idx_low(int i) const\r
{\r
\r
__device__ int idx(int i) const\r
{\r
- return abs(idx_high(i));\r
+ return idx_low(idx_high(i));\r
}\r
\r
bool is_range_safe(int mini, int maxi) const \r
return -last <= mini && maxi <= 2 * last;\r
}\r
\r
+ private:\r
int last;\r
};\r
\r
\r
- template <typename T>\r
+ template <typename D>\r
struct BrdRowReflect101: BrdReflect101\r
{\r
- BrdRowReflect101(int len): BrdReflect101(len) {}\r
+ explicit BrdRowReflect101(int len): BrdReflect101(len) {}\r
\r
- __device__ float at_low(int i, const T* data) const \r
+ template <typename T>\r
+ __device__ D at_low(int i, const T* data) const \r
{\r
- return data[idx_low(i)];\r
+ return saturate_cast<D>(data[idx_low(i)]);\r
}\r
\r
- __device__ float at_high(int i, const T* data) const \r
+ template <typename T>\r
+ __device__ D at_high(int i, const T* data) const \r
{\r
- return data[idx_high(i)];\r
+ return saturate_cast<D>(data[idx_high(i)]);\r
}\r
};\r
\r
\r
- template <typename T>\r
+ template <typename D>\r
struct BrdColReflect101: BrdReflect101\r
{\r
BrdColReflect101(int len, int step): BrdReflect101(len), step(step) {}\r
\r
- __device__ float at_low(int i, const T* data) const \r
+ template <typename T>\r
+ __device__ D at_low(int i, const T* data) const \r
{\r
- return data[idx_low(i) * step];\r
+ return saturate_cast<D>(data[idx_low(i) * step]);\r
}\r
\r
- __device__ float at_high(int i, const T* data) const \r
+ template <typename T>\r
+ __device__ D at_high(int i, const T* data) const \r
{\r
- return data[idx_high(i) * step];\r
+ return saturate_cast<D>(data[idx_high(i) * step]);\r
}\r
\r
+ private:\r
int step;\r
};\r
\r
\r
struct BrdReplicate\r
{\r
- BrdReplicate(int len): last(len - 1) {}\r
+ explicit BrdReplicate(int len): last(len - 1) {}\r
\r
__device__ int idx_low(int i) const\r
{\r
\r
__device__ int idx(int i) const\r
{\r
- return max(min(i, last), 0);\r
+ return idx_low(idx_high(i));\r
}\r
\r
bool is_range_safe(int mini, int maxi) const \r
return true;\r
}\r
\r
+ private:\r
int last;\r
};\r
\r
\r
- template <typename T>\r
+ template <typename D>\r
struct BrdRowReplicate: BrdReplicate\r
{\r
- BrdRowReplicate(int len): BrdReplicate(len) {}\r
+ explicit BrdRowReplicate(int len): BrdReplicate(len) {}\r
\r
- __device__ float at_low(int i, const T* data) const \r
+ template <typename T>\r
+ __device__ D at_low(int i, const T* data) const \r
{\r
- return data[idx_low(i)];\r
+ return saturate_cast<D>(data[idx_low(i)]);\r
}\r
\r
- __device__ float at_high(int i, const T* data) const \r
+ template <typename T>\r
+ __device__ D at_high(int i, const T* data) const \r
{\r
- return data[idx_high(i)];\r
+ return saturate_cast<D>(data[idx_high(i)]);\r
}\r
};\r
\r
\r
- template <typename T>\r
+ template <typename D>\r
struct BrdColReplicate: BrdReplicate\r
{\r
BrdColReplicate(int len, int step): BrdReplicate(len), step(step) {}\r
\r
- __device__ float at_low(int i, const T* data) const \r
+ template <typename T>\r
+ __device__ D at_low(int i, const T* data) const \r
{\r
- return data[idx_low(i) * step];\r
+ return saturate_cast<D>(data[idx_low(i) * step]);\r
}\r
\r
- __device__ float at_high(int i, const T* data) const \r
+ template <typename T>\r
+ __device__ D at_high(int i, const T* data) const \r
+ {\r
+ return saturate_cast<D>(data[idx_high(i) * step]);\r
+ }\r
+\r
+ private:\r
+ int step;\r
+ };\r
+\r
+ template <typename D>\r
+ struct BrdRowConstant\r
+ {\r
+ explicit BrdRowConstant(int len_, const D& val_ = VecTraits<D>::all(0)): len(len_), val(val_) {}\r
+\r
+ template <typename T>\r
+ __device__ D at_low(int i, const T* data) const \r
{\r
- return data[idx_high(i) * step];\r
+ return i >= 0 ? saturate_cast<D>(data[i]) : val;\r
+ }\r
+\r
+ template <typename T>\r
+ __device__ D at_high(int i, const T* data) const \r
+ {\r
+ return i < len ? saturate_cast<D>(data[i]) : val;\r
+ }\r
+\r
+ bool is_range_safe(int mini, int maxi) const \r
+ {\r
+ return true;\r
}\r
+\r
+ private:\r
+ int len;\r
+ D val;\r
+ };\r
+\r
+ template <typename D>\r
+ struct BrdColConstant\r
+ {\r
+ BrdColConstant(int len_, int step_, const D& val_ = VecTraits<D>::all(0)): len(len_), step(step_), val(val_) {}\r
+\r
+ template <typename T>\r
+ __device__ D at_low(int i, const T* data) const \r
+ {\r
+ return i >= 0 ? saturate_cast<D>(data[i * step]) : val;\r
+ }\r
+\r
+ template <typename T>\r
+ __device__ D at_high(int i, const T* data) const \r
+ {\r
+ return i < len ? saturate_cast<D>(data[i * step]) : val;\r
+ }\r
+\r
+ bool is_range_safe(int mini, int maxi) const \r
+ {\r
+ return true;\r
+ }\r
+\r
+ private:\r
+ int len;\r
int step;\r
+ D val;\r
};\r
}\r
}\r
grid.y = divUp(src.rows, threads.y); \r
\r
device::transformSimple<T, D><<<grid, threads, 0, stream>>>(src, dst, mask, op);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
if (stream == 0)\r
cudaSafeCall( cudaThreadSynchronize() ); \r
grid.y = divUp(src1.rows, threads.y); \r
\r
device::transformSimple<T1, T2, D><<<grid, threads, 0, stream>>>(src1, src2, dst, mask, op);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
if (stream == 0)\r
cudaSafeCall( cudaThreadSynchronize() ); \r
grid.y = divUp(src.rows, threads.y); \r
\r
device::transformSmart<T, D><<<grid, threads, 0, stream>>>(src, dst, mask, op);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
if (stream == 0)\r
cudaSafeCall( cudaThreadSynchronize() );\r
grid.y = divUp(src1.rows, threads.y); \r
\r
device::transformSmart<T1, T2, D><<<grid, threads, 0, stream>>>(src1, src2, dst, mask, op);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
if (stream == 0)\r
cudaSafeCall( cudaThreadSynchronize() ); \r
dim3 calcBlockSize(int nIntervals);\r
\r
void fasthessian_gpu(PtrStepf hessianBuffer, int x_size, int y_size, const dim3& threads);\r
+ void fasthessian_gpu_old(PtrStepf hessianBuffer, int x_size, int y_size, const dim3& threadsOld);\r
\r
void nonmaxonly_gpu(PtrStepf hessianBuffer, int4* maxPosBuffer, unsigned int& maxCounter, \r
int x_size, int y_size, bool use_mask, const dim3& threads);\r
void find_orientation_gpu(KeyPoint_GPU* features, int nFeatures);\r
\r
void compute_descriptors_gpu(const DevMem2Df& descriptors, const KeyPoint_GPU* features, int nFeatures);\r
+ void compute_descriptors_gpu_old(const DevMem2Df& descriptors, const KeyPoint_GPU* features, int nFeatures);\r
}}}\r
\r
using namespace cv::gpu::surf;\r
\r
void detectKeypoints(GpuMat& keypoints)\r
{\r
+ typedef void (*fasthessian_t)(PtrStepf hessianBuffer, int x_size, int y_size, const dim3& threads);\r
+ const fasthessian_t fasthessian = \r
+ DeviceInfo().supports(COMPUTE_13) ? fasthessian_gpu : fasthessian_gpu_old;\r
+\r
dim3 threads = calcBlockSize(nIntervals);\r
for(int octave = 0; octave < nOctaves; ++octave)\r
{\r
uploadConstant("cv::gpu::surf::c_border", border);\r
uploadConstant("cv::gpu::surf::c_step", step);\r
\r
- fasthessian_gpu(hessianBuffer, x_size, y_size, threads);\r
+ fasthessian(hessianBuffer, x_size, y_size, threads);\r
\r
// Reset the candidate count.\r
maxCounter = 0;\r
\r
maxCounter = std::min(maxCounter, static_cast<unsigned int>(max_candidates));\r
\r
- fh_interp_extremum_gpu(hessianBuffer, maxPosBuffer.ptr<int4>(), maxCounter,\r
- featuresBuffer.ptr<KeyPoint_GPU>(), featureCounter);\r
+ if (maxCounter > 0)\r
+ {\r
+ fh_interp_extremum_gpu(hessianBuffer, maxPosBuffer.ptr<int4>(), maxCounter,\r
+ featuresBuffer.ptr<KeyPoint_GPU>(), featureCounter);\r
\r
- featureCounter = std::min(featureCounter, static_cast<unsigned int>(max_features));\r
+ featureCounter = std::min(featureCounter, static_cast<unsigned int>(max_features));\r
+ }\r
}\r
\r
if (featureCounter > 0)\r
\r
void computeDescriptors(const GpuMat& keypoints, GpuMat& descriptors, int descriptorSize)\r
{\r
+ typedef void (*compute_descriptors_t)(const DevMem2Df& descriptors, \r
+ const KeyPoint_GPU* features, int nFeatures);\r
+\r
+ const compute_descriptors_t compute_descriptors = \r
+ DeviceInfo().supports(COMPUTE_13) ? compute_descriptors_gpu : compute_descriptors_gpu_old;\r
+\r
if (keypoints.cols > 0)\r
{\r
descriptors.create(keypoints.cols, descriptorSize, CV_32F);\r
- compute_descriptors_gpu(descriptors, keypoints.ptr<KeyPoint_GPU>(), keypoints.cols);\r
+ compute_descriptors(descriptors, keypoints.ptr<KeyPoint_GPU>(), keypoints.cols);\r
}\r
}\r
\r
\r
void CV_GpuBruteForceMatcherTest::radiusMatchTest( const GpuMat& query, const GpuMat& train )\r
{\r
+ bool atomics_ok = TargetArchs::builtWith(ATOMICS) && DeviceInfo().supports(ATOMICS);\r
+ if (!atomics_ok)\r
+ {\r
+ ts->printf(CvTS::CONSOLE, "\nCode and device atomics support is required for radiusMatch (CC >= 1.1)");\r
+ ts->set_failed_test_info(CvTS::FAIL_GENERIC);\r
+ return;\r
+ }\r
+\r
dmatcher.clear();\r
// test const version of match()\r
{\r
\r
void CV_GpuBruteForceMatcherTest::run(int)\r
{\r
- emptyDataTest();\r
-\r
- dataTest(50);\r
- dataTest(64);\r
- dataTest(100);\r
- dataTest(128);\r
- dataTest(200);\r
- dataTest(256);\r
- dataTest(300);\r
+ try\r
+ {\r
+ emptyDataTest();\r
+\r
+ dataTest(50);\r
+ dataTest(64);\r
+ dataTest(100);\r
+ dataTest(128);\r
+ dataTest(200);\r
+ dataTest(256);\r
+ dataTest(300);\r
+ }\r
+ catch(cv::Exception& e)\r
+ {\r
+ if (!check_and_treat_gpu_exception(e, ts))\r
+ throw; \r
+ return;\r
+ }\r
}\r
\r
CV_GpuBruteForceMatcherTest CV_GpuBruteForceMatcher_test;\r
return;\r
}\r
\r
- if (norm(validDescriptors.row(v), calcDescriptors.row(nearestIdx), NORM_L2) > 1.0f)\r
+ if (norm(validDescriptors.row(v), calcDescriptors.row(nearestIdx), NORM_L2) > 1.5f)\r
{\r
ts->printf(CvTS::LOG, "Bad descriptors accuracy.\n");\r
ts->set_failed_test_info( CvTS::FAIL_BAD_ACCURACY );\r
\r
void CV_GPU_SURFTest::run( int /*start_from*/ )\r
{\r
- SURF_GPU fdetector;\r
+ try\r
+ {\r
+ SURF_GPU fdetector;\r
\r
- emptyDataTest(fdetector);\r
- regressionTest(fdetector);\r
+ emptyDataTest(fdetector);\r
+ regressionTest(fdetector);\r
+ }\r
+ catch(cv::Exception& e)\r
+ {\r
+ if (!check_and_treat_gpu_exception(e, ts))\r
+ throw; \r
+ return;\r
+ }\r
}\r
\r
CV_GPU_SURFTest CV_GPU_SURF_test;\r
CvTS test_system("gpu");
-const char* blacklist[] =
-{
- "GPU-NppImageCanny", // NPP_TEXTURE_BIND_ERROR
- 0
-};
+//const char* blacklist[] =
+//{
+// "GPU-NVidia",
+// 0
+//};
int main( int argc, char** argv )
{
- return test_system.run( argc, argv, blacklist );
+ return test_system.run( argc, argv );
}
/* End of file. */
#include <iostream>\r
#include <string>\r
\r
+using namespace cv;\r
+using namespace cv::gpu;\r
+\r
\r
struct CV_GpuMeanShiftTest : public CvTest\r
{\r
\r
void run(int)\r
{\r
+ bool cc12_ok = TargetArchs::builtWith(COMPUTE_12) && DeviceInfo().supports(COMPUTE_12);\r
+ if (!cc12_ok)\r
+ {\r
+ ts->printf(CvTS::CONSOLE, "\nCompute capability 1.2 is required");\r
+ ts->set_failed_test_info(CvTS::FAIL_GENERIC);\r
+ return;\r
+ }\r
+\r
int spatialRad = 30;\r
int colorRad = 30;\r
\r
\r
void run(int)\r
{\r
+ bool cc12_ok = TargetArchs::builtWith(COMPUTE_12) && DeviceInfo().supports(COMPUTE_12);\r
+ if (!cc12_ok)\r
+ {\r
+ ts->printf(CvTS::CONSOLE, "\nCompute capability 1.2 is required");\r
+ ts->set_failed_test_info(CvTS::FAIL_GENERIC);\r
+ return;\r
+ }\r
+\r
int spatialRad = 30;\r
int colorRad = 30;\r
\r
{\r
try \r
{\r
+ bool cc12_ok = TargetArchs::builtWith(COMPUTE_12) && DeviceInfo().supports(COMPUTE_12);\r
+ if (!cc12_ok)\r
+ {\r
+ ts->printf(CvTS::CONSOLE, "\nCompute capability 1.2 is required");\r
+ ts->set_failed_test_info(CvTS::FAIL_GENERIC);\r
+ return;\r
+ }\r
+\r
Mat img_rgb = imread(string(ts->get_data_path()) + "meanshift/cones.png");\r
if (img_rgb.empty())\r
{\r
Mat cpumatdst;
GpuMat gpumatdst;
- cpumatsrc.convertTo(cpumatdst, dst_type);
- gpumatsrc.convertTo(gpumatdst, dst_type);
+ cpumatsrc.convertTo(cpumatdst, dst_type, 0.5, 3.0);
+ gpumatsrc.convertTo(gpumatdst, dst_type, 0.5, 3.0);
double r = norm(cpumatdst, gpumatdst, NORM_INF);
if (r > 1)
{
ts->printf(CvTS::LOG,
- "\nFAILED: SRC_TYPE=%sC%d DST_TYPE=%s NORM = %d\n",
+ "\nFAILED: SRC_TYPE=%sC%d DST_TYPE=%s NORM = %f\n",
types_str[i], c, types_str[j], r);
passed = false;
}