\r
hessianBuffer.ptr(c_y_size * hidx_z + hidx_y)[hidx_x] = result;\r
}\r
- } \r
-\r
- void fasthessian_gpu(PtrStepf hessianBuffer, int nIntervals, int x_size, int y_size)\r
+ }\r
+ \r
+ dim3 calcBlockSize(int nIntervals)\r
{\r
- dim3 threads;\r
- threads.x = 16;\r
- threads.y = 8;\r
+ int threadsPerBlock = 512;\r
+ \r
+ dim3 threads; \r
threads.z = nIntervals;\r
+ threadsPerBlock /= nIntervals;\r
+ if (threadsPerBlock >= 48)\r
+ threads.x = 16;\r
+ else\r
+ threads.x = 8;\r
+ threadsPerBlock /= threads.x;\r
+ threads.y = threadsPerBlock;\r
+ \r
+ return threads;\r
+ }\r
\r
+ void fasthessian_gpu(PtrStepf hessianBuffer, int x_size, int y_size, const dim3& threads)\r
+ {\r
dim3 grid;\r
grid.x = divUp(x_size, threads.x);\r
grid.y = divUp(y_size, threads.y);\r
- grid.z = 1;\r
-\r
+ \r
fasthessian<<<grid, threads>>>(hessianBuffer);\r
\r
cudaSafeCall( cudaThreadSynchronize() );\r
}\r
\r
void nonmaxonly_gpu(PtrStepf hessianBuffer, int4* maxPosBuffer, unsigned int& maxCounter, \r
- int nIntervals, int x_size, int y_size, bool use_mask)\r
+ int x_size, int y_size, bool use_mask, const dim3& threads)\r
{\r
- dim3 threads;\r
- threads.x = 16;\r
- threads.y = 8;\r
- threads.z = nIntervals;\r
-\r
dim3 grid;\r
grid.x = divUp(x_size, threads.x - 2);\r
grid.y = divUp(y_size, threads.y - 2);\r
- grid.z = 1;\r
\r
const size_t smem_size = threads.x * threads.y * threads.z * sizeof(float);\r
\r
\r
dim3 grid;\r
grid.x = maxCounter;\r
- grid.y = 1; \r
- grid.z = 1;\r
\r
DeviceReference<unsigned int> featureCounterWrapper(featureCounter);\r
\r
// - SURF says to only use a circle, but the branching logic would slow it down\r
// - Gaussian weighting should reduce the effects of the outer points anyway\r
if (tid2 < 169)\r
+\r
{\r
dx -= texLookups[threadIdx.x ][threadIdx.y ];\r
dx += 2.f*texLookups[threadIdx.x + 2][threadIdx.y ];\r
\r
dim3 grid;\r
grid.x = nFeatures;\r
- grid.y = 1;\r
- grid.z = 1;\r
\r
find_orientation<<<grid, threads>>>(features);\r
cudaSafeCall( cudaThreadSynchronize() );\r
#else /* !defined (HAVE_CUDA) */\r
\r
namespace cv { namespace gpu { namespace surf\r
-{ \r
- void fasthessian_gpu(PtrStepf hessianBuffer, int nIntervals, int x_size, int y_size);\r
+{\r
+ dim3 calcBlockSize(int nIntervals);\r
+ \r
+ void fasthessian_gpu(PtrStepf hessianBuffer, int x_size, int y_size, const dim3& threads);\r
\r
void nonmaxonly_gpu(PtrStepf hessianBuffer, int4* maxPosBuffer, unsigned int& maxCounter, \r
- int nIntervals, int x_size, int y_size, bool use_mask);\r
+ int x_size, int y_size, bool use_mask, const dim3& threads);\r
\r
void fh_interp_extremum_gpu(PtrStepf hessianBuffer, const int4* maxPosBuffer, unsigned int maxCounter, \r
KeyPoint_GPU* featuresBuffer, unsigned int& featureCounter);\r
{\r
CV_Assert(!img.empty() && img.type() == CV_8UC1);\r
CV_Assert(mask.empty() || (mask.size() == img.size() && mask.type() == CV_8UC1));\r
- CV_Assert(nOctaves > 0 && nIntervals > 2);\r
+ CV_Assert(nOctaves > 0 && nIntervals > 2 && nIntervals < 22);\r
CV_Assert(DeviceInfo().has(ATOMICS));\r
\r
max_features = static_cast<int>(img.size().area() * featuresRatio);\r
\r
void detectKeypoints(GpuMat& keypoints)\r
{\r
+ dim3 threads = calcBlockSize(nIntervals);\r
for(int octave = 0; octave < nOctaves; ++octave)\r
{\r
int step = initialStep * (1 << octave);\r
uploadConstant("cv::gpu::surf::c_border", border);\r
uploadConstant("cv::gpu::surf::c_step", step);\r
\r
- fasthessian_gpu(hessianBuffer, nIntervals, x_size, y_size);\r
+ fasthessian_gpu(hessianBuffer, x_size, y_size, threads);\r
\r
// Reset the candidate count.\r
maxCounter = 0;\r
\r
- nonmaxonly_gpu(hessianBuffer, maxPosBuffer.ptr<int4>(), maxCounter, nIntervals, x_size, y_size, use_mask); \r
+ nonmaxonly_gpu(hessianBuffer, maxPosBuffer.ptr<int4>(), maxCounter, x_size, y_size, use_mask, threads); \r
\r
maxCounter = std::min(maxCounter, static_cast<unsigned int>(max_candidates));\r
\r
GpuMat descriptors1GPU, descriptors2GPU;
surf(img1, GpuMat(), keypoints1GPU, descriptors1GPU);
surf(img2, GpuMat(), keypoints2GPU, descriptors2GPU);
+
+ cout << "FOUND " << keypoints1GPU.cols << " keypoints on first image" << endl;
+ cout << "FOUND " << keypoints2GPU.cols << " keypoints on second image" << endl;
// matching descriptors
BruteForceMatcher_GPU< L2<float> > matcher;
// drawing the results
Mat img_matches;
drawMatches(img1, keypoints1, img2, keypoints2, matches, img_matches);
+
+ namedWindow("matches", 0);
imshow("matches", img_matches);
waitKey(0);
assert(minDist >= 0);\r
if (!isSimilarKeypoints(validKeypoints[v], calcKeypoints[nearestIdx]))\r
{\r
+ ts->printf(CvTS::LOG, "Bad keypoints accuracy.\n");\r
ts->set_failed_test_info( CvTS::FAIL_BAD_ACCURACY );\r
return;\r
}\r
\r
if (norm(validDescriptors.row(v), calcDescriptors.row(nearestIdx), NORM_L2) > 1.0f)\r
{\r
+ ts->printf(CvTS::LOG, "Bad descriptors accuracy.\n");\r
ts->set_failed_test_info( CvTS::FAIL_BAD_ACCURACY );\r
return;\r
}\r