#ifndef __OPENCV_GPU_VEC_DISTANCE_HPP__
#define __OPENCV_GPU_VEC_DISTANCE_HPP__
-#include "utility.hpp"
+#include "reduce.hpp"
#include "functional.hpp"
#include "detail/vec_distance_detail.hpp"
template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(int* smem, int tid)
{
- reduce_old<THREAD_DIM>(smem, mySum, tid, plus<volatile int>());
+ reduce<THREAD_DIM>(smem, mySum, tid, plus<int>());
}
__device__ __forceinline__ operator int() const
template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(float* smem, int tid)
{
- reduce_old<THREAD_DIM>(smem, mySum, tid, plus<volatile float>());
+ reduce<THREAD_DIM>(smem, mySum, tid, plus<float>());
}
__device__ __forceinline__ operator float() const
template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(float* smem, int tid)
{
- reduce_old<THREAD_DIM>(smem, mySum, tid, plus<volatile float>());
+ reduce<THREAD_DIM>(smem, mySum, tid, plus<float>());
}
__device__ __forceinline__ operator float() const
template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(int* smem, int tid)
{
- reduce_old<THREAD_DIM>(smem, mySum, tid, plus<volatile int>());
+ reduce<THREAD_DIM>(smem, mySum, tid, plus<int>());
}
__device__ __forceinline__ operator int() const
#if !defined CUDA_DISABLER
-#include "internal_shared.hpp"
+#include "opencv2/gpu/device/common.hpp"
+#include "opencv2/gpu/device/utility.hpp"
+#include "opencv2/gpu/device/reduce.hpp"
#include "opencv2/gpu/device/limits.hpp"
#include "opencv2/gpu/device/vec_distance.hpp"
#include "opencv2/gpu/device/datamov_utils.hpp"
+#include "opencv2/gpu/device/warp_shuffle.hpp"
namespace cv { namespace gpu { namespace device
{
int& bestTrainIdx1, int& bestTrainIdx2,
float* s_distance, int* s_trainIdx)
{
+ #if __CUDA_ARCH__ >= 300
+ (void) s_distance;
+ (void) s_trainIdx;
+
+ float d1, d2;
+ int i1, i2;
+
+ #pragma unroll
+ for (int i = BLOCK_SIZE / 2; i >= 1; i /= 2)
+ {
+ d1 = shfl_down(bestDistance1, i, BLOCK_SIZE);
+ d2 = shfl_down(bestDistance2, i, BLOCK_SIZE);
+ i1 = shfl_down(bestTrainIdx1, i, BLOCK_SIZE);
+ i2 = shfl_down(bestTrainIdx2, i, BLOCK_SIZE);
+
+ if (bestDistance1 < d1)
+ {
+ if (d1 < bestDistance2)
+ {
+ bestDistance2 = d1;
+ bestTrainIdx2 = i1;
+ }
+ }
+ else
+ {
+ bestDistance2 = bestDistance1;
+ bestTrainIdx2 = bestTrainIdx1;
+
+ bestDistance1 = d1;
+ bestTrainIdx1 = i1;
+
+ if (d2 < bestDistance2)
+ {
+ bestDistance2 = d2;
+ bestTrainIdx2 = i2;
+ }
+ }
+ }
+ #else
float myBestDistance1 = numeric_limits<float>::max();
float myBestDistance2 = numeric_limits<float>::max();
int myBestTrainIdx1 = -1;
bestTrainIdx1 = myBestTrainIdx1;
bestTrainIdx2 = myBestTrainIdx2;
+ #endif
}
template <int BLOCK_SIZE>
int& bestImgIdx1, int& bestImgIdx2,
float* s_distance, int* s_trainIdx, int* s_imgIdx)
{
+ #if __CUDA_ARCH__ >= 300
+ (void) s_distance;
+ (void) s_trainIdx;
+ (void) s_imgIdx;
+
+ float d1, d2;
+ int i1, i2;
+ int j1, j2;
+
+ #pragma unroll
+ for (int i = BLOCK_SIZE / 2; i >= 1; i /= 2)
+ {
+ d1 = shfl_down(bestDistance1, i, BLOCK_SIZE);
+ d2 = shfl_down(bestDistance2, i, BLOCK_SIZE);
+ i1 = shfl_down(bestTrainIdx1, i, BLOCK_SIZE);
+ i2 = shfl_down(bestTrainIdx2, i, BLOCK_SIZE);
+ j1 = shfl_down(bestImgIdx1, i, BLOCK_SIZE);
+ j2 = shfl_down(bestImgIdx2, i, BLOCK_SIZE);
+
+ if (bestDistance1 < d1)
+ {
+ if (d1 < bestDistance2)
+ {
+ bestDistance2 = d1;
+ bestTrainIdx2 = i1;
+ bestImgIdx2 = j1;
+ }
+ }
+ else
+ {
+ bestDistance2 = bestDistance1;
+ bestTrainIdx2 = bestTrainIdx1;
+ bestImgIdx2 = bestImgIdx1;
+
+ bestDistance1 = d1;
+ bestTrainIdx1 = i1;
+ bestImgIdx1 = j1;
+
+ if (d2 < bestDistance2)
+ {
+ bestDistance2 = d2;
+ bestTrainIdx2 = i2;
+ bestImgIdx2 = j2;
+ }
+ }
+ }
+ #else
float myBestDistance1 = numeric_limits<float>::max();
float myBestDistance2 = numeric_limits<float>::max();
int myBestTrainIdx1 = -1;
bestImgIdx1 = myBestImgIdx1;
bestImgIdx2 = myBestImgIdx2;
+ #endif
}
///////////////////////////////////////////////////////////////////////////////
s_trainIdx[threadIdx.x] = bestIdx;
__syncthreads();
- reducePredVal<BLOCK_SIZE>(s_dist, dist, s_trainIdx, bestIdx, threadIdx.x, less<volatile float>());
+ reduceKeyVal<BLOCK_SIZE>(s_dist, dist, s_trainIdx, bestIdx, threadIdx.x, less<float>());
if (threadIdx.x == 0)
{
}}} // namespace cv { namespace gpu { namespace device {
-#endif /* CUDA_DISABLER */
\ No newline at end of file
+#endif /* CUDA_DISABLER */
#if !defined CUDA_DISABLER
-#include "internal_shared.hpp"
+#include "opencv2/gpu/device/common.hpp"
+#include "opencv2/gpu/device/utility.hpp"
+#include "opencv2/gpu/device/reduce.hpp"
#include "opencv2/gpu/device/limits.hpp"
#include "opencv2/gpu/device/vec_distance.hpp"
#include "opencv2/gpu/device/datamov_utils.hpp"
s_distance += threadIdx.y * BLOCK_SIZE;
s_trainIdx += threadIdx.y * BLOCK_SIZE;
- s_distance[threadIdx.x] = bestDistance;
- s_trainIdx[threadIdx.x] = bestTrainIdx;
-
- __syncthreads();
-
- reducePredVal<BLOCK_SIZE>(s_distance, bestDistance, s_trainIdx, bestTrainIdx, threadIdx.x, less<volatile float>());
+ reduceKeyVal<BLOCK_SIZE>(s_distance, bestDistance, s_trainIdx, bestTrainIdx, threadIdx.x, less<float>());
}
template <int BLOCK_SIZE>
s_trainIdx += threadIdx.y * BLOCK_SIZE;
s_imgIdx += threadIdx.y * BLOCK_SIZE;
- s_distance[threadIdx.x] = bestDistance;
- s_trainIdx[threadIdx.x] = bestTrainIdx;
- s_imgIdx [threadIdx.x] = bestImgIdx;
-
- __syncthreads();
-
- reducePredVal2<BLOCK_SIZE>(s_distance, bestDistance, s_trainIdx, bestTrainIdx, s_imgIdx, bestImgIdx, threadIdx.x, less<volatile float>());
+ reduceKeyVal<BLOCK_SIZE>(s_distance, bestDistance, smem_tuple(s_trainIdx, s_imgIdx), thrust::tie(bestTrainIdx, bestImgIdx), threadIdx.x, less<float>());
}
///////////////////////////////////////////////////////////////////////////////
}}} // namespace cv { namespace gpu { namespace device {
-#endif /* CUDA_DISABLER */
\ No newline at end of file
+#endif /* CUDA_DISABLER */
#if !defined CUDA_DISABLER
-#include "internal_shared.hpp"
+#include "opencv2/gpu/device/common.hpp"
+#include "opencv2/gpu/device/utility.hpp"
#include "opencv2/gpu/device/limits.hpp"
#include "opencv2/gpu/device/vec_distance.hpp"
#include "opencv2/gpu/device/datamov_utils.hpp"
__global__ void matchUnrolled(const PtrStepSz<T> query, int imgIdx, const PtrStepSz<T> train, float maxDistance, const Mask mask,
PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount)
{
- #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 110)
-
extern __shared__ int smem[];
const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y;
bestDistance.ptr(queryIdx)[ind] = distVal;
}
}
-
- #endif
}
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
__global__ void match(const PtrStepSz<T> query, int imgIdx, const PtrStepSz<T> train, float maxDistance, const Mask mask,
PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount)
{
- #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 110)
-
extern __shared__ int smem[];
const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y;
bestDistance.ptr(queryIdx)[ind] = distVal;
}
}
-
- #endif
}
template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
}}} // namespace cv { namespace gpu { namespace device
-#endif /* CUDA_DISABLER */
\ No newline at end of file
+#endif /* CUDA_DISABLER */