\r
///////////////////////////////////////////////////////////////////////////////\r
// Reduce Sum\r
- \r
- template <int BLOCK_DIM_X> __device__ void reduceSum(float* sdiff_row, float& mySum);\r
\r
- template <> __device__ void reduceSum<16>(float* sdiff_row, float& mySum)\r
+ template <int BLOCK_DIM_X> struct SumReductor; \r
+ template <> struct SumReductor<16>\r
{\r
- volatile float* smem = sdiff_row;\r
-\r
- smem[threadIdx.x] = mySum;\r
- \r
- if (threadIdx.x < 8) \r
+ template <typename T> static __device__ void reduce(T* sdiff_row, T& mySum)\r
{\r
- smem[threadIdx.x] = mySum += smem[threadIdx.x + 8]; \r
- smem[threadIdx.x] = mySum += smem[threadIdx.x + 4]; \r
- smem[threadIdx.x] = mySum += smem[threadIdx.x + 2];\r
- smem[threadIdx.x] = mySum += smem[threadIdx.x + 1]; \r
+ volatile T* smem = sdiff_row;\r
+\r
+ smem[threadIdx.x] = mySum;\r
+ \r
+ if (threadIdx.x < 8) \r
+ {\r
+ smem[threadIdx.x] = mySum += smem[threadIdx.x + 8]; \r
+ smem[threadIdx.x] = mySum += smem[threadIdx.x + 4]; \r
+ smem[threadIdx.x] = mySum += smem[threadIdx.x + 2];\r
+ smem[threadIdx.x] = mySum += smem[threadIdx.x + 1]; \r
+ }\r
}\r
- }\r
+ };\r
\r
///////////////////////////////////////////////////////////////////////////////\r
// Distance\r
\r
- class L1Dist\r
+ template <typename T> class L1Dist\r
{\r
public:\r
+ typedef int ResultType;\r
+ typedef int ValueType;\r
+\r
+ __device__ L1Dist() : mySum(0) {}\r
+\r
+ __device__ void reduceIter(int val1, int val2)\r
+ {\r
+ mySum = __sad(val1, val2, mySum);\r
+ }\r
+\r
+ template <int BLOCK_DIM_X> __device__ void reduceAll(int* sdiff_row)\r
+ {\r
+ SumReductor<BLOCK_DIM_X>::reduce(sdiff_row, mySum);\r
+ }\r
+\r
+ __device__ operator int() const\r
+ {\r
+ return mySum;\r
+ }\r
+\r
+ private:\r
+ int mySum;\r
+ };\r
+ template <> class L1Dist<float>\r
+ {\r
+ public:\r
+ typedef float ResultType;\r
+ typedef float ValueType;\r
+\r
__device__ L1Dist() : mySum(0.0f) {}\r
\r
__device__ void reduceIter(float val1, float val2)\r
mySum += fabs(val1 - val2);\r
}\r
\r
- template <int BLOCK_DIM_X>\r
- __device__ void reduceAll(float* sdiff_row)\r
+ template <int BLOCK_DIM_X> __device__ void reduceAll(float* sdiff_row)\r
{\r
- reduceSum<BLOCK_DIM_X>(sdiff_row, mySum);\r
+ SumReductor<BLOCK_DIM_X>::reduce(sdiff_row, mySum);\r
}\r
\r
__device__ operator float() const\r
class L2Dist\r
{\r
public:\r
+ typedef float ResultType;\r
+ typedef float ValueType;\r
+\r
__device__ L2Dist() : mySum(0.0f) {}\r
\r
__device__ void reduceIter(float val1, float val2)\r
mySum += reg * reg;\r
}\r
\r
- template <int BLOCK_DIM_X>\r
- __device__ void reduceAll(float* sdiff_row)\r
+ template <int BLOCK_DIM_X> __device__ void reduceAll(float* sdiff_row)\r
{\r
- reduceSum<BLOCK_DIM_X>(sdiff_row, mySum);\r
+ SumReductor<BLOCK_DIM_X>::reduce(sdiff_row, mySum);\r
}\r
\r
__device__ operator float() const\r
private:\r
float mySum;\r
};\r
+\r
+ class HammingDist\r
+ {\r
+ public:\r
+ typedef int ResultType;\r
+ typedef int ValueType;\r
+\r
+ __device__ HammingDist() : mySum(0) {}\r
+\r
+ __device__ void reduceIter(int val1, int val2)\r
+ {\r
+ mySum += __popc(val1 ^ val2);\r
+ }\r
+\r
+ template <int BLOCK_DIM_X> __device__ void reduceAll(int* sdiff_row)\r
+ {\r
+ SumReductor<BLOCK_DIM_X>::reduce(sdiff_row, mySum);\r
+ }\r
+\r
+ __device__ operator int() const\r
+ {\r
+ return mySum;\r
+ }\r
+\r
+ private:\r
+ int mySum;\r
+ };\r
\r
///////////////////////////////////////////////////////////////////////////////\r
// reduceDescDiff\r
\r
template <int BLOCK_DIM_X, typename Dist, typename T> \r
- __device__ void reduceDescDiff(const T* queryDescs, const T* trainDescs, int desc_len, Dist& dist, \r
- float* sdiff_row)\r
+ __device__ void reduceDescDiff(const T* queryDescs, const T* trainDescs, int desc_len, Dist& dist, typename Dist::ResultType* sdiff_row)\r
{\r
for (int i = threadIdx.x; i < desc_len; i += BLOCK_DIM_X)\r
dist.reduceIter(queryDescs[i], trainDescs[i]);\r
///////////////////////////////////////////////////////////////////////////////\r
// loadDescsVals\r
\r
- template <int BLOCK_DIM_X, int MAX_DESCRIPTORS_LEN, typename T> \r
- __device__ void loadDescsVals(const T* descs, int desc_len, float* queryVals, float* smem)\r
+ template <int BLOCK_DIM_X, int MAX_DESCRIPTORS_LEN, typename T, typename U> \r
+ __device__ void loadDescsVals(const T* descs, int desc_len, U* queryVals, U* smem)\r
{\r
const int tid = threadIdx.y * blockDim.x + threadIdx.x;\r
\r
if (tid < desc_len)\r
{\r
- smem[tid] = (float)descs[tid];\r
+ smem[tid] = descs[tid];\r
}\r
__syncthreads();\r
\r
template <int N> struct UnrollDescDiff\r
{\r
template <typename Dist, typename T>\r
- static __device__ void calcCheck(const float* queryVals, const T* trainDescs, int desc_len, \r
- Dist& dist, int ind)\r
+ static __device__ void calcCheck(const typename Dist::ValueType* queryVals, const T* trainDescs, int desc_len, Dist& dist, int ind)\r
{\r
if (ind < desc_len)\r
{\r
}\r
\r
template <typename Dist, typename T>\r
- static __device__ void calcWithoutCheck(const float* queryVals, const T* trainDescs, Dist& dist)\r
+ static __device__ void calcWithoutCheck(const typename Dist::ValueType* queryVals, const T* trainDescs, Dist& dist)\r
{\r
dist.reduceIter(*queryVals, *trainDescs);\r
\r
template <> struct UnrollDescDiff<0>\r
{\r
template <typename Dist, typename T>\r
- static __device__ void calcCheck(const float* queryVals, const T* trainDescs, int desc_len, \r
+ static __device__ void calcCheck(const typename Dist::ValueType* queryVals, const T* trainDescs, int desc_len, \r
Dist& dist, int ind)\r
{\r
}\r
\r
template <typename Dist, typename T>\r
- static __device__ void calcWithoutCheck(const float* queryVals, const T* trainDescs, Dist& dist)\r
+ static __device__ void calcWithoutCheck(const typename Dist::ValueType* queryVals, const T* trainDescs, Dist& dist)\r
{\r
}\r
};\r
struct DescDiffCalculator<BLOCK_DIM_X, MAX_DESCRIPTORS_LEN, false>\r
{\r
template <typename Dist, typename T>\r
- static __device__ void calc(const float* queryVals, const T* trainDescs, int desc_len, Dist& dist)\r
+ static __device__ void calc(const typename Dist::ValueType* queryVals, const T* trainDescs, int desc_len, Dist& dist)\r
{\r
- UnrollDescDiff<MAX_DESCRIPTORS_LEN / BLOCK_DIM_X>::calcCheck(queryVals, trainDescs, desc_len, \r
- dist, threadIdx.x);\r
+ UnrollDescDiff<MAX_DESCRIPTORS_LEN / BLOCK_DIM_X>::calcCheck(queryVals, trainDescs, desc_len, dist, threadIdx.x);\r
}\r
};\r
template <int BLOCK_DIM_X, int MAX_DESCRIPTORS_LEN> \r
struct DescDiffCalculator<BLOCK_DIM_X, MAX_DESCRIPTORS_LEN, true>\r
{\r
template <typename Dist, typename T>\r
- static __device__ void calc(const float* queryVals, const T* trainDescs, int desc_len, Dist& dist)\r
+ static __device__ void calc(const typename Dist::ValueType* queryVals, const T* trainDescs, int desc_len, Dist& dist)\r
{\r
- UnrollDescDiff<MAX_DESCRIPTORS_LEN / BLOCK_DIM_X>::calcWithoutCheck(queryVals, \r
- trainDescs + threadIdx.x, dist);\r
+ UnrollDescDiff<MAX_DESCRIPTORS_LEN / BLOCK_DIM_X>::calcWithoutCheck(queryVals, trainDescs + threadIdx.x, dist);\r
}\r
};\r
\r
template <int BLOCK_DIM_X, int MAX_DESCRIPTORS_LEN, bool DESC_LEN_EQ_MAX_LEN, typename Dist, typename T>\r
- __device__ void reduceDescDiffCached(const float* queryVals, const T* trainDescs, int desc_len, Dist& dist, \r
- float* sdiff_row)\r
+ __device__ void reduceDescDiffCached(const typename Dist::ValueType* queryVals, const T* trainDescs, int desc_len, Dist& dist, typename Dist::ResultType* sdiff_row)\r
{ \r
- DescDiffCalculator<BLOCK_DIM_X, MAX_DESCRIPTORS_LEN, DESC_LEN_EQ_MAX_LEN>::calc(queryVals, \r
- trainDescs, desc_len, dist);\r
+ DescDiffCalculator<BLOCK_DIM_X, MAX_DESCRIPTORS_LEN, DESC_LEN_EQ_MAX_LEN>::calc(queryVals, trainDescs, desc_len, dist);\r
\r
dist.reduceAll<BLOCK_DIM_X>(sdiff_row);\r
}\r
///////////////////////////////////////////////////////////////////////////////\r
// warpReduceMinIdxIdx\r
\r
- template <int BLOCK_DIM_Y> \r
- __device__ void warpReduceMinIdxIdx(float& myMin, int& myBestTrainIdx, int& myBestImgIdx, \r
- volatile float* sdata, volatile int* strainIdx, volatile int* simgIdx);\r
-\r
- template <> \r
- __device__ void warpReduceMinIdxIdx<16>(float& myMin, int& myBestTrainIdx, int& myBestImgIdx, \r
- volatile float* smin, volatile int* strainIdx, volatile int* simgIdx)\r
+ template <int BLOCK_DIM_Y> struct MinIdxIdxWarpReductor;\r
+ template <> struct MinIdxIdxWarpReductor<16>\r
{\r
- const int tid = threadIdx.y * blockDim.x + threadIdx.x;\r
-\r
- if (tid < 8)\r
+ template <typename T> \r
+ static __device__ void reduce(T& myMin, int& myBestTrainIdx, int& myBestImgIdx, volatile T* smin, volatile int* strainIdx, volatile int* simgIdx)\r
{\r
- myMin = smin[tid];\r
- myBestTrainIdx = strainIdx[tid];\r
- myBestImgIdx = simgIdx[tid];\r
+ const int tid = threadIdx.y * blockDim.x + threadIdx.x;\r
\r
- float reg = smin[tid + 8];\r
- if (reg < myMin)\r
+ if (tid < 8)\r
{\r
- smin[tid] = myMin = reg;\r
- strainIdx[tid] = myBestTrainIdx = strainIdx[tid + 8];\r
- simgIdx[tid] = myBestImgIdx = simgIdx[tid + 8];\r
- }\r
+ myMin = smin[tid];\r
+ myBestTrainIdx = strainIdx[tid];\r
+ myBestImgIdx = simgIdx[tid];\r
\r
- reg = smin[tid + 4];\r
- if (reg < myMin)\r
- {\r
- smin[tid] = myMin = reg;\r
- strainIdx[tid] = myBestTrainIdx = strainIdx[tid + 4];\r
- simgIdx[tid] = myBestImgIdx = simgIdx[tid + 4];\r
- }\r
- \r
- reg = smin[tid + 2];\r
- if (reg < myMin)\r
- {\r
- smin[tid] = myMin = reg;\r
- strainIdx[tid] = myBestTrainIdx = strainIdx[tid + 2];\r
- simgIdx[tid] = myBestImgIdx = simgIdx[tid + 2];\r
- }\r
- \r
- reg = smin[tid + 1];\r
- if (reg < myMin)\r
- {\r
- smin[tid] = myMin = reg;\r
- strainIdx[tid] = myBestTrainIdx = strainIdx[tid + 1];\r
- simgIdx[tid] = myBestImgIdx = simgIdx[tid + 1];\r
+ float reg = smin[tid + 8];\r
+ if (reg < myMin)\r
+ {\r
+ smin[tid] = myMin = reg;\r
+ strainIdx[tid] = myBestTrainIdx = strainIdx[tid + 8];\r
+ simgIdx[tid] = myBestImgIdx = simgIdx[tid + 8];\r
+ }\r
+\r
+ reg = smin[tid + 4];\r
+ if (reg < myMin)\r
+ {\r
+ smin[tid] = myMin = reg;\r
+ strainIdx[tid] = myBestTrainIdx = strainIdx[tid + 4];\r
+ simgIdx[tid] = myBestImgIdx = simgIdx[tid + 4];\r
+ }\r
+ \r
+ reg = smin[tid + 2];\r
+ if (reg < myMin)\r
+ {\r
+ smin[tid] = myMin = reg;\r
+ strainIdx[tid] = myBestTrainIdx = strainIdx[tid + 2];\r
+ simgIdx[tid] = myBestImgIdx = simgIdx[tid + 2];\r
+ }\r
+ \r
+ reg = smin[tid + 1];\r
+ if (reg < myMin)\r
+ {\r
+ smin[tid] = myMin = reg;\r
+ strainIdx[tid] = myBestTrainIdx = strainIdx[tid + 1];\r
+ simgIdx[tid] = myBestImgIdx = simgIdx[tid + 1];\r
+ }\r
}\r
}\r
- }\r
+ };\r
\r
///////////////////////////////////////////////////////////////////////////////\r
// findBestMatch\r
\r
- template <int BLOCK_DIM_Y>\r
- __device__ void findBestMatch(float& myMin, int& myBestTrainIdx, int& myBestImgIdx, \r
- float* smin, int* strainIdx, int* simgIdx)\r
+ template <int BLOCK_DIM_Y, typename T>\r
+ __device__ void findBestMatch(T& myMin, int& myBestTrainIdx, int& myBestImgIdx, T* smin, int* strainIdx, int* simgIdx)\r
{\r
if (threadIdx.x == 0)\r
{\r
}\r
__syncthreads();\r
\r
- warpReduceMinIdxIdx<BLOCK_DIM_Y>(myMin, myBestTrainIdx, myBestImgIdx, smin, strainIdx, simgIdx);\r
+ MinIdxIdxWarpReductor<BLOCK_DIM_Y>::reduce(myMin, myBestTrainIdx, myBestImgIdx, smin, strainIdx, simgIdx);\r
}\r
\r
///////////////////////////////////////////////////////////////////////////////\r
class ReduceDescCalculatorSimple\r
{\r
public:\r
- __device__ void prepare(const T* queryDescs_, int, float*)\r
+ __device__ void prepare(const T* queryDescs_, int, void*)\r
{\r
queryDescs = queryDescs_;\r
}\r
\r
template <typename Dist>\r
- __device__ void calc(const T* trainDescs, int desc_len, Dist& dist, float* sdiff_row) const\r
+ __device__ void calc(const T* trainDescs, int desc_len, Dist& dist, typename Dist::ResultType* sdiff_row) const\r
{\r
reduceDescDiff<BLOCK_DIM_X>(queryDescs, trainDescs, desc_len, dist, sdiff_row);\r
}\r
const T* queryDescs;\r
};\r
\r
- template <int BLOCK_DIM_X, int MAX_DESCRIPTORS_LEN, bool DESC_LEN_EQ_MAX_LEN, typename T>\r
+ template <int BLOCK_DIM_X, int MAX_DESCRIPTORS_LEN, bool DESC_LEN_EQ_MAX_LEN, typename T, typename U>\r
class ReduceDescCalculatorCached\r
{\r
public:\r
- __device__ void prepare(const T* queryDescs, int desc_len, float* smem)\r
+ __device__ void prepare(const T* queryDescs, int desc_len, U* smem)\r
{\r
loadDescsVals<BLOCK_DIM_X, MAX_DESCRIPTORS_LEN>(queryDescs, desc_len, queryVals, smem);\r
}\r
\r
template <typename Dist>\r
- __device__ void calc(const T* trainDescs, int desc_len, Dist& dist, float* sdiff_row) const\r
+ __device__ void calc(const T* trainDescs, int desc_len, Dist& dist, typename Dist::ResultType* sdiff_row) const\r
{\r
- reduceDescDiffCached<BLOCK_DIM_X, MAX_DESCRIPTORS_LEN, DESC_LEN_EQ_MAX_LEN>(queryVals, trainDescs, \r
- desc_len, dist, sdiff_row);\r
+ reduceDescDiffCached<BLOCK_DIM_X, MAX_DESCRIPTORS_LEN, DESC_LEN_EQ_MAX_LEN>(queryVals, trainDescs, desc_len, dist, sdiff_row);\r
}\r
\r
private:\r
- float queryVals[MAX_DESCRIPTORS_LEN / BLOCK_DIM_X];\r
+ U queryVals[MAX_DESCRIPTORS_LEN / BLOCK_DIM_X];\r
};\r
\r
///////////////////////////////////////////////////////////////////////////////\r
template <typename Dist, typename ReduceDescCalculator, typename T, typename Mask>\r
__device__ void matchDescs(int queryIdx, int imgIdx, const DevMem2D_<T>& trainDescs_, \r
const Mask& m, const ReduceDescCalculator& reduceDescCalc,\r
- float& myMin, int& myBestTrainIdx, int& myBestImgIdx, float* sdiff_row)\r
+ typename Dist::ResultType& myMin, int& myBestTrainIdx, int& myBestImgIdx, typename Dist::ResultType* sdiff_row)\r
{\r
for (int trainIdx = threadIdx.y; trainIdx < trainDescs_.rows; trainIdx += blockDim.y)\r
{\r
\r
template <typename Dist, typename ReduceDescCalculator, typename Mask>\r
__device__ void loop(int queryIdx, Mask& m, const ReduceDescCalculator& reduceDescCalc, \r
- float& myMin, int& myBestTrainIdx, int& myBestImgIdx, float* sdiff_row) const\r
+ typename Dist::ResultType& myMin, int& myBestTrainIdx, int& myBestImgIdx, typename Dist::ResultType* sdiff_row) const\r
{\r
- matchDescs<Dist>(queryIdx, 0, trainDescs, m, reduceDescCalc, \r
- myMin, myBestTrainIdx, myBestImgIdx, sdiff_row);\r
+ matchDescs<Dist>(queryIdx, 0, trainDescs, m, reduceDescCalc, myMin, myBestTrainIdx, myBestImgIdx, sdiff_row);\r
}\r
\r
__device__ int desc_len() const\r
\r
template <typename Dist, typename ReduceDescCalculator, typename Mask>\r
__device__ void loop(int queryIdx, Mask& m, const ReduceDescCalculator& reduceDescCalc, \r
- float& myMin, int& myBestTrainIdx, int& myBestImgIdx, float* sdiff_row) const\r
+ typename Dist::ResultType& myMin, int& myBestTrainIdx, int& myBestImgIdx, typename Dist::ResultType* sdiff_row) const\r
{\r
for (int imgIdx = 0; imgIdx < nImg; ++imgIdx)\r
{\r
DevMem2D_<T> trainDescs = trainCollection[imgIdx];\r
m.nextMask();\r
- matchDescs<Dist>(queryIdx, imgIdx, trainDescs, m, reduceDescCalc, \r
- myMin, myBestTrainIdx, myBestImgIdx, sdiff_row);\r
+ matchDescs<Dist>(queryIdx, imgIdx, trainDescs, m, reduceDescCalc, myMin, myBestTrainIdx, myBestImgIdx, sdiff_row);\r
}\r
}\r
\r
///////////////////////////////////////////////////////////////////////////////\r
// Match kernel\r
\r
- template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename ReduceDescCalculator, typename Dist, typename T, \r
- typename Train, typename Mask>\r
- __global__ void match(const PtrStep_<T> queryDescs_, const Train train, const Mask mask, \r
- int* trainIdx, int* imgIdx, float* distance)\r
+ template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename ReduceDescCalculator, typename Dist, typename T, typename Train, typename Mask>\r
+ __global__ void match(const PtrStep_<T> queryDescs_, const Train train, const Mask mask, int* trainIdx, int* imgIdx, float* distance)\r
{\r
- __shared__ float smem[BLOCK_DIM_X * BLOCK_DIM_Y]; \r
+ __shared__ typename Dist::ResultType smem[BLOCK_DIM_X * BLOCK_DIM_Y]; \r
\r
const int queryIdx = blockIdx.x;\r
\r
int myBestTrainIdx = -1;\r
int myBestImgIdx = -1;\r
- float myMin = numeric_limits_gpu<float>::max();\r
+ typename Dist::ResultType myMin = numeric_limits_gpu<typename Dist::ResultType>::max();\r
\r
{\r
- float* sdiff_row = smem + BLOCK_DIM_X * threadIdx.y;\r
+ typename Dist::ResultType* sdiff_row = smem + BLOCK_DIM_X * threadIdx.y;\r
\r
Mask m = mask;\r
\r
ReduceDescCalculator reduceDescCalc;\r
\r
- reduceDescCalc.prepare(queryDescs_.ptr(queryIdx), train.desc_len(), smem);\r
+ reduceDescCalc.prepare(queryDescs_.ptr(queryIdx), train.desc_len(), (typename Dist::ValueType*)smem);\r
\r
train.template loop<Dist>(queryIdx, m, reduceDescCalc, myMin, myBestTrainIdx, myBestImgIdx, sdiff_row);\r
}\r
__syncthreads();\r
\r
- float* smin = smem;\r
+ typename Dist::ResultType* smin = smem;\r
int* strainIdx = (int*)(smin + BLOCK_DIM_Y);\r
int* simgIdx = strainIdx + BLOCK_DIM_Y;\r
\r
- findBestMatch<BLOCK_DIM_Y>(myMin, myBestTrainIdx, myBestImgIdx, \r
- smin, strainIdx, simgIdx);\r
+ findBestMatch<BLOCK_DIM_Y>(myMin, myBestTrainIdx, myBestImgIdx, smin, strainIdx, simgIdx);\r
\r
if (threadIdx.x == 0 && threadIdx.y == 0)\r
{\r
///////////////////////////////////////////////////////////////////////////////\r
// Match kernel callers\r
\r
- template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename Dist, typename T, \r
- typename Train, typename Mask>\r
+ template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename Dist, typename T, typename Train, typename Mask>\r
void matchSimple_caller(const DevMem2D_<T>& queryDescs, const Train& train, \r
const Mask& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance)\r
{\r
dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1);\r
\r
match<BLOCK_DIM_X, BLOCK_DIM_Y, ReduceDescCalculatorSimple<BLOCK_DIM_X, T>, Dist, T>\r
- <<<grid, threads>>>(queryDescs, train, mask, trainIdx.data, \r
- imgIdx.data, distance.data);\r
+ <<<grid, threads>>>(queryDescs, train, mask, trainIdx.data, imgIdx.data, distance.data);\r
cudaSafeCall( cudaGetLastError() );\r
\r
cudaSafeCall( cudaThreadSynchronize() );\r
}\r
- template <int BLOCK_DIM_X, int BLOCK_DIM_Y, int MAX_DESCRIPTORS_LEN, bool DESC_LEN_EQ_MAX_LEN, \r
- typename Dist, typename T, typename Train, typename Mask>\r
+ template <int BLOCK_DIM_X, int BLOCK_DIM_Y, int MAX_DESCRIPTORS_LEN, bool DESC_LEN_EQ_MAX_LEN, typename Dist, typename T, typename Train, typename Mask>\r
void matchCached_caller(const DevMem2D_<T>& queryDescs, const Train& train, \r
const Mask& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance)\r
{\r
dim3 grid(queryDescs.rows, 1, 1);\r
dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1);\r
\r
- match<BLOCK_DIM_X, BLOCK_DIM_Y, \r
- ReduceDescCalculatorCached<BLOCK_DIM_X, MAX_DESCRIPTORS_LEN, DESC_LEN_EQ_MAX_LEN, T>, \r
- Dist, T>\r
- <<<grid, threads>>>(queryDescs, train, mask, trainIdx.data, \r
- imgIdx.data, distance.data);\r
+ match<BLOCK_DIM_X, BLOCK_DIM_Y, ReduceDescCalculatorCached<BLOCK_DIM_X, MAX_DESCRIPTORS_LEN, DESC_LEN_EQ_MAX_LEN, T, typename Dist::ValueType>, Dist, T>\r
+ <<<grid, threads>>>(queryDescs, train, mask, trainIdx.data, imgIdx.data, distance.data);\r
cudaSafeCall( cudaGetLastError() );\r
\r
cudaSafeCall( cudaThreadSynchronize() );\r
if (mask.data)\r
{\r
SingleMask m(mask);\r
- matchDispatcher<L1Dist>((DevMem2D_<T>)queryDescs, train, m, trainIdx, imgIdx, distance, cc_12);\r
+ matchDispatcher< L1Dist<T> >((DevMem2D_<T>)queryDescs, train, m, trainIdx, imgIdx, distance, cc_12);\r
}\r
else\r
{\r
- matchDispatcher<L1Dist>((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12);\r
+ matchDispatcher< L1Dist<T> >((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12);\r
}\r
}\r
\r
template void matchSingleL2_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);\r
\r
template <typename T>\r
+ void matchSingleHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, \r
+ const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, \r
+ bool cc_12)\r
+ {\r
+ SingleTrain<T> train((DevMem2D_<T>)trainDescs);\r
+ if (mask.data)\r
+ {\r
+ SingleMask m(mask);\r
+ matchDispatcher<HammingDist>((DevMem2D_<T>)queryDescs, train, m, trainIdx, imgIdx, distance, cc_12);\r
+ }\r
+ else\r
+ {\r
+ matchDispatcher<HammingDist>((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12);\r
+ }\r
+ }\r
+\r
+ template void matchSingleHamming_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);\r
+ template void matchSingleHamming_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);\r
+ template void matchSingleHamming_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);\r
+ template void matchSingleHamming_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);\r
+ template void matchSingleHamming_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);\r
+\r
+ template <typename T>\r
void matchCollectionL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, \r
const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, \r
const DevMem2Df& distance, bool cc_12)\r
if (maskCollection.data)\r
{\r
MaskCollection mask(maskCollection.data);\r
- matchDispatcher<L1Dist>((DevMem2D_<T>)queryDescs, train, mask, trainIdx, imgIdx, distance, cc_12);\r
+ matchDispatcher< L1Dist<T> >((DevMem2D_<T>)queryDescs, train, mask, trainIdx, imgIdx, distance, cc_12);\r
}\r
else\r
{\r
- matchDispatcher<L1Dist>((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12);\r
+ matchDispatcher< L1Dist<T> >((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12);\r
}\r
}\r
\r
template void matchCollectionL2_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);\r
template void matchCollectionL2_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);\r
template void matchCollectionL2_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);\r
+\r
+ template <typename T>\r
+ void matchCollectionHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, \r
+ const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, \r
+ const DevMem2Df& distance, bool cc_12)\r
+ {\r
+ TrainCollection<T> train((DevMem2D_<T>*)trainCollection.ptr(), trainCollection.cols, queryDescs.cols);\r
+ if (maskCollection.data)\r
+ {\r
+ MaskCollection mask(maskCollection.data);\r
+ matchDispatcher<HammingDist>((DevMem2D_<T>)queryDescs, train, mask, trainIdx, imgIdx, distance, cc_12);\r
+ }\r
+ else\r
+ {\r
+ matchDispatcher<HammingDist>((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12);\r
+ }\r
+ }\r
+\r
+ template void matchCollectionHamming_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);\r
+ template void matchCollectionHamming_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);\r
+ template void matchCollectionHamming_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);\r
+ template void matchCollectionHamming_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);\r
+ template void matchCollectionHamming_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);\r
\r
///////////////////////////////////////////////////////////////////////////////////\r
//////////////////////////////////// Knn Match ////////////////////////////////////\r
template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename Dist, typename T, typename Mask>\r
__global__ void calcDistance(PtrStep_<T> queryDescs_, DevMem2D_<T> trainDescs_, Mask mask, PtrStepf distance)\r
{\r
- __shared__ float sdiff[BLOCK_DIM_X * BLOCK_DIM_Y];\r
+ __shared__ typename Dist::ResultType sdiff[BLOCK_DIM_X * BLOCK_DIM_Y];\r
\r
- float* sdiff_row = sdiff + BLOCK_DIM_X * threadIdx.y;\r
+ typename Dist::ResultType* sdiff_row = sdiff + BLOCK_DIM_X * threadIdx.y;\r
\r
const int queryIdx = blockIdx.x;\r
const T* queryDescs = queryDescs_.ptr(queryIdx);\r
{\r
const T* trainDescs = trainDescs_.ptr(trainIdx);\r
\r
- float myDist = numeric_limits_gpu<float>::max();\r
+ typename Dist::ResultType myDist = numeric_limits_gpu<typename Dist::ResultType>::max();\r
\r
if (mask(queryIdx, trainIdx))\r
{\r
///////////////////////////////////////////////////////////////////////////////\r
// warpReduceMinIdx\r
\r
- template <int BLOCK_SIZE> \r
- __device__ void warpReduceMinIdx(volatile float* sdist, volatile int* strainIdx, float& myMin, int tid)\r
+ template <int BLOCK_SIZE, typename T> \r
+ __device__ void warpReduceMinIdx(volatile T* sdist, volatile int* strainIdx, T& myMin, int tid)\r
{\r
if (tid < 32)\r
{\r
if (BLOCK_SIZE >= 64) \r
{ \r
- float reg = sdist[tid + 32];\r
+ T reg = sdist[tid + 32];\r
\r
if (reg < myMin)\r
{\r
}\r
if (BLOCK_SIZE >= 32) \r
{ \r
- float reg = sdist[tid + 16];\r
+ T reg = sdist[tid + 16];\r
\r
if (reg < myMin)\r
{\r
}\r
if (BLOCK_SIZE >= 16) \r
{ \r
- float reg = sdist[tid + 8];\r
+ T reg = sdist[tid + 8];\r
\r
if (reg < myMin)\r
{\r
}\r
if (BLOCK_SIZE >= 8) \r
{ \r
- float reg = sdist[tid + 4];\r
+ T reg = sdist[tid + 4];\r
\r
if (reg < myMin)\r
{\r
}\r
if (BLOCK_SIZE >= 4) \r
{ \r
- float reg = sdist[tid + 2];\r
+ T reg = sdist[tid + 2];\r
\r
if (reg < myMin)\r
{\r
}\r
if (BLOCK_SIZE >= 2) \r
{ \r
- float reg = sdist[tid + 1];\r
+ T reg = sdist[tid + 1];\r
\r
if (reg < myMin)\r
{\r
}\r
}\r
\r
- template <int BLOCK_SIZE> \r
- __device__ void reduceMinIdx(const float* dist, int n, float* sdist, int* strainIdx)\r
+ template <int BLOCK_SIZE, typename T> \r
+ __device__ void reduceMinIdx(const T* dist, int n, T* sdist, int* strainIdx)\r
{\r
const int tid = threadIdx.x;\r
\r
- float myMin = numeric_limits_gpu<float>::max();\r
+ T myMin = numeric_limits_gpu<T>::max();\r
int myMinIdx = -1;\r
\r
for (int i = tid; i < n; i += BLOCK_SIZE)\r
{\r
- float reg = dist[i];\r
+ T reg = dist[i];\r
if (reg < myMin)\r
{\r
myMin = reg;\r
\r
if (BLOCK_SIZE >= 512 && tid < 256) \r
{\r
- float reg = sdist[tid + 256];\r
+ T reg = sdist[tid + 256];\r
\r
if (reg < myMin)\r
{\r
}\r
if (BLOCK_SIZE >= 256 && tid < 128) \r
{\r
- float reg = sdist[tid + 128];\r
+ T reg = sdist[tid + 128];\r
\r
if (reg < myMin)\r
{\r
}\r
if (BLOCK_SIZE >= 128 && tid < 64) \r
{\r
- float reg = sdist[tid + 64];\r
+ T reg = sdist[tid + 64];\r
\r
if (reg < myMin)\r
{\r
// knn match caller\r
\r
template <typename Dist, typename T, typename Mask>\r
- void calcDistanceDispatcher(const DevMem2D_<T>& queryDescs, const DevMem2D_<T>& trainDescs, \r
- const Mask& mask, const DevMem2Df& allDist)\r
+ void calcDistanceDispatcher(const DevMem2D_<T>& queryDescs, const DevMem2D_<T>& trainDescs, const Mask& mask, const DevMem2Df& allDist)\r
{\r
calcDistance_caller<16, 16, Dist>(queryDescs, trainDescs, mask, allDist);\r
}\r
\r
- void findKnnMatchDispatcher(int knn, const DevMem2Di& trainIdx, const DevMem2Df& distance, \r
- const DevMem2Df& allDist)\r
+ void findKnnMatchDispatcher(int knn, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist)\r
{\r
findKnnMatch_caller<256>(knn, trainIdx, distance, allDist);\r
}\r
{\r
if (mask.data)\r
{\r
- calcDistanceDispatcher<L1Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, \r
- SingleMask(mask), allDist);\r
+ calcDistanceDispatcher< L1Dist<T> >((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, SingleMask(mask), allDist);\r
}\r
else\r
{\r
- calcDistanceDispatcher<L1Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, \r
- WithOutMask(), allDist);\r
+ calcDistanceDispatcher< L1Dist<T> >((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, WithOutMask(), allDist);\r
}\r
\r
findKnnMatchDispatcher(knn, trainIdx, distance, allDist);\r
template void knnMatchL2_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);\r
template void knnMatchL2_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);\r
\r
+ template <typename T>\r
+ void knnMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn,\r
+ const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist)\r
+ {\r
+ if (mask.data)\r
+ {\r
+ calcDistanceDispatcher<HammingDist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, \r
+ SingleMask(mask), allDist);\r
+ }\r
+ else\r
+ {\r
+ calcDistanceDispatcher<HammingDist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, \r
+ WithOutMask(), allDist);\r
+ }\r
+\r
+ findKnnMatchDispatcher(knn, trainIdx, distance, allDist);\r
+ }\r
+\r
+ template void knnMatchHamming_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);\r
+ template void knnMatchHamming_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);\r
+ template void knnMatchHamming_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);\r
+ template void knnMatchHamming_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);\r
+ template void knnMatchHamming_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);\r
+\r
///////////////////////////////////////////////////////////////////////////////////\r
/////////////////////////////////// Radius Match //////////////////////////////////\r
///////////////////////////////////////////////////////////////////////////////////\r
{\r
#if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 110\r
\r
- __shared__ float smem[BLOCK_DIM_X * BLOCK_DIM_Y];\r
+ __shared__ typename Dist::ResultType smem[BLOCK_DIM_X * BLOCK_DIM_Y];\r
\r
- float* sdiff_row = smem + BLOCK_DIM_X * threadIdx.y;\r
+ typename Dist::ResultType* sdiff_row = smem + BLOCK_DIM_X * threadIdx.y;\r
\r
const int queryIdx = blockIdx.x;\r
const T* queryDescs = queryDescs_.ptr(queryIdx);\r
{\r
if (mask.data)\r
{\r
- radiusMatchDispatcher<L1Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, \r
+ radiusMatchDispatcher< L1Dist<T> >((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, \r
maxDistance, SingleMask(mask), trainIdx, nMatches, distance);\r
}\r
else\r
{\r
- radiusMatchDispatcher<L1Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, \r
+ radiusMatchDispatcher< L1Dist<T> >((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, \r
maxDistance, WithOutMask(), trainIdx, nMatches, distance);\r
}\r
}\r
template void radiusMatchL2_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);\r
template void radiusMatchL2_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);\r
template void radiusMatchL2_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);\r
+\r
+ template <typename T>\r
+ void radiusMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance,\r
+ const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance)\r
+ {\r
+ if (mask.data)\r
+ {\r
+ radiusMatchDispatcher<HammingDist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, \r
+ maxDistance, SingleMask(mask), trainIdx, nMatches, distance);\r
+ }\r
+ else\r
+ {\r
+ radiusMatchDispatcher<HammingDist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, \r
+ maxDistance, WithOutMask(), trainIdx, nMatches, distance);\r
+ }\r
+ }\r
+\r
+ template void radiusMatchHamming_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);\r
+ template void radiusMatchHamming_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);\r
+ template void radiusMatchHamming_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);\r
+ template void radiusMatchHamming_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);\r
+ template void radiusMatchHamming_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);\r
}}}\r