return T();
#endif
}
+ __device__ __forceinline__ unsigned int shfl(unsigned int val, int srcLane, int width = warpSize)
+ {
+ #if __CUDA_ARCH__ >= 300
+ return (unsigned int) __shfl((int) val, srcLane, width);
+ #else
+ return 0;
+ #endif
+ }
__device__ __forceinline__ double shfl(double val, int srcLane, int width = warpSize)
{
#if __CUDA_ARCH__ >= 300
return T();
#endif
}
+ __device__ __forceinline__ unsigned int shfl_down(unsigned int val, unsigned int delta, int width = warpSize)
+ {
+ #if __CUDA_ARCH__ >= 300
+ return (unsigned int) __shfl_down((int) val, delta, width);
+ #else
+ return 0;
+ #endif
+ }
__device__ __forceinline__ double shfl_down(double val, unsigned int delta, int width = warpSize)
{
#if __CUDA_ARCH__ >= 300
return 0.0;
#endif
}
+
+ template <typename T>
+ __device__ __forceinline__ T shfl_up(T val, unsigned int delta, int width = warpSize)
+ {
+ #if __CUDA_ARCH__ >= 300
+ return __shfl_up(val, delta, width);
+ #else
+ return T();
+ #endif
+ }
+ __device__ __forceinline__ unsigned int shfl_up(unsigned int val, unsigned int delta, int width = warpSize)
+ {
+ #if __CUDA_ARCH__ >= 300
+ return (unsigned int) __shfl_up((int) val, delta, width);
+ #else
+ return 0;
+ #endif
+ }
+ __device__ __forceinline__ double shfl_up(double val, unsigned int delta, int width = warpSize)
+ {
+ #if __CUDA_ARCH__ >= 300
+ int lo = __double2loint(val);
+ int hi = __double2hiint(val);
+
+ lo = __shfl_up(lo, delta, width);
+ hi = __shfl_up(hi, delta, width);
+
+ return __hiloint2double(hi, lo);
+ #else
+ return 0.0;
+ #endif
+ }
}}}
#endif // __OPENCV_GPU_WARP_SHUFFLE_HPP__
#include "NPP_staging/NPP_staging.hpp"
#include "NCVRuntimeTemplates.hpp"
#include "NCVHaarObjectDetection.hpp"
+#include "opencv2/gpu/device/warp.hpp"
+#include "opencv2/gpu/device/warp_shuffle.hpp"
//==============================================================================
//assuming size <= WARP_SIZE and size is power of 2
__device__ Ncv32u warpScanInclusive(Ncv32u idata, volatile Ncv32u *s_Data)
{
+#if __CUDA_ARCH__ >= 300
+ const unsigned int laneId = cv::gpu::device::Warp::laneId();
+
+ // scan on shuffl functions
+ #pragma unroll
+ for (int i = 1; i <= (K_WARP_SIZE / 2); i *= 2)
+ {
+ const Ncv32u n = cv::gpu::device::shfl_up(idata, i);
+ if (laneId >= i)
+ idata += n;
+ }
+
+ return idata;
+#else
Ncv32u pos = 2 * threadIdx.x - (threadIdx.x & (K_WARP_SIZE - 1));
s_Data[pos] = 0;
pos += K_WARP_SIZE;
s_Data[pos] += s_Data[pos - 16];
return s_Data[pos];
+#endif
}
__device__ __forceinline__ Ncv32u warpScanExclusive(Ncv32u idata, volatile Ncv32u *s_Data)
return NCV_SUCCESS;
}
-#endif /* CUDA_DISABLER */
\ No newline at end of file
+#endif /* CUDA_DISABLER */
#include <vector>
#include <cuda_runtime.h>
#include "NPP_staging.hpp"
+#include "opencv2/gpu/device/warp.hpp"
+#include "opencv2/gpu/device/warp_shuffle.hpp"
texture<Ncv8u, 1, cudaReadModeElementType> tex8u;
template <class T>
inline __device__ T warpScanInclusive(T idata, volatile T *s_Data)
{
+#if __CUDA_ARCH__ >= 300
+ const unsigned int laneId = cv::gpu::device::Warp::laneId();
+
+ // scan on shuffl functions
+ #pragma unroll
+ for (int i = 1; i <= (K_WARP_SIZE / 2); i *= 2)
+ {
+ const T n = cv::gpu::device::shfl_up(idata, i);
+ if (laneId >= i)
+ idata += n;
+ }
+
+ return idata;
+#else
+ Ncv32u pos = 2 * threadIdx.x - (threadIdx.x & (K_WARP_SIZE - 1));
+ s_Data[pos] = 0;
+ pos += K_WARP_SIZE;
+ s_Data[pos] = idata;
+
+ s_Data[pos] += s_Data[pos - 1];
+ s_Data[pos] += s_Data[pos - 2];
+ s_Data[pos] += s_Data[pos - 4];
+ s_Data[pos] += s_Data[pos - 8];
+ s_Data[pos] += s_Data[pos - 16];
+
+ return s_Data[pos];
+#endif
+}
+inline __device__ Ncv64u warpScanInclusive(Ncv64u idata, volatile Ncv64u *s_Data)
+{
Ncv32u pos = 2 * threadIdx.x - (threadIdx.x & (K_WARP_SIZE - 1));
s_Data[pos] = 0;
pos += K_WARP_SIZE;
return status;
}
-#endif /* CUDA_DISABLER */
\ No newline at end of file
+#endif /* CUDA_DISABLER */