--- /dev/null
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+// By downloading, copying, installing or using the software you agree to this license.
+// If you do not agree to this license, do not download, install,
+// copy or use the software.
+//
+//
+// License Agreement
+// For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+// * Redistribution's of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+//
+// * Redistribution's in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+//
+// * The name of the copyright holders may not be used to endorse or promote products
+// derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_GPU_REDUCE_DETAIL_HPP__
+#define __OPENCV_GPU_REDUCE_DETAIL_HPP__
+
+#include <thrust/tuple.h>
+#include "../warp.hpp"
+#include "../warp_shuffle.hpp"
+
+namespace cv { namespace gpu { namespace device
+{
+ namespace reduce_detail
+ {
+ template <typename T> struct GetType;
+ template <typename T> struct GetType<T*>
+ {
+ typedef T type;
+ };
+ template <typename T> struct GetType<volatile T*>
+ {
+ typedef T type;
+ };
+ template <typename T> struct GetType<T&>
+ {
+ typedef T type;
+ };
+
+ template <unsigned int I, unsigned int N>
+ struct For
+ {
+ template <class PointerTuple, class ValTuple>
+ static __device__ void loadToSmem(const PointerTuple& smem, const ValTuple& val, unsigned int tid)
+ {
+ thrust::get<I>(smem)[tid] = thrust::get<I>(val);
+
+ For<I + 1, N>::loadToSmem(smem, val, tid);
+ }
+ template <class PointerTuple, class ValTuple>
+ static __device__ void loadFromSmem(const PointerTuple& smem, const ValTuple& val, unsigned int tid)
+ {
+ thrust::get<I>(val) = thrust::get<I>(smem)[tid];
+
+ For<I + 1, N>::loadFromSmem(smem, val, tid);
+ }
+
+ template <class PointerTuple, class ValTuple, class OpTuple>
+ static __device__ void merge(const PointerTuple& smem, const ValTuple& val, unsigned int tid, unsigned int delta, const OpTuple& op)
+ {
+ typename GetType<typename thrust::tuple_element<I, PointerTuple>::type>::type reg = thrust::get<I>(smem)[tid + delta];
+ thrust::get<I>(smem)[tid] = thrust::get<I>(val) = thrust::get<I>(op)(thrust::get<I>(val), reg);
+
+ For<I + 1, N>::merge(smem, val, tid, delta, op);
+ }
+ template <class ValTuple, class OpTuple>
+ static __device__ void mergeShfl(const ValTuple& val, unsigned int delta, unsigned int width, const OpTuple& op)
+ {
+ typename GetType<typename thrust::tuple_element<I, ValTuple>::type>::type reg = shfl_down(thrust::get<I>(val), delta, width);
+ thrust::get<I>(val) = thrust::get<I>(op)(thrust::get<I>(val), reg);
+
+ For<I + 1, N>::mergeShfl(val, delta, width, op);
+ }
+ };
+ template <unsigned int N>
+ struct For<N, N>
+ {
+ template <class PointerTuple, class ValTuple>
+ static __device__ void loadToSmem(const PointerTuple&, const ValTuple&, unsigned int)
+ {
+ }
+ template <class PointerTuple, class ValTuple>
+ static __device__ void loadFromSmem(const PointerTuple&, const ValTuple&, unsigned int)
+ {
+ }
+
+ template <class PointerTuple, class ValTuple, class OpTuple>
+ static __device__ void merge(const PointerTuple&, const ValTuple&, unsigned int, unsigned int, const OpTuple&)
+ {
+ }
+ template <class ValTuple, class OpTuple>
+ static __device__ void mergeShfl(const ValTuple&, unsigned int, unsigned int, const OpTuple&)
+ {
+ }
+ };
+
+ template <typename T>
+ __device__ __forceinline__ void loadToSmem(volatile T* smem, T& val, unsigned int tid)
+ {
+ smem[tid] = val;
+ }
+ template <typename T>
+ __device__ __forceinline__ void loadFromSmem(volatile T* smem, T& val, unsigned int tid)
+ {
+ val = smem[tid];
+ }
+ template <typename P0, typename P1, typename P2, typename P3, typename P4, typename P5, typename P6, typename P7, typename P8, typename P9,
+ typename R0, typename R1, typename R2, typename R3, typename R4, typename R5, typename R6, typename R7, typename R8, typename R9>
+ __device__ __forceinline__ void loadToSmem(const thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9>& smem,
+ const thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>& val,
+ unsigned int tid)
+ {
+ For<0, thrust::tuple_size<thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9> >::value>::loadToSmem(smem, val, tid);
+ }
+ template <typename P0, typename P1, typename P2, typename P3, typename P4, typename P5, typename P6, typename P7, typename P8, typename P9,
+ typename R0, typename R1, typename R2, typename R3, typename R4, typename R5, typename R6, typename R7, typename R8, typename R9>
+ __device__ __forceinline__ void loadFromSmem(const thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9>& smem,
+ const thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>& val,
+ unsigned int tid)
+ {
+ For<0, thrust::tuple_size<thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9> >::value>::loadFromSmem(smem, val, tid);
+ }
+
+ template <typename T, class Op>
+ __device__ __forceinline__ void merge(volatile T* smem, T& val, unsigned int tid, unsigned int delta, const Op& op)
+ {
+ T reg = smem[tid + delta];
+ smem[tid] = val = op(val, reg);
+ }
+ template <typename T, class Op>
+ __device__ __forceinline__ void mergeShfl(T& val, unsigned int delta, unsigned int width, const Op& op)
+ {
+ T reg = shfl_down(val, delta, width);
+ val = op(val, reg);
+ }
+ template <typename P0, typename P1, typename P2, typename P3, typename P4, typename P5, typename P6, typename P7, typename P8, typename P9,
+ typename R0, typename R1, typename R2, typename R3, typename R4, typename R5, typename R6, typename R7, typename R8, typename R9,
+ class Op0, class Op1, class Op2, class Op3, class Op4, class Op5, class Op6, class Op7, class Op8, class Op9>
+ __device__ __forceinline__ void merge(const thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9>& smem,
+ const thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>& val,
+ unsigned int tid,
+ unsigned int delta,
+ const thrust::tuple<Op0, Op1, Op2, Op3, Op4, Op5, Op6, Op7, Op8, Op9>& op)
+ {
+ For<0, thrust::tuple_size<thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9> >::value>::merge(smem, val, tid, delta, op);
+ }
+ template <typename R0, typename R1, typename R2, typename R3, typename R4, typename R5, typename R6, typename R7, typename R8, typename R9,
+ class Op0, class Op1, class Op2, class Op3, class Op4, class Op5, class Op6, class Op7, class Op8, class Op9>
+ __device__ __forceinline__ void mergeShfl(const thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>& val,
+ unsigned int delta,
+ unsigned int width,
+ const thrust::tuple<Op0, Op1, Op2, Op3, Op4, Op5, Op6, Op7, Op8, Op9>& op)
+ {
+ For<0, thrust::tuple_size<thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9> >::value>::mergeShfl(val, delta, width, op);
+ }
+
+ template <unsigned int N> struct Generic
+ {
+ template <typename Pointer, typename Reference, class Op>
+ static __device__ void reduce(Pointer smem, Reference val, unsigned int tid, Op op)
+ {
+ loadToSmem(smem, val, tid);
+ if (N >= 32)
+ __syncthreads();
+
+ if (N >= 2048)
+ {
+ if (tid < 1024)
+ merge(smem, val, tid, 1024, op);
+
+ __syncthreads();
+ }
+ if (N >= 1024)
+ {
+ if (tid < 512)
+ merge(smem, val, tid, 512, op);
+
+ __syncthreads();
+ }
+ if (N >= 512)
+ {
+ if (tid < 256)
+ merge(smem, val, tid, 256, op);
+
+ __syncthreads();
+ }
+ if (N >= 256)
+ {
+ if (tid < 128)
+ merge(smem, val, tid, 128, op);
+
+ __syncthreads();
+ }
+ if (N >= 128)
+ {
+ if (tid < 64)
+ merge(smem, val, tid, 64, op);
+
+ __syncthreads();
+ }
+ if (N >= 64)
+ {
+ if (tid < 32)
+ merge(smem, val, tid, 32, op);
+ }
+
+ if (tid < 16)
+ {
+ merge(smem, val, tid, 16, op);
+ merge(smem, val, tid, 8, op);
+ merge(smem, val, tid, 4, op);
+ merge(smem, val, tid, 2, op);
+ merge(smem, val, tid, 1, op);
+ }
+ }
+ };
+
+ template <unsigned int N> struct WarpOptimized
+ {
+ template <typename Pointer, typename Reference, class Op>
+ static __device__ void reduce(Pointer smem, Reference val, unsigned int tid, Op op)
+ {
+ #if __CUDA_ARCH >= 300
+ (void) smem;
+ (void) tid;
+
+ #pragma unroll
+ for (unsigned int i = N / 2; i >= 1; i /= 2)
+ mergeShfl(val, i, N, op);
+ #else
+ loadToSmem(smem, val, tid);
+
+ if (tid < N / 2)
+ {
+ #pragma unroll
+ for (unsigned int i = N / 2; i >= 1; i /= 2)
+ merge(smem, val, tid, i, op);
+ }
+ #endif
+ }
+ };
+
+ template <unsigned int N> struct GenericOptimized32
+ {
+ enum { M = N / 32 };
+
+ template <typename Pointer, typename Reference, class Op>
+ static __device__ void reduce(Pointer smem, Reference val, unsigned int tid, Op op)
+ {
+ const unsigned int laneId = Warp::laneId();
+
+ #if __CUDA_ARCH >= 300
+ #pragma unroll
+ for (int i = 16; i >= 1; i /= 2)
+ mergeShfl(val, i, warpSize, op);
+
+ if (laneId == 0)
+ loadToSmem(smem, val, tid / 32);
+ #else
+ loadToSmem(smem, val, tid);
+
+ if (laneId < 16)
+ {
+ #pragma unroll
+ for (int i = 16; i >= 1; i /= 2)
+ merge(smem, val, tid, i, op);
+ }
+
+ __syncthreads();
+
+ if (laneId == 0)
+ loadToSmem(smem, val, tid / 32);
+ #endif
+
+ __syncthreads();
+
+ loadFromSmem(smem, val, tid);
+
+ if (tid < 32)
+ {
+ #if __CUDA_ARCH >= 300
+ #pragma unroll
+ for (int i = M / 2; i >= 1; i /= 2)
+ mergeShfl(val, i, M, op);
+ #else
+ #pragma unroll
+ for (int i = M / 2; i >= 1; i /= 2)
+ merge(smem, val, tid, i, op);
+ #endif
+ }
+ }
+ };
+
+ template <bool val, class T1, class T2> struct StaticIf;
+ template <class T1, class T2> struct StaticIf<true, T1, T2>
+ {
+ typedef T1 type;
+ };
+ template <class T1, class T2> struct StaticIf<false, T1, T2>
+ {
+ typedef T2 type;
+ };
+
+ template <unsigned int N> struct IsPowerOf2
+ {
+ enum { value = ((N != 0) && !(N & (N - 1))) };
+ };
+
+ template <unsigned int N> struct Dispatcher
+ {
+ typedef typename StaticIf<
+ (N <= 32) && IsPowerOf2<N>::value,
+ WarpOptimized<N>,
+ typename StaticIf<
+ (N <= 1024) && IsPowerOf2<N>::value,
+ GenericOptimized32<N>,
+ Generic<N>
+ >::type
+ >::type reductor;
+ };
+ }
+}}}
+
+#endif // __OPENCV_GPU_REDUCE_DETAIL_HPP__
--- /dev/null
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+// By downloading, copying, installing or using the software you agree to this license.
+// If you do not agree to this license, do not download, install,
+// copy or use the software.
+//
+//
+// License Agreement
+// For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+// * Redistribution's of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+//
+// * Redistribution's in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+//
+// * The name of the copyright holders may not be used to endorse or promote products
+// derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_GPU_PRED_VAL_REDUCE_DETAIL_HPP__
+#define __OPENCV_GPU_PRED_VAL_REDUCE_DETAIL_HPP__
+
+#include <thrust/tuple.h>
+#include "../warp.hpp"
+#include "../warp_shuffle.hpp"
+
+namespace cv { namespace gpu { namespace device
+{
+ namespace reduce_key_val_detail
+ {
+ template <typename T> struct GetType;
+ template <typename T> struct GetType<T*>
+ {
+ typedef T type;
+ };
+ template <typename T> struct GetType<volatile T*>
+ {
+ typedef T type;
+ };
+ template <typename T> struct GetType<T&>
+ {
+ typedef T type;
+ };
+
+ template <unsigned int I, unsigned int N>
+ struct For
+ {
+ template <class PointerTuple, class ReferenceTuple>
+ static __device__ void loadToSmem(const PointerTuple& smem, const ReferenceTuple& data, unsigned int tid)
+ {
+ thrust::get<I>(smem)[tid] = thrust::get<I>(data);
+
+ For<I + 1, N>::loadToSmem(smem, data, tid);
+ }
+ template <class PointerTuple, class ReferenceTuple>
+ static __device__ void loadFromSmem(const PointerTuple& smem, const ReferenceTuple& data, unsigned int tid)
+ {
+ thrust::get<I>(data) = thrust::get<I>(smem)[tid];
+
+ For<I + 1, N>::loadFromSmem(smem, data, tid);
+ }
+
+ template <class ReferenceTuple>
+ static __device__ void copyShfl(const ReferenceTuple& val, unsigned int delta, int width)
+ {
+ thrust::get<I>(val) = shfl_down(thrust::get<I>(val), delta, width);
+
+ For<I + 1, N>::copyShfl(val, delta, width);
+ }
+ template <class PointerTuple, class ReferenceTuple>
+ static __device__ void copy(const PointerTuple& svals, const ReferenceTuple& val, unsigned int tid, unsigned int delta)
+ {
+ thrust::get<I>(svals)[tid] = thrust::get<I>(val) = thrust::get<I>(svals)[tid + delta];
+
+ For<I + 1, N>::copy(svals, val, tid, delta);
+ }
+
+ template <class KeyReferenceTuple, class ValReferenceTuple, class CmpTuple>
+ static __device__ void mergeShfl(const KeyReferenceTuple& key, const ValReferenceTuple& val, const CmpTuple& cmp, unsigned int delta, int width)
+ {
+ typename GetType<typename thrust::tuple_element<I, KeyReferenceTuple>::type>::type reg = shfl_down(thrust::get<I>(key), delta, width);
+
+ if (thrust::get<I>(cmp)(reg, thrust::get<I>(key)))
+ {
+ thrust::get<I>(key) = reg;
+ thrust::get<I>(val) = shfl_down(thrust::get<I>(val), delta, width);
+ }
+
+ For<I + 1, N>::mergeShfl(key, val, cmp, delta, width);
+ }
+ template <class KeyPointerTuple, class KeyReferenceTuple, class ValPointerTuple, class ValReferenceTuple, class CmpTuple>
+ static __device__ void merge(const KeyPointerTuple& skeys, const KeyReferenceTuple& key,
+ const ValPointerTuple& svals, const ValReferenceTuple& val,
+ const CmpTuple& cmp,
+ unsigned int tid, unsigned int delta)
+ {
+ typename GetType<typename thrust::tuple_element<I, KeyPointerTuple>::type>::type reg = thrust::get<I>(skeys)[tid + delta];
+
+ if (thrust::get<I>(cmp)(reg, thrust::get<I>(key)))
+ {
+ thrust::get<I>(skeys)[tid] = thrust::get<I>(key) = reg;
+ thrust::get<I>(svals)[tid] = thrust::get<I>(val) = thrust::get<I>(svals)[tid + delta];
+ }
+
+ For<I + 1, N>::merge(skeys, key, svals, val, cmp, tid, delta);
+ }
+ };
+ template <unsigned int N>
+ struct For<N, N>
+ {
+ template <class PointerTuple, class ReferenceTuple>
+ static __device__ void loadToSmem(const PointerTuple&, const ReferenceTuple&, unsigned int)
+ {
+ }
+ template <class PointerTuple, class ReferenceTuple>
+ static __device__ void loadFromSmem(const PointerTuple&, const ReferenceTuple&, unsigned int)
+ {
+ }
+
+ template <class ReferenceTuple>
+ static __device__ void copyShfl(const ReferenceTuple&, unsigned int, int)
+ {
+ }
+ template <class PointerTuple, class ReferenceTuple>
+ static __device__ void copy(const PointerTuple&, const ReferenceTuple&, unsigned int, unsigned int)
+ {
+ }
+
+ template <class KeyReferenceTuple, class ValReferenceTuple, class CmpTuple>
+ static __device__ void mergeShfl(const KeyReferenceTuple&, const ValReferenceTuple&, const CmpTuple&, unsigned int, int)
+ {
+ }
+ template <class KeyPointerTuple, class KeyReferenceTuple, class ValPointerTuple, class ValReferenceTuple, class CmpTuple>
+ static __device__ void merge(const KeyPointerTuple&, const KeyReferenceTuple&,
+ const ValPointerTuple&, const ValReferenceTuple&,
+ const CmpTuple&,
+ unsigned int, unsigned int)
+ {
+ }
+ };
+
+ //////////////////////////////////////////////////////
+ // loadToSmem
+
+ template <typename T>
+ __device__ __forceinline__ void loadToSmem(volatile T* smem, T& data, unsigned int tid)
+ {
+ smem[tid] = data;
+ }
+ template <typename T>
+ __device__ __forceinline__ void loadFromSmem(volatile T* smem, T& data, unsigned int tid)
+ {
+ data = smem[tid];
+ }
+ template <typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
+ typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9>
+ __device__ __forceinline__ void loadToSmem(const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& smem,
+ const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& data,
+ unsigned int tid)
+ {
+ For<0, thrust::tuple_size<thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9> >::value>::loadToSmem(smem, data, tid);
+ }
+ template <typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
+ typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9>
+ __device__ __forceinline__ void loadFromSmem(const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& smem,
+ const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& data,
+ unsigned int tid)
+ {
+ For<0, thrust::tuple_size<thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9> >::value>::loadFromSmem(smem, data, tid);
+ }
+
+ //////////////////////////////////////////////////////
+ // copyVals
+
+ template <typename V>
+ __device__ __forceinline__ void copyValsShfl(V& val, unsigned int delta, int width)
+ {
+ val = shfl_down(val, delta, width);
+ }
+ template <typename V>
+ __device__ __forceinline__ void copyVals(volatile V* svals, V& val, unsigned int tid, unsigned int delta)
+ {
+ svals[tid] = val = svals[tid + delta];
+ }
+ template <typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9>
+ __device__ __forceinline__ void copyValsShfl(const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
+ unsigned int delta,
+ int width)
+ {
+ For<0, thrust::tuple_size<thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9> >::value>::copyShfl(val, delta, width);
+ }
+ template <typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
+ typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9>
+ __device__ __forceinline__ void copyVals(const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& svals,
+ const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
+ unsigned int tid, unsigned int delta)
+ {
+ For<0, thrust::tuple_size<thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9> >::value>::copy(svals, val, tid, delta);
+ }
+
+ //////////////////////////////////////////////////////
+ // merge
+
+ template <typename K, typename V, class Cmp>
+ __device__ __forceinline__ void mergeShfl(K& key, V& val, const Cmp& cmp, unsigned int delta, int width)
+ {
+ K reg = shfl_down(key, delta, width);
+
+ if (cmp(reg, key))
+ {
+ key = reg;
+ copyValsShfl(val, delta, width);
+ }
+ }
+ template <typename K, typename V, class Cmp>
+ __device__ __forceinline__ void merge(volatile K* skeys, K& key, volatile V* svals, V& val, const Cmp& cmp, unsigned int tid, unsigned int delta)
+ {
+ K reg = skeys[tid + delta];
+
+ if (cmp(reg, key))
+ {
+ skeys[tid] = key = reg;
+ copyVals(svals, val, tid, delta);
+ }
+ }
+ template <typename K,
+ typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9,
+ class Cmp>
+ __device__ __forceinline__ void mergeShfl(K& key,
+ const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
+ const Cmp& cmp,
+ unsigned int delta, int width)
+ {
+ K reg = shfl_down(key, delta, width);
+
+ if (cmp(reg, key))
+ {
+ key = reg;
+ copyValsShfl(val, delta, width);
+ }
+ }
+ template <typename K,
+ typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
+ typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9,
+ class Cmp>
+ __device__ __forceinline__ void merge(volatile K* skeys, K& key,
+ const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& svals,
+ const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
+ const Cmp& cmp, unsigned int tid, unsigned int delta)
+ {
+ K reg = skeys[tid + delta];
+
+ if (cmp(reg, key))
+ {
+ skeys[tid] = key = reg;
+ copyVals(svals, val, tid, delta);
+ }
+ }
+ template <typename KR0, typename KR1, typename KR2, typename KR3, typename KR4, typename KR5, typename KR6, typename KR7, typename KR8, typename KR9,
+ typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9,
+ class Cmp0, class Cmp1, class Cmp2, class Cmp3, class Cmp4, class Cmp5, class Cmp6, class Cmp7, class Cmp8, class Cmp9>
+ __device__ __forceinline__ void mergeShfl(const thrust::tuple<KR0, KR1, KR2, KR3, KR4, KR5, KR6, KR7, KR8, KR9>& key,
+ const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
+ const thrust::tuple<Cmp0, Cmp1, Cmp2, Cmp3, Cmp4, Cmp5, Cmp6, Cmp7, Cmp8, Cmp9>& cmp,
+ unsigned int delta, int width)
+ {
+ For<0, thrust::tuple_size<thrust::tuple<KR0, KR1, KR2, KR3, KR4, KR5, KR6, KR7, KR8, KR9> >::value>::mergeShfl(key, val, cmp, delta, width);
+ }
+ template <typename KP0, typename KP1, typename KP2, typename KP3, typename KP4, typename KP5, typename KP6, typename KP7, typename KP8, typename KP9,
+ typename KR0, typename KR1, typename KR2, typename KR3, typename KR4, typename KR5, typename KR6, typename KR7, typename KR8, typename KR9,
+ typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
+ typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9,
+ class Cmp0, class Cmp1, class Cmp2, class Cmp3, class Cmp4, class Cmp5, class Cmp6, class Cmp7, class Cmp8, class Cmp9>
+ __device__ __forceinline__ void merge(const thrust::tuple<KP0, KP1, KP2, KP3, KP4, KP5, KP6, KP7, KP8, KP9>& skeys,
+ const thrust::tuple<KR0, KR1, KR2, KR3, KR4, KR5, KR6, KR7, KR8, KR9>& key,
+ const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& svals,
+ const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
+ const thrust::tuple<Cmp0, Cmp1, Cmp2, Cmp3, Cmp4, Cmp5, Cmp6, Cmp7, Cmp8, Cmp9>& cmp,
+ unsigned int tid, unsigned int delta)
+ {
+ For<0, thrust::tuple_size<thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9> >::value>::merge(skeys, key, svals, val, cmp, tid, delta);
+ }
+
+ //////////////////////////////////////////////////////
+ // Generic
+
+ template <unsigned int N> struct Generic
+ {
+ template <class KP, class KR, class VP, class VR, class Cmp>
+ static __device__ void reduce(KP skeys, KR key, VP svals, VR val, unsigned int tid, Cmp cmp)
+ {
+ loadToSmem(skeys, key, tid);
+ loadValsToSmem(svals, val, tid);
+ if (N >= 32)
+ __syncthreads();
+
+ if (N >= 2048)
+ {
+ if (tid < 1024)
+ merge(skeys, key, svals, val, cmp, tid, 1024);
+
+ __syncthreads();
+ }
+ if (N >= 1024)
+ {
+ if (tid < 512)
+ merge(skeys, key, svals, val, cmp, tid, 512);
+
+ __syncthreads();
+ }
+ if (N >= 512)
+ {
+ if (tid < 256)
+ merge(skeys, key, svals, val, cmp, tid, 256);
+
+ __syncthreads();
+ }
+ if (N >= 256)
+ {
+ if (tid < 128)
+ merge(skeys, key, svals, val, cmp, tid, 128);
+
+ __syncthreads();
+ }
+ if (N >= 128)
+ {
+ if (tid < 64)
+ merge(skeys, key, svals, val, cmp, tid, 64);
+
+ __syncthreads();
+ }
+ if (N >= 64)
+ {
+ if (tid < 32)
+ merge(skeys, key, svals, val, cmp, tid, 32);
+ }
+
+ if (tid < 16)
+ {
+ merge(skeys, key, svals, val, cmp, tid, 16);
+ merge(skeys, key, svals, val, cmp, tid, 8);
+ merge(skeys, key, svals, val, cmp, tid, 4);
+ merge(skeys, key, svals, val, cmp, tid, 2);
+ merge(skeys, key, svals, val, cmp, tid, 1);
+ }
+ }
+ };
+
+ template <unsigned int N> struct WarpOptimized
+ {
+ template <class KP, class KR, class VP, class VR, class Cmp>
+ static __device__ void reduce(KP skeys, KR key, VP svals, VR val, unsigned int tid, Cmp cmp)
+ {
+ #if __CUDA_ARCH >= 300
+ (void) skeys;
+ (void) svals;
+ (void) tid;
+
+ #pragma unroll
+ for (unsigned int i = N / 2; i >= 1; i /= 2)
+ mergeShfl(key, val, cml, i, N);
+ #else
+ loadToSmem(skeys, key, tid);
+ loadToSmem(svals, val, tid);
+
+ if (tid < N / 2)
+ {
+ #pragma unroll
+ for (unsigned int i = N / 2; i >= 1; i /= 2)
+ merge(skeys, key, svals, val, cmp, tid, i);
+ }
+ #endif
+ }
+ };
+
+ template <unsigned int N> struct GenericOptimized32
+ {
+ enum { M = N / 32 };
+
+ template <class KP, class KR, class VP, class VR, class Cmp>
+ static __device__ void reduce(KP skeys, KR key, VP svals, VR val, unsigned int tid, Cmp cmp)
+ {
+ const unsigned int laneId = Warp::laneId();
+
+ #if __CUDA_ARCH >= 300
+ #pragma unroll
+ for (unsigned int i = 16; i >= 1; i /= 2)
+ mergeShfl(key, val, cml, i, warpSize);
+
+ if (laneId == 0)
+ {
+ loadToSmem(skeys, key, tid / 32);
+ loadToSmem(svals, val, tid / 32);
+ }
+ #else
+ loadToSmem(skeys, key, tid);
+ loadToSmem(svals, val, tid);
+
+ if (laneId < 16)
+ {
+ #pragma unroll
+ for (int i = 16; i >= 1; i /= 2)
+ merge(skeys, key, svals, val, cmp, tid, i);
+ }
+
+ __syncthreads();
+
+ if (laneId == 0)
+ {
+ loadToSmem(skeys, key, tid / 32);
+ loadToSmem(svals, val, tid / 32);
+ }
+ #endif
+
+ __syncthreads();
+
+ loadFromSmem(skeys, key, tid);
+
+ if (tid < 32)
+ {
+ #if __CUDA_ARCH >= 300
+ loadFromSmem(svals, val, tid);
+
+ #pragma unroll
+ for (unsigned int i = M / 2; i >= 1; i /= 2)
+ mergeShfl(key, val, cml, i, M);
+ #else
+ #pragma unroll
+ for (unsigned int i = M / 2; i >= 1; i /= 2)
+ merge(skeys, key, svals, val, cmp, tid, i);
+ #endif
+ }
+ }
+ };
+
+ template <bool val, class T1, class T2> struct StaticIf;
+ template <class T1, class T2> struct StaticIf<true, T1, T2>
+ {
+ typedef T1 type;
+ };
+ template <class T1, class T2> struct StaticIf<false, T1, T2>
+ {
+ typedef T2 type;
+ };
+
+ template <unsigned int N> struct IsPowerOf2
+ {
+ enum { value = ((N != 0) && !(N & (N - 1))) };
+ };
+
+ template <unsigned int N> struct Dispatcher
+ {
+ typedef typename StaticIf<
+ (N <= 32) && IsPowerOf2<N>::value,
+ WarpOptimized<N>,
+ typename StaticIf<
+ (N <= 1024) && IsPowerOf2<N>::value,
+ GenericOptimized32<N>,
+ Generic<N>
+ >::type
+ >::type reductor;
+ };
+ }
+}}}
+
+#endif // __OPENCV_GPU_PRED_VAL_REDUCE_DETAIL_HPP__
--- /dev/null
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+// By downloading, copying, installing or using the software you agree to this license.
+// If you do not agree to this license, do not download, install,
+// copy or use the software.
+//
+//
+// License Agreement
+// For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+// * Redistribution's of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+//
+// * Redistribution's in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+//
+// * The name of the copyright holders may not be used to endorse or promote products
+// derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_GPU_REDUCE_HPP__
+#define __OPENCV_GPU_REDUCE_HPP__
+
+#include <thrust/tuple.h>
+#include "detail/reduce.hpp"
+#include "detail/reduce_key_val.hpp"
+
+namespace cv { namespace gpu { namespace device
+{
+ template <int N, typename T, class Op>
+ __device__ __forceinline__ void reduce(volatile T* smem, T& val, unsigned int tid, const Op& op)
+ {
+ reduce_detail::Dispatcher<N>::reductor::template reduce<volatile T*, T&, const Op&>(smem, val, tid, op);
+ }
+ template <int N,
+ typename P0, typename P1, typename P2, typename P3, typename P4, typename P5, typename P6, typename P7, typename P8, typename P9,
+ typename R0, typename R1, typename R2, typename R3, typename R4, typename R5, typename R6, typename R7, typename R8, typename R9,
+ class Op0, class Op1, class Op2, class Op3, class Op4, class Op5, class Op6, class Op7, class Op8, class Op9>
+ __device__ __forceinline__ void reduce(const thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9>& smem,
+ const thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>& val,
+ unsigned int tid,
+ const thrust::tuple<Op0, Op1, Op2, Op3, Op4, Op5, Op6, Op7, Op8, Op9>& op)
+ {
+ reduce_detail::Dispatcher<N>::reductor::template reduce<
+ const thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9>&,
+ const thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>&,
+ const thrust::tuple<Op0, Op1, Op2, Op3, Op4, Op5, Op6, Op7, Op8, Op9>&>(smem, val, tid, op);
+ }
+
+ template <unsigned int N, typename K, typename V, class Cmp>
+ __device__ __forceinline__ void reduceKeyVal(volatile K* skeys, K& key, volatile V* svals, V& val, unsigned int tid, const Cmp& cmp)
+ {
+ reduce_key_val_detail::Dispatcher<N>::reductor::template reduce<volatile K*, K&, volatile V*, V&, const Cmp&>(skeys, key, svals, val, tid, cmp);
+ }
+ template <unsigned int N,
+ typename K,
+ typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
+ typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9,
+ class Cmp>
+ __device__ __forceinline__ void reduceKeyVal(volatile K* skeys, K& key,
+ const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& svals,
+ const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
+ unsigned int tid, const Cmp& cmp)
+ {
+ reduce_key_val_detail::Dispatcher<N>::reductor::template reduce<volatile K*, K&,
+ const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>&,
+ const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>&,
+ const Cmp&>(skeys, key, svals, val, tid, cmp);
+ }
+ template <unsigned int N,
+ typename KP0, typename KP1, typename KP2, typename KP3, typename KP4, typename KP5, typename KP6, typename KP7, typename KP8, typename KP9,
+ typename KR0, typename KR1, typename KR2, typename KR3, typename KR4, typename KR5, typename KR6, typename KR7, typename KR8, typename KR9,
+ typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
+ typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9,
+ class Cmp0, class Cmp1, class Cmp2, class Cmp3, class Cmp4, class Cmp5, class Cmp6, class Cmp7, class Cmp8, class Cmp9>
+ __device__ __forceinline__ void reduceKeyVal(const thrust::tuple<KP0, KP1, KP2, KP3, KP4, KP5, KP6, KP7, KP8, KP9>& skeys,
+ const thrust::tuple<KR0, KR1, KR2, KR3, KR4, KR5, KR6, KR7, KR8, KR9>& key,
+ const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& svals,
+ const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
+ unsigned int tid,
+ const thrust::tuple<Cmp0, Cmp1, Cmp2, Cmp3, Cmp4, Cmp5, Cmp6, Cmp7, Cmp8, Cmp9>& cmp)
+ {
+ reduce_key_val_detail::Dispatcher<N>::reductor::template reduce<
+ const thrust::tuple<KP0, KP1, KP2, KP3, KP4, KP5, KP6, KP7, KP8, KP9>&,
+ const thrust::tuple<KR0, KR1, KR2, KR3, KR4, KR5, KR6, KR7, KR8, KR9>&,
+ const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>&,
+ const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>&,
+ const thrust::tuple<Cmp0, Cmp1, Cmp2, Cmp3, Cmp4, Cmp5, Cmp6, Cmp7, Cmp8, Cmp9>&
+ >(skeys, key, svals, val, tid, cmp);
+ }
+
+ // smem_tuple
+
+ template <typename T0>
+ __device__ __forceinline__
+ thrust::tuple<volatile T0*>
+ smem_tuple(T0* t0)
+ {
+ return thrust::make_tuple((volatile T0*) t0);
+ }
+
+ template <typename T0, typename T1>
+ __device__ __forceinline__
+ thrust::tuple<volatile T0*, volatile T1*>
+ smem_tuple(T0* t0, T1* t1)
+ {
+ return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1);
+ }
+
+ template <typename T0, typename T1, typename T2>
+ __device__ __forceinline__
+ thrust::tuple<volatile T0*, volatile T1*, volatile T2*>
+ smem_tuple(T0* t0, T1* t1, T2* t2)
+ {
+ return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2);
+ }
+
+ template <typename T0, typename T1, typename T2, typename T3>
+ __device__ __forceinline__
+ thrust::tuple<volatile T0*, volatile T1*, volatile T2*, volatile T3*>
+ smem_tuple(T0* t0, T1* t1, T2* t2, T3* t3)
+ {
+ return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2, (volatile T3*) t3);
+ }
+
+ template <typename T0, typename T1, typename T2, typename T3, typename T4>
+ __device__ __forceinline__
+ thrust::tuple<volatile T0*, volatile T1*, volatile T2*, volatile T3*, volatile T4*>
+ smem_tuple(T0* t0, T1* t1, T2* t2, T3* t3, T4* t4)
+ {
+ return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2, (volatile T3*) t3, (volatile T4*) t4);
+ }
+
+ template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5>
+ __device__ __forceinline__
+ thrust::tuple<volatile T0*, volatile T1*, volatile T2*, volatile T3*, volatile T4*, volatile T5*>
+ smem_tuple(T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, T5* t5)
+ {
+ return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2, (volatile T3*) t3, (volatile T4*) t4, (volatile T5*) t5);
+ }
+
+ template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6>
+ __device__ __forceinline__
+ thrust::tuple<volatile T0*, volatile T1*, volatile T2*, volatile T3*, volatile T4*, volatile T5*, volatile T6*>
+ smem_tuple(T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, T5* t5, T6* t6)
+ {
+ return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2, (volatile T3*) t3, (volatile T4*) t4, (volatile T5*) t5, (volatile T6*) t6);
+ }
+
+ template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7>
+ __device__ __forceinline__
+ thrust::tuple<volatile T0*, volatile T1*, volatile T2*, volatile T3*, volatile T4*, volatile T5*, volatile T6*, volatile T7*>
+ smem_tuple(T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, T5* t5, T6* t6, T7* t7)
+ {
+ return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2, (volatile T3*) t3, (volatile T4*) t4, (volatile T5*) t5, (volatile T6*) t6, (volatile T7*) t7);
+ }
+
+ template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7, typename T8>
+ __device__ __forceinline__
+ thrust::tuple<volatile T0*, volatile T1*, volatile T2*, volatile T3*, volatile T4*, volatile T5*, volatile T6*, volatile T7*, volatile T8*>
+ smem_tuple(T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, T5* t5, T6* t6, T7* t7, T8* t8)
+ {
+ return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2, (volatile T3*) t3, (volatile T4*) t4, (volatile T5*) t5, (volatile T6*) t6, (volatile T7*) t7, (volatile T8*) t8);
+ }
+
+ template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7, typename T8, typename T9>
+ __device__ __forceinline__
+ thrust::tuple<volatile T0*, volatile T1*, volatile T2*, volatile T3*, volatile T4*, volatile T5*, volatile T6*, volatile T7*, volatile T8*, volatile T9*>
+ smem_tuple(T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, T5* t5, T6* t6, T7* t7, T8* t8, T9* t9)
+ {
+ return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2, (volatile T3*) t3, (volatile T4*) t4, (volatile T5*) t5, (volatile T6*) t6, (volatile T7*) t7, (volatile T8*) t8, (volatile T9*) t9);
+ }
+}}}
+
+#endif // __OPENCV_GPU_UTILITY_HPP__
///////////////////////////////////////////////////////////////////////////////
// Reduction
- template <int n, typename T, typename Op> __device__ __forceinline__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)
+ template <int n, typename T, typename Op> __device__ __forceinline__ void reduce_old(volatile T* data, T& partial_reduction, int tid, const Op& op)
{
StaticAssert<n >= 8 && n <= 512>::check();
utility_detail::ReductionDispatcher<n <= 64>::reduce<n>(data, partial_reduction, tid, op);
template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(int* smem, int tid)
{
- reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile int>());
+ reduce_old<THREAD_DIM>(smem, mySum, tid, plus<volatile int>());
}
__device__ __forceinline__ operator int() const
template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(float* smem, int tid)
{
- reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile float>());
+ reduce_old<THREAD_DIM>(smem, mySum, tid, plus<volatile float>());
}
__device__ __forceinline__ operator float() const
template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(float* smem, int tid)
{
- reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile float>());
+ reduce_old<THREAD_DIM>(smem, mySum, tid, plus<volatile float>());
}
__device__ __forceinline__ operator float() const
template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(int* smem, int tid)
{
- reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile int>());
+ reduce_old<THREAD_DIM>(smem, mySum, tid, plus<volatile int>());
}
__device__ __forceinline__ operator int() const
--- /dev/null
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+// By downloading, copying, installing or using the software you agree to this license.
+// If you do not agree to this license, do not download, install,
+// copy or use the software.
+//
+//
+// License Agreement
+// For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+// * Redistribution's of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+//
+// * Redistribution's in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+//
+// * The name of the copyright holders may not be used to endorse or promote products
+// derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_GPU_WARP_SHUFFLE_HPP__
+#define __OPENCV_GPU_WARP_SHUFFLE_HPP__
+
+namespace cv { namespace gpu { namespace device
+{
+ template <typename T>
+ __device__ __forceinline__ T shfl(T val, int srcLane, int width = warpSize)
+ {
+ #if __CUDA_ARCH__ >= 300
+ return __shfl(val, srcLane, width);
+ #else
+ return T();
+ #endif
+ }
+ __device__ __forceinline__ double shfl(double val, int srcLane, int width = warpSize)
+ {
+ #if __CUDA_ARCH__ >= 300
+ int lo = __double2loint(val);
+ int hi = __double2hiint(val);
+
+ lo = __shfl(lo, srcLane, width);
+ hi = __shfl(hi, srcLane, width);
+
+ return __hiloint2double(hi, lo);
+ #else
+ return 0.0;
+ #endif
+ }
+
+ template <typename T>
+ __device__ __forceinline__ T shfl_down(T val, unsigned int delta, int width = warpSize)
+ {
+ #if __CUDA_ARCH__ >= 300
+ return __shfl_down(val, delta, width);
+ #else
+ return T();
+ #endif
+ }
+ __device__ __forceinline__ double shfl_down(double val, unsigned int delta, int width = warpSize)
+ {
+ #if __CUDA_ARCH__ >= 300
+ int lo = __double2loint(val);
+ int hi = __double2hiint(val);
+
+ lo = __shfl_down(lo, delta, width);
+ hi = __shfl_down(hi, delta, width);
+
+ return __hiloint2double(hi, lo);
+ #else
+ return 0.0;
+ #endif
+ }
+}}}
+
+#endif // __OPENCV_GPU_WARP_SHUFFLE_HPP__
c += Ix * Iy;
}
- reduce<32>(srow, a, threadIdx.x, plus<volatile int>());
- reduce<32>(srow, b, threadIdx.x, plus<volatile int>());
- reduce<32>(srow, c, threadIdx.x, plus<volatile int>());
+ reduce_old<32>(srow, a, threadIdx.x, plus<volatile int>());
+ reduce_old<32>(srow, b, threadIdx.x, plus<volatile int>());
+ reduce_old<32>(srow, c, threadIdx.x, plus<volatile int>());
if (threadIdx.x == 0)
{
for (int u = threadIdx.x - half_k; u <= half_k; u += blockDim.x)
m_10 += u * image(loc.y, loc.x + u);
- reduce<32>(srow, m_10, threadIdx.x, plus<volatile int>());
+ reduce_old<32>(srow, m_10, threadIdx.x, plus<volatile int>());
for (int v = 1; v <= half_k; ++v)
{
m_sum += u * (val_plus + val_minus);
}
- reduce<32>(srow, v_sum, threadIdx.x, plus<volatile int>());
- reduce<32>(srow, m_sum, threadIdx.x, plus<volatile int>());
+ reduce_old<32>(srow, v_sum, threadIdx.x, plus<volatile int>());
+ reduce_old<32>(srow, m_sum, threadIdx.x, plus<volatile int>());
m_10 += m_sum;
m_01 += v * v_sum;
}
}}}
-#endif /* CUDA_DISABLER */
\ No newline at end of file
+#endif /* CUDA_DISABLER */
sumy += s_Y[threadIdx.x + 96];
}
- device::reduce<32>(s_sumx + threadIdx.y * 32, sumx, threadIdx.x, plus<volatile float>());
- device::reduce<32>(s_sumy + threadIdx.y * 32, sumy, threadIdx.x, plus<volatile float>());
+ device::reduce_old<32>(s_sumx + threadIdx.y * 32, sumx, threadIdx.x, plus<volatile float>());
+ device::reduce_old<32>(s_sumy + threadIdx.y * 32, sumy, threadIdx.x, plus<volatile float>());
const float temp_mod = sumx * sumx + sumy * sumy;
if (temp_mod > best_mod)