From: Xiaomeng Yang Date: Thu, 14 Feb 2019 01:47:49 +0000 (-0800) Subject: Separate reduce functions from math (#16929) X-Git-Tag: accepted/tizen/6.5/unified/20211028.231830~1303 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=3a34f443c534d09c62b8a4b84b24c0e1536765ce;p=platform%2Fupstream%2Fpytorch.git Separate reduce functions from math (#16929) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/16929 Separate CPU reduce functions from math i-am-not-moving-c2-to-c10 Reviewed By: houseroad Differential Revision: D13999469 fbshipit-source-id: bd628b15a6e3c1f04cc62aefffb0110690e1c0d1 --- diff --git a/caffe2/contrib/aten/aten_op.cc b/caffe2/contrib/aten/aten_op.cc index 0483ebb..2abc25a 100644 --- a/caffe2/contrib/aten/aten_op.cc +++ b/caffe2/contrib/aten/aten_op.cc @@ -14,7 +14,7 @@ OPERATOR_SCHEMA(ATen); namespace math { template <> void Set( - const size_t /*N*/, + const int /*N*/, const at::Half h, at::Half* v, CPUContext* c) { diff --git a/caffe2/operators/elementwise_add_op.h b/caffe2/operators/elementwise_add_op.h index e09de2f..ded3d42 100644 --- a/caffe2/operators/elementwise_add_op.h +++ b/caffe2/operators/elementwise_add_op.h @@ -47,15 +47,14 @@ struct AddFunctor { const std::vector C_dims = elementwise_ops_utils::ComputeBinaryBroadcastForwardDims( A_dims, B_dims); - std::vector A_axes; - std::vector B_axes; - elementwise_ops_utils::ComputeBinaryBroadcastBackwardAxes( - A_dims, B_dims, &A_axes, &B_axes); + std::vector A_back_dims; + std::vector B_back_dims; + elementwise_ops_utils::ComputeBinaryBroadcastBackwardDims( + A_dims, B_dims, &A_back_dims, &B_back_dims); math::ReduceSum( C_dims.size(), C_dims.data(), - A_axes.size(), - A_axes.data(), + A_back_dims.data(), TGrad(1), dC, dA, @@ -63,8 +62,7 @@ struct AddFunctor { math::ReduceSum( C_dims.size(), C_dims.data(), - B_axes.size(), - B_axes.data(), + B_back_dims.data(), TGrad(1), dC, dB, diff --git a/caffe2/operators/elementwise_ops_utils.cc b/caffe2/operators/elementwise_ops_utils.cc index 13a1204..5bb6c76 100644 --- a/caffe2/operators/elementwise_ops_utils.cc +++ b/caffe2/operators/elementwise_ops_utils.cc @@ -108,5 +108,17 @@ void ComputeBinaryBroadcastBackwardAxes( std::reverse(B_axes->begin(), B_axes->end()); } +void ComputeBinaryBroadcastBackwardDims( + const std::vector& A_dims, + const std::vector& B_dims, + std::vector* A_back_dims, + std::vector* B_back_dims) { + const int ndim = std::max(A_dims.size(), B_dims.size()); + A_back_dims->assign(ndim, 1); + B_back_dims->assign(ndim, 1); + std::copy(A_dims.crbegin(), A_dims.crend(), A_back_dims->rbegin()); + std::copy(B_dims.crbegin(), B_dims.crend(), B_back_dims->rbegin()); +} + } // namespace elementwise_ops_utils } // namespace caffe2 diff --git a/caffe2/operators/elementwise_ops_utils.h b/caffe2/operators/elementwise_ops_utils.h index 0a90622..93ef400 100644 --- a/caffe2/operators/elementwise_ops_utils.h +++ b/caffe2/operators/elementwise_ops_utils.h @@ -23,6 +23,12 @@ CAFFE2_API void ComputeBinaryBroadcastBackwardAxes( std::vector* A_axes, std::vector* B_axes); +CAFFE2_API void ComputeBinaryBroadcastBackwardDims( + const std::vector& A_dims, + const std::vector& B_dims, + std::vector* A_back_dims, + std::vector* B_back_dims); + } // namespace elementwise_ops_utils } // namespace caffe2 diff --git a/caffe2/operators/elementwise_sub_op.h b/caffe2/operators/elementwise_sub_op.h index 7d07a2b..cbd8962 100644 --- a/caffe2/operators/elementwise_sub_op.h +++ b/caffe2/operators/elementwise_sub_op.h @@ -47,15 +47,14 @@ struct SubFunctor { const std::vector C_dims = elementwise_ops_utils::ComputeBinaryBroadcastForwardDims( A_dims, B_dims); - std::vector A_axes; - std::vector B_axes; - elementwise_ops_utils::ComputeBinaryBroadcastBackwardAxes( - A_dims, B_dims, &A_axes, &B_axes); + std::vector A_back_dims; + std::vector B_back_dims; + elementwise_ops_utils::ComputeBinaryBroadcastBackwardDims( + A_dims, B_dims, &A_back_dims, &B_back_dims); math::ReduceSum( C_dims.size(), C_dims.data(), - A_axes.size(), - A_axes.data(), + A_back_dims.data(), TGrad(1), dC, dA, @@ -63,8 +62,7 @@ struct SubFunctor { math::ReduceSum( C_dims.size(), C_dims.data(), - B_axes.size(), - B_axes.data(), + B_back_dims.data(), TGrad(-1), dC, dB, diff --git a/caffe2/operators/expand_op.h b/caffe2/operators/expand_op.h index 5daffa5..8aabee0 100644 --- a/caffe2/operators/expand_op.h +++ b/caffe2/operators/expand_op.h @@ -94,11 +94,14 @@ class ExpandGradientOp final : public Operator { axes.push_back(i); } } + std::vector X_dims = dY_dims; + for (const int axis : axes) { + X_dims[axis] = 1; + } math::ReduceSum( dY_dims.size(), dY_dims.data(), - axes.size(), - axes.data(), + X_dims.data(), T(1), dY.template data(), dX->template mutable_data(), diff --git a/caffe2/operators/pool_op.cc b/caffe2/operators/pool_op.cc index 314e7be..a5bc914 100644 --- a/caffe2/operators/pool_op.cc +++ b/caffe2/operators/pool_op.cc @@ -572,10 +572,10 @@ bool AveragePoolFunctor:: const float* X, float* Y, CPUContext* context) const { - const std::array dims = {N * C, HxW}; - const int axis = 1; + const std::array X_dims = {N * C, HxW}; + const std::array Y_dims = {N * C, 1}; math::ReduceMean( - 2, dims.data(), 1, &axis, 1.0f, X, Y, context); + 2, X_dims.data(), Y_dims.data(), 1.0f, X, Y, context); return true; } @@ -720,10 +720,10 @@ bool MaxPoolFunctor:: const float* X, float* Y, CPUContext* context) const { - const std::array dims = {N * C, HxW}; - const int axis = 1; + const std::array X_dims = {N * C, HxW}; + const std::array Y_dims = {N * C, 1}; math::ReduceMax( - 2, dims.data(), 1, &axis, 1.0f, X, Y, context); + 2, X_dims.data(), Y_dims.data(), 1.0f, X, Y, context); return true; } diff --git a/caffe2/operators/pool_op.cu b/caffe2/operators/pool_op.cu index b9f6050..6fd1370 100644 --- a/caffe2/operators/pool_op.cu +++ b/caffe2/operators/pool_op.cu @@ -698,10 +698,10 @@ bool AveragePoolFunctor:: const float* X, float* Y, CUDAContext* context) const { - const std::array dims = {N * C, HxW}; - const int axis = 1; + const std::array X_dims = {N * C, HxW}; + const std::array Y_dims = {N * C, 1}; math::ReduceMean( - 2, dims.data(), 1, &axis, 1.0f, X, Y, context); + 2, X_dims.data(), Y_dims.data(), 1.0f, X, Y, context); return true; } @@ -1756,10 +1756,10 @@ bool MaxPoolFunctor:: const float* X, float* Y, CUDAContext* context) const { - const std::array dims = {N * C, HxW}; - const int axis = 1; + const std::array X_dims = {N * C, HxW}; + const std::array Y_dims = {N * C, 1}; math::ReduceMax( - 2, dims.data(), 1, &axis, 1.0f, X, Y, context); + 2, X_dims.data(), Y_dims.data(), 1.0f, X, Y, context); return true; } @@ -1773,10 +1773,10 @@ bool MaxPoolFunctor:: const float* X, float* Y, CUDAContext* context) const { - const std::array dims = {N, HxW, C}; - const int axis = 1; + const std::array X_dims = {N, HxW, C}; + const std::array Y_dims = {N, 1, C}; math::ReduceMax( - 3, dims.data(), 1, &axis, 1.0f, X, Y, context); + 3, X_dims.data(), Y_dims.data(), 1.0f, X, Y, context); return true; } diff --git a/caffe2/operators/reduce_ops.h b/caffe2/operators/reduce_ops.h index ab03513..f9515ae 100644 --- a/caffe2/operators/reduce_ops.h +++ b/caffe2/operators/reduce_ops.h @@ -29,13 +29,13 @@ class ReduceOp final : public Operator { template bool DoRunWithType() { const auto& X = Input(0); - const int ndim = X.dim(); + const std::vector X_dims(X.sizes().cbegin(), X.sizes().cend()); if (axes_.empty()) { axes_.resize(ndim); std::iota(axes_.begin(), axes_.end(), 0); } else { - for (auto& axis: axes_) { + for (auto& axis : axes_) { axis = X.canonical_axis_index(axis); } std::sort(axes_.begin(), axes_.end()); @@ -45,24 +45,29 @@ class ReduceOp final : public Operator { ndim, "Axes ids must be smaller than the dimensions of input."); } - const std::vector X_dims(X.sizes().cbegin(), X.sizes().cend()); - std::vector Y_dims; - Y_dims.reserve(ndim); + std::vector output_dims; + output_dims.reserve(ndim); std::size_t cur_axis = 0; for (int i = 0; i < ndim; ++i) { if (cur_axis < axes_.size() && i == axes_[cur_axis]) { if (keep_dims_) { - Y_dims.push_back(1); + output_dims.push_back(1); } ++cur_axis; } else { - Y_dims.push_back(X_dims[i]); + output_dims.push_back(X_dims[i]); } } - auto* Y = Output(0, Y_dims, at::dtype()); + auto* Y = Output(0, output_dims, at::dtype()); + + std::vector Y_dims = X_dims; + for (const int axis : axes_) { + Y_dims[axis] = 1; + } + return reducer_.template Forward( X_dims, - axes_, + Y_dims, X.template data(), Y->template mutable_data(), &context_); @@ -71,7 +76,7 @@ class ReduceOp final : public Operator { private: std::vector axes_; const int keep_dims_; - Reducer reducer_{}; + const Reducer reducer_{}; }; template @@ -98,7 +103,7 @@ class ReduceGradientOp final : public Operator { axes_.resize(ndim); std::iota(axes_.begin(), axes_.end(), 0); } else { - for (auto& axis: axes_) { + for (auto& axis : axes_) { axis = X.canonical_axis_index(axis); } std::sort(axes_.begin(), axes_.end()); @@ -126,23 +131,22 @@ class ReduceGradientOp final : public Operator { private: std::vector axes_; - Reducer reducer_{}; + const Reducer reducer_{}; }; template struct MinReducer { template bool Forward( - const std::vector& dims, - const std::vector& axes, + const std::vector& X_dims, + const std::vector& Y_dims, const T* X_data, T* Y_data, Context* context) const { math::ReduceMin( - dims.size(), - dims.data(), - axes.size(), - axes.data(), + X_dims.size(), + X_dims.data(), + Y_dims.data(), T(1), X_data, Y_data, @@ -165,16 +169,15 @@ template struct MaxReducer { template bool Forward( - const std::vector& dims, - const std::vector& axes, + const std::vector& X_dims, + const std::vector& Y_dims, const T* X_data, T* Y_data, Context* context) const { math::ReduceMax( - dims.size(), - dims.data(), - axes.size(), - axes.data(), + X_dims.size(), + X_dims.data(), + Y_dims.data(), T(1), X_data, Y_data, @@ -197,16 +200,15 @@ template struct SumReducer { template bool Forward( - const std::vector& dims, - const std::vector& axes, + const std::vector& X_dims, + const std::vector& Y_dims, const T* X_data, T* Y_data, Context* context) const { math::ReduceSum( - dims.size(), - dims.data(), - axes.size(), - axes.data(), + X_dims.size(), + X_dims.data(), + Y_dims.data(), T(1), X_data, Y_data, @@ -240,16 +242,15 @@ template struct MeanReducer { template bool Forward( - const std::vector& dims, - const std::vector& axes, + const std::vector& X_dims, + const std::vector& Y_dims, const T* X_data, T* Y_data, Context* context) const { math::ReduceMean( - dims.size(), - dims.data(), - axes.size(), - axes.data(), + X_dims.size(), + X_dims.data(), + Y_dims.data(), T(1), X_data, Y_data, @@ -287,16 +288,15 @@ template struct L1Reducer { template bool Forward( - const std::vector& dims, - const std::vector& axes, + const std::vector& X_dims, + const std::vector& Y_dims, const T* X_data, T* Y_data, Context* context) const { math::ReduceL1( - dims.size(), - dims.data(), - axes.size(), - axes.data(), + X_dims.size(), + X_dims.data(), + Y_dims.data(), T(1), X_data, Y_data, @@ -319,16 +319,15 @@ template struct L2Reducer { template bool Forward( - const std::vector& dims, - const std::vector& axes, + const std::vector& X_dims, + const std::vector& Y_dims, const T* X_data, T* Y_data, Context* context) const { math::ReduceL2( - dims.size(), - dims.data(), - axes.size(), - axes.data(), + X_dims.size(), + X_dims.data(), + Y_dims.data(), T(1), X_data, Y_data, diff --git a/caffe2/utils/math.h b/caffe2/utils/math.h index 2ea9601..a3dfdc1 100644 --- a/caffe2/utils/math.h +++ b/caffe2/utils/math.h @@ -31,34 +31,34 @@ class CAFFE2_API DefaultEngine {}; namespace math { -#define C10_DECLARE_COMPARE_OP(Comp) \ - template \ - void Rowwise##Comp( \ - const int rows, \ - const int cols, \ - const T* A, \ - const T* B, \ - bool* C, \ - Context* context); \ - \ - template \ - void Colwise##Comp( \ - const int rows, \ - const int cols, \ - const T* A, \ - const T* B, \ - bool* C, \ - Context* context); \ - \ - template \ - void Comp( \ - const int A_ndim, \ - const int* A_dims, \ - const int B_ndim, \ - const int* B_dims, \ - const T* A, \ - const T* B, \ - bool* C, \ +#define C10_DECLARE_COMPARE_OP(Comp) \ + template \ + void Rowwise##Comp( \ + const int rows, \ + const int cols, \ + const T* A, \ + const T* B, \ + bool* C, \ + Context* context); \ + \ + template \ + void Colwise##Comp( \ + const int rows, \ + const int cols, \ + const T* A, \ + const T* B, \ + bool* C, \ + Context* context); \ + \ + template \ + void Comp( \ + const int A_ndim, \ + const int* A_dims, \ + const int B_ndim, \ + const int* B_dims, \ + const T* A, \ + const T* B, \ + bool* C, \ Context* context); C10_DECLARE_COMPARE_OP(EQ) @@ -115,80 +115,6 @@ C10_DECLARE_BINARY_OP(BitwiseXor) #undef C10_DECLARE_BINARY_OP -template -CAFFE2_API void -ReduceMin(const int N, const T* x, T* y, Tensor* scratch_ptr, Context* context); - -template -CAFFE2_API void -ReduceMax(const int N, const T* x, T* y, Tensor* scratch_ptr, Context* context); - -template -CAFFE2_API void ReduceMin( - const int num_dims, - const int* dims, - const int num_axes, - const int* axes, - const T alpha, - const T* X, - T* Y, - Context* context); - -template -CAFFE2_API void ReduceMax( - const int num_dims, - const int* dims, - const int num_axes, - const int* axes, - const T alpha, - const T* X, - T* Y, - Context* context); - -template -CAFFE2_API void ReduceSum( - const int num_dims, - const int* dims, - const int num_axes, - const int* axes, - const T alpha, - const T* X, - T* Y, - Context* context); - -template -CAFFE2_API void ReduceMean( - const int num_dims, - const int* dims, - const int num_axes, - const int* axes, - const T alpha, - const T* X, - T* Y, - Context* context); - -template -CAFFE2_API void ReduceL1( - const int num_dims, - const int* dims, - const int num_axes, - const int* axes, - const T alpha, - const T* X, - T* Y, - Context* context); - -template -CAFFE2_API void ReduceL2( - const int num_dims, - const int* dims, - const int num_axes, - const int* axes, - const T alpha, - const T* X, - T* Y, - Context* context); - // Broadcasts X with X_dims to Y with Y_dims. template CAFFE2_API void Broadcast( @@ -338,9 +264,6 @@ CAFFE2_API void Gemv( TensorProto::DataType math_type = TensorProto_DataType_FLOAT); template -CAFFE2_API void Set(const size_t N, const T alpha, T* X, Context* context); - -template CAFFE2_API void RandUniform(const size_t n, const T a, const T b, T* r, Context* context); @@ -409,25 +332,6 @@ CAFFE2_API void Select( T* y, Context* context); -template -CAFFE2_API void Scale( - const int N, - const TAlpha alpha, - const TData* x, - TData* y, - Context* context); - -// Different from the Scale function above, if alpha is passed in -// as a pointer, we will assume that it lives on the Context device, -// for example on GPU. -template -CAFFE2_API void Scale( - const int N, - const TAlpha* alpha, - const TData* x, - TData* y, - Context* context); - template CAFFE2_API void Axpy(const int N, const float alpha, const T* x, T* y, Context* context); diff --git a/caffe2/utils/math/elementwise.cc b/caffe2/utils/math/elementwise.cc index f392829..08d723c 100644 --- a/caffe2/utils/math/elementwise.cc +++ b/caffe2/utils/math/elementwise.cc @@ -3,6 +3,10 @@ #include #include +#ifdef CAFFE2_USE_ACCELERATE +#include +#endif // CAFFE2_USE_ACCELERATE + #ifdef CAFFE2_USE_MKL #include #endif // CAFFE2_USE_MKL @@ -73,25 +77,25 @@ DELEGATE_SIMPLE_UNARY_FUNCTION(float, Erf, vsErf) DELEGATE_SIMPLE_UNARY_FUNCTION(double, Erf, vdErf) #undef DELEGATE_SIMPLE_UNARY_FUNCTION -#define DELEGATE_SINCOS_FUNCTION(T, MKLFunc) \ +#define DELEGATE_SINCOS(T, MKLFunc) \ template <> \ C10_EXPORT void SinCos( \ const int N, const T* X, T* S, T* C, CPUContext* /* context */) { \ MKLFunc(N, X, S, C); \ } -DELEGATE_SINCOS_FUNCTION(float, vsSinCos) -DELEGATE_SINCOS_FUNCTION(double, vdSinCos) -#undef DELEGATE_SINCOS_FUNCTION +DELEGATE_SINCOS(float, vsSinCos) +DELEGATE_SINCOS(double, vdSinCos) +#undef DELEGATE_SINCOS -#define DELEGATE_POWX_FUNCTION(T, MKLFunc) \ +#define DELEGATE_POWX(T, MKLFunc) \ template <> \ C10_EXPORT void Powx( \ const int N, const T* A, const T b, T* Y, CPUContext* /* context */) { \ MKLFunc(N, A, b, Y); \ } -DELEGATE_POWX_FUNCTION(float, vsPowx) -DELEGATE_POWX_FUNCTION(double, vdPowx) -#undef DELEGATE_POWX_FUNCTION +DELEGATE_POWX(float, vsPowx) +DELEGATE_POWX(double, vdPowx) +#undef DELEGATE_POWX #define DELEGATE_SIMPLE_BINARY_FUNCTION(T, Func, MKLFunc) \ template <> \ @@ -228,6 +232,155 @@ DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_OPERATOR(double, Div, /) #endif // CAFFE2_USE_MKL +//////////////////////////////////////////////////////////////////////////////// +// BLAS alternatives. +// Depending on whether we have specified an external BLAS library or not, we +// will delegate the Caffe math functions that are BLAS-related to either the +// CBLAS call or the Eigen implementation. +//////////////////////////////////////////////////////////////////////////////// +#ifdef CAFFE2_USE_EIGEN_FOR_BLAS + +#define CAFFE2_SPECIALIZED_SCALE(TAlpha, TData) \ + template <> \ + C10_EXPORT void Scale( \ + const int N, \ + const TAlpha alpha, \ + const TData* X, \ + TData* Y, \ + CPUContext* /* context */) { \ + if (X == Y) { \ + EigenVectorArrayMap(Y, N) *= static_cast(alpha); \ + } else { \ + EigenVectorArrayMap(Y, N) = \ + ConstEigenVectorArrayMap(X, N) * static_cast(alpha); \ + } \ + } \ + template <> \ + C10_EXPORT void Scale( \ + const int N, \ + const TAlpha* alpha, \ + const TData* X, \ + TData* Y, \ + CPUContext* /* context */) { \ + if (X == Y) { \ + EigenVectorArrayMap(Y, N) *= static_cast(*alpha); \ + } else { \ + EigenVectorArrayMap(Y, N) = \ + ConstEigenVectorArrayMap(X, N) * static_cast(*alpha); \ + } \ + } +CAFFE2_SPECIALIZED_SCALE(float, float) +CAFFE2_SPECIALIZED_SCALE(double, double) +CAFFE2_SPECIALIZED_SCALE(float, double) +#undef CAFFE2_SPECIALIZED_SCALE + +#else // CAFFE2_USE_EIGEN_FOR_BLAS + +#ifdef CAFFE2_USE_MKL + +#define DELEGATE_SCALE(TAlpha, TData, MKLFunc1, MKLFunc2) \ + template <> \ + C10_EXPORT void Scale( \ + const int N, \ + const TAlpha alpha, \ + const TData* X, \ + TData* Y, \ + CPUContext* /* context */) { \ + if (Y == X) { \ + MKLFunc1(N, static_cast(alpha), Y, 1); \ + } else { \ + MKLFunc2(N, static_cast(alpha), X, 1, TData(0), Y, 1); \ + } \ + } \ + template <> \ + C10_EXPORT void Scale( \ + const int N, \ + const TAlpha* alpha, \ + const TData* X, \ + TData* Y, \ + CPUContext* /* context */) { \ + if (Y == X) { \ + MKLFunc1(N, static_cast(*alpha), Y, 1); \ + } else { \ + MKLFunc2(N, static_cast(*alpha), X, 1, TData(0), Y, 1); \ + } \ + } +DELEGATE_SCALE(float, float, cblas_sscal, cblas_saxpby) +DELEGATE_SCALE(double, double, cblas_dscal, cblas_daxpby) +DELEGATE_SCALE(float, double, cblas_dscal, cblas_daxpby) +#undef DELEGATE_SCALE + +#else // CAFFE2_USE_MKL + +#define DELEGATE_SCALE(TAlpha, TData, BLASFunc) \ + template <> \ + C10_EXPORT void Scale( \ + const int N, \ + const TAlpha alpha, \ + const TData* X, \ + TData* Y, \ + CPUContext* /* context */) { \ + if (Y == X) { \ + BLASFunc(N, static_cast(alpha), Y, 1); \ + } else { \ + EigenVectorArrayMap(Y, N) = \ + ConstEigenVectorArrayMap(X, N) * static_cast(alpha); \ + } \ + } \ + template <> \ + C10_EXPORT void Scale( \ + const int N, \ + const TAlpha* alpha, \ + const TData* X, \ + TData* Y, \ + CPUContext* /* context */) { \ + if (Y == X) { \ + BLASFunc(N, static_cast(*alpha), Y, 1); \ + } else { \ + EigenVectorArrayMap(Y, N) = \ + ConstEigenVectorArrayMap(X, N) * static_cast(*alpha); \ + } \ + } +DELEGATE_SCALE(float, float, cblas_sscal) +DELEGATE_SCALE(double, double, cblas_dscal) +DELEGATE_SCALE(float, double, cblas_dscal) +#undef DELEGATE_SCALE + +#endif // CAFFE2_USE_MKL + +#endif // CAFFE2_USE_EIGEN_FOR_BLAS + +//////////////////////////////////////////////////////////////////////////////// +// Common math functions being used in Caffe that do not have a BLAS or MKL +// equivalent. For all these functions, we will simply implement them either via +// Eigen or via custom code. +//////////////////////////////////////////////////////////////////////////////// + +#define CAFFE2_SPECIALIZED_SET(T) \ + template <> \ + C10_EXPORT void Set( \ + const int N, const T alpha, T* Y, CPUContext* /* context */) { \ + if (N == 0) { \ + return; \ + } \ + if (alpha == T(0)) { \ + std::memset(Y, 0, N * sizeof(T)); \ + } else { \ + EigenVectorArrayMap(Y, N).setConstant(alpha); \ + } \ + } +CAFFE2_SPECIALIZED_SET(float) +CAFFE2_SPECIALIZED_SET(double) +CAFFE2_SPECIALIZED_SET(int) +CAFFE2_SPECIALIZED_SET(std::int8_t) +CAFFE2_SPECIALIZED_SET(std::int16_t) +CAFFE2_SPECIALIZED_SET(std::int64_t) +CAFFE2_SPECIALIZED_SET(bool) +CAFFE2_SPECIALIZED_SET(char) +CAFFE2_SPECIALIZED_SET(std::uint8_t) +CAFFE2_SPECIALIZED_SET(std::uint16_t) +#undef CAFFE2_SPECIALIZED_SET + #define DELEGATE_SIMPLE_UNARY_FUNCTION(T, Func, EigenFunc) \ template <> \ C10_EXPORT void Func( \ @@ -262,6 +415,39 @@ CAFFE2_SPECIALIZED_NEG(float) CAFFE2_SPECIALIZED_NEG(double) #undef CAFFE2_SPECIALIZED_NEG +#define CAFFE2_SPECIALIZED_SCALE(TAlpha, TData) \ + template <> \ + C10_EXPORT void Scale( \ + const int N, \ + const TAlpha alpha, \ + const TData* X, \ + TData* Y, \ + CPUContext* /* context */) { \ + if (X == Y) { \ + EigenVectorArrayMap(Y, N) *= static_cast(alpha); \ + } else { \ + EigenVectorArrayMap(Y, N) = \ + ConstEigenVectorArrayMap(X, N) * static_cast(alpha); \ + } \ + } \ + template <> \ + C10_EXPORT void Scale( \ + const int N, \ + const TAlpha* alpha, \ + const TData* X, \ + TData* Y, \ + CPUContext* /* context */) { \ + if (X == Y) { \ + EigenVectorArrayMap(Y, N) *= static_cast(*alpha); \ + } else { \ + EigenVectorArrayMap(Y, N) = \ + ConstEigenVectorArrayMap(X, N) * static_cast(*alpha); \ + } \ + } +CAFFE2_SPECIALIZED_SCALE(std::int32_t, std::int32_t) +CAFFE2_SPECIALIZED_SCALE(std::int64_t, std::int64_t) +#undef CAFFE2_SPECIALIZED_SCALE + #define DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_OPERATOR(T, Func, EigenOp) \ template <> \ C10_EXPORT void Func( \ @@ -286,8 +472,12 @@ DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_OPERATOR(std::int64_t, Div, /) EigenVectorMap(C, N) = ConstEigenVectorArrayMap(A, N).EigenFunc( \ ConstEigenVectorArrayMap(B, N)); \ } +DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_FUNCTION(std::int32_t, Min, min) +DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_FUNCTION(std::int64_t, Min, min) DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_FUNCTION(float, Min, min) DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_FUNCTION(double, Min, min) +DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_FUNCTION(std::int32_t, Max, max) +DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_FUNCTION(std::int64_t, Max, max) DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_FUNCTION(float, Max, max) DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_FUNCTION(double, Max, max) #undef DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_FUNCTION diff --git a/caffe2/utils/math/elementwise.h b/caffe2/utils/math/elementwise.h index b11d22b..32ef6be 100644 --- a/caffe2/utils/math/elementwise.h +++ b/caffe2/utils/math/elementwise.h @@ -57,6 +57,19 @@ template CAFFE2_API void Erf(int N, const T* X, T* Y, Context* context); template +CAFFE2_API void Set(int N, T alpha, T* X, Context* context); + +template +CAFFE2_API void +Scale(int N, TAlpha alpha, const TData* X, TData* Y, Context* context); + +// Different from the Scale function above, if alpha is passed in as a pointer, +// we will assume that it lives on the Context device, for example on GPU. +template +CAFFE2_API void +Scale(int N, const TAlpha* alpha, const TData* X, TData* Y, Context* context); + +template CAFFE2_API void Add(int N, const T* A, const T* B, T* C, Context* context); template CAFFE2_API void Sub(int N, const T* A, const T* B, T* C, Context* context); diff --git a/caffe2/utils/math/reduce.cc b/caffe2/utils/math/reduce.cc index 4bcbb1c..bf13634 100644 --- a/caffe2/utils/math/reduce.cc +++ b/caffe2/utils/math/reduce.cc @@ -6,8 +6,17 @@ #include #include +#ifdef CAFFE2_USE_ACCELERATE +#include +#endif // CAFFE2_USE_ACCELERATE + +#ifdef CAFFE2_USE_MKL +#include +#endif // CAFFE2_USE_MKL + #include "caffe2/core/context.h" #include "caffe2/utils/eigen_utils.h" +#include "caffe2/utils/math/elementwise.h" #include "caffe2/utils/math/utils.h" namespace caffe2 { @@ -15,9 +24,385 @@ namespace math { namespace { +#define DELEGATE_ROWWISE_REDUCE_FUNCTION(Func, EigenFunc) \ + template \ + void Rowwise##Func( \ + const int rows, \ + const int cols, \ + const T alpha, \ + const T* X, \ + T* Y, \ + CPUContext* /* context */) { \ + EigenVectorMap(Y, rows) = \ + ConstEigenMatrixMap(X, cols, rows).colwise().EigenFunc() * alpha; \ + } +DELEGATE_ROWWISE_REDUCE_FUNCTION(ReduceMin, minCoeff) +DELEGATE_ROWWISE_REDUCE_FUNCTION(ReduceMax, maxCoeff) +DELEGATE_ROWWISE_REDUCE_FUNCTION(ReduceSum, sum) +DELEGATE_ROWWISE_REDUCE_FUNCTION(ReduceMean, mean) +DELEGATE_ROWWISE_REDUCE_FUNCTION(ReduceL1, template lpNorm<1>) +DELEGATE_ROWWISE_REDUCE_FUNCTION(ReduceL2, norm) +#undef DELEGATE_ROWWISE_REDUCE_FUNCTION + +#ifndef CAFFE2_USE_EIGEN_FOR_BLAS + +#define DELEGATE_ROWWISE_REDUCE_FUNCTION(T, Func, BLASFunc) \ + template <> \ + void Rowwise##Func( \ + const int rows, \ + const int cols, \ + const T alpha, \ + const T* X, \ + T* Y, \ + CPUContext* /* context */) { \ + for (int i = 0; i < rows; ++i) { \ + Y[i] = BLASFunc(cols, X + i * cols, 1) * alpha; \ + } \ + } +DELEGATE_ROWWISE_REDUCE_FUNCTION(float, ReduceL1, cblas_sasum) +DELEGATE_ROWWISE_REDUCE_FUNCTION(double, ReduceL1, cblas_dasum) +DELEGATE_ROWWISE_REDUCE_FUNCTION(float, ReduceL2, cblas_snrm2) +DELEGATE_ROWWISE_REDUCE_FUNCTION(double, ReduceL2, cblas_dnrm2) +#undef DELEGATE_ROWWISE_REDUCE_FUNCTION + +#endif // CAFFE2_USE_EIGEN_FOR_BLAS + +#define DELEGATE_COLWISE_REDUCE_FUNCTION(Func, MathFunc) \ + template \ + void Colwise##Func( \ + const int rows, \ + const int cols, \ + const T alpha, \ + const T* X, \ + T* Y, \ + CPUContext* context) { \ + std::memcpy(Y, X, sizeof(T) * cols); \ + for (int i = 1; i < rows; ++i) { \ + MathFunc(cols, Y, X + i * cols, Y, context); \ + } \ + Scale(cols, alpha, Y, Y, context); \ + } +DELEGATE_COLWISE_REDUCE_FUNCTION(ReduceMin, Min) +DELEGATE_COLWISE_REDUCE_FUNCTION(ReduceMax, Max) +DELEGATE_COLWISE_REDUCE_FUNCTION(ReduceSum, Add) +#undef DELEGATE_COLWISE_REDUCE_FUNCTION + +template +void ColwiseReduceMean( + const int rows, + const int cols, + const T alpha, + const T* X, + T* Y, + CPUContext* context) { + ColwiseReduceSum(rows, cols, alpha / static_cast(rows), X, Y, context); +} + +template +void ColwiseReduceL1( + const int rows, + const int cols, + const T alpha, + const T* X, + T* Y, + CPUContext* context) { + ConstEigenArrayMap X_arr(X, cols, rows); + EigenVectorArrayMap Y_arr(Y, cols); + Y_arr = X_arr.col(0).abs(); + for (int i = 1; i < rows; ++i) { + Y_arr += X_arr.col(i).abs(); + } + Scale(cols, alpha, Y, Y, context); +} + +template +void ColwiseReduceL2( + const int rows, + const int cols, + const T alpha, + const T* X, + T* Y, + CPUContext* /* context */) { + ConstEigenArrayMap X_arr(X, cols, rows); + EigenVectorArrayMap Y_arr(Y, cols); + Y_arr = X_arr.col(0).square(); + for (int i = 1; i < rows; ++i) { + Y_arr += X_arr.col(i).square(); + } + Y_arr = Y_arr.sqrt() * alpha; +} + +template +void BothEndsReduceMin( + const int M, + const int N, + const int K, + const T alpha, + const T* X, + T* Y, + CPUContext* context) { + EigenVectorArrayMap Y_arr(Y, N); + Y_arr = ConstEigenArrayMap(X, K, N).colwise().minCoeff(); + for (int i = 1; i < M; ++i) { + ConstEigenArrayMap X_arr(X + i * N * K, K, N); + for (int j = 0; j < N; ++j) { + Y[j] = std::min(Y[j], X_arr.col(j).minCoeff()); + } + } + Scale(N, alpha, Y, Y, context); +} + +template +void BothEndsReduceMax( + const int M, + const int N, + const int K, + const T alpha, + const T* X, + T* Y, + CPUContext* context) { + EigenVectorArrayMap Y_arr(Y, N); + Y_arr = ConstEigenArrayMap(X, K, N).colwise().maxCoeff(); + for (int i = 1; i < M; ++i) { + ConstEigenArrayMap X_arr(X + i * N * K, K, N); + for (int j = 0; j < N; ++j) { + Y[j] = std::max(Y[j], X_arr.col(j).maxCoeff()); + } + } + Scale(N, alpha, Y, Y, context); +} + +template +void BothEndsReduceSum( + const int M, + const int N, + const int K, + const T alpha, + const T* X, + T* Y, + CPUContext* context) { + EigenVectorArrayMap Y_arr(Y, N); + Y_arr = ConstEigenArrayMap(X, K, N).colwise().sum(); + for (int i = 1; i < M; ++i) { + Y_arr += ConstEigenArrayMap(X + i * N * K, K, N).colwise().sum(); + } + Scale(N, alpha, Y, Y, context); +} + +template +void BothEndsReduceMean( + const int M, + const int N, + const int K, + const T alpha, + const T* X, + T* Y, + CPUContext* context) { + EigenVectorArrayMap Y_arr(Y, N); + Y_arr = ConstEigenArrayMap(X, K, N).colwise().mean(); + for (int i = 1; i < M; ++i) { + Y_arr += ConstEigenArrayMap(X + i * N * K, K, N).colwise().mean(); + } + Scale(N, alpha / static_cast(M), Y, Y, context); +} + +template +void BothEndsReduceL1( + const int M, + const int N, + const int K, + const T alpha, + const T* X, + T* Y, + CPUContext* context) { + EigenVectorMap Y_vec(Y, N); + Y_vec = ConstEigenMatrixMap(X, K, N).colwise().template lpNorm<1>(); + for (int i = 1; i < M; ++i) { + Y_vec += ConstEigenMatrixMap(X + i * N * K, K, N) + .colwise() + .template lpNorm<1>(); + } + Scale(N, alpha, Y, Y, context); +} + +template +void BothEndsReduceL2( + const int M, + const int N, + const int K, + const T alpha, + const T* X, + T* Y, + CPUContext* /* context */) { + EigenVectorMap Y_vec(Y, N); + Y_vec = ConstEigenMatrixMap(X, K, N).colwise().squaredNorm(); + for (int i = 1; i < M; ++i) { + Y_vec += + ConstEigenMatrixMap(X + i * N * K, K, N).colwise().squaredNorm(); + } + Y_vec = Y_vec.cwiseSqrt() * alpha; +} + +template +void ReduceTensorImpl( + const int ndim, + const int* X_dims, + const int* Y_dims, + const Reducer& reducer, + const T init, + const T* X, + T* Y, + CPUContext* context) { + const int X_size = + std::accumulate(X_dims, X_dims + ndim, 1, std::multiplies()); + const int Y_size = + std::accumulate(Y_dims, Y_dims + ndim, 1, std::multiplies()); + Set(Y_size, init, Y, context); + std::vector index(ndim, 0); + for (int X_index = 0; X_index < X_size; ++X_index) { + const int Y_index = utils::GetIndexFromDims(ndim, Y_dims, index.data()); + Y[Y_index] = reducer(Y[Y_index], X[X_index]); + utils::IncreaseIndexInDims(ndim, X_dims, index.data()); + } +} + +template +void ReduceMinImpl( + const int ndim, + const int* X_dims, + const int* Y_dims, + const T alpha, + const T* X, + T* Y, + CPUContext* context) { + ReduceTensorImpl( + ndim, + X_dims, + Y_dims, + [](const T a, const T b) { return std::min(a, b); }, + std::numeric_limits::max(), + X, + Y, + context); + const int Y_size = + std::accumulate(Y_dims, Y_dims + ndim, 1, std::multiplies()); + Scale(Y_size, alpha, Y, Y, context); +} + +template +void ReduceMaxImpl( + const int ndim, + const int* X_dims, + const int* Y_dims, + const T alpha, + const T* X, + T* Y, + CPUContext* context) { + ReduceTensorImpl( + ndim, + X_dims, + Y_dims, + [](const T a, const T b) { return std::max(a, b); }, + std::numeric_limits::lowest(), + X, + Y, + context); + const int Y_size = + std::accumulate(Y_dims, Y_dims + ndim, 1, std::multiplies()); + Scale(Y_size, alpha, Y, Y, context); +} + +template +void ReduceSumImpl( + const int ndim, + const int* X_dims, + const int* Y_dims, + const T alpha, + const T* X, + T* Y, + CPUContext* context) { + ReduceTensorImpl(ndim, X_dims, Y_dims, std::plus(), T(0), X, Y, context); + const int Y_size = + std::accumulate(Y_dims, Y_dims + ndim, 1, std::multiplies()); + Scale(Y_size, alpha, Y, Y, context); +} + template -C10_EXPORT void -RowwiseMoments(const int rows, const int cols, const T* X, T* mean, T* var) { +void ReduceMeanImpl( + const int ndim, + const int* X_dims, + const int* Y_dims, + const T alpha, + const T* X, + T* Y, + CPUContext* context) { + ReduceTensorImpl(ndim, X_dims, Y_dims, std::plus(), T(0), X, Y, context); + const int X_size = + std::accumulate(X_dims, X_dims + ndim, 1, std::multiplies()); + const int Y_size = + std::accumulate(Y_dims, Y_dims + ndim, 1, std::multiplies()); + Scale( + Y_size, + alpha * static_cast(Y_size) / static_cast(X_size), + Y, + Y, + context); +} + +template +void ReduceL1Impl( + const int ndim, + const int* X_dims, + const int* Y_dims, + const T alpha, + const T* X, + T* Y, + CPUContext* context) { + ReduceTensorImpl( + ndim, + X_dims, + Y_dims, + [](const T a, const T b) { return a + std::abs(b); }, + T(0), + X, + Y, + context); + const int Y_size = + std::accumulate(Y_dims, Y_dims + ndim, 1, std::multiplies()); + Scale(Y_size, alpha, Y, Y, context); +} + +template +void ReduceL2Impl( + const int ndim, + const int* X_dims, + const int* Y_dims, + const T alpha, + const T* X, + T* Y, + CPUContext* context) { + ReduceTensorImpl( + ndim, + X_dims, + Y_dims, + [](const T a, const T b) { return a + b * b; }, + T(0), + X, + Y, + context); + const int Y_size = + std::accumulate(Y_dims, Y_dims + ndim, 1, std::multiplies()); + EigenVectorArrayMap Y_arr(Y, Y_size); + Y_arr = Y_arr.sqrt() * alpha; +} + +template +void RowwiseMoments( + const int rows, + const int cols, + const T* X, + T* mean, + T* var) { ConstEigenArrayMap X_arr(X, cols, rows); EigenVectorArrayMap mean_arr(mean, rows); EigenVectorArrayMap var_arr(var, rows); @@ -26,15 +411,19 @@ RowwiseMoments(const int rows, const int cols, const T* X, T* mean, T* var) { } template -C10_EXPORT void -ColwiseMoments(const int rows, const int cols, const T* X, T* mean, T* var) { - std::memset(mean, 0, sizeof(T) * cols); - std::memset(var, 0, sizeof(T) * cols); +void ColwiseMoments( + const int rows, + const int cols, + const T* X, + T* mean, + T* var) { ConstEigenArrayMap X_arr(X, cols, rows); EigenVectorArrayMap mean_arr(mean, cols); EigenVectorArrayMap var_arr(var, cols); // Eigen rowwise reduction is about 10 times slower than this for-loop. - for (int i = 0; i < rows; ++i) { + mean_arr = X_arr.col(0); + var_arr = X_arr.col(0).square(); + for (int i = 1; i < rows; ++i) { mean_arr += X_arr.col(i); var_arr += X_arr.col(i).square(); } @@ -44,32 +433,30 @@ ColwiseMoments(const int rows, const int cols, const T* X, T* mean, T* var) { } template -C10_EXPORT void BothEndsMoments( - const int pre, - const int mid, - const int nxt, +void BothEndsMoments( + const int M, + const int N, + const int K, const T* X, T* mean, T* var) { - std::memset(mean, 0, sizeof(T) * mid); - std::memset(var, 0, sizeof(T) * mid); - EigenVectorArrayMap mean_arr(mean, mid); - EigenVectorArrayMap var_arr(var, mid); - ConstEigenArrayMap X_arr(X, nxt, pre * mid); - for (int i = 0; i < pre; ++i) { - for (int j = 0; j < mid; ++j) { - const int c = i * mid + j; - mean_arr(j) += X_arr.col(c).sum(); - var_arr(j) += X_arr.col(c).square().sum(); - } + EigenVectorArrayMap mean_arr(mean, N); + EigenVectorArrayMap var_arr(var, N); + ConstEigenArrayMap X0_arr(X, K, N); + mean_arr = X0_arr.colwise().sum(); + var_arr = X0_arr.square().colwise().sum(); + for (int i = 1; i < M; ++i) { + ConstEigenArrayMap X_arr(X + i * N * K, K, N); + mean_arr += X_arr.colwise().sum(); + var_arr += X_arr.square().colwise().sum(); } - const T scale = T(1) / static_cast(pre * nxt); + const T scale = T(1) / static_cast(M * K); mean_arr *= scale; var_arr = var_arr * scale - mean_arr.square(); } template -C10_EXPORT void MomentsImpl( +void MomentsImpl( const int ndim, const int* X_dims, const int* Y_dims, @@ -126,6 +513,128 @@ C10_EXPORT void MomentsImpl( } // namespace +#define DELEGATE_GLOBAL_REDUCE_FUNCTION(T, Func, EigenFunc) \ + template <> \ + C10_EXPORT void Func( \ + const int N, \ + const T* X, \ + T* Y, \ + Tensor* /* scratch_ptr */, \ + CPUContext* /* context */) { \ + *Y = ConstEigenVectorArrayMap(X, N).EigenFunc(); \ + } +DELEGATE_GLOBAL_REDUCE_FUNCTION(float, ReduceMin, minCoeff) +DELEGATE_GLOBAL_REDUCE_FUNCTION(std::int32_t, ReduceMin, minCoeff) +DELEGATE_GLOBAL_REDUCE_FUNCTION(std::int64_t, ReduceMin, minCoeff) +DELEGATE_GLOBAL_REDUCE_FUNCTION(float, ReduceMax, maxCoeff) +DELEGATE_GLOBAL_REDUCE_FUNCTION(std::int32_t, ReduceMax, maxCoeff) +DELEGATE_GLOBAL_REDUCE_FUNCTION(std::int64_t, ReduceMax, maxCoeff) +#undef DELEGATE_GLOBAL_REDUCE_FUNCTION + +#define DELEGATE_REDUCE_FUNCTION(T, Func, kInit, kIsNorm) \ + template <> \ + C10_EXPORT void Func( \ + const int ndim, \ + const int* X_dims, \ + const int* Y_dims, \ + const T alpha, \ + const T* X, \ + T* Y, \ + CPUContext* context) { \ + const int X_size = \ + std::accumulate(X_dims, X_dims + ndim, 1, std::multiplies()); \ + const int Y_size = \ + std::accumulate(Y_dims, Y_dims + ndim, 1, std::multiplies()); \ + if (X_size == 0) { \ + Set(Y_size, alpha * kInit, Y, context); \ + return; \ + } \ + if (alpha == T(0)) { \ + std::memset(Y, 0, sizeof(T) * Y_size); \ + return; \ + } \ + if (std::equal(X_dims, X_dims + ndim, Y_dims)) { \ + if (kIsNorm) { \ + EigenVectorArrayMap(Y, Y_size) = \ + ConstEigenVectorArrayMap(X, X_size).abs() * alpha; \ + } else { \ + Scale(Y_size, alpha, X, Y, context); \ + } \ + return; \ + } \ + int rows; \ + int cols; \ + if (utils::IsRowwiseReduce(ndim, X_dims, Y_dims, &rows, &cols)) { \ + Rowwise##Func(rows, cols, alpha, X, Y, context); \ + return; \ + } \ + if (utils::IsColwiseReduce(ndim, X_dims, Y_dims, &rows, &cols)) { \ + Colwise##Func(rows, cols, alpha, X, Y, context); \ + return; \ + } \ + int M; \ + int N; \ + int K; \ + if (utils::IsBothEndsReduce(ndim, X_dims, Y_dims, &M, &N, &K)) { \ + BothEnds##Func(M, N, K, alpha, X, Y, context); \ + return; \ + } \ + Func##Impl(ndim, X_dims, Y_dims, alpha, X, Y, context); \ + } +DELEGATE_REDUCE_FUNCTION( + float, + ReduceMin, + std::numeric_limits::max(), + false) +DELEGATE_REDUCE_FUNCTION( + double, + ReduceMin, + std::numeric_limits::max(), + false) +DELEGATE_REDUCE_FUNCTION( + std::int32_t, + ReduceMin, + std::numeric_limits::max(), + false) +DELEGATE_REDUCE_FUNCTION( + std::int64_t, + ReduceMin, + std::numeric_limits::max(), + false) +DELEGATE_REDUCE_FUNCTION( + float, + ReduceMax, + std::numeric_limits::lowest(), + false) +DELEGATE_REDUCE_FUNCTION( + double, + ReduceMax, + std::numeric_limits::lowest(), + false) +DELEGATE_REDUCE_FUNCTION( + std::int32_t, + ReduceMax, + std::numeric_limits::lowest(), + false) +DELEGATE_REDUCE_FUNCTION( + std::int64_t, + ReduceMax, + std::numeric_limits::lowest(), + false) +DELEGATE_REDUCE_FUNCTION(float, ReduceSum, 0.0f, false) +DELEGATE_REDUCE_FUNCTION(double, ReduceSum, 0.0, false) +DELEGATE_REDUCE_FUNCTION(std::int32_t, ReduceSum, 0, false) +DELEGATE_REDUCE_FUNCTION(std::int64_t, ReduceSum, 0LL, false) +DELEGATE_REDUCE_FUNCTION(float, ReduceMean, 0.0f, false) +DELEGATE_REDUCE_FUNCTION(double, ReduceMean, 0.0, false) +DELEGATE_REDUCE_FUNCTION(float, ReduceL1, 0.0f, true) +DELEGATE_REDUCE_FUNCTION(double, ReduceL1, 0.0, true) +DELEGATE_REDUCE_FUNCTION(std::int32_t, ReduceL1, 0, true) +DELEGATE_REDUCE_FUNCTION(std::int64_t, ReduceL1, 0LL, true) +DELEGATE_REDUCE_FUNCTION(float, ReduceL2, 0.0f, true) +DELEGATE_REDUCE_FUNCTION(double, ReduceL2, 0.0, true) +#undef DELEGATE_REDUCE_FUNCTION + #define CAFFE2_SPECIALIZED_MOMENTS(T) \ template <> \ C10_EXPORT void Moments( \ diff --git a/caffe2/utils/math/reduce.h b/caffe2/utils/math/reduce.h index fce3c95..7f8b835 100644 --- a/caffe2/utils/math/reduce.h +++ b/caffe2/utils/math/reduce.h @@ -5,8 +5,90 @@ #include "caffe2/core/types.h" namespace caffe2 { + +class Tensor; + namespace math { +template +CAFFE2_API void +ReduceMin(const int N, const T* X, T* y, Tensor* scratch_ptr, Context* context); + +template +CAFFE2_API void +ReduceMax(const int N, const T* X, T* y, Tensor* scratch_ptr, Context* context); + +// In all of the reduce functions, X_dims and Y_dims should have ndim elements. +// Each dimension of Y_dims must match the corresponding dimension of X_dims or +// must be equal to 1. The dimensions equal to 1 indicate the dimensions of X to +// be reduced. + +// Y = alpha * ReduceMin(X) +template +CAFFE2_API void ReduceMin( + const int ndim, + const int* X_dims, + const int* Y_dims, + const T alpha, + const T* X, + T* Y, + Context* context); + +// Y = alpha * ReduceMax(X) +template +CAFFE2_API void ReduceMax( + const int ndim, + const int* X_dims, + const int* Y_dims, + const T alpha, + const T* X, + T* Y, + Context* context); + +// Y = alpha * ReduceSum(X) +template +CAFFE2_API void ReduceSum( + const int ndim, + const int* X_dims, + const int* Y_dims, + const T alpha, + const T* X, + T* Y, + Context* context); + +// Y = alpha * ReduceMean(X) +template +CAFFE2_API void ReduceMean( + const int ndim, + const int* X_dims, + const int* Y_dims, + const T alpha, + const T* X, + T* Y, + Context* context); + +// Y = alpha * ReduceL1(X) +template +CAFFE2_API void ReduceL1( + const int ndim, + const int* X_dims, + const int* Y_dims, + const T alpha, + const T* X, + T* Y, + Context* context); + +// Y = alpha * ReduceL2(X) +template +CAFFE2_API void ReduceL2( + const int ndim, + const int* X_dims, + const int* Y_dims, + const T alpha, + const T* X, + T* Y, + Context* context); + // Computes mean and variance over axes. template CAFFE2_API void Moments( @@ -19,6 +101,7 @@ CAFFE2_API void Moments( Context* context); } // namespace math + } // namespace caffe2 #endif // CAFFE2_UTILS_MATH_REDUCE_H_ diff --git a/caffe2/utils/math_cpu.cc b/caffe2/utils/math_cpu.cc index 3bbc1ac..895b561 100644 --- a/caffe2/utils/math_cpu.cc +++ b/caffe2/utils/math_cpu.cc @@ -375,40 +375,6 @@ C10_EXPORT void Gemv( cblas_sgemv(CblasRowMajor, trans_A, M, N, alpha, A, N, x, 1, beta, y, 1); } -#define CAFFE2_SPECIALIZED_SCALE(TAlpha, TData, prefix) \ - template <> \ - C10_EXPORT void Scale( \ - const int n, \ - const TAlpha alpha, \ - const TData* x, \ - TData* y, \ - CPUContext*) { \ - if (y != x) { \ - cblas_##prefix##copy(n, x, 1, y, 1); \ - } \ - if (alpha != TAlpha(1)) { \ - cblas_##prefix##scal(n, static_cast(alpha), y, 1); \ - } \ - } \ - template <> \ - C10_EXPORT void Scale( \ - const int n, \ - const TAlpha* alpha, \ - const TData* x, \ - TData* y, \ - CPUContext*) { \ - if (y != x) { \ - cblas_##prefix##copy(n, x, 1, y, 1); \ - } \ - if (*alpha != TAlpha(1)) { \ - cblas_##prefix##scal(n, static_cast(*alpha), y, 1); \ - } \ - } -CAFFE2_SPECIALIZED_SCALE(float, float, s) -CAFFE2_SPECIALIZED_SCALE(double, double, d) -CAFFE2_SPECIALIZED_SCALE(float, double, d) -#undef CAFFE2_SPECIALIZED_SCALE - #define CAFFE2_SPECIALIZED_DOT(T, prefix) \ template <> \ C10_EXPORT void Dot( \ @@ -486,36 +452,6 @@ CAFFE2_SPECIALIZED_AXPBY(float, s) #endif // CAFFE2_USE_EIGEN_FOR_BLAS -#define CAFFE2_SPECIALIZED_SCALE(TAlpha, TData) \ - template <> \ - C10_EXPORT void Scale( \ - const int n, \ - const TAlpha alpha, \ - const TData* x, \ - TData* y, \ - CPUContext* /* context */) { \ - EigenVectorMap(y, n) = \ - ConstEigenVectorMap(x, n) * static_cast(alpha); \ - } \ - template <> \ - C10_EXPORT void Scale( \ - const int n, \ - const TAlpha* alpha, \ - const TData* x, \ - TData* y, \ - CPUContext* /* context */) { \ - EigenVectorMap(y, n) = \ - ConstEigenVectorMap(x, n) * static_cast(*alpha); \ - } -#ifdef CAFFE2_USE_EIGEN_FOR_BLAS -CAFFE2_SPECIALIZED_SCALE(float, float) -CAFFE2_SPECIALIZED_SCALE(double, double) -CAFFE2_SPECIALIZED_SCALE(float, double) -#endif // CAFFE2_USE_EIGEN_FOR_BLAS -CAFFE2_SPECIALIZED_SCALE(std::int32_t, std::int32_t) -CAFFE2_SPECIALIZED_SCALE(std::int64_t, std::int64_t) -#undef CAFFE2_SPECIALIZED_SCALE - template <> C10_EXPORT void GemmBatched( const CBLAS_TRANSPOSE trans_A, @@ -628,563 +564,6 @@ C10_EXPORT void GemmStridedBatched( // Eigen or via custom code. //////////////////////////////////////////////////////////////////////////////// -#define CAFFE2_SPECIALIZED_SET(T) \ - template <> \ - C10_EXPORT void Set( \ - const size_t N, const T alpha, T* Y, CPUContext*) { \ - if (N == 0) { \ - return; \ - } \ - if (alpha == (T)0) { \ - if (Y != nullptr) { \ - std::memset(Y, 0, N * sizeof(T)); \ - } \ - } else { \ - EigenVectorMap(Y, N).setConstant(alpha); \ - } \ - } - -CAFFE2_SPECIALIZED_SET(float); -CAFFE2_SPECIALIZED_SET(double); -CAFFE2_SPECIALIZED_SET(int8_t); -CAFFE2_SPECIALIZED_SET(int16_t); -CAFFE2_SPECIALIZED_SET(int); -CAFFE2_SPECIALIZED_SET(int64_t); -CAFFE2_SPECIALIZED_SET(bool); -CAFFE2_SPECIALIZED_SET(char); -CAFFE2_SPECIALIZED_SET(uint8_t); -CAFFE2_SPECIALIZED_SET(uint16_t); -#undef CAFFE2_SPECIALIZED_SET - -#define CAFFE2_SPECIALIZED_REDUCEMIN(T) \ - template <> \ - C10_EXPORT void ReduceMin( \ - const int N, \ - const T* x, \ - T* y, \ - Tensor* /*scratch_ptr*/, \ - CPUContext* /*context*/) { \ - *y = ConstEigenVectorArrayMap(x, N).minCoeff(); \ - } -CAFFE2_SPECIALIZED_REDUCEMIN(float) -#undef CAFFE2_SPECIALIZED_REDUCEMIN - -#define CAFFE2_SPECIALIZED_REDUCEMAX(T) \ - template <> \ - C10_EXPORT void ReduceMax( \ - const int N, \ - const T* x, \ - T* y, \ - Tensor* /*scratch_ptr*/, \ - CPUContext* /*context*/) { \ - *y = ConstEigenVectorArrayMap(x, N).maxCoeff(); \ - } -CAFFE2_SPECIALIZED_REDUCEMAX(float) -CAFFE2_SPECIALIZED_REDUCEMAX(int32_t) -CAFFE2_SPECIALIZED_REDUCEMAX(int64_t) - -#undef CAFFE2_SPECIALIZED_REDUCEMAX - -namespace { - -template -struct MinFunctor { - inline T operator()(const T a, const T b) const { - return std::min(a, b); - } -}; - -template -struct MaxFunctor { - inline T operator()(const T a, const T b) const { - return std::max(a, b); - } -}; - -template -struct L1NormFunctor { - inline T operator()(const T a, const T b) const { - return a + std::abs(b); - } -}; - -template -struct SquaredL2NormFunctor { - inline T operator()(const T a, const T b) const { - return a + b * b; - } -}; - -#define DELEGATE_ROWWISE_REDUCE_FUNCTION(Func, EigenOp) \ - template \ - C10_EXPORT void Rowwise##Func( \ - const int rows, const int cols, const T alpha, const T* X, T* Y) { \ - EigenVectorMap(Y, rows) = \ - ConstEigenMatrixMap(X, cols, rows).colwise().EigenOp() * alpha; \ - } -DELEGATE_ROWWISE_REDUCE_FUNCTION(ReduceMin, minCoeff) -DELEGATE_ROWWISE_REDUCE_FUNCTION(ReduceMax, maxCoeff) -DELEGATE_ROWWISE_REDUCE_FUNCTION(ReduceSum, sum) -DELEGATE_ROWWISE_REDUCE_FUNCTION(ReduceMean, mean) -DELEGATE_ROWWISE_REDUCE_FUNCTION(ReduceL1, template lpNorm<1>); -DELEGATE_ROWWISE_REDUCE_FUNCTION(ReduceL2, norm) -#undef DELEGATE_ROWWISE_REDUCE_FUNCTION - -#define DELEGATE_COLWISE_REDUCE_FUNCTION(Func, EigenOp) \ - template \ - C10_EXPORT void Colwise##Func( \ - const int rows, const int cols, const T alpha, const T* X, T* Y) { \ - EigenVectorMap(Y, cols) = \ - ConstEigenMatrixMap(X, cols, rows).rowwise().EigenOp() * alpha; \ - } -DELEGATE_COLWISE_REDUCE_FUNCTION(ReduceMin, minCoeff) -DELEGATE_COLWISE_REDUCE_FUNCTION(ReduceMax, maxCoeff) -DELEGATE_COLWISE_REDUCE_FUNCTION(ReduceSum, sum) -DELEGATE_COLWISE_REDUCE_FUNCTION(ReduceMean, mean) -DELEGATE_COLWISE_REDUCE_FUNCTION(ReduceL1, template lpNorm<1>); -DELEGATE_COLWISE_REDUCE_FUNCTION(ReduceL2, norm) -#undef DELEGATE_COLWISE_REDUCE_FUNCTION - -template -C10_EXPORT void BothEndsReduceMin( - const int pre, - const int mid, - const int nxt, - const T alpha, - const T* X, - T* Y) { - EigenVectorArrayMap Y_arr(Y, mid); - Y_arr = ConstEigenArrayMap(X, nxt, mid).colwise().minCoeff(); - const T* X_ptr = X + mid * nxt; - // It seems there is some bug in eigen array::min so it cannot be implemented - // as ReduceSum below. - for (int i = 1; i < pre; ++i) { - for (int j = 0; j < mid; ++j) { - Y[j] = std::min(Y[j], ConstEigenVectorArrayMap(X_ptr, nxt).minCoeff()); - X_ptr += nxt; - } - } - if (alpha != T(1)) { - Y_arr *= alpha; - } -} - -template -C10_EXPORT void BothEndsReduceMax( - const int pre, - const int mid, - const int nxt, - const T alpha, - const T* X, - T* Y) { - EigenVectorArrayMap Y_arr(Y, mid); - Y_arr = ConstEigenArrayMap(X, nxt, mid).colwise().maxCoeff(); - const T* X_ptr = X + mid * nxt; - for (int i = 1; i < pre; ++i) { - for (int j = 0; j < mid; ++j) { - Y[j] = std::max(Y[j], ConstEigenVectorArrayMap(X_ptr, nxt).maxCoeff()); - X_ptr += nxt; - } - } - if (alpha != T(1)) { - Y_arr *= alpha; - } -} - -template -C10_EXPORT void BothEndsReduceSum( - const int pre, - const int mid, - const int nxt, - const T alpha, - const T* X, - T* Y) { - EigenVectorArrayMap Y_arr(Y, mid); - Y_arr = ConstEigenArrayMap(X, nxt, mid).colwise().sum(); - const int stride = mid * nxt; - const T* X_ptr = X + stride; - for (int i = 1; i < pre; ++i) { - Y_arr += ConstEigenArrayMap(X_ptr, nxt, mid).colwise().sum(); - X_ptr += stride; - } - if (alpha != T(1)) { - Y_arr *= alpha; - } -} - -template -C10_EXPORT void BothEndsReduceMean( - const int pre, - const int mid, - const int nxt, - const T alpha, - const T* X, - T* Y) { - EigenVectorArrayMap Y_arr(Y, mid); - Y_arr = ConstEigenArrayMap(X, nxt, mid).colwise().mean(); - const int stride = mid * nxt; - const T* X_ptr = X + stride; - for (int i = 1; i < pre; ++i) { - Y_arr += ConstEigenArrayMap(X_ptr, nxt, mid).colwise().mean(); - X_ptr += stride; - } - if (alpha / static_cast(pre) != 1) { - Y_arr *= alpha / static_cast(pre); - } -} - -template -C10_EXPORT void BothEndsReduceL1( - const int pre, - const int mid, - const int nxt, - const T alpha, - const T* X, - T* Y) { - EigenVectorArrayMap Y_arr(Y, mid); - Y_arr = ConstEigenMatrixMap(X, nxt, mid) - .colwise() - .template lpNorm<1>() - .array(); - const int stride = mid * nxt; - const T* X_ptr = X + stride; - for (int i = 1; i < pre; ++i) { - Y_arr += ConstEigenMatrixMap(X_ptr, nxt, mid) - .colwise() - .template lpNorm<1>() - .array(); - X_ptr += stride; - } - if (alpha != T(1)) { - Y_arr *= alpha; - } -} - -template -C10_EXPORT void BothEndsReduceL2( - const int pre, - const int mid, - const int nxt, - const T alpha, - const T* X, - T* Y) { - EigenVectorArrayMap Y_arr(Y, mid); - Y_arr = ConstEigenMatrixMap(X, nxt, mid).colwise().squaredNorm().array(); - const int stride = mid * nxt; - const T* X_ptr = X + stride; - for (int i = 1; i < pre; ++i) { - Y_arr += - ConstEigenMatrixMap(X_ptr, nxt, mid).colwise().squaredNorm().array(); - X_ptr += stride; - } - Y_arr = Y_arr.sqrt() * alpha; -} - -template -C10_EXPORT void ReduceTensor( - const int ndim, - const int* X_dims, - const int* Y_dims, - const Reducer& reducer, - const T init, - const T alpha, - const T* X, - T* Y, - CPUContext* context) { - const int X_size = - std::accumulate(X_dims, X_dims + ndim, 1, std::multiplies()); - const int Y_size = - std::accumulate(Y_dims, Y_dims + ndim, 1, std::multiplies()); - Set(Y_size, init, Y, context); - std::vector index(ndim, 0); - for (int X_index = 0; X_index < X_size; ++X_index) { - const int Y_index = utils::GetIndexFromDims(ndim, Y_dims, index.data()); - Y[Y_index] = reducer(Y[Y_index], X[X_index]); - utils::IncreaseIndexInDims(ndim, X_dims, index.data()); - } - Scale(Y_size, alpha, Y, Y, context); -} - -} // namespace - -#define DELEGATE_REDUCE_FUNCTION(T, Func, reducer, init, is_norm) \ - template <> \ - C10_EXPORT void Func( \ - const int num_dims, \ - const int* dims, \ - const int num_axes, \ - const int* axes, \ - const T alpha, \ - const T* X, \ - T* Y, \ - CPUContext* context) { \ - CAFFE_ENFORCE_LE(num_axes, num_dims); \ - std::vector Y_dims_vector(dims, dims + num_dims); \ - for (int i = 0; i < num_axes; ++i) { \ - Y_dims_vector[axes[i]] = 1; \ - } \ - const int* X_dims = dims; \ - const int* Y_dims = Y_dims_vector.data(); \ - const int X_size = \ - std::accumulate(X_dims, X_dims + num_dims, 1, std::multiplies()); \ - const int Y_size = \ - std::accumulate(Y_dims, Y_dims + num_dims, 1, std::multiplies()); \ - if (X_size == 0) { \ - Set(Y_size, alpha * init, Y, context); \ - return; \ - } \ - if (alpha == T(0)) { \ - Set(Y_size, 0, Y, context); \ - return; \ - } \ - if (std::equal(X_dims, X_dims + num_dims, Y_dims)) { \ - if (is_norm) { \ - Abs(X_size, X, Y, context); \ - Scale(Y_size, alpha, Y, Y, context); \ - } else { \ - Scale(Y_size, alpha, X, Y, context); \ - } \ - return; \ - } \ - int rows; \ - int cols; \ - if (utils::IsRowwiseReduce(num_dims, X_dims, Y_dims, &rows, &cols)) { \ - Rowwise##Func(rows, cols, alpha, X, Y); \ - return; \ - } \ - if (utils::IsColwiseReduce(num_dims, X_dims, Y_dims, &rows, &cols)) { \ - Colwise##Func(rows, cols, alpha, X, Y); \ - return; \ - } \ - int pre; \ - int mid; \ - int nxt; \ - if (utils::IsBothEndsReduce(num_dims, X_dims, Y_dims, &pre, &mid, &nxt)) { \ - BothEnds##Func(pre, mid, nxt, alpha, X, Y); \ - return; \ - } \ - ReduceTensor( \ - num_dims, X_dims, Y_dims, reducer, init, alpha, X, Y, context); \ - } - -DELEGATE_REDUCE_FUNCTION( - float, - ReduceMin, - MinFunctor(), - std::numeric_limits::max(), - false) -DELEGATE_REDUCE_FUNCTION( - double, - ReduceMin, - MinFunctor(), - std::numeric_limits::max(), - false) -DELEGATE_REDUCE_FUNCTION( - std::int32_t, - ReduceMin, - MinFunctor(), - std::numeric_limits::max(), - false) -DELEGATE_REDUCE_FUNCTION( - std::int64_t, - ReduceMin, - MinFunctor(), - std::numeric_limits::max(), - false) - -DELEGATE_REDUCE_FUNCTION( - float, - ReduceMax, - MaxFunctor(), - std::numeric_limits::lowest(), - false) -DELEGATE_REDUCE_FUNCTION( - double, - ReduceMax, - MaxFunctor(), - std::numeric_limits::lowest(), - false) -DELEGATE_REDUCE_FUNCTION( - std::int32_t, - ReduceMax, - MaxFunctor(), - std::numeric_limits::lowest(), - false) -DELEGATE_REDUCE_FUNCTION( - std::int64_t, - ReduceMax, - MaxFunctor(), - std::numeric_limits::lowest(), - false) - -DELEGATE_REDUCE_FUNCTION(float, ReduceSum, std::plus(), 0.0f, false) -DELEGATE_REDUCE_FUNCTION(double, ReduceSum, std::plus(), 0.0, false) -DELEGATE_REDUCE_FUNCTION( - std::int32_t, - ReduceSum, - std::plus(), - 0, - false) -DELEGATE_REDUCE_FUNCTION( - std::int64_t, - ReduceSum, - std::plus(), - std::int64_t(0), - false) - -DELEGATE_REDUCE_FUNCTION(float, ReduceL1, L1NormFunctor(), 0.0f, true) -DELEGATE_REDUCE_FUNCTION(double, ReduceL1, L1NormFunctor(), 0.0, true) -DELEGATE_REDUCE_FUNCTION( - std::int32_t, - ReduceL1, - L1NormFunctor(), - 0, - true) -DELEGATE_REDUCE_FUNCTION( - std::int64_t, - ReduceL1, - L1NormFunctor(), - std::int64_t(0), - true) - -#undef DELEGATE_REDUCE_FUNCTION - -#define CAFFE2_SPECIALIZED_REDUCE_MEAN(T) \ - template <> \ - C10_EXPORT void ReduceMean( \ - const int num_dims, \ - const int* dims, \ - const int num_axes, \ - const int* axes, \ - const T alpha, \ - const T* X, \ - T* Y, \ - CPUContext* context) { \ - CAFFE_ENFORCE_LE(num_axes, num_dims); \ - std::vector Y_dims_vector(dims, dims + num_dims); \ - for (int i = 0; i < num_axes; ++i) { \ - Y_dims_vector[axes[i]] = 1; \ - } \ - const int* X_dims = dims; \ - const int* Y_dims = Y_dims_vector.data(); \ - const int X_size = \ - std::accumulate(X_dims, X_dims + num_dims, 1, std::multiplies()); \ - const int Y_size = \ - std::accumulate(Y_dims, Y_dims + num_dims, 1, std::multiplies()); \ - if (X_size == 0) { \ - Set(Y_size, 0, Y, context); \ - return; \ - } \ - if (alpha == T(0)) { \ - Set(Y_size, 0, Y, context); \ - return; \ - } \ - if (std::equal(X_dims, X_dims + num_dims, Y_dims)) { \ - Scale(X_size, alpha, X, Y, context); \ - return; \ - } \ - int rows; \ - int cols; \ - if (utils::IsRowwiseReduce(num_dims, X_dims, Y_dims, &rows, &cols)) { \ - RowwiseReduceMean(rows, cols, alpha, X, Y); \ - return; \ - } \ - if (utils::IsColwiseReduce(num_dims, X_dims, Y_dims, &rows, &cols)) { \ - ColwiseReduceMean(rows, cols, alpha, X, Y); \ - return; \ - } \ - int pre; \ - int mid; \ - int nxt; \ - if (utils::IsBothEndsReduce(num_dims, X_dims, Y_dims, &pre, &mid, &nxt)) { \ - BothEndsReduceMean(pre, mid, nxt, alpha, X, Y); \ - return; \ - } \ - const int scale = X_size / Y_size; \ - ReduceTensor( \ - num_dims, \ - X_dims, \ - Y_dims, \ - std::plus(), \ - T(0), \ - alpha / static_cast(scale), \ - X, \ - Y, \ - context); \ - } -CAFFE2_SPECIALIZED_REDUCE_MEAN(float) -CAFFE2_SPECIALIZED_REDUCE_MEAN(double) -#undef CAFFE2_SPECIALIZED_REDUCE_MEAN - -#define CAFFE2_SPECIALIZED_REDUCE_L2(T) \ - template <> \ - C10_EXPORT void ReduceL2( \ - const int num_dims, \ - const int* dims, \ - const int num_axes, \ - const int* axes, \ - const T alpha, \ - const T* X, \ - T* Y, \ - CPUContext* context) { \ - CAFFE_ENFORCE_LE(num_axes, num_dims); \ - std::vector Y_dims_vector(dims, dims + num_dims); \ - for (int i = 0; i < num_axes; ++i) { \ - Y_dims_vector[axes[i]] = 1; \ - } \ - const int* X_dims = dims; \ - const int* Y_dims = Y_dims_vector.data(); \ - const int X_size = \ - std::accumulate(X_dims, X_dims + num_dims, 1, std::multiplies()); \ - const int Y_size = \ - std::accumulate(Y_dims, Y_dims + num_dims, 1, std::multiplies()); \ - if (X_size == 0) { \ - Set(Y_size, 0, Y, context); \ - return; \ - } \ - if (alpha == T(0)) { \ - Set(Y_size, 0, Y, context); \ - return; \ - } \ - if (std::equal(X_dims, X_dims + num_dims, Y_dims)) { \ - Abs(X_size, X, Y, context); \ - Scale(Y_size, alpha, Y, Y, context); \ - return; \ - } \ - int rows; \ - int cols; \ - if (utils::IsRowwiseReduce(num_dims, X_dims, Y_dims, &rows, &cols)) { \ - RowwiseReduceL2(rows, cols, alpha, X, Y); \ - return; \ - } \ - if (utils::IsColwiseReduce(num_dims, X_dims, Y_dims, &rows, &cols)) { \ - ColwiseReduceL2(rows, cols, alpha, X, Y); \ - return; \ - } \ - int pre; \ - int mid; \ - int nxt; \ - if (utils::IsBothEndsReduce(num_dims, X_dims, Y_dims, &pre, &mid, &nxt)) { \ - BothEndsReduceL2(pre, mid, nxt, alpha, X, Y); \ - return; \ - } \ - ReduceTensor( \ - num_dims, \ - X_dims, \ - Y_dims, \ - SquaredL2NormFunctor(), \ - T(0), \ - T(1), \ - X, \ - Y, \ - context); \ - Sqrt(Y_size, Y, Y, context); \ - Scale(Y_size, alpha, Y, Y, context); \ - } -CAFFE2_SPECIALIZED_REDUCE_L2(float) -CAFFE2_SPECIALIZED_REDUCE_L2(double) -#undef CAFFE2_SPECIALIZED_REDUCE_L2 - namespace { template diff --git a/caffe2/utils/math_gpu.cu b/caffe2/utils/math_gpu.cu index f8ccda7..7314199 100644 --- a/caffe2/utils/math_gpu.cu +++ b/caffe2/utils/math_gpu.cu @@ -1482,7 +1482,7 @@ __global__ void SetKernel(const int N, const T alpha, T* Y) { #define CAFFE2_SPECIALIZED_CUDA_SET(T) \ template <> \ CAFFE2_CUDA_API void Set( \ - const size_t N, const T alpha, T* Y, CUDAContext* context) { \ + const int N, const T alpha, T* Y, CUDAContext* context) { \ if (N == 0) { \ return; \ } \ @@ -1510,7 +1510,7 @@ CAFFE2_SPECIALIZED_CUDA_SET(uint16_t); template <> CAFFE2_CUDA_EXPORT void Set( - const size_t N, + const int N, const at::Half alpha, at::Half* Y, CUDAContext* context) { @@ -3356,27 +3356,19 @@ CAFFE2_CUDA_EXPORT void ReduceTensorCUDAImpl( template CAFFE2_CUDA_EXPORT void ReduceTensorCUDA( - const int num_dims, - const int* dims, - const int num_axes, - const int* axes, + const int ndim, + const int* X_dims, + const int* Y_dims, const Reducer& reducer, const T init, const T alpha, const T* X, T* Y, CUDAContext* context) { - CAFFE_ENFORCE_LE(num_axes, num_dims); - std::vector Y_dims_vector(dims, dims + num_dims); - for (int i = 0; i < num_axes; ++i) { - Y_dims_vector[axes[i]] = 1; - } - const int* X_dims = dims; - const int* Y_dims = Y_dims_vector.data(); const int X_size = - std::accumulate(X_dims, X_dims + num_dims, 1, std::multiplies()); + std::accumulate(X_dims, X_dims + ndim, 1, std::multiplies()); const int Y_size = - std::accumulate(Y_dims, Y_dims + num_dims, 1, std::multiplies()); + std::accumulate(Y_dims, Y_dims + ndim, 1, std::multiplies()); if (X_size == 0) { Set(Y_size, alpha * init, Y, context); return; @@ -3385,13 +3377,13 @@ CAFFE2_CUDA_EXPORT void ReduceTensorCUDA( Set(Y_size, T(0), Y, context); return; } - if (std::equal(X_dims, X_dims + num_dims, Y_dims)) { + if (std::equal(X_dims, X_dims + ndim, Y_dims)) { Scale(X_size, alpha, X, Y, context); return; } int rows; int cols; - if (utils::IsRowwiseReduce(num_dims, X_dims, Y_dims, &rows, &cols)) { + if (utils::IsRowwiseReduce(ndim, X_dims, Y_dims, &rows, &cols)) { RowwiseReduceKernel <<cuda_stream()>>>(rows, cols, reducer, init, alpha, X, Y); return; } - if (utils::IsColwiseReduce(num_dims, X_dims, Y_dims, &rows, &cols)) { + if (utils::IsColwiseReduce(ndim, X_dims, Y_dims, &rows, &cols)) { ColwiseReduceKernel <<cuda_stream()>>>(rows, cols, reducer, init, alpha, X, Y); return; } - std::vector transpose_axes(num_dims); - utils::ComputeTransposeAxesForReduceOp( - num_dims, num_axes, axes, transpose_axes.data()); + std::vector axes(ndim); + utils::ComputeTransposeAxesForReduceOp(ndim, Y_dims, axes.data()); const int outer_size = Y_size; const int inner_size = X_size / Y_size; DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_2( - num_dims, + ndim, ReduceTensorCUDAImpl, T, Reducer, outer_size, inner_size, - dims, - transpose_axes.data(), + X_dims, + axes.data(), reducer, init, alpha, @@ -3434,19 +3425,17 @@ CAFFE2_CUDA_EXPORT void ReduceTensorCUDA( #define CAFFE2_SPECIALIZED_CUDA_REDUCE_MIN(T) \ template <> \ CAFFE2_CUDA_EXPORT void ReduceMin( \ - const int num_dims, \ - const int* dims, \ - const int num_axes, \ - const int* axes, \ + const int ndim, \ + const int* X_dims, \ + const int* Y_dims, \ const T alpha, \ const T* X, \ T* Y, \ CUDAContext* context) { \ ReduceTensorCUDA( \ - num_dims, \ - dims, \ - num_axes, \ - axes, \ + ndim, \ + X_dims, \ + Y_dims, \ cub::Min(), \ std::numeric_limits::max(), \ alpha, \ @@ -3463,19 +3452,17 @@ CAFFE2_SPECIALIZED_CUDA_REDUCE_MIN(double) #define CAFFE2_SPECIALIZED_CUDA_REDUCE_MAX(T) \ template <> \ CAFFE2_CUDA_EXPORT void ReduceMax( \ - const int num_dims, \ - const int* dims, \ - const int num_axes, \ - const int* axes, \ + const int ndim, \ + const int* X_dims, \ + const int* Y_dims, \ const T alpha, \ const T* X, \ T* Y, \ CUDAContext* context) { \ ReduceTensorCUDA( \ - num_dims, \ - dims, \ - num_axes, \ - axes, \ + ndim, \ + X_dims, \ + Y_dims, \ cub::Max(), \ std::numeric_limits::lowest(), \ alpha, \ @@ -3489,28 +3476,18 @@ CAFFE2_SPECIALIZED_CUDA_REDUCE_MAX(float) CAFFE2_SPECIALIZED_CUDA_REDUCE_MAX(double) #undef CAFFE2_SPECIALIZED_CUDA_REDUCE_MAX -#define CAFFE2_SPECIALIZED_CUDA_REDUCE_SUM(T) \ - template <> \ - CAFFE2_CUDA_EXPORT void ReduceSum( \ - const int num_dims, \ - const int* dims, \ - const int num_axes, \ - const int* axes, \ - const T alpha, \ - const T* X, \ - T* Y, \ - CUDAContext* context) { \ - ReduceTensorCUDA( \ - num_dims, \ - dims, \ - num_axes, \ - axes, \ - cub::Sum(), \ - T(0), \ - alpha, \ - X, \ - Y, \ - context); \ +#define CAFFE2_SPECIALIZED_CUDA_REDUCE_SUM(T) \ + template <> \ + CAFFE2_CUDA_EXPORT void ReduceSum( \ + const int ndim, \ + const int* X_dims, \ + const int* Y_dims, \ + const T alpha, \ + const T* X, \ + T* Y, \ + CUDAContext* context) { \ + ReduceTensorCUDA( \ + ndim, X_dims, Y_dims, cub::Sum(), T(0), alpha, X, Y, context); \ } CAFFE2_SPECIALIZED_CUDA_REDUCE_SUM(std::int32_t) CAFFE2_SPECIALIZED_CUDA_REDUCE_SUM(std::int64_t) @@ -3521,23 +3498,23 @@ CAFFE2_SPECIALIZED_CUDA_REDUCE_SUM(double) #define CAFFE2_SPECIALIZED_CUDA_REDUCE_MEAN(T) \ template <> \ CAFFE2_CUDA_EXPORT void ReduceMean( \ - const int num_dims, \ - const int* dims, \ - const int num_axes, \ - const int* axes, \ + const int ndim, \ + const int* X_dims, \ + const int* Y_dims, \ const T alpha, \ const T* X, \ T* Y, \ CUDAContext* context) { \ int scale = 1; \ - for (int i = 0; i < num_axes; ++i) { \ - scale *= dims[axes[i]]; \ + for (int i = 0; i < ndim; ++i) { \ + if (Y_dims[i] == 1) { \ + scale *= X_dims[i]; \ + } \ } \ ReduceTensorCUDA( \ - num_dims, \ - dims, \ - num_axes, \ - axes, \ + ndim, \ + X_dims, \ + Y_dims, \ cub::Sum(), \ T(0), \ alpha / static_cast(scale), \ diff --git a/caffe2/utils/math_gpu_test.cc b/caffe2/utils/math_gpu_test.cc index 99e9f69..ccaaf89 100644 --- a/caffe2/utils/math_gpu_test.cc +++ b/caffe2/utils/math_gpu_test.cc @@ -351,288 +351,6 @@ INSTANTIATE_TEST_CASE_P( GemmBatchedGPUTest, testing::Combine(testing::Bool(), testing::Bool())); -class ReduceTensorGPUTest : public testing::Test { - protected: - void SetUp() override { - if (!HasCudaGPU()) { - return; - } - option_.set_device_type(PROTO_CUDA); - cuda_context_ = make_unique(option_); - Blob* blob_x = ws_.CreateBlob("X"); - Blob* blob_y = ws_.CreateBlob("Y"); - X_ = BlobGetMutableTensor(blob_x, CUDA); - Y_ = BlobGetMutableTensor(blob_y, CUDA); - } - - void SetUpData( - const std::vector& X_dims, - const std::vector& axes, - const std::vector& X_data) { - std::vector Y_dims = X_dims; - for (const int axis : axes) { - Y_dims[axis] = 1; - } - X_->Resize(X_dims); - Y_->Resize(Y_dims); - ASSERT_EQ(X_data.size(), X_->numel()); - cuda_context_->CopyFromCPU( - X_data.size(), X_data.data(), X_->mutable_data()); - } - - void VerifyResult(const std::vector& expected_output) { - Blob* blob_y_host = ws_.CreateBlob("Y_host"); - auto* Y_host = BlobGetMutableTensor(blob_y_host, CPU); - Y_host->CopyFrom(*Y_); - ASSERT_EQ(expected_output.size(), Y_host->numel()); - for (std::size_t i = 0; i < expected_output.size(); ++i) { - EXPECT_FLOAT_EQ(expected_output[i], Y_host->data()[i]); - } - } - - template - void RunRedcueTensorTest( - const ReduceFunc& reduce_func, - const std::vector& X_dims, - const std::vector& axes, - const std::vector& X_data, - const std::vector& Y_data) { - SetUpData(X_dims, axes, X_data); - reduce_func( - X_dims.size(), - X_dims.data(), - axes.size(), - axes.data(), - 1.0f, - X_->data(), - Y_->mutable_data(), - cuda_context_.get()); - VerifyResult(Y_data); - } - - Workspace ws_; - DeviceOption option_; - std::unique_ptr cuda_context_; - Tensor* X_ = nullptr; - Tensor* Y_ = nullptr; -}; - -TEST_F(ReduceTensorGPUTest, ReduceMinGPUTest) { - if (!HasCudaGPU()) { - return; - } - const auto& reduce_min = [](const int num_dims, - const int* dims, - const int num_axes, - const int* axes, - const float alpha, - const float* X, - float* Y, - CUDAContext* context) { - return math::ReduceMin( - num_dims, dims, num_axes, axes, alpha, X, Y, context); - }; - // Test for 1D tensor. - RunRedcueTensorTest(reduce_min, {3}, {0}, {1.0f, 2.0f, 3.0f}, {1.0f}); - - // Test for 2D Tensor. - RunRedcueTensorTest( - reduce_min, - {2, 3}, - {1}, - {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}, - {1.0f, 4.0f}); - RunRedcueTensorTest( - reduce_min, - {2, 3}, - {0}, - {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}, - {1.0f, 2.0f, 3.0f}); - RunRedcueTensorTest( - reduce_min, {2, 3}, {0, 1}, {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}, {1.0f}); - - // Test for 3D tensor. - RunRedcueTensorTest( - reduce_min, - {2, 2, 2}, - {1, 2}, - {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f}, - {1.0f, 5.0f}); - RunRedcueTensorTest( - reduce_min, - {2, 2, 2}, - {0, 1}, - {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f}, - {1.0f, 2.0f}); - RunRedcueTensorTest( - reduce_min, - {2, 2, 2}, - {0, 2}, - {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f}, - {1.0f, 3.0f}); -} - -TEST_F(ReduceTensorGPUTest, ReduceMaxGPUTest) { - if (!HasCudaGPU()) { - return; - } - const auto& reduce_max = [](const int num_dims, - const int* dims, - const int num_axes, - const int* axes, - const float alpha, - const float* X, - float* Y, - CUDAContext* context) { - return math::ReduceMax( - num_dims, dims, num_axes, axes, alpha, X, Y, context); - }; - // Test for 1D tensor. - RunRedcueTensorTest(reduce_max, {3}, {0}, {1.0f, 2.0f, 3.0f}, {3.0f}); - - // Test for 2D Tensor. - RunRedcueTensorTest( - reduce_max, - {2, 3}, - {1}, - {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}, - {3.0f, 6.0f}); - RunRedcueTensorTest( - reduce_max, - {2, 3}, - {0}, - {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}, - {4.0f, 5.0f, 6.0f}); - RunRedcueTensorTest( - reduce_max, {2, 3}, {0, 1}, {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}, {6.0f}); - - // Test for 3D tensor. - RunRedcueTensorTest( - reduce_max, - {2, 2, 2}, - {1, 2}, - {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f}, - {4.0f, 8.0f}); - RunRedcueTensorTest( - reduce_max, - {2, 2, 2}, - {0, 1}, - {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f}, - {7.0f, 8.0f}); - RunRedcueTensorTest( - reduce_max, - {2, 2, 2}, - {0, 2}, - {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f}, - {6.0f, 8.0f}); -} - -TEST_F(ReduceTensorGPUTest, ReduceSumGPUTest) { - if (!HasCudaGPU()) { - return; - } - // Test for 1D tensor. - RunRedcueTensorTest( - math::ReduceSum, - {3}, - {0}, - {1.0f, 2.0f, 3.0f}, - {6.0f}); - - // Test for 2D Tensor. - RunRedcueTensorTest( - math::ReduceSum, - {2, 3}, - {1}, - {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}, - {6.0f, 15.0f}); - RunRedcueTensorTest( - math::ReduceSum, - {2, 3}, - {0}, - {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}, - {5.0f, 7.0f, 9.0f}); - RunRedcueTensorTest( - math::ReduceSum, - {2, 3}, - {0, 1}, - {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}, - {21.0f}); - - // Test for 3D tensor. - RunRedcueTensorTest( - math::ReduceSum, - {2, 2, 2}, - {1, 2}, - {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f}, - {10.0f, 26.0f}); - RunRedcueTensorTest( - math::ReduceSum, - {2, 2, 2}, - {0, 1}, - {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f}, - {16.0f, 20.0f}); - RunRedcueTensorTest( - math::ReduceSum, - {2, 2, 2}, - {0, 2}, - {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f}, - {14.0f, 22.0f}); -} - -TEST_F(ReduceTensorGPUTest, ReduceMeanGPUTest) { - if (!HasCudaGPU()) { - return; - } - // Test for 1D tensor. - RunRedcueTensorTest( - math::ReduceMean, - {3}, - {0}, - {1.0f, 2.0f, 3.0f}, - {2.0f}); - - // Test for 2D Tensor. - RunRedcueTensorTest( - math::ReduceMean, - {2, 3}, - {1}, - {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}, - {2.0f, 5.0f}); - RunRedcueTensorTest( - math::ReduceMean, - {2, 3}, - {0}, - {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}, - {2.5f, 3.5f, 4.5f}); - RunRedcueTensorTest( - math::ReduceMean, - {2, 3}, - {0, 1}, - {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}, - {3.5f}); - - // Test for 3D tensor. - RunRedcueTensorTest( - math::ReduceMean, - {2, 2, 2}, - {1, 2}, - {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f}, - {2.5f, 6.5f}); - RunRedcueTensorTest( - math::ReduceMean, - {2, 2, 2}, - {0, 1}, - {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f}, - {4.0f, 5.0f}); - RunRedcueTensorTest( - math::ReduceMean, - {2, 2, 2}, - {0, 2}, - {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f}, - {3.5f, 5.5f}); -} - class BroadcastGPUTest : public testing::Test { protected: void SetUp() override { diff --git a/caffe2/utils/math_test.cc b/caffe2/utils/math_test.cc index c4fec53..0e30fda 100644 --- a/caffe2/utils/math_test.cc +++ b/caffe2/utils/math_test.cc @@ -426,253 +426,6 @@ TEST(MathTest, FloatToHalfConversion) { namespace { -class ReduceTensorTest : public testing::Test { - protected: - void SetUp() override { - cpu_context_ = make_unique(option_); - } - - template - void RunRedcueTensorTest( - const ReduceFunc& reduce_func, - const std::vector& X_dims, - const std::vector& axes, - const std::vector& X_data, - const std::vector& Y_data) { - std::vector Y_dims = X_dims; - for (const int axis : axes) { - Y_dims[axis] = 1; - } - std::vector X_dims_64; - std::vector Y_dims_64; - std::copy(X_dims.cbegin(), X_dims.cend(), std::back_inserter(X_dims_64)); - std::copy(Y_dims.cbegin(), Y_dims.cend(), std::back_inserter(Y_dims_64)); - ReinitializeTensor(&X_, X_dims_64, at::dtype().device(CPU)); - ReinitializeTensor(&Y_, Y_dims_64, at::dtype().device(CPU)); - ASSERT_EQ(X_data.size(), X_.numel()); - cpu_context_->CopyFromCPU( - X_data.size(), X_data.data(), X_.mutable_data()); - reduce_func( - X_dims.size(), - X_dims.data(), - axes.size(), - axes.data(), - 1.0f, - X_.data(), - Y_.mutable_data(), - cpu_context_.get()); - ASSERT_EQ(Y_data.size(), Y_.numel()); - for (int i = 0; i < Y_.numel(); ++i) { - EXPECT_FLOAT_EQ(Y_data[i], Y_.data()[i]); - } - } - - DeviceOption option_; - std::unique_ptr cpu_context_; - Tensor X_; - Tensor Y_; -}; - -TEST_F(ReduceTensorTest, ReduceMinTest) { - const auto& reduce_min = [](const int num_dims, - const int* dims, - const int num_axes, - const int* axes, - const float alpha, - const float* X, - float* Y, - CPUContext* context) { - return math::ReduceMin( - num_dims, dims, num_axes, axes, alpha, X, Y, context); - }; - // Test for 1D tensor. - RunRedcueTensorTest(reduce_min, {3}, {0}, {1.0f, 2.0f, 3.0f}, {1.0f}); - - // Test for 2D Tensor. - RunRedcueTensorTest( - reduce_min, - {2, 3}, - {1}, - {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}, - {1.0f, 4.0f}); - RunRedcueTensorTest( - reduce_min, - {2, 3}, - {0}, - {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}, - {1.0f, 2.0f, 3.0f}); - RunRedcueTensorTest( - reduce_min, {2, 3}, {0, 1}, {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}, {1.0f}); - - // Test for 3D tensor. - RunRedcueTensorTest( - reduce_min, - {2, 2, 2}, - {1, 2}, - {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f}, - {1.0f, 5.0f}); - RunRedcueTensorTest( - reduce_min, - {2, 2, 2}, - {0, 1}, - {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f}, - {1.0f, 2.0f}); - RunRedcueTensorTest( - reduce_min, - {2, 2, 2}, - {0, 2}, - {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f}, - {1.0f, 3.0f}); -} - -TEST_F(ReduceTensorTest, ReduceMaxTest) { - const auto& reduce_max = [](const int num_dims, - const int* dims, - const int num_axes, - const int* axes, - const float alpha, - const float* X, - float* Y, - CPUContext* context) { - return math::ReduceMax( - num_dims, dims, num_axes, axes, alpha, X, Y, context); - }; - // Test for 1D tensor. - RunRedcueTensorTest(reduce_max, {3}, {0}, {1.0f, 2.0f, 3.0f}, {3.0f}); - - // Test for 2D Tensor. - RunRedcueTensorTest( - reduce_max, - {2, 3}, - {1}, - {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}, - {3.0f, 6.0f}); - RunRedcueTensorTest( - reduce_max, - {2, 3}, - {0}, - {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}, - {4.0f, 5.0f, 6.0f}); - RunRedcueTensorTest( - reduce_max, {2, 3}, {0, 1}, {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}, {6.0f}); - - // Test for 3D tensor. - RunRedcueTensorTest( - reduce_max, - {2, 2, 2}, - {1, 2}, - {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f}, - {4.0f, 8.0f}); - RunRedcueTensorTest( - reduce_max, - {2, 2, 2}, - {0, 1}, - {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f}, - {7.0f, 8.0f}); - RunRedcueTensorTest( - reduce_max, - {2, 2, 2}, - {0, 2}, - {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f}, - {6.0f, 8.0f}); -} - -TEST_F(ReduceTensorTest, ReduceSumTest) { - // Test for 1D tensor. - RunRedcueTensorTest( - math::ReduceSum, {3}, {0}, {1.0f, 2.0f, 3.0f}, {6.0f}); - - // Test for 2D Tensor. - RunRedcueTensorTest( - math::ReduceSum, - {2, 3}, - {1}, - {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}, - {6.0f, 15.0f}); - RunRedcueTensorTest( - math::ReduceSum, - {2, 3}, - {0}, - {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}, - {5.0f, 7.0f, 9.0f}); - RunRedcueTensorTest( - math::ReduceSum, - {2, 3}, - {0, 1}, - {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}, - {21.0f}); - - // Test for 3D tensor. - RunRedcueTensorTest( - math::ReduceSum, - {2, 2, 2}, - {1, 2}, - {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f}, - {10.0f, 26.0f}); - RunRedcueTensorTest( - math::ReduceSum, - {2, 2, 2}, - {0, 1}, - {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f}, - {16.0f, 20.0f}); - RunRedcueTensorTest( - math::ReduceSum, - {2, 2, 2}, - {0, 2}, - {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f}, - {14.0f, 22.0f}); -} - -TEST_F(ReduceTensorTest, ReduceMeanTest) { - // Test for 1D tensor. - RunRedcueTensorTest( - math::ReduceMean, - {3}, - {0}, - {1.0f, 2.0f, 3.0f}, - {2.0f}); - - // Test for 2D Tensor. - RunRedcueTensorTest( - math::ReduceMean, - {2, 3}, - {1}, - {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}, - {2.0f, 5.0f}); - RunRedcueTensorTest( - math::ReduceMean, - {2, 3}, - {0}, - {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}, - {2.5f, 3.5f, 4.5f}); - RunRedcueTensorTest( - math::ReduceMean, - {2, 3}, - {0, 1}, - {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}, - {3.5f}); - - // Test for 3D tensor. - RunRedcueTensorTest( - math::ReduceMean, - {2, 2, 2}, - {1, 2}, - {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f}, - {2.5f, 6.5f}); - RunRedcueTensorTest( - math::ReduceMean, - {2, 2, 2}, - {0, 1}, - {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f}, - {4.0f, 5.0f}); - RunRedcueTensorTest( - math::ReduceMean, - {2, 2, 2}, - {0, 2}, - {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f}, - {3.5f, 5.5f}); -} - class BroadcastTest : public testing::Test { protected: void SetUp() override {