namespace math {
template <>
void Set<at::Half, CPUContext>(
- const size_t /*N*/,
+ const int /*N*/,
const at::Half h,
at::Half* v,
CPUContext* c) {
const std::vector<int> C_dims =
elementwise_ops_utils::ComputeBinaryBroadcastForwardDims(
A_dims, B_dims);
- std::vector<int> A_axes;
- std::vector<int> B_axes;
- elementwise_ops_utils::ComputeBinaryBroadcastBackwardAxes(
- A_dims, B_dims, &A_axes, &B_axes);
+ std::vector<int> A_back_dims;
+ std::vector<int> B_back_dims;
+ elementwise_ops_utils::ComputeBinaryBroadcastBackwardDims(
+ A_dims, B_dims, &A_back_dims, &B_back_dims);
math::ReduceSum(
C_dims.size(),
C_dims.data(),
- A_axes.size(),
- A_axes.data(),
+ A_back_dims.data(),
TGrad(1),
dC,
dA,
math::ReduceSum(
C_dims.size(),
C_dims.data(),
- B_axes.size(),
- B_axes.data(),
+ B_back_dims.data(),
TGrad(1),
dC,
dB,
std::reverse(B_axes->begin(), B_axes->end());
}
+void ComputeBinaryBroadcastBackwardDims(
+ const std::vector<int>& A_dims,
+ const std::vector<int>& B_dims,
+ std::vector<int>* A_back_dims,
+ std::vector<int>* B_back_dims) {
+ const int ndim = std::max(A_dims.size(), B_dims.size());
+ A_back_dims->assign(ndim, 1);
+ B_back_dims->assign(ndim, 1);
+ std::copy(A_dims.crbegin(), A_dims.crend(), A_back_dims->rbegin());
+ std::copy(B_dims.crbegin(), B_dims.crend(), B_back_dims->rbegin());
+}
+
} // namespace elementwise_ops_utils
} // namespace caffe2
std::vector<int>* A_axes,
std::vector<int>* B_axes);
+CAFFE2_API void ComputeBinaryBroadcastBackwardDims(
+ const std::vector<int>& A_dims,
+ const std::vector<int>& B_dims,
+ std::vector<int>* A_back_dims,
+ std::vector<int>* B_back_dims);
+
} // namespace elementwise_ops_utils
} // namespace caffe2
const std::vector<int> C_dims =
elementwise_ops_utils::ComputeBinaryBroadcastForwardDims(
A_dims, B_dims);
- std::vector<int> A_axes;
- std::vector<int> B_axes;
- elementwise_ops_utils::ComputeBinaryBroadcastBackwardAxes(
- A_dims, B_dims, &A_axes, &B_axes);
+ std::vector<int> A_back_dims;
+ std::vector<int> B_back_dims;
+ elementwise_ops_utils::ComputeBinaryBroadcastBackwardDims(
+ A_dims, B_dims, &A_back_dims, &B_back_dims);
math::ReduceSum(
C_dims.size(),
C_dims.data(),
- A_axes.size(),
- A_axes.data(),
+ A_back_dims.data(),
TGrad(1),
dC,
dA,
math::ReduceSum(
C_dims.size(),
C_dims.data(),
- B_axes.size(),
- B_axes.data(),
+ B_back_dims.data(),
TGrad(-1),
dC,
dB,
axes.push_back(i);
}
}
+ std::vector<int> X_dims = dY_dims;
+ for (const int axis : axes) {
+ X_dims[axis] = 1;
+ }
math::ReduceSum<T, Context>(
dY_dims.size(),
dY_dims.data(),
- axes.size(),
- axes.data(),
+ X_dims.data(),
T(1),
dY.template data<T>(),
dX->template mutable_data<T>(),
const float* X,
float* Y,
CPUContext* context) const {
- const std::array<int, 2> dims = {N * C, HxW};
- const int axis = 1;
+ const std::array<int, 2> X_dims = {N * C, HxW};
+ const std::array<int, 2> Y_dims = {N * C, 1};
math::ReduceMean<float, CPUContext>(
- 2, dims.data(), 1, &axis, 1.0f, X, Y, context);
+ 2, X_dims.data(), Y_dims.data(), 1.0f, X, Y, context);
return true;
}
const float* X,
float* Y,
CPUContext* context) const {
- const std::array<int, 2> dims = {N * C, HxW};
- const int axis = 1;
+ const std::array<int, 2> X_dims = {N * C, HxW};
+ const std::array<int, 2> Y_dims = {N * C, 1};
math::ReduceMax<float, CPUContext>(
- 2, dims.data(), 1, &axis, 1.0f, X, Y, context);
+ 2, X_dims.data(), Y_dims.data(), 1.0f, X, Y, context);
return true;
}
const float* X,
float* Y,
CUDAContext* context) const {
- const std::array<int, 2> dims = {N * C, HxW};
- const int axis = 1;
+ const std::array<int, 2> X_dims = {N * C, HxW};
+ const std::array<int, 2> Y_dims = {N * C, 1};
math::ReduceMean<float, CUDAContext>(
- 2, dims.data(), 1, &axis, 1.0f, X, Y, context);
+ 2, X_dims.data(), Y_dims.data(), 1.0f, X, Y, context);
return true;
}
const float* X,
float* Y,
CUDAContext* context) const {
- const std::array<int, 2> dims = {N * C, HxW};
- const int axis = 1;
+ const std::array<int, 2> X_dims = {N * C, HxW};
+ const std::array<int, 2> Y_dims = {N * C, 1};
math::ReduceMax<float, CUDAContext>(
- 2, dims.data(), 1, &axis, 1.0f, X, Y, context);
+ 2, X_dims.data(), Y_dims.data(), 1.0f, X, Y, context);
return true;
}
const float* X,
float* Y,
CUDAContext* context) const {
- const std::array<int, 3> dims = {N, HxW, C};
- const int axis = 1;
+ const std::array<int, 3> X_dims = {N, HxW, C};
+ const std::array<int, 3> Y_dims = {N, 1, C};
math::ReduceMax<float, CUDAContext>(
- 3, dims.data(), 1, &axis, 1.0f, X, Y, context);
+ 3, X_dims.data(), Y_dims.data(), 1.0f, X, Y, context);
return true;
}
template <typename T>
bool DoRunWithType() {
const auto& X = Input(0);
-
const int ndim = X.dim();
+ const std::vector<int> X_dims(X.sizes().cbegin(), X.sizes().cend());
if (axes_.empty()) {
axes_.resize(ndim);
std::iota(axes_.begin(), axes_.end(), 0);
} else {
- for (auto& axis: axes_) {
+ for (auto& axis : axes_) {
axis = X.canonical_axis_index(axis);
}
std::sort(axes_.begin(), axes_.end());
ndim,
"Axes ids must be smaller than the dimensions of input.");
}
- const std::vector<int> X_dims(X.sizes().cbegin(), X.sizes().cend());
- std::vector<int64_t> Y_dims;
- Y_dims.reserve(ndim);
+ std::vector<int64_t> output_dims;
+ output_dims.reserve(ndim);
std::size_t cur_axis = 0;
for (int i = 0; i < ndim; ++i) {
if (cur_axis < axes_.size() && i == axes_[cur_axis]) {
if (keep_dims_) {
- Y_dims.push_back(1);
+ output_dims.push_back(1);
}
++cur_axis;
} else {
- Y_dims.push_back(X_dims[i]);
+ output_dims.push_back(X_dims[i]);
}
}
- auto* Y = Output(0, Y_dims, at::dtype<T>());
+ auto* Y = Output(0, output_dims, at::dtype<T>());
+
+ std::vector<int> Y_dims = X_dims;
+ for (const int axis : axes_) {
+ Y_dims[axis] = 1;
+ }
+
return reducer_.template Forward<T>(
X_dims,
- axes_,
+ Y_dims,
X.template data<T>(),
Y->template mutable_data<T>(),
&context_);
private:
std::vector<int> axes_;
const int keep_dims_;
- Reducer reducer_{};
+ const Reducer reducer_{};
};
template <typename InputTypes, class Context, class Reducer>
axes_.resize(ndim);
std::iota(axes_.begin(), axes_.end(), 0);
} else {
- for (auto& axis: axes_) {
+ for (auto& axis : axes_) {
axis = X.canonical_axis_index(axis);
}
std::sort(axes_.begin(), axes_.end());
private:
std::vector<int> axes_;
- Reducer reducer_{};
+ const Reducer reducer_{};
};
template <class Context>
struct MinReducer {
template <typename T>
bool Forward(
- const std::vector<int>& dims,
- const std::vector<int>& axes,
+ const std::vector<int>& X_dims,
+ const std::vector<int>& Y_dims,
const T* X_data,
T* Y_data,
Context* context) const {
math::ReduceMin<T, Context>(
- dims.size(),
- dims.data(),
- axes.size(),
- axes.data(),
+ X_dims.size(),
+ X_dims.data(),
+ Y_dims.data(),
T(1),
X_data,
Y_data,
struct MaxReducer {
template <typename T>
bool Forward(
- const std::vector<int>& dims,
- const std::vector<int>& axes,
+ const std::vector<int>& X_dims,
+ const std::vector<int>& Y_dims,
const T* X_data,
T* Y_data,
Context* context) const {
math::ReduceMax<T, Context>(
- dims.size(),
- dims.data(),
- axes.size(),
- axes.data(),
+ X_dims.size(),
+ X_dims.data(),
+ Y_dims.data(),
T(1),
X_data,
Y_data,
struct SumReducer {
template <typename T>
bool Forward(
- const std::vector<int>& dims,
- const std::vector<int>& axes,
+ const std::vector<int>& X_dims,
+ const std::vector<int>& Y_dims,
const T* X_data,
T* Y_data,
Context* context) const {
math::ReduceSum<T, Context>(
- dims.size(),
- dims.data(),
- axes.size(),
- axes.data(),
+ X_dims.size(),
+ X_dims.data(),
+ Y_dims.data(),
T(1),
X_data,
Y_data,
struct MeanReducer {
template <typename T>
bool Forward(
- const std::vector<int>& dims,
- const std::vector<int>& axes,
+ const std::vector<int>& X_dims,
+ const std::vector<int>& Y_dims,
const T* X_data,
T* Y_data,
Context* context) const {
math::ReduceMean<T, Context>(
- dims.size(),
- dims.data(),
- axes.size(),
- axes.data(),
+ X_dims.size(),
+ X_dims.data(),
+ Y_dims.data(),
T(1),
X_data,
Y_data,
struct L1Reducer {
template <typename T>
bool Forward(
- const std::vector<int>& dims,
- const std::vector<int>& axes,
+ const std::vector<int>& X_dims,
+ const std::vector<int>& Y_dims,
const T* X_data,
T* Y_data,
Context* context) const {
math::ReduceL1<T, Context>(
- dims.size(),
- dims.data(),
- axes.size(),
- axes.data(),
+ X_dims.size(),
+ X_dims.data(),
+ Y_dims.data(),
T(1),
X_data,
Y_data,
struct L2Reducer {
template <typename T>
bool Forward(
- const std::vector<int>& dims,
- const std::vector<int>& axes,
+ const std::vector<int>& X_dims,
+ const std::vector<int>& Y_dims,
const T* X_data,
T* Y_data,
Context* context) const {
math::ReduceL2<T, Context>(
- dims.size(),
- dims.data(),
- axes.size(),
- axes.data(),
+ X_dims.size(),
+ X_dims.data(),
+ Y_dims.data(),
T(1),
X_data,
Y_data,
namespace math {
-#define C10_DECLARE_COMPARE_OP(Comp) \
- template <typename T, class Context, bool kBroadcast1st = false> \
- void Rowwise##Comp( \
- const int rows, \
- const int cols, \
- const T* A, \
- const T* B, \
- bool* C, \
- Context* context); \
- \
- template <typename T, class Context, bool kBroadcast1st = false> \
- void Colwise##Comp( \
- const int rows, \
- const int cols, \
- const T* A, \
- const T* B, \
- bool* C, \
- Context* context); \
- \
- template <typename T, class Context> \
- void Comp( \
- const int A_ndim, \
- const int* A_dims, \
- const int B_ndim, \
- const int* B_dims, \
- const T* A, \
- const T* B, \
- bool* C, \
+#define C10_DECLARE_COMPARE_OP(Comp) \
+ template <typename T, class Context, bool kBroadcast1st = false> \
+ void Rowwise##Comp( \
+ const int rows, \
+ const int cols, \
+ const T* A, \
+ const T* B, \
+ bool* C, \
+ Context* context); \
+ \
+ template <typename T, class Context, bool kBroadcast1st = false> \
+ void Colwise##Comp( \
+ const int rows, \
+ const int cols, \
+ const T* A, \
+ const T* B, \
+ bool* C, \
+ Context* context); \
+ \
+ template <typename T, class Context> \
+ void Comp( \
+ const int A_ndim, \
+ const int* A_dims, \
+ const int B_ndim, \
+ const int* B_dims, \
+ const T* A, \
+ const T* B, \
+ bool* C, \
Context* context);
C10_DECLARE_COMPARE_OP(EQ)
#undef C10_DECLARE_BINARY_OP
-template <typename T, class Context>
-CAFFE2_API void
-ReduceMin(const int N, const T* x, T* y, Tensor* scratch_ptr, Context* context);
-
-template <typename T, class Context>
-CAFFE2_API void
-ReduceMax(const int N, const T* x, T* y, Tensor* scratch_ptr, Context* context);
-
-template <typename T, class Context>
-CAFFE2_API void ReduceMin(
- const int num_dims,
- const int* dims,
- const int num_axes,
- const int* axes,
- const T alpha,
- const T* X,
- T* Y,
- Context* context);
-
-template <typename T, class Context>
-CAFFE2_API void ReduceMax(
- const int num_dims,
- const int* dims,
- const int num_axes,
- const int* axes,
- const T alpha,
- const T* X,
- T* Y,
- Context* context);
-
-template <typename T, class Context>
-CAFFE2_API void ReduceSum(
- const int num_dims,
- const int* dims,
- const int num_axes,
- const int* axes,
- const T alpha,
- const T* X,
- T* Y,
- Context* context);
-
-template <typename T, class Context>
-CAFFE2_API void ReduceMean(
- const int num_dims,
- const int* dims,
- const int num_axes,
- const int* axes,
- const T alpha,
- const T* X,
- T* Y,
- Context* context);
-
-template <typename T, class Context>
-CAFFE2_API void ReduceL1(
- const int num_dims,
- const int* dims,
- const int num_axes,
- const int* axes,
- const T alpha,
- const T* X,
- T* Y,
- Context* context);
-
-template <typename T, class Context>
-CAFFE2_API void ReduceL2(
- const int num_dims,
- const int* dims,
- const int num_axes,
- const int* axes,
- const T alpha,
- const T* X,
- T* Y,
- Context* context);
-
// Broadcasts X with X_dims to Y with Y_dims.
template <typename T, class Context>
CAFFE2_API void Broadcast(
TensorProto::DataType math_type = TensorProto_DataType_FLOAT);
template <typename T, class Context>
-CAFFE2_API void Set(const size_t N, const T alpha, T* X, Context* context);
-
-template <typename T, class Context>
CAFFE2_API void
RandUniform(const size_t n, const T a, const T b, T* r, Context* context);
T* y,
Context* context);
-template <typename TAlpha, typename TData, class Context>
-CAFFE2_API void Scale(
- const int N,
- const TAlpha alpha,
- const TData* x,
- TData* y,
- Context* context);
-
-// Different from the Scale function above, if alpha is passed in
-// as a pointer, we will assume that it lives on the Context device,
-// for example on GPU.
-template <typename TAlpha, typename TData, class Context>
-CAFFE2_API void Scale(
- const int N,
- const TAlpha* alpha,
- const TData* x,
- TData* y,
- Context* context);
-
template <typename T, class Context>
CAFFE2_API void
Axpy(const int N, const float alpha, const T* x, T* y, Context* context);
#include <algorithm>
#include <functional>
+#ifdef CAFFE2_USE_ACCELERATE
+#include <Accelerate/Accelerate.h>
+#endif // CAFFE2_USE_ACCELERATE
+
#ifdef CAFFE2_USE_MKL
#include <mkl.h>
#endif // CAFFE2_USE_MKL
DELEGATE_SIMPLE_UNARY_FUNCTION(double, Erf, vdErf)
#undef DELEGATE_SIMPLE_UNARY_FUNCTION
-#define DELEGATE_SINCOS_FUNCTION(T, MKLFunc) \
+#define DELEGATE_SINCOS(T, MKLFunc) \
template <> \
C10_EXPORT void SinCos<T, CPUContext>( \
const int N, const T* X, T* S, T* C, CPUContext* /* context */) { \
MKLFunc(N, X, S, C); \
}
-DELEGATE_SINCOS_FUNCTION(float, vsSinCos)
-DELEGATE_SINCOS_FUNCTION(double, vdSinCos)
-#undef DELEGATE_SINCOS_FUNCTION
+DELEGATE_SINCOS(float, vsSinCos)
+DELEGATE_SINCOS(double, vdSinCos)
+#undef DELEGATE_SINCOS
-#define DELEGATE_POWX_FUNCTION(T, MKLFunc) \
+#define DELEGATE_POWX(T, MKLFunc) \
template <> \
C10_EXPORT void Powx<T, CPUContext>( \
const int N, const T* A, const T b, T* Y, CPUContext* /* context */) { \
MKLFunc(N, A, b, Y); \
}
-DELEGATE_POWX_FUNCTION(float, vsPowx)
-DELEGATE_POWX_FUNCTION(double, vdPowx)
-#undef DELEGATE_POWX_FUNCTION
+DELEGATE_POWX(float, vsPowx)
+DELEGATE_POWX(double, vdPowx)
+#undef DELEGATE_POWX
#define DELEGATE_SIMPLE_BINARY_FUNCTION(T, Func, MKLFunc) \
template <> \
#endif // CAFFE2_USE_MKL
+////////////////////////////////////////////////////////////////////////////////
+// BLAS alternatives.
+// Depending on whether we have specified an external BLAS library or not, we
+// will delegate the Caffe math functions that are BLAS-related to either the
+// CBLAS call or the Eigen implementation.
+////////////////////////////////////////////////////////////////////////////////
+#ifdef CAFFE2_USE_EIGEN_FOR_BLAS
+
+#define CAFFE2_SPECIALIZED_SCALE(TAlpha, TData) \
+ template <> \
+ C10_EXPORT void Scale<TAlpha, TData, CPUContext>( \
+ const int N, \
+ const TAlpha alpha, \
+ const TData* X, \
+ TData* Y, \
+ CPUContext* /* context */) { \
+ if (X == Y) { \
+ EigenVectorArrayMap<TData>(Y, N) *= static_cast<TData>(alpha); \
+ } else { \
+ EigenVectorArrayMap<TData>(Y, N) = \
+ ConstEigenVectorArrayMap<TData>(X, N) * static_cast<TData>(alpha); \
+ } \
+ } \
+ template <> \
+ C10_EXPORT void Scale<TAlpha, TData, CPUContext>( \
+ const int N, \
+ const TAlpha* alpha, \
+ const TData* X, \
+ TData* Y, \
+ CPUContext* /* context */) { \
+ if (X == Y) { \
+ EigenVectorArrayMap<TData>(Y, N) *= static_cast<TData>(*alpha); \
+ } else { \
+ EigenVectorArrayMap<TData>(Y, N) = \
+ ConstEigenVectorArrayMap<TData>(X, N) * static_cast<TData>(*alpha); \
+ } \
+ }
+CAFFE2_SPECIALIZED_SCALE(float, float)
+CAFFE2_SPECIALIZED_SCALE(double, double)
+CAFFE2_SPECIALIZED_SCALE(float, double)
+#undef CAFFE2_SPECIALIZED_SCALE
+
+#else // CAFFE2_USE_EIGEN_FOR_BLAS
+
+#ifdef CAFFE2_USE_MKL
+
+#define DELEGATE_SCALE(TAlpha, TData, MKLFunc1, MKLFunc2) \
+ template <> \
+ C10_EXPORT void Scale<TAlpha, TData, CPUContext>( \
+ const int N, \
+ const TAlpha alpha, \
+ const TData* X, \
+ TData* Y, \
+ CPUContext* /* context */) { \
+ if (Y == X) { \
+ MKLFunc1(N, static_cast<TData>(alpha), Y, 1); \
+ } else { \
+ MKLFunc2(N, static_cast<TData>(alpha), X, 1, TData(0), Y, 1); \
+ } \
+ } \
+ template <> \
+ C10_EXPORT void Scale<TAlpha, TData, CPUContext>( \
+ const int N, \
+ const TAlpha* alpha, \
+ const TData* X, \
+ TData* Y, \
+ CPUContext* /* context */) { \
+ if (Y == X) { \
+ MKLFunc1(N, static_cast<TData>(*alpha), Y, 1); \
+ } else { \
+ MKLFunc2(N, static_cast<TData>(*alpha), X, 1, TData(0), Y, 1); \
+ } \
+ }
+DELEGATE_SCALE(float, float, cblas_sscal, cblas_saxpby)
+DELEGATE_SCALE(double, double, cblas_dscal, cblas_daxpby)
+DELEGATE_SCALE(float, double, cblas_dscal, cblas_daxpby)
+#undef DELEGATE_SCALE
+
+#else // CAFFE2_USE_MKL
+
+#define DELEGATE_SCALE(TAlpha, TData, BLASFunc) \
+ template <> \
+ C10_EXPORT void Scale<TAlpha, TData, CPUContext>( \
+ const int N, \
+ const TAlpha alpha, \
+ const TData* X, \
+ TData* Y, \
+ CPUContext* /* context */) { \
+ if (Y == X) { \
+ BLASFunc(N, static_cast<TData>(alpha), Y, 1); \
+ } else { \
+ EigenVectorArrayMap<TData>(Y, N) = \
+ ConstEigenVectorArrayMap<TData>(X, N) * static_cast<TData>(alpha); \
+ } \
+ } \
+ template <> \
+ C10_EXPORT void Scale<TAlpha, TData, CPUContext>( \
+ const int N, \
+ const TAlpha* alpha, \
+ const TData* X, \
+ TData* Y, \
+ CPUContext* /* context */) { \
+ if (Y == X) { \
+ BLASFunc(N, static_cast<TData>(*alpha), Y, 1); \
+ } else { \
+ EigenVectorArrayMap<TData>(Y, N) = \
+ ConstEigenVectorArrayMap<TData>(X, N) * static_cast<TData>(*alpha); \
+ } \
+ }
+DELEGATE_SCALE(float, float, cblas_sscal)
+DELEGATE_SCALE(double, double, cblas_dscal)
+DELEGATE_SCALE(float, double, cblas_dscal)
+#undef DELEGATE_SCALE
+
+#endif // CAFFE2_USE_MKL
+
+#endif // CAFFE2_USE_EIGEN_FOR_BLAS
+
+////////////////////////////////////////////////////////////////////////////////
+// Common math functions being used in Caffe that do not have a BLAS or MKL
+// equivalent. For all these functions, we will simply implement them either via
+// Eigen or via custom code.
+////////////////////////////////////////////////////////////////////////////////
+
+#define CAFFE2_SPECIALIZED_SET(T) \
+ template <> \
+ C10_EXPORT void Set<T, CPUContext>( \
+ const int N, const T alpha, T* Y, CPUContext* /* context */) { \
+ if (N == 0) { \
+ return; \
+ } \
+ if (alpha == T(0)) { \
+ std::memset(Y, 0, N * sizeof(T)); \
+ } else { \
+ EigenVectorArrayMap<T>(Y, N).setConstant(alpha); \
+ } \
+ }
+CAFFE2_SPECIALIZED_SET(float)
+CAFFE2_SPECIALIZED_SET(double)
+CAFFE2_SPECIALIZED_SET(int)
+CAFFE2_SPECIALIZED_SET(std::int8_t)
+CAFFE2_SPECIALIZED_SET(std::int16_t)
+CAFFE2_SPECIALIZED_SET(std::int64_t)
+CAFFE2_SPECIALIZED_SET(bool)
+CAFFE2_SPECIALIZED_SET(char)
+CAFFE2_SPECIALIZED_SET(std::uint8_t)
+CAFFE2_SPECIALIZED_SET(std::uint16_t)
+#undef CAFFE2_SPECIALIZED_SET
+
#define DELEGATE_SIMPLE_UNARY_FUNCTION(T, Func, EigenFunc) \
template <> \
C10_EXPORT void Func<T, CPUContext>( \
CAFFE2_SPECIALIZED_NEG(double)
#undef CAFFE2_SPECIALIZED_NEG
+#define CAFFE2_SPECIALIZED_SCALE(TAlpha, TData) \
+ template <> \
+ C10_EXPORT void Scale<TAlpha, TData, CPUContext>( \
+ const int N, \
+ const TAlpha alpha, \
+ const TData* X, \
+ TData* Y, \
+ CPUContext* /* context */) { \
+ if (X == Y) { \
+ EigenVectorArrayMap<TData>(Y, N) *= static_cast<TData>(alpha); \
+ } else { \
+ EigenVectorArrayMap<TData>(Y, N) = \
+ ConstEigenVectorArrayMap<TData>(X, N) * static_cast<TData>(alpha); \
+ } \
+ } \
+ template <> \
+ C10_EXPORT void Scale<TAlpha, TData, CPUContext>( \
+ const int N, \
+ const TAlpha* alpha, \
+ const TData* X, \
+ TData* Y, \
+ CPUContext* /* context */) { \
+ if (X == Y) { \
+ EigenVectorArrayMap<TData>(Y, N) *= static_cast<TData>(*alpha); \
+ } else { \
+ EigenVectorArrayMap<TData>(Y, N) = \
+ ConstEigenVectorArrayMap<TData>(X, N) * static_cast<TData>(*alpha); \
+ } \
+ }
+CAFFE2_SPECIALIZED_SCALE(std::int32_t, std::int32_t)
+CAFFE2_SPECIALIZED_SCALE(std::int64_t, std::int64_t)
+#undef CAFFE2_SPECIALIZED_SCALE
+
#define DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_OPERATOR(T, Func, EigenOp) \
template <> \
C10_EXPORT void Func<T, CPUContext>( \
EigenVectorMap<T>(C, N) = ConstEigenVectorArrayMap<T>(A, N).EigenFunc( \
ConstEigenVectorArrayMap<T>(B, N)); \
}
+DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_FUNCTION(std::int32_t, Min, min)
+DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_FUNCTION(std::int64_t, Min, min)
DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_FUNCTION(float, Min, min)
DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_FUNCTION(double, Min, min)
+DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_FUNCTION(std::int32_t, Max, max)
+DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_FUNCTION(std::int64_t, Max, max)
DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_FUNCTION(float, Max, max)
DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_FUNCTION(double, Max, max)
#undef DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_FUNCTION
CAFFE2_API void Erf(int N, const T* X, T* Y, Context* context);
template <typename T, class Context>
+CAFFE2_API void Set(int N, T alpha, T* X, Context* context);
+
+template <typename TAlpha, typename TData, class Context>
+CAFFE2_API void
+Scale(int N, TAlpha alpha, const TData* X, TData* Y, Context* context);
+
+// Different from the Scale function above, if alpha is passed in as a pointer,
+// we will assume that it lives on the Context device, for example on GPU.
+template <typename TAlpha, typename TData, class Context>
+CAFFE2_API void
+Scale(int N, const TAlpha* alpha, const TData* X, TData* Y, Context* context);
+
+template <typename T, class Context>
CAFFE2_API void Add(int N, const T* A, const T* B, T* C, Context* context);
template <typename T, class Context>
CAFFE2_API void Sub(int N, const T* A, const T* B, T* C, Context* context);
#include <numeric>
#include <vector>
+#ifdef CAFFE2_USE_ACCELERATE
+#include <Accelerate/Accelerate.h>
+#endif // CAFFE2_USE_ACCELERATE
+
+#ifdef CAFFE2_USE_MKL
+#include <mkl.h>
+#endif // CAFFE2_USE_MKL
+
#include "caffe2/core/context.h"
#include "caffe2/utils/eigen_utils.h"
+#include "caffe2/utils/math/elementwise.h"
#include "caffe2/utils/math/utils.h"
namespace caffe2 {
namespace {
+#define DELEGATE_ROWWISE_REDUCE_FUNCTION(Func, EigenFunc) \
+ template <typename T> \
+ void Rowwise##Func( \
+ const int rows, \
+ const int cols, \
+ const T alpha, \
+ const T* X, \
+ T* Y, \
+ CPUContext* /* context */) { \
+ EigenVectorMap<T>(Y, rows) = \
+ ConstEigenMatrixMap<T>(X, cols, rows).colwise().EigenFunc() * alpha; \
+ }
+DELEGATE_ROWWISE_REDUCE_FUNCTION(ReduceMin, minCoeff)
+DELEGATE_ROWWISE_REDUCE_FUNCTION(ReduceMax, maxCoeff)
+DELEGATE_ROWWISE_REDUCE_FUNCTION(ReduceSum, sum)
+DELEGATE_ROWWISE_REDUCE_FUNCTION(ReduceMean, mean)
+DELEGATE_ROWWISE_REDUCE_FUNCTION(ReduceL1, template lpNorm<1>)
+DELEGATE_ROWWISE_REDUCE_FUNCTION(ReduceL2, norm)
+#undef DELEGATE_ROWWISE_REDUCE_FUNCTION
+
+#ifndef CAFFE2_USE_EIGEN_FOR_BLAS
+
+#define DELEGATE_ROWWISE_REDUCE_FUNCTION(T, Func, BLASFunc) \
+ template <> \
+ void Rowwise##Func( \
+ const int rows, \
+ const int cols, \
+ const T alpha, \
+ const T* X, \
+ T* Y, \
+ CPUContext* /* context */) { \
+ for (int i = 0; i < rows; ++i) { \
+ Y[i] = BLASFunc(cols, X + i * cols, 1) * alpha; \
+ } \
+ }
+DELEGATE_ROWWISE_REDUCE_FUNCTION(float, ReduceL1, cblas_sasum)
+DELEGATE_ROWWISE_REDUCE_FUNCTION(double, ReduceL1, cblas_dasum)
+DELEGATE_ROWWISE_REDUCE_FUNCTION(float, ReduceL2, cblas_snrm2)
+DELEGATE_ROWWISE_REDUCE_FUNCTION(double, ReduceL2, cblas_dnrm2)
+#undef DELEGATE_ROWWISE_REDUCE_FUNCTION
+
+#endif // CAFFE2_USE_EIGEN_FOR_BLAS
+
+#define DELEGATE_COLWISE_REDUCE_FUNCTION(Func, MathFunc) \
+ template <typename T> \
+ void Colwise##Func( \
+ const int rows, \
+ const int cols, \
+ const T alpha, \
+ const T* X, \
+ T* Y, \
+ CPUContext* context) { \
+ std::memcpy(Y, X, sizeof(T) * cols); \
+ for (int i = 1; i < rows; ++i) { \
+ MathFunc<T, CPUContext>(cols, Y, X + i * cols, Y, context); \
+ } \
+ Scale<T, T, CPUContext>(cols, alpha, Y, Y, context); \
+ }
+DELEGATE_COLWISE_REDUCE_FUNCTION(ReduceMin, Min)
+DELEGATE_COLWISE_REDUCE_FUNCTION(ReduceMax, Max)
+DELEGATE_COLWISE_REDUCE_FUNCTION(ReduceSum, Add)
+#undef DELEGATE_COLWISE_REDUCE_FUNCTION
+
+template <typename T>
+void ColwiseReduceMean(
+ const int rows,
+ const int cols,
+ const T alpha,
+ const T* X,
+ T* Y,
+ CPUContext* context) {
+ ColwiseReduceSum<T>(rows, cols, alpha / static_cast<T>(rows), X, Y, context);
+}
+
+template <typename T>
+void ColwiseReduceL1(
+ const int rows,
+ const int cols,
+ const T alpha,
+ const T* X,
+ T* Y,
+ CPUContext* context) {
+ ConstEigenArrayMap<T> X_arr(X, cols, rows);
+ EigenVectorArrayMap<T> Y_arr(Y, cols);
+ Y_arr = X_arr.col(0).abs();
+ for (int i = 1; i < rows; ++i) {
+ Y_arr += X_arr.col(i).abs();
+ }
+ Scale<T, T, CPUContext>(cols, alpha, Y, Y, context);
+}
+
+template <typename T>
+void ColwiseReduceL2(
+ const int rows,
+ const int cols,
+ const T alpha,
+ const T* X,
+ T* Y,
+ CPUContext* /* context */) {
+ ConstEigenArrayMap<T> X_arr(X, cols, rows);
+ EigenVectorArrayMap<T> Y_arr(Y, cols);
+ Y_arr = X_arr.col(0).square();
+ for (int i = 1; i < rows; ++i) {
+ Y_arr += X_arr.col(i).square();
+ }
+ Y_arr = Y_arr.sqrt() * alpha;
+}
+
+template <typename T>
+void BothEndsReduceMin(
+ const int M,
+ const int N,
+ const int K,
+ const T alpha,
+ const T* X,
+ T* Y,
+ CPUContext* context) {
+ EigenVectorArrayMap<T> Y_arr(Y, N);
+ Y_arr = ConstEigenArrayMap<T>(X, K, N).colwise().minCoeff();
+ for (int i = 1; i < M; ++i) {
+ ConstEigenArrayMap<T> X_arr(X + i * N * K, K, N);
+ for (int j = 0; j < N; ++j) {
+ Y[j] = std::min(Y[j], X_arr.col(j).minCoeff());
+ }
+ }
+ Scale<T, T, CPUContext>(N, alpha, Y, Y, context);
+}
+
+template <typename T>
+void BothEndsReduceMax(
+ const int M,
+ const int N,
+ const int K,
+ const T alpha,
+ const T* X,
+ T* Y,
+ CPUContext* context) {
+ EigenVectorArrayMap<T> Y_arr(Y, N);
+ Y_arr = ConstEigenArrayMap<T>(X, K, N).colwise().maxCoeff();
+ for (int i = 1; i < M; ++i) {
+ ConstEigenArrayMap<T> X_arr(X + i * N * K, K, N);
+ for (int j = 0; j < N; ++j) {
+ Y[j] = std::max(Y[j], X_arr.col(j).maxCoeff());
+ }
+ }
+ Scale<T, T, CPUContext>(N, alpha, Y, Y, context);
+}
+
+template <typename T>
+void BothEndsReduceSum(
+ const int M,
+ const int N,
+ const int K,
+ const T alpha,
+ const T* X,
+ T* Y,
+ CPUContext* context) {
+ EigenVectorArrayMap<T> Y_arr(Y, N);
+ Y_arr = ConstEigenArrayMap<T>(X, K, N).colwise().sum();
+ for (int i = 1; i < M; ++i) {
+ Y_arr += ConstEigenArrayMap<T>(X + i * N * K, K, N).colwise().sum();
+ }
+ Scale<T, T, CPUContext>(N, alpha, Y, Y, context);
+}
+
+template <typename T>
+void BothEndsReduceMean(
+ const int M,
+ const int N,
+ const int K,
+ const T alpha,
+ const T* X,
+ T* Y,
+ CPUContext* context) {
+ EigenVectorArrayMap<T> Y_arr(Y, N);
+ Y_arr = ConstEigenArrayMap<T>(X, K, N).colwise().mean();
+ for (int i = 1; i < M; ++i) {
+ Y_arr += ConstEigenArrayMap<T>(X + i * N * K, K, N).colwise().mean();
+ }
+ Scale<T, T, CPUContext>(N, alpha / static_cast<T>(M), Y, Y, context);
+}
+
+template <typename T>
+void BothEndsReduceL1(
+ const int M,
+ const int N,
+ const int K,
+ const T alpha,
+ const T* X,
+ T* Y,
+ CPUContext* context) {
+ EigenVectorMap<T> Y_vec(Y, N);
+ Y_vec = ConstEigenMatrixMap<T>(X, K, N).colwise().template lpNorm<1>();
+ for (int i = 1; i < M; ++i) {
+ Y_vec += ConstEigenMatrixMap<T>(X + i * N * K, K, N)
+ .colwise()
+ .template lpNorm<1>();
+ }
+ Scale<T, T, CPUContext>(N, alpha, Y, Y, context);
+}
+
+template <typename T>
+void BothEndsReduceL2(
+ const int M,
+ const int N,
+ const int K,
+ const T alpha,
+ const T* X,
+ T* Y,
+ CPUContext* /* context */) {
+ EigenVectorMap<T> Y_vec(Y, N);
+ Y_vec = ConstEigenMatrixMap<T>(X, K, N).colwise().squaredNorm();
+ for (int i = 1; i < M; ++i) {
+ Y_vec +=
+ ConstEigenMatrixMap<T>(X + i * N * K, K, N).colwise().squaredNorm();
+ }
+ Y_vec = Y_vec.cwiseSqrt() * alpha;
+}
+
+template <typename T, class Reducer>
+void ReduceTensorImpl(
+ const int ndim,
+ const int* X_dims,
+ const int* Y_dims,
+ const Reducer& reducer,
+ const T init,
+ const T* X,
+ T* Y,
+ CPUContext* context) {
+ const int X_size =
+ std::accumulate(X_dims, X_dims + ndim, 1, std::multiplies<int>());
+ const int Y_size =
+ std::accumulate(Y_dims, Y_dims + ndim, 1, std::multiplies<int>());
+ Set<T, CPUContext>(Y_size, init, Y, context);
+ std::vector<int> index(ndim, 0);
+ for (int X_index = 0; X_index < X_size; ++X_index) {
+ const int Y_index = utils::GetIndexFromDims(ndim, Y_dims, index.data());
+ Y[Y_index] = reducer(Y[Y_index], X[X_index]);
+ utils::IncreaseIndexInDims(ndim, X_dims, index.data());
+ }
+}
+
+template <typename T>
+void ReduceMinImpl(
+ const int ndim,
+ const int* X_dims,
+ const int* Y_dims,
+ const T alpha,
+ const T* X,
+ T* Y,
+ CPUContext* context) {
+ ReduceTensorImpl(
+ ndim,
+ X_dims,
+ Y_dims,
+ [](const T a, const T b) { return std::min(a, b); },
+ std::numeric_limits<T>::max(),
+ X,
+ Y,
+ context);
+ const int Y_size =
+ std::accumulate(Y_dims, Y_dims + ndim, 1, std::multiplies<int>());
+ Scale<T, T, CPUContext>(Y_size, alpha, Y, Y, context);
+}
+
+template <typename T>
+void ReduceMaxImpl(
+ const int ndim,
+ const int* X_dims,
+ const int* Y_dims,
+ const T alpha,
+ const T* X,
+ T* Y,
+ CPUContext* context) {
+ ReduceTensorImpl(
+ ndim,
+ X_dims,
+ Y_dims,
+ [](const T a, const T b) { return std::max(a, b); },
+ std::numeric_limits<T>::lowest(),
+ X,
+ Y,
+ context);
+ const int Y_size =
+ std::accumulate(Y_dims, Y_dims + ndim, 1, std::multiplies<int>());
+ Scale<T, T, CPUContext>(Y_size, alpha, Y, Y, context);
+}
+
+template <typename T>
+void ReduceSumImpl(
+ const int ndim,
+ const int* X_dims,
+ const int* Y_dims,
+ const T alpha,
+ const T* X,
+ T* Y,
+ CPUContext* context) {
+ ReduceTensorImpl(ndim, X_dims, Y_dims, std::plus<T>(), T(0), X, Y, context);
+ const int Y_size =
+ std::accumulate(Y_dims, Y_dims + ndim, 1, std::multiplies<int>());
+ Scale<T, T, CPUContext>(Y_size, alpha, Y, Y, context);
+}
+
template <typename T>
-C10_EXPORT void
-RowwiseMoments(const int rows, const int cols, const T* X, T* mean, T* var) {
+void ReduceMeanImpl(
+ const int ndim,
+ const int* X_dims,
+ const int* Y_dims,
+ const T alpha,
+ const T* X,
+ T* Y,
+ CPUContext* context) {
+ ReduceTensorImpl(ndim, X_dims, Y_dims, std::plus<T>(), T(0), X, Y, context);
+ const int X_size =
+ std::accumulate(X_dims, X_dims + ndim, 1, std::multiplies<int>());
+ const int Y_size =
+ std::accumulate(Y_dims, Y_dims + ndim, 1, std::multiplies<int>());
+ Scale<T, T, CPUContext>(
+ Y_size,
+ alpha * static_cast<T>(Y_size) / static_cast<T>(X_size),
+ Y,
+ Y,
+ context);
+}
+
+template <typename T>
+void ReduceL1Impl(
+ const int ndim,
+ const int* X_dims,
+ const int* Y_dims,
+ const T alpha,
+ const T* X,
+ T* Y,
+ CPUContext* context) {
+ ReduceTensorImpl(
+ ndim,
+ X_dims,
+ Y_dims,
+ [](const T a, const T b) { return a + std::abs(b); },
+ T(0),
+ X,
+ Y,
+ context);
+ const int Y_size =
+ std::accumulate(Y_dims, Y_dims + ndim, 1, std::multiplies<int>());
+ Scale<T, T, CPUContext>(Y_size, alpha, Y, Y, context);
+}
+
+template <typename T>
+void ReduceL2Impl(
+ const int ndim,
+ const int* X_dims,
+ const int* Y_dims,
+ const T alpha,
+ const T* X,
+ T* Y,
+ CPUContext* context) {
+ ReduceTensorImpl(
+ ndim,
+ X_dims,
+ Y_dims,
+ [](const T a, const T b) { return a + b * b; },
+ T(0),
+ X,
+ Y,
+ context);
+ const int Y_size =
+ std::accumulate(Y_dims, Y_dims + ndim, 1, std::multiplies<int>());
+ EigenVectorArrayMap<T> Y_arr(Y, Y_size);
+ Y_arr = Y_arr.sqrt() * alpha;
+}
+
+template <typename T>
+void RowwiseMoments(
+ const int rows,
+ const int cols,
+ const T* X,
+ T* mean,
+ T* var) {
ConstEigenArrayMap<T> X_arr(X, cols, rows);
EigenVectorArrayMap<T> mean_arr(mean, rows);
EigenVectorArrayMap<T> var_arr(var, rows);
}
template <typename T>
-C10_EXPORT void
-ColwiseMoments(const int rows, const int cols, const T* X, T* mean, T* var) {
- std::memset(mean, 0, sizeof(T) * cols);
- std::memset(var, 0, sizeof(T) * cols);
+void ColwiseMoments(
+ const int rows,
+ const int cols,
+ const T* X,
+ T* mean,
+ T* var) {
ConstEigenArrayMap<T> X_arr(X, cols, rows);
EigenVectorArrayMap<T> mean_arr(mean, cols);
EigenVectorArrayMap<T> var_arr(var, cols);
// Eigen rowwise reduction is about 10 times slower than this for-loop.
- for (int i = 0; i < rows; ++i) {
+ mean_arr = X_arr.col(0);
+ var_arr = X_arr.col(0).square();
+ for (int i = 1; i < rows; ++i) {
mean_arr += X_arr.col(i);
var_arr += X_arr.col(i).square();
}
}
template <typename T>
-C10_EXPORT void BothEndsMoments(
- const int pre,
- const int mid,
- const int nxt,
+void BothEndsMoments(
+ const int M,
+ const int N,
+ const int K,
const T* X,
T* mean,
T* var) {
- std::memset(mean, 0, sizeof(T) * mid);
- std::memset(var, 0, sizeof(T) * mid);
- EigenVectorArrayMap<T> mean_arr(mean, mid);
- EigenVectorArrayMap<T> var_arr(var, mid);
- ConstEigenArrayMap<T> X_arr(X, nxt, pre * mid);
- for (int i = 0; i < pre; ++i) {
- for (int j = 0; j < mid; ++j) {
- const int c = i * mid + j;
- mean_arr(j) += X_arr.col(c).sum();
- var_arr(j) += X_arr.col(c).square().sum();
- }
+ EigenVectorArrayMap<T> mean_arr(mean, N);
+ EigenVectorArrayMap<T> var_arr(var, N);
+ ConstEigenArrayMap<T> X0_arr(X, K, N);
+ mean_arr = X0_arr.colwise().sum();
+ var_arr = X0_arr.square().colwise().sum();
+ for (int i = 1; i < M; ++i) {
+ ConstEigenArrayMap<T> X_arr(X + i * N * K, K, N);
+ mean_arr += X_arr.colwise().sum();
+ var_arr += X_arr.square().colwise().sum();
}
- const T scale = T(1) / static_cast<T>(pre * nxt);
+ const T scale = T(1) / static_cast<T>(M * K);
mean_arr *= scale;
var_arr = var_arr * scale - mean_arr.square();
}
template <typename T>
-C10_EXPORT void MomentsImpl(
+void MomentsImpl(
const int ndim,
const int* X_dims,
const int* Y_dims,
} // namespace
+#define DELEGATE_GLOBAL_REDUCE_FUNCTION(T, Func, EigenFunc) \
+ template <> \
+ C10_EXPORT void Func<T, CPUContext>( \
+ const int N, \
+ const T* X, \
+ T* Y, \
+ Tensor* /* scratch_ptr */, \
+ CPUContext* /* context */) { \
+ *Y = ConstEigenVectorArrayMap<T>(X, N).EigenFunc(); \
+ }
+DELEGATE_GLOBAL_REDUCE_FUNCTION(float, ReduceMin, minCoeff)
+DELEGATE_GLOBAL_REDUCE_FUNCTION(std::int32_t, ReduceMin, minCoeff)
+DELEGATE_GLOBAL_REDUCE_FUNCTION(std::int64_t, ReduceMin, minCoeff)
+DELEGATE_GLOBAL_REDUCE_FUNCTION(float, ReduceMax, maxCoeff)
+DELEGATE_GLOBAL_REDUCE_FUNCTION(std::int32_t, ReduceMax, maxCoeff)
+DELEGATE_GLOBAL_REDUCE_FUNCTION(std::int64_t, ReduceMax, maxCoeff)
+#undef DELEGATE_GLOBAL_REDUCE_FUNCTION
+
+#define DELEGATE_REDUCE_FUNCTION(T, Func, kInit, kIsNorm) \
+ template <> \
+ C10_EXPORT void Func<T, CPUContext>( \
+ const int ndim, \
+ const int* X_dims, \
+ const int* Y_dims, \
+ const T alpha, \
+ const T* X, \
+ T* Y, \
+ CPUContext* context) { \
+ const int X_size = \
+ std::accumulate(X_dims, X_dims + ndim, 1, std::multiplies<int>()); \
+ const int Y_size = \
+ std::accumulate(Y_dims, Y_dims + ndim, 1, std::multiplies<int>()); \
+ if (X_size == 0) { \
+ Set<T, CPUContext>(Y_size, alpha * kInit, Y, context); \
+ return; \
+ } \
+ if (alpha == T(0)) { \
+ std::memset(Y, 0, sizeof(T) * Y_size); \
+ return; \
+ } \
+ if (std::equal(X_dims, X_dims + ndim, Y_dims)) { \
+ if (kIsNorm) { \
+ EigenVectorArrayMap<T>(Y, Y_size) = \
+ ConstEigenVectorArrayMap<T>(X, X_size).abs() * alpha; \
+ } else { \
+ Scale<T, T, CPUContext>(Y_size, alpha, X, Y, context); \
+ } \
+ return; \
+ } \
+ int rows; \
+ int cols; \
+ if (utils::IsRowwiseReduce(ndim, X_dims, Y_dims, &rows, &cols)) { \
+ Rowwise##Func<T>(rows, cols, alpha, X, Y, context); \
+ return; \
+ } \
+ if (utils::IsColwiseReduce(ndim, X_dims, Y_dims, &rows, &cols)) { \
+ Colwise##Func<T>(rows, cols, alpha, X, Y, context); \
+ return; \
+ } \
+ int M; \
+ int N; \
+ int K; \
+ if (utils::IsBothEndsReduce(ndim, X_dims, Y_dims, &M, &N, &K)) { \
+ BothEnds##Func<T>(M, N, K, alpha, X, Y, context); \
+ return; \
+ } \
+ Func##Impl<T>(ndim, X_dims, Y_dims, alpha, X, Y, context); \
+ }
+DELEGATE_REDUCE_FUNCTION(
+ float,
+ ReduceMin,
+ std::numeric_limits<float>::max(),
+ false)
+DELEGATE_REDUCE_FUNCTION(
+ double,
+ ReduceMin,
+ std::numeric_limits<double>::max(),
+ false)
+DELEGATE_REDUCE_FUNCTION(
+ std::int32_t,
+ ReduceMin,
+ std::numeric_limits<std::int32_t>::max(),
+ false)
+DELEGATE_REDUCE_FUNCTION(
+ std::int64_t,
+ ReduceMin,
+ std::numeric_limits<std::int64_t>::max(),
+ false)
+DELEGATE_REDUCE_FUNCTION(
+ float,
+ ReduceMax,
+ std::numeric_limits<float>::lowest(),
+ false)
+DELEGATE_REDUCE_FUNCTION(
+ double,
+ ReduceMax,
+ std::numeric_limits<double>::lowest(),
+ false)
+DELEGATE_REDUCE_FUNCTION(
+ std::int32_t,
+ ReduceMax,
+ std::numeric_limits<std::int32_t>::lowest(),
+ false)
+DELEGATE_REDUCE_FUNCTION(
+ std::int64_t,
+ ReduceMax,
+ std::numeric_limits<std::int64_t>::lowest(),
+ false)
+DELEGATE_REDUCE_FUNCTION(float, ReduceSum, 0.0f, false)
+DELEGATE_REDUCE_FUNCTION(double, ReduceSum, 0.0, false)
+DELEGATE_REDUCE_FUNCTION(std::int32_t, ReduceSum, 0, false)
+DELEGATE_REDUCE_FUNCTION(std::int64_t, ReduceSum, 0LL, false)
+DELEGATE_REDUCE_FUNCTION(float, ReduceMean, 0.0f, false)
+DELEGATE_REDUCE_FUNCTION(double, ReduceMean, 0.0, false)
+DELEGATE_REDUCE_FUNCTION(float, ReduceL1, 0.0f, true)
+DELEGATE_REDUCE_FUNCTION(double, ReduceL1, 0.0, true)
+DELEGATE_REDUCE_FUNCTION(std::int32_t, ReduceL1, 0, true)
+DELEGATE_REDUCE_FUNCTION(std::int64_t, ReduceL1, 0LL, true)
+DELEGATE_REDUCE_FUNCTION(float, ReduceL2, 0.0f, true)
+DELEGATE_REDUCE_FUNCTION(double, ReduceL2, 0.0, true)
+#undef DELEGATE_REDUCE_FUNCTION
+
#define CAFFE2_SPECIALIZED_MOMENTS(T) \
template <> \
C10_EXPORT void Moments<T, CPUContext>( \
#include "caffe2/core/types.h"
namespace caffe2 {
+
+class Tensor;
+
namespace math {
+template <typename T, class Context>
+CAFFE2_API void
+ReduceMin(const int N, const T* X, T* y, Tensor* scratch_ptr, Context* context);
+
+template <typename T, class Context>
+CAFFE2_API void
+ReduceMax(const int N, const T* X, T* y, Tensor* scratch_ptr, Context* context);
+
+// In all of the reduce functions, X_dims and Y_dims should have ndim elements.
+// Each dimension of Y_dims must match the corresponding dimension of X_dims or
+// must be equal to 1. The dimensions equal to 1 indicate the dimensions of X to
+// be reduced.
+
+// Y = alpha * ReduceMin(X)
+template <typename T, class Context>
+CAFFE2_API void ReduceMin(
+ const int ndim,
+ const int* X_dims,
+ const int* Y_dims,
+ const T alpha,
+ const T* X,
+ T* Y,
+ Context* context);
+
+// Y = alpha * ReduceMax(X)
+template <typename T, class Context>
+CAFFE2_API void ReduceMax(
+ const int ndim,
+ const int* X_dims,
+ const int* Y_dims,
+ const T alpha,
+ const T* X,
+ T* Y,
+ Context* context);
+
+// Y = alpha * ReduceSum(X)
+template <typename T, class Context>
+CAFFE2_API void ReduceSum(
+ const int ndim,
+ const int* X_dims,
+ const int* Y_dims,
+ const T alpha,
+ const T* X,
+ T* Y,
+ Context* context);
+
+// Y = alpha * ReduceMean(X)
+template <typename T, class Context>
+CAFFE2_API void ReduceMean(
+ const int ndim,
+ const int* X_dims,
+ const int* Y_dims,
+ const T alpha,
+ const T* X,
+ T* Y,
+ Context* context);
+
+// Y = alpha * ReduceL1(X)
+template <typename T, class Context>
+CAFFE2_API void ReduceL1(
+ const int ndim,
+ const int* X_dims,
+ const int* Y_dims,
+ const T alpha,
+ const T* X,
+ T* Y,
+ Context* context);
+
+// Y = alpha * ReduceL2(X)
+template <typename T, class Context>
+CAFFE2_API void ReduceL2(
+ const int ndim,
+ const int* X_dims,
+ const int* Y_dims,
+ const T alpha,
+ const T* X,
+ T* Y,
+ Context* context);
+
// Computes mean and variance over axes.
template <typename T, class Context>
CAFFE2_API void Moments(
Context* context);
} // namespace math
+
} // namespace caffe2
#endif // CAFFE2_UTILS_MATH_REDUCE_H_
cblas_sgemv(CblasRowMajor, trans_A, M, N, alpha, A, N, x, 1, beta, y, 1);
}
-#define CAFFE2_SPECIALIZED_SCALE(TAlpha, TData, prefix) \
- template <> \
- C10_EXPORT void Scale<TAlpha, TData, CPUContext>( \
- const int n, \
- const TAlpha alpha, \
- const TData* x, \
- TData* y, \
- CPUContext*) { \
- if (y != x) { \
- cblas_##prefix##copy(n, x, 1, y, 1); \
- } \
- if (alpha != TAlpha(1)) { \
- cblas_##prefix##scal(n, static_cast<TData>(alpha), y, 1); \
- } \
- } \
- template <> \
- C10_EXPORT void Scale<TAlpha, TData, CPUContext>( \
- const int n, \
- const TAlpha* alpha, \
- const TData* x, \
- TData* y, \
- CPUContext*) { \
- if (y != x) { \
- cblas_##prefix##copy(n, x, 1, y, 1); \
- } \
- if (*alpha != TAlpha(1)) { \
- cblas_##prefix##scal(n, static_cast<TData>(*alpha), y, 1); \
- } \
- }
-CAFFE2_SPECIALIZED_SCALE(float, float, s)
-CAFFE2_SPECIALIZED_SCALE(double, double, d)
-CAFFE2_SPECIALIZED_SCALE(float, double, d)
-#undef CAFFE2_SPECIALIZED_SCALE
-
#define CAFFE2_SPECIALIZED_DOT(T, prefix) \
template <> \
C10_EXPORT void Dot<T, CPUContext>( \
#endif // CAFFE2_USE_EIGEN_FOR_BLAS
-#define CAFFE2_SPECIALIZED_SCALE(TAlpha, TData) \
- template <> \
- C10_EXPORT void Scale<TAlpha, TData, CPUContext>( \
- const int n, \
- const TAlpha alpha, \
- const TData* x, \
- TData* y, \
- CPUContext* /* context */) { \
- EigenVectorMap<TData>(y, n) = \
- ConstEigenVectorMap<TData>(x, n) * static_cast<TData>(alpha); \
- } \
- template <> \
- C10_EXPORT void Scale<TAlpha, TData, CPUContext>( \
- const int n, \
- const TAlpha* alpha, \
- const TData* x, \
- TData* y, \
- CPUContext* /* context */) { \
- EigenVectorMap<TData>(y, n) = \
- ConstEigenVectorMap<TData>(x, n) * static_cast<TData>(*alpha); \
- }
-#ifdef CAFFE2_USE_EIGEN_FOR_BLAS
-CAFFE2_SPECIALIZED_SCALE(float, float)
-CAFFE2_SPECIALIZED_SCALE(double, double)
-CAFFE2_SPECIALIZED_SCALE(float, double)
-#endif // CAFFE2_USE_EIGEN_FOR_BLAS
-CAFFE2_SPECIALIZED_SCALE(std::int32_t, std::int32_t)
-CAFFE2_SPECIALIZED_SCALE(std::int64_t, std::int64_t)
-#undef CAFFE2_SPECIALIZED_SCALE
-
template <>
C10_EXPORT void GemmBatched<float, CPUContext>(
const CBLAS_TRANSPOSE trans_A,
// Eigen or via custom code.
////////////////////////////////////////////////////////////////////////////////
-#define CAFFE2_SPECIALIZED_SET(T) \
- template <> \
- C10_EXPORT void Set<T, CPUContext>( \
- const size_t N, const T alpha, T* Y, CPUContext*) { \
- if (N == 0) { \
- return; \
- } \
- if (alpha == (T)0) { \
- if (Y != nullptr) { \
- std::memset(Y, 0, N * sizeof(T)); \
- } \
- } else { \
- EigenVectorMap<T>(Y, N).setConstant(alpha); \
- } \
- }
-
-CAFFE2_SPECIALIZED_SET(float);
-CAFFE2_SPECIALIZED_SET(double);
-CAFFE2_SPECIALIZED_SET(int8_t);
-CAFFE2_SPECIALIZED_SET(int16_t);
-CAFFE2_SPECIALIZED_SET(int);
-CAFFE2_SPECIALIZED_SET(int64_t);
-CAFFE2_SPECIALIZED_SET(bool);
-CAFFE2_SPECIALIZED_SET(char);
-CAFFE2_SPECIALIZED_SET(uint8_t);
-CAFFE2_SPECIALIZED_SET(uint16_t);
-#undef CAFFE2_SPECIALIZED_SET
-
-#define CAFFE2_SPECIALIZED_REDUCEMIN(T) \
- template <> \
- C10_EXPORT void ReduceMin<T, CPUContext>( \
- const int N, \
- const T* x, \
- T* y, \
- Tensor* /*scratch_ptr*/, \
- CPUContext* /*context*/) { \
- *y = ConstEigenVectorArrayMap<T>(x, N).minCoeff(); \
- }
-CAFFE2_SPECIALIZED_REDUCEMIN(float)
-#undef CAFFE2_SPECIALIZED_REDUCEMIN
-
-#define CAFFE2_SPECIALIZED_REDUCEMAX(T) \
- template <> \
- C10_EXPORT void ReduceMax<T, CPUContext>( \
- const int N, \
- const T* x, \
- T* y, \
- Tensor* /*scratch_ptr*/, \
- CPUContext* /*context*/) { \
- *y = ConstEigenVectorArrayMap<T>(x, N).maxCoeff(); \
- }
-CAFFE2_SPECIALIZED_REDUCEMAX(float)
-CAFFE2_SPECIALIZED_REDUCEMAX(int32_t)
-CAFFE2_SPECIALIZED_REDUCEMAX(int64_t)
-
-#undef CAFFE2_SPECIALIZED_REDUCEMAX
-
-namespace {
-
-template <typename T>
-struct MinFunctor {
- inline T operator()(const T a, const T b) const {
- return std::min(a, b);
- }
-};
-
-template <typename T>
-struct MaxFunctor {
- inline T operator()(const T a, const T b) const {
- return std::max(a, b);
- }
-};
-
-template <typename T>
-struct L1NormFunctor {
- inline T operator()(const T a, const T b) const {
- return a + std::abs(b);
- }
-};
-
-template <typename T>
-struct SquaredL2NormFunctor {
- inline T operator()(const T a, const T b) const {
- return a + b * b;
- }
-};
-
-#define DELEGATE_ROWWISE_REDUCE_FUNCTION(Func, EigenOp) \
- template <typename T> \
- C10_EXPORT void Rowwise##Func( \
- const int rows, const int cols, const T alpha, const T* X, T* Y) { \
- EigenVectorMap<T>(Y, rows) = \
- ConstEigenMatrixMap<T>(X, cols, rows).colwise().EigenOp() * alpha; \
- }
-DELEGATE_ROWWISE_REDUCE_FUNCTION(ReduceMin, minCoeff)
-DELEGATE_ROWWISE_REDUCE_FUNCTION(ReduceMax, maxCoeff)
-DELEGATE_ROWWISE_REDUCE_FUNCTION(ReduceSum, sum)
-DELEGATE_ROWWISE_REDUCE_FUNCTION(ReduceMean, mean)
-DELEGATE_ROWWISE_REDUCE_FUNCTION(ReduceL1, template lpNorm<1>);
-DELEGATE_ROWWISE_REDUCE_FUNCTION(ReduceL2, norm)
-#undef DELEGATE_ROWWISE_REDUCE_FUNCTION
-
-#define DELEGATE_COLWISE_REDUCE_FUNCTION(Func, EigenOp) \
- template <typename T> \
- C10_EXPORT void Colwise##Func( \
- const int rows, const int cols, const T alpha, const T* X, T* Y) { \
- EigenVectorMap<T>(Y, cols) = \
- ConstEigenMatrixMap<T>(X, cols, rows).rowwise().EigenOp() * alpha; \
- }
-DELEGATE_COLWISE_REDUCE_FUNCTION(ReduceMin, minCoeff)
-DELEGATE_COLWISE_REDUCE_FUNCTION(ReduceMax, maxCoeff)
-DELEGATE_COLWISE_REDUCE_FUNCTION(ReduceSum, sum)
-DELEGATE_COLWISE_REDUCE_FUNCTION(ReduceMean, mean)
-DELEGATE_COLWISE_REDUCE_FUNCTION(ReduceL1, template lpNorm<1>);
-DELEGATE_COLWISE_REDUCE_FUNCTION(ReduceL2, norm)
-#undef DELEGATE_COLWISE_REDUCE_FUNCTION
-
-template <typename T>
-C10_EXPORT void BothEndsReduceMin(
- const int pre,
- const int mid,
- const int nxt,
- const T alpha,
- const T* X,
- T* Y) {
- EigenVectorArrayMap<T> Y_arr(Y, mid);
- Y_arr = ConstEigenArrayMap<T>(X, nxt, mid).colwise().minCoeff();
- const T* X_ptr = X + mid * nxt;
- // It seems there is some bug in eigen array::min so it cannot be implemented
- // as ReduceSum below.
- for (int i = 1; i < pre; ++i) {
- for (int j = 0; j < mid; ++j) {
- Y[j] = std::min(Y[j], ConstEigenVectorArrayMap<T>(X_ptr, nxt).minCoeff());
- X_ptr += nxt;
- }
- }
- if (alpha != T(1)) {
- Y_arr *= alpha;
- }
-}
-
-template <typename T>
-C10_EXPORT void BothEndsReduceMax(
- const int pre,
- const int mid,
- const int nxt,
- const T alpha,
- const T* X,
- T* Y) {
- EigenVectorArrayMap<T> Y_arr(Y, mid);
- Y_arr = ConstEigenArrayMap<T>(X, nxt, mid).colwise().maxCoeff();
- const T* X_ptr = X + mid * nxt;
- for (int i = 1; i < pre; ++i) {
- for (int j = 0; j < mid; ++j) {
- Y[j] = std::max(Y[j], ConstEigenVectorArrayMap<T>(X_ptr, nxt).maxCoeff());
- X_ptr += nxt;
- }
- }
- if (alpha != T(1)) {
- Y_arr *= alpha;
- }
-}
-
-template <typename T>
-C10_EXPORT void BothEndsReduceSum(
- const int pre,
- const int mid,
- const int nxt,
- const T alpha,
- const T* X,
- T* Y) {
- EigenVectorArrayMap<T> Y_arr(Y, mid);
- Y_arr = ConstEigenArrayMap<T>(X, nxt, mid).colwise().sum();
- const int stride = mid * nxt;
- const T* X_ptr = X + stride;
- for (int i = 1; i < pre; ++i) {
- Y_arr += ConstEigenArrayMap<T>(X_ptr, nxt, mid).colwise().sum();
- X_ptr += stride;
- }
- if (alpha != T(1)) {
- Y_arr *= alpha;
- }
-}
-
-template <typename T>
-C10_EXPORT void BothEndsReduceMean(
- const int pre,
- const int mid,
- const int nxt,
- const T alpha,
- const T* X,
- T* Y) {
- EigenVectorArrayMap<T> Y_arr(Y, mid);
- Y_arr = ConstEigenArrayMap<T>(X, nxt, mid).colwise().mean();
- const int stride = mid * nxt;
- const T* X_ptr = X + stride;
- for (int i = 1; i < pre; ++i) {
- Y_arr += ConstEigenArrayMap<T>(X_ptr, nxt, mid).colwise().mean();
- X_ptr += stride;
- }
- if (alpha / static_cast<T>(pre) != 1) {
- Y_arr *= alpha / static_cast<T>(pre);
- }
-}
-
-template <typename T>
-C10_EXPORT void BothEndsReduceL1(
- const int pre,
- const int mid,
- const int nxt,
- const T alpha,
- const T* X,
- T* Y) {
- EigenVectorArrayMap<T> Y_arr(Y, mid);
- Y_arr = ConstEigenMatrixMap<T>(X, nxt, mid)
- .colwise()
- .template lpNorm<1>()
- .array();
- const int stride = mid * nxt;
- const T* X_ptr = X + stride;
- for (int i = 1; i < pre; ++i) {
- Y_arr += ConstEigenMatrixMap<T>(X_ptr, nxt, mid)
- .colwise()
- .template lpNorm<1>()
- .array();
- X_ptr += stride;
- }
- if (alpha != T(1)) {
- Y_arr *= alpha;
- }
-}
-
-template <typename T>
-C10_EXPORT void BothEndsReduceL2(
- const int pre,
- const int mid,
- const int nxt,
- const T alpha,
- const T* X,
- T* Y) {
- EigenVectorArrayMap<T> Y_arr(Y, mid);
- Y_arr = ConstEigenMatrixMap<T>(X, nxt, mid).colwise().squaredNorm().array();
- const int stride = mid * nxt;
- const T* X_ptr = X + stride;
- for (int i = 1; i < pre; ++i) {
- Y_arr +=
- ConstEigenMatrixMap<T>(X_ptr, nxt, mid).colwise().squaredNorm().array();
- X_ptr += stride;
- }
- Y_arr = Y_arr.sqrt() * alpha;
-}
-
-template <typename T, class Reducer>
-C10_EXPORT void ReduceTensor(
- const int ndim,
- const int* X_dims,
- const int* Y_dims,
- const Reducer& reducer,
- const T init,
- const T alpha,
- const T* X,
- T* Y,
- CPUContext* context) {
- const int X_size =
- std::accumulate(X_dims, X_dims + ndim, 1, std::multiplies<int>());
- const int Y_size =
- std::accumulate(Y_dims, Y_dims + ndim, 1, std::multiplies<int>());
- Set<T, CPUContext>(Y_size, init, Y, context);
- std::vector<int> index(ndim, 0);
- for (int X_index = 0; X_index < X_size; ++X_index) {
- const int Y_index = utils::GetIndexFromDims(ndim, Y_dims, index.data());
- Y[Y_index] = reducer(Y[Y_index], X[X_index]);
- utils::IncreaseIndexInDims(ndim, X_dims, index.data());
- }
- Scale<T, T, CPUContext>(Y_size, alpha, Y, Y, context);
-}
-
-} // namespace
-
-#define DELEGATE_REDUCE_FUNCTION(T, Func, reducer, init, is_norm) \
- template <> \
- C10_EXPORT void Func<T, CPUContext>( \
- const int num_dims, \
- const int* dims, \
- const int num_axes, \
- const int* axes, \
- const T alpha, \
- const T* X, \
- T* Y, \
- CPUContext* context) { \
- CAFFE_ENFORCE_LE(num_axes, num_dims); \
- std::vector<int> Y_dims_vector(dims, dims + num_dims); \
- for (int i = 0; i < num_axes; ++i) { \
- Y_dims_vector[axes[i]] = 1; \
- } \
- const int* X_dims = dims; \
- const int* Y_dims = Y_dims_vector.data(); \
- const int X_size = \
- std::accumulate(X_dims, X_dims + num_dims, 1, std::multiplies<int>()); \
- const int Y_size = \
- std::accumulate(Y_dims, Y_dims + num_dims, 1, std::multiplies<int>()); \
- if (X_size == 0) { \
- Set<T, CPUContext>(Y_size, alpha * init, Y, context); \
- return; \
- } \
- if (alpha == T(0)) { \
- Set<T, CPUContext>(Y_size, 0, Y, context); \
- return; \
- } \
- if (std::equal(X_dims, X_dims + num_dims, Y_dims)) { \
- if (is_norm) { \
- Abs<T, CPUContext>(X_size, X, Y, context); \
- Scale<T, T, CPUContext>(Y_size, alpha, Y, Y, context); \
- } else { \
- Scale<T, T, CPUContext>(Y_size, alpha, X, Y, context); \
- } \
- return; \
- } \
- int rows; \
- int cols; \
- if (utils::IsRowwiseReduce(num_dims, X_dims, Y_dims, &rows, &cols)) { \
- Rowwise##Func<T>(rows, cols, alpha, X, Y); \
- return; \
- } \
- if (utils::IsColwiseReduce(num_dims, X_dims, Y_dims, &rows, &cols)) { \
- Colwise##Func<T>(rows, cols, alpha, X, Y); \
- return; \
- } \
- int pre; \
- int mid; \
- int nxt; \
- if (utils::IsBothEndsReduce(num_dims, X_dims, Y_dims, &pre, &mid, &nxt)) { \
- BothEnds##Func<T>(pre, mid, nxt, alpha, X, Y); \
- return; \
- } \
- ReduceTensor( \
- num_dims, X_dims, Y_dims, reducer, init, alpha, X, Y, context); \
- }
-
-DELEGATE_REDUCE_FUNCTION(
- float,
- ReduceMin,
- MinFunctor<float>(),
- std::numeric_limits<float>::max(),
- false)
-DELEGATE_REDUCE_FUNCTION(
- double,
- ReduceMin,
- MinFunctor<double>(),
- std::numeric_limits<double>::max(),
- false)
-DELEGATE_REDUCE_FUNCTION(
- std::int32_t,
- ReduceMin,
- MinFunctor<std::int32_t>(),
- std::numeric_limits<std::int32_t>::max(),
- false)
-DELEGATE_REDUCE_FUNCTION(
- std::int64_t,
- ReduceMin,
- MinFunctor<std::int64_t>(),
- std::numeric_limits<std::int64_t>::max(),
- false)
-
-DELEGATE_REDUCE_FUNCTION(
- float,
- ReduceMax,
- MaxFunctor<float>(),
- std::numeric_limits<float>::lowest(),
- false)
-DELEGATE_REDUCE_FUNCTION(
- double,
- ReduceMax,
- MaxFunctor<double>(),
- std::numeric_limits<double>::lowest(),
- false)
-DELEGATE_REDUCE_FUNCTION(
- std::int32_t,
- ReduceMax,
- MaxFunctor<std::int32_t>(),
- std::numeric_limits<std::int32_t>::lowest(),
- false)
-DELEGATE_REDUCE_FUNCTION(
- std::int64_t,
- ReduceMax,
- MaxFunctor<std::int64_t>(),
- std::numeric_limits<std::int64_t>::lowest(),
- false)
-
-DELEGATE_REDUCE_FUNCTION(float, ReduceSum, std::plus<float>(), 0.0f, false)
-DELEGATE_REDUCE_FUNCTION(double, ReduceSum, std::plus<double>(), 0.0, false)
-DELEGATE_REDUCE_FUNCTION(
- std::int32_t,
- ReduceSum,
- std::plus<std::int32_t>(),
- 0,
- false)
-DELEGATE_REDUCE_FUNCTION(
- std::int64_t,
- ReduceSum,
- std::plus<std::int64_t>(),
- std::int64_t(0),
- false)
-
-DELEGATE_REDUCE_FUNCTION(float, ReduceL1, L1NormFunctor<float>(), 0.0f, true)
-DELEGATE_REDUCE_FUNCTION(double, ReduceL1, L1NormFunctor<double>(), 0.0, true)
-DELEGATE_REDUCE_FUNCTION(
- std::int32_t,
- ReduceL1,
- L1NormFunctor<std::int32_t>(),
- 0,
- true)
-DELEGATE_REDUCE_FUNCTION(
- std::int64_t,
- ReduceL1,
- L1NormFunctor<std::int64_t>(),
- std::int64_t(0),
- true)
-
-#undef DELEGATE_REDUCE_FUNCTION
-
-#define CAFFE2_SPECIALIZED_REDUCE_MEAN(T) \
- template <> \
- C10_EXPORT void ReduceMean<T, CPUContext>( \
- const int num_dims, \
- const int* dims, \
- const int num_axes, \
- const int* axes, \
- const T alpha, \
- const T* X, \
- T* Y, \
- CPUContext* context) { \
- CAFFE_ENFORCE_LE(num_axes, num_dims); \
- std::vector<int> Y_dims_vector(dims, dims + num_dims); \
- for (int i = 0; i < num_axes; ++i) { \
- Y_dims_vector[axes[i]] = 1; \
- } \
- const int* X_dims = dims; \
- const int* Y_dims = Y_dims_vector.data(); \
- const int X_size = \
- std::accumulate(X_dims, X_dims + num_dims, 1, std::multiplies<int>()); \
- const int Y_size = \
- std::accumulate(Y_dims, Y_dims + num_dims, 1, std::multiplies<int>()); \
- if (X_size == 0) { \
- Set<T, CPUContext>(Y_size, 0, Y, context); \
- return; \
- } \
- if (alpha == T(0)) { \
- Set<T, CPUContext>(Y_size, 0, Y, context); \
- return; \
- } \
- if (std::equal(X_dims, X_dims + num_dims, Y_dims)) { \
- Scale<T, T, CPUContext>(X_size, alpha, X, Y, context); \
- return; \
- } \
- int rows; \
- int cols; \
- if (utils::IsRowwiseReduce(num_dims, X_dims, Y_dims, &rows, &cols)) { \
- RowwiseReduceMean<T>(rows, cols, alpha, X, Y); \
- return; \
- } \
- if (utils::IsColwiseReduce(num_dims, X_dims, Y_dims, &rows, &cols)) { \
- ColwiseReduceMean<T>(rows, cols, alpha, X, Y); \
- return; \
- } \
- int pre; \
- int mid; \
- int nxt; \
- if (utils::IsBothEndsReduce(num_dims, X_dims, Y_dims, &pre, &mid, &nxt)) { \
- BothEndsReduceMean<T>(pre, mid, nxt, alpha, X, Y); \
- return; \
- } \
- const int scale = X_size / Y_size; \
- ReduceTensor( \
- num_dims, \
- X_dims, \
- Y_dims, \
- std::plus<T>(), \
- T(0), \
- alpha / static_cast<T>(scale), \
- X, \
- Y, \
- context); \
- }
-CAFFE2_SPECIALIZED_REDUCE_MEAN(float)
-CAFFE2_SPECIALIZED_REDUCE_MEAN(double)
-#undef CAFFE2_SPECIALIZED_REDUCE_MEAN
-
-#define CAFFE2_SPECIALIZED_REDUCE_L2(T) \
- template <> \
- C10_EXPORT void ReduceL2<T, CPUContext>( \
- const int num_dims, \
- const int* dims, \
- const int num_axes, \
- const int* axes, \
- const T alpha, \
- const T* X, \
- T* Y, \
- CPUContext* context) { \
- CAFFE_ENFORCE_LE(num_axes, num_dims); \
- std::vector<int> Y_dims_vector(dims, dims + num_dims); \
- for (int i = 0; i < num_axes; ++i) { \
- Y_dims_vector[axes[i]] = 1; \
- } \
- const int* X_dims = dims; \
- const int* Y_dims = Y_dims_vector.data(); \
- const int X_size = \
- std::accumulate(X_dims, X_dims + num_dims, 1, std::multiplies<int>()); \
- const int Y_size = \
- std::accumulate(Y_dims, Y_dims + num_dims, 1, std::multiplies<int>()); \
- if (X_size == 0) { \
- Set<T, CPUContext>(Y_size, 0, Y, context); \
- return; \
- } \
- if (alpha == T(0)) { \
- Set<T, CPUContext>(Y_size, 0, Y, context); \
- return; \
- } \
- if (std::equal(X_dims, X_dims + num_dims, Y_dims)) { \
- Abs<T, CPUContext>(X_size, X, Y, context); \
- Scale<T, T, CPUContext>(Y_size, alpha, Y, Y, context); \
- return; \
- } \
- int rows; \
- int cols; \
- if (utils::IsRowwiseReduce(num_dims, X_dims, Y_dims, &rows, &cols)) { \
- RowwiseReduceL2<T>(rows, cols, alpha, X, Y); \
- return; \
- } \
- if (utils::IsColwiseReduce(num_dims, X_dims, Y_dims, &rows, &cols)) { \
- ColwiseReduceL2<T>(rows, cols, alpha, X, Y); \
- return; \
- } \
- int pre; \
- int mid; \
- int nxt; \
- if (utils::IsBothEndsReduce(num_dims, X_dims, Y_dims, &pre, &mid, &nxt)) { \
- BothEndsReduceL2<T>(pre, mid, nxt, alpha, X, Y); \
- return; \
- } \
- ReduceTensor( \
- num_dims, \
- X_dims, \
- Y_dims, \
- SquaredL2NormFunctor<T>(), \
- T(0), \
- T(1), \
- X, \
- Y, \
- context); \
- Sqrt<T, CPUContext>(Y_size, Y, Y, context); \
- Scale<T, T, CPUContext>(Y_size, alpha, Y, Y, context); \
- }
-CAFFE2_SPECIALIZED_REDUCE_L2(float)
-CAFFE2_SPECIALIZED_REDUCE_L2(double)
-#undef CAFFE2_SPECIALIZED_REDUCE_L2
-
namespace {
template <typename T>
#define CAFFE2_SPECIALIZED_CUDA_SET(T) \
template <> \
CAFFE2_CUDA_API void Set<T, CUDAContext>( \
- const size_t N, const T alpha, T* Y, CUDAContext* context) { \
+ const int N, const T alpha, T* Y, CUDAContext* context) { \
if (N == 0) { \
return; \
} \
template <>
CAFFE2_CUDA_EXPORT void Set<at::Half, CUDAContext>(
- const size_t N,
+ const int N,
const at::Half alpha,
at::Half* Y,
CUDAContext* context) {
template <typename T, class Reducer>
CAFFE2_CUDA_EXPORT void ReduceTensorCUDA(
- const int num_dims,
- const int* dims,
- const int num_axes,
- const int* axes,
+ const int ndim,
+ const int* X_dims,
+ const int* Y_dims,
const Reducer& reducer,
const T init,
const T alpha,
const T* X,
T* Y,
CUDAContext* context) {
- CAFFE_ENFORCE_LE(num_axes, num_dims);
- std::vector<int> Y_dims_vector(dims, dims + num_dims);
- for (int i = 0; i < num_axes; ++i) {
- Y_dims_vector[axes[i]] = 1;
- }
- const int* X_dims = dims;
- const int* Y_dims = Y_dims_vector.data();
const int X_size =
- std::accumulate(X_dims, X_dims + num_dims, 1, std::multiplies<int>());
+ std::accumulate(X_dims, X_dims + ndim, 1, std::multiplies<int>());
const int Y_size =
- std::accumulate(Y_dims, Y_dims + num_dims, 1, std::multiplies<int>());
+ std::accumulate(Y_dims, Y_dims + ndim, 1, std::multiplies<int>());
if (X_size == 0) {
Set<T, CUDAContext>(Y_size, alpha * init, Y, context);
return;
Set<T, CUDAContext>(Y_size, T(0), Y, context);
return;
}
- if (std::equal(X_dims, X_dims + num_dims, Y_dims)) {
+ if (std::equal(X_dims, X_dims + ndim, Y_dims)) {
Scale<T, T, CUDAContext>(X_size, alpha, X, Y, context);
return;
}
int rows;
int cols;
- if (utils::IsRowwiseReduce(num_dims, X_dims, Y_dims, &rows, &cols)) {
+ if (utils::IsRowwiseReduce(ndim, X_dims, Y_dims, &rows, &cols)) {
RowwiseReduceKernel<T>
<<<std::min(rows, CAFFE_MAXIMUM_NUM_BLOCKS),
CAFFE_CUDA_NUM_THREADS,
context->cuda_stream()>>>(rows, cols, reducer, init, alpha, X, Y);
return;
}
- if (utils::IsColwiseReduce(num_dims, X_dims, Y_dims, &rows, &cols)) {
+ if (utils::IsColwiseReduce(ndim, X_dims, Y_dims, &rows, &cols)) {
ColwiseReduceKernel<T>
<<<std::min(cols, CAFFE_MAXIMUM_NUM_BLOCKS),
CAFFE_CUDA_NUM_THREADS,
context->cuda_stream()>>>(rows, cols, reducer, init, alpha, X, Y);
return;
}
- std::vector<int> transpose_axes(num_dims);
- utils::ComputeTransposeAxesForReduceOp(
- num_dims, num_axes, axes, transpose_axes.data());
+ std::vector<int> axes(ndim);
+ utils::ComputeTransposeAxesForReduceOp(ndim, Y_dims, axes.data());
const int outer_size = Y_size;
const int inner_size = X_size / Y_size;
DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_2(
- num_dims,
+ ndim,
ReduceTensorCUDAImpl,
T,
Reducer,
outer_size,
inner_size,
- dims,
- transpose_axes.data(),
+ X_dims,
+ axes.data(),
reducer,
init,
alpha,
#define CAFFE2_SPECIALIZED_CUDA_REDUCE_MIN(T) \
template <> \
CAFFE2_CUDA_EXPORT void ReduceMin<T, CUDAContext>( \
- const int num_dims, \
- const int* dims, \
- const int num_axes, \
- const int* axes, \
+ const int ndim, \
+ const int* X_dims, \
+ const int* Y_dims, \
const T alpha, \
const T* X, \
T* Y, \
CUDAContext* context) { \
ReduceTensorCUDA( \
- num_dims, \
- dims, \
- num_axes, \
- axes, \
+ ndim, \
+ X_dims, \
+ Y_dims, \
cub::Min(), \
std::numeric_limits<T>::max(), \
alpha, \
#define CAFFE2_SPECIALIZED_CUDA_REDUCE_MAX(T) \
template <> \
CAFFE2_CUDA_EXPORT void ReduceMax<T, CUDAContext>( \
- const int num_dims, \
- const int* dims, \
- const int num_axes, \
- const int* axes, \
+ const int ndim, \
+ const int* X_dims, \
+ const int* Y_dims, \
const T alpha, \
const T* X, \
T* Y, \
CUDAContext* context) { \
ReduceTensorCUDA( \
- num_dims, \
- dims, \
- num_axes, \
- axes, \
+ ndim, \
+ X_dims, \
+ Y_dims, \
cub::Max(), \
std::numeric_limits<T>::lowest(), \
alpha, \
CAFFE2_SPECIALIZED_CUDA_REDUCE_MAX(double)
#undef CAFFE2_SPECIALIZED_CUDA_REDUCE_MAX
-#define CAFFE2_SPECIALIZED_CUDA_REDUCE_SUM(T) \
- template <> \
- CAFFE2_CUDA_EXPORT void ReduceSum<T, CUDAContext>( \
- const int num_dims, \
- const int* dims, \
- const int num_axes, \
- const int* axes, \
- const T alpha, \
- const T* X, \
- T* Y, \
- CUDAContext* context) { \
- ReduceTensorCUDA( \
- num_dims, \
- dims, \
- num_axes, \
- axes, \
- cub::Sum(), \
- T(0), \
- alpha, \
- X, \
- Y, \
- context); \
+#define CAFFE2_SPECIALIZED_CUDA_REDUCE_SUM(T) \
+ template <> \
+ CAFFE2_CUDA_EXPORT void ReduceSum<T, CUDAContext>( \
+ const int ndim, \
+ const int* X_dims, \
+ const int* Y_dims, \
+ const T alpha, \
+ const T* X, \
+ T* Y, \
+ CUDAContext* context) { \
+ ReduceTensorCUDA( \
+ ndim, X_dims, Y_dims, cub::Sum(), T(0), alpha, X, Y, context); \
}
CAFFE2_SPECIALIZED_CUDA_REDUCE_SUM(std::int32_t)
CAFFE2_SPECIALIZED_CUDA_REDUCE_SUM(std::int64_t)
#define CAFFE2_SPECIALIZED_CUDA_REDUCE_MEAN(T) \
template <> \
CAFFE2_CUDA_EXPORT void ReduceMean<T, CUDAContext>( \
- const int num_dims, \
- const int* dims, \
- const int num_axes, \
- const int* axes, \
+ const int ndim, \
+ const int* X_dims, \
+ const int* Y_dims, \
const T alpha, \
const T* X, \
T* Y, \
CUDAContext* context) { \
int scale = 1; \
- for (int i = 0; i < num_axes; ++i) { \
- scale *= dims[axes[i]]; \
+ for (int i = 0; i < ndim; ++i) { \
+ if (Y_dims[i] == 1) { \
+ scale *= X_dims[i]; \
+ } \
} \
ReduceTensorCUDA( \
- num_dims, \
- dims, \
- num_axes, \
- axes, \
+ ndim, \
+ X_dims, \
+ Y_dims, \
cub::Sum(), \
T(0), \
alpha / static_cast<T>(scale), \
GemmBatchedGPUTest,
testing::Combine(testing::Bool(), testing::Bool()));
-class ReduceTensorGPUTest : public testing::Test {
- protected:
- void SetUp() override {
- if (!HasCudaGPU()) {
- return;
- }
- option_.set_device_type(PROTO_CUDA);
- cuda_context_ = make_unique<CUDAContext>(option_);
- Blob* blob_x = ws_.CreateBlob("X");
- Blob* blob_y = ws_.CreateBlob("Y");
- X_ = BlobGetMutableTensor(blob_x, CUDA);
- Y_ = BlobGetMutableTensor(blob_y, CUDA);
- }
-
- void SetUpData(
- const std::vector<int>& X_dims,
- const std::vector<int>& axes,
- const std::vector<float>& X_data) {
- std::vector<int> Y_dims = X_dims;
- for (const int axis : axes) {
- Y_dims[axis] = 1;
- }
- X_->Resize(X_dims);
- Y_->Resize(Y_dims);
- ASSERT_EQ(X_data.size(), X_->numel());
- cuda_context_->CopyFromCPU<float>(
- X_data.size(), X_data.data(), X_->mutable_data<float>());
- }
-
- void VerifyResult(const std::vector<float>& expected_output) {
- Blob* blob_y_host = ws_.CreateBlob("Y_host");
- auto* Y_host = BlobGetMutableTensor(blob_y_host, CPU);
- Y_host->CopyFrom(*Y_);
- ASSERT_EQ(expected_output.size(), Y_host->numel());
- for (std::size_t i = 0; i < expected_output.size(); ++i) {
- EXPECT_FLOAT_EQ(expected_output[i], Y_host->data<float>()[i]);
- }
- }
-
- template <class ReduceFunc>
- void RunRedcueTensorTest(
- const ReduceFunc& reduce_func,
- const std::vector<int>& X_dims,
- const std::vector<int>& axes,
- const std::vector<float>& X_data,
- const std::vector<float>& Y_data) {
- SetUpData(X_dims, axes, X_data);
- reduce_func(
- X_dims.size(),
- X_dims.data(),
- axes.size(),
- axes.data(),
- 1.0f,
- X_->data<float>(),
- Y_->mutable_data<float>(),
- cuda_context_.get());
- VerifyResult(Y_data);
- }
-
- Workspace ws_;
- DeviceOption option_;
- std::unique_ptr<CUDAContext> cuda_context_;
- Tensor* X_ = nullptr;
- Tensor* Y_ = nullptr;
-};
-
-TEST_F(ReduceTensorGPUTest, ReduceMinGPUTest) {
- if (!HasCudaGPU()) {
- return;
- }
- const auto& reduce_min = [](const int num_dims,
- const int* dims,
- const int num_axes,
- const int* axes,
- const float alpha,
- const float* X,
- float* Y,
- CUDAContext* context) {
- return math::ReduceMin<float, CUDAContext>(
- num_dims, dims, num_axes, axes, alpha, X, Y, context);
- };
- // Test for 1D tensor.
- RunRedcueTensorTest(reduce_min, {3}, {0}, {1.0f, 2.0f, 3.0f}, {1.0f});
-
- // Test for 2D Tensor.
- RunRedcueTensorTest(
- reduce_min,
- {2, 3},
- {1},
- {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
- {1.0f, 4.0f});
- RunRedcueTensorTest(
- reduce_min,
- {2, 3},
- {0},
- {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
- {1.0f, 2.0f, 3.0f});
- RunRedcueTensorTest(
- reduce_min, {2, 3}, {0, 1}, {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}, {1.0f});
-
- // Test for 3D tensor.
- RunRedcueTensorTest(
- reduce_min,
- {2, 2, 2},
- {1, 2},
- {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
- {1.0f, 5.0f});
- RunRedcueTensorTest(
- reduce_min,
- {2, 2, 2},
- {0, 1},
- {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
- {1.0f, 2.0f});
- RunRedcueTensorTest(
- reduce_min,
- {2, 2, 2},
- {0, 2},
- {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
- {1.0f, 3.0f});
-}
-
-TEST_F(ReduceTensorGPUTest, ReduceMaxGPUTest) {
- if (!HasCudaGPU()) {
- return;
- }
- const auto& reduce_max = [](const int num_dims,
- const int* dims,
- const int num_axes,
- const int* axes,
- const float alpha,
- const float* X,
- float* Y,
- CUDAContext* context) {
- return math::ReduceMax<float, CUDAContext>(
- num_dims, dims, num_axes, axes, alpha, X, Y, context);
- };
- // Test for 1D tensor.
- RunRedcueTensorTest(reduce_max, {3}, {0}, {1.0f, 2.0f, 3.0f}, {3.0f});
-
- // Test for 2D Tensor.
- RunRedcueTensorTest(
- reduce_max,
- {2, 3},
- {1},
- {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
- {3.0f, 6.0f});
- RunRedcueTensorTest(
- reduce_max,
- {2, 3},
- {0},
- {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
- {4.0f, 5.0f, 6.0f});
- RunRedcueTensorTest(
- reduce_max, {2, 3}, {0, 1}, {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}, {6.0f});
-
- // Test for 3D tensor.
- RunRedcueTensorTest(
- reduce_max,
- {2, 2, 2},
- {1, 2},
- {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
- {4.0f, 8.0f});
- RunRedcueTensorTest(
- reduce_max,
- {2, 2, 2},
- {0, 1},
- {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
- {7.0f, 8.0f});
- RunRedcueTensorTest(
- reduce_max,
- {2, 2, 2},
- {0, 2},
- {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
- {6.0f, 8.0f});
-}
-
-TEST_F(ReduceTensorGPUTest, ReduceSumGPUTest) {
- if (!HasCudaGPU()) {
- return;
- }
- // Test for 1D tensor.
- RunRedcueTensorTest(
- math::ReduceSum<float, CUDAContext>,
- {3},
- {0},
- {1.0f, 2.0f, 3.0f},
- {6.0f});
-
- // Test for 2D Tensor.
- RunRedcueTensorTest(
- math::ReduceSum<float, CUDAContext>,
- {2, 3},
- {1},
- {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
- {6.0f, 15.0f});
- RunRedcueTensorTest(
- math::ReduceSum<float, CUDAContext>,
- {2, 3},
- {0},
- {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
- {5.0f, 7.0f, 9.0f});
- RunRedcueTensorTest(
- math::ReduceSum<float, CUDAContext>,
- {2, 3},
- {0, 1},
- {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
- {21.0f});
-
- // Test for 3D tensor.
- RunRedcueTensorTest(
- math::ReduceSum<float, CUDAContext>,
- {2, 2, 2},
- {1, 2},
- {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
- {10.0f, 26.0f});
- RunRedcueTensorTest(
- math::ReduceSum<float, CUDAContext>,
- {2, 2, 2},
- {0, 1},
- {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
- {16.0f, 20.0f});
- RunRedcueTensorTest(
- math::ReduceSum<float, CUDAContext>,
- {2, 2, 2},
- {0, 2},
- {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
- {14.0f, 22.0f});
-}
-
-TEST_F(ReduceTensorGPUTest, ReduceMeanGPUTest) {
- if (!HasCudaGPU()) {
- return;
- }
- // Test for 1D tensor.
- RunRedcueTensorTest(
- math::ReduceMean<float, CUDAContext>,
- {3},
- {0},
- {1.0f, 2.0f, 3.0f},
- {2.0f});
-
- // Test for 2D Tensor.
- RunRedcueTensorTest(
- math::ReduceMean<float, CUDAContext>,
- {2, 3},
- {1},
- {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
- {2.0f, 5.0f});
- RunRedcueTensorTest(
- math::ReduceMean<float, CUDAContext>,
- {2, 3},
- {0},
- {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
- {2.5f, 3.5f, 4.5f});
- RunRedcueTensorTest(
- math::ReduceMean<float, CUDAContext>,
- {2, 3},
- {0, 1},
- {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
- {3.5f});
-
- // Test for 3D tensor.
- RunRedcueTensorTest(
- math::ReduceMean<float, CUDAContext>,
- {2, 2, 2},
- {1, 2},
- {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
- {2.5f, 6.5f});
- RunRedcueTensorTest(
- math::ReduceMean<float, CUDAContext>,
- {2, 2, 2},
- {0, 1},
- {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
- {4.0f, 5.0f});
- RunRedcueTensorTest(
- math::ReduceMean<float, CUDAContext>,
- {2, 2, 2},
- {0, 2},
- {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
- {3.5f, 5.5f});
-}
-
class BroadcastGPUTest : public testing::Test {
protected:
void SetUp() override {
namespace {
-class ReduceTensorTest : public testing::Test {
- protected:
- void SetUp() override {
- cpu_context_ = make_unique<CPUContext>(option_);
- }
-
- template <class ReduceFunc>
- void RunRedcueTensorTest(
- const ReduceFunc& reduce_func,
- const std::vector<int>& X_dims,
- const std::vector<int>& axes,
- const std::vector<float>& X_data,
- const std::vector<float>& Y_data) {
- std::vector<int> Y_dims = X_dims;
- for (const int axis : axes) {
- Y_dims[axis] = 1;
- }
- std::vector<int64_t> X_dims_64;
- std::vector<int64_t> Y_dims_64;
- std::copy(X_dims.cbegin(), X_dims.cend(), std::back_inserter(X_dims_64));
- std::copy(Y_dims.cbegin(), Y_dims.cend(), std::back_inserter(Y_dims_64));
- ReinitializeTensor(&X_, X_dims_64, at::dtype<float>().device(CPU));
- ReinitializeTensor(&Y_, Y_dims_64, at::dtype<float>().device(CPU));
- ASSERT_EQ(X_data.size(), X_.numel());
- cpu_context_->CopyFromCPU<float>(
- X_data.size(), X_data.data(), X_.mutable_data<float>());
- reduce_func(
- X_dims.size(),
- X_dims.data(),
- axes.size(),
- axes.data(),
- 1.0f,
- X_.data<float>(),
- Y_.mutable_data<float>(),
- cpu_context_.get());
- ASSERT_EQ(Y_data.size(), Y_.numel());
- for (int i = 0; i < Y_.numel(); ++i) {
- EXPECT_FLOAT_EQ(Y_data[i], Y_.data<float>()[i]);
- }
- }
-
- DeviceOption option_;
- std::unique_ptr<CPUContext> cpu_context_;
- Tensor X_;
- Tensor Y_;
-};
-
-TEST_F(ReduceTensorTest, ReduceMinTest) {
- const auto& reduce_min = [](const int num_dims,
- const int* dims,
- const int num_axes,
- const int* axes,
- const float alpha,
- const float* X,
- float* Y,
- CPUContext* context) {
- return math::ReduceMin<float, CPUContext>(
- num_dims, dims, num_axes, axes, alpha, X, Y, context);
- };
- // Test for 1D tensor.
- RunRedcueTensorTest(reduce_min, {3}, {0}, {1.0f, 2.0f, 3.0f}, {1.0f});
-
- // Test for 2D Tensor.
- RunRedcueTensorTest(
- reduce_min,
- {2, 3},
- {1},
- {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
- {1.0f, 4.0f});
- RunRedcueTensorTest(
- reduce_min,
- {2, 3},
- {0},
- {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
- {1.0f, 2.0f, 3.0f});
- RunRedcueTensorTest(
- reduce_min, {2, 3}, {0, 1}, {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}, {1.0f});
-
- // Test for 3D tensor.
- RunRedcueTensorTest(
- reduce_min,
- {2, 2, 2},
- {1, 2},
- {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
- {1.0f, 5.0f});
- RunRedcueTensorTest(
- reduce_min,
- {2, 2, 2},
- {0, 1},
- {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
- {1.0f, 2.0f});
- RunRedcueTensorTest(
- reduce_min,
- {2, 2, 2},
- {0, 2},
- {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
- {1.0f, 3.0f});
-}
-
-TEST_F(ReduceTensorTest, ReduceMaxTest) {
- const auto& reduce_max = [](const int num_dims,
- const int* dims,
- const int num_axes,
- const int* axes,
- const float alpha,
- const float* X,
- float* Y,
- CPUContext* context) {
- return math::ReduceMax<float, CPUContext>(
- num_dims, dims, num_axes, axes, alpha, X, Y, context);
- };
- // Test for 1D tensor.
- RunRedcueTensorTest(reduce_max, {3}, {0}, {1.0f, 2.0f, 3.0f}, {3.0f});
-
- // Test for 2D Tensor.
- RunRedcueTensorTest(
- reduce_max,
- {2, 3},
- {1},
- {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
- {3.0f, 6.0f});
- RunRedcueTensorTest(
- reduce_max,
- {2, 3},
- {0},
- {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
- {4.0f, 5.0f, 6.0f});
- RunRedcueTensorTest(
- reduce_max, {2, 3}, {0, 1}, {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}, {6.0f});
-
- // Test for 3D tensor.
- RunRedcueTensorTest(
- reduce_max,
- {2, 2, 2},
- {1, 2},
- {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
- {4.0f, 8.0f});
- RunRedcueTensorTest(
- reduce_max,
- {2, 2, 2},
- {0, 1},
- {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
- {7.0f, 8.0f});
- RunRedcueTensorTest(
- reduce_max,
- {2, 2, 2},
- {0, 2},
- {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
- {6.0f, 8.0f});
-}
-
-TEST_F(ReduceTensorTest, ReduceSumTest) {
- // Test for 1D tensor.
- RunRedcueTensorTest(
- math::ReduceSum<float, CPUContext>, {3}, {0}, {1.0f, 2.0f, 3.0f}, {6.0f});
-
- // Test for 2D Tensor.
- RunRedcueTensorTest(
- math::ReduceSum<float, CPUContext>,
- {2, 3},
- {1},
- {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
- {6.0f, 15.0f});
- RunRedcueTensorTest(
- math::ReduceSum<float, CPUContext>,
- {2, 3},
- {0},
- {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
- {5.0f, 7.0f, 9.0f});
- RunRedcueTensorTest(
- math::ReduceSum<float, CPUContext>,
- {2, 3},
- {0, 1},
- {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
- {21.0f});
-
- // Test for 3D tensor.
- RunRedcueTensorTest(
- math::ReduceSum<float, CPUContext>,
- {2, 2, 2},
- {1, 2},
- {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
- {10.0f, 26.0f});
- RunRedcueTensorTest(
- math::ReduceSum<float, CPUContext>,
- {2, 2, 2},
- {0, 1},
- {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
- {16.0f, 20.0f});
- RunRedcueTensorTest(
- math::ReduceSum<float, CPUContext>,
- {2, 2, 2},
- {0, 2},
- {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
- {14.0f, 22.0f});
-}
-
-TEST_F(ReduceTensorTest, ReduceMeanTest) {
- // Test for 1D tensor.
- RunRedcueTensorTest(
- math::ReduceMean<float, CPUContext>,
- {3},
- {0},
- {1.0f, 2.0f, 3.0f},
- {2.0f});
-
- // Test for 2D Tensor.
- RunRedcueTensorTest(
- math::ReduceMean<float, CPUContext>,
- {2, 3},
- {1},
- {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
- {2.0f, 5.0f});
- RunRedcueTensorTest(
- math::ReduceMean<float, CPUContext>,
- {2, 3},
- {0},
- {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
- {2.5f, 3.5f, 4.5f});
- RunRedcueTensorTest(
- math::ReduceMean<float, CPUContext>,
- {2, 3},
- {0, 1},
- {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
- {3.5f});
-
- // Test for 3D tensor.
- RunRedcueTensorTest(
- math::ReduceMean<float, CPUContext>,
- {2, 2, 2},
- {1, 2},
- {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
- {2.5f, 6.5f});
- RunRedcueTensorTest(
- math::ReduceMean<float, CPUContext>,
- {2, 2, 2},
- {0, 1},
- {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
- {4.0f, 5.0f});
- RunRedcueTensorTest(
- math::ReduceMean<float, CPUContext>,
- {2, 2, 2},
- {0, 2},
- {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
- {3.5f, 5.5f});
-}
-
class BroadcastTest : public testing::Test {
protected:
void SetUp() override {