From 265fa0ce4d4a10feeeaf76c2445fdd1fe0080b2e Mon Sep 17 00:00:00 2001 From: Xiaomeng Yang Date: Tue, 26 Mar 2019 12:13:51 -0700 Subject: [PATCH] Move math::Axpy function to elementwise lib (#18316) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/18316 Move math::Axpy function to elementwise lib i-am-not-moving-c2-to-c10 Reviewed By: houseroad Differential Revision: D14574697 fbshipit-source-id: 7cfbb2da295c8966c5328bd6b577cce2638eea62 --- caffe2/operators/distance_op.h | 3 +- .../operators/local_response_normalization_op.cc | 59 ++- caffe2/operators/utility_ops.h | 2 +- caffe2/operators/utility_ops_cudnn.cc | 169 -------- caffe2/perfkernels/typed_axpy.cc | 2 +- caffe2/utils/math.h | 30 -- caffe2/utils/math/elementwise.cc | 100 +++++ caffe2/utils/math/elementwise.cu | 482 +++++++++++++++++---- caffe2/utils/math/elementwise.h | 33 ++ caffe2/utils/math_cpu.cc | 106 ----- caffe2/utils/math_gpu.cu | 195 --------- 11 files changed, 575 insertions(+), 606 deletions(-) delete mode 100644 caffe2/operators/utility_ops_cudnn.cc diff --git a/caffe2/operators/distance_op.h b/caffe2/operators/distance_op.h index a246209..11b43b6 100644 --- a/caffe2/operators/distance_op.h +++ b/caffe2/operators/distance_op.h @@ -259,7 +259,8 @@ class DotProductWithPaddingGradientOp final : public Operator { DS, dDot_data[i], S_data, dL_data + j * DS, &context_); math::Scale( DS, dDot_data[i], L_data + j * DS, tmp_data.data(), &context_); - math::Axpy(DS, 1.0, tmp_data.data(), dS_data, &context_); + math::Axpy( + DS, 1.0, tmp_data.data(), dS_data, &context_); } } else { math::Scale( diff --git a/caffe2/operators/local_response_normalization_op.cc b/caffe2/operators/local_response_normalization_op.cc index e6b079b..0f21f2a 100644 --- a/caffe2/operators/local_response_normalization_op.cc +++ b/caffe2/operators/local_response_normalization_op.cc @@ -40,9 +40,12 @@ bool LRNOp::RunOnDeviceWithOrderNCHW() { &context_); // Create the first channel scale for (int c = 0; c < size_; ++c) { - math::Axpy( - H * W, alpha_over_size, padded_square_data + c * H * W, - scale_data + image_size * n, &context_); + math::Axpy( + H * W, + alpha_over_size, + padded_square_data + c * H * W, + scale_data + image_size * n, + &context_); } for (int c = 1; c < C; ++c) { float* this_scale_slice = scale_data + n * image_size + c * H * W; @@ -50,13 +53,19 @@ bool LRNOp::RunOnDeviceWithOrderNCHW() { context_.CopyFromCPU( H * W, this_scale_slice - H * W, this_scale_slice); // add head - math::Axpy( - H * W, alpha_over_size, padded_square_data + (c + size_ - 1) * H * W, - this_scale_slice, &context_); + math::Axpy( + H * W, + alpha_over_size, + padded_square_data + (c + size_ - 1) * H * W, + this_scale_slice, + &context_); // subtract tail - math::Axpy( - H * W, -alpha_over_size, padded_square_data + (c - 1) * H * W, - this_scale_slice, &context_); + math::Axpy( + H * W, + -alpha_over_size, + padded_square_data + (c - 1) * H * W, + this_scale_slice, + &context_); } } math::Powx( @@ -161,9 +170,12 @@ bool LRNGradientOp::RunOnDeviceWithOrderNCHW() { &context_); // Create the first channel scale for (int c = 0; c < size_; ++c) { - math::Axpy( - H * W, alpha_over_size, padded_ratio_data + c * H * W, - scale_data + image_size * n, &context_); + math::Axpy( + H * W, + alpha_over_size, + padded_ratio_data + c * H * W, + scale_data + image_size * n, + &context_); } for (int c = 1; c < C; ++c) { float* this_scale_slice = scale_data + n * image_size + c * H * W; @@ -171,13 +183,19 @@ bool LRNGradientOp::RunOnDeviceWithOrderNCHW() { context_.CopyFromCPU( H * W, this_scale_slice - H * W, this_scale_slice); // add head - math::Axpy( - H * W, alpha_over_size, padded_ratio_data + (c + size_ - 1) * H * W, - this_scale_slice, &context_); + math::Axpy( + H * W, + alpha_over_size, + padded_ratio_data + (c + size_ - 1) * H * W, + this_scale_slice, + &context_); // subtract tail - math::Axpy( - H * W, -alpha_over_size, padded_ratio_data + (c - 1) * H * W, - this_scale_slice, &context_); + math::Axpy( + H * W, + -alpha_over_size, + padded_ratio_data + (c - 1) * H * W, + this_scale_slice, + &context_); } } @@ -203,9 +221,8 @@ bool LRNGradientOp::RunOnDeviceWithOrderNCHW() { math::Set( accum_ratio.numel(), 0., accum_ratio_data, &context_); for (int c = 0; c < size_ - 1; ++c) { - math::Axpy(H * W, 1, - padded_ratio_data + c * H * W, - accum_ratio_data, &context_); + math::Axpy( + H * W, 1, padded_ratio_data + c * H * W, accum_ratio_data, &context_); } for (int c = 0; c < C; ++c) { for (int hw = 0; hw < H * W; ++hw) { diff --git a/caffe2/operators/utility_ops.h b/caffe2/operators/utility_ops.h index 0786f8a..2b38d1b 100644 --- a/caffe2/operators/utility_ops.h +++ b/caffe2/operators/utility_ops.h @@ -414,7 +414,7 @@ class WeightedSumOp : public Operator { const auto& weighti = Input(i + 1); CAFFE_ENFORCE_EQ(Xi.numel(), size); CAFFE_ENFORCE_EQ(weighti.numel(), 1); - math::Axpy( + math::Axpy( size, weighti.template data(), Xi.template data(), diff --git a/caffe2/operators/utility_ops_cudnn.cc b/caffe2/operators/utility_ops_cudnn.cc deleted file mode 100644 index c04ad32..0000000 --- a/caffe2/operators/utility_ops_cudnn.cc +++ /dev/null @@ -1,169 +0,0 @@ -#include "caffe2/operators/utility_ops.h" - -#include - -#include "caffe2/core/context_gpu.h" -#include "caffe2/core/cudnn_wrappers.h" -#include "caffe2/utils/conversions.h" - -namespace caffe2 { - -class CuDNNWeightedSumOp : public Operator { - public: - USE_OPERATOR_FUNCTIONS(CUDAContext); - - template - explicit CuDNNWeightedSumOp(Args&&... args) - : Operator(std::forward(args)...), - cudnn_wrapper_(&context_) { - CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&data_desc_)); - CUDNN_ENFORCE(cudnnCreateOpTensorDescriptor(&add_desc_)); - // Both float and at::Half require opTensorCompType to be CUDNN_DATA_FLOAT. - CUDNN_ENFORCE(cudnnSetOpTensorDescriptor( - add_desc_, CUDNN_OP_TENSOR_ADD, CUDNN_DATA_FLOAT, CUDNN_PROPAGATE_NAN)); - } - - ~CuDNNWeightedSumOp() override { - CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(data_desc_)); - CUDNN_ENFORCE(cudnnDestroyOpTensorDescriptor(add_desc_)); - } - - bool RunOnDevice() override { - return DispatchHelper>::call(this, Input(0)); - } - - template - bool DoRunWithType() { - if (std::is_same::value) { - LOG(WARNING) - << "CuDNN only support same type for data and weight, " - "so the weight will be cast to at::Half when data type is Half."; - } - const int num_inputs = InputSize(); - CAFFE_ENFORCE_EQ(num_inputs % 2, 0); - const auto& X0 = Input(0); - const auto& weight0 = Input(1); - CAFFE_ENFORCE_GT(X0.numel(), 0); - CAFFE_ENFORCE_EQ(weight0.numel(), 1); - const int input_size = X0.numel(); - SetTensorDescriptor(cudnnTypeWrapper::type, input_size); - - // Note: removed Aliasing check, since Output already has - // caching capability - auto* Y = Output(0, X0.sizes(), at::dtype()); - T* Y_data = Y->template mutable_data(); - T alpha = convert::To(0.0f); - T beta = convert::To(0.0f); - if (num_inputs == 2) { - CopyWeightToHost(weight0.template data(), &alpha); - CUDNN_ENFORCE(cudnnAddTensor( - cudnn_wrapper_.inline_cudnn_handle(), - &alpha, - data_desc_, - X0.template data(), - cudnnTypeWrapper::kZero(), - data_desc_, - Y_data)); - return true; - } - const auto& X1 = Input(2); - CAFFE_ENFORCE( - !IsInputOutputAlias(2, 0), - "Input #2 is the same as output. If you want to do in-place updates, " - "put the output as input #0."); - const auto& weight1 = Input(3); - CAFFE_ENFORCE_EQ(X1.numel(), input_size); - CAFFE_ENFORCE_EQ(weight1.numel(), 1); - CopyWeightToHost(weight1.template data(), &alpha); - CopyWeightToHost(weight0.template data(), &beta); - if (IsInputOutputAlias(0, 0)) { - CUDNN_ENFORCE(cudnnAddTensor( - cudnn_wrapper_.inline_cudnn_handle(), - &alpha, - data_desc_, - X1.template data(), - &beta, - data_desc_, - Y_data)); - } else { - CUDNN_ENFORCE(cudnnOpTensor( - cudnn_wrapper_.inline_cudnn_handle(), - add_desc_, - &alpha, - data_desc_, - X1.template data(), - &beta, - data_desc_, - X0.template data(), - cudnnTypeWrapper::kZero(), - data_desc_, - Y_data)); - } - for (int i = 4; i < num_inputs; i += 2) { - const auto& Xi = Input(i); - // Do a check: if the input is the same as output, we have a problem - - // in-place update should always only happen with the zeroth input. - const std::string err_msg = "Input #" + to_string(i) + - " is the same as output. If you want to do in-place updates, " - "put the output as input #0."; - CAFFE_ENFORCE(!IsInputOutputAlias(i, 0), err_msg); - const auto& weighti = Input(i + 1); - CAFFE_ENFORCE_EQ(Xi.numel(), input_size); - CAFFE_ENFORCE_EQ(weighti.numel(), 1); - CopyWeightToHost(weighti.template data(), &alpha); - CUDNN_ENFORCE(cudnnAddTensor( - cudnn_wrapper_.inline_cudnn_handle(), - &alpha, - data_desc_, - Xi.template data(), - cudnnTypeWrapper::kOne(), - data_desc_, - Y_data)); - } - return true; - } - - private: - void SetTensorDescriptor( - const cudnnDataType_t data_type, - const int input_size) { - if (cached_input_size_ != input_size) { - cached_input_size_ = input_size; - // Since the best performance is obtained when the tesor is HW-packed, we - // put X.size() to W. - CUDNN_ENFORCE(cudnnSetTensor4dDescriptor( - data_desc_, - GetCudnnTensorFormat(StorageOrder::NCHW), - data_type, - 1, - 1, - 1, - input_size)); - } - } - - template - void CopyWeightToHost(const float* src, T* dst); - - CuDNNWrapper cudnn_wrapper_; - cudnnTensorDescriptor_t data_desc_; - cudnnOpTensorDescriptor_t add_desc_; - - int cached_input_size_ = 0; -}; - -template -void CuDNNWeightedSumOp::CopyWeightToHost(const float* src, T* dst) { - float val; - context_.template CopyToCPU(1, src, &val); - *dst = convert::To(val); -} - -template <> -void CuDNNWeightedSumOp::CopyWeightToHost(const float* src, float* dst) { - context_.CopyToCPU(1, src, dst); -} - -REGISTER_CUDNN_OPERATOR(WeightedSum, CuDNNWeightedSumOp); - -} // namespace caffe2 diff --git a/caffe2/perfkernels/typed_axpy.cc b/caffe2/perfkernels/typed_axpy.cc index 2ca219a..5656977 100644 --- a/caffe2/perfkernels/typed_axpy.cc +++ b/caffe2/perfkernels/typed_axpy.cc @@ -10,7 +10,7 @@ template <> void TypedAxpy(int N, const float a, const float* x, float* y) { // This uses a hack that axpy implementation actually does not use the // CPUContext, so passing in a nullpointer works. - math::Axpy(N, a, x, y, nullptr); + math::Axpy(N, a, x, y, nullptr); } void TypedAxpyHalffloat__base( diff --git a/caffe2/utils/math.h b/caffe2/utils/math.h index 0ce41f1..4ad285d 100644 --- a/caffe2/utils/math.h +++ b/caffe2/utils/math.h @@ -323,35 +323,6 @@ CAFFE2_API void Select( T* y, Context* context); -template -CAFFE2_API void -Axpy(const int N, const float alpha, const T* x, T* y, Context* context); - -// Different from the Axpy function above, if alpha is passed in -// as a pointer, we will assume that it lives on the Context device, -// for example on GPU. -template -CAFFE2_API void -Axpy(const int N, const float* alpha, const T* x, T* y, Context* context); - -template -CAFFE2_API void Axpby( - const int N, - const TCoeff alpha, - const TData* x, - const TCoeff b, - TData* y, - Context* context); - -template -CAFFE2_API void Axpby( - const int N, - const TCoeff* alpha, - const TData* x, - const TCoeff* b, - TData* y, - Context* context); - // groups must be 1 for GPU // For NHWC order with groups > 1, the result will be layout in // NHW G RS C/G order to make data within the same group to be contiguous. @@ -488,7 +459,6 @@ CAFFE2_API void CopyMatrix( template CAFFE2_API void CopyVector(const int N, const T* A, T* B, Context* context); - } // namespace math } // namespace caffe2 diff --git a/caffe2/utils/math/elementwise.cc b/caffe2/utils/math/elementwise.cc index 7e56c26..c18f35b 100644 --- a/caffe2/utils/math/elementwise.cc +++ b/caffe2/utils/math/elementwise.cc @@ -114,6 +114,32 @@ DELEGATE_SIMPLE_BINARY_FUNCTION(float, Div, vsDiv) DELEGATE_SIMPLE_BINARY_FUNCTION(double, Div, vdDiv) #undef DELEGATE_SIMPLE_BINARY_FUNCTION +#define DELEGATE_AXPBY(TAlpha, TData, MKLFunc) \ + template <> \ + C10_EXPORT void Axpby( \ + const std::int64_t N, \ + const TAlpha alpha, \ + const TData* X, \ + const TAlpha beta, \ + TData* Y, \ + CPUContext* /* context */) { \ + MKLFunc( \ + N, static_cast(alpha), X, 1, static_cast(beta), Y, 1); \ + } \ + template <> \ + C10_EXPORT void Axpby( \ + const std::int64_t N, \ + const TAlpha* alpha, \ + const TData* X, \ + const TAlpha* beta, \ + TData* Y, \ + CPUContext* /* context */) { \ + MKLFunc( \ + N, static_cast(*alpha), X, 1, static_cast(*beta), Y, 1); \ + } +DELEGATE_AXPBY(float, float, cblas_saxpby) +#undef DELEGATE_AXPBY + #else // CAFFE2_USE_MKL #define DELEGATE_SIMPLE_UNARY_FUNCTION(T, Func, EigenFunc) \ @@ -231,6 +257,34 @@ DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_OPERATOR(float, Div, /) DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_OPERATOR(double, Div, /) #undef DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_OPERATOR +#define CAFFE2_SPECIALIZED_AXPBY(TAlpha, TData) \ + template <> \ + C10_EXPORT void Axpby( \ + const std::int64_t N, \ + const TAlpha alpha, \ + const TData* X, \ + const TAlpha beta, \ + TData* Y, \ + CPUContext* /* context */) { \ + EigenVectorArrayMap Y_arr(Y, N); \ + Y_arr = Y_arr * static_cast(beta) + \ + ConstEigenVectorArrayMap(X, N) * static_cast(alpha); \ + } \ + template <> \ + C10_EXPORT void Axpby( \ + const std::int64_t N, \ + const TAlpha* alpha, \ + const TData* X, \ + const TAlpha* beta, \ + TData* Y, \ + CPUContext* /* context */) { \ + EigenVectorArrayMap Y_arr(Y, N); \ + Y_arr = Y_arr * static_cast(*beta) + \ + ConstEigenVectorArrayMap(X, N) * static_cast(*alpha); \ + } +CAFFE2_SPECIALIZED_AXPBY(float, float) +#undef CAFFE2_SPECIALIZED_AXPBY + #endif // CAFFE2_USE_MKL //////////////////////////////////////////////////////////////////////////////// @@ -275,6 +329,30 @@ CAFFE2_SPECIALIZED_SCALE(double, double) CAFFE2_SPECIALIZED_SCALE(float, double) #undef CAFFE2_SPECIALIZED_SCALE +#define CAFFE2_SPECIALIZED_AXPY(TAlpha, TData) \ + template <> \ + C10_EXPORT void Axpy( \ + const std::int64_t N, \ + const TAlpha alpha, \ + const TData* X, \ + TData* Y, \ + CPUContext* /* context */) { \ + EigenVectorArrayMap(Y, N) += \ + ConstEigenVectorArrayMap(X, N) * static_cast(alpha); \ + } \ + template <> \ + C10_EXPORT void Axpy( \ + const std::int64_t N, \ + const TAlpha* alpha, \ + const TData* X, \ + TData* Y, \ + CPUContext* /* context */) { \ + EigenVectorArrayMap(Y, N) += \ + ConstEigenVectorArrayMap(X, N) * static_cast(*alpha); \ + } +CAFFE2_SPECIALIZED_AXPY(float, float) +#undef CAFFE2_SPECIALIZED_AXPY + #else // CAFFE2_USE_EIGEN_FOR_BLAS #ifdef CAFFE2_USE_MKL @@ -349,6 +427,28 @@ DELEGATE_SCALE(float, double, cblas_dscal) #endif // CAFFE2_USE_MKL +#define DELEGATE_AXPY(TAlpha, TData, BLASFunc) \ + template <> \ + C10_EXPORT void Axpy( \ + const std::int64_t N, \ + const TAlpha alpha, \ + const TData* X, \ + TData* Y, \ + CPUContext* /* context */) { \ + BLASFunc(N, static_cast(alpha), X, 1, Y, 1); \ + } \ + template <> \ + C10_EXPORT void Axpy( \ + const std::int64_t N, \ + const TAlpha* alpha, \ + const TData* X, \ + TData* Y, \ + CPUContext* /* context */) { \ + BLASFunc(N, static_cast(*alpha), X, 1, Y, 1); \ + } +DELEGATE_AXPY(float, float, cblas_saxpy) +#undef DELEGATE_AXPY + #endif // CAFFE2_USE_EIGEN_FOR_BLAS //////////////////////////////////////////////////////////////////////////////// diff --git a/caffe2/utils/math/elementwise.cu b/caffe2/utils/math/elementwise.cu index 7509fb2..9b321a5 100644 --- a/caffe2/utils/math/elementwise.cu +++ b/caffe2/utils/math/elementwise.cu @@ -29,6 +29,188 @@ __global__ void SinCosCUDAKernel(const int N, const T* X, T* S, T* C) { } } +#ifdef __HIP_PLATFORM_HCC__ + +template +__global__ void AxpyCUDAKernel( + const std::int64_t N, + const TAlpha alpha, + const TData* X, + TData* Y) { + const std::int64_t index = static_cast(blockIdx.x) * + static_cast(CAFFE_CUDA_NUM_THREADS) + + static_cast(threadIdx.x); + if (index < N) { + Y[index] += static_cast(alpha) * __ldg(X + index); + } +} + +template +__global__ void AxpyCUDAKernel( + const std::int64_t N, + const TAlpha* alpha, + const TData* X, + TData* Y) { + __shared__ TData a; + if (threadIdx.x == 0) { + a = static_cast(__ldg(alpha)); + } + __syncthreads(); + const std::int64_t index = static_cast(blockIdx.x) * + static_cast(CAFFE_CUDA_NUM_THREADS) + + static_cast(threadIdx.x); + if (index < N) { + Y[index] += a * __ldg(X + index); + } +} + +#define DELEGATE_HALF_AXPY_CUDA_KERNEL(TAlpha, FMAFunc) \ + template <> \ + __global__ void AxpyCUDAKernel( \ + const std::int64_t N, \ + const TAlpha alpha, \ + const at::Half* X, \ + at::Half* Y) { \ + const std::int64_t index = static_cast(blockIdx.x) * \ + static_cast(CAFFE_CUDA_NUM_THREADS) + \ + static_cast(threadIdx.x); \ + if (index < N) { \ + Y[index] = convert::To(FMAFunc( \ + alpha, \ + convert::To(X[index]), \ + convert::To(Y[index]))); \ + } \ + } \ + template <> \ + __global__ void AxpyCUDAKernel( \ + const std::int64_t N, \ + const TAlpha* alpha, \ + const at::Half* X, \ + at::Half* Y) { \ + __shared__ TAlpha a; \ + if (threadIdx.x == 0) { \ + a = __ldg(alpha); \ + } \ + __syncthreads(); \ + const std::int64_t index = static_cast(blockIdx.x) * \ + static_cast(CAFFE_CUDA_NUM_THREADS) + \ + static_cast(threadIdx.x); \ + if (index < N) { \ + Y[index] = convert::To(FMAFunc( \ + a, \ + convert::To(X[index]), \ + convert::To(Y[index]))); \ + } \ + } +DELEGATE_HALF_AXPY_CUDA_KERNEL(float, fmaf) +#undef DELEGATE_HALF_AXPY_CUDA_KERNEL + +#endif // __HIP_PLATFORM_HCC__ + +template +__global__ void AxpbyCUDAKernel( + const std::int64_t N, + const TAlpha alpha, + const TData* X, + const TAlpha beta, + TData* Y); + +template +__global__ void AxpbyCUDAKernel( + const std::int64_t N, + const TAlpha* alpha, + const TData* X, + const TAlpha* beta, + TData* Y); + +#define DELEGATE_AXPBY_CUDA_KERNEL(TAlpha, TData, FMAFunc) \ + template <> \ + __global__ void AxpbyCUDAKernel( \ + const std::int64_t N, \ + const TAlpha alpha, \ + const TData* X, \ + const TAlpha beta, \ + TData* Y) { \ + const std::int64_t index = static_cast(blockIdx.x) * \ + static_cast(CAFFE_CUDA_NUM_THREADS) + \ + static_cast(threadIdx.x); \ + if (index < N) { \ + Y[index] = FMAFunc( \ + static_cast(alpha), \ + X[index], \ + static_cast(beta) * Y[index]); \ + } \ + } \ + template <> \ + __global__ void AxpbyCUDAKernel( \ + const std::int64_t N, \ + const TAlpha* alpha, \ + const TData* X, \ + const TAlpha* beta, \ + TData* Y) { \ + __shared__ TData a; \ + __shared__ TData b; \ + if (threadIdx.x == 0) { \ + a = static_cast(*alpha); \ + b = static_cast(*beta); \ + } \ + __syncthreads(); \ + const std::int64_t index = static_cast(blockIdx.x) * \ + static_cast(CAFFE_CUDA_NUM_THREADS) + \ + static_cast(threadIdx.x); \ + if (index < N) { \ + Y[index] = FMAFunc(a, X[index], b * Y[index]); \ + } \ + } +DELEGATE_AXPBY_CUDA_KERNEL(float, float, fmaf) +DELEGATE_AXPBY_CUDA_KERNEL(float, double, fma) +#undef DELEGATE_AXPBY_CUDA_KERNEL + +#define DELEGATE_HALF_AXPBY_CUDA_KERNEL(TAlpha, FMAFunc) \ + template <> \ + __global__ void AxpbyCUDAKernel( \ + const std::int64_t N, \ + const TAlpha alpha, \ + const at::Half* X, \ + const TAlpha beta, \ + at::Half* Y) { \ + const std::int64_t index = static_cast(blockIdx.x) * \ + static_cast(CAFFE_CUDA_NUM_THREADS) + \ + static_cast(threadIdx.x); \ + if (index < N) { \ + Y[index] = convert::To(FMAFunc( \ + alpha, \ + convert::To(X[index]), \ + beta * convert::To(Y[index]))); \ + } \ + } \ + template <> \ + __global__ void AxpbyCUDAKernel( \ + const std::int64_t N, \ + const TAlpha* alpha, \ + const at::Half* X, \ + const TAlpha* beta, \ + at::Half* Y) { \ + __shared__ TAlpha a; \ + __shared__ TAlpha b; \ + if (threadIdx.x == 0) { \ + a = *alpha; \ + b = *beta; \ + } \ + __syncthreads(); \ + const std::int64_t index = static_cast(blockIdx.x) * \ + static_cast(CAFFE_CUDA_NUM_THREADS) + \ + static_cast(threadIdx.x); \ + if (index < N) { \ + Y[index] = convert::To(FMAFunc( \ + a, \ + convert::To(X[index]), \ + b * convert::To(Y[index]))); \ + } \ + } +DELEGATE_HALF_AXPBY_CUDA_KERNEL(float, fmaf) +#undef DELEGATE_HALF_AXPBY_CUDA_KERNEL + } // namespace #define CAFFE2_SPECIALIZED_CUDA_SET(T) \ @@ -304,89 +486,80 @@ CAFFE2_SPECIALIZED_CUDA_HALF_SCALE(float) #else // __HIP_PLATFORM_HCC__ -#define DELEGATE_CUDA_HALF_SCALE_BY_CUBLAS_FUNCTION( \ - TAlpha, CuBLASFunc, kAlphaType, kExecutionType) \ - template <> \ - CAFFE2_CUDA_EXPORT void Scale( \ - const int N, \ - const TAlpha alpha, \ - const at::Half* X, \ - at::Half* Y, \ - CUDAContext* context) { \ - if (N == 0) { \ - return; \ - } \ - if (Y == X) { \ - CUBLAS_ENFORCE(cublasSetPointerMode( \ - context->cublas_handle(), CUBLAS_POINTER_MODE_HOST)); \ - CUBLAS_ENFORCE(cublasScalEx( \ - context->cublas_handle(), \ - N, \ - &alpha, \ - kAlphaType, \ - Y, \ - CUDA_R_16F, \ - 1, \ - kExecutionType)); \ - } else { \ - const float alpha_host = convert::To(alpha); \ - thrust::transform( \ - thrust::cuda::par.on(context->cuda_stream()), \ - X, \ - X + N, \ - Y, \ - [alpha_host] __device__(const at::Half x) { \ - return convert::To( \ - convert::To(x) * alpha_host); \ - }); \ - } \ - } \ - template <> \ - CAFFE2_CUDA_EXPORT void Scale( \ - const int N, \ - const TAlpha* alpha, \ - const at::Half* X, \ - at::Half* Y, \ - CUDAContext* context) { \ - if (N == 0) { \ - return; \ - } \ - if (Y == X) { \ - CUBLAS_ENFORCE(cublasSetPointerMode( \ - context->cublas_handle(), CUBLAS_POINTER_MODE_HOST)); \ - CUBLAS_ENFORCE(cublasScalEx( \ - context->cublas_handle(), \ - N, \ - alpha, \ - kAlphaType, \ - Y, \ - CUDA_R_16F, \ - 1, \ - kExecutionType)); \ - } else { \ - thrust::transform( \ - thrust::cuda::par.on(context->cuda_stream()), \ - X, \ - X + N, \ - Y, \ - [alpha] __device__(const at::Half x) { \ - return convert::To( \ - convert::To(x) * \ - convert::To(*alpha)); \ - }); \ - } \ +#define DELEGATE_CUDA_HALF_SCALE(TAlpha, kAlphaType, kExecutionType) \ + template <> \ + CAFFE2_CUDA_EXPORT void Scale( \ + const int N, \ + const TAlpha alpha, \ + const at::Half* X, \ + at::Half* Y, \ + CUDAContext* context) { \ + if (N == 0) { \ + return; \ + } \ + if (Y == X) { \ + CUBLAS_ENFORCE(cublasSetPointerMode( \ + context->cublas_handle(), CUBLAS_POINTER_MODE_HOST)); \ + CUBLAS_ENFORCE(cublasScalEx( \ + context->cublas_handle(), \ + N, \ + &alpha, \ + kAlphaType, \ + Y, \ + CUDA_R_16F, \ + 1, \ + kExecutionType)); \ + } else { \ + const float alpha_host = convert::To(alpha); \ + thrust::transform( \ + thrust::cuda::par.on(context->cuda_stream()), \ + X, \ + X + N, \ + Y, \ + [alpha_host] __device__(const at::Half x) { \ + return convert::To( \ + convert::To(x) * alpha_host); \ + }); \ + } \ + } \ + template <> \ + CAFFE2_CUDA_EXPORT void Scale( \ + const int N, \ + const TAlpha* alpha, \ + const at::Half* X, \ + at::Half* Y, \ + CUDAContext* context) { \ + if (N == 0) { \ + return; \ + } \ + if (Y == X) { \ + CUBLAS_ENFORCE(cublasSetPointerMode( \ + context->cublas_handle(), CUBLAS_POINTER_MODE_HOST)); \ + CUBLAS_ENFORCE(cublasScalEx( \ + context->cublas_handle(), \ + N, \ + alpha, \ + kAlphaType, \ + Y, \ + CUDA_R_16F, \ + 1, \ + kExecutionType)); \ + } else { \ + thrust::transform( \ + thrust::cuda::par.on(context->cuda_stream()), \ + X, \ + X + N, \ + Y, \ + [alpha] __device__(const at::Half x) { \ + return convert::To( \ + convert::To(x) * \ + convert::To(*alpha)); \ + }); \ + } \ } -DELEGATE_CUDA_HALF_SCALE_BY_CUBLAS_FUNCTION( - at::Half, - cublasScalEx, - CUDA_R_16F, - CUDA_R_32F) -DELEGATE_CUDA_HALF_SCALE_BY_CUBLAS_FUNCTION( - float, - cublasScalEx, - CUDA_R_32F, - CUDA_R_32F) -#undef DELEGATE_CUDA_HALF_SCALE_BY_CUBLAS_FUNCTION +DELEGATE_CUDA_HALF_SCALE(at::Half, CUDA_R_16F, CUDA_R_32F) +DELEGATE_CUDA_HALF_SCALE(float, CUDA_R_32F, CUDA_R_32F) +#undef DELEGATE_CUDA_HALF_SCALE #endif // __HIP_PLATFORM_HCC__ @@ -572,5 +745,150 @@ DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION( thrust::greater_equal()) #undef DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION +#define DELEGATE_CUDA_AXPY(T, CuBLASFunc) \ + template <> \ + CAFFE2_CUDA_EXPORT void Axpy( \ + const std::int64_t N, \ + const T alpha, \ + const T* X, \ + T* Y, \ + CUDAContext* context) { \ + CUBLAS_ENFORCE(cublasSetPointerMode( \ + context->cublas_handle(), CUBLAS_POINTER_MODE_HOST)); \ + CUBLAS_ENFORCE( \ + CuBLASFunc(context->cublas_handle(), N, &alpha, X, 1, Y, 1)); \ + } \ + template <> \ + CAFFE2_CUDA_EXPORT void Axpy( \ + const std::int64_t N, \ + const T* alpha, \ + const T* X, \ + T* Y, \ + CUDAContext* context) { \ + CUBLAS_ENFORCE(cublasSetPointerMode( \ + context->cublas_handle(), CUBLAS_POINTER_MODE_DEVICE)); \ + CUBLAS_ENFORCE( \ + cublasSaxpy(context->cublas_handle(), N, alpha, X, 1, Y, 1)); \ + } +DELEGATE_CUDA_AXPY(float, cublasSaxpy) +#undef DELEGATE_CUDA_AXPY + +#ifndef __HIP_PLATFORM_HCC__ + +#define DELEGATE_CUDA_AXPY_EX( \ + TAlpha, TData, kAlphaType, kDataType, kExecutionType) \ + template <> \ + CAFFE2_CUDA_EXPORT void Axpy( \ + const std::int64_t N, \ + const TAlpha alpha, \ + const TData* X, \ + TData* Y, \ + CUDAContext* context) { \ + CUBLAS_ENFORCE(cublasSetPointerMode( \ + context->cublas_handle(), CUBLAS_POINTER_MODE_HOST)); \ + CUBLAS_ENFORCE(cublasAxpyEx( \ + context->cublas_handle(), \ + N, \ + &alpha, \ + kAlphaType, \ + X, \ + kDataType, \ + 1, \ + Y, \ + kDataType, \ + 1, \ + kExecutionType)); \ + } \ + template <> \ + CAFFE2_CUDA_EXPORT void Axpy( \ + const std::int64_t N, \ + const TAlpha* alpha, \ + const TData* X, \ + TData* Y, \ + CUDAContext* context) { \ + CUBLAS_ENFORCE(cublasSetPointerMode( \ + context->cublas_handle(), CUBLAS_POINTER_MODE_DEVICE)); \ + CUBLAS_ENFORCE(cublasAxpyEx( \ + context->cublas_handle(), \ + N, \ + alpha, \ + kAlphaType, \ + X, \ + kDataType, \ + 1, \ + Y, \ + kDataType, \ + 1, \ + kExecutionType)); \ + } +DELEGATE_CUDA_AXPY_EX(float, double, CUDA_R_32F, CUDA_R_64F, CUDA_R_64F) +DELEGATE_CUDA_AXPY_EX(float, at::Half, CUDA_R_32F, CUDA_R_16F, CUDA_R_32F) +#undef DELEGATE_CUDA_AXPY_EX + +#else // __HIP_PLATFORM_HCC__ + +#define CAFFE2_SPECIALIZED_CUDA_AXPY(TAlpha, TData) \ + template <> \ + CAFFE2_CUDA_EXPORT void Axpy( \ + const std::int64_t N, \ + const TAlpha alpha, \ + const TData* X, \ + TData* Y, \ + CUDAContext* context) { \ + const std::int64_t M = DivUp(N, CAFFE_CUDA_NUM_THREADS); \ + AxpyCUDAKernel \ + <<cuda_stream()>>>( \ + N, alpha, X, Y); \ + } \ + template <> \ + CAFFE2_CUDA_EXPORT void Axpy( \ + const std::int64_t N, \ + const TAlpha* alpha, \ + const TData* X, \ + TData* Y, \ + CUDAContext* context) { \ + const std::int64_t M = DivUp(N, CAFFE_CUDA_NUM_THREADS); \ + AxpyCUDAKernel \ + <<cuda_stream()>>>( \ + N, alpha, X, Y); \ + } +CAFFE2_SPECIALIZED_CUDA_AXPY(float, double) +CAFFE2_SPECIALIZED_CUDA_AXPY(float, at::Half) +#undef CAFFE2_SPECIALIZED_CUDA_AXPY + +#endif // __HIP_PLATFORM_HCC__ + +#define CAFFE2_SPECIALIZED_CUDA_AXPBY(TAlpha, TData) \ + template <> \ + CAFFE2_CUDA_EXPORT void Axpby( \ + const std::int64_t N, \ + const TAlpha alpha, \ + const TData* X, \ + const TAlpha beta, \ + TData* Y, \ + CUDAContext* context) { \ + const std::int64_t M = DivUp(N, CAFFE_CUDA_NUM_THREADS); \ + AxpbyCUDAKernel \ + <<cuda_stream()>>>( \ + N, alpha, X, beta, Y); \ + } \ + template <> \ + CAFFE2_CUDA_EXPORT void Axpby( \ + const std::int64_t N, \ + const TAlpha* alpha, \ + const TData* X, \ + const TAlpha* beta, \ + TData* Y, \ + CUDAContext* context) { \ + const std::int64_t M = DivUp(N, CAFFE_CUDA_NUM_THREADS); \ + AxpbyCUDAKernel \ + <<cuda_stream()>>>( \ + N, alpha, X, beta, Y); \ + } +CAFFE2_SPECIALIZED_CUDA_AXPBY(float, float) +CAFFE2_SPECIALIZED_CUDA_AXPBY(float, double) +CAFFE2_SPECIALIZED_CUDA_AXPBY(float, at::Half) +#undef CAFFE2_SPECIALIZED_CUDA_AXPBY + } // namespace math } // namespace caffe2 diff --git a/caffe2/utils/math/elementwise.h b/caffe2/utils/math/elementwise.h index 904853c..0655dc6 100644 --- a/caffe2/utils/math/elementwise.h +++ b/caffe2/utils/math/elementwise.h @@ -113,6 +113,39 @@ CAFFE2_API void GT(int N, const T* A, const T* B, bool* C, Context* context); template CAFFE2_API void GE(int N, const T* A, const T* B, bool* C, Context* context); +template +CAFFE2_API void +Axpy(std::int64_t N, TAlpha alpha, const TData* X, TData* Y, Context* context); + +// Different from the Axpy function above, if alpha is passed in +// as a pointer, we will assume that it lives on the Context device, +// for example on GPU. +template +CAFFE2_API void Axpy( + std::int64_t N, + const TAlpha* alpha, + const TData* X, + TData* Y, + Context* context); + +template +CAFFE2_API void Axpby( + std::int64_t N, + TAlpha alpha, + const TData* X, + TAlpha beta, + TData* Y, + Context* context); + +template +CAFFE2_API void Axpby( + std::int64_t N, + const TAlpha* alpha, + const TData* X, + const TAlpha* beta, + TData* Y, + Context* context); + } // namespace math } // namespace caffe2 diff --git a/caffe2/utils/math_cpu.cc b/caffe2/utils/math_cpu.cc index f512d9e..7909eb6 100644 --- a/caffe2/utils/math_cpu.cc +++ b/caffe2/utils/math_cpu.cc @@ -252,46 +252,6 @@ C10_EXPORT void Gemv( CAFFE2_SPECIALIZED_DOT(float) #undef CAFFE2_SPECIALIZED_DOT -#define CAFFE2_SPECIALIZED_AXPY(T) \ - template <> \ - C10_EXPORT void Axpy( \ - const int N, const T alpha, const T* x, T* Y, CPUContext* context) { \ - EigenVectorMap(Y, N) += ConstEigenVectorMap(x, N) * alpha; \ - } \ - template <> \ - C10_EXPORT void Axpy( \ - const int N, const T* alpha, const T* x, T* Y, CPUContext* context) { \ - EigenVectorMap(Y, N) += ConstEigenVectorMap(x, N) * (*alpha); \ - } -CAFFE2_SPECIALIZED_AXPY(float) -#undef CAFFE2_SPECIALIZED_AXPY - -#define CAFFE2_SPECIALIZED_AXPBY(T) \ - template <> \ - C10_EXPORT void Axpby( \ - const int N, \ - const T alpha, \ - const T* x, \ - const T beta, \ - T* y, \ - CPUContext* context) { \ - EigenVectorArrayMap y_arr(y, N); \ - y_arr = y_arr * beta + ConstEigenVectorArrayMap(x, N) * alpha; \ - } \ - template <> \ - C10_EXPORT void Axpby( \ - const int N, \ - const T* alpha, \ - const T* x, \ - const T* beta, \ - T* y, \ - CPUContext* context) { \ - EigenVectorArrayMap y_arr(y, N); \ - y_arr = y_arr * *beta + ConstEigenVectorArrayMap(x, N) * *alpha; \ - } -CAFFE2_SPECIALIZED_AXPBY(float) -#undef CAFFE2_SPECIALIZED_AXPBY - #else // CAFFE2_USE_EIGEN_FOR_BLAS template <> @@ -384,72 +344,6 @@ C10_EXPORT void Gemv( CAFFE2_SPECIALIZED_DOT(float, s) #undef CAFFE2_SPECIALIZED_DOT -#define CAFFE2_SPECIALIZED_AXPY(T, prefix) \ - template <> \ - C10_EXPORT void Axpy( \ - const int N, const T alpha, const T* x, T* y, CPUContext*) { \ - cblas_##prefix##axpy(N, alpha, x, 1, y, 1); \ - } \ - template <> \ - C10_EXPORT void Axpy( \ - const int N, const T* alpha, const T* x, T* y, CPUContext*) { \ - cblas_##prefix##axpy(N, *alpha, x, 1, y, 1); \ - } -CAFFE2_SPECIALIZED_AXPY(float, s) -#undef CAFFE2_SPECIALIZED_AXPY - -// cblas_[sd]axpby is not a standard blas function, and if MKL is not present, -// we will need to implement it. -#ifdef CAFFE2_USE_MKL -#define CAFFE2_SPECIALIZED_AXPBY(T, prefix) \ - template <> \ - C10_EXPORT void Axpby( \ - const int N, \ - const T alpha, \ - const T* x, \ - const T beta, \ - T* y, \ - CPUContext*) { \ - cblas_##prefix##axpby(N, alpha, x, 1, beta, y, 1); \ - } \ - template <> \ - C10_EXPORT void Axpby( \ - const int N, \ - const T* alpha, \ - const T* x, \ - const T* beta, \ - T* y, \ - CPUContext*) { \ - cblas_##prefix##axpby(N, *alpha, x, 1, *beta, y, 1); \ - } -#else // CAFFE2_USE_MKL -#define CAFFE2_SPECIALIZED_AXPBY(T, prefix) \ - template <> \ - C10_EXPORT void Axpby( \ - const int N, \ - const T alpha, \ - const T* x, \ - const T beta, \ - T* y, \ - CPUContext*) { \ - cblas_##prefix##scal(N, beta, y, 1); \ - cblas_##prefix##axpy(N, alpha, x, 1, y, 1); \ - } \ - template <> \ - C10_EXPORT void Axpby( \ - const int N, \ - const T* alpha, \ - const T* x, \ - const T* beta, \ - T* y, \ - CPUContext*) { \ - cblas_##prefix##scal(N, *beta, y, 1); \ - cblas_##prefix##axpy(N, *alpha, x, 1, y, 1); \ - } -#endif // CAFFE2_USE_MKL -CAFFE2_SPECIALIZED_AXPBY(float, s) -#undef CAFFE2_SPECIALIZED_AXPBY - #endif // CAFFE2_USE_EIGEN_FOR_BLAS template <> diff --git a/caffe2/utils/math_gpu.cu b/caffe2/utils/math_gpu.cu index 71f05ab..a228527 100644 --- a/caffe2/utils/math_gpu.cu +++ b/caffe2/utils/math_gpu.cu @@ -1865,201 +1865,6 @@ CAFFE2_CUDA_EXPORT void Select( context->cuda_stream()>>>(N, D, x, idx, y); } -template <> -CAFFE2_CUDA_EXPORT void Axpy( - const int N, - const float alpha, - const float* X, - float* Y, - CUDAContext* context) { - CUBLAS_ENFORCE( - cublasSetPointerMode(context->cublas_handle(), CUBLAS_POINTER_MODE_HOST)); - CUBLAS_ENFORCE(cublasSaxpy(context->cublas_handle(), N, &alpha, X, 1, Y, 1)); -} - -template <> -CAFFE2_CUDA_EXPORT void Axpy( - const int N, - const float alpha, - const double* X, - double* Y, - CUDAContext* context) { - double alpha_d{alpha}; - CUBLAS_ENFORCE( - cublasSetPointerMode(context->cublas_handle(), CUBLAS_POINTER_MODE_HOST)); - CUBLAS_ENFORCE( - cublasDaxpy(context->cublas_handle(), N, &alpha_d, X, 1, Y, 1)); -} - -template <> -CAFFE2_CUDA_EXPORT void Axpy( - const int N, - const float alpha, - const at::Half* X, - at::Half* Y, - CUDAContext* context) { -#if defined(__HIP_PLATFORM_HCC__) - CAFFE_THROW("HIP currently does not support FP16 completely yet."); -#else - CUBLAS_ENFORCE( - cublasSetPointerMode(context->cublas_handle(), CUBLAS_POINTER_MODE_HOST)); - CUBLAS_ENFORCE(cublasAxpyEx( - context->cublas_handle(), - N, - &alpha, - CUDA_R_32F, - X, - CUDA_R_16F, - 1, - Y, - CUDA_R_16F, - 1, - CUDA_R_32F)); -#endif -} - -template <> -CAFFE2_CUDA_EXPORT void Axpy( - const int N, - const float* alpha, - const float* X, - float* Y, - CUDAContext* context) { - CUBLAS_ENFORCE(cublasSetPointerMode( - context->cublas_handle(), CUBLAS_POINTER_MODE_DEVICE)); - CUBLAS_ENFORCE(cublasSaxpy(context->cublas_handle(), N, alpha, X, 1, Y, 1)); -} - -template <> -CAFFE2_CUDA_EXPORT void Axpy( - const int N, - const float* alpha, - const at::Half* X, - at::Half* Y, - CUDAContext* context) { -#if defined(__HIP_PLATFORM_HCC__) - CAFFE_THROW("HIP currently does not support FP16 completely yet."); -#else - CUBLAS_ENFORCE(cublasSetPointerMode( - context->cublas_handle(), CUBLAS_POINTER_MODE_DEVICE)); - CUBLAS_ENFORCE(cublasAxpyEx( - context->cublas_handle(), - N, - alpha, - CUDA_R_32F, - X, - CUDA_R_16F, - 1, - Y, - CUDA_R_16F, - 1, - CUDA_R_32F)); -#endif -} - -namespace { - -template -__global__ void AxpbyCUDAKernel( - const int N, - const TCoeff a, - const TData* x, - const TCoeff b, - TData* y) { - CUDA_1D_KERNEL_LOOP(i, N) { -#if __CUDA_ARCH__ >= 350 - y[i] = __ldg(x + i) * a + y[i] * b; -#else - y[i] = x[i] * a + y[i] * b; -#endif - } -} - -template <> -__global__ void AxpbyCUDAKernel( - const int N, - const float a, - const at::Half* x, - const float b, - at::Half* y) { - CUDA_1D_KERNEL_LOOP(i, N) { - y[i] = convert::To( - convert::To(x[i]) * a + - convert::To(y[i]) * b); - } -} - -template -__global__ void AxpbyCUDAKernel( - const int N, - const TCoeff* a, - const TData* x, - const TCoeff* b, - TData* y) { - CUDA_1D_KERNEL_LOOP(i, N) { -#if __CUDA_ARCH__ >= 350 - y[i] = __ldg(x + i) * __ldg(a) + y[i] * __ldg(b); -#else - y[i] = x[i] * *a + y[i] * *b; -#endif - } -} - -template <> -__global__ void AxpbyCUDAKernel( - const int N, - const float* a, - const at::Half* x, - const float* b, - at::Half* y) { - CUDA_1D_KERNEL_LOOP(i, N) { -#if __CUDA_ARCH__ >= 350 - y[i] = convert::To( - convert::To(x[i]) * __ldg(a) + - convert::To(y[i]) * __ldg(b)); -#else - y[i] = convert::To( - convert::To(x[i]) * *a + - convert::To(y[i]) * *b); -#endif - } -} - -} // namespace - -#define CAFFE2_SPECIALIZED_CUDA_AXPBY(TCoeff, TData) \ - template <> \ - CAFFE2_CUDA_EXPORT void Axpby( \ - const int n, \ - const TCoeff a, \ - const TData* x, \ - const TCoeff b, \ - TData* y, \ - CUDAContext* context) { \ - AxpbyCUDAKernel \ - <<cuda_stream()>>>(n, a, x, b, y); \ - } \ - template <> \ - CAFFE2_CUDA_EXPORT void Axpby( \ - const int n, \ - const TCoeff* a, \ - const TData* x, \ - const TCoeff* b, \ - TData* y, \ - CUDAContext* context) { \ - AxpbyCUDAKernel \ - <<cuda_stream()>>>(n, a, x, b, y); \ - } -CAFFE2_SPECIALIZED_CUDA_AXPBY(float, float) -CAFFE2_SPECIALIZED_CUDA_AXPBY(float, at::Half) -#undef CAFFE2_SPECIALIZED_CUDA_AXPBY - namespace { template -- 2.7.4