From 265fa0ce4d4a10feeeaf76c2445fdd1fe0080b2e Mon Sep 17 00:00:00 2001
From: Xiaomeng Yang <yangxm@fb.com>
Date: Tue, 26 Mar 2019 12:13:51 -0700
Subject: [PATCH] Move math::Axpy function to elementwise lib (#18316)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/18316

Move math::Axpy function to elementwise lib

i-am-not-moving-c2-to-c10

Reviewed By: houseroad

Differential Revision: D14574697

fbshipit-source-id: 7cfbb2da295c8966c5328bd6b577cce2638eea62
---
 caffe2/operators/distance_op.h                     |   3 +-
 .../operators/local_response_normalization_op.cc   |  59 ++-
 caffe2/operators/utility_ops.h                     |   2 +-
 caffe2/operators/utility_ops_cudnn.cc              | 169 --------
 caffe2/perfkernels/typed_axpy.cc                   |   2 +-
 caffe2/utils/math.h                                |  30 --
 caffe2/utils/math/elementwise.cc                   | 100 +++++
 caffe2/utils/math/elementwise.cu                   | 482 +++++++++++++++++----
 caffe2/utils/math/elementwise.h                    |  33 ++
 caffe2/utils/math_cpu.cc                           | 106 -----
 caffe2/utils/math_gpu.cu                           | 195 ---------
 11 files changed, 575 insertions(+), 606 deletions(-)
 delete mode 100644 caffe2/operators/utility_ops_cudnn.cc
diff --git a/caffe2/operators/distance_op.h b/caffe2/operators/distance_op.h
index a246209..11b43b6 100644
--- a/caffe2/operators/distance_op.h
+++ b/caffe2/operators/distance_op.h
@@ -259,7 +259,8 @@ class DotProductWithPaddingGradientOp final : public Operator<Context> {
               DS, dDot_data[i], S_data, dL_data + j * DS, &context_);
           math::Scale<T, T, Context>(
               DS, dDot_data[i], L_data + j * DS, tmp_data.data(), &context_);
-          math::Axpy<T, Context>(DS, 1.0, tmp_data.data(), dS_data, &context_);
+          math::Axpy<float, T, Context>(
+              DS, 1.0, tmp_data.data(), dS_data, &context_);
         }
       } else {
         math::Scale<T, T, Context>(
diff --git a/caffe2/operators/local_response_normalization_op.cc b/caffe2/operators/local_response_normalization_op.cc
index e6b079b..0f21f2a 100644
--- a/caffe2/operators/local_response_normalization_op.cc
+++ b/caffe2/operators/local_response_normalization_op.cc
@@ -40,9 +40,12 @@ bool LRNOp<float, CPUContext>::RunOnDeviceWithOrderNCHW() {
                                  &context_);
     // Create the first channel scale
     for (int c = 0; c < size_; ++c) {
-      math::Axpy<float, CPUContext>(
-          H * W, alpha_over_size, padded_square_data + c * H * W,
-          scale_data + image_size * n, &context_);
+      math::Axpy<float, float, CPUContext>(
+          H * W,
+          alpha_over_size,
+          padded_square_data + c * H * W,
+          scale_data + image_size * n,
+          &context_);
     }
     for (int c = 1; c < C; ++c) {
       float* this_scale_slice = scale_data + n * image_size + c * H * W;
@@ -50,13 +53,19 @@ bool LRNOp<float, CPUContext>::RunOnDeviceWithOrderNCHW() {
       context_.CopyFromCPU<float>(
           H * W, this_scale_slice - H * W, this_scale_slice);
       // add head
-      math::Axpy<float, CPUContext>(
-          H * W, alpha_over_size, padded_square_data + (c + size_ - 1) * H * W,
-          this_scale_slice, &context_);
+      math::Axpy<float, float, CPUContext>(
+          H * W,
+          alpha_over_size,
+          padded_square_data + (c + size_ - 1) * H * W,
+          this_scale_slice,
+          &context_);
       // subtract tail
-      math::Axpy<float, CPUContext>(
-          H * W, -alpha_over_size, padded_square_data + (c - 1) * H * W,
-          this_scale_slice, &context_);
+      math::Axpy<float, float, CPUContext>(
+          H * W,
+          -alpha_over_size,
+          padded_square_data + (c - 1) * H * W,
+          this_scale_slice,
+          &context_);
     }
   }
   math::Powx<float, CPUContext>(
@@ -161,9 +170,12 @@ bool LRNGradientOp<float, CPUContext>::RunOnDeviceWithOrderNCHW() {
                                  &context_);
     // Create the first channel scale
     for (int c = 0; c < size_; ++c) {
-      math::Axpy<float, CPUContext>(
-          H * W, alpha_over_size, padded_ratio_data + c * H * W,
-          scale_data + image_size * n, &context_);
+      math::Axpy<float, float, CPUContext>(
+          H * W,
+          alpha_over_size,
+          padded_ratio_data + c * H * W,
+          scale_data + image_size * n,
+          &context_);
     }
     for (int c = 1; c < C; ++c) {
       float* this_scale_slice = scale_data + n * image_size + c * H * W;
@@ -171,13 +183,19 @@ bool LRNGradientOp<float, CPUContext>::RunOnDeviceWithOrderNCHW() {
       context_.CopyFromCPU<float>(
           H * W, this_scale_slice - H * W, this_scale_slice);
       // add head
-      math::Axpy<float, CPUContext>(
-          H * W, alpha_over_size, padded_ratio_data + (c + size_ - 1) * H * W,
-          this_scale_slice, &context_);
+      math::Axpy<float, float, CPUContext>(
+          H * W,
+          alpha_over_size,
+          padded_ratio_data + (c + size_ - 1) * H * W,
+          this_scale_slice,
+          &context_);
       // subtract tail
-      math::Axpy<float, CPUContext>(
-          H * W, -alpha_over_size, padded_ratio_data + (c - 1) * H * W,
-          this_scale_slice, &context_);
+      math::Axpy<float, float, CPUContext>(
+          H * W,
+          -alpha_over_size,
+          padded_ratio_data + (c - 1) * H * W,
+          this_scale_slice,
+          &context_);
     }
   }
 
@@ -203,9 +221,8 @@ bool LRNGradientOp<float, CPUContext>::RunOnDeviceWithOrderNCHW() {
     math::Set<float, CPUContext>(
         accum_ratio.numel(), 0., accum_ratio_data, &context_);
     for (int c = 0; c < size_ - 1; ++c) {
-      math::Axpy<float, CPUContext>(H * W, 1,
-                                    padded_ratio_data + c * H * W,
-                                    accum_ratio_data, &context_);
+      math::Axpy<float, float, CPUContext>(
+          H * W, 1, padded_ratio_data + c * H * W, accum_ratio_data, &context_);
     }
     for (int c = 0; c < C; ++c) {
       for (int hw = 0; hw < H * W; ++hw) {
diff --git a/caffe2/operators/utility_ops.h b/caffe2/operators/utility_ops.h
index 0786f8a..2b38d1b 100644
--- a/caffe2/operators/utility_ops.h
+++ b/caffe2/operators/utility_ops.h
@@ -414,7 +414,7 @@ class WeightedSumOp : public Operator<Context> {
       const auto& weighti = Input(i + 1);
       CAFFE_ENFORCE_EQ(Xi.numel(), size);
       CAFFE_ENFORCE_EQ(weighti.numel(), 1);
-      math::Axpy<T, Context>(
+      math::Axpy<float, T, Context>(
           size,
           weighti.template data<float>(),
           Xi.template data<T>(),
diff --git a/caffe2/operators/utility_ops_cudnn.cc b/caffe2/operators/utility_ops_cudnn.cc
deleted file mode 100644
index c04ad32..0000000
--- a/caffe2/operators/utility_ops_cudnn.cc
+++ /dev/null
@@ -1,169 +0,0 @@
-#include "caffe2/operators/utility_ops.h"
-
-#include <type_traits>
-
-#include "caffe2/core/context_gpu.h"
-#include "caffe2/core/cudnn_wrappers.h"
-#include "caffe2/utils/conversions.h"
-
-namespace caffe2 {
-
-class CuDNNWeightedSumOp : public Operator<CUDAContext> {
- public:
-  USE_OPERATOR_FUNCTIONS(CUDAContext);
-
-  template <class... Args>
-  explicit CuDNNWeightedSumOp(Args&&... args)
-      : Operator<CUDAContext>(std::forward<Args>(args)...),
-        cudnn_wrapper_(&context_) {
-    CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&data_desc_));
-    CUDNN_ENFORCE(cudnnCreateOpTensorDescriptor(&add_desc_));
-    // Both float and at::Half require opTensorCompType to be CUDNN_DATA_FLOAT.
-    CUDNN_ENFORCE(cudnnSetOpTensorDescriptor(
-        add_desc_, CUDNN_OP_TENSOR_ADD, CUDNN_DATA_FLOAT, CUDNN_PROPAGATE_NAN));
-  }
-
-  ~CuDNNWeightedSumOp() override {
-    CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(data_desc_));
-    CUDNN_ENFORCE(cudnnDestroyOpTensorDescriptor(add_desc_));
-  }
-
-  bool RunOnDevice() override {
-    return DispatchHelper<TensorTypes<float, at::Half>>::call(this, Input(0));
-  }
-
-  template <typename T>
-  bool DoRunWithType() {
-    if (std::is_same<T, at::Half>::value) {
-      LOG(WARNING)
-          << "CuDNN only support same type for data and weight, "
-             "so the weight will be cast to at::Half when data type is Half.";
-    }
-    const int num_inputs = InputSize();
-    CAFFE_ENFORCE_EQ(num_inputs % 2, 0);
-    const auto& X0 = Input(0);
-    const auto& weight0 = Input(1);
-    CAFFE_ENFORCE_GT(X0.numel(), 0);
-    CAFFE_ENFORCE_EQ(weight0.numel(), 1);
-    const int input_size = X0.numel();
-    SetTensorDescriptor(cudnnTypeWrapper<T>::type, input_size);
-
-    // Note: removed Aliasing check, since Output already has
-    // caching capability
-    auto* Y = Output(0, X0.sizes(), at::dtype<T>());
-    T* Y_data = Y->template mutable_data<T>();
-    T alpha = convert::To<float, T>(0.0f);
-    T beta = convert::To<float, T>(0.0f);
-    if (num_inputs == 2) {
-      CopyWeightToHost<T>(weight0.template data<float>(), &alpha);
-      CUDNN_ENFORCE(cudnnAddTensor(
-          cudnn_wrapper_.inline_cudnn_handle(),
-          &alpha,
-          data_desc_,
-          X0.template data<T>(),
-          cudnnTypeWrapper<T>::kZero(),
-          data_desc_,
-          Y_data));
-      return true;
-    }
-    const auto& X1 = Input(2);
-    CAFFE_ENFORCE(
-        !IsInputOutputAlias(2, 0),
-        "Input #2 is the same as output. If you want to do in-place updates, "
-        "put the output as input #0.");
-    const auto& weight1 = Input(3);
-    CAFFE_ENFORCE_EQ(X1.numel(), input_size);
-    CAFFE_ENFORCE_EQ(weight1.numel(), 1);
-    CopyWeightToHost<T>(weight1.template data<float>(), &alpha);
-    CopyWeightToHost<T>(weight0.template data<float>(), &beta);
-    if (IsInputOutputAlias(0, 0)) {
-      CUDNN_ENFORCE(cudnnAddTensor(
-          cudnn_wrapper_.inline_cudnn_handle(),
-          &alpha,
-          data_desc_,
-          X1.template data<T>(),
-          &beta,
-          data_desc_,
-          Y_data));
-    } else {
-      CUDNN_ENFORCE(cudnnOpTensor(
-          cudnn_wrapper_.inline_cudnn_handle(),
-          add_desc_,
-          &alpha,
-          data_desc_,
-          X1.template data<T>(),
-          &beta,
-          data_desc_,
-          X0.template data<T>(),
-          cudnnTypeWrapper<T>::kZero(),
-          data_desc_,
-          Y_data));
-    }
-    for (int i = 4; i < num_inputs; i += 2) {
-      const auto& Xi = Input(i);
-      // Do a check: if the input is the same as output, we have a problem -
-      // in-place update should always only happen with the zeroth input.
-      const std::string err_msg = "Input #" + to_string(i) +
-          " is the same as output. If you want to do in-place updates, "
-          "put the output as input #0.";
-      CAFFE_ENFORCE(!IsInputOutputAlias(i, 0), err_msg);
-      const auto& weighti = Input(i + 1);
-      CAFFE_ENFORCE_EQ(Xi.numel(), input_size);
-      CAFFE_ENFORCE_EQ(weighti.numel(), 1);
-      CopyWeightToHost<T>(weighti.template data<float>(), &alpha);
-      CUDNN_ENFORCE(cudnnAddTensor(
-          cudnn_wrapper_.inline_cudnn_handle(),
-          &alpha,
-          data_desc_,
-          Xi.template data<T>(),
-          cudnnTypeWrapper<T>::kOne(),
-          data_desc_,
-          Y_data));
-    }
-    return true;
-  }
-
- private:
-  void SetTensorDescriptor(
-      const cudnnDataType_t data_type,
-      const int input_size) {
-    if (cached_input_size_ != input_size) {
-      cached_input_size_ = input_size;
-      // Since the best performance is obtained when the tesor is HW-packed, we
-      // put X.size() to W.
-      CUDNN_ENFORCE(cudnnSetTensor4dDescriptor(
-          data_desc_,
-          GetCudnnTensorFormat(StorageOrder::NCHW),
-          data_type,
-          1,
-          1,
-          1,
-          input_size));
-    }
-  }
-
-  template <typename T>
-  void CopyWeightToHost(const float* src, T* dst);
-
-  CuDNNWrapper cudnn_wrapper_;
-  cudnnTensorDescriptor_t data_desc_;
-  cudnnOpTensorDescriptor_t add_desc_;
-
-  int cached_input_size_ = 0;
-};
-
-template <typename T>
-void CuDNNWeightedSumOp::CopyWeightToHost(const float* src, T* dst) {
-  float val;
-  context_.template CopyToCPU<float>(1, src, &val);
-  *dst = convert::To<float, T>(val);
-}
-
-template <>
-void CuDNNWeightedSumOp::CopyWeightToHost<float>(const float* src, float* dst) {
-  context_.CopyToCPU<float>(1, src, dst);
-}
-
-REGISTER_CUDNN_OPERATOR(WeightedSum, CuDNNWeightedSumOp);
-
-} // namespace caffe2
diff --git a/caffe2/perfkernels/typed_axpy.cc b/caffe2/perfkernels/typed_axpy.cc
index 2ca219a..5656977 100644
--- a/caffe2/perfkernels/typed_axpy.cc
+++ b/caffe2/perfkernels/typed_axpy.cc
@@ -10,7 +10,7 @@ template <>
 void TypedAxpy<float, float>(int N, const float a, const float* x, float* y) {
   // This uses a hack that axpy implementation actually does not use the
   // CPUContext, so passing in a nullpointer works.
-  math::Axpy<float, CPUContext>(N, a, x, y, nullptr);
+  math::Axpy<float, float, CPUContext>(N, a, x, y, nullptr);
 }
 
 void TypedAxpyHalffloat__base(
diff --git a/caffe2/utils/math.h b/caffe2/utils/math.h
index 0ce41f1..4ad285d 100644
--- a/caffe2/utils/math.h
+++ b/caffe2/utils/math.h
@@ -323,35 +323,6 @@ CAFFE2_API void Select(
     T* y,
     Context* context);
 
-template <typename T, class Context>
-CAFFE2_API void
-Axpy(const int N, const float alpha, const T* x, T* y, Context* context);
-
-// Different from the Axpy function above, if alpha is passed in
-// as a pointer, we will assume that it lives on the Context device,
-// for example on GPU.
-template <typename T, class Context>
-CAFFE2_API void
-Axpy(const int N, const float* alpha, const T* x, T* y, Context* context);
-
-template <typename TCoeff, typename TData, class Context>
-CAFFE2_API void Axpby(
-    const int N,
-    const TCoeff alpha,
-    const TData* x,
-    const TCoeff b,
-    TData* y,
-    Context* context);
-
-template <typename TCoeff, typename TData, class Context>
-CAFFE2_API void Axpby(
-    const int N,
-    const TCoeff* alpha,
-    const TData* x,
-    const TCoeff* b,
-    TData* y,
-    Context* context);
-
 // groups must be 1 for GPU
 // For NHWC order with groups > 1, the result will be layout in
 // NHW G RS C/G order to make data within the same group to be contiguous.
@@ -488,7 +459,6 @@ CAFFE2_API void CopyMatrix(
 template <typename T, class Context>
 CAFFE2_API void CopyVector(const int N, const T* A, T* B, Context* context);
 
-
 } // namespace math
 } // namespace caffe2
 
diff --git a/caffe2/utils/math/elementwise.cc b/caffe2/utils/math/elementwise.cc
index 7e56c26..c18f35b 100644
--- a/caffe2/utils/math/elementwise.cc
+++ b/caffe2/utils/math/elementwise.cc
@@ -114,6 +114,32 @@ DELEGATE_SIMPLE_BINARY_FUNCTION(float, Div, vsDiv)
 DELEGATE_SIMPLE_BINARY_FUNCTION(double, Div, vdDiv)
 #undef DELEGATE_SIMPLE_BINARY_FUNCTION
 
+#define DELEGATE_AXPBY(TAlpha, TData, MKLFunc)                                 \
+  template <>                                                                  \
+  C10_EXPORT void Axpby<TAlpha, TData, CPUContext>(                            \
+      const std::int64_t N,                                                    \
+      const TAlpha alpha,                                                      \
+      const TData* X,                                                          \
+      const TAlpha beta,                                                       \
+      TData* Y,                                                                \
+      CPUContext* /* context */) {                                             \
+    MKLFunc(                                                                   \
+        N, static_cast<TData>(alpha), X, 1, static_cast<TData>(beta), Y, 1);   \
+  }                                                                            \
+  template <>                                                                  \
+  C10_EXPORT void Axpby<TAlpha, TData, CPUContext>(                            \
+      const std::int64_t N,                                                    \
+      const TAlpha* alpha,                                                     \
+      const TData* X,                                                          \
+      const TAlpha* beta,                                                      \
+      TData* Y,                                                                \
+      CPUContext* /* context */) {                                             \
+    MKLFunc(                                                                   \
+        N, static_cast<TData>(*alpha), X, 1, static_cast<TData>(*beta), Y, 1); \
+  }
+DELEGATE_AXPBY(float, float, cblas_saxpby)
+#undef DELEGATE_AXPBY
+
 #else // CAFFE2_USE_MKL
 
 #define DELEGATE_SIMPLE_UNARY_FUNCTION(T, Func, EigenFunc)        \
@@ -231,6 +257,34 @@ DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_OPERATOR(float, Div, /)
 DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_OPERATOR(double, Div, /)
 #undef DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_OPERATOR
 
+#define CAFFE2_SPECIALIZED_AXPBY(TAlpha, TData)                             \
+  template <>                                                               \
+  C10_EXPORT void Axpby<TAlpha, TData, CPUContext>(                         \
+      const std::int64_t N,                                                 \
+      const TAlpha alpha,                                                   \
+      const TData* X,                                                       \
+      const TAlpha beta,                                                    \
+      TData* Y,                                                             \
+      CPUContext* /* context */) {                                          \
+    EigenVectorArrayMap<TData> Y_arr(Y, N);                                 \
+    Y_arr = Y_arr * static_cast<TData>(beta) +                              \
+        ConstEigenVectorArrayMap<TData>(X, N) * static_cast<TData>(alpha);  \
+  }                                                                         \
+  template <>                                                               \
+  C10_EXPORT void Axpby<TAlpha, TData, CPUContext>(                         \
+      const std::int64_t N,                                                 \
+      const TAlpha* alpha,                                                  \
+      const TData* X,                                                       \
+      const TAlpha* beta,                                                   \
+      TData* Y,                                                             \
+      CPUContext* /* context */) {                                          \
+    EigenVectorArrayMap<TData> Y_arr(Y, N);                                 \
+    Y_arr = Y_arr * static_cast<TData>(*beta) +                             \
+        ConstEigenVectorArrayMap<TData>(X, N) * static_cast<TData>(*alpha); \
+  }
+CAFFE2_SPECIALIZED_AXPBY(float, float)
+#undef CAFFE2_SPECIALIZED_AXPBY
+
 #endif // CAFFE2_USE_MKL
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -275,6 +329,30 @@ CAFFE2_SPECIALIZED_SCALE(double, double)
 CAFFE2_SPECIALIZED_SCALE(float, double)
 #undef CAFFE2_SPECIALIZED_SCALE
 
+#define CAFFE2_SPECIALIZED_AXPY(TAlpha, TData)                              \
+  template <>                                                               \
+  C10_EXPORT void Axpy<TAlpha, TData, CPUContext>(                          \
+      const std::int64_t N,                                                 \
+      const TAlpha alpha,                                                   \
+      const TData* X,                                                       \
+      TData* Y,                                                             \
+      CPUContext* /* context */) {                                          \
+    EigenVectorArrayMap<TData>(Y, N) +=                                     \
+        ConstEigenVectorArrayMap<TData>(X, N) * static_cast<TData>(alpha);  \
+  }                                                                         \
+  template <>                                                               \
+  C10_EXPORT void Axpy<TAlpha, TData, CPUContext>(                          \
+      const std::int64_t N,                                                 \
+      const TAlpha* alpha,                                                  \
+      const TData* X,                                                       \
+      TData* Y,                                                             \
+      CPUContext* /* context */) {                                          \
+    EigenVectorArrayMap<TData>(Y, N) +=                                     \
+        ConstEigenVectorArrayMap<TData>(X, N) * static_cast<TData>(*alpha); \
+  }
+CAFFE2_SPECIALIZED_AXPY(float, float)
+#undef CAFFE2_SPECIALIZED_AXPY
+
 #else // CAFFE2_USE_EIGEN_FOR_BLAS
 
 #ifdef CAFFE2_USE_MKL
@@ -349,6 +427,28 @@ DELEGATE_SCALE(float, double, cblas_dscal)
 
 #endif // CAFFE2_USE_MKL
 
+#define DELEGATE_AXPY(TAlpha, TData, BLASFunc)           \
+  template <>                                            \
+  C10_EXPORT void Axpy<TAlpha, TData, CPUContext>(       \
+      const std::int64_t N,                              \
+      const TAlpha alpha,                                \
+      const TData* X,                                    \
+      TData* Y,                                          \
+      CPUContext* /* context */) {                       \
+    BLASFunc(N, static_cast<TData>(alpha), X, 1, Y, 1);  \
+  }                                                      \
+  template <>                                            \
+  C10_EXPORT void Axpy<TAlpha, TData, CPUContext>(       \
+      const std::int64_t N,                              \
+      const TAlpha* alpha,                               \
+      const TData* X,                                    \
+      TData* Y,                                          \
+      CPUContext* /* context */) {                       \
+    BLASFunc(N, static_cast<TData>(*alpha), X, 1, Y, 1); \
+  }
+DELEGATE_AXPY(float, float, cblas_saxpy)
+#undef DELEGATE_AXPY
+
 #endif // CAFFE2_USE_EIGEN_FOR_BLAS
 
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/caffe2/utils/math/elementwise.cu b/caffe2/utils/math/elementwise.cu
index 7509fb2..9b321a5 100644
--- a/caffe2/utils/math/elementwise.cu
+++ b/caffe2/utils/math/elementwise.cu
@@ -29,6 +29,188 @@ __global__ void SinCosCUDAKernel(const int N, const T* X, T* S, T* C) {
   }
 }
 
+#ifdef __HIP_PLATFORM_HCC__
+
+template <typename TAlpha, typename TData>
+__global__ void AxpyCUDAKernel(
+    const std::int64_t N,
+    const TAlpha alpha,
+    const TData* X,
+    TData* Y) {
+  const std::int64_t index = static_cast<std::int64_t>(blockIdx.x) *
+          static_cast<std::int64_t>(CAFFE_CUDA_NUM_THREADS) +
+      static_cast<std::int64_t>(threadIdx.x);
+  if (index < N) {
+    Y[index] += static_cast<TData>(alpha) * __ldg(X + index);
+  }
+}
+
+template <typename TAlpha, typename TData>
+__global__ void AxpyCUDAKernel(
+    const std::int64_t N,
+    const TAlpha* alpha,
+    const TData* X,
+    TData* Y) {
+  __shared__ TData a;
+  if (threadIdx.x == 0) {
+    a = static_cast<TData>(__ldg(alpha));
+  }
+  __syncthreads();
+  const std::int64_t index = static_cast<std::int64_t>(blockIdx.x) *
+          static_cast<std::int64_t>(CAFFE_CUDA_NUM_THREADS) +
+      static_cast<std::int64_t>(threadIdx.x);
+  if (index < N) {
+    Y[index] += a * __ldg(X + index);
+  }
+}
+
+#define DELEGATE_HALF_AXPY_CUDA_KERNEL(TAlpha, FMAFunc)                \
+  template <>                                                          \
+  __global__ void AxpyCUDAKernel<TAlpha, at::Half>(                    \
+      const std::int64_t N,                                            \
+      const TAlpha alpha,                                              \
+      const at::Half* X,                                               \
+      at::Half* Y) {                                                   \
+    const std::int64_t index = static_cast<std::int64_t>(blockIdx.x) * \
+            static_cast<std::int64_t>(CAFFE_CUDA_NUM_THREADS) +        \
+        static_cast<std::int64_t>(threadIdx.x);                        \
+    if (index < N) {                                                   \
+      Y[index] = convert::To<TAlpha, at::Half>(FMAFunc(                \
+          alpha,                                                       \
+          convert::To<at::Half, TAlpha>(X[index]),                     \
+          convert::To<at::Half, TAlpha>(Y[index])));                   \
+    }                                                                  \
+  }                                                                    \
+  template <>                                                          \
+  __global__ void AxpyCUDAKernel<TAlpha, at::Half>(                    \
+      const std::int64_t N,                                            \
+      const TAlpha* alpha,                                             \
+      const at::Half* X,                                               \
+      at::Half* Y) {                                                   \
+    __shared__ TAlpha a;                                               \
+    if (threadIdx.x == 0) {                                            \
+      a = __ldg(alpha);                                                \
+    }                                                                  \
+    __syncthreads();                                                   \
+    const std::int64_t index = static_cast<std::int64_t>(blockIdx.x) * \
+            static_cast<std::int64_t>(CAFFE_CUDA_NUM_THREADS) +        \
+        static_cast<std::int64_t>(threadIdx.x);                        \
+    if (index < N) {                                                   \
+      Y[index] = convert::To<TAlpha, at::Half>(FMAFunc(                \
+          a,                                                           \
+          convert::To<at::Half, TAlpha>(X[index]),                     \
+          convert::To<at::Half, TAlpha>(Y[index])));                   \
+    }                                                                  \
+  }
+DELEGATE_HALF_AXPY_CUDA_KERNEL(float, fmaf)
+#undef DELEGATE_HALF_AXPY_CUDA_KERNEL
+
+#endif // __HIP_PLATFORM_HCC__
+
+template <typename TAlpha, typename TData>
+__global__ void AxpbyCUDAKernel(
+    const std::int64_t N,
+    const TAlpha alpha,
+    const TData* X,
+    const TAlpha beta,
+    TData* Y);
+
+template <typename TAlpha, typename TData>
+__global__ void AxpbyCUDAKernel(
+    const std::int64_t N,
+    const TAlpha* alpha,
+    const TData* X,
+    const TAlpha* beta,
+    TData* Y);
+
+#define DELEGATE_AXPBY_CUDA_KERNEL(TAlpha, TData, FMAFunc)             \
+  template <>                                                          \
+  __global__ void AxpbyCUDAKernel<TAlpha, TData>(                      \
+      const std::int64_t N,                                            \
+      const TAlpha alpha,                                              \
+      const TData* X,                                                  \
+      const TAlpha beta,                                               \
+      TData* Y) {                                                      \
+    const std::int64_t index = static_cast<std::int64_t>(blockIdx.x) * \
+            static_cast<std::int64_t>(CAFFE_CUDA_NUM_THREADS) +        \
+        static_cast<std::int64_t>(threadIdx.x);                        \
+    if (index < N) {                                                   \
+      Y[index] = FMAFunc(                                              \
+          static_cast<TData>(alpha),                                   \
+          X[index],                                                    \
+          static_cast<TData>(beta) * Y[index]);                        \
+    }                                                                  \
+  }                                                                    \
+  template <>                                                          \
+  __global__ void AxpbyCUDAKernel<TAlpha, TData>(                      \
+      const std::int64_t N,                                            \
+      const TAlpha* alpha,                                             \
+      const TData* X,                                                  \
+      const TAlpha* beta,                                              \
+      TData* Y) {                                                      \
+    __shared__ TData a;                                                \
+    __shared__ TData b;                                                \
+    if (threadIdx.x == 0) {                                            \
+      a = static_cast<TData>(*alpha);                                  \
+      b = static_cast<TData>(*beta);                                   \
+    }                                                                  \
+    __syncthreads();                                                   \
+    const std::int64_t index = static_cast<std::int64_t>(blockIdx.x) * \
+            static_cast<std::int64_t>(CAFFE_CUDA_NUM_THREADS) +        \
+        static_cast<std::int64_t>(threadIdx.x);                        \
+    if (index < N) {                                                   \
+      Y[index] = FMAFunc(a, X[index], b * Y[index]);                   \
+    }                                                                  \
+  }
+DELEGATE_AXPBY_CUDA_KERNEL(float, float, fmaf)
+DELEGATE_AXPBY_CUDA_KERNEL(float, double, fma)
+#undef DELEGATE_AXPBY_CUDA_KERNEL
+
+#define DELEGATE_HALF_AXPBY_CUDA_KERNEL(TAlpha, FMAFunc)               \
+  template <>                                                          \
+  __global__ void AxpbyCUDAKernel<TAlpha, at::Half>(                   \
+      const std::int64_t N,                                            \
+      const TAlpha alpha,                                              \
+      const at::Half* X,                                               \
+      const TAlpha beta,                                               \
+      at::Half* Y) {                                                   \
+    const std::int64_t index = static_cast<std::int64_t>(blockIdx.x) * \
+            static_cast<std::int64_t>(CAFFE_CUDA_NUM_THREADS) +        \
+        static_cast<std::int64_t>(threadIdx.x);                        \
+    if (index < N) {                                                   \
+      Y[index] = convert::To<TAlpha, at::Half>(FMAFunc(                \
+          alpha,                                                       \
+          convert::To<at::Half, TAlpha>(X[index]),                     \
+          beta * convert::To<at::Half, TAlpha>(Y[index])));            \
+    }                                                                  \
+  }                                                                    \
+  template <>                                                          \
+  __global__ void AxpbyCUDAKernel<TAlpha, at::Half>(                   \
+      const std::int64_t N,                                            \
+      const TAlpha* alpha,                                             \
+      const at::Half* X,                                               \
+      const TAlpha* beta,                                              \
+      at::Half* Y) {                                                   \
+    __shared__ TAlpha a;                                               \
+    __shared__ TAlpha b;                                               \
+    if (threadIdx.x == 0) {                                            \
+      a = *alpha;                                                      \
+      b = *beta;                                                       \
+    }                                                                  \
+    __syncthreads();                                                   \
+    const std::int64_t index = static_cast<std::int64_t>(blockIdx.x) * \
+            static_cast<std::int64_t>(CAFFE_CUDA_NUM_THREADS) +        \
+        static_cast<std::int64_t>(threadIdx.x);                        \
+    if (index < N) {                                                   \
+      Y[index] = convert::To<TAlpha, at::Half>(FMAFunc(                \
+          a,                                                           \
+          convert::To<at::Half, TAlpha>(X[index]),                     \
+          b * convert::To<at::Half, TAlpha>(Y[index])));               \
+    }                                                                  \
+  }
+DELEGATE_HALF_AXPBY_CUDA_KERNEL(float, fmaf)
+#undef DELEGATE_HALF_AXPBY_CUDA_KERNEL
+
 } // namespace
 
 #define CAFFE2_SPECIALIZED_CUDA_SET(T)                                    \
@@ -304,89 +486,80 @@ CAFFE2_SPECIALIZED_CUDA_HALF_SCALE(float)
 
 #else // __HIP_PLATFORM_HCC__
 
-#define DELEGATE_CUDA_HALF_SCALE_BY_CUBLAS_FUNCTION(              \
-    TAlpha, CuBLASFunc, kAlphaType, kExecutionType)               \
-  template <>                                                     \
-  CAFFE2_CUDA_EXPORT void Scale<TAlpha, at::Half, CUDAContext>(   \
-      const int N,                                                \
-      const TAlpha alpha,                                         \
-      const at::Half* X,                                          \
-      at::Half* Y,                                                \
-      CUDAContext* context) {                                     \
-    if (N == 0) {                                                 \
-      return;                                                     \
-    }                                                             \
-    if (Y == X) {                                                 \
-      CUBLAS_ENFORCE(cublasSetPointerMode(                        \
-          context->cublas_handle(), CUBLAS_POINTER_MODE_HOST));   \
-      CUBLAS_ENFORCE(cublasScalEx(                                \
-          context->cublas_handle(),                               \
-          N,                                                      \
-          &alpha,                                                 \
-          kAlphaType,                                             \
-          Y,                                                      \
-          CUDA_R_16F,                                             \
-          1,                                                      \
-          kExecutionType));                                       \
-    } else {                                                      \
-      const float alpha_host = convert::To<TAlpha, float>(alpha); \
-      thrust::transform(                                          \
-          thrust::cuda::par.on(context->cuda_stream()),           \
-          X,                                                      \
-          X + N,                                                  \
-          Y,                                                      \
-          [alpha_host] __device__(const at::Half x) {             \
-            return convert::To<float, at::Half>(                  \
-                convert::To<at::Half, float>(x) * alpha_host);    \
-          });                                                     \
-    }                                                             \
-  }                                                               \
-  template <>                                                     \
-  CAFFE2_CUDA_EXPORT void Scale<TAlpha, at::Half, CUDAContext>(   \
-      const int N,                                                \
-      const TAlpha* alpha,                                        \
-      const at::Half* X,                                          \
-      at::Half* Y,                                                \
-      CUDAContext* context) {                                     \
-    if (N == 0) {                                                 \
-      return;                                                     \
-    }                                                             \
-    if (Y == X) {                                                 \
-      CUBLAS_ENFORCE(cublasSetPointerMode(                        \
-          context->cublas_handle(), CUBLAS_POINTER_MODE_HOST));   \
-      CUBLAS_ENFORCE(cublasScalEx(                                \
-          context->cublas_handle(),                               \
-          N,                                                      \
-          alpha,                                                  \
-          kAlphaType,                                             \
-          Y,                                                      \
-          CUDA_R_16F,                                             \
-          1,                                                      \
-          kExecutionType));                                       \
-    } else {                                                      \
-      thrust::transform(                                          \
-          thrust::cuda::par.on(context->cuda_stream()),           \
-          X,                                                      \
-          X + N,                                                  \
-          Y,                                                      \
-          [alpha] __device__(const at::Half x) {                  \
-            return convert::To<float, at::Half>(                  \
-                convert::To<at::Half, float>(x) *                 \
-                convert::To<TAlpha, float>(*alpha));              \
-          });                                                     \
-    }                                                             \
+#define DELEGATE_CUDA_HALF_SCALE(TAlpha, kAlphaType, kExecutionType) \
+  template <>                                                        \
+  CAFFE2_CUDA_EXPORT void Scale<TAlpha, at::Half, CUDAContext>(      \
+      const int N,                                                   \
+      const TAlpha alpha,                                            \
+      const at::Half* X,                                             \
+      at::Half* Y,                                                   \
+      CUDAContext* context) {                                        \
+    if (N == 0) {                                                    \
+      return;                                                        \
+    }                                                                \
+    if (Y == X) {                                                    \
+      CUBLAS_ENFORCE(cublasSetPointerMode(                           \
+          context->cublas_handle(), CUBLAS_POINTER_MODE_HOST));      \
+      CUBLAS_ENFORCE(cublasScalEx(                                   \
+          context->cublas_handle(),                                  \
+          N,                                                         \
+          &alpha,                                                    \
+          kAlphaType,                                                \
+          Y,                                                         \
+          CUDA_R_16F,                                                \
+          1,                                                         \
+          kExecutionType));                                          \
+    } else {                                                         \
+      const float alpha_host = convert::To<TAlpha, float>(alpha);    \
+      thrust::transform(                                             \
+          thrust::cuda::par.on(context->cuda_stream()),              \
+          X,                                                         \
+          X + N,                                                     \
+          Y,                                                         \
+          [alpha_host] __device__(const at::Half x) {                \
+            return convert::To<float, at::Half>(                     \
+                convert::To<at::Half, float>(x) * alpha_host);       \
+          });                                                        \
+    }                                                                \
+  }                                                                  \
+  template <>                                                        \
+  CAFFE2_CUDA_EXPORT void Scale<TAlpha, at::Half, CUDAContext>(      \
+      const int N,                                                   \
+      const TAlpha* alpha,                                           \
+      const at::Half* X,                                             \
+      at::Half* Y,                                                   \
+      CUDAContext* context) {                                        \
+    if (N == 0) {                                                    \
+      return;                                                        \
+    }                                                                \
+    if (Y == X) {                                                    \
+      CUBLAS_ENFORCE(cublasSetPointerMode(                           \
+          context->cublas_handle(), CUBLAS_POINTER_MODE_HOST));      \
+      CUBLAS_ENFORCE(cublasScalEx(                                   \
+          context->cublas_handle(),                                  \
+          N,                                                         \
+          alpha,                                                     \
+          kAlphaType,                                                \
+          Y,                                                         \
+          CUDA_R_16F,                                                \
+          1,                                                         \
+          kExecutionType));                                          \
+    } else {                                                         \
+      thrust::transform(                                             \
+          thrust::cuda::par.on(context->cuda_stream()),              \
+          X,                                                         \
+          X + N,                                                     \
+          Y,                                                         \
+          [alpha] __device__(const at::Half x) {                     \
+            return convert::To<float, at::Half>(                     \
+                convert::To<at::Half, float>(x) *                    \
+                convert::To<TAlpha, float>(*alpha));                 \
+          });                                                        \
+    }                                                                \
   }
-DELEGATE_CUDA_HALF_SCALE_BY_CUBLAS_FUNCTION(
-    at::Half,
-    cublasScalEx,
-    CUDA_R_16F,
-    CUDA_R_32F)
-DELEGATE_CUDA_HALF_SCALE_BY_CUBLAS_FUNCTION(
-    float,
-    cublasScalEx,
-    CUDA_R_32F,
-    CUDA_R_32F)
-#undef DELEGATE_CUDA_HALF_SCALE_BY_CUBLAS_FUNCTION
+DELEGATE_CUDA_HALF_SCALE(at::Half, CUDA_R_16F, CUDA_R_32F)
+DELEGATE_CUDA_HALF_SCALE(float, CUDA_R_32F, CUDA_R_32F)
+#undef DELEGATE_CUDA_HALF_SCALE
 
 #endif // __HIP_PLATFORM_HCC__
 
@@ -572,5 +745,150 @@ DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(
     thrust::greater_equal<double>())
 #undef DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION
 
+#define DELEGATE_CUDA_AXPY(T, CuBLASFunc)                             \
+  template <>                                                         \
+  CAFFE2_CUDA_EXPORT void Axpy<T, T, CUDAContext>(                    \
+      const std::int64_t N,                                           \
+      const T alpha,                                                  \
+      const T* X,                                                     \
+      T* Y,                                                           \
+      CUDAContext* context) {                                         \
+    CUBLAS_ENFORCE(cublasSetPointerMode(                              \
+        context->cublas_handle(), CUBLAS_POINTER_MODE_HOST));         \
+    CUBLAS_ENFORCE(                                                   \
+        CuBLASFunc(context->cublas_handle(), N, &alpha, X, 1, Y, 1)); \
+  }                                                                   \
+  template <>                                                         \
+  CAFFE2_CUDA_EXPORT void Axpy<T, T, CUDAContext>(                    \
+      const std::int64_t N,                                           \
+      const T* alpha,                                                 \
+      const T* X,                                                     \
+      T* Y,                                                           \
+      CUDAContext* context) {                                         \
+    CUBLAS_ENFORCE(cublasSetPointerMode(                              \
+        context->cublas_handle(), CUBLAS_POINTER_MODE_DEVICE));       \
+    CUBLAS_ENFORCE(                                                   \
+        cublasSaxpy(context->cublas_handle(), N, alpha, X, 1, Y, 1)); \
+  }
+DELEGATE_CUDA_AXPY(float, cublasSaxpy)
+#undef DELEGATE_CUDA_AXPY
+
+#ifndef __HIP_PLATFORM_HCC__
+
+#define DELEGATE_CUDA_AXPY_EX(                                  \
+    TAlpha, TData, kAlphaType, kDataType, kExecutionType)       \
+  template <>                                                   \
+  CAFFE2_CUDA_EXPORT void Axpy<TAlpha, TData, CUDAContext>(     \
+      const std::int64_t N,                                     \
+      const TAlpha alpha,                                       \
+      const TData* X,                                           \
+      TData* Y,                                                 \
+      CUDAContext* context) {                                   \
+    CUBLAS_ENFORCE(cublasSetPointerMode(                        \
+        context->cublas_handle(), CUBLAS_POINTER_MODE_HOST));   \
+    CUBLAS_ENFORCE(cublasAxpyEx(                                \
+        context->cublas_handle(),                               \
+        N,                                                      \
+        &alpha,                                                 \
+        kAlphaType,                                             \
+        X,                                                      \
+        kDataType,                                              \
+        1,                                                      \
+        Y,                                                      \
+        kDataType,                                              \
+        1,                                                      \
+        kExecutionType));                                       \
+  }                                                             \
+  template <>                                                   \
+  CAFFE2_CUDA_EXPORT void Axpy<TAlpha, TData, CUDAContext>(     \
+      const std::int64_t N,                                     \
+      const TAlpha* alpha,                                      \
+      const TData* X,                                           \
+      TData* Y,                                                 \
+      CUDAContext* context) {                                   \
+    CUBLAS_ENFORCE(cublasSetPointerMode(                        \
+        context->cublas_handle(), CUBLAS_POINTER_MODE_DEVICE)); \
+    CUBLAS_ENFORCE(cublasAxpyEx(                                \
+        context->cublas_handle(),                               \
+        N,                                                      \
+        alpha,                                                  \
+        kAlphaType,                                             \
+        X,                                                      \
+        kDataType,                                              \
+        1,                                                      \
+        Y,                                                      \
+        kDataType,                                              \
+        1,                                                      \
+        kExecutionType));                                       \
+  }
+DELEGATE_CUDA_AXPY_EX(float, double, CUDA_R_32F, CUDA_R_64F, CUDA_R_64F)
+DELEGATE_CUDA_AXPY_EX(float, at::Half, CUDA_R_32F, CUDA_R_16F, CUDA_R_32F)
+#undef DELEGATE_CUDA_AXPY_EX
+
+#else // __HIP_PLATFORM_HCC__
+
+#define CAFFE2_SPECIALIZED_CUDA_AXPY(TAlpha, TData)                        \
+  template <>                                                              \
+  CAFFE2_CUDA_EXPORT void Axpy<TAlpha, TData, CUDAContext>(                \
+      const std::int64_t N,                                                \
+      const TAlpha alpha,                                                  \
+      const TData* X,                                                      \
+      TData* Y,                                                            \
+      CUDAContext* context) {                                              \
+    const std::int64_t M = DivUp<std::int64_t>(N, CAFFE_CUDA_NUM_THREADS); \
+    AxpyCUDAKernel<TAlpha, TData>                                          \
+        <<<M, CAFFE_CUDA_NUM_THREADS, 0, context->cuda_stream()>>>(        \
+            N, alpha, X, Y);                                               \
+  }                                                                        \
+  template <>                                                              \
+  CAFFE2_CUDA_EXPORT void Axpy<TAlpha, TData, CUDAContext>(                \
+      const std::int64_t N,                                                \
+      const TAlpha* alpha,                                                 \
+      const TData* X,                                                      \
+      TData* Y,                                                            \
+      CUDAContext* context) {                                              \
+    const std::int64_t M = DivUp<std::int64_t>(N, CAFFE_CUDA_NUM_THREADS); \
+    AxpyCUDAKernel<TAlpha, TData>                                          \
+        <<<M, CAFFE_CUDA_NUM_THREADS, 0, context->cuda_stream()>>>(        \
+            N, alpha, X, Y);                                               \
+  }
+CAFFE2_SPECIALIZED_CUDA_AXPY(float, double)
+CAFFE2_SPECIALIZED_CUDA_AXPY(float, at::Half)
+#undef CAFFE2_SPECIALIZED_CUDA_AXPY
+
+#endif // __HIP_PLATFORM_HCC__
+
+#define CAFFE2_SPECIALIZED_CUDA_AXPBY(TAlpha, TData)                       \
+  template <>                                                              \
+  CAFFE2_CUDA_EXPORT void Axpby<TAlpha, TData, CUDAContext>(               \
+      const std::int64_t N,                                                \
+      const TAlpha alpha,                                                  \
+      const TData* X,                                                      \
+      const TAlpha beta,                                                   \
+      TData* Y,                                                            \
+      CUDAContext* context) {                                              \
+    const std::int64_t M = DivUp<std::int64_t>(N, CAFFE_CUDA_NUM_THREADS); \
+    AxpbyCUDAKernel<TAlpha, TData>                                         \
+        <<<M, CAFFE_CUDA_NUM_THREADS, 0, context->cuda_stream()>>>(        \
+            N, alpha, X, beta, Y);                                         \
+  }                                                                        \
+  template <>                                                              \
+  CAFFE2_CUDA_EXPORT void Axpby<TAlpha, TData, CUDAContext>(               \
+      const std::int64_t N,                                                \
+      const TAlpha* alpha,                                                 \
+      const TData* X,                                                      \
+      const TAlpha* beta,                                                  \
+      TData* Y,                                                            \
+      CUDAContext* context) {                                              \
+    const std::int64_t M = DivUp<std::int64_t>(N, CAFFE_CUDA_NUM_THREADS); \
+    AxpbyCUDAKernel<TAlpha, TData>                                         \
+        <<<M, CAFFE_CUDA_NUM_THREADS, 0, context->cuda_stream()>>>(        \
+            N, alpha, X, beta, Y);                                         \
+  }
+CAFFE2_SPECIALIZED_CUDA_AXPBY(float, float)
+CAFFE2_SPECIALIZED_CUDA_AXPBY(float, double)
+CAFFE2_SPECIALIZED_CUDA_AXPBY(float, at::Half)
+#undef CAFFE2_SPECIALIZED_CUDA_AXPBY
+
 } // namespace math
 } // namespace caffe2
diff --git a/caffe2/utils/math/elementwise.h b/caffe2/utils/math/elementwise.h
index 904853c..0655dc6 100644
--- a/caffe2/utils/math/elementwise.h
+++ b/caffe2/utils/math/elementwise.h
@@ -113,6 +113,39 @@ CAFFE2_API void GT(int N, const T* A, const T* B, bool* C, Context* context);
 template <typename T, class Context>
 CAFFE2_API void GE(int N, const T* A, const T* B, bool* C, Context* context);
 
+template <typename TAlpha, typename TData, class Context>
+CAFFE2_API void
+Axpy(std::int64_t N, TAlpha alpha, const TData* X, TData* Y, Context* context);
+
+// Different from the Axpy function above, if alpha is passed in
+// as a pointer, we will assume that it lives on the Context device,
+// for example on GPU.
+template <typename TAlpha, typename TData, class Context>
+CAFFE2_API void Axpy(
+    std::int64_t N,
+    const TAlpha* alpha,
+    const TData* X,
+    TData* Y,
+    Context* context);
+
+template <typename TAlpha, typename TData, class Context>
+CAFFE2_API void Axpby(
+    std::int64_t N,
+    TAlpha alpha,
+    const TData* X,
+    TAlpha beta,
+    TData* Y,
+    Context* context);
+
+template <typename TAlpha, typename TData, class Context>
+CAFFE2_API void Axpby(
+    std::int64_t N,
+    const TAlpha* alpha,
+    const TData* X,
+    const TAlpha* beta,
+    TData* Y,
+    Context* context);
+
 } // namespace math
 } // namespace caffe2
 
diff --git a/caffe2/utils/math_cpu.cc b/caffe2/utils/math_cpu.cc
index f512d9e..7909eb6 100644
--- a/caffe2/utils/math_cpu.cc
+++ b/caffe2/utils/math_cpu.cc
@@ -252,46 +252,6 @@ C10_EXPORT void Gemv<float, CPUContext>(
 CAFFE2_SPECIALIZED_DOT(float)
 #undef CAFFE2_SPECIALIZED_DOT
 
-#define CAFFE2_SPECIALIZED_AXPY(T)                                          \
-  template <>                                                               \
-  C10_EXPORT void Axpy<T, CPUContext>(                                      \
-      const int N, const T alpha, const T* x, T* Y, CPUContext* context) {  \
-    EigenVectorMap<T>(Y, N) += ConstEigenVectorMap<T>(x, N) * alpha;        \
-  }                                                                         \
-  template <>                                                               \
-  C10_EXPORT void Axpy<T, CPUContext>(                                      \
-      const int N, const T* alpha, const T* x, T* Y, CPUContext* context) { \
-    EigenVectorMap<T>(Y, N) += ConstEigenVectorMap<T>(x, N) * (*alpha);     \
-  }
-CAFFE2_SPECIALIZED_AXPY(float)
-#undef CAFFE2_SPECIALIZED_AXPY
-
-#define CAFFE2_SPECIALIZED_AXPBY(T)                                     \
-  template <>                                                           \
-  C10_EXPORT void Axpby<T, T, CPUContext>(                              \
-      const int N,                                                      \
-      const T alpha,                                                    \
-      const T* x,                                                       \
-      const T beta,                                                     \
-      T* y,                                                             \
-      CPUContext* context) {                                            \
-    EigenVectorArrayMap<T> y_arr(y, N);                                 \
-    y_arr = y_arr * beta + ConstEigenVectorArrayMap<T>(x, N) * alpha;   \
-  }                                                                     \
-  template <>                                                           \
-  C10_EXPORT void Axpby<T, T, CPUContext>(                              \
-      const int N,                                                      \
-      const T* alpha,                                                   \
-      const T* x,                                                       \
-      const T* beta,                                                    \
-      T* y,                                                             \
-      CPUContext* context) {                                            \
-    EigenVectorArrayMap<T> y_arr(y, N);                                 \
-    y_arr = y_arr * *beta + ConstEigenVectorArrayMap<T>(x, N) * *alpha; \
-  }
-CAFFE2_SPECIALIZED_AXPBY(float)
-#undef CAFFE2_SPECIALIZED_AXPBY
-
 #else // CAFFE2_USE_EIGEN_FOR_BLAS
 
 template <>
@@ -384,72 +344,6 @@ C10_EXPORT void Gemv<float, CPUContext>(
 CAFFE2_SPECIALIZED_DOT(float, s)
 #undef CAFFE2_SPECIALIZED_DOT
 
-#define CAFFE2_SPECIALIZED_AXPY(T, prefix)                          \
-  template <>                                                       \
-  C10_EXPORT void Axpy<T, CPUContext>(                              \
-      const int N, const T alpha, const T* x, T* y, CPUContext*) {  \
-    cblas_##prefix##axpy(N, alpha, x, 1, y, 1);                     \
-  }                                                                 \
-  template <>                                                       \
-  C10_EXPORT void Axpy<T, CPUContext>(                              \
-      const int N, const T* alpha, const T* x, T* y, CPUContext*) { \
-    cblas_##prefix##axpy(N, *alpha, x, 1, y, 1);                    \
-  }
-CAFFE2_SPECIALIZED_AXPY(float, s)
-#undef CAFFE2_SPECIALIZED_AXPY
-
-// cblas_[sd]axpby is not a standard blas function, and if MKL is not present,
-// we will need to implement it.
-#ifdef CAFFE2_USE_MKL
-#define CAFFE2_SPECIALIZED_AXPBY(T, prefix)              \
-  template <>                                            \
-  C10_EXPORT void Axpby<T, T, CPUContext>(               \
-      const int N,                                       \
-      const T alpha,                                     \
-      const T* x,                                        \
-      const T beta,                                      \
-      T* y,                                              \
-      CPUContext*) {                                     \
-    cblas_##prefix##axpby(N, alpha, x, 1, beta, y, 1);   \
-  }                                                      \
-  template <>                                            \
-  C10_EXPORT void Axpby<T, T, CPUContext>(               \
-      const int N,                                       \
-      const T* alpha,                                    \
-      const T* x,                                        \
-      const T* beta,                                     \
-      T* y,                                              \
-      CPUContext*) {                                     \
-    cblas_##prefix##axpby(N, *alpha, x, 1, *beta, y, 1); \
-  }
-#else // CAFFE2_USE_MKL
-#define CAFFE2_SPECIALIZED_AXPBY(T, prefix)      \
-  template <>                                    \
-  C10_EXPORT void Axpby<T, T, CPUContext>(       \
-      const int N,                               \
-      const T alpha,                             \
-      const T* x,                                \
-      const T beta,                              \
-      T* y,                                      \
-      CPUContext*) {                             \
-    cblas_##prefix##scal(N, beta, y, 1);         \
-    cblas_##prefix##axpy(N, alpha, x, 1, y, 1);  \
-  }                                              \
-  template <>                                    \
-  C10_EXPORT void Axpby<T, T, CPUContext>(       \
-      const int N,                               \
-      const T* alpha,                            \
-      const T* x,                                \
-      const T* beta,                             \
-      T* y,                                      \
-      CPUContext*) {                             \
-    cblas_##prefix##scal(N, *beta, y, 1);        \
-    cblas_##prefix##axpy(N, *alpha, x, 1, y, 1); \
-  }
-#endif // CAFFE2_USE_MKL
-CAFFE2_SPECIALIZED_AXPBY(float, s)
-#undef CAFFE2_SPECIALIZED_AXPBY
-
 #endif // CAFFE2_USE_EIGEN_FOR_BLAS
 
 template <>
diff --git a/caffe2/utils/math_gpu.cu b/caffe2/utils/math_gpu.cu
index 71f05ab..a228527 100644
--- a/caffe2/utils/math_gpu.cu
+++ b/caffe2/utils/math_gpu.cu
@@ -1865,201 +1865,6 @@ CAFFE2_CUDA_EXPORT void Select<at::Half, CUDAContext>(
          context->cuda_stream()>>>(N, D, x, idx, y);
 }
 
-template <>
-CAFFE2_CUDA_EXPORT void Axpy<float, CUDAContext>(
-    const int N,
-    const float alpha,
-    const float* X,
-    float* Y,
-    CUDAContext* context) {
-  CUBLAS_ENFORCE(
-      cublasSetPointerMode(context->cublas_handle(), CUBLAS_POINTER_MODE_HOST));
-  CUBLAS_ENFORCE(cublasSaxpy(context->cublas_handle(), N, &alpha, X, 1, Y, 1));
-}
-
-template <>
-CAFFE2_CUDA_EXPORT void Axpy<double, CUDAContext>(
-    const int N,
-    const float alpha,
-    const double* X,
-    double* Y,
-    CUDAContext* context) {
-  double alpha_d{alpha};
-  CUBLAS_ENFORCE(
-      cublasSetPointerMode(context->cublas_handle(), CUBLAS_POINTER_MODE_HOST));
-  CUBLAS_ENFORCE(
-      cublasDaxpy(context->cublas_handle(), N, &alpha_d, X, 1, Y, 1));
-}
-
-template <>
-CAFFE2_CUDA_EXPORT void Axpy<at::Half, CUDAContext>(
-    const int N,
-    const float alpha,
-    const at::Half* X,
-    at::Half* Y,
-    CUDAContext* context) {
-#if defined(__HIP_PLATFORM_HCC__)
-  CAFFE_THROW("HIP currently does not support FP16 completely yet.");
-#else
-  CUBLAS_ENFORCE(
-      cublasSetPointerMode(context->cublas_handle(), CUBLAS_POINTER_MODE_HOST));
-  CUBLAS_ENFORCE(cublasAxpyEx(
-      context->cublas_handle(),
-      N,
-      &alpha,
-      CUDA_R_32F,
-      X,
-      CUDA_R_16F,
-      1,
-      Y,
-      CUDA_R_16F,
-      1,
-      CUDA_R_32F));
-#endif
-}
-
-template <>
-CAFFE2_CUDA_EXPORT void Axpy<float, CUDAContext>(
-    const int N,
-    const float* alpha,
-    const float* X,
-    float* Y,
-    CUDAContext* context) {
-  CUBLAS_ENFORCE(cublasSetPointerMode(
-      context->cublas_handle(), CUBLAS_POINTER_MODE_DEVICE));
-  CUBLAS_ENFORCE(cublasSaxpy(context->cublas_handle(), N, alpha, X, 1, Y, 1));
-}
-
-template <>
-CAFFE2_CUDA_EXPORT void Axpy<at::Half, CUDAContext>(
-    const int N,
-    const float* alpha,
-    const at::Half* X,
-    at::Half* Y,
-    CUDAContext* context) {
-#if defined(__HIP_PLATFORM_HCC__)
-  CAFFE_THROW("HIP currently does not support FP16 completely yet.");
-#else
-  CUBLAS_ENFORCE(cublasSetPointerMode(
-      context->cublas_handle(), CUBLAS_POINTER_MODE_DEVICE));
-  CUBLAS_ENFORCE(cublasAxpyEx(
-      context->cublas_handle(),
-      N,
-      alpha,
-      CUDA_R_32F,
-      X,
-      CUDA_R_16F,
-      1,
-      Y,
-      CUDA_R_16F,
-      1,
-      CUDA_R_32F));
-#endif
-}
-
-namespace {
-
-template <typename TCoeff, typename TData>
-__global__ void AxpbyCUDAKernel(
-    const int N,
-    const TCoeff a,
-    const TData* x,
-    const TCoeff b,
-    TData* y) {
-  CUDA_1D_KERNEL_LOOP(i, N) {
-#if __CUDA_ARCH__ >= 350
-    y[i] = __ldg(x + i) * a + y[i] * b;
-#else
-    y[i] = x[i] * a + y[i] * b;
-#endif
-  }
-}
-
-template <>
-__global__ void AxpbyCUDAKernel<float, at::Half>(
-    const int N,
-    const float a,
-    const at::Half* x,
-    const float b,
-    at::Half* y) {
-  CUDA_1D_KERNEL_LOOP(i, N) {
-    y[i] = convert::To<float, at::Half>(
-        convert::To<at::Half, float>(x[i]) * a +
-        convert::To<at::Half, float>(y[i]) * b);
-  }
-}
-
-template <typename TCoeff, typename TData>
-__global__ void AxpbyCUDAKernel(
-    const int N,
-    const TCoeff* a,
-    const TData* x,
-    const TCoeff* b,
-    TData* y) {
-  CUDA_1D_KERNEL_LOOP(i, N) {
-#if __CUDA_ARCH__ >= 350
-    y[i] = __ldg(x + i) * __ldg(a) + y[i] * __ldg(b);
-#else
-    y[i] = x[i] * *a + y[i] * *b;
-#endif
-  }
-}
-
-template <>
-__global__ void AxpbyCUDAKernel<float, at::Half>(
-    const int N,
-    const float* a,
-    const at::Half* x,
-    const float* b,
-    at::Half* y) {
-  CUDA_1D_KERNEL_LOOP(i, N) {
-#if __CUDA_ARCH__ >= 350
-    y[i] = convert::To<float, at::Half>(
-        convert::To<at::Half, float>(x[i]) * __ldg(a) +
-        convert::To<at::Half, float>(y[i]) * __ldg(b));
-#else
-    y[i] = convert::To<float, at::Half>(
-        convert::To<at::Half, float>(x[i]) * *a +
-        convert::To<at::Half, float>(y[i]) * *b);
-#endif
-  }
-}
-
-} // namespace
-
-#define CAFFE2_SPECIALIZED_CUDA_AXPBY(TCoeff, TData)         \
-  template <>                                                \
-  CAFFE2_CUDA_EXPORT void Axpby<TCoeff, TData, CUDAContext>( \
-      const int n,                                           \
-      const TCoeff a,                                        \
-      const TData* x,                                        \
-      const TCoeff b,                                        \
-      TData* y,                                              \
-      CUDAContext* context) {                                \
-    AxpbyCUDAKernel<TCoeff, TData>                           \
-        <<<CAFFE_GET_BLOCKS(n),                              \
-           CAFFE_CUDA_NUM_THREADS,                           \
-           0,                                                \
-           context->cuda_stream()>>>(n, a, x, b, y);         \
-  }                                                          \
-  template <>                                                \
-  CAFFE2_CUDA_EXPORT void Axpby<TCoeff, TData, CUDAContext>( \
-      const int n,                                           \
-      const TCoeff* a,                                       \
-      const TData* x,                                        \
-      const TCoeff* b,                                       \
-      TData* y,                                              \
-      CUDAContext* context) {                                \
-    AxpbyCUDAKernel<TCoeff, TData>                           \
-        <<<CAFFE_GET_BLOCKS(n),                              \
-           CAFFE_CUDA_NUM_THREADS,                           \
-           0,                                                \
-           context->cuda_stream()>>>(n, a, x, b, y);         \
-  }
-CAFFE2_SPECIALIZED_CUDA_AXPBY(float, float)
-CAFFE2_SPECIALIZED_CUDA_AXPBY(float, at::Half)
-#undef CAFFE2_SPECIALIZED_CUDA_AXPBY
-
 namespace {
 
 template <typename T>
-- 
2.7.4