Update math::Transpose to support tensor with size > 2G (#17670)
authorXiaomeng Yang <yangxm@fb.com>
Thu, 21 Mar 2019 01:19:09 +0000 (18:19 -0700)
committerFacebook Github Bot <facebook-github-bot@users.noreply.github.com>
Thu, 21 Mar 2019 01:22:21 +0000 (18:22 -0700)
Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/17670

Update math::Transpose to support tensor with size > 2G

i-am-not-moving-c2-to-c10

Differential Revision: D14313624

fbshipit-source-id: 0b4a85b913972e5a8981f0d40d0c539407b98f30

17 files changed:
caffe2/operators/channel_shuffle_op.cc
caffe2/operators/channel_shuffle_op.cu
caffe2/operators/lstm_utils.h
caffe2/operators/transpose_op.cu
caffe2/operators/transpose_op.h
caffe2/operators/transpose_op_cudnn.cc
caffe2/utils/CMakeLists.txt
caffe2/utils/math.h
caffe2/utils/math/transpose.cc [new file with mode: 0644]
caffe2/utils/math/transpose.cu [new file with mode: 0644]
caffe2/utils/math/transpose.h [new file with mode: 0644]
caffe2/utils/math/utils.cc
caffe2/utils/math/utils.h
caffe2/utils/math_cpu.cc
caffe2/utils/math_gpu.cu
caffe2/utils/math_gpu_test.cc
caffe2/utils/math_test.cc

index c3e8f88..e8f13bf 100644 (file)
@@ -50,12 +50,13 @@ void RunChannelShuffleNHWC(
     const T* X,
     T* Y,
     CPUContext* context) {
-  const std::array<int, 2> dims = {G, K};
-  const std::array<int, 2> axes = {1, 0};
+  const std::array<std::int64_t, 2> dims = {G, K};
+  const std::array<std::int32_t, 2> axes = {1, 0};
   const int M = N * HxW;
   const int C = G * K;
   for (int i = 0; i < M; ++i) {
-    math::Transpose<T, CPUContext>(2, dims.data(), axes.data(), X, Y, context);
+    math::Transpose<std::int64_t, T, CPUContext>(
+        2, dims.data(), axes.data(), X, Y, context);
     X += C;
     Y += C;
   }
index 7f5e73f..2c5a4e5 100644 (file)
@@ -114,9 +114,9 @@ bool ChannelShuffleOp<float, CUDAContext>::RunOnDeviceWithOrderNHWC() {
         <<<outer_size, CAFFE_CUDA_NUM_THREADS, 0, context_.cuda_stream()>>>(
             G, K, X_data, Y_data);
   } else {
-    const std::array<int, 3> dims = {N * HxW, G, K};
-    const std::array<int, 3> axes = {0, 2, 1};
-    math::Transpose<float, CUDAContext>(
+    const std::array<std::int64_t, 3> dims = {N * HxW, G, K};
+    const std::array<std::int32_t, 3> axes = {0, 2, 1};
+    math::Transpose<std::int64_t, float, CUDAContext>(
         3, dims.data(), axes.data(), X_data, Y_data, &context_);
   }
   return true;
@@ -184,9 +184,9 @@ bool ChannelShuffleGradientOp<float, CUDAContext>::RunOnDeviceWithOrderNHWC() {
         <<<outer_size, CAFFE_CUDA_NUM_THREADS, 0, context_.cuda_stream()>>>(
             K, G, dY_data, dX_data);
   } else {
-    const std::array<int, 3> dims = {N * HxW, K, G};
-    const std::array<int, 3> axes = {0, 2, 1};
-    math::Transpose<float, CUDAContext>(
+    const std::array<std::int64_t, 3> dims = {N * HxW, K, G};
+    const std::array<std::int32_t, 3> axes = {0, 2, 1};
+    math::Transpose<std::int64_t, float, CUDAContext>(
         3, dims.data(), axes.data(), dY_data, dX_data, &context_);
   }
   return true;
index b354764..0f56479 100644 (file)
@@ -299,13 +299,13 @@ Tensor transpose(const Tensor& X, int dim0, int dim1, CPUContext* context) {
   std::vector<int> axes(ndim);
   std::iota(axes.begin(), axes.end(), 0);
   std::swap(axes[dim0], axes[dim1]);
-  std::vector<int> Y_dims(ndim);
-  std::vector<int> X_dims(X.sizes().cbegin(), X.sizes().cend());
+  const std::vector<std::int64_t> X_dims = X.sizes().vec();
+  std::vector<std::int64_t> Y_dims(ndim);
   for (int i = 0; i < ndim; ++i) {
     Y_dims[i] = X_dims[axes[i]];
   }
   Tensor Y(Y_dims, CPU);
-  math::Transpose<float, CPUContext>(
+  math::Transpose<std::int64_t, float, CPUContext>(
       ndim,
       X_dims.data(),
       axes.data(),
index d5e9507..beba904 100644 (file)
@@ -1,6 +1,7 @@
-#include "caffe2/core/context_gpu.h"
 #include "caffe2/operators/transpose_op.h"
 
+#include "caffe2/core/context_gpu.h"
+
 namespace caffe2 {
 
 REGISTER_CUDA_OPERATOR(Transpose, TransposeOp<CUDAContext>);
index dad157b..f84427d 100644 (file)
@@ -30,8 +30,6 @@ class TransposeOp final : public Operator<Context> {
     }
   }
 
-  ~TransposeOp() = default;
-
   bool RunOnDevice() override {
     // Do the actual transpose, which is implemented in DoRunWithType().
     return DispatchHelper<TensorTypes<float, double, int, int64_t>>::call(
@@ -50,13 +48,13 @@ class TransposeOp final : public Operator<Context> {
     } else {
       CAFFE_ENFORCE_EQ(ndim, axes_.size());
     }
-    const std::vector<int> X_dims(X.sizes().cbegin(), X.sizes().cend());
-    std::vector<int64_t> Y_dims(ndim);
+    const std::vector<std::int64_t> X_dims = X.sizes().vec();
+    std::vector<std::int64_t> Y_dims(ndim);
     for (int i = 0; i < ndim; ++i) {
       Y_dims[i] = X_dims[axes_[i]];
     }
     auto* Y = Output(0, Y_dims, at::dtype<T>());
-    math::Transpose<T, Context>(
+    math::Transpose<std::int64_t, T, Context>(
         X_dims.size(),
         X_dims.data(),
         axes_.data(),
index d92d9ed..a28aecc 100644 (file)
@@ -1,6 +1,7 @@
 #include "caffe2/operators/transpose_op.h"
 
 #include <algorithm>
+#include <limits>
 #include <vector>
 
 #include "caffe2/core/context_gpu.h"
 
 namespace caffe2 {
 
-#define MAX_DIMS 8
+namespace {
 
 class CuDNNTransposeOp final : public Operator<CUDAContext> {
  public:
   USE_OPERATOR_FUNCTIONS(CUDAContext);
-  USE_DISPATCH_HELPER;
 
   template <class... Args>
   explicit CuDNNTransposeOp(Args&&... args)
       : Operator<CUDAContext>(std::forward<Args>(args)...),
         cudnn_wrapper_(&context_),
         axes_(OperatorBase::GetRepeatedArgument<int>("axes")) {
-    // We will check the legality of axes_: it should be from 0 to axes_.size().
+    // Checks the legality of axes_: it should be from 0 to axes_.size().
     std::vector<int> axes_sorted(axes_);
     std::sort(axes_sorted.begin(), axes_sorted.end());
-    for (int i = 0; i < axes_sorted.size(); ++i) {
+    for (std::size_t i = 0; i < axes_sorted.size(); ++i) {
       if (axes_sorted[i] != i) {
         CAFFE_THROW("Axes should be a permutation of 0 to ndim.");
       }
     }
 
-    CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&xDesc_));
-    CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&yDesc_));
+    CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&X_desc_));
+    CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&Y_desc_));
   }
 
   ~CuDNNTransposeOp() override {
-    CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(xDesc_));
-    CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(yDesc_));
+    CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(X_desc_));
+    CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(Y_desc_));
   }
 
   bool RunOnDevice() override {
+    return DispatchHelper<TensorTypes<float, int>>::call(this, Input(0));
+  }
+
+  template <typename T>
+  bool DoRunWithType() {
     const auto& X = Input(0);
-    auto* Y = Output(0);
     const int ndim = X.dim();
-    X_dims_.assign(X.sizes().cbegin(), X.sizes().cend());
     if (axes_.empty()) {
       axes_.resize(ndim);
       std::iota(axes_.rbegin(), axes_.rend(), 0);
     } else {
-      CAFFE_ENFORCE_EQ(X.dim(), axes_.size());
+      CAFFE_ENFORCE_EQ(axes_.size(), ndim);
     }
-    std::vector<int> Y_dims(ndim);
+    std::vector<std::int64_t> X_dims = X.sizes().vec();
+    std::vector<std::int64_t> Y_dims(ndim);
     for (int i = 0; i < ndim; ++i) {
-      Y_dims[i] = X_dims_[axes_[i]];
+      Y_dims[i] = X_dims[axes_[i]];
     }
-    Y->Resize(Y_dims);
-    // Do the actual transpose, which is implemented in DoRunWithType().
-#if CUDNN_VERSION_MIN(6, 0, 0)
-    return DispatchHelper<TensorTypes<float, int>>::call(this, Input(0));
-#else
-    // CUDNN 5.1 does not have int support yet.
-    return DispatchHelper<TensorTypes<float>>::call(this, Input(0));
-#endif
-  }
-
- protected:
-  template <typename T>
-  bool DoRunWithType() {
-    const auto& input = Input(0);
-    auto* output = Output(0);
-    int ndim = input.dim();
-
-    if (ndim == 0) {
+    auto* Y = Output(0, Y_dims, at::dtype<T>());
+    const T* X_data = X.template data<T>();
+    T* Y_data = Y->template mutable_data<T>();
+    if (X.numel() == 0) {
       return true;
     }
-    if (ndim == 1) {
-      output->CopyFrom(input);
+    if (ndim < 3 || ndim > CUDNN_DIM_MAX ||
+        X.numel() > std::numeric_limits<std::int32_t>::max()) {
+      math::Transpose<std::int64_t, T, CUDAContext>(
+          ndim, X_dims.data(), axes_.data(), X_data, Y_data, &context_);
       return true;
     }
-
-    cudnnDataType_t typedesc = cudnnTypeWrapper<T>::type;
-#if CUDNN_VERSION_MIN(6, 0, 0)
-    if (typedesc == CUDNN_DATA_INT32) {
-      // CUDNN Transpose only support float for now
-      math::Transpose<int, CUDAContext>(
-          X_dims_.size(),
-          X_dims_.data(),
-          axes_.data(),
-          input.template data<int>(),
-          output->template mutable_data<int>(),
-          &context_);
-      return true;
-    }
-#endif
-
-    CAFFE_ENFORCE(ndim < MAX_DIMS, "Input ndim exceeds compile time max.");
-
-    stride_y[ndim - 1] = 1;
-    for (int i = ndim - 2; i >= 0; i--) {
-      stride_y[i] = stride_y[i + 1] * output->dim32(i + 1);
-    }
-
-    CHECK(axes_.size() >= ndim);
-
-    stride_x[ndim] = 1;
-    for (int i = 0; i < ndim; i++) {
-      stride_x[i] = 1;
-      for (int j = axes_[i] + 1; j < ndim; j++) {
-        stride_x[i] *= input.dim32(j);
-      }
-      dim_y_int[i] = output->dim32(i);
-    }
-
-    // CuDNN requires at least 3-dim tensors
-    for (int i = ndim; i < MAX_DIMS; i++) {
-      stride_x[i] = 1;
-      stride_y[i] = 1;
-      dim_y_int[i] = 1;
+    if (X_dims != cached_X_dims_) {
+      SetTensorDescriptor(cudnnTypeWrapper<T>::type, X_dims, Y_dims);
+      cached_X_dims_ = X_dims;
     }
-
-    CUDNN_ENFORCE(cudnnSetTensorNdDescriptor(
-        xDesc_, typedesc, ndim < 4 ? 4 : ndim, dim_y_int, stride_x));
-
-    CUDNN_ENFORCE(cudnnSetTensorNdDescriptor(
-        yDesc_, typedesc, ndim < 4 ? 4 : ndim, dim_y_int, stride_y));
-
     CUDNN_ENFORCE(cudnnTransformTensor(
         cudnn_wrapper_.inline_cudnn_handle(),
         cudnnTypeWrapper<T>::kOne(),
-        xDesc_,
-        static_cast<const void*>(input.template data<T>()),
+        X_desc_,
+        X_data,
         cudnnTypeWrapper<T>::kZero(),
-        yDesc_,
-        static_cast<void*>(output->template mutable_data<T>())));
+        Y_desc_,
+        Y_data));
     return true;
   }
 
-  int stride_x[MAX_DIMS];
-  int stride_y[MAX_DIMS];
-  int dim_y_int[MAX_DIMS];
+ private:
+  void SetTensorDescriptor(
+      const cudnnDataType_t data_type,
+      const std::vector<std::int64_t>& X_dims,
+      const std::vector<std::int64_t>& Y_dims) {
+    const int ndim = X_dims.size();
+    std::vector<int> dims(Y_dims.cbegin(), Y_dims.cend());
+    std::vector<int> X_strides(ndim);
+    std::vector<int> X_buff(ndim);
+    std::vector<int> Y_strides(ndim);
+    X_buff.back() = 1;
+    Y_strides.back() = 1;
+    for (int i = ndim - 1; i > 0; --i) {
+      X_buff[i - 1] = X_buff[i] * X_dims[i];
+      Y_strides[i - 1] = Y_strides[i] * Y_dims[i];
+    }
+    for (int i = 0; i < ndim; ++i) {
+      X_strides[i] = X_buff[axes_[i]];
+    }
+    CUDNN_ENFORCE(cudnnSetTensorNdDescriptor(
+        X_desc_, data_type, ndim, dims.data(), X_strides.data()));
+    CUDNN_ENFORCE(cudnnSetTensorNdDescriptor(
+        Y_desc_, data_type, ndim, dims.data(), Y_strides.data()));
+  }
 
-  cudnnTensorDescriptor_t xDesc_;
-  cudnnTensorDescriptor_t yDesc_;
   CuDNNWrapper cudnn_wrapper_;
+  cudnnTensorDescriptor_t X_desc_;
+  cudnnTensorDescriptor_t Y_desc_;
 
-  std::vector<int> axes_;
-  std::vector<int> X_dims_;
+  std::vector<std::int64_t> cached_X_dims_;
+  std::vector<std::int32_t> axes_;
 };
 
+#if !CUDNN_VERSION_MIN(6, 0, 0)
+
+// CuDNN 5.1 does not have int support yet.
+template <>
+bool CuDNNTransposeOp::DoRunWithType<int>() {
+  const auto& X = Input(0);
+  const int ndim = X.dim();
+  if (axes_.empty()) {
+    axes_.resize(ndim);
+    std::iota(axes_.rbegin(), axes_.rend(), 0);
+  } else {
+    CAFFE_ENFORCE_EQ(axes_.size(), ndim);
+  }
+  std::vector<std::int64_t> X_dims = X.sizes().vec();
+  std::vector<std::int64_t> Y_dims(ndim);
+  for (int i = 0; i < ndim; ++i) {
+    Y_dims[i] = X_dims[axes_[i]];
+  }
+  auto* Y = Output(0, Y_dims, at::dtype<T>());
+  const T* X_data = X.template data<T>();
+  T* Y_data = Y->template mutable_data<T>();
+  math::Transpose<std::int64_t, T, CUDAContext>(
+      ndim, X_dims.data(), axes_.data(), X_data, Y_data, &context_);
+  return true;
+}
+
+#endif // !CUDNN_VERSION_MIN(6, 0, 0)
+
+} // namespace
+
 REGISTER_CUDNN_OPERATOR(Transpose, CuDNNTransposeOp);
 
 } // namespace caffe2
index 7ed4a50..fd9c07a 100644 (file)
@@ -4,6 +4,7 @@ list(APPEND Caffe2_CPU_SRCS
   utils/math/broadcast.cc
   utils/math/elementwise.cc
   utils/math/reduce.cc
+  utils/math/transpose.cc
   utils/math/utils.cc
   utils/math_cpu.cc
   utils/murmur_hash3.cc
@@ -29,6 +30,7 @@ set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS}
         utils/math/broadcast.cu
         utils/math/elementwise.cu
         utils/math/reduce.cu
+        utils/math/transpose.cu
         utils/math_gpu.cu
         )
 
@@ -36,6 +38,7 @@ set(Caffe2_HIP_SRCS ${Caffe2_HIP_SRCS}
         utils/math/hip/broadcast.hip
         utils/math/hip/elementwise.hip
         utils/math/hip/reduce.hip
+        utils/math/hip/transpose.hip
         utils/hip/math_gpu.hip
         )
 
index a3dfdc1..0ce41f1 100644 (file)
@@ -18,6 +18,7 @@ extern "C" {
 #include "caffe2/utils/math/broadcast.h"
 #include "caffe2/utils/math/elementwise.h"
 #include "caffe2/utils/math/reduce.h"
+#include "caffe2/utils/math/transpose.h"
 #include "caffe2/utils/math/utils.h"
 
 namespace caffe2 {
@@ -164,16 +165,6 @@ template <typename T, class Context>
 CAFFE2_API void
 Maximum(const int N, const float alpha, const T* x, T* y, Context* context);
 
-// Transpose tensor X with dims by axes and write the result to tensor Y.
-template <typename T, class Context>
-CAFFE2_API void Transpose(
-    const int ndim,
-    const int* dims,
-    const int* axes,
-    const T* X,
-    T* Y,
-    Context* context);
-
 // Decaf gemm provides a simpler interface to the gemm functions, with the
 // limitation that the data has to be contiguous in memory.
 template <typename T, class Context, class Engine = DefaultEngine>
@@ -497,23 +488,6 @@ CAFFE2_API void CopyMatrix(
 template <typename T, class Context>
 CAFFE2_API void CopyVector(const int N, const T* A, T* B, Context* context);
 
-template <typename T, class Context>
-CAFFE2_API void NCHW2NHWC(
-    const int N,
-    const int C,
-    const int HxW,
-    const T* X,
-    T* Y,
-    Context* context);
-
-template <typename T, class Context>
-CAFFE2_API void NHWC2NCHW(
-    const int N,
-    const int C,
-    const int HxW,
-    const T* X,
-    T* Y,
-    Context* context);
 
 } // namespace math
 } // namespace caffe2
diff --git a/caffe2/utils/math/transpose.cc b/caffe2/utils/math/transpose.cc
new file mode 100644 (file)
index 0000000..fb99155
--- /dev/null
@@ -0,0 +1,267 @@
+#include "caffe2/utils/math/transpose.h"
+
+#include <algorithm>
+#include <functional>
+#include <limits>
+#include <numeric>
+
+#ifdef CAFFE2_USE_MKL
+#include <mkl.h>
+#endif // CAFFE2_USE_MKL
+
+#ifdef CAFFE2_USE_HPTT
+#include <hptt.h>
+#endif // CAFFE2_USE_HPTT
+
+#include "caffe2/core/context.h"
+#include "caffe2/utils/eigen_utils.h"
+#include "caffe2/utils/math/utils.h"
+
+namespace caffe2 {
+namespace math {
+
+namespace {
+
+template <typename TIndex, typename TData>
+void Transpose2D(
+    const TIndex rows,
+    const TIndex cols,
+    const TData* X,
+    TData* Y) {
+  EigenMatrixMap<TData>(Y, rows, cols) =
+      ConstEigenMatrixMap<TData>(X, cols, rows).transpose();
+}
+
+#ifdef CAFFE2_USE_MKL
+
+#define DELEGATE_TRANSPOSE_2D(TIndex, TData, MKLFunc)                   \
+  template <>                                                           \
+  void Transpose2D<TIndex, TData>(                                      \
+      const TIndex rows, const TIndex cols, const TData* X, TData* Y) { \
+    MKLFunc('R', 'T', rows, cols, TData(1), X, cols, Y, rows);          \
+  }
+DELEGATE_TRANSPOSE_2D(std::int32_t, float, mkl_somatcopy);
+DELEGATE_TRANSPOSE_2D(std::int64_t, float, mkl_somatcopy);
+DELEGATE_TRANSPOSE_2D(std::int32_t, double, mkl_domatcopy);
+DELEGATE_TRANSPOSE_2D(std::int64_t, double, mkl_domatcopy);
+#undef DELEGATE_TRANSPOSE_2D
+
+#endif // CAFFE2_USE_MKL
+
+#ifdef CAFFE2_USE_HPTT
+
+template <typename TIndex, typename TData>
+bool TransposeByHPTT(
+    const int ndim,
+    const TIndex* dims,
+    const int* axes,
+    const TData* X,
+    TData* Y) {
+  for (int i = 0; i < ndim; ++i) {
+    if (dims[i] <= 0 || dims[i] > std::numeric_limits<int>::max()) {
+      return false;
+    }
+  }
+
+  std::vector<int> axes_cm(ndim);
+  std::vector<int> dims_cm(ndim);
+  // Convert row-major index to column-major.
+  const auto cm_fn = [ndim](const int i) { return ndim - i - 1; };
+  for (int i = 0; i < ndim; ++i) {
+    axes_cm[i] = cm_fn(axes[cm_fn(i)]);
+    dims_cm[i] = dims[cm_fn(i)];
+  }
+  auto plan = hptt::create_plan(
+      axes_cm.data(),
+      ndim,
+      TData(1),
+      X,
+      dims_cm.data(),
+      nullptr,
+      TData(0),
+      Y,
+      nullptr,
+      hptt::ESTIMATE,
+      1 /* num_threads */);
+  if (plan == nullptr) {
+    return false;
+  }
+  plan->execute();
+  return true;
+}
+
+#endif // CAFFE2_USE_HPTT
+
+template <typename TIndex, typename TData>
+void TransposeND(
+    const int ndim,
+    const TIndex* dims,
+    const int* axes,
+    const TData* X,
+    TData* Y) {
+  std::vector<TIndex> Y_dims(ndim);
+  for (int i = 0; i < ndim; ++i) {
+    Y_dims[i] = dims[axes[i]];
+  }
+  // Measure amount of contiguous data we can copy at once
+  int pivot = ndim - 1;
+  TIndex block_size = 1;
+  for (; pivot >= 0 && axes[pivot] == pivot; --pivot) {
+    block_size *= Y_dims[pivot];
+  }
+  ++pivot;
+  const TIndex num_blocks = std::accumulate(
+      Y_dims.cbegin(),
+      Y_dims.cbegin() + pivot,
+      TIndex(1),
+      std::multiplies<TIndex>());
+  std::vector<TIndex> X_strides(pivot);
+  utils::ComputeTransposedStrides<TIndex>(pivot, dims, axes, X_strides.data());
+  std::vector<TIndex> index(pivot, 0);
+  for (TIndex Y_index = 0; Y_index < num_blocks; ++Y_index) {
+    const TIndex X_index = std::inner_product(
+        X_strides.cbegin(), X_strides.cend(), index.cbegin(), TIndex(0));
+    if (block_size == 1) {
+      Y[Y_index] = X[X_index];
+    } else {
+      std::memcpy(
+          Y + block_size * Y_index,
+          X + block_size * X_index,
+          block_size * sizeof(TData));
+    }
+    utils::IncreaseIndexInDims<TIndex>(pivot, Y_dims.data(), index.data());
+  }
+}
+
+template <typename TIndex, typename TData>
+void TransposeImpl(
+    const int ndim,
+    const TIndex* dims,
+    const int* axes,
+    const TData* X,
+    TData* Y) {
+  const TIndex size =
+      std::accumulate(dims, dims + ndim, TIndex(1), std::multiplies<TIndex>());
+  if (size == 0) {
+    return;
+  }
+  if (utils::IsIdentityPermutation(ndim, axes)) {
+    std::memcpy(Y, X, size * sizeof(TData));
+    return;
+  }
+  if (utils::IsBatchTranspose2D(ndim, axes)) {
+    const TIndex H = dims[ndim - 2];
+    const TIndex W = dims[ndim - 1];
+    const TIndex N = size / (H * W);
+    for (TIndex i = 0; i < N; ++i) {
+      Transpose2D<TIndex, TData>(H, W, X + i * H * W, Y + i * H * W);
+    }
+    return;
+  }
+  TransposeND<TIndex, TData>(ndim, dims, axes, X, Y);
+}
+
+#ifdef CAFFE2_USE_HPTT
+
+#define CAFFE2_SPECIALIZED_TRANSPOSE_IMPL(TIndex, TData)                \
+  template <>                                                           \
+  void TransposeImpl<TIndex, TData>(                                    \
+      const int ndim,                                                   \
+      const TIndex* dims,                                               \
+      const int* axes,                                                  \
+      const TData* X,                                                   \
+      TData* T) {                                                       \
+    const TIndex size = std::accumulate(                                \
+        dims, dims + ndim, TIndex(1), std::multiplies<TIndex>());       \
+    if (size == 0) {                                                    \
+      return;                                                           \
+    }                                                                   \
+    if (utils::IsIdentityPermutation(ndim, axes)) {                     \
+      std::memcpy(Y, X, size * sizeof(TData));                          \
+      return;                                                           \
+    }                                                                   \
+    if (TransposeByHPTT(ndim, dims, axes, X, Y)) {                      \
+      return;                                                           \
+    }                                                                   \
+    if (utils::IsBatchTranspose2D(ndim, axes)) {                        \
+      const TIndex H = dims[ndim - 2];                                  \
+      const TIndex W = dims[ndim - 1];                                  \
+      const TIndex N = size / (H * W);                                  \
+      for (TIndex i = 0; i < N; ++i) {                                  \
+        Transpose2D<TIndex, TData>(H, W, X + i * H * W, Y + i * H * W); \
+      }                                                                 \
+      return;                                                           \
+    }                                                                   \
+    TransposeND<TIndex, TData>(ndim, dims, axes, X, Y);                 \
+  }
+CAFFE2_SPECIALIZED_TRANSPOSE_IMPL(std::int32_t, float)
+CAFFE2_SPECIALIZED_TRANSPOSE_IMPL(std::int64_t, float)
+CAFFE2_SPECIALIZED_TRANSPOSE_IMPL(std::int32_t, double)
+CAFFE2_SPECIALIZED_TRANSPOSE_IMPL(std::int64_t, double)
+#undef CAFFE2_SPECIALIZED_TRANSPOSE_IMPL
+
+#endif // CAFFE2_USE_HPTT
+
+} // namespace
+
+#define CAFFE2_SPECIALIZED_TRANSPOSE(TIndex, TData)       \
+  template <>                                             \
+  C10_EXPORT void Transpose<TIndex, TData, CPUContext>(   \
+      const int ndim,                                     \
+      const TIndex* dims,                                 \
+      const int* axes,                                    \
+      const TData* X,                                     \
+      TData* Y,                                           \
+      CPUContext* /* context */) {                        \
+    TransposeImpl<TIndex, TData>(ndim, dims, axes, X, Y); \
+  }
+CAFFE2_SPECIALIZED_TRANSPOSE(std::int32_t, float)
+CAFFE2_SPECIALIZED_TRANSPOSE(std::int64_t, float)
+CAFFE2_SPECIALIZED_TRANSPOSE(std::int32_t, double)
+CAFFE2_SPECIALIZED_TRANSPOSE(std::int64_t, double)
+CAFFE2_SPECIALIZED_TRANSPOSE(std::int32_t, std::int32_t)
+CAFFE2_SPECIALIZED_TRANSPOSE(std::int64_t, std::int32_t)
+CAFFE2_SPECIALIZED_TRANSPOSE(std::int32_t, std::int64_t)
+CAFFE2_SPECIALIZED_TRANSPOSE(std::int64_t, std::int64_t)
+CAFFE2_SPECIALIZED_TRANSPOSE(std::int32_t, std::uint8_t)
+CAFFE2_SPECIALIZED_TRANSPOSE(std::int64_t, std::uint8_t)
+CAFFE2_SPECIALIZED_TRANSPOSE(std::int32_t, std::uint16_t)
+CAFFE2_SPECIALIZED_TRANSPOSE(std::int64_t, std::uint16_t)
+#undef CAFFE2_SPECIALIZED_TRANSPOSE
+
+#define CAFFE2_SPECIALIZED_NCHW2NHWC(T)                       \
+  template <>                                                 \
+  C10_EXPORT void NCHW2NHWC<T, CPUContext>(                   \
+      const int N,                                            \
+      const int C,                                            \
+      const int HxW,                                          \
+      const T* X,                                             \
+      T* Y,                                                   \
+      CPUContext* /* context */) {                            \
+    const int stride = C * HxW;                               \
+    for (int i = 0; i < N; ++i) {                             \
+      Transpose2D<T>(C, HxW, X + i * stride, Y + i * stride); \
+    }                                                         \
+  }
+CAFFE2_SPECIALIZED_NCHW2NHWC(float)
+#undef CAFFE2_SPECIALIZED_NCHW2NHWC
+
+#define CAFFE2_SPECIALIZED_NHWC2NCHW(T)                       \
+  template <>                                                 \
+  C10_EXPORT void NHWC2NCHW<T, CPUContext>(                   \
+      const int N,                                            \
+      const int C,                                            \
+      const int HxW,                                          \
+      const T* X,                                             \
+      T* Y,                                                   \
+      CPUContext* /* context */) {                            \
+    const int stride = HxW * C;                               \
+    for (int i = 0; i < N; ++i) {                             \
+      Transpose2D<T>(HxW, C, X + i * stride, Y + i * stride); \
+    }                                                         \
+  }
+CAFFE2_SPECIALIZED_NHWC2NCHW(float)
+#undef CAFFE2_SPECIALIZED_NHWC2NCHW
+
+} // namespace math
+} // namespace caffe2
diff --git a/caffe2/utils/math/transpose.cu b/caffe2/utils/math/transpose.cu
new file mode 100644 (file)
index 0000000..48a6fa2
--- /dev/null
@@ -0,0 +1,231 @@
+#include "caffe2/utils/math/transpose.h"
+
+#include <algorithm>
+#include <functional>
+#include <numeric>
+
+#include "caffe2/core/common_gpu.h"
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/utils/math/utils.h"
+
+namespace caffe2 {
+namespace math {
+
+namespace {
+
+constexpr int kTileDim = 32;
+constexpr int kBlockRows = 8;
+
+// Splits the original matrix into submatrices with size 32 * 32.
+// Each block transposes one submatrix by loading it into shared memory.
+// Reference https://devblogs.nvidia.com/efficient-matrix-transpose-cuda-cc/
+template <typename TIndex, typename TData>
+__global__ void BatchTranspose2DCUDAKernel(
+    const TIndex N,
+    const TIndex H,
+    const TIndex W,
+    const TIndex dh,
+    const TIndex dw,
+    const TData* X,
+    TData* Y) {
+  __shared__ TData tile[kTileDim][kTileDim + 1];
+  const TIndex n = blockIdx.x / (dh * dw);
+  const TIndex k = blockIdx.x % (dh * dw);
+  const TIndex r = k / dw;
+  const TIndex c = k % dw;
+  const TIndex offset = n * H * W;
+  int x = c * kTileDim + threadIdx.x;
+  int y = r * kTileDim + threadIdx.y;
+  if (x < W) {
+    for (int i = 0; i < kTileDim && y + i < H; i += kBlockRows) {
+#if __CUDA_ARCH__ >= 350 || defined(__HIP_PLATFORM_HCC__)
+      tile[threadIdx.y + i][threadIdx.x] = __ldg(X + offset + (y + i) * W + x);
+#else
+      tile[threadIdx.y + i][threadIdx.x] = X[offset + (y + i) * W + x];
+#endif
+    }
+  }
+  __syncthreads();
+  x = r * kTileDim + threadIdx.x;
+  y = c * kTileDim + threadIdx.y;
+  if (x < H) {
+    for (int i = 0; i < kTileDim && y + i < W; i += kBlockRows) {
+      Y[offset + (y + i) * H + x] = tile[threadIdx.x][threadIdx.y + i];
+    }
+  }
+}
+
+template <typename TIndex, typename TData>
+void BatchTranspose2DCUDAImpl(
+    const TIndex N,
+    const TIndex H,
+    const TIndex W,
+    const TData* X,
+    TData* Y,
+    CUDAContext* context) {
+  const TIndex dh = DivUp<TIndex>(H, kTileDim);
+  const TIndex dw = DivUp<TIndex>(W, kTileDim);
+  BatchTranspose2DCUDAKernel<TIndex, TData>
+      <<<N * dh * dw, dim3(kTileDim, kBlockRows), 0, context->cuda_stream()>>>(
+          N, H, W, dh, dw, X, Y);
+}
+
+#define DELEGATE_TRANSPOSE_2D_CUDA_IMPL(TIndex, TData, CuBLASFunc) \
+  template <>                                                      \
+  void BatchTranspose2DCUDAImpl<TIndex, TData>(                    \
+      const TIndex N,                                              \
+      const TIndex H,                                              \
+      const TIndex W,                                              \
+      const TData* X,                                              \
+      TData* Y,                                                    \
+      CUDAContext* context) {                                      \
+    if (N == 1) {                                                  \
+      const TData kAlpha = TData(1);                               \
+      const TData kBeta = TData(0);                                \
+      CUBLAS_ENFORCE(cublasSetPointerMode(                         \
+          context->cublas_handle(), CUBLAS_POINTER_MODE_HOST));    \
+      CUBLAS_ENFORCE(CuBLASFunc(                                   \
+          context->cublas_handle(),                                \
+          CUBLAS_OP_T,                                             \
+          CUBLAS_OP_N,                                             \
+          H,                                                       \
+          W,                                                       \
+          &kAlpha,                                                 \
+          X,                                                       \
+          W,                                                       \
+          &kBeta,                                                  \
+          Y,                                                       \
+          H,                                                       \
+          Y,                                                       \
+          H));                                                     \
+    } else {                                                       \
+      const TIndex dh = DivUp<TIndex>(H, kTileDim);                \
+      const TIndex dw = DivUp<TIndex>(W, kTileDim);                \
+      BatchTranspose2DCUDAKernel<TIndex, TData>                    \
+          <<<N * dh * dw,                                          \
+             dim3(kTileDim, kBlockRows),                           \
+             0,                                                    \
+             context->cuda_stream()>>>(N, H, W, dh, dw, X, Y);     \
+    }                                                              \
+  }
+DELEGATE_TRANSPOSE_2D_CUDA_IMPL(std::int32_t, float, cublasSgeam)
+DELEGATE_TRANSPOSE_2D_CUDA_IMPL(std::int64_t, float, cublasSgeam)
+DELEGATE_TRANSPOSE_2D_CUDA_IMPL(std::int32_t, double, cublasDgeam)
+DELEGATE_TRANSPOSE_2D_CUDA_IMPL(std::int64_t, double, cublasDgeam)
+#undef DELEGATE_TRANSPOSE_2D_CUDA_IMPL
+
+template <typename TIndex, typename TData, int D>
+__global__ void TransposeCUDAKernel(
+    const TIndex size,
+    const SimpleArray<TIndex, D> X_strides,
+    const SimpleArray<TIndex, D> Y_dims,
+    const TData* X,
+    TData* Y) {
+  const int Y_index = blockIdx.x * CAFFE_CUDA_NUM_THREADS + threadIdx.x;
+  if (Y_index < size) {
+    TIndex X_index = 0;
+    TIndex v = Y_index;
+#pragma unroll
+    for (int i = D - 1; i >= 0; --i) {
+      X_index += v % Y_dims.data[i] * X_strides.data[i];
+      v /= Y_dims.data[i];
+    }
+#if __CUDA_ARCH__ >= 350 || defined(__HIP_PLATFORM_HCC__)
+    Y[Y_index] = __ldg(X + X_index);
+#else
+    Y[Y_index] = X[X_index];
+#endif
+  }
+}
+
+template <typename TIndex, typename TData, int D>
+void TransposeCUDAImpl(
+    const TIndex* dims,
+    const int* axes,
+    const TData* X,
+    TData* Y,
+    CUDAContext* context) {
+  SimpleArray<TIndex, D> X_strides;
+  SimpleArray<TIndex, D> Y_dims;
+  utils::ComputeTransposedStrides<TIndex>(D, dims, axes, X_strides.data);
+  TIndex size = 1;
+  for (int i = 0; i < D; ++i) {
+    Y_dims.data[i] = dims[axes[i]];
+    size *= dims[i];
+  }
+  const TIndex M = DivUp<TIndex>(size, CAFFE_CUDA_NUM_THREADS);
+  TransposeCUDAKernel<TIndex, TData, D>
+      <<<M, CAFFE_CUDA_NUM_THREADS, 0, context->cuda_stream()>>>(
+          size, X_strides, Y_dims, X, Y);
+}
+
+} // namespace
+
+#define CAFFE2_SPECIALIZED_CUDA_TRANSPOSE(TIndex, TData)                    \
+  template <>                                                               \
+  CAFFE2_CUDA_EXPORT void Transpose<TIndex, TData, CUDAContext>(            \
+      const int ndim,                                                       \
+      const TIndex* dims,                                                   \
+      const int* axes,                                                      \
+      const TData* X,                                                       \
+      TData* Y,                                                             \
+      CUDAContext* context) {                                               \
+    const TIndex size = std::accumulate(                                    \
+        dims, dims + ndim, TIndex(1), std::multiplies<TIndex>());           \
+    if (size == 0) {                                                        \
+      return;                                                               \
+    }                                                                       \
+    if (utils::IsIdentityPermutation(ndim, axes)) {                         \
+      context->template CopySameDevice<TData>(size, X, Y);                  \
+      return;                                                               \
+    }                                                                       \
+    if (utils::IsBatchTranspose2D(ndim, axes)) {                            \
+      const int H = dims[ndim - 2];                                         \
+      const int W = dims[ndim - 1];                                         \
+      const int N = size / (H * W);                                         \
+      BatchTranspose2DCUDAImpl<TIndex, TData>(N, H, W, X, Y, context);      \
+      return;                                                               \
+    }                                                                       \
+    DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_2(                                 \
+        ndim, TransposeCUDAImpl, TIndex, TData, dims, axes, X, Y, context); \
+  }
+CAFFE2_SPECIALIZED_CUDA_TRANSPOSE(std::int32_t, float)
+CAFFE2_SPECIALIZED_CUDA_TRANSPOSE(std::int64_t, float)
+CAFFE2_SPECIALIZED_CUDA_TRANSPOSE(std::int32_t, double)
+CAFFE2_SPECIALIZED_CUDA_TRANSPOSE(std::int64_t, double)
+CAFFE2_SPECIALIZED_CUDA_TRANSPOSE(std::int32_t, std::int32_t)
+CAFFE2_SPECIALIZED_CUDA_TRANSPOSE(std::int64_t, std::int32_t)
+CAFFE2_SPECIALIZED_CUDA_TRANSPOSE(std::int32_t, std::int64_t)
+CAFFE2_SPECIALIZED_CUDA_TRANSPOSE(std::int64_t, std::int64_t)
+#undef CAFFE2_SPECIALIZED_CUDA_TRANSPOSE
+
+#define CAFFE2_SPECIALIZED_CUDA_NCHW2NHWC(T)                    \
+  template <>                                                   \
+  CAFFE2_CUDA_EXPORT void NCHW2NHWC<T, CUDAContext>(            \
+      const int N,                                              \
+      const int C,                                              \
+      const int HxW,                                            \
+      const T* X,                                               \
+      T* Y,                                                     \
+      CUDAContext* context) {                                   \
+    BatchTranspose2DCUDAImpl<int, T>(N, C, HxW, X, Y, context); \
+  }
+CAFFE2_SPECIALIZED_CUDA_NCHW2NHWC(float)
+#undef CAFFE2_SPECIALIZED_CUDA_NCHW2NHWC
+
+#define CAFFE2_SPECIALIZED_CUDA_NHWC2NCHW(T)                    \
+  template <>                                                   \
+  CAFFE2_CUDA_EXPORT void NHWC2NCHW<T, CUDAContext>(            \
+      const int N,                                              \
+      const int C,                                              \
+      const int HxW,                                            \
+      const T* X,                                               \
+      T* Y,                                                     \
+      CUDAContext* context) {                                   \
+    BatchTranspose2DCUDAImpl<int, T>(N, HxW, C, X, Y, context); \
+  }
+CAFFE2_SPECIALIZED_CUDA_NHWC2NCHW(float)
+#undef CAFFE2_SPECIALIZED_CUDA_NHWC2NCHW
+
+} // namespace math
+} // namespace caffe2
diff --git a/caffe2/utils/math/transpose.h b/caffe2/utils/math/transpose.h
new file mode 100644 (file)
index 0000000..a01caa2
--- /dev/null
@@ -0,0 +1,31 @@
+#ifndef CAFFE2_UTILS_MATH_TRANSPOSE_H_
+#define CAFFE2_UTILS_MATH_TRANSPOSE_H_
+
+#include "caffe2/core/common.h"
+#include "caffe2/core/types.h"
+
+namespace caffe2 {
+namespace math {
+
+// Transpose tensor X with dims by axes and write the result to tensor Y.
+template <typename TIndex, typename TData, class Context>
+CAFFE2_API void Transpose(
+    int ndim,
+    const TIndex* dims,
+    const int* axes,
+    const TData* X,
+    TData* Y,
+    Context* context);
+
+template <typename T, class Context>
+CAFFE2_API void
+NCHW2NHWC(int N, int C, int HxW, const T* X, T* Y, Context* context);
+
+template <typename T, class Context>
+CAFFE2_API void
+NHWC2NCHW(int N, int C, int HxW, const T* X, T* Y, Context* context);
+
+} // namespace math
+} // namespace caffe2
+
+#endif // CAFFE2_UTILS_MATH_TRANSPOSE_H_
index 3b75ced..fdbb479 100644 (file)
@@ -11,16 +11,22 @@ namespace caffe2 {
 namespace math {
 namespace utils {
 
-void IncreaseIndexInDims(const int n, const int* dims, int* index) {
-  for (int i = n - 1; i >= 0; --i) {
-    ++index[i];
-    if (index[i] >= dims[i]) {
-      index[i] -= dims[i];
-    } else {
-      break;
-    }
+#define CAFFE2_SPECIALIZED_INCREASE_INDEX_IN_DIMS(TIndex)  \
+  template <>                                              \
+  C10_EXPORT void IncreaseIndexInDims<TIndex>(             \
+      const int ndim, const TIndex* dims, TIndex* index) { \
+    for (int i = ndim - 1; i >= 0; --i) {                  \
+      ++index[i];                                          \
+      if (index[i] >= dims[i]) {                           \
+        index[i] -= dims[i];                               \
+      } else {                                             \
+        break;                                             \
+      }                                                    \
+    }                                                      \
   }
-}
+CAFFE2_SPECIALIZED_INCREASE_INDEX_IN_DIMS(std::int32_t)
+CAFFE2_SPECIALIZED_INCREASE_INDEX_IN_DIMS(std::int64_t)
+#undef CAFFE2_SPECIALIZED_INCREASE_INDEX_IN_DIMS
 
 int GetIndexFromDims(const int n, const int* dims, const int* index) {
   int sum = 0;
@@ -326,21 +332,23 @@ void ComputeTransposeAxesForReduceOp(
   }
 }
 
-void ComputeTransposedStrides(
-    const int ndim,
-    const int* dims,
-    const int* axes,
-    int* strides) {
-  std::vector<int> buff(ndim);
-  int cur_stride = 1;
-  for (int i = ndim - 1; i >= 0; --i) {
-    buff[i] = cur_stride;
-    cur_stride *= dims[i];
-  }
-  for (int i = 0; i < ndim; ++i) {
-    strides[i] = buff[axes[i]];
+#define CAFFE2_SPECIALIZED_COMPUTE_TRANSPOSED_STRIDES(TIndex)                 \
+  template <>                                                                 \
+  C10_EXPORT void ComputeTransposedStrides<TIndex>(                           \
+      const int ndim, const TIndex* dims, const int* axes, TIndex* strides) { \
+    std::vector<TIndex> buff(ndim);                                           \
+    TIndex cur_stride = 1;                                                    \
+    for (int i = ndim - 1; i >= 0; --i) {                                     \
+      buff[i] = cur_stride;                                                   \
+      cur_stride *= dims[i];                                                  \
+    }                                                                         \
+    for (int i = 0; i < ndim; ++i) {                                          \
+      strides[i] = buff[axes[i]];                                             \
+    }                                                                         \
   }
-}
+CAFFE2_SPECIALIZED_COMPUTE_TRANSPOSED_STRIDES(std::int32_t)
+CAFFE2_SPECIALIZED_COMPUTE_TRANSPOSED_STRIDES(std::int64_t)
+#undef CAFFE2_SPECIALIZED_COMPUTE_TRANSPOSED_STRIDES
 
 } // namespace utils
 } // namespace math
index b704adb..af239cd 100644 (file)
@@ -1,6 +1,8 @@
 #ifndef CAFFE2_UTILS_MATH_UTILS_H_
 #define CAFFE2_UTILS_MATH_UTILS_H_
 
+#include <vector>
+
 #include "caffe2/core/common.h"
 
 #if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) || \
@@ -58,7 +60,9 @@ MATH_UTILS_DECL bool IsAGeZeroAndALtB(const int a, const int b) {
 }
 
 // Increase the index digits by one based on dims.
-CAFFE2_API void IncreaseIndexInDims(const int n, const int* dims, int* index);
+template <typename TIndex>
+CAFFE2_API void
+IncreaseIndexInDims(int ndim, const TIndex* dims, TIndex* index);
 
 // Get index value from dims and index digits.
 CAFFE2_API int GetIndexFromDims(const int n, const int* dims, const int* index);
@@ -137,11 +141,12 @@ CAFFE2_API void ComputeTransposeAxesForReduceOp(
 CAFFE2_API void
 ComputeTransposeAxesForReduceOp(const int ndim, const int* dims, int* axes);
 
+template <typename TIndex>
 CAFFE2_API void ComputeTransposedStrides(
-    const int ndim,
-    const int* dims,
+    int ndim,
+    const TIndex* dims,
     const int* axes,
-    int* strides);
+    TIndex* strides);
 
 } // namespace utils
 
index 895b561..f512d9e 100644 (file)
@@ -2795,269 +2795,5 @@ C10_EXPORT void BiasCHW<float, CPUContext>(
 CAFFE2_SPECIALIZED_COPYVECTOR(float)
 #undef CAFFE2_SPECIALIZED_COPYVECTOR
 
-namespace {
-
-#ifdef CAFFE2_USE_HPTT
-
-bool TransposeWithHPTT(
-    const int ndim,
-    const int* dims,
-    const int* axes,
-    const float* X,
-    float* Y) {
-  std::vector<int> axes_cm(ndim);
-  std::vector<int> dims_cm(ndim);
-  // Convert row-major index to column-major.
-  const auto cm_fn = [ndim](const int i) { return ndim - i - 1; };
-  for (int i = 0; i < ndim; ++i) {
-    axes_cm[i] = cm_fn(axes[cm_fn(i)]);
-    dims_cm[i] = dims[cm_fn(i)];
-  }
-
-  // HPTT doesn't handle 0 sized inputs.
-  for (auto dim : dims_cm) {
-    if (dim <= 0) {
-      return false;
-    }
-  }
-  auto plan = hptt::create_plan(
-      axes_cm.data(),
-      ndim,
-      1.0,
-      X,
-      dims_cm.data(),
-      nullptr,
-      0.0,
-      Y,
-      nullptr,
-      hptt::ESTIMATE,
-      1);
-  if (plan == nullptr) {
-    return false;
-  }
-  plan->execute();
-  return true;
-}
-
-#endif // CAFFE2_USE_HPTT
-
-template <typename T>
-void Transpose2D(const int rows, const int cols, const T* X, T* Y);
-
-#ifdef CAFFE2_USE_MKL
-
-#define DELEGATE_TRANSPOSE_2D_FUNCTION(T, Func)                           \
-  template <>                                                             \
-  void Transpose2D<T>(const int rows, const int cols, const T* X, T* Y) { \
-    Func('R', 'T', rows, cols, T(1), X, cols, Y, rows);                   \
-  }
-DELEGATE_TRANSPOSE_2D_FUNCTION(float, mkl_somatcopy);
-DELEGATE_TRANSPOSE_2D_FUNCTION(double, mkl_domatcopy);
-#undef DELEGATE_TRANSPOSE_2D_FUNCTION
-
-#endif // CAFFE2_USE_MKL
-
-#define CAFFE2_SPECIALIZED_TRANSPOSE_2D(T)                                \
-  template <>                                                             \
-  void Transpose2D<T>(const int rows, const int cols, const T* X, T* Y) { \
-    EigenMatrixMap<T>(Y, rows, cols) =                                    \
-        ConstEigenMatrixMap<T>(X, cols, rows).transpose();                \
-  }
-
-#ifndef CAFFE2_USE_MKL
-
-template <>
-void Transpose2D<float>(
-    const int rows,
-    const int cols,
-    const float* X,
-    float* Y) {
-#ifdef CAFFE2_USE_HPTT
-  const std::array<int, 2> dims = {rows, cols};
-  const std::array<int, 2> axes = {1, 0};
-  if (TransposeWithHPTT(2, dims.data(), axes.data(), X, Y)) {
-    return;
-  }
-#endif // CAFFE2_USE_HPTT
-  EigenMatrixMap<float>(Y, rows, cols) =
-      ConstEigenMatrixMap<float>(X, cols, rows).transpose();
-}
-
-CAFFE2_SPECIALIZED_TRANSPOSE_2D(double)
-
-#endif // CAFFE2_USE_MKL
-
-CAFFE2_SPECIALIZED_TRANSPOSE_2D(int)
-CAFFE2_SPECIALIZED_TRANSPOSE_2D(int64_t)
-CAFFE2_SPECIALIZED_TRANSPOSE_2D(std::uint8_t)
-CAFFE2_SPECIALIZED_TRANSPOSE_2D(std::uint16_t)
-
-#undef CAFFE2_SPECIALIZED_TRANSPOSE_2D
-
-std::vector<int>
-ComputeXStrides(const int ndim, const int* dims, const int* axes) {
-  std::vector<int> x_strides(ndim);
-  std::vector<int> buff(ndim);
-  int cur_stride = 1;
-  for (int i = ndim - 1; i >= 0; --i) {
-    buff[i] = cur_stride;
-    cur_stride *= dims[i];
-  }
-  for (int i = 0; i < ndim; ++i) {
-    x_strides[i] = buff[axes[i]];
-  }
-  return x_strides;
-}
-
-template <typename T>
-void TransposeND(
-    const int ndim,
-    const int* dims,
-    const int* axes,
-    const T* X,
-    T* Y) {
-  std::vector<int> Y_dims(ndim);
-  for (int i = 0; i < ndim; ++i) {
-    Y_dims[i] = dims[axes[i]];
-  }
-  // Measure amount of contiguous data we can copy at once
-  int block_size = 1;
-  int num_shared_idx = 0;
-  for (int i = ndim - 1; i >= 0 && axes[i] == i; --i) {
-    block_size *= Y_dims[i];
-    ++num_shared_idx;
-  }
-  const int itr_axes = ndim - num_shared_idx;
-  const int num_blocks = std::accumulate(
-      Y_dims.cbegin(), Y_dims.cbegin() + itr_axes, 1, std::multiplies<int>());
-  const std::vector<int> X_strides = ComputeXStrides(itr_axes, dims, axes);
-  std::vector<int> index(itr_axes, 0);
-  for (int Y_index = 0; Y_index < num_blocks; ++Y_index) {
-    const int X_index = std::inner_product(
-        X_strides.cbegin(), X_strides.cend(), index.cbegin(), 0);
-    if (block_size == 1) {
-      Y[Y_index] = X[X_index];
-    } else {
-      std::memcpy(
-          Y + block_size * Y_index,
-          X + block_size * X_index,
-          block_size * sizeof(T));
-    }
-    utils::IncreaseIndexInDims(itr_axes, Y_dims.data(), index.data());
-  }
-}
-
-template <typename T>
-void TransposeCPUImpl(
-    const int ndim,
-    const int* dims,
-    const int* axes,
-    const T* X,
-    T* Y) {
-  if (utils::IsIdentityPermutation(ndim, axes)) {
-    const int size =
-        std::accumulate(dims, dims + ndim, 1, std::multiplies<int>());
-    std::memcpy(Y, X, size * sizeof(T));
-    return;
-  }
-  if (utils::IsBatchTranspose2D(ndim, axes)) {
-    const int N =
-        std::accumulate(dims, dims + ndim - 2, 1, std::multiplies<int>());
-    const int H = dims[ndim - 2];
-    const int W = dims[ndim - 1];
-    for (int i = 0; i < N; ++i) {
-      Transpose2D<T>(H, W, X + i * H * W, Y + i * H * W);
-    }
-    return;
-  }
-  TransposeND<T>(ndim, dims, axes, X, Y);
-}
-
-template <>
-void TransposeCPUImpl(
-    const int ndim,
-    const int* dims,
-    const int* axes,
-    const float* X,
-    float* Y) {
-  if (utils::IsIdentityPermutation(ndim, axes)) {
-    const int size =
-        std::accumulate(dims, dims + ndim, 1, std::multiplies<int>());
-    std::memcpy(Y, X, size * sizeof(float));
-    return;
-  }
-  if (utils::IsBatchTranspose2D(ndim, axes)) {
-    const int N =
-        std::accumulate(dims, dims + ndim - 2, 1, std::multiplies<int>());
-    const int H = dims[ndim - 2];
-    const int W = dims[ndim - 1];
-    for (int i = 0; i < N; ++i) {
-      Transpose2D<float>(H, W, X + i * H * W, Y + i * H * W);
-    }
-    return;
-  }
-#ifdef CAFFE2_USE_HPTT
-  if (TransposeWithHPTT(ndim, dims, axes, X, Y)) {
-    return;
-  }
-#endif
-  TransposeND<float>(ndim, dims, axes, X, Y);
-}
-
-} // namespace
-
-#define CAFFE2_SPECIALIZED_TRANSPOSE(T)       \
-  template <>                                 \
-  C10_EXPORT void Transpose<T, CPUContext>(   \
-      const int ndim,                         \
-      const int* dims,                        \
-      const int* axes,                        \
-      const T* X,                             \
-      T* Y,                                   \
-      CPUContext* /* context */) {            \
-    TransposeCPUImpl(ndim, dims, axes, X, Y); \
-  }
-CAFFE2_SPECIALIZED_TRANSPOSE(float)
-CAFFE2_SPECIALIZED_TRANSPOSE(double)
-CAFFE2_SPECIALIZED_TRANSPOSE(int)
-CAFFE2_SPECIALIZED_TRANSPOSE(int64_t)
-CAFFE2_SPECIALIZED_TRANSPOSE(std::uint8_t)
-CAFFE2_SPECIALIZED_TRANSPOSE(std::uint16_t)
-#undef CAFFE2_SPECIALIZED_TRANSPOSE
-
-#define CAFFE2_SPECIALIZED_NCHW2NHWC(T)                       \
-  template <>                                                 \
-  C10_EXPORT void NCHW2NHWC<T, CPUContext>(                   \
-      const int N,                                            \
-      const int C,                                            \
-      const int HxW,                                          \
-      const T* X,                                             \
-      T* Y,                                                   \
-      CPUContext* /* context */) {                            \
-    const int stride = C * HxW;                               \
-    for (int i = 0; i < N; ++i) {                             \
-      Transpose2D<T>(C, HxW, X + i * stride, Y + i * stride); \
-    }                                                         \
-  }
-CAFFE2_SPECIALIZED_NCHW2NHWC(float)
-#undef CAFFE2_SPECIALIZED_NCHW2NHWC
-
-#define CAFFE2_SPECIALIZED_NHWC2NCHW(T)                       \
-  template <>                                                 \
-  C10_EXPORT void NHWC2NCHW<T, CPUContext>(                   \
-      const int N,                                            \
-      const int C,                                            \
-      const int HxW,                                          \
-      const T* X,                                             \
-      T* Y,                                                   \
-      CPUContext* /* context */) {                            \
-    const int stride = HxW * C;                               \
-    for (int i = 0; i < N; ++i) {                             \
-      Transpose2D<T>(HxW, C, X + i * stride, Y + i * stride); \
-    }                                                         \
-  }
-CAFFE2_SPECIALIZED_NHWC2NCHW(float)
-#undef CAFFE2_SPECIALIZED_NHWC2NCHW
-
 } // namespace math
 } // namespace caffe2
index 819edc6..71f05ab 100644 (file)
@@ -3022,186 +3022,5 @@ DELEGATE_INV_STD_KERNEL_FUNCTION(float, rsqrtf)
 CAFFE2_SPECIALIZED_CUDA_INV_STD(float)
 #undef CAFFE2_SPECIALIZED_CUDA_INV_STD
 
-namespace {
-
-constexpr int kTileDim = 32;
-constexpr int kBlockRows = 8;
-
-// Splits the original matrix into submatrices with size 32 * 32.
-// Each block transposes one submatrix by loading it into shared memory.
-// Reference https://devblogs.nvidia.com/efficient-matrix-transpose-cuda-cc/
-template <typename T>
-__global__ void BatchTranspose2DCUDAKernel(
-    const int N,
-    const int H,
-    const int W,
-    const T* X,
-    T* Y) {
-  __shared__ T tile[kTileDim][kTileDim + 1];
-  const int h = (H + kTileDim - 1) / kTileDim;
-  const int w = (W + kTileDim - 1) / kTileDim;
-  const int outer_size = N * h * w;
-  for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
-    const int n = i / (h * w);
-    const int k = i % (h * w);
-    const int r = k / w;
-    const int c = k % w;
-    const int offset = n * H * W;
-    int x = c * kTileDim + threadIdx.x;
-    int y = r * kTileDim + threadIdx.y;
-    if (x < W) {
-      for (int j = 0; j < kTileDim && y + j < H; j += kBlockRows) {
-#if __CUDA_ARCH__ >= 350 || defined(__HIP_PLATFORM_HCC__)
-        tile[threadIdx.y + j][threadIdx.x] =
-            __ldg(X + offset + (y + j) * W + x);
-#else
-        tile[threadIdx.y + j][threadIdx.x] = X[offset + (y + j) * W + x];
-#endif
-      }
-    }
-    __syncthreads();
-    x = r * kTileDim + threadIdx.x;
-    y = c * kTileDim + threadIdx.y;
-    if (x < H) {
-      for (int j = 0; j < kTileDim && y + j < W; j += kBlockRows) {
-        Y[offset + (y + j) * H + x] = tile[threadIdx.x][threadIdx.y + j];
-      }
-    }
-    __syncthreads();
-  }
-}
-
-template <typename T, int D>
-__global__ void TransposeCUDAKernel(
-    const int size,
-    const SimpleArray<int, D> X_strides,
-    const SimpleArray<FIXED_DIVISOR, D> Y_dims,
-    const T* X,
-    T* Y) {
-  CUDA_1D_KERNEL_LOOP(Y_index, size) {
-    int X_index = 0;
-    int Y_index_val = Y_index;
-#pragma unroll
-    for (int i = D - 1; i >= 0; --i) {
-      int d;
-      FIXED_DIVISOR_DIV_MOD(Y_dims.data[i], Y_index_val, &Y_index_val, &d);
-      X_index += d * X_strides.data[i];
-    }
-#if __CUDA_ARCH__ >= 350 || defined(__HIP_PLATFORM_HCC__)
-    Y[Y_index] = __ldg(X + X_index);
-#else
-    Y[Y_index] = X[X_index];
-#endif
-  }
-}
-
-template <typename T, int D>
-CAFFE2_CUDA_EXPORT void TransposeCUDAImpl(
-    const int* dims,
-    const int* axes,
-    const T* X,
-    T* Y,
-    CUDAContext* context) {
-  SimpleArray<int, D> X_strides;
-  SimpleArray<FIXED_DIVISOR, D> Y_dims;
-  utils::ComputeTransposedStrides(D, dims, axes, X_strides.data);
-  int size = 1;
-  for (int i = 0; i < D; ++i) {
-    Y_dims.data[i] = FIXED_DIVISOR(dims[axes[i]]);
-    size *= dims[i];
-  }
-  TransposeCUDAKernel<T, D>
-      <<<CAFFE_GET_BLOCKS(size),
-         CAFFE_CUDA_NUM_THREADS,
-         0,
-         context->cuda_stream()>>>(size, X_strides, Y_dims, X, Y);
-}
-
-} // namespace
-
-#define CAFFE2_SPECIALIZED_CUDA_TRANSPOSE(T)                                 \
-  template <>                                                                \
-  CAFFE2_CUDA_EXPORT void Transpose<T, CUDAContext>(                         \
-      const int ndim,                                                        \
-      const int* dims,                                                       \
-      const int* axes,                                                       \
-      const T* X,                                                            \
-      T* Y,                                                                  \
-      CUDAContext* context) {                                                \
-    if (utils::IsIdentityPermutation(ndim, axes)) {                          \
-      const int size =                                                       \
-          std::accumulate(dims, dims + ndim, 1, std::multiplies<int>());     \
-      context->template CopySameDevice<T>(size, X, Y);                       \
-      return;                                                                \
-    }                                                                        \
-    if (utils::IsBatchTranspose2D(ndim, axes)) {                             \
-      const int N =                                                          \
-          std::accumulate(dims, dims + ndim - 2, 1, std::multiplies<int>()); \
-      const int H = dims[ndim - 2];                                          \
-      const int W = dims[ndim - 1];                                          \
-      const int h = (H + kTileDim - 1) / kTileDim;                           \
-      const int w = (W + kTileDim - 1) / kTileDim;                           \
-      const int outer_size = N * h * w;                                      \
-      const dim3 dim_block(kTileDim, kBlockRows, 1);                         \
-      BatchTranspose2DCUDAKernel<T>                                          \
-          <<<std::min(outer_size, CAFFE_MAXIMUM_NUM_BLOCKS),                 \
-             dim_block,                                                      \
-             0,                                                              \
-             context->cuda_stream()>>>(N, H, W, X, Y);                       \
-      return;                                                                \
-    }                                                                        \
-    DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_1(                                  \
-        ndim, TransposeCUDAImpl, T, dims, axes, X, Y, context);              \
-  }
-CAFFE2_SPECIALIZED_CUDA_TRANSPOSE(float)
-CAFFE2_SPECIALIZED_CUDA_TRANSPOSE(double)
-CAFFE2_SPECIALIZED_CUDA_TRANSPOSE(int)
-CAFFE2_SPECIALIZED_CUDA_TRANSPOSE(int64_t)
-#undef CAFFE2_SPECIALIZED_CUDA_TRANSPOSE
-
-#define CAFFE2_SPECIALIZED_CUDA_NCHW2NHWC(T)               \
-  template <>                                              \
-  CAFFE2_CUDA_EXPORT void NCHW2NHWC<T, CUDAContext>(       \
-      const int N,                                         \
-      const int C,                                         \
-      const int HxW,                                       \
-      const T* X,                                          \
-      T* Y,                                                \
-      CUDAContext* context) {                              \
-    const int h = (C + kTileDim - 1) / kTileDim;           \
-    const int w = (HxW + kTileDim - 1) / kTileDim;         \
-    const int outer_size = N * h * w;                      \
-    const dim3 dim_block(kTileDim, kBlockRows, 1);         \
-    BatchTranspose2DCUDAKernel<T>                          \
-        <<<std::min(outer_size, CAFFE_MAXIMUM_NUM_BLOCKS), \
-           dim_block,                                      \
-           0,                                              \
-           context->cuda_stream()>>>(N, C, HxW, X, Y);     \
-  }
-CAFFE2_SPECIALIZED_CUDA_NCHW2NHWC(float)
-#undef CAFFE2_SPECIALIZED_CUDA_NCHW2NHWC
-
-#define CAFFE2_SPECIALIZED_CUDA_NHWC2NCHW(T)               \
-  template <>                                              \
-  CAFFE2_CUDA_EXPORT void NHWC2NCHW<T, CUDAContext>(       \
-      const int N,                                         \
-      const int C,                                         \
-      const int HxW,                                       \
-      const T* X,                                          \
-      T* Y,                                                \
-      CUDAContext* context) {                              \
-    const int h = (HxW + kTileDim - 1) / kTileDim;         \
-    const int w = (C + kTileDim - 1) / kTileDim;           \
-    const int outer_size = N * h * w;                      \
-    const dim3 dim_block(kTileDim, kBlockRows, 1);         \
-    BatchTranspose2DCUDAKernel<T>                          \
-        <<<std::min(outer_size, CAFFE_MAXIMUM_NUM_BLOCKS), \
-           dim_block,                                      \
-           0,                                              \
-           context->cuda_stream()>>>(N, HxW, C, X, Y);     \
-  }
-CAFFE2_SPECIALIZED_CUDA_NHWC2NCHW(float)
-#undef CAFFE2_SPECIALIZED_CUDA_NHWC2NCHW
-
 } // namespace math
 } // namespace caffe2
index ccaaf89..5e08915 100644 (file)
@@ -426,97 +426,6 @@ TEST_F(BroadcastGPUTest, BroadcastGPUFloatTest) {
       {1.0f, 1.0f, 2.0f, 2.0f, 1.0f, 1.0f, 2.0f, 2.0f});
 }
 
-class TransposeGPUTest : public testing::Test {
- protected:
-  void SetUp() override {
-    if (!HasCudaGPU()) {
-      return;
-    }
-    option_.set_device_type(PROTO_CUDA);
-    cuda_context_ = make_unique<CUDAContext>(option_);
-    Blob* blob_x = ws_.CreateBlob("X");
-    Blob* blob_y = ws_.CreateBlob("Y");
-    X_ = BlobGetMutableTensor(blob_x, CUDA);
-    Y_ = BlobGetMutableTensor(blob_y, CUDA);
-  }
-
-  void SetUpData(
-      const std::vector<int>& X_dims,
-      const std::vector<int>& axes,
-      const std::vector<float>& X_data) {
-    const int ndim = X_dims.size();
-    std::vector<int> Y_dims(ndim);
-    for (int i = 0; i < ndim; ++i) {
-      Y_dims[i] = X_dims[axes[i]];
-    }
-    X_->Resize(X_dims);
-    Y_->Resize(Y_dims);
-    ASSERT_EQ(X_data.size(), X_->numel());
-    cuda_context_->CopyFromCPU<float>(
-        X_data.size(), X_data.data(), X_->mutable_data<float>());
-  }
-
-  void VerifyResult(const std::vector<float>& expected_output) {
-    Blob* blob_y_host = ws_.CreateBlob("Y_host");
-    auto* Y_host = BlobGetMutableTensor(blob_y_host, CPU);
-    Y_host->CopyFrom(*Y_);
-    ASSERT_EQ(expected_output.size(), Y_host->numel());
-    for (std::size_t i = 0; i < expected_output.size(); ++i) {
-      EXPECT_FLOAT_EQ(expected_output[i], Y_host->data<float>()[i]);
-    }
-  }
-
-  void RunTransposeTest(
-      const std::vector<int>& X_dims,
-      const std::vector<int>& axes,
-      const std::vector<float>& X_data,
-      const std::vector<float>& Y_data) {
-    SetUpData(X_dims, axes, X_data);
-    math::Transpose<float, CUDAContext>(
-        X_dims.size(),
-        X_dims.data(),
-        axes.data(),
-        X_->data<float>(),
-        Y_->mutable_data<float>(),
-        cuda_context_.get());
-    cuda_context_->FinishDeviceComputation();
-    VerifyResult(Y_data);
-  }
-
-  Workspace ws_;
-  DeviceOption option_;
-  std::unique_ptr<CUDAContext> cuda_context_;
-  Tensor* X_ = nullptr;
-  Tensor* Y_ = nullptr;
-};
-
-TEST_F(TransposeGPUTest, TransposeGPUFloatTest) {
-  if (!HasCudaGPU()) {
-    return;
-  }
-  // Test for 1D transpose.
-  RunTransposeTest({3}, {0}, {1.0f, 2.0f, 3.0f}, {1.0f, 2.0f, 3.0f});
-
-  // Test for 2D transpose.
-  RunTransposeTest(
-      {2, 3},
-      {1, 0},
-      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
-      {1.0f, 4.0f, 2.0f, 5.0f, 3.0f, 6.0f});
-
-  // Test for 3D transpose.
-  RunTransposeTest(
-      {2, 2, 2},
-      {1, 2, 0},
-      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
-      {1.0f, 5.0f, 2.0f, 6.0f, 3.0f, 7.0f, 4.0f, 8.0f});
-  RunTransposeTest(
-      {2, 2, 2},
-      {1, 0, 2},
-      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
-      {1.0f, 2.0f, 5.0f, 6.0f, 3.0f, 4.0f, 7.0f, 8.0f});
-}
-
 } // namespace
 
 } // namespace caffe2
index 0e30fda..205a6f2 100644 (file)
@@ -495,75 +495,6 @@ TEST_F(RandFixedSumTest, UpperBound) {
       20, 1, 1000, 1000, l.data(), cpu_context_.get());
 }
 
-class TransposeTest : public testing::Test {
- protected:
-  void SetUp() override {
-    cpu_context_ = make_unique<CPUContext>(option_);
-  }
-
-  void RunTransposeTest(
-      const std::vector<int>& X_dims,
-      const std::vector<int>& axes,
-      const std::vector<float>& X_data,
-      const std::vector<float>& Y_data) {
-    const int ndim = X_dims.size();
-    std::vector<int> Y_dims(ndim);
-    for (int i = 0; i < ndim; ++i) {
-      Y_dims[i] = X_dims[axes[i]];
-    }
-    std::vector<int64_t> X_dims_64;
-    std::vector<int64_t> Y_dims_64;
-    std::copy(X_dims.cbegin(), X_dims.cend(), std::back_inserter(X_dims_64));
-    std::copy(Y_dims.cbegin(), Y_dims.cend(), std::back_inserter(Y_dims_64));
-    ReinitializeTensor(&X_, X_dims_64, at::dtype<float>().device(CPU));
-    ReinitializeTensor(&Y_, Y_dims_64, at::dtype<float>().device(CPU));
-    ASSERT_EQ(X_data.size(), X_.numel());
-    cpu_context_->CopyFromCPU<float>(
-        X_data.size(), X_data.data(), X_.mutable_data<float>());
-    math::Transpose<float, CPUContext>(
-        X_dims.size(),
-        X_dims.data(),
-        axes.data(),
-        X_.data<float>(),
-        Y_.mutable_data<float>(),
-        cpu_context_.get());
-    ASSERT_EQ(Y_data.size(), Y_.numel());
-    for (int i = 0; i < Y_.numel(); ++i) {
-      EXPECT_FLOAT_EQ(Y_data[i], Y_.data<float>()[i]);
-    }
-  }
-
-  DeviceOption option_;
-  std::unique_ptr<CPUContext> cpu_context_;
-
-  Tensor X_;
-  Tensor Y_;
-};
-
-TEST_F(TransposeTest, TransposeFloatTest) {
-  // Test for 1D transpose.
-  RunTransposeTest({3}, {0}, {1.0f, 2.0f, 3.0f}, {1.0f, 2.0f, 3.0f});
-
-  // Test for 2D transpose.
-  RunTransposeTest(
-      {2, 3},
-      {1, 0},
-      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
-      {1.0f, 4.0f, 2.0f, 5.0f, 3.0f, 6.0f});
-
-  // Test for 3D transpose.
-  RunTransposeTest(
-      {2, 2, 2},
-      {1, 2, 0},
-      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
-      {1.0f, 5.0f, 2.0f, 6.0f, 3.0f, 7.0f, 4.0f, 8.0f});
-  RunTransposeTest(
-      {2, 2, 2},
-      {1, 0, 2},
-      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
-      {1.0f, 2.0f, 5.0f, 6.0f, 3.0f, 4.0f, 7.0f, 8.0f});
-}
-
 } // namespace
 
 } // namespace caffe2