const T* X,
T* Y,
CPUContext* context) {
- const std::array<int, 2> dims = {G, K};
- const std::array<int, 2> axes = {1, 0};
+ const std::array<std::int64_t, 2> dims = {G, K};
+ const std::array<std::int32_t, 2> axes = {1, 0};
const int M = N * HxW;
const int C = G * K;
for (int i = 0; i < M; ++i) {
- math::Transpose<T, CPUContext>(2, dims.data(), axes.data(), X, Y, context);
+ math::Transpose<std::int64_t, T, CPUContext>(
+ 2, dims.data(), axes.data(), X, Y, context);
X += C;
Y += C;
}
<<<outer_size, CAFFE_CUDA_NUM_THREADS, 0, context_.cuda_stream()>>>(
G, K, X_data, Y_data);
} else {
- const std::array<int, 3> dims = {N * HxW, G, K};
- const std::array<int, 3> axes = {0, 2, 1};
- math::Transpose<float, CUDAContext>(
+ const std::array<std::int64_t, 3> dims = {N * HxW, G, K};
+ const std::array<std::int32_t, 3> axes = {0, 2, 1};
+ math::Transpose<std::int64_t, float, CUDAContext>(
3, dims.data(), axes.data(), X_data, Y_data, &context_);
}
return true;
<<<outer_size, CAFFE_CUDA_NUM_THREADS, 0, context_.cuda_stream()>>>(
K, G, dY_data, dX_data);
} else {
- const std::array<int, 3> dims = {N * HxW, K, G};
- const std::array<int, 3> axes = {0, 2, 1};
- math::Transpose<float, CUDAContext>(
+ const std::array<std::int64_t, 3> dims = {N * HxW, K, G};
+ const std::array<std::int32_t, 3> axes = {0, 2, 1};
+ math::Transpose<std::int64_t, float, CUDAContext>(
3, dims.data(), axes.data(), dY_data, dX_data, &context_);
}
return true;
std::vector<int> axes(ndim);
std::iota(axes.begin(), axes.end(), 0);
std::swap(axes[dim0], axes[dim1]);
- std::vector<int> Y_dims(ndim);
- std::vector<int> X_dims(X.sizes().cbegin(), X.sizes().cend());
+ const std::vector<std::int64_t> X_dims = X.sizes().vec();
+ std::vector<std::int64_t> Y_dims(ndim);
for (int i = 0; i < ndim; ++i) {
Y_dims[i] = X_dims[axes[i]];
}
Tensor Y(Y_dims, CPU);
- math::Transpose<float, CPUContext>(
+ math::Transpose<std::int64_t, float, CPUContext>(
ndim,
X_dims.data(),
axes.data(),
-#include "caffe2/core/context_gpu.h"
#include "caffe2/operators/transpose_op.h"
+#include "caffe2/core/context_gpu.h"
+
namespace caffe2 {
REGISTER_CUDA_OPERATOR(Transpose, TransposeOp<CUDAContext>);
}
}
- ~TransposeOp() = default;
-
bool RunOnDevice() override {
// Do the actual transpose, which is implemented in DoRunWithType().
return DispatchHelper<TensorTypes<float, double, int, int64_t>>::call(
} else {
CAFFE_ENFORCE_EQ(ndim, axes_.size());
}
- const std::vector<int> X_dims(X.sizes().cbegin(), X.sizes().cend());
- std::vector<int64_t> Y_dims(ndim);
+ const std::vector<std::int64_t> X_dims = X.sizes().vec();
+ std::vector<std::int64_t> Y_dims(ndim);
for (int i = 0; i < ndim; ++i) {
Y_dims[i] = X_dims[axes_[i]];
}
auto* Y = Output(0, Y_dims, at::dtype<T>());
- math::Transpose<T, Context>(
+ math::Transpose<std::int64_t, T, Context>(
X_dims.size(),
X_dims.data(),
axes_.data(),
#include "caffe2/operators/transpose_op.h"
#include <algorithm>
+#include <limits>
#include <vector>
#include "caffe2/core/context_gpu.h"
namespace caffe2 {
-#define MAX_DIMS 8
+namespace {
class CuDNNTransposeOp final : public Operator<CUDAContext> {
public:
USE_OPERATOR_FUNCTIONS(CUDAContext);
- USE_DISPATCH_HELPER;
template <class... Args>
explicit CuDNNTransposeOp(Args&&... args)
: Operator<CUDAContext>(std::forward<Args>(args)...),
cudnn_wrapper_(&context_),
axes_(OperatorBase::GetRepeatedArgument<int>("axes")) {
- // We will check the legality of axes_: it should be from 0 to axes_.size().
+ // Checks the legality of axes_: it should be from 0 to axes_.size().
std::vector<int> axes_sorted(axes_);
std::sort(axes_sorted.begin(), axes_sorted.end());
- for (int i = 0; i < axes_sorted.size(); ++i) {
+ for (std::size_t i = 0; i < axes_sorted.size(); ++i) {
if (axes_sorted[i] != i) {
CAFFE_THROW("Axes should be a permutation of 0 to ndim.");
}
}
- CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&xDesc_));
- CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&yDesc_));
+ CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&X_desc_));
+ CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&Y_desc_));
}
~CuDNNTransposeOp() override {
- CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(xDesc_));
- CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(yDesc_));
+ CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(X_desc_));
+ CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(Y_desc_));
}
bool RunOnDevice() override {
+ return DispatchHelper<TensorTypes<float, int>>::call(this, Input(0));
+ }
+
+ template <typename T>
+ bool DoRunWithType() {
const auto& X = Input(0);
- auto* Y = Output(0);
const int ndim = X.dim();
- X_dims_.assign(X.sizes().cbegin(), X.sizes().cend());
if (axes_.empty()) {
axes_.resize(ndim);
std::iota(axes_.rbegin(), axes_.rend(), 0);
} else {
- CAFFE_ENFORCE_EQ(X.dim(), axes_.size());
+ CAFFE_ENFORCE_EQ(axes_.size(), ndim);
}
- std::vector<int> Y_dims(ndim);
+ std::vector<std::int64_t> X_dims = X.sizes().vec();
+ std::vector<std::int64_t> Y_dims(ndim);
for (int i = 0; i < ndim; ++i) {
- Y_dims[i] = X_dims_[axes_[i]];
+ Y_dims[i] = X_dims[axes_[i]];
}
- Y->Resize(Y_dims);
- // Do the actual transpose, which is implemented in DoRunWithType().
-#if CUDNN_VERSION_MIN(6, 0, 0)
- return DispatchHelper<TensorTypes<float, int>>::call(this, Input(0));
-#else
- // CUDNN 5.1 does not have int support yet.
- return DispatchHelper<TensorTypes<float>>::call(this, Input(0));
-#endif
- }
-
- protected:
- template <typename T>
- bool DoRunWithType() {
- const auto& input = Input(0);
- auto* output = Output(0);
- int ndim = input.dim();
-
- if (ndim == 0) {
+ auto* Y = Output(0, Y_dims, at::dtype<T>());
+ const T* X_data = X.template data<T>();
+ T* Y_data = Y->template mutable_data<T>();
+ if (X.numel() == 0) {
return true;
}
- if (ndim == 1) {
- output->CopyFrom(input);
+ if (ndim < 3 || ndim > CUDNN_DIM_MAX ||
+ X.numel() > std::numeric_limits<std::int32_t>::max()) {
+ math::Transpose<std::int64_t, T, CUDAContext>(
+ ndim, X_dims.data(), axes_.data(), X_data, Y_data, &context_);
return true;
}
-
- cudnnDataType_t typedesc = cudnnTypeWrapper<T>::type;
-#if CUDNN_VERSION_MIN(6, 0, 0)
- if (typedesc == CUDNN_DATA_INT32) {
- // CUDNN Transpose only support float for now
- math::Transpose<int, CUDAContext>(
- X_dims_.size(),
- X_dims_.data(),
- axes_.data(),
- input.template data<int>(),
- output->template mutable_data<int>(),
- &context_);
- return true;
- }
-#endif
-
- CAFFE_ENFORCE(ndim < MAX_DIMS, "Input ndim exceeds compile time max.");
-
- stride_y[ndim - 1] = 1;
- for (int i = ndim - 2; i >= 0; i--) {
- stride_y[i] = stride_y[i + 1] * output->dim32(i + 1);
- }
-
- CHECK(axes_.size() >= ndim);
-
- stride_x[ndim] = 1;
- for (int i = 0; i < ndim; i++) {
- stride_x[i] = 1;
- for (int j = axes_[i] + 1; j < ndim; j++) {
- stride_x[i] *= input.dim32(j);
- }
- dim_y_int[i] = output->dim32(i);
- }
-
- // CuDNN requires at least 3-dim tensors
- for (int i = ndim; i < MAX_DIMS; i++) {
- stride_x[i] = 1;
- stride_y[i] = 1;
- dim_y_int[i] = 1;
+ if (X_dims != cached_X_dims_) {
+ SetTensorDescriptor(cudnnTypeWrapper<T>::type, X_dims, Y_dims);
+ cached_X_dims_ = X_dims;
}
-
- CUDNN_ENFORCE(cudnnSetTensorNdDescriptor(
- xDesc_, typedesc, ndim < 4 ? 4 : ndim, dim_y_int, stride_x));
-
- CUDNN_ENFORCE(cudnnSetTensorNdDescriptor(
- yDesc_, typedesc, ndim < 4 ? 4 : ndim, dim_y_int, stride_y));
-
CUDNN_ENFORCE(cudnnTransformTensor(
cudnn_wrapper_.inline_cudnn_handle(),
cudnnTypeWrapper<T>::kOne(),
- xDesc_,
- static_cast<const void*>(input.template data<T>()),
+ X_desc_,
+ X_data,
cudnnTypeWrapper<T>::kZero(),
- yDesc_,
- static_cast<void*>(output->template mutable_data<T>())));
+ Y_desc_,
+ Y_data));
return true;
}
- int stride_x[MAX_DIMS];
- int stride_y[MAX_DIMS];
- int dim_y_int[MAX_DIMS];
+ private:
+ void SetTensorDescriptor(
+ const cudnnDataType_t data_type,
+ const std::vector<std::int64_t>& X_dims,
+ const std::vector<std::int64_t>& Y_dims) {
+ const int ndim = X_dims.size();
+ std::vector<int> dims(Y_dims.cbegin(), Y_dims.cend());
+ std::vector<int> X_strides(ndim);
+ std::vector<int> X_buff(ndim);
+ std::vector<int> Y_strides(ndim);
+ X_buff.back() = 1;
+ Y_strides.back() = 1;
+ for (int i = ndim - 1; i > 0; --i) {
+ X_buff[i - 1] = X_buff[i] * X_dims[i];
+ Y_strides[i - 1] = Y_strides[i] * Y_dims[i];
+ }
+ for (int i = 0; i < ndim; ++i) {
+ X_strides[i] = X_buff[axes_[i]];
+ }
+ CUDNN_ENFORCE(cudnnSetTensorNdDescriptor(
+ X_desc_, data_type, ndim, dims.data(), X_strides.data()));
+ CUDNN_ENFORCE(cudnnSetTensorNdDescriptor(
+ Y_desc_, data_type, ndim, dims.data(), Y_strides.data()));
+ }
- cudnnTensorDescriptor_t xDesc_;
- cudnnTensorDescriptor_t yDesc_;
CuDNNWrapper cudnn_wrapper_;
+ cudnnTensorDescriptor_t X_desc_;
+ cudnnTensorDescriptor_t Y_desc_;
- std::vector<int> axes_;
- std::vector<int> X_dims_;
+ std::vector<std::int64_t> cached_X_dims_;
+ std::vector<std::int32_t> axes_;
};
+#if !CUDNN_VERSION_MIN(6, 0, 0)
+
+// CuDNN 5.1 does not have int support yet.
+template <>
+bool CuDNNTransposeOp::DoRunWithType<int>() {
+ const auto& X = Input(0);
+ const int ndim = X.dim();
+ if (axes_.empty()) {
+ axes_.resize(ndim);
+ std::iota(axes_.rbegin(), axes_.rend(), 0);
+ } else {
+ CAFFE_ENFORCE_EQ(axes_.size(), ndim);
+ }
+ std::vector<std::int64_t> X_dims = X.sizes().vec();
+ std::vector<std::int64_t> Y_dims(ndim);
+ for (int i = 0; i < ndim; ++i) {
+ Y_dims[i] = X_dims[axes_[i]];
+ }
+ auto* Y = Output(0, Y_dims, at::dtype<T>());
+ const T* X_data = X.template data<T>();
+ T* Y_data = Y->template mutable_data<T>();
+ math::Transpose<std::int64_t, T, CUDAContext>(
+ ndim, X_dims.data(), axes_.data(), X_data, Y_data, &context_);
+ return true;
+}
+
+#endif // !CUDNN_VERSION_MIN(6, 0, 0)
+
+} // namespace
+
REGISTER_CUDNN_OPERATOR(Transpose, CuDNNTransposeOp);
} // namespace caffe2
utils/math/broadcast.cc
utils/math/elementwise.cc
utils/math/reduce.cc
+ utils/math/transpose.cc
utils/math/utils.cc
utils/math_cpu.cc
utils/murmur_hash3.cc
utils/math/broadcast.cu
utils/math/elementwise.cu
utils/math/reduce.cu
+ utils/math/transpose.cu
utils/math_gpu.cu
)
utils/math/hip/broadcast.hip
utils/math/hip/elementwise.hip
utils/math/hip/reduce.hip
+ utils/math/hip/transpose.hip
utils/hip/math_gpu.hip
)
#include "caffe2/utils/math/broadcast.h"
#include "caffe2/utils/math/elementwise.h"
#include "caffe2/utils/math/reduce.h"
+#include "caffe2/utils/math/transpose.h"
#include "caffe2/utils/math/utils.h"
namespace caffe2 {
CAFFE2_API void
Maximum(const int N, const float alpha, const T* x, T* y, Context* context);
-// Transpose tensor X with dims by axes and write the result to tensor Y.
-template <typename T, class Context>
-CAFFE2_API void Transpose(
- const int ndim,
- const int* dims,
- const int* axes,
- const T* X,
- T* Y,
- Context* context);
-
// Decaf gemm provides a simpler interface to the gemm functions, with the
// limitation that the data has to be contiguous in memory.
template <typename T, class Context, class Engine = DefaultEngine>
template <typename T, class Context>
CAFFE2_API void CopyVector(const int N, const T* A, T* B, Context* context);
-template <typename T, class Context>
-CAFFE2_API void NCHW2NHWC(
- const int N,
- const int C,
- const int HxW,
- const T* X,
- T* Y,
- Context* context);
-
-template <typename T, class Context>
-CAFFE2_API void NHWC2NCHW(
- const int N,
- const int C,
- const int HxW,
- const T* X,
- T* Y,
- Context* context);
} // namespace math
} // namespace caffe2
--- /dev/null
+#include "caffe2/utils/math/transpose.h"
+
+#include <algorithm>
+#include <functional>
+#include <limits>
+#include <numeric>
+
+#ifdef CAFFE2_USE_MKL
+#include <mkl.h>
+#endif // CAFFE2_USE_MKL
+
+#ifdef CAFFE2_USE_HPTT
+#include <hptt.h>
+#endif // CAFFE2_USE_HPTT
+
+#include "caffe2/core/context.h"
+#include "caffe2/utils/eigen_utils.h"
+#include "caffe2/utils/math/utils.h"
+
+namespace caffe2 {
+namespace math {
+
+namespace {
+
+template <typename TIndex, typename TData>
+void Transpose2D(
+ const TIndex rows,
+ const TIndex cols,
+ const TData* X,
+ TData* Y) {
+ EigenMatrixMap<TData>(Y, rows, cols) =
+ ConstEigenMatrixMap<TData>(X, cols, rows).transpose();
+}
+
+#ifdef CAFFE2_USE_MKL
+
+#define DELEGATE_TRANSPOSE_2D(TIndex, TData, MKLFunc) \
+ template <> \
+ void Transpose2D<TIndex, TData>( \
+ const TIndex rows, const TIndex cols, const TData* X, TData* Y) { \
+ MKLFunc('R', 'T', rows, cols, TData(1), X, cols, Y, rows); \
+ }
+DELEGATE_TRANSPOSE_2D(std::int32_t, float, mkl_somatcopy);
+DELEGATE_TRANSPOSE_2D(std::int64_t, float, mkl_somatcopy);
+DELEGATE_TRANSPOSE_2D(std::int32_t, double, mkl_domatcopy);
+DELEGATE_TRANSPOSE_2D(std::int64_t, double, mkl_domatcopy);
+#undef DELEGATE_TRANSPOSE_2D
+
+#endif // CAFFE2_USE_MKL
+
+#ifdef CAFFE2_USE_HPTT
+
+template <typename TIndex, typename TData>
+bool TransposeByHPTT(
+ const int ndim,
+ const TIndex* dims,
+ const int* axes,
+ const TData* X,
+ TData* Y) {
+ for (int i = 0; i < ndim; ++i) {
+ if (dims[i] <= 0 || dims[i] > std::numeric_limits<int>::max()) {
+ return false;
+ }
+ }
+
+ std::vector<int> axes_cm(ndim);
+ std::vector<int> dims_cm(ndim);
+ // Convert row-major index to column-major.
+ const auto cm_fn = [ndim](const int i) { return ndim - i - 1; };
+ for (int i = 0; i < ndim; ++i) {
+ axes_cm[i] = cm_fn(axes[cm_fn(i)]);
+ dims_cm[i] = dims[cm_fn(i)];
+ }
+ auto plan = hptt::create_plan(
+ axes_cm.data(),
+ ndim,
+ TData(1),
+ X,
+ dims_cm.data(),
+ nullptr,
+ TData(0),
+ Y,
+ nullptr,
+ hptt::ESTIMATE,
+ 1 /* num_threads */);
+ if (plan == nullptr) {
+ return false;
+ }
+ plan->execute();
+ return true;
+}
+
+#endif // CAFFE2_USE_HPTT
+
+template <typename TIndex, typename TData>
+void TransposeND(
+ const int ndim,
+ const TIndex* dims,
+ const int* axes,
+ const TData* X,
+ TData* Y) {
+ std::vector<TIndex> Y_dims(ndim);
+ for (int i = 0; i < ndim; ++i) {
+ Y_dims[i] = dims[axes[i]];
+ }
+ // Measure amount of contiguous data we can copy at once
+ int pivot = ndim - 1;
+ TIndex block_size = 1;
+ for (; pivot >= 0 && axes[pivot] == pivot; --pivot) {
+ block_size *= Y_dims[pivot];
+ }
+ ++pivot;
+ const TIndex num_blocks = std::accumulate(
+ Y_dims.cbegin(),
+ Y_dims.cbegin() + pivot,
+ TIndex(1),
+ std::multiplies<TIndex>());
+ std::vector<TIndex> X_strides(pivot);
+ utils::ComputeTransposedStrides<TIndex>(pivot, dims, axes, X_strides.data());
+ std::vector<TIndex> index(pivot, 0);
+ for (TIndex Y_index = 0; Y_index < num_blocks; ++Y_index) {
+ const TIndex X_index = std::inner_product(
+ X_strides.cbegin(), X_strides.cend(), index.cbegin(), TIndex(0));
+ if (block_size == 1) {
+ Y[Y_index] = X[X_index];
+ } else {
+ std::memcpy(
+ Y + block_size * Y_index,
+ X + block_size * X_index,
+ block_size * sizeof(TData));
+ }
+ utils::IncreaseIndexInDims<TIndex>(pivot, Y_dims.data(), index.data());
+ }
+}
+
+template <typename TIndex, typename TData>
+void TransposeImpl(
+ const int ndim,
+ const TIndex* dims,
+ const int* axes,
+ const TData* X,
+ TData* Y) {
+ const TIndex size =
+ std::accumulate(dims, dims + ndim, TIndex(1), std::multiplies<TIndex>());
+ if (size == 0) {
+ return;
+ }
+ if (utils::IsIdentityPermutation(ndim, axes)) {
+ std::memcpy(Y, X, size * sizeof(TData));
+ return;
+ }
+ if (utils::IsBatchTranspose2D(ndim, axes)) {
+ const TIndex H = dims[ndim - 2];
+ const TIndex W = dims[ndim - 1];
+ const TIndex N = size / (H * W);
+ for (TIndex i = 0; i < N; ++i) {
+ Transpose2D<TIndex, TData>(H, W, X + i * H * W, Y + i * H * W);
+ }
+ return;
+ }
+ TransposeND<TIndex, TData>(ndim, dims, axes, X, Y);
+}
+
+#ifdef CAFFE2_USE_HPTT
+
+#define CAFFE2_SPECIALIZED_TRANSPOSE_IMPL(TIndex, TData) \
+ template <> \
+ void TransposeImpl<TIndex, TData>( \
+ const int ndim, \
+ const TIndex* dims, \
+ const int* axes, \
+ const TData* X, \
+ TData* T) { \
+ const TIndex size = std::accumulate( \
+ dims, dims + ndim, TIndex(1), std::multiplies<TIndex>()); \
+ if (size == 0) { \
+ return; \
+ } \
+ if (utils::IsIdentityPermutation(ndim, axes)) { \
+ std::memcpy(Y, X, size * sizeof(TData)); \
+ return; \
+ } \
+ if (TransposeByHPTT(ndim, dims, axes, X, Y)) { \
+ return; \
+ } \
+ if (utils::IsBatchTranspose2D(ndim, axes)) { \
+ const TIndex H = dims[ndim - 2]; \
+ const TIndex W = dims[ndim - 1]; \
+ const TIndex N = size / (H * W); \
+ for (TIndex i = 0; i < N; ++i) { \
+ Transpose2D<TIndex, TData>(H, W, X + i * H * W, Y + i * H * W); \
+ } \
+ return; \
+ } \
+ TransposeND<TIndex, TData>(ndim, dims, axes, X, Y); \
+ }
+CAFFE2_SPECIALIZED_TRANSPOSE_IMPL(std::int32_t, float)
+CAFFE2_SPECIALIZED_TRANSPOSE_IMPL(std::int64_t, float)
+CAFFE2_SPECIALIZED_TRANSPOSE_IMPL(std::int32_t, double)
+CAFFE2_SPECIALIZED_TRANSPOSE_IMPL(std::int64_t, double)
+#undef CAFFE2_SPECIALIZED_TRANSPOSE_IMPL
+
+#endif // CAFFE2_USE_HPTT
+
+} // namespace
+
+#define CAFFE2_SPECIALIZED_TRANSPOSE(TIndex, TData) \
+ template <> \
+ C10_EXPORT void Transpose<TIndex, TData, CPUContext>( \
+ const int ndim, \
+ const TIndex* dims, \
+ const int* axes, \
+ const TData* X, \
+ TData* Y, \
+ CPUContext* /* context */) { \
+ TransposeImpl<TIndex, TData>(ndim, dims, axes, X, Y); \
+ }
+CAFFE2_SPECIALIZED_TRANSPOSE(std::int32_t, float)
+CAFFE2_SPECIALIZED_TRANSPOSE(std::int64_t, float)
+CAFFE2_SPECIALIZED_TRANSPOSE(std::int32_t, double)
+CAFFE2_SPECIALIZED_TRANSPOSE(std::int64_t, double)
+CAFFE2_SPECIALIZED_TRANSPOSE(std::int32_t, std::int32_t)
+CAFFE2_SPECIALIZED_TRANSPOSE(std::int64_t, std::int32_t)
+CAFFE2_SPECIALIZED_TRANSPOSE(std::int32_t, std::int64_t)
+CAFFE2_SPECIALIZED_TRANSPOSE(std::int64_t, std::int64_t)
+CAFFE2_SPECIALIZED_TRANSPOSE(std::int32_t, std::uint8_t)
+CAFFE2_SPECIALIZED_TRANSPOSE(std::int64_t, std::uint8_t)
+CAFFE2_SPECIALIZED_TRANSPOSE(std::int32_t, std::uint16_t)
+CAFFE2_SPECIALIZED_TRANSPOSE(std::int64_t, std::uint16_t)
+#undef CAFFE2_SPECIALIZED_TRANSPOSE
+
+#define CAFFE2_SPECIALIZED_NCHW2NHWC(T) \
+ template <> \
+ C10_EXPORT void NCHW2NHWC<T, CPUContext>( \
+ const int N, \
+ const int C, \
+ const int HxW, \
+ const T* X, \
+ T* Y, \
+ CPUContext* /* context */) { \
+ const int stride = C * HxW; \
+ for (int i = 0; i < N; ++i) { \
+ Transpose2D<T>(C, HxW, X + i * stride, Y + i * stride); \
+ } \
+ }
+CAFFE2_SPECIALIZED_NCHW2NHWC(float)
+#undef CAFFE2_SPECIALIZED_NCHW2NHWC
+
+#define CAFFE2_SPECIALIZED_NHWC2NCHW(T) \
+ template <> \
+ C10_EXPORT void NHWC2NCHW<T, CPUContext>( \
+ const int N, \
+ const int C, \
+ const int HxW, \
+ const T* X, \
+ T* Y, \
+ CPUContext* /* context */) { \
+ const int stride = HxW * C; \
+ for (int i = 0; i < N; ++i) { \
+ Transpose2D<T>(HxW, C, X + i * stride, Y + i * stride); \
+ } \
+ }
+CAFFE2_SPECIALIZED_NHWC2NCHW(float)
+#undef CAFFE2_SPECIALIZED_NHWC2NCHW
+
+} // namespace math
+} // namespace caffe2
--- /dev/null
+#include "caffe2/utils/math/transpose.h"
+
+#include <algorithm>
+#include <functional>
+#include <numeric>
+
+#include "caffe2/core/common_gpu.h"
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/utils/math/utils.h"
+
+namespace caffe2 {
+namespace math {
+
+namespace {
+
+constexpr int kTileDim = 32;
+constexpr int kBlockRows = 8;
+
+// Splits the original matrix into submatrices with size 32 * 32.
+// Each block transposes one submatrix by loading it into shared memory.
+// Reference https://devblogs.nvidia.com/efficient-matrix-transpose-cuda-cc/
+template <typename TIndex, typename TData>
+__global__ void BatchTranspose2DCUDAKernel(
+ const TIndex N,
+ const TIndex H,
+ const TIndex W,
+ const TIndex dh,
+ const TIndex dw,
+ const TData* X,
+ TData* Y) {
+ __shared__ TData tile[kTileDim][kTileDim + 1];
+ const TIndex n = blockIdx.x / (dh * dw);
+ const TIndex k = blockIdx.x % (dh * dw);
+ const TIndex r = k / dw;
+ const TIndex c = k % dw;
+ const TIndex offset = n * H * W;
+ int x = c * kTileDim + threadIdx.x;
+ int y = r * kTileDim + threadIdx.y;
+ if (x < W) {
+ for (int i = 0; i < kTileDim && y + i < H; i += kBlockRows) {
+#if __CUDA_ARCH__ >= 350 || defined(__HIP_PLATFORM_HCC__)
+ tile[threadIdx.y + i][threadIdx.x] = __ldg(X + offset + (y + i) * W + x);
+#else
+ tile[threadIdx.y + i][threadIdx.x] = X[offset + (y + i) * W + x];
+#endif
+ }
+ }
+ __syncthreads();
+ x = r * kTileDim + threadIdx.x;
+ y = c * kTileDim + threadIdx.y;
+ if (x < H) {
+ for (int i = 0; i < kTileDim && y + i < W; i += kBlockRows) {
+ Y[offset + (y + i) * H + x] = tile[threadIdx.x][threadIdx.y + i];
+ }
+ }
+}
+
+template <typename TIndex, typename TData>
+void BatchTranspose2DCUDAImpl(
+ const TIndex N,
+ const TIndex H,
+ const TIndex W,
+ const TData* X,
+ TData* Y,
+ CUDAContext* context) {
+ const TIndex dh = DivUp<TIndex>(H, kTileDim);
+ const TIndex dw = DivUp<TIndex>(W, kTileDim);
+ BatchTranspose2DCUDAKernel<TIndex, TData>
+ <<<N * dh * dw, dim3(kTileDim, kBlockRows), 0, context->cuda_stream()>>>(
+ N, H, W, dh, dw, X, Y);
+}
+
+#define DELEGATE_TRANSPOSE_2D_CUDA_IMPL(TIndex, TData, CuBLASFunc) \
+ template <> \
+ void BatchTranspose2DCUDAImpl<TIndex, TData>( \
+ const TIndex N, \
+ const TIndex H, \
+ const TIndex W, \
+ const TData* X, \
+ TData* Y, \
+ CUDAContext* context) { \
+ if (N == 1) { \
+ const TData kAlpha = TData(1); \
+ const TData kBeta = TData(0); \
+ CUBLAS_ENFORCE(cublasSetPointerMode( \
+ context->cublas_handle(), CUBLAS_POINTER_MODE_HOST)); \
+ CUBLAS_ENFORCE(CuBLASFunc( \
+ context->cublas_handle(), \
+ CUBLAS_OP_T, \
+ CUBLAS_OP_N, \
+ H, \
+ W, \
+ &kAlpha, \
+ X, \
+ W, \
+ &kBeta, \
+ Y, \
+ H, \
+ Y, \
+ H)); \
+ } else { \
+ const TIndex dh = DivUp<TIndex>(H, kTileDim); \
+ const TIndex dw = DivUp<TIndex>(W, kTileDim); \
+ BatchTranspose2DCUDAKernel<TIndex, TData> \
+ <<<N * dh * dw, \
+ dim3(kTileDim, kBlockRows), \
+ 0, \
+ context->cuda_stream()>>>(N, H, W, dh, dw, X, Y); \
+ } \
+ }
+DELEGATE_TRANSPOSE_2D_CUDA_IMPL(std::int32_t, float, cublasSgeam)
+DELEGATE_TRANSPOSE_2D_CUDA_IMPL(std::int64_t, float, cublasSgeam)
+DELEGATE_TRANSPOSE_2D_CUDA_IMPL(std::int32_t, double, cublasDgeam)
+DELEGATE_TRANSPOSE_2D_CUDA_IMPL(std::int64_t, double, cublasDgeam)
+#undef DELEGATE_TRANSPOSE_2D_CUDA_IMPL
+
+template <typename TIndex, typename TData, int D>
+__global__ void TransposeCUDAKernel(
+ const TIndex size,
+ const SimpleArray<TIndex, D> X_strides,
+ const SimpleArray<TIndex, D> Y_dims,
+ const TData* X,
+ TData* Y) {
+ const int Y_index = blockIdx.x * CAFFE_CUDA_NUM_THREADS + threadIdx.x;
+ if (Y_index < size) {
+ TIndex X_index = 0;
+ TIndex v = Y_index;
+#pragma unroll
+ for (int i = D - 1; i >= 0; --i) {
+ X_index += v % Y_dims.data[i] * X_strides.data[i];
+ v /= Y_dims.data[i];
+ }
+#if __CUDA_ARCH__ >= 350 || defined(__HIP_PLATFORM_HCC__)
+ Y[Y_index] = __ldg(X + X_index);
+#else
+ Y[Y_index] = X[X_index];
+#endif
+ }
+}
+
+template <typename TIndex, typename TData, int D>
+void TransposeCUDAImpl(
+ const TIndex* dims,
+ const int* axes,
+ const TData* X,
+ TData* Y,
+ CUDAContext* context) {
+ SimpleArray<TIndex, D> X_strides;
+ SimpleArray<TIndex, D> Y_dims;
+ utils::ComputeTransposedStrides<TIndex>(D, dims, axes, X_strides.data);
+ TIndex size = 1;
+ for (int i = 0; i < D; ++i) {
+ Y_dims.data[i] = dims[axes[i]];
+ size *= dims[i];
+ }
+ const TIndex M = DivUp<TIndex>(size, CAFFE_CUDA_NUM_THREADS);
+ TransposeCUDAKernel<TIndex, TData, D>
+ <<<M, CAFFE_CUDA_NUM_THREADS, 0, context->cuda_stream()>>>(
+ size, X_strides, Y_dims, X, Y);
+}
+
+} // namespace
+
+#define CAFFE2_SPECIALIZED_CUDA_TRANSPOSE(TIndex, TData) \
+ template <> \
+ CAFFE2_CUDA_EXPORT void Transpose<TIndex, TData, CUDAContext>( \
+ const int ndim, \
+ const TIndex* dims, \
+ const int* axes, \
+ const TData* X, \
+ TData* Y, \
+ CUDAContext* context) { \
+ const TIndex size = std::accumulate( \
+ dims, dims + ndim, TIndex(1), std::multiplies<TIndex>()); \
+ if (size == 0) { \
+ return; \
+ } \
+ if (utils::IsIdentityPermutation(ndim, axes)) { \
+ context->template CopySameDevice<TData>(size, X, Y); \
+ return; \
+ } \
+ if (utils::IsBatchTranspose2D(ndim, axes)) { \
+ const int H = dims[ndim - 2]; \
+ const int W = dims[ndim - 1]; \
+ const int N = size / (H * W); \
+ BatchTranspose2DCUDAImpl<TIndex, TData>(N, H, W, X, Y, context); \
+ return; \
+ } \
+ DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_2( \
+ ndim, TransposeCUDAImpl, TIndex, TData, dims, axes, X, Y, context); \
+ }
+CAFFE2_SPECIALIZED_CUDA_TRANSPOSE(std::int32_t, float)
+CAFFE2_SPECIALIZED_CUDA_TRANSPOSE(std::int64_t, float)
+CAFFE2_SPECIALIZED_CUDA_TRANSPOSE(std::int32_t, double)
+CAFFE2_SPECIALIZED_CUDA_TRANSPOSE(std::int64_t, double)
+CAFFE2_SPECIALIZED_CUDA_TRANSPOSE(std::int32_t, std::int32_t)
+CAFFE2_SPECIALIZED_CUDA_TRANSPOSE(std::int64_t, std::int32_t)
+CAFFE2_SPECIALIZED_CUDA_TRANSPOSE(std::int32_t, std::int64_t)
+CAFFE2_SPECIALIZED_CUDA_TRANSPOSE(std::int64_t, std::int64_t)
+#undef CAFFE2_SPECIALIZED_CUDA_TRANSPOSE
+
+#define CAFFE2_SPECIALIZED_CUDA_NCHW2NHWC(T) \
+ template <> \
+ CAFFE2_CUDA_EXPORT void NCHW2NHWC<T, CUDAContext>( \
+ const int N, \
+ const int C, \
+ const int HxW, \
+ const T* X, \
+ T* Y, \
+ CUDAContext* context) { \
+ BatchTranspose2DCUDAImpl<int, T>(N, C, HxW, X, Y, context); \
+ }
+CAFFE2_SPECIALIZED_CUDA_NCHW2NHWC(float)
+#undef CAFFE2_SPECIALIZED_CUDA_NCHW2NHWC
+
+#define CAFFE2_SPECIALIZED_CUDA_NHWC2NCHW(T) \
+ template <> \
+ CAFFE2_CUDA_EXPORT void NHWC2NCHW<T, CUDAContext>( \
+ const int N, \
+ const int C, \
+ const int HxW, \
+ const T* X, \
+ T* Y, \
+ CUDAContext* context) { \
+ BatchTranspose2DCUDAImpl<int, T>(N, HxW, C, X, Y, context); \
+ }
+CAFFE2_SPECIALIZED_CUDA_NHWC2NCHW(float)
+#undef CAFFE2_SPECIALIZED_CUDA_NHWC2NCHW
+
+} // namespace math
+} // namespace caffe2
--- /dev/null
+#ifndef CAFFE2_UTILS_MATH_TRANSPOSE_H_
+#define CAFFE2_UTILS_MATH_TRANSPOSE_H_
+
+#include "caffe2/core/common.h"
+#include "caffe2/core/types.h"
+
+namespace caffe2 {
+namespace math {
+
+// Transpose tensor X with dims by axes and write the result to tensor Y.
+template <typename TIndex, typename TData, class Context>
+CAFFE2_API void Transpose(
+ int ndim,
+ const TIndex* dims,
+ const int* axes,
+ const TData* X,
+ TData* Y,
+ Context* context);
+
+template <typename T, class Context>
+CAFFE2_API void
+NCHW2NHWC(int N, int C, int HxW, const T* X, T* Y, Context* context);
+
+template <typename T, class Context>
+CAFFE2_API void
+NHWC2NCHW(int N, int C, int HxW, const T* X, T* Y, Context* context);
+
+} // namespace math
+} // namespace caffe2
+
+#endif // CAFFE2_UTILS_MATH_TRANSPOSE_H_
namespace math {
namespace utils {
-void IncreaseIndexInDims(const int n, const int* dims, int* index) {
- for (int i = n - 1; i >= 0; --i) {
- ++index[i];
- if (index[i] >= dims[i]) {
- index[i] -= dims[i];
- } else {
- break;
- }
+#define CAFFE2_SPECIALIZED_INCREASE_INDEX_IN_DIMS(TIndex) \
+ template <> \
+ C10_EXPORT void IncreaseIndexInDims<TIndex>( \
+ const int ndim, const TIndex* dims, TIndex* index) { \
+ for (int i = ndim - 1; i >= 0; --i) { \
+ ++index[i]; \
+ if (index[i] >= dims[i]) { \
+ index[i] -= dims[i]; \
+ } else { \
+ break; \
+ } \
+ } \
}
-}
+CAFFE2_SPECIALIZED_INCREASE_INDEX_IN_DIMS(std::int32_t)
+CAFFE2_SPECIALIZED_INCREASE_INDEX_IN_DIMS(std::int64_t)
+#undef CAFFE2_SPECIALIZED_INCREASE_INDEX_IN_DIMS
int GetIndexFromDims(const int n, const int* dims, const int* index) {
int sum = 0;
}
}
-void ComputeTransposedStrides(
- const int ndim,
- const int* dims,
- const int* axes,
- int* strides) {
- std::vector<int> buff(ndim);
- int cur_stride = 1;
- for (int i = ndim - 1; i >= 0; --i) {
- buff[i] = cur_stride;
- cur_stride *= dims[i];
- }
- for (int i = 0; i < ndim; ++i) {
- strides[i] = buff[axes[i]];
+#define CAFFE2_SPECIALIZED_COMPUTE_TRANSPOSED_STRIDES(TIndex) \
+ template <> \
+ C10_EXPORT void ComputeTransposedStrides<TIndex>( \
+ const int ndim, const TIndex* dims, const int* axes, TIndex* strides) { \
+ std::vector<TIndex> buff(ndim); \
+ TIndex cur_stride = 1; \
+ for (int i = ndim - 1; i >= 0; --i) { \
+ buff[i] = cur_stride; \
+ cur_stride *= dims[i]; \
+ } \
+ for (int i = 0; i < ndim; ++i) { \
+ strides[i] = buff[axes[i]]; \
+ } \
}
-}
+CAFFE2_SPECIALIZED_COMPUTE_TRANSPOSED_STRIDES(std::int32_t)
+CAFFE2_SPECIALIZED_COMPUTE_TRANSPOSED_STRIDES(std::int64_t)
+#undef CAFFE2_SPECIALIZED_COMPUTE_TRANSPOSED_STRIDES
} // namespace utils
} // namespace math
#ifndef CAFFE2_UTILS_MATH_UTILS_H_
#define CAFFE2_UTILS_MATH_UTILS_H_
+#include <vector>
+
#include "caffe2/core/common.h"
#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) || \
}
// Increase the index digits by one based on dims.
-CAFFE2_API void IncreaseIndexInDims(const int n, const int* dims, int* index);
+template <typename TIndex>
+CAFFE2_API void
+IncreaseIndexInDims(int ndim, const TIndex* dims, TIndex* index);
// Get index value from dims and index digits.
CAFFE2_API int GetIndexFromDims(const int n, const int* dims, const int* index);
CAFFE2_API void
ComputeTransposeAxesForReduceOp(const int ndim, const int* dims, int* axes);
+template <typename TIndex>
CAFFE2_API void ComputeTransposedStrides(
- const int ndim,
- const int* dims,
+ int ndim,
+ const TIndex* dims,
const int* axes,
- int* strides);
+ TIndex* strides);
} // namespace utils
CAFFE2_SPECIALIZED_COPYVECTOR(float)
#undef CAFFE2_SPECIALIZED_COPYVECTOR
-namespace {
-
-#ifdef CAFFE2_USE_HPTT
-
-bool TransposeWithHPTT(
- const int ndim,
- const int* dims,
- const int* axes,
- const float* X,
- float* Y) {
- std::vector<int> axes_cm(ndim);
- std::vector<int> dims_cm(ndim);
- // Convert row-major index to column-major.
- const auto cm_fn = [ndim](const int i) { return ndim - i - 1; };
- for (int i = 0; i < ndim; ++i) {
- axes_cm[i] = cm_fn(axes[cm_fn(i)]);
- dims_cm[i] = dims[cm_fn(i)];
- }
-
- // HPTT doesn't handle 0 sized inputs.
- for (auto dim : dims_cm) {
- if (dim <= 0) {
- return false;
- }
- }
- auto plan = hptt::create_plan(
- axes_cm.data(),
- ndim,
- 1.0,
- X,
- dims_cm.data(),
- nullptr,
- 0.0,
- Y,
- nullptr,
- hptt::ESTIMATE,
- 1);
- if (plan == nullptr) {
- return false;
- }
- plan->execute();
- return true;
-}
-
-#endif // CAFFE2_USE_HPTT
-
-template <typename T>
-void Transpose2D(const int rows, const int cols, const T* X, T* Y);
-
-#ifdef CAFFE2_USE_MKL
-
-#define DELEGATE_TRANSPOSE_2D_FUNCTION(T, Func) \
- template <> \
- void Transpose2D<T>(const int rows, const int cols, const T* X, T* Y) { \
- Func('R', 'T', rows, cols, T(1), X, cols, Y, rows); \
- }
-DELEGATE_TRANSPOSE_2D_FUNCTION(float, mkl_somatcopy);
-DELEGATE_TRANSPOSE_2D_FUNCTION(double, mkl_domatcopy);
-#undef DELEGATE_TRANSPOSE_2D_FUNCTION
-
-#endif // CAFFE2_USE_MKL
-
-#define CAFFE2_SPECIALIZED_TRANSPOSE_2D(T) \
- template <> \
- void Transpose2D<T>(const int rows, const int cols, const T* X, T* Y) { \
- EigenMatrixMap<T>(Y, rows, cols) = \
- ConstEigenMatrixMap<T>(X, cols, rows).transpose(); \
- }
-
-#ifndef CAFFE2_USE_MKL
-
-template <>
-void Transpose2D<float>(
- const int rows,
- const int cols,
- const float* X,
- float* Y) {
-#ifdef CAFFE2_USE_HPTT
- const std::array<int, 2> dims = {rows, cols};
- const std::array<int, 2> axes = {1, 0};
- if (TransposeWithHPTT(2, dims.data(), axes.data(), X, Y)) {
- return;
- }
-#endif // CAFFE2_USE_HPTT
- EigenMatrixMap<float>(Y, rows, cols) =
- ConstEigenMatrixMap<float>(X, cols, rows).transpose();
-}
-
-CAFFE2_SPECIALIZED_TRANSPOSE_2D(double)
-
-#endif // CAFFE2_USE_MKL
-
-CAFFE2_SPECIALIZED_TRANSPOSE_2D(int)
-CAFFE2_SPECIALIZED_TRANSPOSE_2D(int64_t)
-CAFFE2_SPECIALIZED_TRANSPOSE_2D(std::uint8_t)
-CAFFE2_SPECIALIZED_TRANSPOSE_2D(std::uint16_t)
-
-#undef CAFFE2_SPECIALIZED_TRANSPOSE_2D
-
-std::vector<int>
-ComputeXStrides(const int ndim, const int* dims, const int* axes) {
- std::vector<int> x_strides(ndim);
- std::vector<int> buff(ndim);
- int cur_stride = 1;
- for (int i = ndim - 1; i >= 0; --i) {
- buff[i] = cur_stride;
- cur_stride *= dims[i];
- }
- for (int i = 0; i < ndim; ++i) {
- x_strides[i] = buff[axes[i]];
- }
- return x_strides;
-}
-
-template <typename T>
-void TransposeND(
- const int ndim,
- const int* dims,
- const int* axes,
- const T* X,
- T* Y) {
- std::vector<int> Y_dims(ndim);
- for (int i = 0; i < ndim; ++i) {
- Y_dims[i] = dims[axes[i]];
- }
- // Measure amount of contiguous data we can copy at once
- int block_size = 1;
- int num_shared_idx = 0;
- for (int i = ndim - 1; i >= 0 && axes[i] == i; --i) {
- block_size *= Y_dims[i];
- ++num_shared_idx;
- }
- const int itr_axes = ndim - num_shared_idx;
- const int num_blocks = std::accumulate(
- Y_dims.cbegin(), Y_dims.cbegin() + itr_axes, 1, std::multiplies<int>());
- const std::vector<int> X_strides = ComputeXStrides(itr_axes, dims, axes);
- std::vector<int> index(itr_axes, 0);
- for (int Y_index = 0; Y_index < num_blocks; ++Y_index) {
- const int X_index = std::inner_product(
- X_strides.cbegin(), X_strides.cend(), index.cbegin(), 0);
- if (block_size == 1) {
- Y[Y_index] = X[X_index];
- } else {
- std::memcpy(
- Y + block_size * Y_index,
- X + block_size * X_index,
- block_size * sizeof(T));
- }
- utils::IncreaseIndexInDims(itr_axes, Y_dims.data(), index.data());
- }
-}
-
-template <typename T>
-void TransposeCPUImpl(
- const int ndim,
- const int* dims,
- const int* axes,
- const T* X,
- T* Y) {
- if (utils::IsIdentityPermutation(ndim, axes)) {
- const int size =
- std::accumulate(dims, dims + ndim, 1, std::multiplies<int>());
- std::memcpy(Y, X, size * sizeof(T));
- return;
- }
- if (utils::IsBatchTranspose2D(ndim, axes)) {
- const int N =
- std::accumulate(dims, dims + ndim - 2, 1, std::multiplies<int>());
- const int H = dims[ndim - 2];
- const int W = dims[ndim - 1];
- for (int i = 0; i < N; ++i) {
- Transpose2D<T>(H, W, X + i * H * W, Y + i * H * W);
- }
- return;
- }
- TransposeND<T>(ndim, dims, axes, X, Y);
-}
-
-template <>
-void TransposeCPUImpl(
- const int ndim,
- const int* dims,
- const int* axes,
- const float* X,
- float* Y) {
- if (utils::IsIdentityPermutation(ndim, axes)) {
- const int size =
- std::accumulate(dims, dims + ndim, 1, std::multiplies<int>());
- std::memcpy(Y, X, size * sizeof(float));
- return;
- }
- if (utils::IsBatchTranspose2D(ndim, axes)) {
- const int N =
- std::accumulate(dims, dims + ndim - 2, 1, std::multiplies<int>());
- const int H = dims[ndim - 2];
- const int W = dims[ndim - 1];
- for (int i = 0; i < N; ++i) {
- Transpose2D<float>(H, W, X + i * H * W, Y + i * H * W);
- }
- return;
- }
-#ifdef CAFFE2_USE_HPTT
- if (TransposeWithHPTT(ndim, dims, axes, X, Y)) {
- return;
- }
-#endif
- TransposeND<float>(ndim, dims, axes, X, Y);
-}
-
-} // namespace
-
-#define CAFFE2_SPECIALIZED_TRANSPOSE(T) \
- template <> \
- C10_EXPORT void Transpose<T, CPUContext>( \
- const int ndim, \
- const int* dims, \
- const int* axes, \
- const T* X, \
- T* Y, \
- CPUContext* /* context */) { \
- TransposeCPUImpl(ndim, dims, axes, X, Y); \
- }
-CAFFE2_SPECIALIZED_TRANSPOSE(float)
-CAFFE2_SPECIALIZED_TRANSPOSE(double)
-CAFFE2_SPECIALIZED_TRANSPOSE(int)
-CAFFE2_SPECIALIZED_TRANSPOSE(int64_t)
-CAFFE2_SPECIALIZED_TRANSPOSE(std::uint8_t)
-CAFFE2_SPECIALIZED_TRANSPOSE(std::uint16_t)
-#undef CAFFE2_SPECIALIZED_TRANSPOSE
-
-#define CAFFE2_SPECIALIZED_NCHW2NHWC(T) \
- template <> \
- C10_EXPORT void NCHW2NHWC<T, CPUContext>( \
- const int N, \
- const int C, \
- const int HxW, \
- const T* X, \
- T* Y, \
- CPUContext* /* context */) { \
- const int stride = C * HxW; \
- for (int i = 0; i < N; ++i) { \
- Transpose2D<T>(C, HxW, X + i * stride, Y + i * stride); \
- } \
- }
-CAFFE2_SPECIALIZED_NCHW2NHWC(float)
-#undef CAFFE2_SPECIALIZED_NCHW2NHWC
-
-#define CAFFE2_SPECIALIZED_NHWC2NCHW(T) \
- template <> \
- C10_EXPORT void NHWC2NCHW<T, CPUContext>( \
- const int N, \
- const int C, \
- const int HxW, \
- const T* X, \
- T* Y, \
- CPUContext* /* context */) { \
- const int stride = HxW * C; \
- for (int i = 0; i < N; ++i) { \
- Transpose2D<T>(HxW, C, X + i * stride, Y + i * stride); \
- } \
- }
-CAFFE2_SPECIALIZED_NHWC2NCHW(float)
-#undef CAFFE2_SPECIALIZED_NHWC2NCHW
-
} // namespace math
} // namespace caffe2
CAFFE2_SPECIALIZED_CUDA_INV_STD(float)
#undef CAFFE2_SPECIALIZED_CUDA_INV_STD
-namespace {
-
-constexpr int kTileDim = 32;
-constexpr int kBlockRows = 8;
-
-// Splits the original matrix into submatrices with size 32 * 32.
-// Each block transposes one submatrix by loading it into shared memory.
-// Reference https://devblogs.nvidia.com/efficient-matrix-transpose-cuda-cc/
-template <typename T>
-__global__ void BatchTranspose2DCUDAKernel(
- const int N,
- const int H,
- const int W,
- const T* X,
- T* Y) {
- __shared__ T tile[kTileDim][kTileDim + 1];
- const int h = (H + kTileDim - 1) / kTileDim;
- const int w = (W + kTileDim - 1) / kTileDim;
- const int outer_size = N * h * w;
- for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
- const int n = i / (h * w);
- const int k = i % (h * w);
- const int r = k / w;
- const int c = k % w;
- const int offset = n * H * W;
- int x = c * kTileDim + threadIdx.x;
- int y = r * kTileDim + threadIdx.y;
- if (x < W) {
- for (int j = 0; j < kTileDim && y + j < H; j += kBlockRows) {
-#if __CUDA_ARCH__ >= 350 || defined(__HIP_PLATFORM_HCC__)
- tile[threadIdx.y + j][threadIdx.x] =
- __ldg(X + offset + (y + j) * W + x);
-#else
- tile[threadIdx.y + j][threadIdx.x] = X[offset + (y + j) * W + x];
-#endif
- }
- }
- __syncthreads();
- x = r * kTileDim + threadIdx.x;
- y = c * kTileDim + threadIdx.y;
- if (x < H) {
- for (int j = 0; j < kTileDim && y + j < W; j += kBlockRows) {
- Y[offset + (y + j) * H + x] = tile[threadIdx.x][threadIdx.y + j];
- }
- }
- __syncthreads();
- }
-}
-
-template <typename T, int D>
-__global__ void TransposeCUDAKernel(
- const int size,
- const SimpleArray<int, D> X_strides,
- const SimpleArray<FIXED_DIVISOR, D> Y_dims,
- const T* X,
- T* Y) {
- CUDA_1D_KERNEL_LOOP(Y_index, size) {
- int X_index = 0;
- int Y_index_val = Y_index;
-#pragma unroll
- for (int i = D - 1; i >= 0; --i) {
- int d;
- FIXED_DIVISOR_DIV_MOD(Y_dims.data[i], Y_index_val, &Y_index_val, &d);
- X_index += d * X_strides.data[i];
- }
-#if __CUDA_ARCH__ >= 350 || defined(__HIP_PLATFORM_HCC__)
- Y[Y_index] = __ldg(X + X_index);
-#else
- Y[Y_index] = X[X_index];
-#endif
- }
-}
-
-template <typename T, int D>
-CAFFE2_CUDA_EXPORT void TransposeCUDAImpl(
- const int* dims,
- const int* axes,
- const T* X,
- T* Y,
- CUDAContext* context) {
- SimpleArray<int, D> X_strides;
- SimpleArray<FIXED_DIVISOR, D> Y_dims;
- utils::ComputeTransposedStrides(D, dims, axes, X_strides.data);
- int size = 1;
- for (int i = 0; i < D; ++i) {
- Y_dims.data[i] = FIXED_DIVISOR(dims[axes[i]]);
- size *= dims[i];
- }
- TransposeCUDAKernel<T, D>
- <<<CAFFE_GET_BLOCKS(size),
- CAFFE_CUDA_NUM_THREADS,
- 0,
- context->cuda_stream()>>>(size, X_strides, Y_dims, X, Y);
-}
-
-} // namespace
-
-#define CAFFE2_SPECIALIZED_CUDA_TRANSPOSE(T) \
- template <> \
- CAFFE2_CUDA_EXPORT void Transpose<T, CUDAContext>( \
- const int ndim, \
- const int* dims, \
- const int* axes, \
- const T* X, \
- T* Y, \
- CUDAContext* context) { \
- if (utils::IsIdentityPermutation(ndim, axes)) { \
- const int size = \
- std::accumulate(dims, dims + ndim, 1, std::multiplies<int>()); \
- context->template CopySameDevice<T>(size, X, Y); \
- return; \
- } \
- if (utils::IsBatchTranspose2D(ndim, axes)) { \
- const int N = \
- std::accumulate(dims, dims + ndim - 2, 1, std::multiplies<int>()); \
- const int H = dims[ndim - 2]; \
- const int W = dims[ndim - 1]; \
- const int h = (H + kTileDim - 1) / kTileDim; \
- const int w = (W + kTileDim - 1) / kTileDim; \
- const int outer_size = N * h * w; \
- const dim3 dim_block(kTileDim, kBlockRows, 1); \
- BatchTranspose2DCUDAKernel<T> \
- <<<std::min(outer_size, CAFFE_MAXIMUM_NUM_BLOCKS), \
- dim_block, \
- 0, \
- context->cuda_stream()>>>(N, H, W, X, Y); \
- return; \
- } \
- DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_1( \
- ndim, TransposeCUDAImpl, T, dims, axes, X, Y, context); \
- }
-CAFFE2_SPECIALIZED_CUDA_TRANSPOSE(float)
-CAFFE2_SPECIALIZED_CUDA_TRANSPOSE(double)
-CAFFE2_SPECIALIZED_CUDA_TRANSPOSE(int)
-CAFFE2_SPECIALIZED_CUDA_TRANSPOSE(int64_t)
-#undef CAFFE2_SPECIALIZED_CUDA_TRANSPOSE
-
-#define CAFFE2_SPECIALIZED_CUDA_NCHW2NHWC(T) \
- template <> \
- CAFFE2_CUDA_EXPORT void NCHW2NHWC<T, CUDAContext>( \
- const int N, \
- const int C, \
- const int HxW, \
- const T* X, \
- T* Y, \
- CUDAContext* context) { \
- const int h = (C + kTileDim - 1) / kTileDim; \
- const int w = (HxW + kTileDim - 1) / kTileDim; \
- const int outer_size = N * h * w; \
- const dim3 dim_block(kTileDim, kBlockRows, 1); \
- BatchTranspose2DCUDAKernel<T> \
- <<<std::min(outer_size, CAFFE_MAXIMUM_NUM_BLOCKS), \
- dim_block, \
- 0, \
- context->cuda_stream()>>>(N, C, HxW, X, Y); \
- }
-CAFFE2_SPECIALIZED_CUDA_NCHW2NHWC(float)
-#undef CAFFE2_SPECIALIZED_CUDA_NCHW2NHWC
-
-#define CAFFE2_SPECIALIZED_CUDA_NHWC2NCHW(T) \
- template <> \
- CAFFE2_CUDA_EXPORT void NHWC2NCHW<T, CUDAContext>( \
- const int N, \
- const int C, \
- const int HxW, \
- const T* X, \
- T* Y, \
- CUDAContext* context) { \
- const int h = (HxW + kTileDim - 1) / kTileDim; \
- const int w = (C + kTileDim - 1) / kTileDim; \
- const int outer_size = N * h * w; \
- const dim3 dim_block(kTileDim, kBlockRows, 1); \
- BatchTranspose2DCUDAKernel<T> \
- <<<std::min(outer_size, CAFFE_MAXIMUM_NUM_BLOCKS), \
- dim_block, \
- 0, \
- context->cuda_stream()>>>(N, HxW, C, X, Y); \
- }
-CAFFE2_SPECIALIZED_CUDA_NHWC2NCHW(float)
-#undef CAFFE2_SPECIALIZED_CUDA_NHWC2NCHW
-
} // namespace math
} // namespace caffe2
{1.0f, 1.0f, 2.0f, 2.0f, 1.0f, 1.0f, 2.0f, 2.0f});
}
-class TransposeGPUTest : public testing::Test {
- protected:
- void SetUp() override {
- if (!HasCudaGPU()) {
- return;
- }
- option_.set_device_type(PROTO_CUDA);
- cuda_context_ = make_unique<CUDAContext>(option_);
- Blob* blob_x = ws_.CreateBlob("X");
- Blob* blob_y = ws_.CreateBlob("Y");
- X_ = BlobGetMutableTensor(blob_x, CUDA);
- Y_ = BlobGetMutableTensor(blob_y, CUDA);
- }
-
- void SetUpData(
- const std::vector<int>& X_dims,
- const std::vector<int>& axes,
- const std::vector<float>& X_data) {
- const int ndim = X_dims.size();
- std::vector<int> Y_dims(ndim);
- for (int i = 0; i < ndim; ++i) {
- Y_dims[i] = X_dims[axes[i]];
- }
- X_->Resize(X_dims);
- Y_->Resize(Y_dims);
- ASSERT_EQ(X_data.size(), X_->numel());
- cuda_context_->CopyFromCPU<float>(
- X_data.size(), X_data.data(), X_->mutable_data<float>());
- }
-
- void VerifyResult(const std::vector<float>& expected_output) {
- Blob* blob_y_host = ws_.CreateBlob("Y_host");
- auto* Y_host = BlobGetMutableTensor(blob_y_host, CPU);
- Y_host->CopyFrom(*Y_);
- ASSERT_EQ(expected_output.size(), Y_host->numel());
- for (std::size_t i = 0; i < expected_output.size(); ++i) {
- EXPECT_FLOAT_EQ(expected_output[i], Y_host->data<float>()[i]);
- }
- }
-
- void RunTransposeTest(
- const std::vector<int>& X_dims,
- const std::vector<int>& axes,
- const std::vector<float>& X_data,
- const std::vector<float>& Y_data) {
- SetUpData(X_dims, axes, X_data);
- math::Transpose<float, CUDAContext>(
- X_dims.size(),
- X_dims.data(),
- axes.data(),
- X_->data<float>(),
- Y_->mutable_data<float>(),
- cuda_context_.get());
- cuda_context_->FinishDeviceComputation();
- VerifyResult(Y_data);
- }
-
- Workspace ws_;
- DeviceOption option_;
- std::unique_ptr<CUDAContext> cuda_context_;
- Tensor* X_ = nullptr;
- Tensor* Y_ = nullptr;
-};
-
-TEST_F(TransposeGPUTest, TransposeGPUFloatTest) {
- if (!HasCudaGPU()) {
- return;
- }
- // Test for 1D transpose.
- RunTransposeTest({3}, {0}, {1.0f, 2.0f, 3.0f}, {1.0f, 2.0f, 3.0f});
-
- // Test for 2D transpose.
- RunTransposeTest(
- {2, 3},
- {1, 0},
- {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
- {1.0f, 4.0f, 2.0f, 5.0f, 3.0f, 6.0f});
-
- // Test for 3D transpose.
- RunTransposeTest(
- {2, 2, 2},
- {1, 2, 0},
- {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
- {1.0f, 5.0f, 2.0f, 6.0f, 3.0f, 7.0f, 4.0f, 8.0f});
- RunTransposeTest(
- {2, 2, 2},
- {1, 0, 2},
- {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
- {1.0f, 2.0f, 5.0f, 6.0f, 3.0f, 4.0f, 7.0f, 8.0f});
-}
-
} // namespace
} // namespace caffe2
20, 1, 1000, 1000, l.data(), cpu_context_.get());
}
-class TransposeTest : public testing::Test {
- protected:
- void SetUp() override {
- cpu_context_ = make_unique<CPUContext>(option_);
- }
-
- void RunTransposeTest(
- const std::vector<int>& X_dims,
- const std::vector<int>& axes,
- const std::vector<float>& X_data,
- const std::vector<float>& Y_data) {
- const int ndim = X_dims.size();
- std::vector<int> Y_dims(ndim);
- for (int i = 0; i < ndim; ++i) {
- Y_dims[i] = X_dims[axes[i]];
- }
- std::vector<int64_t> X_dims_64;
- std::vector<int64_t> Y_dims_64;
- std::copy(X_dims.cbegin(), X_dims.cend(), std::back_inserter(X_dims_64));
- std::copy(Y_dims.cbegin(), Y_dims.cend(), std::back_inserter(Y_dims_64));
- ReinitializeTensor(&X_, X_dims_64, at::dtype<float>().device(CPU));
- ReinitializeTensor(&Y_, Y_dims_64, at::dtype<float>().device(CPU));
- ASSERT_EQ(X_data.size(), X_.numel());
- cpu_context_->CopyFromCPU<float>(
- X_data.size(), X_data.data(), X_.mutable_data<float>());
- math::Transpose<float, CPUContext>(
- X_dims.size(),
- X_dims.data(),
- axes.data(),
- X_.data<float>(),
- Y_.mutable_data<float>(),
- cpu_context_.get());
- ASSERT_EQ(Y_data.size(), Y_.numel());
- for (int i = 0; i < Y_.numel(); ++i) {
- EXPECT_FLOAT_EQ(Y_data[i], Y_.data<float>()[i]);
- }
- }
-
- DeviceOption option_;
- std::unique_ptr<CPUContext> cpu_context_;
-
- Tensor X_;
- Tensor Y_;
-};
-
-TEST_F(TransposeTest, TransposeFloatTest) {
- // Test for 1D transpose.
- RunTransposeTest({3}, {0}, {1.0f, 2.0f, 3.0f}, {1.0f, 2.0f, 3.0f});
-
- // Test for 2D transpose.
- RunTransposeTest(
- {2, 3},
- {1, 0},
- {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
- {1.0f, 4.0f, 2.0f, 5.0f, 3.0f, 6.0f});
-
- // Test for 3D transpose.
- RunTransposeTest(
- {2, 2, 2},
- {1, 2, 0},
- {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
- {1.0f, 5.0f, 2.0f, 6.0f, 3.0f, 7.0f, 4.0f, 8.0f});
- RunTransposeTest(
- {2, 2, 2},
- {1, 0, 2},
- {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
- {1.0f, 2.0f, 5.0f, 6.0f, 3.0f, 4.0f, 7.0f, 8.0f});
-}
-
} // namespace
} // namespace caffe2