From 7536887cb7fbcbaf964fcef979e28b693ec70106 Mon Sep 17 00:00:00 2001 From: Xiaomeng Yang Date: Thu, 17 Jan 2019 02:07:04 -0800 Subject: [PATCH] Add count_include_pad for avg_pool on CuDNN (#16100) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/16100 Add count_include_pad for avg_pool on CuDNN Reviewed By: houseroad Differential Revision: D13707959 fbshipit-source-id: 261f5d116066fef75cf9a5787dfbc5d12b5b9f9b --- caffe2/operators/pool_op.cc | 26 +- caffe2/operators/pool_op.cu | 113 ++++-- caffe2/operators/pool_op.h | 1 + caffe2/operators/pool_op_cudnn.cc | 594 ++++++++++++++++++++++++++++ caffe2/operators/pool_op_cudnn.cu | 534 ------------------------- caffe2/python/operator_test/pooling_test.py | 55 +++ 6 files changed, 744 insertions(+), 579 deletions(-) create mode 100644 caffe2/operators/pool_op_cudnn.cc delete mode 100644 caffe2/operators/pool_op_cudnn.cu diff --git a/caffe2/operators/pool_op.cc b/caffe2/operators/pool_op.cc index 8f5d522..314e7be 100644 --- a/caffe2/operators/pool_op.cc +++ b/caffe2/operators/pool_op.cc @@ -997,14 +997,24 @@ std::function AveragePoolDocGenerator(const char* dim) { "X", "*(type: Tensor``)* Input data tensor of shape NCHW or NHWC."); schema.Output(0, "Y", "*(type: Tensor``)* Output data tensor."); - /* - schema.Arg("kernel", "*(type: int)* Size of the window to take an average - over."); schema.Arg("stride", "*(type: int)* Stride of the window."); - schema.Arg("pad", "*(type: int)* Implicit zero padding to be added on both - sides."); schema.Arg("dilation", "*(type: int)* Parameter that controls - the stride of elements in the window."); schema.Arg("order", "*(type: - string; default: 'NCHW')* Order of the blob dimensions."); - */ + // schema.Arg( + // "kernel", "*(type: int)* Size of the window to take an average + // over."); + // schema.Arg("stride", "*(type: int)* Stride of the window."); + // schema.Arg( + // "pad", + // "*(type: int)* Implicit zero padding to be added on both sides."); + // schema.Arg( + // "dilation", + // "*(type: int)* Parameter that controls the stride of elements in the + // " "window."); + // schema.Arg( + // "order", + // "*(type: string; default: 'NCHW')* Order of the blob dimensions."); + // schema.Arg( + // "count_include_pad", + // "*(type: bool; default: False)* When True, will include the " + // "zero-padding in the averaging."); }; } diff --git a/caffe2/operators/pool_op.cu b/caffe2/operators/pool_op.cu index 83f2759..b9f6050 100644 --- a/caffe2/operators/pool_op.cu +++ b/caffe2/operators/pool_op.cu @@ -689,25 +689,53 @@ __global__ void AveragePool3DBackwardNHWCCUDAKernel( } // namespace template <> -template -bool AveragePoolFunctor::GlobalPoolingForward( - const int N, - const int C, - const int HxW, - const T* X, - T* Y, - CUDAContext* context) const { - if (kOrder == StorageOrder::NCHW) { - const std::array dims = {N * C, HxW}; - const int axis = 1; - math::ReduceMean( - 2, dims.data(), 1, &axis, 1.0f, X, Y, context); - } else { - const std::array dims = {N, HxW, C}; - const int axis = 1; - math::ReduceMean( - 3, dims.data(), 1, &axis, 1.0f, X, Y, context); +template <> +bool AveragePoolFunctor:: + GlobalPoolingForward( + const int N, + const int C, + const int HxW, + const float* X, + float* Y, + CUDAContext* context) const { + const std::array dims = {N * C, HxW}; + const int axis = 1; + math::ReduceMean( + 2, dims.data(), 1, &axis, 1.0f, X, Y, context); + return true; +} + +template <> +template <> +bool AveragePoolFunctor:: + GlobalPoolingForward( + const int N, + const int C, + const int HxW, + const float* X, + float* Y, + CUDAContext* context) const { + if (ones.numel() != HxW) { + ones.Resize(HxW); + math::Set( + HxW, 1.0f, ones.mutable_data(), context); } + math::GemmStridedBatched( + CblasTrans, + CblasNoTrans, + N, + C, + 1, + HxW, + 1.0f / static_cast(HxW), + X, + HxW * C, + ones.data(), + 0, + 0.0f, + Y, + C, + context); return true; } @@ -1719,25 +1747,36 @@ __global__ void MaxPool3DBackwardNHWCCUDAKernel( } // namespace template <> -template -bool MaxPoolFunctor::GlobalPoolingForward( - const int N, - const int C, - const int HxW, - const T* X, - T* Y, - CUDAContext* context) const { - if (kOrder == StorageOrder::NCHW) { - const std::array dims = {N * C, HxW}; - const int axis = 1; - math::ReduceMax( - 2, dims.data(), 1, &axis, 1.0f, X, Y, context); - } else { - const std::array dims = {N, HxW, C}; - const int axis = 1; - math::ReduceMax( - 3, dims.data(), 1, &axis, 1.0f, X, Y, context); - } +template <> +bool MaxPoolFunctor:: + GlobalPoolingForward( + const int N, + const int C, + const int HxW, + const float* X, + float* Y, + CUDAContext* context) const { + const std::array dims = {N * C, HxW}; + const int axis = 1; + math::ReduceMax( + 2, dims.data(), 1, &axis, 1.0f, X, Y, context); + return true; +} + +template <> +template <> +bool MaxPoolFunctor:: + GlobalPoolingForward( + const int N, + const int C, + const int HxW, + const float* X, + float* Y, + CUDAContext* context) const { + const std::array dims = {N, HxW, C}; + const int axis = 1; + math::ReduceMax( + 3, dims.data(), 1, &axis, 1.0f, X, Y, context); return true; } diff --git a/caffe2/operators/pool_op.h b/caffe2/operators/pool_op.h index 8c9db86..909fe12 100644 --- a/caffe2/operators/pool_op.h +++ b/caffe2/operators/pool_op.h @@ -238,6 +238,7 @@ struct AveragePoolFunctor { Context* context) const; const bool count_include_pad; + Tensor ones{Context::GetDeviceType()}; }; template diff --git a/caffe2/operators/pool_op_cudnn.cc b/caffe2/operators/pool_op_cudnn.cc new file mode 100644 index 0000000..1ed723c --- /dev/null +++ b/caffe2/operators/pool_op_cudnn.cc @@ -0,0 +1,594 @@ +#include "caffe2/operators/pool_op.h" + +#include +#include +#include +#include + +#include "caffe2/core/context_gpu.h" +#include "caffe2/core/cudnn_wrappers.h" + +namespace caffe2 { + +namespace { + +void SetTensorDescriptor( + const cudnnDataType_t data_type, + const StorageOrder order, + const std::vector& dims, + cudnnTensorDescriptor_t* desc) { + const int ndim = dims.size(); + const int N = dims[0]; + const int C = order == StorageOrder::NCHW ? dims[1] : dims[ndim - 1]; + switch (ndim) { + case 4: { + const int H = order == StorageOrder::NCHW ? dims[2] : dims[1]; + const int W = order == StorageOrder::NCHW ? dims[3] : dims[2]; + CUDNN_ENFORCE(cudnnSetTensor4dDescriptor( + *desc, GetCudnnTensorFormat(order), data_type, N, C, H, W)); + break; + } + case 5: { + const int D = order == StorageOrder::NCHW ? dims[2] : dims[1]; + const int H = order == StorageOrder::NCHW ? dims[3] : dims[2]; + const int W = order == StorageOrder::NCHW ? dims[4] : dims[3]; + const std::array dims_arr = {N, C, D, H, W}; + const std::array strides_arr = order == StorageOrder::NCHW + ? std::array{C * D * H * W, D * H * W, H * W, W, 1} + : std::array{D * H * W * C, 1, H * W * C, W * C, C}; + CUDNN_ENFORCE(cudnnSetTensorNdDescriptor( + *desc, data_type, 5, dims_arr.data(), strides_arr.data())); + break; + } + default: { + CAFFE_THROW("Unsupported tensor dim: ", ndim); + break; + } + } +} + +template +class CuDNNPoolOp final : public ConvPoolOpBase { + public: + CuDNNPoolOp(const OperatorDef& operator_def, Workspace* ws) + : ConvPoolOpBase(operator_def, ws), + cudnn_wrapper_(&context_), + functor_(*this), + equal_padding_(std::equal( + pads_.cbegin(), + pads_.cbegin() + kernel_.size(), + pads_.cbegin() + kernel_.size())) { + CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&X_desc_)); + CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&Y_desc_)); + CUDNN_ENFORCE(cudnnCreatePoolingDescriptor(&pooling_desc_)); + if (!global_pooling_ && equal_padding_) { + if (kernel_.size() == 2) { + CUDNN_ENFORCE(cudnnSetPooling2dDescriptor( + pooling_desc_, + functor_.GetPoolingMode(), + CUDNN_NOT_PROPAGATE_NAN, + kernel_h(), + kernel_w(), + pad_t(), + pad_l(), + stride_h(), + stride_w())); + } else if (kernel_.size() == 3) { + CUDNN_ENFORCE(cudnnSetPoolingNdDescriptor( + pooling_desc_, + functor_.GetPoolingMode(), + CUDNN_NOT_PROPAGATE_NAN, + kernel_.size(), + kernel_.data(), + pads_.data(), + stride_.data())); + } + } + } + + ~CuDNNPoolOp() { + CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(X_desc_)); + CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(Y_desc_)); + CUDNN_ENFORCE(cudnnDestroyPoolingDescriptor(pooling_desc_)); + } + + bool RunOnDevice() override { + return DispatchHelper>::call(this, Input(0)); + } + + template + bool DoRunWithType() { + const auto& X = Input(0); + auto* Y = Output(0); + const int ndim = X.ndim(); + const int N = X.dim32(0); + const int C = order_ == StorageOrder::NCHW ? X.dim32(1) : X.dim32(ndim - 1); + ConvPoolOpBase::SetOutputSize(X, Y, C); + const T* X_data = X.template data(); + T* Y_data = Y->template mutable_data(); + + if (global_pooling_) { + const int HxW = X.numel() / (N * C); + if (order_ == StorageOrder::NCHW) { + return functor_.template GlobalPoolingForward( + N, C, HxW, X_data, Y_data, &context_); + } else { + return functor_.template GlobalPoolingForward( + N, C, HxW, X_data, Y_data, &context_); + } + } + + const std::vector X_HW_dims = GetDims(X); + const std::vector Y_HW_dims = GetDims(*Y); + if (order_ == StorageOrder::NHWC) { + // CuDNN Pooling on NHWC order is very slow, fallback to CUDA + // implementation. + return functor_.template Forward( + N, + C, + X_HW_dims, + Y_HW_dims, + kernel_, + dilation_, + stride_, + pads_, + X.template data(), + Y->template mutable_data(), + &context_); + } else if (!equal_padding_ || ndim == 3) { + return functor_.template Forward( + N, + C, + X_HW_dims, + Y_HW_dims, + kernel_, + dilation_, + stride_, + pads_, + X.template data(), + Y->template mutable_data(), + &context_); + } + + const std::vector X_dims = X.sizes().vec(); + const std::vector Y_dims = Y->sizes().vec(); + if (cached_X_dims_ != X_dims) { + constexpr cudnnDataType_t data_type = cudnnTypeWrapper::type; + SetTensorDescriptor(data_type, order_, X_dims, &X_desc_); + SetTensorDescriptor(data_type, order_, Y_dims, &Y_desc_); + cached_X_dims_ = X_dims; + } + CUDNN_ENFORCE(cudnnPoolingForward( + cudnn_wrapper_.inline_cudnn_handle(), + pooling_desc_, + cudnnTypeWrapper::kOne(), + X_desc_, + X_data, + cudnnTypeWrapper::kZero(), + Y_desc_, + Y_data)); + + return true; + } + + private: + CuDNNWrapper cudnn_wrapper_; + cudnnTensorDescriptor_t X_desc_; + cudnnTensorDescriptor_t Y_desc_; + cudnnPoolingDescriptor_t pooling_desc_; + + const Functor functor_; + + const bool equal_padding_; + std::vector cached_X_dims_; +}; + +template +class CuDNNPoolGradientOp final : public ConvPoolOpBase { + public: + CuDNNPoolGradientOp(const OperatorDef& operator_def, Workspace* ws) + : ConvPoolOpBase(operator_def, ws), + cudnn_wrapper_(&context_), + functor_(*this), + equal_padding_(std::equal( + pads_.cbegin(), + pads_.cbegin() + kernel_.size(), + pads_.cbegin() + kernel_.size())) { + CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&X_desc_)); + CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&Y_desc_)); + CUDNN_ENFORCE(cudnnCreatePoolingDescriptor(&pooling_desc_)); + if (!global_pooling_ && equal_padding_) { + if (kernel_.size() == 2) { + CUDNN_ENFORCE(cudnnSetPooling2dDescriptor( + pooling_desc_, + functor_.GetPoolingMode(), + CUDNN_NOT_PROPAGATE_NAN, + kernel_h(), + kernel_w(), + pad_t(), + pad_l(), + stride_h(), + stride_w())); + } else if (kernel_.size() == 3) { + CUDNN_ENFORCE(cudnnSetPoolingNdDescriptor( + pooling_desc_, + functor_.GetPoolingMode(), + CUDNN_NOT_PROPAGATE_NAN, + kernel_.size(), + kernel_.data(), + pads_.data(), + stride_.data())); + } + } + } + + ~CuDNNPoolGradientOp() { + CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(X_desc_)); + CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(Y_desc_)); + CUDNN_ENFORCE(cudnnDestroyPoolingDescriptor(pooling_desc_)); + } + + bool RunOnDevice() override { + return DispatchHelper>::call(this, Input(0)); + } + + template + bool DoRunWithType() { + const auto& X = Input(0); + const auto& Y = Input(1); + const auto& dY = Input(2); + auto* dX = Output(0, X.sizes(), at::dtype()); + const int ndim = X.ndim(); + const int N = X.dim32(0); + const int C = order_ == StorageOrder::NCHW ? X.dim32(1) : X.dim32(ndim - 1); + const std::vector X_HW_dims = GetDims(X); + const std::vector Y_HW_dims = GetDims(Y); + ConvPoolOpBase::ComputePads(X_HW_dims); + const T* dY_data = dY.template data(); + const T* X_data = X.template data(); + const T* Y_data = Y.template data(); + T* dX_data = dX->template mutable_data(); + + if (global_pooling_) { + const int HxW = X.numel() / (N * C); + if (order_ == StorageOrder::NCHW) { + return functor_.template GlobalPoolingBackward( + N, C, HxW, dY_data, X_data, Y_data, dX_data, &context_); + } else { + return functor_.template GlobalPoolingBackward( + N, C, HxW, dY_data, X_data, Y_data, dX_data, &context_); + } + } + + if (order_ == StorageOrder::NHWC) { + // CuDNN Pooling on NHWC order is very slow, fallback to CUDA + // implementation. + return functor_.template Backward( + N, + C, + X_HW_dims, + Y_HW_dims, + kernel_, + dilation_, + stride_, + pads_, + dY_data, + X_data, + Y_data, + dX_data, + &context_); + } else if (!equal_padding_ || ndim == 3) { + return functor_.template Backward( + N, + C, + X_HW_dims, + Y_HW_dims, + kernel_, + dilation_, + stride_, + pads_, + dY_data, + X_data, + Y_data, + dX_data, + &context_); + } + + const std::vector X_dims = X.sizes().vec(); + const std::vector Y_dims = Y.sizes().vec(); + if (cached_X_dims_ != X_dims) { + constexpr cudnnDataType_t data_type = cudnnTypeWrapper::type; + SetTensorDescriptor(data_type, order_, X_dims, &X_desc_); + SetTensorDescriptor(data_type, order_, Y_dims, &Y_desc_); + cached_X_dims_ = X_dims; + } + CUDNN_ENFORCE(cudnnPoolingBackward( + cudnn_wrapper_.inline_cudnn_handle(), + pooling_desc_, + cudnnTypeWrapper::kOne(), + Y_desc_, + Y_data, + Y_desc_, + dY_data, + X_desc_, + X_data, + cudnnTypeWrapper::kZero(), + X_desc_, + dX_data)); + + return true; + } + + private: + CuDNNWrapper cudnn_wrapper_; + cudnnTensorDescriptor_t X_desc_; + cudnnTensorDescriptor_t Y_desc_; + cudnnPoolingDescriptor_t pooling_desc_; + + const Functor functor_; + + const bool equal_padding_; + std::vector cached_X_dims_; +}; + +struct CuDNNAveragePoolFunctor { + explicit CuDNNAveragePoolFunctor(const OperatorBase& op) + : avg_pool_functor(op) {} + + cudnnPoolingMode_t GetPoolingMode() const { + return avg_pool_functor.count_include_pad + ? CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING + : CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING; + } + + template + bool GlobalPoolingForward( + const int N, + const int C, + const int HxW, + const T* X, + T* Y, + CUDAContext* context) const { + if (std::is_same::value) { + CAFFE_THROW("Float16 is not supported for average_pooling."); + return false; + } else { + return avg_pool_functor.GlobalPoolingForward( + N, C, HxW, X, Y, context); + } + } + + template + bool Forward( + const int N, + const int C, + const std::vector& X_dims, + const std::vector& Y_dims, + const std::vector& kernel, + const std::vector& dilation, + const std::vector& stride, + const std::vector& pads, + const T* X, + T* Y, + CUDAContext* context) const { + if (std::is_same::value) { + CAFFE_THROW("Float16 is not supported for average_pooling."); + return false; + } else { + return avg_pool_functor.Forward( + N, C, X_dims, Y_dims, kernel, dilation, stride, pads, X, Y, context); + } + } + + template + bool GlobalPoolingBackward( + const int N, + const int C, + const int HxW, + const T* dY, + const T* X, + const T* Y, + T* dX, + CUDAContext* context) const { + if (std::is_same::value) { + CAFFE_THROW("Float16 is not supported for average_pooling."); + return false; + } else { + return avg_pool_functor.GlobalPoolingBackward( + N, C, HxW, dY, X, Y, dX, context); + } + } + + template + bool Backward( + const int N, + const int C, + const std::vector& X_dims, + const std::vector& Y_dims, + const std::vector& kernel, + const std::vector& dilation, + const std::vector& stride, + const std::vector& pads, + const T* dY, + const T* X, + const T* Y, + T* dX, + CUDAContext* context) const { + if (std::is_same::value) { + CAFFE_THROW("Float16 is not supported for average_pooling."); + return false; + } else { + return avg_pool_functor.Backward( + N, + C, + X_dims, + Y_dims, + kernel, + dilation, + stride, + pads, + dY, + X, + Y, + dX, + context); + } + } + + const AveragePoolFunctor avg_pool_functor; +}; + +struct CuDNNMaxPoolFunctor { + explicit CuDNNMaxPoolFunctor(const OperatorBase& op) + : max_pool_functor(op), + deterministic(op.GetSingleArgument("deterministic", false)) {} + + cudnnPoolingMode_t GetPoolingMode() const { +#if CUDNN_VERSION_MIN(6, 0, 0) + return deterministic ? CUDNN_POOLING_MAX_DETERMINISTIC : CUDNN_POOLING_MAX; +#else + return CUDNN_POOLING_MAX; +#endif + } + + template + bool GlobalPoolingForward( + const int N, + const int C, + const int HxW, + const T* X, + T* Y, + CUDAContext* context) const { + if (std::is_same::value) { + CAFFE_THROW("Float16 is not supported for max_pooling."); + return false; + } else { + return max_pool_functor.GlobalPoolingForward( + N, C, HxW, X, Y, context); + } + } + + template + bool Forward( + const int N, + const int C, + const std::vector& X_dims, + const std::vector& Y_dims, + const std::vector& kernel, + const std::vector& dilation, + const std::vector& stride, + const std::vector& pads, + const T* X, + T* Y, + CUDAContext* context) const { + if (std::is_same::value) { + CAFFE_THROW("Float16 is not supported for max_pooling."); + return false; + } else { + return max_pool_functor.Forward( + N, C, X_dims, Y_dims, kernel, dilation, stride, pads, X, Y, context); + } + } + + template + bool GlobalPoolingBackward( + const int N, + const int C, + const int HxW, + const T* dY, + const T* X, + const T* Y, + T* dX, + CUDAContext* context) const { + if (std::is_same::value) { + CAFFE_THROW("Float16 is not supported for max_pooling."); + return false; + } else { + return max_pool_functor.GlobalPoolingBackward( + N, C, HxW, dY, X, Y, dX, context); + } + } + + template + bool Backward( + const int N, + const int C, + const std::vector& X_dims, + const std::vector& Y_dims, + const std::vector& kernel, + const std::vector& dilation, + const std::vector& stride, + const std::vector& pads, + const T* dY, + const T* X, + const T* Y, + T* dX, + CUDAContext* context) const { + if (std::is_same::value) { + CAFFE_THROW("Float16 is not supported for max_pooling."); + return false; + } else { + return max_pool_functor.Backward( + N, + C, + X_dims, + Y_dims, + kernel, + dilation, + stride, + pads, + dY, + X, + Y, + dX, + context); + } + } + + const MaxPoolFunctor max_pool_functor; + const bool deterministic; +}; + +} // namespace + +REGISTER_CUDNN_OPERATOR(AveragePool, CuDNNPoolOp); +REGISTER_CUDNN_OPERATOR( + AveragePoolGradient, + CuDNNPoolGradientOp); + +REGISTER_CUDNN_OPERATOR(AveragePool1D, CuDNNPoolOp); +REGISTER_CUDNN_OPERATOR( + AveragePool1DGradient, + CuDNNPoolGradientOp); + +REGISTER_CUDNN_OPERATOR(AveragePool2D, CuDNNPoolOp); +REGISTER_CUDNN_OPERATOR( + AveragePool2DGradient, + CuDNNPoolGradientOp); + +REGISTER_CUDNN_OPERATOR(AveragePool3D, CuDNNPoolOp); +REGISTER_CUDNN_OPERATOR( + AveragePool3DGradient, + CuDNNPoolGradientOp); + +REGISTER_CUDNN_OPERATOR(MaxPool, CuDNNPoolOp); +REGISTER_CUDNN_OPERATOR( + MaxPoolGradient, + CuDNNPoolGradientOp); + +REGISTER_CUDNN_OPERATOR(MaxPool1D, CuDNNPoolOp); +REGISTER_CUDNN_OPERATOR( + MaxPool1DGradient, + CuDNNPoolGradientOp); + +REGISTER_CUDNN_OPERATOR(MaxPool2D, CuDNNPoolOp); +REGISTER_CUDNN_OPERATOR( + MaxPool2DGradient, + CuDNNPoolGradientOp); + +REGISTER_CUDNN_OPERATOR(MaxPool3D, CuDNNPoolOp); +REGISTER_CUDNN_OPERATOR( + MaxPool3DGradient, + CuDNNPoolGradientOp); + +} // namespace caffe2 diff --git a/caffe2/operators/pool_op_cudnn.cu b/caffe2/operators/pool_op_cudnn.cu deleted file mode 100644 index b521d34..0000000 --- a/caffe2/operators/pool_op_cudnn.cu +++ /dev/null @@ -1,534 +0,0 @@ -#include "caffe2/core/context_gpu.h" -#include "caffe2/core/cudnn_wrappers.h" -#include "caffe2/operators/conv_pool_op_base.h" - -#include - -namespace caffe2 { - -namespace { - -// Explicit fast paths for avg and max global pooling due to CuDNN global -// pooling performance bug which makes pooling extremely slow. -template -__global__ void -global_avgpool_kernel_NCHW(const int NC, const int sz, const T* data, T* out) { - typedef cub::BlockReduce BlockReduce; - __shared__ typename BlockReduce::TempStorage temp_storage; - for (int j = blockIdx.x; j < NC; j += gridDim.x) { - T sum(0); - for (int k = threadIdx.x; k < sz; k += blockDim.x) { - sum += data[j * sz + k]; - } - float totalsum = BlockReduce(temp_storage).Sum(sum); - if (threadIdx.x == 0) { - out[j] = totalsum / sz; - } - __syncthreads(); - } -} - -template -__global__ void -global_avgpool_backward_NCHW(const int NC, const int sz, const T* dx, T* out) { - CUDA_1D_KERNEL_LOOP(i, NC * sz) { - out[i] = dx[i / sz] / sz; - } -} - -template -__global__ void -global_maxpool_kernel_NCHW(const int NC, const int sz, const T* data, T* out) { - typedef cub::BlockReduce BlockReduce; - __shared__ typename BlockReduce::TempStorage temp_storage; - for (int j = blockIdx.x; j < NC; j += gridDim.x) { - T max(-FLT_MAX); - for (int k = threadIdx.x; k < sz; k += blockDim.x) { - max = data[j * sz + k] > max ? data[j * sz + k] : max; - } - float totalmax = BlockReduce(temp_storage).Reduce(max, cub::Max()); - if (threadIdx.x == 0) { - out[j] = totalmax; - } - __syncthreads(); - } -} - -template -__global__ void global_maxpool_backward_NCHW( - const int NC, - const int sz, - const T* dx, - T* out, - const T* x, - const T* in) { - CUDA_1D_KERNEL_LOOP(i, NC * sz) { - if (in[i] == x[i / sz]) { - out[i] = dx[i / sz]; - } else { - out[i] = 0.0; - } - } -} - -template -void setTensorDescriptor( - const int size, - const StorageOrder order, - const int N, - const int C, - const int H, - const int W, - const int D, - cudnnTensorDescriptor_t& desc) { - if (size == 4) { - CUDNN_ENFORCE(cudnnSetTensor4dDescriptor( - desc, - GetCudnnTensorFormat(order), - cudnnTypeWrapper::type, - N, - C, - H, - W)); - } else { - vector dims = {N, C, H, W, D}; - vector strides; - order == NCHW - ? strides.insert(strides.end(), {C * H * W * D, H * W * D, W * D, D, 1}) - : strides.insert( - strides.end(), {H * W * D * C, 1, W * D * C, D * C, C}); - CUDNN_ENFORCE(cudnnSetTensorNdDescriptor( - desc, - cudnnTypeWrapper::type, - size > 3 ? size : 4, - dims.data(), - strides.data())); - } -} - -} // namespace - -class CuDNNPoolOp : public ConvPoolOpBase { - public: - CuDNNPoolOp(const OperatorDef& operator_def, Workspace* ws) - : ConvPoolOpBase(operator_def, ws), - cudnn_wrapper_(&context_) { - CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&bottom_desc_)); - CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&top_desc_)); - CUDNN_ENFORCE(cudnnCreatePoolingDescriptor(&pooling_desc_)); - OPERATOR_NEEDS_FEATURE(kernel_.size() >=2 && kernel_.size() <=3, - "Cudnn pooling only supports 4d and 5d tensor"); - if (legacy_pad_ != LegacyPadding::CAFFE_LEGACY_POOLING) { - for (int i = 0; i < kernel_.size(); ++i) { - OPERATOR_NEEDS_FEATURE( - pads_[i] == pads_[kernel_.size() + i], - "The current padding scheme leads to unequal padding on the left " - "and right, which is not supported by cudnn."); - } - } - // Figure out the pooling descriptor. - if (operator_def.type().substr(0, 7) == "MaxPool") { - bool deterministic = - OperatorBase::GetSingleArgument("deterministic", false); -#if CUDNN_VERSION_MIN(6, 0, 0) - mode_ = - deterministic ? CUDNN_POOLING_MAX_DETERMINISTIC : CUDNN_POOLING_MAX; -#else - mode_ = CUDNN_POOLING_MAX; -#endif - } else if (operator_def.type().substr(0, 11) == "AveragePool") { - mode_ = CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING; - } else { - LOG(FATAL) << "Unsupported pooling method: " << operator_def.type(); - } - } - - ~CuDNNPoolOp() { - CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(bottom_desc_)); - CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(top_desc_)); - CUDNN_ENFORCE(cudnnDestroyPoolingDescriptor(pooling_desc_)); - } - - template - bool DoRunWithType() { - auto& X = Input(0); - auto* Y = Output(0); - int N = 0, C = 0, H = 0, W = 0, D = 0; - int H_out = 0, W_out = 0, D_out = 0; - - // cuDNN pooling support only 2 and 3 spatial dimensions. - CAFFE_ENFORCE(X.ndim() >= 4 && X.ndim() <= 5); - - switch (order_) { - case StorageOrder::NHWC: - N = X.dim32(0); - H = X.dim32(1); - W = X.ndim() > 3 ? X.dim32(2) : 1; - D = X.ndim() > 4 ? X.dim32(3) : 1; - C = X.dim32(X.ndim() - 1); - ConvPoolOpBase::SetOutputSize(X, Y, C); - H_out = Y->dim32(1); - W_out = Y->ndim() > 3 ? Y->dim32(2) : 1; - D_out = Y->ndim() > 4 ? Y->dim32(3) : 1; - break; - case StorageOrder::NCHW: - N = X.dim32(0); - C = X.dim32(1); - H = X.dim32(2); - W = X.ndim() > 3 ? X.dim32(3) : 1; - D = X.ndim() > 4 ? X.dim32(4) : 1; - ConvPoolOpBase::SetOutputSize(X, Y, C); - H_out = Y->dim32(2); - W_out = Y->ndim() > 3 ? Y->dim32(3) : 1; - D_out = Y->ndim() > 4 ? Y->dim32(4) : 1; - break; - default: - LOG(FATAL) << "Unknown storage order: " << order_; - } - - // Fast path for global pooling, as cudnn is slow. But only - // on float, because fp16 not supported for CUB. - if (std::is_same::value) { - if (order_ == StorageOrder::NCHW && global_pooling_) { - if (mode_ == CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING) { - global_avgpool_kernel_NCHW - <<>>( - N * C, - H * W * D, - X.data(), - Y->template mutable_data()); - return true; - } - if (mode_ == CUDNN_POOLING_MAX) { - global_maxpool_kernel_NCHW - <<>>( - N * C, - H * W * D, - X.data(), - Y->template mutable_data()); - return true; - } - } - } - - if (cudnn_input_dims_ != X.sizes()) { - // Dimensions changed; we will need to re-initialize things. - VLOG(1) << "Changing the cudnn descriptor configurations."; - cudnn_input_dims_ = X.sizes().vec(); - setTensorDescriptor(X.ndim(), order_, N, C, H, W, D, bottom_desc_); - setTensorDescriptor( - Y->ndim(), order_, N, C, H_out, W_out, D_out, top_desc_); - for (int i = 0; i < kernel_.size(); ++i) { - if (pads_[i] != pads_[kernel_.size() + i]) { - CAFFE_ENFORCE( - legacy_pad_ == LegacyPadding::CAFFE_LEGACY_POOLING, - "Cudnn pooling only supports even padding on both sides, with " - "the only exception of the caffe legacy pooling case where we " - "try to preserve backward compatibility with Caffe."); - } - } - if (kernel_.size() == 2) { - CUDNN_ENFORCE(cudnnSetPooling2dDescriptor( - pooling_desc_, - mode_, - CUDNN_NOT_PROPAGATE_NAN, - kernel_h(), - kernel_w(), - pad_t(), - pad_l(), - stride_h(), - stride_w())); - } else { - CUDNN_ENFORCE(cudnnSetPoolingNdDescriptor( - pooling_desc_, - mode_, - CUDNN_NOT_PROPAGATE_NAN, - kernel_.size(), - kernel_.data(), - pads_.data(), - stride_.data())); - } - } - // Carry out the pooling computation. - const T* Xdata = X.template data(); - T* Ydata = Y->template mutable_data(); - CUDNN_ENFORCE(cudnnPoolingForward( - cudnn_wrapper_.inline_cudnn_handle(), - pooling_desc_, - cudnnTypeWrapper::kOne(), - bottom_desc_, - Xdata, - cudnnTypeWrapper::kZero(), - top_desc_, - Ydata)); - return true; - } - - bool RunOnDevice() final { - auto& X = Input(0); - auto* Y = Output(0); - - if (X.IsType()) { - return DoRunWithType(); - } else if (X.IsType()) { - return DoRunWithType(); - } else { - LOG(FATAL) << "Unsupported input types"; - } - return true; - } - - protected: - vector cudnn_input_dims_; - - CuDNNWrapper cudnn_wrapper_; - cudnnTensorDescriptor_t bottom_desc_; - cudnnTensorDescriptor_t top_desc_; - cudnnPoolingDescriptor_t pooling_desc_; - cudnnPoolingMode_t mode_; - - private: -}; - -class CuDNNPoolGradientOp : public ConvPoolOpBase { - public: - CuDNNPoolGradientOp(const OperatorDef& operator_def, Workspace* ws) - : ConvPoolOpBase(operator_def, ws), - cudnn_wrapper_(&context_) { - CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&bottom_desc_)); - CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&top_desc_)); - CUDNN_ENFORCE(cudnnCreatePoolingDescriptor(&pooling_desc_)); - // Figure out the pooling descriptor. - if (operator_def.type() == "MaxPoolGradient" || - operator_def.type() == "MaxPool1DGradient" || - operator_def.type() == "MaxPool2DGradient" || - operator_def.type() == "MaxPool3DGradient") { - bool deterministic = - OperatorBase::GetSingleArgument("deterministic", false); -#if CUDNN_VERSION_MIN(6, 0, 0) - mode_ = - deterministic ? CUDNN_POOLING_MAX_DETERMINISTIC : CUDNN_POOLING_MAX; -#else - mode_ = CUDNN_POOLING_MAX; -#endif - } else if ( - operator_def.type() == "AveragePoolGradient" || - operator_def.type() == "AveragePool1DGradient" || - operator_def.type() == "AveragePool2DGradient" || - operator_def.type() == "AveragePool3DGradient") { - mode_ = CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING; - } else { - LOG(FATAL) << "Unsupported pooling method: " << operator_def.type(); - } - } - - ~CuDNNPoolGradientOp() { - CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(bottom_desc_)); - CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(top_desc_)); - CUDNN_ENFORCE(cudnnDestroyPoolingDescriptor(pooling_desc_)); - } - - template - bool DoRunWithType() { - auto& X = Input(0); - auto& Y = Input(1); - auto& dY = Input(2); - - // cuDNN pooling support only 2 and 3 spatial dimensions. - CAFFE_ENFORCE(X.ndim() >= 4 && X.ndim() <= 5); - - auto* dX = Output(0, X.sizes(), at::dtype()); - int N = 0, C = 0, H = 0, W = 0, D = 0; - int H_out = 0, W_out = 0, D_out = 0; - switch (order_) { - case StorageOrder::NHWC: - N = X.dim32(0); - H = X.dim32(1); - W = X.ndim() > 3 ? X.dim32(2) : 1; - D = X.ndim() > 4 ? X.dim32(3) : 1; - C = X.dim32(X.ndim() - 1); - H_out = Y.dim32(1); - W_out = Y.ndim() > 3 ? Y.dim32(2) : 1; - D_out = Y.ndim() > 4 ? Y.dim32(3) : 1; - break; - case StorageOrder::NCHW: - N = X.dim32(0); - C = X.dim32(1); - H = X.dim32(2); - W = X.ndim() > 3 ? X.dim32(3) : 1; - D = X.ndim() > 4 ? X.dim32(4) : 1; - H_out = Y.dim32(2); - W_out = Y.ndim() > 3 ? Y.dim32(3) : 1; - D_out = Y.ndim() > 4 ? Y.dim32(4) : 1; - break; - default: - LOG(FATAL) << "Unknown storage order: " << order_; - } - - // Fast path for global pooling, as cudnn is slow. But only - // on float, because fp16 not supported for CUB. - if (std::is_same::value) { - if (order_ == StorageOrder::NCHW && global_pooling_) { - if (mode_ == CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING) { - global_avgpool_backward_NCHW - <<size()), - CAFFE_CUDA_NUM_THREADS, - 0, - context_.cuda_stream()>>>( - N * C, - H * W * D, - dY.data(), - dX->template mutable_data()); - return true; - } -#if CUDNN_VERSION_MIN(6, 0, 0) - if (mode_ == CUDNN_POOLING_MAX || - mode_ == CUDNN_POOLING_MAX_DETERMINISTIC) { -#else - if (mode_ == CUDNN_POOLING_MAX) { -#endif - global_maxpool_backward_NCHW - <<size()), - CAFFE_CUDA_NUM_THREADS, - 0, - context_.cuda_stream()>>>( - N * C, - H * W * D, - dY.data(), - dX->template mutable_data(), - Y.data(), - X.data()); - return true; - } - } - } - - if (kernel_.size() == 1) { - ConvPoolOpBase::ComputePads({H}); - } else if (kernel_.size() == 2) { - ConvPoolOpBase::ComputePads({H, W}); - } else if (kernel_.size() == 3) { - ConvPoolOpBase::ComputePads({H, W, D}); - } else { - CAFFE_THROW("Unsupported kernel size :", kernel_.size()); - } - - if (cudnn_input_dims_ != X.sizes()) { - // Dimensions changed; we will need to re-initialize things. - VLOG(1) << "Changing the cudnn descriptor configurations."; - cudnn_input_dims_ = X.sizes().vec(); - setTensorDescriptor(X.ndim(), order_, N, C, H, W, D, bottom_desc_); - setTensorDescriptor( - Y.ndim(), order_, N, C, H_out, W_out, D_out, top_desc_); - for (int i = 0; i < kernel_.size(); ++i) { - if (pads_[i] != pads_[kernel_.size() + i]) { - CAFFE_ENFORCE( - legacy_pad_ == LegacyPadding::CAFFE_LEGACY_POOLING, - "Cudnn pooling only supports even padding on both sides, with " - "the only exception of the caffe legacy pooling case where we " - "try to preserve backward compatibility with Caffe."); - } - } - if (kernel_.size() == 2) { - CUDNN_ENFORCE(cudnnSetPooling2dDescriptor( - pooling_desc_, - mode_, - CUDNN_NOT_PROPAGATE_NAN, - kernel_h(), - kernel_w(), - pad_t(), - pad_l(), - stride_h(), - stride_w())); - } else { - CUDNN_ENFORCE(cudnnSetPoolingNdDescriptor( - pooling_desc_, - mode_, - CUDNN_NOT_PROPAGATE_NAN, - kernel_.size(), - kernel_.data(), - pads_.data(), - stride_.data())); - } - } - // Carry out the pooling computation. - const T* Xdata = X.template data(); - const T* Ydata = Y.template data(); - const T* dYdata = dY.template data(); - T* dXdata = dX->template mutable_data(); - - CUDNN_ENFORCE(cudnnPoolingBackward( - cudnn_wrapper_.inline_cudnn_handle(), - pooling_desc_, - cudnnTypeWrapper::kOne(), - top_desc_, - Ydata, - top_desc_, - dYdata, - bottom_desc_, - Xdata, - cudnnTypeWrapper::kZero(), - bottom_desc_, - dXdata)); - return true; - } - - bool RunOnDevice() final { - auto& X = Input(0); - auto& Y = Input(1); - auto& dY = Input(2); - auto* dX = Output(0); - dX->ResizeLike(X); - - if (X.IsType()) { - return DoRunWithType(); - } else if (X.IsType()) { - return DoRunWithType(); - } else { - LOG(FATAL) << "Unsupported input types"; - } - return true; - } - - protected: - vector cudnn_input_dims_; - - CuDNNWrapper cudnn_wrapper_; - cudnnTensorDescriptor_t bottom_desc_; - cudnnTensorDescriptor_t top_desc_; - cudnnPoolingDescriptor_t pooling_desc_; - cudnnPoolingMode_t mode_; -}; - -namespace { -REGISTER_CUDNN_OPERATOR(AveragePool, CuDNNPoolOp); -REGISTER_CUDNN_OPERATOR(AveragePoolGradient, CuDNNPoolGradientOp); - -REGISTER_CUDNN_OPERATOR(AveragePool1D, CuDNNPoolOp); -REGISTER_CUDNN_OPERATOR(AveragePool1DGradient, CuDNNPoolGradientOp); - -REGISTER_CUDNN_OPERATOR(AveragePool2D, CuDNNPoolOp); -REGISTER_CUDNN_OPERATOR(AveragePool2DGradient, CuDNNPoolGradientOp); - -REGISTER_CUDNN_OPERATOR(AveragePool3D, CuDNNPoolOp); -REGISTER_CUDNN_OPERATOR(AveragePool3DGradient, CuDNNPoolGradientOp); - -REGISTER_CUDNN_OPERATOR(MaxPool, CuDNNPoolOp); -REGISTER_CUDNN_OPERATOR(MaxPoolGradient, CuDNNPoolGradientOp); - -REGISTER_CUDNN_OPERATOR(MaxPool1D, CuDNNPoolOp); -REGISTER_CUDNN_OPERATOR(MaxPool1DGradient, CuDNNPoolGradientOp); - -REGISTER_CUDNN_OPERATOR(MaxPool2D, CuDNNPoolOp); -REGISTER_CUDNN_OPERATOR(MaxPool2DGradient, CuDNNPoolGradientOp); - -REGISTER_CUDNN_OPERATOR(MaxPool3D, CuDNNPoolOp); -REGISTER_CUDNN_OPERATOR(MaxPool3DGradient, CuDNNPoolGradientOp); -} // namespace -} // namespace caffe2 diff --git a/caffe2/python/operator_test/pooling_test.py b/caffe2/python/operator_test/pooling_test.py index 4740d7d..3caaf24 100644 --- a/caffe2/python/operator_test/pooling_test.py +++ b/caffe2/python/operator_test/pooling_test.py @@ -398,6 +398,61 @@ class TestPooling(hu.HypothesisTestCase): self.assertGradientChecks( gc, op, [X], 0, [0], threshold=0.05, stepsize=0.005) + @given(op_type=st.sampled_from(["AveragePool", "AveragePoolND"]), + dim=st.integers(1, 3), + N=st.integers(1, 3), + C=st.integers(1, 3), + D=st.integers(3, 5), + H=st.integers(3, 5), + W=st.integers(3, 5), + kernel=st.integers(1, 3), + stride=st.integers(1, 3), + pad=st.integers(0, 2), + count_include_pad=st.booleans(), + order=st.sampled_from(["NCHW", "NHWC"]), + engine=st.sampled_from(["", "CUDNN"]), + **hu.gcs) + def test_avg_pool_count_include_pad( + self, op_type, dim, N, C, D, H, W, kernel, stride, pad, + count_include_pad, order, engine, gc, dc): + assume(pad < kernel) + if hiputl.run_in_hip(gc, dc): + if dim != 2: + assume(engine != "CUDNN") + elif engine == "CUDNN": + assume(order == "NCHW") + + if op_type.endswith("ND"): + op_type = op_type.replace("N", str(dim)) + + op = core.CreateOperator( + op_type, + ["X"], + ["Y"], + kernels=[kernel] * dim, + strides=[stride] * dim, + pads=[pad] * dim * 2, + count_include_pad=count_include_pad, + order=order, + engine=engine, + ) + + if dim == 1: + dims = [N, C, W] + axes = [0, 2, 1] + elif dim == 2: + dims = [N, C, H, W] + axes = [0, 2, 3, 1] + else: + dims = [N, C, D, H, W] + axes = [0, 2, 3, 4, 1] + X = np.random.randn(*dims).astype(np.float32) + if order == "NHWC": + X = np.transpose(X, axes) + + self.assertDeviceChecks(dc, op, [X], [0]) + self.assertGradientChecks(gc, op, [X], 0, [0]) + if __name__ == "__main__": import unittest -- 2.7.4