From 91c50aeec6eccb9e23b8c08b161dbae63de9a0b0 Mon Sep 17 00:00:00 2001 From: ngimel Date: Thu, 14 Feb 2019 21:11:30 -0800 Subject: [PATCH] Speed-up adaptive average pooling for the common case of size=1 output (#17011) Summary: When adaptive pooling has to produce a single pixel feature map, it is faster to do so by calling .mean(). Backward calls a pretty inefficient cuda kernel with atomics, which becomes ridiculously slow for halfs. For half this PR provides approx 30x speed-up for adaptive average pooling, which results in 30% end-to-end speed-up on senet. Improvements are smaller for float, but still significant (approx 5x). Also this PR unifies handling of 3d (no batch dimension) and 4d tensors, using negative dimension indices. cc ezyang for review. Pull Request resolved: https://github.com/pytorch/pytorch/pull/17011 Reviewed By: ailzhang Differential Revision: D14078747 Pulled By: soumith fbshipit-source-id: 0eb9255da2351190a6bcaf68c30e2ae2402a2dd9 --- aten/src/ATen/native/AdaptiveAveragePooling.cpp | 83 +++++-------- .../src/ATen/native/cuda/AdaptiveAveragePooling.cu | 131 ++++++--------------- aten/src/ATen/native/native_functions.yaml | 10 +- test/common_nn.py | 12 ++ tools/autograd/derivatives.yaml | 8 +- torch/csrc/jit/symbolic_script.cpp | 6 +- 6 files changed, 84 insertions(+), 166 deletions(-) diff --git a/aten/src/ATen/native/AdaptiveAveragePooling.cpp b/aten/src/ATen/native/AdaptiveAveragePooling.cpp index 0c13f2e..a31211b 100644 --- a/aten/src/ATen/native/AdaptiveAveragePooling.cpp +++ b/aten/src/ATen/native/AdaptiveAveragePooling.cpp @@ -75,19 +75,6 @@ namespace { at::Tensor const& input, IntArrayRef output_size) { - int dimD = 0; - int dimH = 1; - int dimW = 2; - int64_t sizeB = 1; - int64_t sizeD = 0; - int64_t isizeH = 0; - int64_t isizeW = 0; - - int64_t istrideB = 0; - int64_t istrideD = 0; - int64_t istrideH = 0; - int64_t istrideW = 0; - for (int64_t i = 0; i < input.ndimension(); i++) { AT_CHECK(input.size(i) > 0, "adaptive_avg_pooling2d(): expected input to have non-empty spatial dimensions, " @@ -98,23 +85,14 @@ namespace { AT_CHECK((input.ndimension() == 3 || input.ndimension() == 4), "non-empty 3D or 4D (batch mode) tensor expected for input"); - if (input.ndimension() == 4) - { - istrideB = input.stride(0); - sizeB = input.size(0); - dimD++; - dimH++; - dimW++; - } - /* sizes */ - sizeD = input.size(dimD); - isizeH = input.size(dimH); - isizeW = input.size(dimW); + int64_t sizeD = input.size(-3); + int64_t isizeH = input.size(-2); + int64_t isizeW = input.size(-1); /* strides */ - istrideD = input.stride(dimD); - istrideH = input.stride(dimH); - istrideW = input.stride(dimW); + int64_t istrideD = input.stride(-3); + int64_t istrideH = input.stride(-2); + int64_t istrideW = input.stride(-1); auto osizeH = output_size[0]; auto osizeW = output_size[1]; @@ -138,16 +116,15 @@ namespace { } else { - output.resize_({sizeB, sizeD, osizeH, osizeW}); - + output.resize_({input.size(-4), sizeD, osizeH, osizeW}); int64_t b; #pragma omp parallel for private(b) - for (b = 0; b < sizeB; b++) + for (b = 0; b < input.size(0); b++) { AT_DISPATCH_FLOATING_TYPES(input.type(), "adaptive_avg_pool2d", [&] { auto input_data = input.data(); auto output_data = output.data(); - adaptive_avg_pool2d_out_frame(input_data+b*istrideB, output_data+b*sizeD*osizeH*osizeW, + adaptive_avg_pool2d_out_frame(input_data+b*input.stride(0), output_data+b*sizeD*osizeH*osizeW, sizeD, isizeH, isizeW, osizeH, osizeW, @@ -212,29 +189,12 @@ namespace { const Tensor& gradOutput_, const Tensor& input) { - int dimD = 0; - int dimH = 1; - int dimW = 2; - int64_t sizeB = 1; - int sizeD; - int isizeH; - int isizeW; - int osizeH; - int osizeW; - - if (input.ndimension() == 4) { - sizeB = input.size(0); - dimD++; - dimH++; - dimW++; - } - /* sizes */ - sizeD = input.size(dimD); - isizeH = input.size(dimH); - isizeW = input.size(dimW); - osizeH = gradOutput_.size(dimH); - osizeW = gradOutput_.size(dimW); + int sizeD = input.size(-3); + int isizeH = input.size(-2); + int isizeW = input.size(-1); + int osizeH = gradOutput_.size(-2); + int osizeW = gradOutput_.size(-1); /* get contiguous gradOutput */ auto gradOutput = gradOutput_.contiguous(); @@ -260,7 +220,7 @@ namespace { { int64_t b; #pragma omp parallel for private(b) - for (b = 0; b < sizeB; b++) + for (b = 0; b < input.size(0); b++) { AT_DISPATCH_FLOATING_TYPES( input.type(), "adaptive_avg_pool2d_backward", [&] { @@ -302,6 +262,19 @@ namespace { return output; } + Tensor adaptive_avg_pool2d( + at::Tensor const& input, + IntArrayRef output_size){ + if (output_size[0] == 1 && output_size[1] == 1) { +//in this case, adaptive pooling is just computing mean over hw dimensions, which can be done more efficiently + int64_t mean_size = input.size(-1) * input.size(-2); + Tensor out = input.contiguous().view({-1, mean_size}).mean(-1); + return input.ndimension() == 3 ? out.view({input.size(0), 1, 1}) : out.view({input.size(0), input.size(1), 1, 1}); + } else { + return _adaptive_avg_pool2d(input, output_size); + } + } + Tensor& adaptive_avg_pool2d_backward_out_cpu( Tensor& gradInput, const Tensor& gradOutput, diff --git a/aten/src/ATen/native/cuda/AdaptiveAveragePooling.cu b/aten/src/ATen/native/cuda/AdaptiveAveragePooling.cu index 7060d09..5828248 100644 --- a/aten/src/ATen/native/cuda/AdaptiveAveragePooling.cu +++ b/aten/src/ATen/native/cuda/AdaptiveAveragePooling.cu @@ -222,62 +222,35 @@ namespace { AT_CHECK((input.ndimension() == 3 || input.ndimension() == 4), "non-empty 3D or 4D (batch mode) tensor expected for input"); - - if (input.ndimension() == 3) { - int64_t sizeD = input.size(0); - int64_t isizeH = input.size(1); - int64_t isizeW = input.size(2); - - int64_t istrideD = input.stride(0); - int64_t istrideH = input.stride(1); - int64_t istrideW = input.stride(2); - - int64_t osizeH = output_size[0]; - int64_t osizeW = output_size[1]; - AT_DISPATCH_FLOATING_TYPES_AND_HALF( - input.type(), "adaptive_avg_pool2d", [&] { - scalar_t *input_data = input.data(); - - output.resize_({sizeD, osizeH, osizeW}); - - scalar_t *output_data = output.data(); - - // cuda blocks & threads: - int blocksH = std::max((int)(16L / sizeD), 1); - dim3 blocks(sizeD, blocksH); - dim3 threads(32, 8); - - // run averagepool kernel - adaptiveaveragepool <<>> ( - input_data, output_data, - isizeH, isizeW, osizeH, osizeW, - istrideD, istrideH, istrideW); - } - ); + Tensor input_ = input; + int64_t grid_x = input.size(-3); + if (input.ndimension() == 4) { + input_ = input.contiguous(); + grid_x *= input_.size(-4); + } + int64_t sizeD = input_.size(-3); + int64_t isizeH = input_.size(-2); + int64_t isizeW = input_.size(-1); + + int64_t istrideD = input_.stride(-3); + int64_t istrideH = input_.stride(-2); + int64_t istrideW = input_.stride(-1); + + int64_t osizeH = output_size[0]; + int64_t osizeW = output_size[1]; + if (input.ndimension() == 4) { + output.resize_({input_.size(-4), sizeD, osizeH, osizeW}); } else { - Tensor input_ = input.contiguous(); - int64_t sizeB = input_.size(0); - int64_t sizeD = input_.size(1); - int64_t isizeH = input_.size(2); - int64_t isizeW = input.size(3); - - int64_t istrideD = input_.stride(1); - int64_t istrideH = input_.stride(2); - int64_t istrideW = input_.stride(3); - - int64_t osizeH = output_size[0]; - int64_t osizeW = output_size[1]; - AT_DISPATCH_FLOATING_TYPES_AND_HALF( - input.type(), "adaptive_avg_pool2d", [&] { + output.resize_({sizeD, osizeH, osizeW}); + } + AT_DISPATCH_FLOATING_TYPES_AND_HALF( + input_.type(), "adaptive_avg_pool2d", [&] { scalar_t *input_data = input_.data(); - - output.resize_({sizeB, sizeD, osizeH, osizeW}); - scalar_t *output_data = output.data(); // cuda blocks & threads: int blocksH = std::max((int)(16L / sizeD), 1); - dim3 blocks(sizeB * sizeD, blocksH); + dim3 blocks(grid_x, blocksH); dim3 threads(32, 8); // run averagepool kernel @@ -285,9 +258,8 @@ namespace { input_data, output_data, isizeH, isizeW, osizeH, osizeW, istrideD, istrideH, istrideW); - } + } ); - } THCudaCheck(cudaGetLastError()); } @@ -306,23 +278,25 @@ namespace { Tensor gradOutput = gradOutput_.contiguous(); - if (input.ndimension() == 3) { - int64_t sizeD = input.size(0); - int64_t isizeH = input.size(1); - int64_t isizeW = input.size(2); + int64_t sizeD = input.size(-3); + int64_t isizeH = input.size(-2); + int64_t isizeW = input.size(-1); - int64_t osizeH = gradOutput.size(1); - int64_t osizeW = gradOutput.size(2); + int64_t osizeH = gradOutput.size(-2); + int64_t osizeW = gradOutput.size(-1); + + int64_t grid_x = sizeD; + if (input.ndimension() == 4) grid_x *= input.size(-4); //bool atomic = (isizeW%osizeW != 0) || (isizeH%osizeH != 0); - AT_DISPATCH_FLOATING_TYPES_AND_HALF( + AT_DISPATCH_FLOATING_TYPES_AND_HALF( input.type(), "adaptive_avg_pool2d_backward", [&] { scalar_t *gradOutput_data = gradOutput.data(); scalar_t *gradInput_data = gradInput.data(); // cuda blocks & threads: int blocksH = std::max((int)(16L / sizeD), 1); - dim3 blocks(sizeD, blocksH); + dim3 blocks(grid_x, blocksH); dim3 threads(32, 8); if(atomic) @@ -341,43 +315,6 @@ namespace { } } ); - } else { - int64_t sizeB = input.size(0); - int64_t sizeD = input.size(1); - int64_t isizeH = input.size(2); - int64_t isizeW = input.size(3); - - int64_t osizeH = gradOutput.size(2); - int64_t osizeW = gradOutput.size(3); - - //bool atomic = //(isizeW%osizeW != 0) || (isizeH%osizeH != 0); - AT_DISPATCH_FLOATING_TYPES_AND_HALF( - input.type(), "adaptive_avg_pool2d_backward", [&] { - scalar_t *gradOutput_data = gradOutput.data(); - scalar_t *gradInput_data = gradInput.data(); - - // cuda blocks & threads: - int blocksH = std::max((int)(16L / sizeD), 1); - dim3 blocks(sizeB * sizeD, blocksH); - dim3 threads(32, 8); - - if(atomic) - { - // run updateGradInput kernel, accumulate gradients atomically - atomicadaptiveaveragegradinput <<>> ( - gradInput_data, gradOutput_data, - isizeH, isizeW, osizeH, osizeW); - } - else - { - // run updateGradInput kernel, accumulate gradients atomically - adaptiveaveragegradinput <<>> ( - gradInput_data, gradOutput_data, - isizeH, isizeW, osizeH, osizeW); - } - } - ); - } THCudaCheck(cudaGetLastError()); } diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml index 386b608..4125a45 100644 --- a/aten/src/ATen/native/native_functions.yaml +++ b/aten/src/ATen/native/native_functions.yaml @@ -4166,17 +4166,13 @@ - func: adaptive_avg_pool2d(Tensor self, int[2] output_size) -> Tensor matches_jit_signature: True python_module: nn + +- func: _adaptive_avg_pool2d(Tensor self, int[2] output_size) -> Tensor dispatch: CPU: adaptive_avg_pool2d_cpu CUDA: adaptive_avg_pool2d_cuda -- func: adaptive_avg_pool2d_backward(Tensor grad_output, Tensor self, *, Tensor(a!) grad_input) -> Tensor(a!) - python_module: nn - dispatch: - CPU: adaptive_avg_pool2d_backward_out_cpu - CUDA: adaptive_avg_pool2d_backward_out_cuda - -- func: adaptive_avg_pool2d_backward(Tensor grad_output, Tensor self) -> Tensor +- func: _adaptive_avg_pool2d_backward(Tensor grad_output, Tensor self) -> Tensor matches_jit_signature: True python_module: nn dispatch: diff --git a/test/common_nn.py b/test/common_nn.py index a614908..97d2e0a 100644 --- a/test/common_nn.py +++ b/test/common_nn.py @@ -1928,6 +1928,12 @@ new_module_tests = [ input_fn=lambda: torch.rand(1, 3, 5), ), dict( + module_name='AdaptiveAvgPool1d', + constructor_args=(1,), + input_fn=lambda: torch.rand(1, 3, 5), + desc='one_output', + ), + dict( module_name='AdaptiveAvgPool2d', constructor_args=(3,), input_fn=lambda: torch.rand(1, 3, 5, 6), @@ -1935,6 +1941,12 @@ new_module_tests = [ ), dict( module_name='AdaptiveAvgPool2d', + constructor_args=(1,), + input_fn=lambda: torch.rand(1, 3, 5, 6), + desc='single_1x1output', + ), + dict( + module_name='AdaptiveAvgPool2d', constructor_args=((3, 4),), input_fn=lambda: torch.rand(1, 3, 5, 6), desc='tuple', diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml index be8ac7b..f8c0ca6 100644 --- a/tools/autograd/derivatives.yaml +++ b/tools/autograd/derivatives.yaml @@ -1061,8 +1061,8 @@ - name: upsample_nearest3d(Tensor self, IntArrayRef output_size) self: upsample_nearest3d_backward(grad, output_size, self.sizes()) -- name: adaptive_avg_pool2d(Tensor self, IntArrayRef output_size) - self: adaptive_avg_pool2d_backward(grad, self) +- name: _adaptive_avg_pool2d(Tensor self, IntArrayRef output_size) + self: _adaptive_avg_pool2d_backward(grad, self) - name: adaptive_avg_pool3d(Tensor self, IntArrayRef output_size) self: adaptive_avg_pool3d_backward(grad, self) @@ -1148,8 +1148,8 @@ # NN double backwards support -- name: adaptive_avg_pool2d_backward(Tensor grad_output, Tensor self) - grad_output: adaptive_avg_pool2d(grad, { grad_output.size(-2), grad_output.size(-1) }) +- name: _adaptive_avg_pool2d_backward(Tensor grad_output, Tensor self) + grad_output: _adaptive_avg_pool2d(grad, { grad_output.size(-2), grad_output.size(-1) }) self: zeros_like(self) - name: adaptive_avg_pool3d_backward(Tensor grad_output, Tensor self) diff --git a/torch/csrc/jit/symbolic_script.cpp b/torch/csrc/jit/symbolic_script.cpp index bfbf4ae..e401c0e 100644 --- a/torch/csrc/jit/symbolic_script.cpp +++ b/torch/csrc/jit/symbolic_script.cpp @@ -303,13 +303,13 @@ const std::vector functions = { return torch.view(self, size), backward - def adaptive_avg_pool2d(self, + def _adaptive_avg_pool2d(self, output_size: List[int]): def backward(grad_output): - grad_self = torch.adaptive_avg_pool2d_backward(grad_output, self) + grad_self = torch._adaptive_avg_pool2d_backward(grad_output, self) return grad_self, None - return torch.adaptive_avg_pool2d(self, output_size), backward + return torch._adaptive_avg_pool2d(self, output_size), backward def embedding(weight, indices, -- 2.7.4