From 8a07cbe5e1c2c7b56e9b46ebed0d192dc9551612 Mon Sep 17 00:00:00 2001 From: Brennan Vincent Date: Tue, 8 Jan 2019 19:51:41 -0800 Subject: [PATCH] In loop_wrapper, do not copy the passed-in functor (capture it by reference instead). (#15845) Summary: The overhead of the copy actually makes an appreciable difference when doing a lot of small reductions (i.e., when the reduced dimension is significantly smaller than the non-reduced dimensions. ``` x=torch.randn((1024,10,1024),dtype=torch.float64) torch.set_num_threads(1) %timeit x.std(1) ``` Before: 813.0 ms After: 708.25 ms Pull Request resolved: https://github.com/pytorch/pytorch/pull/15845 Differential Revision: D13603246 Pulled By: umanwizard fbshipit-source-id: 020d224d76fcb8a0b55b75b0f2937e9508891beb --- aten/src/ATen/native/TensorIterator.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/aten/src/ATen/native/TensorIterator.cpp b/aten/src/ATen/native/TensorIterator.cpp index c4cf591..c76dfc8 100644 --- a/aten/src/ATen/native/TensorIterator.cpp +++ b/aten/src/ATen/native/TensorIterator.cpp @@ -332,7 +332,7 @@ int TensorIterator::num_reduce_dims() const { return count; } static loop2d_t loop_wrapper(const loop_t& loop) { - return [loop](int ntensor, char** base, const int64_t* strides, int64_t size0, int64_t size1) { + return [&loop](int ntensor, char** base, const int64_t* strides, int64_t size0, int64_t size1) { auto data = PtrVector(base, base + ntensor); const int64_t* outer_strides = &strides[ntensor]; -- 2.7.4