From 14138f4605bf4ddd24808c9eb4890a11c0b789dd Mon Sep 17 00:00:00 2001 From: Brennan Vincent Date: Mon, 28 Jan 2019 08:47:35 -0800 Subject: [PATCH] Don't initialize a new `std::vector` in a loop. (#15850) Summary: Before this diff, we execute `std::vector> buffer((unsigned)max_threads, optional {});` in every iteration of `foreach_reduced_elt`. Change the code to only execute that line if we need it; i.e., we are actually about to parallelize. This overhead is quite significant when we are doing a lot of small reductions in single-threaded code. ``` x=torch.randn((1024,10,1024),dtype=torch.float64) torch.set_num_threads(1) %timeit x.std(1) ``` Before (with #15845 applied): 708.25 ms After: 508 ms Pull Request resolved: https://github.com/pytorch/pytorch/pull/15850 Differential Revision: D13612960 Pulled By: umanwizard fbshipit-source-id: f5e61abfe0027775c97ed81ac09c997fbee741df --- aten/src/ATen/native/cpu/Reduce.h | 44 ++++++++++++++++++++++----------------- 1 file changed, 25 insertions(+), 19 deletions(-) diff --git a/aten/src/ATen/native/cpu/Reduce.h b/aten/src/ATen/native/cpu/Reduce.h index 3cab114..c0d553e 100644 --- a/aten/src/ATen/native/cpu/Reduce.h +++ b/aten/src/ATen/native/cpu/Reduce.h @@ -83,36 +83,42 @@ void binary_kernel_reduce(TensorIterator& iter, ops_t ops, init_t init) { "the accumulate type must be default-constructible" ); iter.foreach_reduced_elt([&](TensorIterator &sub_iter) { - auto numel = sub_iter.numel(); - bool serial = numel < at::internal::GRAIN_SIZE || at::get_max_threads() == 1 || at::in_parallel_region(); - int max_threads = serial ? 1 : at::get_max_threads(); - AT_ASSERT(max_threads > 0); - std::vector> buffer((unsigned)max_threads, optional {}); - at::parallel_for(0, numel, serial ? (1 + numel) : internal::GRAIN_SIZE, - [&](int64_t begin, int64_t end) { - auto &acc = buffer[at::get_thread_num()]; + auto reduction_body = [&](acc_t acc, int64_t begin, int64_t end) -> acc_t { sub_iter.serial_for_each([&acc, &ops, &init](int ntensors, char** data, const int64_t* strides, int64_t size) { AT_ASSERT(ntensors == 2); char *in = data[1]; int64_t stride = strides[1]; - if (!acc && size > 0) { - //acc = acc_t {}; - acc = init; - } for (int64_t i = 0; i < size; ++i) { - acc = ops.reduce(*acc, *(data_t*)in); + acc = ops.reduce(acc, *(data_t*)in); in += stride; } }, {begin, end}); - }); - acc_t acc = init; - for (int i = 0; i < max_threads; ++i) { - if (buffer[i]) { - acc = ops.combine(acc, *buffer[i]); + return acc; + }; + acc_t total_acc = init; + auto numel = sub_iter.numel(); + if (numel < at::internal::GRAIN_SIZE || at::get_max_threads() == 1 || at::in_parallel_region()) { + total_acc = reduction_body(total_acc, 0, numel); + } else { + int max_threads = at::get_max_threads(); + AT_ASSERT(max_threads > 0); + static_assert( + !std::is_same::value, + "Concurrently modifying different references into std::vector is UB." + ); + std::vector buffer((unsigned)max_threads, init); + at::parallel_for(0, numel, internal::GRAIN_SIZE, + [&](int64_t begin, int64_t end) { + auto& acc = buffer[at::get_thread_num()]; + acc = reduction_body(acc, begin, end); + } + ); + for (int i = 0; i < max_threads; ++i) { + total_acc = ops.combine(total_acc, buffer[i]); } } char *out = (char *)sub_iter.data_ptr(0); - *(data_t*)out = ops.project(acc); + *(data_t*)out = ops.project(total_acc); }); } -- 2.7.4