From 14138f4605bf4ddd24808c9eb4890a11c0b789dd Mon Sep 17 00:00:00 2001
From: Brennan Vincent <btv@fb.com>
Date: Mon, 28 Jan 2019 08:47:35 -0800
Subject: [PATCH] Don't initialize a new `std::vector` in a loop. (#15850)

Summary:
Before this diff, we execute `std::vector<optional<acc_t>> buffer((unsigned)max_threads, optional<acc_t> {});` in every iteration of `foreach_reduced_elt`. Change the code to only execute that line if we need it; i.e., we are actually about to parallelize.

This overhead is quite significant when we are doing a lot of small reductions in single-threaded code.

```
x=torch.randn((1024,10,1024),dtype=torch.float64)
torch.set_num_threads(1)
%timeit x.std(1)
```

Before (with #15845 applied): 708.25 ms
After: 508 ms
Pull Request resolved: https://github.com/pytorch/pytorch/pull/15850

Differential Revision: D13612960

Pulled By: umanwizard

fbshipit-source-id: f5e61abfe0027775c97ed81ac09c997fbee741df
---
 aten/src/ATen/native/cpu/Reduce.h | 44 ++++++++++++++++++++++-----------------
 1 file changed, 25 insertions(+), 19 deletions(-)
diff --git a/aten/src/ATen/native/cpu/Reduce.h b/aten/src/ATen/native/cpu/Reduce.h
index 3cab114..c0d553e 100644
--- a/aten/src/ATen/native/cpu/Reduce.h
+++ b/aten/src/ATen/native/cpu/Reduce.h
@@ -83,36 +83,42 @@ void binary_kernel_reduce(TensorIterator& iter, ops_t ops, init_t init) {
     "the accumulate type must be default-constructible"
   );
   iter.foreach_reduced_elt([&](TensorIterator &sub_iter) {
-    auto numel = sub_iter.numel();
-    bool serial = numel < at::internal::GRAIN_SIZE || at::get_max_threads() == 1 || at::in_parallel_region();
-    int max_threads = serial ? 1 : at::get_max_threads();
-    AT_ASSERT(max_threads > 0);
-    std::vector<optional<acc_t>> buffer((unsigned)max_threads, optional<acc_t> {});
-    at::parallel_for(0, numel, serial ? (1 + numel) : internal::GRAIN_SIZE,
-    [&](int64_t begin, int64_t end) {
-      auto &acc = buffer[at::get_thread_num()];
+    auto reduction_body = [&](acc_t acc, int64_t begin, int64_t end) -> acc_t {
       sub_iter.serial_for_each([&acc, &ops, &init](int ntensors, char** data, const int64_t* strides, int64_t size) {
         AT_ASSERT(ntensors == 2);
         char *in = data[1];
         int64_t stride = strides[1];
-        if (!acc && size > 0) {
-          //acc = acc_t {};
-          acc = init;
-        }
         for (int64_t i = 0; i < size; ++i) {
-          acc = ops.reduce(*acc, *(data_t*)in);
+          acc = ops.reduce(acc, *(data_t*)in);
           in += stride;
         }
       }, {begin, end});
-    });
-    acc_t acc = init;
-    for (int i = 0; i < max_threads; ++i) {
-      if (buffer[i]) {
-        acc = ops.combine(acc, *buffer[i]);
+      return acc;
+    };
+    acc_t total_acc = init;
+    auto numel = sub_iter.numel();
+    if (numel < at::internal::GRAIN_SIZE || at::get_max_threads() == 1 || at::in_parallel_region()) {
+      total_acc = reduction_body(total_acc, 0, numel);
+    } else {
+      int max_threads = at::get_max_threads();
+      AT_ASSERT(max_threads > 0);
+      static_assert(
+        !std::is_same<acc_t, bool>::value,
+        "Concurrently modifying different references into std::vector<bool> is UB."
+      );
+      std::vector<acc_t> buffer((unsigned)max_threads, init);
+      at::parallel_for(0, numel, internal::GRAIN_SIZE,
+        [&](int64_t begin, int64_t end) {
+          auto& acc = buffer[at::get_thread_num()];
+          acc = reduction_body(acc, begin, end);
+        }
+      );
+      for (int i = 0; i < max_threads; ++i) {
+        total_acc = ops.combine(total_acc, buffer[i]);
       }
     }
     char *out = (char *)sub_iter.data_ptr(0);
-    *(data_t*)out = ops.project(acc);
+    *(data_t*)out = ops.project(total_acc);
   });
 }
 
-- 
2.7.4