From fb68d813be3ccbab3cbf74008a11071b9a646c75 Mon Sep 17 00:00:00 2001
From: Brennan Vincent <btv@fb.com>
Date: Tue, 15 Jan 2019 19:55:13 -0800
Subject: [PATCH] Fix logic errors when accumulating reductions in output
 (CUDA) (#16023)

Summary:
The correct logic is as follows:

* If there is an earlier split, we need to combine with its result
* If there is *not* a later split, we need to project before saving into the output.

This should partially f i x #15837  . For example:
```
In [7]: a=torch.ones([1838860800], dtype=torch.float, device="cuda:1")

In [8]: a.mean()
Out[8]: tensor(1., device='cuda:1')
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/16023

Differential Revision: D13678449

Pulled By: umanwizard

fbshipit-source-id: ab5078484c88e96bb30121b5cf24a0e8b0a8c2f8
---
 aten/src/ATen/native/TensorIterator.cpp |  1 +
 aten/src/ATen/native/TensorIterator.h   |  6 ++++++
 aten/src/ATen/native/cuda/Reduce.cuh    | 30 ++++++++++++++++++++++++++----
 test/test_cuda.py                       |  9 +++++++++
 4 files changed, 42 insertions(+), 4 deletions(-)
diff --git a/aten/src/ATen/native/TensorIterator.cpp b/aten/src/ATen/native/TensorIterator.cpp
index c76dfc8..87bcb05 100644
--- a/aten/src/ATen/native/TensorIterator.cpp
+++ b/aten/src/ATen/native/TensorIterator.cpp
@@ -593,6 +593,7 @@ std::unique_ptr<TensorIterator> TensorIterator::split(int dim) {
   auto copy_size = shape_[dim] / 2;
   auto this_size = shape_[dim] - copy_size;
   copy->narrow(dim, 0, copy_size);
+  copy->final_output_ &= !overlaps;
   this->narrow(dim, copy_size, this_size);
   this->accumulate_ |= overlaps;
 
diff --git a/aten/src/ATen/native/TensorIterator.h b/aten/src/ATen/native/TensorIterator.h
index d030ff1..a13c509 100644
--- a/aten/src/ATen/native/TensorIterator.h
+++ b/aten/src/ATen/native/TensorIterator.h
@@ -224,6 +224,11 @@ struct CAFFE2_API TensorIterator {
   /// reductions.
   bool should_accumulate() const { return accumulate_; }
 
+  /// Whether this iterator produces the actual output,
+  /// as opposed to something that will be accumulated further. Only relevant for
+  /// CUDA reductions.
+  bool is_final_output() const { return final_output_; }
+
 protected:
   void mark_outputs();
   void compute_shape();
@@ -247,6 +252,7 @@ protected:
   bool compute_common_dtype_ = true;
   bool allow_cpu_scalars_ = false;
   bool promote_gpu_output_dtypes_ = false;
+  bool final_output_ = true;
 };
 
 struct TensorIterator::Builder {
diff --git a/aten/src/ATen/native/cuda/Reduce.cuh b/aten/src/ATen/native/cuda/Reduce.cuh
index 6409b10..acd45c8 100644
--- a/aten/src/ATen/native/cuda/Reduce.cuh
+++ b/aten/src/ATen/native/cuda/Reduce.cuh
@@ -237,7 +237,8 @@ struct ReduceOp {
 
   static constexpr int vt0 = 4;
   static constexpr bool can_accumulate_in_output =
-    std::is_convertible<arg_t, out_scalar_t>::value;
+    std::is_convertible<arg_t, out_scalar_t>::value
+    && std::is_convertible<out_scalar_t, arg_t>::value;
 
 
   ops_t ops;
@@ -250,6 +251,7 @@ struct ReduceOp {
   void* buffer;
   int* semaphores;
   bool accumulate;
+  bool final_output;
 
   ReduceOp(ops_t ops, ReduceConfig config, InputCalculator input_calc, OutputCalculator output_calc,
            const void* src, void* dst, void* buffer, int* semaphores, arg_t ident)
@@ -289,7 +291,7 @@ struct ReduceOp {
       if (accumulate) {
         value = accumulate_in_output<can_accumulate_in_output>(out, value);
       }
-      *out = ops.project(value);
+      *out = project_if_necessary<can_accumulate_in_output>(value);
     }
   }
 
@@ -374,9 +376,18 @@ struct ReduceOp {
     out_scalar_t* out, arg_t value,
     typename std::enable_if<can_acc>::type* = nullptr
   ) const {
-    return ops.reduce(*out, value);
+    return ops.combine(*out, value);
+  }
+
+  template <bool can_acc>
+  C10_DEVICE out_scalar_t project_if_necessary(
+    arg_t value,
+    typename std::enable_if<can_acc>::type* = nullptr
+  ) const {
+    return final_output ? (out_scalar_t)ops.project(value) : (out_scalar_t)value;
   }
 
+
   // This function should never be called --
   // it's the version of `accumulate_in_output`
   // when accumulation in the output is not possible.
@@ -389,6 +400,15 @@ struct ReduceOp {
     return arg_t {};
   }
 
+  template <bool can_acc>
+  C10_DEVICE out_scalar_t project_if_necessary(
+    arg_t value,
+    typename std::enable_if<!can_acc>::type* = nullptr
+  ) const {
+    assert(final_output);
+    return ops.project(value);
+  }
+
   C10_DEVICE arg_t global_reduce(arg_t value, out_scalar_t* out) const {
     arg_t* reduce_buffer = (arg_t*)buffer;
 
@@ -429,7 +449,7 @@ struct ReduceOp {
         if (accumulate) {
           value = accumulate_in_output<can_accumulate_in_output>(out, value);
         }
-        *out = ops.project(value);
+        *out = project_if_necessary<can_accumulate_in_output>(value);
       }
     }
 
@@ -534,6 +554,7 @@ inline void gpu_reduce_kernel(TensorIterator& iter, const ops_t& ops, ident_t id
         (int*)semaphores.get(),
         ident);
     reduce.accumulate = iter.should_accumulate();
+    reduce.final_output = iter.is_final_output();
 
     launch_reduce_kernel<ReduceConfig::NUM_THREADS>(config, reduce);
   } else {
@@ -551,6 +572,7 @@ inline void gpu_reduce_kernel(TensorIterator& iter, const ops_t& ops, ident_t id
         ident);
     AT_ASSERT(!iter.should_accumulate());
     reduce.accumulate = false;
+    reduce.final_output = true;
 
     launch_reduce_kernel<ReduceConfig::NUM_THREADS>(config, reduce);
   }
diff --git a/test/test_cuda.py b/test/test_cuda.py
index 5130d23..03bd195 100644
--- a/test/test_cuda.py
+++ b/test/test_cuda.py
@@ -107,6 +107,7 @@ def cast_tensor(tensor, t):
 
 S = 10
 M = 50
+G = 275000000
 
 
 def make_tensor(t, *sizes):
@@ -243,6 +244,10 @@ def large_2d_lapack(t):
     return t(1000, 1000).normal_()
 
 
+def giant_1d_ones(t):
+    return t(G).copy_(torch.ones(G))
+
+
 def long_type(t):
     return torch.cuda.LongTensor if 'cuda' in t.__module__ else torch.LongTensor
 
@@ -356,6 +361,10 @@ tests = [
     ('mean', small_3d, lambda t: []),
     ('mean', small_3d, lambda t: [-1], 'neg_dim'),
     ('mean', small_3d, lambda t: [1], 'dim'),
+    ('mean', giant_1d_ones, lambda t: [], '64bit_indexing',
+        # Double here because otherwise the CPU result will be
+        # wrong.
+        [torch.DoubleTensor]),
     ('mode', small_3d, lambda t: []),
     ('mode', small_3d, lambda t: [1], 'dim'),
     ('mode', small_3d, lambda t: [-1], 'neg_dim'),
-- 
2.7.4