Fix logic errors when accumulating reductions in output (CUDA) (#16023)

author Brennan Vincent <btv@fb.com>

Wed, 16 Jan 2019 03:55:13 +0000 (19:55 -0800)

committer Facebook Github Bot <facebook-github-bot@users.noreply.github.com>

Wed, 16 Jan 2019 03:57:57 +0000 (19:57 -0800)
author Brennan Vincent <btv@fb.com>
Wed, 16 Jan 2019 03:55:13 +0000 (19:55 -0800)
committer Facebook Github Bot <facebook-github-bot@users.noreply.github.com>
Wed, 16 Jan 2019 03:57:57 +0000 (19:57 -0800)
diff --git a/aten/src/ATen/native/TensorIterator.cpp b/aten/src/ATen/native/TensorIterator.cpp

index c76dfc8..87bcb05 100644 (file)
--- a/aten/src/ATen/native/TensorIterator.cpp
+++ b/aten/src/ATen/native/TensorIterator.cpp
@@ -593,6 +593,7 @@ std::unique_ptr<TensorIterator> TensorIterator::split(int dim) {
    auto copy_size = shape_[dim] / 2;
    auto this_size = shape_[dim] - copy_size;
    copy->narrow(dim, 0, copy_size);
+  copy->final_output_ &= !overlaps;
    this->narrow(dim, copy_size, this_size);
    this->accumulate_ |= overlaps;
  
diff --git a/aten/src/ATen/native/TensorIterator.h b/aten/src/ATen/native/TensorIterator.h

index d030ff1..a13c509 100644 (file)
--- a/aten/src/ATen/native/TensorIterator.h
+++ b/aten/src/ATen/native/TensorIterator.h
@@ -224,6 +224,11 @@ struct CAFFE2_API TensorIterator {
    /// reductions.
    bool should_accumulate() const { return accumulate_; }
  
+  /// Whether this iterator produces the actual output,
+  /// as opposed to something that will be accumulated further. Only relevant for
+  /// CUDA reductions.
+  bool is_final_output() const { return final_output_; }
+
  protected:
    void mark_outputs();
    void compute_shape();
@@ -247,6 +252,7 @@ protected:
    bool compute_common_dtype_ = true;
    bool allow_cpu_scalars_ = false;
    bool promote_gpu_output_dtypes_ = false;
+  bool final_output_ = true;
  };
  
  struct TensorIterator::Builder {
diff --git a/aten/src/ATen/native/cuda/Reduce.cuh b/aten/src/ATen/native/cuda/Reduce.cuh

index 6409b10..acd45c8 100644 (file)
--- a/aten/src/ATen/native/cuda/Reduce.cuh
+++ b/aten/src/ATen/native/cuda/Reduce.cuh
@@ -237,7 +237,8 @@ struct ReduceOp {
  
    static constexpr int vt0 = 4;
    static constexpr bool can_accumulate_in_output =
-    std::is_convertible<arg_t, out_scalar_t>::value;
+    std::is_convertible<arg_t, out_scalar_t>::value
+    && std::is_convertible<out_scalar_t, arg_t>::value;
  
  
    ops_t ops;
@@ -250,6 +251,7 @@ struct ReduceOp {
    void* buffer;
    int* semaphores;
    bool accumulate;
+  bool final_output;
  
    ReduceOp(ops_t ops, ReduceConfig config, InputCalculator input_calc, OutputCalculator output_calc,
             const void* src, void* dst, void* buffer, int* semaphores, arg_t ident)
@@ -289,7 +291,7 @@ struct ReduceOp {
        if (accumulate) {
          value = accumulate_in_output<can_accumulate_in_output>(out, value);
        }
-      *out = ops.project(value);
+      *out = project_if_necessary<can_accumulate_in_output>(value);
      }
    }
  
@@ -374,9 +376,18 @@ struct ReduceOp {
      out_scalar_t* out, arg_t value,
      typename std::enable_if<can_acc>::type* = nullptr
    ) const {
-    return ops.reduce(*out, value);
+    return ops.combine(*out, value);
+  }
+
+  template <bool can_acc>
+  C10_DEVICE out_scalar_t project_if_necessary(
+    arg_t value,
+    typename std::enable_if<can_acc>::type* = nullptr
+  ) const {
+    return final_output ? (out_scalar_t)ops.project(value) : (out_scalar_t)value;
    }
  
+
    // This function should never be called --
    // it's the version of `accumulate_in_output`
    // when accumulation in the output is not possible.
@@ -389,6 +400,15 @@ struct ReduceOp {
      return arg_t {};
    }
  
+  template <bool can_acc>
+  C10_DEVICE out_scalar_t project_if_necessary(
+    arg_t value,
+    typename std::enable_if<!can_acc>::type* = nullptr
+  ) const {
+    assert(final_output);
+    return ops.project(value);
+  }
+
    C10_DEVICE arg_t global_reduce(arg_t value, out_scalar_t* out) const {
      arg_t* reduce_buffer = (arg_t*)buffer;
  
@@ -429,7 +449,7 @@ struct ReduceOp {
          if (accumulate) {
            value = accumulate_in_output<can_accumulate_in_output>(out, value);
          }
-        *out = ops.project(value);
+        *out = project_if_necessary<can_accumulate_in_output>(value);
        }
      }
  
@@ -534,6 +554,7 @@ inline void gpu_reduce_kernel(TensorIterator& iter, const ops_t& ops, ident_t id
          (int*)semaphores.get(),
          ident);
      reduce.accumulate = iter.should_accumulate();
+    reduce.final_output = iter.is_final_output();
  
      launch_reduce_kernel<ReduceConfig::NUM_THREADS>(config, reduce);
    } else {
@@ -551,6 +572,7 @@ inline void gpu_reduce_kernel(TensorIterator& iter, const ops_t& ops, ident_t id
          ident);
      AT_ASSERT(!iter.should_accumulate());
      reduce.accumulate = false;
+    reduce.final_output = true;
  
      launch_reduce_kernel<ReduceConfig::NUM_THREADS>(config, reduce);
    }
diff --git a/test/test_cuda.py b/test/test_cuda.py

index 5130d23..03bd195 100644 (file)
--- a/test/test_cuda.py
+++ b/test/test_cuda.py
@@ -107,6 +107,7 @@ def cast_tensor(tensor, t):
  
  S = 10
  M = 50
+G = 275000000
  
  
  def make_tensor(t, *sizes):
@@ -243,6 +244,10 @@ def large_2d_lapack(t):
      return t(1000, 1000).normal_()
  
  
+def giant_1d_ones(t):
+    return t(G).copy_(torch.ones(G))
+
+
  def long_type(t):
      return torch.cuda.LongTensor if 'cuda' in t.__module__ else torch.LongTensor
  
@@ -356,6 +361,10 @@ tests = [
      ('mean', small_3d, lambda t: []),
      ('mean', small_3d, lambda t: [-1], 'neg_dim'),
      ('mean', small_3d, lambda t: [1], 'dim'),
+    ('mean', giant_1d_ones, lambda t: [], '64bit_indexing',
+        # Double here because otherwise the CPU result will be
+        # wrong.
+        [torch.DoubleTensor]),
      ('mode', small_3d, lambda t: []),
      ('mode', small_3d, lambda t: [1], 'dim'),
      ('mode', small_3d, lambda t: [-1], 'neg_dim'),
author	Brennan Vincent <btv@fb.com>
	Wed, 16 Jan 2019 03:55:13 +0000 (19:55 -0800)
committer	Facebook Github Bot <facebook-github-bot@users.noreply.github.com>
	Wed, 16 Jan 2019 03:57:57 +0000 (19:57 -0800)
aten/src/ATen/native/TensorIterator.cpp		patch \| blob \| history
aten/src/ATen/native/TensorIterator.h		patch \| blob \| history
aten/src/ATen/native/cuda/Reduce.cuh		patch \| blob \| history
test/test_cuda.py		patch \| blob \| history