From fb68d813be3ccbab3cbf74008a11071b9a646c75 Mon Sep 17 00:00:00 2001 From: Brennan Vincent Date: Tue, 15 Jan 2019 19:55:13 -0800 Subject: [PATCH] Fix logic errors when accumulating reductions in output (CUDA) (#16023) Summary: The correct logic is as follows: * If there is an earlier split, we need to combine with its result * If there is *not* a later split, we need to project before saving into the output. This should partially f i x #15837 . For example: ``` In [7]: a=torch.ones([1838860800], dtype=torch.float, device="cuda:1") In [8]: a.mean() Out[8]: tensor(1., device='cuda:1') ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/16023 Differential Revision: D13678449 Pulled By: umanwizard fbshipit-source-id: ab5078484c88e96bb30121b5cf24a0e8b0a8c2f8 --- aten/src/ATen/native/TensorIterator.cpp | 1 + aten/src/ATen/native/TensorIterator.h | 6 ++++++ aten/src/ATen/native/cuda/Reduce.cuh | 30 ++++++++++++++++++++++++++---- test/test_cuda.py | 9 +++++++++ 4 files changed, 42 insertions(+), 4 deletions(-) diff --git a/aten/src/ATen/native/TensorIterator.cpp b/aten/src/ATen/native/TensorIterator.cpp index c76dfc8..87bcb05 100644 --- a/aten/src/ATen/native/TensorIterator.cpp +++ b/aten/src/ATen/native/TensorIterator.cpp @@ -593,6 +593,7 @@ std::unique_ptr TensorIterator::split(int dim) { auto copy_size = shape_[dim] / 2; auto this_size = shape_[dim] - copy_size; copy->narrow(dim, 0, copy_size); + copy->final_output_ &= !overlaps; this->narrow(dim, copy_size, this_size); this->accumulate_ |= overlaps; diff --git a/aten/src/ATen/native/TensorIterator.h b/aten/src/ATen/native/TensorIterator.h index d030ff1..a13c509 100644 --- a/aten/src/ATen/native/TensorIterator.h +++ b/aten/src/ATen/native/TensorIterator.h @@ -224,6 +224,11 @@ struct CAFFE2_API TensorIterator { /// reductions. bool should_accumulate() const { return accumulate_; } + /// Whether this iterator produces the actual output, + /// as opposed to something that will be accumulated further. Only relevant for + /// CUDA reductions. + bool is_final_output() const { return final_output_; } + protected: void mark_outputs(); void compute_shape(); @@ -247,6 +252,7 @@ protected: bool compute_common_dtype_ = true; bool allow_cpu_scalars_ = false; bool promote_gpu_output_dtypes_ = false; + bool final_output_ = true; }; struct TensorIterator::Builder { diff --git a/aten/src/ATen/native/cuda/Reduce.cuh b/aten/src/ATen/native/cuda/Reduce.cuh index 6409b10..acd45c8 100644 --- a/aten/src/ATen/native/cuda/Reduce.cuh +++ b/aten/src/ATen/native/cuda/Reduce.cuh @@ -237,7 +237,8 @@ struct ReduceOp { static constexpr int vt0 = 4; static constexpr bool can_accumulate_in_output = - std::is_convertible::value; + std::is_convertible::value + && std::is_convertible::value; ops_t ops; @@ -250,6 +251,7 @@ struct ReduceOp { void* buffer; int* semaphores; bool accumulate; + bool final_output; ReduceOp(ops_t ops, ReduceConfig config, InputCalculator input_calc, OutputCalculator output_calc, const void* src, void* dst, void* buffer, int* semaphores, arg_t ident) @@ -289,7 +291,7 @@ struct ReduceOp { if (accumulate) { value = accumulate_in_output(out, value); } - *out = ops.project(value); + *out = project_if_necessary(value); } } @@ -374,9 +376,18 @@ struct ReduceOp { out_scalar_t* out, arg_t value, typename std::enable_if::type* = nullptr ) const { - return ops.reduce(*out, value); + return ops.combine(*out, value); + } + + template + C10_DEVICE out_scalar_t project_if_necessary( + arg_t value, + typename std::enable_if::type* = nullptr + ) const { + return final_output ? (out_scalar_t)ops.project(value) : (out_scalar_t)value; } + // This function should never be called -- // it's the version of `accumulate_in_output` // when accumulation in the output is not possible. @@ -389,6 +400,15 @@ struct ReduceOp { return arg_t {}; } + template + C10_DEVICE out_scalar_t project_if_necessary( + arg_t value, + typename std::enable_if::type* = nullptr + ) const { + assert(final_output); + return ops.project(value); + } + C10_DEVICE arg_t global_reduce(arg_t value, out_scalar_t* out) const { arg_t* reduce_buffer = (arg_t*)buffer; @@ -429,7 +449,7 @@ struct ReduceOp { if (accumulate) { value = accumulate_in_output(out, value); } - *out = ops.project(value); + *out = project_if_necessary(value); } } @@ -534,6 +554,7 @@ inline void gpu_reduce_kernel(TensorIterator& iter, const ops_t& ops, ident_t id (int*)semaphores.get(), ident); reduce.accumulate = iter.should_accumulate(); + reduce.final_output = iter.is_final_output(); launch_reduce_kernel(config, reduce); } else { @@ -551,6 +572,7 @@ inline void gpu_reduce_kernel(TensorIterator& iter, const ops_t& ops, ident_t id ident); AT_ASSERT(!iter.should_accumulate()); reduce.accumulate = false; + reduce.final_output = true; launch_reduce_kernel(config, reduce); } diff --git a/test/test_cuda.py b/test/test_cuda.py index 5130d23..03bd195 100644 --- a/test/test_cuda.py +++ b/test/test_cuda.py @@ -107,6 +107,7 @@ def cast_tensor(tensor, t): S = 10 M = 50 +G = 275000000 def make_tensor(t, *sizes): @@ -243,6 +244,10 @@ def large_2d_lapack(t): return t(1000, 1000).normal_() +def giant_1d_ones(t): + return t(G).copy_(torch.ones(G)) + + def long_type(t): return torch.cuda.LongTensor if 'cuda' in t.__module__ else torch.LongTensor @@ -356,6 +361,10 @@ tests = [ ('mean', small_3d, lambda t: []), ('mean', small_3d, lambda t: [-1], 'neg_dim'), ('mean', small_3d, lambda t: [1], 'dim'), + ('mean', giant_1d_ones, lambda t: [], '64bit_indexing', + # Double here because otherwise the CPU result will be + # wrong. + [torch.DoubleTensor]), ('mode', small_3d, lambda t: []), ('mode', small_3d, lambda t: [1], 'dim'), ('mode', small_3d, lambda t: [-1], 'neg_dim'), -- 2.7.4