/// reductions.
bool should_accumulate() const { return accumulate_; }
+ /// Whether this iterator produces the actual output,
+ /// as opposed to something that will be accumulated further. Only relevant for
+ /// CUDA reductions.
+ bool is_final_output() const { return final_output_; }
+
protected:
void mark_outputs();
void compute_shape();
bool compute_common_dtype_ = true;
bool allow_cpu_scalars_ = false;
bool promote_gpu_output_dtypes_ = false;
+ bool final_output_ = true;
};
struct TensorIterator::Builder {
static constexpr int vt0 = 4;
static constexpr bool can_accumulate_in_output =
- std::is_convertible<arg_t, out_scalar_t>::value;
+ std::is_convertible<arg_t, out_scalar_t>::value
+ && std::is_convertible<out_scalar_t, arg_t>::value;
ops_t ops;
void* buffer;
int* semaphores;
bool accumulate;
+ bool final_output;
ReduceOp(ops_t ops, ReduceConfig config, InputCalculator input_calc, OutputCalculator output_calc,
const void* src, void* dst, void* buffer, int* semaphores, arg_t ident)
if (accumulate) {
value = accumulate_in_output<can_accumulate_in_output>(out, value);
}
- *out = ops.project(value);
+ *out = project_if_necessary<can_accumulate_in_output>(value);
}
}
out_scalar_t* out, arg_t value,
typename std::enable_if<can_acc>::type* = nullptr
) const {
- return ops.reduce(*out, value);
+ return ops.combine(*out, value);
+ }
+
+ template <bool can_acc>
+ C10_DEVICE out_scalar_t project_if_necessary(
+ arg_t value,
+ typename std::enable_if<can_acc>::type* = nullptr
+ ) const {
+ return final_output ? (out_scalar_t)ops.project(value) : (out_scalar_t)value;
}
+
// This function should never be called --
// it's the version of `accumulate_in_output`
// when accumulation in the output is not possible.
return arg_t {};
}
+ template <bool can_acc>
+ C10_DEVICE out_scalar_t project_if_necessary(
+ arg_t value,
+ typename std::enable_if<!can_acc>::type* = nullptr
+ ) const {
+ assert(final_output);
+ return ops.project(value);
+ }
+
C10_DEVICE arg_t global_reduce(arg_t value, out_scalar_t* out) const {
arg_t* reduce_buffer = (arg_t*)buffer;
if (accumulate) {
value = accumulate_in_output<can_accumulate_in_output>(out, value);
}
- *out = ops.project(value);
+ *out = project_if_necessary<can_accumulate_in_output>(value);
}
}
(int*)semaphores.get(),
ident);
reduce.accumulate = iter.should_accumulate();
+ reduce.final_output = iter.is_final_output();
launch_reduce_kernel<ReduceConfig::NUM_THREADS>(config, reduce);
} else {
ident);
AT_ASSERT(!iter.should_accumulate());
reduce.accumulate = false;
+ reduce.final_output = true;
launch_reduce_kernel<ReduceConfig::NUM_THREADS>(config, reduce);
}
S = 10
M = 50
+G = 275000000
def make_tensor(t, *sizes):
return t(1000, 1000).normal_()
+def giant_1d_ones(t):
+ return t(G).copy_(torch.ones(G))
+
+
def long_type(t):
return torch.cuda.LongTensor if 'cuda' in t.__module__ else torch.LongTensor
('mean', small_3d, lambda t: []),
('mean', small_3d, lambda t: [-1], 'neg_dim'),
('mean', small_3d, lambda t: [1], 'dim'),
+ ('mean', giant_1d_ones, lambda t: [], '64bit_indexing',
+ # Double here because otherwise the CPU result will be
+ # wrong.
+ [torch.DoubleTensor]),
('mode', small_3d, lambda t: []),
('mode', small_3d, lambda t: [1], 'dim'),
('mode', small_3d, lambda t: [-1], 'neg_dim'),