// acc_buf_ptr holds buffer used for accumulation among multiple sub_iter
// when accumulation in output is not possible.
if (!can_accumulate_in_output && !can_use_32bit_indexing) {
- int64_t output_memory_size = 1;
+ int64_t output_memory_size = iter.element_size(0);
for (int dim = 0; dim < iter.ndim(); dim++) {
output_memory_size = std::max(output_memory_size, iter.shape()[dim] * iter.strides(0)[dim]);
}
+ output_memory_size /= iter.element_size(0); //iter.strides is in bytes
owned_buf_ptr.reset(new AccumulationBuffer(sizeof(arg_t),
sizeof(out_scalar_t),
(char*) iter.data_ptr(0),
run_test(torch.zeros(64, 61, dtype=dtype, device=device))
run_test(torch.zeros(64, 1, dtype=dtype, device=device))
- @slowTest
+ @onlyCUDA
def test_argminmax_large_axis(self, device):
# Regression test for gh-32863
x = torch.zeros(2**31, device=device, dtype=torch.int8)