namespace at {
MemOverlap has_internal_overlap(const Tensor& tensor) {
- auto* t = tensor.unsafeGetTensorImpl();
+ return has_internal_overlap(tensor.unsafeGetTensorImpl());
+}
- AT_ASSERT(tensor.layout() == kStrided);
+MemOverlap has_internal_overlap(TensorImpl* t) {
+ AT_ASSERT(t->layout() == kStrided);
if (t->is_contiguous()) {
return MemOverlap::NO;
}
auto strides = t->strides();
- if (std::find_if(
- strides.begin(), strides.end(), [](int s) { return s == 0; })) {
+ if (strides.end() != std::find_if(
+ strides.begin(), strides.end(), [](int64_t s) { return s == 0; })) {
return MemOverlap::YES;
}
}
void assert_no_internal_overlap(const Tensor& t, std::string op) {
+ assert_no_internal_overlap(t.unsafeGetTensorImpl(), op);
+}
+
+void assert_no_internal_overlap(TensorImpl* t, std::string op) {
if (has_internal_overlap(t) == MemOverlap::YES) {
AT_ERROR(
op, ": unsupported operation: more than one element of the written-to "
#include <ATen/CPUGenerator.h>
#include <ATen/CheckGenerator.h>
#include <ATen/Generator.h>
+#include <ATen/MemoryOverlap.h>
#include <ATen/cpu/vml.h>
#include <ATen/CPUApplyUtils.h>
#include <ATen/native/DispatchStub.h>
result.data<scalar_t>(), self.data<scalar_t>(), self.numel()); \
\
} else { \
+ assert_no_internal_overlap(result, #op); \
static constexpr int64_t WIDTH = 131072 / sizeof(scalar_t); \
CPU_tensor_parallel_kernel_apply2<scalar_t, scalar_t>( \
result, \
}); \
} \
REGISTER_DISPATCH(op##Impl, &op##_kernel)
-
} // anonymous namespace
REGISTER_DISPATCH(sigmoidImpl, &sigmoid_kernel)
#define THC_GENERIC_FILE "THC/generic/THCTensorMathPointwise.cu"
#else
+#include <ATen/MemoryOverlap.h>
+
#define IMPLEMENT_CUDA_TENSOR_BASIC_FUNC_(NAME, CFUNC, REAL) \
struct Tensor_##NAME##_##REAL##_Op { \
__device__ __forceinline__ void operator()(scalar_t* out, scalar_t* in) const { \
\
void THCTensor_(NAME)(THCState* state, THCTensor* self_, THCTensor* src) { \
THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src)); \
+ at::assert_no_internal_overlap(self_, #NAME); \
if (self_ == src) { \
if (!THC_pointwiseApply1<scalar_t>(state, self_, Tensor_##NAME##_##REAL##_Op())) { \
THArgCheck(false, 2, CUTORCH_DIM_WARNING); \
b_expanded = b.expand(4, 3)
self.assertEqual(torch._debug_has_internal_overlap(b_expanded), OVERLAP_YES)
+ @staticmethod
+ def unary_check_mem_overlap(self, inplace_op, value=-0.5, device='cpu'):
+ tensor = torch.tensor(value, device=device).expand(3, 3)
+ with self.assertRaisesRegex(RuntimeError, 'single memory location'):
+ inplace_op(tensor)
+
+ @staticmethod
+ def _test_inplace_unary_mem_overlap(self, device='cpu'):
+ TestTorch.unary_check_mem_overlap(self, lambda t: t.acos_(), device=device)
+ TestTorch.unary_check_mem_overlap(self, lambda t: t.asin_(), device=device)
+ TestTorch.unary_check_mem_overlap(self, lambda t: t.atan_(), device=device)
+ TestTorch.unary_check_mem_overlap(self, lambda t: t.ceil_(), device=device)
+ TestTorch.unary_check_mem_overlap(self, lambda t: t.cos_(), device=device)
+ TestTorch.unary_check_mem_overlap(self, lambda t: t.erf_(), device=device)
+ TestTorch.unary_check_mem_overlap(self, lambda t: t.erfc_(), device=device)
+ TestTorch.unary_check_mem_overlap(self, lambda t: t.exp_(), device=device)
+ TestTorch.unary_check_mem_overlap(self, lambda t: t.expm1_(), device=device)
+ TestTorch.unary_check_mem_overlap(self, lambda t: t.floor_(), device=device)
+ TestTorch.unary_check_mem_overlap(self, lambda t: t.log_(), device=device)
+ TestTorch.unary_check_mem_overlap(self, lambda t: t.log10_(), device=device)
+ TestTorch.unary_check_mem_overlap(self, lambda t: t.log1p_(), device=device)
+ TestTorch.unary_check_mem_overlap(self, lambda t: t.log2_(), device=device)
+ TestTorch.unary_check_mem_overlap(self, lambda t: t.round_(), device=device)
+ TestTorch.unary_check_mem_overlap(self, lambda t: t.rsqrt_(), device=device)
+ TestTorch.unary_check_mem_overlap(self, lambda t: t.sin_(), device=device)
+ TestTorch.unary_check_mem_overlap(self, lambda t: t.sqrt_(), device=device)
+ TestTorch.unary_check_mem_overlap(self, lambda t: t.tan_(), device=device)
+ TestTorch.unary_check_mem_overlap(self, lambda t: t.tanh_(), device=device)
+ TestTorch.unary_check_mem_overlap(self, lambda t: t.trunc_(), device=device)
+
+ def test_inplace_unary_mem_overlap(self):
+ return self._test_inplace_unary_mem_overlap(self)
+
+ @unittest.expectedFailure
+ def test_abs_unary_mem_overlap(self):
+ self.unary_check_mem_overlap(lambda t: t.abs_())
+
+ @unittest.expectedFailure
+ def test_sinh_unary_mem_overlap(self):
+ self.unary_check_mem_overlap(lambda t: t.sinh_())
+
+ @unittest.expectedFailure
+ def test_cosh_unary_mem_overlap(self):
+ self.unary_check_mem_overlap(lambda t: t.cosh_())
+
@unittest.skipIf(torch.cuda.device_count() < 2, 'only one GPU detected')
def test_reverse_binary_ops_multiple_device(self):
self.assertEqual(2 + torch.tensor(3), 2 + torch.tensor(3).to("cuda:1")) # __radd__