default: "true"
]]
[[
- name: _th_any
- types:
- - Byte
- variants:
- - function
- backends:
- - CPU
- - CUDA
- options:
- - cname: logicalAnyAll
- return: real
- arguments:
- - THTensor* self
-]]
-[[
- name: _th_any
- types:
- - Byte
- variants: function
- backends:
- - CPU
- - CUDA
- options:
- - cname: logicalAny
- return: argument 0
- scalar_check: self_->dim() == 0 || (keepdim == false && self_->dim() == 1)
- arguments:
- - arg: THTensor* result
- output: True
- - THTensor* self
- - arg: long dim
- wrap_dim: self
- - arg: bool keepdim
- default: "false"
-]]
-[[
name: _th_abs
cname: abs
types:
_(aten, _tanh) \
_(aten, _tanh_backward) \
_(aten, _tanh_forward) \
-_(aten, _th_any) \
_(aten, _th_baddbmm) \
_(aten, _th_bmm) \
_(aten, _th_clamp) \
return c;
}
+template <class T> Vec256<T> inline operator||(
+ const Vec256<T> &a, const Vec256<T> &b) {
+ Vec256<T> c = Vec256<T>();
+ for (int i = 0; i != Vec256<T>::size(); i++) {
+ c[i] = a[i] || b[i];
+ }
+ return c;
+}
+
// Implements the IEEE 754 201X `maximum` operation, which propagates NaN if
// either input is a NaN.
template <class T> Vec256<T> inline maximum(const Vec256<T> &a, const Vec256<T> &b) {
template <class T> \
Vec256<T> inline operator op(const Vec256<T> &a, const Vec256<T> &b) { \
using iT = int_same_size_t<T>; \
- iT buffer[Vec256<T>::size()]; \
- for (int64_t i = 0; i != Vec256<T>::size(); i++) { \
+ iT buffer[Vec256<T>::size()]; \
+ for (int64_t i = 0; i != Vec256<T>::size(); i++) { \
auto a_val = a[i]; \
auto b_val = b[i]; \
iT *i_a_ptr = reinterpret_cast<iT*>(&a_val); \
return at::legacy::th::_th_topk(self, k, dim, largest, sorted);
}
-Tensor any(const Tensor & self) {
- return at::legacy::th::_th_any(self);
-}
-
Tensor & renorm_out(Tensor & result, const Tensor & self, Scalar p, int64_t dim, Scalar maxnorm) {
return at::legacy::th::_th_renorm_out(result, self, p, dim, maxnorm);
}
DEFINE_DISPATCH(norm_stub);
DEFINE_DISPATCH(mean_stub);
DEFINE_DISPATCH(and_stub);
+DEFINE_DISPATCH(or_stub);
static inline Tensor integer_upcast(const Tensor& self, optional<ScalarType> dtype) {
ScalarType scalarType = self.type().scalarType();
}
Tensor &all_out(Tensor &result, const Tensor &self, int64_t dim, bool keepdim) {
- AT_CHECK(self.type().backend() == Backend::CPU || self.type().backend() == Backend::CUDA,
- "all only supports CPU AND CUDA backend, got: ", toString(self.type().backend()));
- AT_CHECK(self.type().scalarType() == at::ScalarType::Byte, "all only supports torch.uint8 dtype");
+ AT_CHECK(self.type().backend() == Backend::CPU ||
+ self.type().backend() == Backend::CUDA, "all only supports CPU AND CUDA "
+ "backend, got: ", toString(self.type().backend()));
+ AT_CHECK(self.type().scalarType() == at::ScalarType::Byte,
+ "all only supports torch.uint8 dtype");
dim = maybe_wrap_dim(dim, self.dim());
if (_dimreduce_return_trivial(result, self, 1, dim, keepdim)) {
return result;
}
}
+inline Tensor & _any(Tensor & result, std::unique_ptr<TensorIterator> & iter) {
+ if (iter->numel() == 0) {
+ result.fill_(0);
+ } else {
+ or_stub(iter->device_type(), *iter);
+ }
+
+ return result;
+}
+
+Tensor any(const Tensor& self) {
+ AT_CHECK(self.type().backend() == Backend::CPU ||
+ self.type().backend() == Backend::CUDA, "any only supports CPU AND CUDA "
+ "backend, got: ", toString(self.type().backend()));
+ AT_CHECK(self.type().scalarType() == at::ScalarType::Byte,
+ "any only supports torch.uint8 dtype");
+
+ Tensor result = at::empty({0}, self.options());
+ auto iter = make_reduction(
+ "any", result, self, {}, false, at::ScalarType::Byte);
+ return _any(result, iter);
+}
+
Tensor any(const Tensor& self, int64_t dim, bool keepdim) {
Tensor result = at::empty({0}, self.options());
return at::native::any_out(result, self, dim, keepdim);
}
Tensor &any_out(Tensor &result, const Tensor &self, int64_t dim, bool keepdim) {
- AT_CHECK(self.type().backend() == Backend::CPU || self.type().backend() == Backend::CUDA,
- "any only supports CPU AND CUDA backend, got: ", toString(self.type().backend()));
- AT_CHECK(self.type().scalarType() == at::ScalarType::Byte, "any only supports torch.uint8 dtype");
+ AT_CHECK(self.type().backend() == Backend::CPU ||
+ self.type().backend() == Backend::CUDA, "any only supports CPU AND CUDA "
+ "backend, got: ", toString(self.type().backend()));
+ AT_CHECK(self.type().scalarType() == at::ScalarType::Byte,
+ "any only supports torch.uint8 dtype");
dim = maybe_wrap_dim(dim, self.dim());
if (_dimreduce_return_trivial(result, self, 0, dim, keepdim)) {
return result;
} else {
- return at::legacy::th::_th_any_out(result, self, dim, keepdim);
+ auto iter = make_reduction(
+ "any", result, self, dim, keepdim, at::ScalarType::Byte);
+ return _any(result, iter);
}
}
DECLARE_DISPATCH(reduce_fn, prod_stub);
DECLARE_DISPATCH(reduce_fn, mean_stub);
DECLARE_DISPATCH(reduce_fn, and_stub);
+DECLARE_DISPATCH(reduce_fn, or_stub);
using reduce_std_var_function =
void (*)(TensorIterator&, bool unbiased, bool take_sqrt);
/*ident=*/true);
}
+static void or_kernel_impl(TensorIterator& iter) {
+ binary_kernel_reduce_vec(
+ iter,
+ [=](uint8_t a, uint8_t b) -> uint8_t { return a || b; },
+ [=](Vec256<uint8_t> a, Vec256<uint8_t> b) {
+ Vec256<uint8_t> c = Vec256<uint8_t>();
+ for (int i = 0; i != Vec256<uint8_t>::size(); i++) {
+ c[i] = a[i] || b[i];
+ }
+ return c;
+ },
+ /*ident=*/false);
+}
+
} // anonymous namespace
REGISTER_DISPATCH(sum_stub, &sum_kernel_impl);
REGISTER_DISPATCH(mean_stub, &mean_kernel_impl);
REGISTER_DISPATCH(norm_stub, &norm_kernel_tensor_iterator_impl);
REGISTER_DISPATCH(and_stub, &and_kernel_impl);
+REGISTER_DISPATCH(or_stub, &or_kernel_impl);
}} // namespace at::native
}), true);
}
+void or_kernel_cuda(TensorIterator& iter) {
+ gpu_reduce_kernel<uint8_t, uint8_t>(
+ iter, func_wrapper<uint8_t> ([]GPU_LAMBDA(uint8_t a, uint8_t b) -> uint8_t {
+ return a || b;
+ }), false);
+}
+
REGISTER_DISPATCH(std_var_stub, &std_var_kernel_cuda);
REGISTER_DISPATCH(sum_stub, &sum_kernel_cuda);
REGISTER_DISPATCH(prod_stub, &prod_kernel_cuda);
REGISTER_DISPATCH(mean_stub, &mean_kernel_cuda);
REGISTER_DISPATCH(norm_stub, &norm_kernel_cuda);
REGISTER_DISPATCH(and_stub, &and_kernel_cuda);
+REGISTER_DISPATCH(or_stub, &or_kernel_cuda);
}} // namespace at::native
TH_API void THTensor_(dirichlet_grad)(THTensor *self, THTensor *x, THTensor *alpha, THTensor *total);
#endif
-#if defined(TH_REAL_IS_BYTE)
-
-TH_API int THTensor_(logicalAnyAll)(THTensor *self);
-TH_API void THTensor_(logicalAny)(THTensor *r_, THTensor *t, int dimension, int keepdim);
-
-#endif /* TH_REAL_IS_BYTE */
-
#endif
LAB_IMPLEMENT_BASIC_FUNCTION(abs,abs)
#endif /* int only part */
-#if defined(TH_REAL_IS_BYTE) /* Byte only part */
-
-int THTensor_(logicalAnyAll)(THTensor *tensor)
-{
- scalar_t sum = 0;
- int serial_path = 0;
-#ifdef _OPENMP
- int inOMP = omp_in_parallel();
- if(inOMP) {
- serial_path = 1;
- } else {
- TH_TENSOR_APPLY_REDUCTION_OMP(scalar_t, tensor, ||:sum, sum = sum || *tensor_data;, UNCERTAIN_TH_OMP_OVERHEAD_THRESHOLD);
- }
-#else
- serial_path = 1;
-#endif
- if (serial_path) {
- TH_TENSOR_APPLY(scalar_t, tensor, sum = sum || *tensor_data;);
- }
- return (bool)sum;
-}
-
-void THTensor_(logicalAny)(THTensor *r_, THTensor *t, int dimension, int keepdim)
-{
- THArgCheck(dimension >= 0 && dimension < THTensor_(nDimensionLegacyAll)(t), 2, "dimension %d out of range",
- dimension + TH_INDEX_BASE);
-
- THTensor_(preserveReduceDimSemantics)(r_, THTensor_(nDimensionLegacyAll)(t), dimension, keepdim);
- std::vector<int64_t> dim = THTensor_sizesLegacyNoScalars(t);
- dim[dimension] = 1;
- THTensor_(resize)(r_, dim, {});
-
- int serial_path = 0;
-#ifdef _OPENMP
- int inOMP = omp_in_parallel();
- if (inOMP) {
- serial_path = 1;
- } else {
- int r_Contig = THTensor_(isContiguous)(r_);
- scalar_t *tp = t->data<scalar_t>();
- scalar_t *rp = r_->data<scalar_t>();
- if(r_Contig && (tp != rp)){
- ptrdiff_t iter = 0;
- ptrdiff_t r_Size = THTensor_(nElement)(r_);
- int r_Dim = THTensor_nDimensionLegacyAll(r_);
- #pragma omp parallel for if ( r_Size > TH_OMP_OVERHEAD_THRESHOLD)
- for (iter = 0; iter < r_Size; iter++) {
- int j;
- int64_t quot;
- int64_t rem = iter;
- ptrdiff_t tBasicIndex = 0;
-
- for(j = 0; j < r_Dim; ++j) {
- if(j != dimension){
- quot = rem/r_->stride(j);
- rem = rem%r_->stride(j);
- tBasicIndex += quot*t->stride(j);
- }
- }
- scalar_t *t_data = tp+tBasicIndex;
- scalar_t *r__data = rp+iter;
- *r__data = 0;
- for(j=0; j < THTensor_sizeLegacyNoScalars(t, dimension); ++j) {
- *r__data = *r__data || *(t_data + j*THTensor_strideLegacyNoScalars(t, dimension));
- }
- }
- } else {
- serial_path = 1;
- }
- }
-#else
- serial_path = 1;
-#endif
- if (serial_path) {
- // two implementations optimized for data locality
- if (THTensor_strideLegacyNoScalars(t, dimension) == 1) {
- TH_TENSOR_DIM_APPLY2(scalar_t, t, scalar_t, r_, dimension,
- accreal sum = 0;
- int64_t i;
- for(i = 0; i < t_size; i++)
- sum = sum || t_data[i*t_stride];
- *r__data = (scalar_t)sum;);
- } else {
- THTensor_(zero)(r_);
- THTensor *temp_ = THTensor_(newWithTensor)(r_);
- // r_.expand_as(t)
- temp_->set_size(dimension,THTensor_sizeLegacyNoScalars(t, dimension));
- temp_->set_stride(dimension, 0);
-
- TH_TENSOR_APPLY2(scalar_t, temp_, scalar_t, t, *temp__data = *temp__data || *t_data;);
- c10::raw::intrusive_ptr::decref(temp_);
- }
- }
-
- if (!keepdim) {
- THTensor_(squeeze1d)(r_, r_, dimension);
- }
-}
-
-#endif /* Byte only part */
-
/* floating point only now */
#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE)
#include <THC/generic/THCTensorTopK.h>
#include <THC/THCGenerateAllTypes.h>
-THC_API int THCudaByteTensor_logicalAnyAll(THCState *state, THCudaByteTensor *self);
-
-THC_API void THCudaByteTensor_logicalAny(THCState *state, THCudaByteTensor *self, THCudaByteTensor *src, int dimension, int keepdim);
-
#endif
#include <THC/THCTensorMathReduce.cuh>
#include <THC/THCTensor.hpp>
-
-THC_API int
-THCudaByteTensor_logicalAnyAll(THCState *state, THCudaByteTensor *self) {
- THCAssertSameGPU(THCudaByteTensor_checkGPU(state, 1, self));
- unsigned char result;
- if (!THC_reduceAll<uint8_t>(state, self,
- thrust::identity<unsigned char>(),
- LogicalAny(),
- (unsigned char) 0, &result, 0)) {
- THArgCheck(false, 1, CUTORCH_DIM_WARNING);
- }
-
- return (int) result;
-}
-
-THC_API void
-THCudaByteTensor_logicalAny(THCState* state, THCudaByteTensor *self, THCudaByteTensor *src, int dimension, int keepdim) {
- THCAssertSameGPU(THCudaByteTensor_checkGPU(state, 2, self, src));
- if (!THC_reduceDim<uint8_t>(state, self, src,
- thrust::identity<unsigned char>(),
- LogicalAny(),
- thrust::identity<unsigned char>(),
- (unsigned char) 0,
- dimension,
- keepdim)) {
- THArgCheck(false, 2, CUTORCH_DIM_WARNING);
- }
-
- THCudaCheck(cudaGetLastError());
-}
res2[i, j] += m1[i, k] * m2[k, j]
self.assertEqual(res1, res2)
+ def test_logical_any(self):
+ devices = ['cpu'] if not torch.cuda.is_available() else ['cpu', 'cuda']
+ for device in devices:
+ x = torch.zeros([2, 3, 400], dtype=torch.uint8, device=device)
+
+ self.assertEqual(
+ torch.tensor(0, dtype=torch.uint8, device=device),
+ x.any())
+
+ self.assertEqual(
+ torch.zeros([1, 3, 400], dtype=torch.uint8, device=device),
+ x.any(0, keepdim=True))
+
+ self.assertEqual(
+ torch.zeros([2, 1, 400], dtype=torch.uint8, device=device),
+ x.any(1, keepdim=True))
+
+ self.assertEqual(
+ torch.zeros([2, 3, 1], dtype=torch.uint8, device=device),
+ x.any(2, keepdim=True))
+
+ # set the last element to 0
+ x[-1][-1][-1] = 1
+
+ self.assertEqual(
+ torch.tensor(1, dtype=torch.uint8, device=device),
+ x.any())
+
+ y = torch.zeros([1, 3, 400], dtype=torch.uint8, device=device)
+ y[-1][-1][-1] = 1
+ self.assertEqual(y, x.any(0, keepdim=True))
+
+ y = torch.zeros([2, 1, 400], dtype=torch.uint8, device=device)
+ y[-1][-1][-1] = 1
+ self.assertEqual(y, x.any(1, keepdim=True))
+
+ y = torch.zeros([2, 3, 1], dtype=torch.uint8, device=device)
+ y[-1][-1][-1] = 1
+ self.assertEqual(y, x.any(2, keepdim=True))
+
def test_logical_all(self):
devices = ['cpu'] if not torch.cuda.is_available() else ['cpu', 'cuda']
for device in devices:
- name: alias(Tensor self)
self: grad
-# The two items below are necessary because TensorIterator doesn't work on
-# Variables (codegen does not unwrap the input Tensor for all(*) without this
-# line).
+# The four items below are necessary because TensorIterator doesn't work on
+# Variables (codegen does not unwrap the input Tensor for all() and any() ).
+- name: any(Tensor self)
+ self: not_implemented("any")
+
+- name: any(Tensor self, int64_t dim, bool keepdim)
+ self: not_implemented("any")
+
- name: all(Tensor self)
self: not_implemented("all")