Port legacy any(*) to ATen

author Shen Li <shenli@fb.com>

Fri, 18 Jan 2019 18:07:33 +0000 (10:07 -0800)

committer Facebook Github Bot <facebook-github-bot@users.noreply.github.com>

Fri, 18 Jan 2019 18:32:19 +0000 (10:32 -0800)
author Shen Li <shenli@fb.com>
Fri, 18 Jan 2019 18:07:33 +0000 (10:07 -0800)
committer Facebook Github Bot <facebook-github-bot@users.noreply.github.com>
Fri, 18 Jan 2019 18:32:19 +0000 (10:32 -0800)
diff --git a/aten/src/ATen/Declarations.cwrap b/aten/src/ATen/Declarations.cwrap

index bc06d11..ae6d4bd 100644 (file)
--- a/aten/src/ATen/Declarations.cwrap
+++ b/aten/src/ATen/Declarations.cwrap
@@ -908,42 +908,6 @@
        default: "true"
  ]]
  [[
-  name: _th_any
-  types:
-    - Byte
-  variants:
-    - function
-  backends:
-    - CPU
-    - CUDA
-  options:
-    - cname: logicalAnyAll
-      return: real
-      arguments:
-        - THTensor* self
-]]
-[[
-  name: _th_any
-  types:
-    - Byte
-  variants: function
-  backends:
-    - CPU
-    - CUDA
-  options:
-    - cname: logicalAny
-      return: argument 0
-      scalar_check: self_->dim() == 0 || (keepdim == false && self_->dim() == 1)
-      arguments:
-        - arg: THTensor* result
-          output: True
-        - THTensor* self
-        - arg: long dim
-          wrap_dim: self
-        - arg: bool keepdim
-          default: "false"
-]]
-[[
    name: _th_abs
    cname: abs
    types:
diff --git a/aten/src/ATen/core/aten_interned_strings.h b/aten/src/ATen/core/aten_interned_strings.h

index 28f08e9..514323c 100644 (file)
--- a/aten/src/ATen/core/aten_interned_strings.h
+++ b/aten/src/ATen/core/aten_interned_strings.h
@@ -139,7 +139,6 @@ _(aten, _tan) \
  _(aten, _tanh) \
  _(aten, _tanh_backward) \
  _(aten, _tanh_forward) \
-_(aten, _th_any) \
  _(aten, _th_baddbmm) \
  _(aten, _th_bmm) \
  _(aten, _th_clamp) \
diff --git a/aten/src/ATen/cpu/vec256/vec256_base.h b/aten/src/ATen/cpu/vec256/vec256_base.h

index f5afece..0e52ffa 100644 (file)
--- a/aten/src/ATen/cpu/vec256/vec256_base.h
+++ b/aten/src/ATen/cpu/vec256/vec256_base.h
@@ -308,6 +308,15 @@ template <class T> Vec256<T> inline operator/(const Vec256<T> &a, const Vec256<T
    return c;
  }
  
+template <class T> Vec256<T> inline operator||(
+    const Vec256<T> &a, const Vec256<T> &b) {
+  Vec256<T> c = Vec256<T>();
+  for (int i = 0; i != Vec256<T>::size(); i++) {
+    c[i] = a[i] || b[i];
+  }
+  return c;
+}
+
  // Implements the IEEE 754 201X `maximum` operation, which propagates NaN if
  // either input is a NaN.
  template <class T> Vec256<T> inline maximum(const Vec256<T> &a, const Vec256<T> &b) {
@@ -363,8 +372,8 @@ inline T minimum(const T& a, const T& b) {
  template <class T>                                                          \
  Vec256<T> inline operator op(const Vec256<T> &a, const Vec256<T> &b) {      \
    using iT = int_same_size_t<T>;                                            \
-  iT buffer[Vec256<T>::size()];                                               \
-  for (int64_t i = 0; i != Vec256<T>::size(); i++) {                          \
+  iT buffer[Vec256<T>::size()];                                             \
+  for (int64_t i = 0; i != Vec256<T>::size(); i++) {                        \
      auto a_val = a[i];                                                      \
      auto b_val = b[i];                                                      \
      iT *i_a_ptr = reinterpret_cast<iT*>(&a_val);                            \
diff --git a/aten/src/ATen/native/LegacyDefinitions.cpp b/aten/src/ATen/native/LegacyDefinitions.cpp

index e6c2787..cf3421c 100644 (file)
--- a/aten/src/ATen/native/LegacyDefinitions.cpp
+++ b/aten/src/ATen/native/LegacyDefinitions.cpp
@@ -705,10 +705,6 @@ std::tuple<Tensor,Tensor> topk(const Tensor & self, int64_t k, int64_t dim, bool
    return at::legacy::th::_th_topk(self, k, dim, largest, sorted);
  }
  
-Tensor any(const Tensor & self) {
-  return at::legacy::th::_th_any(self);
-}
-
  Tensor & renorm_out(Tensor & result, const Tensor & self, Scalar p, int64_t dim, Scalar maxnorm) {
    return at::legacy::th::_th_renorm_out(result, self, p, dim, maxnorm);
  }
diff --git a/aten/src/ATen/native/ReduceOps.cpp b/aten/src/ATen/native/ReduceOps.cpp

index 7c8c119..90188a2 100644 (file)
--- a/aten/src/ATen/native/ReduceOps.cpp
+++ b/aten/src/ATen/native/ReduceOps.cpp
@@ -26,6 +26,7 @@ DEFINE_DISPATCH(prod_stub);
  DEFINE_DISPATCH(norm_stub);
  DEFINE_DISPATCH(mean_stub);
  DEFINE_DISPATCH(and_stub);
+DEFINE_DISPATCH(or_stub);
  
  static inline Tensor integer_upcast(const Tensor& self, optional<ScalarType> dtype) {
    ScalarType scalarType = self.type().scalarType();
@@ -482,9 +483,11 @@ Tensor all(const Tensor& self, int64_t dim, bool keepdim) {
  }
  
  Tensor &all_out(Tensor &result, const Tensor &self, int64_t dim, bool keepdim) {
-  AT_CHECK(self.type().backend() == Backend::CPU || self.type().backend() == Backend::CUDA,
-           "all only supports CPU AND CUDA backend, got: ", toString(self.type().backend()));
-  AT_CHECK(self.type().scalarType() == at::ScalarType::Byte, "all only supports torch.uint8 dtype");
+  AT_CHECK(self.type().backend() == Backend::CPU ||
+    self.type().backend() == Backend::CUDA, "all only supports CPU AND CUDA "
+    "backend, got: ", toString(self.type().backend()));
+  AT_CHECK(self.type().scalarType() == at::ScalarType::Byte,
+    "all only supports torch.uint8 dtype");
    dim = maybe_wrap_dim(dim, self.dim());
    if (_dimreduce_return_trivial(result, self, 1, dim, keepdim)) {
      return result;
@@ -495,20 +498,47 @@ Tensor &all_out(Tensor &result, const Tensor &self, int64_t dim, bool keepdim) {
    }
  }
  
+inline Tensor & _any(Tensor & result, std::unique_ptr<TensorIterator> & iter) {
+  if (iter->numel() == 0) {
+    result.fill_(0);
+  } else {
+    or_stub(iter->device_type(), *iter);
+  }
+
+  return result;
+}
+
+Tensor any(const Tensor& self) {
+  AT_CHECK(self.type().backend() == Backend::CPU ||
+    self.type().backend() == Backend::CUDA, "any only supports CPU AND CUDA "
+    "backend, got: ", toString(self.type().backend()));
+  AT_CHECK(self.type().scalarType() == at::ScalarType::Byte,
+    "any only supports torch.uint8 dtype");
+
+  Tensor result = at::empty({0}, self.options());
+  auto iter = make_reduction(
+    "any", result, self, {}, false, at::ScalarType::Byte);
+  return _any(result, iter);
+}
+
  Tensor any(const Tensor& self, int64_t dim, bool keepdim) {
    Tensor result = at::empty({0}, self.options());
    return at::native::any_out(result, self, dim, keepdim);
  }
  
  Tensor &any_out(Tensor &result, const Tensor &self, int64_t dim, bool keepdim) {
-  AT_CHECK(self.type().backend() == Backend::CPU || self.type().backend() == Backend::CUDA,
-           "any only supports CPU AND CUDA backend, got: ", toString(self.type().backend()));
-  AT_CHECK(self.type().scalarType() == at::ScalarType::Byte, "any only supports torch.uint8 dtype");
+  AT_CHECK(self.type().backend() == Backend::CPU ||
+    self.type().backend() == Backend::CUDA, "any only supports CPU AND CUDA "
+    "backend, got: ", toString(self.type().backend()));
+  AT_CHECK(self.type().scalarType() == at::ScalarType::Byte,
+    "any only supports torch.uint8 dtype");
    dim = maybe_wrap_dim(dim, self.dim());
    if (_dimreduce_return_trivial(result, self, 0, dim, keepdim)) {
      return result;
    } else {
-    return at::legacy::th::_th_any_out(result, self, dim, keepdim);
+    auto iter = make_reduction(
+      "any", result, self, dim, keepdim, at::ScalarType::Byte);
+    return _any(result, iter);
    }
  }
  
diff --git a/aten/src/ATen/native/ReduceOps.h b/aten/src/ATen/native/ReduceOps.h

index f0c0231..6a9010c 100644 (file)
--- a/aten/src/ATen/native/ReduceOps.h
+++ b/aten/src/ATen/native/ReduceOps.h
@@ -16,6 +16,7 @@ DECLARE_DISPATCH(reduce_fn, sum_stub);
  DECLARE_DISPATCH(reduce_fn, prod_stub);
  DECLARE_DISPATCH(reduce_fn, mean_stub);
  DECLARE_DISPATCH(reduce_fn, and_stub);
+DECLARE_DISPATCH(reduce_fn, or_stub);
  
  using reduce_std_var_function =
    void (*)(TensorIterator&, bool unbiased, bool take_sqrt);
diff --git a/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp b/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp

index 3841dec..b6fb9b6 100644 (file)
--- a/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp
@@ -134,6 +134,20 @@ static void and_kernel_impl(TensorIterator& iter) {
      /*ident=*/true);
  }
  
+static void or_kernel_impl(TensorIterator& iter) {
+  binary_kernel_reduce_vec(
+    iter,
+    [=](uint8_t a, uint8_t b) -> uint8_t { return a || b; },
+    [=](Vec256<uint8_t> a, Vec256<uint8_t> b) {
+      Vec256<uint8_t> c = Vec256<uint8_t>();
+      for (int i = 0; i != Vec256<uint8_t>::size(); i++) {
+        c[i] = a[i] || b[i];
+      }
+      return c;
+    },
+    /*ident=*/false);
+}
+
  }  // anonymous namespace
  
  REGISTER_DISPATCH(sum_stub, &sum_kernel_impl);
@@ -142,5 +156,6 @@ REGISTER_DISPATCH(prod_stub, &prod_kernel_impl);
  REGISTER_DISPATCH(mean_stub, &mean_kernel_impl);
  REGISTER_DISPATCH(norm_stub, &norm_kernel_tensor_iterator_impl);
  REGISTER_DISPATCH(and_stub, &and_kernel_impl);
+REGISTER_DISPATCH(or_stub, &or_kernel_impl);
  
  }}  // namespace at::native
diff --git a/aten/src/ATen/native/cuda/ReduceOpsKernel.cu b/aten/src/ATen/native/cuda/ReduceOpsKernel.cu

index 1ce0fec..1e4c9fc 100644 (file)
--- a/aten/src/ATen/native/cuda/ReduceOpsKernel.cu
+++ b/aten/src/ATen/native/cuda/ReduceOpsKernel.cu
@@ -151,11 +151,19 @@ void and_kernel_cuda(TensorIterator& iter) {
      }), true);
  }
  
+void or_kernel_cuda(TensorIterator& iter) {
+  gpu_reduce_kernel<uint8_t, uint8_t>(
+    iter, func_wrapper<uint8_t> ([]GPU_LAMBDA(uint8_t a, uint8_t b) -> uint8_t {
+      return a || b;
+    }), false);
+}
+
  REGISTER_DISPATCH(std_var_stub, &std_var_kernel_cuda);
  REGISTER_DISPATCH(sum_stub, &sum_kernel_cuda);
  REGISTER_DISPATCH(prod_stub, &prod_kernel_cuda);
  REGISTER_DISPATCH(mean_stub, &mean_kernel_cuda);
  REGISTER_DISPATCH(norm_stub, &norm_kernel_cuda);
  REGISTER_DISPATCH(and_stub, &and_kernel_cuda);
+REGISTER_DISPATCH(or_stub, &or_kernel_cuda);
  
  }} // namespace at::native
diff --git a/aten/src/TH/generic/THTensorMath.h b/aten/src/TH/generic/THTensorMath.h

index 61520a3..244b60c 100644 (file)
--- a/aten/src/TH/generic/THTensorMath.h
+++ b/aten/src/TH/generic/THTensorMath.h
@@ -196,11 +196,4 @@ TH_API accreal THTensor_(normall)(THTensor *t, scalar_t value);
  TH_API void THTensor_(dirichlet_grad)(THTensor *self, THTensor *x, THTensor *alpha, THTensor *total);
  #endif
  
-#if defined(TH_REAL_IS_BYTE)
-
-TH_API int THTensor_(logicalAnyAll)(THTensor *self);
-TH_API void THTensor_(logicalAny)(THTensor *r_, THTensor *t, int dimension, int keepdim);
-
-#endif /* TH_REAL_IS_BYTE */
-
  #endif
diff --git a/aten/src/TH/generic/THTensorMoreMath.cpp b/aten/src/TH/generic/THTensorMoreMath.cpp

index ab6242d..27f6798 100644 (file)
--- a/aten/src/TH/generic/THTensorMoreMath.cpp
+++ b/aten/src/TH/generic/THTensorMoreMath.cpp
@@ -1531,107 +1531,6 @@ LAB_IMPLEMENT_BASIC_FUNCTION(abs,labs)
  LAB_IMPLEMENT_BASIC_FUNCTION(abs,abs)
  #endif /* int only part */
  
-#if defined(TH_REAL_IS_BYTE) /* Byte only part */
-
-int THTensor_(logicalAnyAll)(THTensor *tensor)
-{
-  scalar_t sum = 0;
-  int serial_path = 0;
-#ifdef _OPENMP
-  int inOMP = omp_in_parallel();
-  if(inOMP) {
-    serial_path = 1;
-  } else {
-    TH_TENSOR_APPLY_REDUCTION_OMP(scalar_t, tensor, ||:sum, sum = sum || *tensor_data;, UNCERTAIN_TH_OMP_OVERHEAD_THRESHOLD);
-  }
-#else
-    serial_path = 1;
-#endif
-  if (serial_path) {
-    TH_TENSOR_APPLY(scalar_t, tensor, sum = sum || *tensor_data;);
-  }
-  return (bool)sum;
-}
-
-void THTensor_(logicalAny)(THTensor *r_, THTensor *t, int dimension, int keepdim)
-{
-  THArgCheck(dimension >= 0 && dimension < THTensor_(nDimensionLegacyAll)(t), 2, "dimension %d out of range",
-      dimension + TH_INDEX_BASE);
-
-  THTensor_(preserveReduceDimSemantics)(r_, THTensor_(nDimensionLegacyAll)(t), dimension, keepdim);
-  std::vector<int64_t> dim = THTensor_sizesLegacyNoScalars(t);
-  dim[dimension] = 1;
-  THTensor_(resize)(r_, dim, {});
-
-  int serial_path = 0;
-#ifdef _OPENMP
-  int inOMP = omp_in_parallel();
-  if (inOMP) {
-    serial_path = 1;
-  } else {
-    int r_Contig = THTensor_(isContiguous)(r_);
-    scalar_t *tp = t->data<scalar_t>();
-    scalar_t *rp = r_->data<scalar_t>();
-    if(r_Contig && (tp != rp)){
-      ptrdiff_t iter = 0;
-      ptrdiff_t r_Size = THTensor_(nElement)(r_);
-      int r_Dim = THTensor_nDimensionLegacyAll(r_);
-      #pragma omp parallel for if ( r_Size > TH_OMP_OVERHEAD_THRESHOLD)
-      for (iter = 0; iter < r_Size; iter++) {
-        int j;
-        int64_t quot;
-        int64_t rem = iter;
-        ptrdiff_t tBasicIndex = 0;
-
-        for(j = 0; j < r_Dim; ++j) {
-          if(j != dimension){
-            quot = rem/r_->stride(j);
-            rem = rem%r_->stride(j);
-            tBasicIndex += quot*t->stride(j);
-          }
-        }
-        scalar_t *t_data = tp+tBasicIndex;
-        scalar_t *r__data = rp+iter;
-        *r__data = 0;
-        for(j=0; j < THTensor_sizeLegacyNoScalars(t, dimension); ++j) {
-          *r__data = *r__data || *(t_data + j*THTensor_strideLegacyNoScalars(t, dimension));
-        }
-      }
-    } else {
-      serial_path = 1;
-    }
-  }
-#else
-  serial_path = 1;
-#endif
-  if (serial_path) {
-    // two implementations optimized for data locality
-    if (THTensor_strideLegacyNoScalars(t, dimension) == 1) {
-      TH_TENSOR_DIM_APPLY2(scalar_t, t, scalar_t, r_, dimension,
-                           accreal sum = 0;
-                           int64_t i;
-                           for(i = 0; i < t_size; i++)
-                             sum = sum || t_data[i*t_stride];
-                           *r__data = (scalar_t)sum;);
-    } else {
-      THTensor_(zero)(r_);
-      THTensor *temp_ = THTensor_(newWithTensor)(r_);
-      // r_.expand_as(t)
-      temp_->set_size(dimension,THTensor_sizeLegacyNoScalars(t, dimension));
-      temp_->set_stride(dimension, 0);
-
-      TH_TENSOR_APPLY2(scalar_t, temp_, scalar_t, t, *temp__data = *temp__data || *t_data;);
-      c10::raw::intrusive_ptr::decref(temp_);
-    }
-  }
-
-  if (!keepdim) {
-    THTensor_(squeeze1d)(r_, r_, dimension);
-  }
-}
-
-#endif /* Byte only part */
-
  /* floating point only now */
  #if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE)
  
diff --git a/aten/src/THC/THCTensorMath.h b/aten/src/THC/THCTensorMath.h

index c3164f3..87ca4a2 100644 (file)
--- a/aten/src/THC/THCTensorMath.h
+++ b/aten/src/THC/THCTensorMath.h
@@ -49,8 +49,4 @@
  #include <THC/generic/THCTensorTopK.h>
  #include <THC/THCGenerateAllTypes.h>
  
-THC_API int THCudaByteTensor_logicalAnyAll(THCState *state, THCudaByteTensor *self);
-
-THC_API void THCudaByteTensor_logicalAny(THCState *state, THCudaByteTensor *self, THCudaByteTensor *src, int dimension, int keepdim);
-
  #endif
diff --git a/aten/src/THC/THCTensorMathReduce.cu b/aten/src/THC/THCTensorMathReduce.cu

index 7d2342e..1a2c626 100644 (file)
--- a/aten/src/THC/THCTensorMathReduce.cu
+++ b/aten/src/THC/THCTensorMathReduce.cu
@@ -1,32 +1,2 @@
  #include <THC/THCTensorMathReduce.cuh>
  #include <THC/THCTensor.hpp>
-
-THC_API int
-THCudaByteTensor_logicalAnyAll(THCState *state, THCudaByteTensor *self) {
-  THCAssertSameGPU(THCudaByteTensor_checkGPU(state, 1, self));
-  unsigned char result;
-  if (!THC_reduceAll<uint8_t>(state, self,
-                              thrust::identity<unsigned char>(),
-                              LogicalAny(),
-                              (unsigned char) 0, &result, 0)) {
-    THArgCheck(false, 1, CUTORCH_DIM_WARNING);
-  }
-
-  return (int) result;
-}
-
-THC_API void
-THCudaByteTensor_logicalAny(THCState* state, THCudaByteTensor *self, THCudaByteTensor *src, int dimension, int keepdim) {
-  THCAssertSameGPU(THCudaByteTensor_checkGPU(state, 2, self, src));
-  if (!THC_reduceDim<uint8_t>(state, self, src,
-                              thrust::identity<unsigned char>(),
-                              LogicalAny(),
-                              thrust::identity<unsigned char>(),
-                              (unsigned char) 0,
-                              dimension,
-                              keepdim)) {
-    THArgCheck(false, 2, CUTORCH_DIM_WARNING);
-  }
-
-  THCudaCheck(cudaGetLastError());
-}
diff --git a/test/test_torch.py b/test/test_torch.py

index 19d903d..0836643 100644 (file)
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -378,6 +378,46 @@ class _TestTorchMixin(object):
                          res2[i, j] += m1[i, k] * m2[k, j]
              self.assertEqual(res1, res2)
  
+    def test_logical_any(self):
+        devices = ['cpu'] if not torch.cuda.is_available() else ['cpu', 'cuda']
+        for device in devices:
+            x = torch.zeros([2, 3, 400], dtype=torch.uint8, device=device)
+
+            self.assertEqual(
+                torch.tensor(0, dtype=torch.uint8, device=device),
+                x.any())
+
+            self.assertEqual(
+                torch.zeros([1, 3, 400], dtype=torch.uint8, device=device),
+                x.any(0, keepdim=True))
+
+            self.assertEqual(
+                torch.zeros([2, 1, 400], dtype=torch.uint8, device=device),
+                x.any(1, keepdim=True))
+
+            self.assertEqual(
+                torch.zeros([2, 3, 1], dtype=torch.uint8, device=device),
+                x.any(2, keepdim=True))
+
+            # set the last element to 0
+            x[-1][-1][-1] = 1
+
+            self.assertEqual(
+                torch.tensor(1, dtype=torch.uint8, device=device),
+                x.any())
+
+            y = torch.zeros([1, 3, 400], dtype=torch.uint8, device=device)
+            y[-1][-1][-1] = 1
+            self.assertEqual(y, x.any(0, keepdim=True))
+
+            y = torch.zeros([2, 1, 400], dtype=torch.uint8, device=device)
+            y[-1][-1][-1] = 1
+            self.assertEqual(y, x.any(1, keepdim=True))
+
+            y = torch.zeros([2, 3, 1], dtype=torch.uint8, device=device)
+            y[-1][-1][-1] = 1
+            self.assertEqual(y, x.any(2, keepdim=True))
+
      def test_logical_all(self):
          devices = ['cpu'] if not torch.cuda.is_available() else ['cpu', 'cuda']
          for device in devices:
diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml

index b391a6b..2a93d42 100644 (file)
--- a/tools/autograd/derivatives.yaml
+++ b/tools/autograd/derivatives.yaml
@@ -138,9 +138,14 @@
  - name: alias(Tensor self)
    self: grad
  
-# The two items below are necessary because TensorIterator doesn't work on
-# Variables (codegen does not unwrap the input Tensor for all(*) without this
-# line).
+# The four items below are necessary because TensorIterator doesn't work on
+# Variables (codegen does not unwrap the input Tensor for all() and any() ).
+- name: any(Tensor self)
+  self: not_implemented("any")
+
+- name: any(Tensor self, int64_t dim, bool keepdim)
+  self: not_implemented("any")
+
  - name: all(Tensor self)
    self: not_implemented("all")
author	Shen Li <shenli@fb.com>
	Fri, 18 Jan 2019 18:07:33 +0000 (10:07 -0800)
committer	Facebook Github Bot <facebook-github-bot@users.noreply.github.com>
	Fri, 18 Jan 2019 18:32:19 +0000 (10:32 -0800)
aten/src/ATen/Declarations.cwrap		patch \| blob \| history
aten/src/ATen/core/aten_interned_strings.h		patch \| blob \| history
aten/src/ATen/cpu/vec256/vec256_base.h		patch \| blob \| history
aten/src/ATen/native/LegacyDefinitions.cpp		patch \| blob \| history
aten/src/ATen/native/ReduceOps.cpp		patch \| blob \| history
aten/src/ATen/native/ReduceOps.h		patch \| blob \| history
aten/src/ATen/native/cpu/ReduceOpsKernel.cpp		patch \| blob \| history
aten/src/ATen/native/cuda/ReduceOpsKernel.cu		patch \| blob \| history
aten/src/TH/generic/THTensorMath.h		patch \| blob \| history
aten/src/TH/generic/THTensorMoreMath.cpp		patch \| blob \| history
aten/src/THC/THCTensorMath.h		patch \| blob \| history
aten/src/THC/THCTensorMathReduce.cu		patch \| blob \| history
test/test_torch.py		patch \| blob \| history
tools/autograd/derivatives.yaml		patch \| blob \| history