From 1ee66a5278f8c323210e051465d0c0cab2c9ccba Mon Sep 17 00:00:00 2001
From: Jane Xu <janeyx@fb.com>
Date: Fri, 17 Sep 2021 11:45:11 -0700
Subject: [PATCH] Remove CUDA 9.2 references conditionals and workarounds
 (#65070)

Summary:
Title says it all

Pull Request resolved: https://github.com/pytorch/pytorch/pull/65070

Reviewed By: malfet

Differential Revision: D30966464

Pulled By: janeyx99

fbshipit-source-id: e454906fd5d7d321d390939ba5d237e1d9b150f8
---
 aten/src/ATen/native/cuda/BatchLinearAlgebraLib.h |  3 +-
 c10/util/Optional.cpp                             |  4 -
 c10/util/Optional.h                               | 41 ----------
 caffe2/core/operator.h                            |  6 --
 test/cpp/jit/test_gpu.cpp                         | 98 +++++++++++------------
 torch/__init__.py                                 |  3 +-
 torch/autograd/profiler.py                        |  4 +-
 torch/csrc/api/include/torch/nn/cloneable.h       | 15 +---
 8 files changed, 54 insertions(+), 120 deletions(-)
diff --git a/aten/src/ATen/native/cuda/BatchLinearAlgebraLib.h b/aten/src/ATen/native/cuda/BatchLinearAlgebraLib.h
index 72d2f65..2c48a0d 100644
--- a/aten/src/ATen/native/cuda/BatchLinearAlgebraLib.h
+++ b/aten/src/ATen/native/cuda/BatchLinearAlgebraLib.h
@@ -7,8 +7,7 @@
 #include <ATen/native/LinearAlgebraUtils.h>
 #include <ATen/native/cuda/MiscUtils.h>
 
-#if defined(CUDART_VERSION) && defined(CUSOLVER_VERSION) && CUSOLVER_VERSION >= 10200
-// some cusolver functions don't work well on cuda 9.2 or cuda 10.1.105, cusolver is used on cuda >= 10.1.243
+#if defined(CUDART_VERSION) && defined(CUSOLVER_VERSION)
 #define USE_CUSOLVER
 #endif
 
diff --git a/c10/util/Optional.cpp b/c10/util/Optional.cpp
index dd78eee..b98dc56 100644
--- a/c10/util/Optional.cpp
+++ b/c10/util/Optional.cpp
@@ -3,9 +3,6 @@
 
 #include <type_traits>
 
-// CUDA 9.2 and below fail while trying to compile default move constructor
-// see https://github.com/pytorch/csprng/issues/84
-#if (!defined(__CUDA_ARCH__) || !defined(CUDA_VERSION) || CUDA_VERSION > 9200)
 static_assert(
     C10_IS_TRIVIALLY_COPYABLE(c10::optional<int>),
     "c10::optional<int> should be trivially copyable");
@@ -18,4 +15,3 @@ static_assert(
 static_assert(
     sizeof(c10::optional<c10::IntArrayRef>) == sizeof(c10::IntArrayRef),
     "c10::optional<IntArrayRef> should be size-optimized");
-#endif
diff --git a/c10/util/Optional.h b/c10/util/Optional.h
index 7044c79..c2f87eb 100644
--- a/c10/util/Optional.h
+++ b/c10/util/Optional.h
@@ -499,9 +499,6 @@ template <typename T>
 struct is_arrayref<c10::ArrayRef<T>> : std::true_type {};
 } // namespace detail_
 
-// CUDA 9.2 and below fail while trying to compile default move constructor
-// see https://github.com/pytorch/csprng/issues/84
-#if (!defined(__CUDA_ARCH__) || !defined(CUDA_VERSION) || CUDA_VERSION > 9200)
 template <class T>
 using OptionalBase = std::conditional_t<
     detail_::is_arrayref<T>::value,
@@ -524,23 +521,9 @@ using OptionalBase = std::conditional_t<
                                                              // trivial
                                                              // destructor
             optional_base<std::remove_const_t<T>>>>>;
-#else
-template <class T>
-using OptionalBase = std::conditional_t<
-    detail_::is_arrayref<T>::value,
-    arrayref_optional_base<T>,
-    std::conditional_t<
-        std::is_trivially_destructible<T>::value, // if possible
-        constexpr_optional_base<std::remove_const_t<T>>, // use base with
-                                                         // trivial destructor
-        optional_base<std::remove_const_t<T>>>>;
-#endif
 
 template <class T>
 class optional : private OptionalBase<T> {
-// CUDA 9.2 and below fail while trying to compile default move constructor
-// see https://github.com/pytorch/csprng/issues/84
-#if (!defined(__CUDA_ARCH__) || !defined(CUDA_VERSION) || CUDA_VERSION > 9200)
   template <class U> // re-declaration for nvcc on Windows.
   using OptionalBase = std::conditional_t<
       detail_::is_arrayref<U>::value,
@@ -565,17 +548,6 @@ class optional : private OptionalBase<T> {
                                                                // trivial
                                                                // destructor
               optional_base<std::remove_const_t<U>>>>>;
-#else
-  template <class U>
-  using OptionalBase = std::conditional_t<
-      detail_::is_arrayref<U>::value,
-      arrayref_optional_base<U>,
-      std::conditional_t<
-          std::is_trivially_destructible<U>::value, // if possible
-          constexpr_optional_base<std::remove_const_t<U>>, // use base with
-                                                           // trivial destructor
-          optional_base<std::remove_const_t<U>>>>;
-#endif
 
   static_assert(
       !std::is_same<typename std::decay<T>::type, nullopt_t>::value,
@@ -634,20 +606,7 @@ class optional : private OptionalBase<T> {
   constexpr optional(nullopt_t) noexcept : OptionalBase<T>(){};
 
   optional(const optional& rhs) = default;
-
-// CUDA 9.2 and below fail while trying to compile default move constructor
-// see https://github.com/pytorch/csprng/issues/84
-#if (!defined(__CUDA_ARCH__) || !defined(CUDA_VERSION) || CUDA_VERSION > 9200)
   optional(optional&& rhs) = default;
-#else
-  optional(optional&& rhs) noexcept(
-      std::is_nothrow_move_constructible<T>::value) {
-    if (rhs.initialized()) {
-      ::new (static_cast<void*>(dataptr())) T(std::move(*rhs));
-      OptionalBase<T>::setInitialized(true);
-    }
-  }
-#endif
 
   // see https://github.com/akrzemi1/Optional/issues/16
   // and https://en.cppreference.com/w/cpp/utility/optional/optional,
diff --git a/caffe2/core/operator.h b/caffe2/core/operator.h
index 15d1ead..b670845 100644
--- a/caffe2/core/operator.h
+++ b/caffe2/core/operator.h
@@ -731,14 +731,8 @@ inline vector<int16_t> OperatorBase::GetVectorFromIValueList<int16_t>(
 
 // OP_SINGLE_ARG provides a shorter initialization choice for initialization of
 // member variables for the class constructors.
-// This is a workaround for CUDA9.2 and GCC7
-#if defined(CUDART_VERSION) && CUDART_VERSION >= 9020 && __GNUC__ >= 7
-#define OP_SINGLE_ARG(type, name, variable, default) \
-  variable(this->template GetSingleArgument<type>(name, (default)))
-#else
 #define OP_SINGLE_ARG(type, name, variable, default) \
   variable(OperatorBase::GetSingleArgument<type>(name, (default)))
-#endif
 
 // INPUT_TAGS and OUTPUT_TAGS are optional features to name the indices of the
 // operator's inputs and outputs, in order to avoid confusion. For example, for
diff --git a/test/cpp/jit/test_gpu.cpp b/test/cpp/jit/test_gpu.cpp
index 1a0ee7b..4674082 100644
--- a/test/cpp/jit/test_gpu.cpp
+++ b/test/cpp/jit/test_gpu.cpp
@@ -2606,44 +2606,40 @@ TEST(NVFuserTest, FusionUnaryOps_CUDA) {
   using OpTuple =
       std::tuple<at::Tensor (*)(const at::Tensor&), UnaryOpType, std::string>;
 
-  // [Note: explicit tuple type for uniform initialization list]
-  // Tuple type must be explicitly specified for each uniform initialization
-  // list within the vector to make this code compatible with some old env
-  // which we still need to support. eg. gcc 5.4 + cuda 9.2.
   std::vector<OpTuple> ops{
-      OpTuple{at::abs, UnaryOpType::Abs, "abs"},
-      OpTuple{at::acos, UnaryOpType::Acos, "acos"},
-      OpTuple{at::asin, UnaryOpType::Asin, "asin"},
-      OpTuple{at::atan, UnaryOpType::Atan, "atan"},
+      {at::abs, UnaryOpType::Abs, "abs"},
+      {at::acos, UnaryOpType::Acos, "acos"},
+      {at::asin, UnaryOpType::Asin, "asin"},
+      {at::atan, UnaryOpType::Atan, "atan"},
       // There does not appear to be an appropriate ATen function for atanh
-      // OpTuple{at::atanh,      UnaryOpType::Atanh,      "atanh"      },
-      OpTuple{at::ceil, UnaryOpType::Ceil, "ceil"},
-      OpTuple{at::cos, UnaryOpType::Cos, "cos"},
-      OpTuple{at::cosh, UnaryOpType::Cosh, "cosh"},
-      OpTuple{at::erf, UnaryOpType::Erf, "erf"},
-      OpTuple{at::erfc, UnaryOpType::Erfc, "erfc"},
-      OpTuple{at::exp, UnaryOpType::Exp, "exp"},
-      OpTuple{at::expm1, UnaryOpType::Expm1, "expm1"},
-      OpTuple{at::floor, UnaryOpType::Floor, "floor"},
-      OpTuple{at::frac, UnaryOpType::Frac, "frac"},
-      OpTuple{at::gelu, UnaryOpType::Gelu, "gelu"},
-      OpTuple{at::lgamma, UnaryOpType::Lgamma, "lgamma"},
-      OpTuple{at::log, UnaryOpType::Log, "log"},
-      OpTuple{at::log10, UnaryOpType::Log10, "log10"},
-      OpTuple{at::log1p, UnaryOpType::Log1p, "log1p"},
-      OpTuple{at::log2, UnaryOpType::Log2, "log2"},
-      OpTuple{at::neg, UnaryOpType::Neg, "neg"},
-      OpTuple{at::reciprocal, UnaryOpType::Reciprocal, "reciprocal"},
-      OpTuple{at::relu, UnaryOpType::Relu, "relu"},
-      OpTuple{at::round, UnaryOpType::Round, "round"},
-      OpTuple{at::rsqrt, UnaryOpType::Rsqrt, "rsqrt"},
-      OpTuple{at::sigmoid, UnaryOpType::Sigmoid, "sigmoid"},
-      OpTuple{at::sin, UnaryOpType::Sin, "sin"},
-      OpTuple{at::sinh, UnaryOpType::Sinh, "sinh"},
-      OpTuple{at::sqrt, UnaryOpType::Sqrt, "sqrt"},
-      OpTuple{at::tan, UnaryOpType::Tan, "tan"},
-      OpTuple{at::tanh, UnaryOpType::Tanh, "tanh"},
-      OpTuple{at::trunc, UnaryOpType::Trunc, "trunc"}};
+      // {at::atanh,      UnaryOpType::Atanh,      "atanh"      },
+      {at::ceil, UnaryOpType::Ceil, "ceil"},
+      {at::cos, UnaryOpType::Cos, "cos"},
+      {at::cosh, UnaryOpType::Cosh, "cosh"},
+      {at::erf, UnaryOpType::Erf, "erf"},
+      {at::erfc, UnaryOpType::Erfc, "erfc"},
+      {at::exp, UnaryOpType::Exp, "exp"},
+      {at::expm1, UnaryOpType::Expm1, "expm1"},
+      {at::floor, UnaryOpType::Floor, "floor"},
+      {at::frac, UnaryOpType::Frac, "frac"},
+      {at::gelu, UnaryOpType::Gelu, "gelu"},
+      {at::lgamma, UnaryOpType::Lgamma, "lgamma"},
+      {at::log, UnaryOpType::Log, "log"},
+      {at::log10, UnaryOpType::Log10, "log10"},
+      {at::log1p, UnaryOpType::Log1p, "log1p"},
+      {at::log2, UnaryOpType::Log2, "log2"},
+      {at::neg, UnaryOpType::Neg, "neg"},
+      {at::reciprocal, UnaryOpType::Reciprocal, "reciprocal"},
+      {at::relu, UnaryOpType::Relu, "relu"},
+      {at::round, UnaryOpType::Round, "round"},
+      {at::rsqrt, UnaryOpType::Rsqrt, "rsqrt"},
+      {at::sigmoid, UnaryOpType::Sigmoid, "sigmoid"},
+      {at::sin, UnaryOpType::Sin, "sin"},
+      {at::sinh, UnaryOpType::Sinh, "sinh"},
+      {at::sqrt, UnaryOpType::Sqrt, "sqrt"},
+      {at::tan, UnaryOpType::Tan, "tan"},
+      {at::tanh, UnaryOpType::Tanh, "tanh"},
+      {at::trunc, UnaryOpType::Trunc, "trunc"}};
 
   std::for_each(ops.begin(), ops.end(), [](OpTuple& op) {
     test_op(
@@ -2680,14 +2676,13 @@ TEST(NVFuserTest, FusionBinaryOps_CUDA) {
   using AtenFuncSig = at::Tensor (*)(const at::Tensor&, const at::Tensor&);
   using OpTuple = std::tuple<AtenFuncSig, BinaryOpType, std::string>;
 
-  // see [Note: explicit tuple type for uniform initialization list]
   std::vector<OpTuple> logic_ops{
-      OpTuple{at::eq, BinaryOpType::Eq, "eq"},
-      OpTuple{at::ge, BinaryOpType::GE, "ge"},
-      OpTuple{at::gt, BinaryOpType::GT, "gt"},
-      OpTuple{at::le, BinaryOpType::LE, "le"},
-      OpTuple{at::lt, BinaryOpType::LT, "lt"},
-      OpTuple{at::ne, BinaryOpType::NE, "ne"}};
+      {at::eq, BinaryOpType::Eq, "eq"},
+      {at::ge, BinaryOpType::GE, "ge"},
+      {at::gt, BinaryOpType::GT, "gt"},
+      {at::le, BinaryOpType::LE, "le"},
+      {at::lt, BinaryOpType::LT, "lt"},
+      {at::ne, BinaryOpType::NE, "ne"}};
 
   std::for_each(logic_ops.begin(), logic_ops.end(), [](OpTuple& op) {
     test_op(
@@ -2709,18 +2704,17 @@ TEST(NVFuserTest, FusionBinaryOps_CUDA) {
             std::make_pair(ValType::TensorView, DataType::Float)));
   });
 
-  // see [Note: explicit tuple type for uniform initialization list]
   std::vector<OpTuple> math_ops{
-      OpTuple{at::atan2, BinaryOpType::Atan2, "atan2"},
-      OpTuple{at::div, BinaryOpType::Div, "div"},
-      OpTuple{at::fmod, BinaryOpType::Fmod, "fmod"},
-      OpTuple{at::max, BinaryOpType::Max, "max"},
-      OpTuple{at::min, BinaryOpType::Min, "min"},
-      OpTuple{at::mul, BinaryOpType::Mul, "mul"},
-      OpTuple{at::pow, BinaryOpType::Pow, "pow"},
+      {at::atan2, BinaryOpType::Atan2, "atan2"},
+      {at::div, BinaryOpType::Div, "div"},
+      {at::fmod, BinaryOpType::Fmod, "fmod"},
+      {at::max, BinaryOpType::Max, "max"},
+      {at::min, BinaryOpType::Min, "min"},
+      {at::mul, BinaryOpType::Mul, "mul"},
+      {at::pow, BinaryOpType::Pow, "pow"},
       // NOTE: Remainder does not match the Aten impl exactly
       // despite using an identical function.
-      OpTuple{at::remainder, BinaryOpType::Remainder, "remainder"},
+      {at::remainder, BinaryOpType::Remainder, "remainder"},
   };
 
   std::for_each(math_ops.begin(), math_ops.end(), [](OpTuple& op) {
diff --git a/torch/__init__.py b/torch/__init__.py
index 5740b7a..18c9b08 100644
--- a/torch/__init__.py
+++ b/torch/__init__.py
@@ -106,8 +106,7 @@ if sys.platform == 'win32':
     try:
         ctypes.CDLL('vcruntime140.dll')
         ctypes.CDLL('msvcp140.dll')
-        if cuda_version not in ('9.2', '10.0'):
-            ctypes.CDLL('vcruntime140_1.dll')
+        ctypes.CDLL('vcruntime140_1.dll')
     except OSError:
         print('''Microsoft Visual C++ Redistributable is not installed, this may lead to the DLL load failure.
                  It can be downloaded at https://aka.ms/vs/16/release/vc_redist.x64.exe''')
diff --git a/torch/autograd/profiler.py b/torch/autograd/profiler.py
index c38ad99..c121b11 100644
--- a/torch/autograd/profiler.py
+++ b/torch/autograd/profiler.py
@@ -654,8 +654,8 @@ def parse_nvprof_trace(path):
     unique = EnforceUnique()
     for row in conn.execute(kernel_query):
         unique.see(row['marker_id'], row['runtime_id'])
-        # 211 is cudaKernelLaunch for cuda >= 9.2; 13 is for older cuda versions
-        assert (row['cbid'] == 211) or (row['cbid'] == 13)
+        # 211 is cudaKernelLaunch for cuda >= 9.2
+        assert (row['cbid'] == 211)
         evt = functions_map[row['marker_id']]
         evt.append_kernel(row['kernel_name'],
                           0,
diff --git a/torch/csrc/api/include/torch/nn/cloneable.h b/torch/csrc/api/include/torch/nn/cloneable.h
index 463784a..cc735d6 100644
--- a/torch/csrc/api/include/torch/nn/cloneable.h
+++ b/torch/csrc/api/include/torch/nn/cloneable.h
@@ -42,12 +42,8 @@ class Cloneable : public virtual Module {
     copy->buffers_.clear();
     copy->children_.clear();
     copy->reset();
-    // [[this pointer note]]
-    // Don't remove 'this' pointer, nvcc needs it to be explicitly given in some envs.
-    // eg. ubuntu 16.04 + gcc 5.x + cuda 9.2
-    //     ubuntu 16.04 + gcc 7.x + cuda 9.2
     TORCH_CHECK(
-        copy->parameters_.size() == this->parameters_.size(),
+        copy->parameters_.size() == parameters_.size(),
         "The cloned module does not have the same number of "
         "parameters as the original module after calling reset(). "
         "Are you sure you called register_parameter() inside reset() "
@@ -58,9 +54,8 @@ class Cloneable : public virtual Module {
           tensor.to(*device) : autograd::Variable(tensor).clone();
       copy->parameters_[parameter.key()].set_data(data);
     }
-    // Don't remove 'this' pointer. See [[this pointer note]]
     TORCH_CHECK(
-        copy->buffers_.size() == this->buffers_.size(),
+        copy->buffers_.size() == buffers_.size(),
         "The cloned module does not have the same number of "
         "buffers as the original module after calling reset(). "
         "Are you sure you called register_buffer() inside reset() "
@@ -71,15 +66,13 @@ class Cloneable : public virtual Module {
           tensor.to(*device) : autograd::Variable(tensor).clone();
       copy->buffers_[buffer.key()].set_data(data);
     }
-    // Don't remove 'this' pointer. See [[this pointer note]]
     TORCH_CHECK(
-        copy->children_.size() == this->children_.size(),
+        copy->children_.size() == children_.size(),
         "The cloned module does not have the same number of "
         "child modules as the original module after calling reset(). "
         "Are you sure you called register_module() inside reset() "
         "and not the constructor?");
-    // Don't remove 'this' pointer. See [[this pointer note]]
-    for (const auto& child : this->children_) {
+    for (const auto& child : children_) {
       copy->children_[child.key()]->clone_(*child.value(), device);
     }
     return copy;
-- 
2.7.4