From: Peter Bell <peterbell10@live.co.uk>
Date: Tue, 17 Aug 2021 05:09:25 +0000 (-0700)
Subject: Workaround for cuFFT bug (#63327)
X-Git-Tag: accepted/tizen/8.0/unified/20231005.095509~976
X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=877b649bc3c49b90ca87c53bb3e707977b163e1b;p=platform%2Fupstream%2Fpytorch.git

Workaround for cuFFT bug (#63327)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63327

Fixes #63152

Test Plan: Imported from OSS

Reviewed By: astaff

Differential Revision: D30343558

Pulled By: mruberry

fbshipit-source-id: 68e17a07650f65f397e26efc417e97e2ab302f82
---

diff --git a/aten/src/ATen/native/cuda/SpectralOps.cpp b/aten/src/ATen/native/cuda/SpectralOps.cpp
index 8d8ac70..69729b2 100644
--- a/aten/src/ATen/native/cuda/SpectralOps.cpp
+++ b/aten/src/ATen/native/cuda/SpectralOps.cpp
@@ -234,6 +234,23 @@ void cufft_clear_plan_cache_impl(int64_t device_index) {
 namespace {
 constexpr int64_t cufft_max_ndim = 3;
 
+// "Large" here means a prime factor not special-cased by cuFFT
+// Ref: https://docs.nvidia.com/cuda/cufft/index.html#accuracy-and-performance
+bool has_large_prime_factor(int64_t n) {
+  constexpr int64_t first_large_prime = 11;
+  const std::array<int64_t, 4> prime_radices{{2, 3, 5, 7}};
+  for (auto prime : prime_radices) {
+    if (n < first_large_prime) {
+        return false;
+    }
+
+    while (n % prime == 0) {
+      n /= prime;
+    }
+  }
+  return n != 1;
+}
+
 // Execute a general fft operation (can be c2c, onesided r2c or onesided c2r)
 static const Tensor& _exec_fft(Tensor& out, const Tensor& self, IntArrayRef out_sizes,
                          IntArrayRef dim, bool forward) {
@@ -293,7 +310,22 @@ static const Tensor& _exec_fft(Tensor& out, const Tensor& self, IntArrayRef out_
   c10::optional<CuFFTConfig> uncached_plan;
   const CuFFTConfig * config = nullptr;
 
-  if (plan_cache.max_size() > 0) {
+  // Workaround for gh-63152, gh-58724
+  // Bluestein plans in CUDA 11.1 (cufft 10.3) cannot be re-used
+  // Bluestein's algorithm is only used when a size has large prime factors,
+  // sizes with only small prime factors can still be cached
+  bool use_caching = true;
+#ifdef CUFFT_VERSION
+  if (10300 <= CUFFT_VERSION && CUFFT_VERSION < 10400) {
+    // Only cache plans for transforms with small prime factors
+    use_caching = std::none_of(
+        signal_size.begin() + 1, signal_size.end(), [](int64_t dim_size) {
+      return has_large_prime_factor(dim_size);
+    });
+  }
+#endif
+
+  if (use_caching && plan_cache.max_size() > 0) {
     guard.lock();
     if (plan_cache.max_size() > 0) {  // check again after acquiring the lock
       config = &plan_cache.lookup(Params);
diff --git a/test/test_spectral_ops.py b/test/test_spectral_ops.py
index 9085254..e7e4832 100644
--- a/test/test_spectral_ops.py
+++ b/test/test_spectral_ops.py
@@ -1083,6 +1083,21 @@ class TestFFT(TestCase):
 
     @onlyOnCPUAndCUDA
     @skipCPUIfNoFFT
+    def test_fft_plan_repeatable(self, device):
+        # Regression test for gh-58724 and gh-63152
+        for n in [2048, 3199, 5999]:
+            a = torch.randn(n, device=device, dtype=torch.complex64)
+            res1 = torch.fft.fftn(a)
+            res2 = torch.fft.fftn(a.clone())
+            self.assertEqual(res1, res2)
+
+            a = torch.randn(n, device=device, dtype=torch.float64)
+            res1 = torch.fft.rfft(a)
+            res2 = torch.fft.rfft(a.clone())
+            self.assertEqual(res1, res2)
+
+    @onlyOnCPUAndCUDA
+    @skipCPUIfNoFFT
     @dtypes(torch.double)
     def test_istft_round_trip_simple_cases(self, device, dtype):
         """stft -> istft should recover the original signale"""