From: Peter Bell Date: Tue, 17 Aug 2021 05:09:25 +0000 (-0700) Subject: Workaround for cuFFT bug (#63327) X-Git-Tag: accepted/tizen/8.0/unified/20231005.095509~976 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=877b649bc3c49b90ca87c53bb3e707977b163e1b;p=platform%2Fupstream%2Fpytorch.git Workaround for cuFFT bug (#63327) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63327 Fixes #63152 Test Plan: Imported from OSS Reviewed By: astaff Differential Revision: D30343558 Pulled By: mruberry fbshipit-source-id: 68e17a07650f65f397e26efc417e97e2ab302f82 --- diff --git a/aten/src/ATen/native/cuda/SpectralOps.cpp b/aten/src/ATen/native/cuda/SpectralOps.cpp index 8d8ac70..69729b2 100644 --- a/aten/src/ATen/native/cuda/SpectralOps.cpp +++ b/aten/src/ATen/native/cuda/SpectralOps.cpp @@ -234,6 +234,23 @@ void cufft_clear_plan_cache_impl(int64_t device_index) { namespace { constexpr int64_t cufft_max_ndim = 3; +// "Large" here means a prime factor not special-cased by cuFFT +// Ref: https://docs.nvidia.com/cuda/cufft/index.html#accuracy-and-performance +bool has_large_prime_factor(int64_t n) { + constexpr int64_t first_large_prime = 11; + const std::array prime_radices{{2, 3, 5, 7}}; + for (auto prime : prime_radices) { + if (n < first_large_prime) { + return false; + } + + while (n % prime == 0) { + n /= prime; + } + } + return n != 1; +} + // Execute a general fft operation (can be c2c, onesided r2c or onesided c2r) static const Tensor& _exec_fft(Tensor& out, const Tensor& self, IntArrayRef out_sizes, IntArrayRef dim, bool forward) { @@ -293,7 +310,22 @@ static const Tensor& _exec_fft(Tensor& out, const Tensor& self, IntArrayRef out_ c10::optional uncached_plan; const CuFFTConfig * config = nullptr; - if (plan_cache.max_size() > 0) { + // Workaround for gh-63152, gh-58724 + // Bluestein plans in CUDA 11.1 (cufft 10.3) cannot be re-used + // Bluestein's algorithm is only used when a size has large prime factors, + // sizes with only small prime factors can still be cached + bool use_caching = true; +#ifdef CUFFT_VERSION + if (10300 <= CUFFT_VERSION && CUFFT_VERSION < 10400) { + // Only cache plans for transforms with small prime factors + use_caching = std::none_of( + signal_size.begin() + 1, signal_size.end(), [](int64_t dim_size) { + return has_large_prime_factor(dim_size); + }); + } +#endif + + if (use_caching && plan_cache.max_size() > 0) { guard.lock(); if (plan_cache.max_size() > 0) { // check again after acquiring the lock config = &plan_cache.lookup(Params); diff --git a/test/test_spectral_ops.py b/test/test_spectral_ops.py index 9085254..e7e4832 100644 --- a/test/test_spectral_ops.py +++ b/test/test_spectral_ops.py @@ -1083,6 +1083,21 @@ class TestFFT(TestCase): @onlyOnCPUAndCUDA @skipCPUIfNoFFT + def test_fft_plan_repeatable(self, device): + # Regression test for gh-58724 and gh-63152 + for n in [2048, 3199, 5999]: + a = torch.randn(n, device=device, dtype=torch.complex64) + res1 = torch.fft.fftn(a) + res2 = torch.fft.fftn(a.clone()) + self.assertEqual(res1, res2) + + a = torch.randn(n, device=device, dtype=torch.float64) + res1 = torch.fft.rfft(a) + res2 = torch.fft.rfft(a.clone()) + self.assertEqual(res1, res2) + + @onlyOnCPUAndCUDA + @skipCPUIfNoFFT @dtypes(torch.double) def test_istft_round_trip_simple_cases(self, device, dtype): """stft -> istft should recover the original signale"""