From 7ca4728e6dd4f3a706082dedf33c21771116070a Mon Sep 17 00:00:00 2001 From: Peter Bell Date: Mon, 30 Aug 2021 20:17:12 -0700 Subject: [PATCH] Compile BatchLinearAlgebra without nvcc (#64146) Summary: These files only use cuda libraries interfaces, so don't actually need to be compiled with nvcc. Pull Request resolved: https://github.com/pytorch/pytorch/pull/64146 Reviewed By: ezyang Differential Revision: D30633189 Pulled By: ngimel fbshipit-source-id: c9d0ae5259a10cb49332d31f0da89ad758736ea8 --- ...atchLinearAlgebra.cu => BatchLinearAlgebra.cpp} | 29 +++++++++++----------- ...nearAlgebraLib.cu => BatchLinearAlgebraLib.cpp} | 18 -------------- caffe2/CMakeLists.txt | 2 +- 3 files changed, 15 insertions(+), 34 deletions(-) rename aten/src/ATen/native/cuda/{BatchLinearAlgebra.cu => BatchLinearAlgebra.cpp} (99%) rename aten/src/ATen/native/cuda/{BatchLinearAlgebraLib.cu => BatchLinearAlgebraLib.cpp} (98%) diff --git a/aten/src/ATen/native/cuda/BatchLinearAlgebra.cu b/aten/src/ATen/native/cuda/BatchLinearAlgebra.cpp similarity index 99% rename from aten/src/ATen/native/cuda/BatchLinearAlgebra.cu rename to aten/src/ATen/native/cuda/BatchLinearAlgebra.cpp index 4e806f0..7fdc55d 100644 --- a/aten/src/ATen/native/cuda/BatchLinearAlgebra.cu +++ b/aten/src/ATen/native/cuda/BatchLinearAlgebra.cpp @@ -1701,7 +1701,7 @@ static void cholesky_kernel(const Tensor& input, const Tensor& info, bool upper) #endif // USE_CUSOLVER } -REGISTER_DISPATCH(cholesky_stub, &cholesky_kernel) +REGISTER_CUDA_DISPATCH(cholesky_stub, &cholesky_kernel) // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ cholesky_inverse ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -1773,7 +1773,7 @@ Tensor& cholesky_inverse_kernel_impl(Tensor &result, Tensor& infos, bool upper) } -REGISTER_DISPATCH(cholesky_inverse_stub, &cholesky_inverse_kernel_impl); +REGISTER_CUDA_DISPATCH(cholesky_inverse_stub, &cholesky_inverse_kernel_impl); // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ lu ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -1945,7 +1945,7 @@ static void apply_lu(const Tensor& input, const Tensor& pivots, const Tensor& in } } -REGISTER_DISPATCH(lu_stub, &apply_lu); +REGISTER_CUDA_DISPATCH(lu_stub, &apply_lu); // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ triangular_solve ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -2039,7 +2039,7 @@ void triangular_solve_kernel(Tensor& A, Tensor& B, Tensor& infos, bool upper, bo } } -REGISTER_DISPATCH(triangular_solve_stub, &triangular_solve_kernel); +REGISTER_CUDA_DISPATCH(triangular_solve_stub, &triangular_solve_kernel); // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ orgqr ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -2057,7 +2057,7 @@ Tensor& orgqr_kernel_impl(Tensor& result, const Tensor& tau) { #endif } -REGISTER_DISPATCH(orgqr_stub, &orgqr_kernel_impl); +REGISTER_CUDA_DISPATCH(orgqr_stub, &orgqr_kernel_impl); void ormqr_kernel(const Tensor& input, const Tensor& tau, const Tensor& other, bool left, bool transpose) { #if defined(USE_CUSOLVER) @@ -2069,7 +2069,7 @@ void ormqr_kernel(const Tensor& input, const Tensor& tau, const Tensor& other, b #endif } -REGISTER_DISPATCH(ormqr_stub, &ormqr_kernel); +REGISTER_CUDA_DISPATCH(ormqr_stub, &ormqr_kernel); // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ qr ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -2148,7 +2148,7 @@ void geqrf_kernel(const Tensor& input, const Tensor& tau) { } } -REGISTER_DISPATCH(geqrf_stub, &geqrf_kernel); +REGISTER_CUDA_DISPATCH(geqrf_stub, &geqrf_kernel); template static void apply_qr(Tensor& Q, Tensor& R, int64_t q_size_minus_2, int64_t r_size_minus_1, int64_t n_columns, @@ -2423,7 +2423,7 @@ void linalg_eigh_kernel(const Tensor& eigenvalues, const Tensor& eigenvectors, c #endif } -REGISTER_DISPATCH(linalg_eigh_stub, &linalg_eigh_kernel); +REGISTER_CUDA_DISPATCH(linalg_eigh_stub, &linalg_eigh_kernel); // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ eig ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -2513,7 +2513,7 @@ std::tuple eig_kernel_impl(const Tensor& self, bool& eigenvector return std::tuple(out_eigvals, out_eigvecs); } -REGISTER_DISPATCH(eig_stub, &eig_kernel_impl); +REGISTER_CUDA_DISPATCH(eig_stub, &eig_kernel_impl); // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ linalg_eig ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -2599,7 +2599,7 @@ void linalg_eig_kernel(Tensor& eigenvalues, Tensor& eigenvectors, Tensor& infos, }); } -REGISTER_DISPATCH(linalg_eig_stub, &linalg_eig_kernel); +REGISTER_CUDA_DISPATCH(linalg_eig_stub, &linalg_eig_kernel); // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ svd ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -2671,8 +2671,7 @@ AT_ERROR("svd: MAGMA library not found in " std::tuple _svd_helper_cuda_legacy(const Tensor& self, bool some, bool compute_uv) { std::vector infos(batchCount(self), 0); - int64_t m = self.size(-2), n = self.size(-1); - int64_t k = std::min(m, n); + int64_t m = self.size(-2); char jobchar = compute_uv ? (some ? 'S' : 'A') : 'N'; @@ -2922,13 +2921,13 @@ static void lu_solve_trans_dispatch(const Tensor& b, const Tensor& lu, const Ten } } -REGISTER_DISPATCH(lu_solve_trans_stub, &lu_solve_trans_dispatch); +REGISTER_CUDA_DISPATCH(lu_solve_trans_stub, &lu_solve_trans_dispatch); static void lu_solve_dispatch(const Tensor& b, const Tensor& lu, const Tensor& pivots) { lu_solve_trans_dispatch(b, lu, pivots, 'N'); } -REGISTER_DISPATCH(lu_solve_stub, &lu_solve_dispatch); +REGISTER_CUDA_DISPATCH(lu_solve_stub, &lu_solve_dispatch); // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ lstsq ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -3112,7 +3111,7 @@ void lstsq_kernel(const Tensor& a, Tensor& b, Tensor& /*rank*/, Tensor& /*singul } } -REGISTER_DISPATCH(lstsq_stub, &lstsq_kernel); +REGISTER_CUDA_DISPATCH(lstsq_stub, &lstsq_kernel); // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ legacy_lstsq ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/aten/src/ATen/native/cuda/BatchLinearAlgebraLib.cu b/aten/src/ATen/native/cuda/BatchLinearAlgebraLib.cpp similarity index 98% rename from aten/src/ATen/native/cuda/BatchLinearAlgebraLib.cu rename to aten/src/ATen/native/cuda/BatchLinearAlgebraLib.cpp index bb9af14..13d67e5 100644 --- a/aten/src/ATen/native/cuda/BatchLinearAlgebraLib.cu +++ b/aten/src/ATen/native/cuda/BatchLinearAlgebraLib.cpp @@ -143,10 +143,6 @@ static void apply_triangular_solve_batched(Tensor& A, Tensor& B, bool upper, boo cublasDiagType_t diag = unitriangular ? CUBLAS_DIAG_UNIT : CUBLAS_DIAG_NON_UNIT; cublasSideMode_t side = CUBLAS_SIDE_LEFT; - auto A_data = A.data_ptr(); - auto B_data = B.data_ptr(); - auto A_mat_stride = matrixStride(A); - auto B_mat_stride = matrixStride(B); auto batch_size = cuda_int_cast(batchCount(A), "batch_size"); auto m = cuda_int_cast(A.size(-2), "m"); auto n = cuda_int_cast(A.size(-1), "n"); @@ -329,8 +325,6 @@ Tensor& _linalg_inv_out_helper_cuda_lib(Tensor& result, Tensor& infos_getrf, Ten result.zero_(); result.diagonal(/*offset=*/0, /*dim1=*/-2, /*dim2=*/-1).fill_(1); - const int batch_size = cuda_int_cast(batchCount(result), "batchCount"); - if (result.dim() > 2) { AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(result.scalar_type(), "linalg_inv_out_cuda", [&]{ apply_batched_inverse_lib( @@ -435,10 +429,6 @@ inline static void _apply_svd_lib_gesvdjBatched(const Tensor& self, Tensor& U, T auto U_data = U.data_ptr(); auto S_data = S.data_ptr(); auto VT_data = VT.data_ptr(); - auto self_stride = matrixStride(self); - auto U_stride = matrixStride(U); - auto S_stride = S.size(-1); - auto VT_stride = matrixStride(VT); int batchsize = cuda_int_cast(batchCount(self), "batch size"); int m = cuda_int_cast(self.size(-2), "m"); @@ -481,7 +471,6 @@ std::tuple _svd_helper_cuda_lib(const Tensor& self, bool at::Tensor infos = at::zeros({batch_size}, self.options().dtype(at::kInt)); const int64_t m = self.size(-2); const int64_t n = self.size(-1); - const int64_t k = std::min(m, n); Tensor U_working_copy, S_working_copy, VT_working_copy; std::tie(U_working_copy, S_working_copy, VT_working_copy) = \ @@ -686,11 +675,7 @@ inline static void apply_cholesky_cusolver_potrsBatched(Tensor& self_working_cop const int64_t nrhs = self_working_copy.size(-1); const int64_t lda = std::max(1, n); const int64_t batch_size = batchCount(self_working_copy); - const int64_t self_matrix_stride = matrixStride(self_working_copy); - scalar_t* self_working_copy_ptr = self_working_copy.data_ptr(); - const scalar_t* A_ptr = A_column_major_copy.data_ptr(); - const int64_t A_matrix_stride = matrixStride(A_column_major_copy); const int64_t ldb = std::max(1, A_column_major_copy.size(-1)); int* infos_ptr = infos.data_ptr(); @@ -882,8 +867,6 @@ void geqrf_cusolver(const Tensor& input, const Tensor& tau) { */ template static void apply_ormqr(const Tensor& input, const Tensor& tau, const Tensor& other, bool left, bool transpose) { - using value_t = typename c10::scalar_value_type::type; - auto side = left ? CUBLAS_SIDE_LEFT : CUBLAS_SIDE_RIGHT; auto trans = transpose ? (input.is_complex() ? CUBLAS_OP_C : CUBLAS_OP_T) : CUBLAS_OP_N; @@ -957,7 +940,6 @@ void ormqr_cusolver(const Tensor& input, const Tensor& tau, const Tensor& other, */ template inline static void apply_orgqr(Tensor& self, const Tensor& tau) { - using value_t = typename c10::scalar_value_type::type; auto self_data = self.data_ptr(); auto tau_data = tau.data_ptr(); auto self_matrix_stride = matrixStride(self); diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt index 1662a92..8b403a7 100644 --- a/caffe2/CMakeLists.txt +++ b/caffe2/CMakeLists.txt @@ -175,7 +175,7 @@ endif() if(BUILD_SPLIT_CUDA) # Splitting the source files that'll be in torch_cuda between torch_cuda_cu and torch_cuda_cpp foreach(tmp ${Caffe2_GPU_SRCS}) - if("${tmp}" MATCHES "(.*aten.*\\.cu|.*(b|B)las.*|.*((s|S)olver|Register.*CUDA|Legacy|THC|TensorShapeCUDA).*\\.cpp)" AND NOT "${tmp}" MATCHES ".*(THC((CachingHost)?Allocator|General)).*") + if("${tmp}" MATCHES "(.*aten.*\\.cu|.*(b|B)las.*|.*((s|S)olver|Register.*CUDA|Legacy|THC|TensorShapeCUDA|BatchLinearAlgebra).*\\.cpp)" AND NOT "${tmp}" MATCHES ".*(THC((CachingHost)?Allocator|General)).*") # Currently, torch_cuda_cu will have all the .cu files in aten, as well as some others that depend on those files list(APPEND Caffe2_GPU_SRCS_CU ${tmp}) else() -- 2.7.4