);
}
CUDA4DNN_CHECK_CUDNN(cudnnSetConvolutionGroupCount(descriptor, group_count));
+
+#if CUDNN_MAJOR >= 8
+ /* cuDNN 7 and below use FMA math by default. cuDNN 8 includes TF32 Tensor Ops
+ * in the default setting. TF32 convolutions have lower precision than FP32.
+ * Hence, we set the math type to CUDNN_FMA_MATH to reproduce old behavior.
+ */
+ CUDA4DNN_CHECK_CUDNN(cudnnSetConvolutionMathType(descriptor, CUDNN_FMA_MATH));
+#endif
+
if (std::is_same<T, half>::value)
CUDA4DNN_CHECK_CUDNN(cudnnSetConvolutionMathType(descriptor, CUDNN_TENSOR_OP_MATH));
} catch (...) {
*/
ConvolutionAlgorithm(
const Handle& handle,
- const ConvolutionDescriptor<T>& conv,
- const FilterDescriptor<T>& filter,
- const TensorDescriptor<T>& input,
- const TensorDescriptor<T>& output)
+ const ConvolutionDescriptor<T>& convDesc,
+ const FilterDescriptor<T>& filterDesc,
+ const TensorDescriptor<T>& inputDesc,
+ const TensorDescriptor<T>& outputDesc)
{
+#if CUDNN_MAJOR >= 8
+ int requestedAlgoCount = 0, returnedAlgoCount = 0;
+ CUDA4DNN_CHECK_CUDNN(cudnnGetConvolutionForwardAlgorithmMaxCount(handle.get(), &requestedAlgoCount));
+ std::vector<cudnnConvolutionFwdAlgoPerf_t> results(requestedAlgoCount);
+ CUDA4DNN_CHECK_CUDNN(
+ cudnnGetConvolutionForwardAlgorithm_v7(
+ handle.get(),
+ inputDesc.get(), filterDesc.get(), convDesc.get(), outputDesc.get(),
+ requestedAlgoCount,
+ &returnedAlgoCount,
+ &results[0]
+ )
+ );
+
+ size_t free_memory, total_memory;
+ CUDA4DNN_CHECK_CUDA(cudaMemGetInfo(&free_memory, &total_memory));
+
+ bool found_conv_algorithm = false;
+ for (int i = 0; i < returnedAlgoCount; i++)
+ {
+ if (results[i].status == CUDNN_STATUS_SUCCESS &&
+ results[i].algo != CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED &&
+ results[i].memory < free_memory)
+ {
+ found_conv_algorithm = true;
+ algo = results[i].algo;
+ workspace_size = results[i].memory;
+ break;
+ }
+ }
+
+ if (!found_conv_algorithm)
+ CV_Error (cv::Error::GpuApiCallError, "cuDNN did not return a suitable algorithm for convolution.");
+#else
CUDA4DNN_CHECK_CUDNN(
cudnnGetConvolutionForwardAlgorithm(
handle.get(),
- input.get(), filter.get(), conv.get(), output.get(),
+ inputDesc.get(), filterDesc.get(), convDesc.get(), outputDesc.get(),
CUDNN_CONVOLUTION_FWD_PREFER_FASTEST,
0, /* no memory limit */
&algo
CUDA4DNN_CHECK_CUDNN(
cudnnGetConvolutionForwardWorkspaceSize(
handle.get(),
- input.get(), filter.get(), conv.get(), output.get(),
+ inputDesc.get(), filterDesc.get(), convDesc.get(), outputDesc.get(),
algo, &workspace_size
)
);
+#endif
}
ConvolutionAlgorithm& operator=(const ConvolutionAlgorithm&) = default;
TransposeConvolutionAlgorithm(
const Handle& handle,
- const ConvolutionDescriptor<T>& conv,
- const FilterDescriptor<T>& filter,
- const TensorDescriptor<T>& input,
- const TensorDescriptor<T>& output)
+ const ConvolutionDescriptor<T>& convDesc,
+ const FilterDescriptor<T>& filterDesc,
+ const TensorDescriptor<T>& inputDesc,
+ const TensorDescriptor<T>& outputDesc)
{
+#if CUDNN_MAJOR >= 8
+ int requestedAlgoCount = 0, returnedAlgoCount = 0;
+ CUDA4DNN_CHECK_CUDNN(cudnnGetConvolutionBackwardDataAlgorithmMaxCount(handle.get(), &requestedAlgoCount));
+ std::vector<cudnnConvolutionBwdDataAlgoPerf_t> results(requestedAlgoCount);
+ CUDA4DNN_CHECK_CUDNN(
+ cudnnGetConvolutionBackwardDataAlgorithm_v7(
+ handle.get(),
+ filterDesc.get(), inputDesc.get(), convDesc.get(), outputDesc.get(),
+ requestedAlgoCount,
+ &returnedAlgoCount,
+ &results[0]
+ )
+ );
+
+ size_t free_memory, total_memory;
+ CUDA4DNN_CHECK_CUDA(cudaMemGetInfo(&free_memory, &total_memory));
+
+ bool found_conv_algorithm = false;
+ for (int i = 0; i < returnedAlgoCount; i++)
+ {
+ if (results[i].status == CUDNN_STATUS_SUCCESS &&
+ results[i].algo != CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED &&
+ results[i].memory < free_memory)
+ {
+ found_conv_algorithm = true;
+ dalgo = results[i].algo;
+ workspace_size = results[i].memory;
+ break;
+ }
+ }
+
+ if (!found_conv_algorithm)
+ CV_Error (cv::Error::GpuApiCallError, "cuDNN did not return a suitable algorithm for transpose convolution.");
+#else
CUDA4DNN_CHECK_CUDNN(
cudnnGetConvolutionBackwardDataAlgorithm(
handle.get(),
- filter.get(), input.get(), conv.get(), output.get(),
+ filterDesc.get(), inputDesc.get(), convDesc.get(), outputDesc.get(),
CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST,
0, /* no memory limit */
&dalgo
CUDA4DNN_CHECK_CUDNN(
cudnnGetConvolutionBackwardDataWorkspaceSize(
handle.get(),
- filter.get(), input.get(), conv.get(), output.get(),
+ filterDesc.get(), inputDesc.get(), convDesc.get(), outputDesc.get(),
dalgo, &workspace_size
)
);
+#endif
}
TransposeConvolutionAlgorithm& operator=(const TransposeConvolutionAlgorithm&) = default;