From 89362247a5966fe2d312c2d3879ee6d67b7991d6 Mon Sep 17 00:00:00 2001 From: YashasSamaga Date: Tue, 30 Jun 2020 21:51:23 +0530 Subject: [PATCH] add cuDNN 8 support --- modules/dnn/CMakeLists.txt | 6 +- .../src/cuda4dnn/csl/cudnn/convolution.hpp | 56 +++++++++++++++++-- .../csl/cudnn/transpose_convolution.hpp | 47 ++++++++++++++-- 3 files changed, 96 insertions(+), 13 deletions(-) diff --git a/modules/dnn/CMakeLists.txt b/modules/dnn/CMakeLists.txt index 3fa7fd69c34e..9947e010600c 100644 --- a/modules/dnn/CMakeLists.txt +++ b/modules/dnn/CMakeLists.txt @@ -18,7 +18,11 @@ if(OPENCV_DNN_OPENCL AND HAVE_OPENCL) add_definitions(-DCV_OCL4DNN=1) endif() -ocv_option(OPENCV_DNN_CUDA "Build with CUDA support" HAVE_CUDA AND HAVE_CUBLAS AND HAVE_CUDNN) +ocv_option(OPENCV_DNN_CUDA "Build with CUDA support" + HAVE_CUDA + AND HAVE_CUBLAS + AND HAVE_CUDNN +) if(OPENCV_DNN_CUDA AND HAVE_CUDA AND HAVE_CUBLAS AND HAVE_CUDNN) add_definitions(-DCV_CUDA4DNN=1) diff --git a/modules/dnn/src/cuda4dnn/csl/cudnn/convolution.hpp b/modules/dnn/src/cuda4dnn/csl/cudnn/convolution.hpp index 679429ba994c..52ba6561c84a 100644 --- a/modules/dnn/src/cuda4dnn/csl/cudnn/convolution.hpp +++ b/modules/dnn/src/cuda4dnn/csl/cudnn/convolution.hpp @@ -224,6 +224,15 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cu ); } CUDA4DNN_CHECK_CUDNN(cudnnSetConvolutionGroupCount(descriptor, group_count)); + +#if CUDNN_MAJOR >= 8 + /* cuDNN 7 and below use FMA math by default. cuDNN 8 includes TF32 Tensor Ops + * in the default setting. TF32 convolutions have lower precision than FP32. + * Hence, we set the math type to CUDNN_FMA_MATH to reproduce old behavior. + */ + CUDA4DNN_CHECK_CUDNN(cudnnSetConvolutionMathType(descriptor, CUDNN_FMA_MATH)); +#endif + if (std::is_same::value) CUDA4DNN_CHECK_CUDNN(cudnnSetConvolutionMathType(descriptor, CUDNN_TENSOR_OP_MATH)); } catch (...) { @@ -253,15 +262,49 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cu */ ConvolutionAlgorithm( const Handle& handle, - const ConvolutionDescriptor& conv, - const FilterDescriptor& filter, - const TensorDescriptor& input, - const TensorDescriptor& output) + const ConvolutionDescriptor& convDesc, + const FilterDescriptor& filterDesc, + const TensorDescriptor& inputDesc, + const TensorDescriptor& outputDesc) { +#if CUDNN_MAJOR >= 8 + int requestedAlgoCount = 0, returnedAlgoCount = 0; + CUDA4DNN_CHECK_CUDNN(cudnnGetConvolutionForwardAlgorithmMaxCount(handle.get(), &requestedAlgoCount)); + std::vector results(requestedAlgoCount); + CUDA4DNN_CHECK_CUDNN( + cudnnGetConvolutionForwardAlgorithm_v7( + handle.get(), + inputDesc.get(), filterDesc.get(), convDesc.get(), outputDesc.get(), + requestedAlgoCount, + &returnedAlgoCount, + &results[0] + ) + ); + + size_t free_memory, total_memory; + CUDA4DNN_CHECK_CUDA(cudaMemGetInfo(&free_memory, &total_memory)); + + bool found_conv_algorithm = false; + for (int i = 0; i < returnedAlgoCount; i++) + { + if (results[i].status == CUDNN_STATUS_SUCCESS && + results[i].algo != CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED && + results[i].memory < free_memory) + { + found_conv_algorithm = true; + algo = results[i].algo; + workspace_size = results[i].memory; + break; + } + } + + if (!found_conv_algorithm) + CV_Error (cv::Error::GpuApiCallError, "cuDNN did not return a suitable algorithm for convolution."); +#else CUDA4DNN_CHECK_CUDNN( cudnnGetConvolutionForwardAlgorithm( handle.get(), - input.get(), filter.get(), conv.get(), output.get(), + inputDesc.get(), filterDesc.get(), convDesc.get(), outputDesc.get(), CUDNN_CONVOLUTION_FWD_PREFER_FASTEST, 0, /* no memory limit */ &algo @@ -271,10 +314,11 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cu CUDA4DNN_CHECK_CUDNN( cudnnGetConvolutionForwardWorkspaceSize( handle.get(), - input.get(), filter.get(), conv.get(), output.get(), + inputDesc.get(), filterDesc.get(), convDesc.get(), outputDesc.get(), algo, &workspace_size ) ); +#endif } ConvolutionAlgorithm& operator=(const ConvolutionAlgorithm&) = default; diff --git a/modules/dnn/src/cuda4dnn/csl/cudnn/transpose_convolution.hpp b/modules/dnn/src/cuda4dnn/csl/cudnn/transpose_convolution.hpp index d1d26aa2cc6d..e1596b96ccbc 100644 --- a/modules/dnn/src/cuda4dnn/csl/cudnn/transpose_convolution.hpp +++ b/modules/dnn/src/cuda4dnn/csl/cudnn/transpose_convolution.hpp @@ -30,15 +30,49 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cu TransposeConvolutionAlgorithm( const Handle& handle, - const ConvolutionDescriptor& conv, - const FilterDescriptor& filter, - const TensorDescriptor& input, - const TensorDescriptor& output) + const ConvolutionDescriptor& convDesc, + const FilterDescriptor& filterDesc, + const TensorDescriptor& inputDesc, + const TensorDescriptor& outputDesc) { +#if CUDNN_MAJOR >= 8 + int requestedAlgoCount = 0, returnedAlgoCount = 0; + CUDA4DNN_CHECK_CUDNN(cudnnGetConvolutionBackwardDataAlgorithmMaxCount(handle.get(), &requestedAlgoCount)); + std::vector results(requestedAlgoCount); + CUDA4DNN_CHECK_CUDNN( + cudnnGetConvolutionBackwardDataAlgorithm_v7( + handle.get(), + filterDesc.get(), inputDesc.get(), convDesc.get(), outputDesc.get(), + requestedAlgoCount, + &returnedAlgoCount, + &results[0] + ) + ); + + size_t free_memory, total_memory; + CUDA4DNN_CHECK_CUDA(cudaMemGetInfo(&free_memory, &total_memory)); + + bool found_conv_algorithm = false; + for (int i = 0; i < returnedAlgoCount; i++) + { + if (results[i].status == CUDNN_STATUS_SUCCESS && + results[i].algo != CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED && + results[i].memory < free_memory) + { + found_conv_algorithm = true; + dalgo = results[i].algo; + workspace_size = results[i].memory; + break; + } + } + + if (!found_conv_algorithm) + CV_Error (cv::Error::GpuApiCallError, "cuDNN did not return a suitable algorithm for transpose convolution."); +#else CUDA4DNN_CHECK_CUDNN( cudnnGetConvolutionBackwardDataAlgorithm( handle.get(), - filter.get(), input.get(), conv.get(), output.get(), + filterDesc.get(), inputDesc.get(), convDesc.get(), outputDesc.get(), CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST, 0, /* no memory limit */ &dalgo @@ -48,10 +82,11 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cu CUDA4DNN_CHECK_CUDNN( cudnnGetConvolutionBackwardDataWorkspaceSize( handle.get(), - filter.get(), input.get(), conv.get(), output.get(), + filterDesc.get(), inputDesc.get(), convDesc.get(), outputDesc.get(), dalgo, &workspace_size ) ); +#endif } TransposeConvolutionAlgorithm& operator=(const TransposeConvolutionAlgorithm&) = default;