Skip to content

Commit

Permalink
add cuDNN 8 support
Browse files Browse the repository at this point in the history
  • Loading branch information
YashasSamaga authored and Spot 1 Jetson committed Aug 22, 2024
1 parent 980cd5b commit 8936224
Show file tree
Hide file tree
Showing 3 changed files with 96 additions and 13 deletions.
6 changes: 5 additions & 1 deletion modules/dnn/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,11 @@ if(OPENCV_DNN_OPENCL AND HAVE_OPENCL)
add_definitions(-DCV_OCL4DNN=1)
endif()

ocv_option(OPENCV_DNN_CUDA "Build with CUDA support" HAVE_CUDA AND HAVE_CUBLAS AND HAVE_CUDNN)
ocv_option(OPENCV_DNN_CUDA "Build with CUDA support"
HAVE_CUDA
AND HAVE_CUBLAS
AND HAVE_CUDNN
)

if(OPENCV_DNN_CUDA AND HAVE_CUDA AND HAVE_CUBLAS AND HAVE_CUDNN)
add_definitions(-DCV_CUDA4DNN=1)
Expand Down
56 changes: 50 additions & 6 deletions modules/dnn/src/cuda4dnn/csl/cudnn/convolution.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,15 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cu
);
}
CUDA4DNN_CHECK_CUDNN(cudnnSetConvolutionGroupCount(descriptor, group_count));

#if CUDNN_MAJOR >= 8
/* cuDNN 7 and below use FMA math by default. cuDNN 8 includes TF32 Tensor Ops
* in the default setting. TF32 convolutions have lower precision than FP32.
* Hence, we set the math type to CUDNN_FMA_MATH to reproduce old behavior.
*/
CUDA4DNN_CHECK_CUDNN(cudnnSetConvolutionMathType(descriptor, CUDNN_FMA_MATH));
#endif

if (std::is_same<T, half>::value)
CUDA4DNN_CHECK_CUDNN(cudnnSetConvolutionMathType(descriptor, CUDNN_TENSOR_OP_MATH));
} catch (...) {
Expand Down Expand Up @@ -253,15 +262,49 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cu
*/
ConvolutionAlgorithm(
const Handle& handle,
const ConvolutionDescriptor<T>& conv,
const FilterDescriptor<T>& filter,
const TensorDescriptor<T>& input,
const TensorDescriptor<T>& output)
const ConvolutionDescriptor<T>& convDesc,
const FilterDescriptor<T>& filterDesc,
const TensorDescriptor<T>& inputDesc,
const TensorDescriptor<T>& outputDesc)
{
#if CUDNN_MAJOR >= 8
int requestedAlgoCount = 0, returnedAlgoCount = 0;
CUDA4DNN_CHECK_CUDNN(cudnnGetConvolutionForwardAlgorithmMaxCount(handle.get(), &requestedAlgoCount));
std::vector<cudnnConvolutionFwdAlgoPerf_t> results(requestedAlgoCount);
CUDA4DNN_CHECK_CUDNN(
cudnnGetConvolutionForwardAlgorithm_v7(
handle.get(),
inputDesc.get(), filterDesc.get(), convDesc.get(), outputDesc.get(),
requestedAlgoCount,
&returnedAlgoCount,
&results[0]
)
);

size_t free_memory, total_memory;
CUDA4DNN_CHECK_CUDA(cudaMemGetInfo(&free_memory, &total_memory));

bool found_conv_algorithm = false;
for (int i = 0; i < returnedAlgoCount; i++)
{
if (results[i].status == CUDNN_STATUS_SUCCESS &&
results[i].algo != CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED &&
results[i].memory < free_memory)
{
found_conv_algorithm = true;
algo = results[i].algo;
workspace_size = results[i].memory;
break;
}
}

if (!found_conv_algorithm)
CV_Error (cv::Error::GpuApiCallError, "cuDNN did not return a suitable algorithm for convolution.");
#else
CUDA4DNN_CHECK_CUDNN(
cudnnGetConvolutionForwardAlgorithm(
handle.get(),
input.get(), filter.get(), conv.get(), output.get(),
inputDesc.get(), filterDesc.get(), convDesc.get(), outputDesc.get(),
CUDNN_CONVOLUTION_FWD_PREFER_FASTEST,
0, /* no memory limit */
&algo
Expand All @@ -271,10 +314,11 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cu
CUDA4DNN_CHECK_CUDNN(
cudnnGetConvolutionForwardWorkspaceSize(
handle.get(),
input.get(), filter.get(), conv.get(), output.get(),
inputDesc.get(), filterDesc.get(), convDesc.get(), outputDesc.get(),
algo, &workspace_size
)
);
#endif
}

ConvolutionAlgorithm& operator=(const ConvolutionAlgorithm&) = default;
Expand Down
47 changes: 41 additions & 6 deletions modules/dnn/src/cuda4dnn/csl/cudnn/transpose_convolution.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,15 +30,49 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cu

TransposeConvolutionAlgorithm(
const Handle& handle,
const ConvolutionDescriptor<T>& conv,
const FilterDescriptor<T>& filter,
const TensorDescriptor<T>& input,
const TensorDescriptor<T>& output)
const ConvolutionDescriptor<T>& convDesc,
const FilterDescriptor<T>& filterDesc,
const TensorDescriptor<T>& inputDesc,
const TensorDescriptor<T>& outputDesc)
{
#if CUDNN_MAJOR >= 8
int requestedAlgoCount = 0, returnedAlgoCount = 0;
CUDA4DNN_CHECK_CUDNN(cudnnGetConvolutionBackwardDataAlgorithmMaxCount(handle.get(), &requestedAlgoCount));
std::vector<cudnnConvolutionBwdDataAlgoPerf_t> results(requestedAlgoCount);
CUDA4DNN_CHECK_CUDNN(
cudnnGetConvolutionBackwardDataAlgorithm_v7(
handle.get(),
filterDesc.get(), inputDesc.get(), convDesc.get(), outputDesc.get(),
requestedAlgoCount,
&returnedAlgoCount,
&results[0]
)
);

size_t free_memory, total_memory;
CUDA4DNN_CHECK_CUDA(cudaMemGetInfo(&free_memory, &total_memory));

bool found_conv_algorithm = false;
for (int i = 0; i < returnedAlgoCount; i++)
{
if (results[i].status == CUDNN_STATUS_SUCCESS &&
results[i].algo != CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED &&
results[i].memory < free_memory)
{
found_conv_algorithm = true;
dalgo = results[i].algo;
workspace_size = results[i].memory;
break;
}
}

if (!found_conv_algorithm)
CV_Error (cv::Error::GpuApiCallError, "cuDNN did not return a suitable algorithm for transpose convolution.");
#else
CUDA4DNN_CHECK_CUDNN(
cudnnGetConvolutionBackwardDataAlgorithm(
handle.get(),
filter.get(), input.get(), conv.get(), output.get(),
filterDesc.get(), inputDesc.get(), convDesc.get(), outputDesc.get(),
CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST,
0, /* no memory limit */
&dalgo
Expand All @@ -48,10 +82,11 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cu
CUDA4DNN_CHECK_CUDNN(
cudnnGetConvolutionBackwardDataWorkspaceSize(
handle.get(),
filter.get(), input.get(), conv.get(), output.get(),
filterDesc.get(), inputDesc.get(), convDesc.get(), outputDesc.get(),
dalgo, &workspace_size
)
);
#endif
}

TransposeConvolutionAlgorithm& operator=(const TransposeConvolutionAlgorithm&) = default;
Expand Down

0 comments on commit 8936224

Please sign in to comment.