Environment
OS: Ubuntu 18.04.5 LTS x86_64
CUDA: 10.0
SDK: Video_Codec_SDK_9.1.23
FFmpeg: N-101443-g74b5564fb5
gcc 7
Python: 3.6
- Create a
CUDA
context. - Query the
decode capabilities
of the hardware decoder. - Create the decoder instance(s).
- De-Mux the content (like .mp4).
FFMPEG
. - Parse the video
bitstream
using third party parser likeFFMPEG
. - Kick off the Decoding using NVDECODE API.
- Obtain the
decoded YUV
for further processing. - Query the
status
of the decoded frame. - Depending on the
decoding status
, use the decoded output for further processing likerendering
,inferencing
,postprocessing
etc. - Convert decoded
YUV
surface toRGB
. - Destroy the decoder instance(s) after the completion of decoding process.
- Destroy the
CUDA
context.
Note:
-
CUDA context
: holds all management data to control and use the devices. For instance, it holds the list of allocated memory, the loaded modules that contain device code, the mapping between CPU and GPU memory. -
Bitstream
: data found in a stream of bits used in digital communication or data storage application. -
BSD
: bit stream decoder. -
All
NVDECODE API
action are exposed in two header-filecuviddec.h
,nvcuvid.h
and a static librarylibnvcuvid.so
. -
NVIDIA GPU-accelerated decoder pipeline consists of three major components:
Demultiplexing
(FFMPEG),parsing
(pure SW FFMPEG),decoding
(GPU-acceleration). -
Before creating decoder, we should query capabilities of GPU to ensure that required functional is supported.
Common/AppDecUtils.h
Click to expand!
static void createCudaContext(CUcontext* cuContext, int iGpu, unsigned int flags)
{
CUdevice cuDevice = 0;
ck(cuDeviceGet(&cuDevice, iGpu));
char szDeviceName[80];
ck(cuDeviceGetName(szDeviceName, sizeof(szDeviceName), cuDevice));
std::cout << "GPU in use: " << szDeviceName << std::endl;
ck(cuCtxCreate(cuContext, flags, cuDevice));
}
cuda.h -> cuDeviceGet(CUdevice *device, int ordinal)
: Returns in *device a device handle given an ordinal in the range.cuda.h -> cuDeviceGetName(char *name, int len, CUdevice dev)
: Returns an identifer string for the device.cuda.h -> cuCtxCreate(CUcontext *pctx, unsigned int flags, CUdevice dev)
: Creates a new CUDA context and associates it with the calling thread.
main.cpp
Click to expand!
#include "NvCodec/NvDecoder/NvDecoder.h"
#include "Utils/NvCodecUtils.h"
#include "Utils/FFmpegDemuxer.h"
#include "Common/AppDecUtils.h"
simplelogger::Logger *logger = simplelogger::LoggerFactory::CreateConsoleLogger();
int main()
{
ck(cuInit(0));
int nGpu = 0;
ck(cuDeviceGetCount(&nGpu));
CUcontext cuContext = NULL;
int iGpu = 0;
createCudaContext(&cuContext, iGpu, 0);
ck(cuCtxDestroy(cuContext));
return 0;
}
cuda.h -> cuInit(unsigned int Flags)
: Initializes the driver API and must be called before any other function from the driver API.cuda.h -> cuDeviceGetCount(int *count)
: Returns an identifer string for the device.cuda.h -> cuCtxDestroy(CUcontext ctx)
: Destroy a CUDA context.
Common/AppDecUtils.h
Click to expand!
static void ShowDecoderCapability()
{
ck(cuInit(0));
int nGpu = 0;
ck(cuDeviceGetCount(&nGpu));
std::cout << "Decoder Capability" << std::endl << std::endl;
const char *aszCodecName[] = {"JPEG", "MPEG1", "MPEG2", "MPEG4", "H264", "HEVC", "HEVC", "HEVC", "HEVC", "HEVC", "HEVC", "VC1", "VP8", "VP9", "VP9", "VP9"};
const char *aszChromaFormat[] = { "4:0:0", "4:2:0", "4:2:2", "4:4:4" };
char strOutputFormats[64];
cudaVideoCodec aeCodec[] = { cudaVideoCodec_JPEG, cudaVideoCodec_MPEG1, cudaVideoCodec_MPEG2, cudaVideoCodec_MPEG4, cudaVideoCodec_H264, cudaVideoCodec_HEVC,
cudaVideoCodec_HEVC, cudaVideoCodec_HEVC, cudaVideoCodec_HEVC, cudaVideoCodec_HEVC, cudaVideoCodec_HEVC, cudaVideoCodec_VC1, cudaVideoCodec_VP8,
cudaVideoCodec_VP9, cudaVideoCodec_VP9, cudaVideoCodec_VP9 };
int anBitDepthMinus8[] = {0, 0, 0, 0, 0, 0, 2, 4, 0, 2, 4, 0, 0, 0, 2, 4};
cudaVideoChromaFormat aeChromaFormat[] = { cudaVideoChromaFormat_420, cudaVideoChromaFormat_420, cudaVideoChromaFormat_420, cudaVideoChromaFormat_420,
cudaVideoChromaFormat_420, cudaVideoChromaFormat_420, cudaVideoChromaFormat_420, cudaVideoChromaFormat_420, cudaVideoChromaFormat_444, cudaVideoChromaFormat_444,
cudaVideoChromaFormat_444, cudaVideoChromaFormat_420, cudaVideoChromaFormat_420, cudaVideoChromaFormat_420, cudaVideoChromaFormat_420, cudaVideoChromaFormat_420 };
for (int iGpu = 0; iGpu < nGpu; iGpu++)
{
CUcontext cuContext = NULL;
CUdevice cuDevice = 0;
ck(cuDeviceGet(&cuDevice, iGpu));
char szDeviceName[80];
ck(cuDeviceGetName(szDeviceName, sizeof(szDeviceName), cuDevice));
std::cout << "GPU in use: " << szDeviceName << std::endl;
ck(cuCtxCreate(&cuContext, 0, cuDevice));
std::cout << "Codec BitDepth ChromaFormat Supported MaxWidth MaxHeight MaxMBCount MinWidth MinHeight SurfaceFormat" << std::endl;
for (int i = 0; i < sizeof(aeCodec) / sizeof(aeCodec[0]); i++)
{
CUVIDDECODECAPS decodeCaps = {};
decodeCaps.eCodecType = aeCodec[i];
decodeCaps.eChromaFormat = aeChromaFormat[i];
decodeCaps.nBitDepthMinus8 = anBitDepthMinus8[i];
cuvidGetDecoderCaps(&decodeCaps);
strOutputFormats[0] = '\0';
getOutputFormatNames(decodeCaps.nOutputFormatMask, strOutputFormats);
// setw() width = maximum_width_of_string + 2 spaces
std::cout << std::left << std::setw(std::string("Codec").length() + 2) << aszCodecName[i] <<
std::setw(std::string("BitDepth").length() + 2) << decodeCaps.nBitDepthMinus8 + 8 <<
std::setw(std::string("ChromaFormat").length() + 2) << aszChromaFormat[decodeCaps.eChromaFormat] <<
std::setw(std::string("Supported").length() + 2) << (int)decodeCaps.bIsSupported <<
std::setw(std::string("MaxWidth").length() + 2) << decodeCaps.nMaxWidth <<
std::setw(std::string("MaxHeight").length() + 2) << decodeCaps.nMaxHeight <<
std::setw(std::string("MaxMBCount").length() + 2) << decodeCaps.nMaxMBCount <<
std::setw(std::string("MinWidth").length() + 2) << decodeCaps.nMinWidth <<
std::setw(std::string("MinHeight").length() + 2) << decodeCaps.nMinHeight <<
std::setw(std::string("SurfaceFormat").length() + 2) << strOutputFormats << std::endl;
}
std::cout << std::endl;
ck(cuCtxDestroy(cuContext));
}
}
cuda.h -> cuInit(unsigned int Flags)
: Initializes the driver API and must be called before any other function from the driver API.cuda.h -> cuDeviceGetCount(int *count)
: Returns an identifer string for the device.cuda.h -> cuDeviceGet(CUdevice *device, int ordinal)
: Returns in *device a device handle given an ordinal in the range.cuda.h -> cuDeviceGetName(char *name, int len, CUdevice dev)
: Returns an identifer string for the device.cuda.h -> cuCtxCreate(CUcontext *pctx, unsigned int flags, CUdevice dev)
: Creates a new CUDA context and associates it with the calling thread.cuda.h -> cuCtxDestroy(CUcontext ctx)
: Destroy a CUDA context.cuviddec.h -> cuvidGetDecoderCaps(CUVIDDECODECAPS *pdc)
: Queries decode capabilities of NVDEC-HW based on CodecType, ChromaFormat and BitDepthMinus8 parameters.
main.cpp
Click to expand!
#include "NvCodec/NvDecoder/NvDecoder.h"
#include "Utils/NvCodecUtils.h"
#include "Utils/FFmpegDemuxer.h"
#include "Common/AppDecUtils.h"
simplelogger::Logger *logger = simplelogger::LoggerFactory::CreateConsoleLogger();
int main()
{
ShowDecoderCapability();
return 0;
}
Common/AppDecUtils.h -> ShowDecoderCapability()
: Show decoder capabilities of all GPU.
main.cpp
Click to expand!
#include "NvCodec/NvDecoder/NvDecoder.h"
#include "Utils/NvCodecUtils.h"
#include "Utils/FFmpegDemuxer.h"
#include "Common/AppDecUtils.h"
simplelogger::Logger *logger = simplelogger::LoggerFactory::CreateConsoleLogger();
int main()
{
ck(cuInit(0));
int nGpu = 0;
ck(cuDeviceGetCount(&nGpu));
CUcontext cuContext = NULL;
int iGpu = 0;
std::string szInFilePath = "/home/m/Documents/NVCODEC/your_name.mp4";
createCudaContext(&cuContext, iGpu, 0);
FFmpegDemuxer demuxer(szInFilePath.c_str());
NvDecoder dec(cuContext, false, FFmpeg2NvCodecId(demuxer.GetVideoCodec()), NULL, false, false);
LOG(INFO) << dec.GetVideoInfo();
std::vector<std::string> aszDecodeOutFormat = {"NV12", "P016", "YUV444", "YUV444P16"};
LOG(INFO) << "Output format: " << aszDecodeOutFormat[dec.GetOutputFormat()];
ck(cuCtxDestroy(cuContext));
return 0;
}
Utils/FFmpegDemuxer.h -> class FFmpegDemuxer
: class provides functionality for stream demuxing.NvCodec/NvDecoder/NvDecoder.h -> class NvDecoder
: Base class for decoder interface.
main.cpp
Click to expand!
#include "NvCodec/NvDecoder/NvDecoder.h"
#include "Utils/NvCodecUtils.h"
#include "Utils/FFmpegDemuxer.h"
#include "Common/AppDecUtils.h"
simplelogger::Logger *logger = simplelogger::LoggerFactory::CreateConsoleLogger();
int main()
{
ck(cuInit(0));
int nGpu = 0;
ck(cuDeviceGetCount(&nGpu));
CUcontext cuContext = NULL;
int iGpu = 0;
createCudaContext(&cuContext, iGpu, 0);
std::string szInFilePath = "/home/m/Documents/NVCODEC/your_name.mp4";
FFmpegDemuxer demuxer(szInFilePath.c_str());
NvDecoder dec(cuContext, false, FFmpeg2NvCodecId(demuxer.GetVideoCodec()), NULL, false, false);
int nVideoBytes = 0, nFrameReturned = 0, nFrame = 0;
uint8_t *pVideo = NULL, **ppFrame;
std::vector<std::string> aszDecodeOutFormat = {"NV12", "P016", "YUV444", "YUV444P16"};
do
{
demuxer.Demux(&pVideo, &nVideoBytes);
dec.Decode(pVideo, nVideoBytes, &ppFrame, &nFrameReturned);
if (!nFrame && nFrameReturned)
{
LOG(INFO) << dec.GetVideoInfo();
LOG(INFO) << "Output format: " << aszDecodeOutFormat[dec.GetOutputFormat()];
}
} while (nVideoBytes);
ck(cuCtxDestroy(cuContext));
LOG(INFO) << "End of process";
}
main.cpp
Click to expand!
#include "NvCodec/NvDecoder/NvDecoder.h"
#include "Utils/NvCodecUtils.h"
#include "Utils/FFmpegDemuxer.h"
#include "Common/AppDecUtils.h"
#include <opencv2/highgui.hpp>
#include <opencv2/imgproc.hpp>
simplelogger::Logger *logger = simplelogger::LoggerFactory::CreateConsoleLogger();
int main()
{
ShowDecoderCapability();
std::string szInFilePath = "/home/m/Documents/NVCODEC/your_name.mp4";
ck(cuInit(0));
int nGpu = 0;
ck(cuDeviceGetCount(&nGpu));
CUcontext cuContext = NULL;
int iGpu = 0;
createCudaContext(&cuContext, iGpu, 0);
FFmpegDemuxer demuxer(szInFilePath.c_str());
NvDecoder dec(cuContext, false, FFmpeg2NvCodecId(demuxer.GetVideoCodec()), NULL, false, false);
int nVideoBytes = 0, nFrameReturned = 0, nFrame = 0;
uint8_t *pVideo = NULL, **ppFrame;
cv::Mat mat_bgr;
std::vector<std::string> aszDecodeOutFormat = {"NV12", "P016", "YUV444", "YUV444P16"};
cv::namedWindow("a", cv::WINDOW_NORMAL);
do
{
demuxer.Demux(&pVideo, &nVideoBytes);
dec.Decode(pVideo, nVideoBytes, &ppFrame, &nFrameReturned);
if (!nFrame && nFrameReturned)
{
LOG(INFO) << dec.GetVideoInfo();
LOG(INFO) << "Output format: " << aszDecodeOutFormat[dec.GetOutputFormat()];
}
if (nFrameReturned < 1)
continue;
nFrame += nFrameReturned;
cv::Mat mat_yuv = cv::Mat(dec.GetHeight() * 1.5, dec.GetWidth(), CV_8UC1, ppFrame[nFrameReturned - 1]);
cv::Mat mat_rgb = cv::Mat(dec.GetHeight(), dec.GetWidth(), CV_8UC3);
cv::cvtColor(mat_yuv, mat_rgb, cv::COLOR_YUV2BGR_NV21);
cv::cvtColor(mat_rgb, mat_bgr, cv::COLOR_RGB2BGR);
cv::imshow("a", mat_bgr);
char c = (char)cv::waitKey(25);
if (c == 27)
break;
} while (nVideoBytes);
ck(cuCtxDestroy(cuContext));
LOG(INFO) << "End of process";
return 0;
}
This sample application demonstrates low latency decoding feature. This feature helps to get output frame as soon as it is decoded without any delay. The feature will work for streams having I
and P
frames only.
[cited]
In the field of video compression a video frame is compressed using different algorithms with different advantages and disadvantages, centered mainly around amount of data compression. These different algorithms for video frames are called picture types or frame types. The three major picture types used in the different video algorithms are I
, P
and B
. They are different in the following characteristics:
I‑frames
are the least compressible but don't require other video frames to decode.P‑frames
can use data from previous frames to decompress and are more compressible thanI‑frames
.B‑frames
can use both previous and forward frames for data reference to get the highest amount of data compression.
main.cpp
Click to expand!
#include <iostream>
#include <algorithm>
#include <thread>
#include <cuda.h>
#include "NvCodec/NvDecoder/NvDecoder.h"
#include "Utils/NvCodecUtils.h"
#include "Utils/FFmpegDemuxer.h"
#include "Common/AppDecUtils.h"
#include <opencv2/highgui.hpp>
#include <opencv2/imgproc.hpp>
#include <vector>
#include <string.h>
int main()
{
ShowDecoderCapability();
std::string szInFilePath = "/home/m/Documents/NVCODEC/your_name.mp4";
try
{
ck(cuInit(0));
int nGpu = 0;
int iGpu = 0;
ck(cuDeviceGetCount(&nGpu));
if (iGpu < 0 || iGpu >= nGpu)
{
std::ostringstream err;
err << "GPU ordinal out of range. Should be within [" << 0 << ", " << nGpu - 1 << "]" << std::endl;
throw std::invalid_argument(err.str());
}
CUcontext cuContext = NULL;
createCudaContext(&cuContext, iGpu, 0);
FFmpegDemuxer demuxer(szInFilePath.c_str());
// Here set bLowLatency=true in the constructor.
// Please don't use this flag except for low latency, it is harder to get 100% utilization of
// hardware decoder with this flag set.
NvDecoder dec(cuContext, false, FFmpeg2NvCodecId(demuxer.GetVideoCodec()), NULL, false);
int nVideoBytes = 0, nFrameReturned = 0, nFrame = 0, n = 0;
uint8_t *pVideo = NULL, **ppFrame;
int64_t *pTimestamp;
cv::Mat mat_bgr;
std::vector<std::string> aszDecodeOutFormat = {"NV12", "P016", "YUV444", "YUV444P16"};
cv::namedWindow("a", cv::WINDOW_NORMAL);
do
{
demuxer.Demux(&pVideo, &nVideoBytes);
// Set flag CUVID_PKT_ENDOFPICTURE to signal that a complete packet has been sent to decode
dec.Decode(pVideo, nVideoBytes, &ppFrame, &nFrameReturned, CUVID_PKT_ENDOFPICTURE, &pTimestamp, n++);
if (!nFrame && nFrameReturned)
{
LOG(INFO) << dec.GetVideoInfo();
LOG(INFO) << "Output format: " << aszDecodeOutFormat[dec.GetOutputFormat()];
}
if (nFrameReturned < 1)
continue;
nFrame += nFrameReturned;
cv::Mat mat_yuv = cv::Mat(dec.GetHeight() * 1.5, dec.GetWidth(), CV_8UC1, ppFrame[nFrameReturned - 1]);
cv::Mat mat_rgb = cv::Mat(dec.GetHeight(), dec.GetWidth(), CV_8UC3);
cv::cvtColor(mat_yuv, mat_rgb, cv::COLOR_YUV2BGR_NV21);
cv::cvtColor(mat_rgb, mat_bgr, cv::COLOR_RGB2BGR);
cv::imshow("a", mat_bgr);
char c = (char)cv::waitKey(25);
if (c == 27)
break;
// For a stream without B-frames, "one in and one out" is expected, and nFrameReturned should be always 1 for each input packet
LOG(INFO) << "Decode: nVideoBytes=" << std::setw(10) << nVideoBytes
<< ", nFrameReturned=" << std::setw (10) << nFrameReturned
<< ", total=" << std::setw(10) << nFrame;
for (int i = 0; i < nFrameReturned; i++)
{
LOG(INFO) << "Timestamp: " << pTimestamp [i];
}
} while (nVideoBytes);
}
catch (const std::exception &ex)
{
LOG(ERROR) << ex.what();
exit(1);
}
return 0;
}
The NVIDIA video encoder API is designed to accept raw video frames (in YUV
or RGB
format) and output the H.264
or HEVC
bitstream
- [Initialize the encoder.]
- [Set up the desired encoding parameters.]
- [Allocate input/output buffers.]
- [Copy frames to input buffers and read bitstream from the output buffers. This can be done synchronously (Windows & Linux) or asynchronously (Windows 7 and above only).]
- Clean-up - release all allocated input/output buffers.
- [Close the encoding session.]
Note:
-
Bitstream
: data found in a stream of bits used in digital communication or data storage application. -
Before creating encoder, we should query capabilities of GPU to ensure that required functional is supported.
Click to expand!
void ShowEncoderCapability()
{
ck(cuInit(0));
int nGpu = 0;
ck(cuDeviceGetCount(&nGpu));
std::cout << "Encoder Capability" << std::endl
<< std::endl;
for (int iGpu = 0; iGpu < nGpu; iGpu++)
{
CUdevice cuDevice = 0;
ck(cuDeviceGet(&cuDevice, iGpu));
char szDeviceName[80];
ck(cuDeviceGetName(szDeviceName, sizeof(szDeviceName), cuDevice));
CUcontext cuContext = NULL;
ck(cuCtxCreate(&cuContext, 0, cuDevice));
NvEncoderCuda enc(cuContext, 1280, 720, NV_ENC_BUFFER_FORMAT_NV12);
std::cout << "GPU " << iGpu << " - " << szDeviceName << std::endl
<< std::endl;
std::cout << "\tH264:\t\t"
<< " " << (enc.GetCapabilityValue(NV_ENC_CODEC_H264_GUID, NV_ENC_CAPS_SUPPORTED_RATECONTROL_MODES) ? "yes" : "no") << std::endl
<< "\tH264_444:\t"
<< " " << (enc.GetCapabilityValue(NV_ENC_CODEC_H264_GUID, NV_ENC_CAPS_SUPPORT_YUV444_ENCODE) ? "yes" : "no") << std::endl
<< "\tH264_ME:\t"
<< " " << (enc.GetCapabilityValue(NV_ENC_CODEC_H264_GUID, NV_ENC_CAPS_SUPPORT_MEONLY_MODE) ? "yes" : "no") << std::endl
<< "\tH264_WxH:\t"
<< " " << (enc.GetCapabilityValue(NV_ENC_CODEC_H264_GUID, NV_ENC_CAPS_WIDTH_MAX)) << "*" << (enc.GetCapabilityValue(NV_ENC_CODEC_H264_GUID, NV_ENC_CAPS_HEIGHT_MAX)) << std::endl
<< "\tHEVC:\t\t"
<< " " << (enc.GetCapabilityValue(NV_ENC_CODEC_HEVC_GUID, NV_ENC_CAPS_SUPPORTED_RATECONTROL_MODES) ? "yes" : "no") << std::endl
<< "\tHEVC_Main10:\t"
<< " " << (enc.GetCapabilityValue(NV_ENC_CODEC_HEVC_GUID, NV_ENC_CAPS_SUPPORT_10BIT_ENCODE) ? "yes" : "no") << std::endl
<< "\tHEVC_Lossless:\t"
<< " " << (enc.GetCapabilityValue(NV_ENC_CODEC_HEVC_GUID, NV_ENC_CAPS_SUPPORT_LOSSLESS_ENCODE) ? "yes" : "no") << std::endl
<< "\tHEVC_SAO:\t"
<< " " << (enc.GetCapabilityValue(NV_ENC_CODEC_HEVC_GUID, NV_ENC_CAPS_SUPPORT_SAO) ? "yes" : "no") << std::endl
<< "\tHEVC_444:\t"
<< " " << (enc.GetCapabilityValue(NV_ENC_CODEC_HEVC_GUID, NV_ENC_CAPS_SUPPORT_YUV444_ENCODE) ? "yes" : "no") << std::endl
<< "\tHEVC_ME:\t"
<< " " << (enc.GetCapabilityValue(NV_ENC_CODEC_HEVC_GUID, NV_ENC_CAPS_SUPPORT_MEONLY_MODE) ? "yes" : "no") << std::endl
<< "\tHEVC_WxH:\t"
<< " " << (enc.GetCapabilityValue(NV_ENC_CODEC_HEVC_GUID, NV_ENC_CAPS_WIDTH_MAX)) << "*" << (enc.GetCapabilityValue(NV_ENC_CODEC_HEVC_GUID, NV_ENC_CAPS_HEIGHT_MAX)) << std::endl;
std::cout << std::endl;
enc.DestroyEncoder();
ck(cuCtxDestroy(cuContext));
}
}