From 27a10811afb4f2f9c5404d02b1056391f14f4b1a Mon Sep 17 00:00:00 2001 From: Yuting Jiang Date: Tue, 22 Aug 2023 18:56:33 +0800 Subject: [PATCH] Benchmarks: micro benchmark - source code for evaluating NVDEC decoding performance (#560) **Description** source code for evaluating NVDEC decoding performance. --------- Co-authored-by: yukirora --- .azure-pipelines/cuda-unit-test.yml | 6 +- .github/workflows/codeql-analysis.yml | 4 + .gitignore | 3 - dockerfile/cuda11.1.1.dockerfile | 5 + dockerfile/cuda12.1.dockerfile | 5 + .../cuda_decode_performance/AppDecPerf.cpp | 454 +++++++ .../cuda_decode_performance/CMakeLists.txt | 117 ++ .../OptimizedNvDecoder.cpp | 263 ++++ .../OptimizedNvDecoder.h | 52 + .../cuda_decode_performance/ThreadPoolUtils.h | 99 ++ .../Video_Codec_SDK/Interface/cuviddec.h | 1173 +++++++++++++++++ .../Video_Codec_SDK/Interface/nvcuvid.h | 486 +++++++ .../Lib/linux/stubs/x86_64/libnvcuvid.so | Bin 0 -> 3528 bytes .../Samples/NvCodec/NvDecoder/NvDecoder.cpp | 709 ++++++++++ .../Samples/NvCodec/NvDecoder/NvDecoder.h | 528 ++++++++ .../Samples/Utils/FFmpegDemuxer.h | 379 ++++++ .../Samples/Utils/FFmpegStreamer.h | 148 +++ .../Video_Codec_SDK/Samples/Utils/Logger.h | 235 ++++ .../Samples/Utils/NvCodecUtils.h | 547 ++++++++ 19 files changed, 5208 insertions(+), 5 deletions(-) create mode 100644 superbench/benchmarks/micro_benchmarks/cuda_decode_performance/AppDecPerf.cpp create mode 100644 superbench/benchmarks/micro_benchmarks/cuda_decode_performance/CMakeLists.txt create mode 100644 superbench/benchmarks/micro_benchmarks/cuda_decode_performance/OptimizedNvDecoder.cpp create mode 100644 superbench/benchmarks/micro_benchmarks/cuda_decode_performance/OptimizedNvDecoder.h create mode 100644 superbench/benchmarks/micro_benchmarks/cuda_decode_performance/ThreadPoolUtils.h create mode 100644 third_party/Video_Codec_SDK/Interface/cuviddec.h create mode 100644 third_party/Video_Codec_SDK/Interface/nvcuvid.h create mode 100644 third_party/Video_Codec_SDK/Lib/linux/stubs/x86_64/libnvcuvid.so create mode 100644 third_party/Video_Codec_SDK/Samples/NvCodec/NvDecoder/NvDecoder.cpp create mode 100644 third_party/Video_Codec_SDK/Samples/NvCodec/NvDecoder/NvDecoder.h create mode 100644 third_party/Video_Codec_SDK/Samples/Utils/FFmpegDemuxer.h create mode 100644 third_party/Video_Codec_SDK/Samples/Utils/FFmpegStreamer.h create mode 100644 third_party/Video_Codec_SDK/Samples/Utils/Logger.h create mode 100644 third_party/Video_Codec_SDK/Samples/Utils/NvCodecUtils.h diff --git a/.azure-pipelines/cuda-unit-test.yml b/.azure-pipelines/cuda-unit-test.yml index 3afcd49f..2d953d65 100644 --- a/.azure-pipelines/cuda-unit-test.yml +++ b/.azure-pipelines/cuda-unit-test.yml @@ -11,7 +11,7 @@ pool: container: image: nvcr.io/nvidia/pytorch:20.12-py3 - options: '-v /var/run/docker.sock:/var/run/docker.sock -v /usr/bin/docker:/usr/bin/docker' + options: '-v /var/run/docker.sock:/var/run/docker.sock -v /usr/bin/docker:/usr/bin/docker -v /usr/bin/sudo:/usr/bin/sudo -v /usr/lib/sudo/:/usr/lib/sudo/' steps: - script: | @@ -21,6 +21,8 @@ steps: python3 -m pip install --upgrade pip setuptools==65.7 python3 -m pip install .[test,nvworker] make postinstall + sudo DEBIAN_FRONTEND=noninteractive apt-get update + sudo DEBIAN_FRONTEND=noninteractive apt-get install -y ffmpeg libavcodec-dev libavformat-dev libavutil-dev libswresample-dev displayName: Install dependencies - script: | python3 setup.py lint @@ -31,7 +33,7 @@ steps: - script: | SB_MICRO_PATH=$PWD python3 setup.py test displayName: Run unit tests - timeoutInMinutes: 15 + timeoutInMinutes: 30 - script: | bash <(curl -s https://codecov.io/bash) -cF cuda-unit-test displayName: Report coverage results diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml index ef9f652b..e53acebf 100644 --- a/.github/workflows/codeql-analysis.yml +++ b/.github/workflows/codeql-analysis.yml @@ -49,6 +49,10 @@ jobs: steps: - name: Checkout uses: actions/checkout@v3 + - name: Install Dependency + run: | + DEBIAN_FRONTEND=noninteractive apt-get update + DEBIAN_FRONTEND=noninteractive apt-get install -y ffmpeg libavcodec-dev libavformat-dev libavutil-dev libswresample-dev sudo - name: Initialize CodeQL uses: github/codeql-action/init@v2 with: diff --git a/.gitignore b/.gitignore index e1ab18ca..5888455a 100644 --- a/.gitignore +++ b/.gitignore @@ -9,9 +9,6 @@ __pycache__/ *.py[cod] *$py.class -# C extensions -*.so - # Distribution / packaging .Python build/ diff --git a/dockerfile/cuda11.1.1.dockerfile b/dockerfile/cuda11.1.1.dockerfile index 8b92c546..d7feb2ba 100644 --- a/dockerfile/cuda11.1.1.dockerfile +++ b/dockerfile/cuda11.1.1.dockerfile @@ -26,13 +26,18 @@ RUN apt-get update && \ build-essential \ curl \ dmidecode \ + ffmpeg \ git \ iproute2 \ jq \ libaio-dev \ + libavcodec-dev \ + libavformat-dev \ + libavutil-dev \ libcap2 \ libnuma-dev \ libpci-dev \ + libswresample-dev \ libtinfo5 \ libtool \ lshw \ diff --git a/dockerfile/cuda12.1.dockerfile b/dockerfile/cuda12.1.dockerfile index 4a257bf4..2f9e430f 100644 --- a/dockerfile/cuda12.1.dockerfile +++ b/dockerfile/cuda12.1.dockerfile @@ -25,14 +25,19 @@ RUN apt-get update && \ build-essential \ curl \ dmidecode \ + ffmpeg \ git \ iproute2 \ jq \ libaio-dev \ + libavcodec-dev \ + libavformat-dev \ + libavutil-dev \ libboost-program-options-dev \ libcap2 \ libnuma-dev \ libpci-dev \ + libswresample-dev \ libtinfo5 \ libtool \ lshw \ diff --git a/superbench/benchmarks/micro_benchmarks/cuda_decode_performance/AppDecPerf.cpp b/superbench/benchmarks/micro_benchmarks/cuda_decode_performance/AppDecPerf.cpp new file mode 100644 index 00000000..1ae5ae12 --- /dev/null +++ b/superbench/benchmarks/micro_benchmarks/cuda_decode_performance/AppDecPerf.cpp @@ -0,0 +1,454 @@ +// Copyright(c) Microsoft Corporation. +// Licensed under the MIT License. + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../Utils/FFmpegDemuxer.h" +#include "../Utils/NvCodecUtils.h" +#include "OptimizedNvDecoder.h" +#include "ThreadPoolUtils.h" + +// Define logger which need in third party utils +simplelogger::Logger *logger = simplelogger::LoggerFactory::CreateConsoleLogger(); + +// Define the codec map +std::map codecMap = { + {"mpeg1", cudaVideoCodec_MPEG1}, {"mpeg2", cudaVideoCodec_MPEG2}, {"mpeg4", cudaVideoCodec_MPEG4}, + {"vc1", cudaVideoCodec_VC1}, {"h264", cudaVideoCodec_H264}, {"jpeg", cudaVideoCodec_JPEG}, + {"h264_svc", cudaVideoCodec_H264_SVC}, {"h264_mvc", cudaVideoCodec_H264_MVC}, {"hevc", cudaVideoCodec_HEVC}, + {"vp8", cudaVideoCodec_VP8}, {"vp9", cudaVideoCodec_VP9}, {"av1", cudaVideoCodec_AV1}}; + +/** + * @brief Function to decode video file using OptimizedNvDecoder interface + * @param pDec - Handle to OptimizedNvDecoder + * @param demuxer - Pointer to an FFmpegDemuxer instance + * @param pnFrame - Variable to record the number of frames decoded + * @param ex - Stores current exception in case of failure + */ +void DecProc(OptimizedNvDecoder *pDec, const char *szInFilePath, int *pnFrame, std::exception_ptr &ex) { + try { + std::unique_ptr demuxer(new FFmpegDemuxer(szInFilePath)); + int nVideoBytes = 0, nFrameReturned = 0, nFrame = 0; + uint8_t *pVideo = NULL, *pFrame = NULL; + do { + // Demux video from file using FFmpegDemuxer + demuxer->Demux(&pVideo, &nVideoBytes); + // Decode the video frame from demuxed packet + nFrameReturned = pDec->Decode(pVideo, nVideoBytes); + if (!nFrame && nFrameReturned) + LOG(INFO) << pDec->GetVideoInfo(); + nFrame += nFrameReturned; + } while (nVideoBytes); + *pnFrame = nFrame; + } catch (std::exception &) { + ex = std::current_exception(); + } +} + +/** + * @brief Function to show help message and exit + */ +void ShowHelpAndExit(const char *szBadOption = NULL) { + std::ostringstream oss; + bool bThrowError = false; + if (szBadOption) { + bThrowError = true; + oss << "Error parsing \"" << szBadOption << "\"" << std::endl; + } + oss << "Options:" << std::endl + << "-i Input file path. No default value. One of -i and -multi_input is required." << std::endl + << "-o Output file path of raw data. No default value. Optional." << std::endl + << "-gpu Ordinal of GPU to use. Default 0. Optional." << std::endl + << "-thread Number of decoding thread. Default 5. Optional." << std::endl + << "-total Number of total video to test. Default 100. Optional." << std::endl + << "-single (No value) Use single cuda context for every thread. Default is multi-context, one context " + "per thread." + << std::endl + << "-host (No value) Copy frame to host memory .Default is device memory)" << std::endl + << "-multi_input The file path which lists the path of multiple video in each line." << std::endl + << "-codec The codec of video to test. Default H264." << std::endl; + if (bThrowError) { + throw std::invalid_argument(oss.str()); + } else { + std::cout << oss.str(); + exit(0); + } +} + +/** + * @brief Function to parse commandline arguments + */ +void ParseCommandLine(int argc, char *argv[], char *szInputFileName, int &iGpu, int &nThread, int &nTotalVideo, + bool &bSingle, bool &bHost, std::string &inputFilesListPath, std::string &outputFile, + cudaVideoCodec &codec) { + for (int i = 1; i < argc; i++) { + if (!_stricmp(argv[i], "-h")) { + ShowHelpAndExit(); + } + if (!_stricmp(argv[i], "-i")) { + if (++i == argc) { + ShowHelpAndExit("-i"); + } + sprintf(szInputFileName, "%s", argv[i]); + continue; + } + if (!_stricmp(argv[i], "-o")) { + if (++i == argc) { + ShowHelpAndExit("-o"); + } + outputFile = std::string(argv[i]); + continue; + } + if (!_stricmp(argv[i], "-gpu")) { + if (++i == argc) { + ShowHelpAndExit("-gpu"); + } + iGpu = atoi(argv[i]); + continue; + } + if (!_stricmp(argv[i], "-thread")) { + if (++i == argc) { + ShowHelpAndExit("-thread"); + } + nThread = atoi(argv[i]); + continue; + } + if (!_stricmp(argv[i], "-total")) { + if (++i == argc) { + ShowHelpAndExit("-total"); + } + nTotalVideo = atoi(argv[i]); + continue; + } + if (!_stricmp(argv[i], "-multi_input")) { + if (++i == argc) { + ShowHelpAndExit("-multi_input"); + } + inputFilesListPath = std::string(argv[i]); + continue; + } + if (!_stricmp(argv[i], "-single")) { + bSingle = true; + continue; + } + if (!_stricmp(argv[i], "-host")) { + bHost = true; + continue; + } + if (!_stricmp(argv[i], "-codec")) { + if (++i == argc) { + ShowHelpAndExit("-codec"); + } + std::string codecName = std::string(argv[i]); + std::transform(codecName.begin(), codecName.end(), codecName.begin(), + [](unsigned char c) { return std::tolower(c); }); + if (codecMap.find(codecName) != codecMap.end()) { + codec = codecMap[codecName]; + } else { + std::cout << "Codec name not found in the map." << std::endl; + exit(1); + } + continue; + } + ShowHelpAndExit(argv[i]); + } +} + +/** + * @brief Function to create cuda context and initialize decoder + */ +OptimizedNvDecoder *InitOptimizedNvDecoder(int i, const CUdevice &cuDevice, CUcontext &cuContext, bool bSingle, + bool bHost, cudaVideoCodec codec, CUVIDDECODECAPS decodecaps) { + if (!bSingle) { + ck(cuCtxCreate(&cuContext, 0, cuDevice)); + } + OptimizedNvDecoder *sessionObject = new OptimizedNvDecoder(cuContext, !bHost, codec, decodecaps); + sessionObject->setDecoderSessionID(i); + return sessionObject; +} + +/** + * @brief Function to decode a video in a thread and measure the latency + */ +double DecodeVideo(size_t i, const std::vector &vDec, const char *szInFilePath, int *pnFrame, + std::exception_ptr &ex) { + try { + OptimizedNvDecoder *pDec = vDec[i]; + auto start = std::chrono::high_resolution_clock::now(); + DecProc(pDec, szInFilePath, pnFrame, ex); + auto end = std::chrono::high_resolution_clock::now(); + auto elapsedTime = std::chrono::duration_cast(end - start).count(); + std::cout << "Decode finished --" + << " duration:" << elapsedTime << " frames:" << *pnFrame << std::endl; + return elapsedTime / 1000.0f; + } catch (const std::exception &e) { + std::cerr << "Exception in deocding: " << e.what() << std::endl; + return 0; + } +} + +/** + * @brief Function to read the video paths from a file + */ +std::vector ReadMultipleVideoFiles(const std::string &filepath) { + std::ifstream file(filepath); + if (!file) { + std::cerr << "Error opening the file." << std::endl; + exit(1); + } + std::string line; + std::vector tokens; + while (std::getline(file, line)) { + tokens.push_back(line); + } + file.close(); + return tokens; +} + +/** + * @brief Function to get the decoder capability + */ +void GetDefaultDecoderCaps(CUVIDDECODECAPS &decodecaps, cudaVideoCodec codec) { + memset(&decodecaps, 0, sizeof(decodecaps)); + decodecaps.eCodecType = codec; + decodecaps.eChromaFormat = cudaVideoChromaFormat_420; + decodecaps.nBitDepthMinus8 = 0; + NVDEC_API_CALL(cuvidGetDecoderCaps(&decodecaps)); +} + +/** + * @brief Function to initialize the cuda device, cuda context, query the decoder capability and create decoder for + * each thread + */ +void InitializeContext(std::vector &vDec, int iGpu, int nThread, bool bSingle, bool bHost, + cudaVideoCodec codec) { + ck(cuInit(0)); + int nGpu = 0; + ck(cuDeviceGetCount(&nGpu)); + if (iGpu < 0 || iGpu >= nGpu) { + std::cout << "GPU ordinal out of range. Should be within [" << 0 << ", " << nGpu - 1 << "]" << std::endl; + exit(1); + } + CUdevice cuDevice = 0; + ck(cuDeviceGet(&cuDevice, iGpu)); + char szDeviceName[80]; + ck(cuDeviceGetName(szDeviceName, sizeof(szDeviceName), cuDevice)); + std::cout << "GPU in use: " << szDeviceName << std::endl; + + CUcontext cuContext = NULL; + ck(cuCtxCreate(&cuContext, 0, cuDevice)); + + CUVIDDECODECAPS decodecaps; + GetDefaultDecoderCaps(decodecaps, codec); + + ThreadPool threadPool(nThread); + std::vector> futures; + for (int i = 0; i < nThread; i++) { + futures.push_back( + threadPool.enqueue(InitOptimizedNvDecoder, cuDevice, cuContext, bSingle, bHost, codec, decodecaps)); + } + for (auto &future : futures) { + vDec.push_back(future.get()); // Retrieve the results from each task + } +} + +/** + * @brief Function to write the latency and FPS data of each video to a file + */ +void WriteRawData(std::vector &vDec, int nThread, const std::vector &data, + std::vector &frames, std::string filename) { + // Open the output file stream + std::ofstream outputFile(filename); + outputFile << "Frame Latency" << std::endl; + for (int i = 0; i < nThread; i++) { + for (const auto &tuple : vDec[i]->GetFrameLatency()) { + int frame = std::get<0>(tuple); + double latency = std::get<1>(tuple); + outputFile << "Frame: " << frame << ", Latency: " << latency << std::endl; + } + } + outputFile << "Video Latency" << std::endl; + for (int i = 0; i < data.size(); i++) { + outputFile << data[i] << std::endl; + } + outputFile << "Video FPS" << std::endl; + for (int i = 0; i < data.size(); i++) { + outputFile << frames[i] / data[i] << std::endl; + } + + // Close the file stream + outputFile.close(); +} + +/** + * @brief Function to calculate the statistical metrics + */ +std::tuple +CalMetrics(const std::vector &originData) { + std::vector data = originData; + double sum = std::accumulate(data.begin(), data.end(), 0.0); + double mean = sum / data.size(); + double min = *std::min_element(data.begin(), data.end()); + double max = *std::max_element(data.begin(), data.end()); + std::sort(data.begin(), data.end()); + double p50 = data[data.size() / 2]; + double p90 = data[static_cast(data.size() * 0.9)]; + double p95 = data[static_cast(data.size() * 0.95)]; + double p99 = data[static_cast(data.size() * 0.99)]; + return std::make_tuple(sum, mean, min, max, p50, p90, p95, p99); +} + +/** + * @brief Function to generate the total file list for the given total number of videos. + * If the number of videos is less than the total number of videos, the list will be repeated. + * If the number of videos is greater than the total number of videos, the list will be truncated. + */ +std::vector GenerateTotalFileList(const std::string &inputFilesListPath, int nTotalVideo, + const char *szInFilePath) { + std::vector files; + if (inputFilesListPath.size() != 0) { + auto videofiles = ReadMultipleVideoFiles(inputFilesListPath); + int smallerSize = videofiles.size(); + + if (nTotalVideo > smallerSize) { + int numIterations = nTotalVideo / smallerSize; + + for (int i = 0; i < numIterations; i++) { + files.insert(files.end(), videofiles.begin(), videofiles.end()); + } + + int remainingElements = nTotalVideo - (numIterations * smallerSize); + files.insert(files.end(), videofiles.begin(), videofiles.begin() + remainingElements); + } else { + files = std::vector(videofiles.begin(), videofiles.begin() + nTotalVideo); + } + + std::cout << "Multifile mode - " << nTotalVideo << "videos will be decoded" << std::endl; + } else { + for (int i = 0; i < nTotalVideo; i++) { + files.push_back(std::string(szInFilePath)); + } + } + return files; +} + +/** + * @brief Function to run the decoding tasks in parallel with thread pool to decode all the videos and record the total + * latency and the total number of frames + */ +float run(std::vector &vDec, int nThread, std::vector &files, + std::vector &vnFrame, std::vector &vExceptionPtrs, int *nTotalFrames, + std::vector &vnLatency, std::vector &frLatency, std::vector &vnFPS) { + std::vector> decodeLatencyFutures; + ThreadPool threadPool(nThread); + // Enqueue the video decoding task into thread pool + auto start = std::chrono::high_resolution_clock::now(); + for (int i = 0; i < files.size(); i++) { + auto filePath = files[i].c_str(); + CheckInputFile(filePath); + decodeLatencyFutures.push_back( + threadPool.enqueue(DecodeVideo, vDec, filePath, &vnFrame[i], std::ref(vExceptionPtrs[i]))); + } + // Wait until decoding tasks finished + for (int i = 0; i < files.size(); i++) { + auto decodeLatency = decodeLatencyFutures[i].get(); + vnLatency.push_back(decodeLatency); + *nTotalFrames += vnFrame[i]; + } + auto elapsedTime = + (std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - start) + .count()) / + 1000.0f; + for (int i = 0; i < nThread; i++) { + for (const auto &tuple : vDec[i]->GetFrameLatency()) { + int frame = std::get<0>(tuple); + double latency = std::get<1>(tuple); + if (frame > 0) { + frLatency.push_back(latency / frame); + } + } + } + for (int i = 0; i < vnLatency.size(); i++) { + if (vnLatency[i] != 0) { + vnFPS.push_back(vnFrame[i] / vnLatency[i]); + } + } + + // Record the total time + return elapsedTime; +} + +int main(int argc, char **argv) { + char szInFilePath[256] = ""; + int iGpu = 0; + int nThread = 5; + int nTotalVideo = 100; + bool bSingle = false; + bool bHost = false; + std::string inputFilesListPath = ""; + std::string outputFilePath = ""; + std::vector vExceptionPtrs(nTotalVideo); + cudaVideoCodec codec = cudaVideoCodec_H264; + try { + // Parse the command line arguments + ParseCommandLine(argc, argv, szInFilePath, iGpu, nThread, nTotalVideo, bSingle, bHost, inputFilesListPath, + outputFilePath, codec); + auto files = GenerateTotalFileList(inputFilesListPath, nTotalVideo, szInFilePath); + + // Initialize and prepare the decoder context for each thread + std::vector vDec; + InitializeContext(vDec, iGpu, nThread, bSingle, bHost, codec); + + // Decode all video with thread pool + std::vector vnFrame(nTotalVideo); + int nTotalFrames = 0; + std::vector vnLatency; + std::vector frLatency; + std::vector videoFPS; + auto elapsedTime = + run(vDec, nThread, files, vnFrame, vExceptionPtrs, &nTotalFrames, vnLatency, frLatency, videoFPS); + + // Calculate and output the raw data into file and metrics into stdout + double sum, mean, min, max, p50, p90, p95, p99; + std::tie(sum, mean, min, max, p50, p90, p95, p99) = CalMetrics(vnLatency); + std::cout << "Total Frames Decoded=" << nTotalFrames << " FPS=" << nTotalFrames / elapsedTime << std::endl; + std::cout << "Mean Latency for each video=" << mean * 1000 << " P50 Latency=" << p50 * 1000 + << " P90 Latency=" << p90 * 1000 << " P95 Latency=" << p95 * 1000 << " P99 Latency=" << p99 * 1000 + << "ms" << std::endl; + + std::tie(sum, mean, min, max, p50, p90, p95, p99) = CalMetrics(videoFPS); + std::cout << "Mean FPS for each video=" << mean << " P50 FPS=" << p50 << " P90 FPS=" << p90 + << " P95 FPS=" << p95 << " P99 FPS=" << p99 << std::endl; + std::tie(sum, mean, min, max, p50, p90, p95, p99) = CalMetrics(frLatency); + std::cout << "Mean Latency for each frame=" << mean * 1000 << " P50 Latency=" << p50 * 1000 + << " P90 Latency=" << p90 * 1000 << " P95 Latency=" << p95 * 1000 << " P99 Latency=" << p99 * 1000 + << "ms" << std::endl; + if (outputFilePath.size() != 0) { + WriteRawData(vDec, nThread, vnLatency, vnFrame, outputFilePath); + } + // Deinitialization + for (int i = 0; i < nThread; i++) { + delete (vDec[i]); + } + for (int i = 0; i < nThread; i++) { + if (vExceptionPtrs[i]) { + std::rethrow_exception(vExceptionPtrs[i]); + } + } + } catch (const std::exception &ex) { + std::cout << ex.what(); + exit(1); + } + return 0; +} diff --git a/superbench/benchmarks/micro_benchmarks/cuda_decode_performance/CMakeLists.txt b/superbench/benchmarks/micro_benchmarks/cuda_decode_performance/CMakeLists.txt new file mode 100644 index 00000000..83cb1506 --- /dev/null +++ b/superbench/benchmarks/micro_benchmarks/cuda_decode_performance/CMakeLists.txt @@ -0,0 +1,117 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +cmake_minimum_required(VERSION 3.18) +project(cuda_decode_performance) + +find_package(CUDA QUIET) +if(CUDA_FOUND) + set(CMAKE_CXX_STANDARD 17) + set(CMAKE_CXX_STANDARD_REQUIRED ON) + + set(THIRD_PARTY_SAMPLE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../../../../third_party/Video_Codec_SDK/Samples) + set(NVCODEC_PUBLIC_INTERFACE_DIR ${THIRD_PARTY_SAMPLE_DIR}/../Interface) + set(NVCODEC_UTILS_DIR ${THIRD_PARTY_SAMPLE_DIR}/Utils) + set(NV_CODEC_DIR ${THIRD_PARTY_SAMPLE_DIR}/NvCodec) + set(NV_DEC_DIR ${THIRD_PARTY_SAMPLE_DIR}/NvCodec/NvDecoder) + + if(CMAKE_SYSTEM_NAME STREQUAL "Linux") + find_package(PkgConfig REQUIRED) + pkg_check_modules(PC_AVCODEC REQUIRED IMPORTED_TARGET libavcodec) + pkg_check_modules(PC_AVFORMAT REQUIRED IMPORTED_TARGET libavformat) + pkg_check_modules(PC_AVUTIL REQUIRED IMPORTED_TARGET libavutil) + pkg_check_modules(PC_SWRESAMPLE REQUIRED IMPORTED_TARGET libswresample) + + set(NV_FFMPEG_HDRS ${PC_AVCODEC_INCLUDE_DIRS}) + find_library(AVCODEC_LIBRARY NAMES avcodec + HINTS + ${PC_AVCODEC_LIBDIR} + ${PC_AVCODEC_LIBRARY_DIRS} + ) + find_library(AVFORMAT_LIBRARY NAMES avformat + HINTS + ${PC_AVFORMAT_LIBDIR} + ${PC_AVFORMAT_LIBRARY_DIRS} + ) + find_library(AVUTIL_LIBRARY NAMES avutil + HINTS + ${PC_AVUTIL_LIBDIR} + ${PC_AVUTIL_LIBRARY_DIRS} + ) + find_library(SWRESAMPLE_LIBRARY NAMES swresample + HINTS + ${PC_SWRESAMPLE_LIBDIR} + ${PC_SWRESAMPLE_LIBRARY_DIRS} + ) + set(AVCODEC_LIB ${AVCODEC_LIBRARY}) + set(AVFORMAT_LIB ${AVFORMAT_LIBRARY}) + set(AVUTIL_LIB ${AVUTIL_LIBRARY}) + set(SWRESAMPLE_LIB ${SWRESAMPLE_LIBRARY}) + endif() + + set(APP_SOURCES + ${CMAKE_CURRENT_SOURCE_DIR}/AppDecPerf.cpp + ) + + set(NV_DEC_SOURCES + ${NV_DEC_DIR}/NvDecoder.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/OptimizedNvDecoder.cpp + ) + + set(NV_DEC_HDRS + ${NV_DEC_DIR}/NvDecoder.h + ${NVCODEC_PUBLIC_INTERFACE_DIR}/cuviddec.h + ${NVCODEC_PUBLIC_INTERFACE_DIR}/nvcuvid.h + ${NVCODEC_UTILS_DIR}/NvCodecUtils.h + ${NVCODEC_UTILS_DIR}/FFmpegDemuxer.h + ${CMAKE_CURRENT_SOURCE_DIR}/ThreadPoolUtils.h + ${CMAKE_CURRENT_SOURCE_DIR}/OptimizedNvDecoder.h + ) + + source_group( "headers" FILES ${NV_DEC_HDRS} ) + source_group( "sources" FILES ${APP_SOURCES} ${NV_DEC_SOURCES}) + set(CMAKE_LIBRARY_PATH "${CUDA_TOOLKIT_ROOT_DIR}/lib64/stubs;${CUDA_TOOLKIT_ROOT_DIR}/lib/stubs;${CUDA_TOOLKIT_ROOT_DIR}/lib64;${CUDA_TOOLKIT_ROOT_DIR}/lib;${CMAKE_LIBRARY_PATH}") + find_package(CUDA) + set(CUDA_HOST_COMPILER ${CMAKE_CXX_COMPILER}) + set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-gencode arch=compute_50,code=\"sm_50,compute_50\") + if ( CMAKE_COMPILER_IS_GNUCC ) + if(NOT "${CUDA_NVCC_FLAGS}" MATCHES "-std=c\\+\\+11" ) + list(APPEND CUDA_NVCC_FLAGS -std=c++11) + endif() + endif() + + # Check if the file exists + if (NOT EXISTS "/usr/local/lib/libnvcuvid.so" ) + execute_process( + COMMAND sudo ln -s /usr/lib/x86_64-linux-gnu/libnvcuvid.so.1 /usr/local/lib/libnvcuvid.so + RESULT_VARIABLE result + ) + if(result) + message(FATAL_ERROR "Failed to create symbolic link for nvcuvid lib: ${result}") + endif() + endif () + + find_library(CUVID_LIB nvcuvid + HINTS + "/usr/local/lib/" + "${CMAKE_CURRENT_SOURCE_DIR}/../../../../third_party/Video_Codec_SDK/Lib/linux/stubs/x86_64/" + ) + + cuda_add_executable(${PROJECT_NAME} ${APP_SOURCES} ${NV_DEC_SOURCES} ${NV_DEC_HDRS}) + + set_target_properties(${PROJECT_NAME} PROPERTIES CUDA_SEPARABLE_COMPILATION ON) + + target_include_directories(${PROJECT_NAME} PUBLIC ${CUDA_INCLUDE_DIRS} + ${NVCODEC_PUBLIC_INTERFACE_DIR} + ${NVCODEC_UTILS_DIR} + ${NV_CODEC_DIR} + ${NV_APPDEC_COMMON_DIR} + ${NV_FFMPEG_HDRS} + ${THIRD_PARTY_SAMPLE_DIR} + ) + + target_link_libraries(${PROJECT_NAME} ${CUDA_CUDA_LIBRARY} ${CMAKE_DL_LIBS} ${CUVID_LIB} ${AVCODEC_LIB} + ${AVFORMAT_LIB} ${AVUTIL_LIB} ${SWRESAMPLE_LIB}) + + install(TARGETS ${PROJECT_NAME} RUNTIME DESTINATION bin LIBRARY DESTINATION lib) +endif() diff --git a/superbench/benchmarks/micro_benchmarks/cuda_decode_performance/OptimizedNvDecoder.cpp b/superbench/benchmarks/micro_benchmarks/cuda_decode_performance/OptimizedNvDecoder.cpp new file mode 100644 index 00000000..ee23391b --- /dev/null +++ b/superbench/benchmarks/micro_benchmarks/cuda_decode_performance/OptimizedNvDecoder.cpp @@ -0,0 +1,263 @@ +// Copyright(c) Microsoft Corporation. +// Licensed under the MIT License. + +#include + +#include "OptimizedNvDecoder.h" + +int OptimizedNvDecoder::Decode(const uint8_t *pData, int nSize, int nFlags, int64_t nTimestamp) { + m_nDecodedFrame = 0; + m_nDecodedFrameReturned = 0; + CUVIDSOURCEDATAPACKET packet = {0}; + packet.payload = pData; + packet.payload_size = nSize; + packet.flags = nFlags | CUVID_PKT_TIMESTAMP; + packet.timestamp = nTimestamp; + if (!pData || nSize == 0) { + packet.flags |= CUVID_PKT_ENDOFSTREAM; + } + auto start = std::chrono::high_resolution_clock::now(); + NVDEC_API_CALL(cuvidParseVideoData(m_hParser, &packet)); + int64_t elapsedTime = + std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - start) + .count(); + frameLatency.push_back(std::make_tuple(m_nDecodedFrame, elapsedTime / 1000.0f / 1000.0f)); + return m_nDecodedFrame; +} + +OptimizedNvDecoder::OptimizedNvDecoder(CUcontext &cuContext, bool bUseDeviceFrame, cudaVideoCodec eCodec, + CUVIDDECODECAPS decodecaps, bool bLowLatency, bool bDeviceFramePitched, + const Rect *pCropRect, const Dim *pResizeDim, bool extract_user_SEI_Message, + int maxWidth, int maxHeight, unsigned int clkRate, bool force_zero_latency) { + m_cuContext = cuContext; + m_bUseDeviceFrame = bUseDeviceFrame; + m_eCodec = eCodec; + m_bDeviceFramePitched = bDeviceFramePitched; + m_bExtractSEIMessage = extract_user_SEI_Message; + m_nMaxWidth = maxWidth; + m_nMaxHeight = maxHeight; + m_bForce_zero_latency = force_zero_latency; + if (pCropRect) + m_cropRect = *pCropRect; + if (pResizeDim) + m_resizeDim = *pResizeDim; + + CUDA_DRVAPI_CALL(cuCtxPushCurrent(m_cuContext)); + NVDEC_API_CALL(cuvidCtxLockCreate(&m_ctxLock, cuContext)); + + ck(cuStreamCreate(&m_cuvidStream, CU_STREAM_DEFAULT)); + + decoderSessionID = 0; + + if (m_bExtractSEIMessage) { + m_fpSEI = fopen("sei_message.txt", "wb"); + m_pCurrSEIMessage = new CUVIDSEIMESSAGEINFO; + memset(&m_SEIMessagesDisplayOrder, 0, sizeof(m_SEIMessagesDisplayOrder)); + } + CUVIDPARSERPARAMS videoParserParameters = {}; + videoParserParameters.CodecType = eCodec; + videoParserParameters.ulMaxNumDecodeSurfaces = 1; + videoParserParameters.ulClockRate = clkRate; + videoParserParameters.ulMaxDisplayDelay = bLowLatency ? 0 : 1; + videoParserParameters.pUserData = this; + videoParserParameters.pfnSequenceCallback = HandleVideoSequenceProc; + videoParserParameters.pfnDecodePicture = HandlePictureDecodeProc; + videoParserParameters.pfnDisplayPicture = m_bForce_zero_latency ? NULL : HandlePictureDisplayProc; + videoParserParameters.pfnGetOperatingPoint = HandleOperatingPointProc; + videoParserParameters.pfnGetSEIMsg = m_bExtractSEIMessage ? HandleSEIMessagesProc : NULL; + NVDEC_API_CALL(cuvidCreateVideoParser(&m_hParser, &videoParserParameters)); + // reuse the decodecaps queried before + m_decodecaps = decodecaps; + CUDA_DRVAPI_CALL(cuCtxPopCurrent(NULL)); +} + +int OptimizedNvDecoder::HandleVideoSequence(CUVIDEOFORMAT *pVideoFormat) { + START_TIMER + m_videoInfo.str(""); + m_videoInfo.clear(); + m_videoInfo << "Video Input Information" << std::endl + << "\tCodec : " << GetVideoCodecString(pVideoFormat->codec) << std::endl + << "\tFrame rate : " << pVideoFormat->frame_rate.numerator << "/" + << pVideoFormat->frame_rate.denominator << " = " + << 1.0 * pVideoFormat->frame_rate.numerator / pVideoFormat->frame_rate.denominator << " fps" + << std::endl + << "\tSequence : " << (pVideoFormat->progressive_sequence ? "Progressive" : "Interlaced") + << std::endl + << "\tCoded size : [" << pVideoFormat->coded_width << ", " << pVideoFormat->coded_height << "]" + << std::endl + << "\tDisplay area : [" << pVideoFormat->display_area.left << ", " << pVideoFormat->display_area.top + << ", " << pVideoFormat->display_area.right << ", " << pVideoFormat->display_area.bottom << "]" + << std::endl + << "\tChroma : " << GetVideoChromaFormatString(pVideoFormat->chroma_format) << std::endl + << "\tBit depth : " << pVideoFormat->bit_depth_luma_minus8 + 8; + m_videoInfo << std::endl; + + int nDecodeSurface = pVideoFormat->min_num_decode_surfaces; + + // re-call the cuvidGetDecoderCaps when the video codeoc and format change + if (m_decodecaps.eCodecType != pVideoFormat->codec || m_decodecaps.eChromaFormat != pVideoFormat->chroma_format || + m_decodecaps.nBitDepthMinus8 != pVideoFormat->bit_depth_luma_minus8) { + m_decodecaps.eCodecType = pVideoFormat->codec; + m_decodecaps.eChromaFormat = pVideoFormat->chroma_format; + m_decodecaps.nBitDepthMinus8 = pVideoFormat->bit_depth_luma_minus8; + + CUDA_DRVAPI_CALL(cuCtxPushCurrent(m_cuContext)); + NVDEC_API_CALL(cuvidGetDecoderCaps(&m_decodecaps)); + CUDA_DRVAPI_CALL(cuCtxPopCurrent(NULL)); + } + + if (!m_decodecaps.bIsSupported) { + NVDEC_THROW_ERROR("Codec not supported on this GPU", CUDA_ERROR_NOT_SUPPORTED); + return nDecodeSurface; + } + + if ((pVideoFormat->coded_width > m_decodecaps.nMaxWidth) || + (pVideoFormat->coded_height > m_decodecaps.nMaxHeight)) { + + std::ostringstream errorString; + errorString << std::endl + << "Resolution : " << pVideoFormat->coded_width << "x" << pVideoFormat->coded_height + << std::endl + << "Max Supported (wxh) : " << m_decodecaps.nMaxWidth << "x" << m_decodecaps.nMaxHeight << std::endl + << "Resolution not supported on this GPU"; + + const std::string cErr = errorString.str(); + NVDEC_THROW_ERROR(cErr, CUDA_ERROR_NOT_SUPPORTED); + return nDecodeSurface; + } + if ((pVideoFormat->coded_width >> 4) * (pVideoFormat->coded_height >> 4) > m_decodecaps.nMaxMBCount) { + + std::ostringstream errorString; + errorString << std::endl + << "MBCount : " << (pVideoFormat->coded_width >> 4) * (pVideoFormat->coded_height >> 4) + << std::endl + << "Max Supported mbcnt : " << m_decodecaps.nMaxMBCount << std::endl + << "MBCount not supported on this GPU"; + NVDEC_THROW_ERROR(errorString.str(), CUDA_ERROR_NOT_SUPPORTED); + return nDecodeSurface; + } + + if (m_nWidth && m_nLumaHeight && m_nChromaHeight) { + + // cuvidCreateDecoder() has been called before, and now there's possible config change + return ReconfigureDecoder(pVideoFormat); + } + + // eCodec has been set in the constructor (for parser). Here it's set again for potential correction + m_eCodec = pVideoFormat->codec; + m_eChromaFormat = pVideoFormat->chroma_format; + m_nBitDepthMinus8 = pVideoFormat->bit_depth_luma_minus8; + m_nBPP = m_nBitDepthMinus8 > 0 ? 2 : 1; + + // Set the output surface format same as chroma format + if (m_eChromaFormat == cudaVideoChromaFormat_420 || cudaVideoChromaFormat_Monochrome) + m_eOutputFormat = + pVideoFormat->bit_depth_luma_minus8 ? cudaVideoSurfaceFormat_P016 : cudaVideoSurfaceFormat_NV12; + else if (m_eChromaFormat == cudaVideoChromaFormat_444) + m_eOutputFormat = + pVideoFormat->bit_depth_luma_minus8 ? cudaVideoSurfaceFormat_YUV444_16Bit : cudaVideoSurfaceFormat_YUV444; + else if (m_eChromaFormat == cudaVideoChromaFormat_422) + m_eOutputFormat = cudaVideoSurfaceFormat_NV12; // no 4:2:2 output format supported yet so make 420 default + + // Check if output format supported. If not, check falback options + if (!(m_decodecaps.nOutputFormatMask & (1 << m_eOutputFormat))) { + if (m_decodecaps.nOutputFormatMask & (1 << cudaVideoSurfaceFormat_NV12)) + m_eOutputFormat = cudaVideoSurfaceFormat_NV12; + else if (m_decodecaps.nOutputFormatMask & (1 << cudaVideoSurfaceFormat_P016)) + m_eOutputFormat = cudaVideoSurfaceFormat_P016; + else if (m_decodecaps.nOutputFormatMask & (1 << cudaVideoSurfaceFormat_YUV444)) + m_eOutputFormat = cudaVideoSurfaceFormat_YUV444; + else if (m_decodecaps.nOutputFormatMask & (1 << cudaVideoSurfaceFormat_YUV444_16Bit)) + m_eOutputFormat = cudaVideoSurfaceFormat_YUV444_16Bit; + else + NVDEC_THROW_ERROR("No supported output format found", CUDA_ERROR_NOT_SUPPORTED); + } + m_videoFormat = *pVideoFormat; + + CUVIDDECODECREATEINFO videoDecodeCreateInfo = {0}; + videoDecodeCreateInfo.CodecType = pVideoFormat->codec; + videoDecodeCreateInfo.ChromaFormat = pVideoFormat->chroma_format; + videoDecodeCreateInfo.OutputFormat = m_eOutputFormat; + videoDecodeCreateInfo.bitDepthMinus8 = pVideoFormat->bit_depth_luma_minus8; + if (pVideoFormat->progressive_sequence) + videoDecodeCreateInfo.DeinterlaceMode = cudaVideoDeinterlaceMode_Weave; + else + videoDecodeCreateInfo.DeinterlaceMode = cudaVideoDeinterlaceMode_Adaptive; + videoDecodeCreateInfo.ulNumOutputSurfaces = 2; + // With PreferCUVID, JPEG is still decoded by CUDA while video is decoded by NVDEC hardware + videoDecodeCreateInfo.ulCreationFlags = cudaVideoCreate_PreferCUVID; + videoDecodeCreateInfo.ulNumDecodeSurfaces = nDecodeSurface; + videoDecodeCreateInfo.vidLock = m_ctxLock; + videoDecodeCreateInfo.ulWidth = pVideoFormat->coded_width; + videoDecodeCreateInfo.ulHeight = pVideoFormat->coded_height; + // AV1 has max width/height of sequence in sequence header + if (pVideoFormat->codec == cudaVideoCodec_AV1 && pVideoFormat->seqhdr_data_length > 0) { + CUVIDEOFORMATEX *vidFormatEx = (CUVIDEOFORMATEX *)pVideoFormat; + if (m_nMaxWidth < pVideoFormat->coded_width) { + m_nMaxWidth = vidFormatEx->av1.max_width; + } + if (m_nMaxHeight < pVideoFormat->coded_height) { + m_nMaxHeight = vidFormatEx->av1.max_height; + } + } + if (m_nMaxWidth < (int)pVideoFormat->coded_width) + m_nMaxWidth = pVideoFormat->coded_width; + if (m_nMaxHeight < (int)pVideoFormat->coded_height) + m_nMaxHeight = pVideoFormat->coded_height; + videoDecodeCreateInfo.ulMaxWidth = m_nMaxWidth; + videoDecodeCreateInfo.ulMaxHeight = m_nMaxHeight; + + if (!(m_cropRect.r && m_cropRect.b) && !(m_resizeDim.w && m_resizeDim.h)) { + m_nWidth = pVideoFormat->display_area.right - pVideoFormat->display_area.left; + m_nLumaHeight = pVideoFormat->display_area.bottom - pVideoFormat->display_area.top; + videoDecodeCreateInfo.ulTargetWidth = pVideoFormat->coded_width; + videoDecodeCreateInfo.ulTargetHeight = pVideoFormat->coded_height; + } else { + if (m_resizeDim.w && m_resizeDim.h) { + videoDecodeCreateInfo.display_area.left = pVideoFormat->display_area.left; + videoDecodeCreateInfo.display_area.top = pVideoFormat->display_area.top; + videoDecodeCreateInfo.display_area.right = pVideoFormat->display_area.right; + videoDecodeCreateInfo.display_area.bottom = pVideoFormat->display_area.bottom; + m_nWidth = m_resizeDim.w; + m_nLumaHeight = m_resizeDim.h; + } + + if (m_cropRect.r && m_cropRect.b) { + videoDecodeCreateInfo.display_area.left = m_cropRect.l; + videoDecodeCreateInfo.display_area.top = m_cropRect.t; + videoDecodeCreateInfo.display_area.right = m_cropRect.r; + videoDecodeCreateInfo.display_area.bottom = m_cropRect.b; + m_nWidth = m_cropRect.r - m_cropRect.l; + m_nLumaHeight = m_cropRect.b - m_cropRect.t; + } + videoDecodeCreateInfo.ulTargetWidth = m_nWidth; + videoDecodeCreateInfo.ulTargetHeight = m_nLumaHeight; + } + + m_nChromaHeight = (int)(ceil(m_nLumaHeight * GetChromaHeightFactor(m_eOutputFormat))); + m_nNumChromaPlanes = GetChromaPlaneCount(m_eOutputFormat); + m_nSurfaceHeight = videoDecodeCreateInfo.ulTargetHeight; + m_nSurfaceWidth = videoDecodeCreateInfo.ulTargetWidth; + m_displayRect.b = videoDecodeCreateInfo.display_area.bottom; + m_displayRect.t = videoDecodeCreateInfo.display_area.top; + m_displayRect.l = videoDecodeCreateInfo.display_area.left; + m_displayRect.r = videoDecodeCreateInfo.display_area.right; + + m_videoInfo << "Video Decoding Params:" << std::endl + << "\tNum Surfaces : " << videoDecodeCreateInfo.ulNumDecodeSurfaces << std::endl + << "\tCrop : [" << videoDecodeCreateInfo.display_area.left << ", " + << videoDecodeCreateInfo.display_area.top << ", " << videoDecodeCreateInfo.display_area.right << ", " + << videoDecodeCreateInfo.display_area.bottom << "]" << std::endl + << "\tResize : " << videoDecodeCreateInfo.ulTargetWidth << "x" + << videoDecodeCreateInfo.ulTargetHeight << std::endl + << "\tDeinterlace : " + << std::vector{"Weave", "Bob", "Adaptive"}[videoDecodeCreateInfo.DeinterlaceMode]; + m_videoInfo << std::endl; + + CUDA_DRVAPI_CALL(cuCtxPushCurrent(m_cuContext)); + NVDEC_API_CALL(cuvidCreateDecoder(&m_hDecoder, &videoDecodeCreateInfo)); + CUDA_DRVAPI_CALL(cuCtxPopCurrent(NULL)); + STOP_TIMER("Session Initialization Time: "); + NvDecoder::addDecoderSessionOverHead(getDecoderSessionID(), elapsedTime); + return nDecodeSurface; +} diff --git a/superbench/benchmarks/micro_benchmarks/cuda_decode_performance/OptimizedNvDecoder.h b/superbench/benchmarks/micro_benchmarks/cuda_decode_performance/OptimizedNvDecoder.h new file mode 100644 index 00000000..f9881c80 --- /dev/null +++ b/superbench/benchmarks/micro_benchmarks/cuda_decode_performance/OptimizedNvDecoder.h @@ -0,0 +1,52 @@ +// Copyright(c) Microsoft Corporation. +// Licensed under the MIT License. + +#include "NvDecoder/NvDecoder.h" + +// This class is derived from NvDecoder class and is used to optimize the cuvidGetDecoderCaps overhead +class OptimizedNvDecoder : public NvDecoder { + + public: + OptimizedNvDecoder() {} + /** + * @brief This function is used to initialize the decoder session. + * Application must call this function to initialize the decoder, before + * starting to decode any frames. + * The only difference from the original function is to add a new member m_decodecaps. + * Other part is the same as the original function, refer to NvDecoder.cpp in NVIDIA Video Codec SDK. + */ + OptimizedNvDecoder(CUcontext &cuContext, bool bUseDeviceFrame, cudaVideoCodec eCodec, CUVIDDECODECAPS decodecaps, + bool bLowLatency = false, bool bDeviceFramePitched = false, const Rect *pCropRect = NULL, + const Dim *pResizeDim = NULL, bool extract_user_SEI_Message = false, int maxWidth = 0, + int maxHeight = 0, unsigned int clkRate = 1000, bool force_zero_latency = false); + + /** + * @brief This function is to overwrite the origin Decode function to record the latency on frame level. + */ + int Decode(const uint8_t *pData, int nSize, int nFlags = 0, int64_t nTimestamp = 0); + /** + * @brief This function is used to Get the frameLatency vector + */ + std::vector> &GetFrameLatency() { return frameLatency; } + + protected: + /** + * @brief Callback function to be registered for getting a callback when decoding of sequence starts + */ + static int CUDAAPI HandleVideoSequenceProc(void *pUserData, CUVIDEOFORMAT *pVideoFormat) { + if (pUserData == nullptr) { + throw std::runtime_error("pUserData is nullptr"); + } + return ((OptimizedNvDecoder *)pUserData)->HandleVideoSequence(pVideoFormat); + } + /** + * @brief Define the new handler when decoding of sequence starts. + * The only change is to re-query decoder caps when the video codec or format change + * Other part is the same as the original function, refer to NvDecoder.cpp in NVIDIA Video Codec SDK. + */ + int HandleVideoSequence(CUVIDEOFORMAT *pVideoFormat); + + CUVIDDECODECAPS m_decodecaps; + + std::vector> frameLatency; +}; diff --git a/superbench/benchmarks/micro_benchmarks/cuda_decode_performance/ThreadPoolUtils.h b/superbench/benchmarks/micro_benchmarks/cuda_decode_performance/ThreadPoolUtils.h new file mode 100644 index 00000000..5592b76e --- /dev/null +++ b/superbench/benchmarks/micro_benchmarks/cuda_decode_performance/ThreadPoolUtils.h @@ -0,0 +1,99 @@ +// Copyright(c) Microsoft Corporation. +// Licensed under the MIT License. + +#include +#include +#include +#include +#include +#include +#include + +// ThreadPool is a simple thread pool implementation that supports enqueueing the task with the index of thread to use +// and custom arguments like task(thread_index, *args). +class ThreadPool { + public: + /** + * @brief Construct a new ThreadPool object with the given number of threads. + */ + ThreadPool(size_t numThreads) { + for (size_t i = 0; i < numThreads; ++i) { + threads.emplace_back(&ThreadPool::worker, this, i); + } + } + /** + * @brief Destroy the ThreadPool object and join all threads. + */ + ~ThreadPool() { + { + std::unique_lock lock(mutex); + stop = true; + } + cv.notify_all(); + + for (auto &thread : threads) { + thread.join(); + } + } + /** + * @brief TaskWrapper is a wrapper of the task with the index of thread to use and custom arguments like + * task(thread_index, *args). + */ + template struct TaskWrapper { + std::shared_ptr> task; + + template TaskWrapper(Callable &&f, CallableArgs &&...args) { + task = std::make_shared>( + [f, args...](size_t threadIdx) mutable { return f(threadIdx, args...); }); + } + + void operator()(size_t threadIdx) { (*task)(threadIdx); } + }; + /** + * @brief Enqueue enqueues the task with custom arguments and return the results of task when finished. + */ + template + auto enqueue(F &&f, Args &&...args) -> std::future::type> { + using ReturnType = typename std::result_of::type; + + TaskWrapper wrapper(std::forward(f), std::forward(args)...); + std::future res = wrapper.task->get_future(); + + { + std::unique_lock lock(mutex); + tasks.emplace(std::move(wrapper)); + } + cv.notify_one(); + + return res; + } + + private: + /** + * @brief The worker function that dequeues the task and executes it for each thread index. + */ + void worker(size_t threadIdx) { + while (true) { + std::function task; + { + std::unique_lock lock(mutex); + cv.wait(lock, [this] { return stop || !tasks.empty(); }); + + if (stop && tasks.empty()) { + return; + } + + task = tasks.front(); + tasks.pop(); + } + + task(threadIdx); + } + } + + std::vector threads; + std::queue> tasks; + std::mutex mutex; + std::condition_variable cv; + bool stop = false; +}; diff --git a/third_party/Video_Codec_SDK/Interface/cuviddec.h b/third_party/Video_Codec_SDK/Interface/cuviddec.h new file mode 100644 index 00000000..1d13eec8 --- /dev/null +++ b/third_party/Video_Codec_SDK/Interface/cuviddec.h @@ -0,0 +1,1173 @@ +/* + * This copyright notice applies to this header file only: + * + * Copyright (c) 2010-2023 NVIDIA Corporation + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the software, and to permit persons to whom the + * software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +/*****************************************************************************************************/ +//! \file cuviddec.h +//! NVDECODE API provides video decoding interface to NVIDIA GPU devices. +//! This file contains constants, structure definitions and function prototypes used for decoding. +/*****************************************************************************************************/ + +#if !defined(__CUDA_VIDEO_H__) +#define __CUDA_VIDEO_H__ + +#ifndef __cuda_cuda_h__ +#include +#endif // __cuda_cuda_h__ + +#if defined(_WIN64) || defined(__LP64__) || defined(__x86_64) || defined(AMD64) || defined(_M_AMD64) +#if (CUDA_VERSION >= 3020) && (!defined(CUDA_FORCE_API_VERSION) || (CUDA_FORCE_API_VERSION >= 3020)) +#define __CUVID_DEVPTR64 +#endif +#endif + +#if defined(__cplusplus) +extern "C" { +#endif /* __cplusplus */ + +typedef void *CUvideodecoder; +typedef struct _CUcontextlock_st *CUvideoctxlock; + +/*********************************************************************************/ +//! \enum cudaVideoCodec +//! Video codec enums +//! These enums are used in CUVIDDECODECREATEINFO and CUVIDDECODECAPS structures +/*********************************************************************************/ +typedef enum cudaVideoCodec_enum { + cudaVideoCodec_MPEG1 = 0, /**< MPEG1 */ + cudaVideoCodec_MPEG2, /**< MPEG2 */ + cudaVideoCodec_MPEG4, /**< MPEG4 */ + cudaVideoCodec_VC1, /**< VC1 */ + cudaVideoCodec_H264, /**< H264 */ + cudaVideoCodec_JPEG, /**< JPEG */ + cudaVideoCodec_H264_SVC, /**< H264-SVC */ + cudaVideoCodec_H264_MVC, /**< H264-MVC */ + cudaVideoCodec_HEVC, /**< HEVC */ + cudaVideoCodec_VP8, /**< VP8 */ + cudaVideoCodec_VP9, /**< VP9 */ + cudaVideoCodec_AV1, /**< AV1 */ + cudaVideoCodec_NumCodecs, /**< Max codecs */ + // Uncompressed YUV + cudaVideoCodec_YUV420 = (('I' << 24) | ('Y' << 16) | ('U' << 8) | ('V')), /**< Y,U,V (4:2:0) */ + cudaVideoCodec_YV12 = (('Y' << 24) | ('V' << 16) | ('1' << 8) | ('2')), /**< Y,V,U (4:2:0) */ + cudaVideoCodec_NV12 = (('N' << 24) | ('V' << 16) | ('1' << 8) | ('2')), /**< Y,UV (4:2:0) */ + cudaVideoCodec_YUYV = (('Y' << 24) | ('U' << 16) | ('Y' << 8) | ('V')), /**< YUYV/YUY2 (4:2:2) */ + cudaVideoCodec_UYVY = (('U' << 24) | ('Y' << 16) | ('V' << 8) | ('Y')) /**< UYVY (4:2:2) */ +} cudaVideoCodec; + +/*********************************************************************************/ +//! \enum cudaVideoSurfaceFormat +//! Video surface format enums used for output format of decoded output +//! These enums are used in CUVIDDECODECREATEINFO structure +/*********************************************************************************/ +typedef enum cudaVideoSurfaceFormat_enum { + cudaVideoSurfaceFormat_NV12 = 0, /**< Semi-Planar YUV [Y plane followed by interleaved UV plane] */ + cudaVideoSurfaceFormat_P016 = 1, /**< 16 bit Semi-Planar YUV [Y plane followed by interleaved UV plane]. + Can be used for 10 bit(6LSB bits 0), 12 bit (4LSB bits 0) */ + cudaVideoSurfaceFormat_YUV444 = 2, /**< Planar YUV [Y plane followed by U and V planes] */ + cudaVideoSurfaceFormat_YUV444_16Bit = 3, /**< 16 bit Planar YUV [Y plane followed by U and V planes]. + Can be used for 10 bit(6LSB bits 0), 12 bit (4LSB bits 0) */ +} cudaVideoSurfaceFormat; + +/******************************************************************************************************************/ +//! \enum cudaVideoDeinterlaceMode +//! Deinterlacing mode enums +//! These enums are used in CUVIDDECODECREATEINFO structure +//! Use cudaVideoDeinterlaceMode_Weave for progressive content and for content that doesn't need deinterlacing +//! cudaVideoDeinterlaceMode_Adaptive needs more video memory than other DImodes +/******************************************************************************************************************/ +typedef enum cudaVideoDeinterlaceMode_enum { + cudaVideoDeinterlaceMode_Weave = 0, /**< Weave both fields (no deinterlacing) */ + cudaVideoDeinterlaceMode_Bob, /**< Drop one field */ + cudaVideoDeinterlaceMode_Adaptive /**< Adaptive deinterlacing */ +} cudaVideoDeinterlaceMode; + +/**************************************************************************************************************/ +//! \enum cudaVideoChromaFormat +//! Chroma format enums +//! These enums are used in CUVIDDECODECREATEINFO and CUVIDDECODECAPS structures +/**************************************************************************************************************/ +typedef enum cudaVideoChromaFormat_enum { + cudaVideoChromaFormat_Monochrome = 0, /**< MonoChrome */ + cudaVideoChromaFormat_420, /**< YUV 4:2:0 */ + cudaVideoChromaFormat_422, /**< YUV 4:2:2 */ + cudaVideoChromaFormat_444 /**< YUV 4:4:4 */ +} cudaVideoChromaFormat; + +/*************************************************************************************************************/ +//! \enum cudaVideoCreateFlags +//! Decoder flag enums to select preferred decode path +//! cudaVideoCreate_Default and cudaVideoCreate_PreferCUVID are most optimized, use these whenever possible +/*************************************************************************************************************/ +typedef enum cudaVideoCreateFlags_enum { + cudaVideoCreate_Default = 0x00, /**< Default operation mode: use dedicated video engines */ + cudaVideoCreate_PreferCUDA = + 0x01, /**< Use CUDA-based decoder (requires valid vidLock object for multi-threading) */ + cudaVideoCreate_PreferDXVA = 0x02, /**< Go through DXVA internally if possible (requires D3D9 interop) */ + cudaVideoCreate_PreferCUVID = 0x04 /**< Use dedicated video engines directly */ +} cudaVideoCreateFlags; + +/*************************************************************************/ +//! \enum cuvidDecodeStatus +//! Decode status enums +//! These enums are used in CUVIDGETDECODESTATUS structure +/*************************************************************************/ +typedef enum cuvidDecodeStatus_enum { + cuvidDecodeStatus_Invalid = 0, // Decode status is not valid + cuvidDecodeStatus_InProgress = 1, // Decode is in progress + cuvidDecodeStatus_Success = 2, // Decode is completed without any errors + // 3 to 7 enums are reserved for future use + cuvidDecodeStatus_Error = 8, // Decode is completed with an error (error is not concealed) + cuvidDecodeStatus_Error_Concealed = 9, // Decode is completed with an error and error is concealed +} cuvidDecodeStatus; + +/**************************************************************************************************************/ +//! \struct CUVIDDECODECAPS; +//! This structure is used in cuvidGetDecoderCaps API +/**************************************************************************************************************/ +typedef struct _CUVIDDECODECAPS { + cudaVideoCodec eCodecType; /**< IN: cudaVideoCodec_XXX */ + cudaVideoChromaFormat eChromaFormat; /**< IN: cudaVideoChromaFormat_XXX */ + unsigned int nBitDepthMinus8; /**< IN: The Value "BitDepth minus 8" */ + unsigned int reserved1[3]; /**< Reserved for future use - set to zero */ + + unsigned char bIsSupported; /**< OUT: 1 if codec supported, 0 if not supported */ + unsigned char nNumNVDECs; /**< OUT: Number of NVDECs that can support IN params */ + unsigned short nOutputFormatMask; /**< OUT: each bit represents corresponding cudaVideoSurfaceFormat enum */ + unsigned int nMaxWidth; /**< OUT: Max supported coded width in pixels */ + unsigned int nMaxHeight; /**< OUT: Max supported coded height in pixels */ + unsigned int nMaxMBCount; /**< OUT: Max supported macroblock count + CodedWidth*CodedHeight/256 must be <= nMaxMBCount */ + unsigned short nMinWidth; /**< OUT: Min supported coded width in pixels */ + unsigned short nMinHeight; /**< OUT: Min supported coded height in pixels */ + unsigned char bIsHistogramSupported; /**< OUT: 1 if Y component histogram output is supported, 0 if not + Note: histogram is computed on original picture data before + any post-processing like scaling, cropping, etc. is applied */ + unsigned char nCounterBitDepth; /**< OUT: histogram counter bit depth */ + unsigned short nMaxHistogramBins; /**< OUT: Max number of histogram bins */ + unsigned int reserved3[10]; /**< Reserved for future use - set to zero */ +} CUVIDDECODECAPS; + +/**************************************************************************************************************/ +//! \struct CUVIDDECODECREATEINFO +//! This structure is used in cuvidCreateDecoder API +/**************************************************************************************************************/ +typedef struct _CUVIDDECODECREATEINFO { + unsigned long ulWidth; /**< IN: Coded sequence width in pixels */ + unsigned long ulHeight; /**< IN: Coded sequence height in pixels */ + unsigned long ulNumDecodeSurfaces; /**< IN: Maximum number of internal decode surfaces */ + cudaVideoCodec CodecType; /**< IN: cudaVideoCodec_XXX */ + cudaVideoChromaFormat ChromaFormat; /**< IN: cudaVideoChromaFormat_XXX */ + unsigned long ulCreationFlags; /**< IN: Decoder creation flags (cudaVideoCreateFlags_XXX) */ + unsigned long bitDepthMinus8; /**< IN: The value "BitDepth minus 8" */ + unsigned long ulIntraDecodeOnly; /**< IN: Set 1 only if video has all intra frames (default value is 0). This will + optimize video memory for Intra frames only decoding. The support is limited + to specific codecs - H264, HEVC, VP9, the flag will be ignored for codecs + which are not supported. However decoding might fail if the flag is enabled in + case of supported codecs for regular bit streams having P and/or B frames. */ + unsigned long ulMaxWidth; /**< IN: Coded sequence max width in pixels used with reconfigure Decoder */ + unsigned long ulMaxHeight; /**< IN: Coded sequence max height in pixels used with reconfigure Decoder */ + unsigned long Reserved1; /**< Reserved for future use - set to zero */ + /** + * IN: area of the frame that should be displayed + */ + struct { + short left; + short top; + short right; + short bottom; + } display_area; + + cudaVideoSurfaceFormat OutputFormat; /**< IN: cudaVideoSurfaceFormat_XXX */ + cudaVideoDeinterlaceMode DeinterlaceMode; /**< IN: cudaVideoDeinterlaceMode_XXX */ + unsigned long ulTargetWidth; /**< IN: Post-processed output width (Should be aligned to 2) */ + unsigned long ulTargetHeight; /**< IN: Post-processed output height (Should be aligned to 2) */ + unsigned long ulNumOutputSurfaces; /**< IN: Maximum number of output surfaces simultaneously mapped */ + CUvideoctxlock vidLock; /**< IN: If non-NULL, context lock used for synchronizing ownership of + the cuda context. Needed for cudaVideoCreate_PreferCUDA decode */ + /** + * IN: target rectangle in the output frame (for aspect ratio conversion) + * if a null rectangle is specified, {0,0,ulTargetWidth,ulTargetHeight} will be used + */ + struct { + short left; + short top; + short right; + short bottom; + } target_rect; + + unsigned long enableHistogram; /**< IN: enable histogram output, if supported */ + unsigned long Reserved2[4]; /**< Reserved for future use - set to zero */ +} CUVIDDECODECREATEINFO; + +/*********************************************************/ +//! \struct CUVIDH264DPBENTRY +//! H.264 DPB entry +//! This structure is used in CUVIDH264PICPARAMS structure +/*********************************************************/ +typedef struct _CUVIDH264DPBENTRY { + int PicIdx; /**< picture index of reference frame */ + int FrameIdx; /**< frame_num(short-term) or LongTermFrameIdx(long-term) */ + int is_long_term; /**< 0=short term reference, 1=long term reference */ + int not_existing; /**< non-existing reference frame (corresponding PicIdx should be set to -1) */ + int used_for_reference; /**< 0=unused, 1=top_field, 2=bottom_field, 3=both_fields */ + int FieldOrderCnt[2]; /**< field order count of top and bottom fields */ +} CUVIDH264DPBENTRY; + +/************************************************************/ +//! \struct CUVIDH264MVCEXT +//! H.264 MVC picture parameters ext +//! This structure is used in CUVIDH264PICPARAMS structure +/************************************************************/ +typedef struct _CUVIDH264MVCEXT { + int num_views_minus1; /**< Max number of coded views minus 1 in video : Range - 0 to 1023 */ + int view_id; /**< view identifier */ + unsigned char inter_view_flag; /**< 1 if used for inter-view prediction, 0 if not */ + unsigned char num_inter_view_refs_l0; /**< number of inter-view ref pics in RefPicList0 */ + unsigned char num_inter_view_refs_l1; /**< number of inter-view ref pics in RefPicList1 */ + unsigned char MVCReserved8Bits; /**< Reserved bits */ + int InterViewRefsL0[16]; /**< view id of the i-th view component for inter-view prediction in RefPicList0 */ + int InterViewRefsL1[16]; /**< view id of the i-th view component for inter-view prediction in RefPicList1 */ +} CUVIDH264MVCEXT; + +/*********************************************************/ +//! \struct CUVIDH264SVCEXT +//! H.264 SVC picture parameters ext +//! This structure is used in CUVIDH264PICPARAMS structure +/*********************************************************/ +typedef struct _CUVIDH264SVCEXT { + unsigned char profile_idc; + unsigned char level_idc; + unsigned char DQId; + unsigned char DQIdMax; + unsigned char disable_inter_layer_deblocking_filter_idc; + unsigned char ref_layer_chroma_phase_y_plus1; + signed char inter_layer_slice_alpha_c0_offset_div2; + signed char inter_layer_slice_beta_offset_div2; + + unsigned short DPBEntryValidFlag; + unsigned char inter_layer_deblocking_filter_control_present_flag; + unsigned char extended_spatial_scalability_idc; + unsigned char adaptive_tcoeff_level_prediction_flag; + unsigned char slice_header_restriction_flag; + unsigned char chroma_phase_x_plus1_flag; + unsigned char chroma_phase_y_plus1; + + unsigned char tcoeff_level_prediction_flag; + unsigned char constrained_intra_resampling_flag; + unsigned char ref_layer_chroma_phase_x_plus1_flag; + unsigned char store_ref_base_pic_flag; + unsigned char Reserved8BitsA; + unsigned char Reserved8BitsB; + + short scaled_ref_layer_left_offset; + short scaled_ref_layer_top_offset; + short scaled_ref_layer_right_offset; + short scaled_ref_layer_bottom_offset; + unsigned short Reserved16Bits; + struct _CUVIDPICPARAMS *pNextLayer; /**< Points to the picparams for the next layer to be decoded. + Linked list ends at the target layer. */ + int bRefBaseLayer; /**< whether to store ref base pic */ +} CUVIDH264SVCEXT; + +/******************************************************/ +//! \struct CUVIDH264PICPARAMS +//! H.264 picture parameters +//! This structure is used in CUVIDPICPARAMS structure +/******************************************************/ +typedef struct _CUVIDH264PICPARAMS { + // SPS + int log2_max_frame_num_minus4; + int pic_order_cnt_type; + int log2_max_pic_order_cnt_lsb_minus4; + int delta_pic_order_always_zero_flag; + int frame_mbs_only_flag; + int direct_8x8_inference_flag; + int num_ref_frames; // NOTE: shall meet level 4.1 restrictions + unsigned char residual_colour_transform_flag; + unsigned char bit_depth_luma_minus8; // Must be 0 (only 8-bit supported) + unsigned char bit_depth_chroma_minus8; // Must be 0 (only 8-bit supported) + unsigned char qpprime_y_zero_transform_bypass_flag; + // PPS + int entropy_coding_mode_flag; + int pic_order_present_flag; + int num_ref_idx_l0_active_minus1; + int num_ref_idx_l1_active_minus1; + int weighted_pred_flag; + int weighted_bipred_idc; + int pic_init_qp_minus26; + int deblocking_filter_control_present_flag; + int redundant_pic_cnt_present_flag; + int transform_8x8_mode_flag; + int MbaffFrameFlag; + int constrained_intra_pred_flag; + int chroma_qp_index_offset; + int second_chroma_qp_index_offset; + int ref_pic_flag; + int frame_num; + int CurrFieldOrderCnt[2]; + // DPB + CUVIDH264DPBENTRY dpb[16]; // List of reference frames within the DPB + // Quantization Matrices (raster-order) + unsigned char WeightScale4x4[6][16]; + unsigned char WeightScale8x8[2][64]; + // FMO/ASO + unsigned char fmo_aso_enable; + unsigned char num_slice_groups_minus1; + unsigned char slice_group_map_type; + signed char pic_init_qs_minus26; + unsigned int slice_group_change_rate_minus1; + union { + unsigned long long slice_group_map_addr; + const unsigned char *pMb2SliceGroupMap; + } fmo; + unsigned int Reserved[12]; + // SVC/MVC + union { + CUVIDH264MVCEXT mvcext; + CUVIDH264SVCEXT svcext; + }; +} CUVIDH264PICPARAMS; + +/********************************************************/ +//! \struct CUVIDMPEG2PICPARAMS +//! MPEG-2 picture parameters +//! This structure is used in CUVIDPICPARAMS structure +/********************************************************/ +typedef struct _CUVIDMPEG2PICPARAMS { + int ForwardRefIdx; // Picture index of forward reference (P/B-frames) + int BackwardRefIdx; // Picture index of backward reference (B-frames) + int picture_coding_type; + int full_pel_forward_vector; + int full_pel_backward_vector; + int f_code[2][2]; + int intra_dc_precision; + int frame_pred_frame_dct; + int concealment_motion_vectors; + int q_scale_type; + int intra_vlc_format; + int alternate_scan; + int top_field_first; + // Quantization matrices (raster order) + unsigned char QuantMatrixIntra[64]; + unsigned char QuantMatrixInter[64]; +} CUVIDMPEG2PICPARAMS; + +// MPEG-4 has VOP types instead of Picture types +#define I_VOP 0 +#define P_VOP 1 +#define B_VOP 2 +#define S_VOP 3 + +/*******************************************************/ +//! \struct CUVIDMPEG4PICPARAMS +//! MPEG-4 picture parameters +//! This structure is used in CUVIDPICPARAMS structure +/*******************************************************/ +typedef struct _CUVIDMPEG4PICPARAMS { + int ForwardRefIdx; // Picture index of forward reference (P/B-frames) + int BackwardRefIdx; // Picture index of backward reference (B-frames) + // VOL + int video_object_layer_width; + int video_object_layer_height; + int vop_time_increment_bitcount; + int top_field_first; + int resync_marker_disable; + int quant_type; + int quarter_sample; + int short_video_header; + int divx_flags; + // VOP + int vop_coding_type; + int vop_coded; + int vop_rounding_type; + int alternate_vertical_scan_flag; + int interlaced; + int vop_fcode_forward; + int vop_fcode_backward; + int trd[2]; + int trb[2]; + // Quantization matrices (raster order) + unsigned char QuantMatrixIntra[64]; + unsigned char QuantMatrixInter[64]; + int gmc_enabled; +} CUVIDMPEG4PICPARAMS; + +/********************************************************/ +//! \struct CUVIDVC1PICPARAMS +//! VC1 picture parameters +//! This structure is used in CUVIDPICPARAMS structure +/********************************************************/ +typedef struct _CUVIDVC1PICPARAMS { + int ForwardRefIdx; /**< Picture index of forward reference (P/B-frames) */ + int BackwardRefIdx; /**< Picture index of backward reference (B-frames) */ + int FrameWidth; /**< Actual frame width */ + int FrameHeight; /**< Actual frame height */ + // PICTURE + int intra_pic_flag; /**< Set to 1 for I,BI frames */ + int ref_pic_flag; /**< Set to 1 for I,P frames */ + int progressive_fcm; /**< Progressive frame */ + // SEQUENCE + int profile; + int postprocflag; + int pulldown; + int interlace; + int tfcntrflag; + int finterpflag; + int psf; + int multires; + int syncmarker; + int rangered; + int maxbframes; + // ENTRYPOINT + int panscan_flag; + int refdist_flag; + int extended_mv; + int dquant; + int vstransform; + int loopfilter; + int fastuvmc; + int overlap; + int quantizer; + int extended_dmv; + int range_mapy_flag; + int range_mapy; + int range_mapuv_flag; + int range_mapuv; + int rangeredfrm; // range reduction state +} CUVIDVC1PICPARAMS; + +/***********************************************************/ +//! \struct CUVIDJPEGPICPARAMS +//! JPEG picture parameters +//! This structure is used in CUVIDPICPARAMS structure +/***********************************************************/ +typedef struct _CUVIDJPEGPICPARAMS { + int Reserved; +} CUVIDJPEGPICPARAMS; + +/*******************************************************/ +//! \struct CUVIDHEVCPICPARAMS +//! HEVC picture parameters +//! This structure is used in CUVIDPICPARAMS structure +/*******************************************************/ +typedef struct _CUVIDHEVCPICPARAMS { + // sps + int pic_width_in_luma_samples; + int pic_height_in_luma_samples; + unsigned char log2_min_luma_coding_block_size_minus3; + unsigned char log2_diff_max_min_luma_coding_block_size; + unsigned char log2_min_transform_block_size_minus2; + unsigned char log2_diff_max_min_transform_block_size; + unsigned char pcm_enabled_flag; + unsigned char log2_min_pcm_luma_coding_block_size_minus3; + unsigned char log2_diff_max_min_pcm_luma_coding_block_size; + unsigned char pcm_sample_bit_depth_luma_minus1; + + unsigned char pcm_sample_bit_depth_chroma_minus1; + unsigned char pcm_loop_filter_disabled_flag; + unsigned char strong_intra_smoothing_enabled_flag; + unsigned char max_transform_hierarchy_depth_intra; + unsigned char max_transform_hierarchy_depth_inter; + unsigned char amp_enabled_flag; + unsigned char separate_colour_plane_flag; + unsigned char log2_max_pic_order_cnt_lsb_minus4; + + unsigned char num_short_term_ref_pic_sets; + unsigned char long_term_ref_pics_present_flag; + unsigned char num_long_term_ref_pics_sps; + unsigned char sps_temporal_mvp_enabled_flag; + unsigned char sample_adaptive_offset_enabled_flag; + unsigned char scaling_list_enable_flag; + unsigned char IrapPicFlag; + unsigned char IdrPicFlag; + + unsigned char bit_depth_luma_minus8; + unsigned char bit_depth_chroma_minus8; + // sps/pps extension fields + unsigned char log2_max_transform_skip_block_size_minus2; + unsigned char log2_sao_offset_scale_luma; + unsigned char log2_sao_offset_scale_chroma; + unsigned char high_precision_offsets_enabled_flag; + unsigned char reserved1[10]; + + // pps + unsigned char dependent_slice_segments_enabled_flag; + unsigned char slice_segment_header_extension_present_flag; + unsigned char sign_data_hiding_enabled_flag; + unsigned char cu_qp_delta_enabled_flag; + unsigned char diff_cu_qp_delta_depth; + signed char init_qp_minus26; + signed char pps_cb_qp_offset; + signed char pps_cr_qp_offset; + + unsigned char constrained_intra_pred_flag; + unsigned char weighted_pred_flag; + unsigned char weighted_bipred_flag; + unsigned char transform_skip_enabled_flag; + unsigned char transquant_bypass_enabled_flag; + unsigned char entropy_coding_sync_enabled_flag; + unsigned char log2_parallel_merge_level_minus2; + unsigned char num_extra_slice_header_bits; + + unsigned char loop_filter_across_tiles_enabled_flag; + unsigned char loop_filter_across_slices_enabled_flag; + unsigned char output_flag_present_flag; + unsigned char num_ref_idx_l0_default_active_minus1; + unsigned char num_ref_idx_l1_default_active_minus1; + unsigned char lists_modification_present_flag; + unsigned char cabac_init_present_flag; + unsigned char pps_slice_chroma_qp_offsets_present_flag; + + unsigned char deblocking_filter_override_enabled_flag; + unsigned char pps_deblocking_filter_disabled_flag; + signed char pps_beta_offset_div2; + signed char pps_tc_offset_div2; + unsigned char tiles_enabled_flag; + unsigned char uniform_spacing_flag; + unsigned char num_tile_columns_minus1; + unsigned char num_tile_rows_minus1; + + unsigned short column_width_minus1[21]; + unsigned short row_height_minus1[21]; + + // sps and pps extension HEVC-main 444 + unsigned char sps_range_extension_flag; + unsigned char transform_skip_rotation_enabled_flag; + unsigned char transform_skip_context_enabled_flag; + unsigned char implicit_rdpcm_enabled_flag; + + unsigned char explicit_rdpcm_enabled_flag; + unsigned char extended_precision_processing_flag; + unsigned char intra_smoothing_disabled_flag; + unsigned char persistent_rice_adaptation_enabled_flag; + + unsigned char cabac_bypass_alignment_enabled_flag; + unsigned char pps_range_extension_flag; + unsigned char cross_component_prediction_enabled_flag; + unsigned char chroma_qp_offset_list_enabled_flag; + + unsigned char diff_cu_chroma_qp_offset_depth; + unsigned char chroma_qp_offset_list_len_minus1; + signed char cb_qp_offset_list[6]; + + signed char cr_qp_offset_list[6]; + unsigned char reserved2[2]; + + unsigned int reserved3[8]; + + // RefPicSets + int NumBitsForShortTermRPSInSlice; + int NumDeltaPocsOfRefRpsIdx; + int NumPocTotalCurr; + int NumPocStCurrBefore; + int NumPocStCurrAfter; + int NumPocLtCurr; + int CurrPicOrderCntVal; + int RefPicIdx[16]; // [refpic] Indices of valid reference pictures (-1 if unused for reference) + int PicOrderCntVal[16]; // [refpic] + unsigned char IsLongTerm[16]; // [refpic] 0=not a long-term reference, 1=long-term reference + unsigned char RefPicSetStCurrBefore[8]; // [0..NumPocStCurrBefore-1] -> refpic (0..15) + unsigned char RefPicSetStCurrAfter[8]; // [0..NumPocStCurrAfter-1] -> refpic (0..15) + unsigned char RefPicSetLtCurr[8]; // [0..NumPocLtCurr-1] -> refpic (0..15) + unsigned char RefPicSetInterLayer0[8]; + unsigned char RefPicSetInterLayer1[8]; + unsigned int reserved4[12]; + + // scaling lists (diag order) + unsigned char ScalingList4x4[6][16]; // [matrixId][i] + unsigned char ScalingList8x8[6][64]; // [matrixId][i] + unsigned char ScalingList16x16[6][64]; // [matrixId][i] + unsigned char ScalingList32x32[2][64]; // [matrixId][i] + unsigned char ScalingListDCCoeff16x16[6]; // [matrixId] + unsigned char ScalingListDCCoeff32x32[2]; // [matrixId] +} CUVIDHEVCPICPARAMS; + +/***********************************************************/ +//! \struct CUVIDVP8PICPARAMS +//! VP8 picture parameters +//! This structure is used in CUVIDPICPARAMS structure +/***********************************************************/ +typedef struct _CUVIDVP8PICPARAMS { + int width; + int height; + unsigned int first_partition_size; + // Frame Indexes + unsigned char LastRefIdx; + unsigned char GoldenRefIdx; + unsigned char AltRefIdx; + union { + struct { + unsigned char frame_type : 1; /**< 0 = KEYFRAME, 1 = INTERFRAME */ + unsigned char version : 3; + unsigned char show_frame : 1; + unsigned char update_mb_segmentation_data : 1; /**< Must be 0 if segmentation is not enabled */ + unsigned char Reserved2Bits : 2; + } vp8_frame_tag; + unsigned char wFrameTagFlags; + }; + unsigned char Reserved1[4]; + unsigned int Reserved2[3]; +} CUVIDVP8PICPARAMS; + +/***********************************************************/ +//! \struct CUVIDVP9PICPARAMS +//! VP9 picture parameters +//! This structure is used in CUVIDPICPARAMS structure +/***********************************************************/ +typedef struct _CUVIDVP9PICPARAMS { + unsigned int width; + unsigned int height; + + // Frame Indices + unsigned char LastRefIdx; + unsigned char GoldenRefIdx; + unsigned char AltRefIdx; + unsigned char colorSpace; + + unsigned short profile : 3; + unsigned short frameContextIdx : 2; + unsigned short frameType : 1; + unsigned short showFrame : 1; + unsigned short errorResilient : 1; + unsigned short frameParallelDecoding : 1; + unsigned short subSamplingX : 1; + unsigned short subSamplingY : 1; + unsigned short intraOnly : 1; + unsigned short allow_high_precision_mv : 1; + unsigned short refreshEntropyProbs : 1; + unsigned short reserved2Bits : 2; + + unsigned short reserved16Bits; + + unsigned char refFrameSignBias[4]; + + unsigned char bitDepthMinus8Luma; + unsigned char bitDepthMinus8Chroma; + unsigned char loopFilterLevel; + unsigned char loopFilterSharpness; + + unsigned char modeRefLfEnabled; + unsigned char log2_tile_columns; + unsigned char log2_tile_rows; + + unsigned char segmentEnabled : 1; + unsigned char segmentMapUpdate : 1; + unsigned char segmentMapTemporalUpdate : 1; + unsigned char segmentFeatureMode : 1; + unsigned char reserved4Bits : 4; + + unsigned char segmentFeatureEnable[8][4]; + short segmentFeatureData[8][4]; + unsigned char mb_segment_tree_probs[7]; + unsigned char segment_pred_probs[3]; + unsigned char reservedSegment16Bits[2]; + + int qpYAc; + int qpYDc; + int qpChDc; + int qpChAc; + + unsigned int activeRefIdx[3]; + unsigned int resetFrameContext; + unsigned int mcomp_filter_type; + unsigned int mbRefLfDelta[4]; + unsigned int mbModeLfDelta[2]; + unsigned int frameTagSize; + unsigned int offsetToDctParts; + unsigned int reserved128Bits[4]; + +} CUVIDVP9PICPARAMS; + +/***********************************************************/ +//! \struct CUVIDAV1PICPARAMS +//! AV1 picture parameters +//! This structure is used in CUVIDPICPARAMS structure +/***********************************************************/ +typedef struct _CUVIDAV1PICPARAMS { + unsigned int width; // coded width, if superres enabled then it is upscaled width + unsigned int height; // coded height + unsigned int frame_offset; // defined as order_hint in AV1 specification + int decodePicIdx; // decoded output pic index, if film grain enabled, it will keep decoded (without film grain) + // output It can be used as reference frame for future frames + + // sequence header + unsigned int profile : 3; // 0 = profile0, 1 = profile1, 2 = profile2 + unsigned int use_128x128_superblock : 1; // superblock size 0:64x64, 1: 128x128 + unsigned int subsampling_x : 1; // (subsampling_x, _y) 1,1 = 420, 1,0 = 422, 0,0 = 444 + unsigned int subsampling_y : 1; + unsigned int mono_chrome : 1; // for monochrome content, mono_chrome = 1 and (subsampling_x, _y) should be 1,1 + unsigned int bit_depth_minus8 : 4; // bit depth minus 8 + unsigned int enable_filter_intra : 1; // tool enable in seq level, 0 : disable 1: frame header control + unsigned int enable_intra_edge_filter : 1; // intra edge filtering process, 0 : disable 1: enabled + unsigned int enable_interintra_compound : 1; // interintra, 0 : not present 1: present + unsigned int enable_masked_compound : 1; // 1: mode info for inter blocks may contain the syntax element + // compound_type. 0: syntax element compound_type will not be present + unsigned int enable_dual_filter : 1; // vertical and horiz filter selection, 1: enable and 0: disable + unsigned int enable_order_hint : 1; // order hint, and related tools, 1: enable and 0: disable + unsigned int order_hint_bits_minus1 : 3; // is used to compute OrderHintBits + unsigned int enable_jnt_comp : 1; // joint compound modes, 1: enable and 0: disable + unsigned int enable_superres : 1; // superres in seq level, 0 : disable 1: frame level control + unsigned int enable_cdef : 1; // cdef filtering in seq level, 0 : disable 1: frame level control + unsigned int enable_restoration : 1; // loop restoration filtering in seq level, 0 : disable 1: frame level control + unsigned int enable_fgs : 1; // defined as film_grain_params_present in AV1 specification + unsigned int reserved0_7bits : 7; // reserved bits; must be set to 0 + + // frame header + unsigned int frame_type : 2; // 0:Key frame, 1:Inter frame, 2:intra only, 3:s-frame + unsigned int show_frame : 1; // show_frame = 1 implies that frame should be immediately output once decoded + unsigned int disable_cdf_update : 1; // CDF update during symbol decoding, 1: disabled, 0: enabled + unsigned int + allow_screen_content_tools : 1; // 1: intra blocks may use palette encoding, 0: palette encoding is never used + unsigned int force_integer_mv : 1; // 1: motion vectors will always be integers, 0: can contain fractional bits + unsigned int coded_denom : 3; // coded_denom of the superres scale as specified in AV1 specification + unsigned int allow_intrabc : 1; // 1: intra block copy may be used, 0: intra block copy is not allowed + unsigned int allow_high_precision_mv : 1; // 1/8 precision mv enable + unsigned int interp_filter : 3; // interpolation filter. Refer to section 6.8.9 of the AV1 specification + // Version 1.0.0 with Errata 1 + unsigned int switchable_motion_mode : 1; // defined as is_motion_mode_switchable in AV1 specification + unsigned int use_ref_frame_mvs : 1; // 1: current frame can use the previous frame mv information, 0: will not use. + unsigned int disable_frame_end_update_cdf : 1; // 1: indicates that the end of frame CDF update is disabled + unsigned int delta_q_present : 1; // quantizer index delta values are present in the block level + unsigned int delta_q_res : 2; // left shift which should be applied to decoded quantizer index delta values + unsigned int using_qmatrix : 1; // 1: quantizer matrix will be used to compute quantizers + unsigned int coded_lossless : 1; // 1: all segments use lossless coding + unsigned int use_superres : 1; // 1: superres enabled for frame + unsigned int tx_mode : 2; // 0: ONLY4x4,1:LARGEST,2:SELECT + unsigned int reference_mode : 1; // 0: SINGLE, 1: SELECT + unsigned int + allow_warped_motion : 1; // 1: allow_warped_motion may be present, 0: allow_warped_motion will not be present + unsigned int reduced_tx_set : 1; // 1: frame is restricted to subset of the full set of transform types, 0: no such + // restriction + unsigned int skip_mode : 1; // 1: most of the mode info is skipped, 0: mode info is not skipped + unsigned int reserved1_3bits : 3; // reserved bits; must be set to 0 + + // tiling info + unsigned int num_tile_cols : 8; // number of tiles across the frame., max is 64 + unsigned int num_tile_rows : 8; // number of tiles down the frame., max is 64 + unsigned int context_update_tile_id : 16; // specifies which tile to use for the CDF update + unsigned short tile_widths[64]; // Width of each column in superblocks + unsigned short tile_heights[64]; // height of each row in superblocks + + // CDEF - refer to section 6.10.14 of the AV1 specification Version 1.0.0 with Errata 1 + unsigned char cdef_damping_minus_3 : 2; // controls the amount of damping in the deringing filter + unsigned char cdef_bits : 2; // the number of bits needed to specify which CDEF filter to apply + unsigned char reserved2_4bits : 4; // reserved bits; must be set to 0 + unsigned char cdef_y_strength[8]; // 0-3 bits: y_pri_strength, 4-7 bits y_sec_strength + unsigned char cdef_uv_strength[8]; // 0-3 bits: uv_pri_strength, 4-7 bits uv_sec_strength + + // SkipModeFrames + unsigned char + SkipModeFrame0 : 4; // specifies the frames to use for compound prediction when skip_mode is equal to 1. + unsigned char SkipModeFrame1 : 4; + + // qp information - refer to section 6.8.11 of the AV1 specification Version 1.0.0 with Errata 1 + unsigned char base_qindex; // indicates the base frame qindex. Defined as base_q_idx in AV1 specification + char qp_y_dc_delta_q; // indicates the Y DC quantizer relative to base_q_idx. Defined as DeltaQYDc in AV1 + // specification + char qp_u_dc_delta_q; // indicates the U DC quantizer relative to base_q_idx. Defined as DeltaQUDc in AV1 + // specification + char qp_v_dc_delta_q; // indicates the V DC quantizer relative to base_q_idx. Defined as DeltaQVDc in AV1 + // specification + char qp_u_ac_delta_q; // indicates the U AC quantizer relative to base_q_idx. Defined as DeltaQUAc in AV1 + // specification + char qp_v_ac_delta_q; // indicates the V AC quantizer relative to base_q_idx. Defined as DeltaQVAc in AV1 + // specification + unsigned char qm_y; // specifies the level in the quantizer matrix that should be used for luma plane decoding + unsigned char qm_u; // specifies the level in the quantizer matrix that should be used for chroma U plane decoding + unsigned char qm_v; // specifies the level in the quantizer matrix that should be used for chroma V plane decoding + + // segmentation - refer to section 6.8.13 of the AV1 specification Version 1.0.0 with Errata 1 + unsigned char segmentation_enabled : 1; // 1 indicates that this frame makes use of the segmentation tool + unsigned char segmentation_update_map : 1; // 1 indicates that the segmentation map are updated during the decoding + // of this frame + unsigned char + segmentation_update_data : 1; // 1 indicates that new parameters are about to be specified for each segment + unsigned char segmentation_temporal_update : 1; // 1 indicates that the updates to the segmentation map are coded + // relative to the existing segmentation map + unsigned char reserved3_4bits : 4; // reserved bits; must be set to 0 + short segmentation_feature_data[8][8]; // specifies the feature data for a segment feature + unsigned char + segmentation_feature_mask[8]; // indicates that the corresponding feature is unused or feature value is coded + + // loopfilter - refer to section 6.8.10 of the AV1 specification Version 1.0.0 with Errata 1 + unsigned char loop_filter_level[2]; // contains loop filter strength values + unsigned char loop_filter_level_u; // loop filter strength value of U plane + unsigned char loop_filter_level_v; // loop filter strength value of V plane + unsigned char loop_filter_sharpness; // indicates the sharpness level + char loop_filter_ref_deltas[8]; // contains the adjustment needed for the filter level based on the chosen reference + // frame + char loop_filter_mode_deltas[2]; // contains the adjustment needed for the filter level based on the chosen mode + unsigned char loop_filter_delta_enabled : 1; // indicates that the filter level depends on the mode and reference + // frame used to predict a block + unsigned char loop_filter_delta_update : 1; // indicates that additional syntax elements are present that specify + // which mode and reference frame deltas are to be updated + unsigned char delta_lf_present : 1; // specifies whether loop filter delta values are present in the block level + unsigned char delta_lf_res : 2; // specifies the left shift to apply to the decoded loop filter values + unsigned char delta_lf_multi : 1; // separate loop filter deltas for Hy,Vy,U,V edges + unsigned char reserved4_2bits : 2; // reserved bits; must be set to 0 + + // restoration - refer to section 6.10.15 of the AV1 specification Version 1.0.0 with Errata 1 + unsigned char lr_unit_size[3]; // specifies the size of loop restoration units: 0: 32, 1: 64, 2: 128, 3: 256 + unsigned char lr_type[3]; // used to compute FrameRestorationType + + // reference frames + unsigned char primary_ref_frame; // specifies which reference frame contains the CDF values and other state that + // should be loaded at the start of the frame + unsigned char ref_frame_map[8]; // frames in dpb that can be used as reference for current or future frames + + unsigned char temporal_layer_id : 4; // temporal layer id + unsigned char spatial_layer_id : 4; // spatial layer id + + unsigned char reserved5_32bits[4]; // reserved bits; must be set to 0 + + // ref frame list + struct { + unsigned int width; + unsigned int height; + unsigned char index; + unsigned char reserved24Bits[3]; // reserved bits; must be set to 0 + } ref_frame[7]; // frames used as reference frame for current frame. + + // global motion + struct { + unsigned char invalid : 1; + unsigned char wmtype : 2; // defined as GmType in AV1 specification + unsigned char reserved5Bits : 5; // reserved bits; must be set to 0 + char reserved24Bits[3]; // reserved bits; must be set to 0 + int wmmat[6]; // defined as gm_params[] in AV1 specification + } global_motion[7]; // global motion params for reference frames + + // film grain params - refer to section 6.8.20 of the AV1 specification Version 1.0.0 with Errata 1 + unsigned short apply_grain : 1; + unsigned short overlap_flag : 1; + unsigned short scaling_shift_minus8 : 2; + unsigned short chroma_scaling_from_luma : 1; + unsigned short ar_coeff_lag : 2; + unsigned short ar_coeff_shift_minus6 : 2; + unsigned short grain_scale_shift : 2; + unsigned short clip_to_restricted_range : 1; + unsigned short reserved6_4bits : 4; // reserved bits; must be set to 0 + unsigned char num_y_points; + unsigned char scaling_points_y[14][2]; + unsigned char num_cb_points; + unsigned char scaling_points_cb[10][2]; + unsigned char num_cr_points; + unsigned char scaling_points_cr[10][2]; + unsigned char reserved7_8bits; // reserved bits; must be set to 0 + unsigned short random_seed; + short ar_coeffs_y[24]; + short ar_coeffs_cb[25]; + short ar_coeffs_cr[25]; + unsigned char cb_mult; + unsigned char cb_luma_mult; + short cb_offset; + unsigned char cr_mult; + unsigned char cr_luma_mult; + short cr_offset; + + int reserved[7]; // reserved bits; must be set to 0 +} CUVIDAV1PICPARAMS; + +/******************************************************************************************/ +//! \struct CUVIDPICPARAMS +//! Picture parameters for decoding +//! This structure is used in cuvidDecodePicture API +//! IN for cuvidDecodePicture +/******************************************************************************************/ +typedef struct _CUVIDPICPARAMS { + int PicWidthInMbs; /**< IN: Coded frame size in macroblocks */ + int FrameHeightInMbs; /**< IN: Coded frame height in macroblocks */ + int CurrPicIdx; /**< IN: Output index of the current picture */ + int field_pic_flag; /**< IN: 0=frame picture, 1=field picture */ + int bottom_field_flag; /**< IN: 0=top field, 1=bottom field (ignored if field_pic_flag=0) */ + int second_field; /**< IN: Second field of a complementary field pair */ + // Bitstream data + unsigned int nBitstreamDataLen; /**< IN: Number of bytes in bitstream data buffer */ + const unsigned char *pBitstreamData; /**< IN: Ptr to bitstream data for this picture (slice-layer) */ + unsigned int nNumSlices; /**< IN: Number of slices in this picture */ + const unsigned int *pSliceDataOffsets; /**< IN: nNumSlices entries, contains offset of each slice within + the bitstream data buffer */ + int ref_pic_flag; /**< IN: This picture is a reference picture */ + int intra_pic_flag; /**< IN: This picture is entirely intra coded */ + unsigned int Reserved[30]; /**< Reserved for future use */ + // IN: Codec-specific data + union { + CUVIDMPEG2PICPARAMS mpeg2; /**< Also used for MPEG-1 */ + CUVIDH264PICPARAMS h264; + CUVIDVC1PICPARAMS vc1; + CUVIDMPEG4PICPARAMS mpeg4; + CUVIDJPEGPICPARAMS jpeg; + CUVIDHEVCPICPARAMS hevc; + CUVIDVP8PICPARAMS vp8; + CUVIDVP9PICPARAMS vp9; + CUVIDAV1PICPARAMS av1; + unsigned int CodecReserved[1024]; + } CodecSpecific; +} CUVIDPICPARAMS; + +/******************************************************/ +//! \struct CUVIDPROCPARAMS +//! Picture parameters for postprocessing +//! This structure is used in cuvidMapVideoFrame API +/******************************************************/ +typedef struct _CUVIDPROCPARAMS { + int progressive_frame; /**< IN: Input is progressive (deinterlace_mode will be ignored) */ + int second_field; /**< IN: Output the second field (ignored if deinterlace mode is Weave) */ + int top_field_first; /**< IN: Input frame is top field first (1st field is top, 2nd field is bottom) */ + int unpaired_field; /**< IN: Input only contains one field (2nd field is invalid) */ + // The fields below are used for raw YUV input + unsigned int reserved_flags; /**< Reserved for future use (set to zero) */ + unsigned int reserved_zero; /**< Reserved (set to zero) */ + unsigned long long raw_input_dptr; /**< IN: Input CUdeviceptr for raw YUV extensions */ + unsigned int raw_input_pitch; /**< IN: pitch in bytes of raw YUV input (should be aligned appropriately) */ + unsigned int raw_input_format; /**< IN: Input YUV format (cudaVideoCodec_enum) */ + unsigned long long raw_output_dptr; /**< IN: Output CUdeviceptr for raw YUV extensions */ + unsigned int raw_output_pitch; /**< IN: pitch in bytes of raw YUV output (should be aligned appropriately) */ + unsigned int Reserved1; /**< Reserved for future use (set to zero) */ + CUstream output_stream; /**< IN: stream object used by cuvidMapVideoFrame */ + unsigned int Reserved[46]; /**< Reserved for future use (set to zero) */ + unsigned long long *histogram_dptr; /**< OUT: Output CUdeviceptr for histogram extensions */ + void *Reserved2[1]; /**< Reserved for future use (set to zero) */ +} CUVIDPROCPARAMS; + +/*********************************************************************************************************/ +//! \struct CUVIDGETDECODESTATUS +//! Struct for reporting decode status. +//! This structure is used in cuvidGetDecodeStatus API. +/*********************************************************************************************************/ +typedef struct _CUVIDGETDECODESTATUS { + cuvidDecodeStatus decodeStatus; + unsigned int reserved[31]; + void *pReserved[8]; +} CUVIDGETDECODESTATUS; + +/****************************************************/ +//! \struct CUVIDRECONFIGUREDECODERINFO +//! Struct for decoder reset +//! This structure is used in cuvidReconfigureDecoder() API +/****************************************************/ +typedef struct _CUVIDRECONFIGUREDECODERINFO { + unsigned int + ulWidth; /**< IN: Coded sequence width in pixels, MUST be < = ulMaxWidth defined at CUVIDDECODECREATEINFO */ + unsigned int + ulHeight; /**< IN: Coded sequence height in pixels, MUST be < = ulMaxHeight defined at CUVIDDECODECREATEINFO */ + unsigned int ulTargetWidth; /**< IN: Post processed output width */ + unsigned int ulTargetHeight; /**< IN: Post Processed output height */ + unsigned int ulNumDecodeSurfaces; /**< IN: Maximum number of internal decode surfaces */ + unsigned int reserved1[12]; /**< Reserved for future use. Set to Zero */ + /** + * IN: Area of frame to be displayed. Use-case : Source Cropping + */ + struct { + short left; + short top; + short right; + short bottom; + } display_area; + /** + * IN: Target Rectangle in the OutputFrame. Use-case : Aspect ratio Conversion + */ + struct { + short left; + short top; + short right; + short bottom; + } target_rect; + unsigned int reserved2[11]; /**< Reserved for future use. Set to Zero */ +} CUVIDRECONFIGUREDECODERINFO; + +/***********************************************************************************************************/ +//! VIDEO_DECODER +//! +//! In order to minimize decode latencies, there should be always at least 2 pictures in the decode +//! queue at any time, in order to make sure that all decode engines are always busy. +//! +//! Overall data flow: +//! - cuvidGetDecoderCaps(...) +//! - cuvidCreateDecoder(...) +//! - For each picture: +//! + cuvidDecodePicture(N) +//! + cuvidMapVideoFrame(N-4) +//! + do some processing in cuda +//! + cuvidUnmapVideoFrame(N-4) +//! + cuvidDecodePicture(N+1) +//! + cuvidMapVideoFrame(N-3) +//! + ... +//! - cuvidDestroyDecoder(...) +//! +//! NOTE: +//! - When the cuda context is created from a D3D device, the D3D device must also be created +//! with the D3DCREATE_MULTITHREADED flag. +//! - There is a limit to how many pictures can be mapped simultaneously (ulNumOutputSurfaces) +//! - cuvidDecodePicture may block the calling thread if there are too many pictures pending +//! in the decode queue +/***********************************************************************************************************/ + +/**********************************************************************************************************************/ +//! \fn CUresult CUDAAPI cuvidGetDecoderCaps(CUVIDDECODECAPS *pdc) +//! Queries decode capabilities of NVDEC-HW based on CodecType, ChromaFormat and BitDepthMinus8 parameters. +//! 1. Application fills IN parameters CodecType, ChromaFormat and BitDepthMinus8 of CUVIDDECODECAPS structure +//! 2. On calling cuvidGetDecoderCaps, driver fills OUT parameters if the IN parameters are supported +//! If IN parameters passed to the driver are not supported by NVDEC-HW, then all OUT params are set to 0. +//! E.g. on Geforce GTX 960: +//! App fills - eCodecType = cudaVideoCodec_H264; eChromaFormat = cudaVideoChromaFormat_420; nBitDepthMinus8 = 0; +//! Given IN parameters are supported, hence driver fills: bIsSupported = 1; nMinWidth = 48; nMinHeight = 16; +//! nMaxWidth = 4096; nMaxHeight = 4096; nMaxMBCount = 65536; +//! CodedWidth*CodedHeight/256 must be less than or equal to nMaxMBCount +/**********************************************************************************************************************/ +extern CUresult CUDAAPI cuvidGetDecoderCaps(CUVIDDECODECAPS *pdc); + +/*****************************************************************************************************/ +//! \fn CUresult CUDAAPI cuvidCreateDecoder(CUvideodecoder *phDecoder, CUVIDDECODECREATEINFO *pdci) +//! Create the decoder object based on pdci. A handle to the created decoder is returned +/*****************************************************************************************************/ +extern CUresult CUDAAPI cuvidCreateDecoder(CUvideodecoder *phDecoder, CUVIDDECODECREATEINFO *pdci); + +/*****************************************************************************************************/ +//! \fn CUresult CUDAAPI cuvidDestroyDecoder(CUvideodecoder hDecoder) +//! Destroy the decoder object +/*****************************************************************************************************/ +extern CUresult CUDAAPI cuvidDestroyDecoder(CUvideodecoder hDecoder); + +/*****************************************************************************************************/ +//! \fn CUresult CUDAAPI cuvidDecodePicture(CUvideodecoder hDecoder, CUVIDPICPARAMS *pPicParams) +//! Decode a single picture (field or frame) +//! Kicks off HW decoding +/*****************************************************************************************************/ +extern CUresult CUDAAPI cuvidDecodePicture(CUvideodecoder hDecoder, CUVIDPICPARAMS *pPicParams); + +/************************************************************************************************************/ +//! \fn CUresult CUDAAPI cuvidGetDecodeStatus(CUvideodecoder hDecoder, int nPicIdx); +//! Get the decode status for frame corresponding to nPicIdx +//! API is supported for Maxwell and above generation GPUs. +//! API is currently supported for HEVC, H264 and JPEG codecs. +//! API returns CUDA_ERROR_NOT_SUPPORTED error code for unsupported GPU or codec. +/************************************************************************************************************/ +extern CUresult CUDAAPI cuvidGetDecodeStatus(CUvideodecoder hDecoder, int nPicIdx, CUVIDGETDECODESTATUS *pDecodeStatus); + +/*********************************************************************************************************/ +//! \fn CUresult CUDAAPI cuvidReconfigureDecoder(CUvideodecoder hDecoder, CUVIDRECONFIGUREDECODERINFO +//! *pDecReconfigParams) Used to reuse single decoder for multiple clips. Currently supports resolution change, resize +//! params, display area params, target area params change for same codec. Must be called during +//! CUVIDPARSERPARAMS::pfnSequenceCallback +/*********************************************************************************************************/ +extern CUresult CUDAAPI cuvidReconfigureDecoder(CUvideodecoder hDecoder, + CUVIDRECONFIGUREDECODERINFO *pDecReconfigParams); + +#if !defined(__CUVID_DEVPTR64) || defined(__CUVID_INTERNAL) +/************************************************************************************************************************/ +//! \fn CUresult CUDAAPI cuvidMapVideoFrame(CUvideodecoder hDecoder, int nPicIdx, unsigned int *pDevPtr, +//! unsigned int *pPitch, CUVIDPROCPARAMS *pVPP); +//! Post-process and map video frame corresponding to nPicIdx for use in cuda. Returns cuda device pointer and +//! associated pitch of the video frame +/************************************************************************************************************************/ +extern CUresult CUDAAPI cuvidMapVideoFrame(CUvideodecoder hDecoder, int nPicIdx, unsigned int *pDevPtr, + unsigned int *pPitch, CUVIDPROCPARAMS *pVPP); + +/*****************************************************************************************************/ +//! \fn CUresult CUDAAPI cuvidUnmapVideoFrame(CUvideodecoder hDecoder, unsigned int DevPtr) +//! Unmap a previously mapped video frame +/*****************************************************************************************************/ +extern CUresult CUDAAPI cuvidUnmapVideoFrame(CUvideodecoder hDecoder, unsigned int DevPtr); +#endif + +/****************************************************************************************************************************/ +//! \fn CUresult CUDAAPI cuvidMapVideoFrame64(CUvideodecoder hDecoder, int nPicIdx, unsigned long long *pDevPtr, +//! unsigned int * pPitch, CUVIDPROCPARAMS *pVPP); +//! Post-process and map video frame corresponding to nPicIdx for use in cuda. Returns cuda device pointer and +//! associated pitch of the video frame +/****************************************************************************************************************************/ +extern CUresult CUDAAPI cuvidMapVideoFrame64(CUvideodecoder hDecoder, int nPicIdx, unsigned long long *pDevPtr, + unsigned int *pPitch, CUVIDPROCPARAMS *pVPP); + +/**************************************************************************************************/ +//! \fn CUresult CUDAAPI cuvidUnmapVideoFrame64(CUvideodecoder hDecoder, unsigned long long DevPtr); +//! Unmap a previously mapped video frame +/**************************************************************************************************/ +extern CUresult CUDAAPI cuvidUnmapVideoFrame64(CUvideodecoder hDecoder, unsigned long long DevPtr); + +#if defined(__CUVID_DEVPTR64) && !defined(__CUVID_INTERNAL) +#define cuvidMapVideoFrame cuvidMapVideoFrame64 +#define cuvidUnmapVideoFrame cuvidUnmapVideoFrame64 +#endif + +/********************************************************************************************************************/ +//! +//! Context-locking: to facilitate multi-threaded implementations, the following 4 functions +//! provide a simple mutex-style host synchronization. If a non-NULL context is specified +//! in CUVIDDECODECREATEINFO, the codec library will acquire the mutex associated with the given +//! context before making any cuda calls. +//! A multi-threaded application could create a lock associated with a context handle so that +//! multiple threads can safely share the same cuda context: +//! - use cuCtxPopCurrent immediately after context creation in order to create a 'floating' context +//! that can be passed to cuvidCtxLockCreate. +//! - When using a floating context, all cuda calls should only be made within a cuvidCtxLock/cuvidCtxUnlock section. +//! +//! NOTE: This is a safer alternative to cuCtxPushCurrent and cuCtxPopCurrent, and is not related to video +//! decoder in any way (implemented as a critical section associated with cuCtx{Push|Pop}Current calls). +/********************************************************************************************************************/ + +/********************************************************************************************************************/ +//! \fn CUresult CUDAAPI cuvidCtxLockCreate(CUvideoctxlock *pLock, CUcontext ctx) +//! This API is used to create CtxLock object +/********************************************************************************************************************/ +extern CUresult CUDAAPI cuvidCtxLockCreate(CUvideoctxlock *pLock, CUcontext ctx); + +/********************************************************************************************************************/ +//! \fn CUresult CUDAAPI cuvidCtxLockDestroy(CUvideoctxlock lck) +//! This API is used to free CtxLock object +/********************************************************************************************************************/ +extern CUresult CUDAAPI cuvidCtxLockDestroy(CUvideoctxlock lck); + +/********************************************************************************************************************/ +//! \fn CUresult CUDAAPI cuvidCtxLock(CUvideoctxlock lck, unsigned int reserved_flags) +//! This API is used to acquire ctxlock +/********************************************************************************************************************/ +extern CUresult CUDAAPI cuvidCtxLock(CUvideoctxlock lck, unsigned int reserved_flags); + +/********************************************************************************************************************/ +//! \fn CUresult CUDAAPI cuvidCtxUnlock(CUvideoctxlock lck, unsigned int reserved_flags) +//! This API is used to release ctxlock +/********************************************************************************************************************/ +extern CUresult CUDAAPI cuvidCtxUnlock(CUvideoctxlock lck, unsigned int reserved_flags); + +/**********************************************************************************************/ + +#if defined(__cplusplus) +} +// Auto-lock helper for C++ applications +class CCtxAutoLock { + private: + CUvideoctxlock m_ctx; + + public: + CCtxAutoLock(CUvideoctxlock ctx) : m_ctx(ctx) { cuvidCtxLock(m_ctx, 0); } + ~CCtxAutoLock() { cuvidCtxUnlock(m_ctx, 0); } +}; +#endif /* __cplusplus */ + +#endif // __CUDA_VIDEO_H__ diff --git a/third_party/Video_Codec_SDK/Interface/nvcuvid.h b/third_party/Video_Codec_SDK/Interface/nvcuvid.h new file mode 100644 index 00000000..d4691672 --- /dev/null +++ b/third_party/Video_Codec_SDK/Interface/nvcuvid.h @@ -0,0 +1,486 @@ +/* + * This copyright notice applies to this header file only: + * + * Copyright (c) 2010-2023 NVIDIA Corporation + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the software, and to permit persons to whom the + * software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +/********************************************************************************************************************/ +//! \file nvcuvid.h +//! NVDECODE API provides video decoding interface to NVIDIA GPU devices. +//! \date 2015-2022 +//! This file contains the interface constants, structure definitions and function prototypes. +/********************************************************************************************************************/ + +#if !defined(__NVCUVID_H__) +#define __NVCUVID_H__ + +#include "cuviddec.h" + +#if defined(__cplusplus) +extern "C" { +#endif /* __cplusplus */ + +#define MAX_CLOCK_TS 3 + +/***********************************************/ +//! +//! High-level helper APIs for video sources +//! +/***********************************************/ + +typedef void *CUvideosource; +typedef void *CUvideoparser; +typedef long long CUvideotimestamp; + +/************************************************************************/ +//! \enum cudaVideoState +//! Video source state enums +//! Used in cuvidSetVideoSourceState and cuvidGetVideoSourceState APIs +/************************************************************************/ +typedef enum { + cudaVideoState_Error = -1, /**< Error state (invalid source) */ + cudaVideoState_Stopped = 0, /**< Source is stopped (or reached end-of-stream) */ + cudaVideoState_Started = 1 /**< Source is running and delivering data */ +} cudaVideoState; + +/************************************************************************/ +//! \enum cudaAudioCodec +//! Audio compression enums +//! Used in CUAUDIOFORMAT structure +/************************************************************************/ +typedef enum { + cudaAudioCodec_MPEG1 = 0, /**< MPEG-1 Audio */ + cudaAudioCodec_MPEG2, /**< MPEG-2 Audio */ + cudaAudioCodec_MP3, /**< MPEG-1 Layer III Audio */ + cudaAudioCodec_AC3, /**< Dolby Digital (AC3) Audio */ + cudaAudioCodec_LPCM, /**< PCM Audio */ + cudaAudioCodec_AAC, /**< AAC Audio */ +} cudaAudioCodec; + +/************************************************************************/ +//! \ingroup STRUCTS +//! \struct HEVCTIMECODESET +//! Used to store Time code extracted from Time code SEI in HEVC codec +/************************************************************************/ +typedef struct _HEVCTIMECODESET { + unsigned int time_offset_value; + unsigned short n_frames; + unsigned char clock_timestamp_flag; + unsigned char units_field_based_flag; + unsigned char counting_type; + unsigned char full_timestamp_flag; + unsigned char discontinuity_flag; + unsigned char cnt_dropped_flag; + unsigned char seconds_value; + unsigned char minutes_value; + unsigned char hours_value; + unsigned char seconds_flag; + unsigned char minutes_flag; + unsigned char hours_flag; + unsigned char time_offset_length; + unsigned char reserved; +} HEVCTIMECODESET; + +/************************************************************************/ +//! \ingroup STRUCTS +//! \struct HEVCSEITIMECODE +//! Used to extract Time code SEI in HEVC codec +/************************************************************************/ +typedef struct _HEVCSEITIMECODE { + HEVCTIMECODESET time_code_set[MAX_CLOCK_TS]; + unsigned char num_clock_ts; +} HEVCSEITIMECODE; + +/**********************************************************************************/ +//! \ingroup STRUCTS +//! \struct CUSEIMESSAGE; +//! Used in CUVIDSEIMESSAGEINFO structure +/**********************************************************************************/ +typedef struct _CUSEIMESSAGE { + unsigned char sei_message_type; /**< OUT: SEI Message Type */ + unsigned char reserved[3]; + unsigned int sei_message_size; /**< OUT: SEI Message Size */ +} CUSEIMESSAGE; + +/************************************************************************************************/ +//! \ingroup STRUCTS +//! \struct CUVIDEOFORMAT +//! Video format +//! Used in cuvidGetSourceVideoFormat API +/************************************************************************************************/ +typedef struct { + cudaVideoCodec codec; /**< OUT: Compression format */ + /** + * OUT: frame rate = numerator / denominator (for example: 30000/1001) + */ + struct { + /**< OUT: frame rate numerator (0 = unspecified or variable frame rate) */ + unsigned int numerator; + /**< OUT: frame rate denominator (0 = unspecified or variable frame rate) */ + unsigned int denominator; + } frame_rate; + unsigned char progressive_sequence; /**< OUT: 0=interlaced, 1=progressive */ + unsigned char bit_depth_luma_minus8; /**< OUT: high bit depth luma. E.g, 2 for 10-bitdepth, 4 for 12-bitdepth */ + unsigned char bit_depth_chroma_minus8; /**< OUT: high bit depth chroma. E.g, 2 for 10-bitdepth, 4 for 12-bitdepth */ + unsigned char min_num_decode_surfaces; /**< OUT: Minimum number of decode surfaces to be allocated for correct + decoding. The client can send this value in ulNumDecodeSurfaces + (in CUVIDDECODECREATEINFO structure). + This guarantees correct functionality and optimal video memory + usage but not necessarily the best performance, which depends on + the design of the overall application. The optimal number of + decode surfaces (in terms of performance and memory utilization) + should be decided by experimentation for each application, but it + cannot go below min_num_decode_surfaces. + If this value is used for ulNumDecodeSurfaces then it must be + returned to parser during sequence callback. */ + unsigned int coded_width; /**< OUT: coded frame width in pixels */ + unsigned int coded_height; /**< OUT: coded frame height in pixels */ + /** + * area of the frame that should be displayed + * typical example: + * coded_width = 1920, coded_height = 1088 + * display_area = { 0,0,1920,1080 } + */ + struct { + int left; /**< OUT: left position of display rect */ + int top; /**< OUT: top position of display rect */ + int right; /**< OUT: right position of display rect */ + int bottom; /**< OUT: bottom position of display rect */ + } display_area; + cudaVideoChromaFormat chroma_format; /**< OUT: Chroma format */ + unsigned int bitrate; /**< OUT: video bitrate (bps, 0=unknown) */ + /** + * OUT: Display Aspect Ratio = x:y (4:3, 16:9, etc) + */ + struct { + int x; + int y; + } display_aspect_ratio; + /** + * Video Signal Description + * Refer section E.2.1 (VUI parameters semantics) of H264 spec file + */ + struct { + unsigned char video_format : 3; /**< OUT: 0-Component, 1-PAL, 2-NTSC, 3-SECAM, 4-MAC, 5-Unspecified */ + unsigned char video_full_range_flag : 1; /**< OUT: indicates the black level and luma and chroma range */ + unsigned char reserved_zero_bits : 4; /**< Reserved bits */ + unsigned char color_primaries; /**< OUT: chromaticity coordinates of source primaries */ + unsigned char + transfer_characteristics; /**< OUT: opto-electronic transfer characteristic of the source picture */ + unsigned char matrix_coefficients; /**< OUT: used in deriving luma and chroma signals from RGB primaries */ + } video_signal_description; + unsigned int seqhdr_data_length; /**< OUT: Additional bytes following (CUVIDEOFORMATEX) */ +} CUVIDEOFORMAT; + +/****************************************************************/ +//! \ingroup STRUCTS +//! \struct CUVIDOPERATINGPOINTINFO +//! Operating point information of scalable bitstream +/****************************************************************/ +typedef struct { + cudaVideoCodec codec; + union { + struct { + unsigned char operating_points_cnt; + unsigned char reserved24_bits[3]; + unsigned short operating_points_idc[32]; + } av1; + unsigned char CodecReserved[1024]; + }; +} CUVIDOPERATINGPOINTINFO; + +/**********************************************************************************/ +//! \ingroup STRUCTS +//! \struct CUVIDSEIMESSAGEINFO +//! Used in cuvidParseVideoData API with PFNVIDSEIMSGCALLBACK pfnGetSEIMsg +/**********************************************************************************/ +typedef struct _CUVIDSEIMESSAGEINFO { + void *pSEIData; /**< OUT: SEI Message Data */ + CUSEIMESSAGE *pSEIMessage; /**< OUT: SEI Message Info */ + unsigned int sei_message_count; /**< OUT: SEI Message Count */ + unsigned int picIdx; /**< OUT: SEI Message Pic Index */ +} CUVIDSEIMESSAGEINFO; + +/****************************************************************/ +//! \ingroup STRUCTS +//! \struct CUVIDAV1SEQHDR +//! AV1 specific sequence header information +/****************************************************************/ +typedef struct { + unsigned int max_width; + unsigned int max_height; + unsigned char reserved[1016]; +} CUVIDAV1SEQHDR; + +/****************************************************************/ +//! \ingroup STRUCTS +//! \struct CUVIDEOFORMATEX +//! Video format including raw sequence header information +//! Used in cuvidGetSourceVideoFormat API +/****************************************************************/ +typedef struct { + CUVIDEOFORMAT format; /**< OUT: CUVIDEOFORMAT structure */ + union { + CUVIDAV1SEQHDR av1; + unsigned char raw_seqhdr_data[1024]; /**< OUT: Sequence header data */ + }; +} CUVIDEOFORMATEX; + +/****************************************************************/ +//! \ingroup STRUCTS +//! \struct CUAUDIOFORMAT +//! Audio formats +//! Used in cuvidGetSourceAudioFormat API +/****************************************************************/ +typedef struct { + cudaAudioCodec codec; /**< OUT: Compression format */ + unsigned int channels; /**< OUT: number of audio channels */ + unsigned int samplespersec; /**< OUT: sampling frequency */ + unsigned int bitrate; /**< OUT: For uncompressed, can also be used to determine bits per sample */ + unsigned int reserved1; /**< Reserved for future use */ + unsigned int reserved2; /**< Reserved for future use */ +} CUAUDIOFORMAT; + +/***************************************************************/ +//! \enum CUvideopacketflags +//! Data packet flags +//! Used in CUVIDSOURCEDATAPACKET structure +/***************************************************************/ +typedef enum { + CUVID_PKT_ENDOFSTREAM = 0x01, /**< Set when this is the last packet for this stream */ + CUVID_PKT_TIMESTAMP = 0x02, /**< Timestamp is valid */ + CUVID_PKT_DISCONTINUITY = 0x04, /**< Set when a discontinuity has to be signalled */ + CUVID_PKT_ENDOFPICTURE = 0x08, /**< Set when the packet contains exactly one frame or one field */ + CUVID_PKT_NOTIFY_EOS = 0x10, /**< If this flag is set along with CUVID_PKT_ENDOFSTREAM, an additional (dummy) + display callback will be invoked with null value of CUVIDPARSERDISPINFO which + should be interpreted as end of the stream. */ +} CUvideopacketflags; + +/*****************************************************************************/ +//! \ingroup STRUCTS +//! \struct CUVIDSOURCEDATAPACKET +//! Data Packet +//! Used in cuvidParseVideoData API +//! IN for cuvidParseVideoData +/*****************************************************************************/ +typedef struct _CUVIDSOURCEDATAPACKET { + unsigned long flags; /**< IN: Combination of CUVID_PKT_XXX flags */ + unsigned long payload_size; /**< IN: number of bytes in the payload (may be zero if EOS flag is set) */ + const unsigned char *payload; /**< IN: Pointer to packet payload data (may be NULL if EOS flag is set) */ + CUvideotimestamp timestamp; /**< IN: Presentation time stamp (10MHz clock), only valid if + CUVID_PKT_TIMESTAMP flag is set */ +} CUVIDSOURCEDATAPACKET; + +// Callback for packet delivery +typedef int(CUDAAPI *PFNVIDSOURCECALLBACK)(void *, CUVIDSOURCEDATAPACKET *); + +/**************************************************************************************************************************/ +//! \ingroup STRUCTS +//! \struct CUVIDSOURCEPARAMS +//! Describes parameters needed in cuvidCreateVideoSource API +//! NVDECODE API is intended for HW accelerated video decoding so CUvideosource doesn't have audio demuxer for all +//! supported containers. It's recommended to clients to use their own or third party demuxer if audio support is +//! needed. +/**************************************************************************************************************************/ +typedef struct _CUVIDSOURCEPARAMS { + unsigned int ulClockRate; /**< IN: Time stamp units in Hz (0=default=10000000Hz) */ + unsigned int bAnnexb : 1; /**< IN: AV1 annexB stream */ + unsigned int uReserved : 31; /**< Reserved for future use - set to zero */ + unsigned int uReserved1[6]; /**< Reserved for future use - set to zero */ + void *pUserData; /**< IN: User private data passed in to the data handlers */ + PFNVIDSOURCECALLBACK pfnVideoDataHandler; /**< IN: Called to deliver video packets */ + PFNVIDSOURCECALLBACK pfnAudioDataHandler; /**< IN: Called to deliver audio packets. */ + void *pvReserved2[8]; /**< Reserved for future use - set to NULL */ +} CUVIDSOURCEPARAMS; + +/**********************************************/ +//! \ingroup ENUMS +//! \enum CUvideosourceformat_flags +//! CUvideosourceformat_flags +//! Used in cuvidGetSourceVideoFormat API +/**********************************************/ +typedef enum { + CUVID_FMT_EXTFORMATINFO = 0x100 /**< Return extended format structure (CUVIDEOFORMATEX) */ +} CUvideosourceformat_flags; + +#if !defined(__APPLE__) +/***************************************************************************************************************************/ +//! \ingroup FUNCTS +//! \fn CUresult CUDAAPI cuvidCreateVideoSource(CUvideosource *pObj, const char *pszFileName, CUVIDSOURCEPARAMS +//! *pParams) Create CUvideosource object. CUvideosource spawns demultiplexer thread that provides two callbacks: +//! pfnVideoDataHandler() and pfnAudioDataHandler() +//! NVDECODE API is intended for HW accelerated video decoding so CUvideosource doesn't have audio demuxer for all +//! supported containers. It's recommended to clients to use their own or third party demuxer if audio support is +//! needed. +/***************************************************************************************************************************/ +CUresult CUDAAPI cuvidCreateVideoSource(CUvideosource *pObj, const char *pszFileName, CUVIDSOURCEPARAMS *pParams); + +/***************************************************************************************************************************/ +//! \ingroup FUNCTS +//! \fn CUresult CUDAAPI cuvidCreateVideoSourceW(CUvideosource *pObj, const wchar_t *pwszFileName, CUVIDSOURCEPARAMS +//! *pParams) Create video source +/***************************************************************************************************************************/ +CUresult CUDAAPI cuvidCreateVideoSourceW(CUvideosource *pObj, const wchar_t *pwszFileName, CUVIDSOURCEPARAMS *pParams); + +/********************************************************************/ +//! \ingroup FUNCTS +//! \fn CUresult CUDAAPI cuvidDestroyVideoSource(CUvideosource obj) +//! Destroy video source +/********************************************************************/ +CUresult CUDAAPI cuvidDestroyVideoSource(CUvideosource obj); + +/******************************************************************************************/ +//! \ingroup FUNCTS +//! \fn CUresult CUDAAPI cuvidSetVideoSourceState(CUvideosource obj, cudaVideoState state) +//! Set video source state to: +//! cudaVideoState_Started - to signal the source to run and deliver data +//! cudaVideoState_Stopped - to stop the source from delivering the data +//! cudaVideoState_Error - invalid source +/******************************************************************************************/ +CUresult CUDAAPI cuvidSetVideoSourceState(CUvideosource obj, cudaVideoState state); + +/******************************************************************************************/ +//! \ingroup FUNCTS +//! \fn cudaVideoState CUDAAPI cuvidGetVideoSourceState(CUvideosource obj) +//! Get video source state +//! Returns: +//! cudaVideoState_Started - if Source is running and delivering data +//! cudaVideoState_Stopped - if Source is stopped or reached end-of-stream +//! cudaVideoState_Error - if Source is in error state +/******************************************************************************************/ +cudaVideoState CUDAAPI cuvidGetVideoSourceState(CUvideosource obj); + +/******************************************************************************************************************/ +//! \ingroup FUNCTS +//! \fn CUresult CUDAAPI cuvidGetSourceVideoFormat(CUvideosource obj, CUVIDEOFORMAT *pvidfmt, unsigned int flags) +//! Gets video source format in pvidfmt, flags is set to combination of CUvideosourceformat_flags as per requirement +/******************************************************************************************************************/ +CUresult CUDAAPI cuvidGetSourceVideoFormat(CUvideosource obj, CUVIDEOFORMAT *pvidfmt, unsigned int flags); + +/**************************************************************************************************************************/ +//! \ingroup FUNCTS +//! \fn CUresult CUDAAPI cuvidGetSourceAudioFormat(CUvideosource obj, CUAUDIOFORMAT *paudfmt, unsigned int flags) +//! Get audio source format +//! NVDECODE API is intended for HW accelerated video decoding so CUvideosource doesn't have audio demuxer for all +//! supported containers. It's recommended to clients to use their own or third party demuxer if audio support is +//! needed. +/**************************************************************************************************************************/ +CUresult CUDAAPI cuvidGetSourceAudioFormat(CUvideosource obj, CUAUDIOFORMAT *paudfmt, unsigned int flags); + +#endif +/**********************************************************************************/ +//! \ingroup STRUCTS +//! \struct CUVIDPARSERDISPINFO +//! Used in cuvidParseVideoData API with PFNVIDDISPLAYCALLBACK pfnDisplayPicture +/**********************************************************************************/ +typedef struct _CUVIDPARSERDISPINFO { + int picture_index; /**< OUT: Index of the current picture */ + int progressive_frame; /**< OUT: 1 if progressive frame; 0 otherwise */ + int top_field_first; /**< OUT: 1 if top field is displayed first; 0 otherwise */ + int repeat_first_field; /**< OUT: Number of additional fields (1=ivtc, 2=frame doubling, 4=frame tripling, + -1=unpaired field) */ + CUvideotimestamp timestamp; /**< OUT: Presentation time stamp */ +} CUVIDPARSERDISPINFO; + +/***********************************************************************************************************************/ +//! Parser callbacks +//! The parser will call these synchronously from within cuvidParseVideoData(), whenever there is sequence change or a +//! picture is ready to be decoded and/or displayed. First argument in functions is "void *pUserData" member of +//! structure CUVIDSOURCEPARAMS Return values from these callbacks are interpreted as below. If the callbacks return +//! failure, it will be propagated by cuvidParseVideoData() to the application. Parser picks default operating point as +//! 0 and outputAllLayers flag as 0 if PFNVIDOPPOINTCALLBACK is not set or return value is -1 or invalid operating +//! point. PFNVIDSEQUENCECALLBACK : 0: fail, 1: succeeded, > 1: override dpb size of parser (set by +//! CUVIDPARSERPARAMS::ulMaxNumDecodeSurfaces while creating parser) PFNVIDDECODECALLBACK : 0: fail, >=1: succeeded +//! PFNVIDDISPLAYCALLBACK : 0: fail, >=1: succeeded +//! PFNVIDOPPOINTCALLBACK : <0: fail, >=0: succeeded (bit 0-9: OperatingPoint, bit 10-10: outputAllLayers, bit 11-30: +//! reserved) PFNVIDSEIMSGCALLBACK : 0: fail, >=1: succeeded +/***********************************************************************************************************************/ +typedef int(CUDAAPI *PFNVIDSEQUENCECALLBACK)(void *, CUVIDEOFORMAT *); +typedef int(CUDAAPI *PFNVIDDECODECALLBACK)(void *, CUVIDPICPARAMS *); +typedef int(CUDAAPI *PFNVIDDISPLAYCALLBACK)(void *, CUVIDPARSERDISPINFO *); +typedef int(CUDAAPI *PFNVIDOPPOINTCALLBACK)(void *, CUVIDOPERATINGPOINTINFO *); +typedef int(CUDAAPI *PFNVIDSEIMSGCALLBACK)(void *, CUVIDSEIMESSAGEINFO *); + +/**************************************/ +//! \ingroup STRUCTS +//! \struct CUVIDPARSERPARAMS +//! Used in cuvidCreateVideoParser API +/**************************************/ +typedef struct _CUVIDPARSERPARAMS { + cudaVideoCodec CodecType; /**< IN: cudaVideoCodec_XXX */ + unsigned int ulMaxNumDecodeSurfaces; /**< IN: Max # of decode surfaces (parser will cycle through these) */ + unsigned int ulClockRate; /**< IN: Timestamp units in Hz (0=default=10000000Hz) */ + unsigned int ulErrorThreshold; /**< IN: % Error threshold (0-100) for calling pfnDecodePicture (100=always + IN: call pfnDecodePicture even if picture bitstream is fully corrupted) */ + unsigned int ulMaxDisplayDelay; /**< IN: Max display queue delay (improves pipelining of decode with display) + 0=no delay (recommended values: 2..4) */ + unsigned int bAnnexb : 1; /**< IN: AV1 annexB stream */ + unsigned int uReserved : 31; /**< Reserved for future use - set to zero */ + unsigned int uReserved1[4]; /**< IN: Reserved for future use - set to 0 */ + void *pUserData; /**< IN: User data for callbacks */ + PFNVIDSEQUENCECALLBACK + pfnSequenceCallback; /**< IN: Called before decoding frames and/or whenever there is a fmt change */ + PFNVIDDECODECALLBACK pfnDecodePicture; /**< IN: Called when a picture is ready to be decoded (decode order) */ + PFNVIDDISPLAYCALLBACK + pfnDisplayPicture; /**< IN: Called whenever a picture is ready to be displayed (display order) */ + PFNVIDOPPOINTCALLBACK pfnGetOperatingPoint; /**< IN: Called from AV1 sequence header to get operating point of a AV1 + scalable bitstream */ + PFNVIDSEIMSGCALLBACK pfnGetSEIMsg; /**< IN: Called when all SEI messages are parsed for particular frame */ + void *pvReserved2[5]; /**< Reserved for future use - set to NULL */ + CUVIDEOFORMATEX *pExtVideoInfo; /**< IN: [Optional] sequence header data from system layer */ +} CUVIDPARSERPARAMS; + +/************************************************************************************************/ +//! \ingroup FUNCTS +//! \fn CUresult CUDAAPI cuvidCreateVideoParser(CUvideoparser *pObj, CUVIDPARSERPARAMS *pParams) +//! Create video parser object and initialize +/************************************************************************************************/ +CUresult CUDAAPI cuvidCreateVideoParser(CUvideoparser *pObj, CUVIDPARSERPARAMS *pParams); + +/************************************************************************************************/ +//! \ingroup FUNCTS +//! \fn CUresult CUDAAPI cuvidParseVideoData(CUvideoparser obj, CUVIDSOURCEDATAPACKET *pPacket) +//! Parse the video data from source data packet in pPacket +//! Extracts parameter sets like SPS, PPS, bitstream etc. from pPacket and +//! calls back pfnDecodePicture with CUVIDPICPARAMS data for kicking of HW decoding +//! calls back pfnSequenceCallback with CUVIDEOFORMAT data for initial sequence header or when +//! the decoder encounters a video format change +//! calls back pfnDisplayPicture with CUVIDPARSERDISPINFO data to display a video frame +/************************************************************************************************/ +CUresult CUDAAPI cuvidParseVideoData(CUvideoparser obj, CUVIDSOURCEDATAPACKET *pPacket); + +/************************************************************************************************/ +//! \ingroup FUNCTS +//! \fn CUresult CUDAAPI cuvidDestroyVideoParser(CUvideoparser obj) +//! Destroy the video parser +/************************************************************************************************/ +CUresult CUDAAPI cuvidDestroyVideoParser(CUvideoparser obj); + +/**********************************************************************************************/ + +#if defined(__cplusplus) +} +#endif /* __cplusplus */ + +#endif // __NVCUVID_H__ diff --git a/third_party/Video_Codec_SDK/Lib/linux/stubs/x86_64/libnvcuvid.so b/third_party/Video_Codec_SDK/Lib/linux/stubs/x86_64/libnvcuvid.so new file mode 100644 index 0000000000000000000000000000000000000000..f08a209545e076a835d11dcc24bd20d22088b1c5 GIT binary patch literal 3528 zcmd6qPiWLf6vt;1HBqCksam5^qWA|QEDE9$7Rf&~2kXBy7D8bA%kD49*6eQh{boZF zQhO;S^&sNGQ+g}n(MzF+P(t+(4}u2|J?z!uNvMbi8%@9OH}7pWlk5~hN*}zL+0XaQ z%$u3tZ+G^)a~IFo#9~UUR-IM!>CFa1+F4lG%nelS>Wo^4vR>6ivFn$mRjq|h21%_{ zO9W^RnnL)(Cc~wkhekFZl~lAuts0ac)2}}2rPS3II>Sq1;;y9Mc7*MRH^M)M$KiY7 zd*Ec-;2WcsDfXeFKD*$%;k5i__*S^fXkF{zk|XV@IdbVVDZF z7ige-MvZ0%^W)Lek_Zcl6L`-0dVYNIlPhJM7M&q8l-=yT9tzpuW3CFp-R^hM~uIrL_H2mkEQlhAKD z^a1E&4t*N>b%#C+J?YTrpdWPTCFu1IeG&Sr#p?UljEl|lD!SaSS}_J0-KEG{u%~2l zNBxxrd2e2_&ev4c#{K19MR&OW#1i2MeoM4Bs1hSyIHD4n$$U7OGqny>Li-b1CH#@p zQ1l&4x|hqQKT&eDUfLsGetF+y|LFcjCc4~BWuNKJOJj&G`zrM_7U*tDP3bTDOZQaK z&{T3xQp>rJM#jfxKA=3nOwLvA5mMhPN1BG@E%+$+m;CSWeM9Oqlw5^L{(|Uth31wp i`5zq3C&*|$qD6p<8GaDkDk0kT`u_pVP7(wF literal 0 HcmV?d00001 diff --git a/third_party/Video_Codec_SDK/Samples/NvCodec/NvDecoder/NvDecoder.cpp b/third_party/Video_Codec_SDK/Samples/NvCodec/NvDecoder/NvDecoder.cpp new file mode 100644 index 00000000..0fd61f44 --- /dev/null +++ b/third_party/Video_Codec_SDK/Samples/NvCodec/NvDecoder/NvDecoder.cpp @@ -0,0 +1,709 @@ +/* + * This copyright notice applies to this header file only: + * + * Copyright (c) 2010-2023 NVIDIA Corporation + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the software, and to permit persons to whom the + * software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include +#include +#include +#include + +#include "../../../Interface/nvcuvid.h" +#include "NvDecoder/NvDecoder.h" + +std::map NvDecoder::sessionOverHead = {{0, 0}, {1, 0}}; + +/** + * @brief This function is used to get codec string from codec id + */ +const char *NvDecoder::GetCodecString(cudaVideoCodec eCodec) { return GetVideoCodecString(eCodec); } + +/* Called when the parser encounters sequence header for AV1 SVC content + * return value interpretation: + * < 0 : fail, >=0: succeeded (bit 0-9: currOperatingPoint, bit 10-10: bDispAllLayer, bit 11-30: reserved, must be + * set 0) + */ +int NvDecoder::GetOperatingPoint(CUVIDOPERATINGPOINTINFO *pOPInfo) { + if (pOPInfo->codec == cudaVideoCodec_AV1) { + if (pOPInfo->av1.operating_points_cnt > 1) { + // clip has SVC enabled + if (m_nOperatingPoint >= pOPInfo->av1.operating_points_cnt) + m_nOperatingPoint = 0; + + printf("AV1 SVC clip: operating point count %d ", pOPInfo->av1.operating_points_cnt); + printf("Selected operating point: %d, IDC 0x%x bOutputAllLayers %d\n", m_nOperatingPoint, + pOPInfo->av1.operating_points_idc[m_nOperatingPoint], m_bDispAllLayers); + return (m_nOperatingPoint | (m_bDispAllLayers << 10)); + } + } + return -1; +} + +/* Return value from HandleVideoSequence() are interpreted as : + * 0: fail, 1: succeeded, > 1: override dpb size of parser (set by CUVIDPARSERPARAMS::ulMaxNumDecodeSurfaces while + * creating parser) + */ +int NvDecoder::HandleVideoSequence(CUVIDEOFORMAT *pVideoFormat) { + START_TIMER + m_videoInfo.str(""); + m_videoInfo.clear(); + m_videoInfo << "Video Input Information" << std::endl + << "\tCodec : " << GetVideoCodecString(pVideoFormat->codec) << std::endl + << "\tFrame rate : " << pVideoFormat->frame_rate.numerator << "/" + << pVideoFormat->frame_rate.denominator << " = " + << 1.0 * pVideoFormat->frame_rate.numerator / pVideoFormat->frame_rate.denominator << " fps" + << std::endl + << "\tSequence : " << (pVideoFormat->progressive_sequence ? "Progressive" : "Interlaced") + << std::endl + << "\tCoded size : [" << pVideoFormat->coded_width << ", " << pVideoFormat->coded_height << "]" + << std::endl + << "\tDisplay area : [" << pVideoFormat->display_area.left << ", " << pVideoFormat->display_area.top + << ", " << pVideoFormat->display_area.right << ", " << pVideoFormat->display_area.bottom << "]" + << std::endl + << "\tChroma : " << GetVideoChromaFormatString(pVideoFormat->chroma_format) << std::endl + << "\tBit depth : " << pVideoFormat->bit_depth_luma_minus8 + 8; + m_videoInfo << std::endl; + + int nDecodeSurface = pVideoFormat->min_num_decode_surfaces; + + CUVIDDECODECAPS decodecaps; + memset(&decodecaps, 0, sizeof(decodecaps)); + + decodecaps.eCodecType = pVideoFormat->codec; + decodecaps.eChromaFormat = pVideoFormat->chroma_format; + decodecaps.nBitDepthMinus8 = pVideoFormat->bit_depth_luma_minus8; + + CUDA_DRVAPI_CALL(cuCtxPushCurrent(m_cuContext)); + NVDEC_API_CALL(cuvidGetDecoderCaps(&decodecaps)); + CUDA_DRVAPI_CALL(cuCtxPopCurrent(NULL)); + + if (!decodecaps.bIsSupported) { + NVDEC_THROW_ERROR("Codec not supported on this GPU", CUDA_ERROR_NOT_SUPPORTED); + return nDecodeSurface; + } + + if ((pVideoFormat->coded_width > decodecaps.nMaxWidth) || (pVideoFormat->coded_height > decodecaps.nMaxHeight)) { + + std::ostringstream errorString; + errorString << std::endl + << "Resolution : " << pVideoFormat->coded_width << "x" << pVideoFormat->coded_height + << std::endl + << "Max Supported (wxh) : " << decodecaps.nMaxWidth << "x" << decodecaps.nMaxHeight << std::endl + << "Resolution not supported on this GPU"; + + const std::string cErr = errorString.str(); + NVDEC_THROW_ERROR(cErr, CUDA_ERROR_NOT_SUPPORTED); + return nDecodeSurface; + } + + if ((pVideoFormat->coded_width >> 4) * (pVideoFormat->coded_height >> 4) > decodecaps.nMaxMBCount) { + + std::ostringstream errorString; + errorString << std::endl + << "MBCount : " << (pVideoFormat->coded_width >> 4) * (pVideoFormat->coded_height >> 4) + << std::endl + << "Max Supported mbcnt : " << decodecaps.nMaxMBCount << std::endl + << "MBCount not supported on this GPU"; + + const std::string cErr = errorString.str(); + NVDEC_THROW_ERROR(cErr, CUDA_ERROR_NOT_SUPPORTED); + return nDecodeSurface; + } + + if (m_nWidth && m_nLumaHeight && m_nChromaHeight) { + + // cuvidCreateDecoder() has been called before, and now there's possible config change + return ReconfigureDecoder(pVideoFormat); + } + + // eCodec has been set in the constructor (for parser). Here it's set again for potential correction + m_eCodec = pVideoFormat->codec; + m_eChromaFormat = pVideoFormat->chroma_format; + m_nBitDepthMinus8 = pVideoFormat->bit_depth_luma_minus8; + m_nBPP = m_nBitDepthMinus8 > 0 ? 2 : 1; + + // Set the output surface format same as chroma format + if (m_eChromaFormat == cudaVideoChromaFormat_420 || cudaVideoChromaFormat_Monochrome) + m_eOutputFormat = + pVideoFormat->bit_depth_luma_minus8 ? cudaVideoSurfaceFormat_P016 : cudaVideoSurfaceFormat_NV12; + else if (m_eChromaFormat == cudaVideoChromaFormat_444) + m_eOutputFormat = + pVideoFormat->bit_depth_luma_minus8 ? cudaVideoSurfaceFormat_YUV444_16Bit : cudaVideoSurfaceFormat_YUV444; + else if (m_eChromaFormat == cudaVideoChromaFormat_422) + m_eOutputFormat = cudaVideoSurfaceFormat_NV12; // no 4:2:2 output format supported yet so make 420 default + + // Check if output format supported. If not, check falback options + if (!(decodecaps.nOutputFormatMask & (1 << m_eOutputFormat))) { + if (decodecaps.nOutputFormatMask & (1 << cudaVideoSurfaceFormat_NV12)) + m_eOutputFormat = cudaVideoSurfaceFormat_NV12; + else if (decodecaps.nOutputFormatMask & (1 << cudaVideoSurfaceFormat_P016)) + m_eOutputFormat = cudaVideoSurfaceFormat_P016; + else if (decodecaps.nOutputFormatMask & (1 << cudaVideoSurfaceFormat_YUV444)) + m_eOutputFormat = cudaVideoSurfaceFormat_YUV444; + else if (decodecaps.nOutputFormatMask & (1 << cudaVideoSurfaceFormat_YUV444_16Bit)) + m_eOutputFormat = cudaVideoSurfaceFormat_YUV444_16Bit; + else + NVDEC_THROW_ERROR("No supported output format found", CUDA_ERROR_NOT_SUPPORTED); + } + m_videoFormat = *pVideoFormat; + + CUVIDDECODECREATEINFO videoDecodeCreateInfo = {0}; + videoDecodeCreateInfo.CodecType = pVideoFormat->codec; + videoDecodeCreateInfo.ChromaFormat = pVideoFormat->chroma_format; + videoDecodeCreateInfo.OutputFormat = m_eOutputFormat; + videoDecodeCreateInfo.bitDepthMinus8 = pVideoFormat->bit_depth_luma_minus8; + if (pVideoFormat->progressive_sequence) + videoDecodeCreateInfo.DeinterlaceMode = cudaVideoDeinterlaceMode_Weave; + else + videoDecodeCreateInfo.DeinterlaceMode = cudaVideoDeinterlaceMode_Adaptive; + videoDecodeCreateInfo.ulNumOutputSurfaces = 2; + // With PreferCUVID, JPEG is still decoded by CUDA while video is decoded by NVDEC hardware + videoDecodeCreateInfo.ulCreationFlags = cudaVideoCreate_PreferCUVID; + videoDecodeCreateInfo.ulNumDecodeSurfaces = nDecodeSurface; + videoDecodeCreateInfo.vidLock = m_ctxLock; + videoDecodeCreateInfo.ulWidth = pVideoFormat->coded_width; + videoDecodeCreateInfo.ulHeight = pVideoFormat->coded_height; + // AV1 has max width/height of sequence in sequence header + if (pVideoFormat->codec == cudaVideoCodec_AV1 && pVideoFormat->seqhdr_data_length > 0) { + // dont overwrite if it is already set from cmdline or reconfig.txt + if (!(m_nMaxWidth > pVideoFormat->coded_width || m_nMaxHeight > pVideoFormat->coded_height)) { + CUVIDEOFORMATEX *vidFormatEx = (CUVIDEOFORMATEX *)pVideoFormat; + m_nMaxWidth = vidFormatEx->av1.max_width; + m_nMaxHeight = vidFormatEx->av1.max_height; + } + } + if (m_nMaxWidth < (int)pVideoFormat->coded_width) + m_nMaxWidth = pVideoFormat->coded_width; + if (m_nMaxHeight < (int)pVideoFormat->coded_height) + m_nMaxHeight = pVideoFormat->coded_height; + videoDecodeCreateInfo.ulMaxWidth = m_nMaxWidth; + videoDecodeCreateInfo.ulMaxHeight = m_nMaxHeight; + + if (!(m_cropRect.r && m_cropRect.b) && !(m_resizeDim.w && m_resizeDim.h)) { + m_nWidth = pVideoFormat->display_area.right - pVideoFormat->display_area.left; + m_nLumaHeight = pVideoFormat->display_area.bottom - pVideoFormat->display_area.top; + videoDecodeCreateInfo.ulTargetWidth = pVideoFormat->coded_width; + videoDecodeCreateInfo.ulTargetHeight = pVideoFormat->coded_height; + } else { + if (m_resizeDim.w && m_resizeDim.h) { + videoDecodeCreateInfo.display_area.left = pVideoFormat->display_area.left; + videoDecodeCreateInfo.display_area.top = pVideoFormat->display_area.top; + videoDecodeCreateInfo.display_area.right = pVideoFormat->display_area.right; + videoDecodeCreateInfo.display_area.bottom = pVideoFormat->display_area.bottom; + m_nWidth = m_resizeDim.w; + m_nLumaHeight = m_resizeDim.h; + } + + if (m_cropRect.r && m_cropRect.b) { + videoDecodeCreateInfo.display_area.left = m_cropRect.l; + videoDecodeCreateInfo.display_area.top = m_cropRect.t; + videoDecodeCreateInfo.display_area.right = m_cropRect.r; + videoDecodeCreateInfo.display_area.bottom = m_cropRect.b; + m_nWidth = m_cropRect.r - m_cropRect.l; + m_nLumaHeight = m_cropRect.b - m_cropRect.t; + } + videoDecodeCreateInfo.ulTargetWidth = m_nWidth; + videoDecodeCreateInfo.ulTargetHeight = m_nLumaHeight; + } + + m_nChromaHeight = (int)(ceil(m_nLumaHeight * GetChromaHeightFactor(m_eOutputFormat))); + m_nNumChromaPlanes = GetChromaPlaneCount(m_eOutputFormat); + m_nSurfaceHeight = videoDecodeCreateInfo.ulTargetHeight; + m_nSurfaceWidth = videoDecodeCreateInfo.ulTargetWidth; + m_displayRect.b = videoDecodeCreateInfo.display_area.bottom; + m_displayRect.t = videoDecodeCreateInfo.display_area.top; + m_displayRect.l = videoDecodeCreateInfo.display_area.left; + m_displayRect.r = videoDecodeCreateInfo.display_area.right; + + m_videoInfo << "Video Decoding Params:" << std::endl + << "\tNum Surfaces : " << videoDecodeCreateInfo.ulNumDecodeSurfaces << std::endl + << "\tCrop : [" << videoDecodeCreateInfo.display_area.left << ", " + << videoDecodeCreateInfo.display_area.top << ", " << videoDecodeCreateInfo.display_area.right << ", " + << videoDecodeCreateInfo.display_area.bottom << "]" << std::endl + << "\tResize : " << videoDecodeCreateInfo.ulTargetWidth << "x" + << videoDecodeCreateInfo.ulTargetHeight << std::endl + << "\tDeinterlace : " + << std::vector{"Weave", "Bob", "Adaptive"}[videoDecodeCreateInfo.DeinterlaceMode]; + m_videoInfo << std::endl; + + CUDA_DRVAPI_CALL(cuCtxPushCurrent(m_cuContext)); + NVDEC_API_CALL(cuvidCreateDecoder(&m_hDecoder, &videoDecodeCreateInfo)); + CUDA_DRVAPI_CALL(cuCtxPopCurrent(NULL)); + STOP_TIMER("Session Initialization Time: "); + NvDecoder::addDecoderSessionOverHead(getDecoderSessionID(), elapsedTime); + return nDecodeSurface; +} + +int NvDecoder::ReconfigureDecoder(CUVIDEOFORMAT *pVideoFormat) { + if (pVideoFormat->bit_depth_luma_minus8 != m_videoFormat.bit_depth_luma_minus8 || + pVideoFormat->bit_depth_chroma_minus8 != m_videoFormat.bit_depth_chroma_minus8) { + + NVDEC_THROW_ERROR("Reconfigure Not supported for bit depth change", CUDA_ERROR_NOT_SUPPORTED); + } + + if (pVideoFormat->chroma_format != m_videoFormat.chroma_format) { + + NVDEC_THROW_ERROR("Reconfigure Not supported for chroma format change", CUDA_ERROR_NOT_SUPPORTED); + } + + bool bDecodeResChange = !(pVideoFormat->coded_width == m_videoFormat.coded_width && + pVideoFormat->coded_height == m_videoFormat.coded_height); + bool bDisplayRectChange = !(pVideoFormat->display_area.bottom == m_videoFormat.display_area.bottom && + pVideoFormat->display_area.top == m_videoFormat.display_area.top && + pVideoFormat->display_area.left == m_videoFormat.display_area.left && + pVideoFormat->display_area.right == m_videoFormat.display_area.right); + + int nDecodeSurface = pVideoFormat->min_num_decode_surfaces; + + if ((pVideoFormat->coded_width > m_nMaxWidth) || (pVideoFormat->coded_height > m_nMaxHeight)) { + // For VP9, let driver handle the change if new width/height > maxwidth/maxheight + if ((m_eCodec != cudaVideoCodec_VP9) || m_bReconfigExternal) { + NVDEC_THROW_ERROR("Reconfigure Not supported when width/height > maxwidth/maxheight", + CUDA_ERROR_NOT_SUPPORTED); + } + return 1; + } + + if (!bDecodeResChange && !m_bReconfigExtPPChange) { + // if the coded_width/coded_height hasn't changed but display resolution has changed, then need to update + // width/height for correct output without cropping. Example : 1920x1080 vs 1920x1088 + if (bDisplayRectChange) { + m_nWidth = pVideoFormat->display_area.right - pVideoFormat->display_area.left; + m_nLumaHeight = pVideoFormat->display_area.bottom - pVideoFormat->display_area.top; + m_nChromaHeight = (int)ceil(m_nLumaHeight * GetChromaHeightFactor(m_eOutputFormat)); + m_nNumChromaPlanes = GetChromaPlaneCount(m_eOutputFormat); + } + + // no need for reconfigureDecoder(). Just return + return 1; + } + + CUVIDRECONFIGUREDECODERINFO reconfigParams = {0}; + + reconfigParams.ulWidth = m_videoFormat.coded_width = pVideoFormat->coded_width; + reconfigParams.ulHeight = m_videoFormat.coded_height = pVideoFormat->coded_height; + + // Dont change display rect and get scaled output from decoder. This will help display app to present apps smoothly + reconfigParams.display_area.bottom = m_displayRect.b; + reconfigParams.display_area.top = m_displayRect.t; + reconfigParams.display_area.left = m_displayRect.l; + reconfigParams.display_area.right = m_displayRect.r; + reconfigParams.ulTargetWidth = m_nSurfaceWidth; + reconfigParams.ulTargetHeight = m_nSurfaceHeight; + + // If external reconfigure is called along with resolution change even if post processing params is not changed, + // do full reconfigure params update + if ((m_bReconfigExternal && bDecodeResChange) || m_bReconfigExtPPChange) { + // update display rect and target resolution if requested explicitly + m_bReconfigExternal = false; + m_bReconfigExtPPChange = false; + m_videoFormat = *pVideoFormat; + if (!(m_cropRect.r && m_cropRect.b) && !(m_resizeDim.w && m_resizeDim.h)) { + m_nWidth = pVideoFormat->display_area.right - pVideoFormat->display_area.left; + m_nLumaHeight = pVideoFormat->display_area.bottom - pVideoFormat->display_area.top; + reconfigParams.ulTargetWidth = pVideoFormat->coded_width; + reconfigParams.ulTargetHeight = pVideoFormat->coded_height; + } else { + if (m_resizeDim.w && m_resizeDim.h) { + reconfigParams.display_area.left = pVideoFormat->display_area.left; + reconfigParams.display_area.top = pVideoFormat->display_area.top; + reconfigParams.display_area.right = pVideoFormat->display_area.right; + reconfigParams.display_area.bottom = pVideoFormat->display_area.bottom; + m_nWidth = m_resizeDim.w; + m_nLumaHeight = m_resizeDim.h; + } + + if (m_cropRect.r && m_cropRect.b) { + reconfigParams.display_area.left = m_cropRect.l; + reconfigParams.display_area.top = m_cropRect.t; + reconfigParams.display_area.right = m_cropRect.r; + reconfigParams.display_area.bottom = m_cropRect.b; + m_nWidth = m_cropRect.r - m_cropRect.l; + m_nLumaHeight = m_cropRect.b - m_cropRect.t; + } + reconfigParams.ulTargetWidth = m_nWidth; + reconfigParams.ulTargetHeight = m_nLumaHeight; + } + + m_nChromaHeight = (int)ceil(m_nLumaHeight * GetChromaHeightFactor(m_eOutputFormat)); + m_nNumChromaPlanes = GetChromaPlaneCount(m_eOutputFormat); + m_nSurfaceHeight = reconfigParams.ulTargetHeight; + m_nSurfaceWidth = reconfigParams.ulTargetWidth; + m_displayRect.b = reconfigParams.display_area.bottom; + m_displayRect.t = reconfigParams.display_area.top; + m_displayRect.l = reconfigParams.display_area.left; + m_displayRect.r = reconfigParams.display_area.right; + } + + reconfigParams.ulNumDecodeSurfaces = nDecodeSurface; + + START_TIMER + CUDA_DRVAPI_CALL(cuCtxPushCurrent(m_cuContext)); + NVDEC_API_CALL(cuvidReconfigureDecoder(m_hDecoder, &reconfigParams)); + CUDA_DRVAPI_CALL(cuCtxPopCurrent(NULL)); + STOP_TIMER("Session Reconfigure Time: "); + + return nDecodeSurface; +} + +int NvDecoder::setReconfigParams(const Rect *pCropRect, const Dim *pResizeDim) { + m_bReconfigExternal = true; + m_bReconfigExtPPChange = false; + if (pCropRect) { + if (!((pCropRect->t == m_cropRect.t) && (pCropRect->l == m_cropRect.l) && (pCropRect->b == m_cropRect.b) && + (pCropRect->r == m_cropRect.r))) { + m_bReconfigExtPPChange = true; + m_cropRect = *pCropRect; + } + } + if (pResizeDim) { + if (!((pResizeDim->w == m_resizeDim.w) && (pResizeDim->h == m_resizeDim.h))) { + m_bReconfigExtPPChange = true; + m_resizeDim = *pResizeDim; + } + } + + // Clear existing output buffers of different size + uint8_t *pFrame = NULL; + while (!m_vpFrame.empty()) { + pFrame = m_vpFrame.back(); + m_vpFrame.pop_back(); + if (m_bUseDeviceFrame) { + CUDA_DRVAPI_CALL(cuCtxPushCurrent(m_cuContext)); + CUDA_DRVAPI_CALL(cuMemFree((CUdeviceptr)pFrame)); + CUDA_DRVAPI_CALL(cuCtxPopCurrent(NULL)); + } else { + delete pFrame; + } + } + + return 1; +} + +/* Return value from HandlePictureDecode() are interpreted as: + * 0: fail, >=1: succeeded + */ +int NvDecoder::HandlePictureDecode(CUVIDPICPARAMS *pPicParams) { + if (!m_hDecoder) { + NVDEC_THROW_ERROR("Decoder not initialized.", CUDA_ERROR_NOT_INITIALIZED); + return false; + } + m_nPicNumInDecodeOrder[pPicParams->CurrPicIdx] = m_nDecodePicCnt++; + CUDA_DRVAPI_CALL(cuCtxPushCurrent(m_cuContext)); + NVDEC_API_CALL(cuvidDecodePicture(m_hDecoder, pPicParams)); + if (m_bForce_zero_latency && ((!pPicParams->field_pic_flag) || (pPicParams->second_field))) { + CUVIDPARSERDISPINFO dispInfo; + memset(&dispInfo, 0, sizeof(dispInfo)); + dispInfo.picture_index = pPicParams->CurrPicIdx; + dispInfo.progressive_frame = !pPicParams->field_pic_flag; + dispInfo.top_field_first = pPicParams->bottom_field_flag ^ 1; + HandlePictureDisplay(&dispInfo); + } + CUDA_DRVAPI_CALL(cuCtxPopCurrent(NULL)); + return 1; +} + +/* Return value from HandlePictureDisplay() are interpreted as: + * 0: fail, >=1: succeeded + */ +int NvDecoder::HandlePictureDisplay(CUVIDPARSERDISPINFO *pDispInfo) { + CUVIDPROCPARAMS videoProcessingParameters = {}; + videoProcessingParameters.progressive_frame = pDispInfo->progressive_frame; + videoProcessingParameters.second_field = pDispInfo->repeat_first_field + 1; + videoProcessingParameters.top_field_first = pDispInfo->top_field_first; + videoProcessingParameters.unpaired_field = pDispInfo->repeat_first_field < 0; + videoProcessingParameters.output_stream = m_cuvidStream; + + if (m_bExtractSEIMessage) { + if (m_SEIMessagesDisplayOrder[pDispInfo->picture_index].pSEIData) { + // Write SEI Message + uint8_t *seiBuffer = (uint8_t *)(m_SEIMessagesDisplayOrder[pDispInfo->picture_index].pSEIData); + uint32_t seiNumMessages = m_SEIMessagesDisplayOrder[pDispInfo->picture_index].sei_message_count; + CUSEIMESSAGE *seiMessagesInfo = m_SEIMessagesDisplayOrder[pDispInfo->picture_index].pSEIMessage; + if (m_fpSEI) { + for (uint32_t i = 0; i < seiNumMessages; i++) { + if (m_eCodec == cudaVideoCodec_H264 || cudaVideoCodec_H264_SVC || cudaVideoCodec_H264_MVC || + cudaVideoCodec_HEVC) { + switch (seiMessagesInfo[i].sei_message_type) { + case SEI_TYPE_TIME_CODE: { + HEVCSEITIMECODE *timecode = (HEVCSEITIMECODE *)seiBuffer; + fwrite(timecode, sizeof(HEVCSEITIMECODE), 1, m_fpSEI); + } break; + case SEI_TYPE_USER_DATA_UNREGISTERED: { + fwrite(seiBuffer, seiMessagesInfo[i].sei_message_size, 1, m_fpSEI); + } break; + } + } + if (m_eCodec == cudaVideoCodec_AV1) { + fwrite(seiBuffer, seiMessagesInfo[i].sei_message_size, 1, m_fpSEI); + } + seiBuffer += seiMessagesInfo[i].sei_message_size; + } + } + free(m_SEIMessagesDisplayOrder[pDispInfo->picture_index].pSEIData); + free(m_SEIMessagesDisplayOrder[pDispInfo->picture_index].pSEIMessage); + } + } + + CUdeviceptr dpSrcFrame = 0; + unsigned int nSrcPitch = 0; + CUDA_DRVAPI_CALL(cuCtxPushCurrent(m_cuContext)); + NVDEC_API_CALL( + cuvidMapVideoFrame(m_hDecoder, pDispInfo->picture_index, &dpSrcFrame, &nSrcPitch, &videoProcessingParameters)); + + CUVIDGETDECODESTATUS DecodeStatus; + memset(&DecodeStatus, 0, sizeof(DecodeStatus)); + CUresult result = cuvidGetDecodeStatus(m_hDecoder, pDispInfo->picture_index, &DecodeStatus); + if (result == CUDA_SUCCESS && (DecodeStatus.decodeStatus == cuvidDecodeStatus_Error || + DecodeStatus.decodeStatus == cuvidDecodeStatus_Error_Concealed)) { + printf("Decode Error occurred for picture %d\n", m_nPicNumInDecodeOrder[pDispInfo->picture_index]); + } + + uint8_t *pDecodedFrame = nullptr; + { + std::lock_guard lock(m_mtxVPFrame); + if ((unsigned)++m_nDecodedFrame > m_vpFrame.size()) { + // Not enough frames in stock + m_nFrameAlloc++; + uint8_t *pFrame = NULL; + if (m_bUseDeviceFrame) { + if (m_bDeviceFramePitched) { + CUDA_DRVAPI_CALL(cuMemAllocPitch((CUdeviceptr *)&pFrame, &m_nDeviceFramePitch, GetWidth() * m_nBPP, + m_nLumaHeight + (m_nChromaHeight * m_nNumChromaPlanes), 16)); + } else { + CUDA_DRVAPI_CALL(cuMemAlloc((CUdeviceptr *)&pFrame, GetFrameSize())); + } + } else { + pFrame = new uint8_t[GetFrameSize()]; + } + m_vpFrame.push_back(pFrame); + } + pDecodedFrame = m_vpFrame[m_nDecodedFrame - 1]; + } + + // Copy luma plane + CUDA_MEMCPY2D m = {0}; + m.srcMemoryType = CU_MEMORYTYPE_DEVICE; + m.srcDevice = dpSrcFrame; + m.srcPitch = nSrcPitch; + m.dstMemoryType = m_bUseDeviceFrame ? CU_MEMORYTYPE_DEVICE : CU_MEMORYTYPE_HOST; + m.dstDevice = (CUdeviceptr)(m.dstHost = pDecodedFrame); + m.dstPitch = m_nDeviceFramePitch ? m_nDeviceFramePitch : GetWidth() * m_nBPP; + m.WidthInBytes = GetWidth() * m_nBPP; + m.Height = m_nLumaHeight; + CUDA_DRVAPI_CALL(cuMemcpy2DAsync(&m, m_cuvidStream)); + + // Copy chroma plane + // NVDEC output has luma height aligned by 2. Adjust chroma offset by aligning height + m.srcDevice = (CUdeviceptr)((uint8_t *)dpSrcFrame + m.srcPitch * ((m_nSurfaceHeight + 1) & ~1)); + m.dstDevice = (CUdeviceptr)(m.dstHost = pDecodedFrame + m.dstPitch * m_nLumaHeight); + m.Height = m_nChromaHeight; + CUDA_DRVAPI_CALL(cuMemcpy2DAsync(&m, m_cuvidStream)); + + if (m_nNumChromaPlanes == 2) { + m.srcDevice = (CUdeviceptr)((uint8_t *)dpSrcFrame + m.srcPitch * ((m_nSurfaceHeight + 1) & ~1) * 2); + m.dstDevice = (CUdeviceptr)(m.dstHost = pDecodedFrame + m.dstPitch * m_nLumaHeight * 2); + m.Height = m_nChromaHeight; + CUDA_DRVAPI_CALL(cuMemcpy2DAsync(&m, m_cuvidStream)); + } + CUDA_DRVAPI_CALL(cuStreamSynchronize(m_cuvidStream)); + CUDA_DRVAPI_CALL(cuCtxPopCurrent(NULL)); + + if ((int)m_vTimestamp.size() < m_nDecodedFrame) { + m_vTimestamp.resize(m_vpFrame.size()); + } + m_vTimestamp[m_nDecodedFrame - 1] = pDispInfo->timestamp; + + NVDEC_API_CALL(cuvidUnmapVideoFrame(m_hDecoder, dpSrcFrame)); + return 1; +} + +int NvDecoder::GetSEIMessage(CUVIDSEIMESSAGEINFO *pSEIMessageInfo) { + uint32_t seiNumMessages = pSEIMessageInfo->sei_message_count; + CUSEIMESSAGE *seiMessagesInfo = pSEIMessageInfo->pSEIMessage; + size_t totalSEIBufferSize = 0; + if ((pSEIMessageInfo->picIdx < 0) || (pSEIMessageInfo->picIdx >= MAX_FRM_CNT)) { + printf("Invalid picture index (%d)\n", pSEIMessageInfo->picIdx); + return 0; + } + for (uint32_t i = 0; i < seiNumMessages; i++) { + totalSEIBufferSize += seiMessagesInfo[i].sei_message_size; + } + if (!m_pCurrSEIMessage) { + printf("Out of Memory, Allocation failed for m_pCurrSEIMessage\n"); + return 0; + } + m_pCurrSEIMessage->pSEIData = malloc(totalSEIBufferSize); + if (!m_pCurrSEIMessage->pSEIData) { + printf("Out of Memory, Allocation failed for SEI Buffer\n"); + return 0; + } + memcpy(m_pCurrSEIMessage->pSEIData, pSEIMessageInfo->pSEIData, totalSEIBufferSize); + m_pCurrSEIMessage->pSEIMessage = (CUSEIMESSAGE *)malloc(sizeof(CUSEIMESSAGE) * seiNumMessages); + if (!m_pCurrSEIMessage->pSEIMessage) { + free(m_pCurrSEIMessage->pSEIData); + m_pCurrSEIMessage->pSEIData = NULL; + return 0; + } + memcpy(m_pCurrSEIMessage->pSEIMessage, pSEIMessageInfo->pSEIMessage, sizeof(CUSEIMESSAGE) * seiNumMessages); + m_pCurrSEIMessage->sei_message_count = pSEIMessageInfo->sei_message_count; + m_SEIMessagesDisplayOrder[pSEIMessageInfo->picIdx] = *m_pCurrSEIMessage; + return 1; +} + +NvDecoder::NvDecoder(CUcontext cuContext, bool bUseDeviceFrame, cudaVideoCodec eCodec, bool bLowLatency, + bool bDeviceFramePitched, const Rect *pCropRect, const Dim *pResizeDim, + bool extract_user_SEI_Message, int maxWidth, int maxHeight, unsigned int clkRate, + bool force_zero_latency) + : m_cuContext(cuContext), m_bUseDeviceFrame(bUseDeviceFrame), m_eCodec(eCodec), + m_bDeviceFramePitched(bDeviceFramePitched), m_bExtractSEIMessage(extract_user_SEI_Message), m_nMaxWidth(maxWidth), + m_nMaxHeight(maxHeight), m_bForce_zero_latency(force_zero_latency) { + if (pCropRect) + m_cropRect = *pCropRect; + if (pResizeDim) + m_resizeDim = *pResizeDim; + + NVDEC_API_CALL(cuvidCtxLockCreate(&m_ctxLock, cuContext)); + + ck(cuStreamCreate(&m_cuvidStream, CU_STREAM_DEFAULT)); + + decoderSessionID = 0; + + if (m_bExtractSEIMessage) { + m_fpSEI = fopen("sei_message.txt", "wb"); + m_pCurrSEIMessage = new CUVIDSEIMESSAGEINFO; + memset(&m_SEIMessagesDisplayOrder, 0, sizeof(m_SEIMessagesDisplayOrder)); + } + CUVIDPARSERPARAMS videoParserParameters = {}; + videoParserParameters.CodecType = eCodec; + videoParserParameters.ulMaxNumDecodeSurfaces = 1; + videoParserParameters.ulClockRate = clkRate; + videoParserParameters.ulMaxDisplayDelay = bLowLatency ? 0 : 1; + videoParserParameters.pUserData = this; + videoParserParameters.pfnSequenceCallback = HandleVideoSequenceProc; + videoParserParameters.pfnDecodePicture = HandlePictureDecodeProc; + videoParserParameters.pfnDisplayPicture = m_bForce_zero_latency ? NULL : HandlePictureDisplayProc; + videoParserParameters.pfnGetOperatingPoint = HandleOperatingPointProc; + videoParserParameters.pfnGetSEIMsg = m_bExtractSEIMessage ? HandleSEIMessagesProc : NULL; + NVDEC_API_CALL(cuvidCreateVideoParser(&m_hParser, &videoParserParameters)); +} + +NvDecoder::~NvDecoder() { + + START_TIMER + + if (m_pCurrSEIMessage) { + delete m_pCurrSEIMessage; + m_pCurrSEIMessage = NULL; + } + + if (m_fpSEI) { + fclose(m_fpSEI); + m_fpSEI = NULL; + } + + if (m_hParser) { + cuvidDestroyVideoParser(m_hParser); + } + cuCtxPushCurrent(m_cuContext); + if (m_hDecoder) { + cuvidDestroyDecoder(m_hDecoder); + } + + std::lock_guard lock(m_mtxVPFrame); + + for (uint8_t *pFrame : m_vpFrame) { + if (m_bUseDeviceFrame) { + cuMemFree((CUdeviceptr)pFrame); + } else { + delete[] pFrame; + } + } + cuCtxPopCurrent(NULL); + + cuvidCtxLockDestroy(m_ctxLock); + + STOP_TIMER("Session Deinitialization Time: "); + + NvDecoder::addDecoderSessionOverHead(getDecoderSessionID(), elapsedTime); +} + +int NvDecoder::Decode(const uint8_t *pData, int nSize, int nFlags, int64_t nTimestamp) { + m_nDecodedFrame = 0; + m_nDecodedFrameReturned = 0; + CUVIDSOURCEDATAPACKET packet = {0}; + packet.payload = pData; + packet.payload_size = nSize; + packet.flags = nFlags | CUVID_PKT_TIMESTAMP; + packet.timestamp = nTimestamp; + if (!pData || nSize == 0) { + packet.flags |= CUVID_PKT_ENDOFSTREAM; + } + NVDEC_API_CALL(cuvidParseVideoData(m_hParser, &packet)); + + return m_nDecodedFrame; +} + +uint8_t *NvDecoder::GetFrame(int64_t *pTimestamp) { + if (m_nDecodedFrame > 0) { + std::lock_guard lock(m_mtxVPFrame); + m_nDecodedFrame--; + if (pTimestamp) + *pTimestamp = m_vTimestamp[m_nDecodedFrameReturned]; + return m_vpFrame[m_nDecodedFrameReturned++]; + } + + return NULL; +} + +uint8_t *NvDecoder::GetLockedFrame(int64_t *pTimestamp) { + uint8_t *pFrame; + uint64_t timestamp; + if (m_nDecodedFrame > 0) { + std::lock_guard lock(m_mtxVPFrame); + m_nDecodedFrame--; + pFrame = m_vpFrame[0]; + m_vpFrame.erase(m_vpFrame.begin(), m_vpFrame.begin() + 1); + + timestamp = m_vTimestamp[0]; + m_vTimestamp.erase(m_vTimestamp.begin(), m_vTimestamp.begin() + 1); + + if (pTimestamp) + *pTimestamp = timestamp; + + return pFrame; + } + + return NULL; +} + +void NvDecoder::UnlockFrame(uint8_t **pFrame) { + std::lock_guard lock(m_mtxVPFrame); + m_vpFrame.insert(m_vpFrame.end(), &pFrame[0], &pFrame[1]); + + // add a dummy entry for timestamp + uint64_t timestamp[2] = {0}; + m_vTimestamp.insert(m_vTimestamp.end(), ×tamp[0], ×tamp[1]); +} diff --git a/third_party/Video_Codec_SDK/Samples/NvCodec/NvDecoder/NvDecoder.h b/third_party/Video_Codec_SDK/Samples/NvCodec/NvDecoder/NvDecoder.h new file mode 100644 index 00000000..886202bf --- /dev/null +++ b/third_party/Video_Codec_SDK/Samples/NvCodec/NvDecoder/NvDecoder.h @@ -0,0 +1,528 @@ +/* + * This copyright notice applies to this header file only: + * + * Copyright (c) 2010-2023 NVIDIA Corporation + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the software, and to permit persons to whom the + * software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#pragma once + +#include "../../../Interface/nvcuvid.h" +#include "../Utils/NvCodecUtils.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define MAX_FRM_CNT 32 + +typedef enum { SEI_TYPE_TIME_CODE = 136, SEI_TYPE_USER_DATA_UNREGISTERED = 5 } SEI_H264_HEVC_PAYLOAD_TYPE; + +/** + * @brief Exception class for error reporting from the decode API. + */ +class NVDECException : public std::exception { + public: + NVDECException(const std::string &errorStr, const CUresult errorCode) + : m_errorString(errorStr), m_errorCode(errorCode) {} + + virtual ~NVDECException() throw() {} + virtual const char *what() const throw() { return m_errorString.c_str(); } + CUresult getErrorCode() const { return m_errorCode; } + const std::string &getErrorString() const { return m_errorString; } + static NVDECException makeNVDECException(const std::string &errorStr, const CUresult errorCode, + const std::string &functionName, const std::string &fileName, int lineNo); + + private: + std::string m_errorString; + CUresult m_errorCode; +}; + +inline NVDECException NVDECException::makeNVDECException(const std::string &errorStr, const CUresult errorCode, + const std::string &functionName, const std::string &fileName, + int lineNo) { + std::ostringstream errorLog; + errorLog << functionName << " : " << errorStr << " at " << fileName << ":" << lineNo << std::endl; + NVDECException exception(errorLog.str(), errorCode); + return exception; +} + +#define NVDEC_THROW_ERROR(errorStr, errorCode) \ + do { \ + throw NVDECException::makeNVDECException(errorStr, errorCode, __FUNCTION__, __FILE__, __LINE__); \ + } while (0) + +#define NVDEC_API_CALL(cuvidAPI) \ + do { \ + CUresult errorCode = cuvidAPI; \ + if (errorCode != CUDA_SUCCESS) { \ + std::ostringstream errorLog; \ + errorLog << #cuvidAPI << " returned error " << errorCode; \ + throw NVDECException::makeNVDECException(errorLog.str(), errorCode, __FUNCTION__, __FILE__, __LINE__); \ + } \ + } while (0) + +struct Rect { + int l, t, r, b; +}; + +struct Dim { + int w, h; +}; + +#define START_TIMER auto start = std::chrono::high_resolution_clock::now(); + +#define STOP_TIMER(print_message) \ + int64_t elapsedTime = \ + std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - start) \ + .count(); \ + std::cout << print_message << elapsedTime << " ms " << std::endl; + +#define CUDA_DRVAPI_CALL(call) \ + do { \ + CUresult err__ = call; \ + if (err__ != CUDA_SUCCESS) { \ + const char *szErrName = NULL; \ + cuGetErrorName(err__, &szErrName); \ + std::ostringstream errorLog; \ + errorLog << "CUDA driver API error " << szErrName; \ + throw NVDECException::makeNVDECException(errorLog.str(), err__, __FUNCTION__, __FILE__, __LINE__); \ + } \ + } while (0) + +static const char *GetVideoCodecString(cudaVideoCodec eCodec) { + static struct { + cudaVideoCodec eCodec; + const char *name; + } aCodecName[] = { + {cudaVideoCodec_MPEG1, "MPEG-1"}, + {cudaVideoCodec_MPEG2, "MPEG-2"}, + {cudaVideoCodec_MPEG4, "MPEG-4 (ASP)"}, + {cudaVideoCodec_VC1, "VC-1/WMV"}, + {cudaVideoCodec_H264, "AVC/H.264"}, + {cudaVideoCodec_JPEG, "M-JPEG"}, + {cudaVideoCodec_H264_SVC, "H.264/SVC"}, + {cudaVideoCodec_H264_MVC, "H.264/MVC"}, + {cudaVideoCodec_HEVC, "H.265/HEVC"}, + {cudaVideoCodec_VP8, "VP8"}, + {cudaVideoCodec_VP9, "VP9"}, + {cudaVideoCodec_AV1, "AV1"}, + {cudaVideoCodec_NumCodecs, "Invalid"}, + {cudaVideoCodec_YUV420, "YUV 4:2:0"}, + {cudaVideoCodec_YV12, "YV12 4:2:0"}, + {cudaVideoCodec_NV12, "NV12 4:2:0"}, + {cudaVideoCodec_YUYV, "YUYV 4:2:2"}, + {cudaVideoCodec_UYVY, "UYVY 4:2:2"}, + }; + + if (eCodec >= 0 && eCodec <= cudaVideoCodec_NumCodecs) { + return aCodecName[eCodec].name; + } + for (int i = cudaVideoCodec_NumCodecs + 1; i < sizeof(aCodecName) / sizeof(aCodecName[0]); i++) { + if (eCodec == aCodecName[i].eCodec) { + return aCodecName[eCodec].name; + } + } + return "Unknown"; +} + +static const char *GetVideoChromaFormatString(cudaVideoChromaFormat eChromaFormat) { + static struct { + cudaVideoChromaFormat eChromaFormat; + const char *name; + } aChromaFormatName[] = { + {cudaVideoChromaFormat_Monochrome, "YUV 400 (Monochrome)"}, + {cudaVideoChromaFormat_420, "YUV 420"}, + {cudaVideoChromaFormat_422, "YUV 422"}, + {cudaVideoChromaFormat_444, "YUV 444"}, + }; + + if (eChromaFormat >= 0 && eChromaFormat < sizeof(aChromaFormatName) / sizeof(aChromaFormatName[0])) { + return aChromaFormatName[eChromaFormat].name; + } + return "Unknown"; +} + +static float GetChromaHeightFactor(cudaVideoSurfaceFormat eSurfaceFormat) { + float factor = 0.5; + switch (eSurfaceFormat) { + case cudaVideoSurfaceFormat_NV12: + case cudaVideoSurfaceFormat_P016: + factor = 0.5; + break; + case cudaVideoSurfaceFormat_YUV444: + case cudaVideoSurfaceFormat_YUV444_16Bit: + factor = 1.0; + break; + } + + return factor; +} + +static int GetChromaPlaneCount(cudaVideoSurfaceFormat eSurfaceFormat) { + int numPlane = 1; + switch (eSurfaceFormat) { + case cudaVideoSurfaceFormat_NV12: + case cudaVideoSurfaceFormat_P016: + numPlane = 1; + break; + case cudaVideoSurfaceFormat_YUV444: + case cudaVideoSurfaceFormat_YUV444_16Bit: + numPlane = 2; + break; + } + + return numPlane; +} + +/** + * @brief Base class for decoder interface. + */ +class NvDecoder { + + public: + NvDecoder() {} + /** + * @brief This function is used to initialize the decoder session. + * Application must call this function to initialize the decoder, before + * starting to decode any frames. + */ + NvDecoder(CUcontext cuContext, bool bUseDeviceFrame, cudaVideoCodec eCodec, bool bLowLatency = false, + bool bDeviceFramePitched = false, const Rect *pCropRect = NULL, const Dim *pResizeDim = NULL, + bool extract_user_SEI_Message = false, int maxWidth = 0, int maxHeight = 0, unsigned int clkRate = 1000, + bool force_zero_latency = false); + ~NvDecoder(); + + /** + * @brief This function is used to get the current CUDA context. + */ + CUcontext GetContext() { return m_cuContext; } + + /** + * @brief This function is used to get the output frame width. + * NV12/P016 output format width is 2 byte aligned because of U and V interleave + */ + int GetWidth() { + assert(m_nWidth); + return (m_eOutputFormat == cudaVideoSurfaceFormat_NV12 || m_eOutputFormat == cudaVideoSurfaceFormat_P016) + ? (m_nWidth + 1) & ~1 + : m_nWidth; + } + + /** + * @brief This function is used to get the actual decode width + */ + int GetDecodeWidth() { + assert(m_nWidth); + return m_nWidth; + } + + /** + * @brief This function is used to get the output frame height (Luma height). + */ + int GetHeight() { + assert(m_nLumaHeight); + return m_nLumaHeight; + } + + /** + * @brief This function is used to get the current chroma height. + */ + int GetChromaHeight() { + assert(m_nChromaHeight); + return m_nChromaHeight; + } + + /** + * @brief This function is used to get the number of chroma planes. + */ + int GetNumChromaPlanes() { + assert(m_nNumChromaPlanes); + return m_nNumChromaPlanes; + } + + /** + * @brief This function is used to get the current frame size based on pixel format. + */ + int GetFrameSize() { + assert(m_nWidth); + return GetWidth() * (m_nLumaHeight + (m_nChromaHeight * m_nNumChromaPlanes)) * m_nBPP; + } + + /** + * @brief This function is used to get the current frame Luma plane size. + */ + int GetLumaPlaneSize() { + assert(m_nWidth); + return GetWidth() * m_nLumaHeight * m_nBPP; + } + + /** + * @brief This function is used to get the current frame chroma plane size. + */ + int GetChromaPlaneSize() { + assert(m_nWidth); + return GetWidth() * (m_nChromaHeight * m_nNumChromaPlanes) * m_nBPP; + } + + /** + * @brief This function is used to get the pitch of the device buffer holding the decoded frame. + */ + int GetDeviceFramePitch() { + assert(m_nWidth); + return m_nDeviceFramePitch ? (int)m_nDeviceFramePitch : GetWidth() * m_nBPP; + } + + /** + * @brief This function is used to get the bit depth associated with the pixel format. + */ + int GetBitDepth() { + assert(m_nWidth); + return m_nBitDepthMinus8 + 8; + } + + /** + * @brief This function is used to get the bytes used per pixel. + */ + int GetBPP() { + assert(m_nWidth); + return m_nBPP; + } + + /** + * @brief This function is used to get the YUV chroma format + */ + cudaVideoSurfaceFormat GetOutputFormat() { return m_eOutputFormat; } + + /** + * @brief This function is used to get information about the video stream (codec, display parameters etc) + */ + CUVIDEOFORMAT GetVideoFormatInfo() { + assert(m_nWidth); + return m_videoFormat; + } + + /** + * @brief This function is used to get codec string from codec id + */ + const char *GetCodecString(cudaVideoCodec eCodec); + + /** + * @brief This function is used to print information about the video stream + */ + std::string GetVideoInfo() const { return m_videoInfo.str(); } + + /** + * @brief This function decodes a frame and returns the number of frames that are available for + * display. All frames that are available for display should be read before making a subsequent decode call. + * @param pData - pointer to the data buffer that is to be decoded + * @param nSize - size of the data buffer in bytes + * @param nFlags - CUvideopacketflags for setting decode options + * @param nTimestamp - presentation timestamp + */ + int Decode(const uint8_t *pData, int nSize, int nFlags = 0, int64_t nTimestamp = 0); + + /** + * @brief This function returns a decoded frame and timestamp. This function should be called in a loop for + * fetching all the frames that are available for display. + */ + uint8_t *GetFrame(int64_t *pTimestamp = nullptr); + + /** + * @brief This function decodes a frame and returns the locked frame buffers + * This makes the buffers available for use by the application without the buffers + * getting overwritten, even if subsequent decode calls are made. The frame buffers + * remain locked, until UnlockFrame() is called + */ + uint8_t *GetLockedFrame(int64_t *pTimestamp = nullptr); + + /** + * @brief This function unlocks the frame buffer and makes the frame buffers available for write again + * @param ppFrame - pointer to array of frames that are to be unlocked + * @param nFrame - number of frames to be unlocked + */ + void UnlockFrame(uint8_t **pFrame); + + /** + * @brief This function allows app to set decoder reconfig params + * @param pCropRect - cropping rectangle coordinates + * @param pResizeDim - width and height of resized output + */ + int setReconfigParams(const Rect *pCropRect, const Dim *pResizeDim); + + /** + * @brief This function allows app to set operating point for AV1 SVC clips + * @param opPoint - operating point of an AV1 scalable bitstream + * @param bDispAllLayers - Output all decoded frames of an AV1 scalable bitstream + */ + void SetOperatingPoint(const uint32_t opPoint, const bool bDispAllLayers) { + m_nOperatingPoint = opPoint; + m_bDispAllLayers = bDispAllLayers; + } + + // start a timer + void startTimer() { m_stDecode_time.Start(); } + + // stop the timer + double stopTimer() { return m_stDecode_time.Stop(); } + + void setDecoderSessionID(int sessionID) { decoderSessionID = sessionID; } + int getDecoderSessionID() { return decoderSessionID; } + + // Session overhead refers to decoder initialization and deinitialization time + static void addDecoderSessionOverHead(int sessionID, int64_t duration) { sessionOverHead[sessionID] += duration; } + static int64_t getDecoderSessionOverHead(int sessionID) { return sessionOverHead[sessionID]; } + + protected: + int decoderSessionID; // Decoder session identifier. Used to gather session level stats. + static std::map sessionOverHead; // Records session overhead of initialization+deinitialization time. + // Format is (thread id, duration) + + /** + * @brief Callback function to be registered for getting a callback when decoding of sequence starts + */ + static int CUDAAPI HandleVideoSequenceProc(void *pUserData, CUVIDEOFORMAT *pVideoFormat) { + return ((NvDecoder *)pUserData)->HandleVideoSequence(pVideoFormat); + } + + /** + * @brief Callback function to be registered for getting a callback when a decoded frame is ready to be decoded + */ + static int CUDAAPI HandlePictureDecodeProc(void *pUserData, CUVIDPICPARAMS *pPicParams) { + return ((NvDecoder *)pUserData)->HandlePictureDecode(pPicParams); + } + + /** + * @brief Callback function to be registered for getting a callback when a decoded frame is available for display + */ + static int CUDAAPI HandlePictureDisplayProc(void *pUserData, CUVIDPARSERDISPINFO *pDispInfo) { + return ((NvDecoder *)pUserData)->HandlePictureDisplay(pDispInfo); + } + + /** + * @brief Callback function to be registered for getting a callback to get operating point when AV1 SVC sequence + * header start. + */ + static int CUDAAPI HandleOperatingPointProc(void *pUserData, CUVIDOPERATINGPOINTINFO *pOPInfo) { + return ((NvDecoder *)pUserData)->GetOperatingPoint(pOPInfo); + } + + /** + * @brief Callback function to be registered for getting a callback when all the unregistered user SEI Messages + * are parsed for a frame. + */ + static int CUDAAPI HandleSEIMessagesProc(void *pUserData, CUVIDSEIMESSAGEINFO *pSEIMessageInfo) { + return ((NvDecoder *)pUserData)->GetSEIMessage(pSEIMessageInfo); + } + + /** + * @brief This function gets called when a sequence is ready to be decoded. The function also gets called + when there is format change + */ + int HandleVideoSequence(CUVIDEOFORMAT *pVideoFormat); + + /** + * @brief This function gets called when a picture is ready to be decoded. cuvidDecodePicture is called from this + * function to decode the picture + */ + int HandlePictureDecode(CUVIDPICPARAMS *pPicParams); + + /** + * @brief This function gets called after a picture is decoded and available for display. Frames are fetched and + stored in internal buffer + */ + int HandlePictureDisplay(CUVIDPARSERDISPINFO *pDispInfo); + + /** + * @brief This function gets called when AV1 sequence encounter more than one operating points + */ + int GetOperatingPoint(CUVIDOPERATINGPOINTINFO *pOPInfo); + + /** + * @brief This function gets called when all unregistered user SEI messages are parsed for a frame + */ + int GetSEIMessage(CUVIDSEIMESSAGEINFO *pSEIMessageInfo); + + /** + * @brief This function reconfigure decoder if there is a change in sequence params. + */ + int ReconfigureDecoder(CUVIDEOFORMAT *pVideoFormat); + + public: + CUcontext m_cuContext = NULL; + CUvideoctxlock m_ctxLock; + CUvideoparser m_hParser = NULL; + CUvideodecoder m_hDecoder = NULL; + bool m_bUseDeviceFrame; + // dimension of the output + unsigned int m_nWidth = 0, m_nLumaHeight = 0, m_nChromaHeight = 0; + unsigned int m_nNumChromaPlanes = 0; + // height of the mapped surface + int m_nSurfaceHeight = 0; + int m_nSurfaceWidth = 0; + cudaVideoCodec m_eCodec = cudaVideoCodec_NumCodecs; + cudaVideoChromaFormat m_eChromaFormat = cudaVideoChromaFormat_420; + cudaVideoSurfaceFormat m_eOutputFormat = cudaVideoSurfaceFormat_NV12; + int m_nBitDepthMinus8 = 0; + int m_nBPP = 1; + CUVIDEOFORMAT m_videoFormat = {}; + Rect m_displayRect = {}; + // stock of frames + std::vector m_vpFrame; + // timestamps of decoded frames + std::vector m_vTimestamp; + int m_nDecodedFrame = 0, m_nDecodedFrameReturned = 0; + int m_nDecodePicCnt = 0, m_nPicNumInDecodeOrder[MAX_FRM_CNT]; + CUVIDSEIMESSAGEINFO *m_pCurrSEIMessage = NULL; + CUVIDSEIMESSAGEINFO m_SEIMessagesDisplayOrder[MAX_FRM_CNT]; + FILE *m_fpSEI = NULL; + bool m_bEndDecodeDone = false; + std::mutex m_mtxVPFrame; + int m_nFrameAlloc = 0; + CUstream m_cuvidStream = 0; + bool m_bDeviceFramePitched = false; + size_t m_nDeviceFramePitch = 0; + Rect m_cropRect = {}; + Dim m_resizeDim = {}; + + std::ostringstream m_videoInfo; + unsigned int m_nMaxWidth = 0, m_nMaxHeight = 0; + bool m_bReconfigExternal = false; + bool m_bReconfigExtPPChange = false; + StopWatch m_stDecode_time; + + unsigned int m_nOperatingPoint = 0; + bool m_bDispAllLayers = false; + // In H.264, there is an inherent display latency for video contents + // which do not have num_reorder_frames=0 in the VUI. This applies to + // All-Intra and IPPP sequences as well. If the user wants zero display + // latency for All-Intra and IPPP sequences, the below flag will enable + // the display callback immediately after the decode callback. + bool m_bForce_zero_latency = false; + bool m_bExtractSEIMessage = false; +}; diff --git a/third_party/Video_Codec_SDK/Samples/Utils/FFmpegDemuxer.h b/third_party/Video_Codec_SDK/Samples/Utils/FFmpegDemuxer.h new file mode 100644 index 00000000..bd1881db --- /dev/null +++ b/third_party/Video_Codec_SDK/Samples/Utils/FFmpegDemuxer.h @@ -0,0 +1,379 @@ +/* + * This copyright notice applies to this header file only: + * + * Copyright (c) 2010-2023 NVIDIA Corporation + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the software, and to permit persons to whom the + * software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ +#pragma once + +extern "C" { +#include +#include +#include +/* Explicitly include bsf.h when building against FFmpeg 4.3 (libavcodec 58.45.100) or later for backward compatibility + */ +#if LIBAVCODEC_VERSION_INT >= 3824484 +#include +#endif +} +#include "NvCodecUtils.h" +#include "nvcuvid.h" + +//--------------------------------------------------------------------------- +//! \file FFmpegDemuxer.h +//! \brief Provides functionality for stream demuxing +//! +//! This header file is used by Decode/Transcode apps to demux input video clips before decoding frames from it. +//--------------------------------------------------------------------------- + +/** + * @brief libavformat wrapper class. Retrieves the elementary encoded stream from the container format. + */ +class FFmpegDemuxer { + private: + AVFormatContext *fmtc = NULL; + AVIOContext *avioc = NULL; + AVPacket *pkt = NULL; /*!< AVPacket stores compressed data typically exported by demuxers and then passed as input + to decoders */ + AVPacket *pktFiltered = NULL; + AVBSFContext *bsfc = NULL; + + int iVideoStream; + bool bMp4H264, bMp4HEVC, bMp4MPEG4; + AVCodecID eVideoCodec; + AVPixelFormat eChromaFormat; + int nWidth, nHeight, nBitDepth, nBPP, nChromaHeight; + double timeBase = 0.0; + int64_t userTimeScale = 0; + + uint8_t *pDataWithHeader = NULL; + + unsigned int frameCount = 0; + + public: + class DataProvider { + public: + virtual ~DataProvider() {} + virtual int GetData(uint8_t *pBuf, int nBuf) = 0; + }; + + private: + /** + * @brief Private constructor to initialize libavformat resources. + * @param fmtc - Pointer to AVFormatContext allocated inside avformat_open_input() + */ + FFmpegDemuxer(AVFormatContext *fmtc, int64_t timeScale = 1000 /*Hz*/) : fmtc(fmtc) { + if (!fmtc) { + LOG(ERROR) << "No AVFormatContext provided."; + return; + } + + // Allocate the AVPackets and initialize to default values + pkt = av_packet_alloc(); + pktFiltered = av_packet_alloc(); + if (!pkt || !pktFiltered) { + LOG(ERROR) << "AVPacket allocation failed"; + return; + } + + LOG(INFO) << "Media format: " << fmtc->iformat->long_name << " (" << fmtc->iformat->name << ")"; + + ck(avformat_find_stream_info(fmtc, NULL)); + iVideoStream = av_find_best_stream(fmtc, AVMEDIA_TYPE_VIDEO, -1, -1, NULL, 0); + if (iVideoStream < 0) { + LOG(ERROR) << "FFmpeg error: " << __FILE__ << " " << __LINE__ << " " + << "Could not find stream in input file"; + av_packet_free(&pkt); + av_packet_free(&pktFiltered); + return; + } + + // fmtc->streams[iVideoStream]->need_parsing = AVSTREAM_PARSE_NONE; + eVideoCodec = fmtc->streams[iVideoStream]->codecpar->codec_id; + nWidth = fmtc->streams[iVideoStream]->codecpar->width; + nHeight = fmtc->streams[iVideoStream]->codecpar->height; + eChromaFormat = (AVPixelFormat)fmtc->streams[iVideoStream]->codecpar->format; + AVRational rTimeBase = fmtc->streams[iVideoStream]->time_base; + timeBase = av_q2d(rTimeBase); + userTimeScale = timeScale; + + // Set bit depth, chroma height, bits per pixel based on eChromaFormat of input + switch (eChromaFormat) { + case AV_PIX_FMT_YUV420P10LE: + case AV_PIX_FMT_GRAY10LE: // monochrome is treated as 420 with chroma filled with 0x0 + nBitDepth = 10; + nChromaHeight = (nHeight + 1) >> 1; + nBPP = 2; + break; + case AV_PIX_FMT_YUV420P12LE: + nBitDepth = 12; + nChromaHeight = (nHeight + 1) >> 1; + nBPP = 2; + break; + case AV_PIX_FMT_YUV444P10LE: + nBitDepth = 10; + nChromaHeight = nHeight << 1; + nBPP = 2; + break; + case AV_PIX_FMT_YUV444P12LE: + nBitDepth = 12; + nChromaHeight = nHeight << 1; + nBPP = 2; + break; + case AV_PIX_FMT_YUV444P: + nBitDepth = 8; + nChromaHeight = nHeight << 1; + nBPP = 1; + break; + case AV_PIX_FMT_YUV420P: + case AV_PIX_FMT_YUVJ420P: + case AV_PIX_FMT_YUVJ422P: // jpeg decoder output is subsampled to NV12 for 422/444 so treat it as 420 + case AV_PIX_FMT_YUVJ444P: // jpeg decoder output is subsampled to NV12 for 422/444 so treat it as 420 + case AV_PIX_FMT_GRAY8: // monochrome is treated as 420 with chroma filled with 0x0 + nBitDepth = 8; + nChromaHeight = (nHeight + 1) >> 1; + nBPP = 1; + break; + default: + LOG(WARNING) << "ChromaFormat not recognized. Assuming 420"; + eChromaFormat = AV_PIX_FMT_YUV420P; + nBitDepth = 8; + nChromaHeight = (nHeight + 1) >> 1; + nBPP = 1; + } + + bMp4H264 = eVideoCodec == AV_CODEC_ID_H264 && (!strcmp(fmtc->iformat->long_name, "QuickTime / MOV") || + !strcmp(fmtc->iformat->long_name, "FLV (Flash Video)") || + !strcmp(fmtc->iformat->long_name, "Matroska / WebM")); + bMp4HEVC = eVideoCodec == AV_CODEC_ID_HEVC && (!strcmp(fmtc->iformat->long_name, "QuickTime / MOV") || + !strcmp(fmtc->iformat->long_name, "FLV (Flash Video)") || + !strcmp(fmtc->iformat->long_name, "Matroska / WebM")); + + bMp4MPEG4 = eVideoCodec == AV_CODEC_ID_MPEG4 && (!strcmp(fmtc->iformat->long_name, "QuickTime / MOV") || + !strcmp(fmtc->iformat->long_name, "FLV (Flash Video)") || + !strcmp(fmtc->iformat->long_name, "Matroska / WebM")); + + // Initialize bitstream filter and its required resources + if (bMp4H264) { + const AVBitStreamFilter *bsf = av_bsf_get_by_name("h264_mp4toannexb"); + if (!bsf) { + LOG(ERROR) << "FFmpeg error: " << __FILE__ << " " << __LINE__ << " " + << "av_bsf_get_by_name() failed"; + av_packet_free(&pkt); + av_packet_free(&pktFiltered); + return; + } + ck(av_bsf_alloc(bsf, &bsfc)); + avcodec_parameters_copy(bsfc->par_in, fmtc->streams[iVideoStream]->codecpar); + ck(av_bsf_init(bsfc)); + } + if (bMp4HEVC) { + const AVBitStreamFilter *bsf = av_bsf_get_by_name("hevc_mp4toannexb"); + if (!bsf) { + LOG(ERROR) << "FFmpeg error: " << __FILE__ << " " << __LINE__ << " " + << "av_bsf_get_by_name() failed"; + av_packet_free(&pkt); + av_packet_free(&pktFiltered); + return; + } + ck(av_bsf_alloc(bsf, &bsfc)); + avcodec_parameters_copy(bsfc->par_in, fmtc->streams[iVideoStream]->codecpar); + ck(av_bsf_init(bsfc)); + } + } + + AVFormatContext *CreateFormatContext(DataProvider *pDataProvider) { + + AVFormatContext *ctx = NULL; + if (!(ctx = avformat_alloc_context())) { + LOG(ERROR) << "FFmpeg error: " << __FILE__ << " " << __LINE__; + return NULL; + } + + uint8_t *avioc_buffer = NULL; + int avioc_buffer_size = 8 * 1024 * 1024; + avioc_buffer = (uint8_t *)av_malloc(avioc_buffer_size); + if (!avioc_buffer) { + LOG(ERROR) << "FFmpeg error: " << __FILE__ << " " << __LINE__; + return NULL; + } + avioc = avio_alloc_context(avioc_buffer, avioc_buffer_size, 0, pDataProvider, &ReadPacket, NULL, NULL); + if (!avioc) { + LOG(ERROR) << "FFmpeg error: " << __FILE__ << " " << __LINE__; + return NULL; + } + ctx->pb = avioc; + + ck(avformat_open_input(&ctx, NULL, NULL, NULL)); + return ctx; + } + + /** + * @brief Allocate and return AVFormatContext*. + * @param szFilePath - Filepath pointing to input stream. + * @return Pointer to AVFormatContext + */ + AVFormatContext *CreateFormatContext(const char *szFilePath) { + avformat_network_init(); + + AVFormatContext *ctx = NULL; + ck(avformat_open_input(&ctx, szFilePath, NULL, NULL)); + return ctx; + } + + public: + FFmpegDemuxer(const char *szFilePath, int64_t timescale = 1000 /*Hz*/) + : FFmpegDemuxer(CreateFormatContext(szFilePath), timescale) {} + FFmpegDemuxer(DataProvider *pDataProvider) : FFmpegDemuxer(CreateFormatContext(pDataProvider)) { avioc = fmtc->pb; } + ~FFmpegDemuxer() { + + if (!fmtc) { + return; + } + + if (pkt) { + av_packet_free(&pkt); + } + if (pktFiltered) { + av_packet_free(&pktFiltered); + } + + if (bsfc) { + av_bsf_free(&bsfc); + } + + avformat_close_input(&fmtc); + + if (avioc) { + av_freep(&avioc->buffer); + av_freep(&avioc); + } + + if (pDataWithHeader) { + av_free(pDataWithHeader); + } + } + AVCodecID GetVideoCodec() { return eVideoCodec; } + AVPixelFormat GetChromaFormat() { return eChromaFormat; } + int GetWidth() { return nWidth; } + int GetHeight() { return nHeight; } + int GetBitDepth() { return nBitDepth; } + int GetFrameSize() { return nWidth * (nHeight + nChromaHeight) * nBPP; } + bool Demux(uint8_t **ppVideo, int *pnVideoBytes, int64_t *pts = NULL) { + if (!fmtc) { + return false; + } + + *pnVideoBytes = 0; + + if (pkt->data) { + av_packet_unref(pkt); + } + + int e = 0; + while ((e = av_read_frame(fmtc, pkt)) >= 0 && pkt->stream_index != iVideoStream) { + av_packet_unref(pkt); + } + if (e < 0) { + return false; + } + + if (bMp4H264 || bMp4HEVC) { + if (pktFiltered->data) { + av_packet_unref(pktFiltered); + } + ck(av_bsf_send_packet(bsfc, pkt)); + ck(av_bsf_receive_packet(bsfc, pktFiltered)); + *ppVideo = pktFiltered->data; + *pnVideoBytes = pktFiltered->size; + if (pts) + *pts = (int64_t)(pktFiltered->pts * userTimeScale * timeBase); + } else { + + if (bMp4MPEG4 && (frameCount == 0)) { + + int extraDataSize = fmtc->streams[iVideoStream]->codecpar->extradata_size; + + if (extraDataSize > 0) { + + // extradata contains start codes 00 00 01. Subtract its size + pDataWithHeader = (uint8_t *)av_malloc(extraDataSize + pkt->size - 3 * sizeof(uint8_t)); + + if (!pDataWithHeader) { + LOG(ERROR) << "FFmpeg error: " << __FILE__ << " " << __LINE__; + return false; + } + + memcpy(pDataWithHeader, fmtc->streams[iVideoStream]->codecpar->extradata, extraDataSize); + memcpy(pDataWithHeader + extraDataSize, pkt->data + 3, pkt->size - 3 * sizeof(uint8_t)); + + *ppVideo = pDataWithHeader; + *pnVideoBytes = extraDataSize + pkt->size - 3 * sizeof(uint8_t); + } + + } else { + *ppVideo = pkt->data; + *pnVideoBytes = pkt->size; + } + + if (pts) + *pts = (int64_t)(pkt->pts * userTimeScale * timeBase); + } + + frameCount++; + + return true; + } + + static int ReadPacket(void *opaque, uint8_t *pBuf, int nBuf) { + return ((DataProvider *)opaque)->GetData(pBuf, nBuf); + } +}; + +inline cudaVideoCodec FFmpeg2NvCodecId(AVCodecID id) { + switch (id) { + case AV_CODEC_ID_MPEG1VIDEO: + return cudaVideoCodec_MPEG1; + case AV_CODEC_ID_MPEG2VIDEO: + return cudaVideoCodec_MPEG2; + case AV_CODEC_ID_MPEG4: + return cudaVideoCodec_MPEG4; + case AV_CODEC_ID_WMV3: + case AV_CODEC_ID_VC1: + return cudaVideoCodec_VC1; + case AV_CODEC_ID_H264: + return cudaVideoCodec_H264; + case AV_CODEC_ID_HEVC: + return cudaVideoCodec_HEVC; + case AV_CODEC_ID_VP8: + return cudaVideoCodec_VP8; + case AV_CODEC_ID_VP9: + return cudaVideoCodec_VP9; + case AV_CODEC_ID_MJPEG: + return cudaVideoCodec_JPEG; + case AV_CODEC_ID_AV1: + return cudaVideoCodec_AV1; + default: + return cudaVideoCodec_NumCodecs; + } +} diff --git a/third_party/Video_Codec_SDK/Samples/Utils/FFmpegStreamer.h b/third_party/Video_Codec_SDK/Samples/Utils/FFmpegStreamer.h new file mode 100644 index 00000000..08e43e60 --- /dev/null +++ b/third_party/Video_Codec_SDK/Samples/Utils/FFmpegStreamer.h @@ -0,0 +1,148 @@ +/* + * This copyright notice applies to this header file only: + * + * Copyright (c) 2010-2023 NVIDIA Corporation + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the software, and to permit persons to whom the + * software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ +#pragma once + +#include +#include +extern "C" { +#include +#include +#include +}; +#include "Logger.h" + +using namespace std; + +extern simplelogger::Logger *logger; + +static string AvErrorToString(int av_error_code) { + const auto buf_size = 1024U; + char *err_string = (char *)calloc(buf_size, sizeof(*err_string)); + if (!err_string) { + return string(); + } + + if (0 != av_strerror(av_error_code, err_string, buf_size - 1)) { + free(err_string); + stringstream ss; + ss << "Unknown error with code " << av_error_code; + return ss.str(); + } + + string str(err_string); + free(err_string); + return str; +} + +class FFmpegStreamer { + private: + AVFormatContext *oc = NULL; + AVStream *vs = NULL; + int nFps = 0; + + public: + FFmpegStreamer(AVCodecID eCodecId, int nWidth, int nHeight, int nFps, const char *szInFilePath) : nFps(nFps) { + avformat_network_init(); + + int ret = 0; + + if ((eCodecId == AV_CODEC_ID_H264) || (eCodecId == AV_CODEC_ID_HEVC)) + ret = avformat_alloc_output_context2(&oc, NULL, "mpegts", NULL); + else if (eCodecId == AV_CODEC_ID_AV1) + ret = avformat_alloc_output_context2(&oc, NULL, "ivf", NULL); + + if (ret < 0) { + LOG(ERROR) << "FFmpeg: failed to allocate an AVFormatContext. Error message: " << AvErrorToString(ret); + return; + } + + oc->url = av_strdup(szInFilePath); + LOG(INFO) << "Streaming destination: " << oc->url; + + // Add video stream to oc + vs = avformat_new_stream(oc, NULL); + if (!vs) { + LOG(ERROR) << "FFMPEG: Could not alloc video stream"; + return; + } + vs->id = 0; + + // Set video parameters + AVCodecParameters *vpar = vs->codecpar; + vpar->codec_id = eCodecId; + vpar->codec_type = AVMEDIA_TYPE_VIDEO; + vpar->width = nWidth; + vpar->height = nHeight; + + // Everything is ready. Now open the output stream. + if (avio_open(&oc->pb, oc->url, AVIO_FLAG_WRITE) < 0) { + LOG(ERROR) << "FFMPEG: Could not open " << oc->url; + return; + } + + // Write the container header + if (avformat_write_header(oc, NULL)) { + LOG(ERROR) << "FFMPEG: avformat_write_header error!"; + return; + } + } + ~FFmpegStreamer() { + if (oc) { + av_write_trailer(oc); + avio_close(oc->pb); + avformat_free_context(oc); + } + } + + bool Stream(uint8_t *pData, int nBytes, int nPts) { + AVPacket *pkt = av_packet_alloc(); + if (!pkt) { + LOG(ERROR) << "AVPacket allocation failed !"; + return false; + } + pkt->pts = av_rescale_q(nPts++, AVRational{1, nFps}, vs->time_base); + // No B-frames + pkt->dts = pkt->pts; + pkt->stream_index = vs->index; + pkt->data = pData; + pkt->size = nBytes; + + if (!memcmp(pData, "\x00\x00\x00\x01\x67", 5)) { + pkt->flags |= AV_PKT_FLAG_KEY; + } + + // Write the compressed frame into the output + int ret = av_write_frame(oc, pkt); + av_write_frame(oc, NULL); + if (ret < 0) { + LOG(ERROR) << "FFMPEG: Error while writing video frame"; + } + + av_packet_free(&pkt); + return true; + } +}; diff --git a/third_party/Video_Codec_SDK/Samples/Utils/Logger.h b/third_party/Video_Codec_SDK/Samples/Utils/Logger.h new file mode 100644 index 00000000..5d2f069c --- /dev/null +++ b/third_party/Video_Codec_SDK/Samples/Utils/Logger.h @@ -0,0 +1,235 @@ +/* + * This copyright notice applies to this header file only: + * + * Copyright (c) 2010-2023 NVIDIA Corporation + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the software, and to permit persons to whom the + * software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include + +#ifdef _WIN32 +#include +#include + +#pragma comment(lib, "ws2_32.lib") +#undef ERROR +#else +#include +#include +#include +#include +#define SOCKET int +#define INVALID_SOCKET -1 +#endif + +enum LogLevel { TRACE, INFO, WARNING, ERROR, FATAL }; + +namespace simplelogger { +class Logger { + public: + Logger(LogLevel level, bool bPrintTimeStamp) : level(level), bPrintTimeStamp(bPrintTimeStamp) {} + virtual ~Logger() {} + virtual std::ostream &GetStream() = 0; + virtual void FlushStream() {} + bool ShouldLogFor(LogLevel l) { return l >= level; } + char *GetLead(LogLevel l, const char *szFile, int nLine, const char *szFunc) { + if (l < TRACE || l > FATAL) { + sprintf(szLead, "[?????] "); + return szLead; + } + const char *szLevels[] = {"TRACE", "INFO", "WARN", "ERROR", "FATAL"}; + if (bPrintTimeStamp) { + time_t t = time(NULL); + struct tm *ptm = localtime(&t); + sprintf(szLead, "[%-5s][%02d:%02d:%02d] ", szLevels[l], ptm->tm_hour, ptm->tm_min, ptm->tm_sec); + } else { + sprintf(szLead, "[%-5s] ", szLevels[l]); + } + return szLead; + } + void EnterCriticalSection() { mtx.lock(); } + void LeaveCriticalSection() { mtx.unlock(); } + + private: + LogLevel level; + char szLead[80]; + bool bPrintTimeStamp; + std::mutex mtx; +}; + +class LoggerFactory { + public: + static Logger *CreateFileLogger(std::string strFilePath, LogLevel level = INFO, bool bPrintTimeStamp = true) { + return new FileLogger(strFilePath, level, bPrintTimeStamp); + } + static Logger *CreateConsoleLogger(LogLevel level = INFO, bool bPrintTimeStamp = true) { + return new ConsoleLogger(level, bPrintTimeStamp); + } + static Logger *CreateUdpLogger(char *szHost, unsigned uPort, LogLevel level = INFO, bool bPrintTimeStamp = true) { + return new UdpLogger(szHost, uPort, level, bPrintTimeStamp); + } + + private: + LoggerFactory() {} + + class FileLogger : public Logger { + public: + FileLogger(std::string strFilePath, LogLevel level, bool bPrintTimeStamp) : Logger(level, bPrintTimeStamp) { + pFileOut = new std::ofstream(); + pFileOut->open(strFilePath.c_str()); + } + ~FileLogger() { pFileOut->close(); } + std::ostream &GetStream() { return *pFileOut; } + + private: + std::ofstream *pFileOut; + }; + + class ConsoleLogger : public Logger { + public: + ConsoleLogger(LogLevel level, bool bPrintTimeStamp) : Logger(level, bPrintTimeStamp) {} + std::ostream &GetStream() { return std::cout; } + }; + + class UdpLogger : public Logger { + private: + class UdpOstream : public std::ostream { + public: + UdpOstream(char *szHost, unsigned short uPort) : std::ostream(&sb), socket(INVALID_SOCKET) { +#ifdef _WIN32 + WSADATA w; + if (WSAStartup(0x0101, &w) != 0) { + fprintf(stderr, "WSAStartup() failed.\n"); + return; + } +#endif + socket = ::socket(AF_INET, SOCK_DGRAM, 0); + if (socket == INVALID_SOCKET) { +#ifdef _WIN32 + WSACleanup(); +#endif + fprintf(stderr, "socket() failed.\n"); + return; + } +#ifdef _WIN32 + unsigned int b1, b2, b3, b4; + sscanf(szHost, "%u.%u.%u.%u", &b1, &b2, &b3, &b4); + struct in_addr addr = {(unsigned char)b1, (unsigned char)b2, (unsigned char)b3, (unsigned char)b4}; +#else + struct in_addr addr = {inet_addr(szHost)}; +#endif + struct sockaddr_in s = {AF_INET, htons(uPort), addr}; + server = s; + } + ~UdpOstream() throw() { + if (socket == INVALID_SOCKET) { + return; + } +#ifdef _WIN32 + closesocket(socket); + WSACleanup(); +#else + close(socket); +#endif + } + void Flush() { + if (sendto(socket, sb.str().c_str(), (int)sb.str().length() + 1, 0, (struct sockaddr *)&server, + (int)sizeof(sockaddr_in)) == -1) { + fprintf(stderr, "sendto() failed.\n"); + } + sb.str(""); + } + + private: + std::stringbuf sb; + SOCKET socket; + struct sockaddr_in server; + }; + + public: + UdpLogger(char *szHost, unsigned uPort, LogLevel level, bool bPrintTimeStamp) + : Logger(level, bPrintTimeStamp), udpOut(szHost, (unsigned short)uPort) {} + UdpOstream &GetStream() { return udpOut; } + virtual void FlushStream() { udpOut.Flush(); } + + private: + UdpOstream udpOut; + }; +}; + +class LogTransaction { + public: + LogTransaction(Logger *pLogger, LogLevel level, const char *szFile, const int nLine, const char *szFunc) + : pLogger(pLogger), level(level) { + if (!pLogger) { + std::cout << "[-----] "; + return; + } + if (!pLogger->ShouldLogFor(level)) { + return; + } + pLogger->EnterCriticalSection(); + pLogger->GetStream() << pLogger->GetLead(level, szFile, nLine, szFunc); + } + ~LogTransaction() { + if (!pLogger) { + std::cout << std::endl; + return; + } + if (!pLogger->ShouldLogFor(level)) { + return; + } + pLogger->GetStream() << std::endl; + pLogger->FlushStream(); + pLogger->LeaveCriticalSection(); + if (level == FATAL) { + exit(1); + } + } + std::ostream &GetStream() { + if (!pLogger) { + return std::cout; + } + if (!pLogger->ShouldLogFor(level)) { + return ossNull; + } + return pLogger->GetStream(); + } + + private: + Logger *pLogger; + LogLevel level; + std::ostringstream ossNull; +}; + +} // namespace simplelogger + +extern simplelogger::Logger *logger; +#define LOG(level) simplelogger::LogTransaction(logger, level, __FILE__, __LINE__, __FUNCTION__).GetStream() diff --git a/third_party/Video_Codec_SDK/Samples/Utils/NvCodecUtils.h b/third_party/Video_Codec_SDK/Samples/Utils/NvCodecUtils.h new file mode 100644 index 00000000..065a7cd9 --- /dev/null +++ b/third_party/Video_Codec_SDK/Samples/Utils/NvCodecUtils.h @@ -0,0 +1,547 @@ +/* + * This copyright notice applies to this header file only: + * + * Copyright (c) 2010-2023 NVIDIA Corporation + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the software, and to permit persons to whom the + * software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +//--------------------------------------------------------------------------- +//! \file NvCodecUtils.h +//! \brief Miscellaneous classes and error checking functions. +//! +//! Used by Transcode/Encode samples apps for reading input files, mutithreading, performance measurement or colorspace +//! conversion while decoding. +//--------------------------------------------------------------------------- + +#pragma once +#include "Logger.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +extern simplelogger::Logger *logger; + +#ifdef __cuda_cuda_h__ +inline bool check(CUresult e, int iLine, const char *szFile) { + if (e != CUDA_SUCCESS) { + const char *szErrName = NULL; + cuGetErrorName(e, &szErrName); + LOG(FATAL) << "CUDA driver API error " << szErrName << " at line " << iLine << " in file " << szFile; + return false; + } + return true; +} +#endif + +#ifdef __CUDA_RUNTIME_H__ +inline bool check(cudaError_t e, int iLine, const char *szFile) { + if (e != cudaSuccess) { + LOG(FATAL) << "CUDA runtime API error " << cudaGetErrorName(e) << " at line " << iLine << " in file " << szFile; + return false; + } + return true; +} +#endif + +#ifdef _NV_ENCODEAPI_H_ +inline bool check(NVENCSTATUS e, int iLine, const char *szFile) { + const char *aszErrName[] = { + "NV_ENC_SUCCESS", + "NV_ENC_ERR_NO_ENCODE_DEVICE", + "NV_ENC_ERR_UNSUPPORTED_DEVICE", + "NV_ENC_ERR_INVALID_ENCODERDEVICE", + "NV_ENC_ERR_INVALID_DEVICE", + "NV_ENC_ERR_DEVICE_NOT_EXIST", + "NV_ENC_ERR_INVALID_PTR", + "NV_ENC_ERR_INVALID_EVENT", + "NV_ENC_ERR_INVALID_PARAM", + "NV_ENC_ERR_INVALID_CALL", + "NV_ENC_ERR_OUT_OF_MEMORY", + "NV_ENC_ERR_ENCODER_NOT_INITIALIZED", + "NV_ENC_ERR_UNSUPPORTED_PARAM", + "NV_ENC_ERR_LOCK_BUSY", + "NV_ENC_ERR_NOT_ENOUGH_BUFFER", + "NV_ENC_ERR_INVALID_VERSION", + "NV_ENC_ERR_MAP_FAILED", + "NV_ENC_ERR_NEED_MORE_INPUT", + "NV_ENC_ERR_ENCODER_BUSY", + "NV_ENC_ERR_EVENT_NOT_REGISTERED", + "NV_ENC_ERR_GENERIC", + "NV_ENC_ERR_INCOMPATIBLE_CLIENT_KEY", + "NV_ENC_ERR_UNIMPLEMENTED", + "NV_ENC_ERR_RESOURCE_REGISTER_FAILED", + "NV_ENC_ERR_RESOURCE_NOT_REGISTERED", + "NV_ENC_ERR_RESOURCE_NOT_MAPPED", + }; + if (e != NV_ENC_SUCCESS) { + LOG(FATAL) << "NVENC error " << aszErrName[e] << " at line " << iLine << " in file " << szFile; + return false; + } + return true; +} +#endif + +#ifdef _WINERROR_ +inline bool check(HRESULT e, int iLine, const char *szFile) { + if (e != S_OK) { + std::stringstream stream; + stream << std::hex << std::uppercase << e; + LOG(FATAL) << "HRESULT error 0x" << stream.str() << " at line " << iLine << " in file " << szFile; + return false; + } + return true; +} +#endif + +#if defined(__gl_h_) || defined(__GL_H__) +inline bool check(GLenum e, int iLine, const char *szFile) { + if (e != 0) { + LOG(ERROR) << "GLenum error " << e << " at line " << iLine << " in file " << szFile; + return false; + } + return true; +} +#endif + +inline bool check(int e, int iLine, const char *szFile) { + if (e < 0) { + LOG(ERROR) << "General error " << e << " at line " << iLine << " in file " << szFile; + return false; + } + return true; +} + +#define ck(call) check(call, __LINE__, __FILE__) +#define MAKE_FOURCC(ch0, ch1, ch2, ch3) \ + ((uint32_t)(uint8_t)(ch0) | ((uint32_t)(uint8_t)(ch1) << 8) | ((uint32_t)(uint8_t)(ch2) << 16) | \ + ((uint32_t)(uint8_t)(ch3) << 24)) + +/** + * @brief Wrapper class around std::thread + */ +class NvThread { + public: + NvThread() = default; + NvThread(const NvThread &) = delete; + NvThread &operator=(const NvThread &other) = delete; + + NvThread(std::thread &&thread) : t(std::move(thread)) {} + + NvThread(NvThread &&thread) : t(std::move(thread.t)) {} + + NvThread &operator=(NvThread &&other) { + t = std::move(other.t); + return *this; + } + + ~NvThread() { join(); } + + void join() { + if (t.joinable()) { + t.join(); + } + } + + private: + std::thread t; +}; + +#ifndef _WIN32 +#define _stricmp strcasecmp +#define _stat64 stat64 +#endif + +/** + * @brief Utility class to allocate buffer memory. Helps avoid I/O during the encode/decode loop in case of performance + * tests. + */ +class BufferedFileReader { + public: + /** + * @brief Constructor function to allocate appropriate memory and copy file contents into it + */ + BufferedFileReader(const char *szFileName, bool bPartial = false) { + struct _stat64 st; + + if (_stat64(szFileName, &st) != 0) { + return; + } + + nSize = st.st_size; + while (nSize) { + try { + pBuf = new uint8_t[(size_t)nSize]; + if (nSize != st.st_size) { + LOG(WARNING) << "File is too large - only " << std::setprecision(4) << 100.0 * nSize / st.st_size + << "% is loaded"; + } + break; + } catch (std::bad_alloc) { + if (!bPartial) { + LOG(ERROR) << "Failed to allocate memory in BufferedReader"; + return; + } + nSize = (uint32_t)(nSize * 0.9); + } + } + + std::ifstream fpIn(szFileName, std::ifstream::in | std::ifstream::binary); + if (!fpIn) { + LOG(ERROR) << "Unable to open input file: " << szFileName; + return; + } + + std::streamsize nRead = fpIn.read(reinterpret_cast(pBuf), nSize).gcount(); + fpIn.close(); + + assert(nRead == nSize); + } + ~BufferedFileReader() { + if (pBuf) { + delete[] pBuf; + } + } + bool GetBuffer(uint8_t **ppBuf, uint64_t *pnSize) { + if (!pBuf) { + return false; + } + + *ppBuf = pBuf; + *pnSize = nSize; + return true; + } + + private: + uint8_t *pBuf = NULL; + uint64_t nSize = 0; +}; + +/** + * @brief Template class to facilitate color space conversion + */ +template class YuvConverter { + public: + YuvConverter(int nWidth, int nHeight) : nWidth(nWidth), nHeight(nHeight) { + pQuad = new T[((nWidth + 1) / 2) * ((nHeight + 1) / 2)]; + } + ~YuvConverter() { delete[] pQuad; } + void PlanarToUVInterleaved(T *pFrame, int nPitch = 0) { + if (nPitch == 0) { + nPitch = nWidth; + } + + // sizes of source surface plane + int nSizePlaneY = nPitch * nHeight; + int nSizePlaneU = ((nPitch + 1) / 2) * ((nHeight + 1) / 2); + int nSizePlaneV = nSizePlaneU; + + T *puv = pFrame + nSizePlaneY; + if (nPitch == nWidth) { + memcpy(pQuad, puv, nSizePlaneU * sizeof(T)); + } else { + for (int i = 0; i < (nHeight + 1) / 2; i++) { + memcpy(pQuad + ((nWidth + 1) / 2) * i, puv + ((nPitch + 1) / 2) * i, ((nWidth + 1) / 2) * sizeof(T)); + } + } + T *pv = puv + nSizePlaneU; + for (int y = 0; y < (nHeight + 1) / 2; y++) { + for (int x = 0; x < (nWidth + 1) / 2; x++) { + puv[y * nPitch + x * 2] = pQuad[y * ((nWidth + 1) / 2) + x]; + puv[y * nPitch + x * 2 + 1] = pv[y * ((nPitch + 1) / 2) + x]; + } + } + } + void UVInterleavedToPlanar(T *pFrame, int nPitch = 0) { + if (nPitch == 0) { + nPitch = nWidth; + } + + // sizes of source surface plane + int nSizePlaneY = nPitch * nHeight; + int nSizePlaneU = ((nPitch + 1) / 2) * ((nHeight + 1) / 2); + int nSizePlaneV = nSizePlaneU; + + T *puv = pFrame + nSizePlaneY, *pu = puv, *pv = puv + nSizePlaneU; + + // split chroma from interleave to planar + for (int y = 0; y < (nHeight + 1) / 2; y++) { + for (int x = 0; x < (nWidth + 1) / 2; x++) { + pu[y * ((nPitch + 1) / 2) + x] = puv[y * nPitch + x * 2]; + pQuad[y * ((nWidth + 1) / 2) + x] = puv[y * nPitch + x * 2 + 1]; + } + } + if (nPitch == nWidth) { + memcpy(pv, pQuad, nSizePlaneV * sizeof(T)); + } else { + for (int i = 0; i < (nHeight + 1) / 2; i++) { + memcpy(pv + ((nPitch + 1) / 2) * i, pQuad + ((nWidth + 1) / 2) * i, ((nWidth + 1) / 2) * sizeof(T)); + } + } + } + + private: + T *pQuad; + int nWidth, nHeight; +}; + +/** + * @brief Class for writing IVF format header for AV1 codec + */ +class IVFUtils { + public: + void WriteFileHeader(std::vector &vPacket, uint32_t nFourCC, uint32_t nWidth, uint32_t nHeight, + uint32_t nFrameRateNum, uint32_t nFrameRateDen, uint32_t nFrameCnt) { + char header[32]; + + header[0] = 'D'; + header[1] = 'K'; + header[2] = 'I'; + header[3] = 'F'; + mem_put_le16(header + 4, 0); // version + mem_put_le16(header + 6, 32); // header size + mem_put_le32(header + 8, nFourCC); // fourcc + mem_put_le16(header + 12, nWidth); // width + mem_put_le16(header + 14, nHeight); // height + mem_put_le32(header + 16, nFrameRateNum); // rate + mem_put_le32(header + 20, nFrameRateDen); // scale + mem_put_le32(header + 24, nFrameCnt); // length + mem_put_le32(header + 28, 0); // unused + + vPacket.insert(vPacket.end(), &header[0], &header[32]); + } + + void WriteFrameHeader(std::vector &vPacket, size_t nFrameSize, int64_t pts) { + char header[12]; + mem_put_le32(header, (int)nFrameSize); + mem_put_le32(header + 4, (int)(pts & 0xFFFFFFFF)); + mem_put_le32(header + 8, (int)(pts >> 32)); + + vPacket.insert(vPacket.end(), &header[0], &header[12]); + } + + private: + static inline void mem_put_le32(void *vmem, int val) { + unsigned char *mem = (unsigned char *)vmem; + mem[0] = (unsigned char)((val >> 0) & 0xff); + mem[1] = (unsigned char)((val >> 8) & 0xff); + mem[2] = (unsigned char)((val >> 16) & 0xff); + mem[3] = (unsigned char)((val >> 24) & 0xff); + } + + static inline void mem_put_le16(void *vmem, int val) { + unsigned char *mem = (unsigned char *)vmem; + mem[0] = (unsigned char)((val >> 0) & 0xff); + mem[1] = (unsigned char)((val >> 8) & 0xff); + } +}; + +/** + * @brief Utility class to measure elapsed time in seconds between the block of executed code + */ +class StopWatch { + public: + void Start() { t0 = std::chrono::high_resolution_clock::now(); } + double Stop() { + return std::chrono::duration_cast( + std::chrono::high_resolution_clock::now().time_since_epoch() - t0.time_since_epoch()) + .count() / + 1.0e9; + } + + private: + std::chrono::high_resolution_clock::time_point t0; +}; + +template class ConcurrentQueue { + public: + ConcurrentQueue() {} + ConcurrentQueue(size_t size) : maxSize(size) {} + ConcurrentQueue(const ConcurrentQueue &) = delete; + ConcurrentQueue &operator=(const ConcurrentQueue &) = delete; + + void setSize(size_t s) { maxSize = s; } + + void push_back(const T &value) { + // Do not use a std::lock_guard here. We will need to explicitly + // unlock before notify_one as the other waiting thread will + // automatically try to acquire mutex once it wakes up + // (which will happen on notify_one) + std::unique_lock lock(m_mutex); + auto wasEmpty = m_List.empty(); + + while (full()) { + m_cond.wait(lock); + } + + m_List.push_back(value); + if (wasEmpty && !m_List.empty()) { + lock.unlock(); + m_cond.notify_one(); + } + } + + T pop_front() { + std::unique_lock lock(m_mutex); + + while (m_List.empty()) { + m_cond.wait(lock); + } + auto wasFull = full(); + T data = std::move(m_List.front()); + m_List.pop_front(); + + if (wasFull && !full()) { + lock.unlock(); + m_cond.notify_one(); + } + + return data; + } + + T front() { + std::unique_lock lock(m_mutex); + + while (m_List.empty()) { + m_cond.wait(lock); + } + + return m_List.front(); + } + + size_t size() { + std::unique_lock lock(m_mutex); + return m_List.size(); + } + + bool empty() { + std::unique_lock lock(m_mutex); + return m_List.empty(); + } + void clear() { + std::unique_lock lock(m_mutex); + m_List.clear(); + } + + private: + bool full() { + if (maxSize > 0 && m_List.size() == maxSize) + return true; + return false; + } + + private: + std::list m_List; + std::mutex m_mutex; + std::condition_variable m_cond; + size_t maxSize; +}; + +inline void CheckInputFile(const char *szInFilePath) { + std::ifstream fpIn(szInFilePath, std::ios::in | std::ios::binary); + if (fpIn.fail()) { + std::ostringstream err; + err << "Unable to open input file: " << szInFilePath << std::endl; + throw std::invalid_argument(err.str()); + } +} + +inline void ValidateResolution(int nWidth, int nHeight) { + + if (nWidth <= 0 || nHeight <= 0) { + std::ostringstream err; + err << "Please specify positive non zero resolution as -s WxH. Current resolution is " << nWidth << "x" + << nHeight << std::endl; + throw std::invalid_argument(err.str()); + } +} + +template +void Nv12ToColor32(uint8_t *dpNv12, int nNv12Pitch, uint8_t *dpBgra, int nBgraPitch, int nWidth, int nHeight, + int iMatrix = 0); +template +void Nv12ToColor64(uint8_t *dpNv12, int nNv12Pitch, uint8_t *dpBgra, int nBgraPitch, int nWidth, int nHeight, + int iMatrix = 0); + +template +void P016ToColor32(uint8_t *dpP016, int nP016Pitch, uint8_t *dpBgra, int nBgraPitch, int nWidth, int nHeight, + int iMatrix = 4); +template +void P016ToColor64(uint8_t *dpP016, int nP016Pitch, uint8_t *dpBgra, int nBgraPitch, int nWidth, int nHeight, + int iMatrix = 4); + +template +void YUV444ToColor32(uint8_t *dpYUV444, int nPitch, uint8_t *dpBgra, int nBgraPitch, int nWidth, int nHeight, + int iMatrix = 0); +template +void YUV444ToColor64(uint8_t *dpYUV444, int nPitch, uint8_t *dpBgra, int nBgraPitch, int nWidth, int nHeight, + int iMatrix = 0); + +template +void YUV444P16ToColor32(uint8_t *dpYUV444, int nPitch, uint8_t *dpBgra, int nBgraPitch, int nWidth, int nHeight, + int iMatrix = 4); +template +void YUV444P16ToColor64(uint8_t *dpYUV444, int nPitch, uint8_t *dpBgra, int nBgraPitch, int nWidth, int nHeight, + int iMatrix = 4); + +template +void Nv12ToColorPlanar(uint8_t *dpNv12, int nNv12Pitch, uint8_t *dpBgrp, int nBgrpPitch, int nWidth, int nHeight, + int iMatrix = 0); +template +void P016ToColorPlanar(uint8_t *dpP016, int nP016Pitch, uint8_t *dpBgrp, int nBgrpPitch, int nWidth, int nHeight, + int iMatrix = 4); + +template +void YUV444ToColorPlanar(uint8_t *dpYUV444, int nPitch, uint8_t *dpBgrp, int nBgrpPitch, int nWidth, int nHeight, + int iMatrix = 0); +template +void YUV444P16ToColorPlanar(uint8_t *dpYUV444, int nPitch, uint8_t *dpBgrp, int nBgrpPitch, int nWidth, int nHeight, + int iMatrix = 4); + +void Bgra64ToP016(uint8_t *dpBgra, int nBgraPitch, uint8_t *dpP016, int nP016Pitch, int nWidth, int nHeight, + int iMatrix = 4); + +void ConvertUInt8ToUInt16(uint8_t *dpUInt8, uint16_t *dpUInt16, int nSrcPitch, int nDestPitch, int nWidth, int nHeight); +void ConvertUInt16ToUInt8(uint16_t *dpUInt16, uint8_t *dpUInt8, int nSrcPitch, int nDestPitch, int nWidth, int nHeight); + +void ResizeNv12(unsigned char *dpDstNv12, int nDstPitch, int nDstWidth, int nDstHeight, unsigned char *dpSrcNv12, + int nSrcPitch, int nSrcWidth, int nSrcHeight, unsigned char *dpDstNv12UV = nullptr); +void ResizeP016(unsigned char *dpDstP016, int nDstPitch, int nDstWidth, int nDstHeight, unsigned char *dpSrcP016, + int nSrcPitch, int nSrcWidth, int nSrcHeight, unsigned char *dpDstP016UV = nullptr); + +void ScaleYUV420(unsigned char *dpDstY, unsigned char *dpDstU, unsigned char *dpDstV, int nDstPitch, + int nDstChromaPitch, int nDstWidth, int nDstHeight, unsigned char *dpSrcY, unsigned char *dpSrcU, + unsigned char *dpSrcV, int nSrcPitch, int nSrcChromaPitch, int nSrcWidth, int nSrcHeight, + bool bSemiplanar); + +#ifdef __cuda_cuda_h__ +void ComputeCRC(uint8_t *pBuffer, uint32_t *crcValue, CUstream_st *outputCUStream); +#endif