diff --git a/.azure-pipelines/cuda-unit-test.yml b/.azure-pipelines/cuda-unit-test.yml
index 3afcd49f..2d953d65 100644
--- a/.azure-pipelines/cuda-unit-test.yml
+++ b/.azure-pipelines/cuda-unit-test.yml
@@ -11,7 +11,7 @@ pool:
 
 container:
   image: nvcr.io/nvidia/pytorch:20.12-py3
-  options: '-v /var/run/docker.sock:/var/run/docker.sock -v /usr/bin/docker:/usr/bin/docker'
+  options: '-v /var/run/docker.sock:/var/run/docker.sock -v /usr/bin/docker:/usr/bin/docker -v /usr/bin/sudo:/usr/bin/sudo -v /usr/lib/sudo/:/usr/lib/sudo/'
 
 steps:
   - script: |
@@ -21,6 +21,8 @@ steps:
       python3 -m pip install --upgrade pip setuptools==65.7
       python3 -m pip install .[test,nvworker]
       make postinstall
+      sudo DEBIAN_FRONTEND=noninteractive apt-get update
+      sudo DEBIAN_FRONTEND=noninteractive apt-get install -y ffmpeg libavcodec-dev libavformat-dev libavutil-dev libswresample-dev
     displayName: Install dependencies
   - script: |
       python3 setup.py lint
@@ -31,7 +33,7 @@ steps:
   - script: |
       SB_MICRO_PATH=$PWD python3 setup.py test
     displayName: Run unit tests
-    timeoutInMinutes: 15
+    timeoutInMinutes: 30
   - script: |
       bash <(curl -s https://codecov.io/bash) -cF cuda-unit-test
     displayName: Report coverage results
diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml
index ef9f652b..e53acebf 100644
--- a/.github/workflows/codeql-analysis.yml
+++ b/.github/workflows/codeql-analysis.yml
@@ -49,6 +49,10 @@ jobs:
     steps:
       - name: Checkout
         uses: actions/checkout@v3
+      - name: Install Dependency
+        run: |
+          DEBIAN_FRONTEND=noninteractive apt-get update
+          DEBIAN_FRONTEND=noninteractive apt-get install -y ffmpeg libavcodec-dev libavformat-dev libavutil-dev libswresample-dev sudo
       - name: Initialize CodeQL
         uses: github/codeql-action/init@v2
         with:
diff --git a/.gitignore b/.gitignore
index e1ab18ca..5888455a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,9 +9,6 @@ __pycache__/
 *.py[cod]
 *$py.class
 
-# C extensions
-*.so
-
 # Distribution / packaging
 .Python
 build/
diff --git a/dockerfile/cuda11.1.1.dockerfile b/dockerfile/cuda11.1.1.dockerfile
index 8b92c546..d7feb2ba 100644
--- a/dockerfile/cuda11.1.1.dockerfile
+++ b/dockerfile/cuda11.1.1.dockerfile
@@ -26,13 +26,18 @@ RUN apt-get update && \
     build-essential \
     curl \
     dmidecode \
+    ffmpeg \
     git \
     iproute2 \
     jq \
     libaio-dev \
+    libavcodec-dev \
+    libavformat-dev \
+    libavutil-dev \
     libcap2 \
     libnuma-dev \
     libpci-dev \
+    libswresample-dev \
     libtinfo5 \
     libtool \
     lshw \
diff --git a/dockerfile/cuda12.1.dockerfile b/dockerfile/cuda12.1.dockerfile
index 4a257bf4..2f9e430f 100644
--- a/dockerfile/cuda12.1.dockerfile
+++ b/dockerfile/cuda12.1.dockerfile
@@ -25,14 +25,19 @@ RUN apt-get update && \
     build-essential \
     curl \
     dmidecode \
+    ffmpeg \
     git \
     iproute2 \
     jq \
     libaio-dev \
+    libavcodec-dev \
+    libavformat-dev \
+    libavutil-dev \
     libboost-program-options-dev \
     libcap2 \
     libnuma-dev \
     libpci-dev \
+    libswresample-dev \
     libtinfo5 \
     libtool \
     lshw \
diff --git a/superbench/benchmarks/micro_benchmarks/cuda_decode_performance/AppDecPerf.cpp b/superbench/benchmarks/micro_benchmarks/cuda_decode_performance/AppDecPerf.cpp
new file mode 100644
index 00000000..1ae5ae12
--- /dev/null
+++ b/superbench/benchmarks/micro_benchmarks/cuda_decode_performance/AppDecPerf.cpp
@@ -0,0 +1,454 @@
+// Copyright(c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#include <algorithm>
+#include <chrono>
+#include <cuda.h>
+#include <cudaProfiler.h>
+#include <fstream>
+#include <iostream>
+#include <memory>
+#include <numeric>
+#include <stdio.h>
+#include <string.h>
+#include <string>
+#include <thread>
+
+#include "../Utils/FFmpegDemuxer.h"
+#include "../Utils/NvCodecUtils.h"
+#include "OptimizedNvDecoder.h"
+#include "ThreadPoolUtils.h"
+
+// Define logger which need in third party utils
+simplelogger::Logger *logger = simplelogger::LoggerFactory::CreateConsoleLogger();
+
+// Define the codec map
+std::map<std::string, cudaVideoCodec_enum> codecMap = {
+    {"mpeg1", cudaVideoCodec_MPEG1},       {"mpeg2", cudaVideoCodec_MPEG2},       {"mpeg4", cudaVideoCodec_MPEG4},
+    {"vc1", cudaVideoCodec_VC1},           {"h264", cudaVideoCodec_H264},         {"jpeg", cudaVideoCodec_JPEG},
+    {"h264_svc", cudaVideoCodec_H264_SVC}, {"h264_mvc", cudaVideoCodec_H264_MVC}, {"hevc", cudaVideoCodec_HEVC},
+    {"vp8", cudaVideoCodec_VP8},           {"vp9", cudaVideoCodec_VP9},           {"av1", cudaVideoCodec_AV1}};
+
+/**
+ *   @brief  Function to decode video file using OptimizedNvDecoder interface
+ *   @param  pDec    - Handle to OptimizedNvDecoder
+ *   @param  demuxer - Pointer to an FFmpegDemuxer instance
+ *   @param  pnFrame - Variable to record the number of frames decoded
+ *   @param  ex      - Stores current exception in case of failure
+ */
+void DecProc(OptimizedNvDecoder *pDec, const char *szInFilePath, int *pnFrame, std::exception_ptr &ex) {
+    try {
+        std::unique_ptr<FFmpegDemuxer> demuxer(new FFmpegDemuxer(szInFilePath));
+        int nVideoBytes = 0, nFrameReturned = 0, nFrame = 0;
+        uint8_t *pVideo = NULL, *pFrame = NULL;
+        do {
+            // Demux video from file using FFmpegDemuxer
+            demuxer->Demux(&pVideo, &nVideoBytes);
+            // Decode the video frame from demuxed packet
+            nFrameReturned = pDec->Decode(pVideo, nVideoBytes);
+            if (!nFrame && nFrameReturned)
+                LOG(INFO) << pDec->GetVideoInfo();
+            nFrame += nFrameReturned;
+        } while (nVideoBytes);
+        *pnFrame = nFrame;
+    } catch (std::exception &) {
+        ex = std::current_exception();
+    }
+}
+
+/**
+ *   @brief  Function to show help message and exit
+ */
+void ShowHelpAndExit(const char *szBadOption = NULL) {
+    std::ostringstream oss;
+    bool bThrowError = false;
+    if (szBadOption) {
+        bThrowError = true;
+        oss << "Error parsing \"" << szBadOption << "\"" << std::endl;
+    }
+    oss << "Options:" << std::endl
+        << "-i           Input file path. No default value. One of -i and -multi_input is required." << std::endl
+        << "-o           Output file path of raw data. No default value. Optional." << std::endl
+        << "-gpu         Ordinal of GPU to use. Default 0. Optional." << std::endl
+        << "-thread      Number of decoding thread. Default 5. Optional." << std::endl
+        << "-total       Number of total video to test. Default 100. Optional." << std::endl
+        << "-single      (No value) Use single cuda context for every thread. Default is multi-context, one context "
+           "per thread."
+        << std::endl
+        << "-host        (No value) Copy frame to host memory .Default is device memory)" << std::endl
+        << "-multi_input The file path which lists the path of multiple video in each line." << std::endl
+        << "-codec       The codec of video to test. Default H264." << std::endl;
+    if (bThrowError) {
+        throw std::invalid_argument(oss.str());
+    } else {
+        std::cout << oss.str();
+        exit(0);
+    }
+}
+
+/**
+ *   @brief  Function to parse commandline arguments
+ */
+void ParseCommandLine(int argc, char *argv[], char *szInputFileName, int &iGpu, int &nThread, int &nTotalVideo,
+                      bool &bSingle, bool &bHost, std::string &inputFilesListPath, std::string &outputFile,
+                      cudaVideoCodec &codec) {
+    for (int i = 1; i < argc; i++) {
+        if (!_stricmp(argv[i], "-h")) {
+            ShowHelpAndExit();
+        }
+        if (!_stricmp(argv[i], "-i")) {
+            if (++i == argc) {
+                ShowHelpAndExit("-i");
+            }
+            sprintf(szInputFileName, "%s", argv[i]);
+            continue;
+        }
+        if (!_stricmp(argv[i], "-o")) {
+            if (++i == argc) {
+                ShowHelpAndExit("-o");
+            }
+            outputFile = std::string(argv[i]);
+            continue;
+        }
+        if (!_stricmp(argv[i], "-gpu")) {
+            if (++i == argc) {
+                ShowHelpAndExit("-gpu");
+            }
+            iGpu = atoi(argv[i]);
+            continue;
+        }
+        if (!_stricmp(argv[i], "-thread")) {
+            if (++i == argc) {
+                ShowHelpAndExit("-thread");
+            }
+            nThread = atoi(argv[i]);
+            continue;
+        }
+        if (!_stricmp(argv[i], "-total")) {
+            if (++i == argc) {
+                ShowHelpAndExit("-total");
+            }
+            nTotalVideo = atoi(argv[i]);
+            continue;
+        }
+        if (!_stricmp(argv[i], "-multi_input")) {
+            if (++i == argc) {
+                ShowHelpAndExit("-multi_input");
+            }
+            inputFilesListPath = std::string(argv[i]);
+            continue;
+        }
+        if (!_stricmp(argv[i], "-single")) {
+            bSingle = true;
+            continue;
+        }
+        if (!_stricmp(argv[i], "-host")) {
+            bHost = true;
+            continue;
+        }
+        if (!_stricmp(argv[i], "-codec")) {
+            if (++i == argc) {
+                ShowHelpAndExit("-codec");
+            }
+            std::string codecName = std::string(argv[i]);
+            std::transform(codecName.begin(), codecName.end(), codecName.begin(),
+                           [](unsigned char c) { return std::tolower(c); });
+            if (codecMap.find(codecName) != codecMap.end()) {
+                codec = codecMap[codecName];
+            } else {
+                std::cout << "Codec name not found in the map." << std::endl;
+                exit(1);
+            }
+            continue;
+        }
+        ShowHelpAndExit(argv[i]);
+    }
+}
+
+/**
+ *  @brief  Function to create cuda context and initialize decoder
+ */
+OptimizedNvDecoder *InitOptimizedNvDecoder(int i, const CUdevice &cuDevice, CUcontext &cuContext, bool bSingle,
+                                           bool bHost, cudaVideoCodec codec, CUVIDDECODECAPS decodecaps) {
+    if (!bSingle) {
+        ck(cuCtxCreate(&cuContext, 0, cuDevice));
+    }
+    OptimizedNvDecoder *sessionObject = new OptimizedNvDecoder(cuContext, !bHost, codec, decodecaps);
+    sessionObject->setDecoderSessionID(i);
+    return sessionObject;
+}
+
+/**
+ *  @brief  Function to decode a video in a thread and measure the latency
+ */
+double DecodeVideo(size_t i, const std::vector<OptimizedNvDecoder *> &vDec, const char *szInFilePath, int *pnFrame,
+                   std::exception_ptr &ex) {
+    try {
+        OptimizedNvDecoder *pDec = vDec[i];
+        auto start = std::chrono::high_resolution_clock::now();
+        DecProc(pDec, szInFilePath, pnFrame, ex);
+        auto end = std::chrono::high_resolution_clock::now();
+        auto elapsedTime = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
+        std::cout << "Decode finished --"
+                  << " duration:" << elapsedTime << " frames:" << *pnFrame << std::endl;
+        return elapsedTime / 1000.0f;
+    } catch (const std::exception &e) {
+        std::cerr << "Exception in deocding: " << e.what() << std::endl;
+        return 0;
+    }
+}
+
+/**
+ *  @brief  Function to read the video paths from a file
+ */
+std::vector<std::string> ReadMultipleVideoFiles(const std::string &filepath) {
+    std::ifstream file(filepath);
+    if (!file) {
+        std::cerr << "Error opening the file." << std::endl;
+        exit(1);
+    }
+    std::string line;
+    std::vector<std::string> tokens;
+    while (std::getline(file, line)) {
+        tokens.push_back(line);
+    }
+    file.close();
+    return tokens;
+}
+
+/**
+ * @brief  Function to get the decoder capability
+ */
+void GetDefaultDecoderCaps(CUVIDDECODECAPS &decodecaps, cudaVideoCodec codec) {
+    memset(&decodecaps, 0, sizeof(decodecaps));
+    decodecaps.eCodecType = codec;
+    decodecaps.eChromaFormat = cudaVideoChromaFormat_420;
+    decodecaps.nBitDepthMinus8 = 0;
+    NVDEC_API_CALL(cuvidGetDecoderCaps(&decodecaps));
+}
+
+/**
+ * @brief  Function to initialize the cuda device, cuda context, query the decoder capability and create decoder for
+ * each thread
+ */
+void InitializeContext(std::vector<OptimizedNvDecoder *> &vDec, int iGpu, int nThread, bool bSingle, bool bHost,
+                       cudaVideoCodec codec) {
+    ck(cuInit(0));
+    int nGpu = 0;
+    ck(cuDeviceGetCount(&nGpu));
+    if (iGpu < 0 || iGpu >= nGpu) {
+        std::cout << "GPU ordinal out of range. Should be within [" << 0 << ", " << nGpu - 1 << "]" << std::endl;
+        exit(1);
+    }
+    CUdevice cuDevice = 0;
+    ck(cuDeviceGet(&cuDevice, iGpu));
+    char szDeviceName[80];
+    ck(cuDeviceGetName(szDeviceName, sizeof(szDeviceName), cuDevice));
+    std::cout << "GPU in use: " << szDeviceName << std::endl;
+
+    CUcontext cuContext = NULL;
+    ck(cuCtxCreate(&cuContext, 0, cuDevice));
+
+    CUVIDDECODECAPS decodecaps;
+    GetDefaultDecoderCaps(decodecaps, codec);
+
+    ThreadPool threadPool(nThread);
+    std::vector<std::future<OptimizedNvDecoder *>> futures;
+    for (int i = 0; i < nThread; i++) {
+        futures.push_back(
+            threadPool.enqueue(InitOptimizedNvDecoder, cuDevice, cuContext, bSingle, bHost, codec, decodecaps));
+    }
+    for (auto &future : futures) {
+        vDec.push_back(future.get()); // Retrieve the results from each task
+    }
+}
+
+/**
+ * @brief  Function to write the latency and FPS data of each video to a file
+ */
+void WriteRawData(std::vector<OptimizedNvDecoder *> &vDec, int nThread, const std::vector<double> &data,
+                  std::vector<int> &frames, std::string filename) {
+    // Open the output file stream
+    std::ofstream outputFile(filename);
+    outputFile << "Frame Latency" << std::endl;
+    for (int i = 0; i < nThread; i++) {
+        for (const auto &tuple : vDec[i]->GetFrameLatency()) {
+            int frame = std::get<0>(tuple);
+            double latency = std::get<1>(tuple);
+            outputFile << "Frame: " << frame << ", Latency: " << latency << std::endl;
+        }
+    }
+    outputFile << "Video Latency" << std::endl;
+    for (int i = 0; i < data.size(); i++) {
+        outputFile << data[i] << std::endl;
+    }
+    outputFile << "Video FPS" << std::endl;
+    for (int i = 0; i < data.size(); i++) {
+        outputFile << frames[i] / data[i] << std::endl;
+    }
+
+    // Close the file stream
+    outputFile.close();
+}
+
+/**
+ * @brief  Function to calculate the statistical metrics
+ */
+std::tuple<double, double, double, double, double, double, double, double>
+CalMetrics(const std::vector<double> &originData) {
+    std::vector<double> data = originData;
+    double sum = std::accumulate(data.begin(), data.end(), 0.0);
+    double mean = sum / data.size();
+    double min = *std::min_element(data.begin(), data.end());
+    double max = *std::max_element(data.begin(), data.end());
+    std::sort(data.begin(), data.end());
+    double p50 = data[data.size() / 2];
+    double p90 = data[static_cast<size_t>(data.size() * 0.9)];
+    double p95 = data[static_cast<size_t>(data.size() * 0.95)];
+    double p99 = data[static_cast<size_t>(data.size() * 0.99)];
+    return std::make_tuple(sum, mean, min, max, p50, p90, p95, p99);
+}
+
+/**
+ * @brief  Function to generate the total file list for the given total number of videos.
+ *        If the number of videos is less than the total number of videos, the list will be repeated.
+ *        If the number of videos is greater than the total number of videos, the list will be truncated.
+ */
+std::vector<std::string> GenerateTotalFileList(const std::string &inputFilesListPath, int nTotalVideo,
+                                               const char *szInFilePath) {
+    std::vector<std::string> files;
+    if (inputFilesListPath.size() != 0) {
+        auto videofiles = ReadMultipleVideoFiles(inputFilesListPath);
+        int smallerSize = videofiles.size();
+
+        if (nTotalVideo > smallerSize) {
+            int numIterations = nTotalVideo / smallerSize;
+
+            for (int i = 0; i < numIterations; i++) {
+                files.insert(files.end(), videofiles.begin(), videofiles.end());
+            }
+
+            int remainingElements = nTotalVideo - (numIterations * smallerSize);
+            files.insert(files.end(), videofiles.begin(), videofiles.begin() + remainingElements);
+        } else {
+            files = std::vector<std::string>(videofiles.begin(), videofiles.begin() + nTotalVideo);
+        }
+
+        std::cout << "Multifile mode - " << nTotalVideo << "videos will be decoded" << std::endl;
+    } else {
+        for (int i = 0; i < nTotalVideo; i++) {
+            files.push_back(std::string(szInFilePath));
+        }
+    }
+    return files;
+}
+
+/**
+ * @brief  Function to run the decoding tasks in parallel with thread pool to decode all the videos and record the total
+ * latency and the total number of frames
+ */
+float run(std::vector<OptimizedNvDecoder *> &vDec, int nThread, std::vector<std::string> &files,
+          std::vector<int> &vnFrame, std::vector<std::exception_ptr> &vExceptionPtrs, int *nTotalFrames,
+          std::vector<double> &vnLatency, std::vector<double> &frLatency, std::vector<double> &vnFPS) {
+    std::vector<std::future<double>> decodeLatencyFutures;
+    ThreadPool threadPool(nThread);
+    // Enqueue the video decoding task into thread pool
+    auto start = std::chrono::high_resolution_clock::now();
+    for (int i = 0; i < files.size(); i++) {
+        auto filePath = files[i].c_str();
+        CheckInputFile(filePath);
+        decodeLatencyFutures.push_back(
+            threadPool.enqueue(DecodeVideo, vDec, filePath, &vnFrame[i], std::ref(vExceptionPtrs[i])));
+    }
+    // Wait until decoding tasks finished
+    for (int i = 0; i < files.size(); i++) {
+        auto decodeLatency = decodeLatencyFutures[i].get();
+        vnLatency.push_back(decodeLatency);
+        *nTotalFrames += vnFrame[i];
+    }
+    auto elapsedTime =
+        (std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - start)
+             .count()) /
+        1000.0f;
+    for (int i = 0; i < nThread; i++) {
+        for (const auto &tuple : vDec[i]->GetFrameLatency()) {
+            int frame = std::get<0>(tuple);
+            double latency = std::get<1>(tuple);
+            if (frame > 0) {
+                frLatency.push_back(latency / frame);
+            }
+        }
+    }
+    for (int i = 0; i < vnLatency.size(); i++) {
+        if (vnLatency[i] != 0) {
+            vnFPS.push_back(vnFrame[i] / vnLatency[i]);
+        }
+    }
+
+    // Record the total time
+    return elapsedTime;
+}
+
+int main(int argc, char **argv) {
+    char szInFilePath[256] = "";
+    int iGpu = 0;
+    int nThread = 5;
+    int nTotalVideo = 100;
+    bool bSingle = false;
+    bool bHost = false;
+    std::string inputFilesListPath = "";
+    std::string outputFilePath = "";
+    std::vector<std::exception_ptr> vExceptionPtrs(nTotalVideo);
+    cudaVideoCodec codec = cudaVideoCodec_H264;
+    try {
+        // Parse the command line arguments
+        ParseCommandLine(argc, argv, szInFilePath, iGpu, nThread, nTotalVideo, bSingle, bHost, inputFilesListPath,
+                         outputFilePath, codec);
+        auto files = GenerateTotalFileList(inputFilesListPath, nTotalVideo, szInFilePath);
+
+        // Initialize and prepare the decoder context for each thread
+        std::vector<OptimizedNvDecoder *> vDec;
+        InitializeContext(vDec, iGpu, nThread, bSingle, bHost, codec);
+
+        // Decode all video with thread pool
+        std::vector<int> vnFrame(nTotalVideo);
+        int nTotalFrames = 0;
+        std::vector<double> vnLatency;
+        std::vector<double> frLatency;
+        std::vector<double> videoFPS;
+        auto elapsedTime =
+            run(vDec, nThread, files, vnFrame, vExceptionPtrs, &nTotalFrames, vnLatency, frLatency, videoFPS);
+
+        // Calculate and output the raw data into file and metrics into stdout
+        double sum, mean, min, max, p50, p90, p95, p99;
+        std::tie(sum, mean, min, max, p50, p90, p95, p99) = CalMetrics(vnLatency);
+        std::cout << "Total Frames Decoded=" << nTotalFrames << " FPS=" << nTotalFrames / elapsedTime << std::endl;
+        std::cout << "Mean Latency for each video=" << mean * 1000 << " P50 Latency=" << p50 * 1000
+                  << " P90 Latency=" << p90 * 1000 << " P95 Latency=" << p95 * 1000 << " P99 Latency=" << p99 * 1000
+                  << "ms" << std::endl;
+
+        std::tie(sum, mean, min, max, p50, p90, p95, p99) = CalMetrics(videoFPS);
+        std::cout << "Mean FPS for each video=" << mean << " P50 FPS=" << p50 << " P90 FPS=" << p90
+                  << " P95 FPS=" << p95 << " P99 FPS=" << p99 << std::endl;
+        std::tie(sum, mean, min, max, p50, p90, p95, p99) = CalMetrics(frLatency);
+        std::cout << "Mean Latency for each frame=" << mean * 1000 << " P50 Latency=" << p50 * 1000
+                  << " P90 Latency=" << p90 * 1000 << " P95 Latency=" << p95 * 1000 << " P99 Latency=" << p99 * 1000
+                  << "ms" << std::endl;
+        if (outputFilePath.size() != 0) {
+            WriteRawData(vDec, nThread, vnLatency, vnFrame, outputFilePath);
+        }
+        // Deinitialization
+        for (int i = 0; i < nThread; i++) {
+            delete (vDec[i]);
+        }
+        for (int i = 0; i < nThread; i++) {
+            if (vExceptionPtrs[i]) {
+                std::rethrow_exception(vExceptionPtrs[i]);
+            }
+        }
+    } catch (const std::exception &ex) {
+        std::cout << ex.what();
+        exit(1);
+    }
+    return 0;
+}
diff --git a/superbench/benchmarks/micro_benchmarks/cuda_decode_performance/CMakeLists.txt b/superbench/benchmarks/micro_benchmarks/cuda_decode_performance/CMakeLists.txt
new file mode 100644
index 00000000..83cb1506
--- /dev/null
+++ b/superbench/benchmarks/micro_benchmarks/cuda_decode_performance/CMakeLists.txt
@@ -0,0 +1,117 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+cmake_minimum_required(VERSION 3.18)
+project(cuda_decode_performance)
+
+find_package(CUDA QUIET)
+if(CUDA_FOUND)
+  set(CMAKE_CXX_STANDARD 17)
+  set(CMAKE_CXX_STANDARD_REQUIRED ON)
+
+  set(THIRD_PARTY_SAMPLE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../../../../third_party/Video_Codec_SDK/Samples)
+  set(NVCODEC_PUBLIC_INTERFACE_DIR ${THIRD_PARTY_SAMPLE_DIR}/../Interface)
+  set(NVCODEC_UTILS_DIR ${THIRD_PARTY_SAMPLE_DIR}/Utils)
+  set(NV_CODEC_DIR ${THIRD_PARTY_SAMPLE_DIR}/NvCodec)
+  set(NV_DEC_DIR ${THIRD_PARTY_SAMPLE_DIR}/NvCodec/NvDecoder)
+
+  if(CMAKE_SYSTEM_NAME STREQUAL "Linux")
+    find_package(PkgConfig REQUIRED)
+    pkg_check_modules(PC_AVCODEC REQUIRED IMPORTED_TARGET libavcodec)
+    pkg_check_modules(PC_AVFORMAT REQUIRED IMPORTED_TARGET libavformat)
+    pkg_check_modules(PC_AVUTIL REQUIRED IMPORTED_TARGET libavutil)
+    pkg_check_modules(PC_SWRESAMPLE REQUIRED IMPORTED_TARGET libswresample)
+
+    set(NV_FFMPEG_HDRS ${PC_AVCODEC_INCLUDE_DIRS})
+    find_library(AVCODEC_LIBRARY NAMES avcodec
+    HINTS
+    ${PC_AVCODEC_LIBDIR}
+    ${PC_AVCODEC_LIBRARY_DIRS}
+    )
+    find_library(AVFORMAT_LIBRARY NAMES avformat
+    HINTS
+    ${PC_AVFORMAT_LIBDIR}
+    ${PC_AVFORMAT_LIBRARY_DIRS}
+    )
+    find_library(AVUTIL_LIBRARY NAMES avutil
+    HINTS
+    ${PC_AVUTIL_LIBDIR}
+    ${PC_AVUTIL_LIBRARY_DIRS}
+    )
+    find_library(SWRESAMPLE_LIBRARY NAMES swresample
+    HINTS
+    ${PC_SWRESAMPLE_LIBDIR}
+    ${PC_SWRESAMPLE_LIBRARY_DIRS}
+    )
+    set(AVCODEC_LIB ${AVCODEC_LIBRARY})
+    set(AVFORMAT_LIB ${AVFORMAT_LIBRARY})
+    set(AVUTIL_LIB ${AVUTIL_LIBRARY})
+    set(SWRESAMPLE_LIB ${SWRESAMPLE_LIBRARY})
+  endif()
+
+  set(APP_SOURCES
+  ${CMAKE_CURRENT_SOURCE_DIR}/AppDecPerf.cpp
+  )
+
+  set(NV_DEC_SOURCES
+  ${NV_DEC_DIR}/NvDecoder.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/OptimizedNvDecoder.cpp
+  )
+
+  set(NV_DEC_HDRS
+  ${NV_DEC_DIR}/NvDecoder.h
+  ${NVCODEC_PUBLIC_INTERFACE_DIR}/cuviddec.h
+  ${NVCODEC_PUBLIC_INTERFACE_DIR}/nvcuvid.h
+  ${NVCODEC_UTILS_DIR}/NvCodecUtils.h
+  ${NVCODEC_UTILS_DIR}/FFmpegDemuxer.h
+  ${CMAKE_CURRENT_SOURCE_DIR}/ThreadPoolUtils.h
+  ${CMAKE_CURRENT_SOURCE_DIR}/OptimizedNvDecoder.h
+  )
+
+  source_group( "headers" FILES ${NV_DEC_HDRS} )
+  source_group( "sources" FILES ${APP_SOURCES} ${NV_DEC_SOURCES})
+  set(CMAKE_LIBRARY_PATH "${CUDA_TOOLKIT_ROOT_DIR}/lib64/stubs;${CUDA_TOOLKIT_ROOT_DIR}/lib/stubs;${CUDA_TOOLKIT_ROOT_DIR}/lib64;${CUDA_TOOLKIT_ROOT_DIR}/lib;${CMAKE_LIBRARY_PATH}")
+  find_package(CUDA)
+  set(CUDA_HOST_COMPILER ${CMAKE_CXX_COMPILER})
+  set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-gencode arch=compute_50,code=\"sm_50,compute_50\")
+  if ( CMAKE_COMPILER_IS_GNUCC )
+    if(NOT "${CUDA_NVCC_FLAGS}" MATCHES "-std=c\\+\\+11" )
+      list(APPEND CUDA_NVCC_FLAGS -std=c++11)
+    endif()
+  endif()
+
+  # Check if the file exists
+  if (NOT EXISTS "/usr/local/lib/libnvcuvid.so" )
+      execute_process(
+        COMMAND sudo ln -s /usr/lib/x86_64-linux-gnu/libnvcuvid.so.1 /usr/local/lib/libnvcuvid.so
+        RESULT_VARIABLE result
+      )  
+      if(result)
+        message(FATAL_ERROR "Failed to create symbolic link for nvcuvid lib: ${result}")
+      endif()
+  endif ()
+
+  find_library(CUVID_LIB nvcuvid
+  HINTS
+  "/usr/local/lib/"
+  "${CMAKE_CURRENT_SOURCE_DIR}/../../../../third_party/Video_Codec_SDK/Lib/linux/stubs/x86_64/"
+  )
+
+  cuda_add_executable(${PROJECT_NAME} ${APP_SOURCES} ${NV_DEC_SOURCES} ${NV_DEC_HDRS})
+
+  set_target_properties(${PROJECT_NAME} PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+
+  target_include_directories(${PROJECT_NAME} PUBLIC ${CUDA_INCLUDE_DIRS}
+  ${NVCODEC_PUBLIC_INTERFACE_DIR}
+  ${NVCODEC_UTILS_DIR}
+  ${NV_CODEC_DIR}
+  ${NV_APPDEC_COMMON_DIR}
+  ${NV_FFMPEG_HDRS}
+  ${THIRD_PARTY_SAMPLE_DIR}
+  )
+
+  target_link_libraries(${PROJECT_NAME} ${CUDA_CUDA_LIBRARY} ${CMAKE_DL_LIBS} ${CUVID_LIB} ${AVCODEC_LIB}
+  ${AVFORMAT_LIB} ${AVUTIL_LIB} ${SWRESAMPLE_LIB})
+
+  install(TARGETS ${PROJECT_NAME} RUNTIME DESTINATION bin LIBRARY DESTINATION lib)
+endif()
diff --git a/superbench/benchmarks/micro_benchmarks/cuda_decode_performance/OptimizedNvDecoder.cpp b/superbench/benchmarks/micro_benchmarks/cuda_decode_performance/OptimizedNvDecoder.cpp
new file mode 100644
index 00000000..ee23391b
--- /dev/null
+++ b/superbench/benchmarks/micro_benchmarks/cuda_decode_performance/OptimizedNvDecoder.cpp
@@ -0,0 +1,263 @@
+// Copyright(c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#include <cmath>
+
+#include "OptimizedNvDecoder.h"
+
+int OptimizedNvDecoder::Decode(const uint8_t *pData, int nSize, int nFlags, int64_t nTimestamp) {
+    m_nDecodedFrame = 0;
+    m_nDecodedFrameReturned = 0;
+    CUVIDSOURCEDATAPACKET packet = {0};
+    packet.payload = pData;
+    packet.payload_size = nSize;
+    packet.flags = nFlags | CUVID_PKT_TIMESTAMP;
+    packet.timestamp = nTimestamp;
+    if (!pData || nSize == 0) {
+        packet.flags |= CUVID_PKT_ENDOFSTREAM;
+    }
+    auto start = std::chrono::high_resolution_clock::now();
+    NVDEC_API_CALL(cuvidParseVideoData(m_hParser, &packet));
+    int64_t elapsedTime =
+        std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::high_resolution_clock::now() - start)
+            .count();
+    frameLatency.push_back(std::make_tuple(m_nDecodedFrame, elapsedTime / 1000.0f / 1000.0f));
+    return m_nDecodedFrame;
+}
+
+OptimizedNvDecoder::OptimizedNvDecoder(CUcontext &cuContext, bool bUseDeviceFrame, cudaVideoCodec eCodec,
+                                       CUVIDDECODECAPS decodecaps, bool bLowLatency, bool bDeviceFramePitched,
+                                       const Rect *pCropRect, const Dim *pResizeDim, bool extract_user_SEI_Message,
+                                       int maxWidth, int maxHeight, unsigned int clkRate, bool force_zero_latency) {
+    m_cuContext = cuContext;
+    m_bUseDeviceFrame = bUseDeviceFrame;
+    m_eCodec = eCodec;
+    m_bDeviceFramePitched = bDeviceFramePitched;
+    m_bExtractSEIMessage = extract_user_SEI_Message;
+    m_nMaxWidth = maxWidth;
+    m_nMaxHeight = maxHeight;
+    m_bForce_zero_latency = force_zero_latency;
+    if (pCropRect)
+        m_cropRect = *pCropRect;
+    if (pResizeDim)
+        m_resizeDim = *pResizeDim;
+
+    CUDA_DRVAPI_CALL(cuCtxPushCurrent(m_cuContext));
+    NVDEC_API_CALL(cuvidCtxLockCreate(&m_ctxLock, cuContext));
+
+    ck(cuStreamCreate(&m_cuvidStream, CU_STREAM_DEFAULT));
+
+    decoderSessionID = 0;
+
+    if (m_bExtractSEIMessage) {
+        m_fpSEI = fopen("sei_message.txt", "wb");
+        m_pCurrSEIMessage = new CUVIDSEIMESSAGEINFO;
+        memset(&m_SEIMessagesDisplayOrder, 0, sizeof(m_SEIMessagesDisplayOrder));
+    }
+    CUVIDPARSERPARAMS videoParserParameters = {};
+    videoParserParameters.CodecType = eCodec;
+    videoParserParameters.ulMaxNumDecodeSurfaces = 1;
+    videoParserParameters.ulClockRate = clkRate;
+    videoParserParameters.ulMaxDisplayDelay = bLowLatency ? 0 : 1;
+    videoParserParameters.pUserData = this;
+    videoParserParameters.pfnSequenceCallback = HandleVideoSequenceProc;
+    videoParserParameters.pfnDecodePicture = HandlePictureDecodeProc;
+    videoParserParameters.pfnDisplayPicture = m_bForce_zero_latency ? NULL : HandlePictureDisplayProc;
+    videoParserParameters.pfnGetOperatingPoint = HandleOperatingPointProc;
+    videoParserParameters.pfnGetSEIMsg = m_bExtractSEIMessage ? HandleSEIMessagesProc : NULL;
+    NVDEC_API_CALL(cuvidCreateVideoParser(&m_hParser, &videoParserParameters));
+    // reuse the decodecaps queried before
+    m_decodecaps = decodecaps;
+    CUDA_DRVAPI_CALL(cuCtxPopCurrent(NULL));
+}
+
+int OptimizedNvDecoder::HandleVideoSequence(CUVIDEOFORMAT *pVideoFormat) {
+    START_TIMER
+    m_videoInfo.str("");
+    m_videoInfo.clear();
+    m_videoInfo << "Video Input Information" << std::endl
+                << "\tCodec        : " << GetVideoCodecString(pVideoFormat->codec) << std::endl
+                << "\tFrame rate   : " << pVideoFormat->frame_rate.numerator << "/"
+                << pVideoFormat->frame_rate.denominator << " = "
+                << 1.0 * pVideoFormat->frame_rate.numerator / pVideoFormat->frame_rate.denominator << " fps"
+                << std::endl
+                << "\tSequence     : " << (pVideoFormat->progressive_sequence ? "Progressive" : "Interlaced")
+                << std::endl
+                << "\tCoded size   : [" << pVideoFormat->coded_width << ", " << pVideoFormat->coded_height << "]"
+                << std::endl
+                << "\tDisplay area : [" << pVideoFormat->display_area.left << ", " << pVideoFormat->display_area.top
+                << ", " << pVideoFormat->display_area.right << ", " << pVideoFormat->display_area.bottom << "]"
+                << std::endl
+                << "\tChroma       : " << GetVideoChromaFormatString(pVideoFormat->chroma_format) << std::endl
+                << "\tBit depth    : " << pVideoFormat->bit_depth_luma_minus8 + 8;
+    m_videoInfo << std::endl;
+
+    int nDecodeSurface = pVideoFormat->min_num_decode_surfaces;
+
+    // re-call the cuvidGetDecoderCaps when the video codeoc and format change
+    if (m_decodecaps.eCodecType != pVideoFormat->codec || m_decodecaps.eChromaFormat != pVideoFormat->chroma_format ||
+        m_decodecaps.nBitDepthMinus8 != pVideoFormat->bit_depth_luma_minus8) {
+        m_decodecaps.eCodecType = pVideoFormat->codec;
+        m_decodecaps.eChromaFormat = pVideoFormat->chroma_format;
+        m_decodecaps.nBitDepthMinus8 = pVideoFormat->bit_depth_luma_minus8;
+
+        CUDA_DRVAPI_CALL(cuCtxPushCurrent(m_cuContext));
+        NVDEC_API_CALL(cuvidGetDecoderCaps(&m_decodecaps));
+        CUDA_DRVAPI_CALL(cuCtxPopCurrent(NULL));
+    }
+
+    if (!m_decodecaps.bIsSupported) {
+        NVDEC_THROW_ERROR("Codec not supported on this GPU", CUDA_ERROR_NOT_SUPPORTED);
+        return nDecodeSurface;
+    }
+
+    if ((pVideoFormat->coded_width > m_decodecaps.nMaxWidth) ||
+        (pVideoFormat->coded_height > m_decodecaps.nMaxHeight)) {
+
+        std::ostringstream errorString;
+        errorString << std::endl
+                    << "Resolution          : " << pVideoFormat->coded_width << "x" << pVideoFormat->coded_height
+                    << std::endl
+                    << "Max Supported (wxh) : " << m_decodecaps.nMaxWidth << "x" << m_decodecaps.nMaxHeight << std::endl
+                    << "Resolution not supported on this GPU";
+
+        const std::string cErr = errorString.str();
+        NVDEC_THROW_ERROR(cErr, CUDA_ERROR_NOT_SUPPORTED);
+        return nDecodeSurface;
+    }
+    if ((pVideoFormat->coded_width >> 4) * (pVideoFormat->coded_height >> 4) > m_decodecaps.nMaxMBCount) {
+
+        std::ostringstream errorString;
+        errorString << std::endl
+                    << "MBCount             : " << (pVideoFormat->coded_width >> 4) * (pVideoFormat->coded_height >> 4)
+                    << std::endl
+                    << "Max Supported mbcnt : " << m_decodecaps.nMaxMBCount << std::endl
+                    << "MBCount not supported on this GPU";
+        NVDEC_THROW_ERROR(errorString.str(), CUDA_ERROR_NOT_SUPPORTED);
+        return nDecodeSurface;
+    }
+
+    if (m_nWidth && m_nLumaHeight && m_nChromaHeight) {
+
+        // cuvidCreateDecoder() has been called before, and now there's possible config change
+        return ReconfigureDecoder(pVideoFormat);
+    }
+
+    // eCodec has been set in the constructor (for parser). Here it's set again for potential correction
+    m_eCodec = pVideoFormat->codec;
+    m_eChromaFormat = pVideoFormat->chroma_format;
+    m_nBitDepthMinus8 = pVideoFormat->bit_depth_luma_minus8;
+    m_nBPP = m_nBitDepthMinus8 > 0 ? 2 : 1;
+
+    // Set the output surface format same as chroma format
+    if (m_eChromaFormat == cudaVideoChromaFormat_420 || cudaVideoChromaFormat_Monochrome)
+        m_eOutputFormat =
+            pVideoFormat->bit_depth_luma_minus8 ? cudaVideoSurfaceFormat_P016 : cudaVideoSurfaceFormat_NV12;
+    else if (m_eChromaFormat == cudaVideoChromaFormat_444)
+        m_eOutputFormat =
+            pVideoFormat->bit_depth_luma_minus8 ? cudaVideoSurfaceFormat_YUV444_16Bit : cudaVideoSurfaceFormat_YUV444;
+    else if (m_eChromaFormat == cudaVideoChromaFormat_422)
+        m_eOutputFormat = cudaVideoSurfaceFormat_NV12; // no 4:2:2 output format supported yet so make 420 default
+
+    // Check if output format supported. If not, check falback options
+    if (!(m_decodecaps.nOutputFormatMask & (1 << m_eOutputFormat))) {
+        if (m_decodecaps.nOutputFormatMask & (1 << cudaVideoSurfaceFormat_NV12))
+            m_eOutputFormat = cudaVideoSurfaceFormat_NV12;
+        else if (m_decodecaps.nOutputFormatMask & (1 << cudaVideoSurfaceFormat_P016))
+            m_eOutputFormat = cudaVideoSurfaceFormat_P016;
+        else if (m_decodecaps.nOutputFormatMask & (1 << cudaVideoSurfaceFormat_YUV444))
+            m_eOutputFormat = cudaVideoSurfaceFormat_YUV444;
+        else if (m_decodecaps.nOutputFormatMask & (1 << cudaVideoSurfaceFormat_YUV444_16Bit))
+            m_eOutputFormat = cudaVideoSurfaceFormat_YUV444_16Bit;
+        else
+            NVDEC_THROW_ERROR("No supported output format found", CUDA_ERROR_NOT_SUPPORTED);
+    }
+    m_videoFormat = *pVideoFormat;
+
+    CUVIDDECODECREATEINFO videoDecodeCreateInfo = {0};
+    videoDecodeCreateInfo.CodecType = pVideoFormat->codec;
+    videoDecodeCreateInfo.ChromaFormat = pVideoFormat->chroma_format;
+    videoDecodeCreateInfo.OutputFormat = m_eOutputFormat;
+    videoDecodeCreateInfo.bitDepthMinus8 = pVideoFormat->bit_depth_luma_minus8;
+    if (pVideoFormat->progressive_sequence)
+        videoDecodeCreateInfo.DeinterlaceMode = cudaVideoDeinterlaceMode_Weave;
+    else
+        videoDecodeCreateInfo.DeinterlaceMode = cudaVideoDeinterlaceMode_Adaptive;
+    videoDecodeCreateInfo.ulNumOutputSurfaces = 2;
+    // With PreferCUVID, JPEG is still decoded by CUDA while video is decoded by NVDEC hardware
+    videoDecodeCreateInfo.ulCreationFlags = cudaVideoCreate_PreferCUVID;
+    videoDecodeCreateInfo.ulNumDecodeSurfaces = nDecodeSurface;
+    videoDecodeCreateInfo.vidLock = m_ctxLock;
+    videoDecodeCreateInfo.ulWidth = pVideoFormat->coded_width;
+    videoDecodeCreateInfo.ulHeight = pVideoFormat->coded_height;
+    // AV1 has max width/height of sequence in sequence header
+    if (pVideoFormat->codec == cudaVideoCodec_AV1 && pVideoFormat->seqhdr_data_length > 0) {
+        CUVIDEOFORMATEX *vidFormatEx = (CUVIDEOFORMATEX *)pVideoFormat;
+        if (m_nMaxWidth < pVideoFormat->coded_width) {
+            m_nMaxWidth = vidFormatEx->av1.max_width;
+        }
+        if (m_nMaxHeight < pVideoFormat->coded_height) {
+            m_nMaxHeight = vidFormatEx->av1.max_height;
+        }
+    }
+    if (m_nMaxWidth < (int)pVideoFormat->coded_width)
+        m_nMaxWidth = pVideoFormat->coded_width;
+    if (m_nMaxHeight < (int)pVideoFormat->coded_height)
+        m_nMaxHeight = pVideoFormat->coded_height;
+    videoDecodeCreateInfo.ulMaxWidth = m_nMaxWidth;
+    videoDecodeCreateInfo.ulMaxHeight = m_nMaxHeight;
+
+    if (!(m_cropRect.r && m_cropRect.b) && !(m_resizeDim.w && m_resizeDim.h)) {
+        m_nWidth = pVideoFormat->display_area.right - pVideoFormat->display_area.left;
+        m_nLumaHeight = pVideoFormat->display_area.bottom - pVideoFormat->display_area.top;
+        videoDecodeCreateInfo.ulTargetWidth = pVideoFormat->coded_width;
+        videoDecodeCreateInfo.ulTargetHeight = pVideoFormat->coded_height;
+    } else {
+        if (m_resizeDim.w && m_resizeDim.h) {
+            videoDecodeCreateInfo.display_area.left = pVideoFormat->display_area.left;
+            videoDecodeCreateInfo.display_area.top = pVideoFormat->display_area.top;
+            videoDecodeCreateInfo.display_area.right = pVideoFormat->display_area.right;
+            videoDecodeCreateInfo.display_area.bottom = pVideoFormat->display_area.bottom;
+            m_nWidth = m_resizeDim.w;
+            m_nLumaHeight = m_resizeDim.h;
+        }
+
+        if (m_cropRect.r && m_cropRect.b) {
+            videoDecodeCreateInfo.display_area.left = m_cropRect.l;
+            videoDecodeCreateInfo.display_area.top = m_cropRect.t;
+            videoDecodeCreateInfo.display_area.right = m_cropRect.r;
+            videoDecodeCreateInfo.display_area.bottom = m_cropRect.b;
+            m_nWidth = m_cropRect.r - m_cropRect.l;
+            m_nLumaHeight = m_cropRect.b - m_cropRect.t;
+        }
+        videoDecodeCreateInfo.ulTargetWidth = m_nWidth;
+        videoDecodeCreateInfo.ulTargetHeight = m_nLumaHeight;
+    }
+
+    m_nChromaHeight = (int)(ceil(m_nLumaHeight * GetChromaHeightFactor(m_eOutputFormat)));
+    m_nNumChromaPlanes = GetChromaPlaneCount(m_eOutputFormat);
+    m_nSurfaceHeight = videoDecodeCreateInfo.ulTargetHeight;
+    m_nSurfaceWidth = videoDecodeCreateInfo.ulTargetWidth;
+    m_displayRect.b = videoDecodeCreateInfo.display_area.bottom;
+    m_displayRect.t = videoDecodeCreateInfo.display_area.top;
+    m_displayRect.l = videoDecodeCreateInfo.display_area.left;
+    m_displayRect.r = videoDecodeCreateInfo.display_area.right;
+
+    m_videoInfo << "Video Decoding Params:" << std::endl
+                << "\tNum Surfaces : " << videoDecodeCreateInfo.ulNumDecodeSurfaces << std::endl
+                << "\tCrop         : [" << videoDecodeCreateInfo.display_area.left << ", "
+                << videoDecodeCreateInfo.display_area.top << ", " << videoDecodeCreateInfo.display_area.right << ", "
+                << videoDecodeCreateInfo.display_area.bottom << "]" << std::endl
+                << "\tResize       : " << videoDecodeCreateInfo.ulTargetWidth << "x"
+                << videoDecodeCreateInfo.ulTargetHeight << std::endl
+                << "\tDeinterlace  : "
+                << std::vector<const char *>{"Weave", "Bob", "Adaptive"}[videoDecodeCreateInfo.DeinterlaceMode];
+    m_videoInfo << std::endl;
+
+    CUDA_DRVAPI_CALL(cuCtxPushCurrent(m_cuContext));
+    NVDEC_API_CALL(cuvidCreateDecoder(&m_hDecoder, &videoDecodeCreateInfo));
+    CUDA_DRVAPI_CALL(cuCtxPopCurrent(NULL));
+    STOP_TIMER("Session Initialization Time: ");
+    NvDecoder::addDecoderSessionOverHead(getDecoderSessionID(), elapsedTime);
+    return nDecodeSurface;
+}
diff --git a/superbench/benchmarks/micro_benchmarks/cuda_decode_performance/OptimizedNvDecoder.h b/superbench/benchmarks/micro_benchmarks/cuda_decode_performance/OptimizedNvDecoder.h
new file mode 100644
index 00000000..f9881c80
--- /dev/null
+++ b/superbench/benchmarks/micro_benchmarks/cuda_decode_performance/OptimizedNvDecoder.h
@@ -0,0 +1,52 @@
+// Copyright(c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#include "NvDecoder/NvDecoder.h"
+
+// This class is derived from NvDecoder class and is used to optimize the cuvidGetDecoderCaps overhead
+class OptimizedNvDecoder : public NvDecoder {
+
+  public:
+    OptimizedNvDecoder() {}
+    /**
+     *  @brief This function is used to initialize the decoder session.
+     *  Application must call this function to initialize the decoder, before
+     *  starting to decode any frames.
+     *  The only difference from the original function is to add a new member m_decodecaps.
+     *  Other part is the same as the original function, refer to NvDecoder.cpp in NVIDIA Video Codec SDK.
+     */
+    OptimizedNvDecoder(CUcontext &cuContext, bool bUseDeviceFrame, cudaVideoCodec eCodec, CUVIDDECODECAPS decodecaps,
+                       bool bLowLatency = false, bool bDeviceFramePitched = false, const Rect *pCropRect = NULL,
+                       const Dim *pResizeDim = NULL, bool extract_user_SEI_Message = false, int maxWidth = 0,
+                       int maxHeight = 0, unsigned int clkRate = 1000, bool force_zero_latency = false);
+
+    /**
+     * @brief This function is to overwrite the origin Decode function to record the latency on frame level.
+     */
+    int Decode(const uint8_t *pData, int nSize, int nFlags = 0, int64_t nTimestamp = 0);
+    /**
+     * @brief This function is used to Get the frameLatency vector
+     */
+    std::vector<std::tuple<int, double>> &GetFrameLatency() { return frameLatency; }
+
+  protected:
+    /**
+     *   @brief  Callback function to be registered for getting a callback when decoding of sequence starts
+     */
+    static int CUDAAPI HandleVideoSequenceProc(void *pUserData, CUVIDEOFORMAT *pVideoFormat) {
+        if (pUserData == nullptr) {
+            throw std::runtime_error("pUserData is nullptr");
+        }
+        return ((OptimizedNvDecoder *)pUserData)->HandleVideoSequence(pVideoFormat);
+    }
+    /**
+     *   @brief  Define the new handler when decoding of sequence starts.
+     *           The only change is to re-query decoder caps when the video codec or format change
+     *           Other part is the same as the original function, refer to NvDecoder.cpp in NVIDIA Video Codec SDK.
+     */
+    int HandleVideoSequence(CUVIDEOFORMAT *pVideoFormat);
+
+    CUVIDDECODECAPS m_decodecaps;
+
+    std::vector<std::tuple<int, double>> frameLatency;
+};
diff --git a/superbench/benchmarks/micro_benchmarks/cuda_decode_performance/ThreadPoolUtils.h b/superbench/benchmarks/micro_benchmarks/cuda_decode_performance/ThreadPoolUtils.h
new file mode 100644
index 00000000..5592b76e
--- /dev/null
+++ b/superbench/benchmarks/micro_benchmarks/cuda_decode_performance/ThreadPoolUtils.h
@@ -0,0 +1,99 @@
+// Copyright(c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#include <condition_variable>
+#include <functional>
+#include <future>
+#include <mutex>
+#include <queue>
+#include <thread>
+#include <vector>
+
+// ThreadPool is a simple thread pool implementation that supports enqueueing the task with the index of thread to use
+// and custom arguments like task(thread_index, *args).
+class ThreadPool {
+  public:
+    /**
+     * @brief Construct a new ThreadPool object with the given number of threads.
+     */
+    ThreadPool(size_t numThreads) {
+        for (size_t i = 0; i < numThreads; ++i) {
+            threads.emplace_back(&ThreadPool::worker, this, i);
+        }
+    }
+    /**
+     * @brief Destroy the ThreadPool object and join all threads.
+     */
+    ~ThreadPool() {
+        {
+            std::unique_lock<std::mutex> lock(mutex);
+            stop = true;
+        }
+        cv.notify_all();
+
+        for (auto &thread : threads) {
+            thread.join();
+        }
+    }
+    /**
+     * @brief TaskWrapper is a wrapper of the task with the index of thread to use and custom arguments like
+     * task(thread_index, *args).
+     */
+    template <typename R, typename F, typename... Args> struct TaskWrapper {
+        std::shared_ptr<std::packaged_task<R(size_t)>> task;
+
+        template <typename Callable, typename... CallableArgs> TaskWrapper(Callable &&f, CallableArgs &&...args) {
+            task = std::make_shared<std::packaged_task<R(size_t)>>(
+                [f, args...](size_t threadIdx) mutable { return f(threadIdx, args...); });
+        }
+
+        void operator()(size_t threadIdx) { (*task)(threadIdx); }
+    };
+    /**
+     * @brief Enqueue enqueues the task with custom arguments and return the results of task when finished.
+     */
+    template <typename F, typename... Args>
+    auto enqueue(F &&f, Args &&...args) -> std::future<typename std::result_of<F(size_t, Args...)>::type> {
+        using ReturnType = typename std::result_of<F(size_t, Args...)>::type;
+
+        TaskWrapper<ReturnType, F, Args...> wrapper(std::forward<F>(f), std::forward<Args>(args)...);
+        std::future<ReturnType> res = wrapper.task->get_future();
+
+        {
+            std::unique_lock<std::mutex> lock(mutex);
+            tasks.emplace(std::move(wrapper));
+        }
+        cv.notify_one();
+
+        return res;
+    }
+
+  private:
+    /**
+     * @brief The worker function that dequeues the task and executes it for each thread index.
+     */
+    void worker(size_t threadIdx) {
+        while (true) {
+            std::function<void(size_t)> task;
+            {
+                std::unique_lock<std::mutex> lock(mutex);
+                cv.wait(lock, [this] { return stop || !tasks.empty(); });
+
+                if (stop && tasks.empty()) {
+                    return;
+                }
+
+                task = tasks.front();
+                tasks.pop();
+            }
+
+            task(threadIdx);
+        }
+    }
+
+    std::vector<std::thread> threads;
+    std::queue<std::function<void(size_t)>> tasks;
+    std::mutex mutex;
+    std::condition_variable cv;
+    bool stop = false;
+};
diff --git a/third_party/Video_Codec_SDK/Interface/cuviddec.h b/third_party/Video_Codec_SDK/Interface/cuviddec.h
new file mode 100644
index 00000000..1d13eec8
--- /dev/null
+++ b/third_party/Video_Codec_SDK/Interface/cuviddec.h
@@ -0,0 +1,1173 @@
+/*
+ * This copyright notice applies to this header file only:
+ *
+ * Copyright (c) 2010-2023 NVIDIA Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the software, and to permit persons to whom the
+ * software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*****************************************************************************************************/
+//! \file cuviddec.h
+//! NVDECODE API provides video decoding interface to NVIDIA GPU devices.
+//! This file contains constants, structure definitions and function prototypes used for decoding.
+/*****************************************************************************************************/
+
+#if !defined(__CUDA_VIDEO_H__)
+#define __CUDA_VIDEO_H__
+
+#ifndef __cuda_cuda_h__
+#include <cuda.h>
+#endif // __cuda_cuda_h__
+
+#if defined(_WIN64) || defined(__LP64__) || defined(__x86_64) || defined(AMD64) || defined(_M_AMD64)
+#if (CUDA_VERSION >= 3020) && (!defined(CUDA_FORCE_API_VERSION) || (CUDA_FORCE_API_VERSION >= 3020))
+#define __CUVID_DEVPTR64
+#endif
+#endif
+
+#if defined(__cplusplus)
+extern "C" {
+#endif /* __cplusplus */
+
+typedef void *CUvideodecoder;
+typedef struct _CUcontextlock_st *CUvideoctxlock;
+
+/*********************************************************************************/
+//! \enum cudaVideoCodec
+//! Video codec enums
+//! These enums are used in CUVIDDECODECREATEINFO and CUVIDDECODECAPS structures
+/*********************************************************************************/
+typedef enum cudaVideoCodec_enum {
+    cudaVideoCodec_MPEG1 = 0, /**<  MPEG1             */
+    cudaVideoCodec_MPEG2,     /**<  MPEG2             */
+    cudaVideoCodec_MPEG4,     /**<  MPEG4             */
+    cudaVideoCodec_VC1,       /**<  VC1               */
+    cudaVideoCodec_H264,      /**<  H264              */
+    cudaVideoCodec_JPEG,      /**<  JPEG              */
+    cudaVideoCodec_H264_SVC,  /**<  H264-SVC          */
+    cudaVideoCodec_H264_MVC,  /**<  H264-MVC          */
+    cudaVideoCodec_HEVC,      /**<  HEVC              */
+    cudaVideoCodec_VP8,       /**<  VP8               */
+    cudaVideoCodec_VP9,       /**<  VP9               */
+    cudaVideoCodec_AV1,       /**<  AV1               */
+    cudaVideoCodec_NumCodecs, /**<  Max codecs        */
+    // Uncompressed YUV
+    cudaVideoCodec_YUV420 = (('I' << 24) | ('Y' << 16) | ('U' << 8) | ('V')), /**< Y,U,V (4:2:0)      */
+    cudaVideoCodec_YV12 = (('Y' << 24) | ('V' << 16) | ('1' << 8) | ('2')),   /**< Y,V,U (4:2:0)      */
+    cudaVideoCodec_NV12 = (('N' << 24) | ('V' << 16) | ('1' << 8) | ('2')),   /**< Y,UV  (4:2:0)      */
+    cudaVideoCodec_YUYV = (('Y' << 24) | ('U' << 16) | ('Y' << 8) | ('V')),   /**< YUYV/YUY2 (4:2:2)  */
+    cudaVideoCodec_UYVY = (('U' << 24) | ('Y' << 16) | ('V' << 8) | ('Y'))    /**< UYVY (4:2:2)       */
+} cudaVideoCodec;
+
+/*********************************************************************************/
+//! \enum cudaVideoSurfaceFormat
+//! Video surface format enums used for output format of decoded output
+//! These enums are used in CUVIDDECODECREATEINFO structure
+/*********************************************************************************/
+typedef enum cudaVideoSurfaceFormat_enum {
+    cudaVideoSurfaceFormat_NV12 = 0,         /**< Semi-Planar YUV [Y plane followed by interleaved UV plane]     */
+    cudaVideoSurfaceFormat_P016 = 1,         /**< 16 bit Semi-Planar YUV [Y plane followed by interleaved UV plane].
+                                                  Can be used for 10 bit(6LSB bits 0), 12 bit (4LSB bits 0)      */
+    cudaVideoSurfaceFormat_YUV444 = 2,       /**< Planar YUV [Y plane followed by U and V planes]                */
+    cudaVideoSurfaceFormat_YUV444_16Bit = 3, /**< 16 bit Planar YUV [Y plane followed by U and V planes].
+                                                  Can be used for 10 bit(6LSB bits 0), 12 bit (4LSB bits 0)      */
+} cudaVideoSurfaceFormat;
+
+/******************************************************************************************************************/
+//! \enum cudaVideoDeinterlaceMode
+//! Deinterlacing mode enums
+//! These enums are used in CUVIDDECODECREATEINFO structure
+//! Use cudaVideoDeinterlaceMode_Weave for progressive content and for content that doesn't need deinterlacing
+//! cudaVideoDeinterlaceMode_Adaptive needs more video memory than other DImodes
+/******************************************************************************************************************/
+typedef enum cudaVideoDeinterlaceMode_enum {
+    cudaVideoDeinterlaceMode_Weave = 0, /**< Weave both fields (no deinterlacing) */
+    cudaVideoDeinterlaceMode_Bob,       /**< Drop one field                       */
+    cudaVideoDeinterlaceMode_Adaptive   /**< Adaptive deinterlacing               */
+} cudaVideoDeinterlaceMode;
+
+/**************************************************************************************************************/
+//! \enum cudaVideoChromaFormat
+//! Chroma format enums
+//! These enums are used in CUVIDDECODECREATEINFO and CUVIDDECODECAPS structures
+/**************************************************************************************************************/
+typedef enum cudaVideoChromaFormat_enum {
+    cudaVideoChromaFormat_Monochrome = 0, /**< MonoChrome */
+    cudaVideoChromaFormat_420,            /**< YUV 4:2:0  */
+    cudaVideoChromaFormat_422,            /**< YUV 4:2:2  */
+    cudaVideoChromaFormat_444             /**< YUV 4:4:4  */
+} cudaVideoChromaFormat;
+
+/*************************************************************************************************************/
+//! \enum cudaVideoCreateFlags
+//! Decoder flag enums to select preferred decode path
+//! cudaVideoCreate_Default and cudaVideoCreate_PreferCUVID are most optimized, use these whenever possible
+/*************************************************************************************************************/
+typedef enum cudaVideoCreateFlags_enum {
+    cudaVideoCreate_Default = 0x00, /**< Default operation mode: use dedicated video engines                        */
+    cudaVideoCreate_PreferCUDA =
+        0x01, /**< Use CUDA-based decoder (requires valid vidLock object for multi-threading) */
+    cudaVideoCreate_PreferDXVA = 0x02, /**< Go through DXVA internally if possible (requires D3D9 interop) */
+    cudaVideoCreate_PreferCUVID = 0x04 /**< Use dedicated video engines directly */
+} cudaVideoCreateFlags;
+
+/*************************************************************************/
+//! \enum cuvidDecodeStatus
+//! Decode status enums
+//! These enums are used in CUVIDGETDECODESTATUS structure
+/*************************************************************************/
+typedef enum cuvidDecodeStatus_enum {
+    cuvidDecodeStatus_Invalid = 0,    // Decode status is not valid
+    cuvidDecodeStatus_InProgress = 1, // Decode is in progress
+    cuvidDecodeStatus_Success = 2,    // Decode is completed without any errors
+    // 3 to 7 enums are reserved for future use
+    cuvidDecodeStatus_Error = 8,           // Decode is completed with an error (error is not concealed)
+    cuvidDecodeStatus_Error_Concealed = 9, // Decode is completed with an error and error is concealed
+} cuvidDecodeStatus;
+
+/**************************************************************************************************************/
+//! \struct CUVIDDECODECAPS;
+//! This structure is used in cuvidGetDecoderCaps API
+/**************************************************************************************************************/
+typedef struct _CUVIDDECODECAPS {
+    cudaVideoCodec eCodecType;           /**< IN: cudaVideoCodec_XXX                                             */
+    cudaVideoChromaFormat eChromaFormat; /**< IN: cudaVideoChromaFormat_XXX                                      */
+    unsigned int nBitDepthMinus8;        /**< IN: The Value "BitDepth minus 8"                                   */
+    unsigned int reserved1[3];           /**< Reserved for future use - set to zero                              */
+
+    unsigned char bIsSupported;          /**< OUT: 1 if codec supported, 0 if not supported                      */
+    unsigned char nNumNVDECs;            /**< OUT: Number of NVDECs that can support IN params                   */
+    unsigned short nOutputFormatMask;    /**< OUT: each bit represents corresponding cudaVideoSurfaceFormat enum */
+    unsigned int nMaxWidth;              /**< OUT: Max supported coded width in pixels                           */
+    unsigned int nMaxHeight;             /**< OUT: Max supported coded height in pixels                          */
+    unsigned int nMaxMBCount;            /**< OUT: Max supported macroblock count
+                                                   CodedWidth*CodedHeight/256 must be <= nMaxMBCount             */
+    unsigned short nMinWidth;            /**< OUT: Min supported coded width in pixels                           */
+    unsigned short nMinHeight;           /**< OUT: Min supported coded height in pixels                          */
+    unsigned char bIsHistogramSupported; /**< OUT: 1 if Y component histogram output is supported, 0 if not
+                                                   Note: histogram is computed on original picture data before
+                                                   any post-processing like scaling, cropping, etc. is applied   */
+    unsigned char nCounterBitDepth;      /**< OUT: histogram counter bit depth                                   */
+    unsigned short nMaxHistogramBins;    /**< OUT: Max number of histogram bins                                  */
+    unsigned int reserved3[10];          /**< Reserved for future use - set to zero                              */
+} CUVIDDECODECAPS;
+
+/**************************************************************************************************************/
+//! \struct CUVIDDECODECREATEINFO
+//! This structure is used in cuvidCreateDecoder API
+/**************************************************************************************************************/
+typedef struct _CUVIDDECODECREATEINFO {
+    unsigned long ulWidth;  /**< IN: Coded sequence width in pixels                                             */
+    unsigned long ulHeight; /**< IN: Coded sequence height in pixels                                            */
+    unsigned long ulNumDecodeSurfaces; /**< IN: Maximum number of internal decode surfaces */
+    cudaVideoCodec CodecType; /**< IN: cudaVideoCodec_XXX                                                         */
+    cudaVideoChromaFormat ChromaFormat; /**< IN: cudaVideoChromaFormat_XXX */
+    unsigned long ulCreationFlags;      /**< IN: Decoder creation flags (cudaVideoCreateFlags_XXX)      */
+    unsigned long bitDepthMinus8; /**< IN: The value "BitDepth minus 8"                                               */
+    unsigned long ulIntraDecodeOnly; /**< IN: Set 1 only if video has all intra frames (default value is 0). This will
+                                          optimize video memory for Intra frames only decoding. The support is limited
+                                          to specific codecs - H264, HEVC, VP9, the flag will be ignored for codecs
+                                        which are not supported. However decoding might fail if the flag is enabled in
+                                        case of supported codecs for regular bit streams having P and/or B frames. */
+    unsigned long ulMaxWidth;  /**< IN: Coded sequence max width in pixels used with reconfigure Decoder           */
+    unsigned long ulMaxHeight; /**< IN: Coded sequence max height in pixels used with reconfigure Decoder          */
+    unsigned long Reserved1;   /**< Reserved for future use - set to zero                                          */
+    /**
+     * IN: area of the frame that should be displayed
+     */
+    struct {
+        short left;
+        short top;
+        short right;
+        short bottom;
+    } display_area;
+
+    cudaVideoSurfaceFormat OutputFormat;      /**< IN: cudaVideoSurfaceFormat_XXX                                     */
+    cudaVideoDeinterlaceMode DeinterlaceMode; /**< IN: cudaVideoDeinterlaceMode_XXX                                   */
+    unsigned long ulTargetWidth;              /**< IN: Post-processed output width (Should be aligned to 2)           */
+    unsigned long ulTargetHeight;             /**< IN: Post-processed output height (Should be aligned to 2)          */
+    unsigned long ulNumOutputSurfaces;        /**< IN: Maximum number of output surfaces simultaneously mapped        */
+    CUvideoctxlock vidLock;                   /**< IN: If non-NULL, context lock used for synchronizing ownership of
+                                                   the cuda context. Needed for cudaVideoCreate_PreferCUDA decode     */
+    /**
+     * IN: target rectangle in the output frame (for aspect ratio conversion)
+     * if a null rectangle is specified, {0,0,ulTargetWidth,ulTargetHeight} will be used
+     */
+    struct {
+        short left;
+        short top;
+        short right;
+        short bottom;
+    } target_rect;
+
+    unsigned long enableHistogram; /**< IN: enable histogram output, if supported */
+    unsigned long Reserved2[4];    /**< Reserved for future use - set to zero */
+} CUVIDDECODECREATEINFO;
+
+/*********************************************************/
+//! \struct CUVIDH264DPBENTRY
+//! H.264 DPB entry
+//! This structure is used in CUVIDH264PICPARAMS structure
+/*********************************************************/
+typedef struct _CUVIDH264DPBENTRY {
+    int PicIdx;             /**< picture index of reference frame                                        */
+    int FrameIdx;           /**< frame_num(short-term) or LongTermFrameIdx(long-term)                    */
+    int is_long_term;       /**< 0=short term reference, 1=long term reference                           */
+    int not_existing;       /**< non-existing reference frame (corresponding PicIdx should be set to -1) */
+    int used_for_reference; /**< 0=unused, 1=top_field, 2=bottom_field, 3=both_fields                    */
+    int FieldOrderCnt[2];   /**< field order count of top and bottom fields                              */
+} CUVIDH264DPBENTRY;
+
+/************************************************************/
+//! \struct CUVIDH264MVCEXT
+//! H.264 MVC picture parameters ext
+//! This structure is used in CUVIDH264PICPARAMS structure
+/************************************************************/
+typedef struct _CUVIDH264MVCEXT {
+    int num_views_minus1;          /**< Max number of coded views minus 1 in video : Range - 0 to 1023              */
+    int view_id;                   /**< view identifier                                                             */
+    unsigned char inter_view_flag; /**< 1 if used for inter-view prediction, 0 if not                               */
+    unsigned char num_inter_view_refs_l0; /**< number of inter-view ref pics in RefPicList0 */
+    unsigned char num_inter_view_refs_l1; /**< number of inter-view ref pics in RefPicList1 */
+    unsigned char MVCReserved8Bits; /**< Reserved bits                                                               */
+    int InterViewRefsL0[16];        /**< view id of the i-th view component for inter-view prediction in RefPicList0 */
+    int InterViewRefsL1[16];        /**< view id of the i-th view component for inter-view prediction in RefPicList1 */
+} CUVIDH264MVCEXT;
+
+/*********************************************************/
+//! \struct CUVIDH264SVCEXT
+//! H.264 SVC picture parameters ext
+//! This structure is used in CUVIDH264PICPARAMS structure
+/*********************************************************/
+typedef struct _CUVIDH264SVCEXT {
+    unsigned char profile_idc;
+    unsigned char level_idc;
+    unsigned char DQId;
+    unsigned char DQIdMax;
+    unsigned char disable_inter_layer_deblocking_filter_idc;
+    unsigned char ref_layer_chroma_phase_y_plus1;
+    signed char inter_layer_slice_alpha_c0_offset_div2;
+    signed char inter_layer_slice_beta_offset_div2;
+
+    unsigned short DPBEntryValidFlag;
+    unsigned char inter_layer_deblocking_filter_control_present_flag;
+    unsigned char extended_spatial_scalability_idc;
+    unsigned char adaptive_tcoeff_level_prediction_flag;
+    unsigned char slice_header_restriction_flag;
+    unsigned char chroma_phase_x_plus1_flag;
+    unsigned char chroma_phase_y_plus1;
+
+    unsigned char tcoeff_level_prediction_flag;
+    unsigned char constrained_intra_resampling_flag;
+    unsigned char ref_layer_chroma_phase_x_plus1_flag;
+    unsigned char store_ref_base_pic_flag;
+    unsigned char Reserved8BitsA;
+    unsigned char Reserved8BitsB;
+
+    short scaled_ref_layer_left_offset;
+    short scaled_ref_layer_top_offset;
+    short scaled_ref_layer_right_offset;
+    short scaled_ref_layer_bottom_offset;
+    unsigned short Reserved16Bits;
+    struct _CUVIDPICPARAMS *pNextLayer; /**< Points to the picparams for the next layer to be decoded.
+                                             Linked list ends at the target layer. */
+    int bRefBaseLayer;                  /**< whether to store ref base pic */
+} CUVIDH264SVCEXT;
+
+/******************************************************/
+//! \struct CUVIDH264PICPARAMS
+//! H.264 picture parameters
+//! This structure is used in CUVIDPICPARAMS structure
+/******************************************************/
+typedef struct _CUVIDH264PICPARAMS {
+    // SPS
+    int log2_max_frame_num_minus4;
+    int pic_order_cnt_type;
+    int log2_max_pic_order_cnt_lsb_minus4;
+    int delta_pic_order_always_zero_flag;
+    int frame_mbs_only_flag;
+    int direct_8x8_inference_flag;
+    int num_ref_frames; // NOTE: shall meet level 4.1 restrictions
+    unsigned char residual_colour_transform_flag;
+    unsigned char bit_depth_luma_minus8;   // Must be 0 (only 8-bit supported)
+    unsigned char bit_depth_chroma_minus8; // Must be 0 (only 8-bit supported)
+    unsigned char qpprime_y_zero_transform_bypass_flag;
+    // PPS
+    int entropy_coding_mode_flag;
+    int pic_order_present_flag;
+    int num_ref_idx_l0_active_minus1;
+    int num_ref_idx_l1_active_minus1;
+    int weighted_pred_flag;
+    int weighted_bipred_idc;
+    int pic_init_qp_minus26;
+    int deblocking_filter_control_present_flag;
+    int redundant_pic_cnt_present_flag;
+    int transform_8x8_mode_flag;
+    int MbaffFrameFlag;
+    int constrained_intra_pred_flag;
+    int chroma_qp_index_offset;
+    int second_chroma_qp_index_offset;
+    int ref_pic_flag;
+    int frame_num;
+    int CurrFieldOrderCnt[2];
+    // DPB
+    CUVIDH264DPBENTRY dpb[16]; // List of reference frames within the DPB
+    // Quantization Matrices (raster-order)
+    unsigned char WeightScale4x4[6][16];
+    unsigned char WeightScale8x8[2][64];
+    // FMO/ASO
+    unsigned char fmo_aso_enable;
+    unsigned char num_slice_groups_minus1;
+    unsigned char slice_group_map_type;
+    signed char pic_init_qs_minus26;
+    unsigned int slice_group_change_rate_minus1;
+    union {
+        unsigned long long slice_group_map_addr;
+        const unsigned char *pMb2SliceGroupMap;
+    } fmo;
+    unsigned int Reserved[12];
+    // SVC/MVC
+    union {
+        CUVIDH264MVCEXT mvcext;
+        CUVIDH264SVCEXT svcext;
+    };
+} CUVIDH264PICPARAMS;
+
+/********************************************************/
+//! \struct CUVIDMPEG2PICPARAMS
+//! MPEG-2 picture parameters
+//! This structure is used in CUVIDPICPARAMS structure
+/********************************************************/
+typedef struct _CUVIDMPEG2PICPARAMS {
+    int ForwardRefIdx;  // Picture index of forward reference (P/B-frames)
+    int BackwardRefIdx; // Picture index of backward reference (B-frames)
+    int picture_coding_type;
+    int full_pel_forward_vector;
+    int full_pel_backward_vector;
+    int f_code[2][2];
+    int intra_dc_precision;
+    int frame_pred_frame_dct;
+    int concealment_motion_vectors;
+    int q_scale_type;
+    int intra_vlc_format;
+    int alternate_scan;
+    int top_field_first;
+    // Quantization matrices (raster order)
+    unsigned char QuantMatrixIntra[64];
+    unsigned char QuantMatrixInter[64];
+} CUVIDMPEG2PICPARAMS;
+
+// MPEG-4 has VOP types instead of Picture types
+#define I_VOP 0
+#define P_VOP 1
+#define B_VOP 2
+#define S_VOP 3
+
+/*******************************************************/
+//! \struct CUVIDMPEG4PICPARAMS
+//! MPEG-4 picture parameters
+//! This structure is used in CUVIDPICPARAMS structure
+/*******************************************************/
+typedef struct _CUVIDMPEG4PICPARAMS {
+    int ForwardRefIdx;  // Picture index of forward reference (P/B-frames)
+    int BackwardRefIdx; // Picture index of backward reference (B-frames)
+    // VOL
+    int video_object_layer_width;
+    int video_object_layer_height;
+    int vop_time_increment_bitcount;
+    int top_field_first;
+    int resync_marker_disable;
+    int quant_type;
+    int quarter_sample;
+    int short_video_header;
+    int divx_flags;
+    // VOP
+    int vop_coding_type;
+    int vop_coded;
+    int vop_rounding_type;
+    int alternate_vertical_scan_flag;
+    int interlaced;
+    int vop_fcode_forward;
+    int vop_fcode_backward;
+    int trd[2];
+    int trb[2];
+    // Quantization matrices (raster order)
+    unsigned char QuantMatrixIntra[64];
+    unsigned char QuantMatrixInter[64];
+    int gmc_enabled;
+} CUVIDMPEG4PICPARAMS;
+
+/********************************************************/
+//! \struct CUVIDVC1PICPARAMS
+//! VC1 picture parameters
+//! This structure is used in CUVIDPICPARAMS structure
+/********************************************************/
+typedef struct _CUVIDVC1PICPARAMS {
+    int ForwardRefIdx;  /**< Picture index of forward reference (P/B-frames) */
+    int BackwardRefIdx; /**< Picture index of backward reference (B-frames)  */
+    int FrameWidth;     /**< Actual frame width                              */
+    int FrameHeight;    /**< Actual frame height                             */
+    // PICTURE
+    int intra_pic_flag;  /**< Set to 1 for I,BI frames */
+    int ref_pic_flag;    /**< Set to 1 for I,P frames  */
+    int progressive_fcm; /**< Progressive frame        */
+    // SEQUENCE
+    int profile;
+    int postprocflag;
+    int pulldown;
+    int interlace;
+    int tfcntrflag;
+    int finterpflag;
+    int psf;
+    int multires;
+    int syncmarker;
+    int rangered;
+    int maxbframes;
+    // ENTRYPOINT
+    int panscan_flag;
+    int refdist_flag;
+    int extended_mv;
+    int dquant;
+    int vstransform;
+    int loopfilter;
+    int fastuvmc;
+    int overlap;
+    int quantizer;
+    int extended_dmv;
+    int range_mapy_flag;
+    int range_mapy;
+    int range_mapuv_flag;
+    int range_mapuv;
+    int rangeredfrm; // range reduction state
+} CUVIDVC1PICPARAMS;
+
+/***********************************************************/
+//! \struct CUVIDJPEGPICPARAMS
+//! JPEG picture parameters
+//! This structure is used in CUVIDPICPARAMS structure
+/***********************************************************/
+typedef struct _CUVIDJPEGPICPARAMS {
+    int Reserved;
+} CUVIDJPEGPICPARAMS;
+
+/*******************************************************/
+//! \struct CUVIDHEVCPICPARAMS
+//! HEVC picture parameters
+//! This structure is used in CUVIDPICPARAMS structure
+/*******************************************************/
+typedef struct _CUVIDHEVCPICPARAMS {
+    // sps
+    int pic_width_in_luma_samples;
+    int pic_height_in_luma_samples;
+    unsigned char log2_min_luma_coding_block_size_minus3;
+    unsigned char log2_diff_max_min_luma_coding_block_size;
+    unsigned char log2_min_transform_block_size_minus2;
+    unsigned char log2_diff_max_min_transform_block_size;
+    unsigned char pcm_enabled_flag;
+    unsigned char log2_min_pcm_luma_coding_block_size_minus3;
+    unsigned char log2_diff_max_min_pcm_luma_coding_block_size;
+    unsigned char pcm_sample_bit_depth_luma_minus1;
+
+    unsigned char pcm_sample_bit_depth_chroma_minus1;
+    unsigned char pcm_loop_filter_disabled_flag;
+    unsigned char strong_intra_smoothing_enabled_flag;
+    unsigned char max_transform_hierarchy_depth_intra;
+    unsigned char max_transform_hierarchy_depth_inter;
+    unsigned char amp_enabled_flag;
+    unsigned char separate_colour_plane_flag;
+    unsigned char log2_max_pic_order_cnt_lsb_minus4;
+
+    unsigned char num_short_term_ref_pic_sets;
+    unsigned char long_term_ref_pics_present_flag;
+    unsigned char num_long_term_ref_pics_sps;
+    unsigned char sps_temporal_mvp_enabled_flag;
+    unsigned char sample_adaptive_offset_enabled_flag;
+    unsigned char scaling_list_enable_flag;
+    unsigned char IrapPicFlag;
+    unsigned char IdrPicFlag;
+
+    unsigned char bit_depth_luma_minus8;
+    unsigned char bit_depth_chroma_minus8;
+    // sps/pps extension fields
+    unsigned char log2_max_transform_skip_block_size_minus2;
+    unsigned char log2_sao_offset_scale_luma;
+    unsigned char log2_sao_offset_scale_chroma;
+    unsigned char high_precision_offsets_enabled_flag;
+    unsigned char reserved1[10];
+
+    // pps
+    unsigned char dependent_slice_segments_enabled_flag;
+    unsigned char slice_segment_header_extension_present_flag;
+    unsigned char sign_data_hiding_enabled_flag;
+    unsigned char cu_qp_delta_enabled_flag;
+    unsigned char diff_cu_qp_delta_depth;
+    signed char init_qp_minus26;
+    signed char pps_cb_qp_offset;
+    signed char pps_cr_qp_offset;
+
+    unsigned char constrained_intra_pred_flag;
+    unsigned char weighted_pred_flag;
+    unsigned char weighted_bipred_flag;
+    unsigned char transform_skip_enabled_flag;
+    unsigned char transquant_bypass_enabled_flag;
+    unsigned char entropy_coding_sync_enabled_flag;
+    unsigned char log2_parallel_merge_level_minus2;
+    unsigned char num_extra_slice_header_bits;
+
+    unsigned char loop_filter_across_tiles_enabled_flag;
+    unsigned char loop_filter_across_slices_enabled_flag;
+    unsigned char output_flag_present_flag;
+    unsigned char num_ref_idx_l0_default_active_minus1;
+    unsigned char num_ref_idx_l1_default_active_minus1;
+    unsigned char lists_modification_present_flag;
+    unsigned char cabac_init_present_flag;
+    unsigned char pps_slice_chroma_qp_offsets_present_flag;
+
+    unsigned char deblocking_filter_override_enabled_flag;
+    unsigned char pps_deblocking_filter_disabled_flag;
+    signed char pps_beta_offset_div2;
+    signed char pps_tc_offset_div2;
+    unsigned char tiles_enabled_flag;
+    unsigned char uniform_spacing_flag;
+    unsigned char num_tile_columns_minus1;
+    unsigned char num_tile_rows_minus1;
+
+    unsigned short column_width_minus1[21];
+    unsigned short row_height_minus1[21];
+
+    // sps and pps extension HEVC-main 444
+    unsigned char sps_range_extension_flag;
+    unsigned char transform_skip_rotation_enabled_flag;
+    unsigned char transform_skip_context_enabled_flag;
+    unsigned char implicit_rdpcm_enabled_flag;
+
+    unsigned char explicit_rdpcm_enabled_flag;
+    unsigned char extended_precision_processing_flag;
+    unsigned char intra_smoothing_disabled_flag;
+    unsigned char persistent_rice_adaptation_enabled_flag;
+
+    unsigned char cabac_bypass_alignment_enabled_flag;
+    unsigned char pps_range_extension_flag;
+    unsigned char cross_component_prediction_enabled_flag;
+    unsigned char chroma_qp_offset_list_enabled_flag;
+
+    unsigned char diff_cu_chroma_qp_offset_depth;
+    unsigned char chroma_qp_offset_list_len_minus1;
+    signed char cb_qp_offset_list[6];
+
+    signed char cr_qp_offset_list[6];
+    unsigned char reserved2[2];
+
+    unsigned int reserved3[8];
+
+    // RefPicSets
+    int NumBitsForShortTermRPSInSlice;
+    int NumDeltaPocsOfRefRpsIdx;
+    int NumPocTotalCurr;
+    int NumPocStCurrBefore;
+    int NumPocStCurrAfter;
+    int NumPocLtCurr;
+    int CurrPicOrderCntVal;
+    int RefPicIdx[16];                      // [refpic] Indices of valid reference pictures (-1 if unused for reference)
+    int PicOrderCntVal[16];                 // [refpic]
+    unsigned char IsLongTerm[16];           // [refpic] 0=not a long-term reference, 1=long-term reference
+    unsigned char RefPicSetStCurrBefore[8]; // [0..NumPocStCurrBefore-1] -> refpic (0..15)
+    unsigned char RefPicSetStCurrAfter[8];  // [0..NumPocStCurrAfter-1] -> refpic (0..15)
+    unsigned char RefPicSetLtCurr[8];       // [0..NumPocLtCurr-1] -> refpic (0..15)
+    unsigned char RefPicSetInterLayer0[8];
+    unsigned char RefPicSetInterLayer1[8];
+    unsigned int reserved4[12];
+
+    // scaling lists (diag order)
+    unsigned char ScalingList4x4[6][16];      // [matrixId][i]
+    unsigned char ScalingList8x8[6][64];      // [matrixId][i]
+    unsigned char ScalingList16x16[6][64];    // [matrixId][i]
+    unsigned char ScalingList32x32[2][64];    // [matrixId][i]
+    unsigned char ScalingListDCCoeff16x16[6]; // [matrixId]
+    unsigned char ScalingListDCCoeff32x32[2]; // [matrixId]
+} CUVIDHEVCPICPARAMS;
+
+/***********************************************************/
+//! \struct CUVIDVP8PICPARAMS
+//! VP8 picture parameters
+//! This structure is used in CUVIDPICPARAMS structure
+/***********************************************************/
+typedef struct _CUVIDVP8PICPARAMS {
+    int width;
+    int height;
+    unsigned int first_partition_size;
+    // Frame Indexes
+    unsigned char LastRefIdx;
+    unsigned char GoldenRefIdx;
+    unsigned char AltRefIdx;
+    union {
+        struct {
+            unsigned char frame_type : 1; /**< 0 = KEYFRAME, 1 = INTERFRAME  */
+            unsigned char version : 3;
+            unsigned char show_frame : 1;
+            unsigned char update_mb_segmentation_data : 1; /**< Must be 0 if segmentation is not enabled */
+            unsigned char Reserved2Bits : 2;
+        } vp8_frame_tag;
+        unsigned char wFrameTagFlags;
+    };
+    unsigned char Reserved1[4];
+    unsigned int Reserved2[3];
+} CUVIDVP8PICPARAMS;
+
+/***********************************************************/
+//! \struct CUVIDVP9PICPARAMS
+//! VP9 picture parameters
+//! This structure is used in CUVIDPICPARAMS structure
+/***********************************************************/
+typedef struct _CUVIDVP9PICPARAMS {
+    unsigned int width;
+    unsigned int height;
+
+    // Frame Indices
+    unsigned char LastRefIdx;
+    unsigned char GoldenRefIdx;
+    unsigned char AltRefIdx;
+    unsigned char colorSpace;
+
+    unsigned short profile : 3;
+    unsigned short frameContextIdx : 2;
+    unsigned short frameType : 1;
+    unsigned short showFrame : 1;
+    unsigned short errorResilient : 1;
+    unsigned short frameParallelDecoding : 1;
+    unsigned short subSamplingX : 1;
+    unsigned short subSamplingY : 1;
+    unsigned short intraOnly : 1;
+    unsigned short allow_high_precision_mv : 1;
+    unsigned short refreshEntropyProbs : 1;
+    unsigned short reserved2Bits : 2;
+
+    unsigned short reserved16Bits;
+
+    unsigned char refFrameSignBias[4];
+
+    unsigned char bitDepthMinus8Luma;
+    unsigned char bitDepthMinus8Chroma;
+    unsigned char loopFilterLevel;
+    unsigned char loopFilterSharpness;
+
+    unsigned char modeRefLfEnabled;
+    unsigned char log2_tile_columns;
+    unsigned char log2_tile_rows;
+
+    unsigned char segmentEnabled : 1;
+    unsigned char segmentMapUpdate : 1;
+    unsigned char segmentMapTemporalUpdate : 1;
+    unsigned char segmentFeatureMode : 1;
+    unsigned char reserved4Bits : 4;
+
+    unsigned char segmentFeatureEnable[8][4];
+    short segmentFeatureData[8][4];
+    unsigned char mb_segment_tree_probs[7];
+    unsigned char segment_pred_probs[3];
+    unsigned char reservedSegment16Bits[2];
+
+    int qpYAc;
+    int qpYDc;
+    int qpChDc;
+    int qpChAc;
+
+    unsigned int activeRefIdx[3];
+    unsigned int resetFrameContext;
+    unsigned int mcomp_filter_type;
+    unsigned int mbRefLfDelta[4];
+    unsigned int mbModeLfDelta[2];
+    unsigned int frameTagSize;
+    unsigned int offsetToDctParts;
+    unsigned int reserved128Bits[4];
+
+} CUVIDVP9PICPARAMS;
+
+/***********************************************************/
+//! \struct CUVIDAV1PICPARAMS
+//! AV1 picture parameters
+//! This structure is used in CUVIDPICPARAMS structure
+/***********************************************************/
+typedef struct _CUVIDAV1PICPARAMS {
+    unsigned int width;        // coded width, if superres enabled then it is upscaled width
+    unsigned int height;       // coded height
+    unsigned int frame_offset; // defined as order_hint in AV1 specification
+    int decodePicIdx; // decoded output pic index, if film grain enabled, it will keep decoded (without film grain)
+                      // output It can be used as reference frame for future frames
+
+    // sequence header
+    unsigned int profile : 3;                // 0 = profile0, 1 = profile1, 2 = profile2
+    unsigned int use_128x128_superblock : 1; // superblock size 0:64x64, 1: 128x128
+    unsigned int subsampling_x : 1;          // (subsampling_x, _y) 1,1 = 420, 1,0 = 422, 0,0 = 444
+    unsigned int subsampling_y : 1;
+    unsigned int mono_chrome : 1;      // for monochrome content, mono_chrome = 1 and (subsampling_x, _y) should be 1,1
+    unsigned int bit_depth_minus8 : 4; // bit depth minus 8
+    unsigned int enable_filter_intra : 1;        // tool enable in seq level, 0 : disable 1: frame header control
+    unsigned int enable_intra_edge_filter : 1;   // intra edge filtering process, 0 : disable 1: enabled
+    unsigned int enable_interintra_compound : 1; // interintra, 0 : not present 1: present
+    unsigned int enable_masked_compound : 1;     // 1: mode info for inter blocks may contain the syntax element
+                                                 // compound_type. 0: syntax element compound_type will not be present
+    unsigned int enable_dual_filter : 1;         // vertical and horiz filter selection, 1: enable and 0: disable
+    unsigned int enable_order_hint : 1;          // order hint, and related tools, 1: enable and 0: disable
+    unsigned int order_hint_bits_minus1 : 3;     // is used to compute OrderHintBits
+    unsigned int enable_jnt_comp : 1;            // joint compound modes, 1: enable and 0: disable
+    unsigned int enable_superres : 1;            // superres in seq level, 0 : disable 1: frame level control
+    unsigned int enable_cdef : 1;                // cdef filtering in seq level, 0 : disable 1: frame level control
+    unsigned int enable_restoration : 1; // loop restoration filtering in seq level, 0 : disable 1: frame level control
+    unsigned int enable_fgs : 1;         // defined as film_grain_params_present in AV1 specification
+    unsigned int reserved0_7bits : 7;    // reserved bits; must be set to 0
+
+    // frame header
+    unsigned int frame_type : 2;         // 0:Key frame, 1:Inter frame, 2:intra only, 3:s-frame
+    unsigned int show_frame : 1;         // show_frame = 1 implies that frame should be immediately output once decoded
+    unsigned int disable_cdf_update : 1; // CDF update during symbol decoding, 1: disabled, 0: enabled
+    unsigned int
+        allow_screen_content_tools : 1; // 1: intra blocks may use palette encoding, 0: palette encoding is never used
+    unsigned int force_integer_mv : 1;  // 1: motion vectors will always be integers, 0: can contain fractional bits
+    unsigned int coded_denom : 3;       // coded_denom of the superres scale as specified in AV1 specification
+    unsigned int allow_intrabc : 1;     // 1: intra block copy may be used, 0: intra block copy is not allowed
+    unsigned int allow_high_precision_mv : 1; // 1/8 precision mv enable
+    unsigned int interp_filter : 3;           // interpolation filter. Refer to section 6.8.9 of the AV1 specification
+                                              // Version 1.0.0 with Errata 1
+    unsigned int switchable_motion_mode : 1;  // defined as is_motion_mode_switchable in AV1 specification
+    unsigned int use_ref_frame_mvs : 1; // 1: current frame can use the previous frame mv information, 0: will not use.
+    unsigned int disable_frame_end_update_cdf : 1; // 1: indicates that the end of frame CDF update is disabled
+    unsigned int delta_q_present : 1;              // quantizer index delta values are present in the block level
+    unsigned int delta_q_res : 2;    // left shift which should be applied to decoded quantizer index delta values
+    unsigned int using_qmatrix : 1;  // 1: quantizer matrix will be used to compute quantizers
+    unsigned int coded_lossless : 1; // 1: all segments use lossless coding
+    unsigned int use_superres : 1;   // 1: superres enabled for frame
+    unsigned int tx_mode : 2;        // 0: ONLY4x4,1:LARGEST,2:SELECT
+    unsigned int reference_mode : 1; // 0: SINGLE, 1: SELECT
+    unsigned int
+        allow_warped_motion : 1; // 1: allow_warped_motion may be present, 0: allow_warped_motion will not be present
+    unsigned int reduced_tx_set : 1;  // 1: frame is restricted to subset of the full set of transform types, 0: no such
+                                      // restriction
+    unsigned int skip_mode : 1;       // 1: most of the mode info is skipped, 0: mode info is not skipped
+    unsigned int reserved1_3bits : 3; // reserved bits; must be set to 0
+
+    // tiling info
+    unsigned int num_tile_cols : 8;           // number of tiles across the frame., max is 64
+    unsigned int num_tile_rows : 8;           // number of tiles down the frame., max is 64
+    unsigned int context_update_tile_id : 16; // specifies which tile to use for the CDF update
+    unsigned short tile_widths[64];           // Width of each column in superblocks
+    unsigned short tile_heights[64];          // height of each row in superblocks
+
+    // CDEF - refer to section 6.10.14 of the AV1 specification Version 1.0.0 with Errata 1
+    unsigned char cdef_damping_minus_3 : 2; // controls the amount of damping in the deringing filter
+    unsigned char cdef_bits : 2;            // the number of bits needed to specify which CDEF filter to apply
+    unsigned char reserved2_4bits : 4;      // reserved bits; must be set to 0
+    unsigned char cdef_y_strength[8];       // 0-3 bits: y_pri_strength, 4-7 bits y_sec_strength
+    unsigned char cdef_uv_strength[8];      // 0-3 bits: uv_pri_strength, 4-7 bits uv_sec_strength
+
+    // SkipModeFrames
+    unsigned char
+        SkipModeFrame0 : 4; // specifies the frames to use for compound prediction when skip_mode is equal to 1.
+    unsigned char SkipModeFrame1 : 4;
+
+    // qp information - refer to section 6.8.11 of the AV1 specification Version 1.0.0 with Errata 1
+    unsigned char base_qindex; // indicates the base frame qindex. Defined as base_q_idx in AV1 specification
+    char qp_y_dc_delta_q;      // indicates the Y DC quantizer relative to base_q_idx. Defined as DeltaQYDc in AV1
+                               // specification
+    char qp_u_dc_delta_q;      // indicates the U DC quantizer relative to base_q_idx. Defined as DeltaQUDc in AV1
+                               // specification
+    char qp_v_dc_delta_q;      // indicates the V DC quantizer relative to base_q_idx. Defined as DeltaQVDc in AV1
+                               // specification
+    char qp_u_ac_delta_q;      // indicates the U AC quantizer relative to base_q_idx. Defined as DeltaQUAc in AV1
+                               // specification
+    char qp_v_ac_delta_q;      // indicates the V AC quantizer relative to base_q_idx. Defined as DeltaQVAc in AV1
+                               // specification
+    unsigned char qm_y; // specifies the level in the quantizer matrix that should be used for luma plane decoding
+    unsigned char qm_u; // specifies the level in the quantizer matrix that should be used for chroma U plane decoding
+    unsigned char qm_v; // specifies the level in the quantizer matrix that should be used for chroma V plane decoding
+
+    // segmentation - refer to section 6.8.13 of the AV1 specification Version 1.0.0 with Errata 1
+    unsigned char segmentation_enabled : 1;    // 1 indicates that this frame makes use of the segmentation tool
+    unsigned char segmentation_update_map : 1; // 1 indicates that the segmentation map are updated during the decoding
+                                               // of this frame
+    unsigned char
+        segmentation_update_data : 1; // 1 indicates that new parameters are about to be specified for each segment
+    unsigned char segmentation_temporal_update : 1; // 1 indicates that the updates to the segmentation map are coded
+                                                    // relative to the existing segmentation map
+    unsigned char reserved3_4bits : 4;              // reserved bits; must be set to 0
+    short segmentation_feature_data[8][8];          // specifies the feature data for a segment feature
+    unsigned char
+        segmentation_feature_mask[8]; // indicates that the corresponding feature is unused or feature value is coded
+
+    // loopfilter - refer to section 6.8.10 of the AV1 specification Version 1.0.0 with Errata 1
+    unsigned char loop_filter_level[2];  // contains loop filter strength values
+    unsigned char loop_filter_level_u;   // loop filter strength value of U plane
+    unsigned char loop_filter_level_v;   // loop filter strength value of V plane
+    unsigned char loop_filter_sharpness; // indicates the sharpness level
+    char loop_filter_ref_deltas[8]; // contains the adjustment needed for the filter level based on the chosen reference
+                                    // frame
+    char loop_filter_mode_deltas[2]; // contains the adjustment needed for the filter level based on the chosen mode
+    unsigned char loop_filter_delta_enabled : 1; // indicates that the filter level depends on the mode and reference
+                                                 // frame used to predict a block
+    unsigned char loop_filter_delta_update : 1;  // indicates that additional syntax elements are present that specify
+                                                 // which mode and reference frame deltas are to be updated
+    unsigned char delta_lf_present : 1; // specifies whether loop filter delta values are present in the block level
+    unsigned char delta_lf_res : 2;     // specifies the left shift to apply to the decoded loop filter values
+    unsigned char delta_lf_multi : 1;   // separate loop filter deltas for Hy,Vy,U,V edges
+    unsigned char reserved4_2bits : 2;  // reserved bits; must be set to 0
+
+    // restoration - refer to section 6.10.15 of the AV1 specification Version 1.0.0 with Errata 1
+    unsigned char lr_unit_size[3]; // specifies the size of loop restoration units: 0: 32, 1: 64, 2: 128, 3: 256
+    unsigned char lr_type[3];      // used to compute FrameRestorationType
+
+    // reference frames
+    unsigned char primary_ref_frame; // specifies which reference frame contains the CDF values and other state that
+                                     // should be loaded at the start of the frame
+    unsigned char ref_frame_map[8];  // frames in dpb that can be used as reference for current or future frames
+
+    unsigned char temporal_layer_id : 4; // temporal layer id
+    unsigned char spatial_layer_id : 4;  // spatial layer id
+
+    unsigned char reserved5_32bits[4]; // reserved bits; must be set to 0
+
+    // ref frame list
+    struct {
+        unsigned int width;
+        unsigned int height;
+        unsigned char index;
+        unsigned char reserved24Bits[3]; // reserved bits; must be set to 0
+    } ref_frame[7];                      // frames used as reference frame for current frame.
+
+    // global motion
+    struct {
+        unsigned char invalid : 1;
+        unsigned char wmtype : 2;        // defined as GmType in AV1 specification
+        unsigned char reserved5Bits : 5; // reserved bits; must be set to 0
+        char reserved24Bits[3];          // reserved bits; must be set to 0
+        int wmmat[6];                    // defined as gm_params[] in AV1 specification
+    } global_motion[7];                  // global motion params for reference frames
+
+    // film grain params - refer to section 6.8.20 of the AV1 specification Version 1.0.0 with Errata 1
+    unsigned short apply_grain : 1;
+    unsigned short overlap_flag : 1;
+    unsigned short scaling_shift_minus8 : 2;
+    unsigned short chroma_scaling_from_luma : 1;
+    unsigned short ar_coeff_lag : 2;
+    unsigned short ar_coeff_shift_minus6 : 2;
+    unsigned short grain_scale_shift : 2;
+    unsigned short clip_to_restricted_range : 1;
+    unsigned short reserved6_4bits : 4; // reserved bits; must be set to 0
+    unsigned char num_y_points;
+    unsigned char scaling_points_y[14][2];
+    unsigned char num_cb_points;
+    unsigned char scaling_points_cb[10][2];
+    unsigned char num_cr_points;
+    unsigned char scaling_points_cr[10][2];
+    unsigned char reserved7_8bits; // reserved bits; must be set to 0
+    unsigned short random_seed;
+    short ar_coeffs_y[24];
+    short ar_coeffs_cb[25];
+    short ar_coeffs_cr[25];
+    unsigned char cb_mult;
+    unsigned char cb_luma_mult;
+    short cb_offset;
+    unsigned char cr_mult;
+    unsigned char cr_luma_mult;
+    short cr_offset;
+
+    int reserved[7]; // reserved bits; must be set to 0
+} CUVIDAV1PICPARAMS;
+
+/******************************************************************************************/
+//! \struct CUVIDPICPARAMS
+//! Picture parameters for decoding
+//! This structure is used in cuvidDecodePicture API
+//! IN  for cuvidDecodePicture
+/******************************************************************************************/
+typedef struct _CUVIDPICPARAMS {
+    int PicWidthInMbs;     /**< IN: Coded frame size in macroblocks                           */
+    int FrameHeightInMbs;  /**< IN: Coded frame height in macroblocks                         */
+    int CurrPicIdx;        /**< IN: Output index of the current picture                       */
+    int field_pic_flag;    /**< IN: 0=frame picture, 1=field picture                          */
+    int bottom_field_flag; /**< IN: 0=top field, 1=bottom field (ignored if field_pic_flag=0) */
+    int second_field;      /**< IN: Second field of a complementary field pair                */
+    // Bitstream data
+    unsigned int nBitstreamDataLen;        /**< IN: Number of bytes in bitstream data buffer                  */
+    const unsigned char *pBitstreamData;   /**< IN: Ptr to bitstream data for this picture (slice-layer)      */
+    unsigned int nNumSlices;               /**< IN: Number of slices in this picture                          */
+    const unsigned int *pSliceDataOffsets; /**< IN: nNumSlices entries, contains offset of each slice within
+                                                        the bitstream data buffer                             */
+    int ref_pic_flag;                      /**< IN: This picture is a reference picture                       */
+    int intra_pic_flag;                    /**< IN: This picture is entirely intra coded                      */
+    unsigned int Reserved[30];             /**< Reserved for future use                                       */
+    // IN: Codec-specific data
+    union {
+        CUVIDMPEG2PICPARAMS mpeg2; /**< Also used for MPEG-1 */
+        CUVIDH264PICPARAMS h264;
+        CUVIDVC1PICPARAMS vc1;
+        CUVIDMPEG4PICPARAMS mpeg4;
+        CUVIDJPEGPICPARAMS jpeg;
+        CUVIDHEVCPICPARAMS hevc;
+        CUVIDVP8PICPARAMS vp8;
+        CUVIDVP9PICPARAMS vp9;
+        CUVIDAV1PICPARAMS av1;
+        unsigned int CodecReserved[1024];
+    } CodecSpecific;
+} CUVIDPICPARAMS;
+
+/******************************************************/
+//! \struct CUVIDPROCPARAMS
+//! Picture parameters for postprocessing
+//! This structure is used in cuvidMapVideoFrame API
+/******************************************************/
+typedef struct _CUVIDPROCPARAMS {
+    int progressive_frame; /**< IN: Input is progressive (deinterlace_mode will be ignored)                */
+    int second_field;      /**< IN: Output the second field (ignored if deinterlace mode is Weave)         */
+    int top_field_first;   /**< IN: Input frame is top field first (1st field is top, 2nd field is bottom) */
+    int unpaired_field;    /**< IN: Input only contains one field (2nd field is invalid)                   */
+    // The fields below are used for raw YUV input
+    unsigned int reserved_flags; /**< Reserved for future use (set to zero)                                      */
+    unsigned int reserved_zero;  /**< Reserved (set to zero)                                                     */
+    unsigned long long raw_input_dptr; /**< IN: Input CUdeviceptr for raw YUV extensions */
+    unsigned int raw_input_pitch;  /**< IN: pitch in bytes of raw YUV input (should be aligned appropriately)      */
+    unsigned int raw_input_format; /**< IN: Input YUV format (cudaVideoCodec_enum)                                 */
+    unsigned long long raw_output_dptr; /**< IN: Output CUdeviceptr for raw YUV extensions */
+    unsigned int raw_output_pitch; /**< IN: pitch in bytes of raw YUV output (should be aligned appropriately)     */
+    unsigned int Reserved1;        /**< Reserved for future use (set to zero)                                      */
+    CUstream output_stream;        /**< IN: stream object used by cuvidMapVideoFrame                               */
+    unsigned int Reserved[46];     /**< Reserved for future use (set to zero)                                      */
+    unsigned long long *histogram_dptr; /**< OUT: Output CUdeviceptr for histogram extensions */
+    void *Reserved2[1]; /**< Reserved for future use (set to zero)                                      */
+} CUVIDPROCPARAMS;
+
+/*********************************************************************************************************/
+//! \struct CUVIDGETDECODESTATUS
+//! Struct for reporting decode status.
+//! This structure is used in cuvidGetDecodeStatus API.
+/*********************************************************************************************************/
+typedef struct _CUVIDGETDECODESTATUS {
+    cuvidDecodeStatus decodeStatus;
+    unsigned int reserved[31];
+    void *pReserved[8];
+} CUVIDGETDECODESTATUS;
+
+/****************************************************/
+//! \struct CUVIDRECONFIGUREDECODERINFO
+//! Struct for decoder reset
+//! This structure is used in cuvidReconfigureDecoder() API
+/****************************************************/
+typedef struct _CUVIDRECONFIGUREDECODERINFO {
+    unsigned int
+        ulWidth; /**< IN: Coded sequence width in pixels, MUST be < = ulMaxWidth defined at CUVIDDECODECREATEINFO  */
+    unsigned int
+        ulHeight; /**< IN: Coded sequence height in pixels, MUST be < = ulMaxHeight defined at CUVIDDECODECREATEINFO  */
+    unsigned int ulTargetWidth;       /**< IN: Post processed output width */
+    unsigned int ulTargetHeight;      /**< IN: Post Processed output height */
+    unsigned int ulNumDecodeSurfaces; /**< IN: Maximum number of internal decode surfaces */
+    unsigned int reserved1[12];       /**< Reserved for future use. Set to Zero */
+    /**
+     * IN: Area of frame to be displayed. Use-case : Source Cropping
+     */
+    struct {
+        short left;
+        short top;
+        short right;
+        short bottom;
+    } display_area;
+    /**
+     * IN: Target Rectangle in the OutputFrame. Use-case : Aspect ratio Conversion
+     */
+    struct {
+        short left;
+        short top;
+        short right;
+        short bottom;
+    } target_rect;
+    unsigned int reserved2[11]; /**< Reserved for future use. Set to Zero */
+} CUVIDRECONFIGUREDECODERINFO;
+
+/***********************************************************************************************************/
+//! VIDEO_DECODER
+//!
+//! In order to minimize decode latencies, there should be always at least 2 pictures in the decode
+//! queue at any time, in order to make sure that all decode engines are always busy.
+//!
+//! Overall data flow:
+//!  - cuvidGetDecoderCaps(...)
+//!  - cuvidCreateDecoder(...)
+//!  - For each picture:
+//!    + cuvidDecodePicture(N)
+//!    + cuvidMapVideoFrame(N-4)
+//!    + do some processing in cuda
+//!    + cuvidUnmapVideoFrame(N-4)
+//!    + cuvidDecodePicture(N+1)
+//!    + cuvidMapVideoFrame(N-3)
+//!    + ...
+//!  - cuvidDestroyDecoder(...)
+//!
+//! NOTE:
+//! - When the cuda context is created from a D3D device, the D3D device must also be created
+//!   with the D3DCREATE_MULTITHREADED flag.
+//! - There is a limit to how many pictures can be mapped simultaneously (ulNumOutputSurfaces)
+//! - cuvidDecodePicture may block the calling thread if there are too many pictures pending
+//!   in the decode queue
+/***********************************************************************************************************/
+
+/**********************************************************************************************************************/
+//! \fn CUresult CUDAAPI cuvidGetDecoderCaps(CUVIDDECODECAPS *pdc)
+//! Queries decode capabilities of NVDEC-HW based on CodecType, ChromaFormat and BitDepthMinus8 parameters.
+//! 1. Application fills IN parameters CodecType, ChromaFormat and BitDepthMinus8 of CUVIDDECODECAPS structure
+//! 2. On calling cuvidGetDecoderCaps, driver fills OUT parameters if the IN parameters are supported
+//!    If IN parameters passed to the driver are not supported by NVDEC-HW, then all OUT params are set to 0.
+//! E.g. on Geforce GTX 960:
+//!   App fills - eCodecType = cudaVideoCodec_H264; eChromaFormat = cudaVideoChromaFormat_420; nBitDepthMinus8 = 0;
+//!   Given IN parameters are supported, hence driver fills: bIsSupported = 1; nMinWidth   = 48; nMinHeight  = 16;
+//!   nMaxWidth = 4096; nMaxHeight = 4096; nMaxMBCount = 65536;
+//! CodedWidth*CodedHeight/256 must be less than or equal to nMaxMBCount
+/**********************************************************************************************************************/
+extern CUresult CUDAAPI cuvidGetDecoderCaps(CUVIDDECODECAPS *pdc);
+
+/*****************************************************************************************************/
+//! \fn CUresult CUDAAPI cuvidCreateDecoder(CUvideodecoder *phDecoder, CUVIDDECODECREATEINFO *pdci)
+//! Create the decoder object based on pdci. A handle to the created decoder is returned
+/*****************************************************************************************************/
+extern CUresult CUDAAPI cuvidCreateDecoder(CUvideodecoder *phDecoder, CUVIDDECODECREATEINFO *pdci);
+
+/*****************************************************************************************************/
+//! \fn CUresult CUDAAPI cuvidDestroyDecoder(CUvideodecoder hDecoder)
+//! Destroy the decoder object
+/*****************************************************************************************************/
+extern CUresult CUDAAPI cuvidDestroyDecoder(CUvideodecoder hDecoder);
+
+/*****************************************************************************************************/
+//! \fn CUresult CUDAAPI cuvidDecodePicture(CUvideodecoder hDecoder, CUVIDPICPARAMS *pPicParams)
+//! Decode a single picture (field or frame)
+//! Kicks off HW decoding
+/*****************************************************************************************************/
+extern CUresult CUDAAPI cuvidDecodePicture(CUvideodecoder hDecoder, CUVIDPICPARAMS *pPicParams);
+
+/************************************************************************************************************/
+//! \fn CUresult CUDAAPI cuvidGetDecodeStatus(CUvideodecoder hDecoder, int nPicIdx);
+//! Get the decode status for frame corresponding to nPicIdx
+//! API is supported for Maxwell and above generation GPUs.
+//! API is currently supported for HEVC, H264 and JPEG codecs.
+//! API returns CUDA_ERROR_NOT_SUPPORTED error code for unsupported GPU or codec.
+/************************************************************************************************************/
+extern CUresult CUDAAPI cuvidGetDecodeStatus(CUvideodecoder hDecoder, int nPicIdx, CUVIDGETDECODESTATUS *pDecodeStatus);
+
+/*********************************************************************************************************/
+//! \fn CUresult CUDAAPI cuvidReconfigureDecoder(CUvideodecoder hDecoder, CUVIDRECONFIGUREDECODERINFO
+//! *pDecReconfigParams) Used to reuse single decoder for multiple clips. Currently supports resolution change, resize
+//! params, display area params, target area params change for same codec. Must be called during
+//! CUVIDPARSERPARAMS::pfnSequenceCallback
+/*********************************************************************************************************/
+extern CUresult CUDAAPI cuvidReconfigureDecoder(CUvideodecoder hDecoder,
+                                                CUVIDRECONFIGUREDECODERINFO *pDecReconfigParams);
+
+#if !defined(__CUVID_DEVPTR64) || defined(__CUVID_INTERNAL)
+/************************************************************************************************************************/
+//! \fn CUresult CUDAAPI cuvidMapVideoFrame(CUvideodecoder hDecoder, int nPicIdx, unsigned int *pDevPtr,
+//!                                         unsigned int *pPitch, CUVIDPROCPARAMS *pVPP);
+//! Post-process and map video frame corresponding to nPicIdx for use in cuda. Returns cuda device pointer and
+//! associated pitch of the video frame
+/************************************************************************************************************************/
+extern CUresult CUDAAPI cuvidMapVideoFrame(CUvideodecoder hDecoder, int nPicIdx, unsigned int *pDevPtr,
+                                           unsigned int *pPitch, CUVIDPROCPARAMS *pVPP);
+
+/*****************************************************************************************************/
+//! \fn CUresult CUDAAPI cuvidUnmapVideoFrame(CUvideodecoder hDecoder, unsigned int DevPtr)
+//! Unmap a previously mapped video frame
+/*****************************************************************************************************/
+extern CUresult CUDAAPI cuvidUnmapVideoFrame(CUvideodecoder hDecoder, unsigned int DevPtr);
+#endif
+
+/****************************************************************************************************************************/
+//! \fn CUresult CUDAAPI cuvidMapVideoFrame64(CUvideodecoder hDecoder, int nPicIdx, unsigned long long *pDevPtr,
+//!                                           unsigned int * pPitch, CUVIDPROCPARAMS *pVPP);
+//! Post-process and map video frame corresponding to nPicIdx for use in cuda. Returns cuda device pointer and
+//! associated pitch of the video frame
+/****************************************************************************************************************************/
+extern CUresult CUDAAPI cuvidMapVideoFrame64(CUvideodecoder hDecoder, int nPicIdx, unsigned long long *pDevPtr,
+                                             unsigned int *pPitch, CUVIDPROCPARAMS *pVPP);
+
+/**************************************************************************************************/
+//! \fn CUresult CUDAAPI cuvidUnmapVideoFrame64(CUvideodecoder hDecoder, unsigned long long DevPtr);
+//! Unmap a previously mapped video frame
+/**************************************************************************************************/
+extern CUresult CUDAAPI cuvidUnmapVideoFrame64(CUvideodecoder hDecoder, unsigned long long DevPtr);
+
+#if defined(__CUVID_DEVPTR64) && !defined(__CUVID_INTERNAL)
+#define cuvidMapVideoFrame cuvidMapVideoFrame64
+#define cuvidUnmapVideoFrame cuvidUnmapVideoFrame64
+#endif
+
+/********************************************************************************************************************/
+//!
+//! Context-locking: to facilitate multi-threaded implementations, the following 4 functions
+//! provide a simple mutex-style host synchronization. If a non-NULL context is specified
+//! in CUVIDDECODECREATEINFO, the codec library will acquire the mutex associated with the given
+//! context before making any cuda calls.
+//! A multi-threaded application could create a lock associated with a context handle so that
+//! multiple threads can safely share the same cuda context:
+//!  - use cuCtxPopCurrent immediately after context creation in order to create a 'floating' context
+//!    that can be passed to cuvidCtxLockCreate.
+//!  - When using a floating context, all cuda calls should only be made within a cuvidCtxLock/cuvidCtxUnlock section.
+//!
+//! NOTE: This is a safer alternative to cuCtxPushCurrent and cuCtxPopCurrent, and is not related to video
+//! decoder in any way (implemented as a critical section associated with cuCtx{Push|Pop}Current calls).
+/********************************************************************************************************************/
+
+/********************************************************************************************************************/
+//! \fn CUresult CUDAAPI cuvidCtxLockCreate(CUvideoctxlock *pLock, CUcontext ctx)
+//! This API is used to create CtxLock object
+/********************************************************************************************************************/
+extern CUresult CUDAAPI cuvidCtxLockCreate(CUvideoctxlock *pLock, CUcontext ctx);
+
+/********************************************************************************************************************/
+//! \fn CUresult CUDAAPI cuvidCtxLockDestroy(CUvideoctxlock lck)
+//! This API is used to free CtxLock object
+/********************************************************************************************************************/
+extern CUresult CUDAAPI cuvidCtxLockDestroy(CUvideoctxlock lck);
+
+/********************************************************************************************************************/
+//! \fn CUresult CUDAAPI cuvidCtxLock(CUvideoctxlock lck, unsigned int reserved_flags)
+//! This API is used to acquire ctxlock
+/********************************************************************************************************************/
+extern CUresult CUDAAPI cuvidCtxLock(CUvideoctxlock lck, unsigned int reserved_flags);
+
+/********************************************************************************************************************/
+//! \fn CUresult CUDAAPI cuvidCtxUnlock(CUvideoctxlock lck, unsigned int reserved_flags)
+//! This API is used to release ctxlock
+/********************************************************************************************************************/
+extern CUresult CUDAAPI cuvidCtxUnlock(CUvideoctxlock lck, unsigned int reserved_flags);
+
+/**********************************************************************************************/
+
+#if defined(__cplusplus)
+}
+// Auto-lock helper for C++ applications
+class CCtxAutoLock {
+  private:
+    CUvideoctxlock m_ctx;
+
+  public:
+    CCtxAutoLock(CUvideoctxlock ctx) : m_ctx(ctx) { cuvidCtxLock(m_ctx, 0); }
+    ~CCtxAutoLock() { cuvidCtxUnlock(m_ctx, 0); }
+};
+#endif /* __cplusplus */
+
+#endif // __CUDA_VIDEO_H__
diff --git a/third_party/Video_Codec_SDK/Interface/nvcuvid.h b/third_party/Video_Codec_SDK/Interface/nvcuvid.h
new file mode 100644
index 00000000..d4691672
--- /dev/null
+++ b/third_party/Video_Codec_SDK/Interface/nvcuvid.h
@@ -0,0 +1,486 @@
+/*
+ * This copyright notice applies to this header file only:
+ *
+ * Copyright (c) 2010-2023 NVIDIA Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the software, and to permit persons to whom the
+ * software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/********************************************************************************************************************/
+//! \file nvcuvid.h
+//!   NVDECODE API provides video decoding interface to NVIDIA GPU devices.
+//! \date 2015-2022
+//!  This file contains the interface constants, structure definitions and function prototypes.
+/********************************************************************************************************************/
+
+#if !defined(__NVCUVID_H__)
+#define __NVCUVID_H__
+
+#include "cuviddec.h"
+
+#if defined(__cplusplus)
+extern "C" {
+#endif /* __cplusplus */
+
+#define MAX_CLOCK_TS 3
+
+/***********************************************/
+//!
+//! High-level helper APIs for video sources
+//!
+/***********************************************/
+
+typedef void *CUvideosource;
+typedef void *CUvideoparser;
+typedef long long CUvideotimestamp;
+
+/************************************************************************/
+//! \enum cudaVideoState
+//! Video source state enums
+//! Used in cuvidSetVideoSourceState and cuvidGetVideoSourceState APIs
+/************************************************************************/
+typedef enum {
+    cudaVideoState_Error = -1,  /**< Error state (invalid source)                  */
+    cudaVideoState_Stopped = 0, /**< Source is stopped (or reached end-of-stream)  */
+    cudaVideoState_Started = 1  /**< Source is running and delivering data         */
+} cudaVideoState;
+
+/************************************************************************/
+//! \enum cudaAudioCodec
+//! Audio compression enums
+//! Used in CUAUDIOFORMAT structure
+/************************************************************************/
+typedef enum {
+    cudaAudioCodec_MPEG1 = 0, /**< MPEG-1 Audio               */
+    cudaAudioCodec_MPEG2,     /**< MPEG-2 Audio               */
+    cudaAudioCodec_MP3,       /**< MPEG-1 Layer III Audio     */
+    cudaAudioCodec_AC3,       /**< Dolby Digital (AC3) Audio  */
+    cudaAudioCodec_LPCM,      /**< PCM Audio                  */
+    cudaAudioCodec_AAC,       /**< AAC Audio                  */
+} cudaAudioCodec;
+
+/************************************************************************/
+//! \ingroup STRUCTS
+//! \struct HEVCTIMECODESET
+//! Used to store Time code extracted from Time code SEI in HEVC codec
+/************************************************************************/
+typedef struct _HEVCTIMECODESET {
+    unsigned int time_offset_value;
+    unsigned short n_frames;
+    unsigned char clock_timestamp_flag;
+    unsigned char units_field_based_flag;
+    unsigned char counting_type;
+    unsigned char full_timestamp_flag;
+    unsigned char discontinuity_flag;
+    unsigned char cnt_dropped_flag;
+    unsigned char seconds_value;
+    unsigned char minutes_value;
+    unsigned char hours_value;
+    unsigned char seconds_flag;
+    unsigned char minutes_flag;
+    unsigned char hours_flag;
+    unsigned char time_offset_length;
+    unsigned char reserved;
+} HEVCTIMECODESET;
+
+/************************************************************************/
+//! \ingroup STRUCTS
+//! \struct HEVCSEITIMECODE
+//! Used to extract Time code SEI in HEVC codec
+/************************************************************************/
+typedef struct _HEVCSEITIMECODE {
+    HEVCTIMECODESET time_code_set[MAX_CLOCK_TS];
+    unsigned char num_clock_ts;
+} HEVCSEITIMECODE;
+
+/**********************************************************************************/
+//! \ingroup STRUCTS
+//! \struct CUSEIMESSAGE;
+//! Used in CUVIDSEIMESSAGEINFO structure
+/**********************************************************************************/
+typedef struct _CUSEIMESSAGE {
+    unsigned char sei_message_type; /**< OUT: SEI Message Type      */
+    unsigned char reserved[3];
+    unsigned int sei_message_size; /**< OUT: SEI Message Size      */
+} CUSEIMESSAGE;
+
+/************************************************************************************************/
+//! \ingroup STRUCTS
+//! \struct CUVIDEOFORMAT
+//! Video format
+//! Used in cuvidGetSourceVideoFormat API
+/************************************************************************************************/
+typedef struct {
+    cudaVideoCodec codec; /**< OUT: Compression format          */
+                          /**
+                           * OUT: frame rate = numerator / denominator (for example: 30000/1001)
+                           */
+    struct {
+        /**< OUT: frame rate numerator   (0 = unspecified or variable frame rate) */
+        unsigned int numerator;
+        /**< OUT: frame rate denominator (0 = unspecified or variable frame rate) */
+        unsigned int denominator;
+    } frame_rate;
+    unsigned char progressive_sequence;    /**< OUT: 0=interlaced, 1=progressive                                      */
+    unsigned char bit_depth_luma_minus8;   /**< OUT: high bit depth luma. E.g, 2 for 10-bitdepth, 4 for 12-bitdepth   */
+    unsigned char bit_depth_chroma_minus8; /**< OUT: high bit depth chroma. E.g, 2 for 10-bitdepth, 4 for 12-bitdepth */
+    unsigned char min_num_decode_surfaces; /**< OUT: Minimum number of decode surfaces to be allocated for correct
+                                                     decoding. The client can send this value in ulNumDecodeSurfaces
+                                                     (in CUVIDDECODECREATEINFO structure).
+                                                     This guarantees correct functionality and optimal video memory
+                                                     usage but not necessarily the best performance, which depends on
+                                                     the design of the overall application. The optimal number of
+                                                     decode surfaces (in terms of performance and memory utilization)
+                                                     should be decided by experimentation for each application, but it
+                                                     cannot go below min_num_decode_surfaces.
+                                                     If this value is used for ulNumDecodeSurfaces then it must be
+                                                     returned to parser during sequence callback.                     */
+    unsigned int coded_width;              /**< OUT: coded frame width in pixels                                      */
+    unsigned int coded_height;             /**< OUT: coded frame height in pixels                                     */
+                                           /**
+                                            * area of the frame that should be displayed
+                                            * typical example:
+                                            * coded_width = 1920, coded_height = 1088
+                                            * display_area = { 0,0,1920,1080 }
+                                            */
+    struct {
+        int left;   /**< OUT: left position of display rect    */
+        int top;    /**< OUT: top position of display rect     */
+        int right;  /**< OUT: right position of display rect   */
+        int bottom; /**< OUT: bottom position of display rect  */
+    } display_area;
+    cudaVideoChromaFormat chroma_format; /**< OUT:  Chroma format                   */
+    unsigned int bitrate;                /**< OUT: video bitrate (bps, 0=unknown)   */
+                                         /**
+                                          * OUT: Display Aspect Ratio = x:y (4:3, 16:9, etc)
+                                          */
+    struct {
+        int x;
+        int y;
+    } display_aspect_ratio;
+    /**
+     * Video Signal Description
+     * Refer section E.2.1 (VUI parameters semantics) of H264 spec file
+     */
+    struct {
+        unsigned char video_format : 3; /**< OUT: 0-Component, 1-PAL, 2-NTSC, 3-SECAM, 4-MAC, 5-Unspecified     */
+        unsigned char video_full_range_flag : 1; /**< OUT: indicates the black level and luma and chroma range */
+        unsigned char reserved_zero_bits : 4; /**< Reserved bits                                                      */
+        unsigned char color_primaries;        /**< OUT: chromaticity coordinates of source primaries                  */
+        unsigned char
+            transfer_characteristics;      /**< OUT: opto-electronic transfer characteristic of the source picture */
+        unsigned char matrix_coefficients; /**< OUT: used in deriving luma and chroma signals from RGB primaries   */
+    } video_signal_description;
+    unsigned int seqhdr_data_length; /**< OUT: Additional bytes following (CUVIDEOFORMATEX)                  */
+} CUVIDEOFORMAT;
+
+/****************************************************************/
+//! \ingroup STRUCTS
+//! \struct CUVIDOPERATINGPOINTINFO
+//! Operating point information of scalable bitstream
+/****************************************************************/
+typedef struct {
+    cudaVideoCodec codec;
+    union {
+        struct {
+            unsigned char operating_points_cnt;
+            unsigned char reserved24_bits[3];
+            unsigned short operating_points_idc[32];
+        } av1;
+        unsigned char CodecReserved[1024];
+    };
+} CUVIDOPERATINGPOINTINFO;
+
+/**********************************************************************************/
+//! \ingroup STRUCTS
+//! \struct CUVIDSEIMESSAGEINFO
+//! Used in cuvidParseVideoData API with PFNVIDSEIMSGCALLBACK pfnGetSEIMsg
+/**********************************************************************************/
+typedef struct _CUVIDSEIMESSAGEINFO {
+    void *pSEIData;                 /**< OUT: SEI Message Data      */
+    CUSEIMESSAGE *pSEIMessage;      /**< OUT: SEI Message Info      */
+    unsigned int sei_message_count; /**< OUT: SEI Message Count     */
+    unsigned int picIdx;            /**< OUT: SEI Message Pic Index */
+} CUVIDSEIMESSAGEINFO;
+
+/****************************************************************/
+//! \ingroup STRUCTS
+//! \struct CUVIDAV1SEQHDR
+//! AV1 specific sequence header information
+/****************************************************************/
+typedef struct {
+    unsigned int max_width;
+    unsigned int max_height;
+    unsigned char reserved[1016];
+} CUVIDAV1SEQHDR;
+
+/****************************************************************/
+//! \ingroup STRUCTS
+//! \struct CUVIDEOFORMATEX
+//! Video format including raw sequence header information
+//! Used in cuvidGetSourceVideoFormat API
+/****************************************************************/
+typedef struct {
+    CUVIDEOFORMAT format; /**< OUT: CUVIDEOFORMAT structure */
+    union {
+        CUVIDAV1SEQHDR av1;
+        unsigned char raw_seqhdr_data[1024]; /**< OUT: Sequence header data    */
+    };
+} CUVIDEOFORMATEX;
+
+/****************************************************************/
+//! \ingroup STRUCTS
+//! \struct CUAUDIOFORMAT
+//! Audio formats
+//! Used in cuvidGetSourceAudioFormat API
+/****************************************************************/
+typedef struct {
+    cudaAudioCodec codec;       /**< OUT: Compression format                                              */
+    unsigned int channels;      /**< OUT: number of audio channels                                        */
+    unsigned int samplespersec; /**< OUT: sampling frequency                                              */
+    unsigned int bitrate;       /**< OUT: For uncompressed, can also be used to determine bits per sample */
+    unsigned int reserved1;     /**< Reserved for future use                                              */
+    unsigned int reserved2;     /**< Reserved for future use                                              */
+} CUAUDIOFORMAT;
+
+/***************************************************************/
+//! \enum CUvideopacketflags
+//! Data packet flags
+//! Used in CUVIDSOURCEDATAPACKET structure
+/***************************************************************/
+typedef enum {
+    CUVID_PKT_ENDOFSTREAM = 0x01, /**< Set when this is the last packet for this stream                              */
+    CUVID_PKT_TIMESTAMP = 0x02,   /**< Timestamp is valid                                                            */
+    CUVID_PKT_DISCONTINUITY = 0x04, /**< Set when a discontinuity has to be signalled */
+    CUVID_PKT_ENDOFPICTURE = 0x08, /**< Set when the packet contains exactly one frame or one field                   */
+    CUVID_PKT_NOTIFY_EOS = 0x10,   /**< If this flag is set along with CUVID_PKT_ENDOFSTREAM, an additional (dummy)
+                                        display callback will be invoked with null value of CUVIDPARSERDISPINFO which
+                                        should be interpreted as end of the stream.                                   */
+} CUvideopacketflags;
+
+/*****************************************************************************/
+//! \ingroup STRUCTS
+//! \struct CUVIDSOURCEDATAPACKET
+//! Data Packet
+//! Used in cuvidParseVideoData API
+//! IN for cuvidParseVideoData
+/*****************************************************************************/
+typedef struct _CUVIDSOURCEDATAPACKET {
+    unsigned long flags;          /**< IN: Combination of CUVID_PKT_XXX flags                              */
+    unsigned long payload_size;   /**< IN: number of bytes in the payload (may be zero if EOS flag is set) */
+    const unsigned char *payload; /**< IN: Pointer to packet payload data (may be NULL if EOS flag is set) */
+    CUvideotimestamp timestamp;   /**< IN: Presentation time stamp (10MHz clock), only valid if
+                                           CUVID_PKT_TIMESTAMP flag is set                                 */
+} CUVIDSOURCEDATAPACKET;
+
+// Callback for packet delivery
+typedef int(CUDAAPI *PFNVIDSOURCECALLBACK)(void *, CUVIDSOURCEDATAPACKET *);
+
+/**************************************************************************************************************************/
+//! \ingroup STRUCTS
+//! \struct CUVIDSOURCEPARAMS
+//! Describes parameters needed in cuvidCreateVideoSource API
+//! NVDECODE API is intended for HW accelerated video decoding so CUvideosource doesn't have audio demuxer for all
+//! supported containers. It's recommended to clients to use their own or third party demuxer if audio support is
+//! needed.
+/**************************************************************************************************************************/
+typedef struct _CUVIDSOURCEPARAMS {
+    unsigned int ulClockRate;                 /**< IN: Time stamp units in Hz (0=default=10000000Hz)      */
+    unsigned int bAnnexb : 1;                 /**< IN: AV1 annexB stream                                  */
+    unsigned int uReserved : 31;              /**< Reserved for future use - set to zero                  */
+    unsigned int uReserved1[6];               /**< Reserved for future use - set to zero                  */
+    void *pUserData;                          /**< IN: User private data passed in to the data handlers   */
+    PFNVIDSOURCECALLBACK pfnVideoDataHandler; /**< IN: Called to deliver video packets                    */
+    PFNVIDSOURCECALLBACK pfnAudioDataHandler; /**< IN: Called to deliver audio packets.                   */
+    void *pvReserved2[8];                     /**< Reserved for future use - set to NULL                  */
+} CUVIDSOURCEPARAMS;
+
+/**********************************************/
+//! \ingroup ENUMS
+//! \enum CUvideosourceformat_flags
+//! CUvideosourceformat_flags
+//! Used in cuvidGetSourceVideoFormat API
+/**********************************************/
+typedef enum {
+    CUVID_FMT_EXTFORMATINFO = 0x100 /**< Return extended format structure (CUVIDEOFORMATEX) */
+} CUvideosourceformat_flags;
+
+#if !defined(__APPLE__)
+/***************************************************************************************************************************/
+//! \ingroup FUNCTS
+//! \fn CUresult CUDAAPI cuvidCreateVideoSource(CUvideosource *pObj, const char *pszFileName, CUVIDSOURCEPARAMS
+//! *pParams) Create CUvideosource object. CUvideosource spawns demultiplexer thread that provides two callbacks:
+//! pfnVideoDataHandler() and pfnAudioDataHandler()
+//! NVDECODE API is intended for HW accelerated video decoding so CUvideosource doesn't have audio demuxer for all
+//! supported containers. It's recommended to clients to use their own or third party demuxer if audio support is
+//! needed.
+/***************************************************************************************************************************/
+CUresult CUDAAPI cuvidCreateVideoSource(CUvideosource *pObj, const char *pszFileName, CUVIDSOURCEPARAMS *pParams);
+
+/***************************************************************************************************************************/
+//! \ingroup FUNCTS
+//! \fn CUresult CUDAAPI cuvidCreateVideoSourceW(CUvideosource *pObj, const wchar_t *pwszFileName, CUVIDSOURCEPARAMS
+//! *pParams) Create video source
+/***************************************************************************************************************************/
+CUresult CUDAAPI cuvidCreateVideoSourceW(CUvideosource *pObj, const wchar_t *pwszFileName, CUVIDSOURCEPARAMS *pParams);
+
+/********************************************************************/
+//! \ingroup FUNCTS
+//! \fn CUresult CUDAAPI cuvidDestroyVideoSource(CUvideosource obj)
+//! Destroy video source
+/********************************************************************/
+CUresult CUDAAPI cuvidDestroyVideoSource(CUvideosource obj);
+
+/******************************************************************************************/
+//! \ingroup FUNCTS
+//! \fn CUresult CUDAAPI cuvidSetVideoSourceState(CUvideosource obj, cudaVideoState state)
+//! Set video source state to:
+//! cudaVideoState_Started - to signal the source to run and deliver data
+//! cudaVideoState_Stopped - to stop the source from delivering the data
+//! cudaVideoState_Error   - invalid source
+/******************************************************************************************/
+CUresult CUDAAPI cuvidSetVideoSourceState(CUvideosource obj, cudaVideoState state);
+
+/******************************************************************************************/
+//! \ingroup FUNCTS
+//! \fn cudaVideoState CUDAAPI cuvidGetVideoSourceState(CUvideosource obj)
+//! Get video source state
+//! Returns:
+//! cudaVideoState_Started - if Source is running and delivering data
+//! cudaVideoState_Stopped - if Source is stopped or reached end-of-stream
+//! cudaVideoState_Error   - if Source is in error state
+/******************************************************************************************/
+cudaVideoState CUDAAPI cuvidGetVideoSourceState(CUvideosource obj);
+
+/******************************************************************************************************************/
+//! \ingroup FUNCTS
+//! \fn CUresult CUDAAPI cuvidGetSourceVideoFormat(CUvideosource obj, CUVIDEOFORMAT *pvidfmt, unsigned int flags)
+//! Gets video source format in pvidfmt, flags is set to combination of CUvideosourceformat_flags as per requirement
+/******************************************************************************************************************/
+CUresult CUDAAPI cuvidGetSourceVideoFormat(CUvideosource obj, CUVIDEOFORMAT *pvidfmt, unsigned int flags);
+
+/**************************************************************************************************************************/
+//! \ingroup FUNCTS
+//! \fn CUresult CUDAAPI cuvidGetSourceAudioFormat(CUvideosource obj, CUAUDIOFORMAT *paudfmt, unsigned int flags)
+//! Get audio source format
+//! NVDECODE API is intended for HW accelerated video decoding so CUvideosource doesn't have audio demuxer for all
+//! supported containers. It's recommended to clients to use their own or third party demuxer if audio support is
+//! needed.
+/**************************************************************************************************************************/
+CUresult CUDAAPI cuvidGetSourceAudioFormat(CUvideosource obj, CUAUDIOFORMAT *paudfmt, unsigned int flags);
+
+#endif
+/**********************************************************************************/
+//! \ingroup STRUCTS
+//! \struct CUVIDPARSERDISPINFO
+//! Used in cuvidParseVideoData API with PFNVIDDISPLAYCALLBACK pfnDisplayPicture
+/**********************************************************************************/
+typedef struct _CUVIDPARSERDISPINFO {
+    int picture_index; /**< OUT: Index of the current picture                                                         */
+    int progressive_frame;      /**< OUT: 1 if progressive frame; 0 otherwise      */
+    int top_field_first;        /**< OUT: 1 if top field is displayed first; 0 otherwise        */
+    int repeat_first_field;     /**< OUT: Number of additional fields (1=ivtc, 2=frame doubling, 4=frame tripling,
+                                     -1=unpaired field)     */
+    CUvideotimestamp timestamp; /**< OUT: Presentation time stamp */
+} CUVIDPARSERDISPINFO;
+
+/***********************************************************************************************************************/
+//! Parser callbacks
+//! The parser will call these synchronously from within cuvidParseVideoData(), whenever there is sequence change or a
+//! picture is ready to be decoded and/or displayed. First argument in functions is "void *pUserData" member of
+//! structure CUVIDSOURCEPARAMS Return values from these callbacks are interpreted as below. If the callbacks return
+//! failure, it will be propagated by cuvidParseVideoData() to the application. Parser picks default operating point as
+//! 0 and outputAllLayers flag as 0 if PFNVIDOPPOINTCALLBACK is not set or return value is -1 or invalid operating
+//! point. PFNVIDSEQUENCECALLBACK : 0: fail, 1: succeeded, > 1: override dpb size of parser (set by
+//! CUVIDPARSERPARAMS::ulMaxNumDecodeSurfaces while creating parser) PFNVIDDECODECALLBACK   : 0: fail, >=1: succeeded
+//! PFNVIDDISPLAYCALLBACK  : 0: fail, >=1: succeeded
+//! PFNVIDOPPOINTCALLBACK  : <0: fail, >=0: succeeded (bit 0-9: OperatingPoint, bit 10-10: outputAllLayers, bit 11-30:
+//! reserved) PFNVIDSEIMSGCALLBACK   : 0: fail, >=1: succeeded
+/***********************************************************************************************************************/
+typedef int(CUDAAPI *PFNVIDSEQUENCECALLBACK)(void *, CUVIDEOFORMAT *);
+typedef int(CUDAAPI *PFNVIDDECODECALLBACK)(void *, CUVIDPICPARAMS *);
+typedef int(CUDAAPI *PFNVIDDISPLAYCALLBACK)(void *, CUVIDPARSERDISPINFO *);
+typedef int(CUDAAPI *PFNVIDOPPOINTCALLBACK)(void *, CUVIDOPERATINGPOINTINFO *);
+typedef int(CUDAAPI *PFNVIDSEIMSGCALLBACK)(void *, CUVIDSEIMESSAGEINFO *);
+
+/**************************************/
+//! \ingroup STRUCTS
+//! \struct CUVIDPARSERPARAMS
+//! Used in cuvidCreateVideoParser API
+/**************************************/
+typedef struct _CUVIDPARSERPARAMS {
+    cudaVideoCodec CodecType;            /**< IN: cudaVideoCodec_XXX                                                  */
+    unsigned int ulMaxNumDecodeSurfaces; /**< IN: Max # of decode surfaces (parser will cycle through these)          */
+    unsigned int ulClockRate;            /**< IN: Timestamp units in Hz (0=default=10000000Hz)                        */
+    unsigned int ulErrorThreshold;       /**< IN: % Error threshold (0-100) for calling pfnDecodePicture (100=always
+                                              IN: call pfnDecodePicture even if picture bitstream is fully corrupted) */
+    unsigned int ulMaxDisplayDelay;      /**< IN: Max display queue delay (improves pipelining of decode with display)
+                                                  0=no delay (recommended values: 2..4)                               */
+    unsigned int bAnnexb : 1;            /**< IN: AV1 annexB stream                                                   */
+    unsigned int uReserved : 31;         /**< Reserved for future use - set to zero                                   */
+    unsigned int uReserved1[4];          /**< IN: Reserved for future use - set to 0                                  */
+    void *pUserData;                     /**< IN: User data for callbacks                                             */
+    PFNVIDSEQUENCECALLBACK
+    pfnSequenceCallback; /**< IN: Called before decoding frames and/or whenever there is a fmt change */
+    PFNVIDDECODECALLBACK pfnDecodePicture; /**< IN: Called when a picture is ready to be decoded (decode order) */
+    PFNVIDDISPLAYCALLBACK
+    pfnDisplayPicture; /**< IN: Called whenever a picture is ready to be displayed (display order)  */
+    PFNVIDOPPOINTCALLBACK pfnGetOperatingPoint; /**< IN: Called from AV1 sequence header to get operating point of a AV1
+                                                         scalable bitstream */
+    PFNVIDSEIMSGCALLBACK pfnGetSEIMsg; /**< IN: Called when all SEI messages are parsed for particular frame        */
+    void *pvReserved2[5];              /**< Reserved for future use - set to NULL                                   */
+    CUVIDEOFORMATEX *pExtVideoInfo;    /**< IN: [Optional] sequence header data from system layer                   */
+} CUVIDPARSERPARAMS;
+
+/************************************************************************************************/
+//! \ingroup FUNCTS
+//! \fn CUresult CUDAAPI cuvidCreateVideoParser(CUvideoparser *pObj, CUVIDPARSERPARAMS *pParams)
+//! Create video parser object and initialize
+/************************************************************************************************/
+CUresult CUDAAPI cuvidCreateVideoParser(CUvideoparser *pObj, CUVIDPARSERPARAMS *pParams);
+
+/************************************************************************************************/
+//! \ingroup FUNCTS
+//! \fn CUresult CUDAAPI cuvidParseVideoData(CUvideoparser obj, CUVIDSOURCEDATAPACKET *pPacket)
+//! Parse the video data from source data packet in pPacket
+//! Extracts parameter sets like SPS, PPS, bitstream etc. from pPacket and
+//! calls back pfnDecodePicture with CUVIDPICPARAMS data for kicking of HW decoding
+//! calls back pfnSequenceCallback with CUVIDEOFORMAT data for initial sequence header or when
+//! the decoder encounters a video format change
+//! calls back pfnDisplayPicture with CUVIDPARSERDISPINFO data to display a video frame
+/************************************************************************************************/
+CUresult CUDAAPI cuvidParseVideoData(CUvideoparser obj, CUVIDSOURCEDATAPACKET *pPacket);
+
+/************************************************************************************************/
+//! \ingroup FUNCTS
+//! \fn CUresult CUDAAPI cuvidDestroyVideoParser(CUvideoparser obj)
+//! Destroy the video parser
+/************************************************************************************************/
+CUresult CUDAAPI cuvidDestroyVideoParser(CUvideoparser obj);
+
+/**********************************************************************************************/
+
+#if defined(__cplusplus)
+}
+#endif /* __cplusplus */
+
+#endif // __NVCUVID_H__
diff --git a/third_party/Video_Codec_SDK/Lib/linux/stubs/x86_64/libnvcuvid.so b/third_party/Video_Codec_SDK/Lib/linux/stubs/x86_64/libnvcuvid.so
new file mode 100644
index 00000000..f08a2095
Binary files /dev/null and b/third_party/Video_Codec_SDK/Lib/linux/stubs/x86_64/libnvcuvid.so differ
diff --git a/third_party/Video_Codec_SDK/Samples/NvCodec/NvDecoder/NvDecoder.cpp b/third_party/Video_Codec_SDK/Samples/NvCodec/NvDecoder/NvDecoder.cpp
new file mode 100644
index 00000000..0fd61f44
--- /dev/null
+++ b/third_party/Video_Codec_SDK/Samples/NvCodec/NvDecoder/NvDecoder.cpp
@@ -0,0 +1,709 @@
+/*
+ * This copyright notice applies to this header file only:
+ *
+ * Copyright (c) 2010-2023 NVIDIA Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the software, and to permit persons to whom the
+ * software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <algorithm>
+#include <chrono>
+#include <cmath>
+#include <iostream>
+
+#include "../../../Interface/nvcuvid.h"
+#include "NvDecoder/NvDecoder.h"
+
+std::map<int, int64_t> NvDecoder::sessionOverHead = {{0, 0}, {1, 0}};
+
+/**
+ *   @brief  This function is used to get codec string from codec id
+ */
+const char *NvDecoder::GetCodecString(cudaVideoCodec eCodec) { return GetVideoCodecString(eCodec); }
+
+/* Called when the parser encounters sequence header for AV1 SVC content
+ *  return value interpretation:
+ *      < 0 : fail, >=0: succeeded (bit 0-9: currOperatingPoint, bit 10-10: bDispAllLayer, bit 11-30: reserved, must be
+ * set 0)
+ */
+int NvDecoder::GetOperatingPoint(CUVIDOPERATINGPOINTINFO *pOPInfo) {
+    if (pOPInfo->codec == cudaVideoCodec_AV1) {
+        if (pOPInfo->av1.operating_points_cnt > 1) {
+            // clip has SVC enabled
+            if (m_nOperatingPoint >= pOPInfo->av1.operating_points_cnt)
+                m_nOperatingPoint = 0;
+
+            printf("AV1 SVC clip: operating point count %d  ", pOPInfo->av1.operating_points_cnt);
+            printf("Selected operating point: %d, IDC 0x%x bOutputAllLayers %d\n", m_nOperatingPoint,
+                   pOPInfo->av1.operating_points_idc[m_nOperatingPoint], m_bDispAllLayers);
+            return (m_nOperatingPoint | (m_bDispAllLayers << 10));
+        }
+    }
+    return -1;
+}
+
+/* Return value from HandleVideoSequence() are interpreted as   :
+ *  0: fail, 1: succeeded, > 1: override dpb size of parser (set by CUVIDPARSERPARAMS::ulMaxNumDecodeSurfaces while
+ * creating parser)
+ */
+int NvDecoder::HandleVideoSequence(CUVIDEOFORMAT *pVideoFormat) {
+    START_TIMER
+    m_videoInfo.str("");
+    m_videoInfo.clear();
+    m_videoInfo << "Video Input Information" << std::endl
+                << "\tCodec        : " << GetVideoCodecString(pVideoFormat->codec) << std::endl
+                << "\tFrame rate   : " << pVideoFormat->frame_rate.numerator << "/"
+                << pVideoFormat->frame_rate.denominator << " = "
+                << 1.0 * pVideoFormat->frame_rate.numerator / pVideoFormat->frame_rate.denominator << " fps"
+                << std::endl
+                << "\tSequence     : " << (pVideoFormat->progressive_sequence ? "Progressive" : "Interlaced")
+                << std::endl
+                << "\tCoded size   : [" << pVideoFormat->coded_width << ", " << pVideoFormat->coded_height << "]"
+                << std::endl
+                << "\tDisplay area : [" << pVideoFormat->display_area.left << ", " << pVideoFormat->display_area.top
+                << ", " << pVideoFormat->display_area.right << ", " << pVideoFormat->display_area.bottom << "]"
+                << std::endl
+                << "\tChroma       : " << GetVideoChromaFormatString(pVideoFormat->chroma_format) << std::endl
+                << "\tBit depth    : " << pVideoFormat->bit_depth_luma_minus8 + 8;
+    m_videoInfo << std::endl;
+
+    int nDecodeSurface = pVideoFormat->min_num_decode_surfaces;
+
+    CUVIDDECODECAPS decodecaps;
+    memset(&decodecaps, 0, sizeof(decodecaps));
+
+    decodecaps.eCodecType = pVideoFormat->codec;
+    decodecaps.eChromaFormat = pVideoFormat->chroma_format;
+    decodecaps.nBitDepthMinus8 = pVideoFormat->bit_depth_luma_minus8;
+
+    CUDA_DRVAPI_CALL(cuCtxPushCurrent(m_cuContext));
+    NVDEC_API_CALL(cuvidGetDecoderCaps(&decodecaps));
+    CUDA_DRVAPI_CALL(cuCtxPopCurrent(NULL));
+
+    if (!decodecaps.bIsSupported) {
+        NVDEC_THROW_ERROR("Codec not supported on this GPU", CUDA_ERROR_NOT_SUPPORTED);
+        return nDecodeSurface;
+    }
+
+    if ((pVideoFormat->coded_width > decodecaps.nMaxWidth) || (pVideoFormat->coded_height > decodecaps.nMaxHeight)) {
+
+        std::ostringstream errorString;
+        errorString << std::endl
+                    << "Resolution          : " << pVideoFormat->coded_width << "x" << pVideoFormat->coded_height
+                    << std::endl
+                    << "Max Supported (wxh) : " << decodecaps.nMaxWidth << "x" << decodecaps.nMaxHeight << std::endl
+                    << "Resolution not supported on this GPU";
+
+        const std::string cErr = errorString.str();
+        NVDEC_THROW_ERROR(cErr, CUDA_ERROR_NOT_SUPPORTED);
+        return nDecodeSurface;
+    }
+
+    if ((pVideoFormat->coded_width >> 4) * (pVideoFormat->coded_height >> 4) > decodecaps.nMaxMBCount) {
+
+        std::ostringstream errorString;
+        errorString << std::endl
+                    << "MBCount             : " << (pVideoFormat->coded_width >> 4) * (pVideoFormat->coded_height >> 4)
+                    << std::endl
+                    << "Max Supported mbcnt : " << decodecaps.nMaxMBCount << std::endl
+                    << "MBCount not supported on this GPU";
+
+        const std::string cErr = errorString.str();
+        NVDEC_THROW_ERROR(cErr, CUDA_ERROR_NOT_SUPPORTED);
+        return nDecodeSurface;
+    }
+
+    if (m_nWidth && m_nLumaHeight && m_nChromaHeight) {
+
+        // cuvidCreateDecoder() has been called before, and now there's possible config change
+        return ReconfigureDecoder(pVideoFormat);
+    }
+
+    // eCodec has been set in the constructor (for parser). Here it's set again for potential correction
+    m_eCodec = pVideoFormat->codec;
+    m_eChromaFormat = pVideoFormat->chroma_format;
+    m_nBitDepthMinus8 = pVideoFormat->bit_depth_luma_minus8;
+    m_nBPP = m_nBitDepthMinus8 > 0 ? 2 : 1;
+
+    // Set the output surface format same as chroma format
+    if (m_eChromaFormat == cudaVideoChromaFormat_420 || cudaVideoChromaFormat_Monochrome)
+        m_eOutputFormat =
+            pVideoFormat->bit_depth_luma_minus8 ? cudaVideoSurfaceFormat_P016 : cudaVideoSurfaceFormat_NV12;
+    else if (m_eChromaFormat == cudaVideoChromaFormat_444)
+        m_eOutputFormat =
+            pVideoFormat->bit_depth_luma_minus8 ? cudaVideoSurfaceFormat_YUV444_16Bit : cudaVideoSurfaceFormat_YUV444;
+    else if (m_eChromaFormat == cudaVideoChromaFormat_422)
+        m_eOutputFormat = cudaVideoSurfaceFormat_NV12; // no 4:2:2 output format supported yet so make 420 default
+
+    // Check if output format supported. If not, check falback options
+    if (!(decodecaps.nOutputFormatMask & (1 << m_eOutputFormat))) {
+        if (decodecaps.nOutputFormatMask & (1 << cudaVideoSurfaceFormat_NV12))
+            m_eOutputFormat = cudaVideoSurfaceFormat_NV12;
+        else if (decodecaps.nOutputFormatMask & (1 << cudaVideoSurfaceFormat_P016))
+            m_eOutputFormat = cudaVideoSurfaceFormat_P016;
+        else if (decodecaps.nOutputFormatMask & (1 << cudaVideoSurfaceFormat_YUV444))
+            m_eOutputFormat = cudaVideoSurfaceFormat_YUV444;
+        else if (decodecaps.nOutputFormatMask & (1 << cudaVideoSurfaceFormat_YUV444_16Bit))
+            m_eOutputFormat = cudaVideoSurfaceFormat_YUV444_16Bit;
+        else
+            NVDEC_THROW_ERROR("No supported output format found", CUDA_ERROR_NOT_SUPPORTED);
+    }
+    m_videoFormat = *pVideoFormat;
+
+    CUVIDDECODECREATEINFO videoDecodeCreateInfo = {0};
+    videoDecodeCreateInfo.CodecType = pVideoFormat->codec;
+    videoDecodeCreateInfo.ChromaFormat = pVideoFormat->chroma_format;
+    videoDecodeCreateInfo.OutputFormat = m_eOutputFormat;
+    videoDecodeCreateInfo.bitDepthMinus8 = pVideoFormat->bit_depth_luma_minus8;
+    if (pVideoFormat->progressive_sequence)
+        videoDecodeCreateInfo.DeinterlaceMode = cudaVideoDeinterlaceMode_Weave;
+    else
+        videoDecodeCreateInfo.DeinterlaceMode = cudaVideoDeinterlaceMode_Adaptive;
+    videoDecodeCreateInfo.ulNumOutputSurfaces = 2;
+    // With PreferCUVID, JPEG is still decoded by CUDA while video is decoded by NVDEC hardware
+    videoDecodeCreateInfo.ulCreationFlags = cudaVideoCreate_PreferCUVID;
+    videoDecodeCreateInfo.ulNumDecodeSurfaces = nDecodeSurface;
+    videoDecodeCreateInfo.vidLock = m_ctxLock;
+    videoDecodeCreateInfo.ulWidth = pVideoFormat->coded_width;
+    videoDecodeCreateInfo.ulHeight = pVideoFormat->coded_height;
+    // AV1 has max width/height of sequence in sequence header
+    if (pVideoFormat->codec == cudaVideoCodec_AV1 && pVideoFormat->seqhdr_data_length > 0) {
+        // dont overwrite if it is already set from cmdline or reconfig.txt
+        if (!(m_nMaxWidth > pVideoFormat->coded_width || m_nMaxHeight > pVideoFormat->coded_height)) {
+            CUVIDEOFORMATEX *vidFormatEx = (CUVIDEOFORMATEX *)pVideoFormat;
+            m_nMaxWidth = vidFormatEx->av1.max_width;
+            m_nMaxHeight = vidFormatEx->av1.max_height;
+        }
+    }
+    if (m_nMaxWidth < (int)pVideoFormat->coded_width)
+        m_nMaxWidth = pVideoFormat->coded_width;
+    if (m_nMaxHeight < (int)pVideoFormat->coded_height)
+        m_nMaxHeight = pVideoFormat->coded_height;
+    videoDecodeCreateInfo.ulMaxWidth = m_nMaxWidth;
+    videoDecodeCreateInfo.ulMaxHeight = m_nMaxHeight;
+
+    if (!(m_cropRect.r && m_cropRect.b) && !(m_resizeDim.w && m_resizeDim.h)) {
+        m_nWidth = pVideoFormat->display_area.right - pVideoFormat->display_area.left;
+        m_nLumaHeight = pVideoFormat->display_area.bottom - pVideoFormat->display_area.top;
+        videoDecodeCreateInfo.ulTargetWidth = pVideoFormat->coded_width;
+        videoDecodeCreateInfo.ulTargetHeight = pVideoFormat->coded_height;
+    } else {
+        if (m_resizeDim.w && m_resizeDim.h) {
+            videoDecodeCreateInfo.display_area.left = pVideoFormat->display_area.left;
+            videoDecodeCreateInfo.display_area.top = pVideoFormat->display_area.top;
+            videoDecodeCreateInfo.display_area.right = pVideoFormat->display_area.right;
+            videoDecodeCreateInfo.display_area.bottom = pVideoFormat->display_area.bottom;
+            m_nWidth = m_resizeDim.w;
+            m_nLumaHeight = m_resizeDim.h;
+        }
+
+        if (m_cropRect.r && m_cropRect.b) {
+            videoDecodeCreateInfo.display_area.left = m_cropRect.l;
+            videoDecodeCreateInfo.display_area.top = m_cropRect.t;
+            videoDecodeCreateInfo.display_area.right = m_cropRect.r;
+            videoDecodeCreateInfo.display_area.bottom = m_cropRect.b;
+            m_nWidth = m_cropRect.r - m_cropRect.l;
+            m_nLumaHeight = m_cropRect.b - m_cropRect.t;
+        }
+        videoDecodeCreateInfo.ulTargetWidth = m_nWidth;
+        videoDecodeCreateInfo.ulTargetHeight = m_nLumaHeight;
+    }
+
+    m_nChromaHeight = (int)(ceil(m_nLumaHeight * GetChromaHeightFactor(m_eOutputFormat)));
+    m_nNumChromaPlanes = GetChromaPlaneCount(m_eOutputFormat);
+    m_nSurfaceHeight = videoDecodeCreateInfo.ulTargetHeight;
+    m_nSurfaceWidth = videoDecodeCreateInfo.ulTargetWidth;
+    m_displayRect.b = videoDecodeCreateInfo.display_area.bottom;
+    m_displayRect.t = videoDecodeCreateInfo.display_area.top;
+    m_displayRect.l = videoDecodeCreateInfo.display_area.left;
+    m_displayRect.r = videoDecodeCreateInfo.display_area.right;
+
+    m_videoInfo << "Video Decoding Params:" << std::endl
+                << "\tNum Surfaces : " << videoDecodeCreateInfo.ulNumDecodeSurfaces << std::endl
+                << "\tCrop         : [" << videoDecodeCreateInfo.display_area.left << ", "
+                << videoDecodeCreateInfo.display_area.top << ", " << videoDecodeCreateInfo.display_area.right << ", "
+                << videoDecodeCreateInfo.display_area.bottom << "]" << std::endl
+                << "\tResize       : " << videoDecodeCreateInfo.ulTargetWidth << "x"
+                << videoDecodeCreateInfo.ulTargetHeight << std::endl
+                << "\tDeinterlace  : "
+                << std::vector<const char *>{"Weave", "Bob", "Adaptive"}[videoDecodeCreateInfo.DeinterlaceMode];
+    m_videoInfo << std::endl;
+
+    CUDA_DRVAPI_CALL(cuCtxPushCurrent(m_cuContext));
+    NVDEC_API_CALL(cuvidCreateDecoder(&m_hDecoder, &videoDecodeCreateInfo));
+    CUDA_DRVAPI_CALL(cuCtxPopCurrent(NULL));
+    STOP_TIMER("Session Initialization Time: ");
+    NvDecoder::addDecoderSessionOverHead(getDecoderSessionID(), elapsedTime);
+    return nDecodeSurface;
+}
+
+int NvDecoder::ReconfigureDecoder(CUVIDEOFORMAT *pVideoFormat) {
+    if (pVideoFormat->bit_depth_luma_minus8 != m_videoFormat.bit_depth_luma_minus8 ||
+        pVideoFormat->bit_depth_chroma_minus8 != m_videoFormat.bit_depth_chroma_minus8) {
+
+        NVDEC_THROW_ERROR("Reconfigure Not supported for bit depth change", CUDA_ERROR_NOT_SUPPORTED);
+    }
+
+    if (pVideoFormat->chroma_format != m_videoFormat.chroma_format) {
+
+        NVDEC_THROW_ERROR("Reconfigure Not supported for chroma format change", CUDA_ERROR_NOT_SUPPORTED);
+    }
+
+    bool bDecodeResChange = !(pVideoFormat->coded_width == m_videoFormat.coded_width &&
+                              pVideoFormat->coded_height == m_videoFormat.coded_height);
+    bool bDisplayRectChange = !(pVideoFormat->display_area.bottom == m_videoFormat.display_area.bottom &&
+                                pVideoFormat->display_area.top == m_videoFormat.display_area.top &&
+                                pVideoFormat->display_area.left == m_videoFormat.display_area.left &&
+                                pVideoFormat->display_area.right == m_videoFormat.display_area.right);
+
+    int nDecodeSurface = pVideoFormat->min_num_decode_surfaces;
+
+    if ((pVideoFormat->coded_width > m_nMaxWidth) || (pVideoFormat->coded_height > m_nMaxHeight)) {
+        // For VP9, let driver  handle the change if new width/height > maxwidth/maxheight
+        if ((m_eCodec != cudaVideoCodec_VP9) || m_bReconfigExternal) {
+            NVDEC_THROW_ERROR("Reconfigure Not supported when width/height > maxwidth/maxheight",
+                              CUDA_ERROR_NOT_SUPPORTED);
+        }
+        return 1;
+    }
+
+    if (!bDecodeResChange && !m_bReconfigExtPPChange) {
+        // if the coded_width/coded_height hasn't changed but display resolution has changed, then need to update
+        // width/height for correct output without cropping. Example : 1920x1080 vs 1920x1088
+        if (bDisplayRectChange) {
+            m_nWidth = pVideoFormat->display_area.right - pVideoFormat->display_area.left;
+            m_nLumaHeight = pVideoFormat->display_area.bottom - pVideoFormat->display_area.top;
+            m_nChromaHeight = (int)ceil(m_nLumaHeight * GetChromaHeightFactor(m_eOutputFormat));
+            m_nNumChromaPlanes = GetChromaPlaneCount(m_eOutputFormat);
+        }
+
+        // no need for reconfigureDecoder(). Just return
+        return 1;
+    }
+
+    CUVIDRECONFIGUREDECODERINFO reconfigParams = {0};
+
+    reconfigParams.ulWidth = m_videoFormat.coded_width = pVideoFormat->coded_width;
+    reconfigParams.ulHeight = m_videoFormat.coded_height = pVideoFormat->coded_height;
+
+    // Dont change display rect and get scaled output from decoder. This will help display app to present apps smoothly
+    reconfigParams.display_area.bottom = m_displayRect.b;
+    reconfigParams.display_area.top = m_displayRect.t;
+    reconfigParams.display_area.left = m_displayRect.l;
+    reconfigParams.display_area.right = m_displayRect.r;
+    reconfigParams.ulTargetWidth = m_nSurfaceWidth;
+    reconfigParams.ulTargetHeight = m_nSurfaceHeight;
+
+    // If external reconfigure is called along with resolution change even if post processing params is not changed,
+    // do full reconfigure params update
+    if ((m_bReconfigExternal && bDecodeResChange) || m_bReconfigExtPPChange) {
+        // update display rect and target resolution if requested explicitly
+        m_bReconfigExternal = false;
+        m_bReconfigExtPPChange = false;
+        m_videoFormat = *pVideoFormat;
+        if (!(m_cropRect.r && m_cropRect.b) && !(m_resizeDim.w && m_resizeDim.h)) {
+            m_nWidth = pVideoFormat->display_area.right - pVideoFormat->display_area.left;
+            m_nLumaHeight = pVideoFormat->display_area.bottom - pVideoFormat->display_area.top;
+            reconfigParams.ulTargetWidth = pVideoFormat->coded_width;
+            reconfigParams.ulTargetHeight = pVideoFormat->coded_height;
+        } else {
+            if (m_resizeDim.w && m_resizeDim.h) {
+                reconfigParams.display_area.left = pVideoFormat->display_area.left;
+                reconfigParams.display_area.top = pVideoFormat->display_area.top;
+                reconfigParams.display_area.right = pVideoFormat->display_area.right;
+                reconfigParams.display_area.bottom = pVideoFormat->display_area.bottom;
+                m_nWidth = m_resizeDim.w;
+                m_nLumaHeight = m_resizeDim.h;
+            }
+
+            if (m_cropRect.r && m_cropRect.b) {
+                reconfigParams.display_area.left = m_cropRect.l;
+                reconfigParams.display_area.top = m_cropRect.t;
+                reconfigParams.display_area.right = m_cropRect.r;
+                reconfigParams.display_area.bottom = m_cropRect.b;
+                m_nWidth = m_cropRect.r - m_cropRect.l;
+                m_nLumaHeight = m_cropRect.b - m_cropRect.t;
+            }
+            reconfigParams.ulTargetWidth = m_nWidth;
+            reconfigParams.ulTargetHeight = m_nLumaHeight;
+        }
+
+        m_nChromaHeight = (int)ceil(m_nLumaHeight * GetChromaHeightFactor(m_eOutputFormat));
+        m_nNumChromaPlanes = GetChromaPlaneCount(m_eOutputFormat);
+        m_nSurfaceHeight = reconfigParams.ulTargetHeight;
+        m_nSurfaceWidth = reconfigParams.ulTargetWidth;
+        m_displayRect.b = reconfigParams.display_area.bottom;
+        m_displayRect.t = reconfigParams.display_area.top;
+        m_displayRect.l = reconfigParams.display_area.left;
+        m_displayRect.r = reconfigParams.display_area.right;
+    }
+
+    reconfigParams.ulNumDecodeSurfaces = nDecodeSurface;
+
+    START_TIMER
+    CUDA_DRVAPI_CALL(cuCtxPushCurrent(m_cuContext));
+    NVDEC_API_CALL(cuvidReconfigureDecoder(m_hDecoder, &reconfigParams));
+    CUDA_DRVAPI_CALL(cuCtxPopCurrent(NULL));
+    STOP_TIMER("Session Reconfigure Time: ");
+
+    return nDecodeSurface;
+}
+
+int NvDecoder::setReconfigParams(const Rect *pCropRect, const Dim *pResizeDim) {
+    m_bReconfigExternal = true;
+    m_bReconfigExtPPChange = false;
+    if (pCropRect) {
+        if (!((pCropRect->t == m_cropRect.t) && (pCropRect->l == m_cropRect.l) && (pCropRect->b == m_cropRect.b) &&
+              (pCropRect->r == m_cropRect.r))) {
+            m_bReconfigExtPPChange = true;
+            m_cropRect = *pCropRect;
+        }
+    }
+    if (pResizeDim) {
+        if (!((pResizeDim->w == m_resizeDim.w) && (pResizeDim->h == m_resizeDim.h))) {
+            m_bReconfigExtPPChange = true;
+            m_resizeDim = *pResizeDim;
+        }
+    }
+
+    // Clear existing output buffers of different size
+    uint8_t *pFrame = NULL;
+    while (!m_vpFrame.empty()) {
+        pFrame = m_vpFrame.back();
+        m_vpFrame.pop_back();
+        if (m_bUseDeviceFrame) {
+            CUDA_DRVAPI_CALL(cuCtxPushCurrent(m_cuContext));
+            CUDA_DRVAPI_CALL(cuMemFree((CUdeviceptr)pFrame));
+            CUDA_DRVAPI_CALL(cuCtxPopCurrent(NULL));
+        } else {
+            delete pFrame;
+        }
+    }
+
+    return 1;
+}
+
+/* Return value from HandlePictureDecode() are interpreted as:
+ *  0: fail, >=1: succeeded
+ */
+int NvDecoder::HandlePictureDecode(CUVIDPICPARAMS *pPicParams) {
+    if (!m_hDecoder) {
+        NVDEC_THROW_ERROR("Decoder not initialized.", CUDA_ERROR_NOT_INITIALIZED);
+        return false;
+    }
+    m_nPicNumInDecodeOrder[pPicParams->CurrPicIdx] = m_nDecodePicCnt++;
+    CUDA_DRVAPI_CALL(cuCtxPushCurrent(m_cuContext));
+    NVDEC_API_CALL(cuvidDecodePicture(m_hDecoder, pPicParams));
+    if (m_bForce_zero_latency && ((!pPicParams->field_pic_flag) || (pPicParams->second_field))) {
+        CUVIDPARSERDISPINFO dispInfo;
+        memset(&dispInfo, 0, sizeof(dispInfo));
+        dispInfo.picture_index = pPicParams->CurrPicIdx;
+        dispInfo.progressive_frame = !pPicParams->field_pic_flag;
+        dispInfo.top_field_first = pPicParams->bottom_field_flag ^ 1;
+        HandlePictureDisplay(&dispInfo);
+    }
+    CUDA_DRVAPI_CALL(cuCtxPopCurrent(NULL));
+    return 1;
+}
+
+/* Return value from HandlePictureDisplay() are interpreted as:
+ *  0: fail, >=1: succeeded
+ */
+int NvDecoder::HandlePictureDisplay(CUVIDPARSERDISPINFO *pDispInfo) {
+    CUVIDPROCPARAMS videoProcessingParameters = {};
+    videoProcessingParameters.progressive_frame = pDispInfo->progressive_frame;
+    videoProcessingParameters.second_field = pDispInfo->repeat_first_field + 1;
+    videoProcessingParameters.top_field_first = pDispInfo->top_field_first;
+    videoProcessingParameters.unpaired_field = pDispInfo->repeat_first_field < 0;
+    videoProcessingParameters.output_stream = m_cuvidStream;
+
+    if (m_bExtractSEIMessage) {
+        if (m_SEIMessagesDisplayOrder[pDispInfo->picture_index].pSEIData) {
+            // Write SEI Message
+            uint8_t *seiBuffer = (uint8_t *)(m_SEIMessagesDisplayOrder[pDispInfo->picture_index].pSEIData);
+            uint32_t seiNumMessages = m_SEIMessagesDisplayOrder[pDispInfo->picture_index].sei_message_count;
+            CUSEIMESSAGE *seiMessagesInfo = m_SEIMessagesDisplayOrder[pDispInfo->picture_index].pSEIMessage;
+            if (m_fpSEI) {
+                for (uint32_t i = 0; i < seiNumMessages; i++) {
+                    if (m_eCodec == cudaVideoCodec_H264 || cudaVideoCodec_H264_SVC || cudaVideoCodec_H264_MVC ||
+                        cudaVideoCodec_HEVC) {
+                        switch (seiMessagesInfo[i].sei_message_type) {
+                        case SEI_TYPE_TIME_CODE: {
+                            HEVCSEITIMECODE *timecode = (HEVCSEITIMECODE *)seiBuffer;
+                            fwrite(timecode, sizeof(HEVCSEITIMECODE), 1, m_fpSEI);
+                        } break;
+                        case SEI_TYPE_USER_DATA_UNREGISTERED: {
+                            fwrite(seiBuffer, seiMessagesInfo[i].sei_message_size, 1, m_fpSEI);
+                        } break;
+                        }
+                    }
+                    if (m_eCodec == cudaVideoCodec_AV1) {
+                        fwrite(seiBuffer, seiMessagesInfo[i].sei_message_size, 1, m_fpSEI);
+                    }
+                    seiBuffer += seiMessagesInfo[i].sei_message_size;
+                }
+            }
+            free(m_SEIMessagesDisplayOrder[pDispInfo->picture_index].pSEIData);
+            free(m_SEIMessagesDisplayOrder[pDispInfo->picture_index].pSEIMessage);
+        }
+    }
+
+    CUdeviceptr dpSrcFrame = 0;
+    unsigned int nSrcPitch = 0;
+    CUDA_DRVAPI_CALL(cuCtxPushCurrent(m_cuContext));
+    NVDEC_API_CALL(
+        cuvidMapVideoFrame(m_hDecoder, pDispInfo->picture_index, &dpSrcFrame, &nSrcPitch, &videoProcessingParameters));
+
+    CUVIDGETDECODESTATUS DecodeStatus;
+    memset(&DecodeStatus, 0, sizeof(DecodeStatus));
+    CUresult result = cuvidGetDecodeStatus(m_hDecoder, pDispInfo->picture_index, &DecodeStatus);
+    if (result == CUDA_SUCCESS && (DecodeStatus.decodeStatus == cuvidDecodeStatus_Error ||
+                                   DecodeStatus.decodeStatus == cuvidDecodeStatus_Error_Concealed)) {
+        printf("Decode Error occurred for picture %d\n", m_nPicNumInDecodeOrder[pDispInfo->picture_index]);
+    }
+
+    uint8_t *pDecodedFrame = nullptr;
+    {
+        std::lock_guard<std::mutex> lock(m_mtxVPFrame);
+        if ((unsigned)++m_nDecodedFrame > m_vpFrame.size()) {
+            // Not enough frames in stock
+            m_nFrameAlloc++;
+            uint8_t *pFrame = NULL;
+            if (m_bUseDeviceFrame) {
+                if (m_bDeviceFramePitched) {
+                    CUDA_DRVAPI_CALL(cuMemAllocPitch((CUdeviceptr *)&pFrame, &m_nDeviceFramePitch, GetWidth() * m_nBPP,
+                                                     m_nLumaHeight + (m_nChromaHeight * m_nNumChromaPlanes), 16));
+                } else {
+                    CUDA_DRVAPI_CALL(cuMemAlloc((CUdeviceptr *)&pFrame, GetFrameSize()));
+                }
+            } else {
+                pFrame = new uint8_t[GetFrameSize()];
+            }
+            m_vpFrame.push_back(pFrame);
+        }
+        pDecodedFrame = m_vpFrame[m_nDecodedFrame - 1];
+    }
+
+    // Copy luma plane
+    CUDA_MEMCPY2D m = {0};
+    m.srcMemoryType = CU_MEMORYTYPE_DEVICE;
+    m.srcDevice = dpSrcFrame;
+    m.srcPitch = nSrcPitch;
+    m.dstMemoryType = m_bUseDeviceFrame ? CU_MEMORYTYPE_DEVICE : CU_MEMORYTYPE_HOST;
+    m.dstDevice = (CUdeviceptr)(m.dstHost = pDecodedFrame);
+    m.dstPitch = m_nDeviceFramePitch ? m_nDeviceFramePitch : GetWidth() * m_nBPP;
+    m.WidthInBytes = GetWidth() * m_nBPP;
+    m.Height = m_nLumaHeight;
+    CUDA_DRVAPI_CALL(cuMemcpy2DAsync(&m, m_cuvidStream));
+
+    // Copy chroma plane
+    // NVDEC output has luma height aligned by 2. Adjust chroma offset by aligning height
+    m.srcDevice = (CUdeviceptr)((uint8_t *)dpSrcFrame + m.srcPitch * ((m_nSurfaceHeight + 1) & ~1));
+    m.dstDevice = (CUdeviceptr)(m.dstHost = pDecodedFrame + m.dstPitch * m_nLumaHeight);
+    m.Height = m_nChromaHeight;
+    CUDA_DRVAPI_CALL(cuMemcpy2DAsync(&m, m_cuvidStream));
+
+    if (m_nNumChromaPlanes == 2) {
+        m.srcDevice = (CUdeviceptr)((uint8_t *)dpSrcFrame + m.srcPitch * ((m_nSurfaceHeight + 1) & ~1) * 2);
+        m.dstDevice = (CUdeviceptr)(m.dstHost = pDecodedFrame + m.dstPitch * m_nLumaHeight * 2);
+        m.Height = m_nChromaHeight;
+        CUDA_DRVAPI_CALL(cuMemcpy2DAsync(&m, m_cuvidStream));
+    }
+    CUDA_DRVAPI_CALL(cuStreamSynchronize(m_cuvidStream));
+    CUDA_DRVAPI_CALL(cuCtxPopCurrent(NULL));
+
+    if ((int)m_vTimestamp.size() < m_nDecodedFrame) {
+        m_vTimestamp.resize(m_vpFrame.size());
+    }
+    m_vTimestamp[m_nDecodedFrame - 1] = pDispInfo->timestamp;
+
+    NVDEC_API_CALL(cuvidUnmapVideoFrame(m_hDecoder, dpSrcFrame));
+    return 1;
+}
+
+int NvDecoder::GetSEIMessage(CUVIDSEIMESSAGEINFO *pSEIMessageInfo) {
+    uint32_t seiNumMessages = pSEIMessageInfo->sei_message_count;
+    CUSEIMESSAGE *seiMessagesInfo = pSEIMessageInfo->pSEIMessage;
+    size_t totalSEIBufferSize = 0;
+    if ((pSEIMessageInfo->picIdx < 0) || (pSEIMessageInfo->picIdx >= MAX_FRM_CNT)) {
+        printf("Invalid picture index (%d)\n", pSEIMessageInfo->picIdx);
+        return 0;
+    }
+    for (uint32_t i = 0; i < seiNumMessages; i++) {
+        totalSEIBufferSize += seiMessagesInfo[i].sei_message_size;
+    }
+    if (!m_pCurrSEIMessage) {
+        printf("Out of Memory, Allocation failed for m_pCurrSEIMessage\n");
+        return 0;
+    }
+    m_pCurrSEIMessage->pSEIData = malloc(totalSEIBufferSize);
+    if (!m_pCurrSEIMessage->pSEIData) {
+        printf("Out of Memory, Allocation failed for SEI Buffer\n");
+        return 0;
+    }
+    memcpy(m_pCurrSEIMessage->pSEIData, pSEIMessageInfo->pSEIData, totalSEIBufferSize);
+    m_pCurrSEIMessage->pSEIMessage = (CUSEIMESSAGE *)malloc(sizeof(CUSEIMESSAGE) * seiNumMessages);
+    if (!m_pCurrSEIMessage->pSEIMessage) {
+        free(m_pCurrSEIMessage->pSEIData);
+        m_pCurrSEIMessage->pSEIData = NULL;
+        return 0;
+    }
+    memcpy(m_pCurrSEIMessage->pSEIMessage, pSEIMessageInfo->pSEIMessage, sizeof(CUSEIMESSAGE) * seiNumMessages);
+    m_pCurrSEIMessage->sei_message_count = pSEIMessageInfo->sei_message_count;
+    m_SEIMessagesDisplayOrder[pSEIMessageInfo->picIdx] = *m_pCurrSEIMessage;
+    return 1;
+}
+
+NvDecoder::NvDecoder(CUcontext cuContext, bool bUseDeviceFrame, cudaVideoCodec eCodec, bool bLowLatency,
+                     bool bDeviceFramePitched, const Rect *pCropRect, const Dim *pResizeDim,
+                     bool extract_user_SEI_Message, int maxWidth, int maxHeight, unsigned int clkRate,
+                     bool force_zero_latency)
+    : m_cuContext(cuContext), m_bUseDeviceFrame(bUseDeviceFrame), m_eCodec(eCodec),
+      m_bDeviceFramePitched(bDeviceFramePitched), m_bExtractSEIMessage(extract_user_SEI_Message), m_nMaxWidth(maxWidth),
+      m_nMaxHeight(maxHeight), m_bForce_zero_latency(force_zero_latency) {
+    if (pCropRect)
+        m_cropRect = *pCropRect;
+    if (pResizeDim)
+        m_resizeDim = *pResizeDim;
+
+    NVDEC_API_CALL(cuvidCtxLockCreate(&m_ctxLock, cuContext));
+
+    ck(cuStreamCreate(&m_cuvidStream, CU_STREAM_DEFAULT));
+
+    decoderSessionID = 0;
+
+    if (m_bExtractSEIMessage) {
+        m_fpSEI = fopen("sei_message.txt", "wb");
+        m_pCurrSEIMessage = new CUVIDSEIMESSAGEINFO;
+        memset(&m_SEIMessagesDisplayOrder, 0, sizeof(m_SEIMessagesDisplayOrder));
+    }
+    CUVIDPARSERPARAMS videoParserParameters = {};
+    videoParserParameters.CodecType = eCodec;
+    videoParserParameters.ulMaxNumDecodeSurfaces = 1;
+    videoParserParameters.ulClockRate = clkRate;
+    videoParserParameters.ulMaxDisplayDelay = bLowLatency ? 0 : 1;
+    videoParserParameters.pUserData = this;
+    videoParserParameters.pfnSequenceCallback = HandleVideoSequenceProc;
+    videoParserParameters.pfnDecodePicture = HandlePictureDecodeProc;
+    videoParserParameters.pfnDisplayPicture = m_bForce_zero_latency ? NULL : HandlePictureDisplayProc;
+    videoParserParameters.pfnGetOperatingPoint = HandleOperatingPointProc;
+    videoParserParameters.pfnGetSEIMsg = m_bExtractSEIMessage ? HandleSEIMessagesProc : NULL;
+    NVDEC_API_CALL(cuvidCreateVideoParser(&m_hParser, &videoParserParameters));
+}
+
+NvDecoder::~NvDecoder() {
+
+    START_TIMER
+
+    if (m_pCurrSEIMessage) {
+        delete m_pCurrSEIMessage;
+        m_pCurrSEIMessage = NULL;
+    }
+
+    if (m_fpSEI) {
+        fclose(m_fpSEI);
+        m_fpSEI = NULL;
+    }
+
+    if (m_hParser) {
+        cuvidDestroyVideoParser(m_hParser);
+    }
+    cuCtxPushCurrent(m_cuContext);
+    if (m_hDecoder) {
+        cuvidDestroyDecoder(m_hDecoder);
+    }
+
+    std::lock_guard<std::mutex> lock(m_mtxVPFrame);
+
+    for (uint8_t *pFrame : m_vpFrame) {
+        if (m_bUseDeviceFrame) {
+            cuMemFree((CUdeviceptr)pFrame);
+        } else {
+            delete[] pFrame;
+        }
+    }
+    cuCtxPopCurrent(NULL);
+
+    cuvidCtxLockDestroy(m_ctxLock);
+
+    STOP_TIMER("Session Deinitialization Time: ");
+
+    NvDecoder::addDecoderSessionOverHead(getDecoderSessionID(), elapsedTime);
+}
+
+int NvDecoder::Decode(const uint8_t *pData, int nSize, int nFlags, int64_t nTimestamp) {
+    m_nDecodedFrame = 0;
+    m_nDecodedFrameReturned = 0;
+    CUVIDSOURCEDATAPACKET packet = {0};
+    packet.payload = pData;
+    packet.payload_size = nSize;
+    packet.flags = nFlags | CUVID_PKT_TIMESTAMP;
+    packet.timestamp = nTimestamp;
+    if (!pData || nSize == 0) {
+        packet.flags |= CUVID_PKT_ENDOFSTREAM;
+    }
+    NVDEC_API_CALL(cuvidParseVideoData(m_hParser, &packet));
+
+    return m_nDecodedFrame;
+}
+
+uint8_t *NvDecoder::GetFrame(int64_t *pTimestamp) {
+    if (m_nDecodedFrame > 0) {
+        std::lock_guard<std::mutex> lock(m_mtxVPFrame);
+        m_nDecodedFrame--;
+        if (pTimestamp)
+            *pTimestamp = m_vTimestamp[m_nDecodedFrameReturned];
+        return m_vpFrame[m_nDecodedFrameReturned++];
+    }
+
+    return NULL;
+}
+
+uint8_t *NvDecoder::GetLockedFrame(int64_t *pTimestamp) {
+    uint8_t *pFrame;
+    uint64_t timestamp;
+    if (m_nDecodedFrame > 0) {
+        std::lock_guard<std::mutex> lock(m_mtxVPFrame);
+        m_nDecodedFrame--;
+        pFrame = m_vpFrame[0];
+        m_vpFrame.erase(m_vpFrame.begin(), m_vpFrame.begin() + 1);
+
+        timestamp = m_vTimestamp[0];
+        m_vTimestamp.erase(m_vTimestamp.begin(), m_vTimestamp.begin() + 1);
+
+        if (pTimestamp)
+            *pTimestamp = timestamp;
+
+        return pFrame;
+    }
+
+    return NULL;
+}
+
+void NvDecoder::UnlockFrame(uint8_t **pFrame) {
+    std::lock_guard<std::mutex> lock(m_mtxVPFrame);
+    m_vpFrame.insert(m_vpFrame.end(), &pFrame[0], &pFrame[1]);
+
+    // add a dummy entry for timestamp
+    uint64_t timestamp[2] = {0};
+    m_vTimestamp.insert(m_vTimestamp.end(), &timestamp[0], &timestamp[1]);
+}
diff --git a/third_party/Video_Codec_SDK/Samples/NvCodec/NvDecoder/NvDecoder.h b/third_party/Video_Codec_SDK/Samples/NvCodec/NvDecoder/NvDecoder.h
new file mode 100644
index 00000000..886202bf
--- /dev/null
+++ b/third_party/Video_Codec_SDK/Samples/NvCodec/NvDecoder/NvDecoder.h
@@ -0,0 +1,528 @@
+/*
+ * This copyright notice applies to this header file only:
+ *
+ * Copyright (c) 2010-2023 NVIDIA Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the software, and to permit persons to whom the
+ * software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#include "../../../Interface/nvcuvid.h"
+#include "../Utils/NvCodecUtils.h"
+#include <assert.h>
+#include <iostream>
+#include <map>
+#include <mutex>
+#include <sstream>
+#include <stdint.h>
+#include <string.h>
+#include <string>
+#include <vector>
+
+#define MAX_FRM_CNT 32
+
+typedef enum { SEI_TYPE_TIME_CODE = 136, SEI_TYPE_USER_DATA_UNREGISTERED = 5 } SEI_H264_HEVC_PAYLOAD_TYPE;
+
+/**
+ * @brief Exception class for error reporting from the decode API.
+ */
+class NVDECException : public std::exception {
+  public:
+    NVDECException(const std::string &errorStr, const CUresult errorCode)
+        : m_errorString(errorStr), m_errorCode(errorCode) {}
+
+    virtual ~NVDECException() throw() {}
+    virtual const char *what() const throw() { return m_errorString.c_str(); }
+    CUresult getErrorCode() const { return m_errorCode; }
+    const std::string &getErrorString() const { return m_errorString; }
+    static NVDECException makeNVDECException(const std::string &errorStr, const CUresult errorCode,
+                                             const std::string &functionName, const std::string &fileName, int lineNo);
+
+  private:
+    std::string m_errorString;
+    CUresult m_errorCode;
+};
+
+inline NVDECException NVDECException::makeNVDECException(const std::string &errorStr, const CUresult errorCode,
+                                                         const std::string &functionName, const std::string &fileName,
+                                                         int lineNo) {
+    std::ostringstream errorLog;
+    errorLog << functionName << " : " << errorStr << " at " << fileName << ":" << lineNo << std::endl;
+    NVDECException exception(errorLog.str(), errorCode);
+    return exception;
+}
+
+#define NVDEC_THROW_ERROR(errorStr, errorCode)                                                                         \
+    do {                                                                                                               \
+        throw NVDECException::makeNVDECException(errorStr, errorCode, __FUNCTION__, __FILE__, __LINE__);               \
+    } while (0)
+
+#define NVDEC_API_CALL(cuvidAPI)                                                                                       \
+    do {                                                                                                               \
+        CUresult errorCode = cuvidAPI;                                                                                 \
+        if (errorCode != CUDA_SUCCESS) {                                                                               \
+            std::ostringstream errorLog;                                                                               \
+            errorLog << #cuvidAPI << " returned error " << errorCode;                                                  \
+            throw NVDECException::makeNVDECException(errorLog.str(), errorCode, __FUNCTION__, __FILE__, __LINE__);     \
+        }                                                                                                              \
+    } while (0)
+
+struct Rect {
+    int l, t, r, b;
+};
+
+struct Dim {
+    int w, h;
+};
+
+#define START_TIMER auto start = std::chrono::high_resolution_clock::now();
+
+#define STOP_TIMER(print_message)                                                                                      \
+    int64_t elapsedTime =                                                                                              \
+        std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - start)       \
+            .count();                                                                                                  \
+    std::cout << print_message << elapsedTime << " ms " << std::endl;
+
+#define CUDA_DRVAPI_CALL(call)                                                                                         \
+    do {                                                                                                               \
+        CUresult err__ = call;                                                                                         \
+        if (err__ != CUDA_SUCCESS) {                                                                                   \
+            const char *szErrName = NULL;                                                                              \
+            cuGetErrorName(err__, &szErrName);                                                                         \
+            std::ostringstream errorLog;                                                                               \
+            errorLog << "CUDA driver API error " << szErrName;                                                         \
+            throw NVDECException::makeNVDECException(errorLog.str(), err__, __FUNCTION__, __FILE__, __LINE__);         \
+        }                                                                                                              \
+    } while (0)
+
+static const char *GetVideoCodecString(cudaVideoCodec eCodec) {
+    static struct {
+        cudaVideoCodec eCodec;
+        const char *name;
+    } aCodecName[] = {
+        {cudaVideoCodec_MPEG1, "MPEG-1"},
+        {cudaVideoCodec_MPEG2, "MPEG-2"},
+        {cudaVideoCodec_MPEG4, "MPEG-4 (ASP)"},
+        {cudaVideoCodec_VC1, "VC-1/WMV"},
+        {cudaVideoCodec_H264, "AVC/H.264"},
+        {cudaVideoCodec_JPEG, "M-JPEG"},
+        {cudaVideoCodec_H264_SVC, "H.264/SVC"},
+        {cudaVideoCodec_H264_MVC, "H.264/MVC"},
+        {cudaVideoCodec_HEVC, "H.265/HEVC"},
+        {cudaVideoCodec_VP8, "VP8"},
+        {cudaVideoCodec_VP9, "VP9"},
+        {cudaVideoCodec_AV1, "AV1"},
+        {cudaVideoCodec_NumCodecs, "Invalid"},
+        {cudaVideoCodec_YUV420, "YUV  4:2:0"},
+        {cudaVideoCodec_YV12, "YV12 4:2:0"},
+        {cudaVideoCodec_NV12, "NV12 4:2:0"},
+        {cudaVideoCodec_YUYV, "YUYV 4:2:2"},
+        {cudaVideoCodec_UYVY, "UYVY 4:2:2"},
+    };
+
+    if (eCodec >= 0 && eCodec <= cudaVideoCodec_NumCodecs) {
+        return aCodecName[eCodec].name;
+    }
+    for (int i = cudaVideoCodec_NumCodecs + 1; i < sizeof(aCodecName) / sizeof(aCodecName[0]); i++) {
+        if (eCodec == aCodecName[i].eCodec) {
+            return aCodecName[eCodec].name;
+        }
+    }
+    return "Unknown";
+}
+
+static const char *GetVideoChromaFormatString(cudaVideoChromaFormat eChromaFormat) {
+    static struct {
+        cudaVideoChromaFormat eChromaFormat;
+        const char *name;
+    } aChromaFormatName[] = {
+        {cudaVideoChromaFormat_Monochrome, "YUV 400 (Monochrome)"},
+        {cudaVideoChromaFormat_420, "YUV 420"},
+        {cudaVideoChromaFormat_422, "YUV 422"},
+        {cudaVideoChromaFormat_444, "YUV 444"},
+    };
+
+    if (eChromaFormat >= 0 && eChromaFormat < sizeof(aChromaFormatName) / sizeof(aChromaFormatName[0])) {
+        return aChromaFormatName[eChromaFormat].name;
+    }
+    return "Unknown";
+}
+
+static float GetChromaHeightFactor(cudaVideoSurfaceFormat eSurfaceFormat) {
+    float factor = 0.5;
+    switch (eSurfaceFormat) {
+    case cudaVideoSurfaceFormat_NV12:
+    case cudaVideoSurfaceFormat_P016:
+        factor = 0.5;
+        break;
+    case cudaVideoSurfaceFormat_YUV444:
+    case cudaVideoSurfaceFormat_YUV444_16Bit:
+        factor = 1.0;
+        break;
+    }
+
+    return factor;
+}
+
+static int GetChromaPlaneCount(cudaVideoSurfaceFormat eSurfaceFormat) {
+    int numPlane = 1;
+    switch (eSurfaceFormat) {
+    case cudaVideoSurfaceFormat_NV12:
+    case cudaVideoSurfaceFormat_P016:
+        numPlane = 1;
+        break;
+    case cudaVideoSurfaceFormat_YUV444:
+    case cudaVideoSurfaceFormat_YUV444_16Bit:
+        numPlane = 2;
+        break;
+    }
+
+    return numPlane;
+}
+
+/**
+ * @brief Base class for decoder interface.
+ */
+class NvDecoder {
+
+  public:
+    NvDecoder() {}
+    /**
+     *  @brief This function is used to initialize the decoder session.
+     *  Application must call this function to initialize the decoder, before
+     *  starting to decode any frames.
+     */
+    NvDecoder(CUcontext cuContext, bool bUseDeviceFrame, cudaVideoCodec eCodec, bool bLowLatency = false,
+              bool bDeviceFramePitched = false, const Rect *pCropRect = NULL, const Dim *pResizeDim = NULL,
+              bool extract_user_SEI_Message = false, int maxWidth = 0, int maxHeight = 0, unsigned int clkRate = 1000,
+              bool force_zero_latency = false);
+    ~NvDecoder();
+
+    /**
+     *  @brief  This function is used to get the current CUDA context.
+     */
+    CUcontext GetContext() { return m_cuContext; }
+
+    /**
+     *  @brief  This function is used to get the output frame width.
+     *  NV12/P016 output format width is 2 byte aligned because of U and V interleave
+     */
+    int GetWidth() {
+        assert(m_nWidth);
+        return (m_eOutputFormat == cudaVideoSurfaceFormat_NV12 || m_eOutputFormat == cudaVideoSurfaceFormat_P016)
+                   ? (m_nWidth + 1) & ~1
+                   : m_nWidth;
+    }
+
+    /**
+     *  @brief  This function is used to get the actual decode width
+     */
+    int GetDecodeWidth() {
+        assert(m_nWidth);
+        return m_nWidth;
+    }
+
+    /**
+     *  @brief  This function is used to get the output frame height (Luma height).
+     */
+    int GetHeight() {
+        assert(m_nLumaHeight);
+        return m_nLumaHeight;
+    }
+
+    /**
+     *  @brief  This function is used to get the current chroma height.
+     */
+    int GetChromaHeight() {
+        assert(m_nChromaHeight);
+        return m_nChromaHeight;
+    }
+
+    /**
+     *  @brief  This function is used to get the number of chroma planes.
+     */
+    int GetNumChromaPlanes() {
+        assert(m_nNumChromaPlanes);
+        return m_nNumChromaPlanes;
+    }
+
+    /**
+     *   @brief  This function is used to get the current frame size based on pixel format.
+     */
+    int GetFrameSize() {
+        assert(m_nWidth);
+        return GetWidth() * (m_nLumaHeight + (m_nChromaHeight * m_nNumChromaPlanes)) * m_nBPP;
+    }
+
+    /**
+     *   @brief  This function is used to get the current frame Luma plane size.
+     */
+    int GetLumaPlaneSize() {
+        assert(m_nWidth);
+        return GetWidth() * m_nLumaHeight * m_nBPP;
+    }
+
+    /**
+     *   @brief  This function is used to get the current frame chroma plane size.
+     */
+    int GetChromaPlaneSize() {
+        assert(m_nWidth);
+        return GetWidth() * (m_nChromaHeight * m_nNumChromaPlanes) * m_nBPP;
+    }
+
+    /**
+     *  @brief  This function is used to get the pitch of the device buffer holding the decoded frame.
+     */
+    int GetDeviceFramePitch() {
+        assert(m_nWidth);
+        return m_nDeviceFramePitch ? (int)m_nDeviceFramePitch : GetWidth() * m_nBPP;
+    }
+
+    /**
+     *   @brief  This function is used to get the bit depth associated with the pixel format.
+     */
+    int GetBitDepth() {
+        assert(m_nWidth);
+        return m_nBitDepthMinus8 + 8;
+    }
+
+    /**
+     *   @brief  This function is used to get the bytes used per pixel.
+     */
+    int GetBPP() {
+        assert(m_nWidth);
+        return m_nBPP;
+    }
+
+    /**
+     *   @brief  This function is used to get the YUV chroma format
+     */
+    cudaVideoSurfaceFormat GetOutputFormat() { return m_eOutputFormat; }
+
+    /**
+     *   @brief  This function is used to get information about the video stream (codec, display parameters etc)
+     */
+    CUVIDEOFORMAT GetVideoFormatInfo() {
+        assert(m_nWidth);
+        return m_videoFormat;
+    }
+
+    /**
+     *   @brief  This function is used to get codec string from codec id
+     */
+    const char *GetCodecString(cudaVideoCodec eCodec);
+
+    /**
+     *   @brief  This function is used to print information about the video stream
+     */
+    std::string GetVideoInfo() const { return m_videoInfo.str(); }
+
+    /**
+     *   @brief  This function decodes a frame and returns the number of frames that are available for
+     *   display. All frames that are available for display should be read before making a subsequent decode call.
+     *   @param  pData - pointer to the data buffer that is to be decoded
+     *   @param  nSize - size of the data buffer in bytes
+     *   @param  nFlags - CUvideopacketflags for setting decode options
+     *   @param  nTimestamp - presentation timestamp
+     */
+    int Decode(const uint8_t *pData, int nSize, int nFlags = 0, int64_t nTimestamp = 0);
+
+    /**
+     *   @brief  This function returns a decoded frame and timestamp. This function should be called in a loop for
+     *   fetching all the frames that are available for display.
+     */
+    uint8_t *GetFrame(int64_t *pTimestamp = nullptr);
+
+    /**
+     *   @brief  This function decodes a frame and returns the locked frame buffers
+     *   This makes the buffers available for use by the application without the buffers
+     *   getting overwritten, even if subsequent decode calls are made. The frame buffers
+     *   remain locked, until UnlockFrame() is called
+     */
+    uint8_t *GetLockedFrame(int64_t *pTimestamp = nullptr);
+
+    /**
+     *   @brief  This function unlocks the frame buffer and makes the frame buffers available for write again
+     *   @param  ppFrame - pointer to array of frames that are to be unlocked
+     *   @param  nFrame - number of frames to be unlocked
+     */
+    void UnlockFrame(uint8_t **pFrame);
+
+    /**
+     *   @brief  This function allows app to set decoder reconfig params
+     *   @param  pCropRect - cropping rectangle coordinates
+     *   @param  pResizeDim - width and height of resized output
+     */
+    int setReconfigParams(const Rect *pCropRect, const Dim *pResizeDim);
+
+    /**
+     *   @brief  This function allows app to set operating point for AV1 SVC clips
+     *   @param  opPoint - operating point of an AV1 scalable bitstream
+     *   @param  bDispAllLayers - Output all decoded frames of an AV1 scalable bitstream
+     */
+    void SetOperatingPoint(const uint32_t opPoint, const bool bDispAllLayers) {
+        m_nOperatingPoint = opPoint;
+        m_bDispAllLayers = bDispAllLayers;
+    }
+
+    // start a timer
+    void startTimer() { m_stDecode_time.Start(); }
+
+    // stop the timer
+    double stopTimer() { return m_stDecode_time.Stop(); }
+
+    void setDecoderSessionID(int sessionID) { decoderSessionID = sessionID; }
+    int getDecoderSessionID() { return decoderSessionID; }
+
+    // Session overhead refers to decoder initialization and deinitialization time
+    static void addDecoderSessionOverHead(int sessionID, int64_t duration) { sessionOverHead[sessionID] += duration; }
+    static int64_t getDecoderSessionOverHead(int sessionID) { return sessionOverHead[sessionID]; }
+
+  protected:
+    int decoderSessionID;                          // Decoder session identifier. Used to gather session level stats.
+    static std::map<int, int64_t> sessionOverHead; // Records session overhead of initialization+deinitialization time.
+                                                   // Format is (thread id, duration)
+
+    /**
+     *   @brief  Callback function to be registered for getting a callback when decoding of sequence starts
+     */
+    static int CUDAAPI HandleVideoSequenceProc(void *pUserData, CUVIDEOFORMAT *pVideoFormat) {
+        return ((NvDecoder *)pUserData)->HandleVideoSequence(pVideoFormat);
+    }
+
+    /**
+     *   @brief  Callback function to be registered for getting a callback when a decoded frame is ready to be decoded
+     */
+    static int CUDAAPI HandlePictureDecodeProc(void *pUserData, CUVIDPICPARAMS *pPicParams) {
+        return ((NvDecoder *)pUserData)->HandlePictureDecode(pPicParams);
+    }
+
+    /**
+     *   @brief  Callback function to be registered for getting a callback when a decoded frame is available for display
+     */
+    static int CUDAAPI HandlePictureDisplayProc(void *pUserData, CUVIDPARSERDISPINFO *pDispInfo) {
+        return ((NvDecoder *)pUserData)->HandlePictureDisplay(pDispInfo);
+    }
+
+    /**
+     *   @brief  Callback function to be registered for getting a callback to get operating point when AV1 SVC sequence
+     * header start.
+     */
+    static int CUDAAPI HandleOperatingPointProc(void *pUserData, CUVIDOPERATINGPOINTINFO *pOPInfo) {
+        return ((NvDecoder *)pUserData)->GetOperatingPoint(pOPInfo);
+    }
+
+    /**
+     *   @brief  Callback function to be registered for getting a callback when all the unregistered user SEI Messages
+     * are parsed for a frame.
+     */
+    static int CUDAAPI HandleSEIMessagesProc(void *pUserData, CUVIDSEIMESSAGEINFO *pSEIMessageInfo) {
+        return ((NvDecoder *)pUserData)->GetSEIMessage(pSEIMessageInfo);
+    }
+
+    /**
+    *   @brief  This function gets called when a sequence is ready to be decoded. The function also gets called
+        when there is format change
+    */
+    int HandleVideoSequence(CUVIDEOFORMAT *pVideoFormat);
+
+    /**
+     *   @brief  This function gets called when a picture is ready to be decoded. cuvidDecodePicture is called from this
+     * function to decode the picture
+     */
+    int HandlePictureDecode(CUVIDPICPARAMS *pPicParams);
+
+    /**
+    *   @brief  This function gets called after a picture is decoded and available for display. Frames are fetched and
+    stored in internal buffer
+    */
+    int HandlePictureDisplay(CUVIDPARSERDISPINFO *pDispInfo);
+
+    /**
+     *   @brief  This function gets called when AV1 sequence encounter more than one operating points
+     */
+    int GetOperatingPoint(CUVIDOPERATINGPOINTINFO *pOPInfo);
+
+    /**
+     *   @brief  This function gets called when all unregistered user SEI messages are parsed for a frame
+     */
+    int GetSEIMessage(CUVIDSEIMESSAGEINFO *pSEIMessageInfo);
+
+    /**
+     *   @brief  This function reconfigure decoder if there is a change in sequence params.
+     */
+    int ReconfigureDecoder(CUVIDEOFORMAT *pVideoFormat);
+
+  public:
+    CUcontext m_cuContext = NULL;
+    CUvideoctxlock m_ctxLock;
+    CUvideoparser m_hParser = NULL;
+    CUvideodecoder m_hDecoder = NULL;
+    bool m_bUseDeviceFrame;
+    // dimension of the output
+    unsigned int m_nWidth = 0, m_nLumaHeight = 0, m_nChromaHeight = 0;
+    unsigned int m_nNumChromaPlanes = 0;
+    // height of the mapped surface
+    int m_nSurfaceHeight = 0;
+    int m_nSurfaceWidth = 0;
+    cudaVideoCodec m_eCodec = cudaVideoCodec_NumCodecs;
+    cudaVideoChromaFormat m_eChromaFormat = cudaVideoChromaFormat_420;
+    cudaVideoSurfaceFormat m_eOutputFormat = cudaVideoSurfaceFormat_NV12;
+    int m_nBitDepthMinus8 = 0;
+    int m_nBPP = 1;
+    CUVIDEOFORMAT m_videoFormat = {};
+    Rect m_displayRect = {};
+    // stock of frames
+    std::vector<uint8_t *> m_vpFrame;
+    // timestamps of decoded frames
+    std::vector<int64_t> m_vTimestamp;
+    int m_nDecodedFrame = 0, m_nDecodedFrameReturned = 0;
+    int m_nDecodePicCnt = 0, m_nPicNumInDecodeOrder[MAX_FRM_CNT];
+    CUVIDSEIMESSAGEINFO *m_pCurrSEIMessage = NULL;
+    CUVIDSEIMESSAGEINFO m_SEIMessagesDisplayOrder[MAX_FRM_CNT];
+    FILE *m_fpSEI = NULL;
+    bool m_bEndDecodeDone = false;
+    std::mutex m_mtxVPFrame;
+    int m_nFrameAlloc = 0;
+    CUstream m_cuvidStream = 0;
+    bool m_bDeviceFramePitched = false;
+    size_t m_nDeviceFramePitch = 0;
+    Rect m_cropRect = {};
+    Dim m_resizeDim = {};
+
+    std::ostringstream m_videoInfo;
+    unsigned int m_nMaxWidth = 0, m_nMaxHeight = 0;
+    bool m_bReconfigExternal = false;
+    bool m_bReconfigExtPPChange = false;
+    StopWatch m_stDecode_time;
+
+    unsigned int m_nOperatingPoint = 0;
+    bool m_bDispAllLayers = false;
+    // In H.264, there is an inherent display latency for video contents
+    // which do not have num_reorder_frames=0 in the VUI. This applies to
+    // All-Intra and IPPP sequences as well. If the user wants zero display
+    // latency for All-Intra and IPPP sequences, the below flag will enable
+    // the display callback immediately after the decode callback.
+    bool m_bForce_zero_latency = false;
+    bool m_bExtractSEIMessage = false;
+};
diff --git a/third_party/Video_Codec_SDK/Samples/Utils/FFmpegDemuxer.h b/third_party/Video_Codec_SDK/Samples/Utils/FFmpegDemuxer.h
new file mode 100644
index 00000000..bd1881db
--- /dev/null
+++ b/third_party/Video_Codec_SDK/Samples/Utils/FFmpegDemuxer.h
@@ -0,0 +1,379 @@
+/*
+ * This copyright notice applies to this header file only:
+ *
+ * Copyright (c) 2010-2023 NVIDIA Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the software, and to permit persons to whom the
+ * software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+#pragma once
+
+extern "C" {
+#include <libavcodec/avcodec.h>
+#include <libavformat/avformat.h>
+#include <libavformat/avio.h>
+/* Explicitly include bsf.h when building against FFmpeg 4.3 (libavcodec 58.45.100) or later for backward compatibility
+ */
+#if LIBAVCODEC_VERSION_INT >= 3824484
+#include <libavcodec/bsf.h>
+#endif
+}
+#include "NvCodecUtils.h"
+#include "nvcuvid.h"
+
+//---------------------------------------------------------------------------
+//! \file FFmpegDemuxer.h
+//! \brief Provides functionality for stream demuxing
+//!
+//! This header file is used by Decode/Transcode apps to demux input video clips before decoding frames from it.
+//---------------------------------------------------------------------------
+
+/**
+ * @brief libavformat wrapper class. Retrieves the elementary encoded stream from the container format.
+ */
+class FFmpegDemuxer {
+  private:
+    AVFormatContext *fmtc = NULL;
+    AVIOContext *avioc = NULL;
+    AVPacket *pkt = NULL; /*!< AVPacket stores compressed data typically exported by demuxers and then passed as input
+                             to decoders */
+    AVPacket *pktFiltered = NULL;
+    AVBSFContext *bsfc = NULL;
+
+    int iVideoStream;
+    bool bMp4H264, bMp4HEVC, bMp4MPEG4;
+    AVCodecID eVideoCodec;
+    AVPixelFormat eChromaFormat;
+    int nWidth, nHeight, nBitDepth, nBPP, nChromaHeight;
+    double timeBase = 0.0;
+    int64_t userTimeScale = 0;
+
+    uint8_t *pDataWithHeader = NULL;
+
+    unsigned int frameCount = 0;
+
+  public:
+    class DataProvider {
+      public:
+        virtual ~DataProvider() {}
+        virtual int GetData(uint8_t *pBuf, int nBuf) = 0;
+    };
+
+  private:
+    /**
+     *   @brief  Private constructor to initialize libavformat resources.
+     *   @param  fmtc - Pointer to AVFormatContext allocated inside avformat_open_input()
+     */
+    FFmpegDemuxer(AVFormatContext *fmtc, int64_t timeScale = 1000 /*Hz*/) : fmtc(fmtc) {
+        if (!fmtc) {
+            LOG(ERROR) << "No AVFormatContext provided.";
+            return;
+        }
+
+        // Allocate the AVPackets and initialize to default values
+        pkt = av_packet_alloc();
+        pktFiltered = av_packet_alloc();
+        if (!pkt || !pktFiltered) {
+            LOG(ERROR) << "AVPacket allocation failed";
+            return;
+        }
+
+        LOG(INFO) << "Media format: " << fmtc->iformat->long_name << " (" << fmtc->iformat->name << ")";
+
+        ck(avformat_find_stream_info(fmtc, NULL));
+        iVideoStream = av_find_best_stream(fmtc, AVMEDIA_TYPE_VIDEO, -1, -1, NULL, 0);
+        if (iVideoStream < 0) {
+            LOG(ERROR) << "FFmpeg error: " << __FILE__ << " " << __LINE__ << " "
+                       << "Could not find stream in input file";
+            av_packet_free(&pkt);
+            av_packet_free(&pktFiltered);
+            return;
+        }
+
+        // fmtc->streams[iVideoStream]->need_parsing = AVSTREAM_PARSE_NONE;
+        eVideoCodec = fmtc->streams[iVideoStream]->codecpar->codec_id;
+        nWidth = fmtc->streams[iVideoStream]->codecpar->width;
+        nHeight = fmtc->streams[iVideoStream]->codecpar->height;
+        eChromaFormat = (AVPixelFormat)fmtc->streams[iVideoStream]->codecpar->format;
+        AVRational rTimeBase = fmtc->streams[iVideoStream]->time_base;
+        timeBase = av_q2d(rTimeBase);
+        userTimeScale = timeScale;
+
+        // Set bit depth, chroma height, bits per pixel based on eChromaFormat of input
+        switch (eChromaFormat) {
+        case AV_PIX_FMT_YUV420P10LE:
+        case AV_PIX_FMT_GRAY10LE: // monochrome is treated as 420 with chroma filled with 0x0
+            nBitDepth = 10;
+            nChromaHeight = (nHeight + 1) >> 1;
+            nBPP = 2;
+            break;
+        case AV_PIX_FMT_YUV420P12LE:
+            nBitDepth = 12;
+            nChromaHeight = (nHeight + 1) >> 1;
+            nBPP = 2;
+            break;
+        case AV_PIX_FMT_YUV444P10LE:
+            nBitDepth = 10;
+            nChromaHeight = nHeight << 1;
+            nBPP = 2;
+            break;
+        case AV_PIX_FMT_YUV444P12LE:
+            nBitDepth = 12;
+            nChromaHeight = nHeight << 1;
+            nBPP = 2;
+            break;
+        case AV_PIX_FMT_YUV444P:
+            nBitDepth = 8;
+            nChromaHeight = nHeight << 1;
+            nBPP = 1;
+            break;
+        case AV_PIX_FMT_YUV420P:
+        case AV_PIX_FMT_YUVJ420P:
+        case AV_PIX_FMT_YUVJ422P: // jpeg decoder output is subsampled to NV12 for 422/444 so treat it as 420
+        case AV_PIX_FMT_YUVJ444P: // jpeg decoder output is subsampled to NV12 for 422/444 so treat it as 420
+        case AV_PIX_FMT_GRAY8:    // monochrome is treated as 420 with chroma filled with 0x0
+            nBitDepth = 8;
+            nChromaHeight = (nHeight + 1) >> 1;
+            nBPP = 1;
+            break;
+        default:
+            LOG(WARNING) << "ChromaFormat not recognized. Assuming 420";
+            eChromaFormat = AV_PIX_FMT_YUV420P;
+            nBitDepth = 8;
+            nChromaHeight = (nHeight + 1) >> 1;
+            nBPP = 1;
+        }
+
+        bMp4H264 = eVideoCodec == AV_CODEC_ID_H264 && (!strcmp(fmtc->iformat->long_name, "QuickTime / MOV") ||
+                                                       !strcmp(fmtc->iformat->long_name, "FLV (Flash Video)") ||
+                                                       !strcmp(fmtc->iformat->long_name, "Matroska / WebM"));
+        bMp4HEVC = eVideoCodec == AV_CODEC_ID_HEVC && (!strcmp(fmtc->iformat->long_name, "QuickTime / MOV") ||
+                                                       !strcmp(fmtc->iformat->long_name, "FLV (Flash Video)") ||
+                                                       !strcmp(fmtc->iformat->long_name, "Matroska / WebM"));
+
+        bMp4MPEG4 = eVideoCodec == AV_CODEC_ID_MPEG4 && (!strcmp(fmtc->iformat->long_name, "QuickTime / MOV") ||
+                                                         !strcmp(fmtc->iformat->long_name, "FLV (Flash Video)") ||
+                                                         !strcmp(fmtc->iformat->long_name, "Matroska / WebM"));
+
+        // Initialize bitstream filter and its required resources
+        if (bMp4H264) {
+            const AVBitStreamFilter *bsf = av_bsf_get_by_name("h264_mp4toannexb");
+            if (!bsf) {
+                LOG(ERROR) << "FFmpeg error: " << __FILE__ << " " << __LINE__ << " "
+                           << "av_bsf_get_by_name() failed";
+                av_packet_free(&pkt);
+                av_packet_free(&pktFiltered);
+                return;
+            }
+            ck(av_bsf_alloc(bsf, &bsfc));
+            avcodec_parameters_copy(bsfc->par_in, fmtc->streams[iVideoStream]->codecpar);
+            ck(av_bsf_init(bsfc));
+        }
+        if (bMp4HEVC) {
+            const AVBitStreamFilter *bsf = av_bsf_get_by_name("hevc_mp4toannexb");
+            if (!bsf) {
+                LOG(ERROR) << "FFmpeg error: " << __FILE__ << " " << __LINE__ << " "
+                           << "av_bsf_get_by_name() failed";
+                av_packet_free(&pkt);
+                av_packet_free(&pktFiltered);
+                return;
+            }
+            ck(av_bsf_alloc(bsf, &bsfc));
+            avcodec_parameters_copy(bsfc->par_in, fmtc->streams[iVideoStream]->codecpar);
+            ck(av_bsf_init(bsfc));
+        }
+    }
+
+    AVFormatContext *CreateFormatContext(DataProvider *pDataProvider) {
+
+        AVFormatContext *ctx = NULL;
+        if (!(ctx = avformat_alloc_context())) {
+            LOG(ERROR) << "FFmpeg error: " << __FILE__ << " " << __LINE__;
+            return NULL;
+        }
+
+        uint8_t *avioc_buffer = NULL;
+        int avioc_buffer_size = 8 * 1024 * 1024;
+        avioc_buffer = (uint8_t *)av_malloc(avioc_buffer_size);
+        if (!avioc_buffer) {
+            LOG(ERROR) << "FFmpeg error: " << __FILE__ << " " << __LINE__;
+            return NULL;
+        }
+        avioc = avio_alloc_context(avioc_buffer, avioc_buffer_size, 0, pDataProvider, &ReadPacket, NULL, NULL);
+        if (!avioc) {
+            LOG(ERROR) << "FFmpeg error: " << __FILE__ << " " << __LINE__;
+            return NULL;
+        }
+        ctx->pb = avioc;
+
+        ck(avformat_open_input(&ctx, NULL, NULL, NULL));
+        return ctx;
+    }
+
+    /**
+     *   @brief  Allocate and return AVFormatContext*.
+     *   @param  szFilePath - Filepath pointing to input stream.
+     *   @return Pointer to AVFormatContext
+     */
+    AVFormatContext *CreateFormatContext(const char *szFilePath) {
+        avformat_network_init();
+
+        AVFormatContext *ctx = NULL;
+        ck(avformat_open_input(&ctx, szFilePath, NULL, NULL));
+        return ctx;
+    }
+
+  public:
+    FFmpegDemuxer(const char *szFilePath, int64_t timescale = 1000 /*Hz*/)
+        : FFmpegDemuxer(CreateFormatContext(szFilePath), timescale) {}
+    FFmpegDemuxer(DataProvider *pDataProvider) : FFmpegDemuxer(CreateFormatContext(pDataProvider)) { avioc = fmtc->pb; }
+    ~FFmpegDemuxer() {
+
+        if (!fmtc) {
+            return;
+        }
+
+        if (pkt) {
+            av_packet_free(&pkt);
+        }
+        if (pktFiltered) {
+            av_packet_free(&pktFiltered);
+        }
+
+        if (bsfc) {
+            av_bsf_free(&bsfc);
+        }
+
+        avformat_close_input(&fmtc);
+
+        if (avioc) {
+            av_freep(&avioc->buffer);
+            av_freep(&avioc);
+        }
+
+        if (pDataWithHeader) {
+            av_free(pDataWithHeader);
+        }
+    }
+    AVCodecID GetVideoCodec() { return eVideoCodec; }
+    AVPixelFormat GetChromaFormat() { return eChromaFormat; }
+    int GetWidth() { return nWidth; }
+    int GetHeight() { return nHeight; }
+    int GetBitDepth() { return nBitDepth; }
+    int GetFrameSize() { return nWidth * (nHeight + nChromaHeight) * nBPP; }
+    bool Demux(uint8_t **ppVideo, int *pnVideoBytes, int64_t *pts = NULL) {
+        if (!fmtc) {
+            return false;
+        }
+
+        *pnVideoBytes = 0;
+
+        if (pkt->data) {
+            av_packet_unref(pkt);
+        }
+
+        int e = 0;
+        while ((e = av_read_frame(fmtc, pkt)) >= 0 && pkt->stream_index != iVideoStream) {
+            av_packet_unref(pkt);
+        }
+        if (e < 0) {
+            return false;
+        }
+
+        if (bMp4H264 || bMp4HEVC) {
+            if (pktFiltered->data) {
+                av_packet_unref(pktFiltered);
+            }
+            ck(av_bsf_send_packet(bsfc, pkt));
+            ck(av_bsf_receive_packet(bsfc, pktFiltered));
+            *ppVideo = pktFiltered->data;
+            *pnVideoBytes = pktFiltered->size;
+            if (pts)
+                *pts = (int64_t)(pktFiltered->pts * userTimeScale * timeBase);
+        } else {
+
+            if (bMp4MPEG4 && (frameCount == 0)) {
+
+                int extraDataSize = fmtc->streams[iVideoStream]->codecpar->extradata_size;
+
+                if (extraDataSize > 0) {
+
+                    // extradata contains start codes 00 00 01. Subtract its size
+                    pDataWithHeader = (uint8_t *)av_malloc(extraDataSize + pkt->size - 3 * sizeof(uint8_t));
+
+                    if (!pDataWithHeader) {
+                        LOG(ERROR) << "FFmpeg error: " << __FILE__ << " " << __LINE__;
+                        return false;
+                    }
+
+                    memcpy(pDataWithHeader, fmtc->streams[iVideoStream]->codecpar->extradata, extraDataSize);
+                    memcpy(pDataWithHeader + extraDataSize, pkt->data + 3, pkt->size - 3 * sizeof(uint8_t));
+
+                    *ppVideo = pDataWithHeader;
+                    *pnVideoBytes = extraDataSize + pkt->size - 3 * sizeof(uint8_t);
+                }
+
+            } else {
+                *ppVideo = pkt->data;
+                *pnVideoBytes = pkt->size;
+            }
+
+            if (pts)
+                *pts = (int64_t)(pkt->pts * userTimeScale * timeBase);
+        }
+
+        frameCount++;
+
+        return true;
+    }
+
+    static int ReadPacket(void *opaque, uint8_t *pBuf, int nBuf) {
+        return ((DataProvider *)opaque)->GetData(pBuf, nBuf);
+    }
+};
+
+inline cudaVideoCodec FFmpeg2NvCodecId(AVCodecID id) {
+    switch (id) {
+    case AV_CODEC_ID_MPEG1VIDEO:
+        return cudaVideoCodec_MPEG1;
+    case AV_CODEC_ID_MPEG2VIDEO:
+        return cudaVideoCodec_MPEG2;
+    case AV_CODEC_ID_MPEG4:
+        return cudaVideoCodec_MPEG4;
+    case AV_CODEC_ID_WMV3:
+    case AV_CODEC_ID_VC1:
+        return cudaVideoCodec_VC1;
+    case AV_CODEC_ID_H264:
+        return cudaVideoCodec_H264;
+    case AV_CODEC_ID_HEVC:
+        return cudaVideoCodec_HEVC;
+    case AV_CODEC_ID_VP8:
+        return cudaVideoCodec_VP8;
+    case AV_CODEC_ID_VP9:
+        return cudaVideoCodec_VP9;
+    case AV_CODEC_ID_MJPEG:
+        return cudaVideoCodec_JPEG;
+    case AV_CODEC_ID_AV1:
+        return cudaVideoCodec_AV1;
+    default:
+        return cudaVideoCodec_NumCodecs;
+    }
+}
diff --git a/third_party/Video_Codec_SDK/Samples/Utils/FFmpegStreamer.h b/third_party/Video_Codec_SDK/Samples/Utils/FFmpegStreamer.h
new file mode 100644
index 00000000..08e43e60
--- /dev/null
+++ b/third_party/Video_Codec_SDK/Samples/Utils/FFmpegStreamer.h
@@ -0,0 +1,148 @@
+/*
+ * This copyright notice applies to this header file only:
+ *
+ * Copyright (c) 2010-2023 NVIDIA Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the software, and to permit persons to whom the
+ * software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+#pragma once
+
+#include <mutex>
+#include <thread>
+extern "C" {
+#include <libavformat/avformat.h>
+#include <libavutil/opt.h>
+#include <libswresample/swresample.h>
+};
+#include "Logger.h"
+
+using namespace std;
+
+extern simplelogger::Logger *logger;
+
+static string AvErrorToString(int av_error_code) {
+    const auto buf_size = 1024U;
+    char *err_string = (char *)calloc(buf_size, sizeof(*err_string));
+    if (!err_string) {
+        return string();
+    }
+
+    if (0 != av_strerror(av_error_code, err_string, buf_size - 1)) {
+        free(err_string);
+        stringstream ss;
+        ss << "Unknown error with code " << av_error_code;
+        return ss.str();
+    }
+
+    string str(err_string);
+    free(err_string);
+    return str;
+}
+
+class FFmpegStreamer {
+  private:
+    AVFormatContext *oc = NULL;
+    AVStream *vs = NULL;
+    int nFps = 0;
+
+  public:
+    FFmpegStreamer(AVCodecID eCodecId, int nWidth, int nHeight, int nFps, const char *szInFilePath) : nFps(nFps) {
+        avformat_network_init();
+
+        int ret = 0;
+
+        if ((eCodecId == AV_CODEC_ID_H264) || (eCodecId == AV_CODEC_ID_HEVC))
+            ret = avformat_alloc_output_context2(&oc, NULL, "mpegts", NULL);
+        else if (eCodecId == AV_CODEC_ID_AV1)
+            ret = avformat_alloc_output_context2(&oc, NULL, "ivf", NULL);
+
+        if (ret < 0) {
+            LOG(ERROR) << "FFmpeg: failed to allocate an AVFormatContext. Error message: " << AvErrorToString(ret);
+            return;
+        }
+
+        oc->url = av_strdup(szInFilePath);
+        LOG(INFO) << "Streaming destination: " << oc->url;
+
+        // Add video stream to oc
+        vs = avformat_new_stream(oc, NULL);
+        if (!vs) {
+            LOG(ERROR) << "FFMPEG: Could not alloc video stream";
+            return;
+        }
+        vs->id = 0;
+
+        // Set video parameters
+        AVCodecParameters *vpar = vs->codecpar;
+        vpar->codec_id = eCodecId;
+        vpar->codec_type = AVMEDIA_TYPE_VIDEO;
+        vpar->width = nWidth;
+        vpar->height = nHeight;
+
+        // Everything is ready. Now open the output stream.
+        if (avio_open(&oc->pb, oc->url, AVIO_FLAG_WRITE) < 0) {
+            LOG(ERROR) << "FFMPEG: Could not open " << oc->url;
+            return;
+        }
+
+        // Write the container header
+        if (avformat_write_header(oc, NULL)) {
+            LOG(ERROR) << "FFMPEG: avformat_write_header error!";
+            return;
+        }
+    }
+    ~FFmpegStreamer() {
+        if (oc) {
+            av_write_trailer(oc);
+            avio_close(oc->pb);
+            avformat_free_context(oc);
+        }
+    }
+
+    bool Stream(uint8_t *pData, int nBytes, int nPts) {
+        AVPacket *pkt = av_packet_alloc();
+        if (!pkt) {
+            LOG(ERROR) << "AVPacket allocation failed !";
+            return false;
+        }
+        pkt->pts = av_rescale_q(nPts++, AVRational{1, nFps}, vs->time_base);
+        // No B-frames
+        pkt->dts = pkt->pts;
+        pkt->stream_index = vs->index;
+        pkt->data = pData;
+        pkt->size = nBytes;
+
+        if (!memcmp(pData, "\x00\x00\x00\x01\x67", 5)) {
+            pkt->flags |= AV_PKT_FLAG_KEY;
+        }
+
+        // Write the compressed frame into the output
+        int ret = av_write_frame(oc, pkt);
+        av_write_frame(oc, NULL);
+        if (ret < 0) {
+            LOG(ERROR) << "FFMPEG: Error while writing video frame";
+        }
+
+        av_packet_free(&pkt);
+        return true;
+    }
+};
diff --git a/third_party/Video_Codec_SDK/Samples/Utils/Logger.h b/third_party/Video_Codec_SDK/Samples/Utils/Logger.h
new file mode 100644
index 00000000..5d2f069c
--- /dev/null
+++ b/third_party/Video_Codec_SDK/Samples/Utils/Logger.h
@@ -0,0 +1,235 @@
+/*
+ * This copyright notice applies to this header file only:
+ *
+ * Copyright (c) 2010-2023 NVIDIA Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the software, and to permit persons to whom the
+ * software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#include <fstream>
+#include <iostream>
+#include <mutex>
+#include <sstream>
+#include <string>
+#include <time.h>
+
+#ifdef _WIN32
+#include <windows.h>
+#include <winsock.h>
+
+#pragma comment(lib, "ws2_32.lib")
+#undef ERROR
+#else
+#include <arpa/inet.h>
+#include <netinet/in.h>
+#include <sys/socket.h>
+#include <unistd.h>
+#define SOCKET int
+#define INVALID_SOCKET -1
+#endif
+
+enum LogLevel { TRACE, INFO, WARNING, ERROR, FATAL };
+
+namespace simplelogger {
+class Logger {
+  public:
+    Logger(LogLevel level, bool bPrintTimeStamp) : level(level), bPrintTimeStamp(bPrintTimeStamp) {}
+    virtual ~Logger() {}
+    virtual std::ostream &GetStream() = 0;
+    virtual void FlushStream() {}
+    bool ShouldLogFor(LogLevel l) { return l >= level; }
+    char *GetLead(LogLevel l, const char *szFile, int nLine, const char *szFunc) {
+        if (l < TRACE || l > FATAL) {
+            sprintf(szLead, "[?????] ");
+            return szLead;
+        }
+        const char *szLevels[] = {"TRACE", "INFO", "WARN", "ERROR", "FATAL"};
+        if (bPrintTimeStamp) {
+            time_t t = time(NULL);
+            struct tm *ptm = localtime(&t);
+            sprintf(szLead, "[%-5s][%02d:%02d:%02d] ", szLevels[l], ptm->tm_hour, ptm->tm_min, ptm->tm_sec);
+        } else {
+            sprintf(szLead, "[%-5s] ", szLevels[l]);
+        }
+        return szLead;
+    }
+    void EnterCriticalSection() { mtx.lock(); }
+    void LeaveCriticalSection() { mtx.unlock(); }
+
+  private:
+    LogLevel level;
+    char szLead[80];
+    bool bPrintTimeStamp;
+    std::mutex mtx;
+};
+
+class LoggerFactory {
+  public:
+    static Logger *CreateFileLogger(std::string strFilePath, LogLevel level = INFO, bool bPrintTimeStamp = true) {
+        return new FileLogger(strFilePath, level, bPrintTimeStamp);
+    }
+    static Logger *CreateConsoleLogger(LogLevel level = INFO, bool bPrintTimeStamp = true) {
+        return new ConsoleLogger(level, bPrintTimeStamp);
+    }
+    static Logger *CreateUdpLogger(char *szHost, unsigned uPort, LogLevel level = INFO, bool bPrintTimeStamp = true) {
+        return new UdpLogger(szHost, uPort, level, bPrintTimeStamp);
+    }
+
+  private:
+    LoggerFactory() {}
+
+    class FileLogger : public Logger {
+      public:
+        FileLogger(std::string strFilePath, LogLevel level, bool bPrintTimeStamp) : Logger(level, bPrintTimeStamp) {
+            pFileOut = new std::ofstream();
+            pFileOut->open(strFilePath.c_str());
+        }
+        ~FileLogger() { pFileOut->close(); }
+        std::ostream &GetStream() { return *pFileOut; }
+
+      private:
+        std::ofstream *pFileOut;
+    };
+
+    class ConsoleLogger : public Logger {
+      public:
+        ConsoleLogger(LogLevel level, bool bPrintTimeStamp) : Logger(level, bPrintTimeStamp) {}
+        std::ostream &GetStream() { return std::cout; }
+    };
+
+    class UdpLogger : public Logger {
+      private:
+        class UdpOstream : public std::ostream {
+          public:
+            UdpOstream(char *szHost, unsigned short uPort) : std::ostream(&sb), socket(INVALID_SOCKET) {
+#ifdef _WIN32
+                WSADATA w;
+                if (WSAStartup(0x0101, &w) != 0) {
+                    fprintf(stderr, "WSAStartup() failed.\n");
+                    return;
+                }
+#endif
+                socket = ::socket(AF_INET, SOCK_DGRAM, 0);
+                if (socket == INVALID_SOCKET) {
+#ifdef _WIN32
+                    WSACleanup();
+#endif
+                    fprintf(stderr, "socket() failed.\n");
+                    return;
+                }
+#ifdef _WIN32
+                unsigned int b1, b2, b3, b4;
+                sscanf(szHost, "%u.%u.%u.%u", &b1, &b2, &b3, &b4);
+                struct in_addr addr = {(unsigned char)b1, (unsigned char)b2, (unsigned char)b3, (unsigned char)b4};
+#else
+                struct in_addr addr = {inet_addr(szHost)};
+#endif
+                struct sockaddr_in s = {AF_INET, htons(uPort), addr};
+                server = s;
+            }
+            ~UdpOstream() throw() {
+                if (socket == INVALID_SOCKET) {
+                    return;
+                }
+#ifdef _WIN32
+                closesocket(socket);
+                WSACleanup();
+#else
+                close(socket);
+#endif
+            }
+            void Flush() {
+                if (sendto(socket, sb.str().c_str(), (int)sb.str().length() + 1, 0, (struct sockaddr *)&server,
+                           (int)sizeof(sockaddr_in)) == -1) {
+                    fprintf(stderr, "sendto() failed.\n");
+                }
+                sb.str("");
+            }
+
+          private:
+            std::stringbuf sb;
+            SOCKET socket;
+            struct sockaddr_in server;
+        };
+
+      public:
+        UdpLogger(char *szHost, unsigned uPort, LogLevel level, bool bPrintTimeStamp)
+            : Logger(level, bPrintTimeStamp), udpOut(szHost, (unsigned short)uPort) {}
+        UdpOstream &GetStream() { return udpOut; }
+        virtual void FlushStream() { udpOut.Flush(); }
+
+      private:
+        UdpOstream udpOut;
+    };
+};
+
+class LogTransaction {
+  public:
+    LogTransaction(Logger *pLogger, LogLevel level, const char *szFile, const int nLine, const char *szFunc)
+        : pLogger(pLogger), level(level) {
+        if (!pLogger) {
+            std::cout << "[-----] ";
+            return;
+        }
+        if (!pLogger->ShouldLogFor(level)) {
+            return;
+        }
+        pLogger->EnterCriticalSection();
+        pLogger->GetStream() << pLogger->GetLead(level, szFile, nLine, szFunc);
+    }
+    ~LogTransaction() {
+        if (!pLogger) {
+            std::cout << std::endl;
+            return;
+        }
+        if (!pLogger->ShouldLogFor(level)) {
+            return;
+        }
+        pLogger->GetStream() << std::endl;
+        pLogger->FlushStream();
+        pLogger->LeaveCriticalSection();
+        if (level == FATAL) {
+            exit(1);
+        }
+    }
+    std::ostream &GetStream() {
+        if (!pLogger) {
+            return std::cout;
+        }
+        if (!pLogger->ShouldLogFor(level)) {
+            return ossNull;
+        }
+        return pLogger->GetStream();
+    }
+
+  private:
+    Logger *pLogger;
+    LogLevel level;
+    std::ostringstream ossNull;
+};
+
+} // namespace simplelogger
+
+extern simplelogger::Logger *logger;
+#define LOG(level) simplelogger::LogTransaction(logger, level, __FILE__, __LINE__, __FUNCTION__).GetStream()
diff --git a/third_party/Video_Codec_SDK/Samples/Utils/NvCodecUtils.h b/third_party/Video_Codec_SDK/Samples/Utils/NvCodecUtils.h
new file mode 100644
index 00000000..065a7cd9
--- /dev/null
+++ b/third_party/Video_Codec_SDK/Samples/Utils/NvCodecUtils.h
@@ -0,0 +1,547 @@
+/*
+ * This copyright notice applies to this header file only:
+ *
+ * Copyright (c) 2010-2023 NVIDIA Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the software, and to permit persons to whom the
+ * software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+//---------------------------------------------------------------------------
+//! \file NvCodecUtils.h
+//! \brief Miscellaneous classes and error checking functions.
+//!
+//! Used by Transcode/Encode samples apps for reading input files, mutithreading, performance measurement or colorspace
+//! conversion while decoding.
+//---------------------------------------------------------------------------
+
+#pragma once
+#include "Logger.h"
+#include <assert.h>
+#include <chrono>
+#include <condition_variable>
+#include <iomanip>
+#include <ios>
+#include <list>
+#include <sstream>
+#include <stdint.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <thread>
+#include <vector>
+
+extern simplelogger::Logger *logger;
+
+#ifdef __cuda_cuda_h__
+inline bool check(CUresult e, int iLine, const char *szFile) {
+    if (e != CUDA_SUCCESS) {
+        const char *szErrName = NULL;
+        cuGetErrorName(e, &szErrName);
+        LOG(FATAL) << "CUDA driver API error " << szErrName << " at line " << iLine << " in file " << szFile;
+        return false;
+    }
+    return true;
+}
+#endif
+
+#ifdef __CUDA_RUNTIME_H__
+inline bool check(cudaError_t e, int iLine, const char *szFile) {
+    if (e != cudaSuccess) {
+        LOG(FATAL) << "CUDA runtime API error " << cudaGetErrorName(e) << " at line " << iLine << " in file " << szFile;
+        return false;
+    }
+    return true;
+}
+#endif
+
+#ifdef _NV_ENCODEAPI_H_
+inline bool check(NVENCSTATUS e, int iLine, const char *szFile) {
+    const char *aszErrName[] = {
+        "NV_ENC_SUCCESS",
+        "NV_ENC_ERR_NO_ENCODE_DEVICE",
+        "NV_ENC_ERR_UNSUPPORTED_DEVICE",
+        "NV_ENC_ERR_INVALID_ENCODERDEVICE",
+        "NV_ENC_ERR_INVALID_DEVICE",
+        "NV_ENC_ERR_DEVICE_NOT_EXIST",
+        "NV_ENC_ERR_INVALID_PTR",
+        "NV_ENC_ERR_INVALID_EVENT",
+        "NV_ENC_ERR_INVALID_PARAM",
+        "NV_ENC_ERR_INVALID_CALL",
+        "NV_ENC_ERR_OUT_OF_MEMORY",
+        "NV_ENC_ERR_ENCODER_NOT_INITIALIZED",
+        "NV_ENC_ERR_UNSUPPORTED_PARAM",
+        "NV_ENC_ERR_LOCK_BUSY",
+        "NV_ENC_ERR_NOT_ENOUGH_BUFFER",
+        "NV_ENC_ERR_INVALID_VERSION",
+        "NV_ENC_ERR_MAP_FAILED",
+        "NV_ENC_ERR_NEED_MORE_INPUT",
+        "NV_ENC_ERR_ENCODER_BUSY",
+        "NV_ENC_ERR_EVENT_NOT_REGISTERED",
+        "NV_ENC_ERR_GENERIC",
+        "NV_ENC_ERR_INCOMPATIBLE_CLIENT_KEY",
+        "NV_ENC_ERR_UNIMPLEMENTED",
+        "NV_ENC_ERR_RESOURCE_REGISTER_FAILED",
+        "NV_ENC_ERR_RESOURCE_NOT_REGISTERED",
+        "NV_ENC_ERR_RESOURCE_NOT_MAPPED",
+    };
+    if (e != NV_ENC_SUCCESS) {
+        LOG(FATAL) << "NVENC error " << aszErrName[e] << " at line " << iLine << " in file " << szFile;
+        return false;
+    }
+    return true;
+}
+#endif
+
+#ifdef _WINERROR_
+inline bool check(HRESULT e, int iLine, const char *szFile) {
+    if (e != S_OK) {
+        std::stringstream stream;
+        stream << std::hex << std::uppercase << e;
+        LOG(FATAL) << "HRESULT error 0x" << stream.str() << " at line " << iLine << " in file " << szFile;
+        return false;
+    }
+    return true;
+}
+#endif
+
+#if defined(__gl_h_) || defined(__GL_H__)
+inline bool check(GLenum e, int iLine, const char *szFile) {
+    if (e != 0) {
+        LOG(ERROR) << "GLenum error " << e << " at line " << iLine << " in file " << szFile;
+        return false;
+    }
+    return true;
+}
+#endif
+
+inline bool check(int e, int iLine, const char *szFile) {
+    if (e < 0) {
+        LOG(ERROR) << "General error " << e << " at line " << iLine << " in file " << szFile;
+        return false;
+    }
+    return true;
+}
+
+#define ck(call) check(call, __LINE__, __FILE__)
+#define MAKE_FOURCC(ch0, ch1, ch2, ch3)                                                                                \
+    ((uint32_t)(uint8_t)(ch0) | ((uint32_t)(uint8_t)(ch1) << 8) | ((uint32_t)(uint8_t)(ch2) << 16) |                   \
+     ((uint32_t)(uint8_t)(ch3) << 24))
+
+/**
+ * @brief Wrapper class around std::thread
+ */
+class NvThread {
+  public:
+    NvThread() = default;
+    NvThread(const NvThread &) = delete;
+    NvThread &operator=(const NvThread &other) = delete;
+
+    NvThread(std::thread &&thread) : t(std::move(thread)) {}
+
+    NvThread(NvThread &&thread) : t(std::move(thread.t)) {}
+
+    NvThread &operator=(NvThread &&other) {
+        t = std::move(other.t);
+        return *this;
+    }
+
+    ~NvThread() { join(); }
+
+    void join() {
+        if (t.joinable()) {
+            t.join();
+        }
+    }
+
+  private:
+    std::thread t;
+};
+
+#ifndef _WIN32
+#define _stricmp strcasecmp
+#define _stat64 stat64
+#endif
+
+/**
+ * @brief Utility class to allocate buffer memory. Helps avoid I/O during the encode/decode loop in case of performance
+ * tests.
+ */
+class BufferedFileReader {
+  public:
+    /**
+     * @brief Constructor function to allocate appropriate memory and copy file contents into it
+     */
+    BufferedFileReader(const char *szFileName, bool bPartial = false) {
+        struct _stat64 st;
+
+        if (_stat64(szFileName, &st) != 0) {
+            return;
+        }
+
+        nSize = st.st_size;
+        while (nSize) {
+            try {
+                pBuf = new uint8_t[(size_t)nSize];
+                if (nSize != st.st_size) {
+                    LOG(WARNING) << "File is too large - only " << std::setprecision(4) << 100.0 * nSize / st.st_size
+                                 << "% is loaded";
+                }
+                break;
+            } catch (std::bad_alloc) {
+                if (!bPartial) {
+                    LOG(ERROR) << "Failed to allocate memory in BufferedReader";
+                    return;
+                }
+                nSize = (uint32_t)(nSize * 0.9);
+            }
+        }
+
+        std::ifstream fpIn(szFileName, std::ifstream::in | std::ifstream::binary);
+        if (!fpIn) {
+            LOG(ERROR) << "Unable to open input file: " << szFileName;
+            return;
+        }
+
+        std::streamsize nRead = fpIn.read(reinterpret_cast<char *>(pBuf), nSize).gcount();
+        fpIn.close();
+
+        assert(nRead == nSize);
+    }
+    ~BufferedFileReader() {
+        if (pBuf) {
+            delete[] pBuf;
+        }
+    }
+    bool GetBuffer(uint8_t **ppBuf, uint64_t *pnSize) {
+        if (!pBuf) {
+            return false;
+        }
+
+        *ppBuf = pBuf;
+        *pnSize = nSize;
+        return true;
+    }
+
+  private:
+    uint8_t *pBuf = NULL;
+    uint64_t nSize = 0;
+};
+
+/**
+ * @brief Template class to facilitate color space conversion
+ */
+template <typename T> class YuvConverter {
+  public:
+    YuvConverter(int nWidth, int nHeight) : nWidth(nWidth), nHeight(nHeight) {
+        pQuad = new T[((nWidth + 1) / 2) * ((nHeight + 1) / 2)];
+    }
+    ~YuvConverter() { delete[] pQuad; }
+    void PlanarToUVInterleaved(T *pFrame, int nPitch = 0) {
+        if (nPitch == 0) {
+            nPitch = nWidth;
+        }
+
+        // sizes of source surface plane
+        int nSizePlaneY = nPitch * nHeight;
+        int nSizePlaneU = ((nPitch + 1) / 2) * ((nHeight + 1) / 2);
+        int nSizePlaneV = nSizePlaneU;
+
+        T *puv = pFrame + nSizePlaneY;
+        if (nPitch == nWidth) {
+            memcpy(pQuad, puv, nSizePlaneU * sizeof(T));
+        } else {
+            for (int i = 0; i < (nHeight + 1) / 2; i++) {
+                memcpy(pQuad + ((nWidth + 1) / 2) * i, puv + ((nPitch + 1) / 2) * i, ((nWidth + 1) / 2) * sizeof(T));
+            }
+        }
+        T *pv = puv + nSizePlaneU;
+        for (int y = 0; y < (nHeight + 1) / 2; y++) {
+            for (int x = 0; x < (nWidth + 1) / 2; x++) {
+                puv[y * nPitch + x * 2] = pQuad[y * ((nWidth + 1) / 2) + x];
+                puv[y * nPitch + x * 2 + 1] = pv[y * ((nPitch + 1) / 2) + x];
+            }
+        }
+    }
+    void UVInterleavedToPlanar(T *pFrame, int nPitch = 0) {
+        if (nPitch == 0) {
+            nPitch = nWidth;
+        }
+
+        // sizes of source surface plane
+        int nSizePlaneY = nPitch * nHeight;
+        int nSizePlaneU = ((nPitch + 1) / 2) * ((nHeight + 1) / 2);
+        int nSizePlaneV = nSizePlaneU;
+
+        T *puv = pFrame + nSizePlaneY, *pu = puv, *pv = puv + nSizePlaneU;
+
+        // split chroma from interleave to planar
+        for (int y = 0; y < (nHeight + 1) / 2; y++) {
+            for (int x = 0; x < (nWidth + 1) / 2; x++) {
+                pu[y * ((nPitch + 1) / 2) + x] = puv[y * nPitch + x * 2];
+                pQuad[y * ((nWidth + 1) / 2) + x] = puv[y * nPitch + x * 2 + 1];
+            }
+        }
+        if (nPitch == nWidth) {
+            memcpy(pv, pQuad, nSizePlaneV * sizeof(T));
+        } else {
+            for (int i = 0; i < (nHeight + 1) / 2; i++) {
+                memcpy(pv + ((nPitch + 1) / 2) * i, pQuad + ((nWidth + 1) / 2) * i, ((nWidth + 1) / 2) * sizeof(T));
+            }
+        }
+    }
+
+  private:
+    T *pQuad;
+    int nWidth, nHeight;
+};
+
+/**
+ * @brief Class for writing IVF format header for AV1 codec
+ */
+class IVFUtils {
+  public:
+    void WriteFileHeader(std::vector<uint8_t> &vPacket, uint32_t nFourCC, uint32_t nWidth, uint32_t nHeight,
+                         uint32_t nFrameRateNum, uint32_t nFrameRateDen, uint32_t nFrameCnt) {
+        char header[32];
+
+        header[0] = 'D';
+        header[1] = 'K';
+        header[2] = 'I';
+        header[3] = 'F';
+        mem_put_le16(header + 4, 0);              // version
+        mem_put_le16(header + 6, 32);             // header size
+        mem_put_le32(header + 8, nFourCC);        // fourcc
+        mem_put_le16(header + 12, nWidth);        // width
+        mem_put_le16(header + 14, nHeight);       // height
+        mem_put_le32(header + 16, nFrameRateNum); // rate
+        mem_put_le32(header + 20, nFrameRateDen); // scale
+        mem_put_le32(header + 24, nFrameCnt);     // length
+        mem_put_le32(header + 28, 0);             // unused
+
+        vPacket.insert(vPacket.end(), &header[0], &header[32]);
+    }
+
+    void WriteFrameHeader(std::vector<uint8_t> &vPacket, size_t nFrameSize, int64_t pts) {
+        char header[12];
+        mem_put_le32(header, (int)nFrameSize);
+        mem_put_le32(header + 4, (int)(pts & 0xFFFFFFFF));
+        mem_put_le32(header + 8, (int)(pts >> 32));
+
+        vPacket.insert(vPacket.end(), &header[0], &header[12]);
+    }
+
+  private:
+    static inline void mem_put_le32(void *vmem, int val) {
+        unsigned char *mem = (unsigned char *)vmem;
+        mem[0] = (unsigned char)((val >> 0) & 0xff);
+        mem[1] = (unsigned char)((val >> 8) & 0xff);
+        mem[2] = (unsigned char)((val >> 16) & 0xff);
+        mem[3] = (unsigned char)((val >> 24) & 0xff);
+    }
+
+    static inline void mem_put_le16(void *vmem, int val) {
+        unsigned char *mem = (unsigned char *)vmem;
+        mem[0] = (unsigned char)((val >> 0) & 0xff);
+        mem[1] = (unsigned char)((val >> 8) & 0xff);
+    }
+};
+
+/**
+ * @brief Utility class to measure elapsed time in seconds between the block of executed code
+ */
+class StopWatch {
+  public:
+    void Start() { t0 = std::chrono::high_resolution_clock::now(); }
+    double Stop() {
+        return std::chrono::duration_cast<std::chrono::nanoseconds>(
+                   std::chrono::high_resolution_clock::now().time_since_epoch() - t0.time_since_epoch())
+                   .count() /
+               1.0e9;
+    }
+
+  private:
+    std::chrono::high_resolution_clock::time_point t0;
+};
+
+template <typename T> class ConcurrentQueue {
+  public:
+    ConcurrentQueue() {}
+    ConcurrentQueue(size_t size) : maxSize(size) {}
+    ConcurrentQueue(const ConcurrentQueue &) = delete;
+    ConcurrentQueue &operator=(const ConcurrentQueue &) = delete;
+
+    void setSize(size_t s) { maxSize = s; }
+
+    void push_back(const T &value) {
+        // Do not use a std::lock_guard here. We will need to explicitly
+        // unlock before notify_one as the other waiting thread will
+        // automatically try to acquire mutex once it wakes up
+        // (which will happen on notify_one)
+        std::unique_lock<std::mutex> lock(m_mutex);
+        auto wasEmpty = m_List.empty();
+
+        while (full()) {
+            m_cond.wait(lock);
+        }
+
+        m_List.push_back(value);
+        if (wasEmpty && !m_List.empty()) {
+            lock.unlock();
+            m_cond.notify_one();
+        }
+    }
+
+    T pop_front() {
+        std::unique_lock<std::mutex> lock(m_mutex);
+
+        while (m_List.empty()) {
+            m_cond.wait(lock);
+        }
+        auto wasFull = full();
+        T data = std::move(m_List.front());
+        m_List.pop_front();
+
+        if (wasFull && !full()) {
+            lock.unlock();
+            m_cond.notify_one();
+        }
+
+        return data;
+    }
+
+    T front() {
+        std::unique_lock<std::mutex> lock(m_mutex);
+
+        while (m_List.empty()) {
+            m_cond.wait(lock);
+        }
+
+        return m_List.front();
+    }
+
+    size_t size() {
+        std::unique_lock<std::mutex> lock(m_mutex);
+        return m_List.size();
+    }
+
+    bool empty() {
+        std::unique_lock<std::mutex> lock(m_mutex);
+        return m_List.empty();
+    }
+    void clear() {
+        std::unique_lock<std::mutex> lock(m_mutex);
+        m_List.clear();
+    }
+
+  private:
+    bool full() {
+        if (maxSize > 0 && m_List.size() == maxSize)
+            return true;
+        return false;
+    }
+
+  private:
+    std::list<T> m_List;
+    std::mutex m_mutex;
+    std::condition_variable m_cond;
+    size_t maxSize;
+};
+
+inline void CheckInputFile(const char *szInFilePath) {
+    std::ifstream fpIn(szInFilePath, std::ios::in | std::ios::binary);
+    if (fpIn.fail()) {
+        std::ostringstream err;
+        err << "Unable to open input file: " << szInFilePath << std::endl;
+        throw std::invalid_argument(err.str());
+    }
+}
+
+inline void ValidateResolution(int nWidth, int nHeight) {
+
+    if (nWidth <= 0 || nHeight <= 0) {
+        std::ostringstream err;
+        err << "Please specify positive non zero resolution as -s WxH. Current resolution is " << nWidth << "x"
+            << nHeight << std::endl;
+        throw std::invalid_argument(err.str());
+    }
+}
+
+template <class COLOR32>
+void Nv12ToColor32(uint8_t *dpNv12, int nNv12Pitch, uint8_t *dpBgra, int nBgraPitch, int nWidth, int nHeight,
+                   int iMatrix = 0);
+template <class COLOR64>
+void Nv12ToColor64(uint8_t *dpNv12, int nNv12Pitch, uint8_t *dpBgra, int nBgraPitch, int nWidth, int nHeight,
+                   int iMatrix = 0);
+
+template <class COLOR32>
+void P016ToColor32(uint8_t *dpP016, int nP016Pitch, uint8_t *dpBgra, int nBgraPitch, int nWidth, int nHeight,
+                   int iMatrix = 4);
+template <class COLOR64>
+void P016ToColor64(uint8_t *dpP016, int nP016Pitch, uint8_t *dpBgra, int nBgraPitch, int nWidth, int nHeight,
+                   int iMatrix = 4);
+
+template <class COLOR32>
+void YUV444ToColor32(uint8_t *dpYUV444, int nPitch, uint8_t *dpBgra, int nBgraPitch, int nWidth, int nHeight,
+                     int iMatrix = 0);
+template <class COLOR64>
+void YUV444ToColor64(uint8_t *dpYUV444, int nPitch, uint8_t *dpBgra, int nBgraPitch, int nWidth, int nHeight,
+                     int iMatrix = 0);
+
+template <class COLOR32>
+void YUV444P16ToColor32(uint8_t *dpYUV444, int nPitch, uint8_t *dpBgra, int nBgraPitch, int nWidth, int nHeight,
+                        int iMatrix = 4);
+template <class COLOR64>
+void YUV444P16ToColor64(uint8_t *dpYUV444, int nPitch, uint8_t *dpBgra, int nBgraPitch, int nWidth, int nHeight,
+                        int iMatrix = 4);
+
+template <class COLOR32>
+void Nv12ToColorPlanar(uint8_t *dpNv12, int nNv12Pitch, uint8_t *dpBgrp, int nBgrpPitch, int nWidth, int nHeight,
+                       int iMatrix = 0);
+template <class COLOR32>
+void P016ToColorPlanar(uint8_t *dpP016, int nP016Pitch, uint8_t *dpBgrp, int nBgrpPitch, int nWidth, int nHeight,
+                       int iMatrix = 4);
+
+template <class COLOR32>
+void YUV444ToColorPlanar(uint8_t *dpYUV444, int nPitch, uint8_t *dpBgrp, int nBgrpPitch, int nWidth, int nHeight,
+                         int iMatrix = 0);
+template <class COLOR32>
+void YUV444P16ToColorPlanar(uint8_t *dpYUV444, int nPitch, uint8_t *dpBgrp, int nBgrpPitch, int nWidth, int nHeight,
+                            int iMatrix = 4);
+
+void Bgra64ToP016(uint8_t *dpBgra, int nBgraPitch, uint8_t *dpP016, int nP016Pitch, int nWidth, int nHeight,
+                  int iMatrix = 4);
+
+void ConvertUInt8ToUInt16(uint8_t *dpUInt8, uint16_t *dpUInt16, int nSrcPitch, int nDestPitch, int nWidth, int nHeight);
+void ConvertUInt16ToUInt8(uint16_t *dpUInt16, uint8_t *dpUInt8, int nSrcPitch, int nDestPitch, int nWidth, int nHeight);
+
+void ResizeNv12(unsigned char *dpDstNv12, int nDstPitch, int nDstWidth, int nDstHeight, unsigned char *dpSrcNv12,
+                int nSrcPitch, int nSrcWidth, int nSrcHeight, unsigned char *dpDstNv12UV = nullptr);
+void ResizeP016(unsigned char *dpDstP016, int nDstPitch, int nDstWidth, int nDstHeight, unsigned char *dpSrcP016,
+                int nSrcPitch, int nSrcWidth, int nSrcHeight, unsigned char *dpDstP016UV = nullptr);
+
+void ScaleYUV420(unsigned char *dpDstY, unsigned char *dpDstU, unsigned char *dpDstV, int nDstPitch,
+                 int nDstChromaPitch, int nDstWidth, int nDstHeight, unsigned char *dpSrcY, unsigned char *dpSrcU,
+                 unsigned char *dpSrcV, int nSrcPitch, int nSrcChromaPitch, int nSrcWidth, int nSrcHeight,
+                 bool bSemiplanar);
+
+#ifdef __cuda_cuda_h__
+void ComputeCRC(uint8_t *pBuffer, uint32_t *crcValue, CUstream_st *outputCUStream);
+#endif