From c2cad0997bb68d5963f912d23f6aa747d0d0f52e Mon Sep 17 00:00:00 2001
From: Nuzhny007 <nuzhny@mail.ru>
Date: Wed, 2 Oct 2024 08:08:47 +0300
Subject: [PATCH] First version with cuda 12.6, trt 10

---
 src/Detector/tensorrt_yolo/CMakeLists.txt     |   14 +-
 src/Detector/tensorrt_yolo/YoloONNX.cpp       |   29 +-
 .../tensorrt_yolo/common/BatchStream.h        |   47 +-
 .../tensorrt_yolo/common/EntropyCalibrator.h  |   18 +-
 .../tensorrt_yolo/common/ErrorRecorder.h      |    9 +-
 .../tensorrt_yolo/common/argsParser.h         |  162 +
 .../tensorrt_yolo/common/bfloat16.cpp         |   60 +
 src/Detector/tensorrt_yolo/common/bfloat16.h  |   46 +
 src/Detector/tensorrt_yolo/common/buffers.h   |  164 +-
 src/Detector/tensorrt_yolo/common/common.h    |  330 +-
 .../tensorrt_yolo/common/dumpTFWts.py         |  124 +
 .../tensorrt_yolo/common/fileLock.cpp         |  100 +
 src/Detector/tensorrt_yolo/common/fileLock.h  |   86 +
 .../tensorrt_yolo/common/getOptions.cpp       |  248 +
 .../tensorrt_yolo/common/getOptions.h         |  128 +
 src/Detector/tensorrt_yolo/common/getopt.c    |  568 +++
 src/Detector/tensorrt_yolo/common/getoptWin.h |  124 +
 src/Detector/tensorrt_yolo/common/half.h      |    9 +-
 src/Detector/tensorrt_yolo/common/logger.cpp  |    7 +-
 src/Detector/tensorrt_yolo/common/logger.h    |    5 +-
 src/Detector/tensorrt_yolo/common/logging.h   |   16 +-
 .../tensorrt_yolo/common/parserOnnxConfig.h   |   56 +-
 .../tensorrt_yolo/common/safeCommon.h         |  321 +-
 .../tensorrt_yolo/common/sampleConfig.h       |   50 +-
 .../tensorrt_yolo/common/sampleDevice.cpp     |  133 +
 .../tensorrt_yolo/common/sampleDevice.h       |  142 +-
 .../tensorrt_yolo/common/sampleEngines.cpp_   | 1688 +++++++
 .../tensorrt_yolo/common/sampleEngines.h      |  296 +-
 .../tensorrt_yolo/common/sampleEntrypoints.h  |  101 +
 .../tensorrt_yolo/common/sampleInference.cpp_ | 1622 +++++++
 .../tensorrt_yolo/common/sampleInference.h    |  226 +-
 .../tensorrt_yolo/common/sampleOptions.cpp    | 2081 ++++++--
 .../tensorrt_yolo/common/sampleOptions.h      |  236 +-
 .../tensorrt_yolo/common/sampleReporting.cpp  |  300 +-
 .../tensorrt_yolo/common/sampleReporting.h    |  124 +-
 .../tensorrt_yolo/common/sampleUtils.cpp      |  587 +++
 .../tensorrt_yolo/common/sampleUtils.h        |  528 +-
 .../tensorrt_yolo/common/streamReader.h       |   78 +
 .../tensorrt_yolo/common/timingCache.cpp      |  157 +
 .../tensorrt_yolo/common/timingCache.h        |   38 +
 .../common_deprecated/BatchStream.h           |  388 ++
 .../common_deprecated/EntropyCalibrator.h     |  134 +
 .../common_deprecated/ErrorRecorder.h         |  137 +
 .../tensorrt_yolo/common_deprecated/buffers.h |  478 ++
 .../tensorrt_yolo/common_deprecated/common.h  |  963 ++++
 .../tensorrt_yolo/common_deprecated/half.h    | 4302 +++++++++++++++++
 .../common_deprecated/logger.cpp              |   40 +
 .../tensorrt_yolo/common_deprecated/logger.h  |   36 +
 .../tensorrt_yolo/common_deprecated/logging.h |  578 +++
 .../common_deprecated/parserOnnxConfig.h      |  153 +
 .../common_deprecated/safeCommon.h            |   71 +
 .../common_deprecated/sampleConfig.h          |  337 ++
 .../common_deprecated/sampleDevice.h          |  494 ++
 .../sampleEngines.cpp                         |    0
 .../common_deprecated/sampleEngines.h         |  183 +
 .../sampleInference.cpp                       |    0
 .../common_deprecated/sampleInference.h       |   92 +
 .../common_deprecated/sampleOptions.cpp       | 1778 +++++++
 .../common_deprecated/sampleOptions.h         |  355 ++
 .../common_deprecated/sampleReporting.cpp     |  445 ++
 .../common_deprecated/sampleReporting.h       |  222 +
 .../common_deprecated/sampleUtils.h           |  543 +++
 src/Detector/tensorrt_yolo/yolo.cpp           |   63 +-
 src/Detector/tensorrt_yolo/yolo.h             |    1 +
 64 files changed, 21126 insertions(+), 1725 deletions(-)
 create mode 100644 src/Detector/tensorrt_yolo/common/argsParser.h
 create mode 100644 src/Detector/tensorrt_yolo/common/bfloat16.cpp
 create mode 100644 src/Detector/tensorrt_yolo/common/bfloat16.h
 create mode 100644 src/Detector/tensorrt_yolo/common/dumpTFWts.py
 create mode 100644 src/Detector/tensorrt_yolo/common/fileLock.cpp
 create mode 100644 src/Detector/tensorrt_yolo/common/fileLock.h
 create mode 100644 src/Detector/tensorrt_yolo/common/getOptions.cpp
 create mode 100644 src/Detector/tensorrt_yolo/common/getOptions.h
 create mode 100644 src/Detector/tensorrt_yolo/common/getopt.c
 create mode 100644 src/Detector/tensorrt_yolo/common/getoptWin.h
 create mode 100644 src/Detector/tensorrt_yolo/common/sampleDevice.cpp
 create mode 100644 src/Detector/tensorrt_yolo/common/sampleEngines.cpp_
 create mode 100644 src/Detector/tensorrt_yolo/common/sampleEntrypoints.h
 create mode 100644 src/Detector/tensorrt_yolo/common/sampleInference.cpp_
 create mode 100644 src/Detector/tensorrt_yolo/common/sampleUtils.cpp
 create mode 100644 src/Detector/tensorrt_yolo/common/streamReader.h
 create mode 100644 src/Detector/tensorrt_yolo/common/timingCache.cpp
 create mode 100644 src/Detector/tensorrt_yolo/common/timingCache.h
 create mode 100644 src/Detector/tensorrt_yolo/common_deprecated/BatchStream.h
 create mode 100644 src/Detector/tensorrt_yolo/common_deprecated/EntropyCalibrator.h
 create mode 100644 src/Detector/tensorrt_yolo/common_deprecated/ErrorRecorder.h
 create mode 100644 src/Detector/tensorrt_yolo/common_deprecated/buffers.h
 create mode 100644 src/Detector/tensorrt_yolo/common_deprecated/common.h
 create mode 100644 src/Detector/tensorrt_yolo/common_deprecated/half.h
 create mode 100644 src/Detector/tensorrt_yolo/common_deprecated/logger.cpp
 create mode 100644 src/Detector/tensorrt_yolo/common_deprecated/logger.h
 create mode 100644 src/Detector/tensorrt_yolo/common_deprecated/logging.h
 create mode 100644 src/Detector/tensorrt_yolo/common_deprecated/parserOnnxConfig.h
 create mode 100644 src/Detector/tensorrt_yolo/common_deprecated/safeCommon.h
 create mode 100644 src/Detector/tensorrt_yolo/common_deprecated/sampleConfig.h
 create mode 100644 src/Detector/tensorrt_yolo/common_deprecated/sampleDevice.h
 rename src/Detector/tensorrt_yolo/{common => common_deprecated}/sampleEngines.cpp (100%)
 create mode 100644 src/Detector/tensorrt_yolo/common_deprecated/sampleEngines.h
 rename src/Detector/tensorrt_yolo/{common => common_deprecated}/sampleInference.cpp (100%)
 create mode 100644 src/Detector/tensorrt_yolo/common_deprecated/sampleInference.h
 create mode 100644 src/Detector/tensorrt_yolo/common_deprecated/sampleOptions.cpp
 create mode 100644 src/Detector/tensorrt_yolo/common_deprecated/sampleOptions.h
 create mode 100644 src/Detector/tensorrt_yolo/common_deprecated/sampleReporting.cpp
 create mode 100644 src/Detector/tensorrt_yolo/common_deprecated/sampleReporting.h
 create mode 100644 src/Detector/tensorrt_yolo/common_deprecated/sampleUtils.h

diff --git a/src/Detector/tensorrt_yolo/CMakeLists.txt b/src/Detector/tensorrt_yolo/CMakeLists.txt
index 30509d0e..30f916bf 100644
--- a/src/Detector/tensorrt_yolo/CMakeLists.txt
+++ b/src/Detector/tensorrt_yolo/CMakeLists.txt
@@ -58,13 +58,20 @@ file(GLOB TENSORRT_CUDA_FILES *.cu)
 cuda_add_library(${libname_rt} SHARED
     ${TENSORRT_CUDA_FILES}
     ${TENSORRT_SOURCE_FILES}
-    ${TENSORRT_HEADER_FILES}
-)
+    ${TENSORRT_HEADER_FILES})
 
 #message("TensorRT OpenCV libraries:")
 #message("${OpenCV_LIBS}")
 #message(${OpenCV_DIR})
 
+if (MSVC)
+    file(GLOB TensorRT_LIBRARIES ${TensorRT_LIBRARY})
+endif()
+
+message("TensorRT_LIBRARY: ${TensorRT_LIBRARY}")
+message("TensorRT_LIBRARIES: ${TensorRT_LIBRARIES}")
+
+
 set(TENSORRT_LIBS
     ${OpenCV_LIBS}
     #${CUDA_LIBRARIES}
@@ -74,8 +81,7 @@ set(TENSORRT_LIBS
     ${CUDA_curand_LIBRARY}
     ${CUDNN_LIBRARY}
     # ${LIB_PTHREAD}
-    ${TensorRT_LIBRARIES}
-)
+    ${TensorRT_LIBRARIES})
 
 if (CMAKE_COMPILER_IS_GNUCXX)
     set(TENSORRT_LIBS ${TENSORRT_LIBS} stdc++fs nvinfer_plugin nvonnxparser)
diff --git a/src/Detector/tensorrt_yolo/YoloONNX.cpp b/src/Detector/tensorrt_yolo/YoloONNX.cpp
index b016c4b3..0b19d5cc 100644
--- a/src/Detector/tensorrt_yolo/YoloONNX.cpp
+++ b/src/Detector/tensorrt_yolo/YoloONNX.cpp
@@ -22,14 +22,13 @@ bool YoloONNX::Init(const SampleYoloParams& params)
 
     auto GetBindings = [&]()
     {
-        auto numBindings = m_engine->getNbBindings();
+        auto numBindings = m_engine->getNbIOTensors();
 
         std::cout << "** Bindings: " << numBindings << " **" << std::endl;
         for (int32_t i = 0; i < numBindings; ++i)
         {
-            nvinfer1::Dims dim = m_engine->getBindingDimensions(i);
-
-            std::string bindName = m_engine->getBindingName(i);
+            std::string bindName = m_engine->getIOTensorName(i);
+            nvinfer1::Dims dim = m_engine->getTensorShape(bindName.c_str());
             for (const auto& outName : m_params.outputTensorNames)
             {
                 if (bindName == outName)
@@ -77,27 +76,17 @@ bool YoloONNX::Init(const SampleYoloParams& params)
         delete infer;
 #endif
 
-        sample::gLogInfo << "TRT Engine loaded from: " << m_params.engineFileName << std::endl;
-
-        GetBindings();
-
-        if (!m_engine)
+        if (m_engine)
         {
-            res = false;
+            GetBindings();
+            m_inputDims = m_engine->getTensorShape(m_engine->getIOTensorName(0));
+            res = true;
         }
         else
         {
-#if 1
-            m_inputDims = m_engine->getBindingDimensions(0);
-#else
-            m_inputDims.nbDims = 4;
-            m_inputDims.d[0] = m_params.explicitBatchSize;
-            m_inputDims.d[1] = 3;
-            m_inputDims.d[2] = m_params.width;
-            m_inputDims.d[3] = m_params.height;
-#endif
             res = true;
         }
+        sample::gLogInfo << "TRT Engine loaded from: " << m_params.engineFileName << " with res = " << res << std::endl;
     }
     else
     {
@@ -177,7 +166,7 @@ bool YoloONNX::ConstructNetwork(YoloONNXUniquePtr<nvinfer1::IBuilder>& builder,
     size_t dlaGlobalDRAMSize = config->getMemoryPoolLimit(nvinfer1::MemoryPoolType::kDLA_GLOBAL_DRAM);
 	std::cout << "workspaceSize = " << workspaceSize << ", dlaManagedSRAMSize = " << dlaManagedSRAMSize << ", dlaLocalDRAMSize = " << dlaLocalDRAMSize << ", dlaGlobalDRAMSize = " << dlaGlobalDRAMSize << std::endl;
 
-    config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, m_params.videoMemory ? m_params.videoMemory : 4096_MiB);
+    config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, m_params.videoMemory ? m_params.videoMemory : (1 << 20));
 #endif
 
     config->setFlag(nvinfer1::BuilderFlag::kGPU_FALLBACK);
diff --git a/src/Detector/tensorrt_yolo/common/BatchStream.h b/src/Detector/tensorrt_yolo/common/BatchStream.h
index a8da9923..c4ab9de0 100644
--- a/src/Detector/tensorrt_yolo/common/BatchStream.h
+++ b/src/Detector/tensorrt_yolo/common/BatchStream.h
@@ -1,11 +1,12 @@
 /*
- * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
- *     http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
@@ -119,7 +120,7 @@ class MNISTBatchStream : public IBatchStream
         file.read(reinterpret_cast<char*>(rawData.data()), numElements * sizeof(uint8_t));
         mData.resize(numElements);
         std::transform(
-            rawData.begin(), rawData.end(), mData.begin(), [](uint8_t val) { return static_cast<float>(val) / 255.f; });
+            rawData.begin(), rawData.end(), mData.begin(), [](uint8_t val) { return static_cast<float>(val) / 255.F; });
     }
 
     void readLabelsFile(const std::string& labelsFilePath)
@@ -152,42 +153,39 @@ class MNISTBatchStream : public IBatchStream
 class BatchStream : public IBatchStream
 {
 public:
-    BatchStream(
-        int batchSize, int maxBatches, std::string prefix, std::string suffix, std::vector<std::string> directories)
+    BatchStream(int batchSize, int maxBatches, std::string const& prefix, std::string const& suffix,
+        std::vector<std::string> const& directories)
         : mBatchSize(batchSize)
         , mMaxBatches(maxBatches)
         , mPrefix(prefix)
         , mSuffix(suffix)
         , mDataDir(directories)
     {
-        FILE* file = fopen(locateFile(mPrefix + std::string("0") + mSuffix, mDataDir).c_str(), "rb");
-        ASSERT(file != nullptr);
+        std::ifstream file(locateFile(mPrefix + std::string("0") + mSuffix, mDataDir).c_str(), std::ios::binary);
+        ASSERT(file.good());
         int d[4];
-        size_t readSize = fread(d, sizeof(int), 4, file);
-        ASSERT(readSize == 4);
+        file.read(reinterpret_cast<char*>(d), 4 * sizeof(int32_t));
         mDims.nbDims = 4;  // The number of dimensions.
         mDims.d[0] = d[0]; // Batch Size
         mDims.d[1] = d[1]; // Channels
         mDims.d[2] = d[2]; // Height
         mDims.d[3] = d[3]; // Width
         ASSERT(mDims.d[0] > 0 && mDims.d[1] > 0 && mDims.d[2] > 0 && mDims.d[3] > 0);
-        fclose(file);
 
         mImageSize = mDims.d[1] * mDims.d[2] * mDims.d[3];
         mBatch.resize(mBatchSize * mImageSize, 0);
         mLabels.resize(mBatchSize, 0);
         mFileBatch.resize(mDims.d[0] * mImageSize, 0);
         mFileLabels.resize(mDims.d[0], 0);
-        reset(0);
     }
 
-    BatchStream(int batchSize, int maxBatches, std::string prefix, std::vector<std::string> directories)
+    BatchStream(int batchSize, int maxBatches, std::string const& prefix, std::vector<std::string> const& directories)
         : BatchStream(batchSize, maxBatches, prefix, ".batch", directories)
     {
     }
 
-    BatchStream(
-        int batchSize, int maxBatches, nvinfer1::Dims dims, std::string listFile, std::vector<std::string> directories)
+    BatchStream(int batchSize, int maxBatches, nvinfer1::Dims const& dims, std::string const& listFile,
+        std::vector<std::string> const& directories)
         : mBatchSize(batchSize)
         , mMaxBatches(maxBatches)
         , mDims(dims)
@@ -199,7 +197,6 @@ class BatchStream : public IBatchStream
         mLabels.resize(mBatchSize, 0);
         mFileBatch.resize(mDims.d[0] * mImageSize, 0);
         mFileLabels.resize(mDims.d[0], 0);
-        reset(0);
     }
 
     // Resets data members
@@ -219,7 +216,7 @@ class BatchStream : public IBatchStream
             return false;
         }
 
-        for (int csize = 1, batchPos = 0; batchPos < mBatchSize; batchPos += csize, mFileBatchPos += csize)
+        for (int64_t csize = 1, batchPos = 0; batchPos < mBatchSize; batchPos += csize, mFileBatchPos += csize)
         {
             ASSERT(mFileBatchPos > 0 && mFileBatchPos <= mDims.d[0]);
             if (mFileBatchPos == mDims.d[0] && !update())
@@ -228,7 +225,7 @@ class BatchStream : public IBatchStream
             }
 
             // copy the smaller of: elements left to fulfill the request, or elements left in the file buffer.
-            csize = std::min(mBatchSize - batchPos, mDims.d[0] - mFileBatchPos);
+            csize = std::min<int64_t>(mBatchSize - batchPos, mDims.d[0] - mFileBatchPos);
             std::copy_n(
                 getFileBatch() + mFileBatchPos * mImageSize, csize * mImageSize, getBatch() + batchPos * mImageSize);
             std::copy_n(getFileLabels() + mFileBatchPos, csize, getLabels() + batchPos);
@@ -295,22 +292,16 @@ class BatchStream : public IBatchStream
         if (mListFile.empty())
         {
             std::string inputFileName = locateFile(mPrefix + std::to_string(mFileCount++) + mSuffix, mDataDir);
-            FILE* file = fopen(inputFileName.c_str(), "rb");
+            std::ifstream file(inputFileName.c_str(), std::ios::binary);
             if (!file)
             {
                 return false;
             }
-
             int d[4];
-            size_t readSize = fread(d, sizeof(int), 4, file);
-            ASSERT(readSize == 4);
+            file.read(reinterpret_cast<char*>(d), 4 * sizeof(int32_t));
             ASSERT(mDims.d[0] == d[0] && mDims.d[1] == d[1] && mDims.d[2] == d[2] && mDims.d[3] == d[3]);
-            size_t readInputCount = fread(getFileBatch(), sizeof(float), mDims.d[0] * mImageSize, file);
-            ASSERT(readInputCount == size_t(mDims.d[0] * mImageSize));
-            size_t readLabelCount = fread(getFileLabels(), sizeof(float), mDims.d[0], file);
-            ASSERT(readLabelCount == 0 || readLabelCount == size_t(mDims.d[0]));
-
-            fclose(file);
+            file.read(reinterpret_cast<char*>(getFileBatch()), sizeof(float) * mDims.d[0] * mImageSize);
+            file.read(reinterpret_cast<char*>(getFileLabels()), sizeof(float) * mDims.d[0]);
         }
         else
         {
@@ -368,7 +359,7 @@ class BatchStream : public IBatchStream
         return true;
     }
 
-    int mBatchSize{0};
+    int64_t mBatchSize{0};
     int mMaxBatches{0};
     int mBatchCount{0};
     int mFileCount{0};
diff --git a/src/Detector/tensorrt_yolo/common/EntropyCalibrator.h b/src/Detector/tensorrt_yolo/common/EntropyCalibrator.h
index f31789bf..67a0130e 100644
--- a/src/Detector/tensorrt_yolo/common/EntropyCalibrator.h
+++ b/src/Detector/tensorrt_yolo/common/EntropyCalibrator.h
@@ -1,11 +1,12 @@
 /*
- * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
- *     http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
@@ -28,8 +29,8 @@ template <typename TBatchStream>
 class EntropyCalibratorImpl
 {
 public:
-    EntropyCalibratorImpl(
-        TBatchStream stream, int firstBatch, std::string networkName, const char* inputBlobName, bool readCache = true)
+    EntropyCalibratorImpl(TBatchStream const& stream, int firstBatch, std::string const& networkName,
+        const char* inputBlobName, bool readCache = true)
         : mStream{stream}
         , mCalibrationTableName("CalibrationTable" + networkName)
         , mInputBlobName(inputBlobName)
@@ -51,11 +52,12 @@ class EntropyCalibratorImpl
         return mStream.getBatchSize();
     }
 
-    bool getBatch(void* bindings[], const char* names[], int /*nbBindings*/) noexcept
+    bool getBatch(void* bindings[], const char* names[], int nbBindings) noexcept
     {
         if (!mStream.next())
+        {
             return false;
-
+        }
         CHECK(cudaMemcpy(mDeviceInput, mStream.getBatch(), mInputCount * sizeof(float), cudaMemcpyHostToDevice));
         ASSERT(!strcmp(names[0], mInputBlobName));
         bindings[0] = mDeviceInput;
@@ -101,8 +103,8 @@ template <typename TBatchStream>
 class Int8EntropyCalibrator2 : public nvinfer1::IInt8EntropyCalibrator2
 {
 public:
-    Int8EntropyCalibrator2(
-        TBatchStream stream, int firstBatch, const char* networkName, const char* inputBlobName, bool readCache = true)
+    Int8EntropyCalibrator2(TBatchStream const& stream, int32_t firstBatch, const char* networkName,
+        const char* inputBlobName, bool readCache = true)
         : mImpl(stream, firstBatch, networkName, inputBlobName, readCache)
     {
     }
diff --git a/src/Detector/tensorrt_yolo/common/ErrorRecorder.h b/src/Detector/tensorrt_yolo/common/ErrorRecorder.h
index 40b35fb5..bfb857c5 100644
--- a/src/Detector/tensorrt_yolo/common/ErrorRecorder.h
+++ b/src/Detector/tensorrt_yolo/common/ErrorRecorder.h
@@ -1,11 +1,12 @@
 /*
- * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
- *     http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
@@ -16,7 +17,7 @@
 
 #ifndef ERROR_RECORDER_H
 #define ERROR_RECORDER_H
-#include "NvInferRuntimeCommon.h"
+#include "NvInferRuntime.h"
 #include "logger.h"
 #include <atomic>
 #include <cstdint>
@@ -44,7 +45,7 @@ class SampleErrorRecorder : public IErrorRecorder
 public:
     SampleErrorRecorder() = default;
 
-    virtual ~SampleErrorRecorder() noexcept {}
+    ~SampleErrorRecorder() noexcept override {}
     int32_t getNbErrors() const noexcept final
     {
         return mErrorStack.size();
diff --git a/src/Detector/tensorrt_yolo/common/argsParser.h b/src/Detector/tensorrt_yolo/common/argsParser.h
new file mode 100644
index 00000000..1f0b9025
--- /dev/null
+++ b/src/Detector/tensorrt_yolo/common/argsParser.h
@@ -0,0 +1,162 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef TENSORRT_ARGS_PARSER_H
+#define TENSORRT_ARGS_PARSER_H
+
+#ifdef _MSC_VER
+#include "getOptWin.h"
+#else
+#include <getopt.h>
+#endif
+#include <iostream>
+#include <string>
+#include <vector>
+
+namespace samplesCommon
+{
+
+//!
+//! \brief The SampleParams structure groups the basic parameters required by
+//!        all sample networks.
+//!
+struct SampleParams
+{
+    int32_t batchSize{1};              //!< Number of inputs in a batch
+    int32_t dlaCore{-1};               //!< Specify the DLA core to run network on.
+    bool int8{false};                  //!< Allow runnning the network in Int8 mode.
+    bool fp16{false};                  //!< Allow running the network in FP16 mode.
+    bool bf16{false};                  //!< Allow running the network in BF16 mode.
+    std::vector<std::string> dataDirs; //!< Directory paths where sample data files are stored
+    std::vector<std::string> inputTensorNames;
+    std::vector<std::string> outputTensorNames;
+    std::string timingCacheFile; //!< Path to timing cache file
+};
+
+//!
+//! \brief The OnnxSampleParams structure groups the additional parameters required by
+//!         networks that use ONNX
+//!
+struct OnnxSampleParams : public SampleParams
+{
+    std::string onnxFileName; //!< Filename of ONNX file of a network
+};
+
+//!
+//! /brief Struct to maintain command-line arguments.
+//!
+struct Args
+{
+    bool runInInt8{false};
+    bool runInFp16{false};
+    bool runInBf16{false};
+    bool help{false};
+    int32_t useDLACore{-1};
+    int32_t batch{1};
+    std::vector<std::string> dataDirs;
+    std::string saveEngine;
+    std::string loadEngine;
+    bool rowOrder{true};
+    std::string timingCacheFile;
+};
+
+//!
+//! \brief Populates the Args struct with the provided command-line parameters.
+//!
+//! \throw invalid_argument if any of the arguments are not valid
+//!
+//! \return boolean If return value is true, execution can continue, otherwise program should exit
+//!
+inline bool parseArgs(Args& args, int32_t argc, char* argv[])
+{
+    while (1)
+    {
+        int32_t arg;
+        static struct option long_options[]
+            = {{"help", no_argument, 0, 'h'}, {"datadir", required_argument, 0, 'd'}, {"int8", no_argument, 0, 'i'},
+                {"fp16", no_argument, 0, 'f'}, {"bf16", no_argument, 0, 'z'}, {"columnOrder", no_argument, 0, 'c'},
+                {"saveEngine", required_argument, 0, 's'}, {"loadEngine", required_argument, 0, 'o'},
+                {"useDLACore", required_argument, 0, 'u'}, {"batch", required_argument, 0, 'b'},
+                {"timingCacheFile", required_argument, 0, 't'}, {nullptr, 0, nullptr, 0}};
+        int32_t option_index = 0;
+        arg = getopt_long(argc, argv, "hd:iu", long_options, &option_index);
+        if (arg == -1)
+        {
+            break;
+        }
+
+        switch (arg)
+        {
+        case 'h': args.help = true; return true;
+        case 'd':
+            if (optarg)
+            {
+                args.dataDirs.push_back(optarg);
+            }
+            else
+            {
+                std::cerr << "ERROR: --datadir requires option argument" << std::endl;
+                return false;
+            }
+            break;
+        case 's':
+            if (optarg)
+            {
+                args.saveEngine = optarg;
+            }
+            break;
+        case 'o':
+            if (optarg)
+            {
+                args.loadEngine = optarg;
+            }
+            break;
+        case 'i': args.runInInt8 = true; break;
+        case 'f': args.runInFp16 = true; break;
+        case 'z': args.runInBf16 = true; break;
+        case 'c': args.rowOrder = false; break;
+        case 'u':
+            if (optarg)
+            {
+                args.useDLACore = std::stoi(optarg);
+            }
+            break;
+        case 'b':
+            if (optarg)
+            {
+                args.batch = std::stoi(optarg);
+            }
+            break;
+        case 't':
+            if (optarg)
+            {
+                args.timingCacheFile = optarg;
+            }
+            else
+            {
+                std::cerr << "ERROR: --timingCacheFile requires option argument" << std::endl;
+                return false;
+            }
+            break;
+        default: return false;
+        }
+    }
+    return true;
+}
+
+} // namespace samplesCommon
+
+#endif // TENSORRT_ARGS_PARSER_H
diff --git a/src/Detector/tensorrt_yolo/common/bfloat16.cpp b/src/Detector/tensorrt_yolo/common/bfloat16.cpp
new file mode 100644
index 00000000..8222826a
--- /dev/null
+++ b/src/Detector/tensorrt_yolo/common/bfloat16.cpp
@@ -0,0 +1,60 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "bfloat16.h"
+#include <cstring>
+
+namespace sample
+{
+
+BFloat16::operator float() const
+{
+    static_assert(sizeof(uint32_t) == sizeof(float), "");
+    float val{0.F};
+    auto bits = static_cast<uint32_t>(mRep) << 16;
+    std::memcpy(&val, &bits, sizeof(uint32_t));
+    return val;
+}
+
+BFloat16::BFloat16(float x)
+{
+    static_assert(sizeof(uint32_t) == sizeof(float), "");
+    uint32_t bits{0};
+    std::memcpy(&bits, &x, sizeof(float));
+
+    // FP32 format: 1 sign bit, 8 bit exponent, 23 bit mantissa
+    // BF16 format: 1 sign bit, 8 bit exponent, 7 bit mantissa
+
+    // Mask for exponent
+    constexpr uint32_t exponent = 0xFFU << 23;
+
+    // Check if exponent is all 1s (NaN or infinite)
+    if ((bits & exponent) != exponent)
+    {
+        // x is finite - round to even
+        bits += 0x7FFFU + (bits >> 16 & 1);
+    }
+
+    mRep = static_cast<uint16_t>(bits >> 16);
+}
+
+BFloat16 operator+(BFloat16 x, BFloat16 y)
+{
+    return BFloat16(static_cast<float>(x) + static_cast<float>(y));
+}
+
+} // namespace sample
diff --git a/src/Detector/tensorrt_yolo/common/bfloat16.h b/src/Detector/tensorrt_yolo/common/bfloat16.h
new file mode 100644
index 00000000..0d0ab922
--- /dev/null
+++ b/src/Detector/tensorrt_yolo/common/bfloat16.h
@@ -0,0 +1,46 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cstdint>
+
+namespace sample
+{
+
+//! Implements "Brain Floating Point": like an IEEE FP32,
+//! but the significand is only 7 bits instead of 23 bits.
+class BFloat16
+{
+public:
+    BFloat16()
+        : mRep(0)
+    {
+    }
+
+    // Rounds to even if there is a tie.
+    BFloat16(float x);
+
+    operator float() const;
+
+private:
+    //! Value stored in BFloat16 representation.
+    uint16_t mRep;
+};
+BFloat16 operator+(BFloat16 x, BFloat16 y);
+
+} // namespace sample
diff --git a/src/Detector/tensorrt_yolo/common/buffers.h b/src/Detector/tensorrt_yolo/common/buffers.h
index ef673b2b..e58f2f5c 100644
--- a/src/Detector/tensorrt_yolo/common/buffers.h
+++ b/src/Detector/tensorrt_yolo/common/buffers.h
@@ -1,11 +1,12 @@
 /*
- * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
- *     http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
@@ -238,28 +239,53 @@ class BufferManager
 public:
     static const size_t kINVALID_SIZE_VALUE = ~size_t(0);
 
+    //!
+    //! \brief Create a BufferManager for handling buffer interactions with engine, when the I/O tensor volumes
+    //! are provided
+    //!
+    BufferManager(
+        std::shared_ptr<nvinfer1::ICudaEngine> engine, std::vector<int64_t> const& volumes, int32_t batchSize = 0)
+        : mEngine(engine)
+        , mBatchSize(batchSize)
+    {
+        // Create host and device buffers
+        for (int32_t i = 0; i < mEngine->getNbIOTensors(); i++)
+        {
+            auto const name = engine->getIOTensorName(i);
+            mNames[name] = i;
+
+            nvinfer1::DataType type = mEngine->getTensorDataType(name);
+
+            std::unique_ptr<ManagedBuffer> manBuf{new ManagedBuffer()};
+            manBuf->deviceBuffer = DeviceBuffer(volumes[i], type);
+            manBuf->hostBuffer = HostBuffer(volumes[i], type);
+            void* deviceBuffer = manBuf->deviceBuffer.data();
+            mDeviceBindings.emplace_back(deviceBuffer);
+            mManagedBuffers.emplace_back(std::move(manBuf));
+        }
+    }
+
     //!
     //! \brief Create a BufferManager for handling buffer interactions with engine.
     //!
-    BufferManager(std::shared_ptr<nvinfer1::ICudaEngine> engine, const int batchSize,
-        const nvinfer1::IExecutionContext* context = nullptr)
+    BufferManager(std::shared_ptr<nvinfer1::ICudaEngine> engine, int32_t const batchSize = 0,
+        nvinfer1::IExecutionContext const* context = nullptr)
         : mEngine(engine)
         , mBatchSize(batchSize)
     {
-        // Full Dims implies no batch size.
-        auto impbs = engine->hasImplicitBatchDimension();
-        std::cout << "hasImplicitBatchDimension: " << impbs << ", mBatchSize = " << mBatchSize << std::endl;
-        assert(engine->hasImplicitBatchDimension() || mBatchSize == 0);
         // Create host and device buffers
-        for (int i = 0; i < mEngine->getNbBindings(); i++)
+        for (int32_t i = 0, e = mEngine->getNbIOTensors(); i < e; i++)
         {
-            auto dims = context ? context->getBindingDimensions(i) : mEngine->getBindingDimensions(i);
+            auto const name = engine->getIOTensorName(i);
+            mNames[name] = i;
+
+            auto dims = context ? context->getTensorShape(name) : mEngine->getTensorShape(name);
             size_t vol = context || !mBatchSize ? 1 : static_cast<size_t>(mBatchSize);
-            nvinfer1::DataType type = mEngine->getBindingDataType(i);
-            int vecDim = mEngine->getBindingVectorizedDim(i);
+            nvinfer1::DataType type = mEngine->getTensorDataType(name);
+            int32_t vecDim = mEngine->getTensorVectorizedDim(name);
             if (-1 != vecDim) // i.e., 0 != lgScalarsPerVector
             {
-                int scalarsPerVec = mEngine->getBindingComponentsPerElement(i);
+                int32_t scalarsPerVec = mEngine->getTensorComponentsPerElement(name);
                 dims.d[vecDim] = divUp(dims.d[vecDim], scalarsPerVec);
                 vol *= scalarsPerVec;
             }
@@ -267,7 +293,8 @@ class BufferManager
             std::unique_ptr<ManagedBuffer> manBuf{new ManagedBuffer()};
             manBuf->deviceBuffer = DeviceBuffer(vol, type);
             manBuf->hostBuffer = HostBuffer(vol, type);
-            mDeviceBindings.emplace_back(manBuf->deviceBuffer.data());
+            void* deviceBuffer = manBuf->deviceBuffer.data();
+            mDeviceBindings.emplace_back(deviceBuffer);
             mManagedBuffers.emplace_back(std::move(manBuf));
         }
     }
@@ -284,7 +311,7 @@ class BufferManager
     //!
     //! \brief Returns a vector of device buffers.
     //!
-    const std::vector<void*>& getDeviceBindings() const
+    std::vector<void*> const& getDeviceBindings() const
     {
         return mDeviceBindings;
     }
@@ -293,7 +320,7 @@ class BufferManager
     //! \brief Returns the device buffer corresponding to tensorName.
     //!        Returns nullptr if no such tensor can be found.
     //!
-    void* getDeviceBuffer(const std::string& tensorName) const
+    void* getDeviceBuffer(std::string const& tensorName) const
     {
         return getBuffer(false, tensorName);
     }
@@ -302,72 +329,21 @@ class BufferManager
     //! \brief Returns the host buffer corresponding to tensorName.
     //!        Returns nullptr if no such tensor can be found.
     //!
-    void* getHostBuffer(const std::string& tensorName) const
+    void* getHostBuffer(std::string const& tensorName) const
     {
         return getBuffer(true, tensorName);
     }
 
-    //!
-    //! \brief Returns the host buffer corresponding to tensorName.
-    //!        Returns nullptr if no such tensor can be found.
-    //!
-    void* getHostBuffer(int bindingIndex) const
-    {
-        return getBuffer(true, bindingIndex);
-    }
-
     //!
     //! \brief Returns the size of the host and device buffers that correspond to tensorName.
     //!        Returns kINVALID_SIZE_VALUE if no such tensor can be found.
     //!
-    size_t size(const std::string& tensorName) const
+    size_t size(std::string const& tensorName) const
     {
-        int index = mEngine->getBindingIndex(tensorName.c_str());
-        if (index == -1)
+        auto record = mNames.find(tensorName);
+        if (record == mNames.end())
             return kINVALID_SIZE_VALUE;
-        return mManagedBuffers[index]->hostBuffer.nbBytes();
-    }
-
-    //!
-    //! \brief Dump host buffer with specified tensorName to ostream.
-    //!        Prints error message to std::ostream if no such tensor can be found.
-    //!
-    void dumpBuffer(std::ostream& os, const std::string& tensorName)
-    {
-        int index = mEngine->getBindingIndex(tensorName.c_str());
-        if (index == -1)
-        {
-            os << "Invalid tensor name" << std::endl;
-            return;
-        }
-        void* buf = mManagedBuffers[index]->hostBuffer.data();
-        size_t bufSize = mManagedBuffers[index]->hostBuffer.nbBytes();
-        nvinfer1::Dims bufDims = mEngine->getBindingDimensions(index);
-        size_t rowCount = static_cast<size_t>(bufDims.nbDims > 0 ? bufDims.d[bufDims.nbDims - 1] : mBatchSize);
-        int leadDim = mBatchSize;
-        int* trailDims = bufDims.d;
-        int nbDims = bufDims.nbDims;
-
-        // Fix explicit Dimension networks
-        if (!leadDim && nbDims > 0)
-        {
-            leadDim = bufDims.d[0];
-            ++trailDims;
-            --nbDims;
-        }
-
-        os << "[" << leadDim;
-        for (int i = 0; i < nbDims; i++)
-            os << ", " << trailDims[i];
-        os << "]" << std::endl;
-        switch (mEngine->getBindingDataType(index))
-        {
-        case nvinfer1::DataType::kINT32: print<int32_t>(os, buf, bufSize, rowCount); break;
-        case nvinfer1::DataType::kFLOAT: print<float>(os, buf, bufSize, rowCount); break;
-        case nvinfer1::DataType::kHALF: print<half_float::half>(os, buf, bufSize, rowCount); break;
-        case nvinfer1::DataType::kINT8: assert(0 && "Int8 network-level input and output is not supported"); break;
-        case nvinfer1::DataType::kBOOL: assert(0 && "Bool network-level input and output are not supported"); break;
-        }
+        return mManagedBuffers[record->second]->hostBuffer.nbBytes();
     }
 
     //!
@@ -382,7 +358,7 @@ class BufferManager
         assert(bufSize % sizeof(T) == 0);
         T* typedBuf = static_cast<T*>(buf);
         size_t numItems = bufSize / sizeof(T);
-        for (int i = 0; i < static_cast<int>(numItems); i++)
+        for (int32_t i = 0; i < static_cast<int>(numItems); i++)
         {
             // Handle rowCount == 1 case
             if (rowCount == 1 && i != static_cast<int>(numItems) - 1)
@@ -404,7 +380,7 @@ class BufferManager
     //!
     void copyInputToDevice()
     {
-        memcpyBuffers(true, false, false, 0);
+        memcpyBuffers(true, false, false);
     }
 
     //!
@@ -412,13 +388,13 @@ class BufferManager
     //!
     void copyOutputToHost()
     {
-        memcpyBuffers(false, true, false, 0);
+        memcpyBuffers(false, true, false);
     }
 
     //!
     //! \brief Copy the contents of input host buffers to input device buffers asynchronously.
     //!
-    void copyInputToDeviceAsync(const cudaStream_t& stream)
+    void copyInputToDeviceAsync(cudaStream_t const& stream = 0)
     {
         memcpyBuffers(true, false, true, stream);
     }
@@ -426,7 +402,7 @@ class BufferManager
     //!
     //! \brief Copy the contents of output device buffers to output host buffers asynchronously.
     //!
-    void copyOutputToHostAsync(const cudaStream_t& stream)
+    void copyOutputToHostAsync(cudaStream_t const& stream = 0)
     {
         memcpyBuffers(false, true, true, stream);
     }
@@ -434,30 +410,31 @@ class BufferManager
     ~BufferManager() = default;
 
 private:
-    void* getBuffer(const bool isHost, const std::string& tensorName) const
+    void* getBuffer(bool const isHost, std::string const& tensorName) const
     {
-        int index = mEngine->getBindingIndex(tensorName.c_str());
-        if (index == -1)
+        auto record = mNames.find(tensorName);
+        if (record == mNames.end())
             return nullptr;
-        return (isHost ? mManagedBuffers[index]->hostBuffer.data() : mManagedBuffers[index]->deviceBuffer.data());
+        return (isHost ? mManagedBuffers[record->second]->hostBuffer.data()
+                       : mManagedBuffers[record->second]->deviceBuffer.data());
     }
 
-    void* getBuffer(const bool isHost, int bindingIndex) const
+    bool tenosrIsInput(const std::string& tensorName) const
     {
-        if (bindingIndex == -1)
-            return nullptr;
-        return (isHost ? mManagedBuffers[bindingIndex]->hostBuffer.data() : mManagedBuffers[bindingIndex]->deviceBuffer.data());
+        return mEngine->getTensorIOMode(tensorName.c_str()) == nvinfer1::TensorIOMode::kINPUT;
     }
 
-    void memcpyBuffers(const bool copyInput, const bool deviceToHost, const bool async, const cudaStream_t& stream)
+    void memcpyBuffers(bool const copyInput, bool const deviceToHost, bool const async, cudaStream_t const& stream = 0)
     {
-        for (int i = 0; i < mEngine->getNbBindings(); i++)
+        for (auto const& n : mNames)
         {
-            void* dstPtr = deviceToHost ? mManagedBuffers[i]->hostBuffer.data() : mManagedBuffers[i]->deviceBuffer.data();
-            const void* srcPtr = deviceToHost ? mManagedBuffers[i]->deviceBuffer.data() : mManagedBuffers[i]->hostBuffer.data();
-            const size_t byteSize = mManagedBuffers[i]->hostBuffer.nbBytes();
+            void* dstPtr = deviceToHost ? mManagedBuffers[n.second]->hostBuffer.data()
+                                        : mManagedBuffers[n.second]->deviceBuffer.data();
+            void const* srcPtr = deviceToHost ? mManagedBuffers[n.second]->deviceBuffer.data()
+                                              : mManagedBuffers[n.second]->hostBuffer.data();
+            size_t const byteSize = mManagedBuffers[n.second]->hostBuffer.nbBytes();
             const cudaMemcpyKind memcpyType = deviceToHost ? cudaMemcpyDeviceToHost : cudaMemcpyHostToDevice;
-            if ((copyInput && mEngine->bindingIsInput(i)) || (!copyInput && !mEngine->bindingIsInput(i)))
+            if ((copyInput && tenosrIsInput(n.first)) || (!copyInput && !tenosrIsInput(n.first)))
             {
                 if (async)
                     CHECK(cudaMemcpyAsync(dstPtr, srcPtr, byteSize, memcpyType, stream));
@@ -468,9 +445,10 @@ class BufferManager
     }
 
     std::shared_ptr<nvinfer1::ICudaEngine> mEngine;              //!< The pointer to the engine
-    int mBatchSize = 0;                                          //!< The batch size for legacy networks, 0 otherwise.
+    int mBatchSize;                                              //!< The batch size for legacy networks, 0 otherwise.
     std::vector<std::unique_ptr<ManagedBuffer>> mManagedBuffers; //!< The vector of pointers to managed buffers
-    std::vector<void*> mDeviceBindings;                          //!< The vector of device buffers needed for engine execution
+    std::vector<void*> mDeviceBindings;              //!< The vector of device buffers needed for engine execution
+    std::unordered_map<std::string, int32_t> mNames; //!< The map of tensor name and index pairs
 };
 
 } // namespace samplesCommon
diff --git a/src/Detector/tensorrt_yolo/common/common.h b/src/Detector/tensorrt_yolo/common/common.h
index 2270a2cd..538c6094 100644
--- a/src/Detector/tensorrt_yolo/common/common.h
+++ b/src/Detector/tensorrt_yolo/common/common.h
@@ -1,11 +1,12 @@
 /*
- * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
- *     http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
@@ -16,22 +17,13 @@
 
 #ifndef TENSORRT_COMMON_H
 #define TENSORRT_COMMON_H
-
-// For loadLibrary
-#ifdef _MSC_VER
-// Needed so that the max/min definitions in windows.h do not conflict with std::max/min.
-#ifndef NOMINMAX
-#define NOMINMAX
-#endif
-#include <windows.h>
-#undef NOMINMAX
-#else
-#include <dlfcn.h>
-#endif
-
 #include "NvInfer.h"
+#if !TRT_WINML
 #include "NvInferPlugin.h"
+#endif
 #include "logger.h"
+#include "safeCommon.h"
+#include "timingCache.h"
 #include <algorithm>
 #include <cassert>
 #include <chrono>
@@ -39,6 +31,7 @@
 #include <cstring>
 #include <cuda_runtime_api.h>
 #include <fstream>
+#include <functional>
 #include <iomanip>
 #include <iostream>
 #include <iterator>
@@ -52,7 +45,15 @@
 #include <utility>
 #include <vector>
 
-#include "safeCommon.h"
+#ifdef _MSC_VER
+// For loadLibrary
+// Needed so that the max/min definitions in windows.h do not conflict with std::max/min.
+#define NOMINMAX
+#include <windows.h>
+#undef NOMINMAX
+#else
+#include <dlfcn.h>
+#endif
 
 #ifdef _MSC_VER
 #define FN_NAME __FUNCTION__
@@ -82,7 +83,7 @@
         if (!(condition))                                                   \
         {                                                                   \
             sample::gLogError << "Assertion failure: " << #condition << std::endl;  \
-            abort();                                                        \
+            exit(EXIT_FAILURE);                                                       \
         }                                                                   \
     } while (0)
 
@@ -96,7 +97,7 @@ OBJ_GUARD(T)
 makeObjGuard(T_* t)
 {
     CHECK(!(std::is_base_of<T, T_>::value || std::is_same<T, T_>::value));
-    auto deleter = [](T* t) { t->destroy(); };
+    auto deleter = [](T* t) { delete t; };
     return std::unique_ptr<T, decltype(deleter)>{static_cast<T*>(t), deleter};
 }
 
@@ -113,21 +114,6 @@ constexpr long double operator"" _KiB(long double val)
     return val * (1 << 10);
 }
 
-// These is necessary if we want to be able to write 1_GiB instead of 1.0_GiB.
-// Since the return type is signed, -1_GiB will work as expected.
-constexpr long long int operator"" _GiB(unsigned long long val)
-{
-    return val * (1 << 30);
-}
-constexpr long long int operator"" _MiB(unsigned long long val)
-{
-    return val * (1 << 20);
-}
-constexpr long long int operator"" _KiB(unsigned long long val)
-{
-    return val * (1 << 10);
-}
-
 struct SimpleProfiler : public nvinfer1::IProfiler
 {
     struct Record
@@ -136,7 +122,7 @@ struct SimpleProfiler : public nvinfer1::IProfiler
         int count{0};
     };
 
-    virtual void reportLayerTime(const char* layerName, float ms) noexcept
+    void reportLayerTime(const char* layerName, float ms) noexcept override
     {
         mProfile[layerName].count++;
         mProfile[layerName].time += ms;
@@ -183,7 +169,7 @@ struct SimpleProfiler : public nvinfer1::IProfiler
         auto old_precision = out.precision();
         // Output header
         {
-            out << std::setw(maxLayerNameLength) << layerNameStr << " ";
+            out << std::setfill(' ') << std::setw(maxLayerNameLength) << layerNameStr << " ";
             out << std::setw(12) << "Runtime, "
                 << "%"
                 << " ";
@@ -214,80 +200,12 @@ struct SimpleProfiler : public nvinfer1::IProfiler
     std::map<std::string, Record> mProfile;
 };
 
-//! Locate path to file, given its filename or filepath suffix and possible dirs it might lie in.
-//! Function will also walk back MAX_DEPTH dirs from CWD to check for such a file path.
-inline std::string locateFile(
-    const std::string& filepathSuffix, const std::vector<std::string>& directories, bool reportError = true)
-{
-    const int MAX_DEPTH{10};
-    bool found{false};
-    std::string filepath;
-
-    for (auto& dir : directories)
-    {
-        if (!dir.empty() && dir.back() != '/')
-        {
-#ifdef _MSC_VER
-            filepath = dir + "\\" + filepathSuffix;
-#else
-            filepath = dir + "/" + filepathSuffix;
-#endif
-        }
-        else
-        {
-            filepath = dir + filepathSuffix;
-        }
-
-        for (int i = 0; i < MAX_DEPTH && !found; i++)
-        {
-            const std::ifstream checkFile(filepath);
-            found = checkFile.is_open();
-            if (found)
-            {
-                break;
-            }
-
-            filepath = "../" + filepath; // Try again in parent dir
-        }
-
-        if (found)
-        {
-            break;
-        }
-
-        filepath.clear();
-    }
-
-    // Could not find the file
-    if (filepath.empty())
-    {
-        const std::string dirList = std::accumulate(directories.begin() + 1, directories.end(), directories.front(),
-            [](const std::string& a, const std::string& b) { return a + "\n\t" + b; });
-        std::cout << "Could not find " << filepathSuffix << " in data directories:\n\t" << dirList << std::endl;
-
-        if (reportError)
-        {
-            std::cout << "&&&& FAILED" << std::endl;
-            exit(EXIT_FAILURE);
-        }
-    }
-
-    return filepath;
-}
-
-inline void readPGMFile(const std::string& fileName, uint8_t* buffer, int inH, int inW)
-{
-    std::ifstream infile(fileName, std::ifstream::binary);
-    assert(infile.is_open() && "Attempting to read from a file that is not open.");
-    std::string magic, h, w, max;
-    infile >> magic >> h >> w >> max;
-    infile.seekg(1, infile.cur);
-    infile.read(reinterpret_cast<char*>(buffer), inH * inW);
-}
-
 namespace samplesCommon
 {
-
+using nvinfer1::utils::loadTimingCacheFile;
+using nvinfer1::utils::buildTimingCacheFromFile;
+using nvinfer1::utils::saveTimingCacheFile;
+using nvinfer1::utils::updateTimingCacheFile;
 // Swaps endianness of an integral type.
 template <typename T, typename std::enable_if<std::is_integral<T>::value, int>::type = 0>
 inline T swapEndianness(const T& value)
@@ -339,7 +257,7 @@ class TypedHostMemory : public HostMemory
     {
         mData = new ElemType[size];
     };
-    ~TypedHostMemory() noexcept
+    ~TypedHostMemory() noexcept override
     {
         delete[](ElemType*) mData;
     }
@@ -360,7 +278,7 @@ inline void* safeCudaMalloc(size_t memSize)
     if (deviceMem == nullptr)
     {
         std::cerr << "Out of memory" << std::endl;
-        exit(1);
+        exit(EXIT_FAILURE);
     }
     return deviceMem;
 }
@@ -375,25 +293,20 @@ struct InferDeleter
     template <typename T>
     void operator()(T* obj) const
     {
-#if (NV_TENSORRT_MAJOR < 8)
-		obj->destroy();
-#else
         delete obj;
-#endif
     }
 };
 
 template <typename T>
-using SampleUniquePtr = std::unique_ptr<T, InferDeleter>;
+using SampleUniquePtr = std::unique_ptr<T>;
 
-static auto StreamDeleter = [](cudaStream_t* pStream)
+static auto StreamDeleter = [](cudaStream_t* pStream) {
+    if (pStream)
     {
-        if (pStream)
-        {
-            cudaStreamDestroy(*pStream);
-            delete pStream;
-        }
-    };
+        static_cast<void>(cudaStreamDestroy(*pStream));
+        delete pStream;
+    }
+};
 
 inline std::unique_ptr<cudaStream_t, decltype(StreamDeleter)> makeCudaStream()
 {
@@ -531,7 +444,7 @@ inline float getMaxValue(const float* buffer, int64_t size)
 //
 // The default parameter values choosen arbitrarily. Range values should be choosen such that
 // we avoid underflow or overflow. Also range value should be non zero to avoid uniform zero scale tensor.
-inline void setAllDynamicRanges(nvinfer1::INetworkDefinition* network, float inRange = 2.0f, float outRange = 4.0f)
+inline void setAllDynamicRanges(nvinfer1::INetworkDefinition* network, float inRange = 2.0F, float outRange = 4.0F)
 {
     // Ensure that all layer inputs have a scale.
     for (int i = 0; i < network->getNbLayers(); i++)
@@ -579,14 +492,15 @@ inline void setDummyInt8DynamicRanges(const nvinfer1::IBuilderConfig* c, nvinfer
     // Set dummy per-tensor dynamic range if Int8 mode is requested.
     if (c->getFlag(nvinfer1::BuilderFlag::kINT8))
     {
-        sample::gLogWarning
-            << "Int8 calibrator not provided. Generating dummy per-tensor dynamic range. Int8 accuracy is not guaranteed."
-            << std::endl;
+        sample::gLogWarning << "Int8 calibrator not provided. Generating dummy per-tensor dynamic range. Int8 accuracy "
+                               "is not guaranteed."
+                            << std::endl;
         setAllDynamicRanges(n);
     }
 }
 
-inline void enableDLA(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, int useDLACore, bool allowGPUFallback = true)
+inline void enableDLA(
+    nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, int useDLACore, bool allowGPUFallback = true)
 {
     if (useDLACore >= 0)
     {
@@ -627,18 +541,28 @@ inline uint32_t getElementSize(nvinfer1::DataType t) noexcept
 {
     switch (t)
     {
-    case nvinfer1::DataType::kINT32: return 4;
+    case nvinfer1::DataType::kINT64: return 8;
+    case nvinfer1::DataType::kINT32:
     case nvinfer1::DataType::kFLOAT: return 4;
+    case nvinfer1::DataType::kBF16:
     case nvinfer1::DataType::kHALF: return 2;
     case nvinfer1::DataType::kBOOL:
-    case nvinfer1::DataType::kINT8: return 1;
+    case nvinfer1::DataType::kUINT8:
+    case nvinfer1::DataType::kINT8:
+    case nvinfer1::DataType::kFP8: return 1;
+    case nvinfer1::DataType::kINT4:
+        ASSERT(false && "Element size is not implemented for sub-byte data-types");
     }
     return 0;
 }
 
-inline int64_t volume(const nvinfer1::Dims& d)
+inline int64_t volume(nvinfer1::Dims const& dims, int32_t start, int32_t stop)
 {
-    return std::accumulate(d.d, d.d + d.nbDims, 1, std::multiplies<int64_t>());
+    ASSERT(start >= 0);
+    ASSERT(start <= stop);
+    ASSERT(stop <= dims.nbDims);
+    ASSERT(std::all_of(dims.d + start, dims.d + stop, [](int32_t x) { return x >= 0; }));
+    return std::accumulate(dims.d + start, dims.d + stop, int64_t{1}, std::multiplies<int64_t>{});
 }
 
 template <int C, int H, int W>
@@ -698,7 +622,7 @@ void writePPMFileWithBBox(const std::string& filename, PPM<C, H, W>& ppm, const
             << ppm.w << " " << ppm.h << "\n"
             << ppm.max << "\n";
 
-    auto round = [](float x) -> int { return int(std::floor(x + 0.5f)); };
+    auto round = [](float x) -> int { return int(std::floor(x + 0.5F)); };
     const int x1 = std::min(std::max(0, round(int(bbox.x1))), W - 1);
     const int x2 = std::min(std::max(0, round(int(bbox.x2))), W - 1);
     const int y1 = std::min(std::max(0, round(int(bbox.y1))), H - 1);
@@ -739,7 +663,7 @@ inline void writePPMFileWithBBox(const std::string& filename, vPPM ppm, std::vec
             << "\n"
             << ppm.w << " " << ppm.h << "\n"
             << ppm.max << "\n";
-    auto round = [](float x) -> int { return int(std::floor(x + 0.5f)); };
+    auto round = [](float x) -> int { return int(std::floor(x + 0.5F)); };
 
     for (auto bbox : dets)
     {
@@ -778,7 +702,7 @@ class TimerBase
     virtual void stop() {}
     float microseconds() const noexcept
     {
-        return mMs * 1000.f;
+        return mMs * 1000.F;
     }
     float milliseconds() const noexcept
     {
@@ -786,15 +710,15 @@ class TimerBase
     }
     float seconds() const noexcept
     {
-        return mMs / 1000.f;
+        return mMs / 1000.F;
     }
     void reset() noexcept
     {
-        mMs = 0.f;
+        mMs = 0.F;
     }
 
 protected:
-    float mMs{0.0f};
+    float mMs{0.0F};
 };
 
 class GpuTimer : public TimerBase
@@ -811,14 +735,14 @@ class GpuTimer : public TimerBase
         CHECK(cudaEventDestroy(mStart));
         CHECK(cudaEventDestroy(mStop));
     }
-    void start()
+    void start() override
     {
         CHECK(cudaEventRecord(mStart, mStream));
     }
-    void stop()
+    void stop() override
     {
         CHECK(cudaEventRecord(mStop, mStream));
-        float ms{0.0f};
+        float ms{0.0F};
         CHECK(cudaEventSynchronize(mStop));
         CHECK(cudaEventElapsedTime(&ms, mStart, mStop));
         mMs += ms;
@@ -835,11 +759,11 @@ class CpuTimer : public TimerBase
 public:
     using clock_type = Clock;
 
-    void start()
+    void start() override
     {
         mStart = Clock::now();
     }
-    void stop()
+    void stop() override
     {
         mStop = Clock::now();
         mMs += std::chrono::duration<float, std::milli>{mStop - mStart}.count();
@@ -865,13 +789,7 @@ inline std::vector<std::string> splitString(std::string str, char delimiter = ',
     return splitVect;
 }
 
-// Return m rounded up to nearest multiple of n
-inline int roundUp(int m, int n)
-{
-    return ((m + n - 1) / n) * n;
-}
-
-inline int getC(const nvinfer1::Dims& d)
+inline int getC(nvinfer1::Dims const& d)
 {
     return d.nbDims >= 3 ? d.d[d.nbDims - 3] : 1;
 }
@@ -886,54 +804,111 @@ inline int getW(const nvinfer1::Dims& d)
     return d.nbDims >= 1 ? d.d[d.nbDims - 1] : 1;
 }
 
-inline void loadLibrary(const std::string& path)
+//! Platform-agnostic wrapper around dynamic libraries.
+class DynamicLibrary
 {
-#ifdef _MSC_VER
-    void* handle = LoadLibrary(path.c_str());
-#else
-    int32_t flags{RTLD_LAZY};
+public:
+    explicit DynamicLibrary(std::string const& name)
+        : mLibName{name}
+    {
+#if defined(_WIN32)
+        mHandle = LoadLibraryA(name.c_str());
+#else // defined(_WIN32)
+        int32_t flags{RTLD_LAZY};
 #if ENABLE_ASAN
-    // https://github.com/google/sanitizers/issues/89
-    // asan doesn't handle module unloading correctly and there are no plans on doing
-    // so. In order to get proper stack traces, don't delete the shared library on
-    // close so that asan can resolve the symbols correctly.
-    flags |= RTLD_NODELETE;
+        // https://github.com/google/sanitizers/issues/89
+        // asan doesn't handle module unloading correctly and there are no plans on doing
+        // so. In order to get proper stack traces, don't delete the shared library on
+        // close so that asan can resolve the symbols correctly.
+        flags |= RTLD_NODELETE;
 #endif // ENABLE_ASAN
 
-    void* handle = dlopen(path.c_str(), flags);
+        mHandle = dlopen(name.c_str(), flags);
+#endif // defined(_WIN32)
+
+        if (mHandle == nullptr)
+        {
+            std::string errorStr{};
+#if !defined(_WIN32)
+            errorStr = std::string{" due to "} + std::string{dlerror()};
 #endif
-    if (handle == nullptr)
+            throw std::runtime_error("Unable to open library: " + name + errorStr);
+        }
+    }
+
+    DynamicLibrary(DynamicLibrary const&) = delete;
+    DynamicLibrary(DynamicLibrary const&&) = delete;
+
+    //!
+    //! Retrieve a function symbol from the loaded library.
+    //!
+    //! \return the loaded symbol on success
+    //! \throw std::invalid_argument if loading the symbol failed.
+    //!
+    template <typename Signature>
+    std::function<Signature> symbolAddress(char const* name)
     {
-#ifdef _MSC_VER
-        sample::gLogError << "Could not load plugin library: " << path << std::endl;
+        if (mHandle == nullptr)
+        {
+            throw std::runtime_error("Handle to library is nullptr.");
+        }
+        void* ret;
+#if defined(_MSC_VER)
+        ret = static_cast<void*>(GetProcAddress(static_cast<HMODULE>(mHandle), name));
 #else
-        sample::gLogError << "Could not load plugin library: " << path << ", due to: " << dlerror() << std::endl;
+        ret = dlsym(mHandle, name);
 #endif
+        if (ret == nullptr)
+        {
+            std::string const kERROR_MSG(mLibName + ": error loading symbol: " + std::string(name));
+            throw std::invalid_argument(kERROR_MSG);
+        }
+        return reinterpret_cast<Signature*>(ret);
     }
-}
 
-inline int32_t getSMVersion()
-{
-    int32_t deviceIndex = 0;
-    CHECK(cudaGetDevice(&deviceIndex));
+    ~DynamicLibrary()
+    {
+        try
+        {
+#if defined(_WIN32)
+            ASSERT(static_cast<bool>(FreeLibrary(static_cast<HMODULE>(mHandle))));
+#else
+            ASSERT(dlclose(mHandle) == 0);
+#endif
+        }
+        catch (...)
+        {
+            sample::gLogError << "Unable to close library: " << mLibName << std::endl;
+        }
+    }
 
-    int32_t major, minor;
-    CHECK(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, deviceIndex));
-    CHECK(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, deviceIndex));
+private:
+    std::string mLibName{}; //!< Name of the DynamicLibrary
+    void* mHandle{};        //!< Handle to the DynamicLibrary
+};
 
-    return ((major << 8) | minor);
+inline std::unique_ptr<DynamicLibrary> loadLibrary(std::string const& path)
+{
+    // make_unique not available until C++14 - we still need to support C++11 builds.
+    return std::unique_ptr<DynamicLibrary>(new DynamicLibrary{path});
 }
 
-inline bool isSMSafe()
+inline int32_t getMaxPersistentCacheSize()
 {
-    const int32_t smVersion = getSMVersion();
-    return smVersion == 0x0700 || smVersion == 0x0702 || smVersion == 0x0705 ||
-           smVersion == 0x0800 || smVersion == 0x0806 || smVersion == 0x0807;
+    int32_t deviceIndex{};
+    CHECK(cudaGetDevice(&deviceIndex));
+
+    int32_t maxPersistentL2CacheSize{};
+#if CUDART_VERSION >= 11030 && !TRT_WINML
+    CHECK(cudaDeviceGetAttribute(&maxPersistentL2CacheSize, cudaDevAttrMaxPersistingL2CacheSize, deviceIndex));
+#endif
+
+    return maxPersistentL2CacheSize;
 }
 
 inline bool isDataTypeSupported(nvinfer1::DataType dataType)
 {
-    auto builder = SampleUniquePtr<nvinfer1::IBuilder>(nvinfer1::createInferBuilder(sample::gLogger.getTRTLogger()));
+    auto builder = SampleUniquePtr<nvinfer1::IBuilder>(createBuilder());
     if (!builder)
     {
         return false;
@@ -947,7 +922,6 @@ inline bool isDataTypeSupported(nvinfer1::DataType dataType)
 
     return true;
 }
-
 } // namespace samplesCommon
 
 inline std::ostream& operator<<(std::ostream& os, const nvinfer1::Dims& dims)
diff --git a/src/Detector/tensorrt_yolo/common/dumpTFWts.py b/src/Detector/tensorrt_yolo/common/dumpTFWts.py
new file mode 100644
index 00000000..70770fbd
--- /dev/null
+++ b/src/Detector/tensorrt_yolo/common/dumpTFWts.py
@@ -0,0 +1,124 @@
+#!/usr/bin/python
+#
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# Script to dump TensorFlow weights in TRT v1 and v2 dump format.
+# The V1 format is for TensorRT 4.0. The V2 format is for TensorRT 4.0 and later.
+
+import sys
+import struct
+import argparse
+
+try:
+    import tensorflow as tf
+    from tensorflow.python import pywrap_tensorflow
+except ImportError as err:
+    sys.stderr.write("""Error: Failed to import module ({})""".format(err))
+    sys.exit()
+
+parser = argparse.ArgumentParser(description="TensorFlow Weight Dumper")
+
+parser.add_argument(
+    "-m",
+    "--model",
+    required=True,
+    help="The checkpoint file basename, example basename(model.ckpt-766908.data-00000-of-00001) -> model.ckpt-766908",
+)
+parser.add_argument("-o", "--output", required=True, help="The weight file to dump all the weights to.")
+parser.add_argument("-1", "--wtsv1", required=False, default=False, type=bool, help="Dump the weights in the wts v1.")
+
+opt = parser.parse_args()
+
+if opt.wtsv1:
+    print("Outputting the trained weights in TensorRT's wts v1 format. This format is documented as:")
+    print("Line 0: <number of buffers in the file>")
+    print("Line 1-Num: [buffer name] [buffer type] [buffer size] <hex values>")
+else:
+    print("Outputting the trained weights in TensorRT's wts v2 format. This format is documented as:")
+    print("Line 0: <number of buffers in the file>")
+    print("Line 1-Num: [buffer name] [buffer type] [(buffer shape{e.g. (1, 2, 3)}] <buffer shaped size bytes of data>")
+
+inputbase = opt.model
+outputbase = opt.output
+
+
+def float_to_hex(f):
+    return hex(struct.unpack("<I", struct.pack("<f", f))[0])
+
+
+def getTRTType(tensor):
+    if tf.as_dtype(tensor.dtype) == tf.float32:
+        return 0
+    if tf.as_dtype(tensor.dtype) == tf.float16:
+        return 1
+    if tf.as_dtype(tensor.dtype) == tf.int8:
+        return 2
+    if tf.as_dtype(tensor.dtype) == tf.int32:
+        return 3
+    print("Tensor data type of %s is not supported in TensorRT" % (tensor.dtype))
+    sys.exit()
+
+
+try:
+    # Open output file
+    if opt.wtsv1:
+        outputFileName = outputbase + ".wts"
+    else:
+        outputFileName = outputbase + ".wts2"
+    outputFile = open(outputFileName, "w")
+
+    # read vars from checkpoint
+    reader = pywrap_tensorflow.NewCheckpointReader(inputbase)
+    var_to_shape_map = reader.get_variable_to_shape_map()
+
+    # Record count of weights
+    count = 0
+    for key in sorted(var_to_shape_map):
+        count += 1
+    outputFile.write("%s\n" % (count))
+
+    # Dump the weights in either v1 or v2 format
+    for key in sorted(var_to_shape_map):
+        tensor = reader.get_tensor(key)
+        file_key = key.replace("/", "_")
+        typeOfElem = getTRTType(tensor)
+        val = tensor.shape
+        if opt.wtsv1:
+            val = tensor.size
+        print("%s %s %s " % (file_key, typeOfElem, val))
+        flat_tensor = tensor.flatten()
+        outputFile.write("%s 0 %s " % (file_key, val))
+        if opt.wtsv1:
+            for weight in flat_tensor:
+                hexval = float_to_hex(float(weight))
+                outputFile.write("%s " % (hexval[2:]))
+        else:
+            outputFile.write(flat_tensor.tobytes())
+        outputFile.write("\n")
+    outputFile.close()
+
+except Exception as e:  # pylint: disable=broad-except
+    print(str(e))
+    if "corrupted compressed block contents" in str(e):
+        print("It's likely that your checkpoint file has been compressed " "with SNAPPY.")
+        if "Data loss" in str(e) and (any([e in inputbase for e in [".index", ".meta", ".data"]])):
+            proposed_file = ".".join(inputbase.split(".")[0:-1])
+            v2_file_error_template = """
+           It's likely that this is a V2 checkpoint and you need to provide the filename
+           *prefix*.  Try removing the '.' and extension.  Try:
+           inspect checkpoint --file_name = {}"""
+            print(v2_file_error_template.format(proposed_file))
diff --git a/src/Detector/tensorrt_yolo/common/fileLock.cpp b/src/Detector/tensorrt_yolo/common/fileLock.cpp
new file mode 100644
index 00000000..e155c0bd
--- /dev/null
+++ b/src/Detector/tensorrt_yolo/common/fileLock.cpp
@@ -0,0 +1,100 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "fileLock.h"
+#include "NvInfer.h"
+#include <sstream>
+#include <stdexcept>
+#include <string>
+
+namespace nvinfer1
+{
+namespace utils
+{
+FileLock::FileLock(ILogger& logger, std::string const& fileName)
+    : mLogger(logger)
+    , mFileName(fileName)
+{
+    std::string lockFileName = mFileName + ".lock";
+#ifdef _MSC_VER
+    {
+        std::stringstream ss;
+        ss << "Trying to set exclusive file lock " << lockFileName << std::endl;
+        mLogger.log(ILogger::Severity::kVERBOSE, ss.str().c_str());
+    }
+    // MS docs said this is a blocking IO if "FILE_FLAG_OVERLAPPED" is not provided
+    mHandle = CreateFileA(lockFileName.c_str(), GENERIC_WRITE, 0, NULL, OPEN_ALWAYS, 0, NULL);
+    if (mHandle == INVALID_HANDLE_VALUE)
+    {
+        throw std::runtime_error("Failed to lock " + lockFileName + "!");
+    }
+#elif defined(__QNX__)
+    // We once enabled the file lock on QNX, lockf(F_TLOCK) return -1 and the reported error is
+    // The error generated was 89, which means that the function is not implemented.
+#else
+    mHandle = fopen(lockFileName.c_str(), "wb+");
+    if (mHandle == nullptr)
+    {
+        throw std::runtime_error("Cannot open " + lockFileName + "!");
+    }
+    {
+        std::stringstream ss;
+        ss << "Trying to set exclusive file lock " << lockFileName << std::endl;
+        mLogger.log(ILogger::Severity::kVERBOSE, ss.str().c_str());
+    }
+    mDescriptor = fileno(mHandle);
+    auto ret = lockf(mDescriptor, F_LOCK, 0);
+    if (ret != 0)
+    {
+        mDescriptor = -1;
+        fclose(mHandle);
+        throw std::runtime_error("Failed to lock " + lockFileName + "!");
+    }
+#endif
+}
+
+FileLock::~FileLock()
+{
+    std::string lockFileName = mFileName + ".lock";
+#ifdef _MSC_VER
+    if (mHandle != INVALID_HANDLE_VALUE)
+    {
+        CloseHandle(mHandle);
+    }
+#elif defined(__QNX__)
+    // We once enabled the file lock on QNX, lockf(F_TLOCK) return -1 and the reported error is
+    // The error generated was 89
+    // That means : Function not implemented
+#else
+    if (mDescriptor != -1)
+    {
+        auto ret = lockf(mDescriptor, F_ULOCK, 0);
+        if (mHandle != nullptr)
+        {
+            fclose(mHandle);
+        }
+        if (ret != 0)
+        {
+            std::stringstream ss;
+            ss << "Failed to unlock " << lockFileName << ", please remove " << lockFileName << ".lock manually!"
+               << std::endl;
+            mLogger.log(ILogger::Severity::kVERBOSE, ss.str().c_str());
+        }
+    }
+#endif
+}
+} // namespace utils
+} // namespace nvinfer1
diff --git a/src/Detector/tensorrt_yolo/common/fileLock.h b/src/Detector/tensorrt_yolo/common/fileLock.h
new file mode 100644
index 00000000..d0f64a5b
--- /dev/null
+++ b/src/Detector/tensorrt_yolo/common/fileLock.h
@@ -0,0 +1,86 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef TENSORRT_SAMPLES_COMMON_FILELOCK_H_
+#define TENSORRT_SAMPLES_COMMON_FILELOCK_H_
+#include "NvInfer.h"
+#ifdef _MSC_VER
+// Needed so that the max/min definitions in windows.h do not conflict with std::max/min.
+#define NOMINMAX
+#include <windows.h>
+#undef NOMINMAX
+#else
+#include <stdio.h>  // fileno
+#include <unistd.h> // lockf
+#endif
+#include <string>
+
+namespace nvinfer1
+{
+namespace utils
+{
+//!
+//! \brief RAII object that locks a the specified file.
+//!
+//! The FileLock class uses a lock file to specify that the
+//! current file is being used by a TensorRT tool or sample
+//! so that things like the TimingCache can be updated across
+//! processes without having conflicts.
+//!
+class FileLock
+{
+public:
+    FileLock(nvinfer1::ILogger& logger, std::string const& fileName);
+    ~FileLock();
+    FileLock() = delete;                           // no default ctor
+    FileLock(FileLock const&) = delete;            // no copy ctor
+    FileLock& operator=(FileLock const&) = delete; // no copy assignment
+    FileLock(FileLock&&) = delete;                 // no move ctor
+    FileLock& operator=(FileLock&&) = delete;      // no move assignment
+
+private:
+    //!
+    //! The logger that emits any error messages that might show up.
+    //!
+    nvinfer1::ILogger& mLogger;
+
+    //!
+    //! The filename that the FileLock is protecting from multiple
+    //! TensorRT processes from writing to.
+    //!
+    std::string const mFileName;
+
+#ifdef _MSC_VER
+    //!
+    //! The file handle on windows for the file lock.
+    //!
+    HANDLE mHandle{};
+#else
+    //!
+    //! The file handle on linux for the file lock.
+    //!
+    FILE* mHandle{};
+    //!
+    //! The file descriptor on linux of the file lock.
+    //!
+    int32_t mDescriptor{-1};
+#endif
+}; // class FileLock
+} // namespace utils
+} // namespace nvinfer1
+
+#endif // TENSORRT_SAMPLES_COMMON_FILELOCK_H_
diff --git a/src/Detector/tensorrt_yolo/common/getOptions.cpp b/src/Detector/tensorrt_yolo/common/getOptions.cpp
new file mode 100644
index 00000000..19cd3281
--- /dev/null
+++ b/src/Detector/tensorrt_yolo/common/getOptions.cpp
@@ -0,0 +1,248 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "getOptions.h"
+#include "logger.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cctype>
+#include <cstring>
+#include <set>
+
+namespace nvinfer1
+{
+namespace utility
+{
+
+//! Matching for TRTOptions is defined as follows:
+//!
+//! If A and B both have longName set, A matches B if and only if A.longName ==
+//! B.longName and (A.shortName == B.shortName if both have short name set).
+//!
+//! If A only has shortName set and B only has longName set, then A does not
+//! match B. It is assumed that when 2 TRTOptions are compared, one of them is
+//! the definition of a TRTOption in the input to getOptions. As such, if the
+//! definition only has shortName set, it will never be equal to a TRTOption
+//! that does not have shortName set (and same for longName).
+//!
+//! If A and B both have shortName set but B does not have longName set, A
+//! matches B if and only if A.shortName == B.shortName.
+//!
+//! If A has neither long or short name set, A matches B if and only if B has
+//! neither long or short name set.
+bool matches(const TRTOption& a, const TRTOption& b)
+{
+    if (!a.longName.empty() && !b.longName.empty())
+    {
+        if (a.shortName && b.shortName)
+        {
+            return (a.longName == b.longName) && (a.shortName == b.shortName);
+        }
+        return a.longName == b.longName;
+    }
+
+    // If only one of them is not set, this will return false anyway.
+    return a.shortName == b.shortName;
+}
+
+//! getTRTOptionIndex returns the index of a TRTOption in a vector of
+//! TRTOptions, -1 if not found.
+int getTRTOptionIndex(const std::vector<TRTOption>& options, const TRTOption& opt)
+{
+    for (size_t i = 0; i < options.size(); ++i)
+    {
+        if (matches(opt, options[i]))
+        {
+            return i;
+        }
+    }
+    return -1;
+}
+
+//! validateTRTOption will return a string containing an error message if options
+//! contain non-numeric characters, or if there are duplicate option names found.
+//! Otherwise, returns the empty string.
+std::string validateTRTOption(
+    const std::set<char>& seenShortNames, const std::set<std::string>& seenLongNames, const TRTOption& opt)
+{
+    if (opt.shortName != 0)
+    {
+        if (!std::isalnum(opt.shortName))
+        {
+            return "Short name '" + std::to_string(opt.shortName) + "' is non-alphanumeric";
+        }
+
+        if (seenShortNames.find(opt.shortName) != seenShortNames.end())
+        {
+            return "Short name '" + std::to_string(opt.shortName) + "' is a duplicate";
+        }
+    }
+
+    if (!opt.longName.empty())
+    {
+        for (const char& c : opt.longName)
+        {
+            if (!std::isalnum(c) && c != '-' && c != '_')
+            {
+                return "Long name '" + opt.longName + "' contains characters that are not '-', '_', or alphanumeric";
+            }
+        }
+
+        if (seenLongNames.find(opt.longName) != seenLongNames.end())
+        {
+            return "Long name '" + opt.longName + "' is a duplicate";
+        }
+    }
+    return "";
+}
+
+//! validateTRTOptions will return a string containing an error message if any
+//! options contain non-numeric characters, or if there are duplicate option
+//! names found. Otherwise, returns the empty string.
+std::string validateTRTOptions(const std::vector<TRTOption>& options)
+{
+    std::set<char> seenShortNames;
+    std::set<std::string> seenLongNames;
+    for (size_t i = 0; i < options.size(); ++i)
+    {
+        const std::string errMsg = validateTRTOption(seenShortNames, seenLongNames, options[i]);
+        if (!errMsg.empty())
+        {
+            return "Error '" + errMsg + "' at TRTOption " + std::to_string(i);
+        }
+
+        seenShortNames.insert(options[i].shortName);
+        seenLongNames.insert(options[i].longName);
+    }
+    return "";
+}
+
+//! parseArgs parses an argument list and returns a TRTParsedArgs with the
+//! fields set accordingly. Assumes that options is validated.
+//! ErrMsg will be set if:
+//!     - an argument is null
+//!     - an argument is empty
+//!     - an argument does not have option (i.e. "-" and "--")
+//!     - a short argument has more than 1 character
+//!     - the last argument in the list requires a value
+TRTParsedArgs parseArgs(int argc, const char* const* argv, const std::vector<TRTOption>& options)
+{
+    TRTParsedArgs parsedArgs;
+    parsedArgs.values.resize(options.size());
+
+    for (int i = 1; i < argc; ++i) // index of current command-line argument
+    {
+        if (argv[i] == nullptr)
+        {
+            return TRTParsedArgs{"Null argument at index " + std::to_string(i)};
+        }
+
+        const std::string argStr(argv[i]);
+        if (argStr.empty())
+        {
+            return TRTParsedArgs{"Empty argument at index " + std::to_string(i)};
+        }
+
+        // No starting hyphen means it is a positional argument
+        if (argStr[0] != '-')
+        {
+            parsedArgs.positionalArgs.push_back(argStr);
+            continue;
+        }
+
+        if (argStr == "-" || argStr == "--")
+        {
+            return TRTParsedArgs{"Argument does not specify an option at index " + std::to_string(i)};
+        }
+
+        // If only 1 hyphen, char after is the flag.
+        TRTOption opt{' ', "", false, ""};
+        std::string value;
+        if (argStr[1] != '-')
+        {
+            // Must only have 1 char after the hyphen
+            if (argStr.size() > 2)
+            {
+                return TRTParsedArgs{"Short arg contains more than 1 character at index " + std::to_string(i)};
+            }
+            opt.shortName = argStr[1];
+        }
+        else
+        {
+            opt.longName = argStr.substr(2);
+
+            // We need to support --foo=bar syntax, so look for '='
+            const size_t eqIndex = opt.longName.find('=');
+            if (eqIndex < opt.longName.size())
+            {
+                value = opt.longName.substr(eqIndex + 1);
+                opt.longName = opt.longName.substr(0, eqIndex);
+            }
+        }
+
+        const int idx = getTRTOptionIndex(options, opt);
+        if (idx < 0)
+        {
+            continue;
+        }
+
+        if (options[idx].valueRequired)
+        {
+            if (!value.empty())
+            {
+                parsedArgs.values[idx].second.push_back(value);
+                parsedArgs.values[idx].first = parsedArgs.values[idx].second.size();
+                continue;
+            }
+
+            if (i + 1 >= argc)
+            {
+                return TRTParsedArgs{"Last argument requires value, but none given"};
+            }
+
+            const std::string nextArg(argv[i + 1]);
+            if (nextArg.size() >= 1 && nextArg[0] == '-')
+            {
+                sample::gLogWarning << "Warning: Using '" << nextArg << "' as a value for '" << argStr
+                                    << "', Should this be its own flag?" << std::endl;
+            }
+
+            parsedArgs.values[idx].second.push_back(nextArg);
+            i += 1; // Next argument already consumed
+
+            parsedArgs.values[idx].first = parsedArgs.values[idx].second.size();
+        }
+        else
+        {
+            parsedArgs.values[idx].first += 1;
+        }
+    }
+    return parsedArgs;
+}
+
+TRTParsedArgs getOptions(int argc, const char* const* argv, const std::vector<TRTOption>& options)
+{
+    const std::string errMsg = validateTRTOptions(options);
+    if (!errMsg.empty())
+    {
+        return TRTParsedArgs{errMsg};
+    }
+    return parseArgs(argc, argv, options);
+}
+} // namespace utility
+} // namespace nvinfer1
diff --git a/src/Detector/tensorrt_yolo/common/getOptions.h b/src/Detector/tensorrt_yolo/common/getOptions.h
new file mode 100644
index 00000000..4bbf9e27
--- /dev/null
+++ b/src/Detector/tensorrt_yolo/common/getOptions.h
@@ -0,0 +1,128 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef TRT_GET_OPTIONS_H
+#define TRT_GET_OPTIONS_H
+
+#include <string>
+#include <utility>
+#include <vector>
+
+namespace nvinfer1
+{
+namespace utility
+{
+
+//! TRTOption defines a command line option. At least 1 of shortName and longName
+//! must be defined.
+//! If bool initialization is undefined behavior on your system, valueRequired
+//! must also be explicitly defined.
+//! helpText is optional.
+struct TRTOption
+{
+    char shortName;       //!< Option name in short (single hyphen) form (i.e. -a, -b)
+    std::string longName; //!< Option name in long (double hyphen) form (i.e. --foo, --bar)
+    bool valueRequired;   //!< True if a value is needed for an option (i.e. -N 4, --foo bar)
+    std::string helpText; //!< Text to show when printing out the command usage
+};
+
+//! TRTParsedArgs is returned by getOptions after it has parsed a command line
+//! argument list (argv).
+//!
+//! errMsg is a string containing an error message if any errors occurred. If it
+//! is empty, no errors occurred.
+//!
+//! values stores a vector of pairs for each option (ordered by order in the
+//! input). Each pair contains an int (the number of occurrences) and a vector
+//! of strings (a list of values). The user should know which of these to use,
+//! and which options required values. For non-value options, only occurrences is
+//! populated. For value-required options, occurrences == # of values. Values do
+//! not need to be unique.
+//!
+//! positionalArgs stores additional arguments that are passed in without an
+//! option (these must not start with a hyphen).
+struct TRTParsedArgs
+{
+    std::string errMsg;
+    std::vector<std::pair<int, std::vector<std::string>>> values;
+    std::vector<std::string> positionalArgs;
+};
+
+//! Parse the input arguments passed to main() and extract options as well as
+//! positional arguments.
+//!
+//! Options are supposed to be passed to main() with a preceding hyphen '-'.
+//!
+//! If there is a single preceding hyphen, there should be exactly 1 character
+//! after the hyphen, which is interpreted as the option.
+//!
+//! If there are 2 preceding hyphens, the entire argument (without the hyphens)
+//! is interpreted as the option.
+//!
+//! If the option requires a value, the next argument is used as the value.
+//!
+//! Positional arguments must not start with a hyphen.
+//!
+//! If an argument requires a value, the next argument is interpreted as the
+//! value, even if it is the form of a valid option (i.e. --foo --bar will store
+//! "--bar" as a value for option "foo" if "foo" requires a value).
+//! We also support --name=value syntax. In this case, 'value' would be used as
+//! the value, NOT the next argument.
+//!
+//! For options:
+//!   { { 'a', "", false },
+//!     { 'b', "", false },
+//!     { 0, "cee", false },
+//!     { 'd', "", true },
+//!     { 'e', "", true },
+//!     { 'f', "foo", true } }
+//!
+//! ./main hello world -a -a --cee -d 12 -f 34
+//! and
+//! ./main hello world -a -a --cee -d 12 --foo 34
+//!
+//! will result in:
+//!
+//! TRTParsedArgs {
+//!      errMsg: "",
+//!      values: { { 2, {} },
+//!                { 0, {} },
+//!                { 1, {} },
+//!                { 1, {"12"} },
+//!                { 0, {} },
+//!                { 1, {"34"} } }
+//!      positionalArgs: {"hello", "world"},
+//! }
+//!
+//! Non-POSIX behavior:
+//!      - Does not support "-abcde" as a shorthand for "-a -b -c -d -e". Each
+//!        option must have its own hyphen prefix.
+//!      - Does not support -e12 as a shorthand for "-e 12". Values MUST be
+//!        whitespace-separated from the option it is for.
+//!
+//! @param[in] argc The number of arguments passed to main (including the
+//!            file name, which is disregarded)
+//! @param[in] argv The arguments passed to main (including the file name,
+//!            which is disregarded)
+//! @param[in] options List of TRTOptions to parse
+//! @return TRTParsedArgs. See TRTParsedArgs documentation for descriptions of
+//!         the fields.
+TRTParsedArgs getOptions(int argc, const char* const* argv, const std::vector<TRTOption>& options);
+} // namespace utility
+} // namespace nvinfer1
+
+#endif // TRT_GET_OPTIONS_H
diff --git a/src/Detector/tensorrt_yolo/common/getopt.c b/src/Detector/tensorrt_yolo/common/getopt.c
new file mode 100644
index 00000000..c1da08b5
--- /dev/null
+++ b/src/Detector/tensorrt_yolo/common/getopt.c
@@ -0,0 +1,568 @@
+/*	$OpenBSD: getopt_long.c,v 1.23 2007/10/31 12:34:57 chl Exp $	*/
+/*	$NetBSD: getopt_long.c,v 1.15 2002/01/31 22:43:40 tv Exp $	*/
+
+/*
+ * Copyright (c) 2002 Todd C. Miller <Todd.Miller@courtesan.com>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ *
+ * Sponsored in part by the Defense Advanced Research Projects
+ * Agency (DARPA) and Air Force Research Laboratory, Air Force
+ * Materiel Command, USAF, under agreement number F39502-99-1-0512.
+ */
+/*-
+ * Copyright (c) 2000 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Dieter Baron and Thomas Klausner.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "getoptWin.h"
+#include <errno.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <windows.h>
+
+#define REPLACE_GETOPT /* use this getopt as the system getopt(3) */
+
+#ifdef REPLACE_GETOPT
+int opterr = 1;   /* if error message should be printed */
+int optind = 1;   /* index into parent argv vector */
+int optopt = '?'; /* character checked for validity */
+#undef optreset   /* see getopt.h */
+#define optreset __mingw_optreset
+int optreset; /* reset getopt */
+char* optarg; /* argument associated with option */
+#endif
+
+#define PRINT_ERROR ((opterr) && (*options != ':'))
+
+#define FLAG_PERMUTE 0x01  /* permute non-options to the end of argv */
+#define FLAG_ALLARGS 0x02  /* treat non-options as args to option "-1" */
+#define FLAG_LONGONLY 0x04 /* operate as getopt_long_only */
+
+/* return values */
+#define BADCH (int) '?'
+#define BADARG ((*options == ':') ? (int) ':' : (int) '?')
+#define INORDER (int) 1
+
+#ifndef __CYGWIN__
+#define __progname __argv[0]
+#else
+extern char __declspec(dllimport) * __progname;
+#endif
+
+#ifdef __CYGWIN__
+static char EMSG[] = "";
+#else
+#define EMSG ""
+#endif
+
+static int getopt_internal(int, char* const*, char const*, const struct option*, int*, int);
+static int parse_long_options(char* const*, char const*, const struct option*, int*, int);
+static int gcd(int, int);
+static void permute_args(int, int, int, char* const*);
+
+static char* place = EMSG; /* option letter processing */
+
+/* XXX: set optreset to 1 rather than these two */
+static int nonopt_start = -1; /* first non option argument (for permute) */
+static int nonopt_end = -1;   /* first option after non options (for permute) */
+
+/* Error messages */
+static char const recargchar[] = "option requires an argument -- %c";
+static char const recargstring[] = "option requires an argument -- %s";
+static char const ambig[] = "ambiguous option -- %.*s";
+static char const noarg[] = "option doesn't take an argument -- %.*s";
+static char const illoptchar[] = "unknown option -- %c";
+static char const illoptstring[] = "unknown option -- %s";
+
+static void _vwarnx(char const* fmt, va_list ap)
+{
+    (void) fprintf(stderr, "%s: ", __progname);
+    if (fmt != NULL)
+        (void) vfprintf(stderr, fmt, ap);
+    (void) fprintf(stderr, "\n");
+}
+
+static void warnx(char const* fmt, ...)
+{
+    va_list ap;
+    va_start(ap, fmt);
+    _vwarnx(fmt, ap);
+    va_end(ap);
+}
+
+/*
+ * Compute the greatest common divisor of a and b.
+ */
+static int gcd(int a, int b)
+{
+    int c;
+
+    c = a % b;
+    while (c != 0)
+    {
+        a = b;
+        b = c;
+        c = a % b;
+    }
+
+    return (b);
+}
+
+/*
+ * Exchange the block from nonopt_start to nonopt_end with the block
+ * from nonopt_end to opt_end (keeping the same order of arguments
+ * in each block).
+ */
+static void permute_args(int panonopt_start, int panonopt_end, int opt_end, char* const* nargv)
+{
+    int cstart, cyclelen, i, j, ncycle, nnonopts, nopts, pos;
+    char* swap;
+
+    /*
+     * compute lengths of blocks and number and size of cycles
+     */
+    nnonopts = panonopt_end - panonopt_start;
+    nopts = opt_end - panonopt_end;
+    ncycle = gcd(nnonopts, nopts);
+    cyclelen = (opt_end - panonopt_start) / ncycle;
+
+    for (i = 0; i < ncycle; i++)
+    {
+        cstart = panonopt_end + i;
+        pos = cstart;
+        for (j = 0; j < cyclelen; j++)
+        {
+            if (pos >= panonopt_end)
+                pos -= nnonopts;
+            else
+                pos += nopts;
+            swap = nargv[pos];
+            /* LINTED const cast */
+            ((char**) nargv)[pos] = nargv[cstart];
+            /* LINTED const cast */
+            ((char**) nargv)[cstart] = swap;
+        }
+    }
+}
+
+/*
+ * parse_long_options --
+ *	Parse long options in argc/argv argument vector.
+ * Returns -1 if short_too is set and the option does not match long_options.
+ */
+static int parse_long_options(
+    char* const* nargv, char const* options, const struct option* long_options, int* idx, int short_too)
+{
+    char *current_argv, *has_equal;
+    size_t current_argv_len;
+    int i, ambiguous, match;
+
+#define IDENTICAL_INTERPRETATION(_x, _y)                                                                               \
+    (long_options[(_x)].has_arg == long_options[(_y)].has_arg && long_options[(_x)].flag == long_options[(_y)].flag    \
+        && long_options[(_x)].val == long_options[(_y)].val)
+
+    current_argv = place;
+    match = -1;
+    ambiguous = 0;
+
+    optind++;
+
+    if ((has_equal = strchr(current_argv, '=')) != NULL)
+    {
+        /* argument found (--option=arg) */
+        current_argv_len = has_equal - current_argv;
+        has_equal++;
+    }
+    else
+        current_argv_len = strlen(current_argv);
+
+    for (i = 0; long_options[i].name; i++)
+    {
+        /* find matching long option */
+        if (strncmp(current_argv, long_options[i].name, current_argv_len))
+            continue;
+
+        if (strlen(long_options[i].name) == current_argv_len)
+        {
+            /* exact match */
+            match = i;
+            ambiguous = 0;
+            break;
+        }
+        /*
+         * If this is a known short option, don't allow
+         * a partial match of a single character.
+         */
+        if (short_too && current_argv_len == 1)
+            continue;
+
+        if (match == -1) /* partial match */
+            match = i;
+        else if (!IDENTICAL_INTERPRETATION(i, match))
+            ambiguous = 1;
+    }
+    if (ambiguous)
+    {
+        /* ambiguous abbreviation */
+        if (PRINT_ERROR)
+            warnx(ambig, (int) current_argv_len, current_argv);
+        optopt = 0;
+        return (BADCH);
+    }
+    if (match != -1)
+    { /* option found */
+        if (long_options[match].has_arg == no_argument && has_equal)
+        {
+            if (PRINT_ERROR)
+                warnx(noarg, (int) current_argv_len, current_argv);
+            /*
+             * XXX: GNU sets optopt to val regardless of flag
+             */
+            if (long_options[match].flag == NULL)
+                optopt = long_options[match].val;
+            else
+                optopt = 0;
+            return (BADARG);
+        }
+        if (long_options[match].has_arg == required_argument || long_options[match].has_arg == optional_argument)
+        {
+            if (has_equal)
+                optarg = has_equal;
+            else if (long_options[match].has_arg == required_argument)
+            {
+                /*
+                 * optional argument doesn't use next nargv
+                 */
+                optarg = nargv[optind++];
+            }
+        }
+        if ((long_options[match].has_arg == required_argument) && (optarg == NULL))
+        {
+            /*
+             * Missing argument; leading ':' indicates no error
+             * should be generated.
+             */
+            if (PRINT_ERROR)
+                warnx(recargstring, current_argv);
+            /*
+             * XXX: GNU sets optopt to val regardless of flag
+             */
+            if (long_options[match].flag == NULL)
+                optopt = long_options[match].val;
+            else
+                optopt = 0;
+            --optind;
+            return (BADARG);
+        }
+    }
+    else
+    { /* unknown option */
+        if (short_too)
+        {
+            --optind;
+            return (-1);
+        }
+        if (PRINT_ERROR)
+            warnx(illoptstring, current_argv);
+        optopt = 0;
+        return (BADCH);
+    }
+    if (idx)
+        *idx = match;
+    if (long_options[match].flag)
+    {
+        *long_options[match].flag = long_options[match].val;
+        return (0);
+    }
+    else
+        return (long_options[match].val);
+#undef IDENTICAL_INTERPRETATION
+}
+
+/*
+ * getopt_internal --
+ *	Parse argc/argv argument vector.  Called by user level routines.
+ */
+static int getopt_internal(
+    int nargc, char* const* nargv, char const* options, const struct option* long_options, int* idx, int flags)
+{
+    char const* oli; /* option letter list index */
+    int optchar, short_too;
+    static int posixly_correct = -1;
+
+    if (options == NULL)
+        return (-1);
+
+    /*
+     * XXX Some GNU programs (like cvs) set optind to 0 instead of
+     * XXX using optreset.  Work around this braindamage.
+     */
+    if (optind == 0)
+        optind = optreset = 1;
+
+    /*
+     * Disable GNU extensions if POSIXLY_CORRECT is set or options
+     * string begins with a '+'.
+     *
+     * CV, 2009-12-14: Check POSIXLY_CORRECT anew if optind == 0 or
+     *                 optreset != 0 for GNU compatibility.
+     */
+    if (posixly_correct == -1 || optreset != 0)
+        posixly_correct = (getenv("POSIXLY_CORRECT") != NULL);
+    if (*options == '-')
+        flags |= FLAG_ALLARGS;
+    else if (posixly_correct || *options == '+')
+        flags &= ~FLAG_PERMUTE;
+    if (*options == '+' || *options == '-')
+        options++;
+
+    optarg = NULL;
+    if (optreset)
+        nonopt_start = nonopt_end = -1;
+start:
+    if (optreset || !*place)
+    { /* update scanning pointer */
+        optreset = 0;
+        if (optind >= nargc)
+        { /* end of argument vector */
+            place = EMSG;
+            if (nonopt_end != -1)
+            {
+                /* do permutation, if we have to */
+                permute_args(nonopt_start, nonopt_end, optind, nargv);
+                optind -= nonopt_end - nonopt_start;
+            }
+            else if (nonopt_start != -1)
+            {
+                /*
+                 * If we skipped non-options, set optind
+                 * to the first of them.
+                 */
+                optind = nonopt_start;
+            }
+            nonopt_start = nonopt_end = -1;
+            return (-1);
+        }
+        if (*(place = nargv[optind]) != '-' || (place[1] == '\0' && strchr(options, '-') == NULL))
+        {
+            place = EMSG; /* found non-option */
+            if (flags & FLAG_ALLARGS)
+            {
+                /*
+                 * GNU extension:
+                 * return non-option as argument to option 1
+                 */
+                optarg = nargv[optind++];
+                return (INORDER);
+            }
+            if (!(flags & FLAG_PERMUTE))
+            {
+                /*
+                 * If no permutation wanted, stop parsing
+                 * at first non-option.
+                 */
+                return (-1);
+            }
+            /* do permutation */
+            if (nonopt_start == -1)
+                nonopt_start = optind;
+            else if (nonopt_end != -1)
+            {
+                permute_args(nonopt_start, nonopt_end, optind, nargv);
+                nonopt_start = optind - (nonopt_end - nonopt_start);
+                nonopt_end = -1;
+            }
+            optind++;
+            /* process next argument */
+            goto start;
+        }
+        if (nonopt_start != -1 && nonopt_end == -1)
+            nonopt_end = optind;
+
+        /*
+         * If we have "-" do nothing, if "--" we are done.
+         */
+        if (place[1] != '\0' && *++place == '-' && place[1] == '\0')
+        {
+            optind++;
+            place = EMSG;
+            /*
+             * We found an option (--), so if we skipped
+             * non-options, we have to permute.
+             */
+            if (nonopt_end != -1)
+            {
+                permute_args(nonopt_start, nonopt_end, optind, nargv);
+                optind -= nonopt_end - nonopt_start;
+            }
+            nonopt_start = nonopt_end = -1;
+            return (-1);
+        }
+    }
+
+    /*
+     * Check long options if:
+     *  1) we were passed some
+     *  2) the arg is not just "-"
+     *  3) either the arg starts with -- we are getopt_long_only()
+     */
+    if (long_options != NULL && place != nargv[optind] && (*place == '-' || (flags & FLAG_LONGONLY)))
+    {
+        short_too = 0;
+        if (*place == '-')
+            place++; /* --foo long option */
+        else if (*place != ':' && strchr(options, *place) != NULL)
+            short_too = 1; /* could be short option too */
+
+        optchar = parse_long_options(nargv, options, long_options, idx, short_too);
+        if (optchar != -1)
+        {
+            place = EMSG;
+            return (optchar);
+        }
+    }
+
+    if ((optchar = (int) *place++) == (int) ':' || (optchar == (int) '-' && *place != '\0')
+        || (oli = strchr(options, optchar)) == NULL)
+    {
+        /*
+         * If the user specified "-" and  '-' isn't listed in
+         * options, return -1 (non-option) as per POSIX.
+         * Otherwise, it is an unknown option character (or ':').
+         */
+        if (optchar == (int) '-' && *place == '\0')
+            return (-1);
+        if (!*place)
+            ++optind;
+        if (PRINT_ERROR)
+            warnx(illoptchar, optchar);
+        optopt = optchar;
+        return (BADCH);
+    }
+    if (long_options != NULL && optchar == 'W' && oli[1] == ';')
+    {
+        /* -W long-option */
+        if (*place) /* no space */
+            /* NOTHING */;
+        else if (++optind >= nargc)
+        { /* no arg */
+            place = EMSG;
+            if (PRINT_ERROR)
+                warnx(recargchar, optchar);
+            optopt = optchar;
+            return (BADARG);
+        }
+        else /* white space */
+            place = nargv[optind];
+        optchar = parse_long_options(nargv, options, long_options, idx, 0);
+        place = EMSG;
+        return (optchar);
+    }
+    if (*++oli != ':')
+    { /* doesn't take argument */
+        if (!*place)
+            ++optind;
+    }
+    else
+    { /* takes (optional) argument */
+        optarg = NULL;
+        if (*place) /* no white space */
+            optarg = place;
+        else if (oli[1] != ':')
+        { /* arg not optional */
+            if (++optind >= nargc)
+            { /* no arg */
+                place = EMSG;
+                if (PRINT_ERROR)
+                    warnx(recargchar, optchar);
+                optopt = optchar;
+                return (BADARG);
+            }
+            else
+                optarg = nargv[optind];
+        }
+        place = EMSG;
+        ++optind;
+    }
+    /* dump back option letter */
+    return (optchar);
+}
+
+#ifdef REPLACE_GETOPT
+/*
+ * getopt --
+ *	Parse argc/argv argument vector.
+ *
+ * [eventually this will replace the BSD getopt]
+ */
+int getopt(int nargc, char* const* nargv, char const* options)
+{
+
+    /*
+     * We don't pass FLAG_PERMUTE to getopt_internal() since
+     * the BSD getopt(3) (unlike GNU) has never done this.
+     *
+     * Furthermore, since many privileged programs call getopt()
+     * before dropping privileges it makes sense to keep things
+     * as simple (and bug-free) as possible.
+     */
+    return (getopt_internal(nargc, nargv, options, NULL, NULL, 0));
+}
+#endif /* REPLACE_GETOPT */
+
+/*
+ * getopt_long --
+ *	Parse argc/argv argument vector.
+ */
+int getopt_long(int nargc, char* const* nargv, char const* options, const struct option* long_options, int* idx)
+{
+
+    return (getopt_internal(nargc, nargv, options, long_options, idx, FLAG_PERMUTE));
+}
+
+/*
+ * getopt_long_only --
+ *	Parse argc/argv argument vector.
+ */
+int getopt_long_only(int nargc, char* const* nargv, char const* options, const struct option* long_options, int* idx)
+{
+
+    return (getopt_internal(nargc, nargv, options, long_options, idx, FLAG_PERMUTE | FLAG_LONGONLY));
+}
diff --git a/src/Detector/tensorrt_yolo/common/getoptWin.h b/src/Detector/tensorrt_yolo/common/getoptWin.h
new file mode 100644
index 00000000..a1dc6ffa
--- /dev/null
+++ b/src/Detector/tensorrt_yolo/common/getoptWin.h
@@ -0,0 +1,124 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __GETOPT_H__
+/**
+ * DISCLAIMER
+ * This file has no copyright assigned and is placed in the Public Domain.
+ * This file is a part of the w64 mingw-runtime package.
+ *
+ * The w64 mingw-runtime package and its code is distributed in the hope that it
+ * will be useful but WITHOUT ANY WARRANTY.  ALL WARRANTIES, EXPRESSED OR
+ * IMPLIED ARE HEREBY DISCLAIMED.  This includes but is not limited to
+ * warranties of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+
+#define __GETOPT_H__
+
+/* All the headers include this file. */
+#include <crtdefs.h>
+
+#if defined(WINGETOPT_SHARED_LIB)
+#if defined(BUILDING_WINGETOPT_DLL)
+#define WINGETOPT_API __declspec(dllexport)
+#else
+#define WINGETOPT_API __declspec(dllimport)
+#endif
+#else
+#define WINGETOPT_API
+#endif
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+    WINGETOPT_API extern int optind; /* index of first non-option in argv      */
+    WINGETOPT_API extern int optopt; /* single option character, as parsed     */
+    WINGETOPT_API extern int opterr; /* flag to enable built-in diagnostics... */
+    /* (user may set to zero, to suppress)    */
+
+    WINGETOPT_API extern char* optarg; /* pointer to argument of current option  */
+
+    extern int getopt(int nargc, char* const* nargv, char const* options);
+
+#ifdef _BSD_SOURCE
+/*
+ * BSD adds the non-standard `optreset' feature, for reinitialisation
+ * of `getopt' parsing.  We support this feature, for applications which
+ * proclaim their BSD heritage, before including this header; however,
+ * to maintain portability, developers are advised to avoid it.
+ */
+#define optreset __mingw_optreset
+    extern int optreset;
+#endif
+#ifdef __cplusplus
+}
+#endif
+/*
+ * POSIX requires the `getopt' API to be specified in `unistd.h';
+ * thus, `unistd.h' includes this header.  However, we do not want
+ * to expose the `getopt_long' or `getopt_long_only' APIs, when
+ * included in this manner.  Thus, close the standard __GETOPT_H__
+ * declarations block, and open an additional __GETOPT_LONG_H__
+ * specific block, only when *not* __UNISTD_H_SOURCED__, in which
+ * to declare the extended API.
+ */
+#endif /* !defined(__GETOPT_H__) */
+
+#if !defined(__UNISTD_H_SOURCED__) && !defined(__GETOPT_LONG_H__)
+#define __GETOPT_LONG_H__
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+    struct option /* specification for a long form option...	*/
+    {
+        char const* name; /* option name, without leading hyphens */
+        int has_arg;      /* does it take an argument?		*/
+        int* flag;        /* where to save its status, or NULL	*/
+        int val;          /* its associated status value		*/
+    };
+
+    enum /* permitted values for its `has_arg' field...	*/
+    {
+        no_argument = 0,   /* option never takes an argument	*/
+        required_argument, /* option always requires an argument	*/
+        optional_argument  /* option may take an argument		*/
+    };
+
+    extern int getopt_long(
+        int nargc, char* const* nargv, char const* options, const struct option* long_options, int* idx);
+    extern int getopt_long_only(
+        int nargc, char* const* nargv, char const* options, const struct option* long_options, int* idx);
+/*
+ * Previous MinGW implementation had...
+ */
+#ifndef HAVE_DECL_GETOPT
+/*
+ * ...for the long form API only; keep this for compatibility.
+ */
+#define HAVE_DECL_GETOPT 1
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* !defined(__UNISTD_H_SOURCED__) && !defined(__GETOPT_LONG_H__) */
diff --git a/src/Detector/tensorrt_yolo/common/half.h b/src/Detector/tensorrt_yolo/common/half.h
index 0755c316..b997e7db 100644
--- a/src/Detector/tensorrt_yolo/common/half.h
+++ b/src/Detector/tensorrt_yolo/common/half.h
@@ -16,13 +16,14 @@
 // OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
- *     http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
@@ -1522,14 +1523,14 @@ class half
     /// \return incremented half value
     half& operator++()
     {
-        return *this += 1.0f;
+        return *this += 1.0F;
     }
 
     /// Prefix decrement.
     /// \return decremented half value
     half& operator--()
     {
-        return *this -= 1.0f;
+        return *this -= 1.0F;
     }
 
     /// Postfix increment.
diff --git a/src/Detector/tensorrt_yolo/common/logger.cpp b/src/Detector/tensorrt_yolo/common/logger.cpp
index 03c64398..909ec0bb 100644
--- a/src/Detector/tensorrt_yolo/common/logger.cpp
+++ b/src/Detector/tensorrt_yolo/common/logger.cpp
@@ -1,11 +1,12 @@
 /*
- * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
- *     http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
@@ -17,7 +18,7 @@
 #include "logger.h"
 #include "ErrorRecorder.h"
 #include "logging.h"
-
+using namespace nvinfer1;
 SampleErrorRecorder gRecorder;
 namespace sample
 {
diff --git a/src/Detector/tensorrt_yolo/common/logger.h b/src/Detector/tensorrt_yolo/common/logger.h
index 3069e8e9..8205e457 100644
--- a/src/Detector/tensorrt_yolo/common/logger.h
+++ b/src/Detector/tensorrt_yolo/common/logger.h
@@ -1,11 +1,12 @@
 /*
- * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
- *     http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
diff --git a/src/Detector/tensorrt_yolo/common/logging.h b/src/Detector/tensorrt_yolo/common/logging.h
index 78732c10..69273a5e 100644
--- a/src/Detector/tensorrt_yolo/common/logging.h
+++ b/src/Detector/tensorrt_yolo/common/logging.h
@@ -1,11 +1,12 @@
 /*
- * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
- *     http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
@@ -17,7 +18,7 @@
 #ifndef TENSORRT_LOGGING_H
 #define TENSORRT_LOGGING_H
 
-#include "NvInferRuntimeCommon.h"
+#include "NvInferRuntime.h"
 #include "sampleOptions.h"
 #include <cassert>
 #include <ctime>
@@ -162,7 +163,7 @@ class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream
     }
     LogStreamConsumer(const LogStreamConsumer& other) = delete;
     LogStreamConsumer() = delete;
-    ~LogStreamConsumer() = default;
+    ~LogStreamConsumer() override = default;
     LogStreamConsumer& operator=(const LogStreamConsumer&) = delete;
     LogStreamConsumer& operator=(LogStreamConsumer&&) = delete;
 
@@ -291,7 +292,7 @@ class Logger : public nvinfer1::ILogger
     };
 
     //!
-    //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger
+    //! \brief Forward-compatible method for retrieving the nvinfer1::ILogger associated with this Logger
     //! \return The nvinfer1::ILogger associated with this Logger
     //!
     //! TODO Once all samples are updated to use this method to register the logger with TensorRT,
@@ -353,7 +354,7 @@ class Logger : public nvinfer1::ILogger
     //!
     //! \brief Define a test for logging
     //!
-    //! \param[in] name The name of the test.  This should be a string starting with
+    //! \param[in] name The name of the test. This should be a string starting with
     //!                  "TensorRT" and containing dot-separated strings containing
     //!                  the characters [A-Za-z0-9_].
     //!                  For example, "TensorRT.sample_googlenet"
@@ -379,7 +380,8 @@ class Logger : public nvinfer1::ILogger
     static TestAtom defineTest(const std::string& name, int32_t argc, char const* const* argv)
     {
         // Append TensorRT version as info
-        const std::string vname = name + " [TensorRT v" + std::to_string(NV_TENSORRT_VERSION) + "]";
+        const std::string vname = name + " [TensorRT v" + std::to_string(NV_TENSORRT_VERSION) + "] [b"
+            + std::to_string(NV_TENSORRT_BUILD) + "]";
         auto cmdline = genCmdlineString(argc, argv);
         return defineTest(vname, cmdline);
     }
diff --git a/src/Detector/tensorrt_yolo/common/parserOnnxConfig.h b/src/Detector/tensorrt_yolo/common/parserOnnxConfig.h
index c92a1420..67ee6c71 100644
--- a/src/Detector/tensorrt_yolo/common/parserOnnxConfig.h
+++ b/src/Detector/tensorrt_yolo/common/parserOnnxConfig.h
@@ -1,11 +1,12 @@
 /*
- * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
- *     http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
@@ -35,15 +36,13 @@
  *
  */
 
-using namespace std;
-
 class ParserOnnxConfig : public nvonnxparser::IOnnxConfig
 {
 
 protected:
-    string mModelFilename{};
-    string mTextFilename{};
-    string mFullTextFilename{};
+    std::string mModelFilename{};
+    std::string mTextFilename{};
+    std::string mFullTextFilename{};
     nvinfer1::DataType mModelDtype;
     nvonnxparser::IOnnxConfig::Verbosity mVerbosity;
     bool mPrintLayercInfo;
@@ -62,8 +61,7 @@ class ParserOnnxConfig : public nvonnxparser::IOnnxConfig
 #endif
     }
 
-protected:
-    ~ParserOnnxConfig()
+    ~ParserOnnxConfig() override
     {
 #ifdef ONNX_DEBUG
         if (isDebug())
@@ -74,62 +72,62 @@ class ParserOnnxConfig : public nvonnxparser::IOnnxConfig
     }
 
 public:
-    virtual void setModelDtype(const nvinfer1::DataType modelDtype) noexcept
+    void setModelDtype(const nvinfer1::DataType modelDtype) noexcept override
     {
         mModelDtype = modelDtype;
     }
 
-    virtual nvinfer1::DataType getModelDtype() const noexcept
+    nvinfer1::DataType getModelDtype() const noexcept override
     {
         return mModelDtype;
     }
 
-    virtual const char* getModelFileName() const noexcept
+    const char* getModelFileName() const noexcept override
     {
         return mModelFilename.c_str();
     }
-    virtual void setModelFileName(const char* onnxFilename) noexcept
+    void setModelFileName(const char* onnxFilename) noexcept override
     {
-        mModelFilename = string(onnxFilename);
+        mModelFilename = std::string(onnxFilename);
     }
-    virtual nvonnxparser::IOnnxConfig::Verbosity getVerbosityLevel() const noexcept
+    nvonnxparser::IOnnxConfig::Verbosity getVerbosityLevel() const noexcept override
     {
         return mVerbosity;
     }
-    virtual void addVerbosity() noexcept
+    void addVerbosity() noexcept override
     {
         ++mVerbosity;
     }
-    virtual void reduceVerbosity() noexcept
+    void reduceVerbosity() noexcept override
     {
         --mVerbosity;
     }
-    virtual void setVerbosityLevel(nvonnxparser::IOnnxConfig::Verbosity verbosity) noexcept
+    void setVerbosityLevel(nvonnxparser::IOnnxConfig::Verbosity verbosity) noexcept override
     {
         mVerbosity = verbosity;
     }
 
-    virtual const char* getTextFileName() const noexcept
+    const char* getTextFileName() const noexcept override
     {
         return mTextFilename.c_str();
     }
-    virtual void setTextFileName(const char* textFilename) noexcept
+    void setTextFileName(const char* textFilename) noexcept override
     {
-        mTextFilename = string(textFilename);
+        mTextFilename = std::string(textFilename);
     }
-    virtual const char* getFullTextFileName() const noexcept
+    const char* getFullTextFileName() const noexcept override
     {
         return mFullTextFilename.c_str();
     }
-    virtual void setFullTextFileName(const char* fullTextFilename) noexcept
+    void setFullTextFileName(const char* fullTextFilename) noexcept override
     {
-        mFullTextFilename = string(fullTextFilename);
+        mFullTextFilename = std::string(fullTextFilename);
     }
-    virtual bool getPrintLayerInfo() const noexcept
+    bool getPrintLayerInfo() const noexcept override
     {
         return mPrintLayercInfo;
     }
-    virtual void setPrintLayerInfo(bool src) noexcept
+    void setPrintLayerInfo(bool src) noexcept override
     {
         mPrintLayercInfo = src;
     } //!< get the boolean variable corresponding to the Layer Info, see getPrintLayerInfo()
@@ -142,12 +140,6 @@ class ParserOnnxConfig : public nvonnxparser::IOnnxConfig
         return false;
 #endif
     }
-
-    virtual void destroy() noexcept
-    {
-        delete this;
-    }
-
 }; // class ParserOnnxConfig
 
 #endif
diff --git a/src/Detector/tensorrt_yolo/common/safeCommon.h b/src/Detector/tensorrt_yolo/common/safeCommon.h
index 3d84b095..f10aad18 100644
--- a/src/Detector/tensorrt_yolo/common/safeCommon.h
+++ b/src/Detector/tensorrt_yolo/common/safeCommon.h
@@ -1,11 +1,12 @@
 /*
- * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
- *     http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
@@ -17,13 +18,32 @@
 #ifndef TENSORRT_SAFE_COMMON_H
 #define TENSORRT_SAFE_COMMON_H
 
-#include "NvInferRuntimeCommon.h"
+#include "cuda_runtime.h"
+#include "sampleEntrypoints.h"
+#include <cmath>
 #include <cstdlib>
+#include <fstream>
 #include <iostream>
 #include <memory>
+#include <numeric>
 #include <stdexcept>
 #include <string>
 
+// For safeLoadLibrary
+#ifdef _MSC_VER
+// Needed so that the max/min definitions in windows.h do not conflict with std::max/min.
+#define NOMINMAX
+#include <windows.h>
+#undef NOMINMAX
+#else
+#include <dlfcn.h>
+#endif
+#if IS_QNX_SAFE
+#include <cuda_runtime_api_safe_ex.h>
+#include <sys/procmgr.h>
+#endif // IS_QNX_SAFE
+
+#undef CHECK
 #define CHECK(status)                                                                                                  \
     do                                                                                                                 \
     {                                                                                                                  \
@@ -31,10 +51,92 @@
         if (ret != 0)                                                                                                  \
         {                                                                                                              \
             std::cerr << "Cuda failure: " << ret << std::endl;                                                         \
-            abort();                                                                                                   \
+            exit(EXIT_FAILURE);                                                                                        \
         }                                                                                                              \
     } while (0)
 
+#undef SAFE_ASSERT
+#define SAFE_ASSERT(condition)                                                                                         \
+    do                                                                                                                 \
+    {                                                                                                                  \
+        if (!(condition))                                                                                              \
+        {                                                                                                              \
+            std::cerr << "Assertion failure: " << #condition << std::endl;                                             \
+            exit(EXIT_FAILURE);                                                                                        \
+        }                                                                                                              \
+    } while (0)
+
+//! Locate path to file, given its filename or filepath suffix and possible dirs it might lie in.
+//! Function will also walk back MAX_DEPTH dirs from CWD to check for such a file path.
+inline std::string locateFile(
+    const std::string& filepathSuffix, const std::vector<std::string>& directories, bool reportError = true)
+{
+    const int MAX_DEPTH{10};
+    bool found{false};
+    std::string filepath;
+
+    for (auto& dir : directories)
+    {
+        if (!dir.empty() && dir.back() != '/')
+        {
+#ifdef _MSC_VER
+            filepath = dir + "\\" + filepathSuffix;
+#else
+            filepath = dir + "/" + filepathSuffix;
+#endif
+        }
+        else
+        {
+            filepath = dir + filepathSuffix;
+        }
+
+        for (int i = 0; i < MAX_DEPTH && !found; i++)
+        {
+            const std::ifstream checkFile(filepath);
+            found = checkFile.is_open();
+            if (found)
+            {
+                break;
+            }
+
+            filepath = "../" + filepath; // Try again in parent dir
+        }
+
+        if (found)
+        {
+            break;
+        }
+
+        filepath.clear();
+    }
+
+    // Could not find the file
+    if (filepath.empty())
+    {
+        const std::string dirList = std::accumulate(directories.begin() + 1, directories.end(), directories.front(),
+            [](const std::string& a, const std::string& b) { return a + "\n\t" + b; });
+        std::cout << "Could not find " << filepathSuffix << " in data directories:\n\t" << dirList << std::endl;
+
+        if (reportError)
+        {
+            std::cout << "&&&& FAILED" << std::endl;
+            exit(EXIT_FAILURE);
+        }
+    }
+
+    return filepath;
+}
+
+inline void readPGMFile(const std::string& fileName, uint8_t* buffer, int32_t inH, int32_t inW)
+{
+    std::ifstream infile(fileName, std::ifstream::binary);
+    SAFE_ASSERT(infile.is_open() && "Attempting to read from a file that is not open.");
+    std::string magic, w, h, max;
+    infile >> magic >> w >> h >> max;
+    infile.seekg(1, infile.cur);
+    infile.read(reinterpret_cast<char*>(buffer), inH * inW);
+}
+
 namespace samplesCommon
 {
 template <typename T>
@@ -51,11 +153,17 @@ inline uint32_t elementSize(nvinfer1::DataType t)
 {
     switch (t)
     {
+    case nvinfer1::DataType::kINT64: return 8;
     case nvinfer1::DataType::kINT32:
     case nvinfer1::DataType::kFLOAT: return 4;
-    case nvinfer1::DataType::kHALF: return 2;
-    case nvinfer1::DataType::kINT8: return 1;
-    case nvinfer1::DataType::kBOOL: return 1;
+    case nvinfer1::DataType::kHALF:
+    case nvinfer1::DataType::kBF16: return 2;
+    case nvinfer1::DataType::kINT8:
+    case nvinfer1::DataType::kUINT8:
+    case nvinfer1::DataType::kBOOL:
+    case nvinfer1::DataType::kFP8: return 1;
+    case nvinfer1::DataType::kINT4:
+        SAFE_ASSERT(false && "Element size is not implemented for sub-byte data-types");
     }
     return 0;
 }
@@ -66,6 +174,205 @@ inline A divUp(A x, B n)
     return (x + n - 1) / n;
 }
 
+inline int64_t volume(nvinfer1::Dims const& d)
+{
+    return std::accumulate(d.d, d.d + d.nbDims, int64_t{1}, std::multiplies<int64_t>{});
+}
+
+//! Return m rounded up to nearest multiple of n
+template <typename T1, typename T2>
+inline T1 roundUp(T1 m, T2 n)
+{
+    static_assert(std::is_integral<T1>::value && std::is_integral<T2>::value, "arguments must be integers");
+    static_assert(std::is_signed<T1>::value == std::is_signed<T2>::value, "mixed signedness not allowed");
+    static_assert(sizeof(T1) >= sizeof(T2), "first type must be as least as wide as second type");
+    return ((m + n - 1) / n) * n;
+}
+
+//! comps is the number of components in a vector. Ignored if vecDim < 0.
+inline int64_t volume(nvinfer1::Dims dims, int32_t vecDim, int32_t comps, int32_t batch)
+{
+    if (vecDim >= 0)
+    {
+        dims.d[vecDim] = roundUp(dims.d[vecDim], comps);
+    }
+    return samplesCommon::volume(dims) * std::max(batch, 1);
+}
+
+inline int32_t getSMVersion()
+{
+#if 0
+    // Use default value for 4090
+    int32_t major{8};
+    int32_t minor{9};
+#else
+    int32_t major{};
+    int32_t minor{};
+    int32_t deviceIndex{};
+    CHECK(cudaGetDevice(&deviceIndex));
+    CHECK(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, deviceIndex));
+    CHECK(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, deviceIndex));
+#endif
+    return ((major << 8) | minor);
+}
+
+inline bool isSMSafe()
+{
+    const int32_t smVersion = getSMVersion();
+    return smVersion == 0x0700 || smVersion == 0x0705 || smVersion == 0x0800 || smVersion == 0x0806
+        || smVersion == 0x0807;
+}
+
+inline int32_t calculateSoftmax(float* const prob, int32_t const numDigits)
+{
+    SAFE_ASSERT(prob != nullptr);
+    SAFE_ASSERT(numDigits == 10);
+    float sum{0.0F};
+    std::transform(prob, prob + numDigits, prob, [&sum](float v) -> float {
+        sum += exp(v);
+        return exp(v);
+    });
+
+    SAFE_ASSERT(sum != 0.0F);
+    std::transform(prob, prob + numDigits, prob, [sum](float v) -> float { return v / sum; });
+    int32_t idx = std::max_element(prob, prob + numDigits) - prob;
+    return idx;
+}
+
+//!
+//! \class TrtCudaGraphSafe
+//! \brief Managed CUDA graph
+//!
+class TrtCudaGraphSafe
+{
+public:
+    explicit TrtCudaGraphSafe() = default;
+
+    TrtCudaGraphSafe(const TrtCudaGraphSafe&) = delete;
+
+    TrtCudaGraphSafe& operator=(const TrtCudaGraphSafe&) = delete;
+
+    TrtCudaGraphSafe(TrtCudaGraphSafe&&) = delete;
+
+    TrtCudaGraphSafe& operator=(TrtCudaGraphSafe&&) = delete;
+
+    ~TrtCudaGraphSafe()
+    {
+        if (mGraphExec)
+        {
+            cudaGraphExecDestroy(mGraphExec);
+        }
+    }
+
+    void beginCapture(cudaStream_t& stream)
+    {
+        // cudaStreamCaptureModeGlobal is the only allowed mode in SAFE CUDA
+        CHECK(cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal));
+    }
+
+    bool launch(cudaStream_t& stream)
+    {
+        return cudaGraphLaunch(mGraphExec, stream) == cudaSuccess;
+    }
+
+    void endCapture(cudaStream_t& stream)
+    {
+        CHECK(cudaStreamEndCapture(stream, &mGraph));
+        CHECK(cudaGraphInstantiate(&mGraphExec, mGraph, nullptr, nullptr, 0));
+        CHECK(cudaGraphDestroy(mGraph));
+    }
+
+    void endCaptureOnError(cudaStream_t& stream)
+    {
+        // There are two possibilities why stream capture would fail:
+        // (1) stream is in cudaErrorStreamCaptureInvalidated state.
+        // (2) TRT reports a failure.
+        // In case (1), the returning mGraph should be nullptr.
+        // In case (2), the returning mGraph is not nullptr, but it should not be used.
+        const auto ret = cudaStreamEndCapture(stream, &mGraph);
+        if (ret == cudaErrorStreamCaptureInvalidated)
+        {
+            SAFE_ASSERT(mGraph == nullptr);
+        }
+        else
+        {
+            SAFE_ASSERT(ret == cudaSuccess);
+            SAFE_ASSERT(mGraph != nullptr);
+            CHECK(cudaGraphDestroy(mGraph));
+            mGraph = nullptr;
+        }
+        // Clean up any CUDA error.
+        cudaGetLastError();
+        sample::gLogError << "The CUDA graph capture on the stream has failed." << std::endl;
+    }
+
+private:
+    cudaGraph_t mGraph{};
+    cudaGraphExec_t mGraphExec{};
+};
+
+inline void safeLoadLibrary(const std::string& path)
+{
+#ifdef _MSC_VER
+    void* handle = LoadLibraryA(path.c_str());
+#else
+    int32_t flags{RTLD_LAZY};
+    void* handle = dlopen(path.c_str(), flags);
+#endif
+    if (handle == nullptr)
+    {
+#ifdef _MSC_VER
+        sample::gLogError << "Could not load plugin library: " << path << std::endl;
+#else
+        sample::gLogError << "Could not load plugin library: " << path << ", due to: " << dlerror() << std::endl;
+#endif
+    }
+}
+
+inline std::vector<std::string> safeSplitString(std::string str, char delimiter = ',')
+{
+    std::vector<std::string> splitVect;
+    std::stringstream ss(str);
+    std::string substr;
+
+    while (ss.good())
+    {
+        getline(ss, substr, delimiter);
+        splitVect.emplace_back(std::move(substr));
+    }
+    return splitVect;
+}
+
 } // namespace samplesCommon
 
+namespace safetyCompliance
+{
+inline void initSafeCuda()
+{
+    // According to CUDA initialization in NVIDIA CUDA SAFETY API REFERENCE FOR DRIVE OS
+    // We will need to do the following in order
+    // 1. Initialize the calling thread with CUDA specific information (Call any CUDA RT API identified as init)
+    // 2. Query/Configure and choose the desired CUDA device
+    // 3. CUDA context initialization. (Call cudaDeviceGetLimit or cuCtxCreate)
+    size_t stackSizeLimit = 0;
+    int32_t deviceIndex = 0;
+    CHECK(cudaGetDevice(&deviceIndex));
+    CHECK(cudaDeviceGetLimit(&stackSizeLimit, cudaLimitStackSize));
+#if IS_QNX_SAFE
+    CHECK(cudaSafeExSelectAPIMode(cudaSafeExAPIModeAsilB));
+#endif // IS_QNX_SAFE
+}
+
+inline void setPromgrAbility()
+{
+#if IS_QNX_SAFE
+    // Comply with DEEPLRN_RES_117 on QNX-safe by dropping PROCMGR_AID_MEM_PHYS ability and locking out any further
+    // changes
+    procmgr_ability(
+        0, PROCMGR_ADN_NONROOT | PROCMGR_AOP_DENY | PROCMGR_AOP_LOCK | PROCMGR_AID_MEM_PHYS, PROCMGR_AID_EOL);
+#endif // IS_QNX_SAFE
+}
+
+} // namespace safetyCompliance
+
 #endif // TENSORRT_SAFE_COMMON_H
diff --git a/src/Detector/tensorrt_yolo/common/sampleConfig.h b/src/Detector/tensorrt_yolo/common/sampleConfig.h
index 53a78331..801a268a 100644
--- a/src/Detector/tensorrt_yolo/common/sampleConfig.h
+++ b/src/Detector/tensorrt_yolo/common/sampleConfig.h
@@ -1,11 +1,12 @@
 /*
- * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
- *     http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
@@ -55,9 +56,9 @@ class SampleConfig : public nvonnxparser::IOnnxConfig
     bool mDebugBuilder{false};
     InputDataFormat mInputDataFormat{InputDataFormat::kASCII};
     uint64_t mTopK{0};
-    float mFailurePercentage{-1.0f};
-    float mTolerance{0.0f};
-    float mAbsTolerance{1e-5f};
+    float mFailurePercentage{-1.0F};
+    float mTolerance{0.0F};
+    float mAbsTolerance{1e-5F};
 
 public:
     SampleConfig()
@@ -70,8 +71,7 @@ class SampleConfig : public nvonnxparser::IOnnxConfig
 #endif
     }
 
-protected:
-    ~SampleConfig()
+    ~SampleConfig() override
     {
 #ifdef ONNX_DEBUG
         if (isDebug())
@@ -82,12 +82,12 @@ class SampleConfig : public nvonnxparser::IOnnxConfig
     }
 
 public:
-    void setModelDtype(const nvinfer1::DataType mdt) noexcept
+    void setModelDtype(const nvinfer1::DataType mdt) noexcept override
     {
         mModelDtype = mdt;
     }
 
-    nvinfer1::DataType getModelDtype() const noexcept
+    nvinfer1::DataType getModelDtype() const noexcept override
     {
         return mModelDtype;
     }
@@ -102,28 +102,28 @@ class SampleConfig : public nvonnxparser::IOnnxConfig
         mTF32 = enabled;
     }
 
-    const char* getModelFileName() const noexcept
+    const char* getModelFileName() const noexcept override
     {
         return mModelFilename.c_str();
     }
 
-    void setModelFileName(const char* onnxFilename) noexcept
+    void setModelFileName(const char* onnxFilename) noexcept override
     {
         mModelFilename = std::string(onnxFilename);
     }
-    Verbosity getVerbosityLevel() const noexcept
+    Verbosity getVerbosityLevel() const noexcept override
     {
         return mVerbosity;
     }
-    void addVerbosity() noexcept
+    void addVerbosity() noexcept override
     {
         ++mVerbosity;
     }
-    void reduceVerbosity() noexcept
+    void reduceVerbosity() noexcept override
     {
         --mVerbosity;
     }
-    virtual void setVerbosityLevel(Verbosity v) noexcept
+    void setVerbosityLevel(Verbosity v) noexcept override
     {
         mVerbosity = v;
     }
@@ -135,19 +135,19 @@ class SampleConfig : public nvonnxparser::IOnnxConfig
     {
         mEngineFilename = std::string(engineFilename);
     }
-    const char* getTextFileName() const noexcept
+    const char* getTextFileName() const noexcept override
     {
         return mTextFilename.c_str();
     }
-    void setTextFileName(const char* textFilename) noexcept
+    void setTextFileName(const char* textFilename) noexcept override
     {
         mTextFilename = std::string(textFilename);
     }
-    const char* getFullTextFileName() const noexcept
+    const char* getFullTextFileName() const noexcept override
     {
         return mFullTextFilename.c_str();
     }
-    void setFullTextFileName(const char* fullTextFilename) noexcept
+    void setFullTextFileName(const char* fullTextFilename) noexcept override
     {
         mFullTextFilename = std::string(fullTextFilename);
     }
@@ -161,12 +161,12 @@ class SampleConfig : public nvonnxparser::IOnnxConfig
         return mLabel;
     } //!<  get the Label
 
-    bool getPrintLayerInfo() const noexcept
+    bool getPrintLayerInfo() const noexcept override
     {
         return mPrintLayercInfo;
     }
 
-    void setPrintLayerInfo(bool b) noexcept
+    void setPrintLayerInfo(bool b) noexcept override
     {
         mPrintLayercInfo = b;
     } //!< get the boolean variable corresponding to the Layer Info, see getPrintLayerInfo()
@@ -312,7 +312,7 @@ class SampleConfig : public nvonnxparser::IOnnxConfig
     {
         return mTimingCacheFilename.c_str();
     }
-    
+
     void setTimingCacheFileName(const char* timingCacheFilename) noexcept
     {
         mTimingCacheFilename = std::string(timingCacheFilename);
@@ -326,12 +326,6 @@ class SampleConfig : public nvonnxparser::IOnnxConfig
         return false;
 #endif
     }
-
-    void destroy() noexcept
-    {
-        delete this;
-    }
-
 }; // class SampleConfig
 
 #endif
diff --git a/src/Detector/tensorrt_yolo/common/sampleDevice.cpp b/src/Detector/tensorrt_yolo/common/sampleDevice.cpp
new file mode 100644
index 00000000..7964aeb5
--- /dev/null
+++ b/src/Detector/tensorrt_yolo/common/sampleDevice.cpp
@@ -0,0 +1,133 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "sampleDevice.h"
+
+#include <iomanip>
+
+namespace sample
+{
+
+void cudaCheck(cudaError_t ret, std::ostream& err)
+{
+    if (ret != cudaSuccess)
+    {
+        err << "Cuda failure: " << cudaGetErrorString(ret) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+// Construct GPU UUID string in the same format as nvidia-smi does.
+std::string getUuidString(cudaUUID_t uuid)
+{
+    constexpr int32_t kUUID_SIZE = sizeof(cudaUUID_t);
+    static_assert(kUUID_SIZE == 16, "Unexpected size for cudaUUID_t!");
+
+    std::ostringstream ss;
+    std::vector<int32_t> const splits = {0, 4, 6, 8, 10, kUUID_SIZE};
+
+    ss << "GPU" << std::hex << std::setfill('0');
+    for (int32_t splitIdx = 0; splitIdx < static_cast<int32_t>(splits.size()) - 1; ++splitIdx)
+    {
+        ss << "-";
+        for (int32_t byteIdx = splits[splitIdx]; byteIdx < splits[splitIdx + 1]; ++byteIdx)
+        {
+            ss << std::setw(2) << +static_cast<uint8_t>(uuid.bytes[byteIdx]);
+        }
+    }
+    return ss.str();
+}
+
+void setCudaDevice(int32_t device, std::ostream& os)
+{
+#if !TRT_WINML
+    os << "=== Device Information ===" << std::endl;
+
+    // Get the number of visible GPUs.
+    int32_t nbDevices{-1};
+    cudaCheck(cudaGetDeviceCount(&nbDevices));
+
+    if (nbDevices <= 0)
+    {
+        os << "Cannot find any available devices (GPUs)!" << std::endl;
+        exit(EXIT_FAILURE);
+    }
+
+    // Print out the GPU name and PCIe bus ID of each GPU.
+    os << "Available Devices: " << std::endl;
+    cudaDeviceProp properties;
+    for (int32_t deviceIdx = 0; deviceIdx < nbDevices; ++deviceIdx)
+    {
+        cudaDeviceProp tempProperties;
+        cudaCheck(cudaGetDeviceProperties(&tempProperties, deviceIdx));
+
+        // clang-format off
+        os << "  Device " << deviceIdx << ": \"" << tempProperties.name << "\" UUID: "
+           << getUuidString(tempProperties.uuid) << std::endl;
+        // clang-format on
+
+        // Record the properties of the desired GPU.
+        if (deviceIdx == device)
+        {
+            properties = tempProperties;
+        }
+    }
+
+    // Exit with error if the requested device ID does not exist.
+    if (device < 0 || device >= nbDevices)
+    {
+        os << "Cannot find device ID " << device << "!" << std::endl;
+        exit(EXIT_FAILURE);
+    }
+
+    // Set to the corresponding GPU.
+    cudaCheck(cudaSetDevice(device));
+
+    // clang-format off
+    os << "Selected Device: "      << properties.name                                               << std::endl;
+    os << "Selected Device ID: "   << device                                                        << std::endl;
+    os << "Selected Device UUID: " << getUuidString(properties.uuid)                                << std::endl;
+    os << "Compute Capability: "   << properties.major << "." << properties.minor                   << std::endl;
+    os << "SMs: "                  << properties.multiProcessorCount                                << std::endl;
+    os << "Device Global Memory: " << (properties.totalGlobalMem >> 20) << " MiB"                   << std::endl;
+    os << "Shared Memory per SM: " << (properties.sharedMemPerMultiprocessor >> 10) << " KiB"       << std::endl;
+    os << "Memory Bus Width: "     << properties.memoryBusWidth << " bits"
+                        << " (ECC " << (properties.ECCEnabled != 0 ? "enabled" : "disabled") << ")" << std::endl;
+    os << "Application Compute Clock Rate: "   << properties.clockRate / 1000000.0F << " GHz"       << std::endl;
+    os << "Application Memory Clock Rate: "    << properties.memoryClockRate / 1000000.0F << " GHz" << std::endl;
+    os << std::endl;
+    os << "Note: The application clock rates do not reflect the actual clock rates that the GPU is "
+                                                                         << "currently running at." << std::endl;
+    // clang-format on
+#endif
+}
+
+int32_t getCudaDriverVersion()
+{
+    int32_t version{-1};
+    cudaCheck(cudaDriverGetVersion(&version));
+    return version;
+}
+
+int32_t getCudaRuntimeVersion()
+{
+    int32_t version{-1};
+    cudaCheck(cudaRuntimeGetVersion(&version));
+    return version;
+}
+
+} // namespace sample
diff --git a/src/Detector/tensorrt_yolo/common/sampleDevice.h b/src/Detector/tensorrt_yolo/common/sampleDevice.h
index 2053ac7c..986dccb4 100644
--- a/src/Detector/tensorrt_yolo/common/sampleDevice.h
+++ b/src/Detector/tensorrt_yolo/common/sampleDevice.h
@@ -1,11 +1,12 @@
 /*
- * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
- *     http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
@@ -23,17 +24,13 @@
 #include <iostream>
 #include <thread>
 
+#include "sampleUtils.h"
+
 namespace sample
 {
 
-inline void cudaCheck(cudaError_t ret, std::ostream& err = std::cerr)
-{
-    if (ret != cudaSuccess)
-    {
-        err << "Cuda failure: " << cudaGetErrorString(ret) << std::endl;
-        abort();
-    }
-}
+//! Check if the CUDA return status shows any error. If so, exit the program immediately.
+void cudaCheck(cudaError_t ret, std::ostream& err = std::cerr);
 
 class TrtCudaEvent;
 
@@ -238,16 +235,18 @@ class TrtCudaBuffer
 
     TrtCudaBuffer(TrtCudaBuffer&& rhs)
     {
-        reset(rhs.mPtr);
+        reset(rhs.mPtr, rhs.mSize);
         rhs.mPtr = nullptr;
+        rhs.mSize = 0;
     }
 
     TrtCudaBuffer& operator=(TrtCudaBuffer&& rhs)
     {
         if (this != &rhs)
         {
-            reset(rhs.mPtr);
+            reset(rhs.mPtr, rhs.mSize);
             rhs.mPtr = nullptr;
+            rhs.mSize = 0;
         }
         return *this;
     }
@@ -260,21 +259,24 @@ class TrtCudaBuffer
     TrtCudaBuffer(size_t size)
     {
         A()(&mPtr, size);
+        mSize = size;
     }
 
     void allocate(size_t size)
     {
         reset();
         A()(&mPtr, size);
+        mSize = size;
     }
 
-    void reset(void* ptr = nullptr)
+    void reset(void* ptr = nullptr, size_t size = 0)
     {
         if (mPtr)
         {
             D()(mPtr);
         }
         mPtr = ptr;
+        mSize = size;
     }
 
     void* get() const
@@ -282,8 +284,14 @@ class TrtCudaBuffer
         return mPtr;
     }
 
+    size_t getSize() const
+    {
+        return mSize;
+    }
+
 private:
     void* mPtr{nullptr};
+    size_t mSize{0};
 };
 
 struct DeviceAllocator
@@ -383,39 +391,39 @@ class IMirroredBuffer
 }; // class IMirroredBuffer
 
 //!
-//! Class to have a seperate memory buffer for discrete device and host allocations.
+//! Class to have a separate memory buffer for discrete device and host allocations.
 //!
 class DiscreteMirroredBuffer : public IMirroredBuffer
 {
 public:
-    void allocate(size_t size)
+    void allocate(size_t size) override
     {
         mSize = size;
         mHostBuffer.allocate(size);
         mDeviceBuffer.allocate(size);
     }
 
-    void* getDeviceBuffer() const
+    void* getDeviceBuffer() const override
     {
         return mDeviceBuffer.get();
     }
 
-    void* getHostBuffer() const
+    void* getHostBuffer() const override
     {
         return mHostBuffer.get();
     }
 
-    void hostToDevice(TrtCudaStream& stream)
+    void hostToDevice(TrtCudaStream& stream) override
     {
         cudaCheck(cudaMemcpyAsync(mDeviceBuffer.get(), mHostBuffer.get(), mSize, cudaMemcpyHostToDevice, stream.get()));
     }
 
-    void deviceToHost(TrtCudaStream& stream)
+    void deviceToHost(TrtCudaStream& stream) override
     {
         cudaCheck(cudaMemcpyAsync(mHostBuffer.get(), mDeviceBuffer.get(), mSize, cudaMemcpyDeviceToHost, stream.get()));
     }
 
-    size_t getSize() const
+    size_t getSize() const override
     {
         return mSize;
     }
@@ -432,33 +440,33 @@ class DiscreteMirroredBuffer : public IMirroredBuffer
 class UnifiedMirroredBuffer : public IMirroredBuffer
 {
 public:
-    void allocate(size_t size)
+    void allocate(size_t size) override
     {
         mSize = size;
         mBuffer.allocate(size);
     }
 
-    void* getDeviceBuffer() const
+    void* getDeviceBuffer() const override
     {
         return mBuffer.get();
     }
 
-    void* getHostBuffer() const
+    void* getHostBuffer() const override
     {
         return mBuffer.get();
     }
 
-    void hostToDevice(TrtCudaStream& /*stream*/)
+    void hostToDevice(TrtCudaStream& stream) override
     {
         // Does nothing since we are using unified memory.
     }
 
-    void deviceToHost(TrtCudaStream& /*stream*/)
+    void deviceToHost(TrtCudaStream& stream) override
     {
         // Does nothing since we are using unified memory.
     }
 
-    size_t getSize() const
+    size_t getSize() const override
     {
         return mSize;
     }
@@ -468,26 +476,70 @@ class UnifiedMirroredBuffer : public IMirroredBuffer
     TrtManagedBuffer mBuffer;
 }; // class UnifiedMirroredBuffer
 
-inline void setCudaDevice(int device, std::ostream& os)
+//!
+//! Class to allocate memory for outputs with data-dependent shapes. The sizes of those are unknown so pre-allocation is
+//! not possible.
+//!
+class OutputAllocator : public nvinfer1::IOutputAllocator
 {
-    cudaCheck(cudaSetDevice(device));
-
-    cudaDeviceProp properties;
-    cudaCheck(cudaGetDeviceProperties(&properties, device));
-
-// clang-format off
-    os << "=== Device Information ===" << std::endl;
-    os << "Selected Device: "      << properties.name                                               << std::endl;
-    os << "Compute Capability: "   << properties.major << "." << properties.minor                   << std::endl;
-    os << "SMs: "                  << properties.multiProcessorCount                                << std::endl;
-    os << "Compute Clock Rate: "   << properties.clockRate / 1000000.0F << " GHz"                   << std::endl;
-    os << "Device Global Memory: " << (properties.totalGlobalMem >> 20) << " MiB"                   << std::endl;
-    os << "Shared Memory per SM: " << (properties.sharedMemPerMultiprocessor >> 10) << " KiB"       << std::endl;
-    os << "Memory Bus Width: "     << properties.memoryBusWidth << " bits"
-                        << " (ECC " << (properties.ECCEnabled != 0 ? "enabled" : "disabled") << ")" << std::endl;
-    os << "Memory Clock Rate: "    << properties.memoryClockRate / 1000000.0F << " GHz"             << std::endl;
-    // clang-format on
-}
+public:
+    OutputAllocator(IMirroredBuffer* buffer)
+        : mBuffer(buffer)
+    {
+    }
+
+    void* reallocateOutput(
+        char const* tensorName, void* currentMemory, uint64_t size, uint64_t alignment) noexcept override
+    {
+        // Some memory allocators return nullptr when allocating zero bytes, but TensorRT requires a non-null ptr
+        // even for empty tensors, so allocate a dummy byte.
+        size = std::max(size, static_cast<uint64_t>(1));
+        if (size > mSize)
+        {
+            mBuffer->allocate(roundUp(size, alignment));
+            mSize = size;
+        }
+        return mBuffer->getDeviceBuffer();
+    }
+
+    //! IMirroredBuffer does not implement Async allocation, hence this is just a wrap around
+    void* reallocateOutputAsync(char const* tensorName, void* currentMemory, uint64_t size, uint64_t alignment,
+        cudaStream_t /*stream*/) noexcept override
+    {
+        return reallocateOutput(tensorName, currentMemory, size, alignment);
+    }
+
+    void notifyShape(char const* tensorName, nvinfer1::Dims const& dims) noexcept override
+    {
+        mFinalDims = dims;
+    }
+
+    IMirroredBuffer* getBuffer()
+    {
+        return mBuffer.get();
+    }
+
+    nvinfer1::Dims getFinalDims()
+    {
+        return mFinalDims;
+    }
+
+    ~OutputAllocator() override {}
+
+private:
+    std::unique_ptr<IMirroredBuffer> mBuffer;
+    uint64_t mSize{};
+    nvinfer1::Dims mFinalDims;
+};
+
+//! Set the GPU to run the inference on.
+void setCudaDevice(int32_t device, std::ostream& os);
+
+//! Get the CUDA version of the current CUDA driver.
+int32_t getCudaDriverVersion();
+
+//! Get the CUDA version of the current CUDA runtime.
+int32_t getCudaRuntimeVersion();
 
 } // namespace sample
 
diff --git a/src/Detector/tensorrt_yolo/common/sampleEngines.cpp_ b/src/Detector/tensorrt_yolo/common/sampleEngines.cpp_
new file mode 100644
index 00000000..8ada0526
--- /dev/null
+++ b/src/Detector/tensorrt_yolo/common/sampleEngines.cpp_
@@ -0,0 +1,1688 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <algorithm>
+#include <chrono>
+#include <fstream>
+#include <iostream>
+#include <iterator>
+#include <map>
+#include <random>
+#include <set>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "NvInfer.h"
+#include "NvOnnxParser.h"
+
+#include "ErrorRecorder.h"
+#include "common.h"
+#include "half.h"
+#include "logger.h"
+#include "sampleDevice.h"
+#include "sampleEngines.h"
+#include "sampleOptions.h"
+#include "sampleUtils.h"
+
+using namespace nvinfer1;
+
+namespace sample
+{
+
+namespace
+{
+
+std::map<std::string, float> readScalesFromCalibrationCache(std::string const& calibrationFile)
+{
+    std::map<std::string, float> tensorScales;
+    std::ifstream cache{calibrationFile};
+    if (!cache.is_open())
+    {
+        sample::gLogError << "[TRT] Can not open provided calibration cache file" << std::endl;
+        return tensorScales;
+    }
+    std::string line;
+    while (std::getline(cache, line))
+    {
+        auto colonPos = line.find_last_of(':');
+        if (colonPos != std::string::npos)
+        {
+            // Scales should be stored in calibration cache as 32-bit floating numbers encoded as 32-bit integers
+            int32_t scalesAsInt = std::stoi(line.substr(colonPos + 2, 8), nullptr, 16);
+            auto const tensorName = line.substr(0, colonPos);
+            tensorScales[tensorName] = *reinterpret_cast<float*>(&scalesAsInt);
+        }
+    }
+    cache.close();
+    return tensorScales;
+}
+} // namespace
+
+nvinfer1::ICudaEngine* LazilyDeserializedEngine::get()
+{
+    SMP_RETVAL_IF_FALSE(
+        !mIsSafe, "Safe mode is enabled, but trying to get standard engine!", nullptr, sample::gLogError);
+
+    if (mEngine == nullptr)
+    {
+        SMP_RETVAL_IF_FALSE(getFileReader().isOpen() || !getBlob().empty(), "Engine is empty. Nothing to deserialize!",
+            nullptr, sample::gLogError);
+
+        using time_point = std::chrono::time_point<std::chrono::high_resolution_clock>;
+        using duration = std::chrono::duration<float>;
+        time_point const deserializeStartTime{std::chrono::high_resolution_clock::now()};
+
+        if (mLeanDLLPath.empty())
+        {
+            mRuntime.reset(createRuntime());
+        }
+        else
+        {
+            mParentRuntime.reset(createRuntime());
+            ASSERT(mParentRuntime.get() != nullptr);
+
+            mRuntime.reset(mParentRuntime->loadRuntime(mLeanDLLPath.c_str()));
+        }
+        ASSERT(mRuntime.get() != nullptr);
+
+        if (mVersionCompatible)
+        {
+            // Application needs to opt into allowing deserialization of engines with embedded lean runtime.
+            mRuntime->setEngineHostCodeAllowed(true);
+        }
+
+        if (!mTempdir.empty())
+        {
+            mRuntime->setTemporaryDirectory(mTempdir.c_str());
+        }
+
+        mRuntime->setTempfileControlFlags(mTempfileControls);
+
+        SMP_RETVAL_IF_FALSE(mRuntime != nullptr, "runtime creation failed", nullptr, sample::gLogError);
+        if (mDLACore != -1)
+        {
+            mRuntime->setDLACore(mDLACore);
+        }
+        mRuntime->setErrorRecorder(&gRecorder);
+#if !TRT_WINML
+        for (auto const& pluginPath : mDynamicPlugins)
+        {
+            mRuntime->getPluginRegistry().loadLibrary(pluginPath.c_str());
+        }
+#endif
+
+        if (getFileReader().isOpen())
+        {
+            mEngine.reset(mRuntime->deserializeCudaEngine(getFileReader()));
+        }
+        else
+        {
+            auto const& engineBlob = getBlob();
+            mEngine.reset(mRuntime->deserializeCudaEngine(engineBlob.data, engineBlob.size));
+        }
+        SMP_RETVAL_IF_FALSE(mEngine != nullptr, "Engine deserialization failed", nullptr, sample::gLogError);
+
+        time_point const deserializeEndTime{std::chrono::high_resolution_clock::now()};
+        sample::gLogInfo << "Engine deserialized in " << duration(deserializeEndTime - deserializeStartTime).count()
+                         << " sec." << std::endl;
+    }
+
+    return mEngine.get();
+}
+
+nvinfer1::ICudaEngine* LazilyDeserializedEngine::release()
+{
+    return mEngine.release();
+}
+
+void setTensorScalesFromCalibration(nvinfer1::INetworkDefinition& network, std::vector<IOFormat> const& inputFormats,
+    std::vector<IOFormat> const& outputFormats, std::string const& calibrationFile)
+{
+    auto const tensorScales = readScalesFromCalibrationCache(calibrationFile);
+    bool const broadcastInputFormats = broadcastIOFormats(inputFormats, network.getNbInputs());
+    for (int32_t i = 0, n = network.getNbInputs(); i < n; ++i)
+    {
+        int32_t formatIdx = broadcastInputFormats ? 0 : i;
+        if (!inputFormats.empty() && inputFormats[formatIdx].first == DataType::kINT8)
+        {
+            auto* input = network.getInput(i);
+            auto const calibScale = tensorScales.at(input->getName());
+            input->setDynamicRange(-127 * calibScale, 127 * calibScale);
+        }
+    }
+    bool const broadcastOutputFormats = broadcastIOFormats(outputFormats, network.getNbOutputs());
+    for (int32_t i = 0, n = network.getNbOutputs(); i < n; ++i)
+    {
+        int32_t formatIdx = broadcastOutputFormats ? 0 : i;
+        if (!outputFormats.empty() && outputFormats[formatIdx].first == DataType::kINT8)
+        {
+            auto* output = network.getOutput(i);
+            auto const calibScale = tensorScales.at(output->getName());
+            output->setDynamicRange(-127 * calibScale, 127 * calibScale);
+        }
+    }
+}
+
+//!
+//! \brief Generate a network definition for a given model
+//!
+//! \param[in] model Model options for this network
+//! \param[in,out] network Network storing the parsed results
+//! \param[in,out] err Error stream
+//! \param[out] vcPluginLibrariesUsed If not nullptr, will be populated with paths to VC plugin libraries required by
+//! the parsed network.
+//!
+//! \return Parser The parser used to initialize the network and that holds the weights for the network, or an invalid
+//! parser (the returned parser converts to false if tested)
+//!
+//! Constant input dimensions in the model must not be changed in the corresponding
+//! network definition, because its correctness may rely on the constants.
+//!
+//! \see Parser::operator bool()
+//!
+Parser modelToNetwork(ModelOptions const& model, BuildOptions const& build, nvinfer1::INetworkDefinition& network,
+    std::ostream& err, std::vector<std::string>* vcPluginLibrariesUsed)
+{
+    sample::gLogInfo << "Start parsing network model." << std::endl;
+    auto const tBegin = std::chrono::high_resolution_clock::now();
+
+    Parser parser;
+    switch (model.baseModel.format)
+    {
+    case ModelFormat::kONNX:
+    {
+        using namespace nvonnxparser;
+        parser.onnxParser.reset(createONNXParser(network));
+        ASSERT(parser.onnxParser != nullptr);
+#if !TRT_WINML
+        // kNATIVE_INSTANCENORM is ON by default in the parser and must be cleared to use the plugin implementation.
+        if (build.pluginInstanceNorm)
+        {
+            parser.onnxParser->clearFlag(OnnxParserFlag::kNATIVE_INSTANCENORM);
+        }
+#endif
+        if (!parser.onnxParser->parseFromFile(
+                model.baseModel.model.c_str(), static_cast<int>(sample::gLogger.getReportableSeverity())))
+        {
+            err << "Failed to parse onnx file" << std::endl;
+            parser.onnxParser.reset();
+        }
+#if !TRT_WINML
+        if (vcPluginLibrariesUsed && parser.onnxParser.get())
+        {
+            int64_t nbPluginLibs;
+            char const* const* pluginLibArray = parser.onnxParser->getUsedVCPluginLibraries(nbPluginLibs);
+            if (nbPluginLibs >= 0)
+            {
+                vcPluginLibrariesUsed->reserve(nbPluginLibs);
+                for (int64_t i = 0; i < nbPluginLibs; ++i)
+                {
+                    sample::gLogInfo << "Using VC plugin library " << pluginLibArray[i] << std::endl;
+                    vcPluginLibrariesUsed->emplace_back(std::string{pluginLibArray[i]});
+                }
+            }
+            else
+            {
+                sample::gLogWarning << "Failure to query VC plugin libraries required by parsed ONNX network"
+                                    << std::endl;
+            }
+        }
+#endif
+        break;
+    }
+    case ModelFormat::kANY: break;
+    }
+
+    auto const tEnd = std::chrono::high_resolution_clock::now();
+    float const parseTime = std::chrono::duration<float>(tEnd - tBegin).count();
+
+    sample::gLogInfo << "Finished parsing network model. Parse time: " << parseTime << std::endl;
+    return parser;
+}
+
+namespace
+{
+
+class RndInt8Calibrator : public nvinfer1::IInt8EntropyCalibrator2
+{
+public:
+    RndInt8Calibrator(int32_t batches, std::vector<int64_t>& elemCount, std::string const& cacheFile,
+        nvinfer1::INetworkDefinition const& network, std::ostream& err);
+
+    ~RndInt8Calibrator() override
+    {
+        for (auto& elem : mInputDeviceBuffers)
+        {
+            cudaCheck(cudaFree(elem.second), mErr);
+        }
+    }
+
+    bool getBatch(void* bindings[], char const* names[], int32_t nbBindings) noexcept override;
+
+    int32_t getBatchSize() const noexcept override
+    {
+        return 1;
+    }
+
+    const void* readCalibrationCache(size_t& length) noexcept override;
+
+    void writeCalibrationCache(void const*, size_t) noexcept override {}
+
+private:
+    int32_t mBatches{};
+    int32_t mCurrentBatch{};
+    std::string mCacheFile;
+    std::map<std::string, void*> mInputDeviceBuffers;
+    std::vector<char> mCalibrationCache;
+    std::ostream& mErr;
+};
+
+RndInt8Calibrator::RndInt8Calibrator(int32_t batches, std::vector<int64_t>& elemCount, std::string const& cacheFile,
+    INetworkDefinition const& network, std::ostream& err)
+    : mBatches(batches)
+    , mCurrentBatch(0)
+    , mCacheFile(cacheFile)
+    , mErr(err)
+{
+    std::ifstream tryCache(cacheFile, std::ios::binary);
+    if (tryCache.good())
+    {
+        return;
+    }
+
+    std::default_random_engine generator;
+    std::uniform_real_distribution<float> distribution(-1.0F, 1.0F);
+    auto gen = [&generator, &distribution]() { return distribution(generator); };
+
+    for (int32_t i = 0; i < network.getNbInputs(); i++)
+    {
+        auto* input = network.getInput(i);
+        std::vector<float> rnd_data(elemCount[i]);
+        std::generate_n(rnd_data.begin(), elemCount[i], gen);
+
+        void* data;
+        cudaCheck(cudaMalloc(&data, elemCount[i] * sizeof(float)), mErr);
+        cudaCheck(cudaMemcpy(data, rnd_data.data(), elemCount[i] * sizeof(float), cudaMemcpyHostToDevice), mErr);
+
+        mInputDeviceBuffers.insert(std::make_pair(input->getName(), data));
+    }
+}
+
+bool RndInt8Calibrator::getBatch(void* bindings[], char const* names[], int32_t nbBindings) noexcept
+{
+    if (mCurrentBatch >= mBatches)
+    {
+        return false;
+    }
+
+    for (int32_t i = 0; i < nbBindings; ++i)
+    {
+        bindings[i] = mInputDeviceBuffers[names[i]];
+    }
+
+    ++mCurrentBatch;
+
+    return true;
+}
+
+const void* RndInt8Calibrator::readCalibrationCache(size_t& length) noexcept
+{
+    mCalibrationCache.clear();
+    std::ifstream input(mCacheFile, std::ios::binary);
+    input >> std::noskipws;
+    if (input.good())
+    {
+        std::copy(
+            std::istream_iterator<char>(input), std::istream_iterator<char>(), std::back_inserter(mCalibrationCache));
+    }
+
+    length = mCalibrationCache.size();
+    return !mCalibrationCache.empty() ? mCalibrationCache.data() : nullptr;
+}
+
+bool setTensorDynamicRange(INetworkDefinition const& network, float inRange = 2.0F, float outRange = 4.0F)
+{
+    // Ensure that all layer inputs have a dynamic range.
+    for (int32_t l = 0; l < network.getNbLayers(); l++)
+    {
+        auto* layer = network.getLayer(l);
+        for (int32_t i = 0; i < layer->getNbInputs(); i++)
+        {
+            ITensor* input{layer->getInput(i)};
+            // Optional inputs are nullptr here and are from RNN layers.
+            if (input && !input->dynamicRangeIsSet())
+            {
+                // Concat should propagate dynamic range from outputs to inputs to avoid
+                // Re-quantization during the concatenation
+                auto dynRange = (layer->getType() == LayerType::kCONCATENATION) ? outRange : inRange;
+                if (!input->setDynamicRange(-dynRange, dynRange))
+                {
+                    return false;
+                }
+            }
+        }
+        for (int32_t o = 0; o < layer->getNbOutputs(); o++)
+        {
+            ITensor* output{layer->getOutput(o)};
+            // Optional outputs are nullptr here and are from RNN layers.
+            if (output && !output->dynamicRangeIsSet())
+            {
+                // Pooling must have the same input and output dynamic range.
+                if (layer->getType() == LayerType::kPOOLING)
+                {
+                    if (!output->setDynamicRange(-inRange, inRange))
+                    {
+                        return false;
+                    }
+                }
+                else
+                {
+                    if (!output->setDynamicRange(-outRange, outRange))
+                    {
+                        return false;
+                    }
+                }
+            }
+        }
+    }
+    return true;
+}
+
+bool isNonActivationType(nvinfer1::DataType const type)
+{
+    return type == nvinfer1::DataType::kINT32 || type == nvinfer1::DataType::kINT64 || type == nvinfer1::DataType::kBOOL
+        || type == nvinfer1::DataType::kUINT8;
+}
+
+void setLayerPrecisions(INetworkDefinition& network, LayerPrecisions const& layerPrecisions)
+{
+    bool hasLayerPrecisionSkipped{false};
+    for (int32_t layerIdx = 0; layerIdx < network.getNbLayers(); ++layerIdx)
+    {
+        auto* layer = network.getLayer(layerIdx);
+        auto const layerName = layer->getName();
+        auto exactMatch = layerPrecisions.find(layerName);
+        auto plausibleMatch = findPlausible(layerPrecisions, layerName);
+        if (exactMatch != layerPrecisions.end())
+        {
+            sample::gLogInfo << "Set layer " << layerName << " to precision " << exactMatch->second << std::endl;
+            layer->setPrecision(exactMatch->second);
+        }
+        else if (plausibleMatch != layerPrecisions.end())
+        {
+            if (isNonActivationType(layer->getPrecision()))
+            {
+                hasLayerPrecisionSkipped = true;
+                sample::gLogVerbose << "Skipped setting precision for layer " << layerName << " because the "
+                                    << " default layer precision is of non-activation type." << std::endl;
+                continue;
+            }
+            if (layer->getType() == nvinfer1::LayerType::kCONSTANT
+                && (isNonActivationType(static_cast<IConstantLayer*>(layer)->getWeights().type)))
+            {
+                hasLayerPrecisionSkipped = true;
+                sample::gLogVerbose << "Skipped setting precision for layer " << layerName << " because this "
+                                    << "constant layer has weights of non-activation type." << std::endl;
+                continue;
+            }
+            if (layer->getNbInputs() >= 1 && layer->getInput(0)->isShapeTensor())
+            {
+                hasLayerPrecisionSkipped = true;
+                sample::gLogVerbose << "Skipped setting precision for layer " << layerName << " because this layer "
+                                    << "operates on a shape tensor." << std::endl;
+                continue;
+            }
+            if (layer->getNbInputs() >= 1 && isNonActivationType(layer->getInput(0)->getType())
+                && layer->getNbOutputs() >= 1 && isNonActivationType(layer->getOutput(0)->getType()))
+            {
+                hasLayerPrecisionSkipped = true;
+                sample::gLogVerbose << "Skipped setting precision for layer " << layerName << " because this "
+                                    << "layer has input and output of non-activation type." << std::endl;
+                continue;
+            }
+            // All heuristics passed. Set the layer precision.
+            sample::gLogInfo << "Set layer " << layerName << " to precision " << plausibleMatch->second << std::endl;
+            layer->setPrecision(plausibleMatch->second);
+        }
+    }
+
+    if (hasLayerPrecisionSkipped)
+    {
+        sample::gLogInfo << "Skipped setting precisions for some layers. Check verbose logs for more details."
+                         << std::endl;
+    }
+}
+
+void setLayerOutputTypes(INetworkDefinition& network, LayerOutputTypes const& layerOutputTypes)
+{
+    bool const hasGlobalOutputType{layerOutputTypes.find("*") != layerOutputTypes.end()};
+    auto const globalOutputType = hasGlobalOutputType ? layerOutputTypes.at("*").at(0) : nvinfer1::DataType::kFLOAT;
+    bool hasLayerOutputTypeSkipped{false};
+    for (int32_t layerIdx = 0; layerIdx < network.getNbLayers(); ++layerIdx)
+    {
+        auto* layer = network.getLayer(layerIdx);
+        auto const layerName = layer->getName();
+        auto const nbOutputs = layer->getNbOutputs();
+        auto exactMatch = layerOutputTypes.find(layerName);
+        auto plausibleMatch = findPlausible(layerOutputTypes, layerName);
+        if (exactMatch != layerOutputTypes.end())
+        {
+            auto const& outputTypes = exactMatch->second;
+            bool const isBroadcast = (outputTypes.size() == 1);
+            if (!isBroadcast && static_cast<int32_t>(outputTypes.size()) != nbOutputs)
+            {
+                sample::gLogError << "Layer " << layerName << " has " << nbOutputs << " outputs but "
+                                  << outputTypes.size() << " output types are given in --layerOutputTypes flag."
+                                  << std::endl;
+                throw std::invalid_argument("Invalid --layerOutputTypes flag.");
+            }
+            for (int32_t outputIdx = 0; outputIdx < nbOutputs; ++outputIdx)
+            {
+                auto const outputType = outputTypes.at(isBroadcast ? 0 : outputIdx);
+                sample::gLogInfo << "Set output " << outputIdx << " of layer " << layerName << " to type " << outputType
+                                 << std::endl;
+                layer->setOutputType(outputIdx, outputType);
+            }
+        }
+        else if (plausibleMatch != layerOutputTypes.end())
+        {
+            auto const& outputTypes = plausibleMatch->second;
+            bool const isBroadcast = (outputTypes.size() == 1);
+
+            // We should not set the layer output types if its default precision is INT32 or Bool.
+            if (layer->getPrecision() == nvinfer1::DataType::kINT32
+                || layer->getPrecision() == nvinfer1::DataType::kBOOL)
+            {
+                hasLayerOutputTypeSkipped = true;
+                sample::gLogVerbose << "Skipped setting output types for layer " << layerName << " because the "
+                                    << " default layer precision is INT32 or Bool." << std::endl;
+                continue;
+            }
+            // We should not set the constant layer output types if its weights are in INT32.
+            if (layer->getType() == nvinfer1::LayerType::kCONSTANT
+                && static_cast<IConstantLayer*>(layer)->getWeights().type == nvinfer1::DataType::kINT32)
+            {
+                hasLayerOutputTypeSkipped = true;
+                sample::gLogVerbose << "Skipped setting output types for layer " << layerName << " because this "
+                                    << "constant layer has INT32 weights." << std::endl;
+                continue;
+            }
+            for (int32_t outputIdx = 0; outputIdx < nbOutputs; ++outputIdx)
+            {
+                // We should not set the output type if the output is a shape tensor.
+                if (layer->getOutput(0)->isShapeTensor())
+                {
+                    hasLayerOutputTypeSkipped = true;
+                    sample::gLogVerbose << "Skipped setting output type for output " << outputIdx << " of layer "
+                                        << layerName << " because it is a shape tensor." << std::endl;
+                    continue;
+                }
+
+                auto const outputType = outputTypes.at(isBroadcast ? 0 : outputIdx);
+                sample::gLogInfo << "Set output " << outputIdx << " of layer " << layerName << " to type " << outputType
+                                 << std::endl;
+                layer->setOutputType(outputIdx, globalOutputType);
+            }
+        }
+    }
+
+    if (hasLayerOutputTypeSkipped)
+    {
+        sample::gLogInfo << "Skipped setting output types for some layers. Check verbose logs for more details."
+                         << std::endl;
+    }
+}
+
+void setLayerDeviceTypes(
+    INetworkDefinition const& network, IBuilderConfig& config, LayerDeviceTypes const& layerDeviceTypes)
+{
+    for (int32_t layerIdx = 0; layerIdx < network.getNbLayers(); ++layerIdx)
+    {
+        auto* layer = network.getLayer(layerIdx);
+        auto const layerName = layer->getName();
+        auto match = findPlausible(layerDeviceTypes, layerName);
+        if (match != layerDeviceTypes.end())
+        {
+            DeviceType const deviceType = match->second;
+            sample::gLogInfo << "Set layer " << layerName << " to device type " << deviceType << std::endl;
+            config.setDeviceType(layer, deviceType);
+        }
+    }
+}
+
+void markDebugTensors(INetworkDefinition& network, StringSet const& debugTensors)
+{
+    for (int64_t inputIndex = 0; inputIndex < network.getNbInputs(); ++inputIndex)
+    {
+        auto* t = network.getInput(inputIndex);
+        auto const tensorName = t->getName();
+        if (debugTensors.count(tensorName) > 0)
+        {
+            network.markDebug(*t);
+        }
+    }
+    for (int64_t layerIndex = 0; layerIndex < network.getNbLayers(); ++layerIndex)
+    {
+        auto* layer = network.getLayer(layerIndex);
+        for (int64_t outputIndex = 0; outputIndex < layer->getNbOutputs(); ++outputIndex)
+        {
+            auto* t = layer->getOutput(outputIndex);
+            auto const tensorName = t->getName();
+            if (debugTensors.count(tensorName) > 0)
+            {
+                network.markDebug(*t);
+            }
+        }
+    }
+}
+
+void setMemoryPoolLimits(IBuilderConfig& config, BuildOptions const& build)
+{
+    auto const roundToBytes = [](double const size, bool fromMB = true) {
+        return static_cast<size_t>(size * (fromMB ? 1.0_MiB : 1.0_KiB));
+    };
+    if (build.workspace >= 0)
+    {
+        config.setMemoryPoolLimit(MemoryPoolType::kWORKSPACE, roundToBytes(build.workspace));
+    }
+    if (build.dlaSRAM >= 0)
+    {
+        size_t const sizeInBytes = roundToBytes(build.dlaSRAM);
+        size_t sizeInPowerOf2{1};
+        // Using 2^30 bytes as a loose upper bound to prevent the possibility of overflows and infinite loops.
+        while (sizeInPowerOf2 < 31 && (static_cast<size_t>(1) << sizeInPowerOf2) <= sizeInBytes)
+        {
+            ++sizeInPowerOf2;
+        }
+        --sizeInPowerOf2;
+        if (sizeInPowerOf2 == 30)
+        {
+            sample::gLogWarning
+                << "User-specified DLA managed SRAM size is too large and has been clipped to 2^30 bytes. "
+                << "Please make sure that this is the intended managed SRAM size." << std::endl;
+        }
+        config.setMemoryPoolLimit(MemoryPoolType::kDLA_MANAGED_SRAM, static_cast<size_t>(1) << sizeInPowerOf2);
+    }
+    if (build.dlaLocalDRAM >= 0)
+    {
+        config.setMemoryPoolLimit(MemoryPoolType::kDLA_LOCAL_DRAM, roundToBytes(build.dlaLocalDRAM));
+    }
+    if (build.dlaGlobalDRAM >= 0)
+    {
+        config.setMemoryPoolLimit(MemoryPoolType::kDLA_GLOBAL_DRAM, roundToBytes(build.dlaGlobalDRAM));
+    }
+    if (build.tacticSharedMem >= 0)
+    {
+        config.setMemoryPoolLimit(MemoryPoolType::kTACTIC_SHARED_MEMORY, roundToBytes(build.tacticSharedMem, false));
+    }
+}
+
+void setPreviewFeatures(IBuilderConfig& config, BuildOptions const& build)
+{
+    auto const setFlag = [&](PreviewFeature feat) {
+        int32_t featVal = static_cast<int32_t>(feat);
+        if (build.previewFeatures.find(featVal) != build.previewFeatures.end())
+        {
+            config.setPreviewFeature(feat, build.previewFeatures.at(featVal));
+        }
+    };
+    setFlag(PreviewFeature::kALIASED_PLUGIN_IO_10_03);
+}
+
+} // namespace
+
+bool setupNetworkAndConfig(BuildOptions const& build, SystemOptions const& sys, IBuilder& builder,
+    INetworkDefinition& network, IBuilderConfig& config, std::unique_ptr<nvinfer1::IInt8Calibrator>& calibrator,
+    std::ostream& err, std::vector<std::vector<int8_t>>& sparseWeights)
+{
+    std::vector<IOptimizationProfile*> profiles{};
+    profiles.resize(build.optProfiles.size());
+    for (auto& profile : profiles)
+    {
+        profile = builder.createOptimizationProfile();
+    }
+
+    bool hasDynamicShapes{false};
+
+    bool broadcastInputFormats = broadcastIOFormats(build.inputFormats, network.getNbInputs());
+
+    // Check if the provided input tensor names match the input tensors of the engine.
+    // Throw an error if the provided input tensor names cannot be found because it implies a potential typo.
+    for (auto const& shapes : build.optProfiles)
+    {
+        for (auto const& shape : shapes)
+        {
+            bool tensorNameFound{false};
+            for (int32_t i = 0; i < network.getNbInputs(); ++i)
+            {
+                if (matchStringWithOneWildcard(shape.first, network.getInput(i)->getName()))
+                {
+                    tensorNameFound = true;
+                    break;
+                }
+            }
+            if (!tensorNameFound)
+            {
+                sample::gLogError << "Cannot find input tensor with name \"" << shape.first << "\" in the network "
+                                  << "inputs! Please make sure the input tensor names are correct." << std::endl;
+                return false;
+            }
+        }
+    }
+
+    for (uint32_t i = 0, n = network.getNbInputs(); i < n; i++)
+    {
+        // Set formats and data types of inputs
+        auto* input = network.getInput(i);
+        if (!build.inputFormats.empty())
+        {
+            int32_t inputFormatIndex = broadcastInputFormats ? 0 : i;
+            input->setType(build.inputFormats[inputFormatIndex].first);
+            input->setAllowedFormats(build.inputFormats[inputFormatIndex].second);
+        }
+
+        auto const dims = input->getDimensions();
+        auto const isScalar = dims.nbDims == 0;
+        auto const isDynamicInput = std::any_of(dims.d, dims.d + dims.nbDims, [](int32_t dim) { return dim == -1; })
+            || input->isShapeTensor();
+        if (isDynamicInput)
+        {
+            hasDynamicShapes = true;
+            for (size_t i = 0; i < build.optProfiles.size(); i++)
+            {
+                auto const& optShapes = build.optProfiles[i];
+                auto profile = profiles[i];
+                auto const tensorName = input->getName();
+                auto shape = findPlausible(optShapes, tensorName);
+                ShapeRange shapes{};
+
+                // If no shape is provided, set dynamic dimensions to 1.
+                if (shape == optShapes.end())
+                {
+                    constexpr int32_t kDEFAULT_DIMENSION{1};
+                    std::vector<int32_t> staticDims;
+                    if (input->isShapeTensor())
+                    {
+                        if (isScalar)
+                        {
+                            staticDims.push_back(1);
+                        }
+                        else
+                        {
+                            staticDims.resize(dims.d[0]);
+                            std::fill(staticDims.begin(), staticDims.end(), kDEFAULT_DIMENSION);
+                        }
+                    }
+                    else
+                    {
+                        staticDims.resize(dims.nbDims);
+                        std::transform(dims.d, dims.d + dims.nbDims, staticDims.begin(),
+                            [&](int dimension) { return dimension > 0 ? dimension : kDEFAULT_DIMENSION; });
+                    }
+                    sample::gLogWarning << "Dynamic dimensions required for input: " << tensorName
+                                        << ", but no shapes were provided. Automatically overriding shape to: "
+                                        << staticDims << std::endl;
+                    std::fill(shapes.begin(), shapes.end(), staticDims);
+                }
+                else
+                {
+                    shapes = shape->second;
+                }
+
+                std::vector<int> profileDims{};
+                if (input->isShapeTensor())
+                {
+                    profileDims = shapes[static_cast<size_t>(OptProfileSelector::kMIN)];
+                    SMP_RETVAL_IF_FALSE(profile->setShapeValues(tensorName, OptProfileSelector::kMIN,
+                                            profileDims.data(), static_cast<int>(profileDims.size())),
+                        "Error in set shape values MIN", false, err);
+                    profileDims = shapes[static_cast<size_t>(OptProfileSelector::kOPT)];
+                    SMP_RETVAL_IF_FALSE(profile->setShapeValues(tensorName, OptProfileSelector::kOPT,
+                                            profileDims.data(), static_cast<int>(profileDims.size())),
+                        "Error in set shape values OPT", false, err);
+                    profileDims = shapes[static_cast<size_t>(OptProfileSelector::kMAX)];
+                    SMP_RETVAL_IF_FALSE(profile->setShapeValues(tensorName, OptProfileSelector::kMAX,
+                                            profileDims.data(), static_cast<int>(profileDims.size())),
+                        "Error in set shape values MAX", false, err);
+                    sample::gLogInfo << "Set input shape tensor " << tensorName << " for optimization profile " << i
+                                     << " to:"
+                                     << " MIN=" << shapes[static_cast<size_t>(OptProfileSelector::kMIN)]
+                                     << " OPT=" << shapes[static_cast<size_t>(OptProfileSelector::kOPT)]
+                                     << " MAX=" << shapes[static_cast<size_t>(OptProfileSelector::kMAX)] << std::endl;
+                }
+                else
+                {
+                    profileDims = shapes[static_cast<size_t>(OptProfileSelector::kMIN)];
+                    SMP_RETVAL_IF_FALSE(
+                        profile->setDimensions(tensorName, OptProfileSelector::kMIN, toDims(profileDims)),
+                        "Error in set dimensions to profile MIN", false, err);
+                    profileDims = shapes[static_cast<size_t>(OptProfileSelector::kOPT)];
+                    SMP_RETVAL_IF_FALSE(
+                        profile->setDimensions(tensorName, OptProfileSelector::kOPT, toDims(profileDims)),
+                        "Error in set dimensions to profile OPT", false, err);
+                    profileDims = shapes[static_cast<size_t>(OptProfileSelector::kMAX)];
+                    SMP_RETVAL_IF_FALSE(
+                        profile->setDimensions(tensorName, OptProfileSelector::kMAX, toDims(profileDims)),
+                        "Error in set dimensions to profile MAX", false, err);
+                    sample::gLogInfo << "Set shape of input tensor " << tensorName << " for optimization profile " << i
+                                     << " to:"
+                                     << " MIN=" << shapes[static_cast<size_t>(OptProfileSelector::kMIN)]
+                                     << " OPT=" << shapes[static_cast<size_t>(OptProfileSelector::kOPT)]
+                                     << " MAX=" << shapes[static_cast<size_t>(OptProfileSelector::kMAX)] << std::endl;
+                }
+            }
+        }
+    }
+
+    for (uint32_t i = 0, n = network.getNbOutputs(); i < n; i++)
+    {
+        auto* output = network.getOutput(i);
+        auto const dims = output->getDimensions();
+        // A shape tensor output with known static dimensions may have dynamic shape values inside it.
+        auto const isDynamicOutput = std::any_of(dims.d, dims.d + dims.nbDims, [](int32_t dim) { return dim == -1; })
+            || output->isShapeTensor();
+        if (isDynamicOutput)
+        {
+            hasDynamicShapes = true;
+        }
+    }
+
+    if (!hasDynamicShapes && !build.optProfiles[0].empty())
+    {
+        sample::gLogError << "Static model does not take explicit shapes since the shape of inference tensors will be "
+                             "determined by the model itself"
+                          << std::endl;
+        return false;
+    }
+
+    if (hasDynamicShapes)
+    {
+        for (auto profile : profiles)
+        {
+            SMP_RETVAL_IF_FALSE(profile->isValid(), "Required optimization profile is invalid", false, err);
+            SMP_RETVAL_IF_FALSE(
+                config.addOptimizationProfile(profile) != -1, "Error in add optimization profile", false, err);
+        }
+    }
+
+    bool broadcastOutputFormats = broadcastIOFormats(build.outputFormats, network.getNbOutputs(), false);
+
+    for (uint32_t i = 0, n = network.getNbOutputs(); i < n; i++)
+    {
+        // Set formats and data types of outputs
+        auto* output = network.getOutput(i);
+        if (!build.outputFormats.empty())
+        {
+            int32_t outputFormatIndex = broadcastOutputFormats ? 0 : i;
+            output->setType(build.outputFormats[outputFormatIndex].first);
+            output->setAllowedFormats(build.outputFormats[outputFormatIndex].second);
+        }
+    }
+
+    setMemoryPoolLimits(config, build);
+
+    setPreviewFeatures(config, build);
+
+    if (build.builderOptimizationLevel != defaultBuilderOptimizationLevel)
+    {
+        config.setBuilderOptimizationLevel(build.builderOptimizationLevel);
+    }
+
+    if (build.maxTactics != defaultMaxTactics)
+    {
+        config.setMaxNbTactics(build.maxTactics);
+    }
+
+    if (build.timingCacheMode == TimingCacheMode::kDISABLE)
+    {
+        config.setFlag(BuilderFlag::kDISABLE_TIMING_CACHE);
+    }
+
+    if (build.disableCompilationCache)
+    {
+        config.setFlag(BuilderFlag::kDISABLE_COMPILATION_CACHE);
+    }
+
+    if (build.errorOnTimingCacheMiss)
+    {
+        config.setFlag(BuilderFlag::kERROR_ON_TIMING_CACHE_MISS);
+    }
+
+    if (!build.tf32)
+    {
+        config.clearFlag(BuilderFlag::kTF32);
+    }
+
+    if (build.refittable)
+    {
+        config.setFlag(BuilderFlag::kREFIT);
+    }
+
+    if (build.stripWeights)
+    {
+        // The kREFIT_IDENTICAL is enabled by default when kSTRIP_PLAN is on.
+        config.setFlag(BuilderFlag::kSTRIP_PLAN);
+    }
+
+    if (build.versionCompatible)
+    {
+        config.setFlag(BuilderFlag::kVERSION_COMPATIBLE);
+    }
+#if !TRT_WINML
+    std::vector<char const*> pluginPaths;
+    for (auto const& pluginPath : sys.setPluginsToSerialize)
+    {
+        sample::gLogVerbose << "Setting plugin to serialize: " << pluginPath << std::endl;
+        pluginPaths.push_back(pluginPath.c_str());
+    }
+    if (!pluginPaths.empty())
+    {
+        config.setPluginsToSerialize(pluginPaths.data(), pluginPaths.size());
+    }
+#endif
+    if (build.excludeLeanRuntime)
+    {
+        config.setFlag(BuilderFlag::kEXCLUDE_LEAN_RUNTIME);
+    }
+
+    if (build.sparsity != SparsityFlag::kDISABLE)
+    {
+        config.setFlag(BuilderFlag::kSPARSE_WEIGHTS);
+        if (build.sparsity == SparsityFlag::kFORCE)
+        {
+            sparsify(network, sparseWeights);
+        }
+    }
+
+    config.setProfilingVerbosity(build.profilingVerbosity);
+    config.setAvgTimingIterations(build.avgTiming);
+
+    if (build.fp16)
+    {
+        config.setFlag(BuilderFlag::kFP16);
+    }
+    if (build.int8)
+    {
+        config.setFlag(BuilderFlag::kINT8);
+    }
+    if (build.bf16)
+    {
+        config.setFlag(BuilderFlag::kBF16);
+    }
+
+    SMP_RETVAL_IF_FALSE(!(build.int8 && build.fp8), "FP8 and INT8 precisions have been specified", false, err);
+
+    if (build.fp8)
+    {
+        config.setFlag(BuilderFlag::kFP8);
+    }
+
+    if (build.int4)
+    {
+        config.setFlag(BuilderFlag::kINT4);
+    }
+
+    if (build.int8 && !build.fp16)
+    {
+        sample::gLogInfo
+            << "FP32 and INT8 precisions have been specified - more performance might be enabled by additionally "
+               "specifying --fp16 or --best"
+            << std::endl;
+    }
+
+    auto isInt8 = [](const IOFormat& format) { return format.first == DataType::kINT8; };
+    auto int8IO = std::count_if(build.inputFormats.begin(), build.inputFormats.end(), isInt8)
+        + std::count_if(build.outputFormats.begin(), build.outputFormats.end(), isInt8);
+
+    auto hasQDQLayers = [](INetworkDefinition& network) {
+        // Determine if our network has QDQ layers.
+        auto const nbLayers = network.getNbLayers();
+        for (int32_t i = 0; i < nbLayers; i++)
+        {
+            auto const& layer = network.getLayer(i);
+            if (layer->getType() == LayerType::kQUANTIZE || layer->getType() == LayerType::kDEQUANTIZE)
+            {
+                return true;
+            }
+        }
+        return false;
+    };
+
+    if (!hasQDQLayers(network) && (build.int8 || int8IO) && build.calibration.empty())
+    {
+        // Explicitly set int8 scales if no calibrator is provided and if I/O tensors use int8,
+        // because auto calibration does not support this case.
+        SMP_RETVAL_IF_FALSE(setTensorDynamicRange(network), "Error in set tensor dynamic range.", false, err);
+    }
+    else if (build.int8)
+    {
+        if (!hasQDQLayers(network) && int8IO)
+        {
+            try
+            {
+                // Set dynamic ranges of int8 inputs / outputs to match scales loaded from calibration cache
+                // TODO http://nvbugs/3262234 Change the network validation so that this workaround can be removed
+                setTensorScalesFromCalibration(network, build.inputFormats, build.outputFormats, build.calibration);
+            }
+            catch (std::exception&)
+            {
+                sample::gLogError
+                    << "Int8IO was specified but impossible to read tensor scales from provided calibration cache file"
+                    << std::endl;
+                return false;
+            }
+        }
+        IOptimizationProfile* profileCalib{nullptr};
+        if (!build.shapesCalib.empty())
+        {
+            profileCalib = builder.createOptimizationProfile();
+            for (uint32_t i = 0, n = network.getNbInputs(); i < n; i++)
+            {
+                auto* input = network.getInput(i);
+                Dims profileDims{};
+                auto const tensorName = input->getName();
+                auto shape = findPlausible(build.shapesCalib, tensorName);
+
+                if (shape == build.shapesCalib.end())
+                {
+                    std::ostringstream msg;
+                    msg << "Calibration profile for tensor " << tensorName << " cannot be found!";
+                    throw std::invalid_argument(msg.str());
+                }
+
+                auto shapesCalib = shape->second;
+                profileDims = toDims(shapesCalib[static_cast<size_t>(OptProfileSelector::kOPT)]);
+                // Here we check only kMIN as all profileDims are the same.
+                SMP_RETVAL_IF_FALSE(profileCalib->setDimensions(tensorName, OptProfileSelector::kMIN, profileDims),
+                    "Error in set dimensions to calibration profile OPT", false, err);
+                profileCalib->setDimensions(tensorName, OptProfileSelector::kOPT, profileDims);
+                profileCalib->setDimensions(tensorName, OptProfileSelector::kMAX, profileDims);
+                sample::gLogInfo << "Set calibration profile for input tensor " << tensorName << " to " << profileDims
+                                 << std::endl;
+            }
+            SMP_RETVAL_IF_FALSE(profileCalib->isValid(), "Calibration profile is invalid", false, err);
+            SMP_RETVAL_IF_FALSE(
+                config.setCalibrationProfile(profileCalib), "Error in set calibration profile", false, err);
+        }
+
+        std::vector<int64_t> elemCount{};
+        for (int i = 0; i < network.getNbInputs(); i++)
+        {
+            auto* input = network.getInput(i);
+            auto const dims = input->getDimensions();
+            auto const isDynamicInput
+                = std::any_of(dims.d, dims.d + dims.nbDims, [](int32_t dim) { return dim == -1; });
+
+            if (profileCalib)
+            {
+                elemCount.push_back(volume(profileCalib->getDimensions(input->getName(), OptProfileSelector::kOPT)));
+            }
+            else if (!profiles.empty() && isDynamicInput)
+            {
+                elemCount.push_back(
+                    volume(profiles[build.calibProfile]->getDimensions(input->getName(), OptProfileSelector::kOPT)));
+            }
+            else
+            {
+                elemCount.push_back(volume(input->getDimensions()));
+            }
+        }
+
+        calibrator.reset(new RndInt8Calibrator(1, elemCount, build.calibration, network, err));
+        config.setInt8Calibrator(calibrator.get());
+    }
+
+    if (build.directIO)
+    {
+        config.setFlag(BuilderFlag::kDIRECT_IO);
+    }
+
+    switch (build.precisionConstraints)
+    {
+    case PrecisionConstraints::kNONE:
+        // It's the default for TensorRT.
+        break;
+    case PrecisionConstraints::kOBEY: config.setFlag(BuilderFlag::kOBEY_PRECISION_CONSTRAINTS); break;
+    case PrecisionConstraints::kPREFER: config.setFlag(BuilderFlag::kPREFER_PRECISION_CONSTRAINTS); break;
+    }
+
+    if (!build.layerPrecisions.empty() && build.precisionConstraints != PrecisionConstraints::kNONE)
+    {
+        setLayerPrecisions(network, build.layerPrecisions);
+    }
+
+    if (!build.layerOutputTypes.empty() && build.precisionConstraints != PrecisionConstraints::kNONE)
+    {
+        setLayerOutputTypes(network, build.layerOutputTypes);
+    }
+
+    if (!build.layerDeviceTypes.empty())
+    {
+        setLayerDeviceTypes(network, config, build.layerDeviceTypes);
+    }
+
+    if (!build.debugTensors.empty())
+    {
+        markDebugTensors(network, build.debugTensors);
+    }
+
+    if (build.safe && sys.DLACore == -1)
+    {
+        config.setEngineCapability(EngineCapability::kSAFETY);
+    }
+
+    if (build.restricted)
+    {
+        config.setFlag(BuilderFlag::kSAFETY_SCOPE);
+    }
+
+    if (sys.DLACore != -1)
+    {
+        if (sys.DLACore < builder.getNbDLACores())
+        {
+            config.setDefaultDeviceType(DeviceType::kDLA);
+            config.setDLACore(sys.DLACore);
+            config.setFlag(BuilderFlag::kPREFER_PRECISION_CONSTRAINTS);
+            if (build.buildDLAStandalone)
+            {
+                config.setEngineCapability(EngineCapability::kDLA_STANDALONE);
+            }
+            if (build.allowGPUFallback)
+            {
+                config.setFlag(BuilderFlag::kGPU_FALLBACK);
+            }
+            else
+            {
+                // Reformatting runs on GPU, so avoid I/O reformatting.
+                config.setFlag(BuilderFlag::kDIRECT_IO);
+            }
+            if (!build.int8)
+            {
+                config.setFlag(BuilderFlag::kFP16);
+            }
+        }
+        else
+        {
+            err << "Cannot create DLA engine, " << sys.DLACore << " not available" << std::endl;
+            return false;
+        }
+    }
+
+    if (build.enabledTactics || build.disabledTactics)
+    {
+        TacticSources tacticSources = config.getTacticSources();
+        tacticSources |= build.enabledTactics;
+        tacticSources &= ~build.disabledTactics;
+        config.setTacticSources(tacticSources);
+    }
+
+    config.setHardwareCompatibilityLevel(build.hardwareCompatibilityLevel);
+    config.setRuntimePlatform(build.runtimePlatform);
+
+    if (build.maxAuxStreams != defaultMaxAuxStreams)
+    {
+        config.setMaxAuxStreams(build.maxAuxStreams);
+    }
+
+    if (build.allowWeightStreaming)
+    {
+        config.setFlag(BuilderFlag::kWEIGHT_STREAMING);
+    }
+
+    return true;
+}
+
+//!
+//! \brief Create a serialized engine for a network defintion
+//!
+//! \return Whether the engine creation succeeds or fails.
+//!
+bool networkToSerializedEngine(
+    BuildOptions const& build, SystemOptions const& sys, IBuilder& builder, BuildEnvironment& env, std::ostream& err)
+{
+    std::unique_ptr<IBuilderConfig> config{builder.createBuilderConfig()};
+    std::unique_ptr<nvinfer1::IInt8Calibrator> calibrator;
+    std::vector<std::vector<int8_t>> sparseWeights;
+    SMP_RETVAL_IF_FALSE(config != nullptr, "Config creation failed", false, err);
+    SMP_RETVAL_IF_FALSE(
+        setupNetworkAndConfig(build, sys, builder, *env.network, *config, calibrator, err, sparseWeights),
+        "Network And Config setup failed", false, err);
+
+    std::unique_ptr<ITimingCache> timingCache{};
+    // Try to load cache from file. Create a fresh cache if the file doesn't exist
+    if (build.timingCacheMode == TimingCacheMode::kGLOBAL)
+    {
+        timingCache
+            = samplesCommon::buildTimingCacheFromFile(gLogger.getTRTLogger(), *config, build.timingCacheFile, err);
+    }
+
+    // CUDA stream used for profiling by the builder.
+    auto profileStream = samplesCommon::makeCudaStream();
+    SMP_RETVAL_IF_FALSE(profileStream != nullptr, "Cuda stream creation failed", false, err);
+    config->setProfileStream(*profileStream);
+
+    auto const tBegin = std::chrono::high_resolution_clock::now();
+    std::unique_ptr<IHostMemory> serializedEngine{builder.buildSerializedNetwork(*env.network, *config)};
+    SMP_RETVAL_IF_FALSE(serializedEngine != nullptr, "Engine could not be created from network", false, err);
+    auto const tEnd = std::chrono::high_resolution_clock::now();
+    float const buildTime = std::chrono::duration<float>(tEnd - tBegin).count();
+    sample::gLogInfo << "Engine built in " << buildTime << " sec." << std::endl;
+    sample::gLogInfo << "Created engine with size: " << (serializedEngine->size() / 1.0_MiB) << " MiB" << std::endl;
+
+    env.engine.setBlob(serializedEngine);
+
+    if (build.timingCacheMode == TimingCacheMode::kGLOBAL)
+    {
+        auto timingCache = config->getTimingCache();
+        samplesCommon::updateTimingCacheFile(gLogger.getTRTLogger(), build.timingCacheFile, timingCache, builder);
+    }
+
+    return true;
+}
+
+//!
+//! \brief Parse a given model, create a network and an engine.
+//!
+bool modelToBuildEnv(
+    ModelOptions const& model, BuildOptions const& build, SystemOptions& sys, BuildEnvironment& env, std::ostream& err)
+{
+    env.builder.reset(createBuilder());
+    SMP_RETVAL_IF_FALSE(env.builder != nullptr, "Builder creation failed", false, err);
+    env.builder->setErrorRecorder(&gRecorder);
+    auto networkFlags = (build.stronglyTyped)
+        ? 1U << static_cast<uint32_t>(nvinfer1::NetworkDefinitionCreationFlag::kSTRONGLY_TYPED)
+        : 0U;
+#if !TRT_WINML
+    for (auto const& pluginPath : sys.dynamicPlugins)
+    {
+        env.builder->getPluginRegistry().loadLibrary(pluginPath.c_str());
+    }
+#endif
+    env.network.reset(env.builder->createNetworkV2(networkFlags));
+
+    std::vector<std::string> vcPluginLibrariesUsed;
+    SMP_RETVAL_IF_FALSE(env.network != nullptr, "Network creation failed", false, err);
+    env.parser
+        = modelToNetwork(model, build, *env.network, err, build.versionCompatible ? &vcPluginLibrariesUsed : nullptr);
+    SMP_RETVAL_IF_FALSE(env.parser.operator bool(), "Parsing model failed", false, err);
+
+#if !TRT_WINML
+    if (build.versionCompatible && !sys.ignoreParsedPluginLibs && !vcPluginLibrariesUsed.empty())
+    {
+        sample::gLogInfo << "The following plugin libraries were identified by the parser as required for a "
+                            "version-compatible engine:"
+                         << std::endl;
+        for (auto const& lib : vcPluginLibrariesUsed)
+        {
+            sample::gLogInfo << "    " << lib << std::endl;
+        }
+        if (!build.excludeLeanRuntime)
+        {
+            sample::gLogInfo << "These libraries will be added to --setPluginsToSerialize since --excludeLeanRuntime "
+                                "was not specified."
+                             << std::endl;
+            std::copy(vcPluginLibrariesUsed.begin(), vcPluginLibrariesUsed.end(),
+                std::back_inserter(sys.setPluginsToSerialize));
+        }
+        sample::gLogInfo << "These libraries will be added to --dynamicPlugins for use at inference time." << std::endl;
+        std::copy(vcPluginLibrariesUsed.begin(), vcPluginLibrariesUsed.end(), std::back_inserter(sys.dynamicPlugins));
+
+        // Implicitly-added plugins from ONNX parser should be loaded into plugin registry as well.
+        for (auto const& pluginPath : vcPluginLibrariesUsed)
+        {
+            env.builder->getPluginRegistry().loadLibrary(pluginPath.c_str());
+        }
+
+        sample::gLogInfo << "Use --ignoreParsedPluginLibs to disable this behavior." << std::endl;
+    }
+#endif
+
+    SMP_RETVAL_IF_FALSE(
+        networkToSerializedEngine(build, sys, *env.builder, env, err), "Building engine failed", false, err);
+    return true;
+}
+
+namespace
+{
+std::pair<std::vector<std::string>, std::vector<WeightsRole>> getLayerWeightsRolePair(IRefitter& refitter)
+{
+    // Get number of refittable items.
+    auto const nbAll = refitter.getAll(0, nullptr, nullptr);
+    std::vector<char const*> layerNames(nbAll);
+    // Allocate buffers for the items and get them.
+    std::vector<nvinfer1::WeightsRole> weightsRoles(nbAll);
+    refitter.getAll(nbAll, layerNames.data(), weightsRoles.data());
+    std::vector<std::string> layerNameStrs(nbAll);
+    std::transform(layerNames.begin(), layerNames.end(), layerNameStrs.begin(), [](char const* name) {
+        if (name == nullptr)
+        {
+            return std::string{};
+        }
+        return std::string{name};
+    });
+    return {layerNameStrs, weightsRoles};
+}
+
+std::pair<std::vector<std::string>, std::vector<WeightsRole>> getMissingLayerWeightsRolePair(IRefitter& refitter)
+{
+    // Get number of refittable items.
+    auto const nbMissing = refitter.getMissing(0, nullptr, nullptr);
+    std::vector<char const*> layerNames(nbMissing);
+    // Allocate buffers for the items and get them.
+    std::vector<nvinfer1::WeightsRole> weightsRoles(nbMissing);
+    refitter.getMissing(nbMissing, layerNames.data(), weightsRoles.data());
+    std::vector<std::string> layerNameStrs(nbMissing);
+    std::transform(layerNames.begin(), layerNames.end(), layerNameStrs.begin(), [](char const* name) {
+        if (name == nullptr)
+        {
+            return std::string{};
+        }
+        return std::string{name};
+    });
+    return {layerNameStrs, weightsRoles};
+}
+} // namespace
+
+bool loadStreamingEngineToBuildEnv(std::string const& filepath, BuildEnvironment& env, std::ostream& err)
+{
+    auto& reader = env.engine.getFileReader();
+    SMP_RETVAL_IF_FALSE(reader.open(filepath), "", false, err << "Error opening engine file: " << filepath);
+    return true;
+}
+
+bool loadEngineToBuildEnv(std::string const& filepath, BuildEnvironment& env, std::ostream& err)
+{
+    auto const tBegin = std::chrono::high_resolution_clock::now();
+    std::ifstream engineFile(filepath, std::ios::binary);
+    SMP_RETVAL_IF_FALSE(engineFile.good(), "", false, err << "Error opening engine file: " << filepath);
+    engineFile.seekg(0, std::ifstream::end);
+    int64_t fsize = engineFile.tellg();
+    engineFile.seekg(0, std::ifstream::beg);
+
+    std::vector<uint8_t> engineBlob(fsize);
+    engineFile.read(reinterpret_cast<char*>(engineBlob.data()), fsize);
+    SMP_RETVAL_IF_FALSE(engineFile.good(), "", false, err << "Error loading engine file: " << filepath);
+    auto const tEnd = std::chrono::high_resolution_clock::now();
+    float const loadTime = std::chrono::duration<float>(tEnd - tBegin).count();
+    sample::gLogInfo << "Engine loaded in " << loadTime << " sec." << std::endl;
+    sample::gLogInfo << "Loaded engine with size: " << (fsize / 1.0_MiB) << " MiB" << std::endl;
+
+    env.engine.setBlob(std::move(engineBlob));
+
+    return true;
+}
+
+bool printPlanVersion(BuildEnvironment& env, std::ostream& err)
+{
+    constexpr int64_t kPLAN_SIZE{28};
+    std::vector<uint8_t> data(kPLAN_SIZE);
+    auto blob = data.data();
+
+    auto& reader = env.engine.getFileReader();
+    if (reader.isOpen())
+    {
+        SMP_RETVAL_IF_FALSE(reader.read(data.data(), kPLAN_SIZE) == kPLAN_SIZE, "Failed to read plan file", false, err);
+    }
+    else
+    {
+        SMP_RETVAL_IF_FALSE(env.engine.getBlob().data != nullptr, "Plan file is empty", false, err);
+        SMP_RETVAL_IF_FALSE(env.engine.getBlob().size >= 28, "Plan file is incorrect", false, err);
+        blob = static_cast<uint8_t*>(env.engine.getBlob().data);
+    }
+    auto blob32 = reinterpret_cast<uint32_t*>(blob);
+
+    //! Correct TensorRT plan file starts with this tag
+    constexpr uint32_t kPLAN_FILE_TAG{0x74727466U};
+    SMP_RETVAL_IF_FALSE(blob32[0] == kPLAN_FILE_TAG, "Failed to verify a plan tag.", false, err);
+    switch (blob32[1])
+    {
+    case 0U:
+    {
+        // Blob index to store the plan version may depend on the serialization version.
+        sample::gLogInfo << "Plan was created with TensorRT version " << static_cast<int32_t>(blob[24])
+        << "." << static_cast<int32_t>(blob[25]) << "." << static_cast<int32_t>(blob[26])
+        << "." << static_cast<int32_t>(blob[27]) << std::endl;
+        return true;
+    }
+    }
+    sample::gLogError << "Serialization version is not supported." << std::endl;
+    return false;
+}
+
+void dumpRefittable(nvinfer1::ICudaEngine& engine)
+{
+    std::unique_ptr<IRefitter> refitter{createRefitter(engine)};
+    if (refitter == nullptr)
+    {
+        sample::gLogError << "Failed to create a refitter." << std::endl;
+        return;
+    }
+
+    auto const& layerWeightsRolePair = getLayerWeightsRolePair(*refitter);
+    auto const& layerNames = layerWeightsRolePair.first;
+    auto const& weightsRoles = layerWeightsRolePair.second;
+    auto const nbAll = layerWeightsRolePair.first.size();
+    for (size_t i = 0; i < nbAll; ++i)
+    {
+        sample::gLogInfo << layerNames[i] << " " << weightsRoles[i] << std::endl;
+    }
+}
+
+ICudaEngine* loadEngine(std::string const& engine, int32_t DLACore, std::ostream& err)
+{
+    BuildEnvironment env(/* isSafe */ false, /* versionCompatible */ false, DLACore, "", getTempfileControlDefaults());
+    return loadEngineToBuildEnv(engine, env, err) ? env.engine.release() : nullptr;
+}
+
+bool saveEngine(const ICudaEngine& engine, std::string const& fileName, std::ostream& err)
+{
+    std::ofstream engineFile(fileName, std::ios::binary);
+    if (!engineFile)
+    {
+        err << "Cannot open engine file: " << fileName << std::endl;
+        return false;
+    }
+
+    std::unique_ptr<IHostMemory> serializedEngine{engine.serialize()};
+    if (serializedEngine == nullptr)
+    {
+        err << "Engine serialization failed" << std::endl;
+        return false;
+    }
+
+    engineFile.write(static_cast<char*>(serializedEngine->data()), serializedEngine->size());
+    return !engineFile.fail();
+}
+
+bool getEngineBuildEnv(
+    const ModelOptions& model, BuildOptions const& build, SystemOptions& sys, BuildEnvironment& env, std::ostream& err)
+{
+    bool createEngineSuccess{false};
+
+    if (build.load)
+    {
+        if (build.safe)
+        {
+            createEngineSuccess = loadEngineToBuildEnv(build.engine, env, err);
+        }
+        else
+        {
+            createEngineSuccess = loadStreamingEngineToBuildEnv(build.engine, env, err);
+        }
+    }
+    else
+    {
+        createEngineSuccess = modelToBuildEnv(model, build, sys, env, err);
+    }
+
+    SMP_RETVAL_IF_FALSE(createEngineSuccess, "Failed to create engine from model or file.", false, err);
+
+    if (build.getPlanVersionOnly && build.load)
+    {
+        SMP_RETVAL_IF_FALSE(printPlanVersion(env, err), "Failed to get plan file version.", false, err);
+        return true;
+    }
+
+    if (build.save)
+    {
+        std::ofstream engineFile(build.engine, std::ios::binary);
+        auto& engineBlob = env.engine.getBlob();
+        engineFile.write(static_cast<char const*>(engineBlob.data), engineBlob.size);
+        SMP_RETVAL_IF_FALSE(!engineFile.fail(), "Saving engine to file failed.", false, err);
+        engineFile.flush();
+        engineFile.close();
+        if (!build.safe)
+        {
+            env.engine.releaseBlob();
+            SMP_RETVAL_IF_FALSE(loadStreamingEngineToBuildEnv(build.engine, env, err), "Reading engine file failed.", false, err);
+        }
+    }
+
+    return true;
+}
+
+// There is not a getWeightsName API, so we need to use WeightsRole.
+std::vector<std::pair<WeightsRole, Weights>> getAllRefitWeightsForLayer(const ILayer& l)
+{
+    switch (l.getType())
+    {
+    case LayerType::kCONSTANT:
+    {
+        auto const& layer = static_cast<const nvinfer1::IConstantLayer&>(l);
+        auto const weights = layer.getWeights();
+        switch (weights.type)
+        {
+        case DataType::kFLOAT:
+        case DataType::kHALF:
+        case DataType::kBF16:
+        case DataType::kINT8:
+        case DataType::kINT32:
+        case DataType::kINT64: return {std::make_pair(WeightsRole::kCONSTANT, weights)};
+        case DataType::kBOOL:
+        case DataType::kUINT8:
+        case DataType::kFP8:
+        case DataType::kINT4:
+            // Refit not supported for these types.
+            break;
+        }
+        break;
+    }
+    case LayerType::kCONVOLUTION:
+    {
+        auto const& layer = static_cast<const nvinfer1::IConvolutionLayer&>(l);
+        return {std::make_pair(WeightsRole::kKERNEL, layer.getKernelWeights()),
+            std::make_pair(WeightsRole::kBIAS, layer.getBiasWeights())};
+    }
+    case LayerType::kDECONVOLUTION:
+    {
+        auto const& layer = static_cast<const nvinfer1::IDeconvolutionLayer&>(l);
+        return {std::make_pair(WeightsRole::kKERNEL, layer.getKernelWeights()),
+            std::make_pair(WeightsRole::kBIAS, layer.getBiasWeights())};
+    }
+    case LayerType::kSCALE:
+    {
+        auto const& layer = static_cast<const nvinfer1::IScaleLayer&>(l);
+        return {std::make_pair(WeightsRole::kSCALE, layer.getScale()),
+            std::make_pair(WeightsRole::kSHIFT, layer.getShift())};
+    }
+    case LayerType::kACTIVATION:
+    case LayerType::kASSERTION:
+    case LayerType::kCAST:
+    case LayerType::kCONCATENATION:
+    case LayerType::kCONDITION:
+    case LayerType::kCONDITIONAL_INPUT:
+    case LayerType::kCONDITIONAL_OUTPUT:
+    case LayerType::kDEQUANTIZE:
+    case LayerType::kEINSUM:
+    case LayerType::kELEMENTWISE:
+    case LayerType::kFILL:
+    case LayerType::kGATHER:
+    case LayerType::kGRID_SAMPLE:
+    case LayerType::kIDENTITY:
+    case LayerType::kITERATOR:
+    case LayerType::kLOOP_OUTPUT:
+    case LayerType::kLRN:
+    case LayerType::kMATRIX_MULTIPLY:
+    case LayerType::kNMS:
+    case LayerType::kNON_ZERO:
+    case LayerType::kNORMALIZATION:
+    case LayerType::kONE_HOT:
+    case LayerType::kPADDING:
+    case LayerType::kPARAMETRIC_RELU:
+    case LayerType::kPLUGIN:
+    case LayerType::kPLUGIN_V2:
+    case LayerType::kPLUGIN_V3:
+    case LayerType::kPOOLING:
+    case LayerType::kQUANTIZE:
+    case LayerType::kRAGGED_SOFTMAX:
+    case LayerType::kRECURRENCE:
+    case LayerType::kREDUCE:
+    case LayerType::kRESIZE:
+    case LayerType::kREVERSE_SEQUENCE:
+    case LayerType::kSCATTER:
+    case LayerType::kSELECT:
+    case LayerType::kSHAPE:
+    case LayerType::kSHUFFLE:
+    case LayerType::kSLICE:
+    case LayerType::kSOFTMAX:
+    case LayerType::kTOPK:
+    case LayerType::kTRIP_LIMIT:
+    case LayerType::kUNARY: return {};
+    }
+    return {};
+}
+
+bool timeRefit(INetworkDefinition const& network, nvinfer1::ICudaEngine& engine, bool multiThreading)
+{
+    using time_point = std::chrono::time_point<std::chrono::steady_clock>;
+    using durationMs = std::chrono::duration<float, std::milli>;
+
+    auto const nbLayers = network.getNbLayers();
+    std::unique_ptr<IRefitter> refitter{createRefitter(engine)};
+    // Set max threads that can be used by refitter.
+    if (multiThreading && !refitter->setMaxThreads(10))
+    {
+        sample::gLogError << "Failed to set max threads to refitter." << std::endl;
+        return false;
+    }
+    auto const& layerWeightsRolePair = getLayerWeightsRolePair(*refitter);
+    // We use std::string instead of char const* since we can have copies of layer names.
+    std::set<std::pair<std::string, WeightsRole>> layerRoleSet;
+
+    auto const& layerNames = layerWeightsRolePair.first;
+    auto const& weightsRoles = layerWeightsRolePair.second;
+
+    std::transform(layerNames.begin(), layerNames.end(), weightsRoles.begin(),
+        std::inserter(layerRoleSet, layerRoleSet.begin()),
+        [](std::string const& layerName, WeightsRole const role) { return std::make_pair(layerName, role); });
+
+    auto const isRefittable = [&layerRoleSet](char const* layerName, WeightsRole const role) {
+        return layerRoleSet.find(std::make_pair(layerName, role)) != layerRoleSet.end();
+    };
+
+    auto const setWeights = [&] {
+        for (int32_t i = 0; i < nbLayers; i++)
+        {
+            auto const layer = network.getLayer(i);
+            auto const roleWeightsVec = getAllRefitWeightsForLayer(*layer);
+            for (auto const& roleWeights : roleWeightsVec)
+            {
+                if (isRefittable(layer->getName(), roleWeights.first))
+                {
+                    bool const success = refitter->setWeights(layer->getName(), roleWeights.first, roleWeights.second);
+                    if (!success)
+                    {
+                        return false;
+                    }
+                }
+            }
+        }
+        return true;
+    };
+
+    auto const reportMissingWeights = [&] {
+        auto const& missingPair = getMissingLayerWeightsRolePair(*refitter);
+        auto const& layerNames = missingPair.first;
+        auto const& weightsRoles = missingPair.second;
+        for (size_t i = 0; i < layerNames.size(); ++i)
+        {
+            sample::gLogError << "Missing (" << layerNames[i] << ", " << weightsRoles[i] << ") for refitting."
+                              << std::endl;
+        }
+        return layerNames.empty();
+    };
+
+    // Skip weights validation since we are confident that the new weights are similar to the weights used to build
+    // engine.
+    refitter->setWeightsValidation(false);
+
+    // Warm up and report missing weights
+    // We only need to set weights for the first time and that can be reused in later refitting process.
+    bool const success = setWeights() && reportMissingWeights() && refitter->refitCudaEngine();
+    if (!success)
+    {
+        return false;
+    }
+
+    TrtCudaStream stream;
+    constexpr int32_t kLOOP = 10;
+    time_point const refitStartTime{std::chrono::steady_clock::now()};
+    {
+        for (int32_t l = 0; l < kLOOP; l++)
+        {
+            if (!refitter->refitCudaEngineAsync(stream.get()))
+            {
+                return false;
+            }
+        }
+    }
+    stream.synchronize();
+    time_point const refitEndTime{std::chrono::steady_clock::now()};
+
+    sample::gLogInfo << "Engine refitted"
+                     << " in " << durationMs(refitEndTime - refitStartTime).count() / kLOOP << " ms." << std::endl;
+    return true;
+}
+
+namespace
+{
+void* initSafeRuntime()
+{
+    void* handle{nullptr};
+    // libsafe_executor.so will be renamed to libnvinfer_safe.so when TRTS-9421 completes.
+    // Currently libsafe_executor_debug.so for samplesCommon::isDebug() is not ready.
+#define TRTS_9421_COMPLETED 0
+#if TRTS_9421_COMPLETED
+#if !defined(_WIN32)
+    std::string const dllName{"libsafe_executor.so"};
+#if SANITIZER_BUILD
+    handle = dlopen(dllName.c_str(), RTLD_LAZY | RTLD_NODELETE);
+#else
+    // RTLD_GLOBAL is used for symbol resolution of subsequently loaded plugin libraries
+    handle = dlopen(dllName.c_str(), RTLD_LAZY | RTLD_GLOBAL);
+#endif
+#endif
+#endif // TRTS_9421_COMPLETED
+    return handle;
+}
+
+#if !defined(_WIN32)
+struct DllDeleter
+{
+    void operator()(void* handle)
+    {
+        if (handle != nullptr)
+        {
+            dlclose(handle);
+        }
+    }
+};
+const std::unique_ptr<void, DllDeleter> safeRuntimeLibrary{initSafeRuntime()};
+#endif
+} // namespace
+
+bool hasSafeRuntime()
+{
+    bool ret{false};
+#if !defined(_WIN32)
+    ret = (safeRuntimeLibrary != nullptr);
+#endif
+    return ret;
+}
+
+} // namespace sample
diff --git a/src/Detector/tensorrt_yolo/common/sampleEngines.h b/src/Detector/tensorrt_yolo/common/sampleEngines.h
index 620b51a1..ec02e909 100644
--- a/src/Detector/tensorrt_yolo/common/sampleEngines.h
+++ b/src/Detector/tensorrt_yolo/common/sampleEngines.h
@@ -1,11 +1,12 @@
 /*
- * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
- *     http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
@@ -17,58 +18,227 @@
 #ifndef TRT_SAMPLE_ENGINES_H
 #define TRT_SAMPLE_ENGINES_H
 
-#include <iostream>
-#include <vector>
-
 #include "NvInfer.h"
-
-#if (NV_TENSORRT_MAJOR > 7)
-
-#include "NvInferConsistency.h"
-#include "NvInferSafeRuntime.h"
-
-#endif
-
 #include "NvOnnxParser.h"
 #include "sampleOptions.h"
 #include "sampleUtils.h"
+#include "streamReader.h"
+#include <iostream>
+#include <vector>
 
 namespace sample
 {
 
 struct Parser
 {
-    TrtUniquePtr<nvonnxparser::IParser> onnxParser;
+    std::unique_ptr<nvonnxparser::IParser> onnxParser;
 
     operator bool() const
     {
-        return onnxParser.operator bool();
+        return onnxParser != nullptr;
     }
 };
 
-struct BuildEnvironment
+//!
+//! \brief Helper struct to faciliate engine serialization and deserialization. It does not own the underlying memory.
+//!
+struct EngineBlob
 {
-    TrtUniquePtr<nvinfer1::INetworkDefinition> network;
-    //! Parser that creates the network. Must be declared *after* network, so that when
-    //! ~BuildEnvironment() executes, the parser is destroyed before the network is destroyed.
-    Parser parser;
-    TrtUniquePtr<nvinfer1::ICudaEngine> engine;
-    std::unique_ptr<nvinfer1::safe::ICudaEngine> safeEngine;
-    std::vector<uint8_t> engineBlob;
+    EngineBlob(void* engineData, size_t engineSize)
+        : data(engineData)
+        , size(engineSize)
+    {
+    }
+    void* data{};
+    size_t size{};
+    bool empty() const
+    {
+        return size == 0;
+    }
 };
 
 //!
-//! \brief Generate a network definition for a given model
-//!
-//! \return Parser The parser used to initialize the network and that holds the weights for the network, or an invalid
-//! parser (the returned parser converts to false if tested)
+//! \brief A helper class to hold a serialized engine (std or safe) and only deserialize it when being accessed.
 //!
-//! Constant input dimensions in the model must not be changed in the corresponding
-//! network definition, because its correctness may rely on the constants.
-//!
-//! \see Parser::operator bool()
-//!
-Parser modelToNetwork(const ModelOptions& model, nvinfer1::INetworkDefinition& network, std::ostream& err);
+class LazilyDeserializedEngine
+{
+public:
+    //!
+    //! \brief Delete default constructor to make sure isSafe and DLACore are always set.
+    //!
+    LazilyDeserializedEngine() = delete;
+
+    //!
+    //! \brief Constructor of LazilyDeserializedEngine.
+    //!
+    LazilyDeserializedEngine(bool isSafe, bool versionCompatible, int32_t DLACore, std::string const& tempdir,
+        nvinfer1::TempfileControlFlags tempfileControls, std::string const& leanDLLPath)
+        : mIsSafe(isSafe)
+        , mVersionCompatible(versionCompatible)
+        , mDLACore(DLACore)
+        , mTempdir(tempdir)
+        , mTempfileControls(tempfileControls)
+        , mLeanDLLPath(leanDLLPath)
+    {
+        mFileReader = std::make_unique<samplesCommon::FileStreamReader>();
+    }
+
+    //!
+    //! \brief Move from another LazilyDeserializedEngine.
+    //!
+    LazilyDeserializedEngine(LazilyDeserializedEngine&& other) = default;
+
+    //!
+    //! \brief Delete copy constructor.
+    //!
+    LazilyDeserializedEngine(LazilyDeserializedEngine const& other) = delete;
+
+    //!
+    //! \brief Get the pointer to the ICudaEngine. Triggers deserialization if not already done so.
+    //!
+    nvinfer1::ICudaEngine* get();
+
+    //!
+    //! \brief Get the pointer to the ICudaEngine and release the ownership.
+    //!
+    nvinfer1::ICudaEngine* release();
+
+    //!
+    //! \brief Get the underlying blob storing serialized engine.
+    //!
+    EngineBlob const getBlob() const
+    {
+        ASSERT((!mFileReader || !mFileReader->isOpen())
+            && "Attempting to access the glob when there is an open file reader!");
+        if (!mEngineBlob.empty())
+        {
+            return EngineBlob{const_cast<void*>(static_cast<void const*>(mEngineBlob.data())), mEngineBlob.size()};
+        }
+        if (mEngineBlobHostMemory.get() != nullptr && mEngineBlobHostMemory->size() > 0)
+        {
+            return EngineBlob{mEngineBlobHostMemory->data(), mEngineBlobHostMemory->size()};
+        }
+        ASSERT(false && "Attempting to access an empty engine!");
+        return EngineBlob{nullptr, 0};
+    }
+
+    //!
+    //! \brief Set the underlying blob storing the serialized engine without duplicating IHostMemory.
+    //!
+    void setBlob(std::unique_ptr<nvinfer1::IHostMemory>& data)
+    {
+        ASSERT(data.get() && data->size() > 0);
+        mEngineBlobHostMemory = std::move(data);
+        mEngine.reset();
+    }
+
+    //!
+    //! \brief Set the underlying blob storing the serialized engine without duplicating vector memory.
+    //!
+    void setBlob(std::vector<uint8_t>&& engineBlob)
+    {
+        mEngineBlob = std::move(engineBlob);
+        mEngine.reset();
+    }
+
+    //!
+    //! \brief Release the underlying blob without deleting the deserialized engine.
+    //!
+    void releaseBlob()
+    {
+        mEngineBlob.clear();
+        mEngineBlobHostMemory.reset();
+    }
+
+    //!
+    //! \brief Get the file stream reader used for deserialization
+    //!
+    samplesCommon::FileStreamReader& getFileReader()
+    {
+        ASSERT(mFileReader);
+        return *mFileReader;
+    }
+
+    //!
+    //! \brief Get if safe mode is enabled.
+    //!
+    bool isSafe()
+    {
+        return mIsSafe;
+    }
+
+    void setDynamicPlugins(std::vector<std::string> const& dynamicPlugins)
+    {
+        mDynamicPlugins = dynamicPlugins;
+    }
+
+private:
+    bool mIsSafe{false};
+    bool mVersionCompatible{false};
+    int32_t mDLACore{-1};
+    std::vector<uint8_t> mEngineBlob;
+    std::unique_ptr<samplesCommon::FileStreamReader> mFileReader;
+
+    // Directly use the host memory of a serialized engine instead of duplicating the engine in CPU memory.
+    std::unique_ptr<nvinfer1::IHostMemory> mEngineBlobHostMemory;
+
+    std::string mTempdir{};
+    nvinfer1::TempfileControlFlags mTempfileControls{getTempfileControlDefaults()};
+    std::string mLeanDLLPath{};
+    std::vector<std::string> mDynamicPlugins;
+
+    //! \name Owned TensorRT objects
+    //! Per TensorRT object lifetime requirements as outlined in the developer guide,
+    //! the runtime must remain live while any engines created by the runtime are live.
+    //! DO NOT ADJUST the declaration order here: runtime -> (engine).
+    //! Destruction occurs in reverse declaration order: (engine) -> runtime.
+    //!@{
+
+    //! The runtime used to track parent of mRuntime if one exists.
+    //! Needed to load mRuntime if lean.so is supplied through file system path.
+    std::unique_ptr<nvinfer1::IRuntime> mParentRuntime{};
+
+    //! The runtime that is used to deserialize the engine.
+    std::unique_ptr<nvinfer1::IRuntime> mRuntime{};
+
+    //! If mIsSafe is false, this points to the deserialized std engine
+    std::unique_ptr<nvinfer1::ICudaEngine> mEngine{};
+
+    //!@}
+};
+
+struct BuildEnvironment
+{
+    BuildEnvironment() = delete;
+    BuildEnvironment(BuildEnvironment const& other) = delete;
+    BuildEnvironment(BuildEnvironment&& other) = delete;
+    BuildEnvironment(bool isSafe, bool versionCompatible, int32_t DLACore, std::string const& tempdir,
+        nvinfer1::TempfileControlFlags tempfileControls, std::string const& leanDLLPath = "")
+        : engine(isSafe, versionCompatible, DLACore, tempdir, tempfileControls, leanDLLPath)
+    {
+    }
+
+    //! \name Owned TensorRT objects
+    //! Per TensorRT object lifetime requirements as outlined in the developer guide,
+    //! factory objects must remain live while the objects created by those factories
+    //! are live (with the exception of builder -> engine).
+    //! DO NOT ADJUST the declaration order here: builder -> network -> parser.
+    //! Destruction occurs in reverse declaration order: parser -> network -> builder.
+    //!@{
+
+    //! The builder used to build the engine.
+    std::unique_ptr<nvinfer1::IBuilder> builder;
+
+    //! The network used by the builder.
+    std::unique_ptr<nvinfer1::INetworkDefinition> network;
+
+    //! The parser used to specify the network.
+    Parser parser;
+
+    //! The engine.
+    LazilyDeserializedEngine engine;
+    //!@}
+};
 
 //!
 //! \brief Set up network and config
@@ -89,95 +259,63 @@ void dumpRefittable(nvinfer1::ICudaEngine& engine);
 //!
 //! \return Pointer to the engine loaded or nullptr if the operation failed
 //!
-nvinfer1::ICudaEngine* loadEngine(const std::string& engine, int DLACore, std::ostream& err);
+nvinfer1::ICudaEngine* loadEngine(std::string const& engine, int32_t DLACore, std::ostream& err);
 
 //!
 //! \brief Save an engine into a file
 //!
 //! \return boolean Return true if the engine was successfully saved
 //!
-bool saveEngine(const nvinfer1::ICudaEngine& engine, const std::string& fileName, std::ostream& err);
+bool saveEngine(nvinfer1::ICudaEngine const& engine, std::string const& fileName, std::ostream& err);
 
 //!
 //! \brief Create an engine from model or serialized file, and optionally save engine
 //!
 //! \return Pointer to the engine created or nullptr if the creation failed
 //!
-bool getEngineBuildEnv(const ModelOptions& model, const BuildOptions& build, const SystemOptions& sys, 
-    BuildEnvironment& env, std::ostream& err);
-
-//!
-//! \brief Create an engine from model or serialized file, and optionally save engine
-//!
-//! \return Pointer to the engine created or nullptr if the creation failed
-//!
-inline TrtUniquePtr<nvinfer1::ICudaEngine> getEngine(
-    const ModelOptions& model, const BuildOptions& build, const SystemOptions& sys, std::ostream& err)
-{
-    BuildEnvironment env;
-    TrtUniquePtr<nvinfer1::ICudaEngine> engine;
-    if (getEngineBuildEnv(model, build, sys, env, err))
-    {
-        engine.swap(env.engine);
-    }
-    return engine;
-}
+bool getEngineBuildEnv(
+    ModelOptions const& model, BuildOptions const& build, SystemOptions& sys, BuildEnvironment& env, std::ostream& err);
 
 //!
 //! \brief Create a serialized network
 //!
 //! \return Pointer to a host memory for a serialized network
 //!
-nvinfer1::IHostMemory* networkToSerialized(const BuildOptions& build, const SystemOptions& sys, nvinfer1::IBuilder& builder,
-    nvinfer1::INetworkDefinition& network, std::ostream& err);
+nvinfer1::IHostMemory* networkToSerialized(const BuildOptions& build, const SystemOptions& sys,
+    nvinfer1::IBuilder& builder, nvinfer1::INetworkDefinition& network, std::ostream& err);
 
 //!
 //! \brief Tranfer model to a serialized network
 //!
 //! \return Pointer to a host memory for a serialized network
 //!
-nvinfer1::IHostMemory* modelToSerialized(const ModelOptions& model, const BuildOptions& build, const SystemOptions& sys, std::ostream& err);
+nvinfer1::IHostMemory* modelToSerialized(
+    const ModelOptions& model, const BuildOptions& build, const SystemOptions& sys, std::ostream& err);
 
 //!
 //! \brief Serialize network and save it into a file
 //!
 //! \return boolean Return true if the network was successfully serialized and saved
 //!
-bool serializeAndSave(const ModelOptions& model, const BuildOptions& build, const SystemOptions& sys, std::ostream& err);
+bool serializeAndSave(
+    const ModelOptions& model, const BuildOptions& build, const SystemOptions& sys, std::ostream& err);
 
 bool timeRefit(const nvinfer1::INetworkDefinition& network, nvinfer1::ICudaEngine& engine, bool multiThreading);
 
 //!
 //! \brief Set tensor scales from a calibration table
 //!
-void setTensorScalesFromCalibration(nvinfer1::INetworkDefinition& network, const std::vector<IOFormat>& inputFormats,
-        const std::vector<IOFormat>& outputFormats, const std::string& calibrationFile);
+void setTensorScalesFromCalibration(nvinfer1::INetworkDefinition& network, std::vector<IOFormat> const& inputFormats,
+    std::vector<IOFormat> const& outputFormats, std::string const& calibrationFile);
 
 //!
 //! \brief Check if safe runtime is loaded.
 //!
 bool hasSafeRuntime();
 
-//!
-//! \brief Create a safe runtime object if the dynamic library is loaded.
-//!
-nvinfer1::safe::IRuntime* createSafeInferRuntime(nvinfer1::ILogger& logger) noexcept;
-
-//!
-//! \brief Check if consistency checker is loaded.
-//!
-bool hasConsistencyChecker();
+bool loadStreamingEngineToBuildEnv(std::string const& engine, BuildEnvironment& env, std::ostream& err);
 
-//!
-//! \brief Create a consistency checker object if the dynamic library is loaded.
-//!
-nvinfer1::consistency::IConsistencyChecker* createConsistencyChecker(
-    nvinfer1::ILogger& logger, nvinfer1::IHostMemory const* engine) noexcept;
-
-//!
-//! \brief Run consistency check on serialized engine.
-//!
-bool checkSafeEngine(void const* serializedEngine, int32_t const engineSize);
+bool loadEngineToBuildEnv(std::string const& engine, BuildEnvironment& env, std::ostream& err);
 } // namespace sample
 
 #endif // TRT_SAMPLE_ENGINES_H
diff --git a/src/Detector/tensorrt_yolo/common/sampleEntrypoints.h b/src/Detector/tensorrt_yolo/common/sampleEntrypoints.h
new file mode 100644
index 00000000..cc8bf1b9
--- /dev/null
+++ b/src/Detector/tensorrt_yolo/common/sampleEntrypoints.h
@@ -0,0 +1,101 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef TRT_SAMPLE_ENTRYPOINTS_H
+#define TRT_SAMPLE_ENTRYPOINTS_H
+
+//! \file sampleEntrypoints.h
+//!
+//! Declares and conditionally defines entrypoints needed to create base TensorRT objects, depending
+//! on whether the given sample uses TRT at link time or dynamically.  Since common code is built once
+//! and shared across all samples (both link-time and dynamic TRT), it does not define these entrypoints,
+//! so each sample must define them individually.
+//!
+//! Samples that use TRT at link time can define DEFINE_TRT_ENTRYPOINTS before including this header to
+//! pick up the definitions here.
+
+#include "NvInfer.h"
+#include "NvOnnxParser.h"
+#include "logger.h"
+
+extern nvinfer1::IBuilder* createBuilder();
+extern nvinfer1::IRuntime* createRuntime();
+extern nvinfer1::IRefitter* createRefitter(nvinfer1::ICudaEngine& engine);
+
+extern nvonnxparser::IParser* createONNXParser(nvinfer1::INetworkDefinition& network);
+
+#if !defined(DEFINE_TRT_ENTRYPOINTS)
+#define DEFINE_TRT_ENTRYPOINTS 0
+#endif
+
+// Allow opting out of individual entrypoints that are unused by the sample
+#if !defined(DEFINE_TRT_BUILDER_ENTRYPOINT)
+#define DEFINE_TRT_BUILDER_ENTRYPOINT 1
+#endif
+#if !defined(DEFINE_TRT_RUNTIME_ENTRYPOINT)
+#define DEFINE_TRT_RUNTIME_ENTRYPOINT 1
+#endif
+#if !defined(DEFINE_TRT_REFITTER_ENTRYPOINT)
+#define DEFINE_TRT_REFITTER_ENTRYPOINT 1
+#endif
+#if !defined(DEFINE_TRT_ONNX_PARSER_ENTRYPOINT)
+#define DEFINE_TRT_ONNX_PARSER_ENTRYPOINT 1
+#endif
+#if !defined(DEFINE_TRT_LEGACY_PARSER_ENTRYPOINT)
+#define DEFINE_TRT_LEGACY_PARSER_ENTRYPOINT 1
+#endif
+
+#if DEFINE_TRT_ENTRYPOINTS
+nvinfer1::IBuilder* createBuilder()
+{
+#if DEFINE_TRT_BUILDER_ENTRYPOINT
+    return nvinfer1::createInferBuilder(sample::gLogger.getTRTLogger());
+#else
+    return {};
+#endif
+}
+
+nvinfer1::IRuntime* createRuntime()
+{
+#if DEFINE_TRT_RUNTIME_ENTRYPOINT
+    return nvinfer1::createInferRuntime(sample::gLogger.getTRTLogger());
+#else
+    return {};
+#endif
+}
+
+nvinfer1::IRefitter* createRefitter(nvinfer1::ICudaEngine& engine)
+{
+#if DEFINE_TRT_REFITTER_ENTRYPOINT
+    return nvinfer1::createInferRefitter(engine, sample::gLogger.getTRTLogger());
+#else
+    return {};
+#endif
+}
+
+nvonnxparser::IParser* createONNXParser(nvinfer1::INetworkDefinition& network)
+{
+#if DEFINE_TRT_ONNX_PARSER_ENTRYPOINT
+    return nvonnxparser::createParser(network, sample::gLogger.getTRTLogger());
+#else
+    return {};
+#endif
+}
+
+#endif // DEFINE_TRT_ENTRYPOINTS
+
+#endif // TRT_SAMPLE_ENTRYPOINTS_H
diff --git a/src/Detector/tensorrt_yolo/common/sampleInference.cpp_ b/src/Detector/tensorrt_yolo/common/sampleInference.cpp_
new file mode 100644
index 00000000..ca0098d4
--- /dev/null
+++ b/src/Detector/tensorrt_yolo/common/sampleInference.cpp_
@@ -0,0 +1,1622 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <algorithm>
+#include <array>
+#include <chrono>
+#include <cuda_profiler_api.h>
+#include <functional>
+#include <limits>
+#include <memory>
+#include <mutex>
+#include <numeric>
+#include <set>
+#include <sstream>
+#include <thread>
+#include <utility>
+#include <vector>
+
+#if defined(__QNX__)
+#include <sys/neutrino.h>
+#include <sys/syspage.h>
+#endif
+
+#include "NvInfer.h"
+
+#include "ErrorRecorder.h"
+#include "bfloat16.h"
+#include "logger.h"
+#include "sampleDevice.h"
+#include "sampleEngines.h"
+#include "sampleInference.h"
+#include "sampleOptions.h"
+#include "sampleReporting.h"
+#include "sampleUtils.h"
+using namespace nvinfer1;
+namespace sample
+{
+
+template <class TMapType, class TEngineType>
+bool validateTensorNames(TMapType const& map, TEngineType const* engine, int32_t const endBindingIndex)
+{
+    // Check if the provided input tensor names match the input tensors of the engine.
+    // Throw an error if the provided input tensor names cannot be found because it implies a potential typo.
+    for (auto const& item : map)
+    {
+        bool tensorNameFound{false};
+        for (int32_t b = 0; b < endBindingIndex; ++b)
+        {
+            auto const tensorName = engine->getIOTensorName(b);
+            auto const tensorIOMode = engine->getTensorIOMode(tensorName);
+            if (tensorIOMode == nvinfer1::TensorIOMode::kINPUT && matchStringWithOneWildcard(item.first, tensorName))
+            {
+                tensorNameFound = true;
+                break;
+            }
+        }
+        if (!tensorNameFound)
+        {
+            sample::gLogError << "Cannot find input tensor with name \"" << item.first << "\" in the engine bindings! "
+                              << "Please make sure the input tensor names are correct." << std::endl;
+            return false;
+        }
+    }
+    return true;
+}
+
+template <class TEngineType>
+class FillBindingClosure
+{
+private:
+    using InputsMap = std::unordered_map<std::string, std::string>;
+    using BindingsVector = std::vector<std::unique_ptr<Bindings>>;
+
+    TEngineType const* mEngine;
+    nvinfer1::IExecutionContext const* mContext;
+    InputsMap const& inputs;
+    BindingsVector& bindings;
+    int32_t batch;
+    int32_t endBindingIndex;
+    int32_t profileIndex;
+
+    void fillOneBinding(TensorInfo const& tensorInfo)
+    {
+        auto const name = tensorInfo.name;
+        auto const* bindingInOutStr = tensorInfo.isInput ? "Input" : "Output";
+        for (auto& binding : bindings)
+        {
+            auto const input = findPlausible(inputs, name);
+            if (tensorInfo.isInput && input != inputs.end())
+            {
+                sample::gLogInfo << "Using values loaded from " << input->second << " for input " << name << std::endl;
+                binding->addBinding(tensorInfo, input->second);
+            }
+            else
+            {
+                if (tensorInfo.isInput)
+                {
+                    sample::gLogInfo << "Using random values for input " << name << std::endl;
+                }
+                binding->addBinding(tensorInfo);
+            }
+            if (tensorInfo.isDynamic)
+            {
+                sample::gLogInfo << bindingInOutStr << " binding for " << name
+                                 << " is dynamic and will be created during execution using OutputAllocator."
+                                 << std::endl;
+            }
+            else
+            {
+                sample::gLogInfo << bindingInOutStr << " binding for " << name << " with dimensions " << tensorInfo.dims
+                                 << " is created." << std::endl;
+            }
+        }
+    }
+
+    bool fillAllBindings(int32_t batch, int32_t endBindingIndex)
+    {
+        if (!validateTensorNames(inputs, mEngine, endBindingIndex))
+        {
+            sample::gLogError << "Invalid tensor names found in --loadInputs flag." << std::endl;
+            return false;
+        }
+        for (int32_t b = 0; b < endBindingIndex; b++)
+        {
+            TensorInfo tensorInfo;
+            tensorInfo.bindingIndex = b;
+            getTensorInfo(tensorInfo);
+            tensorInfo.updateVolume(batch);
+            fillOneBinding(tensorInfo);
+        }
+        return true;
+    }
+
+    void getTensorInfo(TensorInfo& tensorInfo);
+
+public:
+    FillBindingClosure(TEngineType const* _engine, nvinfer1::IExecutionContext const* _context,
+        InputsMap const& _inputs, BindingsVector& _bindings, int32_t _batch, int32_t _endBindingIndex,
+        int32_t _profileIndex)
+        : mEngine(_engine)
+        , mContext(_context)
+        , inputs(_inputs)
+        , bindings(_bindings)
+        , batch(_batch)
+        , endBindingIndex(_endBindingIndex)
+        , profileIndex(_profileIndex)
+    {
+    }
+
+    bool operator()()
+    {
+        return fillAllBindings(batch, endBindingIndex);
+    }
+};
+
+template <>
+void FillBindingClosure<nvinfer1::ICudaEngine>::getTensorInfo(TensorInfo& tensorInfo)
+{
+    auto const b = tensorInfo.bindingIndex;
+    auto const name = mEngine->getIOTensorName(b);
+    tensorInfo.name = name;
+    tensorInfo.dims = mContext->getTensorShape(name);
+    tensorInfo.isDynamic = std::any_of(
+        tensorInfo.dims.d, tensorInfo.dims.d + tensorInfo.dims.nbDims, [](int32_t dim) { return dim == -1; });
+    tensorInfo.comps = mEngine->getTensorComponentsPerElement(name, profileIndex);
+    tensorInfo.strides = mContext->getTensorStrides(name);
+    tensorInfo.vectorDimIndex = mEngine->getTensorVectorizedDim(name, profileIndex);
+    tensorInfo.isInput = mEngine->getTensorIOMode(name) == TensorIOMode::kINPUT;
+    tensorInfo.dataType = mEngine->getTensorDataType(name);
+}
+
+namespace
+{
+bool allocateContextMemory(InferenceEnvironment& iEnv, InferenceOptions const& inference)
+{
+    auto* engine = iEnv.engine.get();
+    iEnv.deviceMemory.resize(inference.infStreams);
+    // Delay context memory allocation until input shapes are specified because runtime allocation would require actual
+    // input shapes.
+    for (int32_t i = 0; i < inference.infStreams; ++i)
+    {
+        auto const& ec = iEnv.contexts.at(i);
+        if (inference.memoryAllocationStrategy == MemoryAllocationStrategy::kSTATIC)
+        {
+            sample::gLogInfo << "Created execution context with device memory size: "
+                             << (engine->getDeviceMemorySize() / 1.0_MiB) << " MiB" << std::endl;
+        }
+        else
+        {
+            size_t sizeToAlloc{0};
+            const char* allocReason{nullptr};
+            if (inference.memoryAllocationStrategy == MemoryAllocationStrategy::kPROFILE)
+            {
+                auto const p = inference.optProfileIndex;
+                sizeToAlloc = engine->getDeviceMemorySizeForProfile(p);
+                allocReason = "current profile";
+            }
+            else if (inference.memoryAllocationStrategy == MemoryAllocationStrategy::kRUNTIME)
+            {
+                sizeToAlloc = ec->updateDeviceMemorySizeForShapes();
+                allocReason = "current input shapes";
+            }
+            else
+            {
+                sample::gLogError << "Unrecognizable memory allocation strategy." << std::endl;
+                return false;
+            }
+            iEnv.deviceMemory.at(i) = TrtDeviceBuffer(sizeToAlloc);
+            ec->setDeviceMemoryV2(iEnv.deviceMemory.at(i).get(), iEnv.deviceMemory.at(i).getSize());
+            sample::gLogInfo << "Maximum device memory size across all profiles: "
+                             << (engine->getDeviceMemorySizeV2() / 1.0_MiB) << " MiB" << std::endl;
+            sample::gLogInfo << "Only allocated device memory enough for " << allocReason << ": "
+                             << (sizeToAlloc / 1.0_MiB) << " MiB" << std::endl;
+        }
+    }
+    return true;
+}
+} // namespace
+
+bool setUpInference(InferenceEnvironment& iEnv, InferenceOptions const& inference, SystemOptions const& system)
+{
+#if TRT_WINML
+    int32_t const isIntegrated{};
+#else
+    int32_t device{};
+    cudaCheck(cudaGetDevice(&device));
+
+    cudaDeviceProp properties;
+    cudaCheck(cudaGetDeviceProperties(&properties, device));
+    int32_t const isIntegrated{properties.integrated};
+#endif
+    // Use managed memory on integrated devices when transfers are skipped
+    // and when it is explicitly requested on the commandline.
+    bool useManagedMemory{(inference.skipTransfers && isIntegrated) || inference.useManaged};
+    SMP_RETVAL_IF_FALSE(!iEnv.safe, "Safe inference is not supported!", false, sample::gLogError);
+
+    using FillStdBindings = FillBindingClosure<nvinfer1::ICudaEngine>;
+
+    auto* engine = iEnv.engine.get();
+    SMP_RETVAL_IF_FALSE(engine != nullptr, "Got invalid engine!", false, sample::gLogError);
+
+    // Release serialized blob to save memory space.
+    iEnv.engine.releaseBlob();
+
+    // Setup weight streaming if enabled
+    if (engine->getStreamableWeightsSize() > 0)
+    {
+        auto const& budget = inference.weightStreamingBudget;
+        int64_t wsBudget = budget.bytes;
+        if (budget.percent != 100.0)
+        {
+            double const percent = budget.percent;
+            ASSERT(percent < 100.0);
+            auto const max = engine->getStreamableWeightsSize();
+            wsBudget = (max >= 0) ? (percent / 100) * (max) : WeightStreamingBudget::kDISABLE;
+        }
+
+        if (wsBudget == WeightStreamingBudget::kDISABLE)
+        {
+            wsBudget = engine->getStreamableWeightsSize();
+        }
+        else if (wsBudget == WeightStreamingBudget::kAUTOMATIC)
+        {
+            wsBudget = engine->getWeightStreamingAutomaticBudget();
+        }
+        ASSERT(wsBudget >= 0);
+        bool success = engine->setWeightStreamingBudgetV2(wsBudget);
+        SMP_RETVAL_IF_FALSE(success, "Failed to set weight streaming limit!", false, sample::gLogError);
+        switch (wsBudget)
+        {
+        case WeightStreamingBudget::kDISABLE:
+        {
+            sample::gLogInfo << "Weight streaming has been disabled at runtime." << std::endl;
+            break;
+        }
+
+        case WeightStreamingBudget::kAUTOMATIC:
+        {
+            sample::gLogInfo << "The weight streaming budget will automatically be chosen by TensorRT." << std::endl;
+            break;
+        }
+        default:
+        {
+            sample::gLogInfo << "Weight streaming is enabled with a device memory limit of " << wsBudget << " bytes."
+                             << std::endl;
+            break;
+        }
+        }
+    }
+
+    int32_t const nbOptProfiles = engine->getNbOptimizationProfiles();
+
+    if (inference.optProfileIndex >= nbOptProfiles)
+    {
+        sample::gLogError << "Selected profile index " << inference.optProfileIndex
+                          << " exceeds the number of profiles that the engine holds. " << std::endl;
+        return false;
+    }
+
+    if (nbOptProfiles > 1 && !inference.setOptProfile)
+    {
+        sample::gLogWarning << nbOptProfiles
+                            << " profiles detected but not set. Running with profile 0. Please use "
+                               "--dumpOptimizationProfile to see all available profiles."
+                            << std::endl;
+    }
+
+    cudaStream_t setOptProfileStream;
+    CHECK(cudaStreamCreate(&setOptProfileStream));
+
+    for (int32_t s = 0; s < inference.infStreams; ++s)
+    {
+        IExecutionContext* ec{nullptr};
+        if (inference.memoryAllocationStrategy == MemoryAllocationStrategy::kSTATIC)
+        {
+            // Let TRT pre-allocate and manage the memory.
+            ec = engine->createExecutionContext();
+        }
+        else
+        {
+            // Allocate based on the current profile or runtime shapes.
+            ec = engine->createExecutionContext(ExecutionContextAllocationStrategy::kUSER_MANAGED);
+        }
+        if (ec == nullptr)
+        {
+            sample::gLogError << "Unable to create execution context for stream " << s << "." << std::endl;
+            return false;
+        }
+        ec->setNvtxVerbosity(inference.nvtxVerbosity);
+
+#if !TRT_WINML
+        int32_t const persistentCacheLimit
+            = samplesCommon::getMaxPersistentCacheSize() * inference.persistentCacheRatio;
+        sample::gLogInfo << "Setting persistentCacheLimit to " << persistentCacheLimit << " bytes." << std::endl;
+        ec->setPersistentCacheLimit(persistentCacheLimit);
+#endif
+
+        auto setProfile = ec->setOptimizationProfileAsync(inference.optProfileIndex, setOptProfileStream);
+        CHECK(cudaStreamSynchronize(setOptProfileStream));
+
+        if (!setProfile)
+        {
+            sample::gLogError << "Set optimization profile failed. " << std::endl;
+            if (inference.infStreams > 1)
+            {
+                sample::gLogError
+                    << "Please ensure that the engine is built with preview feature profileSharing0806 enabled. "
+                    << std::endl;
+            }
+            return false;
+        }
+
+        iEnv.contexts.emplace_back(ec);
+        iEnv.bindings.emplace_back(new Bindings(useManagedMemory));
+    }
+
+    CHECK(cudaStreamDestroy(setOptProfileStream));
+
+    if (iEnv.profiler)
+    {
+        iEnv.contexts.front()->setProfiler(iEnv.profiler.get());
+        // Always run reportToProfiler() after enqueue launch
+        iEnv.contexts.front()->setEnqueueEmitsProfile(false);
+    }
+
+    int32_t const endBindingIndex = engine->getNbIOTensors();
+
+    // Make sure that the tensor names provided in command-line args actually exist in any of the engine bindings
+    // to avoid silent typos.
+    if (!validateTensorNames(inference.shapes, engine, endBindingIndex))
+    {
+        sample::gLogError << "Invalid tensor names found in --shapes flag." << std::endl;
+        return false;
+    }
+
+    for (int32_t b = 0; b < endBindingIndex; ++b)
+    {
+        auto const& name = engine->getIOTensorName(b);
+        auto const& mode = engine->getTensorIOMode(name);
+        if (mode == TensorIOMode::kINPUT)
+        {
+            Dims const dims = iEnv.contexts.front()->getTensorShape(name);
+            bool isShapeInferenceIO{false};
+            isShapeInferenceIO = engine->isShapeInferenceIO(name);
+            bool const hasRuntimeDim = std::any_of(dims.d, dims.d + dims.nbDims, [](int32_t dim) { return dim == -1; });
+            auto const shape = findPlausible(inference.shapes, name);
+            if (hasRuntimeDim || isShapeInferenceIO)
+            {
+                // Set shapeData to either dimensions of the input (if it has a dynamic shape)
+                // or set to values of the input (if it is an input shape tensor).
+                std::vector<int32_t> shapeData;
+
+                if (shape == inference.shapes.end())
+                {
+                    // No information provided. Use default value for missing data.
+                    constexpr int32_t kDEFAULT_VALUE = 1;
+                    if (isShapeInferenceIO)
+                    {
+                        // Set shape tensor to all ones.
+                        shapeData.assign(volume(dims, 0, dims.nbDims), kDEFAULT_VALUE);
+                        sample::gLogWarning << "Values missing for input shape tensor: " << name
+                                            << "Automatically setting values to: " << shapeData << std::endl;
+                    }
+                    else
+                    {
+                        // Use default value for unspecified runtime dimensions.
+                        shapeData.resize(dims.nbDims);
+                        std::transform(dims.d, dims.d + dims.nbDims, shapeData.begin(),
+                            [&](int32_t dimension) { return dimension >= 0 ? dimension : kDEFAULT_VALUE; });
+                        sample::gLogWarning << "Shape missing for input with dynamic shape: " << name
+                                            << "Automatically setting shape to: " << shapeData << std::endl;
+                    }
+                }
+                else if (inference.inputs.count(shape->first) && isShapeInferenceIO)
+                {
+                    // Load shape tensor from file.
+                    int64_t const size = volume(dims, 0, dims.nbDims);
+                    shapeData.resize(size);
+                    auto const& filename = inference.inputs.at(shape->first);
+                    auto dst = reinterpret_cast<char*>(shapeData.data());
+                    loadFromFile(filename, dst, size * sizeof(decltype(shapeData)::value_type));
+                }
+                else
+                {
+                    shapeData = shape->second;
+                }
+
+                int32_t* shapeTensorData{nullptr};
+                if (isShapeInferenceIO)
+                {
+                    // Save the data in iEnv, in a way that it's address does not change
+                    // before enqueueV3 is called.
+                    iEnv.inputShapeTensorValues.emplace_back(shapeData);
+                    shapeTensorData = iEnv.inputShapeTensorValues.back().data();
+                }
+
+                for (auto& c : iEnv.contexts)
+                {
+                    if (isShapeInferenceIO)
+                    {
+                        sample::gLogInfo << "Set input shape tensor " << name << " to: " << shapeData << std::endl;
+                        if (!c->setTensorAddress(name, shapeTensorData))
+                        {
+                            return false;
+                        }
+                    }
+                    else
+                    {
+                        sample::gLogInfo << "Set shape of input tensor " << name << " to: " << shapeData
+                                            << std::endl;
+                        if (!c->setInputShape(name, toDims(shapeData)))
+                        {
+                            return false;
+                        }
+                    }
+                }
+            }
+            else if (nbOptProfiles && shape != inference.shapes.end())
+            {
+                // Check if the provided shape matches the static dimensions in the engine.
+                for (auto& c : iEnv.contexts)
+                {
+                    if (!c->setInputShape(name, toDims(shape->second)))
+                    {
+                        sample::gLogError << "The engine was built with static shapes for input tensor " << name
+                                          << " but the provided shapes do not match the static shapes!" << std::endl;
+                        return false;
+                    }
+                }
+            }
+        }
+    }
+
+    // Create Debug Listener and turn on debug states if client requested dumping debug tensors.
+    if (!inference.debugTensorFileNames.empty())
+    {
+        iEnv.listener.reset(new DebugTensorWriter(inference.debugTensorFileNames));
+        iEnv.contexts.front()->setDebugListener(iEnv.listener.get());
+        for (auto const& s : inference.debugTensorFileNames)
+        {
+            iEnv.contexts.front()->setTensorDebugState(s.first.c_str(), true);
+        }
+    }
+
+    if (!allocateContextMemory(iEnv, inference))
+    {
+        return false;
+    }
+
+    auto const* context = iEnv.contexts.front().get();
+    return FillStdBindings(
+        engine, context, inference.inputs, iEnv.bindings, 1, endBindingIndex, inference.optProfileIndex)();
+}
+
+TaskInferenceEnvironment::TaskInferenceEnvironment(
+    std::string engineFile, InferenceOptions inference, int32_t deviceId, int32_t DLACore, int32_t bs)
+    : iOptions(inference)
+    , device(deviceId)
+    , batch(bs)
+{
+    BuildEnvironment bEnv(/* isSafe */ false, /* versionCompatible */ false, DLACore, "", getTempfileControlDefaults());
+    loadEngineToBuildEnv(engineFile, bEnv, sample::gLogError);
+    std::unique_ptr<InferenceEnvironment> tmp(new InferenceEnvironment(bEnv));
+    iEnv = std::move(tmp);
+
+    cudaCheck(cudaSetDevice(device));
+    SystemOptions system{};
+    system.device = device;
+    system.DLACore = DLACore;
+    if (!setUpInference(*iEnv, iOptions, system))
+    {
+        sample::gLogError << "Inference set up failed" << std::endl;
+    }
+}
+namespace
+{
+
+#if defined(__QNX__)
+using TimePoint = double;
+#else
+using TimePoint = std::chrono::time_point<std::chrono::high_resolution_clock>;
+#endif
+
+TimePoint getCurrentTime()
+{
+#if defined(__QNX__)
+    uint64_t const currentCycles = ClockCycles();
+    uint64_t const cyclesPerSecond = SYSPAGE_ENTRY(qtime)->cycles_per_sec;
+    // Return current timestamp in ms.
+    return static_cast<TimePoint>(currentCycles) * 1000. / cyclesPerSecond;
+#else
+    return std::chrono::high_resolution_clock::now();
+#endif
+}
+
+//!
+//! \struct SyncStruct
+//! \brief Threads synchronization structure
+//!
+struct SyncStruct
+{
+    std::mutex mutex;
+    TrtCudaStream mainStream;
+    TrtCudaEvent gpuStart{cudaEventBlockingSync};
+    TimePoint cpuStart{};
+    float sleep{};
+};
+
+struct Enqueue
+{
+    explicit Enqueue(nvinfer1::IExecutionContext& context)
+        : mContext(context)
+    {
+    }
+
+    nvinfer1::IExecutionContext& mContext;
+};
+
+//!
+//! \class EnqueueExplicit
+//! \brief Functor to enqueue inference with explict batch
+//!
+class EnqueueExplicit : private Enqueue
+{
+
+public:
+    explicit EnqueueExplicit(nvinfer1::IExecutionContext& context, Bindings const& bindings)
+        : Enqueue(context)
+        , mBindings(bindings)
+    {
+        ASSERT(mBindings.setTensorAddresses(mContext));
+    }
+
+    bool operator()(TrtCudaStream& stream) const
+    {
+        try
+        {
+            bool const result = mContext.enqueueV3(stream.get());
+            // Collecting layer timing info from current profile index of execution context, except under capturing
+            // mode.
+            if (!isStreamCapturing(stream) && mContext.getProfiler() && !mContext.getEnqueueEmitsProfile()
+                && !mContext.reportToProfiler())
+            {
+                gLogWarning << "Failed to collect layer timing info from previous enqueueV3()" << std::endl;
+            }
+            return result;
+        }
+        catch (const std::exception&)
+        {
+            return false;
+        }
+        return false;
+    }
+
+private:
+    // Helper function to check if a stream is in capturing mode.
+    bool isStreamCapturing(TrtCudaStream& stream) const
+    {
+        cudaStreamCaptureStatus status{cudaStreamCaptureStatusNone};
+        cudaCheck(cudaStreamIsCapturing(stream.get(), &status));
+        return status != cudaStreamCaptureStatusNone;
+    }
+
+    Bindings const& mBindings;
+};
+
+//!
+//! \class EnqueueGraph
+//! \brief Functor to enqueue inference from CUDA Graph
+//!
+class EnqueueGraph
+{
+
+public:
+    explicit EnqueueGraph(nvinfer1::IExecutionContext& context, TrtCudaGraph& graph)
+        : mGraph(graph)
+        , mContext(context)
+    {
+    }
+
+    bool operator()(TrtCudaStream& stream) const
+    {
+        if (mGraph.launch(stream))
+        {
+            // Collecting layer timing info from current profile index of execution context
+            if (mContext.getProfiler() && !mContext.getEnqueueEmitsProfile() && !mContext.reportToProfiler())
+            {
+                gLogWarning << "Failed to collect layer timing info from previous CUDA graph launch" << std::endl;
+            }
+            return true;
+        }
+        return false;
+    }
+
+    TrtCudaGraph& mGraph;
+    nvinfer1::IExecutionContext& mContext;
+};
+
+//!
+//! \class EnqueueGraphSafe
+//! \brief Functor to enqueue inference from CUDA Graph
+//!
+class EnqueueGraphSafe
+{
+
+public:
+    explicit EnqueueGraphSafe(TrtCudaGraph& graph)
+        : mGraph(graph)
+    {
+    }
+
+    bool operator()(TrtCudaStream& stream) const
+    {
+        return mGraph.launch(stream);
+    }
+
+    TrtCudaGraph& mGraph;
+};
+
+using EnqueueFunction = std::function<bool(TrtCudaStream&)>;
+
+enum class StreamType : int32_t
+{
+    kINPUT = 0,
+    kCOMPUTE = 1,
+    kOUTPUT = 2,
+    kNUM = 3
+};
+
+enum class EventType : int32_t
+{
+    kINPUT_S = 0,
+    kINPUT_E = 1,
+    kCOMPUTE_S = 2,
+    kCOMPUTE_E = 3,
+    kOUTPUT_S = 4,
+    kOUTPUT_E = 5,
+    kNUM = 6
+};
+
+using MultiStream = std::array<TrtCudaStream, static_cast<int32_t>(StreamType::kNUM)>;
+
+using MultiEvent = std::array<std::unique_ptr<TrtCudaEvent>, static_cast<int32_t>(EventType::kNUM)>;
+
+using EnqueueTimes = std::array<TimePoint, 2>;
+
+//!
+//! \class Iteration
+//! \brief Inference iteration and streams management
+//!
+class Iteration
+{
+
+public:
+    Iteration(int32_t id, InferenceOptions const& inference, nvinfer1::IExecutionContext& context, Bindings& bindings)
+        : mBindings(bindings)
+        , mStreamId(id)
+        , mDepth(1 + inference.overlap)
+        , mActive(mDepth)
+        , mEvents(mDepth)
+        , mEnqueueTimes(mDepth)
+        , mContext(&context)
+    {
+        for (int32_t d = 0; d < mDepth; ++d)
+        {
+            for (int32_t e = 0; e < static_cast<int32_t>(EventType::kNUM); ++e)
+            {
+                mEvents[d][e].reset(new TrtCudaEvent(!inference.spin));
+            }
+        }
+        createEnqueueFunction(inference, context, bindings);
+    }
+
+    bool query(bool skipTransfers)
+    {
+        if (mActive[mNext])
+        {
+            return true;
+        }
+
+        if (!skipTransfers)
+        {
+            record(EventType::kINPUT_S, StreamType::kINPUT);
+            setInputData(false);
+            record(EventType::kINPUT_E, StreamType::kINPUT);
+            wait(EventType::kINPUT_E, StreamType::kCOMPUTE); // Wait for input DMA before compute
+        }
+
+        record(EventType::kCOMPUTE_S, StreamType::kCOMPUTE);
+        recordEnqueueTime();
+        if (!mEnqueue(getStream(StreamType::kCOMPUTE)))
+        {
+            return false;
+        }
+        recordEnqueueTime();
+        record(EventType::kCOMPUTE_E, StreamType::kCOMPUTE);
+
+        if (!skipTransfers)
+        {
+            wait(EventType::kCOMPUTE_E, StreamType::kOUTPUT); // Wait for compute before output DMA
+            record(EventType::kOUTPUT_S, StreamType::kOUTPUT);
+            fetchOutputData(false);
+            record(EventType::kOUTPUT_E, StreamType::kOUTPUT);
+        }
+
+        mActive[mNext] = true;
+        moveNext();
+        return true;
+    }
+
+    float sync(
+        TimePoint const& cpuStart, TrtCudaEvent const& gpuStart, std::vector<InferenceTrace>& trace, bool skipTransfers)
+    {
+        if (mActive[mNext])
+        {
+            if (skipTransfers)
+            {
+                getEvent(EventType::kCOMPUTE_E).synchronize();
+            }
+            else
+            {
+                getEvent(EventType::kOUTPUT_E).synchronize();
+            }
+            trace.emplace_back(getTrace(cpuStart, gpuStart, skipTransfers));
+            mActive[mNext] = false;
+            return getEvent(EventType::kCOMPUTE_S) - gpuStart;
+        }
+        return 0;
+    }
+
+    void syncAll(
+        TimePoint const& cpuStart, TrtCudaEvent const& gpuStart, std::vector<InferenceTrace>& trace, bool skipTransfers)
+    {
+        for (int32_t d = 0; d < mDepth; ++d)
+        {
+            sync(cpuStart, gpuStart, trace, skipTransfers);
+            moveNext();
+        }
+    }
+
+    void wait(TrtCudaEvent& gpuStart)
+    {
+        getStream(StreamType::kINPUT).wait(gpuStart);
+    }
+
+    void setInputData(bool sync)
+    {
+        mBindings.transferInputToDevice(getStream(StreamType::kINPUT));
+        // additional sync to avoid overlapping with inference execution.
+        if (sync)
+        {
+            getStream(StreamType::kINPUT).synchronize();
+        }
+    }
+
+    void fetchOutputData(bool sync)
+    {
+        mBindings.transferOutputToHost(getStream(StreamType::kOUTPUT));
+        // additional sync to avoid overlapping with inference execution.
+        if (sync)
+        {
+            getStream(StreamType::kOUTPUT).synchronize();
+        }
+    }
+
+private:
+    void moveNext()
+    {
+        mNext = mDepth - 1 - mNext;
+    }
+
+    TrtCudaStream& getStream(StreamType t)
+    {
+        return mStream[static_cast<int32_t>(t)];
+    }
+
+    TrtCudaEvent& getEvent(EventType t)
+    {
+        return *mEvents[mNext][static_cast<int32_t>(t)];
+    }
+
+    void record(EventType e, StreamType s)
+    {
+        getEvent(e).record(getStream(s));
+    }
+
+    void recordEnqueueTime()
+    {
+        mEnqueueTimes[mNext][enqueueStart] = getCurrentTime();
+        enqueueStart = 1 - enqueueStart;
+    }
+
+    TimePoint getEnqueueTime(bool start)
+    {
+        return mEnqueueTimes[mNext][start ? 0 : 1];
+    }
+
+    void wait(EventType e, StreamType s)
+    {
+        getStream(s).wait(getEvent(e));
+    }
+
+    InferenceTrace getTrace(TimePoint const& cpuStart, TrtCudaEvent const& gpuStart, bool skipTransfers)
+    {
+        float is
+            = skipTransfers ? getEvent(EventType::kCOMPUTE_S) - gpuStart : getEvent(EventType::kINPUT_S) - gpuStart;
+        float ie
+            = skipTransfers ? getEvent(EventType::kCOMPUTE_S) - gpuStart : getEvent(EventType::kINPUT_E) - gpuStart;
+        float os
+            = skipTransfers ? getEvent(EventType::kCOMPUTE_E) - gpuStart : getEvent(EventType::kOUTPUT_S) - gpuStart;
+        float oe
+            = skipTransfers ? getEvent(EventType::kCOMPUTE_E) - gpuStart : getEvent(EventType::kOUTPUT_E) - gpuStart;
+
+        return InferenceTrace(mStreamId,
+            std::chrono::duration<float, std::milli>(getEnqueueTime(true) - cpuStart).count(),
+            std::chrono::duration<float, std::milli>(getEnqueueTime(false) - cpuStart).count(), is, ie,
+            getEvent(EventType::kCOMPUTE_S) - gpuStart, getEvent(EventType::kCOMPUTE_E) - gpuStart, os, oe);
+    }
+
+    void createEnqueueFunction(
+        InferenceOptions const& inference, nvinfer1::IExecutionContext& context, Bindings& bindings)
+    {
+        mEnqueue = EnqueueFunction(EnqueueExplicit(context, mBindings));
+        if (inference.graph)
+        {
+            sample::gLogInfo << "Capturing CUDA graph for the current execution context" << std::endl;
+
+            TrtCudaStream& stream = getStream(StreamType::kCOMPUTE);
+            // Avoid capturing initialization calls by executing the enqueue function at least
+            // once before starting CUDA graph capture.
+            auto const ret = mEnqueue(stream);
+            if (!ret)
+            {
+                throw std::runtime_error("Inference enqueue failed.");
+            }
+            stream.synchronize();
+
+            mGraph.beginCapture(stream);
+            // The built TRT engine may contain operations that are not permitted under CUDA graph capture mode.
+            // When the stream is capturing, the enqueue call may return false if the current CUDA graph capture fails.
+            if (mEnqueue(stream))
+            {
+                mGraph.endCapture(stream);
+                mEnqueue = EnqueueFunction(EnqueueGraph(context, mGraph));
+                sample::gLogInfo << "Successfully captured CUDA graph for the current execution context" << std::endl;
+            }
+            else
+            {
+                mGraph.endCaptureOnError(stream);
+                // Ensure any CUDA error has been cleaned up.
+                cudaCheck(cudaGetLastError());
+                sample::gLogWarning << "The built TensorRT engine contains operations that are not permitted under "
+                                       "CUDA graph capture mode."
+                                    << std::endl;
+                sample::gLogWarning << "The specified --useCudaGraph flag has been ignored. The inference will be "
+                                       "launched without using CUDA graph launch."
+                                    << std::endl;
+            }
+        }
+    }
+
+    Bindings& mBindings;
+
+    TrtCudaGraph mGraph;
+    EnqueueFunction mEnqueue;
+
+    int32_t mStreamId{0};
+    int32_t mNext{0};
+    int32_t mDepth{2}; // default to double buffer to hide DMA transfers
+
+    std::vector<bool> mActive;
+    MultiStream mStream;
+    std::vector<MultiEvent> mEvents;
+
+    int32_t enqueueStart{0};
+    std::vector<EnqueueTimes> mEnqueueTimes;
+    nvinfer1::IExecutionContext* mContext{nullptr};
+};
+
+bool inferenceLoop(std::vector<std::unique_ptr<Iteration>>& iStreams, TimePoint const& cpuStart,
+    TrtCudaEvent const& gpuStart, int iterations, float maxDurationMs, float warmupMs,
+    std::vector<InferenceTrace>& trace, bool skipTransfers, float idleMs)
+{
+    float durationMs = 0;
+    int32_t skip = 0;
+
+    if (maxDurationMs == -1.F)
+    {
+        sample::gLogWarning << "--duration=-1 is specified, inference will run in an endless loop until"
+                            << " aborted with CTRL-C (SIGINT)" << std::endl;
+        while (true)
+        {
+            for (auto& s : iStreams)
+            {
+                if (!s->query(skipTransfers))
+                {
+                    return false;
+                }
+            }
+            for (auto& s : iStreams)
+            {
+                s->sync(cpuStart, gpuStart, trace, skipTransfers);
+            }
+        }
+    }
+
+    for (int32_t i = 0; i < iterations + skip || durationMs < maxDurationMs; ++i)
+    {
+        for (auto& s : iStreams)
+        {
+            if (!s->query(skipTransfers))
+            {
+                return false;
+            }
+        }
+        for (auto& s : iStreams)
+        {
+            durationMs = std::max(durationMs, s->sync(cpuStart, gpuStart, trace, skipTransfers));
+        }
+        if (durationMs < warmupMs) // Warming up
+        {
+            if (durationMs) // Skip complete iterations
+            {
+                ++skip;
+            }
+            continue;
+        }
+        if (idleMs != 0.F)
+        {
+            std::this_thread::sleep_for(std::chrono::duration<float, std::milli>(idleMs));
+        }
+    }
+    for (auto& s : iStreams)
+    {
+        s->syncAll(cpuStart, gpuStart, trace, skipTransfers);
+    }
+    return true;
+}
+
+void inferenceExecution(InferenceOptions const& inference, InferenceEnvironment& iEnv, SyncStruct& sync,
+    int32_t const threadIdx, int32_t const streamsPerThread, int32_t device,
+    std::vector<InferenceTrace>& trace) noexcept
+{
+    try
+    {
+        float warmupMs = inference.warmup;
+        float durationMs = -1.F;
+        if (inference.duration != -1.F)
+        {
+            durationMs = inference.duration * 1000.F + warmupMs;
+        }
+
+        cudaCheck(cudaSetDevice(device));
+
+        std::vector<std::unique_ptr<Iteration>> iStreams;
+
+        for (int32_t s = 0; s < streamsPerThread; ++s)
+        {
+            int32_t const streamId{threadIdx * streamsPerThread + s};
+            auto* iteration = new Iteration(streamId, inference, *iEnv.getContext(streamId), *iEnv.bindings[streamId]);
+            if (inference.skipTransfers)
+            {
+                iteration->setInputData(true);
+            }
+            iStreams.emplace_back(iteration);
+        }
+
+        for (auto& s : iStreams)
+        {
+            s->wait(sync.gpuStart);
+        }
+
+        std::vector<InferenceTrace> localTrace;
+        if (!inferenceLoop(iStreams, sync.cpuStart, sync.gpuStart, inference.iterations, durationMs, warmupMs,
+                localTrace, inference.skipTransfers, inference.idle))
+        {
+            sync.mutex.lock();
+            iEnv.error = true;
+            sync.mutex.unlock();
+        }
+
+        if (inference.skipTransfers)
+        {
+            for (auto& s : iStreams)
+            {
+                s->fetchOutputData(true);
+            }
+        }
+
+        sync.mutex.lock();
+        trace.insert(trace.end(), localTrace.begin(), localTrace.end());
+        sync.mutex.unlock();
+    }
+    catch (...)
+    {
+        sync.mutex.lock();
+        iEnv.error = true;
+        sync.mutex.unlock();
+    }
+}
+
+inline std::thread makeThread(InferenceOptions const& inference, InferenceEnvironment& iEnv, SyncStruct& sync,
+    int32_t threadIdx, int32_t streamsPerThread, int32_t device, std::vector<InferenceTrace>& trace)
+{
+    return std::thread(inferenceExecution, std::cref(inference), std::ref(iEnv), std::ref(sync), threadIdx,
+        streamsPerThread, device, std::ref(trace));
+}
+
+} // namespace
+
+bool runInference(
+    InferenceOptions const& inference, InferenceEnvironment& iEnv, int32_t device, std::vector<InferenceTrace>& trace)
+{
+    SMP_RETVAL_IF_FALSE(!iEnv.safe, "Safe inference is not supported!", false, sample::gLogError);
+    cudaCheck(cudaProfilerStart());
+
+    trace.resize(0);
+
+    SyncStruct sync;
+    sync.sleep = inference.sleep;
+    sync.mainStream.sleep(&sync.sleep);
+    sync.cpuStart = getCurrentTime();
+    sync.gpuStart.record(sync.mainStream);
+
+    // When multiple streams are used, trtexec can run inference in two modes:
+    // (1) if inference.threads is true, then run each stream on each thread.
+    // (2) if inference.threads is false, then run all streams on the same thread.
+    int32_t const numThreads = inference.threads ? inference.infStreams : 1;
+    int32_t const streamsPerThread = inference.threads ? 1 : inference.infStreams;
+
+    std::vector<std::thread> threads;
+    for (int32_t threadIdx = 0; threadIdx < numThreads; ++threadIdx)
+    {
+        threads.emplace_back(makeThread(inference, iEnv, sync, threadIdx, streamsPerThread, device, trace));
+    }
+    for (auto& th : threads)
+    {
+        th.join();
+    }
+
+    cudaCheck(cudaProfilerStop());
+
+    auto cmpTrace = [](InferenceTrace const& a, InferenceTrace const& b) { return a.h2dStart < b.h2dStart; };
+    std::sort(trace.begin(), trace.end(), cmpTrace);
+
+    return !iEnv.error;
+}
+
+bool runMultiTasksInference(std::vector<std::unique_ptr<TaskInferenceEnvironment>>& tEnvList)
+{
+    cudaCheck(cudaProfilerStart());
+    cudaSetDeviceFlags(cudaDeviceScheduleSpin);
+
+    SyncStruct sync;
+    sync.sleep = 0;
+    sync.mainStream.sleep(&sync.sleep);
+    sync.cpuStart = getCurrentTime();
+    sync.gpuStart.record(sync.mainStream);
+
+    std::vector<std::thread> threads;
+    for (size_t i = 0; i < tEnvList.size(); ++i)
+    {
+        auto& tEnv = tEnvList[i];
+        threads.emplace_back(makeThread(
+            tEnv->iOptions, *(tEnv->iEnv), sync, /*threadIdx*/ 0, /*streamsPerThread*/ 1, tEnv->device, tEnv->trace));
+    }
+    for (auto& th : threads)
+    {
+        th.join();
+    }
+
+    cudaCheck(cudaProfilerStop());
+
+    auto cmpTrace = [](InferenceTrace const& a, InferenceTrace const& b) { return a.h2dStart < b.h2dStart; };
+    for (auto& tEnv : tEnvList)
+    {
+        std::sort(tEnv->trace.begin(), tEnv->trace.end(), cmpTrace);
+    }
+
+    return std::none_of(tEnvList.begin(), tEnvList.end(),
+        [](std::unique_ptr<TaskInferenceEnvironment>& tEnv) { return tEnv->iEnv->error; });
+}
+
+namespace
+{
+size_t reportGpuMemory()
+{
+    static size_t prevFree{0};
+    size_t free{0};
+    size_t total{0};
+    size_t newlyAllocated{0};
+    cudaCheck(cudaMemGetInfo(&free, &total));
+    sample::gLogInfo << "Free GPU memory = " << free / 1024.0_MiB << " GiB";
+    if (prevFree != 0)
+    {
+        newlyAllocated = (prevFree - free);
+        sample::gLogInfo << ", newly allocated GPU memory = " << newlyAllocated / 1024.0_MiB << " GiB";
+    }
+    sample::gLogInfo << ", total GPU memory = " << total / 1024.0_MiB << " GiB" << std::endl;
+    prevFree = free;
+    return newlyAllocated;
+}
+} // namespace
+
+//! Returns true if deserialization is slower than expected or fails.
+bool timeDeserialize(InferenceEnvironment& iEnv, SystemOptions const& sys)
+{
+    constexpr int32_t kNB_ITERS{20};
+    std::unique_ptr<IRuntime> rt{createRuntime()};
+    std::unique_ptr<ICudaEngine> engine;
+
+    SMP_RETVAL_IF_FALSE(!iEnv.safe, "Safe inference is not supported!", false, sample::gLogError);
+
+    auto timeDeserializeFn = [&]() -> float {
+        bool deserializeOK{false};
+        engine.reset(nullptr);
+        auto startClock = std::chrono::high_resolution_clock::now();
+        SMP_RETVAL_IF_FALSE(!iEnv.safe, "Safe inference is not supported!", false, sample::gLogError);
+
+        auto& reader = iEnv.engine.getFileReader();
+        reader.reset();
+        ASSERT(reader.isOpen());
+#if !TRT_WINML
+        for (auto const& pluginPath : sys.dynamicPlugins)
+        {
+            rt->getPluginRegistry().loadLibrary(pluginPath.c_str());
+        }
+#endif
+        engine.reset(rt->deserializeCudaEngine(reader));
+        deserializeOK = (engine != nullptr);
+        auto endClock = std::chrono::high_resolution_clock::now();
+        // return NAN if deserialization failed.
+        return deserializeOK ? std::chrono::duration<float, std::milli>(endClock - startClock).count() : NAN;
+    };
+
+    // Warmup the caches to make sure that cache thrashing isn't throwing off the results
+    {
+        sample::gLogInfo << "Begin deserialization warmup..." << std::endl;
+        for (int32_t i = 0, e = 2; i < e; ++i)
+        {
+            timeDeserializeFn();
+        }
+    }
+    sample::gLogInfo << "Begin deserialization engine timing..." << std::endl;
+    float const first = timeDeserializeFn();
+
+    // Check if first deserialization succeeded.
+    if (std::isnan(first))
+    {
+        sample::gLogError << "Engine deserialization failed." << std::endl;
+        return true;
+    }
+
+    sample::gLogInfo << "First deserialization time = " << first << " milliseconds" << std::endl;
+
+    // Record initial gpu memory state.
+    reportGpuMemory();
+
+    float totalTime{0.F};
+    for (int32_t i = 0; i < kNB_ITERS; ++i)
+    {
+        totalTime += timeDeserializeFn();
+    }
+    auto const averageTime = totalTime / kNB_ITERS;
+    // reportGpuMemory sometimes reports zero after a single deserialization of a small engine,
+    // so use the size of memory for all the iterations.
+    auto const totalEngineSizeGpu = reportGpuMemory();
+    sample::gLogInfo << "Total deserialization time = " << totalTime << " milliseconds in " << kNB_ITERS
+                     << " iterations, average time = " << averageTime << " milliseconds, first time = " << first
+                     << " milliseconds." << std::endl;
+    sample::gLogInfo << "Deserialization Bandwidth = " << 1E-6 * totalEngineSizeGpu / totalTime << " GB/s" << std::endl;
+
+    // If the first deserialization is more than tolerance slower than
+    // the average deserialization, return true, which means an error occurred.
+    // The tolerance is set to 2x since the deserialization time is quick and susceptible
+    // to caching issues causing problems in the first timing.
+    auto const tolerance = 2.0F;
+    bool const isSlowerThanExpected = first > averageTime * tolerance;
+    if (isSlowerThanExpected)
+    {
+        sample::gLogInfo << "First deserialization time divided by average time is " << (first / averageTime)
+                         << ". Exceeds tolerance of " << tolerance << "x." << std::endl;
+    }
+    return isSlowerThanExpected;
+}
+
+std::string getLayerInformation(
+    nvinfer1::ICudaEngine* engine, nvinfer1::IExecutionContext* context, nvinfer1::LayerInformationFormat format)
+{
+    auto runtime = std::unique_ptr<IRuntime>{createRuntime()};
+    auto inspector = std::unique_ptr<IEngineInspector>(engine->createEngineInspector());
+    if (context != nullptr)
+    {
+        inspector->setExecutionContext(context);
+    }
+    std::string result = inspector->getEngineInformation(format);
+    return result;
+}
+
+void Binding::fill(std::string const& fileName)
+{
+    loadFromFile(fileName, static_cast<char*>(buffer->getHostBuffer()), buffer->getSize());
+}
+
+void Binding::fill()
+{
+    switch (dataType)
+    {
+    case nvinfer1::DataType::kBOOL:
+    {
+        fillBuffer<bool>(buffer->getHostBuffer(), volume, 0, 1);
+        break;
+    }
+    case nvinfer1::DataType::kINT32:
+    {
+        fillBuffer<int32_t>(buffer->getHostBuffer(), volume, -128, 127);
+        break;
+    }
+    case nvinfer1::DataType::kINT64:
+    {
+        fillBuffer<int64_t>(buffer->getHostBuffer(), volume, -128, 127);
+        break;
+    }
+    case nvinfer1::DataType::kINT8:
+    {
+        fillBuffer<int8_t>(buffer->getHostBuffer(), volume, -128, 127);
+        break;
+    }
+    case nvinfer1::DataType::kFLOAT:
+    {
+        fillBuffer<float>(buffer->getHostBuffer(), volume, -1.0F, 1.0F);
+        break;
+    }
+    case nvinfer1::DataType::kHALF:
+    {
+        fillBuffer<__half>(buffer->getHostBuffer(), volume, -1.0F, 1.0F);
+        break;
+    }
+    case nvinfer1::DataType::kBF16:
+    {
+        fillBuffer<BFloat16>(buffer->getHostBuffer(), volume, -1.0F, 1.0F);
+        break;
+    }
+    case nvinfer1::DataType::kUINT8:
+    {
+        fillBuffer<uint8_t>(buffer->getHostBuffer(), volume, 0, 255);
+        break;
+    }
+    case nvinfer1::DataType::kFP8: ASSERT(false && "FP8 is not supported");
+    case nvinfer1::DataType::kINT4: ASSERT(false && "INT4 is not supported");
+    }
+}
+
+void Binding::dump(std::ostream& os, Dims dims, Dims strides, int32_t vectorDim, int32_t spv,
+    std::string const separator /*= " "*/) const
+{
+    void* outputBuffer{};
+    if (outputAllocator != nullptr)
+    {
+        outputBuffer = outputAllocator->getBuffer()->getHostBuffer();
+        // Overwrite dimensions with those reported by the output allocator.
+        dims = outputAllocator->getFinalDims();
+        os << "Final shape is " << dims << " reported by the output allocator." << std::endl;
+    }
+    else
+    {
+        outputBuffer = buffer->getHostBuffer();
+    }
+    switch (dataType)
+    {
+    case nvinfer1::DataType::kBOOL:
+    {
+        dumpBuffer<bool>(outputBuffer, separator, os, dims, strides, vectorDim, spv);
+        break;
+    }
+    case nvinfer1::DataType::kINT32:
+    {
+        dumpBuffer<int32_t>(outputBuffer, separator, os, dims, strides, vectorDim, spv);
+        break;
+    }
+    case nvinfer1::DataType::kINT8:
+    {
+        dumpBuffer<int8_t>(outputBuffer, separator, os, dims, strides, vectorDim, spv);
+        break;
+    }
+    case nvinfer1::DataType::kFLOAT:
+    {
+        dumpBuffer<float>(outputBuffer, separator, os, dims, strides, vectorDim, spv);
+        break;
+    }
+    case nvinfer1::DataType::kHALF:
+    {
+        dumpBuffer<__half>(outputBuffer, separator, os, dims, strides, vectorDim, spv);
+        break;
+    }
+    case nvinfer1::DataType::kBF16:
+    {
+        dumpBuffer<BFloat16>(outputBuffer, separator, os, dims, strides, vectorDim, spv);
+        break;
+    }
+    case nvinfer1::DataType::kUINT8:
+    {
+        dumpBuffer<uint8_t>(outputBuffer, separator, os, dims, strides, vectorDim, spv);
+        break;
+    }
+    case nvinfer1::DataType::kINT64:
+    {
+        dumpBuffer<int64_t>(outputBuffer, separator, os, dims, strides, vectorDim, spv);
+        break;
+    }
+    case nvinfer1::DataType::kFP8: ASSERT(false && "FP8 is not supported");
+    case nvinfer1::DataType::kINT4: ASSERT(false && "INT4 is not supported");
+    }
+}
+
+void Bindings::addBinding(TensorInfo const& tensorInfo, std::string const& fileName /*= ""*/)
+{
+    auto const b = tensorInfo.bindingIndex;
+    while (mBindings.size() <= static_cast<size_t>(b))
+    {
+        mBindings.emplace_back();
+        mDevicePointers.emplace_back();
+    }
+    mNames[tensorInfo.name] = b;
+    mBindings[b].isInput = tensorInfo.isInput;
+    mBindings[b].volume = tensorInfo.vol;
+    mBindings[b].dataType = tensorInfo.dataType;
+    if (tensorInfo.isDynamic)
+    {
+        ASSERT(!tensorInfo.isInput); // Only output shape can be possibly unknown because of DDS.
+        if (mBindings[b].outputAllocator == nullptr)
+        {
+            if (mUseManaged)
+            {
+                mBindings[b].outputAllocator.reset(new OutputAllocator(new UnifiedMirroredBuffer));
+            }
+            else
+            {
+                mBindings[b].outputAllocator.reset(new OutputAllocator(new DiscreteMirroredBuffer));
+            }
+        }
+    }
+    else
+    {
+        if (mBindings[b].buffer == nullptr)
+        {
+            if (mUseManaged)
+            {
+                mBindings[b].buffer.reset(new UnifiedMirroredBuffer);
+            }
+            else
+            {
+                mBindings[b].buffer.reset(new DiscreteMirroredBuffer);
+            }
+        }
+        // Some memory allocators return nullptr when allocating zero bytes, but TensorRT requires a non-null ptr
+        // even for empty tensors, so allocate a dummy byte.
+        if (tensorInfo.vol == 0)
+        {
+            mBindings[b].buffer->allocate(1);
+        }
+        else
+        {
+            mBindings[b].buffer->allocate(
+                static_cast<size_t>(tensorInfo.vol) * static_cast<size_t>(dataTypeSize(tensorInfo.dataType)));
+        }
+        mDevicePointers[b] = mBindings[b].buffer->getDeviceBuffer();
+    }
+    if (tensorInfo.isInput)
+    {
+        if (fileName.empty())
+        {
+            fill(b);
+        }
+        else
+        {
+            fill(b, fileName);
+        }
+    }
+}
+
+void** Bindings::getDeviceBuffers()
+{
+    return mDevicePointers.data();
+}
+
+void Bindings::transferInputToDevice(TrtCudaStream& stream)
+{
+    for (auto& b : mNames)
+    {
+        if (mBindings[b.second].isInput)
+        {
+            mBindings[b.second].buffer->hostToDevice(stream);
+        }
+    }
+}
+
+void Bindings::transferOutputToHost(TrtCudaStream& stream)
+{
+    for (auto& b : mNames)
+    {
+        if (!mBindings[b.second].isInput)
+        {
+            if (mBindings[b.second].outputAllocator != nullptr)
+            {
+                mBindings[b.second].outputAllocator->getBuffer()->deviceToHost(stream);
+            }
+            else
+            {
+                mBindings[b.second].buffer->deviceToHost(stream);
+            }
+        }
+    }
+}
+
+void Bindings::dumpBindingValues(nvinfer1::IExecutionContext const& context, int32_t binding, std::ostream& os,
+    std::string const& separator /*= " "*/, int32_t batch /*= 1*/) const
+{
+    auto const tensorName = context.getEngine().getIOTensorName(binding);
+    Dims dims = context.getTensorShape(tensorName);
+    Dims strides = context.getTensorStrides(tensorName);
+    int32_t vectorDim = context.getEngine().getTensorVectorizedDim(tensorName);
+    int32_t const spv = context.getEngine().getTensorComponentsPerElement(tensorName);
+
+    mBindings[binding].dump(os, dims, strides, vectorDim, spv, separator);
+}
+
+namespace
+{
+
+std::string genFilenameSafeString(std::string const& s)
+{
+    std::string res = s;
+    static std::string const allowedSpecialChars{"._-,"};
+    for (auto& c : res)
+    {
+        if (!isalnum(c) && allowedSpecialChars.find(c) == std::string::npos)
+        {
+            c = '_';
+        }
+    }
+    return res;
+}
+
+Dims getBindingDimensions(nvinfer1::IExecutionContext const& context, std::string const& name)
+{
+    return context.getTensorShape(name.c_str());
+}
+} // namespace
+
+void Bindings::dumpRawBindingToFiles(nvinfer1::IExecutionContext const& context, std::ostream& os) const
+{
+    os << "Dumping I/O Bindings to RAW Files:" << std::endl;
+    for (auto const& n : mNames)
+    {
+        auto name = n.first;
+        auto bIndex = n.second;
+        auto const& binding = mBindings[bIndex];
+        void* outputBuffer{};
+        if (binding.outputAllocator != nullptr)
+        {
+            outputBuffer = binding.outputAllocator->getBuffer()->getHostBuffer();
+        }
+        else
+        {
+            outputBuffer = binding.buffer->getHostBuffer();
+        }
+
+        Dims dims = getBindingDimensions(context, name);
+        std::string dimsStr;
+        std::string dotStr;
+
+        for (int32_t i = 0; i < dims.nbDims; i++)
+        {
+            dimsStr += dotStr + std::to_string(dims.d[i]);
+            dotStr = ".";
+        }
+
+        std::string const bindingTypeStr = (binding.isInput ? "input" : "output");
+
+        std::stringstream fileName;
+        fileName << genFilenameSafeString(name) << "." << bindingTypeStr << "." << dimsStr << "." << binding.dataType
+                 << ".raw";
+
+        os << "Writing file for " << bindingTypeStr << " binding " << name << " (with datatype " << binding.dataType
+           << " and dimensions " << dimsStr << ") to " << fileName.str() << std::endl;
+
+        std::ofstream f(fileName.str(), std::ios::out | std::ios::binary);
+        ASSERT(f && "Cannot open file for write");
+        f.write(static_cast<char*>(outputBuffer), binding.volume * samplesCommon::elementSize(binding.dataType));
+        f.close();
+    }
+}
+
+void Bindings::dumpBindingDimensions(
+    std::string const& name, nvinfer1::IExecutionContext const& context, std::ostream& os) const
+{
+    auto const dims = context.getTensorShape(name.c_str());
+    // Do not add a newline terminator, because the caller may be outputting a JSON string.
+    os << dims;
+}
+
+std::unordered_map<std::string, int> Bindings::getBindings(std::function<bool(Binding const&)> predicate) const
+{
+    std::unordered_map<std::string, int> bindings;
+    for (auto const& n : mNames)
+    {
+        auto const binding = n.second;
+        if (predicate(mBindings[binding]))
+        {
+            bindings.insert(n);
+        }
+    }
+    return bindings;
+}
+
+bool Bindings::setTensorAddresses(nvinfer1::IExecutionContext& context) const
+{
+    for (auto const& b : mNames)
+    {
+        auto const name = b.first.c_str();
+        auto const location = context.getEngine().getTensorLocation(name);
+        if (location == TensorLocation::kDEVICE)
+        {
+            if (mBindings[b.second].outputAllocator != nullptr)
+            {
+                if (!context.setOutputAllocator(name, mBindings[b.second].outputAllocator.get()))
+                {
+                    return false;
+                }
+            }
+            else
+            {
+                if (!context.setTensorAddress(name, mDevicePointers[b.second]))
+                {
+                    return false;
+                }
+            }
+        }
+    }
+    return true;
+}
+
+bool DebugTensorWriter::processDebugTensor(void const* addr, nvinfer1::TensorLocation location, nvinfer1::DataType type,
+    nvinfer1::Dims const& shape, char const* name, cudaStream_t stream)
+{
+    CHECK(cudaStreamSynchronize(stream));
+    // Store data from callback.
+    int64_t size = std::accumulate(shape.d, shape.d + shape.nbDims, 1LL, std::multiplies<int64_t>{})
+        * samplesCommon::elementSize(type);
+    std::vector<char> hostDataOut(size, 0);
+    CHECK(cudaMemcpy(hostDataOut.data(), addr, size, cudaMemcpyDeviceToHost));
+
+    auto it = mDebugTensorFileNames.find(name);
+    ASSERT(it != mDebugTensorFileNames.end());
+    std::string fileName = it->second;
+
+    std::ofstream f(fileName, std::ios::out | std::ios::binary);
+    ASSERT(f && "Cannot open file for write");
+    sample::gLogInfo << "Writing to file " << fileName << " for debug tensor " << name << std::endl;
+    f.write(hostDataOut.data(), size);
+    f.close();
+
+    CHECK(cudaStreamSynchronize(stream));
+    return true;
+}
+
+} // namespace sample
diff --git a/src/Detector/tensorrt_yolo/common/sampleInference.h b/src/Detector/tensorrt_yolo/common/sampleInference.h
index 1c21f592..d9ebed92 100644
--- a/src/Detector/tensorrt_yolo/common/sampleInference.h
+++ b/src/Detector/tensorrt_yolo/common/sampleInference.h
@@ -1,11 +1,12 @@
 /*
- * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
- *     http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
@@ -17,76 +18,243 @@
 #ifndef TRT_SAMPLE_INFERENCE_H
 #define TRT_SAMPLE_INFERENCE_H
 
+#include "sampleDevice.h"
+#include "sampleEngines.h"
 #include "sampleReporting.h"
 #include "sampleUtils.h"
 
+#include <functional>
 #include <iostream>
+#include <list>
 #include <memory>
 #include <string>
 #include <vector>
 
-#include "NvInfer.h"
+namespace sample
+{
 
-#if (NV_TENSORRT_MAJOR > 7)
+// IDebugListener class for writing debug tensors to output file.
+class DebugTensorWriter : public nvinfer1::IDebugListener
+{
+public:
+    DebugTensorWriter(std::unordered_map<std::string, std::string> fileNames)
+        : mDebugTensorFileNames(fileNames)
+    {
+    }
 
-#include "NvInferSafeRuntime.h"
+    bool processDebugTensor(void const* addr, nvinfer1::TensorLocation location, nvinfer1::DataType type,
+        nvinfer1::Dims const& shape, char const* name, cudaStream_t stream) override;
 
-namespace sample
-{
+private:
+    std::unordered_map<std::string, std::string> mDebugTensorFileNames;
+};
 
 struct InferenceEnvironment
 {
-    TrtUniquePtr<nvinfer1::ICudaEngine> engine;
+    InferenceEnvironment() = delete;
+    InferenceEnvironment(InferenceEnvironment const& other) = delete;
+    InferenceEnvironment(InferenceEnvironment&& other) = delete;
+    InferenceEnvironment(BuildEnvironment& bEnv) : engine(std::move(bEnv.engine)), safe(bEnv.engine.isSafe())
+    {
+    }
+
+    LazilyDeserializedEngine engine;
     std::unique_ptr<Profiler> profiler;
-    std::vector<TrtUniquePtr<nvinfer1::IExecutionContext>> context;
+    std::vector<std::unique_ptr<nvinfer1::IExecutionContext>> contexts;
+    std::vector<TrtDeviceBuffer>
+        deviceMemory; //< Device memory used for inference when the allocation strategy is not static.
     std::vector<std::unique_ptr<Bindings>> bindings;
+    std::unique_ptr<DebugTensorWriter> listener;
     bool error{false};
 
-    std::vector<uint8_t> engineBlob;
-
     bool safe{false};
-    std::unique_ptr<nvinfer1::safe::ICudaEngine> safeEngine;
-    std::vector<std::unique_ptr<nvinfer1::safe::IExecutionContext>> safeContext;
 
-    template <class ContextType>
-    inline ContextType* getContext(int32_t streamIdx);
+    inline nvinfer1::IExecutionContext* getContext(int32_t streamIdx);
+
+    //! Storage for input shape tensors.
+    //!
+    //! It's important that the addresses of the data do not change between the calls to
+    //! setTensorAddress/setInputShape (which tells TensorRT where the input shape tensor is)
+    //! and enqueueV3 (when TensorRT might use the input shape tensor).
+    //!
+    //! The input shape tensors could alternatively be handled via member bindings,
+    //! but it simplifies control-flow to store the data here since it's shared across
+    //! the bindings.
+    std::list<std::vector<int32_t>> inputShapeTensorValues;
 };
 
-template <>
 inline nvinfer1::IExecutionContext* InferenceEnvironment::getContext(int32_t streamIdx)
 {
-    return context[streamIdx].get();
-}
-
-template <>
-inline nvinfer1::safe::IExecutionContext* InferenceEnvironment::getContext(int32_t streamIdx)
-{
-    return safeContext[streamIdx].get();
+    return contexts[streamIdx].get();
 }
 
 //!
 //! \brief Set up contexts and bindings for inference
 //!
-bool setUpInference(InferenceEnvironment& iEnv, const InferenceOptions& inference);
+bool setUpInference(InferenceEnvironment& iEnv, InferenceOptions const& inference, SystemOptions const& system);
 
 //!
 //! \brief Deserialize the engine and time how long it takes.
 //!
-bool timeDeserialize(InferenceEnvironment& iEnv);
+bool timeDeserialize(InferenceEnvironment& iEnv, SystemOptions const& sys);
 
 //!
 //! \brief Run inference and collect timing, return false if any error hit during inference
 //!
 bool runInference(
-    const InferenceOptions& inference, InferenceEnvironment& iEnv, int32_t device, std::vector<InferenceTrace>& trace);
+    InferenceOptions const& inference, InferenceEnvironment& iEnv, int32_t device, std::vector<InferenceTrace>& trace);
 
 //!
 //! \brief Get layer information of the engine.
 //!
-std::string getLayerInformation(const InferenceEnvironment& iEnv, nvinfer1::LayerInformationFormat format);
+std::string getLayerInformation(
+    nvinfer1::ICudaEngine* engine, nvinfer1::IExecutionContext* context, nvinfer1::LayerInformationFormat format);
 
-} // namespace sample
+struct Binding
+{
+    bool isInput{false};
+    std::unique_ptr<IMirroredBuffer> buffer;
+    std::unique_ptr<OutputAllocator> outputAllocator;
+    int64_t volume{0};
+    nvinfer1::DataType dataType{nvinfer1::DataType::kFLOAT};
+
+    void fill(std::string const& fileName);
+
+    void fill();
+
+    void dump(std::ostream& os, nvinfer1::Dims dims, nvinfer1::Dims strides, int32_t vectorDim, int32_t spv,
+        std::string const separator = " ") const;
+};
+
+struct TensorInfo
+{
+    int32_t bindingIndex{-1};
+    char const* name{nullptr};
+    nvinfer1::Dims dims{};
+    bool isDynamic{};
+    int32_t comps{-1};
+    nvinfer1::Dims strides{};
+    int32_t vectorDimIndex{-1};
+    bool isInput{};
+    nvinfer1::DataType dataType{};
+    int64_t vol{-1};
+
+    void updateVolume(int32_t batch)
+    {
+        vol = volume(dims, strides, vectorDimIndex, comps, batch);
+    }
+};
+
+class Bindings
+{
+public:
+    Bindings() = delete;
+    explicit Bindings(bool useManaged)
+        : mUseManaged(useManaged)
+    {
+    }
+
+    void addBinding(TensorInfo const& tensorInfo, std::string const& fileName = "");
 
-#endif
+    void** getDeviceBuffers();
+
+    void transferInputToDevice(TrtCudaStream& stream);
+
+    void transferOutputToHost(TrtCudaStream& stream);
+
+    void fill(int binding, std::string const& fileName)
+    {
+        mBindings[binding].fill(fileName);
+    }
+
+    void fill(int binding)
+    {
+        mBindings[binding].fill();
+    }
+
+    void dumpBindingDimensions(
+        std::string const& name, nvinfer1::IExecutionContext const& context, std::ostream& os) const;
+
+    void dumpBindingValues(nvinfer1::IExecutionContext const& context, int32_t binding, std::ostream& os,
+        std::string const& separator = " ", int32_t batch = 1) const;
+
+    void dumpRawBindingToFiles(nvinfer1::IExecutionContext const& context, std::ostream& os) const;
+
+    void dumpInputs(nvinfer1::IExecutionContext const& context, std::ostream& os) const
+    {
+        auto isInput = [](Binding const& b) { return b.isInput; };
+        dumpBindings(context, isInput, os);
+    }
+
+    void dumpOutputs(nvinfer1::IExecutionContext const& context, std::ostream& os) const;
+
+    void dumpBindings(nvinfer1::IExecutionContext const& context, std::ostream& os) const
+    {
+        auto all = [](Binding const& b) { return true; };
+        dumpBindings(context, all, os);
+    }
+
+    void dumpBindings(nvinfer1::IExecutionContext const& context, std::function<bool(Binding const&)> predicate,
+        std::ostream& os) const
+    {
+        for (auto const& n : mNames)
+        {
+            auto const name = n.first;
+            auto const binding = n.second;
+            if (predicate(mBindings[binding]))
+            {
+                os << n.first << ": (";
+                dumpBindingDimensions(name, context, os);
+                os << ")" << std::endl;
+
+                dumpBindingValues(context, binding, os);
+                os << std::endl;
+            }
+        }
+    }
+
+    std::unordered_map<std::string, int> getInputBindings() const
+    {
+        auto isInput = [](Binding const& b) { return b.isInput; };
+        return getBindings(isInput);
+    }
+
+    std::unordered_map<std::string, int> getOutputBindings() const
+    {
+        auto isOutput = [](Binding const& b) { return !b.isInput; };
+        return getBindings(isOutput);
+    }
+
+    std::unordered_map<std::string, int> getBindings() const
+    {
+        auto all = [](Binding const& b) { return true; };
+        return getBindings(all);
+    }
+
+    std::unordered_map<std::string, int> getBindings(std::function<bool(Binding const&)> predicate) const;
+
+    bool setTensorAddresses(nvinfer1::IExecutionContext& context) const;
+
+private:
+    std::unordered_map<std::string, int32_t> mNames;
+    std::vector<Binding> mBindings;
+    std::vector<void*> mDevicePointers;
+    bool mUseManaged{false};
+};
+
+struct TaskInferenceEnvironment
+{
+    TaskInferenceEnvironment(std::string engineFile, InferenceOptions inference, int32_t deviceId = 0,
+        int32_t DLACore = -1, int32_t bs = batchNotProvided);
+    InferenceOptions iOptions{};
+    int32_t device{defaultDevice};
+    int32_t batch{batchNotProvided};
+    std::unique_ptr<InferenceEnvironment> iEnv;
+    std::vector<InferenceTrace> trace;
+};
+
+bool runMultiTasksInference(std::vector<std::unique_ptr<TaskInferenceEnvironment>>& tEnvList);
+
+} // namespace sample
 
 #endif // TRT_SAMPLE_INFERENCE_H
diff --git a/src/Detector/tensorrt_yolo/common/sampleOptions.cpp b/src/Detector/tensorrt_yolo/common/sampleOptions.cpp
index 0afd163f..bdb1b21c 100644
--- a/src/Detector/tensorrt_yolo/common/sampleOptions.cpp
+++ b/src/Detector/tensorrt_yolo/common/sampleOptions.cpp
@@ -1,11 +1,12 @@
 /*
- * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
- *     http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
@@ -19,6 +20,7 @@
 #include <cstring>
 #include <functional>
 #include <iostream>
+#include <sstream>
 #include <stdexcept>
 #include <string>
 #include <vector>
@@ -27,29 +29,64 @@
 
 #include "logger.h"
 #include "sampleOptions.h"
-
+#include "sampleUtils.h"
+using namespace nvinfer1;
 namespace sample
 {
 
 namespace
 {
 
-std::vector<std::string> splitToStringVec(const std::string& option, char separator)
+static const std::map<char, std::pair<int64_t, std::string>> kUNIT_MULTIPLIERS{
+    {'B', {1, "Bytes"}},
+    {'K', {1 << 10, "Kibibytes"}},
+    {'M', {1 << 20, "Mebibytes"}},
+    {'G', {1 << 30, "Gibibytes"}},
+};
+
+std::string addDefaultUnitSuffixIfNotSpecified(std::string const& option, char defaultUnit)
 {
-    std::vector<std::string> options;
+    char lastChar = option.at(option.size() - 1);
+    return std::isdigit(lastChar) ? option + defaultUnit : option;
+}
 
-    for (size_t start = 0; start < option.length();)
+// Returns "B (Bytes), K (Kilobytes), ..."
+std::string getAvailableUnitSuffixes()
+{
+    std::ostringstream ss;
+    for (auto it = kUNIT_MULTIPLIERS.begin(); it != kUNIT_MULTIPLIERS.end(); ++it)
     {
-        size_t separatorIndex = option.find(separator, start);
-        if (separatorIndex == std::string::npos)
+        if (it != kUNIT_MULTIPLIERS.begin())
         {
-            separatorIndex = option.length();
+            ss << ", ";
         }
-        options.emplace_back(option.substr(start, separatorIndex - start));
-        start = separatorIndex + 1;
+        ss << it->first << " (" << it->second.second << ")";
     }
+    return ss.str();
+}
 
-    return options;
+// Numeric trtexec arguments can have unit specifiers in similar to polygraphy.
+// E.g. --weightStreamingBudget=20M would be 20 Mebibytes (base 2).
+int64_t getUnitMultiplier(std::string const& option)
+{
+    char lastChar = option.at(option.size() - 1);
+    if (!std::isdigit(lastChar))
+    {
+        char unit = std::toupper(lastChar);
+        auto found = kUNIT_MULTIPLIERS.find(unit);
+        if (found == kUNIT_MULTIPLIERS.end())
+        {
+            std::ostringstream ss;
+            ss << "Error parsing \"" << option << "\": invalid unit specifier '" << unit
+               << "'. Valid base-2 unit suffixes include: ";
+            ss << getAvailableUnitSuffixes() << ".";
+            throw std::invalid_argument(ss.str());
+        }
+        return found->second.first;
+    }
+
+    // Return bytes by default
+    return kUNIT_MULTIPLIERS.at('B').first;
 }
 
 template <typename T>
@@ -64,6 +101,12 @@ int32_t stringToValue<int32_t>(const std::string& option)
     return std::stoi(option);
 }
 
+template <>
+size_t stringToValue<size_t>(const std::string& option)
+{
+    return std::stoi(option) * getUnitMultiplier(option);
+}
+
 template <>
 float stringToValue<float>(const std::string& option)
 {
@@ -73,7 +116,7 @@ float stringToValue<float>(const std::string& option)
 template <>
 double stringToValue<double>(const std::string& option)
 {
-    return std::stod(option);
+    return std::stod(option) * getUnitMultiplier(option);
 }
 
 template <>
@@ -86,6 +129,10 @@ template <>
 std::vector<int32_t> stringToValue<std::vector<int32_t>>(const std::string& option)
 {
     std::vector<int32_t> shape;
+    if (option == "scalar")
+    {
+        return shape;
+    }
     std::vector<std::string> dimsStrings = splitToStringVec(option, 'x');
     for (const auto& d : dimsStrings)
     {
@@ -98,8 +145,9 @@ template <>
 nvinfer1::DataType stringToValue<nvinfer1::DataType>(const std::string& option)
 {
     const std::unordered_map<std::string, nvinfer1::DataType> strToDT{{"fp32", nvinfer1::DataType::kFLOAT},
-        {"fp16", nvinfer1::DataType::kHALF}, {"int8", nvinfer1::DataType::kINT8},
-        {"int32", nvinfer1::DataType::kINT32}};
+        {"fp16", nvinfer1::DataType::kHALF}, {"bf16", nvinfer1::DataType::kBF16}, {"int8", nvinfer1::DataType::kINT8},
+        {"fp8", nvinfer1::DataType::kFP8}, {"int32", nvinfer1::DataType::kINT32}, {"int64", nvinfer1::DataType::kINT64},
+        {"bool", nvinfer1::DataType::kBOOL}, {"uint8", nvinfer1::DataType::kUINT8}, {"int4", nvinfer1::DataType::kINT4}};
     const auto& dt = strToDT.find(option);
     if (dt == strToDT.end())
     {
@@ -108,6 +156,21 @@ nvinfer1::DataType stringToValue<nvinfer1::DataType>(const std::string& option)
     return dt->second;
 }
 
+template <>
+nvinfer1::DeviceType stringToValue<nvinfer1::DeviceType>(std::string const& option)
+{
+    std::unordered_map<std::string, nvinfer1::DeviceType> const strToDevice = {
+        {"GPU", nvinfer1::DeviceType::kGPU},
+        {"DLA", nvinfer1::DeviceType::kDLA},
+    };
+    auto const& device = strToDevice.find(option);
+    if (device == strToDevice.end())
+    {
+        throw std::invalid_argument("Invalid Device Type " + option);
+    }
+    return device->second;
+}
+
 template <>
 nvinfer1::TensorFormats stringToValue<nvinfer1::TensorFormats>(const std::string& option)
 {
@@ -116,7 +179,8 @@ nvinfer1::TensorFormats stringToValue<nvinfer1::TensorFormats>(const std::string
         {"chw2", nvinfer1::TensorFormat::kCHW2}, {"chw4", nvinfer1::TensorFormat::kCHW4},
         {"hwc8", nvinfer1::TensorFormat::kHWC8}, {"chw16", nvinfer1::TensorFormat::kCHW16},
         {"chw32", nvinfer1::TensorFormat::kCHW32}, {"dhwc8", nvinfer1::TensorFormat::kDHWC8},
-        {"hwc", nvinfer1::TensorFormat::kHWC}, {"dla_linear", nvinfer1::TensorFormat::kDLA_LINEAR},
+        {"cdhw32", nvinfer1::TensorFormat::kCDHW32}, {"hwc", nvinfer1::TensorFormat::kHWC},
+        {"dhwc", nvinfer1::TensorFormat::kDHWC}, {"dla_linear", nvinfer1::TensorFormat::kDLA_LINEAR},
         {"dla_hwc4", nvinfer1::TensorFormat::kDLA_HWC4}};
     nvinfer1::TensorFormats formats{};
     for (auto f : optionStrings)
@@ -149,11 +213,82 @@ IOFormat stringToValue<IOFormat>(const std::string& option)
     return ioFormat;
 }
 
+template <>
+SparsityFlag stringToValue<SparsityFlag>(std::string const& option)
+{
+    std::unordered_map<std::string, SparsityFlag> const table{
+        {"disable", SparsityFlag::kDISABLE}, {"enable", SparsityFlag::kENABLE}, {"force", SparsityFlag::kFORCE}};
+    auto search = table.find(option);
+    if (search == table.end())
+    {
+        throw std::invalid_argument(std::string("Unknown sparsity mode: ") + option);
+    }
+    if (search->second == SparsityFlag::kFORCE)
+    {
+        sample::gLogWarning << "--sparsity=force has been deprecated. "
+                            << "Please use <polygraphy surgeon prune> to rewrite the weights to a sparsity pattern "
+                            << "and then run with --sparsity=enable" << std::endl;
+    }
+
+    return search->second;
+}
+
+template <>
+WeightStreamingBudget stringToValue<WeightStreamingBudget>(std::string const& option)
+{
+    WeightStreamingBudget budget;
+    if (option.find('%') != std::string::npos)
+    {
+        double percent = std::stod(option);
+        if (!(percent >= 0 && percent <= 100.0))
+        {
+            std::ostringstream err;
+            err << "The weight streaming percent must be between 0 and 100.";
+            throw std::invalid_argument(err.str());
+        }
+        budget.percent = percent;
+    }
+    else
+    {
+        double bytes = stringToValue<double>(option);
+        if (!(bytes == WeightStreamingBudget::kAUTOMATIC || bytes == WeightStreamingBudget::kDISABLE || bytes >= 0))
+        {
+            std::ostringstream err;
+            err << "The weight streaming budget must be " << WeightStreamingBudget::kDISABLE << ", "
+                << WeightStreamingBudget::kAUTOMATIC << ", or at least 0.";
+            throw std::invalid_argument(err.str());
+        }
+        budget.bytes = static_cast<int64_t>(bytes);
+    }
+    return budget;
+}
+
 template <typename T>
 std::pair<std::string, T> splitNameAndValue(const std::string& s)
 {
     std::string tensorName;
     std::string valueString;
+
+    // Support 'inputName':Path format for --loadInputs flag when dealing with Windows paths.
+    // i.e. 'inputName':c:\inputData
+    std::vector<std::string> quoteNameRange{ splitToStringVec(s, '\'') };
+    // splitToStringVec returns the entire string when delimiter is not found, so it's size is always at least 1
+    if (quoteNameRange.size() != 1)
+    {
+        if (quoteNameRange.size() != 3)
+        {
+            std::string errorMsg = std::string("Found invalid number of \'s when parsing ") + s +
+                std::string(". Expected: 2, received: ") + std::to_string(quoteNameRange.size() -1) +
+                ". Please ensure that a singular comma is used within each comma-separated key-value pair for options like --inputIOFormats, --optShapes, --optShapesCalib, --layerPrecisions, etc.";
+            throw std::invalid_argument(errorMsg);
+        }
+        // Everything before the second "'" is the name.
+        tensorName = quoteNameRange[0] + quoteNameRange[1];
+        // Path is the last string - ignoring leading ":" so slice it with [1:]
+        valueString = quoteNameRange[2].substr(1);
+        return std::pair<std::string, T>(tensorName, stringToValue<T>(valueString));
+    }
+
     // Split on the last :
     std::vector<std::string> nameRange{splitToStringVec(s, ':')};
     // Everything before the last : is the name
@@ -181,16 +316,71 @@ const char* boolToEnabled(bool enable)
     return enable ? "Enabled" : "Disabled";
 }
 
+//! A helper function similar to sep.join(list) in Python.
+template <typename T>
+std::string joinValuesToString(std::vector<T> const& list, std::string const& sep)
+{
+    std::ostringstream os;
+    for (int32_t i = 0, n = list.size(); i < n; ++i)
+    {
+        os << list[i];
+        if (i != n - 1)
+        {
+            os << sep;
+        }
+    }
+    return os.str();
+}
+
+template <typename T, size_t N>
+std::string joinValuesToString(std::array<T, N> const& list, std::string const& sep)
+{
+    return joinValuesToString(std::vector<T>(list.begin(), list.end()), sep);
+}
+
 //! Check if input option exists in input arguments.
-//! If it does: return its value, erase the argument and return true.
+//! If it does: set its value, and return true
 //! If it does not: return false.
 template <typename T>
-bool getAndDelOption(Arguments& arguments, const std::string& option, T& value)
+bool getOption(Arguments& arguments, const std::string& option, T& value)
 {
-    const auto match = arguments.find(option);
+    auto const match = arguments.find(option);
     if (match != arguments.end())
     {
-        value = stringToValue<T>(match->second);
+        value = stringToValue<T>(match->second.first);
+        return true;
+    }
+
+    return false;
+}
+
+//! Check if input option exists in input arguments.
+//! If it does: set its value, erase the argument and return true.
+//! If it does not: return false.
+template <typename T_>
+bool getAndDelOption(Arguments& arguments, const std::string& option, T_& value)
+{
+    bool found = getOption(arguments, option, value);
+    if (found)
+    {
+        const auto match = arguments.find(option);
+        arguments.erase(match);
+    }
+
+    return found;
+}
+
+//! Check if input option exists in input arguments.
+//! If it does: set its value and position, erase the argument and return true.
+//! If it does not: return false.
+template <typename T_>
+bool getAndDelOptionWithPosition(Arguments& arguments, std::string const& option, T_& value, int32_t& pos)
+{
+    auto const match = arguments.find(option);
+    if (match != arguments.end())
+    {
+        value = stringToValue<T_>(match->second.first);
+        pos = match->second.second;
         arguments.erase(match);
         return true;
     }
@@ -198,8 +388,31 @@ bool getAndDelOption(Arguments& arguments, const std::string& option, T& value)
     return false;
 }
 
+//! Check if input option exists in input arguments behind the position spcecified by pos.
+//! If it does: set its value, erase the argument and return true.
+//! If it does not: return false.
+template <typename T_>
+bool getAndDelOptionBehind(Arguments& arguments, std::string const& option, int32_t pos, T_& value)
+{
+    auto const match = arguments.equal_range(option);
+    if (match.first == match.second)
+    {
+        return false;
+    }
+    for (auto i = match.first; i != match.second; ++i)
+    {
+        if (i->second.second - pos == 1)
+        {
+            value = stringToValue<T_>(i->second.first);
+            arguments.erase(i);
+            return true;
+        }
+    }
+    return false;
+}
+
 //! Check if input option exists in input arguments.
-//! If it does: return false in value, erase the argument and return true.
+//! If it does: set false in value, erase the argument and return true.
 //! If it does not: return false.
 bool getAndDelNegOption(Arguments& arguments, const std::string& option, bool& value)
 {
@@ -224,34 +437,37 @@ bool getAndDelRepeatedOption(Arguments& arguments, const std::string& option, st
         return false;
     }
 
-    auto addToValues = [&values](Arguments::value_type& argValue) {values.emplace_back(stringToValue<T>(argValue.second));};
+    auto addToValues
+        = [&values](Arguments::value_type& argValue) { values.emplace_back(stringToValue<T>(argValue.second.first)); };
     std::for_each(match.first, match.second, addToValues);
     arguments.erase(match.first, match.second);
 
     return true;
 }
 
-void insertShapesBuild(std::unordered_map<std::string, ShapeRange>& shapes, nvinfer1::OptProfileSelector selector, const std::string& name, const std::vector<int32_t>& dims)
+void insertShapesBuild(BuildOptions::ShapeProfile& shapes, nvinfer1::OptProfileSelector selector,
+    const std::string& name, const std::vector<int32_t>& dims)
 {
     shapes[name][static_cast<size_t>(selector)] = dims;
 }
 
-void insertShapesInference(std::unordered_map<std::string, std::vector<int32_t>>& shapes, const std::string& name, const std::vector<int32_t>& dims)
+void insertShapesInference(
+    InferenceOptions::ShapeProfile& shapes, std::string const& name, std::vector<int32_t> const& dims)
 {
     shapes[name] = dims;
 }
 
 std::string removeSingleQuotationMarks(std::string& str)
 {
-     std::vector<std::string> strList{splitToStringVec(str, '\'')};
-     // Remove all the escaped single quotation marks
-     std::string retVal = "";
-     // Do not really care about unterminated sequences
-     for (size_t i = 0; i < strList.size(); i++)
-     {
-         retVal += strList[i];
-     }
-     return retVal;
+    std::vector<std::string> strList{splitToStringVec(str, '\'')};
+    // Remove all the escaped single quotation marks
+    std::string retVal;
+    // Do not really care about unterminated sequences
+    for (size_t i = 0; i < strList.size(); i++)
+    {
+        retVal += strList[i];
+    }
+    return retVal;
 }
 
 void getLayerPrecisions(Arguments& arguments, char const* argument, LayerPrecisions& layerPrecisions)
@@ -293,7 +509,41 @@ void getLayerOutputTypes(Arguments& arguments, char const* argument, LayerOutput
     }
 }
 
-bool getShapesBuild(Arguments& arguments, std::unordered_map<std::string, ShapeRange>& shapes, char const* argument,
+void getLayerDeviceTypes(Arguments& arguments, char const* argument, LayerDeviceTypes& layerDeviceTypes)
+{
+    std::string list;
+    if (!getAndDelOption(arguments, argument, list))
+    {
+        return;
+    }
+
+    // The layerDeviceTypes flag contains comma-separated layerName:deviceType pairs.
+    std::vector<std::string> deviceList{splitToStringVec(list, ',')};
+    for (auto const& s : deviceList)
+    {
+        auto nameDevicePair = splitNameAndValue<std::string>(s);
+        auto const layerName = removeSingleQuotationMarks(nameDevicePair.first);
+        layerDeviceTypes[layerName] = stringToValue<nvinfer1::DeviceType>(nameDevicePair.second);
+    }
+}
+
+void getStringsSet(Arguments& arguments, char const* argument, StringSet& stringSet)
+{
+    std::string list;
+    if (!getAndDelOption(arguments, argument, list))
+    {
+        return;
+    }
+
+    // The layerPrecisions flag contains comma-separated layerName:precision pairs.
+    std::vector<std::string> strings{splitToStringVec(list, ',')};
+    for (auto const& s : strings)
+    {
+        stringSet.insert(s);
+    }
+}
+
+bool getShapesBuild(Arguments& arguments, BuildOptions::ShapeProfile& shapes, char const* argument,
     nvinfer1::OptProfileSelector selector)
 {
     std::string list;
@@ -309,7 +559,7 @@ bool getShapesBuild(Arguments& arguments, std::unordered_map<std::string, ShapeR
     return retVal;
 }
 
-bool getShapesInference(Arguments& arguments, std::unordered_map<std::string, std::vector<int32_t>>& shapes, const char* argument)
+bool getShapesInference(Arguments& arguments, InferenceOptions::ShapeProfile& shapes, const char* argument)
 {
     std::string list;
     bool retVal = getAndDelOption(arguments, argument, list);
@@ -324,67 +574,195 @@ bool getShapesInference(Arguments& arguments, std::unordered_map<std::string, st
     return retVal;
 }
 
-void processShapes(std::unordered_map<std::string, ShapeRange>& shapes, bool minShapes, bool optShapes, bool maxShapes, bool calib)
+void fillShapes(BuildOptions::ShapeProfile& shapes, std::string const& name, ShapeRange const& sourceShapeRange,
+    nvinfer1::OptProfileSelector minDimsSource, nvinfer1::OptProfileSelector optDimsSource,
+    nvinfer1::OptProfileSelector maxDimsSource)
 {
-    // Only accept optShapes only or all three of minShapes, optShapes, maxShapes
-    if ( ((minShapes || maxShapes) && !optShapes)  // minShapes only, maxShapes only, both minShapes and maxShapes
+    insertShapesBuild(
+        shapes, nvinfer1::OptProfileSelector::kMIN, name, sourceShapeRange[static_cast<size_t>(minDimsSource)]);
+    insertShapesBuild(
+        shapes, nvinfer1::OptProfileSelector::kOPT, name, sourceShapeRange[static_cast<size_t>(optDimsSource)]);
+    insertShapesBuild(
+        shapes, nvinfer1::OptProfileSelector::kMAX, name, sourceShapeRange[static_cast<size_t>(maxDimsSource)]);
+}
+
+void processShapes(BuildOptions::ShapeProfile& shapes, bool minShapes, bool optShapes, bool maxShapes, bool calib)
+{
+    // Only accept optShapes only or all three of minShapes, optShapes, maxShapes when calib is set
+    if (((minShapes || maxShapes) && !optShapes)   // minShapes only, maxShapes only, both minShapes and maxShapes
         || (minShapes && !maxShapes && optShapes)  // both minShapes and optShapes
         || (!minShapes && maxShapes && optShapes)) // both maxShapes and optShapes
     {
         if (calib)
         {
-            throw std::invalid_argument("Must specify only --optShapesCalib or all of --minShapesCalib, --optShapesCalib, --maxShapesCalib");
-        }
-        else
-        {
-            throw std::invalid_argument("Must specify only --optShapes or all of --minShapes, --optShapes, --maxShapes");
+            throw std::invalid_argument(
+                "Must specify only --optShapesCalib or all of --minShapesCalib, --optShapesCalib, --maxShapesCalib");
         }
     }
 
-    // If optShapes only, expand optShapes to minShapes and maxShapes
-    if (optShapes && !minShapes && !maxShapes)
+    if (!minShapes && !optShapes && !maxShapes)
     {
-        std::unordered_map<std::string, ShapeRange> newShapes;
-        for (auto& s : shapes)
+        return;
+    }
+
+    BuildOptions::ShapeProfile newShapes;
+    for (auto& s : shapes)
+    {
+        nvinfer1::OptProfileSelector minDimsSource, optDimsSource, maxDimsSource;
+        minDimsSource = nvinfer1::OptProfileSelector::kMIN;
+        optDimsSource = nvinfer1::OptProfileSelector::kOPT;
+        maxDimsSource = nvinfer1::OptProfileSelector::kMAX;
+
+        // Populate missing minShapes
+        if (!minShapes)
+        {
+            if (optShapes)
+            {
+                minDimsSource = optDimsSource;
+                sample::gLogWarning << "optShapes is being broadcasted to minShapes for tensor " << s.first
+                                    << std::endl;
+            }
+            else
+            {
+                minDimsSource = maxDimsSource;
+                sample::gLogWarning << "maxShapes is being broadcasted to minShapes for tensor " << s.first
+                                    << std::endl;
+            }
+        }
+
+        // Populate missing optShapes
+        if (!optShapes)
         {
-            insertShapesBuild(newShapes, nvinfer1::OptProfileSelector::kMIN, s.first, s.second[static_cast<size_t>(nvinfer1::OptProfileSelector::kOPT)]);
-            insertShapesBuild(newShapes, nvinfer1::OptProfileSelector::kOPT, s.first, s.second[static_cast<size_t>(nvinfer1::OptProfileSelector::kOPT)]);
-            insertShapesBuild(newShapes, nvinfer1::OptProfileSelector::kMAX, s.first, s.second[static_cast<size_t>(nvinfer1::OptProfileSelector::kOPT)]);
+            if (maxShapes)
+            {
+                optDimsSource = maxDimsSource;
+                sample::gLogWarning << "maxShapes is being broadcasted to optShapes for tensor " << s.first
+                                    << std::endl;
+            }
+            else
+            {
+                optDimsSource = minDimsSource;
+                sample::gLogWarning << "minShapes is being broadcasted to optShapes for tensor " << s.first
+                                    << std::endl;
+            }
+        }
+
+        // Populate missing maxShapes
+        if (!maxShapes)
+        {
+            if (optShapes)
+            {
+                maxDimsSource = optDimsSource;
+                sample::gLogWarning << "optShapes is being broadcasted to maxShapes for tensor " << s.first
+                                    << std::endl;
+            }
+            else
+            {
+                maxDimsSource = minDimsSource;
+                sample::gLogWarning << "minShapes is being broadcasted to maxShapes for tensor " << s.first
+                                    << std::endl;
+            }
         }
-        shapes = newShapes;
+
+        fillShapes(newShapes, s.first, s.second, minDimsSource, optDimsSource, maxDimsSource);
     }
+    shapes = newShapes;
 }
 
-template <typename T>
-void printShapes(std::ostream& os, const char* phase, const T& shapes)
+bool getOptimizationProfiles(
+    Arguments& arguments, std::vector<BuildOptions::ShapeProfile>& optProfiles, char const* argument)
 {
-    if (shapes.empty())
+    bool retValue{false};
+    int32_t pos{};
+    size_t profileIndex{};
+
+    auto getShapes
+        = [](BuildOptions::ShapeProfile& shapes, std::string const& list, nvinfer1::OptProfileSelector selector) {
+              std::vector<std::string> shapeList{splitToStringVec(list, ',')};
+              for (auto const& s : shapeList)
+              {
+                  auto nameDimsPair = splitNameAndValue<std::vector<int32_t>>(s);
+                  auto tensorName = removeSingleQuotationMarks(nameDimsPair.first);
+                  auto dims = nameDimsPair.second;
+                  insertShapesBuild(shapes, selector, tensorName, dims);
+              }
+          };
+
+    while (getAndDelOptionWithPosition(arguments, argument, profileIndex, pos))
     {
-        os << "Input " << phase << " shapes: model" << std::endl;
+        BuildOptions::ShapeProfile optProfile{};
+        bool minShapes{false}, maxShapes{false}, optShapes{false};
+        for (int32_t i = 0; i < nvinfer1::EnumMax<nvinfer1::OptProfileSelector>(); i++, pos++)
+        {
+            std::string value;
+
+            if (!minShapes && getAndDelOptionBehind(arguments, "--minShapes", pos, value))
+            {
+                minShapes = true;
+                getShapes(optProfile, value, nvinfer1::OptProfileSelector::kMIN);
+            }
+            else if (!maxShapes && getAndDelOptionBehind(arguments, "--maxShapes", pos, value))
+            {
+                maxShapes = true;
+                getShapes(optProfile, value, nvinfer1::OptProfileSelector::kMAX);
+            }
+            else if (!optShapes && getAndDelOptionBehind(arguments, "--optShapes", pos, value))
+            {
+                optShapes = true;
+                getShapes(optProfile, value, nvinfer1::OptProfileSelector::kOPT);
+            }
+            else
+            {
+                break;
+            }
+        }
+        processShapes(optProfile, minShapes, optShapes, maxShapes, false);
+        if (profileIndex >= optProfiles.size())
+        {
+            optProfiles.resize(profileIndex + 1);
+        }
+        if (!optProfiles[profileIndex].empty())
+        {
+            throw std::invalid_argument("Optimization profile index cannot be the same.");
+        }
+        optProfiles[profileIndex] = optProfile;
+        retValue = true;
     }
-    else
+
+    profileIndex = 0;
+    for (auto const& optProfile : optProfiles)
     {
-        for (const auto& s : shapes)
+        if (optProfile.empty())
         {
-            os << "Input " << phase << " shape: " << s.first << "=" << s.second << std::endl;
+            throw std::invalid_argument(std::string("Found invalid or missing shape spec at profile index ")
+                + std::to_string(profileIndex) + std::string(". "));
         }
+        ++profileIndex;
     }
+    return retValue;
 }
 
-std::ostream& printBatch(std::ostream& os, int32_t maxBatch)
+template <typename T>
+void printShapes(std::ostream& os, char const* phase, T const& shapes, int32_t profileIndex)
 {
-    if (maxBatch != maxBatchNotProvided)
+    if (shapes.empty())
     {
-        os << maxBatch;
+        os << "Input " << phase << " shapes: model" << std::endl;
     }
     else
     {
-        os << "explicit batch";
+        std::string profileString = (profileIndex != -1 && strcmp(phase, "build") == 0)
+            ? "(profile " + std::to_string(profileIndex) + ")"
+            : "";
+        for (auto const& s : shapes)
+        {
+            os << "Input " << phase << " shape " << profileString << ": " << s.first << "=" << s.second << std::endl;
+        }
     }
-    return os;
 }
 
-std::ostream& printTacticSources(std::ostream& os, nvinfer1::TacticSources enabledSources, nvinfer1::TacticSources disabledSources)
+std::ostream& printTacticSources(
+    std::ostream& os, nvinfer1::TacticSources enabledSources, nvinfer1::TacticSources disabledSources)
 {
     if (!enabledSources && !disabledSources)
     {
@@ -405,24 +783,41 @@ std::ostream& printTacticSources(std::ostream& os, nvinfer1::TacticSources enabl
 
         addSource(1U << static_cast<uint32_t>(nvinfer1::TacticSource::kCUBLAS), "cublas");
         addSource(1U << static_cast<uint32_t>(nvinfer1::TacticSource::kCUBLAS_LT), "cublasLt");
-#if (NV_TENSORRT_MAJOR > 7)
         addSource(1U << static_cast<uint32_t>(nvinfer1::TacticSource::kCUDNN), "cudnn");
-#endif
+        addSource(1U << static_cast<uint32_t>(nvinfer1::TacticSource::kEDGE_MASK_CONVOLUTIONS), "edge mask convolutions");
+        addSource(1U << static_cast<uint32_t>(nvinfer1::TacticSource::kJIT_CONVOLUTIONS), "JIT convolutions");
     }
     return os;
 }
 
 std::ostream& printPrecision(std::ostream& os, BuildOptions const& options)
 {
+    if (options.stronglyTyped)
+    {
+        os << "Strongly Typed";
+        return os;
+    }
     os << "FP32";
     if (options.fp16)
     {
         os << "+FP16";
     }
+    if (options.bf16)
+    {
+        os << "+BF16";
+    }
     if (options.int8)
     {
         os << "+INT8";
     }
+    if (options.fp8)
+    {
+        os << "+FP8";
+    }
+    if (options.int4)
+    {
+        os << "+INT4";
+    }
     if (options.precisionConstraints == PrecisionConstraints::kOBEY)
     {
         os << " (obey precision constraints)";
@@ -434,13 +829,27 @@ std::ostream& printPrecision(std::ostream& os, BuildOptions const& options)
     return os;
 }
 
-std::ostream& printTimingCache(std::ostream& os, BuildOptions const& options)
+std::ostream& printTempfileControls(std::ostream& os, TempfileControlFlags const tempfileControls)
+{
+    auto getFlag = [&](TempfileControlFlag f) -> char const* {
+        bool allowed = !!(tempfileControls & (1U << static_cast<int64_t>(f)));
+        return allowed ? "allow" : "deny";
+    };
+    auto const inMemory = getFlag(TempfileControlFlag::kALLOW_IN_MEMORY_FILES);
+    auto const temporary = getFlag(TempfileControlFlag::kALLOW_TEMPORARY_FILES);
+
+    os << "{ in_memory: " << inMemory << ", temporary: " << temporary << " }";
+
+    return os;
+}
+
+std::ostream& printTimingCache(std::ostream& os, TimingCacheMode const& timingCacheMode)
 {
-    switch (options.timingCacheMode)
+    switch (timingCacheMode)
     {
-        case TimingCacheMode::kGLOBAL: os << "global"; break;
-        case TimingCacheMode::kLOCAL: os << "local"; break;
-        case TimingCacheMode::kDISABLE: os << "disable"; break;
+    case TimingCacheMode::kGLOBAL: os << "global"; break;
+    case TimingCacheMode::kLOCAL: os << "local"; break;
+    case TimingCacheMode::kDISABLE: os << "disable"; break;
     }
     return os;
 }
@@ -459,20 +868,67 @@ std::ostream& printSparsity(std::ostream& os, BuildOptions const& options)
 
 std::ostream& printMemoryPools(std::ostream& os, BuildOptions const& options)
 {
-    auto const printValueOrDefault = [&os](double const val) {
+    auto const printValueOrDefault = [&os](double const val, char const* unit = "MiB") {
         if (val >= 0)
         {
-            os << val << " MiB";
+            os << val << " " << unit;
         }
         else
         {
             os << "default";
         }
     };
-    os << "workspace: ";     printValueOrDefault(options.workspace);     os << ", ";
-    os << "dlaSRAM: ";       printValueOrDefault(options.dlaSRAM);       os << ", ";
-    os << "dlaLocalDRAM: ";  printValueOrDefault(options.dlaLocalDRAM);  os << ", ";
-    os << "dlaGlobalDRAM: "; printValueOrDefault(options.dlaGlobalDRAM);
+    os << "workspace: ";
+    printValueOrDefault(options.workspace);
+    os << ", ";
+    os << "dlaSRAM: ";
+    printValueOrDefault(options.dlaSRAM);
+    os << ", ";
+    os << "dlaLocalDRAM: ";
+    printValueOrDefault(options.dlaLocalDRAM);
+    os << ", ";
+    os << "dlaGlobalDRAM: ";
+    printValueOrDefault(options.dlaGlobalDRAM);
+    os << ", ";
+    os << "tacticSharedMem: ";
+    printValueOrDefault(options.tacticSharedMem, "KiB");
+    return os;
+}
+
+std::string previewFeatureToString(PreviewFeature feature)
+{
+    // clang-format off
+    switch (feature)
+    {
+    case PreviewFeature::kPROFILE_SHARING_0806:
+    {
+        gLogWarning << "profileSharing0806 is on by default in TensorRT 10.0. This flag is deprecated and has no effect." << std::endl;
+        break;
+    }
+    case PreviewFeature::kALIASED_PLUGIN_IO_10_03: return "kALIASED_PLUGIN_IO_10_03";
+    }
+    return "Invalid Preview Feature";
+    // clang-format on
+}
+
+std::ostream& printPreviewFlags(std::ostream& os, BuildOptions const& options)
+{
+    if (options.previewFeatures.empty())
+    {
+        os << "Use default preview flags.";
+        return os;
+    }
+
+    auto const addFlag = [&](PreviewFeature feat) {
+        int32_t featVal = static_cast<int32_t>(feat);
+        if (options.previewFeatures.find(featVal) != options.previewFeatures.end())
+        {
+            os << previewFeatureToString(feat) << (options.previewFeatures.at(featVal) ? " [ON], " : " [OFF], ");
+        }
+    };
+
+    addFlag(PreviewFeature::kALIASED_PLUGIN_IO_10_03);
+
     return os;
 }
 
@@ -487,51 +943,41 @@ Arguments argsToArgumentsMap(int32_t argc, char* argv[])
         if (valuePtr)
         {
             std::string value{valuePtr + 1};
-            arguments.emplace(std::string(argv[i], valuePtr - argv[i]), value);
+            arguments.emplace(std::string(argv[i], valuePtr - argv[i]), std::make_pair(value, i));
         }
         else
         {
-            arguments.emplace(argv[i], "");
+            arguments.emplace(argv[i], std::make_pair(std::string(""), i));
         }
     }
     return arguments;
 }
 
-void BaseModelOptions::parse(Arguments& arguments)
+namespace
 {
-    if (getAndDelOption(arguments, "--onnx", model))
-    {
-        format = ModelFormat::kONNX;
-    }
-    else if (getAndDelOption(arguments, "--uff", model))
-    {
-        format = ModelFormat::kUFF;
-    }
-    else if (getAndDelOption(arguments, "--model", model))
+std::string resolveHomeDirectoryOnLinux(std::string const& model)
+{
+    std::string filePath{model};
+#ifndef _WIN32
+    if (filePath[0] == '~')
     {
-        format = ModelFormat::kCAFFE;
+        char const* home = std::getenv("HOME");
+        if (home)
+        {
+            filePath.replace(0, 1, home);
+        }
     }
+#endif
+    return filePath;
 }
+} // namespace
 
-void UffInput::parse(Arguments& arguments)
+void BaseModelOptions::parse(Arguments& arguments)
 {
-    getAndDelOption(arguments, "--uffNHWC", NHWC);
-    std::vector<std::string> args;
-    if (getAndDelRepeatedOption(arguments, "--uffInput", args))
+    if (getAndDelOption(arguments, "--onnx", model))
     {
-        for (const auto& i : args)
-        {
-            std::vector<std::string> values{splitToStringVec(i, ',')};
-            if (values.size() == 4)
-            {
-                nvinfer1::Dims3 dims{std::stoi(values[1]), std::stoi(values[2]), std::stoi(values[3])};
-                inputs.emplace_back(values[0], dims);
-            }
-            else
-            {
-                throw std::invalid_argument(std::string("Invalid uffInput ") + i);
-            }
-        }
+        format = ModelFormat::kONNX;
+        model = resolveHomeDirectoryOnLinux(model);
     }
 }
 
@@ -541,56 +987,66 @@ void ModelOptions::parse(Arguments& arguments)
 
     switch (baseModel.format)
     {
-    case ModelFormat::kCAFFE:
+    case ModelFormat::kONNX:
+    case ModelFormat::kANY:
     {
-        getAndDelOption(arguments, "--deploy", prototxt);
         break;
     }
-    case ModelFormat::kUFF:
-    {
-        uffInputs.parse(arguments);
-        if (uffInputs.inputs.empty())
-        {
-            throw std::invalid_argument("Uff models require at least one input");
-        }
-        break;
     }
-    case ModelFormat::kONNX:
-        break;
-    case ModelFormat::kANY:
+
+    if (baseModel.format == ModelFormat::kONNX)
     {
-        if (getAndDelOption(arguments, "--deploy", prototxt))
+        if (!outputs.empty())
         {
-            baseModel.format = ModelFormat::kCAFFE;
+            throw std::invalid_argument("The --output flag should not be used with ONNX models.");
         }
-        break;
     }
+}
+
+void getTempfileControls(Arguments& arguments, char const* argument, TempfileControlFlags& tempfileControls)
+{
+    std::string list;
+    if (!getAndDelOption(arguments, argument, list))
+    {
+        return;
     }
 
-    // The --output flag should only be used with Caffe and UFF. It has no effect on ONNX.
-    std::vector<std::string> outArgs;
-    if (getAndDelRepeatedOption(arguments, "--output", outArgs))
+    std::vector<std::string> controlList{splitToStringVec(list, ',')};
+    for (auto const& s : controlList)
     {
-        for (const auto& o : outArgs)
+        auto controlAllowPair = splitNameAndValue<std::string>(s);
+        bool allowed{false};
+        int32_t offset{-1};
+
+        if (controlAllowPair.second.compare("allow") == 0)
         {
-            for (auto& v : splitToStringVec(o, ','))
-            {
-                outputs.emplace_back(std::move(v));
-            }
+            allowed = true;
         }
-    }
-    if (baseModel.format == ModelFormat::kCAFFE || baseModel.format == ModelFormat::kUFF)
-    {
-        if (outputs.empty())
+        else if (controlAllowPair.second.compare("deny") != 0)
         {
-            throw std::invalid_argument("Caffe and Uff models require at least one output");
+            throw std::invalid_argument("--tempfileControls value should be `deny` or `allow`");
         }
-    }
-    else if (baseModel.format == ModelFormat::kONNX)
-    {
-        if (!outputs.empty())
+
+        if (controlAllowPair.first.compare("in_memory") == 0)
         {
-            throw std::invalid_argument("The --output flag should not be used with ONNX models.");
+            offset = static_cast<int32_t>(TempfileControlFlag::kALLOW_IN_MEMORY_FILES);
+        }
+        else if (controlAllowPair.first.compare("temporary") == 0)
+        {
+            offset = static_cast<int32_t>(TempfileControlFlag::kALLOW_TEMPORARY_FILES);
+        }
+        else
+        {
+            throw std::invalid_argument(std::string{"Unknown --tempfileControls key "} + controlAllowPair.first);
+        }
+
+        if (allowed)
+        {
+            tempfileControls |= (1U << offset);
+        }
+        else
+        {
+            tempfileControls &= ~(1U << offset);
         }
     }
 }
@@ -610,38 +1066,59 @@ void BuildOptions::parse(Arguments& arguments)
     getFormats(inputFormats, "--inputIOFormats");
     getFormats(outputFormats, "--outputIOFormats");
 
-    bool addedExplicitBatchFlag{false};
-    getAndDelOption(arguments, "--explicitBatch", addedExplicitBatchFlag);
-    if (addedExplicitBatchFlag)
-    {
-        sample::gLogWarning << "--explicitBatch flag has been deprecated and has no effect!" << std::endl;
-        sample::gLogWarning << "Explicit batch dim is automatically enabled if input model is ONNX or if dynamic "
-                            << "shapes are provided when the engine is built." << std::endl;
-    }
-
-    bool minShapes = getShapesBuild(arguments, shapes, "--minShapes", nvinfer1::OptProfileSelector::kMIN);
-    bool optShapes = getShapesBuild(arguments, shapes, "--optShapes", nvinfer1::OptProfileSelector::kOPT);
-    bool maxShapes = getShapesBuild(arguments, shapes, "--maxShapes", nvinfer1::OptProfileSelector::kMAX);
-    processShapes(shapes, minShapes, optShapes, maxShapes, false);
-    bool minShapesCalib
-        = getShapesBuild(arguments, shapesCalib, "--minShapesCalib", nvinfer1::OptProfileSelector::kMIN);
-    bool optShapesCalib
-        = getShapesBuild(arguments, shapesCalib, "--optShapesCalib", nvinfer1::OptProfileSelector::kOPT);
-    bool maxShapesCalib
-        = getShapesBuild(arguments, shapesCalib, "--maxShapesCalib", nvinfer1::OptProfileSelector::kMAX);
-    processShapes(shapesCalib, minShapesCalib, optShapesCalib, maxShapesCalib, true);
+    bool getCalibProfile = getAndDelOption(arguments, "--calibProfile", calibProfile);
+    if (!getOptimizationProfiles(arguments, optProfiles, "--profile"))
+    {
+        ShapeProfile shapes;
+        bool minShapes{false}, optShapes{false}, maxShapes{false};
+        try
+        {
+            minShapes = getShapesBuild(arguments, shapes, "--minShapes", nvinfer1::OptProfileSelector::kMIN);
+            optShapes = getShapesBuild(arguments, shapes, "--optShapes", nvinfer1::OptProfileSelector::kOPT);
+            maxShapes = getShapesBuild(arguments, shapes, "--maxShapes", nvinfer1::OptProfileSelector::kMAX);
+        }
+        catch (std::invalid_argument const& arg)
+        {
+            throw std::invalid_argument(arg.what()
+                + std::string(" conversion failure: failed to parse minShapes/optShapes/maxShapes. Please double check "
+                              "your input string."));
+        }
 
-    bool addedExplicitPrecisionFlag{false};
-    getAndDelOption(arguments, "--explicitPrecision", addedExplicitPrecisionFlag);
-    if (addedExplicitPrecisionFlag)
+        processShapes(shapes, minShapes, optShapes, maxShapes, false);
+        optProfiles.emplace_back(shapes);
+    }
+
+    if (calibProfile >= optProfiles.size())
+    {
+        throw std::invalid_argument(
+            std::string("--calibProfile shouldn't greater than the size of optimization profile."));
+    }
+
+    BuildOptions::ShapeProfile dummyShapes;
+
+    bool remainingMinShapes = getShapesBuild(arguments, dummyShapes, "--minShapes", nvinfer1::OptProfileSelector::kMIN);
+    bool remainingOptShapes = getShapesBuild(arguments, dummyShapes, "--optShapes", nvinfer1::OptProfileSelector::kOPT);
+    bool remainingMaxShapes = getShapesBuild(arguments, dummyShapes, "--maxShapes", nvinfer1::OptProfileSelector::kMAX);
+    if (remainingMinShapes || remainingOptShapes || remainingMaxShapes)
     {
-        sample::gLogWarning << "--explicitPrecision flag has been deprecated and has no effect!" << std::endl;
+        throw std::invalid_argument("Multiple --minShapes/--optShapes/--maxShapes without --profile are not allowed. ");
     }
 
-    if (getAndDelOption(arguments, "--workspace", workspace))
+    bool minShapesCalib{false}, optShapesCalib{false}, maxShapesCalib{false};
+    try
     {
-        sample::gLogWarning << "--workspace flag has been deprecated by --memPoolSize flag." << std::endl;
+        minShapesCalib = getShapesBuild(arguments, shapesCalib, "--minShapesCalib", nvinfer1::OptProfileSelector::kMIN);
+        optShapesCalib = getShapesBuild(arguments, shapesCalib, "--optShapesCalib", nvinfer1::OptProfileSelector::kOPT);
+        maxShapesCalib = getShapesBuild(arguments, shapesCalib, "--maxShapesCalib", nvinfer1::OptProfileSelector::kMAX);
     }
+    catch (std::invalid_argument const& arg)
+    {
+        throw std::invalid_argument(arg.what()
+            + std::string(" conversion failure: failed to parse minShapesCalib/optShapesCalib/maxShapesCalib. Please "
+                          "double check your input string."));
+    }
+
+    processShapes(shapesCalib, minShapesCalib, optShapesCalib, maxShapesCalib, true);
 
     std::string memPoolSizes;
     getAndDelOption(arguments, "--memPoolSize", memPoolSizes);
@@ -650,26 +1127,47 @@ void BuildOptions::parse(Arguments& arguments)
     {
         std::string memPoolName;
         double memPoolSize;
-        std::tie(memPoolName, memPoolSize) = splitNameAndValue<double>(memPoolSpec);
+        try
+        {
+            std::string strPoolSize;
+            std::tie(memPoolName, strPoolSize) = splitNameAndValue<std::string>(memPoolSpec);
+            memPoolSize = stringToValue<double>(addDefaultUnitSuffixIfNotSpecified(strPoolSize, 'M'));
+        }
+        catch (std::invalid_argument const& arg)
+        {
+            throw std::invalid_argument(arg.what()
+                + std::string(
+                      " conversion failure: failed to parse --memPoolSize. Please double check your input string."));
+        }
+
         if (memPoolSize < 0)
         {
             throw std::invalid_argument(std::string("Negative memory pool size: ") + std::to_string(memPoolSize));
         }
         if (memPoolName == "workspace")
         {
-            workspace = memPoolSize;
+            // use unit in MB.
+            workspace = memPoolSize / 1.0_MiB;
         }
         else if (memPoolName == "dlaSRAM")
         {
-            dlaSRAM = memPoolSize;
+            // use unit in MB.
+            dlaSRAM = memPoolSize / 1.0_MiB;
         }
         else if (memPoolName == "dlaLocalDRAM")
         {
-            dlaLocalDRAM = memPoolSize;
+            // use unit in MB.
+            dlaLocalDRAM = memPoolSize / 1.0_MiB;
         }
         else if (memPoolName == "dlaGlobalDRAM")
         {
-            dlaGlobalDRAM = memPoolSize;
+            // use unit in MB.
+            dlaGlobalDRAM = memPoolSize / 1.0_MiB;
+        }
+        else if (memPoolName == "tacticSharedMem")
+        {
+            // use unit in KB.
+            tacticSharedMem = memPoolSize / 1.0_KiB;
         }
         else if (!memPoolName.empty())
         {
@@ -677,8 +1175,6 @@ void BuildOptions::parse(Arguments& arguments)
         }
     }
 
-    getAndDelOption(arguments, "--maxBatch", maxBatch);
-    getAndDelOption(arguments, "--minTiming", minTiming);
     getAndDelOption(arguments, "--avgTiming", avgTiming);
 
     bool best{false};
@@ -687,16 +1183,79 @@ void BuildOptions::parse(Arguments& arguments)
     {
         int8 = true;
         fp16 = true;
+
+        // BF16 only supported on Ampere+
+        if (samplesCommon::getSMVersion() >= 0x0800)
+        {
+            bf16 = true;
+        }
     }
 
     getAndDelOption(arguments, "--refit", refittable);
+
+    getAndDelOption(arguments, "--weightless", stripWeights);
+    getAndDelOption(arguments, "--stripWeights", stripWeights);
+
+    bool stripAllWeights{};
+    getAndDelOption(arguments, "--stripAllWeights", stripAllWeights);
+    if (stripAllWeights)
+    {
+        refittable = true;
+        stripWeights = true;
+    }
+
+    // --vc and --versionCompatible are synonyms
+    getAndDelOption(arguments, "--vc", versionCompatible);
+    if (!versionCompatible)
+    {
+        getAndDelOption(arguments, "--versionCompatible", versionCompatible);
+    }
+
+#if !TRT_WINML
+    // --pi and --pluginInstanceNorm are synonyms
+    getAndDelOption(arguments, "--pi", pluginInstanceNorm);
+    if (!pluginInstanceNorm)
+    {
+        getAndDelOption(arguments, "--pluginInstanceNorm", pluginInstanceNorm);
+    }
+#endif
+
+    getAndDelOption(arguments, "--excludeLeanRuntime", excludeLeanRuntime);
+    getAndDelOption(arguments, "--noCompilationCache", disableCompilationCache);
     getAndDelNegOption(arguments, "--noTF32", tf32);
     getAndDelOption(arguments, "--fp16", fp16);
+    getAndDelOption(arguments, "--bf16", bf16);
     getAndDelOption(arguments, "--int8", int8);
+    getAndDelOption(arguments, "--fp8", fp8);
+    getAndDelOption(arguments, "--int4", int4);
+    getAndDelOption(arguments, "--stronglyTyped", stronglyTyped);
+    if (stronglyTyped)
+    {
+        auto disableAndLog = [](bool& flag, std::string mode, std::string type) {
+            if (flag)
+            {
+                flag = false;
+                sample::gLogWarning << "Invalid usage, setting " << mode
+                                    << " mode is not allowed if graph is strongly typed. Disabling BuilderFlag::"
+                                    << type << "." << std::endl;
+            }
+        };
+        disableAndLog(fp16, "fp16", "kFP16");
+        disableAndLog(int8, "int8", "kINT8");
+        disableAndLog(bf16, "bf16", "kBF16");
+        disableAndLog(fp8, "fp8", "kFP8");
+        disableAndLog(int4, "int4", "kINT4");
+    }
+
+    if (fp8 && int8)
+    {
+        throw std::invalid_argument("Invalid usage, fp8 and int8 aren't allowed to be enabled together.");
+    }
     getAndDelOption(arguments, "--safe", safe);
-    getAndDelOption(arguments, "--consistency", consistency);
+    getAndDelOption(arguments, "--buildDLAStandalone", buildDLAStandalone);
+    getAndDelOption(arguments, "--allowGPUFallback", allowGPUFallback);
     getAndDelOption(arguments, "--restricted", restricted);
-
+    getAndDelOption(arguments, "--skipInference", skipInference);
     getAndDelOption(arguments, "--directIO", directIO);
 
     std::string precisionConstraintsString;
@@ -720,10 +1279,11 @@ void BuildOptions::parse(Arguments& arguments)
 
     getLayerPrecisions(arguments, "--layerPrecisions", layerPrecisions);
     getLayerOutputTypes(arguments, "--layerOutputTypes", layerOutputTypes);
+    getLayerDeviceTypes(arguments, "--layerDeviceTypes", layerDeviceTypes);
 
     if (layerPrecisions.empty() && layerOutputTypes.empty() && precisionConstraints != PrecisionConstraints::kNONE)
     {
-        sample::gLogWarning << "When --precisionConstraints flag is set to \"obey\" or \"prefer\", please add "
+        sample::gLogWarning << R"(When --precisionConstraints flag is set to "obey" or "prefer", please add )"
                             << "--layerPrecision/--layerOutputTypes flags to set layer-wise precisions and output "
                             << "types." << std::endl;
     }
@@ -731,79 +1291,52 @@ void BuildOptions::parse(Arguments& arguments)
         && precisionConstraints == PrecisionConstraints::kNONE)
     {
         sample::gLogWarning << "--layerPrecision/--layerOutputTypes flags have no effect when --precisionConstraints "
-                            << "flag is set to \"none\"." << std::endl;
+                            << R"(flag is set to "none".)" << std::endl;
     }
 
-    std::string sparsityString;
-    getAndDelOption(arguments, "--sparsity", sparsityString);
-    if (sparsityString == "disable")
-    {
-        sparsity = SparsityFlag::kDISABLE;
-    }
-    else if (sparsityString == "enable")
-    {
-        sparsity = SparsityFlag::kENABLE;
-    }
-    else if (sparsityString == "force")
-    {
-        sparsity = SparsityFlag::kFORCE;
-    }
-    else if (!sparsityString.empty())
-    {
-        throw std::invalid_argument(std::string("Unknown sparsity mode: ") + sparsityString);
-    }
+    getStringsSet(arguments, "--markDebug", debugTensors);
+
+    getAndDelOption(arguments, "--sparsity", sparsity);
 
     bool calibCheck = getAndDelOption(arguments, "--calib", calibration);
-    if (int8 && calibCheck && !shapes.empty() && shapesCalib.empty())
+    if (int8 && calibCheck && !optProfiles[calibProfile].empty() && shapesCalib.empty())
     {
-        shapesCalib = shapes;
+        shapesCalib = optProfiles[calibProfile];
     }
-
-    std::string profilingVerbosityString;
-    if (getAndDelOption(arguments, "--nvtxMode", profilingVerbosityString))
+    else if (!shapesCalib.empty() && getCalibProfile)
     {
-        sample::gLogWarning << "--nvtxMode flag has been deprecated by --profilingVerbosity flag." << std::endl;
+        sample::gLogWarning
+            << "--calibProfile have no effect when --minShapesCalib/--optShapesCalib/--maxShapesCalib is set."
+            << std::endl;
     }
 
+    std::string profilingVerbosityString;
+
     getAndDelOption(arguments, "--profilingVerbosity", profilingVerbosityString);
     if (profilingVerbosityString == "layer_names_only")
     {
-#if (NV_TENSORRT_MAJOR > 7)
         profilingVerbosity = nvinfer1::ProfilingVerbosity::kLAYER_NAMES_ONLY;
-#else
-		profilingVerbosity = nvinfer1::ProfilingVerbosity::kDEFAULT;
-#endif
     }
     else if (profilingVerbosityString == "none")
     {
         profilingVerbosity = nvinfer1::ProfilingVerbosity::kNONE;
     }
-#if (NV_TENSORRT_MAJOR > 7)
     else if (profilingVerbosityString == "detailed")
     {
         profilingVerbosity = nvinfer1::ProfilingVerbosity::kDETAILED;
     }
-#endif
     else if (profilingVerbosityString == "default")
     {
-#if (NV_TENSORRT_MAJOR > 7)
         sample::gLogWarning << "--profilingVerbosity=default has been deprecated by "
                                "--profilingVerbosity=layer_names_only."
                             << std::endl;
         profilingVerbosity = nvinfer1::ProfilingVerbosity::kLAYER_NAMES_ONLY;
-#else
-		profilingVerbosity = nvinfer1::ProfilingVerbosity::kDEFAULT;
-#endif
     }
     else if (profilingVerbosityString == "verbose")
     {
-#if (NV_TENSORRT_MAJOR > 7)
         sample::gLogWarning << "--profilingVerbosity=verbose has been deprecated by --profilingVerbosity=detailed."
                             << std::endl;
         profilingVerbosity = nvinfer1::ProfilingVerbosity::kDETAILED;
-#else
-		profilingVerbosity = nvinfer1::ProfilingVerbosity::kDEFAULT;
-#endif
     }
     else if (!profilingVerbosityString.empty())
     {
@@ -814,6 +1347,8 @@ void BuildOptions::parse(Arguments& arguments)
     {
         load = true;
     }
+    getAndDelOption(arguments, "--getPlanVersionOnly", getPlanVersionOnly);
+
     if (getAndDelOption(arguments, "--saveEngine", engine))
     {
         save = true;
@@ -858,12 +1393,18 @@ void BuildOptions::parse(Arguments& arguments)
             {
                 source = nvinfer1::TacticSource::kCUBLAS_LT;
             }
-#if (NV_TENSORRT_MAJOR > 7)
             else if (t == "CUDNN")
             {
                 source = nvinfer1::TacticSource::kCUDNN;
             }
-#endif
+            else if (t == "EDGE_MASK_CONVOLUTIONS")
+            {
+                source = nvinfer1::TacticSource::kEDGE_MASK_CONVOLUTIONS;
+            }
+            else if (t == "JIT_CONVOLUTIONS")
+            {
+                source = nvinfer1::TacticSource::kJIT_CONVOLUTIONS;
+            }
             else
             {
                 throw std::invalid_argument(std::string("Unknown tactic source: ") + t);
@@ -887,38 +1428,179 @@ void BuildOptions::parse(Arguments& arguments)
         }
     }
 
-    bool noBuilderCache{false};
-    getAndDelOption(arguments, "--noBuilderCache", noBuilderCache);
-    getAndDelOption(arguments, "--timingCacheFile", timingCacheFile);
-    if (noBuilderCache)
+    bool noBuilderCache{false};
+    getAndDelOption(arguments, "--noBuilderCache", noBuilderCache);
+    getAndDelOption(arguments, "--timingCacheFile", timingCacheFile);
+    if (noBuilderCache)
+    {
+        timingCacheMode = TimingCacheMode::kDISABLE;
+    }
+    else if (!timingCacheFile.empty())
+    {
+        timingCacheMode = TimingCacheMode::kGLOBAL;
+    }
+    else
+    {
+        timingCacheMode = TimingCacheMode::kLOCAL;
+    }
+    getAndDelOption(arguments, "--errorOnTimingCacheMiss", errorOnTimingCacheMiss);
+    getAndDelOption(arguments, "--builderOptimizationLevel", builderOptimizationLevel);
+    getAndDelOption(arguments, "--maxTactics", maxTactics);
+
+    std::string runtimePlatformArgs;
+    getAndDelOption(arguments, "--runtimePlatform", runtimePlatformArgs);
+    if (runtimePlatformArgs == "SameAsBuild" || runtimePlatformArgs.empty())
+    {
+        runtimePlatform = RuntimePlatform::kSAME_AS_BUILD;
+    }
+    else if (runtimePlatformArgs == "WindowsAMD64")
+    {
+        runtimePlatform = RuntimePlatform::kWINDOWS_AMD64;
+    }
+    else
+    {
+        throw std::invalid_argument(std::string("Unknown runtime platform: ") + runtimePlatformArgs
+            + ". Valid options: SameAsBuild, WindowsAMD64.");
+    }
+
+    std::string hardwareCompatibleArgs;
+    getAndDelOption(arguments, "--hardwareCompatibilityLevel", hardwareCompatibleArgs);
+    if (hardwareCompatibleArgs == "none" || hardwareCompatibleArgs.empty())
+    {
+        hardwareCompatibilityLevel = HardwareCompatibilityLevel::kNONE;
+    }
+    else if (samplesCommon::toLower(hardwareCompatibleArgs) == "ampere+")
+    {
+        hardwareCompatibilityLevel = HardwareCompatibilityLevel::kAMPERE_PLUS;
+    }
+    else
+    {
+        throw std::invalid_argument(std::string("Unknown hardwareCompatibilityLevel: ") + hardwareCompatibleArgs
+            + ". Valid options: none, ampere+.");
+    }
+
+    if (pluginInstanceNorm && (versionCompatible || hardwareCompatibilityLevel == HardwareCompatibilityLevel::kAMPERE_PLUS))
+    {
+        throw std::invalid_argument("Plugin InstanceNorm cannot be used with version compatible or hardware compatible engines!");
+    }
+
+    getAndDelOption(arguments, "--maxAuxStreams", maxAuxStreams);
+
+    std::string previewFeaturesBuf;
+    getAndDelOption(arguments, "--preview", previewFeaturesBuf);
+    std::vector<std::string> previewFeaturesVec{splitToStringVec(previewFeaturesBuf, ',')};
+    for (auto featureName : previewFeaturesVec)
+    {
+        bool enable{false};
+        if (featureName.front() == '+')
+        {
+            enable = true;
+        }
+        else if (featureName.front() != '-')
+        {
+            throw std::invalid_argument(
+                "Preview features must be prefixed with + or -, indicating whether it should be enabled or disabled "
+                "respectively.");
+        }
+        featureName.erase(0, 1);
+
+        PreviewFeature feat{};
+        if (featureName == "profileSharing0806")
+        {
+            sample::gLogWarning
+                << "profileSharing0806 is on by default in TensorRT 10.0. This flag is deprecated and has no effect."
+                << std::endl;
+        }
+        else if (featureName == "aliasedPluginIO1003")
+        {
+            feat = PreviewFeature::kALIASED_PLUGIN_IO_10_03;
+        }
+        else
+        {
+            throw std::invalid_argument(std::string("Unknown preview feature: ") + featureName);
+        }
+        previewFeatures[static_cast<int32_t>(feat)] = enable;
+    }
+
+    getAndDelOption(arguments, "--tempdir", tempdir);
+    getTempfileControls(arguments, "--tempfileControls", tempfileControls);
+
+    std::string runtimeMode;
+    getAndDelOption(arguments, "--useRuntime", runtimeMode);
+    if (runtimeMode == "full")
     {
-        timingCacheMode = TimingCacheMode::kDISABLE;
+        useRuntime = RuntimeMode::kFULL;
     }
-    else if (!timingCacheFile.empty())
+    else if (runtimeMode == "dispatch")
     {
-        timingCacheMode = TimingCacheMode::kGLOBAL;
+        useRuntime = RuntimeMode::kDISPATCH;
     }
-    else
+    else if (runtimeMode == "lean")
     {
-        timingCacheMode = TimingCacheMode::kLOCAL;
+        useRuntime = RuntimeMode::kLEAN;
+    }
+    else if (!runtimeMode.empty())
+    {
+        throw std::invalid_argument(std::string("Unknown useRuntime: ") + runtimeMode);
     }
+
+    if ((useRuntime == RuntimeMode::kDISPATCH || useRuntime == RuntimeMode::kLEAN) && !versionCompatible)
+    {
+        versionCompatible = true;
+        sample::gLogWarning << "Implicitly enabling --versionCompatible since --useRuntime=" << runtimeMode
+                            << " is set." << std::endl;
+    }
+
+    if (useRuntime != RuntimeMode::kFULL && !load)
+    {
+        throw std::invalid_argument(std::string("Building a TensorRT engine requires --useRuntime=full."));
+    }
+
+    getAndDelOption(arguments, "--leanDLLPath", leanDLLPath);
+
+    // Don't delete the option because the inference option parser requires it
+    getOption(arguments, "--allowWeightStreaming", allowWeightStreaming);
 }
 
 void SystemOptions::parse(Arguments& arguments)
 {
     getAndDelOption(arguments, "--device", device);
     getAndDelOption(arguments, "--useDLACore", DLACore);
-    getAndDelOption(arguments, "--allowGPUFallback", fallback);
+#if !TRT_WINML
     std::string pluginName;
     while (getAndDelOption(arguments, "--plugins", pluginName))
     {
+        sample::gLogWarning << "--plugins flag has been deprecated, use --staticPlugins flag instead." << std::endl;
         plugins.emplace_back(pluginName);
     }
+    while (getAndDelOption(arguments, "--staticPlugins", pluginName))
+    {
+        plugins.emplace_back(pluginName);
+    }
+    while (getAndDelOption(arguments, "--setPluginsToSerialize", pluginName))
+    {
+        setPluginsToSerialize.emplace_back(pluginName);
+    }
+    while (getAndDelOption(arguments, "--dynamicPlugins", pluginName))
+    {
+        dynamicPlugins.emplace_back(pluginName);
+    }
+    getAndDelOption(arguments, "--ignoreParsedPluginLibs", ignoreParsedPluginLibs);
+#endif
 }
 
+constexpr int64_t WeightStreamingBudget::kDISABLE;
+constexpr int64_t WeightStreamingBudget::kAUTOMATIC;
+
 void InferenceOptions::parse(Arguments& arguments)
 {
-    getAndDelOption(arguments, "--streams", streams);
+
+    if (getAndDelOption(arguments, "--streams", infStreams))
+    {
+        sample::gLogWarning << "--streams flag has been deprecated, use --infStreams flag instead." << std::endl;
+    }
+    getAndDelOption(arguments, "--infStreams", infStreams);
+
     getAndDelOption(arguments, "--iterations", iterations);
     getAndDelOption(arguments, "--duration", duration);
     getAndDelOption(arguments, "--warmUp", warmup);
@@ -935,9 +1617,9 @@ void InferenceOptions::parse(Arguments& arguments)
     getAndDelOption(arguments, "--threads", threads);
     getAndDelOption(arguments, "--useCudaGraph", graph);
     getAndDelOption(arguments, "--separateProfileRun", rerun);
-    getAndDelOption(arguments, "--buildOnly", skip);
     getAndDelOption(arguments, "--timeDeserialize", timeDeserialize);
     getAndDelOption(arguments, "--timeRefit", timeRefit);
+    getAndDelOption(arguments, "--persistentCacheRatio", persistentCacheRatio);
 
     std::string list;
     getAndDelOption(arguments, "--loadInputs", list);
@@ -945,25 +1627,81 @@ void InferenceOptions::parse(Arguments& arguments)
     splitInsertKeyValue(inputsList, inputs);
 
     getShapesInference(arguments, shapes, "--shapes");
-    getAndDelOption(arguments, "--batch", batch);
+    setOptProfile = getAndDelOption(arguments, "--useProfile", optProfileIndex);
+
+    std::string allocationStrategyString;
+    getAndDelOption(arguments, "--allocationStrategy", allocationStrategyString);
+    if (allocationStrategyString == "static")
+    {
+        memoryAllocationStrategy = MemoryAllocationStrategy::kSTATIC;
+    }
+    else if (allocationStrategyString == "profile")
+    {
+        memoryAllocationStrategy = MemoryAllocationStrategy::kPROFILE;
+    }
+    else if (allocationStrategyString == "runtime")
+    {
+        memoryAllocationStrategy = MemoryAllocationStrategy::kRUNTIME;
+    }
+    else if (!allocationStrategyString.empty())
+    {
+        throw std::invalid_argument(std::string("Unknown allocationStrategy: ") + allocationStrategyString);
+    }
+
+    bool allowWs{false};
+    getAndDelOption(arguments, "--allowWeightStreaming", allowWs);
+    bool wsBudgetFound = getAndDelOption(arguments, "--weightStreamingBudget", weightStreamingBudget);
+    if (wsBudgetFound && !allowWs)
+    {
+        throw std::invalid_argument(
+            "The weight streaming budget can only be set with --allowWeightStreaming specified.");
+    }
+    if (allowWs && weightStreamingBudget.isDisabled())
+    {
+        sample::gLogWarning << "The engine can stream its weights but it will not at runtime because "
+                               "--weightStreamingBudget unset or set to "
+                            << WeightStreamingBudget::kDISABLE << "." << std::endl;
+    }
+
+    std::string debugTensorList;
+    getAndDelOption(arguments, "--saveDebugTensors", debugTensorList);
+    std::vector<std::string> fileNames{splitToStringVec(debugTensorList, ',')};
+    splitInsertKeyValue(fileNames, debugTensorFileNames);
 }
 
 void ReportingOptions::parse(Arguments& arguments)
 {
-    getAndDelOption(arguments, "--percentile", percentile);
     getAndDelOption(arguments, "--avgRuns", avgs);
     getAndDelOption(arguments, "--verbose", verbose);
     getAndDelOption(arguments, "--dumpRefit", refit);
     getAndDelOption(arguments, "--dumpOutput", output);
+    getAndDelOption(arguments, "--dumpRawBindingsToFile", dumpRawBindings);
     getAndDelOption(arguments, "--dumpProfile", profile);
     getAndDelOption(arguments, "--dumpLayerInfo", layerInfo);
+    getAndDelOption(arguments, "--dumpOptimizationProfile", optProfileInfo);
     getAndDelOption(arguments, "--exportTimes", exportTimes);
     getAndDelOption(arguments, "--exportOutput", exportOutput);
     getAndDelOption(arguments, "--exportProfile", exportProfile);
     getAndDelOption(arguments, "--exportLayerInfo", exportLayerInfo);
-    if (percentile < 0 || percentile > 100)
+
+    std::string percentileString;
+    getAndDelOption(arguments, "--percentile", percentileString);
+    std::vector<std::string> percentileStrings = splitToStringVec(percentileString, ',');
+    if (!percentileStrings.empty())
+    {
+        percentiles.clear();
+    }
+    for (const auto& p : percentileStrings)
     {
-        throw std::invalid_argument(std::string("Percentile ") + std::to_string(percentile) + "is not in [0,100]");
+        percentiles.push_back(stringToValue<float>(p));
+    }
+
+    for (auto percentile : percentiles)
+    {
+        if (percentile < 0.F || percentile > 100.F)
+        {
+            throw std::invalid_argument(std::string("Percentile ") + std::to_string(percentile) + "is not in [0,100]");
+        }
     }
 }
 
@@ -983,61 +1721,40 @@ void AllOptions::parse(Arguments& arguments)
     system.parse(arguments);
     inference.parse(arguments);
 
-    // Use explicitBatch when input model is ONNX or when dynamic shapes are used.
-    const bool isOnnx{model.baseModel.format == ModelFormat::kONNX};
-    const bool hasDynamicShapes{!build.shapes.empty() || !inference.shapes.empty()};
-    const bool detectedExplicitBatch = isOnnx || hasDynamicShapes;
-
-    // Throw an error if user tries to use --batch or --maxBatch when the engine has explicit batch dim.
-    const bool maxBatchWasSet{build.maxBatch != maxBatchNotProvided};
-    const bool batchWasSet{inference.batch != batchNotProvided};
-    if (detectedExplicitBatch && (maxBatchWasSet || batchWasSet))
+    if (build.useRuntime != RuntimeMode::kFULL && inference.timeRefit)
     {
-        throw std::invalid_argument(
-            "The --batch and --maxBatch flags should not be used when the input model is ONNX or when dynamic shapes "
-            "are provided. Please use --optShapes and --shapes to set input shapes instead.");
+        throw std::invalid_argument("--timeRefit requires --useRuntime=full.");
     }
 
-    // If batch and/or maxBatch is not set and the engine has implicit batch dim, set them to default values.
-    if (!detectedExplicitBatch)
+    if (inference.optProfileIndex < static_cast<int32_t>(build.optProfiles.size()))
     {
-        // If batch is not set, set it to default value.
-        if (!batchWasSet)
-        {
-            inference.batch = defaultBatch;
-        }
-        // If maxBatch is not set, set it to be equal to batch.
-        if (!maxBatchWasSet)
+        // Propagate shape profile between builder and inference
+        for (auto const& s : build.optProfiles[inference.optProfileIndex])
         {
-            build.maxBatch = inference.batch;
+            if (inference.shapes.find(s.first) == inference.shapes.end())
+            {
+                insertShapesInference(
+                    inference.shapes, s.first, s.second[static_cast<size_t>(nvinfer1::OptProfileSelector::kOPT)]);
+            }
         }
-        // MaxBatch should not be less than batch.
-        if (build.maxBatch < inference.batch)
+        for (auto const& s : inference.shapes)
         {
-            throw std::invalid_argument("Build max batch " + std::to_string(build.maxBatch)
-                + " is less than inference batch " + std::to_string(inference.batch));
+            if (build.optProfiles[inference.optProfileIndex].find(s.first)
+                == build.optProfiles[inference.optProfileIndex].end())
+            {
+                // assume min/opt/max all the same
+                insertShapesBuild(build.optProfiles[inference.optProfileIndex], nvinfer1::OptProfileSelector::kMIN,
+                    s.first, s.second);
+                insertShapesBuild(build.optProfiles[inference.optProfileIndex], nvinfer1::OptProfileSelector::kOPT,
+                    s.first, s.second);
+                insertShapesBuild(build.optProfiles[inference.optProfileIndex], nvinfer1::OptProfileSelector::kMAX,
+                    s.first, s.second);
+            }
         }
     }
 
-    if (build.shapes.empty() && !inference.shapes.empty())
-    {
-        // If --shapes are provided but --optShapes are not, assume that optShapes is the same as shapes.
-        for (auto& s : inference.shapes)
-        {
-            insertShapesBuild(build.shapes, nvinfer1::OptProfileSelector::kMIN, s.first, s.second);
-            insertShapesBuild(build.shapes, nvinfer1::OptProfileSelector::kOPT, s.first, s.second);
-            insertShapesBuild(build.shapes, nvinfer1::OptProfileSelector::kMAX, s.first, s.second);
-        }
-    }
-    else if (!build.shapes.empty() && inference.shapes.empty())
-    {
-        // If --optShapes are provided but --shapes are not, assume that shapes is the same as optShapes.
-        for (auto& s : build.shapes)
-        {
-            insertShapesInference(
-                inference.shapes, s.first, s.second[static_cast<size_t>(nvinfer1::OptProfileSelector::kOPT)]);
-        }
-    }
+    // Set nvtxVerbosity to be the same as build-time profilingVerbosity.
+    inference.nvtxVerbosity = build.profilingVerbosity;
 
     reporting.parse(arguments);
     helps = parseHelp(arguments);
@@ -1050,31 +1767,56 @@ void AllOptions::parse(Arguments& arguments)
         }
         if (build.safe && system.DLACore >= 0)
         {
-            auto checkSafeDLAFormats = [](std::vector<IOFormat> const& fmt) {
-                return fmt.empty() ? false : std::all_of(fmt.begin(), fmt.end(), [](IOFormat const& pair) {
+            build.buildDLAStandalone = true;
+        }
+        if (build.runtimePlatform != nvinfer1::RuntimePlatform::kSAME_AS_BUILD)
+        {
+            build.skipInference = true;
+        }
+        if (build.buildDLAStandalone)
+        {
+            build.skipInference = true;
+            auto checkSafeDLAFormats = [](std::vector<IOFormat> const& fmt, bool isInput) {
+                return fmt.empty() ? false : std::all_of(fmt.begin(), fmt.end(), [&](IOFormat const& pair) {
                     bool supported{false};
-                    bool const isLINEAR{pair.second == 1U << static_cast<int32_t>(nvinfer1::TensorFormat::kLINEAR)};
-                    bool const isCHW4{pair.second == 1U << static_cast<int32_t>(nvinfer1::TensorFormat::kCHW4)};
+                    bool const isDLA_LINEAR{
+                        pair.second == 1U << static_cast<int32_t>(nvinfer1::TensorFormat::kDLA_LINEAR)};
+                    bool const isHWC4{pair.second == 1U << static_cast<int32_t>(nvinfer1::TensorFormat::kCHW4)
+                        || pair.second == 1U << static_cast<int32_t>(nvinfer1::TensorFormat::kDLA_HWC4)};
                     bool const isCHW32{pair.second == 1U << static_cast<int32_t>(nvinfer1::TensorFormat::kCHW32)};
                     bool const isCHW16{pair.second == 1U << static_cast<int32_t>(nvinfer1::TensorFormat::kCHW16)};
-                    supported |= pair.first == nvinfer1::DataType::kINT8 && (isLINEAR || isCHW4 || isCHW32);
-                    supported |= pair.first == nvinfer1::DataType::kHALF && (isLINEAR || isCHW4 || isCHW16);
+                    supported |= pair.first == nvinfer1::DataType::kINT8
+                        && (isDLA_LINEAR || (isInput ? isHWC4 : false) || isCHW32);
+                    supported |= pair.first == nvinfer1::DataType::kHALF
+                        && (isDLA_LINEAR || (isInput ? isHWC4 : false) || isCHW16);
                     return supported;
                 });
             };
-            if (!checkSafeDLAFormats(build.inputFormats) || !checkSafeDLAFormats(build.outputFormats))
+            if (!checkSafeDLAFormats(build.inputFormats, true) || !checkSafeDLAFormats(build.outputFormats, false))
             {
                 throw std::invalid_argument(
-                    "I/O formats for safe DLA capability are restricted to fp16/int8:linear, fp16:chw16 or int8:chw32");
+                    "I/O formats for safe DLA capability are restricted to fp16/int8:dla_linear, fp16/int8:hwc4, "
+                    "fp16:chw16 or "
+                    "int8:chw32");
             }
-            if (system.fallback)
+            if (build.allowGPUFallback)
             {
-                throw std::invalid_argument("GPU fallback (--allowGPUFallback) not allowed for safe DLA capability");
+                throw std::invalid_argument("GPU fallback (--allowGPUFallback) not allowed for DLA standalone mode");
             }
         }
     }
 }
 
+void TaskInferenceOptions::parse(Arguments& arguments)
+{
+    getAndDelOption(arguments, "engine", engine);
+    getAndDelOption(arguments, "device", device);
+    getAndDelOption(arguments, "batch", batch);
+    getAndDelOption(arguments, "DLACore", DLACore);
+    getAndDelOption(arguments, "graph", graph);
+    getAndDelOption(arguments, "persistentCacheRatio", persistentCacheRatio);
+}
+
 void SafeBuilderOptions::parse(Arguments& arguments)
 {
     auto getFormats = [&arguments](std::vector<IOFormat>& formatsVector, const char* argument) {
@@ -1097,13 +1839,36 @@ void SafeBuilderOptions::parse(Arguments& arguments)
     getFormats(outputFormats, "--outputIOFormats");
     getAndDelOption(arguments, "--int8", int8);
     getAndDelOption(arguments, "--calib", calibFile);
-    getAndDelOption(arguments, "--consistency", consistency);
     getAndDelOption(arguments, "--std", standard);
+#if !TRT_WINML
     std::string pluginName;
     while (getAndDelOption(arguments, "--plugins", pluginName))
     {
+        sample::gLogWarning << "--plugins flag has been deprecated, use --staticPlugins flag instead." << std::endl;
         plugins.emplace_back(pluginName);
     }
+    while (getAndDelOption(arguments, "--staticPlugins", pluginName))
+    {
+        plugins.emplace_back(pluginName);
+    }
+#endif
+    bool noBuilderCache{false};
+    getAndDelOption(arguments, "--noBuilderCache", noBuilderCache);
+    getAndDelOption(arguments, "--timingCacheFile", timingCacheFile);
+    getAndDelOption(arguments, "--avgTiming", avgTiming);
+    if (noBuilderCache)
+    {
+        timingCacheMode = TimingCacheMode::kDISABLE;
+    }
+    else if (!timingCacheFile.empty())
+    {
+        timingCacheMode = TimingCacheMode::kGLOBAL;
+    }
+    else
+    {
+        timingCacheMode = TimingCacheMode::kLOCAL;
+    }
+    getAndDelOption(arguments, "--sparsity", sparsity);
 }
 
 std::ostream& operator<<(std::ostream& os, const BaseModelOptions& options)
@@ -1113,59 +1878,25 @@ std::ostream& operator<<(std::ostream& os, const BaseModelOptions& options)
     os << "Format: ";
     switch (options.format)
     {
-    case ModelFormat::kCAFFE:
-    {
-        os << "Caffe";
-        break;
-    }
     case ModelFormat::kONNX:
     {
         os << "ONNX";
         break;
     }
-    case ModelFormat::kUFF:
-    {
-        os << "UFF";
-        break;
-    }
-    case ModelFormat::kANY:
-        os << "*";
-        break;
+    case ModelFormat::kANY: os << "*"; break;
     }
     os << std::endl << "Model: " << options.model << std::endl;
 
     return os;
 }
 
-std::ostream& operator<<(std::ostream& os, const UffInput& input)
-{
-    os << "Uff Inputs Layout: " << (input.NHWC ? "NHWC" : "NCHW") << std::endl;
-    for (const auto& i : input.inputs)
-    {
-        os << "Input: " << i.first << "," << i.second.d[0] << "," << i.second.d[1] << "," << i.second.d[2] << std::endl;
-    }
-
-    return os;
-}
-
 std::ostream& operator<<(std::ostream& os, const ModelOptions& options)
 {
     os << options.baseModel;
     switch (options.baseModel.format)
     {
-    case ModelFormat::kCAFFE:
-    {
-        os << "Prototxt: " << options.prototxt << std::endl;
-        break;
-    }
-    case ModelFormat::kUFF:
-    {
-        os << options.uffInputs;
-        break;
-    }
     case ModelFormat::kONNX: // Fallthrough: No options to report for ONNX or the generic case
-    case ModelFormat::kANY:
-        break;
+    case ModelFormat::kANY: break;
     }
 
     os << "Output:";
@@ -1192,6 +1923,11 @@ std::ostream& operator<<(std::ostream& os, nvinfer1::DataType dtype)
         os << "fp16";
         break;
     }
+    case nvinfer1::DataType::kBF16:
+    {
+        os << "bf16";
+        break;
+    }
     case nvinfer1::DataType::kINT8:
     {
         os << "int8";
@@ -1207,6 +1943,26 @@ std::ostream& operator<<(std::ostream& os, nvinfer1::DataType dtype)
         os << "bool";
         break;
     }
+    case nvinfer1::DataType::kUINT8:
+    {
+        os << "uint8";
+        break;
+    }
+    case nvinfer1::DataType::kFP8:
+    {
+        os << "fp8";
+        break;
+    }
+    case nvinfer1::DataType::kINT64:
+    {
+        os << "int64";
+        break;
+    }
+    case nvinfer1::DataType::kINT4:
+    {
+        os << "int4";
+        break;
+    }
     }
     return os;
 }
@@ -1240,13 +1996,11 @@ std::ostream& operator<<(std::ostream& os, IOFormat const& format)
                 os << "hwc8";
                 break;
             }
-#if (NV_TENSORRT_MAJOR > 7)
             case nvinfer1::TensorFormat::kHWC16:
             {
                 os << "hwc16";
                 break;
             }
-#endif
             case nvinfer1::TensorFormat::kCHW4:
             {
                 os << "chw4";
@@ -1277,6 +2031,11 @@ std::ostream& operator<<(std::ostream& os, IOFormat const& format)
                 os << "hwc";
                 break;
             }
+            case nvinfer1::TensorFormat::kDHWC:
+            {
+                os << "dhwc";
+                break;
+            }
             case nvinfer1::TensorFormat::kDLA_LINEAR:
             {
                 os << "dla_linear";
@@ -1293,6 +2052,42 @@ std::ostream& operator<<(std::ostream& os, IOFormat const& format)
     return os;
 }
 
+std::ostream& operator<<(std::ostream& os, nvinfer1::DeviceType devType)
+{
+    switch (devType)
+    {
+    case nvinfer1::DeviceType::kGPU:
+    {
+        os << "GPU";
+        break;
+    }
+    case nvinfer1::DeviceType::kDLA:
+    {
+        os << "DLA";
+        break;
+    }
+    }
+    return os;
+}
+
+std::ostream& operator<<(std::ostream& os, nvinfer1::RuntimePlatform platform)
+{
+    switch (platform)
+    {
+    case nvinfer1::RuntimePlatform::kSAME_AS_BUILD:
+    {
+        os << "Same As Build";
+        break;
+    }
+    case nvinfer1::RuntimePlatform::kWINDOWS_AMD64:
+    {
+        os << "Windows AMD64";
+        break;
+    }
+    }
+    return os;
+}
+
 std::ostream& operator<<(std::ostream& os, const ShapeRange& dims)
 {
     int32_t i = 0;
@@ -1319,29 +2114,76 @@ std::ostream& operator<<(std::ostream& os, LayerPrecisions const& layerPrecision
     return os;
 }
 
+std::ostream& operator<<(std::ostream& os, LayerDeviceTypes const& layerDeviceTypes)
+{
+    int32_t i = 0;
+    for (auto const& layerDevicePair : layerDeviceTypes)
+    {
+        os << (i++ ? ", " : "") << layerDevicePair.first << ":" << layerDevicePair.second;
+    }
+    return os;
+}
+
+std::ostream& operator<<(std::ostream& os, StringSet const& stringSet)
+{
+    int64_t i = 0;
+    for (auto const& s : stringSet)
+    {
+        os << (i ? "," : "") << s;
+        ++i;
+    }
+    return os;
+}
+
 std::ostream& operator<<(std::ostream& os, const BuildOptions& options)
 {
+    // if loadEngine is specified, BuildOptions are N/A
+    if (options.load)
+    {
+        os << std::endl;
+        return os;
+    }
     // clang-format off
     os << "=== Build Options ==="                                                                                       << std::endl <<
-
-          "Max batch: ";        printBatch(os, options.maxBatch)                                                        << std::endl <<
           "Memory Pools: ";     printMemoryPools(os, options)                                                           << std::endl <<
-          "minTiming: "      << options.minTiming                                                                       << std::endl <<
           "avgTiming: "      << options.avgTiming                                                                       << std::endl <<
           "Precision: ";        printPrecision(os, options)                                                             << std::endl <<
           "LayerPrecisions: " << options.layerPrecisions                                                                << std::endl <<
+          "Layer Device Types: " << options.layerDeviceTypes                                                            << std::endl <<
           "Calibration: "    << (options.int8 && options.calibration.empty() ? "Dynamic" : options.calibration.c_str()) << std::endl <<
           "Refit: "          << boolToEnabled(options.refittable)                                                       << std::endl <<
+          "Strip weights: "     << boolToEnabled(options.stripWeights)                                                  << std::endl <<
+          "Version Compatible: " << boolToEnabled(options.versionCompatible)                                            << std::endl <<
+#if !TRT_WINML
+          "ONNX Plugin InstanceNorm: " << boolToEnabled(options.pluginInstanceNorm)                                     << std::endl <<
+#endif
+          "TensorRT runtime: " << options.useRuntime                                                                    << std::endl <<
+          "Lean DLL Path: " << options.leanDLLPath                                                                      << std::endl <<
+          "Tempfile Controls: "; printTempfileControls(os, options.tempfileControls)                                    << std::endl <<
+          "Exclude Lean Runtime: " << boolToEnabled(options.excludeLeanRuntime)                                         << std::endl <<
           "Sparsity: ";         printSparsity(os, options)                                                              << std::endl <<
           "Safe mode: "      << boolToEnabled(options.safe)                                                             << std::endl <<
+          "Build DLA standalone loadable: " << boolToEnabled(options.buildDLAStandalone)                                << std::endl <<
+          "Allow GPU fallback for DLA: " << boolToEnabled(options.allowGPUFallback)                                     << std::endl <<
           "DirectIO mode: "  << boolToEnabled(options.directIO)                                                         << std::endl <<
           "Restricted mode: " << boolToEnabled(options.restricted)                                                      << std::endl <<
+          "Skip inference: "     << boolToEnabled(options.skipInference)                                                << std::endl <<
           "Save engine: "    << (options.save ? options.engine : "")                                                    << std::endl <<
           "Load engine: "    << (options.load ? options.engine : "")                                                    << std::endl <<
           "Profiling verbosity: " << static_cast<int32_t>(options.profilingVerbosity)                                   << std::endl <<
           "Tactic sources: ";   printTacticSources(os, options.enabledTactics, options.disabledTactics)                 << std::endl <<
-          "timingCacheMode: ";  printTimingCache(os, options)                                                           << std::endl <<
-          "timingCacheFile: " << options.timingCacheFile                                                                << std::endl;
+          "timingCacheMode: ";  printTimingCache(os, options.timingCacheMode)                                           << std::endl <<
+          "timingCacheFile: " << options.timingCacheFile                                                                << std::endl <<
+          "Enable Compilation Cache: "<< boolToEnabled(!options.disableCompilationCache) << std::endl <<
+          "errorOnTimingCacheMiss: "  << boolToEnabled(options.errorOnTimingCacheMiss)                                  << std::endl <<
+          "Preview Features: "; printPreviewFlags(os, options)                                                          << std::endl <<
+          "MaxAuxStreams: "   << options.maxAuxStreams                                                                  << std::endl <<
+          "BuilderOptimizationLevel: " << options.builderOptimizationLevel                                              << std::endl <<
+          "MaxTactics: " << options.maxTactics                                                                          << std::endl <<
+          "Calibration Profile Index: " << options.calibProfile                                                         << std::endl <<
+          "Weight Streaming: " << boolToEnabled(options.allowWeightStreaming)                                           << std::endl <<
+          "Runtime Platform: " << options.runtimePlatform                                                               << std::endl <<
+          "Debug Tensors: " << options.debugTensors                                                                     << std::endl;
     // clang-format on
 
     auto printIOFormats = [](std::ostream& os, const char* direction, const std::vector<IOFormat> formats) {
@@ -1351,7 +2193,7 @@ std::ostream& operator<<(std::ostream& os, const BuildOptions& options)
         }
         else
         {
-            for(const auto& f : formats)
+            for (const auto& f : formats)
             {
                 os << direction << ": " << f << std::endl;
             }
@@ -1360,8 +2202,11 @@ std::ostream& operator<<(std::ostream& os, const BuildOptions& options)
 
     printIOFormats(os, "Input(s)", options.inputFormats);
     printIOFormats(os, "Output(s)", options.outputFormats);
-    printShapes(os, "build", options.shapes);
-    printShapes(os, "calibration", options.shapesCalib);
+    for (size_t i = 0; i < options.optProfiles.size(); i++)
+    {
+        printShapes(os, "build", options.optProfiles[i], i);
+    }
+    printShapes(os, "calibration", options.shapesCalib, -1);
 
     return os;
 }
@@ -1372,8 +2217,8 @@ std::ostream& operator<<(std::ostream& os, const SystemOptions& options)
     os << "=== System Options ==="                                                                << std::endl <<
 
           "Device: "  << options.device                                                           << std::endl <<
-          "DLACore: " << (options.DLACore != -1 ? std::to_string(options.DLACore) : "")           <<
-                         (options.DLACore != -1 && options.fallback ? "(With GPU fallback)" : "") << std::endl;
+          "DLACore: " << (options.DLACore != -1 ? std::to_string(options.DLACore) : "")           << std::endl;
+#if !TRT_WINML
     os << "Plugins:";
 
     for (const auto& p : options.plugins)
@@ -1382,13 +2227,32 @@ std::ostream& operator<<(std::ostream& os, const SystemOptions& options)
     }
     os << std::endl;
 
+    os << "setPluginsToSerialize:";
+
+    for (const auto& p : options.setPluginsToSerialize)
+    {
+        os << " " << p;
+    }
+    os << std::endl;
+
+    os << "dynamicPlugins:";
+
+    for (const auto& p : options.dynamicPlugins)
+    {
+        os << " " << p;
+    }
+    os << std::endl;
+
+    os << "ignoreParsedPluginLibs: " << options.ignoreParsedPluginLibs << std::endl;
+    os << std::endl;
+#endif
     return os;
     // clang-format on
 }
 
 std::ostream& operator<<(std::ostream& os, const InferenceOptions& options)
 {
-// clang-format off
+    // clang-format off
     os << "=== Inference Options ==="                                     << std::endl <<
 
           "Batch: ";
@@ -1400,48 +2264,71 @@ std::ostream& operator<<(std::ostream& os, const InferenceOptions& options)
     {
                           os << "Explicit"                                << std::endl;
     }
-    printShapes(os, "inference", options.shapes);
-    os << "Iterations: "         << options.iterations                    << std::endl <<
-          "Duration: "           << options.duration   << "s (+ "
-                                 << options.warmup     << "ms warm up)"   << std::endl <<
-          "Sleep time: "         << options.sleep      << "ms"            << std::endl <<
-          "Idle time: "          << options.idle       << "ms"            << std::endl <<
-          "Streams: "            << options.streams                       << std::endl <<
-          "ExposeDMA: "          << boolToEnabled(!options.overlap)       << std::endl <<
-          "Data transfers: "     << boolToEnabled(!options.skipTransfers) << std::endl <<
-          "Spin-wait: "          << boolToEnabled(options.spin)           << std::endl <<
-          "Multithreading: "     << boolToEnabled(options.threads)        << std::endl <<
-          "CUDA Graph: "         << boolToEnabled(options.graph)          << std::endl <<
-          "Separate profiling: " << boolToEnabled(options.rerun)          << std::endl <<
-          "Time Deserialize: "   << boolToEnabled(options.timeDeserialize) << std::endl <<
-          "Time Refit: "         << boolToEnabled(options.timeRefit) << std::endl <<
-          "Skip inference: "     << boolToEnabled(options.skip)           << std::endl;
-
-// clang-format on
+    printShapes(os, "inference", options.shapes, options.optProfileIndex);
+
+    std::string wsBudget{"Disabled"};
+    if (options.weightStreamingBudget.bytes == WeightStreamingBudget::kAUTOMATIC)
+    {
+        wsBudget = "Automatic";
+    }
+    else if (options.weightStreamingBudget.bytes != WeightStreamingBudget::kDISABLE)
+    {
+        wsBudget = std::to_string(options.weightStreamingBudget.bytes) + " bytes";
+    }
+    else if (options.weightStreamingBudget.percent != WeightStreamingBudget::kDISABLE)
+    {
+        wsBudget = std::to_string(options.weightStreamingBudget.percent) + "%";
+    }
+
+    os << "Iterations: "                << options.iterations                                   << std::endl <<
+          "Duration: "                  << options.duration   << "s (+ "
+                                        << options.warmup     << "ms warm up)"                  << std::endl <<
+          "Sleep time: "                << options.sleep      << "ms"                           << std::endl <<
+          "Idle time: "                 << options.idle       << "ms"                           << std::endl <<
+          "Inference Streams: "         << options.infStreams                                   << std::endl <<
+          "ExposeDMA: "                 << boolToEnabled(!options.overlap)                      << std::endl <<
+          "Data transfers: "            << boolToEnabled(!options.skipTransfers)                << std::endl <<
+          "Spin-wait: "                 << boolToEnabled(options.spin)                          << std::endl <<
+          "Multithreading: "            << boolToEnabled(options.threads)                       << std::endl <<
+          "CUDA Graph: "                << boolToEnabled(options.graph)                         << std::endl <<
+          "Separate profiling: "        << boolToEnabled(options.rerun)                         << std::endl <<
+          "Time Deserialize: "          << boolToEnabled(options.timeDeserialize)               << std::endl <<
+          "Time Refit: "                << boolToEnabled(options.timeRefit)                     << std::endl <<
+          "NVTX verbosity: "            << static_cast<int32_t>(options.nvtxVerbosity)          << std::endl <<
+          "Persistent Cache Ratio: "    << static_cast<float>(options.persistentCacheRatio)     << std::endl <<
+          "Optimization Profile Index: "<< options.optProfileIndex                              << std::endl <<
+          "Weight Streaming Budget: "   << wsBudget                                             << std::endl;
+    // clang-format on
+
     os << "Inputs:" << std::endl;
     for (const auto& input : options.inputs)
     {
         os << input.first << "<-" << input.second << std::endl;
     }
 
+    os << "Debug Tensor Save Destinations:" << std::endl;
+    for (auto const& fileName : options.debugTensorFileNames)
+    {
+        os << fileName.first << ": " << fileName.second << std::endl;
+    }
+
     return os;
 }
 
 std::ostream& operator<<(std::ostream& os, const ReportingOptions& options)
 {
-// clang-format off
-    os << "=== Reporting Options ==="                                       << std::endl <<
-
-          "Verbose: "                     << boolToEnabled(options.verbose) << std::endl <<
-          "Averages: "                    << options.avgs << " inferences"  << std::endl <<
-          "Percentile: "                  << options.percentile             << std::endl <<
-          "Dump refittable layers:"       << boolToEnabled(options.refit)   << std::endl <<
-          "Dump output: "                 << boolToEnabled(options.output)  << std::endl <<
-          "Profile: "                     << boolToEnabled(options.profile) << std::endl <<
-          "Export timing to JSON file: "  << options.exportTimes            << std::endl <<
-          "Export output to JSON file: "  << options.exportOutput           << std::endl <<
-          "Export profile to JSON file: " << options.exportProfile          << std::endl;
-// clang-format on
+    // clang-format off
+    os << "=== Reporting Options ==="                                                     << std::endl <<
+          "Verbose: "                     << boolToEnabled(options.verbose)               << std::endl <<
+          "Averages: "                    << options.avgs << " inferences"                << std::endl <<
+          "Percentiles: "                 << joinValuesToString(options.percentiles, ",") << std::endl <<
+          "Dump refittable layers:"       << boolToEnabled(options.refit)                 << std::endl <<
+          "Dump output: "                 << boolToEnabled(options.output)                << std::endl <<
+          "Profile: "                     << boolToEnabled(options.profile)               << std::endl <<
+          "Export timing to JSON file: "  << options.exportTimes                          << std::endl <<
+          "Export output to JSON file: "  << options.exportOutput                         << std::endl <<
+          "Export profile to JSON file: " << options.exportProfile                        << std::endl;
+    // clang-format on
 
     return os;
 }
@@ -1461,7 +2348,7 @@ std::ostream& operator<<(std::ostream& os, const SafeBuilderOptions& options)
         }
         else
         {
-            for(const auto& f : formats)
+            for (const auto& f : formats)
             {
                 os << direction << ": " << f << std::endl;
             }
@@ -1476,197 +2363,288 @@ std::ostream& operator<<(std::ostream& os, const SafeBuilderOptions& options)
     {
         os << " + INT8";
     }
+    if (options.fp8)
+    {
+        os << " + FP8";
+    }
+    if (options.int4)
+    {
+        os << " + INT4";
+    }
     os << std::endl;
     os << "Calibration file: " << options.calibFile << std::endl;
     os << "Serialized Network: " << options.serialized << std::endl;
 
     printIOFormats(os, "Input(s)", options.inputFormats);
     printIOFormats(os, "Output(s)", options.outputFormats);
-
+#if !TRT_WINML
     os << "Plugins:";
     for (const auto& p : options.plugins)
     {
         os << " " << p;
     }
+#endif
+    os << "timingCacheMode: ";
+    printTimingCache(os, options.timingCacheMode) << std::endl;
+    os << "timingCacheFile: " << options.timingCacheFile << std::endl;
     os << std::endl;
     return os;
 }
 
 void BaseModelOptions::help(std::ostream& os)
 {
-// clang-format off
-    os << "  --uff=<file>                UFF model"                                             << std::endl <<
-          "  --onnx=<file>               ONNX model"                                            << std::endl <<
-          "  --model=<file>              Caffe model (default = no model, random weights used)" << std::endl;
-// clang-format on
-}
-
-void UffInput::help(std::ostream& os)
-{
-// clang-format off
-    os << "  --uffInput=<name>,X,Y,Z     Input blob name and its dimensions (X,Y,Z=C,H,W), it can be specified "
-                                                       "multiple times; at least one is required for UFF models" << std::endl <<
-          "  --uffNHWC                   Set if inputs are in the NHWC layout instead of NCHW (use "             <<
-                                                                    "X,Y,Z=H,W,C order in --uffInput)"           << std::endl;
-// clang-format on
+    // clang-format off
+    os << "  --onnx=<file>               ONNX model"                                            << std::endl;
+    // clang-format on
 }
 
 void ModelOptions::help(std::ostream& os)
 {
-// clang-format off
+    // clang-format off
     os << "=== Model Options ==="                                                                                 << std::endl;
     BaseModelOptions::help(os);
-    os << "  --deploy=<file>             Caffe prototxt file"                                                     << std::endl <<
-          "  --output=<name>[,<name>]*   Output names (it can be specified multiple times); at least one output "
-                                                                                  "is required for UFF and Caffe" << std::endl;
-    UffInput::help(os);
-// clang-format on
+    // clang-format on
 }
 
 void BuildOptions::help(std::ostream& os)
 {
-// clang-format off
-    os << "=== Build Options ==="                                                                                                            "\n"
-          "  --maxBatch                  Set max batch size and build an implicit batch engine (default = same size as --batch)"             "\n"
-          "                              This option should not be used when the input model is ONNX or when dynamic shapes are provided."   "\n"
-          "  --minShapes=spec            Build with dynamic shapes using a profile with the min shapes provided"                             "\n"
-          "  --optShapes=spec            Build with dynamic shapes using a profile with the opt shapes provided"                             "\n"
-          "  --maxShapes=spec            Build with dynamic shapes using a profile with the max shapes provided"                             "\n"
-          "  --minShapesCalib=spec       Calibrate with dynamic shapes using a profile with the min shapes provided"                         "\n"
-          "  --optShapesCalib=spec       Calibrate with dynamic shapes using a profile with the opt shapes provided"                         "\n"
-          "  --maxShapesCalib=spec       Calibrate with dynamic shapes using a profile with the max shapes provided"                         "\n"
-          "                              Note: All three of min, opt and max shapes must be supplied."                                       "\n"
-          "                                    However, if only opt shapes is supplied then it will be expanded so"                          "\n"
-          "                                    that min shapes and max shapes are set to the same values as opt shapes."                     "\n"
-          "                                    Input names can be wrapped with escaped single quotes (ex: \\\'Input:0\\\')."                 "\n"
-          "                              Example input shapes spec: input0:1x3x256x256,input1:1x3x128x128"                                   "\n"
-          "                              Each input shape is supplied as a key-value pair where key is the input name and"                   "\n"
-          "                              value is the dimensions (including the batch dimension) to be used for that input."                 "\n"
-          "                              Each key-value pair has the key and value separated using a colon (:)."                             "\n"
-          "                              Multiple input shapes can be provided via comma-separated key-value pairs."                         "\n"
-          "  --inputIOFormats=spec       Type and format of each of the input tensors (default = all inputs in fp32:chw)"                    "\n"
-          "                              See --outputIOFormats help for the grammar of type and format list."                                "\n"
-          "                              Note: If this option is specified, please set comma-separated types and formats for all"            "\n"
-          "                                    inputs following the same order as network inputs ID (even if only one input"                 "\n"
-          "                                    needs specifying IO format) or set the type and format once for broadcasting."                "\n"
-          "  --outputIOFormats=spec      Type and format of each of the output tensors (default = all outputs in fp32:chw)"                  "\n"
-          "                              Note: If this option is specified, please set comma-separated types and formats for all"            "\n"
-          "                                    outputs following the same order as network outputs ID (even if only one output"              "\n"
-          "                                    needs specifying IO format) or set the type and format once for broadcasting."                "\n"
-          "                              IO Formats: spec  ::= IOfmt[\",\"spec]"                                                             "\n"
-          "                                          IOfmt ::= type:fmt"                                                                     "\n"
-          "                                          type  ::= \"fp32\"|\"fp16\"|\"int32\"|\"int8\""                                         "\n"
-          "                                          fmt   ::= (\"chw\"|\"chw2\"|\"chw4\"|\"hwc8\"|\"chw16\"|\"chw32\"|\"dhwc8\")[\"+\"fmt]" "\n"
-          "  --workspace=N               Set workspace size in MiB."                                                                         "\n"
-          "  --memPoolSize=poolspec      Specify the size constraints of the designated memory pool(s) in MiB."                              "\n"
-          "                              Note: Also accepts decimal sizes, e.g. 0.25MiB. Will be rounded down to the nearest integer bytes." "\n"
-          "                              Pool constraint: poolspec ::= poolfmt[\",\"poolspec]"                                               "\n"
-          "                                               poolfmt ::= pool:sizeInMiB"                                                        "\n"
-          "                                               pool ::= \"workspace\"|\"dlaSRAM\"|\"dlaLocalDRAM\"|\"dlaGlobalDRAM\""             "\n"
-          "  --profilingVerbosity=mode   Specify profiling verbosity. mode ::= layer_names_only|detailed|none (default = layer_names_only)"  "\n"
-          "  --minTiming=M               Set the minimum number of iterations used in kernel selection (default = "
-                                                                                                           << defaultMinTiming << ")"        "\n"
-          "  --avgTiming=M               Set the number of times averaged in each iteration for kernel selection (default = "
-                                                                                                           << defaultAvgTiming << ")"        "\n"
-          "  --refit                     Mark the engine as refittable. This will allow the inspection of refittable layers "                "\n"
-          "                              and weights within the engine."                                                                     "\n"
-          "  --sparsity=spec             Control sparsity (default = disabled). "                                                            "\n"
-          "                              Sparsity: spec ::= \"disable\", \"enable\", \"force\""                                              "\n"
-          "                              Note: Description about each of these options is as below"                                          "\n"
-          "                                    disable = do not enable sparse tactics in the builder (this is the default)"                  "\n"
-          "                                    enable  = enable sparse tactics in the builder (but these tactics will only be"               "\n"
-          "                                              considered if the weights have the right sparsity pattern)"                         "\n"
-          "                                    force   = enable sparse tactics in the builder and force-overwrite the weights to have"       "\n"
-          "                                              a sparsity pattern (even if you loaded a model yourself)"                           "\n"
-          "  --noTF32                    Disable tf32 precision (default is to enable tf32, in addition to fp32)"                            "\n"
-          "  --fp16                      Enable fp16 precision, in addition to fp32 (default = disabled)"                                    "\n"
-          "  --int8                      Enable int8 precision, in addition to fp32 (default = disabled)"                                    "\n"
-          "  --best                      Enable all precisions to achieve the best performance (default = disabled)"                         "\n"
-          "  --directIO                  Avoid reformatting at network boundaries. (default = disabled)"                                     "\n"
-          "  --precisionConstraints=spec Control precision constraint setting. (default = none)"                                             "\n"
-          "                                  Precision Constaints: spec ::= \"none\" | \"obey\" | \"prefer\""                                "\n"
-          "                                  none = no constraints"                                                                          "\n"
-          "                                  prefer = meet precision constraints set by --layerPrecisions/--layerOutputTypes if possible"    "\n"
-          "                                  obey = meet precision constraints set by --layerPrecisions/--layerOutputTypes or fail"          "\n"
-          "                                         otherwise"                                                                               "\n"
-          "  --layerPrecisions=spec      Control per-layer precision constraints. Effective only when precisionConstraints is set to"        "\n"
-          "                              \"obey\" or \"prefer\". (default = none)"                                                           "\n"
-          "                              The specs are read left-to-right, and later ones override earlier ones. \"*\" can be used as a"     "\n"
-          "                              layerName to specify the default precision for all the unspecified layers."                         "\n"
-          "                              Per-layer precision spec ::= layerPrecision[\",\"spec]"                                             "\n"
-          "                                                  layerPrecision ::= layerName\":\"precision"                                     "\n"
-          "                                                  precision ::= \"fp32\"|\"fp16\"|\"int32\"|\"int8\""                             "\n"
-          "  --layerOutputTypes=spec     Control per-layer output type constraints. Effective only when precisionConstraints is set to"      "\n"
-          "                              \"obey\" or \"prefer\". (default = none)"                                                           "\n"
-          "                              The specs are read left-to-right, and later ones override earlier ones. \"*\" can be used as a"     "\n"
-          "                              layerName to specify the default precision for all the unspecified layers. If a layer has more than""\n"
-          "                              one output, then multiple types separated by \"+\" can be provided for this layer."                 "\n"
-          "                              Per-layer output type spec ::= layerOutputTypes[\",\"spec]"                                         "\n"
-          "                                                    layerOutputTypes ::= layerName\":\"type"                                      "\n"
-          "                                                    type ::= \"fp32\"|\"fp16\"|\"int32\"|\"int8\"[\"+\"type]"                     "\n"
-          "  --calib=<file>              Read INT8 calibration cache file"                                                                   "\n"
-          "  --safe                      Enable build safety certified engine"                                                               "\n"
-          "  --consistency               Perform consistency checking on safety certified engine"                                            "\n"
-          "  --restricted                Enable safety scope checking with kSAFETY_SCOPE build flag"                                         "\n"
-          "  --saveEngine=<file>         Save the serialized engine"                                                                         "\n"
-          "  --loadEngine=<file>         Load a serialized engine"                                                                           "\n"
-          "  --tacticSources=tactics     Specify the tactics to be used by adding (+) or removing (-) tactics from the default "             "\n"
-          "                              tactic sources (default = all available tactics)."                                                  "\n"
-          "                              Note: Currently only cuDNN, cuBLAS and cuBLAS-LT are listed as optional tactics."                   "\n"
-          "                              Tactic Sources: tactics ::= [\",\"tactic]"                                                          "\n"
-          "                                              tactic  ::= (+|-)lib"                                                               "\n"
-          "                                              lib     ::= \"CUBLAS\"|\"CUBLAS_LT\"|\"CUDNN\""                                     "\n"
-          "                              For example, to disable cudnn and enable cublas: --tacticSources=-CUDNN,+CUBLAS"                    "\n"
-          "  --noBuilderCache            Disable timing cache in builder (default is to enable timing cache)"                                "\n"
-          "  --timingCacheFile=<file>    Save/load the serialized global timing cache"                                                       "\n"
+    // clang-format off
+    os << "=== Build Options ==="                                                                                                                   "\n"
+          "  --minShapes=spec                   Build with dynamic shapes using a profile with the min shapes provided"                             "\n"
+          "  --optShapes=spec                   Build with dynamic shapes using a profile with the opt shapes provided"                             "\n"
+          "  --maxShapes=spec                   Build with dynamic shapes using a profile with the max shapes provided"                             "\n"
+          "  --minShapesCalib=spec              Calibrate with dynamic shapes using a profile with the min shapes provided"                         "\n"
+          "  --optShapesCalib=spec              Calibrate with dynamic shapes using a profile with the opt shapes provided"                         "\n"
+          "  --maxShapesCalib=spec              Calibrate with dynamic shapes using a profile with the max shapes provided"                         "\n"
+          "                                     Note: All three of min, opt and max shapes must be supplied."                                       "\n"
+          "                                           However, if only opt shapes is supplied then it will be expanded so"                          "\n"
+          "                                           that min shapes and max shapes are set to the same values as opt shapes."                     "\n"
+          "                                           Input names can be wrapped with escaped single quotes (ex: 'Input:0')."                       "\n"
+          "                                     Example input shapes spec: input0:1x3x256x256,input1:1x3x128x128"                                   "\n"
+          "                                     For scalars (0-D shapes), use input0:scalar or simply input0: with nothing after the colon."        "\n"
+          "                                     Each input shape is supplied as a key-value pair where key is the input name and"                   "\n"
+          "                                     value is the dimensions (including the batch dimension) to be used for that input."                 "\n"
+          "                                     Each key-value pair has the key and value separated using a colon (:)."                             "\n"
+          "                                     Multiple input shapes can be provided via comma-separated key-value pairs, and each input name can" "\n"
+          "                                     contain at most one wildcard ('*') character."                                                      "\n"
+          "  --inputIOFormats=spec              Type and format of each of the input tensors (default = all inputs in fp32:chw)"                    "\n"
+          "                                     See --outputIOFormats help for the grammar of type and format list."                                "\n"
+          "                                     Note: If this option is specified, please set comma-separated types and formats for all"            "\n"
+          "                                           inputs following the same order as network inputs ID (even if only one input"                 "\n"
+          "                                           needs specifying IO format) or set the type and format once for broadcasting."                "\n"
+          "  --outputIOFormats=spec             Type and format of each of the output tensors (default = all outputs in fp32:chw)"                  "\n"
+          "                                     Note: If this option is specified, please set comma-separated types and formats for all"            "\n"
+          "                                           outputs following the same order as network outputs ID (even if only one output"              "\n"
+          "                                           needs specifying IO format) or set the type and format once for broadcasting."                "\n"
+          R"(                                     IO Formats: spec  ::= IOfmt[","spec])"                                                            "\n"
+          "                                                 IOfmt ::= type:fmt"                                                                     "\n"
+          R"(                                               type  ::= "fp32"|"fp16"|"bf16"|"int32"|"int64"|"int8"|"uint8"|"bool")"                  "\n"
+          R"(                                               fmt   ::= ("chw"|"chw2"|"chw4"|"hwc8"|"chw16"|"chw32"|"dhwc8"|)"                        "\n"
+          R"(                                                          "cdhw32"|"hwc"|"dla_linear"|"dla_hwc4")["+"fmt])"                            "\n"
+          "  --memPoolSize=poolspec             Specify the size constraints of the designated memory pool(s)"                                      "\n"
+          "                                     Supports the following base-2 suffixes: " << getAvailableUnitSuffixes() << "."                      "\n"
+          "                                     If none of suffixes is appended, the defualt unit is in MiB."                                       "\n"
+          "                                     Note: Also accepts decimal sizes, e.g. 0.25M. Will be rounded down to the nearest integer bytes."   "\n"
+          "                                     In particular, for dlaSRAM the bytes will be rounded down to the nearest power of 2."               "\n"
+          R"(                                   Pool constraint: poolspec ::= poolfmt[","poolspec])"                                                "\n"
+          "                                                      poolfmt ::= pool:size"                                                             "\n"
+          R"(                                                    pool ::= "workspace"|"dlaSRAM"|"dlaLocalDRAM"|"dlaGlobalDRAM"|"tacticSharedMem")"  "\n"
+          "  --profilingVerbosity=mode          Specify profiling verbosity. mode ::= layer_names_only|detailed|none (default = layer_names_only)." "\n"
+          "                                     Please only assign once."                                                                           "\n"
+          "  --avgTiming=M                      Set the number of times averaged in each iteration for kernel selection (default = "
+                                                                                                                  << defaultAvgTiming << ")"        "\n"
+          "  --refit                            Mark the engine as refittable. This will allow the inspection of refittable layers "                "\n"
+          "                                     and weights within the engine."                                                                     "\n"
+          "  --stripWeights                     Strip weights from plan. This flag works with either refit or refit with identical weights. Default""\n"
+          "                                     to latter, but you can switch to the former by enabling both --stripWeights and --refit at the same""\n"
+          "                                     time."                                                                                              "\n"
+          "  --stripAllWeights                  Alias for combining the --refit and --stripWeights options. It marks all weights as refittable,"    "\n"
+          "                                     disregarding any performance impact. Additionally, it strips all refittable weights after the "     "\n"
+          "                                     engine is built."                                                                                   "\n"
+          "  --weightless                       [Deprecated] this knob has been deprecated. Please use --stripWeights"                              "\n"
+          "  --versionCompatible, --vc          Mark the engine as version compatible. This allows the engine to be used with newer versions"       "\n"
+          "                                     of TensorRT on the same host OS, as well as TensorRT's dispatch and lean runtimes."                 "\n"
+#if !TRT_WINML
+          "  --pluginInstanceNorm, --pi         Set `kNATIVE_INSTANCENORM` to false in the ONNX parser. This will cause the ONNX parser to use"     "\n"
+          "                                     a plugin InstanceNorm implementation over the native implementation when parsing."                  "\n"
+#endif
+          R"(  --useRuntime=runtime               TensorRT runtime to execute engine. "lean" and "dispatch" require loading VC engine and do)"      "\n"
+          "                                     not support building an engine."                                                                    "\n"
+          R"(                                           runtime::= "full"|"lean"|"dispatch")"                                                       "\n"
+          "  --leanDLLPath=<file>               External lean runtime DLL to use in version compatiable mode."                                      "\n"
+          "  --excludeLeanRuntime               When --versionCompatible is enabled, this flag indicates that the generated engine should"          "\n"
+          "                                     not include an embedded lean runtime. If this is set, the user must explicitly specify a"           "\n"
+          "                                     valid lean runtime to use when loading the engine."     "\n"
+          "  --sparsity=spec                    Control sparsity (default = disabled). "                                                            "\n"
+          R"(                                   Sparsity: spec ::= "disable", "enable", "force")"                                                   "\n"
+          "                                     Note: Description about each of these options is as below"                                          "\n"
+          "                                           disable = do not enable sparse tactics in the builder (this is the default)"                  "\n"
+          "                                           enable  = enable sparse tactics in the builder (but these tactics will only be"               "\n"
+          "                                                     considered if the weights have the right sparsity pattern)"                         "\n"
+          "                                           force   = enable sparse tactics in the builder and force-overwrite the weights to have"       "\n"
+          "                                                     a sparsity pattern (even if you loaded a model yourself)"                           "\n"
+          "                                                     [Deprecated] this knob has been deprecated."                                        "\n"
+          "                                                     Please use <polygraphy surgeon prune> to rewrite the weights."                      "\n"
+          "  --noTF32                           Disable tf32 precision (default is to enable tf32, in addition to fp32)"                            "\n"
+          "  --fp16                             Enable fp16 precision, in addition to fp32 (default = disabled)"                                    "\n"
+          "  --bf16                             Enable bf16 precision, in addition to fp32 (default = disabled)"                                    "\n"
+          "  --int8                             Enable int8 precision, in addition to fp32 (default = disabled)"                                    "\n"
+          "  --fp8                              Enable fp8 precision, in addition to fp32 (default = disabled)"                                     "\n"
+          "  --int4                             Enable int4 precision, in addition to fp32 (default = disabled)"                                     "\n"
+          "  --best                             Enable all precisions to achieve the best performance (default = disabled)"                         "\n"
+          "  --stronglyTyped                    Create a strongly typed network. (default = disabled)"                                              "\n"
+          "  --directIO                         Avoid reformatting at network boundaries. (default = disabled)"                                     "\n"
+          "  --precisionConstraints=spec        Control precision constraint setting. (default = none)"                                             "\n"
+          R"(                                       Precision Constraints: spec ::= "none" | "obey" | "prefer")"                                    "\n"
+          "                                         none = no constraints"                                                                          "\n"
+          "                                         prefer = meet precision constraints set by --layerPrecisions/--layerOutputTypes if possible"    "\n"
+          "                                         obey = meet precision constraints set by --layerPrecisions/--layerOutputTypes or fail"          "\n"
+          "                                                otherwise"                                                                               "\n"
+          "  --layerPrecisions=spec             Control per-layer precision constraints. Effective only when precisionConstraints is set to"        "\n"
+          R"(                                   "obey" or "prefer". (default = none))"                                                              "\n"
+          R"(                                   The specs are read left-to-right, and later ones override earlier ones. Each layer name can)"       "\n"
+          "                                     contain at most one wildcard ('*') character."                                                      "\n"
+          R"(                                   Per-layer precision spec ::= layerPrecision[","spec])"                                              "\n"
+          R"(                                                       layerPrecision ::= layerName":"precision)"                                      "\n"
+          R"(                                                       precision ::= "fp32"|"fp16"|"bf16"|"int32"|"int8")"                             "\n"
+          "  --layerOutputTypes=spec            Control per-layer output type constraints. Effective only when precisionConstraints is set to"      "\n"
+          R"(                                   "obey" or "prefer". (default = none)"                                                               "\n"
+          R"(                                   The specs are read left-to-right, and later ones override earlier ones. Each layer name can)"       "\n"
+          "                                     contain at most one wildcard ('*') character. If a layer has more than"                             "\n"
+          R"(                                   one output, then multiple types separated by "+" can be provided for this layer.)"                  "\n"
+          R"(                                   Per-layer output type spec ::= layerOutputTypes[","spec])"                                          "\n"
+          R"(                                                         layerOutputTypes ::= layerName":"type)"                                       "\n"
+          R"(                                                         type ::= "fp32"|"fp16"|"bf16"|"int32"|"int8"["+"type])"                       "\n"
+          "  --layerDeviceTypes=spec            Specify layer-specific device type."                                                                "\n"
+          "                                     The specs are read left-to-right, and later ones override earlier ones. If a layer does not have"   "\n"
+          "                                     a device type specified, the layer will opt for the default device type."                           "\n"
+          R"(                                   Per-layer device type spec ::= layerDeviceTypePair[","spec])"                                       "\n"
+          R"(                                                         layerDeviceTypePair ::= layerName":"deviceType)"                              "\n"
+          R"(                                                           deviceType ::= "GPU"|"DLA")"                                                "\n"
+          "  --calib=<file>                     Read INT8 calibration cache file"                                                                   "\n"
+          "  --safe                             Enable build safety certified engine, if DLA is enable, --buildDLAStandalone will be specified"     "\n"
+          "                                     automatically (default = disabled)"                                                                 "\n"
+          "  --buildDLAStandalone               Enable build DLA standalone loadable which can be loaded by cuDLA, when this option is enabled, "   "\n"
+          "                                     --allowGPUFallback is disallowed and --skipInference is enabled by default. Additionally, "         "\n"
+          "                                     specifying --inputIOFormats and --outputIOFormats restricts I/O data type and memory layout"        "\n"
+          "                                     (default = disabled)"        "\n"
+          "  --allowGPUFallback                 When DLA is enabled, allow GPU fallback for unsupported layers (default = disabled)"                "\n"
+          "  --restricted                       Enable safety scope checking with kSAFETY_SCOPE build flag"                                         "\n"
+          "  --saveEngine=<file>                Save the serialized engine"                                                                         "\n"
+          "  --loadEngine=<file>                Load a serialized engine"                                                                           "\n"
+          "  --getPlanVersionOnly               Print TensorRT version when loaded plan was created. Works without deserialization of the plan."    "\n"
+          "                                     Use together with --loadEngine. Supported only for engines created with 8.6 and forward."           "\n"
+          "  --tacticSources=tactics            Specify the tactics to be used by adding (+) or removing (-) tactics from the default "             "\n"
+          "                                     tactic sources (default = all available tactics)."                                                  "\n"
+          "                                     Note: Currently only cuDNN, cuBLAS, cuBLAS-LT, and edge mask convolutions are listed as optional"   "\n"
+          "                                           tactics."                                                                                     "\n"
+          R"(                                   Tactic Sources: tactics ::= [","tactic])"                                                           "\n"
+          "                                                     tactic  ::= (+|-)lib"                                                               "\n"
+          R"(                                                   lib     ::= "CUBLAS"|"CUBLAS_LT"|"CUDNN"|"EDGE_MASK_CONVOLUTIONS")"                 "\n"
+          R"(                                                               |"JIT_CONVOLUTIONS")"                                                   "\n"
+          "                                     For example, to disable cudnn and enable cublas: --tacticSources=-CUDNN,+CUBLAS"                    "\n"
+          "  --noBuilderCache                   Disable timing cache in builder (default is to enable timing cache)"                                "\n"
+          "  --noCompilationCache               Disable Compilation cache in builder, and the cache is part of timing cache (default is to enable compilation cache)"                                                "\n"
+          "  --errorOnTimingCacheMiss           Emit error when a tactic being timed is not present in the timing cache (default = false)"          "\n"
+          "  --timingCacheFile=<file>           Save/load the serialized global timing cache"                                                       "\n"
+          "  --preview=features                 Specify preview feature to be used by adding (+) or removing (-) preview features from the default" "\n"
+          R"(                                   Preview Features: features ::= [","feature])"                                                       "\n"
+          "                                                       feature  ::= (+|-)flag"                                                           "\n"
+          R"(                                                     flag     ::= "aliasedPluginIO1003")"                                              "\n"
+          R"(                                                                  |"profileSharing0806")"                                              "\n"
+          "  --builderOptimizationLevel         Set the builder optimization level. (default is 3)"                                                 "\n"
+          "                                     Higher level allows TensorRT to spend more building time for more optimization options."            "\n"
+          "                                     Valid values include integers from 0 to the maximum optimization level, which is currently 5."      "\n"
+          "  --maxTactics                       Set the maximum number of tactics to time when there is a choice of tactics. (default is -1)"       "\n"
+          "                                     Larger number of tactics allow TensorRT to spend more building time on evaluating tactics."         "\n"
+          "                                     Default value -1 means TensorRT can decide the number of tactics based on its own heuristic."       "\n"
+          "  --hardwareCompatibilityLevel=mode  Make the engine file compatible with other GPU architectures. (default = none)"                     "\n"
+          R"(                                   Hardware Compatibility Level: mode ::= "none" | "ampere+")"                                         "\n"
+          "                                         none = no compatibility"                                                                        "\n"
+          "                                         ampere+ = compatible with Ampere and newer GPUs"                                                "\n"
+          "  --runtimePlatform=platform         Set the target platform for runtime execution. (default = SameAsBuild)"                             "\n"
+          "                                     When this option is enabled, --skipInference is enabled by default."                                "\n"
+          R"(                                   RuntimePlatfrom: platform ::= "SameAsBuild" | "WindowsAMD64")"                                      "\n"
+          "                                         SameAsBuild = no requirement for cross-platform compatibility."                                 "\n"
+          "                                         WindowsAMD64 = set the target platform for engine execution as Windows AMD64 system"            "\n"
+          "  --tempdir=<dir>                    Overrides the default temporary directory TensorRT will use when creating temporary files."         "\n"
+          "                                     See IRuntime::setTemporaryDirectory API documentation for more information."                        "\n"
+          "  --tempfileControls=controls        Controls what TensorRT is allowed to use when creating temporary executable files."                 "\n"
+          "                                     Should be a comma-separated list with entries in the format (in_memory|temporary):(allow|deny)."    "\n"
+          "                                     in_memory: Controls whether TensorRT is allowed to create temporary in-memory executable files."    "\n"
+          "                                     temporary: Controls whether TensorRT is allowed to create temporary executable files in the"        "\n"
+          "                                                filesystem (in the directory given by --tempdir)."                                       "\n"
+          "                                     For example, to allow in-memory files and disallow temporary files:"                                "\n"
+          "                                         --tempfileControls=in_memory:allow,temporary:deny"                                              "\n"
+          R"(                                     If a flag is unspecified, the default behavior is "allow".)"                                      "\n"
+          "  --maxAuxStreams=N                  Set maximum number of auxiliary streams per inference stream that TRT is allowed to use to run "    "\n"
+          "                                     kernels in parallel if the network contains ops that can run in parallel, with the cost of more "   "\n"
+          "                                     memory usage. Set this to 0 for optimal memory usage. (default = using heuristics)"                 "\n"
+          "  --profile                          Build with dynamic shapes using a profile with the min/max/opt shapes provided. Can be specified"   "\n"
+          "                                         multiple times to create multiple profiles with contiguous index."                              "\n"
+          "                                     (ex: --profile=0 --minShapes=<spec> --optShapes=<spec> --maxShapes=<spec> --profile=1 ...)"         "\n"
+          "  --calibProfile                     Select the optimization profile to calibrate by index. (default = "
+                                                                                                                << defaultOptProfileIndex << ")"    "\n"
+          "  --allowWeightStreaming             Enable a weight streaming engine. Must be specified with --stronglyTyped. TensorRT will disable"    "\n"
+          "                                     weight streaming at runtime unless --weightStreamingBudget is specified."                           "\n"
+          "  --markDebug                        Specify list of names of tensors to be marked as debug tensors. Separate names with a comma"        "\n"
           ;
-// clang-format on
+    // clang-format on
     os << std::flush;
 }
 
 void SystemOptions::help(std::ostream& os)
 {
-// clang-format off
+    // clang-format off
     os << "=== System Options ==="                                                                         << std::endl <<
           "  --device=N                  Select cuda device N (default = "         << defaultDevice << ")" << std::endl <<
           "  --useDLACore=N              Select DLA core N for layers that support DLA (default = none)"   << std::endl <<
-          "  --allowGPUFallback          When DLA is enabled, allow GPU fallback for unsupported layers "
-                                                                                    "(default = disabled)" << std::endl;
-    os << "  --plugins                   Plugin library (.so) to load (can be specified multiple times)"   << std::endl;
-// clang-format on
+#if TRT_WINML
+          std::endl;
+#else
+          "  --staticPlugins             Plugin library (.so) to load statically (can be specified multiple times)" << std::endl <<
+          "  --dynamicPlugins            Plugin library (.so) to load dynamically and may be serialized with the engine if they are included in --setPluginsToSerialize (can be specified multiple times)" << std::endl <<
+          "  --setPluginsToSerialize     Plugin library (.so) to be serialized with the engine (can be specified multiple times)" << std::endl <<
+          "  --ignoreParsedPluginLibs    By default, when building a version-compatible engine, plugin libraries specified by the ONNX parser " << std::endl <<
+          "                              are implicitly serialized with the engine (unless --excludeLeanRuntime is specified) and loaded dynamically. " << std::endl <<
+          "                              Enable this flag to ignore these plugin libraries instead." << std::endl;
+#endif
+    // clang-format on
 }
 
 void InferenceOptions::help(std::ostream& os)
 {
     // clang-format off
     os << "=== Inference Options ==="                                                                                                << std::endl <<
-          "  --batch=N                   Set batch size for implicit batch engines (default = "              << defaultBatch << ")"  << std::endl <<
-          "                              This option should not be used when the engine is built from an ONNX model or when dynamic" << std::endl <<
-          "                              shapes are provided when the engine is built."                                              << std::endl <<
           "  --shapes=spec               Set input shapes for dynamic shapes inference inputs."                                      << std::endl <<
-          "                              Note: Input names can be wrapped with escaped single quotes (ex: \\\'Input:0\\\')."         << std::endl <<
+          R"(                              Note: Input names can be wrapped with escaped single quotes (ex: 'Input:0').)"            << std::endl <<
           "                              Example input shapes spec: input0:1x3x256x256, input1:1x3x128x128"                          << std::endl <<
+          "                              For scalars (0-D shapes), use input0:scalar or simply input0: with nothing after the colon."<< std::endl <<
           "                              Each input shape is supplied as a key-value pair where key is the input name and"           << std::endl <<
           "                              value is the dimensions (including the batch dimension) to be used for that input."         << std::endl <<
           "                              Each key-value pair has the key and value separated using a colon (:)."                     << std::endl <<
-          "                              Multiple input shapes can be provided via comma-separated key-value pairs."                 << std::endl <<
+          "                              Multiple input shapes can be provided via comma-separated key-value pairs, and each input " << std::endl <<
+          "                              name can contain at most one wildcard ('*') character."                                     << std::endl <<
           "  --loadInputs=spec           Load input values from files (default = generate random inputs). Input names can be "
                                                                                        "wrapped with single quotes (ex: 'Input:0')"  << std::endl <<
-          "                              Input values spec ::= Ival[\",\"spec]"                                                      << std::endl <<
-          "                                           Ival ::= name\":\"file"                                                        << std::endl <<
+          R"(                            Input values spec ::= Ival[","spec])"                                                       << std::endl <<
+          R"(                                         Ival ::= name":"file)"                                                         << std::endl <<
+          "                              Consult the README for more information on generating files for custom inputs."             << std::endl <<
           "  --iterations=N              Run at least N inference iterations (default = "               << defaultIterations << ")"  << std::endl <<
           "  --warmUp=N                  Run for N milliseconds to warmup before measuring performance (default = "
                                                                                                             << defaultWarmUp << ")"  << std::endl <<
           "  --duration=N                Run performance measurements for at least N seconds wallclock time (default = "
                                                                                                           << defaultDuration << ")"  << std::endl <<
+          "                              If -1 is specified, inference will keep running unless stopped manually"                    << std::endl <<
           "  --sleepTime=N               Delay inference start with a gap of N milliseconds between launch and compute "
                                                                                                "(default = " << defaultSleep << ")"  << std::endl <<
           "  --idleTime=N                Sleep N milliseconds between two continuous iterations"
                                                                                                "(default = " << defaultIdle << ")"   << std::endl <<
-          "  --streams=N                 Instantiate N engines to use concurrently (default = "            << defaultStreams << ")"  << std::endl <<
+          "  --infStreams=N              Instantiate N execution contexts to run inference concurrently "
+                                                                                             "(default = " << defaultStreams << ")"  << std::endl <<
           "  --exposeDMA                 Serialize DMA transfers to and from device (default = disabled)."                           << std::endl <<
           "  --noDataTransfers           Disable DMA transfers to and from device (default = enabled)."                              << std::endl <<
-          "  --useManagedMemory          Use managed memory instead of seperate host and device allocations (default = disabled)."   << std::endl <<
+          "  --useManagedMemory          Use managed memory instead of separate host and device allocations (default = disabled)."   << std::endl <<
           "  --useSpinWait               Actively synchronize on GPU events. This option may decrease synchronization time but "
                                                                              "increase CPU usage and power (default = disabled)"     << std::endl <<
           "  --threads                   Enable multithreading to drive engines with independent threads"
@@ -1677,42 +2655,84 @@ void InferenceOptions::help(std::ostream& os)
           "  --timeRefit                 Time the amount of time it takes to refit the engine before inference."                     << std::endl <<
           "  --separateProfileRun        Do not attach the profiler in the benchmark run; if profiling is enabled, a second "
                                                                                 "profile run will be executed (default = disabled)"  << std::endl <<
-          "  --buildOnly                 Skip inference perf measurement (default = disabled)"                                       << std::endl;
+          "  --skipInference             Exit after the engine has been built and skip inference perf measurement "
+                                                                                                             "(default = disabled)"  << std::endl <<
+          "  --persistentCacheRatio      Set the persistentCacheLimit in ratio, 0.5 represent half of max persistent L2 size "
+                                                                                                                    "(default = 0)"  << std::endl <<
+          "  --useProfile                Set the optimization profile for the inference context "
+                                                                                   "(default = " << defaultOptProfileIndex << " )."  << std::endl <<
+          "  --allocationStrategy=spec   Specify how the internal device memory for inference is allocated."                         << std::endl <<
+          R"(                            Strategy: spec ::= "static", "profile", "runtime")"                                         << std::endl <<
+          "                                  static = Allocate device memory based on max size across all profiles."                 << std::endl <<
+          "                                  profile = Allocate device memory based on max size of the current profile."             << std::endl <<
+          "                                  runtime = Allocate device memory based on the actual input shapes."                     << std::endl <<
+          "  --saveDebugTensors          Specify list of names of tensors to turn on the debug state"                                << std::endl <<
+          "                              and filename to save raw outputs to."                                                       << std::endl <<
+          "                              These tensors must be specified as debug tensors during build time."                        << std::endl <<
+          R"(                            Input values spec ::= Ival[","spec])"                                                       << std::endl <<
+          R"(                                         Ival ::= name":"file)"                                                         << std::endl <<
+          "  --weightStreamingBudget     Set the maximum amount of GPU memory TensorRT is allowed to use for weights."               << std::endl <<
+          "                              It can take on the following values:"                                                       << std::endl <<
+          "                                -2: (default) Disable weight streaming at runtime."                                       << std::endl <<
+          "                                -1: TensorRT will automatically decide the budget."                                       << std::endl <<
+          "                                 0-100%: Percentage of streamable weights that reside on the GPU."                        << std::endl <<
+          "                                         0% saves the most memory but will have the worst performance."                   << std::endl <<
+          "                                         Requires the % character."                                                       << std::endl <<
+          "                                >=0B: The exact amount of streamable weights that reside on the GPU. Supports the "       << std::endl <<
+          "                                     following base-2 suffixes: " << getAvailableUnitSuffixes() << "."                    << std::endl;
     // clang-format on
 }
 
 void ReportingOptions::help(std::ostream& os)
 {
-// clang-format off
+    // clang-format off
     os << "=== Reporting Options ==="                                                                    << std::endl <<
           "  --verbose                   Use verbose logging (default = false)"                          << std::endl <<
           "  --avgRuns=N                 Report performance measurements averaged over N consecutive "
                                                        "iterations (default = " << defaultAvgRuns << ")" << std::endl <<
-          "  --percentile=P              Report performance for the P percentage (0<=P<=100, 0 "
+          "  --percentile=P1,P2,P3,...   Report performance for the P1,P2,P3,... percentages (0<=P_i<=100, 0 "
                                         "representing max perf, and 100 representing min perf; (default"
-                                                                      " = " << defaultPercentile << "%)" << std::endl <<
+                                            " = " << joinValuesToString(defaultPercentiles, ",") << "%)" << std::endl <<
           "  --dumpRefit                 Print the refittable layers and weights from a refittable "
                                         "engine"                                                         << std::endl <<
           "  --dumpOutput                Print the output tensor(s) of the last inference iteration "
                                                                                   "(default = disabled)" << std::endl <<
+          "  --dumpRawBindingsToFile     Print the input/output tensor(s) of the last inference iteration to file"
+                                                                                  "(default = disabled)" << std::endl <<
           "  --dumpProfile               Print profile information per layer (default = disabled)"       << std::endl <<
           "  --dumpLayerInfo             Print layer information of the engine to console "
                                                                                 "(default = disabled)"   << std::endl <<
+          "  --dumpOptimizationProfile   Print the optimization profile(s) information "
+                                                                                "(default = disabled)"   << std::endl <<
           "  --exportTimes=<file>        Write the timing results in a json file (default = disabled)"   << std::endl <<
           "  --exportOutput=<file>       Write the output tensors to a json file (default = disabled)"   << std::endl <<
           "  --exportProfile=<file>      Write the profile information per layer in a json file "
                                                                               "(default = disabled)"     << std::endl <<
           "  --exportLayerInfo=<file>    Write the layer information of the engine in a json file "
                                                                               "(default = disabled)"     << std::endl;
-// clang-format on
+    // clang-format on
+}
+
+void TaskInferenceOptions::help(std::ostream& os)
+{
+    // clang-format off
+    os << "=== Task Inference Options ==="                                                                                           << std::endl <<
+          "  engine=<file>               Specify a serialized engine for this task"                                                  << std::endl <<
+          "  device=N                    Specify a GPU device for this task"                                                         << std::endl <<
+          "  DLACore=N                   Specify a DLACore for this task"                                                            << std::endl <<
+          "  batch=N                     Set batch size for implicit batch engines (default = "              << defaultBatch << ")"  << std::endl <<
+          "                              This option should not be used for explicit batch engines"                                  << std::endl <<
+          "  graph=1                     Use cuda graph for this task"                                                               << std::endl <<
+          "  persistentCacheRatio=[0-1]  Set the persistentCacheLimit ratio for this task                            (default = 0)"  << std::endl;
+    // clang-format on
 }
 
 void helpHelp(std::ostream& os)
 {
-// clang-format off
+    // clang-format off
     os << "=== Help ==="                                     << std::endl <<
           "  --help, -h                  Print this message" << std::endl;
-// clang-format on
+    // clang-format on
 }
 
 void AllOptions::help(std::ostream& os)
@@ -1723,19 +2743,6 @@ void AllOptions::help(std::ostream& os)
     os << std::endl;
     InferenceOptions::help(os);
     os << std::endl;
-// clang-format off
-    os << "=== Build and Inference Batch Options ==="                                                                   << std::endl <<
-          "                              When using implicit batch, the max batch size of the engine, if not given, "   << std::endl <<
-          "                              is set to the inference batch size;"                                           << std::endl <<
-          "                              when using explicit batch, if shapes are specified only for inference, they "  << std::endl <<
-          "                              will be used also as min/opt/max in the build profile; if shapes are "         << std::endl <<
-          "                              specified only for the build, the opt shapes will be used also for inference;" << std::endl <<
-          "                              if both are specified, they must be compatible; and if explicit batch is "     << std::endl <<
-          "                              enabled but neither is specified, the model must provide complete static"      << std::endl <<
-          "                              dimensions, including batch size, for all inputs"                              << std::endl <<
-          "                              Using ONNX models automatically forces explicit batch."                        << std::endl <<
-    std::endl;
-    // clang-format on
     ReportingOptions::help(os);
     os << std::endl;
     SystemOptions::help(os);
@@ -1745,7 +2752,7 @@ void AllOptions::help(std::ostream& os)
 
 void SafeBuilderOptions::printHelp(std::ostream& os)
 {
-// clang-format off
+    // clang-format off
     os << "=== Mandatory ==="                                                                                                                << std::endl <<
           "  --onnx=<file>               ONNX model"                                                                                         << std::endl <<
           " "                                                                                                                                << std::endl <<
@@ -1759,20 +2766,34 @@ void SafeBuilderOptions::printHelp(std::ostream& os)
           "                              Note: If this option is specified, please set comma-separated types and formats for all"            << std::endl <<
           "                                    outputs following the same order as network outputs ID (even if only one output"              << std::endl <<
           "                                    needs specifying IO format) or set the type and format once for broadcasting."                << std::endl <<
-          "                              IO Formats: spec  ::= IOfmt[\",\"spec]"                                                             << std::endl <<
+          R"(                            IO Formats: spec  ::= IOfmt[","spec])"                                                              << std::endl <<
           "                                          IOfmt ::= type:fmt"                                                                     << std::endl <<
-          "                                          type  ::= \"fp32\"|\"fp16\"|\"int32\"|\"int8\""                                         << std::endl <<
-          "                                          fmt   ::= (\"chw\"|\"chw2\"|\"chw4\"|\"hwc8\"|\"chw16\"|\"chw32\"|\"dhwc8\")[\"+\"fmt]" << std::endl <<
+          R"(                                          type  ::= "fp32"|"fp16"|"int32"|"int8")"                                              << std::endl <<
+          R"(                                          fmt   ::= ("chw"|"chw2"|"chw4"|"hwc8"|"chw16"|"chw32"|"dhwc8"|)"                      << std::endl <<
+          R"(                                                     "cdhw32"|"hwc"|"dla_linear"|"dla_hwc4")["+"fmt])"                          << std::endl <<
           "  --int8                      Enable int8 precision, in addition to fp16 (default = disabled)"                                    << std::endl <<
-          "  --consistency               Enable consistency check for serialized engine, (default = disabled)"                               << std::endl <<
           "  --std                       Build standard serialized engine, (default = disabled)"                                             << std::endl <<
           "  --calib=<file>              Read INT8 calibration cache file"                                                                   << std::endl <<
           "  --serialized=<file>         Save the serialized network"                                                                        << std::endl <<
-          "  --plugins                   Plugin library (.so) to load (can be specified multiple times)"                                     << std::endl <<
+#if !TRT_WINML
+          "  --staticPlugins             Plugin library (.so) to load statically (can be specified multiple times)"                          << std::endl <<
+#endif
           "  --verbose or -v             Use verbose logging (default = false)"                                                              << std::endl <<
           "  --help or -h                Print this message"                                                                                 << std::endl <<
-          " "                                                                                                                                << std::endl;
-// clang-format on
+          "  --noBuilderCache            Disable timing cache in builder (default is to enable timing cache)"                                << std::endl <<
+          "  --timingCacheFile=<file>    Save/load the serialized global timing cache"                                                       << std::endl <<
+          "  --sparsity=spec             Control sparsity (default = disabled). "                                                            << std::endl <<
+          R"(                              Sparsity: spec ::= "disable", "enable", "force")"                                                 << std::endl <<
+          "                              Note: Description about each of these options is as below"                                          << std::endl <<
+          "                                    disable = do not enable sparse tactics in the builder (this is the default)"                  << std::endl <<
+          "                                    enable  = enable sparse tactics in the builder (but these tactics will only be"               << std::endl <<
+          "                                              considered if the weights have the right sparsity pattern)"                         << std::endl <<
+          "                                    force   = enable sparse tactics in the builder and force-overwrite the weights to have"       << std::endl <<
+          "                                              a sparsity pattern"                                                                 << std::endl <<
+          "  --avgTiming=M               Set the number of times averaged in each iteration for kernel selection (default = "                << std::endl <<
+          ""                                                                                               << defaultAvgTiming << ")"        << std::endl <<
+          ""                                                                                                                                 << std::endl;
+    // clang-format on
 }
 
 } // namespace sample
diff --git a/src/Detector/tensorrt_yolo/common/sampleOptions.h b/src/Detector/tensorrt_yolo/common/sampleOptions.h
index 8975e1ea..8ca0a655 100644
--- a/src/Detector/tensorrt_yolo/common/sampleOptions.h
+++ b/src/Detector/tensorrt_yolo/common/sampleOptions.h
@@ -1,11 +1,12 @@
 /*
- * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
- *     http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
@@ -23,6 +24,7 @@
 #include <stdexcept>
 #include <string>
 #include <unordered_map>
+#include <unordered_set>
 #include <utility>
 #include <vector>
 
@@ -32,9 +34,10 @@ namespace sample
 {
 
 // Build default params
-constexpr int32_t maxBatchNotProvided{0};
-constexpr int32_t defaultMinTiming{1};
 constexpr int32_t defaultAvgTiming{8};
+constexpr int32_t defaultMaxAuxStreams{-1};
+constexpr int32_t defaultBuilderOptimizationLevel{-1};
+constexpr int32_t defaultMaxTactics{-1};
 
 // System default params
 constexpr int32_t defaultDevice{0};
@@ -44,14 +47,16 @@ constexpr int32_t defaultBatch{1};
 constexpr int32_t batchNotProvided{0};
 constexpr int32_t defaultStreams{1};
 constexpr int32_t defaultIterations{10};
+constexpr int32_t defaultOptProfileIndex{0};
 constexpr float defaultWarmUp{200.F};
 constexpr float defaultDuration{3.F};
 constexpr float defaultSleep{};
 constexpr float defaultIdle{};
+constexpr float defaultPersistentCacheRatio{0};
 
 // Reporting default params
 constexpr int32_t defaultAvgRuns{10};
-constexpr float defaultPercentile{99};
+constexpr std::array<float, 3> defaultPercentiles{90, 95, 99};
 
 enum class PrecisionConstraints
 {
@@ -63,9 +68,7 @@ enum class PrecisionConstraints
 enum class ModelFormat
 {
     kANY,
-    kCAFFE,
-    kONNX,
-    kUFF
+    kONNX
 };
 
 enum class SparsityFlag
@@ -82,7 +85,55 @@ enum class TimingCacheMode
     kGLOBAL
 };
 
-using Arguments = std::unordered_multimap<std::string, std::string>;
+enum class MemoryAllocationStrategy
+{
+    kSTATIC,  //< Allocate device memory based on max size across all profiles.
+    kPROFILE, //< Allocate device memory based on max size of the current profile.
+    kRUNTIME, //< Allocate device memory based on the current input shapes.
+};
+
+//!
+//! \enum RuntimeMode
+//!
+//! \brief Used to dictate which TensorRT runtime library to dynamically load.
+//!
+enum class RuntimeMode
+{
+    //! Maps to libnvinfer.so or nvinfer.dll
+    kFULL,
+
+    //! Maps to libnvinfer_dispatch.so or nvinfer_dispatch.dll
+    kDISPATCH,
+
+    //! Maps to libnvinfer_lean.so or nvinfer_lean.dll
+    kLEAN,
+};
+
+inline std::ostream& operator<<(std::ostream& os, RuntimeMode const mode)
+{
+    switch (mode)
+    {
+    case RuntimeMode::kFULL:
+    {
+        os << "full";
+        break;
+    }
+    case RuntimeMode::kDISPATCH:
+    {
+        os << "dispatch";
+        break;
+    }
+    case RuntimeMode::kLEAN:
+    {
+        os << "lean";
+        break;
+    }
+    }
+
+    return os;
+}
+
+using Arguments = std::unordered_multimap<std::string, std::pair<std::string, int32_t>>;
 
 using IOFormat = std::pair<nvinfer1::DataType, nvinfer1::TensorFormats>;
 
@@ -90,135 +141,201 @@ using ShapeRange = std::array<std::vector<int32_t>, nvinfer1::EnumMax<nvinfer1::
 
 using LayerPrecisions = std::unordered_map<std::string, nvinfer1::DataType>;
 using LayerOutputTypes = std::unordered_map<std::string, std::vector<nvinfer1::DataType>>;
+using LayerDeviceTypes = std::unordered_map<std::string, nvinfer1::DeviceType>;
 
-struct Options
-{
-    virtual void parse(Arguments& arguments) = 0;
-};
+using StringSet = std::unordered_set<std::string>;
 
-struct BaseModelOptions : public Options
+class WeightStreamingBudget
 {
-    ModelFormat format{ModelFormat::kANY};
-    std::string model;
+public:
+    static constexpr int64_t kDISABLE{-2};
+    static constexpr int64_t kAUTOMATIC{-1};
+    int64_t bytes{kDISABLE};
+    double percent{static_cast<double>(100.0)};
 
-    void parse(Arguments& arguments) override;
+    bool isDisabled()
+    {
+        return bytes == kDISABLE && percent == kDISABLE;
+    }
+};
 
-    static void help(std::ostream& out);
+class Options
+{
+public:
+    virtual ~Options() = default;
+    virtual void parse(Arguments& arguments) = 0;
 };
 
-struct UffInput : public Options
+class BaseModelOptions : public Options
 {
-    std::vector<std::pair<std::string, nvinfer1::Dims>> inputs;
-    bool NHWC{false};
+public:
+    ModelFormat format{ModelFormat::kANY};
+    std::string model;
 
     void parse(Arguments& arguments) override;
 
     static void help(std::ostream& out);
 };
 
-struct ModelOptions : public Options
+class ModelOptions : public Options
 {
+public:
     BaseModelOptions baseModel;
     std::string prototxt;
     std::vector<std::string> outputs;
-    UffInput uffInputs;
 
     void parse(Arguments& arguments) override;
 
     static void help(std::ostream& out);
 };
 
-struct BuildOptions : public Options
+constexpr nvinfer1::TempfileControlFlags getTempfileControlDefaults()
 {
-    int32_t maxBatch{maxBatchNotProvided};
+    using F = nvinfer1::TempfileControlFlag;
+    return (1U << static_cast<uint32_t>(F::kALLOW_TEMPORARY_FILES))
+        | (1U << static_cast<uint32_t>(F::kALLOW_IN_MEMORY_FILES));
+}
+
+class BuildOptions : public Options
+{
+public:
+    // Unit in MB.
     double workspace{-1.0};
+    // Unit in MB.
     double dlaSRAM{-1.0};
+    // Unit in MB.
     double dlaLocalDRAM{-1.0};
+    // Unit in MB.
     double dlaGlobalDRAM{-1.0};
-    int32_t minTiming{defaultMinTiming};
+    // Unit in KB.
+    double tacticSharedMem{-1.0};
     int32_t avgTiming{defaultAvgTiming};
+    size_t calibProfile{defaultOptProfileIndex};
     bool tf32{true};
     bool fp16{false};
+    bool bf16{false};
     bool int8{false};
+    bool fp8{false};
+    bool int4{false};
+    bool stronglyTyped{false};
     bool directIO{false};
     PrecisionConstraints precisionConstraints{PrecisionConstraints::kNONE};
     LayerPrecisions layerPrecisions;
     LayerOutputTypes layerOutputTypes;
+    LayerDeviceTypes layerDeviceTypes;
+    StringSet debugTensors;
+    StringSet debugTensorStates;
     bool safe{false};
-    bool consistency{false};
+    bool buildDLAStandalone{false};
+    bool allowGPUFallback{false};
     bool restricted{false};
+    bool skipInference{false};
     bool save{false};
     bool load{false};
     bool refittable{false};
+    bool stripWeights{false};
+    bool versionCompatible{false};
+    bool pluginInstanceNorm{false};
+    bool excludeLeanRuntime{false};
+    bool disableCompilationCache{false};
+    int32_t builderOptimizationLevel{defaultBuilderOptimizationLevel};
+    int32_t maxTactics{defaultMaxTactics};
     SparsityFlag sparsity{SparsityFlag::kDISABLE};
-#if (NV_TENSORRT_MAJOR > 7)
-	nvinfer1::ProfilingVerbosity profilingVerbosity{nvinfer1::ProfilingVerbosity::kLAYER_NAMES_ONLY};
-#else
-	nvinfer1::ProfilingVerbosity profilingVerbosity{ nvinfer1::ProfilingVerbosity::kDEFAULT };
-#endif
+    nvinfer1::ProfilingVerbosity profilingVerbosity{nvinfer1::ProfilingVerbosity::kLAYER_NAMES_ONLY};
     std::string engine;
     std::string calibration;
-    std::unordered_map<std::string, ShapeRange> shapes;
-    std::unordered_map<std::string, ShapeRange> shapesCalib;
+    using ShapeProfile = std::unordered_map<std::string, ShapeRange>;
+    std::vector<ShapeProfile> optProfiles;
+    ShapeProfile shapesCalib;
     std::vector<IOFormat> inputFormats;
     std::vector<IOFormat> outputFormats;
     nvinfer1::TacticSources enabledTactics{0};
     nvinfer1::TacticSources disabledTactics{0};
     TimingCacheMode timingCacheMode{TimingCacheMode::kLOCAL};
     std::string timingCacheFile{};
+    bool errorOnTimingCacheMiss{false};
+    // C++11 does not automatically generate hash function for enum class.
+    // Use int32_t to support C++11 compilers.
+    std::unordered_map<int32_t, bool> previewFeatures;
+    nvinfer1::HardwareCompatibilityLevel hardwareCompatibilityLevel{nvinfer1::HardwareCompatibilityLevel::kNONE};
+    nvinfer1::RuntimePlatform runtimePlatform{nvinfer1::RuntimePlatform::kSAME_AS_BUILD};
+    std::string tempdir{};
+    nvinfer1::TempfileControlFlags tempfileControls{getTempfileControlDefaults()};
+    RuntimeMode useRuntime{RuntimeMode::kFULL};
+    std::string leanDLLPath{};
+    int32_t maxAuxStreams{defaultMaxAuxStreams};
+    bool getPlanVersionOnly{false};
+
+    bool allowWeightStreaming{false};
+
     void parse(Arguments& arguments) override;
 
     static void help(std::ostream& out);
 };
 
-struct SystemOptions : public Options
+class SystemOptions : public Options
 {
+public:
     int32_t device{defaultDevice};
     int32_t DLACore{-1};
-    bool fallback{false};
+    bool ignoreParsedPluginLibs{false};
     std::vector<std::string> plugins;
+    std::vector<std::string> setPluginsToSerialize;
+    std::vector<std::string> dynamicPlugins;
 
     void parse(Arguments& arguments) override;
 
     static void help(std::ostream& out);
 };
 
-struct InferenceOptions : public Options
+class InferenceOptions : public Options
 {
+public:
     int32_t batch{batchNotProvided};
     int32_t iterations{defaultIterations};
-    int32_t streams{defaultStreams};
+    int32_t infStreams{defaultStreams};
+    int32_t optProfileIndex{defaultOptProfileIndex};
     float warmup{defaultWarmUp};
     float duration{defaultDuration};
     float sleep{defaultSleep};
     float idle{defaultIdle};
+    float persistentCacheRatio{defaultPersistentCacheRatio};
     bool overlap{true};
     bool skipTransfers{false};
     bool useManaged{false};
     bool spin{false};
     bool threads{false};
     bool graph{false};
-    bool skip{false};
     bool rerun{false};
     bool timeDeserialize{false};
     bool timeRefit{false};
+    bool setOptProfile{false};
     std::unordered_map<std::string, std::string> inputs;
-    std::unordered_map<std::string, std::vector<int32_t>> shapes;
+    using ShapeProfile = std::unordered_map<std::string, std::vector<int32_t>>;
+    ShapeProfile shapes;
+    nvinfer1::ProfilingVerbosity nvtxVerbosity{nvinfer1::ProfilingVerbosity::kLAYER_NAMES_ONLY};
+    MemoryAllocationStrategy memoryAllocationStrategy{MemoryAllocationStrategy::kSTATIC};
+    std::unordered_map<std::string, std::string> debugTensorFileNames;
+
+    WeightStreamingBudget weightStreamingBudget;
 
     void parse(Arguments& arguments) override;
 
     static void help(std::ostream& out);
 };
 
-struct ReportingOptions : public Options
+class ReportingOptions : public Options
 {
+public:
     bool verbose{false};
     int32_t avgs{defaultAvgRuns};
-    float percentile{defaultPercentile};
+    std::vector<float> percentiles{defaultPercentiles.begin(), defaultPercentiles.end()};
     bool refit{false};
     bool output{false};
+    bool dumpRawBindings{false};
     bool profile{false};
     bool layerInfo{false};
+    bool optProfileInfo{false};
     std::string exportTimes;
     std::string exportOutput;
     std::string exportProfile;
@@ -229,8 +346,9 @@ struct ReportingOptions : public Options
     static void help(std::ostream& out);
 };
 
-struct SafeBuilderOptions : public Options
+class SafeBuilderOptions : public Options
 {
+public:
     std::string serialized{};
     std::string onnxModelFile{};
     bool help{false};
@@ -238,18 +356,24 @@ struct SafeBuilderOptions : public Options
     std::vector<IOFormat> inputFormats;
     std::vector<IOFormat> outputFormats;
     bool int8{false};
+    bool fp8{false};
+    bool int4{false};
     std::string calibFile{};
     std::vector<std::string> plugins;
-    bool consistency{false};
     bool standard{false};
+    TimingCacheMode timingCacheMode{TimingCacheMode::kLOCAL};
+    std::string timingCacheFile{};
+    SparsityFlag sparsity{SparsityFlag::kDISABLE};
+    int32_t avgTiming{defaultAvgTiming};
 
     void parse(Arguments& arguments) override;
 
     static void printHelp(std::ostream& out);
 };
 
-struct AllOptions : public Options
+class AllOptions : public Options
 {
+public:
     ModelOptions model;
     BuildOptions build;
     SystemOptions system;
@@ -262,6 +386,20 @@ struct AllOptions : public Options
     static void help(std::ostream& out);
 };
 
+class TaskInferenceOptions : public Options
+{
+public:
+    std::string engine;
+    int32_t device{defaultDevice};
+    int32_t DLACore{-1};
+    int32_t batch{batchNotProvided};
+    bool graph{false};
+    float persistentCacheRatio{defaultPersistentCacheRatio};
+    void parse(Arguments& arguments) override;
+    static void help(std::ostream& out);
+};
+
+
 Arguments argsToArgumentsMap(int32_t argc, char* argv[]);
 
 bool parseHelp(Arguments& arguments);
@@ -272,8 +410,6 @@ void helpHelp(std::ostream& out);
 
 std::ostream& operator<<(std::ostream& os, const BaseModelOptions& options);
 
-std::ostream& operator<<(std::ostream& os, const UffInput& input);
-
 std::ostream& operator<<(std::ostream& os, const IOFormat& format);
 
 std::ostream& operator<<(std::ostream& os, const ShapeRange& dims);
@@ -292,6 +428,10 @@ std::ostream& operator<<(std::ostream& os, const AllOptions& options);
 
 std::ostream& operator<<(std::ostream& os, const SafeBuilderOptions& options);
 
+std::ostream& operator<<(std::ostream& os, nvinfer1::DataType dtype);
+
+std::ostream& operator<<(std::ostream& os, nvinfer1::DeviceType devType);
+
 inline std::ostream& operator<<(std::ostream& os, const nvinfer1::Dims& dims)
 {
     for (int32_t i = 0; i < dims.nbDims; ++i)
@@ -329,13 +469,11 @@ inline std::ostream& operator<<(std::ostream& os, const nvinfer1::WeightsRole ro
         os << "Constant";
         break;
     }
-#if (NV_TENSORRT_MAJOR > 7)
     case nvinfer1::WeightsRole::kANY:
     {
         os << "Any";
         break;
     }
-#endif
     }
 
     return os;
diff --git a/src/Detector/tensorrt_yolo/common/sampleReporting.cpp b/src/Detector/tensorrt_yolo/common/sampleReporting.cpp
index a92938c5..e9dda6e0 100644
--- a/src/Detector/tensorrt_yolo/common/sampleReporting.cpp
+++ b/src/Detector/tensorrt_yolo/common/sampleReporting.cpp
@@ -1,11 +1,12 @@
 /*
- * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
- *     http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
@@ -26,6 +27,8 @@
 #include "sampleOptions.h"
 #include "sampleReporting.h"
 
+using namespace nvinfer1;
+
 namespace sample
 {
 
@@ -45,7 +48,7 @@ float findPercentile(float percentile, std::vector<InferenceTime> const& timings
     {
         return std::numeric_limits<float>::infinity();
     }
-    if (percentile < 0.0f || percentile > 100.0f)
+    if (percentile < 0.F || percentile > 100.F)
     {
         throw std::runtime_error("percentile is not in [0, 100]!");
     }
@@ -99,8 +102,26 @@ float findCoeffOfVariance(std::vector<InferenceTime> const& timings, T const& to
 
 inline InferenceTime traceToTiming(const InferenceTrace& a)
 {
-    return InferenceTime((a.enqEnd - a.enqStart), (a.h2dEnd - a.h2dStart), (a.computeEnd - a.computeStart),
-        (a.d2hEnd - a.d2hStart), (a.d2hEnd - a.h2dStart));
+    return InferenceTime(
+        (a.enqEnd - a.enqStart), (a.h2dEnd - a.h2dStart), (a.computeEnd - a.computeStart), (a.d2hEnd - a.d2hStart));
+}
+
+inline std::string dimsToString(Dims const& shape)
+{
+    std::stringstream ss;
+
+    if (shape.nbDims == 0)
+    {
+        ss << "scalar";
+    }
+    else
+    {
+        for (int32_t i = 0; i < shape.nbDims; i++)
+        {
+            ss << shape.d[i] << (i != shape.nbDims - 1 ? "x" : "");
+        }
+    }
+    return ss.str();
 }
 
 } // namespace
@@ -113,29 +134,40 @@ void printProlog(int32_t warmups, int32_t timings, float warmupMs, float benchTi
 
 void printTiming(std::vector<InferenceTime> const& timings, int32_t runsPerAvg, std::ostream& os)
 {
-    int32_t count = 0;
+    int64_t count = 0;
     InferenceTime sum;
 
     os << std::endl;
     os << "=== Trace details ===" << std::endl;
     os << "Trace averages of " << runsPerAvg << " runs:" << std::endl;
-    for (auto const& t : timings)
+
+    // Show only the first N lines and the last N lines, where N = kTIMING_PRINT_THRESHOLD.
+    constexpr int64_t kTIMING_PRINT_THRESHOLD{200};
+    int64_t const maxNbTimings{kTIMING_PRINT_THRESHOLD * runsPerAvg};
+
+    for (int64_t idx = 0, size = timings.size(); idx < size; ++idx)
     {
-        sum += t;
+        // Omit some latency printing to avoid very long logs.
+        if (size > 2 * maxNbTimings && idx == maxNbTimings)
+        {
+            os << "... Omitting " << (size - 2 * maxNbTimings) << " lines" << std::endl;
+            idx = size - kTIMING_PRINT_THRESHOLD * runsPerAvg - 1;
+        }
+
+        sum += timings[idx];
 
         if (++count == runsPerAvg)
         {
             // clang-format off
             os << "Average on " << runsPerAvg << " runs - GPU latency: " << sum.compute / runsPerAvg
-               << " ms - Host latency: " << sum.latency() / runsPerAvg << " ms (end to end " << sum.e2e / runsPerAvg
-               << " ms, enqueue " << sum.enq / runsPerAvg << " ms)" << std::endl;
+               << " ms - Host latency: " << sum.latency() / runsPerAvg << " ms (enqueue " << sum.enq / runsPerAvg
+               << " ms)" << std::endl;
             // clang-format on
             count = 0;
             sum.enq = 0;
             sum.h2d = 0;
             sum.compute = 0;
             sum.d2h = 0;
-            sum.e2e = 0;
         }
     }
 }
@@ -166,14 +198,10 @@ void printMetricExplanations(std::ostream& os)
     os << "Latency: the summation of H2D Latency, GPU Compute Time, and D2H Latency. This is the latency to infer a "
           "single query."
        << std::endl;
-    os << "End-to-End Host Latency: the duration from when the H2D of a query is called to when the D2H of the same "
-          "query is completed, which includes the latency to wait for the completion of the previous query. This is "
-          "the latency of a query if multiple queries are enqueued consecutively."
-       << std::endl;
 }
 
 PerformanceResult getPerformanceResult(std::vector<InferenceTime> const& timings,
-    std::function<float(InferenceTime const&)> metricGetter, float percentile)
+    std::function<float(InferenceTime const&)> metricGetter, std::vector<float> const& percentiles)
 {
     auto const metricComparator
         = [metricGetter](InferenceTime const& a, InferenceTime const& b) { return metricGetter(a) < metricGetter(b); };
@@ -183,40 +211,44 @@ PerformanceResult getPerformanceResult(std::vector<InferenceTime> const& timings
     PerformanceResult result;
     result.min = metricGetter(newTimings.front());
     result.max = metricGetter(newTimings.back());
-    result.mean = std::accumulate(newTimings.begin(), newTimings.end(), 0.0f, metricAccumulator) / newTimings.size();
+    result.mean = std::accumulate(newTimings.begin(), newTimings.end(), 0.0F, metricAccumulator) / newTimings.size();
     result.median = findMedian(newTimings, metricGetter);
-    result.percentile = findPercentile(percentile, newTimings, metricGetter);
+    for (auto percentile : percentiles)
+    {
+        result.percentiles.emplace_back(findPercentile(percentile, newTimings, metricGetter));
+    }
     result.coeffVar = findCoeffOfVariance(newTimings, metricGetter, result.mean);
     return result;
 }
 
-void printEpilog(std::vector<InferenceTime> const& timings, float walltimeMs, float percentile, int32_t batchSize,
-    std::ostream& osInfo, std::ostream& osWarning, std::ostream& osVerbose)
+void printEpilog(std::vector<InferenceTime> const& timings, float walltimeMs, std::vector<float> const& percentiles,
+    int32_t batchSize, int32_t infStreams, std::ostream& osInfo, std::ostream& osWarning, std::ostream& osVerbose)
 {
     float const throughput = batchSize * timings.size() / walltimeMs * 1000;
 
     auto const getLatency = [](InferenceTime const& t) { return t.latency(); };
-    auto const latencyResult = getPerformanceResult(timings, getLatency, percentile);
-
-    auto const getEndToEnd = [](InferenceTime const& t) { return t.e2e; };
-    auto const e2eLatencyResult = getPerformanceResult(timings, getEndToEnd, percentile);
+    auto const latencyResult = getPerformanceResult(timings, getLatency, percentiles);
 
     auto const getEnqueue = [](InferenceTime const& t) { return t.enq; };
-    auto const enqueueResult = getPerformanceResult(timings, getEnqueue, percentile);
+    auto const enqueueResult = getPerformanceResult(timings, getEnqueue, percentiles);
 
     auto const getH2d = [](InferenceTime const& t) { return t.h2d; };
-    auto const h2dResult = getPerformanceResult(timings, getH2d, percentile);
+    auto const h2dResult = getPerformanceResult(timings, getH2d, percentiles);
 
     auto const getCompute = [](InferenceTime const& t) { return t.compute; };
-    auto const gpuComputeResult = getPerformanceResult(timings, getCompute, percentile);
+    auto const gpuComputeResult = getPerformanceResult(timings, getCompute, percentiles);
 
     auto const getD2h = [](InferenceTime const& t) { return t.d2h; };
-    auto const d2hResult = getPerformanceResult(timings, getD2h, percentile);
+    auto const d2hResult = getPerformanceResult(timings, getD2h, percentiles);
 
-    auto const toPerfString = [percentile](const PerformanceResult& r) {
+    auto const toPerfString = [&](const PerformanceResult& r) {
         std::stringstream s;
         s << "min = " << r.min << " ms, max = " << r.max << " ms, mean = " << r.mean << " ms, "
-          << "median = " << r.median << " ms, percentile(" << percentile << "%) = " << r.percentile << " ms";
+          << "median = " << r.median << " ms";
+        for (int32_t i = 0, n = percentiles.size(); i < n; ++i)
+        {
+            s << ", percentile(" << percentiles[i] << "%) = " << r.percentiles[i] << " ms";
+        }
         return s.str();
     };
 
@@ -224,7 +256,6 @@ void printEpilog(std::vector<InferenceTime> const& timings, float walltimeMs, fl
     osInfo << "=== Performance summary ===" << std::endl;
     osInfo << "Throughput: " << throughput << " qps" << std::endl;
     osInfo << "Latency: " << toPerfString(latencyResult) << std::endl;
-    osInfo << "End-to-End Host Latency: " << toPerfString(e2eLatencyResult) << std::endl;
     osInfo << "Enqueue Time: " << toPerfString(enqueueResult) << std::endl;
     osInfo << "H2D Latency: " << toPerfString(h2dResult) << std::endl;
     osInfo << "GPU Compute Time: " << toPerfString(gpuComputeResult) << std::endl;
@@ -268,6 +299,13 @@ void printEpilog(std::vector<InferenceTime> const& timings, float walltimeMs, fl
                   << "stability." << std::endl;
     }
 
+    // Report warnings if multiple inference streams are used.
+    if (infStreams > 1)
+    {
+        osWarning << "* Multiple inference streams are used. Latencies may not be accurate since inferences may run in "
+                  << "  parallel. Please use \"Throughput\" as the performance metric instead." << std::endl;
+    }
+
     // Explain what the metrics mean.
     osInfo << "Explanations of the performance metrics are printed in the verbose logs." << std::endl;
     printMetricExplanations(osVerbose);
@@ -275,27 +313,28 @@ void printEpilog(std::vector<InferenceTime> const& timings, float walltimeMs, fl
     osInfo << std::endl;
 }
 
-void printPerformanceReport(std::vector<InferenceTrace> const& trace, const ReportingOptions& reporting, float warmupMs,
-    int32_t batchSize, std::ostream& osInfo, std::ostream& osWarning, std::ostream& osVerbose)
+void printPerformanceReport(std::vector<InferenceTrace> const& trace, ReportingOptions const& reportingOpts,
+    InferenceOptions const& infOpts, std::ostream& osInfo, std::ostream& osWarning, std::ostream& osVerbose)
 {
+    int32_t batchSize = infOpts.batch;
+    float const warmupMs = infOpts.warmup;
     auto const isNotWarmup = [&warmupMs](const InferenceTrace& a) { return a.computeStart >= warmupMs; };
     auto const noWarmup = std::find_if(trace.begin(), trace.end(), isNotWarmup);
     int32_t const warmups = noWarmup - trace.begin();
     float const benchTime = trace.back().d2hEnd - noWarmup->h2dStart;
-    // when implicit batch used, batchSize = options.inference.batch, which is parsed through --batch
-    // when explicit batch used, batchSize = options.inference.batch = 0
     // treat inference with explicit batch as a single query and report the throughput
     batchSize = batchSize ? batchSize : 1;
     printProlog(warmups * batchSize, (trace.size() - warmups) * batchSize, warmupMs, benchTime, osInfo);
 
     std::vector<InferenceTime> timings(trace.size() - warmups);
     std::transform(noWarmup, trace.end(), timings.begin(), traceToTiming);
-    printTiming(timings, reporting.avgs, osInfo);
-    printEpilog(timings, benchTime, reporting.percentile, batchSize, osInfo, osWarning, osVerbose);
+    printTiming(timings, reportingOpts.avgs, osInfo);
+    printEpilog(
+        timings, benchTime, reportingOpts.percentiles, batchSize, infOpts.infStreams, osInfo, osWarning, osVerbose);
 
-    if (!reporting.exportTimes.empty())
+    if (!reportingOpts.exportTimes.empty())
     {
-        exportJSONTrace(trace, reporting.exportTimes);
+        exportJSONTrace(trace, reportingOpts.exportTimes, warmups);
     }
 }
 
@@ -303,15 +342,16 @@ void printPerformanceReport(std::vector<InferenceTrace> const& trace, const Repo
 //! [ value, ...]
 //! value ::= { "start enq : time, "end enq" : time, "start h2d" : time, "end h2d" : time, "start compute" : time,
 //!             "end compute" : time, "start d2h" : time, "end d2h" : time, "h2d" : time, "compute" : time,
-//!             "d2h" : time, "latency" : time, "end to end" : time }
+//!             "d2h" : time, "latency" : time }
 //!
-void exportJSONTrace(std::vector<InferenceTrace> const& trace, std::string const& fileName)
+void exportJSONTrace(std::vector<InferenceTrace> const& trace, std::string const& fileName, int32_t const nbWarmups)
 {
     std::ofstream os(fileName, std::ofstream::trunc);
     os << "[" << std::endl;
     char const* sep = "  ";
-    for (auto const& t : trace)
+    for (auto iter = trace.begin() + nbWarmups; iter < trace.end(); ++iter)
     {
+        auto const& t = *iter;
         InferenceTime const it(traceToTiming(t));
         os << sep << "{ ";
         sep = ", ";
@@ -321,8 +361,8 @@ void exportJSONTrace(std::vector<InferenceTrace> const& trace, std::string const
            << "\"startComputeMs\" : " << t.computeStart << sep << "\"endComputeMs\" : " << t.computeEnd << sep
            << "\"startD2hMs\" : "     << t.d2hStart     << sep << "\"endD2hMs\" : "     << t.d2hEnd     << sep
            << "\"h2dMs\" : "          << it.h2d         << sep << "\"computeMs\" : "    << it.compute   << sep
-           << "\"d2hMs\" : "          << it.d2h         << sep << "\"latencyMs\" : "    << it.latency() << sep
-           << "\"endToEndMs\" : "     << it.e2e         << " }"                                         << std::endl;
+           << "\"d2hMs\" : "          << it.d2h         << sep << "\"latencyMs\" : "    << it.latency() << " }"
+           << std::endl;
         // clang-format on
     }
     os << "]" << std::endl;
@@ -346,42 +386,49 @@ void Profiler::reportLayerTime(char const* layerName, float timeMs) noexcept
         }
     }
 
-    mIterator->timeMs += timeMs;
+    mIterator->timeMs.push_back(timeMs);
     ++mIterator;
 }
 
 void Profiler::print(std::ostream& os) const noexcept
 {
-    std::string const nameHdr("Layer");
-    std::string const timeHdr("   Time (ms)");
-    std::string const avgHdr("   Avg. Time (ms)");
-    std::string const percentageHdr("   Time %");
+    std::string const nameHdr("   Layer");
+    std::string const timeHdr("   Time(ms)");
+    std::string const avgHdr("     Avg.(ms)");
+    std::string const medHdr("   Median(ms)");
+    std::string const percentageHdr("   Time(%)");
 
     float const totalTimeMs = getTotalTime();
 
-    auto const cmpLayer = [](LayerProfile const& a, LayerProfile const& b) { return a.name.size() < b.name.size(); };
-    auto const longestName = std::max_element(mLayers.begin(), mLayers.end(), cmpLayer);
-    auto const nameLength = std::max(longestName->name.size() + 1, nameHdr.size());
     auto const timeLength = timeHdr.size();
     auto const avgLength = avgHdr.size();
+    auto const medLength = medHdr.size();
     auto const percentageLength = percentageHdr.size();
 
     os << std::endl
        << "=== Profile (" << mUpdatesCount << " iterations ) ===" << std::endl
-       << std::setw(nameLength) << nameHdr << timeHdr << avgHdr << percentageHdr << std::endl;
+       << timeHdr << avgHdr << medHdr << percentageHdr << nameHdr << std::endl;
 
     for (auto const& p : mLayers)
     {
+        if (p.timeMs.empty() || getTotalTime(p) == 0.F)
+        {
+            // there is no point to print profiling for layer that didn't run at all
+            continue;
+        }
         // clang-format off
-        os << std::setw(nameLength) << p.name << std::setw(timeLength) << std::fixed << std::setprecision(2) << p.timeMs
-           << std::setw(avgLength) << std::fixed << std::setprecision(4) << p.timeMs / mUpdatesCount
-           << std::setw(percentageLength) << std::fixed << std::setprecision(1) << p.timeMs / totalTimeMs * 100
-           << std::endl;
+        os << std::setw(timeLength) << std::fixed << std::setprecision(2) << getTotalTime(p)
+           << std::setw(avgLength) << std::fixed << std::setprecision(4) << getAvgTime(p)
+           << std::setw(medLength) << std::fixed << std::setprecision(4) << getMedianTime(p)
+           << std::setw(percentageLength) << std::fixed << std::setprecision(1) << getTotalTime(p) / totalTimeMs * 100
+           << "   " << p.name << std::endl;
     }
     {
-        os << std::setw(nameLength) << "Total" << std::setw(timeLength) << std::fixed << std::setprecision(2)
+        os << std::setw(timeLength) << std::fixed << std::setprecision(2)
            << totalTimeMs << std::setw(avgLength) << std::fixed << std::setprecision(4) << totalTimeMs / mUpdatesCount
-           << std::setw(percentageLength) << std::fixed << std::setprecision(1) << 100.0 << std::endl;
+           << std::setw(medLength) << std::fixed << std::setprecision(4) << getMedianTime()
+           << std::setw(percentageLength) << std::fixed << std::setprecision(1) << 100.0
+           << "   Total" << std::endl;
         // clang-format on
     }
     os << std::endl;
@@ -397,10 +444,11 @@ void Profiler::exportJSONProfile(std::string const& fileName) const noexcept
     for (auto const& l : mLayers)
     {
         // clang-format off
-        os << ", {" << " \"name\" : \""      << l.name << "\""
-                       ", \"timeMs\" : "     << l.timeMs
-           <<          ", \"averageMs\" : "  << l.timeMs / mUpdatesCount
-           <<          ", \"percentage\" : " << l.timeMs / totalTimeMs * 100
+        os << ", {" << R"( "name" : ")"      << l.name << R"(")"
+                       R"(, "timeMs" : )"     << getTotalTime(l)
+           <<          R"(, "averageMs" : )"  << getAvgTime(l)
+           <<          R"(, "medianMs" : )"  << getMedianTime(l)
+           <<          R"(, "percentage" : )" << getTotalTime(l) / totalTimeMs * 100
            << " }"  << std::endl;
         // clang-format on
     }
@@ -415,8 +463,13 @@ void dumpInputs(nvinfer1::IExecutionContext const& context, Bindings const& bind
 
 void dumpOutputs(nvinfer1::IExecutionContext const& context, Bindings const& bindings, std::ostream& os)
 {
-    os << "Output Tensors:" << std::endl;
-    bindings.dumpOutputs(context, os);
+    auto isOutput = [](Binding const& b) { return !b.isInput; };
+    bindings.dumpBindings(context, isOutput, os);
+}
+
+void dumpRawBindingsToFiles(nvinfer1::IExecutionContext const& context, Bindings const& bindings, std::ostream& os)
+{
+    bindings.dumpRawBindingToFiles(context, os);
 }
 
 void exportJSONOutput(
@@ -429,10 +482,10 @@ void exportJSONOutput(
     for (auto const& binding : output)
     {
         // clang-format off
-        os << sep << "{ \"name\" : \"" << binding.first << "\"" << std::endl;
+        os << sep << R"({ "name" : ")" << binding.first << "\"" << std::endl;
         sep = ", ";
-        os << "  " << sep << "\"dimensions\" : \"";
-        bindings.dumpBindingDimensions(binding.second, context, os);
+        os << "  " << sep << R"("dimensions" : ")";
+        bindings.dumpBindingDimensions(binding.first, context, os);
         os << "\"" << std::endl;
         os << "  " << sep << "\"values\" : [ ";
         bindings.dumpBindingValues(context, binding.second, os, sep, batch);
@@ -442,4 +495,115 @@ void exportJSONOutput(
     os << "]" << std::endl;
 }
 
+void exportJSONOutput(
+    nvinfer1::IExecutionContext const& context, Bindings const& bindings, std::string const& fileName, int32_t batch);
+
+void printLayerInfo(
+    ReportingOptions const& reporting, nvinfer1::ICudaEngine* engine, nvinfer1::IExecutionContext* context)
+{
+    if (reporting.layerInfo)
+    {
+        sample::gLogInfo << "Layer Information:" << std::endl;
+        sample::gLogInfo << getLayerInformation(engine, context, nvinfer1::LayerInformationFormat::kONELINE)
+                         << std::flush;
+    }
+    if (!reporting.exportLayerInfo.empty())
+    {
+        std::ofstream os(reporting.exportLayerInfo, std::ofstream::trunc);
+        os << getLayerInformation(engine, context, nvinfer1::LayerInformationFormat::kJSON) << std::flush;
+    }
+}
+
+void printOptimizationProfileInfo(ReportingOptions const& reporting, nvinfer1::ICudaEngine const* engine)
+{
+    if (reporting.optProfileInfo)
+    {
+        sample::gLogInfo << "Optimization Profile Information:" << std::endl;
+        for (int32_t i = 0; i < engine->getNbOptimizationProfiles(); i++)
+        {
+            for (int32_t j = 0, e = engine->getNbIOTensors(); j < e; j++)
+            {
+                auto const tensorName = engine->getIOTensorName(j);
+
+                if (engine->getTensorIOMode(tensorName) == nvinfer1::TensorIOMode::kINPUT)
+                {
+                    auto tensorMinShape = engine->getProfileShape(tensorName, i, nvinfer1::OptProfileSelector::kMIN);
+                    auto tensorOptShape = engine->getProfileShape(tensorName, i, nvinfer1::OptProfileSelector::kOPT);
+                    auto tensorMaxShape = engine->getProfileShape(tensorName, i, nvinfer1::OptProfileSelector::kMAX);
+
+                    sample::gLogInfo << "Model input " << tensorName << " (profile " << i << "): "
+                                     << "min=" << dimsToString(tensorMinShape)
+                                     << ", opt=" << dimsToString(tensorOptShape)
+                                     << ", max=" << dimsToString(tensorMaxShape) << std::endl;
+                }
+            }
+        }
+    }
+}
+
+void printPerformanceProfile(ReportingOptions const& reporting, InferenceEnvironment& iEnv)
+{
+    if (reporting.profile)
+    {
+        iEnv.profiler->print(sample::gLogInfo);
+    }
+    if (!reporting.exportProfile.empty())
+    {
+        iEnv.profiler->exportJSONProfile(reporting.exportProfile);
+    }
+
+    // Print an warning about total per-layer latency when auxiliary streams are used.
+    if (!iEnv.safe && (reporting.profile || !reporting.exportProfile.empty()))
+    {
+        int32_t const nbAuxStreams = iEnv.engine.get()->getNbAuxStreams();
+        if (nbAuxStreams > 0)
+        {
+            sample::gLogWarning << "The engine uses " << nbAuxStreams << " auxiliary streams, so the \"Total\" latency "
+                                << "may not be accurate because some layers may have run in parallel!" << std::endl;
+        }
+    }
+}
+
+namespace details
+{
+void dump(std::unique_ptr<nvinfer1::IExecutionContext> const& context, std::unique_ptr<Bindings> const& binding,
+    ReportingOptions const& reporting, int32_t batch)
+{
+    if (!context)
+    {
+        sample::gLogError << "Empty context! Skip printing outputs." << std::endl;
+        return;
+    }
+    if (reporting.output)
+    {
+        dumpOutputs(*context, *binding, sample::gLogInfo);
+    }
+    if (reporting.dumpRawBindings)
+    {
+        dumpRawBindingsToFiles(*context, *binding, sample::gLogInfo);
+    }
+    if (!reporting.exportOutput.empty())
+    {
+        exportJSONOutput(*context, *binding, reporting.exportOutput, batch);
+    }
+}
+} // namespace details
+
+void printOutput(ReportingOptions const& reporting, InferenceEnvironment const& iEnv, int32_t batch)
+{
+    auto const& binding = iEnv.bindings.at(0);
+    if (!binding)
+    {
+        sample::gLogError << "Empty bindings! Skip printing outputs." << std::endl;
+        return;
+    }
+    if (iEnv.safe)
+    {
+        sample::gLogError << "Safe inferernce is not supported!" << std::endl;
+        return;
+    }
+    auto const& context = iEnv.contexts.at(0);
+    details::dump(context, binding, reporting, batch);
+}
+
 } // namespace sample
diff --git a/src/Detector/tensorrt_yolo/common/sampleReporting.h b/src/Detector/tensorrt_yolo/common/sampleReporting.h
index 5f730987..922ef3c8 100644
--- a/src/Detector/tensorrt_yolo/common/sampleReporting.h
+++ b/src/Detector/tensorrt_yolo/common/sampleReporting.h
@@ -1,11 +1,12 @@
 /*
- * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
- *     http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
@@ -19,27 +20,26 @@
 
 #include <functional>
 #include <iostream>
-
-#include "NvInfer.h"
+#include <numeric>
 
 #include "sampleOptions.h"
-#include "sampleUtils.h"
 
 namespace sample
 {
 
+class Bindings;
+
 //!
 //! \struct InferenceTime
 //! \brief Measurement times in milliseconds
 //!
 struct InferenceTime
 {
-    InferenceTime(float q, float i, float c, float o, float e)
+    InferenceTime(float q, float i, float c, float o)
         : enq(q)
         , h2d(i)
         , compute(c)
         , d2h(o)
-        , e2e(e)
     {
     }
 
@@ -54,7 +54,6 @@ struct InferenceTime
     float h2d{0};     // Host to Device
     float compute{0}; // Compute
     float d2h{0};     // Device to Host
-    float e2e{0};     // end to end
 
     // ideal latency
     float latency() const
@@ -102,7 +101,7 @@ struct InferenceTrace
 
 inline InferenceTime operator+(InferenceTime const& a, InferenceTime const& b)
 {
-    return InferenceTime(a.enq + b.enq, a.h2d + b.h2d, a.compute + b.compute, a.d2h + b.d2h, a.e2e + b.e2e);
+    return InferenceTime(a.enq + b.enq, a.h2d + b.h2d, a.compute + b.compute, a.d2h + b.d2h);
 }
 
 inline InferenceTime operator+=(InferenceTime& a, InferenceTime const& b)
@@ -116,12 +115,12 @@ inline InferenceTime operator+=(InferenceTime& a, InferenceTime const& b)
 //!
 struct PerformanceResult
 {
-    float min{0};
-    float max{0};
-    float mean{0};
-    float median{0};
-    float percentile{0};
-    float coeffVar{0}; // coefficient of variation
+    float min{0.F};
+    float max{0.F};
+    float mean{0.F};
+    float median{0.F};
+    std::vector<float> percentiles;
+    float coeffVar{0.F}; // coefficient of variation
 };
 
 //!
@@ -137,14 +136,14 @@ void printTiming(std::vector<InferenceTime> const& timings, int32_t runsPerAvg,
 //!
 //! \brief Print the performance summary of a trace
 //!
-void printEpilog(std::vector<InferenceTime> const& timings, float percentile, int32_t batchSize, std::ostream& osInfo,
-    std::ostream& osWarning, std::ostream& osVerbose);
+void printEpilog(std::vector<InferenceTime> const& timings, std::vector<float> const& percentiles, int32_t batchSize,
+    std::ostream& osInfo, std::ostream& osWarning, std::ostream& osVerbose);
 
 //!
 //! \brief Get the result of a specific performance metric from a trace
 //!
 PerformanceResult getPerformanceResult(std::vector<InferenceTime> const& timings,
-    std::function<float(InferenceTime const&)> metricGetter, float percentile);
+    std::function<float(InferenceTime const&)> metricGetter, std::vector<float> const& percentiles);
 
 //!
 //! \brief Print the explanations of the performance metrics printed in printEpilog() function.
@@ -154,13 +153,14 @@ void printMetricExplanations(std::ostream& os);
 //!
 //! \brief Print and summarize a timing trace
 //!
-void printPerformanceReport(std::vector<InferenceTrace> const& trace, ReportingOptions const& reporting, float warmupMs,
-    int32_t batchSize, std::ostream& osInfo, std::ostream& osWarning, std::ostream& osVerbose);
+void printPerformanceReport(std::vector<InferenceTrace> const& trace, ReportingOptions const& reportingOpts,
+    InferenceOptions const& infOpts, std::ostream& osInfo, std::ostream& osWarning, std::ostream& osVerbose);
 
 //!
 //! \brief Export a timing trace to JSON file
 //!
-void exportJSONTrace(std::vector<InferenceTrace> const& trace, std::string const& fileName);
+void exportJSONTrace(
+    std::vector<InferenceTrace> const& InferenceTime, std::string const& fileName, int32_t const nbWarmups);
 
 //!
 //! \brief Print input tensors to stream
@@ -172,6 +172,8 @@ void dumpInputs(nvinfer1::IExecutionContext const& context, Bindings const& bind
 //!
 void dumpOutputs(nvinfer1::IExecutionContext const& context, Bindings const& bindings, std::ostream& os);
 
+void dumpRawBindingsToFiles(nvinfer1::IExecutionContext const& context, Bindings const& bindings, std::ostream& os);
+
 //!
 //! \brief Export output tensors to JSON file
 //!
@@ -185,7 +187,7 @@ void exportJSONOutput(
 struct LayerProfile
 {
     std::string name;
-    float timeMs{0};
+    std::vector<float> timeMs;
 };
 
 //!
@@ -208,8 +210,58 @@ class Profiler : public nvinfer1::IProfiler
 private:
     float getTotalTime() const noexcept
     {
-        auto const plusLayerTime = [](float accumulator, LayerProfile const& lp) { return accumulator + lp.timeMs; };
-        return std::accumulate(mLayers.begin(), mLayers.end(), 0.0, plusLayerTime);
+        auto const plusLayerTime = [](float accumulator, LayerProfile const& lp) {
+            return accumulator + std::accumulate(lp.timeMs.begin(), lp.timeMs.end(), 0.F, std::plus<float>());
+        };
+        return std::accumulate(mLayers.begin(), mLayers.end(), 0.0F, plusLayerTime);
+    }
+
+    float getMedianTime() const noexcept
+    {
+        if (mLayers.empty())
+        {
+            return 0.F;
+        }
+        std::vector<float> totalTime;
+        for (size_t run = 0; run < mLayers[0].timeMs.size(); ++run)
+        {
+            auto const layerTime
+                = [&run](float accumulator, LayerProfile const& lp) { return accumulator + lp.timeMs[run]; };
+            auto t = std::accumulate(mLayers.begin(), mLayers.end(), 0.F, layerTime);
+            totalTime.push_back(t);
+        }
+        return median(totalTime);
+    }
+
+    float getMedianTime(LayerProfile const& p) const noexcept
+    {
+        return median(p.timeMs);
+    }
+
+    static float median(std::vector<float> vals)
+    {
+        if (vals.empty())
+        {
+            return 0.F;
+        }
+        std::sort(vals.begin(), vals.end());
+        if (vals.size() % 2U == 1U)
+        {
+            return vals[vals.size() / 2U];
+        }
+        return (vals[vals.size() / 2U - 1U] + vals[vals.size() / 2U]) * 0.5F;
+    }
+
+    //! return the total runtime of given layer profile
+    float getTotalTime(LayerProfile const& p) const noexcept
+    {
+        auto const& vals = p.timeMs;
+        return std::accumulate(vals.begin(), vals.end(), 0.F, std::plus<float>());
+    }
+
+    float getAvgTime(LayerProfile const& p) const noexcept
+    {
+        return getTotalTime(p) / p.timeMs.size();
     }
 
     std::vector<LayerProfile> mLayers;
@@ -217,6 +269,30 @@ class Profiler : public nvinfer1::IProfiler
     int32_t mUpdatesCount{0};
 };
 
+//!
+//! \brief Print layer info to logger or export it to output JSON file.
+//!
+void printLayerInfo(
+    ReportingOptions const& reporting, nvinfer1::ICudaEngine* engine, nvinfer1::IExecutionContext* context);
+
+//!
+//! \brief Print optimization profile info to logger.
+//!
+void printOptimizationProfileInfo(ReportingOptions const& reporting, nvinfer1::ICudaEngine const* engine);
+
+//! Forward declaration.
+struct InferenceEnvironment;
+
+//!
+//! \brief Print per-layer perf profile data to logger or export it to output JSON file.
+//!
+void printPerformanceProfile(ReportingOptions const& reporting, InferenceEnvironment& iEnv);
+
+//!
+//! \brief Print binding output values to logger or export them to output JSON file.
+//!
+void printOutput(ReportingOptions const& reporting, InferenceEnvironment const& iEnv, int32_t batch);
+
 } // namespace sample
 
 #endif // TRT_SAMPLE_REPORTING_H
diff --git a/src/Detector/tensorrt_yolo/common/sampleUtils.cpp b/src/Detector/tensorrt_yolo/common/sampleUtils.cpp
new file mode 100644
index 00000000..689e5857
--- /dev/null
+++ b/src/Detector/tensorrt_yolo/common/sampleUtils.cpp
@@ -0,0 +1,587 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "sampleUtils.h"
+#include "bfloat16.h"
+#include "half.h"
+
+using namespace nvinfer1;
+
+namespace sample
+{
+
+size_t dataTypeSize(nvinfer1::DataType dataType)
+{
+    switch (dataType)
+    {
+    case nvinfer1::DataType::kINT64: return 8U;
+    case nvinfer1::DataType::kINT32:
+    case nvinfer1::DataType::kFLOAT: return 4U;
+    case nvinfer1::DataType::kBF16:
+    case nvinfer1::DataType::kHALF: return 2U;
+    case nvinfer1::DataType::kBOOL:
+    case nvinfer1::DataType::kUINT8:
+    case nvinfer1::DataType::kINT8:
+    case nvinfer1::DataType::kFP8: return 1U;
+    case nvinfer1::DataType::kINT4:
+        ASSERT(false && "Element size is not implemented for sub-byte data-types.");
+    }
+    return 0;
+}
+
+int64_t volume(nvinfer1::Dims const& dims, nvinfer1::Dims const& strides, int32_t vecDim, int32_t comps, int32_t batch)
+{
+    int64_t maxNbElems = 1;
+    for (int32_t i = 0; i < dims.nbDims; ++i)
+    {
+        // Get effective length of axis.
+        int64_t d = dims.d[i];
+        // Any dimension is 0, it is an empty tensor.
+        if (d == 0)
+        {
+            return 0;
+        }
+        if (i == vecDim)
+        {
+            d = samplesCommon::divUp(d, comps);
+        }
+        maxNbElems = std::max(maxNbElems, d * strides.d[i]);
+    }
+    return maxNbElems * batch * (vecDim < 0 ? 1 : comps);
+}
+
+nvinfer1::Dims toDims(std::vector<int32_t> const& vec)
+{
+    int32_t limit = static_cast<int32_t>(nvinfer1::Dims::MAX_DIMS);
+    if (static_cast<int32_t>(vec.size()) > limit)
+    {
+        sample::gLogWarning << "Vector too long, only first 8 elements are used in dimension." << std::endl;
+    }
+    // Pick first nvinfer1::Dims::MAX_DIMS elements
+    nvinfer1::Dims dims{std::min(static_cast<int32_t>(vec.size()), limit), {}};
+    std::copy_n(vec.begin(), dims.nbDims, std::begin(dims.d));
+    return dims;
+}
+
+void loadFromFile(std::string const& fileName, char* dst, size_t size)
+{
+    ASSERT(dst);
+
+    std::ifstream file(fileName, std::ios::in | std::ios::binary);
+    if (file.is_open())
+    {
+        file.seekg(0, std::ios::end);
+        int64_t fileSize = static_cast<int64_t>(file.tellg());
+        // Due to change from int32_t to int64_t VC engines created with earlier versions
+        // may expect input of the half of the size
+        if (fileSize != static_cast<int64_t>(size) && fileSize != static_cast<int64_t>(size * 2))
+        {
+            std::ostringstream msg;
+            msg << "Unexpected file size for input file: " << fileName << ". Note: Input binding size is: " << size
+                << " bytes but the file size is " << fileSize
+                << " bytes. Double check the size and datatype of the provided data.";
+            throw std::invalid_argument(msg.str());
+        }
+        // Move file pointer back to the beginning after reading file size.
+        file.seekg(0, std::ios::beg);
+        file.read(dst, size);
+        size_t const nbBytesRead = file.gcount();
+        file.close();
+        if (nbBytesRead != size)
+        {
+            std::ostringstream msg;
+            msg << "Unexpected file size for input file: " << fileName << ". Note: Expected: " << size
+                << " bytes but only read: " << nbBytesRead << " bytes";
+            throw std::invalid_argument(msg.str());
+        }
+    }
+    else
+    {
+        std::ostringstream msg;
+        msg << "Cannot open file " << fileName << "!";
+        throw std::invalid_argument(msg.str());
+    }
+}
+
+std::vector<std::string> splitToStringVec(std::string const& s, char separator, int64_t maxSplit)
+{
+    std::vector<std::string> splitted;
+
+    for (size_t start = 0; start < s.length();)
+    {
+        // If maxSplit is specified and we have reached maxSplit, emplace back the rest of the string and break the
+        // loop.
+        if (maxSplit >= 0 && static_cast<int64_t>(splitted.size()) == maxSplit)
+        {
+            splitted.emplace_back(s.substr(start, s.length() - start));
+            break;
+        }
+
+        size_t separatorIndex = s.find(separator, start);
+        if (separatorIndex == std::string::npos)
+        {
+            separatorIndex = s.length();
+        }
+        splitted.emplace_back(s.substr(start, separatorIndex - start));
+
+        // If the separator is the last character, then we should push an empty string at the end.
+        if (separatorIndex == s.length() - 1)
+        {
+            splitted.emplace_back("");
+        }
+
+        start = separatorIndex + 1;
+    }
+
+    return splitted;
+}
+
+bool broadcastIOFormats(std::vector<IOFormat> const& formats, size_t nbBindings, bool isInput /*= true*/)
+{
+    bool broadcast = formats.size() == 1;
+    bool validFormatsCount = broadcast || (formats.size() == nbBindings);
+    if (!formats.empty() && !validFormatsCount)
+    {
+        if (isInput)
+        {
+            throw std::invalid_argument(
+                "The number of inputIOFormats must match network's inputs or be one for broadcasting.");
+        }
+
+        throw std::invalid_argument(
+            "The number of outputIOFormats must match network's outputs or be one for broadcasting.");
+    }
+    return broadcast;
+}
+
+void sparsifyMatMulKernelWeights(nvinfer1::INetworkDefinition& network, std::vector<std::vector<int8_t>>& sparseWeights)
+{
+    using TensorToLayer = std::unordered_map<nvinfer1::ITensor*, nvinfer1::ILayer*>;
+    using LayerToTensor = std::unordered_map<nvinfer1::ILayer*, nvinfer1::ITensor*>;
+
+    // 1. Collect layers and tensors information from the network.
+    TensorToLayer matmulI2L;
+    TensorToLayer constO2L;
+    TensorToLayer shuffleI2L;
+    LayerToTensor shuffleL2O;
+    auto collectMappingInfo = [&](int32_t const idx)
+    {
+        ILayer* l = network.getLayer(idx);
+        switch (l->getType())
+        {
+        case nvinfer1::LayerType::kMATRIX_MULTIPLY:
+        {
+            // assume weights on the second input.
+            matmulI2L.insert({l->getInput(1), l});
+            break;
+        }
+        case nvinfer1::LayerType::kCONSTANT:
+        {
+            DataType const dtype = static_cast<nvinfer1::IConstantLayer*>(l)->getWeights().type;
+            if (dtype == nvinfer1::DataType::kFLOAT || dtype == nvinfer1::DataType::kHALF)
+            {
+                // Sparsify float only.
+                constO2L.insert({l->getOutput(0), l});
+            }
+            break;
+        }
+        case nvinfer1::LayerType::kSHUFFLE:
+        {
+            shuffleI2L.insert({l->getInput(0), l});
+            shuffleL2O.insert({l, l->getOutput(0)});
+            break;
+        }
+        default: break;
+        }
+    };
+    int32_t const nbLayers = network.getNbLayers();
+    for (int32_t i = 0; i < nbLayers; ++i)
+    {
+        collectMappingInfo(i);
+    }
+    if (matmulI2L.size() == 0 || constO2L.size() == 0)
+    {
+        // No MatrixMultiply or Constant layer found, no weights to sparsify.
+        return;
+    }
+
+    // Helper for analysis
+    auto isTranspose
+        = [](nvinfer1::Permutation const& perm) -> bool { return (perm.order[0] == 1 && perm.order[1] == 0); };
+    auto is2D = [](nvinfer1::Dims const& dims) -> bool { return dims.nbDims == 2; };
+    auto isIdenticalReshape = [](nvinfer1::Dims const& dims) -> bool
+    {
+        for (int32_t i = 0; i < dims.nbDims; ++i)
+        {
+            if (dims.d[i] != i || dims.d[i] != -1)
+            {
+                return false;
+            }
+        }
+        return true;
+    };
+    auto tensorReachedViaTranspose = [&](nvinfer1::ITensor* t, bool& needTranspose) -> ITensor*
+    {
+        while (shuffleI2L.find(t) != shuffleI2L.end())
+        {
+            nvinfer1::IShuffleLayer* s = static_cast<nvinfer1::IShuffleLayer*>(shuffleI2L.at(t));
+            if (!is2D(s->getInput(0)->getDimensions()) || !is2D(s->getReshapeDimensions())
+                || !isIdenticalReshape(s->getReshapeDimensions()))
+            {
+                break;
+            }
+
+            if (isTranspose(s->getFirstTranspose()))
+            {
+                needTranspose = !needTranspose;
+            }
+            if (isTranspose(s->getSecondTranspose()))
+            {
+                needTranspose = !needTranspose;
+            }
+
+            t = shuffleL2O.at(s);
+        }
+        return t;
+    };
+
+    // 2. Forward analysis to collect the Constant layers connected to MatMul via Transpose
+    std::unordered_map<nvinfer1::IConstantLayer*, bool> constantLayerToSparse;
+    for (auto& o2l : constO2L)
+    {
+        // If need to transpose the weights of the Constant layer.
+        // Need to transpose by default due to semantic difference.
+        bool needTranspose{true};
+        ITensor* t = tensorReachedViaTranspose(o2l.first, needTranspose);
+        if (matmulI2L.find(t) == matmulI2L.end())
+        {
+            continue;
+        }
+
+        // check MatMul params...
+        IMatrixMultiplyLayer* mm = static_cast<nvinfer1::IMatrixMultiplyLayer*>(matmulI2L.at(t));
+        bool const twoInputs = mm->getNbInputs() == 2;
+        bool const all2D = is2D(mm->getInput(0)->getDimensions()) && is2D(mm->getInput(1)->getDimensions());
+        bool const isSimple = mm->getOperation(0) == nvinfer1::MatrixOperation::kNONE
+            && mm->getOperation(1) != nvinfer1::MatrixOperation::kVECTOR;
+        if (!(twoInputs && all2D && isSimple))
+        {
+            continue;
+        }
+        if (mm->getOperation(1) == nvinfer1::MatrixOperation::kTRANSPOSE)
+        {
+            needTranspose = !needTranspose;
+        }
+
+        constantLayerToSparse.insert({static_cast<IConstantLayer*>(o2l.second), needTranspose});
+    }
+
+    // 3. Finally, sparsify the weights
+    auto sparsifyConstantWeights = [&sparseWeights](nvinfer1::IConstantLayer* layer, bool const needTranspose)
+    {
+        Dims dims = layer->getOutput(0)->getDimensions();
+        ASSERT(dims.nbDims == 2);
+        int32_t const idxN = needTranspose ? 1 : 0;
+        int32_t const n = dims.d[idxN];
+        int32_t const k = dims.d[1 - idxN];
+        sparseWeights.emplace_back();
+        std::vector<int8_t>& spw = sparseWeights.back();
+        Weights w = layer->getWeights();
+        DataType const dtype = w.type;
+        ASSERT(dtype == nvinfer1::DataType::kFLOAT
+            || dtype == nvinfer1::DataType::kHALF); // non-float weights should have been ignored.
+
+        if (needTranspose)
+        {
+            if (dtype == nvinfer1::DataType::kFLOAT)
+            {
+                spw.resize(w.count * sizeof(float));
+                transpose2DWeights<float>(spw.data(), w.values, k, n);
+            }
+            else if (dtype == nvinfer1::DataType::kHALF)
+            {
+                spw.resize(w.count * sizeof(half_float::half));
+                transpose2DWeights<half_float::half>(spw.data(), w.values, k, n);
+            }
+
+            w.values = spw.data();
+            std::vector<int8_t> tmpW;
+            sparsify(w, n, 1, tmpW);
+
+            if (dtype == nvinfer1::DataType::kFLOAT)
+            {
+                transpose2DWeights<float>(spw.data(), tmpW.data(), n, k);
+            }
+            else if (dtype == nvinfer1::DataType::kHALF)
+            {
+                transpose2DWeights<half_float::half>(spw.data(), tmpW.data(), n, k);
+            }
+        }
+        else
+        {
+            sparsify(w, n, 1, spw);
+        }
+
+        w.values = spw.data();
+        layer->setWeights(w);
+    };
+    for (auto& l : constantLayerToSparse)
+    {
+        sparsifyConstantWeights(l.first, l.second);
+    }
+}
+
+template <typename L>
+void setSparseWeights(L& l, int32_t k, int32_t trs, std::vector<int8_t>& sparseWeights)
+{
+    auto weights = l.getKernelWeights();
+    sparsify(weights, k, trs, sparseWeights);
+    weights.values = sparseWeights.data();
+    l.setKernelWeights(weights);
+}
+
+// Explicit instantiation
+template void setSparseWeights<IConvolutionLayer>(
+    IConvolutionLayer& l, int32_t k, int32_t trs, std::vector<int8_t>& sparseWeights);
+
+void sparsify(nvinfer1::INetworkDefinition& network, std::vector<std::vector<int8_t>>& sparseWeights)
+{
+    for (int32_t l = 0; l < network.getNbLayers(); ++l)
+    {
+        auto* layer = network.getLayer(l);
+        auto const t = layer->getType();
+        if (t == nvinfer1::LayerType::kCONVOLUTION)
+        {
+            auto& conv = *static_cast<IConvolutionLayer*>(layer);
+            auto const& dims = conv.getKernelSizeNd();
+            ASSERT(dims.nbDims == 2 || dims.nbDims == 3);
+            auto const k = conv.getNbOutputMaps();
+            auto const trs = std::accumulate(dims.d, dims.d + dims.nbDims, 1, std::multiplies<int32_t>());
+            sparseWeights.emplace_back();
+            setSparseWeights(conv, k, trs, sparseWeights.back());
+        }
+    }
+
+    sparsifyMatMulKernelWeights(network, sparseWeights);
+    sample::gLogVerbose << "--sparsity=force pruned " << sparseWeights.size() << " weights to be sparsity pattern." << std::endl;
+    sample::gLogVerbose << "--sparsity=force has been deprecated. Please use <polygraphy surgeon prune> to rewrite the weights to a sparsity pattern and then run with --sparsity=enable" << std::endl;
+}
+
+void sparsify(Weights const& weights, int32_t k, int32_t trs, std::vector<int8_t>& sparseWeights)
+{
+    switch (weights.type)
+    {
+    case DataType::kFLOAT:
+        sparsify(static_cast<float const*>(weights.values), weights.count, k, trs, sparseWeights);
+        break;
+    case DataType::kHALF:
+        sparsify(static_cast<half_float::half const*>(weights.values), weights.count, k, trs, sparseWeights);
+        break;
+    case DataType::kBF16:
+        sparsify(static_cast<BFloat16 const*>(weights.values), weights.count, k, trs, sparseWeights);
+        break;
+    case DataType::kINT8:
+    case DataType::kINT32:
+    case DataType::kUINT8:
+    case DataType::kBOOL:
+    case DataType::kINT4:
+    case DataType::kFP8:
+    case DataType::kINT64:
+        ASSERT(false && "Unsupported data type");
+    }
+}
+
+template <typename T>
+void print(std::ostream& os, T v)
+{
+    os << v;
+}
+
+void print(std::ostream& os, int8_t v)
+{
+    os << static_cast<int32_t>(v);
+}
+
+void print(std::ostream& os, __half v)
+{
+    os << static_cast<float>(v);
+}
+
+template <typename T>
+void dumpBuffer(void const* buffer, std::string const& separator, std::ostream& os, Dims const& dims,
+    Dims const& strides, int32_t vectorDim, int32_t spv)
+{
+    auto const vol = volume(dims);
+    T const* typedBuffer = static_cast<T const*>(buffer);
+    std::string sep;
+    for (int64_t v = 0; v < vol; ++v)
+    {
+        int64_t curV = v;
+        int32_t dataOffset = 0;
+        for (int32_t dimIndex = dims.nbDims - 1; dimIndex >= 0; --dimIndex)
+        {
+            int32_t dimVal = curV % dims.d[dimIndex];
+            if (dimIndex == vectorDim)
+            {
+                dataOffset += (dimVal / spv) * strides.d[dimIndex] * spv + dimVal % spv;
+            }
+            else
+            {
+                dataOffset += dimVal * strides.d[dimIndex] * (vectorDim == -1 ? 1 : spv);
+            }
+            curV /= dims.d[dimIndex];
+            ASSERT(curV >= 0);
+        }
+
+        os << sep;
+        sep = separator;
+        print(os, typedBuffer[dataOffset]);
+    }
+}
+
+// Explicit instantiation
+template void dumpBuffer<bool>(void const* buffer, std::string const& separator, std::ostream& os, Dims const& dims,
+    Dims const& strides, int32_t vectorDim, int32_t spv);
+template void dumpBuffer<int32_t>(void const* buffer, std::string const& separator, std::ostream& os, Dims const& dims,
+    Dims const& strides, int32_t vectorDim, int32_t spv);
+template void dumpBuffer<int8_t>(void const* buffer, std::string const& separator, std::ostream& os, Dims const& dims,
+    Dims const& strides, int32_t vectorDim, int32_t spv);
+template void dumpBuffer<float>(void const* buffer, std::string const& separator, std::ostream& os, Dims const& dims,
+    Dims const& strides, int32_t vectorDim, int32_t spv);
+template void dumpBuffer<__half>(void const* buffer, std::string const& separator, std::ostream& os, Dims const& dims,
+    Dims const& strides, int32_t vectorDim, int32_t spv);
+template void dumpBuffer<BFloat16>(void const* buffer, std::string const& separator, std::ostream& os, Dims const& dims,
+    Dims const& strides, int32_t vectorDim, int32_t spv);
+template void dumpBuffer<uint8_t>(void const* buffer, std::string const& separator, std::ostream& os, Dims const& dims,
+    Dims const& strides, int32_t vectorDim, int32_t spv);
+template void dumpBuffer<int64_t>(void const* buffer, std::string const& separator, std::ostream& os, Dims const& dims,
+    Dims const& strides, int32_t vectorDim, int32_t spv);
+
+template <typename T>
+void sparsify(T const* values, int64_t count, int32_t k, int32_t trs, std::vector<int8_t>& sparseWeights)
+{
+    auto const c = count / (k * trs);
+    sparseWeights.resize(count * sizeof(T));
+    auto* sparseValues = reinterpret_cast<T*>(sparseWeights.data());
+
+    constexpr int32_t window = 4;
+    constexpr int32_t nonzeros = 2;
+
+    int32_t const crs = c * trs;
+    auto const getIndex = [=](int32_t ki, int32_t ci, int32_t rsi) { return ki * crs + ci * trs + rsi; };
+
+    for (int64_t ki = 0; ki < k; ++ki)
+    {
+        for (int64_t rsi = 0; rsi < trs; ++rsi)
+        {
+            int32_t w = 0;
+            int32_t nz = 0;
+            for (int64_t ci = 0; ci < c; ++ci)
+            {
+                auto const index = getIndex(ki, ci, rsi);
+                if (nz < nonzeros)
+                {
+                    sparseValues[index] = values[index];
+                    ++nz;
+                }
+                else
+                {
+                    sparseValues[index] = 0;
+                }
+                if (++w == window)
+                {
+                    w = 0;
+                    nz = 0;
+                }
+            }
+        }
+    }
+}
+
+// Explicit instantiation
+template void sparsify<float>(
+    float const* values, int64_t count, int32_t k, int32_t trs, std::vector<int8_t>& sparseWeights);
+template void sparsify<half_float::half>(
+    half_float::half const* values, int64_t count, int32_t k, int32_t trs, std::vector<int8_t>& sparseWeights);
+
+template <typename T>
+void transpose2DWeights(void* dst, void const* src, int32_t const m, int32_t const n)
+{
+    ASSERT(dst != src);
+    T* tdst = reinterpret_cast<T*>(dst);
+    T const* tsrc = reinterpret_cast<T const*>(src);
+    for (int32_t mi = 0; mi < m; ++mi)
+    {
+        for (int32_t ni = 0; ni < n; ++ni)
+        {
+            int32_t const isrc = mi * n + ni;
+            int32_t const idst = ni * m + mi;
+            tdst[idst] = tsrc[isrc];
+        }
+    }
+}
+
+// Explicit instantiation
+template void transpose2DWeights<float>(void* dst, void const* src, int32_t const m, int32_t const n);
+template void transpose2DWeights<half_float::half>(void* dst, void const* src, int32_t const m, int32_t const n);
+
+template <typename T, typename std::enable_if<std::is_integral<T>::value, bool>::type>
+void fillBuffer(void* buffer, int64_t volume, T min, T max)
+{
+    T* typedBuffer = static_cast<T*>(buffer);
+    std::default_random_engine engine;
+    std::uniform_int_distribution<int32_t> distribution(min, max);
+    auto generator = [&engine, &distribution]() { return static_cast<T>(distribution(engine)); };
+    std::generate(typedBuffer, typedBuffer + volume, generator);
+}
+
+template <typename T, typename std::enable_if<!std::is_integral<T>::value, int32_t>::type>
+void fillBuffer(void* buffer, int64_t volume, T min, T max)
+{
+    T* typedBuffer = static_cast<T*>(buffer);
+    std::default_random_engine engine;
+    std::uniform_real_distribution<float> distribution(min, max);
+    auto generator = [&engine, &distribution]() { return static_cast<T>(distribution(engine)); };
+    std::generate(typedBuffer, typedBuffer + volume, generator);
+}
+
+// Explicit instantiation
+template void fillBuffer<bool>(void* buffer, int64_t volume, bool min, bool max);
+template void fillBuffer<float>(void* buffer, int64_t volume, float min, float max);
+template void fillBuffer<int32_t>(void* buffer, int64_t volume, int32_t min, int32_t max);
+template void fillBuffer<int64_t>(void* buffer, int64_t volume, int64_t min, int64_t max);
+template void fillBuffer<int8_t>(void* buffer, int64_t volume, int8_t min, int8_t max);
+template void fillBuffer<__half>(void* buffer, int64_t volume, __half min, __half max);
+template void fillBuffer<BFloat16>(void* buffer, int64_t volume, BFloat16 min, BFloat16 max);
+template void fillBuffer<uint8_t>(void* buffer, int64_t volume, uint8_t min, uint8_t max);
+
+bool matchStringWithOneWildcard(std::string const& pattern, std::string const& target)
+{
+    auto const splitPattern = splitToStringVec(pattern, '*', 1);
+
+    // If there is no wildcard, return if the two strings match exactly.
+    if (splitPattern.size() == 1)
+    {
+        return pattern == target;
+    }
+
+    // Otherwise, target must follow prefix+anything+postfix pattern.
+    return target.size() >= (splitPattern[0].size() + splitPattern[1].size()) && target.find(splitPattern[0]) == 0
+        && target.rfind(splitPattern[1]) == (target.size() - splitPattern[1].size());
+}
+
+} // namespace sample
diff --git a/src/Detector/tensorrt_yolo/common/sampleUtils.h b/src/Detector/tensorrt_yolo/common/sampleUtils.h
index 1509a7fc..6cd4280b 100644
--- a/src/Detector/tensorrt_yolo/common/sampleUtils.h
+++ b/src/Detector/tensorrt_yolo/common/sampleUtils.h
@@ -1,11 +1,12 @@
 /*
- * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
- *     http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
@@ -22,6 +23,7 @@
 #include <memory>
 #include <numeric>
 #include <random>
+#include <string>
 #include <unordered_map>
 #include <vector>
 
@@ -32,24 +34,20 @@
 
 #include "common.h"
 #include "logger.h"
-#include "sampleDevice.h"
-#include "sampleOptions.h"
+
+#define SMP_RETVAL_IF_FALSE(condition, msg, retval, err)                                                               \
+    {                                                                                                                  \
+        if ((condition) == false)                                                                                      \
+        {                                                                                                              \
+            (err) << (msg) << std::endl;                                                                               \
+            return retval;                                                                                             \
+        }                                                                                                              \
+    }
 
 namespace sample
 {
 
-inline int dataTypeSize(nvinfer1::DataType dataType)
-{
-    switch (dataType)
-    {
-    case nvinfer1::DataType::kINT32:
-    case nvinfer1::DataType::kFLOAT: return 4;
-    case nvinfer1::DataType::kHALF: return 2;
-    case nvinfer1::DataType::kBOOL:
-    case nvinfer1::DataType::kINT8: return 1;
-    }
-    return 0;
-}
+size_t dataTypeSize(nvinfer1::DataType dataType);
 
 template <typename T>
 inline T roundUp(T m, T n)
@@ -57,485 +55,71 @@ inline T roundUp(T m, T n)
     return ((m + n - 1) / n) * n;
 }
 
-inline int volume(const nvinfer1::Dims& d)
-{
-    return std::accumulate(d.d, d.d + d.nbDims, 1, std::multiplies<int>());
-}
-
 //! comps is the number of components in a vector. Ignored if vecDim < 0.
-inline int64_t volume(const nvinfer1::Dims& dims, const nvinfer1::Dims& strides, int vecDim, int comps, int batch)
-{
-    int maxNbElems = 1;
-    for (int i = 0; i < dims.nbDims; ++i)
-    {
-        // Get effective length of axis.
-        int d = dims.d[i];
-        // Any dimension is 0, it is an empty tensor.
-        if (d == 0)
-        {
-            return 0;
-        }
-        if (i == vecDim)
-        {
-            d = samplesCommon::divUp(d, comps);
-        }
-        maxNbElems = std::max(maxNbElems, d * strides.d[i]);
-    }
-    return static_cast<int64_t>(maxNbElems) * batch * (vecDim < 0 ? 1 : comps);
-}
+int64_t volume(nvinfer1::Dims const& dims, nvinfer1::Dims const& strides, int32_t vecDim, int32_t comps, int32_t batch);
 
-inline int64_t volume(nvinfer1::Dims dims, int vecDim, int comps, int batch)
-{
-    if (vecDim != -1)
-    {
-        dims.d[vecDim] = roundUp(dims.d[vecDim], comps);
-    }
-    return volume(dims) * std::max(batch, 1);
-}
+using samplesCommon::volume;
 
-inline nvinfer1::Dims toDims(const std::vector<int>& vec)
-{
-    int limit = static_cast<int>(nvinfer1::Dims::MAX_DIMS);
-    if (static_cast<int>(vec.size()) > limit)
-    {
-        sample::gLogWarning << "Vector too long, only first 8 elements are used in dimension." << std::endl;
-    }
-    // Pick first nvinfer1::Dims::MAX_DIMS elements
-    nvinfer1::Dims dims{std::min(static_cast<int>(vec.size()), limit), {}};
-    std::copy_n(vec.begin(), dims.nbDims, std::begin(dims.d));
-    return dims;
-}
+nvinfer1::Dims toDims(std::vector<int32_t> const& vec);
 
-template <typename T>
-inline void fillBuffer(void* buffer, int64_t volume, T min, T max)
-{
-    T* typedBuffer = static_cast<T*>(buffer);
-    std::default_random_engine engine;
-    if (std::is_integral<T>::value)
-    {
-        std::uniform_int_distribution<int> distribution(min, max);
-        auto generator = [&engine, &distribution]() { return static_cast<T>(distribution(engine)); };
-        std::generate(typedBuffer, typedBuffer + volume, generator);
-    }
-    else
-    {
-        std::uniform_real_distribution<float> distribution(min, max);
-        auto generator = [&engine, &distribution]() { return static_cast<T>(distribution(engine)); };
-        std::generate(typedBuffer, typedBuffer + volume, generator);
-    }
-}
+template <typename T, typename std::enable_if<std::is_integral<T>::value, bool>::type = true>
+void fillBuffer(void* buffer, int64_t volume, T min, T max);
 
-// Specialization needed for custom type __half
-template <typename H>
-inline void fillBufferHalf(void* buffer, int64_t volume, H min, H max)
-{
-    H* typedBuffer = static_cast<H*>(buffer);
-    std::default_random_engine engine;
-    std::uniform_real_distribution<float> distribution(min, max);
-    auto generator = [&engine, &distribution]() { return static_cast<H>(distribution(engine)); };
-    std::generate(typedBuffer, typedBuffer + volume, generator);
-}
-template <>
-inline void fillBuffer<__half>(void* buffer, int64_t volume, __half min, __half max)
-{
-    fillBufferHalf(buffer, volume, min, max);
-}
+template <typename T, typename std::enable_if<!std::is_integral<T>::value, int32_t>::type = 0>
+void fillBuffer(void* buffer, int64_t volume, T min, T max);
 
 template <typename T>
-inline void dumpBuffer(const void* buffer, const std::string& separator, std::ostream& os, const nvinfer1::Dims& dims,
-    const nvinfer1::Dims& strides, int32_t vectorDim, int32_t spv)
-{
-    const int64_t volume = std::accumulate(dims.d, dims.d + dims.nbDims, 1, std::multiplies<int64_t>());
-    const T* typedBuffer = static_cast<const T*>(buffer);
-    std::string sep;
-    for (int64_t v = 0; v < volume; ++v)
-    {
-        int64_t curV = v;
-        int32_t dataOffset = 0;
-        for (int32_t dimIndex = dims.nbDims - 1; dimIndex >= 0; --dimIndex)
-        {
-            int32_t dimVal = curV % dims.d[dimIndex];
-            if (dimIndex == vectorDim)
-            {
-                dataOffset += (dimVal / spv) * strides.d[dimIndex] * spv + dimVal % spv;
-            }
-            else
-            {
-                dataOffset += dimVal * strides.d[dimIndex] * (vectorDim == -1 ? 1 : spv);
-            }
-            curV /= dims.d[dimIndex];
-            ASSERT(curV >= 0);
-        }
-
-        os << sep << typedBuffer[dataOffset];
-        sep = separator;
-    }
-}
-
-inline void loadFromFile(std::string const& fileName, char* dst, size_t size)
-{
-    ASSERT(dst);
-
-    std::ifstream file(fileName, std::ios::in | std::ios::binary);
-    if (file.is_open())
-    {
-        file.read(dst, size);
-        file.close();
-    }
-    else
-    {
-        std::stringstream msg;
-        msg << "Cannot open file " << fileName << "!";
-        throw std::invalid_argument(msg.str());
-    }
-}
-
-struct Binding
-{
-    bool isInput{false};
-    std::unique_ptr<IMirroredBuffer> buffer;
-    int64_t volume{0};
-    nvinfer1::DataType dataType{nvinfer1::DataType::kFLOAT};
-
-    void fill(const std::string& fileName)
-    {
-        loadFromFile(fileName, static_cast<char*>(buffer->getHostBuffer()), buffer->getSize());
-    }
-
-    void fill()
-    {
-        switch (dataType)
-        {
-        case nvinfer1::DataType::kBOOL:
-        {
-            fillBuffer<bool>(buffer->getHostBuffer(), volume, 0, 1);
-            break;
-        }
-        case nvinfer1::DataType::kINT32:
-        {
-            fillBuffer<int32_t>(buffer->getHostBuffer(), volume, -128, 127);
-            break;
-        }
-        case nvinfer1::DataType::kINT8:
-        {
-            fillBuffer<int8_t>(buffer->getHostBuffer(), volume, -128, 127);
-            break;
-        }
-        case nvinfer1::DataType::kFLOAT:
-        {
-            fillBuffer<float>(buffer->getHostBuffer(), volume, -1.0F, 1.0F);
-            break;
-        }
-        case nvinfer1::DataType::kHALF:
-        {
-            fillBuffer<__half>(buffer->getHostBuffer(), volume, -1.0F, 1.0F);
-            break;
-        }
-        }
-    }
-
-    void dump(std::ostream& os, nvinfer1::Dims dims, nvinfer1::Dims strides, int32_t vectorDim, int32_t spv,
-        const std::string separator = " ") const
-    {
-        switch (dataType)
-        {
-        case nvinfer1::DataType::kBOOL:
-        {
-            dumpBuffer<bool>(buffer->getHostBuffer(), separator, os, dims, strides, vectorDim, spv);
-            break;
-        }
-        case nvinfer1::DataType::kINT32:
-        {
-            dumpBuffer<int32_t>(buffer->getHostBuffer(), separator, os, dims, strides, vectorDim, spv);
-            break;
-        }
-        case nvinfer1::DataType::kINT8:
-        {
-            dumpBuffer<int8_t>(buffer->getHostBuffer(), separator, os, dims, strides, vectorDim, spv);
-            break;
-        }
-        case nvinfer1::DataType::kFLOAT:
-        {
-            dumpBuffer<float>(buffer->getHostBuffer(), separator, os, dims, strides, vectorDim, spv);
-            break;
-        }
-        case nvinfer1::DataType::kHALF:
-        {
-            dumpBuffer<__half>(buffer->getHostBuffer(), separator, os, dims, strides, vectorDim, spv);
-            break;
-        }
-        }
-    }
-};
-
-class Bindings
-{
-public:
-    Bindings() = delete;
-    explicit Bindings(bool useManaged)
-        : mUseManaged(useManaged)
-    {
-    }
-
-    void addBinding(int b, const std::string& name, bool isInput, int64_t volume, nvinfer1::DataType dataType,
-        const std::string& fileName = "")
-    {
-        while (mBindings.size() <= static_cast<size_t>(b))
-        {
-            mBindings.emplace_back();
-            mDevicePointers.emplace_back();
-        }
-        mNames[name] = b;
-        if (mBindings[b].buffer == nullptr)
-        {
-            if (mUseManaged)
-                mBindings[b].buffer.reset(new UnifiedMirroredBuffer);
-            else
-                mBindings[b].buffer.reset(new DiscreteMirroredBuffer);
-        }
-        mBindings[b].isInput = isInput;
-        // Some memory allocators return nullptr when allocating zero bytes, but TensorRT requires a non-null ptr
-        // even for empty tensors, so allocate a dummy byte.
-        if (volume == 0)
-            mBindings[b].buffer->allocate(1);
-        else
-            mBindings[b].buffer->allocate(static_cast<size_t>(volume) * static_cast<size_t>(dataTypeSize(dataType)));
-
-        mBindings[b].volume = volume;
-        mBindings[b].dataType = dataType;
-        mDevicePointers[b] = mBindings[b].buffer->getDeviceBuffer();
-        if (isInput)
-        {
-            if (fileName.empty())
-                fill(b);
-            else
-                fill(b, fileName);
-        }
-    }
-
-    void** getDeviceBuffers()
-    {
-        return mDevicePointers.data();
-    }
-
-    void transferInputToDevice(TrtCudaStream& stream)
-    {
-        for (auto& b : mNames)
-        {
-            if (mBindings[b.second].isInput)
-                mBindings[b.second].buffer->hostToDevice(stream);
-        }
-    }
-
-    void transferOutputToHost(TrtCudaStream& stream)
-    {
-        for (auto& b : mNames)
-        {
-            if (!mBindings[b.second].isInput)
-                mBindings[b.second].buffer->deviceToHost(stream);
-        }
-    }
-
-    void fill(int binding, const std::string& fileName)
-    {
-        mBindings[binding].fill(fileName);
-    }
-
-    void fill(int binding)
-    {
-        mBindings[binding].fill();
-    }
+void dumpBuffer(void const* buffer, std::string const& separator, std::ostream& os, nvinfer1::Dims const& dims,
+    nvinfer1::Dims const& strides, int32_t vectorDim, int32_t spv);
 
-    void dumpBindingDimensions(int binding, const nvinfer1::IExecutionContext& context, std::ostream& os) const
-    {
-        const auto dims = context.getBindingDimensions(binding);
-        // Do not add a newline terminator, because the caller may be outputting a JSON string.
-        os << dims;
-    }
-
-    void dumpBindingValues(const nvinfer1::IExecutionContext& context, int binding, std::ostream& os,
-        const std::string& separator = " ", int32_t batch = 1) const
-    {
-        nvinfer1::Dims dims = context.getBindingDimensions(binding);
-        nvinfer1::Dims strides = context.getStrides(binding);
-        int32_t vectorDim = context.getEngine().getBindingVectorizedDim(binding);
-        const int32_t spv = context.getEngine().getBindingComponentsPerElement(binding);
+void loadFromFile(std::string const& fileName, char* dst, size_t size);
 
-        if (context.getEngine().hasImplicitBatchDimension())
-        {
-            auto insertN = [](nvinfer1::Dims& d, int32_t bs) {
-                const int32_t nbDims = d.nbDims;
-                ASSERT(nbDims < nvinfer1::Dims::MAX_DIMS);
-                std::copy_backward(&d.d[0], &d.d[nbDims], &d.d[nbDims + 1]);
-                d.d[0] = bs;
-                d.nbDims = nbDims + 1;
-            };
-            int32_t batchStride = 0;
-            for (int32_t i = 0; i < strides.nbDims; ++i)
-            {
-                if (strides.d[i] * dims.d[i] > batchStride)
-                {
-                    batchStride = strides.d[i] * dims.d[i];
-                }
-            }
-            insertN(dims, batch);
-            insertN(strides, batchStride);
-            vectorDim = (vectorDim == -1) ? -1 : vectorDim + 1;
-        }
-
-        mBindings[binding].dump(os, dims, strides, vectorDim, spv, separator);
-    }
-
-    void dumpInputs(const nvinfer1::IExecutionContext& context, std::ostream& os) const
-    {
-        auto isInput = [](const Binding& b) { return b.isInput; };
-        dumpBindings(context, isInput, os);
-    }
+std::vector<std::string> splitToStringVec(std::string const& option, char separator, int64_t maxSplit = -1);
 
-    void dumpOutputs(const nvinfer1::IExecutionContext& context, std::ostream& os) const
-    {
-        auto isOutput = [](const Binding& b) { return !b.isInput; };
-        dumpBindings(context, isOutput, os);
-    }
+bool broadcastIOFormats(std::vector<IOFormat> const& formats, size_t nbBindings, bool isInput = true);
 
-    void dumpBindings(const nvinfer1::IExecutionContext& context, std::ostream& os) const
-    {
-        auto all = [](const Binding& /*b*/) { return true; };
-        dumpBindings(context, all, os);
-    }
+int32_t getCudaDriverVersion();
 
-    void dumpBindings(
-        const nvinfer1::IExecutionContext& context, bool (*predicate)(const Binding& b), std::ostream& os) const
-    {
-        for (const auto& n : mNames)
-        {
-            const auto binding = n.second;
-            if (predicate(mBindings[binding]))
-            {
-                os << n.first << ": (";
-                dumpBindingDimensions(binding, context, os);
-                os << ")" << std::endl;
+int32_t getCudaRuntimeVersion();
 
-                dumpBindingValues(context, binding, os);
-                os << std::endl;
-            }
-        }
-    }
+void sparsify(nvinfer1::INetworkDefinition& network, std::vector<std::vector<int8_t>>& sparseWeights);
+void sparsify(nvinfer1::Weights const& weights, int32_t k, int32_t rs, std::vector<int8_t>& sparseWeights);
 
-    std::unordered_map<std::string, int> getInputBindings() const
-    {
-        auto isInput = [](const Binding& b) { return b.isInput; };
-        return getBindings(isInput);
-    }
-
-    std::unordered_map<std::string, int> getOutputBindings() const
-    {
-        auto isOutput = [](const Binding& b) { return !b.isInput; };
-        return getBindings(isOutput);
-    }
-
-    std::unordered_map<std::string, int> getBindings() const
-    {
-        auto all = [](const Binding& /*b*/) { return true; };
-        return getBindings(all);
-    }
+// Walk the weights elements and overwrite (at most) 2 out of 4 elements to 0.
+template <typename T>
+void sparsify(T const* values, int64_t count, int32_t k, int32_t rs, std::vector<int8_t>& sparseWeights);
 
-    std::unordered_map<std::string, int> getBindings(bool (*predicate)(const Binding& b)) const
-    {
-        std::unordered_map<std::string, int> bindings;
-        for (const auto& n : mNames)
-        {
-            const auto binding = n.second;
-            if (predicate(mBindings[binding]))
-                bindings.insert(n);
-        }
-        return bindings;
-    }
+template <typename L>
+void setSparseWeights(L& l, int32_t k, int32_t rs, std::vector<int8_t>& sparseWeights);
 
-private:
-    std::unordered_map<std::string, int32_t> mNames;
-    std::vector<Binding> mBindings;
-    std::vector<void*> mDevicePointers;
-    bool mUseManaged{false};
-};
+// Sparsify the weights of Constant layers that are fed to MatMul via Shuffle layers.
+// Forward analysis on the API graph to determine which weights to sparsify.
+void sparsifyMatMulKernelWeights(
+    nvinfer1::INetworkDefinition& network, std::vector<std::vector<int8_t>>& sparseWeights);
 
 template <typename T>
-struct TrtDestroyer
-{
-    void operator()(T* t)
-    {
-        //t->destroy();
-        delete t;
-    }
-};
+void transpose2DWeights(void* dst, void const* src, int32_t const m, int32_t const n);
 
-template <typename T>
-using TrtUniquePtr = std::unique_ptr<T, TrtDestroyer<T>>;
+//! A helper function to match a target string with a pattern where the pattern can contain up to one wildcard ('*')
+//! character that matches to any strings.
+bool matchStringWithOneWildcard(std::string const& pattern, std::string const& target);
 
-inline bool broadcastIOFormats(const std::vector<IOFormat>& formats, size_t nbBindings, bool isInput = true)
-{
-    bool broadcast = formats.size() == 1;
-    bool validFormatsCount = broadcast || (formats.size() == nbBindings);
-    if (!formats.empty() && !validFormatsCount)
-    {
-        if (isInput)
-        {
-            throw std::invalid_argument(
-                "The number of inputIOFormats must match network's inputs or be one for broadcasting.");
-        }
-        else
-        {
-            throw std::invalid_argument(
-                "The number of outputIOFormats must match network's outputs or be one for broadcasting.");
-        }
-    }
-    return broadcast;
-}
-
-inline std::vector<char> loadTimingCacheFile(const std::string inFileName)
-{
-    std::ifstream iFile(inFileName, std::ios::in | std::ios::binary);
-    if (!iFile)
-    {
-        sample::gLogWarning << "Could not read timing cache from: " << inFileName
-                            << ". A new timing cache will be generated and written." << std::endl;
-        return std::vector<char>();
-    }
-    iFile.seekg(0, std::ifstream::end);
-    size_t fsize = iFile.tellg();
-    iFile.seekg(0, std::ifstream::beg);
-    std::vector<char> content(fsize);
-    iFile.read(content.data(), fsize);
-    iFile.close();
-    sample::gLogInfo << "Loaded " << fsize << " bytes of timing cache from " << inFileName << std::endl;
-    return content;
-}
-
-inline void saveTimingCacheFile(const std::string outFileName, const nvinfer1::IHostMemory* blob)
+//! A helper method to find an item from an unordered_map. If the exact match exists, this is identical to
+//! map.find(target). If the exact match does not exist, it returns the first plausible match, taking up to one wildcard
+//! into account. If there is no plausible match, then it returns map.end().
+template <typename T>
+typename std::unordered_map<std::string, T>::const_iterator findPlausible(
+    std::unordered_map<std::string, T> const& map, std::string const& target)
 {
-    std::ofstream oFile(outFileName, std::ios::out | std::ios::binary);
-    if (!oFile)
+    auto res = map.find(target);
+    if (res == map.end())
     {
-        sample::gLogWarning << "Could not write timing cache to: " << outFileName << std::endl;
-        return;
+        res = std::find_if(
+            map.begin(), map.end(), [&](typename std::unordered_map<std::string, T>::value_type const& item) {
+                return matchStringWithOneWildcard(item.first, target);
+            });
     }
-    oFile.write((char*) blob->data(), blob->size());
-    oFile.close();
-    sample::gLogInfo << "Saved " << blob->size() << " bytes of timing cache to " << outFileName << std::endl;
-}
-
-inline int32_t getCudaDriverVersion()
-{
-    int32_t version{-1};
-    cudaCheck(cudaDriverGetVersion(&version));
-    return version;
-}
-
-inline int32_t getCudaRuntimeVersion()
-{
-    int32_t version{-1};
-    cudaCheck(cudaRuntimeGetVersion(&version));
-    return version;
+    return res;
 }
 
 } // namespace sample
diff --git a/src/Detector/tensorrt_yolo/common/streamReader.h b/src/Detector/tensorrt_yolo/common/streamReader.h
new file mode 100644
index 00000000..7d4aa1c6
--- /dev/null
+++ b/src/Detector/tensorrt_yolo/common/streamReader.h
@@ -0,0 +1,78 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef STREAM_READER_H
+#define STREAM_READER_H
+
+#include "NvInferRuntime.h"
+#include "sampleUtils.h"
+#include <iostream>
+
+namespace samplesCommon
+{
+
+//! Implements the TensorRT IStreamReader to allow deserializing an engine directly from the plan file.
+class FileStreamReader final : public nvinfer1::IStreamReader
+{
+public:
+    bool open(std::string filepath)
+    {
+        mFile.open(filepath, std::ios::binary);
+        return mFile.is_open();
+    }
+
+    void close()
+    {
+        if (mFile.is_open())
+        {
+            mFile.close();
+        }
+    }
+
+    ~FileStreamReader() final
+    {
+        close();
+    }
+
+    int64_t read(void* dest, int64_t bytes) final
+    {
+        if (!mFile.good())
+        {
+            return -1;
+        }
+        mFile.read(static_cast<char*>(dest), bytes);
+        return mFile.gcount();
+    }
+
+    void reset()
+    {
+        assert(mFile.good());
+        mFile.seekg(0);
+    }
+
+    bool isOpen() const
+    {
+        return mFile.is_open();
+    }
+
+private:
+    std::ifstream mFile;
+};
+
+} // namespace samplesCommon
+
+#endif // STREAM_READER_H
diff --git a/src/Detector/tensorrt_yolo/common/timingCache.cpp b/src/Detector/tensorrt_yolo/common/timingCache.cpp
new file mode 100644
index 00000000..18e85ba4
--- /dev/null
+++ b/src/Detector/tensorrt_yolo/common/timingCache.cpp
@@ -0,0 +1,157 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "timingCache.h"
+#include "NvInfer.h"
+#include "fileLock.h"
+#include "sampleUtils.h"
+#include <fstream>
+#include <iostream>
+#include <memory>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+using namespace nvinfer1;
+namespace nvinfer1
+{
+namespace utils
+{
+std::vector<char> loadTimingCacheFile(ILogger& logger, std::string const& inFileName)
+{
+    try
+    {
+        std::unique_ptr<FileLock> fileLock{new FileLock(logger, inFileName)};
+        std::ifstream iFile(inFileName, std::ios::in | std::ios::binary);
+        if (!iFile)
+        {
+            std::stringstream ss;
+            ss << "Could not read timing cache from: " << inFileName
+               << ". A new timing cache will be generated and written.";
+            logger.log(ILogger::Severity::kWARNING, ss.str().c_str());
+            return std::vector<char>();
+        }
+        iFile.seekg(0, std::ifstream::end);
+        size_t fsize = iFile.tellg();
+        iFile.seekg(0, std::ifstream::beg);
+        std::vector<char> content(fsize);
+        iFile.read(content.data(), fsize);
+        iFile.close();
+        std::stringstream ss;
+        ss << "Loaded " << fsize << " bytes of timing cache from " << inFileName;
+        logger.log(ILogger::Severity::kINFO, ss.str().c_str());
+        return content;
+    }
+    catch (std::exception const& e)
+    {
+        std::cerr << "Exception detected: " << e.what() << std::endl;
+    }
+    return {};
+}
+
+std::unique_ptr<ITimingCache> buildTimingCacheFromFile(
+    ILogger& logger, IBuilderConfig& config, std::string const& timingCacheFile, std::ostream& err)
+{
+    std::unique_ptr<nvinfer1::ITimingCache> timingCache{};
+    auto timingCacheContents = loadTimingCacheFile(logger, timingCacheFile);
+    timingCache.reset(config.createTimingCache(timingCacheContents.data(), timingCacheContents.size()));
+    SMP_RETVAL_IF_FALSE(timingCache != nullptr, "TimingCache creation failed", nullptr, err);
+    config.clearFlag(BuilderFlag::kDISABLE_TIMING_CACHE);
+    SMP_RETVAL_IF_FALSE(
+        config.setTimingCache(*timingCache, true), "IBuilderConfig setTimingCache failed", nullptr, err);
+    return timingCache;
+}
+
+void saveTimingCacheFile(ILogger& logger, std::string const& outFileName, IHostMemory const* blob)
+{
+    try
+    {
+        std::unique_ptr<FileLock> fileLock{new FileLock(logger, outFileName)};
+        std::ofstream oFile(outFileName, std::ios::out | std::ios::binary);
+        if (!oFile)
+        {
+            std::stringstream ss;
+            ss << "Could not write timing cache to: " << outFileName;
+            logger.log(ILogger::Severity::kWARNING, ss.str().c_str());
+            return;
+        }
+        oFile.write(reinterpret_cast<char*>(blob->data()), blob->size());
+        oFile.close();
+        std::stringstream ss;
+        ss << "Saved " << blob->size() << " bytes of timing cache to " << outFileName;
+        logger.log(ILogger::Severity::kINFO, ss.str().c_str());
+    }
+    catch (std::exception const& e)
+    {
+        std::cerr << "Exception detected: " << e.what() << std::endl;
+    }
+}
+
+void updateTimingCacheFile(nvinfer1::ILogger& logger, std::string const& fileName,
+    nvinfer1::ITimingCache const* timingCache, nvinfer1::IBuilder& builder)
+{
+    try
+    {
+        // Prepare empty timingCache in case that there is no existing file to read
+        std::unique_ptr<IBuilderConfig> config{builder.createBuilderConfig()};
+        std::unique_ptr<ITimingCache> fileTimingCache{config->createTimingCache(static_cast<void const*>(nullptr), 0)};
+
+        std::unique_ptr<FileLock> fileLock{new FileLock(logger, fileName)};
+        std::ifstream iFile(fileName, std::ios::in | std::ios::binary);
+        if (iFile)
+        {
+            iFile.seekg(0, std::ifstream::end);
+            size_t fsize = iFile.tellg();
+            iFile.seekg(0, std::ifstream::beg);
+            std::vector<char> content(fsize);
+            iFile.read(content.data(), fsize);
+            iFile.close();
+            std::stringstream ss;
+            ss << "Loaded " << fsize << " bytes of timing cache from " << fileName;
+            logger.log(ILogger::Severity::kINFO, ss.str().c_str());
+            fileTimingCache.reset(config->createTimingCache(static_cast<void const*>(content.data()), content.size()));
+            if (!fileTimingCache)
+            {
+                throw std::runtime_error("Failed to create timingCache from " + fileName + "!");
+            }
+        }
+        fileTimingCache->combine(*timingCache, false);
+        std::unique_ptr<IHostMemory> blob{fileTimingCache->serialize()};
+        if (!blob)
+        {
+            throw std::runtime_error("Failed to serialize ITimingCache!");
+        }
+        std::ofstream oFile(fileName, std::ios::out | std::ios::binary);
+        if (!oFile)
+        {
+            std::stringstream ss;
+            ss << "Could not write timing cache to: " << fileName;
+            logger.log(ILogger::Severity::kWARNING, ss.str().c_str());
+            return;
+        }
+        oFile.write(reinterpret_cast<char*>(blob->data()), blob->size());
+        oFile.close();
+        std::stringstream ss;
+        ss << "Saved " << blob->size() << " bytes of timing cache to " << fileName;
+        logger.log(ILogger::Severity::kINFO, ss.str().c_str());
+    }
+    catch (std::exception const& e)
+    {
+        std::cerr << "Exception detected: " << e.what() << std::endl;
+    }
+}
+} // namespace utils
+} // namespace nvinfer1
diff --git a/src/Detector/tensorrt_yolo/common/timingCache.h b/src/Detector/tensorrt_yolo/common/timingCache.h
new file mode 100644
index 00000000..c4c76e37
--- /dev/null
+++ b/src/Detector/tensorrt_yolo/common/timingCache.h
@@ -0,0 +1,38 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef TENSORRT_SAMPLES_COMMON_TIMINGCACHE_H_
+#define TENSORRT_SAMPLES_COMMON_TIMINGCACHE_H_
+#include "NvInfer.h"
+#include <iosfwd>
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace nvinfer1
+{
+namespace utils
+{
+std::vector<char> loadTimingCacheFile(nvinfer1::ILogger& logger, std::string const& inFileName);
+std::unique_ptr<ITimingCache> buildTimingCacheFromFile(
+    ILogger& logger, IBuilderConfig& config, std::string const& timingCacheFile, std::ostream& err);
+void saveTimingCacheFile(nvinfer1::ILogger& logger, std::string const& outFileName, nvinfer1::IHostMemory const* blob);
+void updateTimingCacheFile(nvinfer1::ILogger& logger, std::string const& fileName,
+    nvinfer1::ITimingCache const* timingCache, nvinfer1::IBuilder& builder);
+} // namespace utils
+} // namespace nvinfer1
+
+#endif // TENSORRT_SAMPLES_COMMON_TIMINGCACHE_H_
diff --git a/src/Detector/tensorrt_yolo/common_deprecated/BatchStream.h b/src/Detector/tensorrt_yolo/common_deprecated/BatchStream.h
new file mode 100644
index 00000000..9eaac768
--- /dev/null
+++ b/src/Detector/tensorrt_yolo/common_deprecated/BatchStream.h
@@ -0,0 +1,388 @@
+/*
+ * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef BATCH_STREAM_H
+#define BATCH_STREAM_H
+
+#include "NvInfer.h"
+#include "common.h"
+#include <algorithm>
+#include <stdio.h>
+#include <vector>
+
+class IBatchStream
+{
+public:
+    virtual void reset(int firstBatch) = 0;
+    virtual bool next() = 0;
+    virtual void skip(int skipCount) = 0;
+    virtual float* getBatch() = 0;
+    virtual float* getLabels() = 0;
+    virtual int getBatchesRead() const = 0;
+    virtual int getBatchSize() const = 0;
+    virtual nvinfer1::Dims getDims() const = 0;
+};
+
+class MNISTBatchStream : public IBatchStream
+{
+public:
+    MNISTBatchStream(int batchSize, int maxBatches, const std::string& dataFile, const std::string& labelsFile,
+        const std::vector<std::string>& directories)
+        : mBatchSize{batchSize}
+        , mMaxBatches{maxBatches}
+        , mDims{3, {1, 28, 28}} //!< We already know the dimensions of MNIST images.
+    {
+        readDataFile(locateFile(dataFile, directories));
+        readLabelsFile(locateFile(labelsFile, directories));
+    }
+
+    void reset(int firstBatch) override
+    {
+        mBatchCount = firstBatch;
+    }
+
+    bool next() override
+    {
+        if (mBatchCount >= mMaxBatches)
+        {
+            return false;
+        }
+        ++mBatchCount;
+        return true;
+    }
+
+    void skip(int skipCount) override
+    {
+        mBatchCount += skipCount;
+    }
+
+    float* getBatch() override
+    {
+        return mData.data() + (mBatchCount * mBatchSize * samplesCommon::volume(mDims));
+    }
+
+    float* getLabels() override
+    {
+        return mLabels.data() + (mBatchCount * mBatchSize);
+    }
+
+    int getBatchesRead() const override
+    {
+        return mBatchCount;
+    }
+
+    int getBatchSize() const override
+    {
+        return mBatchSize;
+    }
+
+    nvinfer1::Dims getDims() const override
+    {
+        return nvinfer1::Dims{4, {mBatchSize, mDims.d[0], mDims.d[1], mDims.d[2]}};
+    }
+
+private:
+    void readDataFile(const std::string& dataFilePath)
+    {
+        std::ifstream file{dataFilePath.c_str(), std::ios::binary};
+
+        int magicNumber, numImages, imageH, imageW;
+        file.read(reinterpret_cast<char*>(&magicNumber), sizeof(magicNumber));
+        // All values in the MNIST files are big endian.
+        magicNumber = samplesCommon::swapEndianness(magicNumber);
+        ASSERT(magicNumber == 2051 && "Magic Number does not match the expected value for an MNIST image set");
+
+        // Read number of images and dimensions
+        file.read(reinterpret_cast<char*>(&numImages), sizeof(numImages));
+        file.read(reinterpret_cast<char*>(&imageH), sizeof(imageH));
+        file.read(reinterpret_cast<char*>(&imageW), sizeof(imageW));
+
+        numImages = samplesCommon::swapEndianness(numImages);
+        imageH = samplesCommon::swapEndianness(imageH);
+        imageW = samplesCommon::swapEndianness(imageW);
+
+        // The MNIST data is made up of unsigned bytes, so we need to cast to float and normalize.
+        int numElements = numImages * imageH * imageW;
+        std::vector<uint8_t> rawData(numElements);
+        file.read(reinterpret_cast<char*>(rawData.data()), numElements * sizeof(uint8_t));
+        mData.resize(numElements);
+        std::transform(
+            rawData.begin(), rawData.end(), mData.begin(), [](uint8_t val) { return static_cast<float>(val) / 255.f; });
+    }
+
+    void readLabelsFile(const std::string& labelsFilePath)
+    {
+        std::ifstream file{labelsFilePath.c_str(), std::ios::binary};
+        int magicNumber, numImages;
+        file.read(reinterpret_cast<char*>(&magicNumber), sizeof(magicNumber));
+        // All values in the MNIST files are big endian.
+        magicNumber = samplesCommon::swapEndianness(magicNumber);
+        ASSERT(magicNumber == 2049 && "Magic Number does not match the expected value for an MNIST labels file");
+
+        file.read(reinterpret_cast<char*>(&numImages), sizeof(numImages));
+        numImages = samplesCommon::swapEndianness(numImages);
+
+        std::vector<uint8_t> rawLabels(numImages);
+        file.read(reinterpret_cast<char*>(rawLabels.data()), numImages * sizeof(uint8_t));
+        mLabels.resize(numImages);
+        std::transform(
+            rawLabels.begin(), rawLabels.end(), mLabels.begin(), [](uint8_t val) { return static_cast<float>(val); });
+    }
+
+    int mBatchSize{0};
+    int mBatchCount{0}; //!< The batch that will be read on the next invocation of next()
+    int mMaxBatches{0};
+    nvinfer1::Dims mDims{};
+    std::vector<float> mData{};
+    std::vector<float> mLabels{};
+};
+
+class BatchStream : public IBatchStream
+{
+public:
+    BatchStream(
+        int batchSize, int maxBatches, std::string prefix, std::string suffix, std::vector<std::string> directories)
+        : mBatchSize(batchSize)
+        , mMaxBatches(maxBatches)
+        , mPrefix(prefix)
+        , mSuffix(suffix)
+        , mDataDir(directories)
+    {
+        FILE* file = fopen(locateFile(mPrefix + std::string("0") + mSuffix, mDataDir).c_str(), "rb");
+        ASSERT(file != nullptr);
+        int d[4];
+        size_t readSize = fread(d, sizeof(int), 4, file);
+        ASSERT(readSize == 4);
+        mDims.nbDims = 4;  // The number of dimensions.
+        mDims.d[0] = d[0]; // Batch Size
+        mDims.d[1] = d[1]; // Channels
+        mDims.d[2] = d[2]; // Height
+        mDims.d[3] = d[3]; // Width
+        ASSERT(mDims.d[0] > 0 && mDims.d[1] > 0 && mDims.d[2] > 0 && mDims.d[3] > 0);
+        fclose(file);
+
+        mImageSize = mDims.d[1] * mDims.d[2] * mDims.d[3];
+        mBatch.resize(mBatchSize * mImageSize, 0);
+        mLabels.resize(mBatchSize, 0);
+        mFileBatch.resize(mDims.d[0] * mImageSize, 0);
+        mFileLabels.resize(mDims.d[0], 0);
+        reset(0);
+    }
+
+    BatchStream(int batchSize, int maxBatches, std::string prefix, std::vector<std::string> directories)
+        : BatchStream(batchSize, maxBatches, prefix, ".batch", directories)
+    {
+    }
+
+    BatchStream(
+        int batchSize, int maxBatches, nvinfer1::Dims dims, std::string listFile, std::vector<std::string> directories)
+        : mBatchSize(batchSize)
+        , mMaxBatches(maxBatches)
+        , mDims(dims)
+        , mListFile(listFile)
+        , mDataDir(directories)
+    {
+        mImageSize = mDims.d[1] * mDims.d[2] * mDims.d[3];
+        mBatch.resize(mBatchSize * mImageSize, 0);
+        mLabels.resize(mBatchSize, 0);
+        mFileBatch.resize(mDims.d[0] * mImageSize, 0);
+        mFileLabels.resize(mDims.d[0], 0);
+        reset(0);
+    }
+
+    // Resets data members
+    void reset(int firstBatch) override
+    {
+        mBatchCount = 0;
+        mFileCount = 0;
+        mFileBatchPos = mDims.d[0];
+        skip(firstBatch);
+    }
+
+    // Advance to next batch and return true, or return false if there is no batch left.
+    bool next() override
+    {
+        if (mBatchCount == mMaxBatches)
+        {
+            return false;
+        }
+
+        for (int csize = 1, batchPos = 0; batchPos < mBatchSize; batchPos += csize, mFileBatchPos += csize)
+        {
+            ASSERT(mFileBatchPos > 0 && mFileBatchPos <= mDims.d[0]);
+            if (mFileBatchPos == mDims.d[0] && !update())
+            {
+                return false;
+            }
+
+            // copy the smaller of: elements left to fulfill the request, or elements left in the file buffer.
+            csize = std::min<int64_t>(mBatchSize - batchPos, mDims.d[0] - mFileBatchPos);
+            std::copy_n(
+                getFileBatch() + mFileBatchPos * mImageSize, csize * mImageSize, getBatch() + batchPos * mImageSize);
+            std::copy_n(getFileLabels() + mFileBatchPos, csize, getLabels() + batchPos);
+        }
+        mBatchCount++;
+        return true;
+    }
+
+    // Skips the batches
+    void skip(int skipCount) override
+    {
+        if (mBatchSize >= mDims.d[0] && mBatchSize % mDims.d[0] == 0 && mFileBatchPos == mDims.d[0])
+        {
+            mFileCount += skipCount * mBatchSize / mDims.d[0];
+            return;
+        }
+
+        int x = mBatchCount;
+        for (int i = 0; i < skipCount; i++)
+        {
+            next();
+        }
+        mBatchCount = x;
+    }
+
+    float* getBatch() override
+    {
+        return mBatch.data();
+    }
+
+    float* getLabels() override
+    {
+        return mLabels.data();
+    }
+
+    int getBatchesRead() const override
+    {
+        return mBatchCount;
+    }
+
+    int getBatchSize() const override
+    {
+        return mBatchSize;
+    }
+
+    nvinfer1::Dims getDims() const override
+    {
+        return mDims;
+    }
+
+private:
+    float* getFileBatch()
+    {
+        return mFileBatch.data();
+    }
+
+    float* getFileLabels()
+    {
+        return mFileLabels.data();
+    }
+
+    bool update()
+    {
+        if (mListFile.empty())
+        {
+            std::string inputFileName = locateFile(mPrefix + std::to_string(mFileCount++) + mSuffix, mDataDir);
+            FILE* file = fopen(inputFileName.c_str(), "rb");
+            if (!file)
+            {
+                return false;
+            }
+
+            int d[4];
+            size_t readSize = fread(d, sizeof(int), 4, file);
+            ASSERT(readSize == 4);
+            ASSERT(mDims.d[0] == d[0] && mDims.d[1] == d[1] && mDims.d[2] == d[2] && mDims.d[3] == d[3]);
+            size_t readInputCount = fread(getFileBatch(), sizeof(float), mDims.d[0] * mImageSize, file);
+            ASSERT(readInputCount == size_t(mDims.d[0] * mImageSize));
+            size_t readLabelCount = fread(getFileLabels(), sizeof(float), mDims.d[0], file);
+            ASSERT(readLabelCount == 0 || readLabelCount == size_t(mDims.d[0]));
+
+            fclose(file);
+        }
+        else
+        {
+            std::vector<std::string> fNames;
+            std::ifstream file(locateFile(mListFile, mDataDir), std::ios::binary);
+            if (!file)
+            {
+                return false;
+            }
+
+            sample::gLogInfo << "Batch #" << mFileCount << std::endl;
+            file.seekg(((mBatchCount * mBatchSize)) * 7);
+
+            for (int i = 1; i <= mBatchSize; i++)
+            {
+                std::string sName;
+                std::getline(file, sName);
+                sName = sName + ".ppm";
+                sample::gLogInfo << "Calibrating with file " << sName << std::endl;
+                fNames.emplace_back(sName);
+            }
+
+            mFileCount++;
+
+            const int imageC = 3;
+            const int imageH = 300;
+            const int imageW = 300;
+            std::vector<samplesCommon::PPM<imageC, imageH, imageW>> ppms(fNames.size());
+            for (uint32_t i = 0; i < fNames.size(); ++i)
+            {
+                readPPMFile(locateFile(fNames[i], mDataDir), ppms[i]);
+            }
+
+            std::vector<float> data(samplesCommon::volume(mDims));
+            const float scale = 2.0 / 255.0;
+            const float bias = 1.0;
+            long int volChl = mDims.d[2] * mDims.d[3];
+
+            // Normalize input data
+            for (int i = 0, volImg = mDims.d[1] * mDims.d[2] * mDims.d[3]; i < mBatchSize; ++i)
+            {
+                for (int c = 0; c < mDims.d[1]; ++c)
+                {
+                    for (int j = 0; j < volChl; ++j)
+                    {
+                        data[i * volImg + c * volChl + j] = scale * float(ppms[i].buffer[j * mDims.d[1] + c]) - bias;
+                    }
+                }
+            }
+
+            std::copy_n(data.data(), mDims.d[0] * mImageSize, getFileBatch());
+        }
+
+        mFileBatchPos = 0;
+        return true;
+    }
+
+    int mBatchSize{0};
+    int mMaxBatches{0};
+    int mBatchCount{0};
+    int mFileCount{0};
+    int mFileBatchPos{0};
+    int mImageSize{0};
+    std::vector<float> mBatch;         //!< Data for the batch
+    std::vector<float> mLabels;        //!< Labels for the batch
+    std::vector<float> mFileBatch;     //!< List of image files
+    std::vector<float> mFileLabels;    //!< List of label files
+    std::string mPrefix;               //!< Batch file name prefix
+    std::string mSuffix;               //!< Batch file name suffix
+    nvinfer1::Dims mDims;              //!< Input dimensions
+    std::string mListFile;             //!< File name of the list of image names
+    std::vector<std::string> mDataDir; //!< Directories where the files can be found
+};
+
+#endif
diff --git a/src/Detector/tensorrt_yolo/common_deprecated/EntropyCalibrator.h b/src/Detector/tensorrt_yolo/common_deprecated/EntropyCalibrator.h
new file mode 100644
index 00000000..f31789bf
--- /dev/null
+++ b/src/Detector/tensorrt_yolo/common_deprecated/EntropyCalibrator.h
@@ -0,0 +1,134 @@
+/*
+ * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ENTROPY_CALIBRATOR_H
+#define ENTROPY_CALIBRATOR_H
+
+#include "BatchStream.h"
+#include "NvInfer.h"
+
+//! \class EntropyCalibratorImpl
+//!
+//! \brief Implements common functionality for Entropy calibrators.
+//!
+template <typename TBatchStream>
+class EntropyCalibratorImpl
+{
+public:
+    EntropyCalibratorImpl(
+        TBatchStream stream, int firstBatch, std::string networkName, const char* inputBlobName, bool readCache = true)
+        : mStream{stream}
+        , mCalibrationTableName("CalibrationTable" + networkName)
+        , mInputBlobName(inputBlobName)
+        , mReadCache(readCache)
+    {
+        nvinfer1::Dims dims = mStream.getDims();
+        mInputCount = samplesCommon::volume(dims);
+        CHECK(cudaMalloc(&mDeviceInput, mInputCount * sizeof(float)));
+        mStream.reset(firstBatch);
+    }
+
+    virtual ~EntropyCalibratorImpl()
+    {
+        CHECK(cudaFree(mDeviceInput));
+    }
+
+    int getBatchSize() const noexcept
+    {
+        return mStream.getBatchSize();
+    }
+
+    bool getBatch(void* bindings[], const char* names[], int /*nbBindings*/) noexcept
+    {
+        if (!mStream.next())
+            return false;
+
+        CHECK(cudaMemcpy(mDeviceInput, mStream.getBatch(), mInputCount * sizeof(float), cudaMemcpyHostToDevice));
+        ASSERT(!strcmp(names[0], mInputBlobName));
+        bindings[0] = mDeviceInput;
+        return true;
+    }
+
+    const void* readCalibrationCache(size_t& length) noexcept
+    {
+        mCalibrationCache.clear();
+        std::ifstream input(mCalibrationTableName, std::ios::binary);
+        input >> std::noskipws;
+        if (mReadCache && input.good())
+        {
+            std::copy(std::istream_iterator<char>(input), std::istream_iterator<char>(),
+                std::back_inserter(mCalibrationCache));
+        }
+        length = mCalibrationCache.size();
+        return length ? mCalibrationCache.data() : nullptr;
+    }
+
+    void writeCalibrationCache(const void* cache, size_t length) noexcept
+    {
+        std::ofstream output(mCalibrationTableName, std::ios::binary);
+        output.write(reinterpret_cast<const char*>(cache), length);
+    }
+
+private:
+    TBatchStream mStream;
+    size_t mInputCount;
+    std::string mCalibrationTableName;
+    const char* mInputBlobName;
+    bool mReadCache{true};
+    void* mDeviceInput{nullptr};
+    std::vector<char> mCalibrationCache;
+};
+
+//! \class Int8EntropyCalibrator2
+//!
+//! \brief Implements Entropy calibrator 2.
+//!  CalibrationAlgoType is kENTROPY_CALIBRATION_2.
+//!
+template <typename TBatchStream>
+class Int8EntropyCalibrator2 : public nvinfer1::IInt8EntropyCalibrator2
+{
+public:
+    Int8EntropyCalibrator2(
+        TBatchStream stream, int firstBatch, const char* networkName, const char* inputBlobName, bool readCache = true)
+        : mImpl(stream, firstBatch, networkName, inputBlobName, readCache)
+    {
+    }
+
+    int getBatchSize() const noexcept override
+    {
+        return mImpl.getBatchSize();
+    }
+
+    bool getBatch(void* bindings[], const char* names[], int nbBindings) noexcept override
+    {
+        return mImpl.getBatch(bindings, names, nbBindings);
+    }
+
+    const void* readCalibrationCache(size_t& length) noexcept override
+    {
+        return mImpl.readCalibrationCache(length);
+    }
+
+    void writeCalibrationCache(const void* cache, size_t length) noexcept override
+    {
+        mImpl.writeCalibrationCache(cache, length);
+    }
+
+private:
+    EntropyCalibratorImpl<TBatchStream> mImpl;
+};
+
+#endif // ENTROPY_CALIBRATOR_H
diff --git a/src/Detector/tensorrt_yolo/common_deprecated/ErrorRecorder.h b/src/Detector/tensorrt_yolo/common_deprecated/ErrorRecorder.h
new file mode 100644
index 00000000..40b35fb5
--- /dev/null
+++ b/src/Detector/tensorrt_yolo/common_deprecated/ErrorRecorder.h
@@ -0,0 +1,137 @@
+/*
+ * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ERROR_RECORDER_H
+#define ERROR_RECORDER_H
+#include "NvInferRuntimeCommon.h"
+#include "logger.h"
+#include <atomic>
+#include <cstdint>
+#include <exception>
+#include <mutex>
+#include <vector>
+
+using nvinfer1::IErrorRecorder;
+using nvinfer1::ErrorCode;
+
+//!
+//! A simple implementation of the IErrorRecorder interface for
+//! use by samples. This interface also can be used as a reference
+//! implementation.
+//! The sample Error recorder is based on a vector that pairs the error
+//! code and the error string into a single element. It also uses
+//! standard mutex's and atomics in order to make sure that the code
+//! works in a multi-threaded environment.
+//!
+class SampleErrorRecorder : public IErrorRecorder
+{
+    using errorPair = std::pair<ErrorCode, std::string>;
+    using errorStack = std::vector<errorPair>;
+
+public:
+    SampleErrorRecorder() = default;
+
+    virtual ~SampleErrorRecorder() noexcept {}
+    int32_t getNbErrors() const noexcept final
+    {
+        return mErrorStack.size();
+    }
+    ErrorCode getErrorCode(int32_t errorIdx) const noexcept final
+    {
+        return invalidIndexCheck(errorIdx) ? ErrorCode::kINVALID_ARGUMENT : (*this)[errorIdx].first;
+    };
+    IErrorRecorder::ErrorDesc getErrorDesc(int32_t errorIdx) const noexcept final
+    {
+        return invalidIndexCheck(errorIdx) ? "errorIdx out of range." : (*this)[errorIdx].second.c_str();
+    }
+    // This class can never overflow since we have dynamic resize via std::vector usage.
+    bool hasOverflowed() const noexcept final
+    {
+        return false;
+    }
+
+    // Empty the errorStack.
+    void clear() noexcept final
+    {
+        try
+        {
+            // grab a lock so that there is no addition while clearing.
+            std::lock_guard<std::mutex> guard(mStackLock);
+            mErrorStack.clear();
+        }
+        catch (const std::exception& e)
+        {
+            sample::gLogFatal << "Internal Error: " << e.what() << std::endl;
+        }
+    };
+
+    //! Simple helper function that
+    bool empty() const noexcept
+    {
+        return mErrorStack.empty();
+    }
+
+    bool reportError(ErrorCode val, IErrorRecorder::ErrorDesc desc) noexcept final
+    {
+        try
+        {
+            std::lock_guard<std::mutex> guard(mStackLock);
+            sample::gLogError << "Error[" << static_cast<int32_t>(val) << "]: " << desc << std::endl;
+            mErrorStack.push_back(errorPair(val, desc));
+        }
+        catch (const std::exception& e)
+        {
+            sample::gLogFatal << "Internal Error: " << e.what() << std::endl;
+        }
+        // All errors are considered fatal.
+        return true;
+    }
+
+    // Atomically increment or decrement the ref counter.
+    IErrorRecorder::RefCount incRefCount() noexcept final
+    {
+        return ++mRefCount;
+    }
+    IErrorRecorder::RefCount decRefCount() noexcept final
+    {
+        return --mRefCount;
+    }
+
+private:
+    // Simple helper functions.
+    const errorPair& operator[](size_t index) const noexcept
+    {
+        return mErrorStack[index];
+    }
+
+    bool invalidIndexCheck(int32_t index) const noexcept
+    {
+        // By converting signed to unsigned, we only need a single check since
+        // negative numbers turn into large positive greater than the size.
+        size_t sIndex = index;
+        return sIndex >= mErrorStack.size();
+    }
+    // Mutex to hold when locking mErrorStack.
+    std::mutex mStackLock;
+
+    // Reference count of the class. Destruction of the class when mRefCount
+    // is not zero causes undefined behavior.
+    std::atomic<int32_t> mRefCount{0};
+
+    // The error stack that holds the errors recorded by TensorRT.
+    errorStack mErrorStack;
+};     // class SampleErrorRecorder
+#endif // ERROR_RECORDER_H
diff --git a/src/Detector/tensorrt_yolo/common_deprecated/buffers.h b/src/Detector/tensorrt_yolo/common_deprecated/buffers.h
new file mode 100644
index 00000000..ef673b2b
--- /dev/null
+++ b/src/Detector/tensorrt_yolo/common_deprecated/buffers.h
@@ -0,0 +1,478 @@
+/*
+ * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef TENSORRT_BUFFERS_H
+#define TENSORRT_BUFFERS_H
+
+#include "NvInfer.h"
+#include "common.h"
+#include "half.h"
+#include <cassert>
+#include <cuda_runtime_api.h>
+#include <iostream>
+#include <iterator>
+#include <memory>
+#include <new>
+#include <numeric>
+#include <string>
+#include <vector>
+
+namespace samplesCommon
+{
+
+//!
+//! \brief  The GenericBuffer class is a templated class for buffers.
+//!
+//! \details This templated RAII (Resource Acquisition Is Initialization) class handles the allocation,
+//!          deallocation, querying of buffers on both the device and the host.
+//!          It can handle data of arbitrary types because it stores byte buffers.
+//!          The template parameters AllocFunc and FreeFunc are used for the
+//!          allocation and deallocation of the buffer.
+//!          AllocFunc must be a functor that takes in (void** ptr, size_t size)
+//!          and returns bool. ptr is a pointer to where the allocated buffer address should be stored.
+//!          size is the amount of memory in bytes to allocate.
+//!          The boolean indicates whether or not the memory allocation was successful.
+//!          FreeFunc must be a functor that takes in (void* ptr) and returns void.
+//!          ptr is the allocated buffer address. It must work with nullptr input.
+//!
+template <typename AllocFunc, typename FreeFunc>
+class GenericBuffer
+{
+public:
+    //!
+    //! \brief Construct an empty buffer.
+    //!
+    GenericBuffer(nvinfer1::DataType type = nvinfer1::DataType::kFLOAT)
+        : mSize(0)
+        , mCapacity(0)
+        , mType(type)
+        , mBuffer(nullptr)
+    {
+    }
+
+    //!
+    //! \brief Construct a buffer with the specified allocation size in bytes.
+    //!
+    GenericBuffer(size_t size, nvinfer1::DataType type)
+        : mSize(size)
+        , mCapacity(size)
+        , mType(type)
+    {
+        if (!allocFn(&mBuffer, this->nbBytes()))
+        {
+            throw std::bad_alloc();
+        }
+    }
+
+    GenericBuffer(GenericBuffer&& buf)
+        : mSize(buf.mSize)
+        , mCapacity(buf.mCapacity)
+        , mType(buf.mType)
+        , mBuffer(buf.mBuffer)
+    {
+        buf.mSize = 0;
+        buf.mCapacity = 0;
+        buf.mType = nvinfer1::DataType::kFLOAT;
+        buf.mBuffer = nullptr;
+    }
+
+    GenericBuffer& operator=(GenericBuffer&& buf)
+    {
+        if (this != &buf)
+        {
+            freeFn(mBuffer);
+            mSize = buf.mSize;
+            mCapacity = buf.mCapacity;
+            mType = buf.mType;
+            mBuffer = buf.mBuffer;
+            // Reset buf.
+            buf.mSize = 0;
+            buf.mCapacity = 0;
+            buf.mBuffer = nullptr;
+        }
+        return *this;
+    }
+
+    //!
+    //! \brief Returns pointer to underlying array.
+    //!
+    void* data()
+    {
+        return mBuffer;
+    }
+
+    //!
+    //! \brief Returns pointer to underlying array.
+    //!
+    const void* data() const
+    {
+        return mBuffer;
+    }
+
+    //!
+    //! \brief Returns the size (in number of elements) of the buffer.
+    //!
+    size_t size() const
+    {
+        return mSize;
+    }
+
+    //!
+    //! \brief Returns the size (in bytes) of the buffer.
+    //!
+    size_t nbBytes() const
+    {
+        return this->size() * samplesCommon::getElementSize(mType);
+    }
+
+    //!
+    //! \brief Resizes the buffer. This is a no-op if the new size is smaller than or equal to the current capacity.
+    //!
+    void resize(size_t newSize)
+    {
+        mSize = newSize;
+        if (mCapacity < newSize)
+        {
+            freeFn(mBuffer);
+            if (!allocFn(&mBuffer, this->nbBytes()))
+            {
+                throw std::bad_alloc{};
+            }
+            mCapacity = newSize;
+        }
+    }
+
+    //!
+    //! \brief Overload of resize that accepts Dims
+    //!
+    void resize(const nvinfer1::Dims& dims)
+    {
+        return this->resize(samplesCommon::volume(dims));
+    }
+
+    ~GenericBuffer()
+    {
+        freeFn(mBuffer);
+    }
+
+private:
+    size_t mSize{0}, mCapacity{0};
+    nvinfer1::DataType mType;
+    void* mBuffer;
+    AllocFunc allocFn;
+    FreeFunc freeFn;
+};
+
+class DeviceAllocator
+{
+public:
+    bool operator()(void** ptr, size_t size) const
+    {
+        return cudaMalloc(ptr, size) == cudaSuccess;
+    }
+};
+
+class DeviceFree
+{
+public:
+    void operator()(void* ptr) const
+    {
+        cudaFree(ptr);
+    }
+};
+
+class HostAllocator
+{
+public:
+    bool operator()(void** ptr, size_t size) const
+    {
+        *ptr = malloc(size);
+        return *ptr != nullptr;
+    }
+};
+
+class HostFree
+{
+public:
+    void operator()(void* ptr) const
+    {
+        free(ptr);
+    }
+};
+
+using DeviceBuffer = GenericBuffer<DeviceAllocator, DeviceFree>;
+using HostBuffer = GenericBuffer<HostAllocator, HostFree>;
+
+//!
+//! \brief  The ManagedBuffer class groups together a pair of corresponding device and host buffers.
+//!
+class ManagedBuffer
+{
+public:
+    DeviceBuffer deviceBuffer;
+    HostBuffer hostBuffer;
+};
+
+//!
+//! \brief  The BufferManager class handles host and device buffer allocation and deallocation.
+//!
+//! \details This RAII class handles host and device buffer allocation and deallocation,
+//!          memcpy between host and device buffers to aid with inference,
+//!          and debugging dumps to validate inference. The BufferManager class is meant to be
+//!          used to simplify buffer management and any interactions between buffers and the engine.
+//!
+class BufferManager
+{
+public:
+    static const size_t kINVALID_SIZE_VALUE = ~size_t(0);
+
+    //!
+    //! \brief Create a BufferManager for handling buffer interactions with engine.
+    //!
+    BufferManager(std::shared_ptr<nvinfer1::ICudaEngine> engine, const int batchSize,
+        const nvinfer1::IExecutionContext* context = nullptr)
+        : mEngine(engine)
+        , mBatchSize(batchSize)
+    {
+        // Full Dims implies no batch size.
+        auto impbs = engine->hasImplicitBatchDimension();
+        std::cout << "hasImplicitBatchDimension: " << impbs << ", mBatchSize = " << mBatchSize << std::endl;
+        assert(engine->hasImplicitBatchDimension() || mBatchSize == 0);
+        // Create host and device buffers
+        for (int i = 0; i < mEngine->getNbBindings(); i++)
+        {
+            auto dims = context ? context->getBindingDimensions(i) : mEngine->getBindingDimensions(i);
+            size_t vol = context || !mBatchSize ? 1 : static_cast<size_t>(mBatchSize);
+            nvinfer1::DataType type = mEngine->getBindingDataType(i);
+            int vecDim = mEngine->getBindingVectorizedDim(i);
+            if (-1 != vecDim) // i.e., 0 != lgScalarsPerVector
+            {
+                int scalarsPerVec = mEngine->getBindingComponentsPerElement(i);
+                dims.d[vecDim] = divUp(dims.d[vecDim], scalarsPerVec);
+                vol *= scalarsPerVec;
+            }
+            vol *= samplesCommon::volume(dims);
+            std::unique_ptr<ManagedBuffer> manBuf{new ManagedBuffer()};
+            manBuf->deviceBuffer = DeviceBuffer(vol, type);
+            manBuf->hostBuffer = HostBuffer(vol, type);
+            mDeviceBindings.emplace_back(manBuf->deviceBuffer.data());
+            mManagedBuffers.emplace_back(std::move(manBuf));
+        }
+    }
+
+    //!
+    //! \brief Returns a vector of device buffers that you can use directly as
+    //!        bindings for the execute and enqueue methods of IExecutionContext.
+    //!
+    std::vector<void*>& getDeviceBindings()
+    {
+        return mDeviceBindings;
+    }
+
+    //!
+    //! \brief Returns a vector of device buffers.
+    //!
+    const std::vector<void*>& getDeviceBindings() const
+    {
+        return mDeviceBindings;
+    }
+
+    //!
+    //! \brief Returns the device buffer corresponding to tensorName.
+    //!        Returns nullptr if no such tensor can be found.
+    //!
+    void* getDeviceBuffer(const std::string& tensorName) const
+    {
+        return getBuffer(false, tensorName);
+    }
+
+    //!
+    //! \brief Returns the host buffer corresponding to tensorName.
+    //!        Returns nullptr if no such tensor can be found.
+    //!
+    void* getHostBuffer(const std::string& tensorName) const
+    {
+        return getBuffer(true, tensorName);
+    }
+
+    //!
+    //! \brief Returns the host buffer corresponding to tensorName.
+    //!        Returns nullptr if no such tensor can be found.
+    //!
+    void* getHostBuffer(int bindingIndex) const
+    {
+        return getBuffer(true, bindingIndex);
+    }
+
+    //!
+    //! \brief Returns the size of the host and device buffers that correspond to tensorName.
+    //!        Returns kINVALID_SIZE_VALUE if no such tensor can be found.
+    //!
+    size_t size(const std::string& tensorName) const
+    {
+        int index = mEngine->getBindingIndex(tensorName.c_str());
+        if (index == -1)
+            return kINVALID_SIZE_VALUE;
+        return mManagedBuffers[index]->hostBuffer.nbBytes();
+    }
+
+    //!
+    //! \brief Dump host buffer with specified tensorName to ostream.
+    //!        Prints error message to std::ostream if no such tensor can be found.
+    //!
+    void dumpBuffer(std::ostream& os, const std::string& tensorName)
+    {
+        int index = mEngine->getBindingIndex(tensorName.c_str());
+        if (index == -1)
+        {
+            os << "Invalid tensor name" << std::endl;
+            return;
+        }
+        void* buf = mManagedBuffers[index]->hostBuffer.data();
+        size_t bufSize = mManagedBuffers[index]->hostBuffer.nbBytes();
+        nvinfer1::Dims bufDims = mEngine->getBindingDimensions(index);
+        size_t rowCount = static_cast<size_t>(bufDims.nbDims > 0 ? bufDims.d[bufDims.nbDims - 1] : mBatchSize);
+        int leadDim = mBatchSize;
+        int* trailDims = bufDims.d;
+        int nbDims = bufDims.nbDims;
+
+        // Fix explicit Dimension networks
+        if (!leadDim && nbDims > 0)
+        {
+            leadDim = bufDims.d[0];
+            ++trailDims;
+            --nbDims;
+        }
+
+        os << "[" << leadDim;
+        for (int i = 0; i < nbDims; i++)
+            os << ", " << trailDims[i];
+        os << "]" << std::endl;
+        switch (mEngine->getBindingDataType(index))
+        {
+        case nvinfer1::DataType::kINT32: print<int32_t>(os, buf, bufSize, rowCount); break;
+        case nvinfer1::DataType::kFLOAT: print<float>(os, buf, bufSize, rowCount); break;
+        case nvinfer1::DataType::kHALF: print<half_float::half>(os, buf, bufSize, rowCount); break;
+        case nvinfer1::DataType::kINT8: assert(0 && "Int8 network-level input and output is not supported"); break;
+        case nvinfer1::DataType::kBOOL: assert(0 && "Bool network-level input and output are not supported"); break;
+        }
+    }
+
+    //!
+    //! \brief Templated print function that dumps buffers of arbitrary type to std::ostream.
+    //!        rowCount parameter controls how many elements are on each line.
+    //!        A rowCount of 1 means that there is only 1 element on each line.
+    //!
+    template <typename T>
+    void print(std::ostream& os, void* buf, size_t bufSize, size_t rowCount)
+    {
+        assert(rowCount != 0);
+        assert(bufSize % sizeof(T) == 0);
+        T* typedBuf = static_cast<T*>(buf);
+        size_t numItems = bufSize / sizeof(T);
+        for (int i = 0; i < static_cast<int>(numItems); i++)
+        {
+            // Handle rowCount == 1 case
+            if (rowCount == 1 && i != static_cast<int>(numItems) - 1)
+                os << typedBuf[i] << std::endl;
+            else if (rowCount == 1)
+                os << typedBuf[i];
+            // Handle rowCount > 1 case
+            else if (i % rowCount == 0)
+                os << typedBuf[i];
+            else if (i % rowCount == rowCount - 1)
+                os << " " << typedBuf[i] << std::endl;
+            else
+                os << " " << typedBuf[i];
+        }
+    }
+
+    //!
+    //! \brief Copy the contents of input host buffers to input device buffers synchronously.
+    //!
+    void copyInputToDevice()
+    {
+        memcpyBuffers(true, false, false, 0);
+    }
+
+    //!
+    //! \brief Copy the contents of output device buffers to output host buffers synchronously.
+    //!
+    void copyOutputToHost()
+    {
+        memcpyBuffers(false, true, false, 0);
+    }
+
+    //!
+    //! \brief Copy the contents of input host buffers to input device buffers asynchronously.
+    //!
+    void copyInputToDeviceAsync(const cudaStream_t& stream)
+    {
+        memcpyBuffers(true, false, true, stream);
+    }
+
+    //!
+    //! \brief Copy the contents of output device buffers to output host buffers asynchronously.
+    //!
+    void copyOutputToHostAsync(const cudaStream_t& stream)
+    {
+        memcpyBuffers(false, true, true, stream);
+    }
+
+    ~BufferManager() = default;
+
+private:
+    void* getBuffer(const bool isHost, const std::string& tensorName) const
+    {
+        int index = mEngine->getBindingIndex(tensorName.c_str());
+        if (index == -1)
+            return nullptr;
+        return (isHost ? mManagedBuffers[index]->hostBuffer.data() : mManagedBuffers[index]->deviceBuffer.data());
+    }
+
+    void* getBuffer(const bool isHost, int bindingIndex) const
+    {
+        if (bindingIndex == -1)
+            return nullptr;
+        return (isHost ? mManagedBuffers[bindingIndex]->hostBuffer.data() : mManagedBuffers[bindingIndex]->deviceBuffer.data());
+    }
+
+    void memcpyBuffers(const bool copyInput, const bool deviceToHost, const bool async, const cudaStream_t& stream)
+    {
+        for (int i = 0; i < mEngine->getNbBindings(); i++)
+        {
+            void* dstPtr = deviceToHost ? mManagedBuffers[i]->hostBuffer.data() : mManagedBuffers[i]->deviceBuffer.data();
+            const void* srcPtr = deviceToHost ? mManagedBuffers[i]->deviceBuffer.data() : mManagedBuffers[i]->hostBuffer.data();
+            const size_t byteSize = mManagedBuffers[i]->hostBuffer.nbBytes();
+            const cudaMemcpyKind memcpyType = deviceToHost ? cudaMemcpyDeviceToHost : cudaMemcpyHostToDevice;
+            if ((copyInput && mEngine->bindingIsInput(i)) || (!copyInput && !mEngine->bindingIsInput(i)))
+            {
+                if (async)
+                    CHECK(cudaMemcpyAsync(dstPtr, srcPtr, byteSize, memcpyType, stream));
+                else
+                    CHECK(cudaMemcpy(dstPtr, srcPtr, byteSize, memcpyType));
+            }
+        }
+    }
+
+    std::shared_ptr<nvinfer1::ICudaEngine> mEngine;              //!< The pointer to the engine
+    int mBatchSize = 0;                                          //!< The batch size for legacy networks, 0 otherwise.
+    std::vector<std::unique_ptr<ManagedBuffer>> mManagedBuffers; //!< The vector of pointers to managed buffers
+    std::vector<void*> mDeviceBindings;                          //!< The vector of device buffers needed for engine execution
+};
+
+} // namespace samplesCommon
+
+#endif // TENSORRT_BUFFERS_H
diff --git a/src/Detector/tensorrt_yolo/common_deprecated/common.h b/src/Detector/tensorrt_yolo/common_deprecated/common.h
new file mode 100644
index 00000000..2270a2cd
--- /dev/null
+++ b/src/Detector/tensorrt_yolo/common_deprecated/common.h
@@ -0,0 +1,963 @@
+/*
+ * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef TENSORRT_COMMON_H
+#define TENSORRT_COMMON_H
+
+// For loadLibrary
+#ifdef _MSC_VER
+// Needed so that the max/min definitions in windows.h do not conflict with std::max/min.
+#ifndef NOMINMAX
+#define NOMINMAX
+#endif
+#include <windows.h>
+#undef NOMINMAX
+#else
+#include <dlfcn.h>
+#endif
+
+#include "NvInfer.h"
+#include "NvInferPlugin.h"
+#include "logger.h"
+#include <algorithm>
+#include <cassert>
+#include <chrono>
+#include <cmath>
+#include <cstring>
+#include <cuda_runtime_api.h>
+#include <fstream>
+#include <iomanip>
+#include <iostream>
+#include <iterator>
+#include <map>
+#include <memory>
+#include <new>
+#include <numeric>
+#include <ratio>
+#include <sstream>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "safeCommon.h"
+
+#ifdef _MSC_VER
+#define FN_NAME __FUNCTION__
+#else
+#define FN_NAME __func__
+#endif
+
+#if defined(__aarch64__) || defined(__QNX__)
+#define ENABLE_DLA_API 1
+#endif
+
+#define CHECK_RETURN_W_MSG(status, val, errMsg)                                                                        \
+    do                                                                                                                 \
+    {                                                                                                                  \
+        if (!(status))                                                                                                 \
+        {                                                                                                              \
+            sample::gLogError << errMsg << " Error in " << __FILE__ << ", function " << FN_NAME << "(), line " << __LINE__     \
+                      << std::endl;                                                                                    \
+            return val;                                                                                                \
+        }                                                                                                              \
+    } while (0)
+
+#undef ASSERT
+#define ASSERT(condition)                                                   \
+    do                                                                      \
+    {                                                                       \
+        if (!(condition))                                                   \
+        {                                                                   \
+            sample::gLogError << "Assertion failure: " << #condition << std::endl;  \
+            abort();                                                        \
+        }                                                                   \
+    } while (0)
+
+
+#define CHECK_RETURN(status, val) CHECK_RETURN_W_MSG(status, val, "")
+
+#define OBJ_GUARD(A) std::unique_ptr<A, void (*)(A * t)>
+
+template <typename T, typename T_>
+OBJ_GUARD(T)
+makeObjGuard(T_* t)
+{
+    CHECK(!(std::is_base_of<T, T_>::value || std::is_same<T, T_>::value));
+    auto deleter = [](T* t) { t->destroy(); };
+    return std::unique_ptr<T, decltype(deleter)>{static_cast<T*>(t), deleter};
+}
+
+constexpr long double operator"" _GiB(long double val)
+{
+    return val * (1 << 30);
+}
+constexpr long double operator"" _MiB(long double val)
+{
+    return val * (1 << 20);
+}
+constexpr long double operator"" _KiB(long double val)
+{
+    return val * (1 << 10);
+}
+
+// These is necessary if we want to be able to write 1_GiB instead of 1.0_GiB.
+// Since the return type is signed, -1_GiB will work as expected.
+constexpr long long int operator"" _GiB(unsigned long long val)
+{
+    return val * (1 << 30);
+}
+constexpr long long int operator"" _MiB(unsigned long long val)
+{
+    return val * (1 << 20);
+}
+constexpr long long int operator"" _KiB(unsigned long long val)
+{
+    return val * (1 << 10);
+}
+
+struct SimpleProfiler : public nvinfer1::IProfiler
+{
+    struct Record
+    {
+        float time{0};
+        int count{0};
+    };
+
+    virtual void reportLayerTime(const char* layerName, float ms) noexcept
+    {
+        mProfile[layerName].count++;
+        mProfile[layerName].time += ms;
+        if (std::find(mLayerNames.begin(), mLayerNames.end(), layerName) == mLayerNames.end())
+        {
+            mLayerNames.push_back(layerName);
+        }
+    }
+
+    SimpleProfiler(const char* name, const std::vector<SimpleProfiler>& srcProfilers = std::vector<SimpleProfiler>())
+        : mName(name)
+    {
+        for (const auto& srcProfiler : srcProfilers)
+        {
+            for (const auto& rec : srcProfiler.mProfile)
+            {
+                auto it = mProfile.find(rec.first);
+                if (it == mProfile.end())
+                {
+                    mProfile.insert(rec);
+                }
+                else
+                {
+                    it->second.time += rec.second.time;
+                    it->second.count += rec.second.count;
+                }
+            }
+        }
+    }
+
+    friend std::ostream& operator<<(std::ostream& out, const SimpleProfiler& value)
+    {
+        out << "========== " << value.mName << " profile ==========" << std::endl;
+        float totalTime = 0;
+        std::string layerNameStr = "TensorRT layer name";
+        int maxLayerNameLength = std::max(static_cast<int>(layerNameStr.size()), 70);
+        for (const auto& elem : value.mProfile)
+        {
+            totalTime += elem.second.time;
+            maxLayerNameLength = std::max(maxLayerNameLength, static_cast<int>(elem.first.size()));
+        }
+
+        auto old_settings = out.flags();
+        auto old_precision = out.precision();
+        // Output header
+        {
+            out << std::setw(maxLayerNameLength) << layerNameStr << " ";
+            out << std::setw(12) << "Runtime, "
+                << "%"
+                << " ";
+            out << std::setw(12) << "Invocations"
+                << " ";
+            out << std::setw(12) << "Runtime, ms" << std::endl;
+        }
+        for (size_t i = 0; i < value.mLayerNames.size(); i++)
+        {
+            const std::string layerName = value.mLayerNames[i];
+            auto elem = value.mProfile.at(layerName);
+            out << std::setw(maxLayerNameLength) << layerName << " ";
+            out << std::setw(12) << std::fixed << std::setprecision(1) << (elem.time * 100.0F / totalTime) << "%"
+                << " ";
+            out << std::setw(12) << elem.count << " ";
+            out << std::setw(12) << std::fixed << std::setprecision(2) << elem.time << std::endl;
+        }
+        out.flags(old_settings);
+        out.precision(old_precision);
+        out << "========== " << value.mName << " total runtime = " << totalTime << " ms ==========" << std::endl;
+
+        return out;
+    }
+
+private:
+    std::string mName;
+    std::vector<std::string> mLayerNames;
+    std::map<std::string, Record> mProfile;
+};
+
+//! Locate path to file, given its filename or filepath suffix and possible dirs it might lie in.
+//! Function will also walk back MAX_DEPTH dirs from CWD to check for such a file path.
+inline std::string locateFile(
+    const std::string& filepathSuffix, const std::vector<std::string>& directories, bool reportError = true)
+{
+    const int MAX_DEPTH{10};
+    bool found{false};
+    std::string filepath;
+
+    for (auto& dir : directories)
+    {
+        if (!dir.empty() && dir.back() != '/')
+        {
+#ifdef _MSC_VER
+            filepath = dir + "\\" + filepathSuffix;
+#else
+            filepath = dir + "/" + filepathSuffix;
+#endif
+        }
+        else
+        {
+            filepath = dir + filepathSuffix;
+        }
+
+        for (int i = 0; i < MAX_DEPTH && !found; i++)
+        {
+            const std::ifstream checkFile(filepath);
+            found = checkFile.is_open();
+            if (found)
+            {
+                break;
+            }
+
+            filepath = "../" + filepath; // Try again in parent dir
+        }
+
+        if (found)
+        {
+            break;
+        }
+
+        filepath.clear();
+    }
+
+    // Could not find the file
+    if (filepath.empty())
+    {
+        const std::string dirList = std::accumulate(directories.begin() + 1, directories.end(), directories.front(),
+            [](const std::string& a, const std::string& b) { return a + "\n\t" + b; });
+        std::cout << "Could not find " << filepathSuffix << " in data directories:\n\t" << dirList << std::endl;
+
+        if (reportError)
+        {
+            std::cout << "&&&& FAILED" << std::endl;
+            exit(EXIT_FAILURE);
+        }
+    }
+
+    return filepath;
+}
+
+inline void readPGMFile(const std::string& fileName, uint8_t* buffer, int inH, int inW)
+{
+    std::ifstream infile(fileName, std::ifstream::binary);
+    assert(infile.is_open() && "Attempting to read from a file that is not open.");
+    std::string magic, h, w, max;
+    infile >> magic >> h >> w >> max;
+    infile.seekg(1, infile.cur);
+    infile.read(reinterpret_cast<char*>(buffer), inH * inW);
+}
+
+namespace samplesCommon
+{
+
+// Swaps endianness of an integral type.
+template <typename T, typename std::enable_if<std::is_integral<T>::value, int>::type = 0>
+inline T swapEndianness(const T& value)
+{
+    uint8_t bytes[sizeof(T)];
+    for (int i = 0; i < static_cast<int>(sizeof(T)); ++i)
+    {
+        bytes[sizeof(T) - 1 - i] = *(reinterpret_cast<const uint8_t*>(&value) + i);
+    }
+    return *reinterpret_cast<T*>(bytes);
+}
+
+class HostMemory
+{
+public:
+    HostMemory() = delete;
+    virtual void* data() const noexcept
+    {
+        return mData;
+    }
+    virtual std::size_t size() const noexcept
+    {
+        return mSize;
+    }
+    virtual nvinfer1::DataType type() const noexcept
+    {
+        return mType;
+    }
+    virtual ~HostMemory() {}
+
+protected:
+    HostMemory(std::size_t size, nvinfer1::DataType type)
+        : mData{nullptr}
+        , mSize(size)
+        , mType(type)
+    {
+    }
+    void* mData;
+    std::size_t mSize;
+    nvinfer1::DataType mType;
+};
+
+template <typename ElemType, nvinfer1::DataType dataType>
+class TypedHostMemory : public HostMemory
+{
+public:
+    explicit TypedHostMemory(std::size_t size)
+        : HostMemory(size, dataType)
+    {
+        mData = new ElemType[size];
+    };
+    ~TypedHostMemory() noexcept
+    {
+        delete[](ElemType*) mData;
+    }
+    ElemType* raw() noexcept
+    {
+        return static_cast<ElemType*>(data());
+    }
+};
+
+using FloatMemory = TypedHostMemory<float, nvinfer1::DataType::kFLOAT>;
+using HalfMemory = TypedHostMemory<uint16_t, nvinfer1::DataType::kHALF>;
+using ByteMemory = TypedHostMemory<uint8_t, nvinfer1::DataType::kINT8>;
+
+inline void* safeCudaMalloc(size_t memSize)
+{
+    void* deviceMem;
+    CHECK(cudaMalloc(&deviceMem, memSize));
+    if (deviceMem == nullptr)
+    {
+        std::cerr << "Out of memory" << std::endl;
+        exit(1);
+    }
+    return deviceMem;
+}
+
+inline bool isDebug()
+{
+    return (std::getenv("TENSORRT_DEBUG") ? true : false);
+}
+
+struct InferDeleter
+{
+    template <typename T>
+    void operator()(T* obj) const
+    {
+#if (NV_TENSORRT_MAJOR < 8)
+		obj->destroy();
+#else
+        delete obj;
+#endif
+    }
+};
+
+template <typename T>
+using SampleUniquePtr = std::unique_ptr<T, InferDeleter>;
+
+static auto StreamDeleter = [](cudaStream_t* pStream)
+    {
+        if (pStream)
+        {
+            cudaStreamDestroy(*pStream);
+            delete pStream;
+        }
+    };
+
+inline std::unique_ptr<cudaStream_t, decltype(StreamDeleter)> makeCudaStream()
+{
+    std::unique_ptr<cudaStream_t, decltype(StreamDeleter)> pStream(new cudaStream_t, StreamDeleter);
+    if (cudaStreamCreateWithFlags(pStream.get(), cudaStreamNonBlocking) != cudaSuccess)
+    {
+        pStream.reset(nullptr);
+    }
+
+    return pStream;
+}
+
+//! Return vector of indices that puts magnitudes of sequence in descending order.
+template <class Iter>
+std::vector<size_t> argMagnitudeSort(Iter begin, Iter end)
+{
+    std::vector<size_t> indices(end - begin);
+    std::iota(indices.begin(), indices.end(), 0);
+    std::sort(indices.begin(), indices.end(), [&begin](size_t i, size_t j) { return std::abs(begin[j]) < std::abs(begin[i]); });
+    return indices;
+}
+
+inline bool readReferenceFile(const std::string& fileName, std::vector<std::string>& refVector)
+{
+    std::ifstream infile(fileName);
+    if (!infile.is_open())
+    {
+        std::cout << "ERROR: readReferenceFile: Attempting to read from a file that is not open." << std::endl;
+        return false;
+    }
+    std::string line;
+    while (std::getline(infile, line))
+    {
+        if (line.empty())
+            continue;
+        refVector.push_back(line);
+    }
+    infile.close();
+    return true;
+}
+
+template <typename T>
+std::vector<std::string> classify(
+    const std::vector<std::string>& refVector, const std::vector<T>& output, const size_t topK)
+{
+    const auto inds = samplesCommon::argMagnitudeSort(output.cbegin(), output.cend());
+    std::vector<std::string> result;
+    result.reserve(topK);
+    for (size_t k = 0; k < topK; ++k)
+    {
+        result.push_back(refVector[inds[k]]);
+    }
+    return result;
+}
+
+// Returns indices of highest K magnitudes in v.
+template <typename T>
+std::vector<size_t> topKMagnitudes(const std::vector<T>& v, const size_t k)
+{
+    std::vector<size_t> indices = samplesCommon::argMagnitudeSort(v.cbegin(), v.cend());
+    indices.resize(k);
+    return indices;
+}
+
+template <typename T>
+bool readASCIIFile(const std::string& fileName, const size_t size, std::vector<T>& out)
+{
+    std::ifstream infile(fileName);
+    if (!infile.is_open())
+    {
+        std::cout << "ERROR readASCIIFile: Attempting to read from a file that is not open." << std::endl;
+        return false;
+    }
+    out.clear();
+    out.reserve(size);
+    out.assign(std::istream_iterator<T>(infile), std::istream_iterator<T>());
+    infile.close();
+    return true;
+}
+
+template <typename T>
+bool writeASCIIFile(const std::string& fileName, const std::vector<T>& in)
+{
+    std::ofstream outfile(fileName);
+    if (!outfile.is_open())
+    {
+        std::cout << "ERROR: writeASCIIFile: Attempting to write to a file that is not open." << std::endl;
+        return false;
+    }
+    for (auto fn : in)
+    {
+        outfile << fn << "\n";
+    }
+    outfile.close();
+    return true;
+}
+
+inline void print_version()
+{
+    std::cout << "  TensorRT version: " << NV_TENSORRT_MAJOR << "." << NV_TENSORRT_MINOR << "." << NV_TENSORRT_PATCH
+              << "." << NV_TENSORRT_BUILD << std::endl;
+}
+
+inline std::string getFileType(const std::string& filepath)
+{
+    return filepath.substr(filepath.find_last_of(".") + 1);
+}
+
+inline std::string toLower(const std::string& inp)
+{
+    std::string out = inp;
+    std::transform(out.begin(), out.end(), out.begin(), ::tolower);
+    return out;
+}
+
+inline float getMaxValue(const float* buffer, int64_t size)
+{
+    assert(buffer != nullptr);
+    assert(size > 0);
+    return *std::max_element(buffer, buffer + size);
+}
+
+// Ensures that every tensor used by a network has a dynamic range set.
+//
+// All tensors in a network must have a dynamic range specified if a calibrator is not used.
+// This function is just a utility to globally fill in missing scales and zero-points for the entire network.
+//
+// If a tensor does not have a dyanamic range set, it is assigned inRange or outRange as follows:
+//
+// * If the tensor is the input to a layer or output of a pooling node, its dynamic range is derived from inRange.
+// * Otherwise its dynamic range is derived from outRange.
+//
+// The default parameter values are intended to demonstrate, for final layers in the network,
+// cases where dynamic ranges are asymmetric.
+//
+// The default parameter values choosen arbitrarily. Range values should be choosen such that
+// we avoid underflow or overflow. Also range value should be non zero to avoid uniform zero scale tensor.
+inline void setAllDynamicRanges(nvinfer1::INetworkDefinition* network, float inRange = 2.0f, float outRange = 4.0f)
+{
+    // Ensure that all layer inputs have a scale.
+    for (int i = 0; i < network->getNbLayers(); i++)
+    {
+        auto layer = network->getLayer(i);
+        for (int j = 0; j < layer->getNbInputs(); j++)
+        {
+            nvinfer1::ITensor* input{layer->getInput(j)};
+            // Optional inputs are nullptr here and are from RNN layers.
+            if (input != nullptr && !input->dynamicRangeIsSet())
+            {
+                ASSERT(input->setDynamicRange(-inRange, inRange));
+            }
+        }
+    }
+
+    // Ensure that all layer outputs have a scale.
+    // Tensors that are also inputs to layers are ingored here
+    // since the previous loop nest assigned scales to them.
+    for (int i = 0; i < network->getNbLayers(); i++)
+    {
+        auto layer = network->getLayer(i);
+        for (int j = 0; j < layer->getNbOutputs(); j++)
+        {
+            nvinfer1::ITensor* output{layer->getOutput(j)};
+            // Optional outputs are nullptr here and are from RNN layers.
+            if (output != nullptr && !output->dynamicRangeIsSet())
+            {
+                // Pooling must have the same input and output scales.
+                if (layer->getType() == nvinfer1::LayerType::kPOOLING)
+                {
+                    ASSERT(output->setDynamicRange(-inRange, inRange));
+                }
+                else
+                {
+                    ASSERT(output->setDynamicRange(-outRange, outRange));
+                }
+            }
+        }
+    }
+}
+
+inline void setDummyInt8DynamicRanges(const nvinfer1::IBuilderConfig* c, nvinfer1::INetworkDefinition* n)
+{
+    // Set dummy per-tensor dynamic range if Int8 mode is requested.
+    if (c->getFlag(nvinfer1::BuilderFlag::kINT8))
+    {
+        sample::gLogWarning
+            << "Int8 calibrator not provided. Generating dummy per-tensor dynamic range. Int8 accuracy is not guaranteed."
+            << std::endl;
+        setAllDynamicRanges(n);
+    }
+}
+
+inline void enableDLA(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, int useDLACore, bool allowGPUFallback = true)
+{
+    if (useDLACore >= 0)
+    {
+        if (builder->getNbDLACores() == 0)
+        {
+            std::cerr << "Trying to use DLA core " << useDLACore << " on a platform that doesn't have any DLA cores"
+                      << std::endl;
+            assert("Error: use DLA core on a platfrom that doesn't have any DLA cores" && false);
+        }
+        if (allowGPUFallback)
+        {
+            config->setFlag(nvinfer1::BuilderFlag::kGPU_FALLBACK);
+        }
+        if (!config->getFlag(nvinfer1::BuilderFlag::kINT8))
+        {
+            // User has not requested INT8 Mode.
+            // By default run in FP16 mode. FP32 mode is not permitted.
+            config->setFlag(nvinfer1::BuilderFlag::kFP16);
+        }
+        config->setDefaultDeviceType(nvinfer1::DeviceType::kDLA);
+        config->setDLACore(useDLACore);
+    }
+}
+
+inline int32_t parseDLA(int32_t argc, char** argv)
+{
+    for (int32_t i = 1; i < argc; i++)
+    {
+        if (strncmp(argv[i], "--useDLACore=", 13) == 0)
+        {
+            return std::stoi(argv[i] + 13);
+        }
+    }
+    return -1;
+}
+
+inline uint32_t getElementSize(nvinfer1::DataType t) noexcept
+{
+    switch (t)
+    {
+    case nvinfer1::DataType::kINT32: return 4;
+    case nvinfer1::DataType::kFLOAT: return 4;
+    case nvinfer1::DataType::kHALF: return 2;
+    case nvinfer1::DataType::kBOOL:
+    case nvinfer1::DataType::kINT8: return 1;
+    }
+    return 0;
+}
+
+inline int64_t volume(const nvinfer1::Dims& d)
+{
+    return std::accumulate(d.d, d.d + d.nbDims, 1, std::multiplies<int64_t>());
+}
+
+template <int C, int H, int W>
+struct PPM
+{
+    std::string magic, fileName;
+    int h, w, max;
+    uint8_t buffer[C * H * W];
+};
+
+// New vPPM(variable sized PPM) class with variable dimensions.
+struct vPPM
+{
+    std::string magic, fileName;
+    int h, w, max;
+    std::vector<uint8_t> buffer;
+};
+
+struct BBox
+{
+    float x1, y1, x2, y2;
+};
+
+template <int C, int H, int W>
+void readPPMFile(const std::string& filename, samplesCommon::PPM<C, H, W>& ppm)
+{
+    ppm.fileName = filename;
+    std::ifstream infile(filename, std::ifstream::binary);
+    assert(infile.is_open() && "Attempting to read from a file that is not open.");
+    infile >> ppm.magic >> ppm.w >> ppm.h >> ppm.max;
+    infile.seekg(1, infile.cur);
+    infile.read(reinterpret_cast<char*>(ppm.buffer), ppm.w * ppm.h * 3);
+}
+
+inline void readPPMFile(const std::string& filename, vPPM& ppm, std::vector<std::string>& input_dir)
+{
+    ppm.fileName = filename;
+    std::ifstream infile(locateFile(filename, input_dir), std::ifstream::binary);
+    infile >> ppm.magic >> ppm.w >> ppm.h >> ppm.max;
+    infile.seekg(1, infile.cur);
+
+    for (int i = 0; i < ppm.w * ppm.h * 3; ++i)
+    {
+        ppm.buffer.push_back(0);
+    }
+
+    infile.read(reinterpret_cast<char*>(&ppm.buffer[0]), ppm.w * ppm.h * 3);
+}
+
+template <int C, int H, int W>
+void writePPMFileWithBBox(const std::string& filename, PPM<C, H, W>& ppm, const BBox& bbox)
+{
+    std::ofstream outfile("./" + filename, std::ofstream::binary);
+    assert(!outfile.fail());
+    outfile << "P6"
+            << "\n"
+            << ppm.w << " " << ppm.h << "\n"
+            << ppm.max << "\n";
+
+    auto round = [](float x) -> int { return int(std::floor(x + 0.5f)); };
+    const int x1 = std::min(std::max(0, round(int(bbox.x1))), W - 1);
+    const int x2 = std::min(std::max(0, round(int(bbox.x2))), W - 1);
+    const int y1 = std::min(std::max(0, round(int(bbox.y1))), H - 1);
+    const int y2 = std::min(std::max(0, round(int(bbox.y2))), H - 1);
+
+    for (int x = x1; x <= x2; ++x)
+    {
+        // bbox top border
+        ppm.buffer[(y1 * ppm.w + x) * 3] = 255;
+        ppm.buffer[(y1 * ppm.w + x) * 3 + 1] = 0;
+        ppm.buffer[(y1 * ppm.w + x) * 3 + 2] = 0;
+        // bbox bottom border
+        ppm.buffer[(y2 * ppm.w + x) * 3] = 255;
+        ppm.buffer[(y2 * ppm.w + x) * 3 + 1] = 0;
+        ppm.buffer[(y2 * ppm.w + x) * 3 + 2] = 0;
+    }
+
+    for (int y = y1; y <= y2; ++y)
+    {
+        // bbox left border
+        ppm.buffer[(y * ppm.w + x1) * 3] = 255;
+        ppm.buffer[(y * ppm.w + x1) * 3 + 1] = 0;
+        ppm.buffer[(y * ppm.w + x1) * 3 + 2] = 0;
+        // bbox right border
+        ppm.buffer[(y * ppm.w + x2) * 3] = 255;
+        ppm.buffer[(y * ppm.w + x2) * 3 + 1] = 0;
+        ppm.buffer[(y * ppm.w + x2) * 3 + 2] = 0;
+    }
+
+    outfile.write(reinterpret_cast<char*>(ppm.buffer), ppm.w * ppm.h * 3);
+}
+
+inline void writePPMFileWithBBox(const std::string& filename, vPPM ppm, std::vector<BBox>& dets)
+{
+    std::ofstream outfile("./" + filename, std::ofstream::binary);
+    assert(!outfile.fail());
+    outfile << "P6"
+            << "\n"
+            << ppm.w << " " << ppm.h << "\n"
+            << ppm.max << "\n";
+    auto round = [](float x) -> int { return int(std::floor(x + 0.5f)); };
+
+    for (auto bbox : dets)
+    {
+        for (int x = int(bbox.x1); x < int(bbox.x2); ++x)
+        {
+            // bbox top border
+            ppm.buffer[(round(bbox.y1) * ppm.w + x) * 3] = 255;
+            ppm.buffer[(round(bbox.y1) * ppm.w + x) * 3 + 1] = 0;
+            ppm.buffer[(round(bbox.y1) * ppm.w + x) * 3 + 2] = 0;
+            // bbox bottom border
+            ppm.buffer[(round(bbox.y2) * ppm.w + x) * 3] = 255;
+            ppm.buffer[(round(bbox.y2) * ppm.w + x) * 3 + 1] = 0;
+            ppm.buffer[(round(bbox.y2) * ppm.w + x) * 3 + 2] = 0;
+        }
+
+        for (int y = int(bbox.y1); y < int(bbox.y2); ++y)
+        {
+            // bbox left border
+            ppm.buffer[(y * ppm.w + round(bbox.x1)) * 3] = 255;
+            ppm.buffer[(y * ppm.w + round(bbox.x1)) * 3 + 1] = 0;
+            ppm.buffer[(y * ppm.w + round(bbox.x1)) * 3 + 2] = 0;
+            // bbox right border
+            ppm.buffer[(y * ppm.w + round(bbox.x2)) * 3] = 255;
+            ppm.buffer[(y * ppm.w + round(bbox.x2)) * 3 + 1] = 0;
+            ppm.buffer[(y * ppm.w + round(bbox.x2)) * 3 + 2] = 0;
+        }
+    }
+
+    outfile.write(reinterpret_cast<char*>(&ppm.buffer[0]), ppm.w * ppm.h * 3);
+}
+
+class TimerBase
+{
+public:
+    virtual void start() {}
+    virtual void stop() {}
+    float microseconds() const noexcept
+    {
+        return mMs * 1000.f;
+    }
+    float milliseconds() const noexcept
+    {
+        return mMs;
+    }
+    float seconds() const noexcept
+    {
+        return mMs / 1000.f;
+    }
+    void reset() noexcept
+    {
+        mMs = 0.f;
+    }
+
+protected:
+    float mMs{0.0f};
+};
+
+class GpuTimer : public TimerBase
+{
+public:
+    explicit GpuTimer(cudaStream_t stream)
+        : mStream(stream)
+    {
+        CHECK(cudaEventCreate(&mStart));
+        CHECK(cudaEventCreate(&mStop));
+    }
+    ~GpuTimer()
+    {
+        CHECK(cudaEventDestroy(mStart));
+        CHECK(cudaEventDestroy(mStop));
+    }
+    void start()
+    {
+        CHECK(cudaEventRecord(mStart, mStream));
+    }
+    void stop()
+    {
+        CHECK(cudaEventRecord(mStop, mStream));
+        float ms{0.0f};
+        CHECK(cudaEventSynchronize(mStop));
+        CHECK(cudaEventElapsedTime(&ms, mStart, mStop));
+        mMs += ms;
+    }
+
+private:
+    cudaEvent_t mStart, mStop;
+    cudaStream_t mStream;
+}; // class GpuTimer
+
+template <typename Clock>
+class CpuTimer : public TimerBase
+{
+public:
+    using clock_type = Clock;
+
+    void start()
+    {
+        mStart = Clock::now();
+    }
+    void stop()
+    {
+        mStop = Clock::now();
+        mMs += std::chrono::duration<float, std::milli>{mStop - mStart}.count();
+    }
+
+private:
+    std::chrono::time_point<Clock> mStart, mStop;
+}; // class CpuTimer
+
+using PreciseCpuTimer = CpuTimer<std::chrono::high_resolution_clock>;
+
+inline std::vector<std::string> splitString(std::string str, char delimiter = ',')
+{
+    std::vector<std::string> splitVect;
+    std::stringstream ss(str);
+    std::string substr;
+
+    while (ss.good())
+    {
+        getline(ss, substr, delimiter);
+        splitVect.emplace_back(std::move(substr));
+    }
+    return splitVect;
+}
+
+// Return m rounded up to nearest multiple of n
+inline int roundUp(int m, int n)
+{
+    return ((m + n - 1) / n) * n;
+}
+
+inline int getC(const nvinfer1::Dims& d)
+{
+    return d.nbDims >= 3 ? d.d[d.nbDims - 3] : 1;
+}
+
+inline int getH(const nvinfer1::Dims& d)
+{
+    return d.nbDims >= 2 ? d.d[d.nbDims - 2] : 1;
+}
+
+inline int getW(const nvinfer1::Dims& d)
+{
+    return d.nbDims >= 1 ? d.d[d.nbDims - 1] : 1;
+}
+
+inline void loadLibrary(const std::string& path)
+{
+#ifdef _MSC_VER
+    void* handle = LoadLibrary(path.c_str());
+#else
+    int32_t flags{RTLD_LAZY};
+#if ENABLE_ASAN
+    // https://github.com/google/sanitizers/issues/89
+    // asan doesn't handle module unloading correctly and there are no plans on doing
+    // so. In order to get proper stack traces, don't delete the shared library on
+    // close so that asan can resolve the symbols correctly.
+    flags |= RTLD_NODELETE;
+#endif // ENABLE_ASAN
+
+    void* handle = dlopen(path.c_str(), flags);
+#endif
+    if (handle == nullptr)
+    {
+#ifdef _MSC_VER
+        sample::gLogError << "Could not load plugin library: " << path << std::endl;
+#else
+        sample::gLogError << "Could not load plugin library: " << path << ", due to: " << dlerror() << std::endl;
+#endif
+    }
+}
+
+inline int32_t getSMVersion()
+{
+    int32_t deviceIndex = 0;
+    CHECK(cudaGetDevice(&deviceIndex));
+
+    int32_t major, minor;
+    CHECK(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, deviceIndex));
+    CHECK(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, deviceIndex));
+
+    return ((major << 8) | minor);
+}
+
+inline bool isSMSafe()
+{
+    const int32_t smVersion = getSMVersion();
+    return smVersion == 0x0700 || smVersion == 0x0702 || smVersion == 0x0705 ||
+           smVersion == 0x0800 || smVersion == 0x0806 || smVersion == 0x0807;
+}
+
+inline bool isDataTypeSupported(nvinfer1::DataType dataType)
+{
+    auto builder = SampleUniquePtr<nvinfer1::IBuilder>(nvinfer1::createInferBuilder(sample::gLogger.getTRTLogger()));
+    if (!builder)
+    {
+        return false;
+    }
+
+    if ((dataType == nvinfer1::DataType::kINT8 && !builder->platformHasFastInt8())
+        || (dataType == nvinfer1::DataType::kHALF && !builder->platformHasFastFp16()))
+    {
+        return false;
+    }
+
+    return true;
+}
+
+} // namespace samplesCommon
+
+inline std::ostream& operator<<(std::ostream& os, const nvinfer1::Dims& dims)
+{
+    os << "(";
+    for (int i = 0; i < dims.nbDims; ++i)
+    {
+        os << (i ? ", " : "") << dims.d[i];
+    }
+    return os << ")";
+}
+
+#endif // TENSORRT_COMMON_H
diff --git a/src/Detector/tensorrt_yolo/common_deprecated/half.h b/src/Detector/tensorrt_yolo/common_deprecated/half.h
new file mode 100644
index 00000000..0755c316
--- /dev/null
+++ b/src/Detector/tensorrt_yolo/common_deprecated/half.h
@@ -0,0 +1,4302 @@
+// half - IEEE 754-based half-precision floating point library.
+//
+// Copyright (c) 2012-2017 Christian Rau <rauy@users.sourceforge.net>
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
+// documentation files (the "Software"), to deal in the Software without restriction, including without limitation the
+// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to
+// permit persons to whom the Software is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the
+// Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+// OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Version 1.12.0
+
+/// \file
+/// Main header file for half precision functionality.
+
+#ifndef HALF_HALF_HPP
+#define HALF_HALF_HPP
+
+/// Combined gcc version number.
+#define HALF_GNUC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
+
+// check C++11 language features
+#if defined(__clang__) // clang
+#if __has_feature(cxx_static_assert) && !defined(HALF_ENABLE_CPP11_STATIC_ASSERT)
+#define HALF_ENABLE_CPP11_STATIC_ASSERT 1
+#endif
+#if __has_feature(cxx_constexpr) && !defined(HALF_ENABLE_CPP11_CONSTEXPR)
+#define HALF_ENABLE_CPP11_CONSTEXPR 1
+#endif
+#if __has_feature(cxx_noexcept) && !defined(HALF_ENABLE_CPP11_NOEXCEPT)
+#define HALF_ENABLE_CPP11_NOEXCEPT 1
+#endif
+#if __has_feature(cxx_user_literals) && !defined(HALF_ENABLE_CPP11_USER_LITERALS)
+#define HALF_ENABLE_CPP11_USER_LITERALS 1
+#endif
+#if (defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103L) && !defined(HALF_ENABLE_CPP11_LONG_LONG)
+#define HALF_ENABLE_CPP11_LONG_LONG 1
+#endif
+/*#elif defined(__INTEL_COMPILER)								//Intel C++
+    #if __INTEL_COMPILER >= 1100 && !defined(HALF_ENABLE_CPP11_STATIC_ASSERT)		????????
+        #define HALF_ENABLE_CPP11_STATIC_ASSERT 1
+    #endif
+    #if __INTEL_COMPILER >= 1300 && !defined(HALF_ENABLE_CPP11_CONSTEXPR)			????????
+        #define HALF_ENABLE_CPP11_CONSTEXPR 1
+    #endif
+    #if __INTEL_COMPILER >= 1300 && !defined(HALF_ENABLE_CPP11_NOEXCEPT)			????????
+        #define HALF_ENABLE_CPP11_NOEXCEPT 1
+    #endif
+    #if __INTEL_COMPILER >= 1100 && !defined(HALF_ENABLE_CPP11_LONG_LONG)			????????
+        #define HALF_ENABLE_CPP11_LONG_LONG 1
+    #endif*/
+#elif defined(__GNUC__) // gcc
+#if defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103L
+#if HALF_GNUC_VERSION >= 403 && !defined(HALF_ENABLE_CPP11_STATIC_ASSERT)
+#define HALF_ENABLE_CPP11_STATIC_ASSERT 1
+#endif
+#if HALF_GNUC_VERSION >= 406 && !defined(HALF_ENABLE_CPP11_CONSTEXPR)
+#define HALF_ENABLE_CPP11_CONSTEXPR 1
+#endif
+#if HALF_GNUC_VERSION >= 406 && !defined(HALF_ENABLE_CPP11_NOEXCEPT)
+#define HALF_ENABLE_CPP11_NOEXCEPT 1
+#endif
+#if HALF_GNUC_VERSION >= 407 && !defined(HALF_ENABLE_CPP11_USER_LITERALS)
+#define HALF_ENABLE_CPP11_USER_LITERALS 1
+#endif
+#if !defined(HALF_ENABLE_CPP11_LONG_LONG)
+#define HALF_ENABLE_CPP11_LONG_LONG 1
+#endif
+#endif
+#elif defined(_MSC_VER) // Visual C++
+#if _MSC_VER >= 1900 && !defined(HALF_ENABLE_CPP11_CONSTEXPR)
+#define HALF_ENABLE_CPP11_CONSTEXPR 1
+#endif
+#if _MSC_VER >= 1900 && !defined(HALF_ENABLE_CPP11_NOEXCEPT)
+#define HALF_ENABLE_CPP11_NOEXCEPT 1
+#endif
+#if _MSC_VER >= 1900 && !defined(HALF_ENABLE_CPP11_USER_LITERALS)
+#define HALF_ENABLE_CPP11_USER_LITERALS 1
+#endif
+#if _MSC_VER >= 1600 && !defined(HALF_ENABLE_CPP11_STATIC_ASSERT)
+#define HALF_ENABLE_CPP11_STATIC_ASSERT 1
+#endif
+#if _MSC_VER >= 1310 && !defined(HALF_ENABLE_CPP11_LONG_LONG)
+#define HALF_ENABLE_CPP11_LONG_LONG 1
+#endif
+#define HALF_POP_WARNINGS 1
+#pragma warning(push)
+#pragma warning(disable : 4099 4127 4146) // struct vs class, constant in if, negative unsigned
+#endif
+
+// check C++11 library features
+#include <utility>
+#if defined(_LIBCPP_VERSION) // libc++
+#if defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103
+#ifndef HALF_ENABLE_CPP11_TYPE_TRAITS
+#define HALF_ENABLE_CPP11_TYPE_TRAITS 1
+#endif
+#ifndef HALF_ENABLE_CPP11_CSTDINT
+#define HALF_ENABLE_CPP11_CSTDINT 1
+#endif
+#ifndef HALF_ENABLE_CPP11_CMATH
+#define HALF_ENABLE_CPP11_CMATH 1
+#endif
+#ifndef HALF_ENABLE_CPP11_HASH
+#define HALF_ENABLE_CPP11_HASH 1
+#endif
+#endif
+#elif defined(__GLIBCXX__) // libstdc++
+#if defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103
+#ifdef __clang__
+#if __GLIBCXX__ >= 20080606 && !defined(HALF_ENABLE_CPP11_TYPE_TRAITS)
+#define HALF_ENABLE_CPP11_TYPE_TRAITS 1
+#endif
+#if __GLIBCXX__ >= 20080606 && !defined(HALF_ENABLE_CPP11_CSTDINT)
+#define HALF_ENABLE_CPP11_CSTDINT 1
+#endif
+#if __GLIBCXX__ >= 20080606 && !defined(HALF_ENABLE_CPP11_CMATH)
+#define HALF_ENABLE_CPP11_CMATH 1
+#endif
+#if __GLIBCXX__ >= 20080606 && !defined(HALF_ENABLE_CPP11_HASH)
+#define HALF_ENABLE_CPP11_HASH 1
+#endif
+#else
+#if HALF_GNUC_VERSION >= 403 && !defined(HALF_ENABLE_CPP11_CSTDINT)
+#define HALF_ENABLE_CPP11_CSTDINT 1
+#endif
+#if HALF_GNUC_VERSION >= 403 && !defined(HALF_ENABLE_CPP11_CMATH)
+#define HALF_ENABLE_CPP11_CMATH 1
+#endif
+#if HALF_GNUC_VERSION >= 403 && !defined(HALF_ENABLE_CPP11_HASH)
+#define HALF_ENABLE_CPP11_HASH 1
+#endif
+#endif
+#endif
+#elif defined(_CPPLIB_VER) // Dinkumware/Visual C++
+#if _CPPLIB_VER >= 520
+#ifndef HALF_ENABLE_CPP11_TYPE_TRAITS
+#define HALF_ENABLE_CPP11_TYPE_TRAITS 1
+#endif
+#ifndef HALF_ENABLE_CPP11_CSTDINT
+#define HALF_ENABLE_CPP11_CSTDINT 1
+#endif
+#ifndef HALF_ENABLE_CPP11_HASH
+#define HALF_ENABLE_CPP11_HASH 1
+#endif
+#endif
+#if _CPPLIB_VER >= 610
+#ifndef HALF_ENABLE_CPP11_CMATH
+#define HALF_ENABLE_CPP11_CMATH 1
+#endif
+#endif
+#endif
+#undef HALF_GNUC_VERSION
+
+// support constexpr
+#if HALF_ENABLE_CPP11_CONSTEXPR
+#define HALF_CONSTEXPR constexpr
+#define HALF_CONSTEXPR_CONST constexpr
+#else
+#define HALF_CONSTEXPR
+#define HALF_CONSTEXPR_CONST const
+#endif
+
+// support noexcept
+#if HALF_ENABLE_CPP11_NOEXCEPT
+#define HALF_NOEXCEPT noexcept
+#define HALF_NOTHROW noexcept
+#else
+#define HALF_NOEXCEPT
+#define HALF_NOTHROW throw()
+#endif
+
+#include <algorithm>
+#include <climits>
+#include <cmath>
+#include <cstring>
+#include <iostream>
+#include <limits>
+#if HALF_ENABLE_CPP11_TYPE_TRAITS
+#include <type_traits>
+#endif
+#if HALF_ENABLE_CPP11_CSTDINT
+#include <cstdint>
+#endif
+#if HALF_ENABLE_CPP11_HASH
+#include <functional>
+#endif
+
+/// Default rounding mode.
+/// This specifies the rounding mode used for all conversions between [half](\ref half_float::half)s and `float`s as
+/// well as for the half_cast() if not specifying a rounding mode explicitly. It can be redefined (before including
+/// half.hpp) to one of the standard rounding modes using their respective constants or the equivalent values of
+/// `std::float_round_style`:
+///
+/// `std::float_round_style`         | value | rounding
+/// ---------------------------------|-------|-------------------------
+/// `std::round_indeterminate`       | -1    | fastest (default)
+/// `std::round_toward_zero`         | 0     | toward zero
+/// `std::round_to_nearest`          | 1     | to nearest
+/// `std::round_toward_infinity`     | 2     | toward positive infinity
+/// `std::round_toward_neg_infinity` | 3     | toward negative infinity
+///
+/// By default this is set to `-1` (`std::round_indeterminate`), which uses truncation (round toward zero, but with
+/// overflows set to infinity) and is the fastest rounding mode possible. It can even be set to
+/// `std::numeric_limits<float>::round_style` to synchronize the rounding mode with that of the underlying
+/// single-precision implementation.
+#ifndef HALF_ROUND_STYLE
+#define HALF_ROUND_STYLE 1 // = std::round_to_nearest
+#endif
+
+/// Tie-breaking behaviour for round to nearest.
+/// This specifies if ties in round to nearest should be resolved by rounding to the nearest even value. By default this
+/// is defined to `0` resulting in the faster but slightly more biased behaviour of rounding away from zero in half-way
+/// cases (and thus equal to the round() function), but can be redefined to `1` (before including half.hpp) if more
+/// IEEE-conformant behaviour is needed.
+#ifndef HALF_ROUND_TIES_TO_EVEN
+#define HALF_ROUND_TIES_TO_EVEN 0 // ties away from zero
+#endif
+
+/// Value signaling overflow.
+/// In correspondence with `HUGE_VAL[F|L]` from `<cmath>` this symbol expands to a positive value signaling the overflow
+/// of an operation, in particular it just evaluates to positive infinity.
+#define HUGE_VALH std::numeric_limits<half_float::half>::infinity()
+
+/// Fast half-precision fma function.
+/// This symbol is only defined if the fma() function generally executes as fast as, or faster than, a separate
+/// half-precision multiplication followed by an addition. Due to the internal single-precision implementation of all
+/// arithmetic operations, this is in fact always the case.
+#define FP_FAST_FMAH 1
+
+#ifndef FP_ILOGB0
+#define FP_ILOGB0 INT_MIN
+#endif
+#ifndef FP_ILOGBNAN
+#define FP_ILOGBNAN INT_MAX
+#endif
+#ifndef FP_SUBNORMAL
+#define FP_SUBNORMAL 0
+#endif
+#ifndef FP_ZERO
+#define FP_ZERO 1
+#endif
+#ifndef FP_NAN
+#define FP_NAN 2
+#endif
+#ifndef FP_INFINITE
+#define FP_INFINITE 3
+#endif
+#ifndef FP_NORMAL
+#define FP_NORMAL 4
+#endif
+
+/// Main namespace for half precision functionality.
+/// This namespace contains all the functionality provided by the library.
+namespace half_float
+{
+class half;
+
+#if HALF_ENABLE_CPP11_USER_LITERALS
+/// Library-defined half-precision literals.
+/// Import this namespace to enable half-precision floating point literals:
+/// ~~~~{.cpp}
+/// using namespace half_float::literal;
+/// half_float::half = 4.2_h;
+/// ~~~~
+namespace literal
+{
+half operator"" _h(long double);
+}
+#endif
+
+/// \internal
+/// \brief Implementation details.
+namespace detail
+{
+#if HALF_ENABLE_CPP11_TYPE_TRAITS
+/// Conditional type.
+template <bool B, typename T, typename F>
+struct conditional : std::conditional<B, T, F>
+{
+};
+
+/// Helper for tag dispatching.
+template <bool B>
+struct bool_type : std::integral_constant<bool, B>
+{
+};
+using std::false_type;
+using std::true_type;
+
+/// Type traits for floating point types.
+template <typename T>
+struct is_float : std::is_floating_point<T>
+{
+};
+#else
+/// Conditional type.
+template <bool, typename T, typename>
+struct conditional
+{
+    typedef T type;
+};
+template <typename T, typename F>
+struct conditional<false, T, F>
+{
+    typedef F type;
+};
+
+/// Helper for tag dispatching.
+template <bool>
+struct bool_type
+{
+};
+typedef bool_type<true> true_type;
+typedef bool_type<false> false_type;
+
+/// Type traits for floating point types.
+template <typename>
+struct is_float : false_type
+{
+};
+template <typename T>
+struct is_float<const T> : is_float<T>
+{
+};
+template <typename T>
+struct is_float<volatile T> : is_float<T>
+{
+};
+template <typename T>
+struct is_float<const volatile T> : is_float<T>
+{
+};
+template <>
+struct is_float<float> : true_type
+{
+};
+template <>
+struct is_float<double> : true_type
+{
+};
+template <>
+struct is_float<long double> : true_type
+{
+};
+#endif
+
+/// Type traits for floating point bits.
+template <typename T>
+struct bits
+{
+    typedef unsigned char type;
+};
+template <typename T>
+struct bits<const T> : bits<T>
+{
+};
+template <typename T>
+struct bits<volatile T> : bits<T>
+{
+};
+template <typename T>
+struct bits<const volatile T> : bits<T>
+{
+};
+
+#if HALF_ENABLE_CPP11_CSTDINT
+/// Unsigned integer of (at least) 16 bits width.
+typedef std::uint_least16_t uint16;
+
+/// Unsigned integer of (at least) 32 bits width.
+template <>
+struct bits<float>
+{
+    typedef std::uint_least32_t type;
+};
+
+/// Unsigned integer of (at least) 64 bits width.
+template <>
+struct bits<double>
+{
+    typedef std::uint_least64_t type;
+};
+#else
+/// Unsigned integer of (at least) 16 bits width.
+typedef unsigned short uint16;
+
+/// Unsigned integer of (at least) 32 bits width.
+template <>
+struct bits<float> : conditional<std::numeric_limits<unsigned int>::digits >= 32, unsigned int, unsigned long>
+{
+};
+
+#if HALF_ENABLE_CPP11_LONG_LONG
+/// Unsigned integer of (at least) 64 bits width.
+template <>
+struct bits<double> : conditional<std::numeric_limits<unsigned long>::digits >= 64, unsigned long, unsigned long long>
+{
+};
+#else
+/// Unsigned integer of (at least) 64 bits width.
+template <>
+struct bits<double>
+{
+    typedef unsigned long type;
+};
+#endif
+#endif
+
+/// Tag type for binary construction.
+struct binary_t
+{
+};
+
+/// Tag for binary construction.
+HALF_CONSTEXPR_CONST binary_t binary = binary_t();
+
+/// Temporary half-precision expression.
+/// This class represents a half-precision expression which just stores a single-precision value internally.
+struct expr
+{
+    /// Conversion constructor.
+    /// \param f single-precision value to convert
+    explicit HALF_CONSTEXPR expr(float f) HALF_NOEXCEPT : value_(f) {}
+
+    /// Conversion to single-precision.
+    /// \return single precision value representing expression value
+    HALF_CONSTEXPR operator float() const HALF_NOEXCEPT
+    {
+        return value_;
+    }
+
+private:
+    /// Internal expression value stored in single-precision.
+    float value_;
+};
+
+/// SFINAE helper for generic half-precision functions.
+/// This class template has to be specialized for each valid combination of argument types to provide a corresponding
+/// `type` member equivalent to \a T.
+/// \tparam T type to return
+template <typename T, typename, typename = void, typename = void>
+struct enable
+{
+};
+template <typename T>
+struct enable<T, half, void, void>
+{
+    typedef T type;
+};
+template <typename T>
+struct enable<T, expr, void, void>
+{
+    typedef T type;
+};
+template <typename T>
+struct enable<T, half, half, void>
+{
+    typedef T type;
+};
+template <typename T>
+struct enable<T, half, expr, void>
+{
+    typedef T type;
+};
+template <typename T>
+struct enable<T, expr, half, void>
+{
+    typedef T type;
+};
+template <typename T>
+struct enable<T, expr, expr, void>
+{
+    typedef T type;
+};
+template <typename T>
+struct enable<T, half, half, half>
+{
+    typedef T type;
+};
+template <typename T>
+struct enable<T, half, half, expr>
+{
+    typedef T type;
+};
+template <typename T>
+struct enable<T, half, expr, half>
+{
+    typedef T type;
+};
+template <typename T>
+struct enable<T, half, expr, expr>
+{
+    typedef T type;
+};
+template <typename T>
+struct enable<T, expr, half, half>
+{
+    typedef T type;
+};
+template <typename T>
+struct enable<T, expr, half, expr>
+{
+    typedef T type;
+};
+template <typename T>
+struct enable<T, expr, expr, half>
+{
+    typedef T type;
+};
+template <typename T>
+struct enable<T, expr, expr, expr>
+{
+    typedef T type;
+};
+
+/// Return type for specialized generic 2-argument half-precision functions.
+/// This class template has to be specialized for each valid combination of argument types to provide a corresponding
+/// `type` member denoting the appropriate return type.
+/// \tparam T first argument type
+/// \tparam U first argument type
+template <typename T, typename U>
+struct result : enable<expr, T, U>
+{
+};
+template <>
+struct result<half, half>
+{
+    typedef half type;
+};
+
+/// \name Classification helpers
+/// \{
+
+/// Check for infinity.
+/// \tparam T argument type (builtin floating point type)
+/// \param arg value to query
+/// \retval true if infinity
+/// \retval false else
+template <typename T>
+bool builtin_isinf(T arg)
+{
+#if HALF_ENABLE_CPP11_CMATH
+    return std::isinf(arg);
+#elif defined(_MSC_VER)
+    return !::_finite(static_cast<double>(arg)) && !::_isnan(static_cast<double>(arg));
+#else
+    return arg == std::numeric_limits<T>::infinity() || arg == -std::numeric_limits<T>::infinity();
+#endif
+}
+
+/// Check for NaN.
+/// \tparam T argument type (builtin floating point type)
+/// \param arg value to query
+/// \retval true if not a number
+/// \retval false else
+template <typename T>
+bool builtin_isnan(T arg)
+{
+#if HALF_ENABLE_CPP11_CMATH
+    return std::isnan(arg);
+#elif defined(_MSC_VER)
+    return ::_isnan(static_cast<double>(arg)) != 0;
+#else
+    return arg != arg;
+#endif
+}
+
+/// Check sign.
+/// \tparam T argument type (builtin floating point type)
+/// \param arg value to query
+/// \retval true if signbit set
+/// \retval false else
+template <typename T>
+bool builtin_signbit(T arg)
+{
+#if HALF_ENABLE_CPP11_CMATH
+    return std::signbit(arg);
+#else
+    return arg < T() || (arg == T() && T(1) / arg < T());
+#endif
+}
+
+/// \}
+/// \name Conversion
+/// \{
+
+/// Convert IEEE single-precision to half-precision.
+/// Credit for this goes to [Jeroen van der Zijp](ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf).
+/// \tparam R rounding mode to use, `std::round_indeterminate` for fastest rounding
+/// \param value single-precision value
+/// \return binary representation of half-precision value
+template <std::float_round_style R>
+uint16 float2half_impl(float value, true_type)
+{
+    typedef bits<float>::type uint32;
+    uint32 bits; // = *reinterpret_cast<uint32*>(&value);		//violating strict aliasing!
+    std::memcpy(&bits, &value, sizeof(float));
+    /*			uint16 hbits = (bits>>16) & 0x8000;
+                bits &= 0x7FFFFFFF;
+                int exp = bits >> 23;
+                if(exp == 255)
+                    return hbits | 0x7C00 | (0x3FF&-static_cast<unsigned>((bits&0x7FFFFF)!=0));
+                if(exp > 142)
+                {
+                    if(R == std::round_toward_infinity)
+                        return hbits | 0x7C00 - (hbits>>15);
+                    if(R == std::round_toward_neg_infinity)
+                        return hbits | 0x7BFF + (hbits>>15);
+                    return hbits | 0x7BFF + (R!=std::round_toward_zero);
+                }
+                int g, s;
+                if(exp > 112)
+                {
+                    g = (bits>>12) & 1;
+                    s = (bits&0xFFF) != 0;
+                    hbits |= ((exp-112)<<10) | ((bits>>13)&0x3FF);
+                }
+                else if(exp > 101)
+                {
+                    int i = 125 - exp;
+                    bits = (bits&0x7FFFFF) | 0x800000;
+                    g = (bits>>i) & 1;
+                    s = (bits&((1L<<i)-1)) != 0;
+                    hbits |= bits >> (i+1);
+                }
+                else
+                {
+                    g = 0;
+                    s = bits != 0;
+                }
+                if(R == std::round_to_nearest)
+                    #if HALF_ROUND_TIES_TO_EVEN
+                        hbits += g & (s|hbits);
+                    #else
+                        hbits += g;
+                    #endif
+                else if(R == std::round_toward_infinity)
+                    hbits += ~(hbits>>15) & (s|g);
+                else if(R == std::round_toward_neg_infinity)
+                    hbits += (hbits>>15) & (g|s);
+    */
+    static const uint16 base_table[512] = {0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+        0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+        0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+        0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+        0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+        0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+        0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+        0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0001, 0x0002, 0x0004, 0x0008,
+        0x0010, 0x0020, 0x0040, 0x0080, 0x0100, 0x0200, 0x0400, 0x0800, 0x0C00, 0x1000, 0x1400, 0x1800, 0x1C00, 0x2000,
+        0x2400, 0x2800, 0x2C00, 0x3000, 0x3400, 0x3800, 0x3C00, 0x4000, 0x4400, 0x4800, 0x4C00, 0x5000, 0x5400, 0x5800,
+        0x5C00, 0x6000, 0x6400, 0x6800, 0x6C00, 0x7000, 0x7400, 0x7800, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00,
+        0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00,
+        0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00,
+        0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00,
+        0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00,
+        0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00,
+        0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00,
+        0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00,
+        0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+        0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+        0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+        0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+        0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+        0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+        0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+        0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+        0x8001, 0x8002, 0x8004, 0x8008, 0x8010, 0x8020, 0x8040, 0x8080, 0x8100, 0x8200, 0x8400, 0x8800, 0x8C00, 0x9000,
+        0x9400, 0x9800, 0x9C00, 0xA000, 0xA400, 0xA800, 0xAC00, 0xB000, 0xB400, 0xB800, 0xBC00, 0xC000, 0xC400, 0xC800,
+        0xCC00, 0xD000, 0xD400, 0xD800, 0xDC00, 0xE000, 0xE400, 0xE800, 0xEC00, 0xF000, 0xF400, 0xF800, 0xFC00, 0xFC00,
+        0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00,
+        0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00,
+        0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00,
+        0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00,
+        0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00,
+        0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00,
+        0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00,
+        0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00};
+    static const unsigned char shift_table[512] = {24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+        24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+        24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+        24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+        24, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+        13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+        24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+        24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+        24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+        24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 13, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+        24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+        24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+        24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+        24, 24, 24, 24, 24, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+        13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+        24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+        24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+        24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+        24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 13};
+    uint16 hbits = base_table[bits >> 23] + static_cast<uint16>((bits & 0x7FFFFF) >> shift_table[bits >> 23]);
+    if (R == std::round_to_nearest)
+        hbits += (((bits & 0x7FFFFF) >> (shift_table[bits >> 23] - 1)) | (((bits >> 23) & 0xFF) == 102))
+            & ((hbits & 0x7C00) != 0x7C00)
+#if HALF_ROUND_TIES_TO_EVEN
+            & (((((static_cast<uint32>(1) << (shift_table[bits >> 23] - 1)) - 1) & bits) != 0) | hbits)
+#endif
+            ;
+    else if (R == std::round_toward_zero)
+        hbits -= ((hbits & 0x7FFF) == 0x7C00) & ~shift_table[bits >> 23];
+    else if (R == std::round_toward_infinity)
+        hbits += ((((bits & 0x7FFFFF & ((static_cast<uint32>(1) << (shift_table[bits >> 23])) - 1)) != 0)
+                      | (((bits >> 23) <= 102) & ((bits >> 23) != 0)))
+                     & (hbits < 0x7C00))
+            - ((hbits == 0xFC00) & ((bits >> 23) != 511));
+    else if (R == std::round_toward_neg_infinity)
+        hbits += ((((bits & 0x7FFFFF & ((static_cast<uint32>(1) << (shift_table[bits >> 23])) - 1)) != 0)
+                      | (((bits >> 23) <= 358) & ((bits >> 23) != 256)))
+                     & (hbits < 0xFC00) & (hbits >> 15))
+            - ((hbits == 0x7C00) & ((bits >> 23) != 255));
+    return hbits;
+}
+
+/// Convert IEEE double-precision to half-precision.
+/// \tparam R rounding mode to use, `std::round_indeterminate` for fastest rounding
+/// \param value double-precision value
+/// \return binary representation of half-precision value
+template <std::float_round_style R>
+uint16 float2half_impl(double value, true_type)
+{
+    typedef bits<float>::type uint32;
+    typedef bits<double>::type uint64;
+    uint64 bits; // = *reinterpret_cast<uint64*>(&value);		//violating strict aliasing!
+    std::memcpy(&bits, &value, sizeof(double));
+    uint32 hi = bits >> 32, lo = bits & 0xFFFFFFFF;
+    uint16 hbits = (hi >> 16) & 0x8000;
+    hi &= 0x7FFFFFFF;
+    int exp = hi >> 20;
+    if (exp == 2047)
+        return hbits | 0x7C00 | (0x3FF & -static_cast<unsigned>((bits & 0xFFFFFFFFFFFFF) != 0));
+    if (exp > 1038)
+    {
+        if (R == std::round_toward_infinity)
+            return hbits | 0x7C00 - (hbits >> 15);
+        if (R == std::round_toward_neg_infinity)
+            return hbits | 0x7BFF + (hbits >> 15);
+        return hbits | 0x7BFF + (R != std::round_toward_zero);
+    }
+    int g, s = lo != 0;
+    if (exp > 1008)
+    {
+        g = (hi >> 9) & 1;
+        s |= (hi & 0x1FF) != 0;
+        hbits |= ((exp - 1008) << 10) | ((hi >> 10) & 0x3FF);
+    }
+    else if (exp > 997)
+    {
+        int i = 1018 - exp;
+        hi = (hi & 0xFFFFF) | 0x100000;
+        g = (hi >> i) & 1;
+        s |= (hi & ((1L << i) - 1)) != 0;
+        hbits |= hi >> (i + 1);
+    }
+    else
+    {
+        g = 0;
+        s |= hi != 0;
+    }
+    if (R == std::round_to_nearest)
+#if HALF_ROUND_TIES_TO_EVEN
+        hbits += g & (s | hbits);
+#else
+        hbits += g;
+#endif
+    else if (R == std::round_toward_infinity)
+        hbits += ~(hbits >> 15) & (s | g);
+    else if (R == std::round_toward_neg_infinity)
+        hbits += (hbits >> 15) & (g | s);
+    return hbits;
+}
+
+/// Convert non-IEEE floating point to half-precision.
+/// \tparam R rounding mode to use, `std::round_indeterminate` for fastest rounding
+/// \tparam T source type (builtin floating point type)
+/// \param value floating point value
+/// \return binary representation of half-precision value
+template <std::float_round_style R, typename T>
+uint16 float2half_impl(T value, ...)
+{
+    uint16 hbits = static_cast<unsigned>(builtin_signbit(value)) << 15;
+    if (value == T())
+        return hbits;
+    if (builtin_isnan(value))
+        return hbits | 0x7FFF;
+    if (builtin_isinf(value))
+        return hbits | 0x7C00;
+    int exp;
+    std::frexp(value, &exp);
+    if (exp > 16)
+    {
+        if (R == std::round_toward_infinity)
+            return hbits | (0x7C00 - (hbits >> 15));
+        else if (R == std::round_toward_neg_infinity)
+            return hbits | (0x7BFF + (hbits >> 15));
+        return hbits | (0x7BFF + (R != std::round_toward_zero));
+    }
+    if (exp < -13)
+        value = std::ldexp(value, 24);
+    else
+    {
+        value = std::ldexp(value, 11 - exp);
+        hbits |= ((exp + 13) << 10);
+    }
+    T ival, frac = std::modf(value, &ival);
+    hbits += static_cast<uint16>(std::abs(static_cast<int>(ival)));
+    if (R == std::round_to_nearest)
+    {
+        frac = std::abs(frac);
+#if HALF_ROUND_TIES_TO_EVEN
+        hbits += (frac > T(0.5)) | ((frac == T(0.5)) & hbits);
+#else
+        hbits += frac >= T(0.5);
+#endif
+    }
+    else if (R == std::round_toward_infinity)
+        hbits += frac > T();
+    else if (R == std::round_toward_neg_infinity)
+        hbits += frac < T();
+    return hbits;
+}
+
+/// Convert floating point to half-precision.
+/// \tparam R rounding mode to use, `std::round_indeterminate` for fastest rounding
+/// \tparam T source type (builtin floating point type)
+/// \param value floating point value
+/// \return binary representation of half-precision value
+template <std::float_round_style R, typename T>
+uint16 float2half(T value)
+{
+    return float2half_impl<R>(
+        value, bool_type < std::numeric_limits<T>::is_iec559 && sizeof(typename bits<T>::type) == sizeof(T) > ());
+}
+
+/// Convert integer to half-precision floating point.
+/// \tparam R rounding mode to use, `std::round_indeterminate` for fastest rounding
+/// \tparam S `true` if value negative, `false` else
+/// \tparam T type to convert (builtin integer type)
+/// \param value non-negative integral value
+/// \return binary representation of half-precision value
+template <std::float_round_style R, bool S, typename T>
+uint16 int2half_impl(T value)
+{
+#if HALF_ENABLE_CPP11_STATIC_ASSERT && HALF_ENABLE_CPP11_TYPE_TRAITS
+    static_assert(std::is_integral<T>::value, "int to half conversion only supports builtin integer types");
+#endif
+    if (S)
+        value = -value;
+    uint16 bits = S << 15;
+    if (value > 0xFFFF)
+    {
+        if (R == std::round_toward_infinity)
+            bits |= 0x7C00 - S;
+        else if (R == std::round_toward_neg_infinity)
+            bits |= 0x7BFF + S;
+        else
+            bits |= 0x7BFF + (R != std::round_toward_zero);
+    }
+    else if (value)
+    {
+        uint32_t m = value, exp = 24;
+        for (; m < 0x400; m <<= 1, --exp)
+            ;
+        for (; m > 0x7FF; m >>= 1, ++exp)
+            ;
+        bits |= (exp << 10) + m;
+        if (exp > 24)
+        {
+            if (R == std::round_to_nearest)
+                bits += (value >> (exp - 25)) & 1
+#if HALF_ROUND_TIES_TO_EVEN
+                    & (((((1 << (exp - 25)) - 1) & value) != 0) | bits)
+#endif
+                    ;
+            else if (R == std::round_toward_infinity)
+                bits += ((value & ((1 << (exp - 24)) - 1)) != 0) & !S;
+            else if (R == std::round_toward_neg_infinity)
+                bits += ((value & ((1 << (exp - 24)) - 1)) != 0) & S;
+        }
+    }
+    return bits;
+}
+
+/// Convert integer to half-precision floating point.
+/// \tparam R rounding mode to use, `std::round_indeterminate` for fastest rounding
+/// \tparam T type to convert (builtin integer type)
+/// \param value integral value
+/// \return binary representation of half-precision value
+template <std::float_round_style R, typename T>
+uint16 int2half(T value)
+{
+    return (value < 0) ? int2half_impl<R, true>(value) : int2half_impl<R, false>(value);
+}
+
+/// Convert half-precision to IEEE single-precision.
+/// Credit for this goes to [Jeroen van der Zijp](ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf).
+/// \param value binary representation of half-precision value
+/// \return single-precision value
+inline float half2float_impl(uint16 value, float, true_type)
+{
+    typedef bits<float>::type uint32;
+    /*			uint32 bits = static_cast<uint32>(value&0x8000) << 16;
+                int abs = value & 0x7FFF;
+                if(abs)
+                {
+                    bits |= 0x38000000 << static_cast<unsigned>(abs>=0x7C00);
+                    for(; abs<0x400; abs<<=1,bits-=0x800000) ;
+                    bits += static_cast<uint32>(abs) << 13;
+                }
+    */
+    static const uint32 mantissa_table[2048] = {0x00000000, 0x33800000, 0x34000000, 0x34400000, 0x34800000, 0x34A00000,
+        0x34C00000, 0x34E00000, 0x35000000, 0x35100000, 0x35200000, 0x35300000, 0x35400000, 0x35500000, 0x35600000,
+        0x35700000, 0x35800000, 0x35880000, 0x35900000, 0x35980000, 0x35A00000, 0x35A80000, 0x35B00000, 0x35B80000,
+        0x35C00000, 0x35C80000, 0x35D00000, 0x35D80000, 0x35E00000, 0x35E80000, 0x35F00000, 0x35F80000, 0x36000000,
+        0x36040000, 0x36080000, 0x360C0000, 0x36100000, 0x36140000, 0x36180000, 0x361C0000, 0x36200000, 0x36240000,
+        0x36280000, 0x362C0000, 0x36300000, 0x36340000, 0x36380000, 0x363C0000, 0x36400000, 0x36440000, 0x36480000,
+        0x364C0000, 0x36500000, 0x36540000, 0x36580000, 0x365C0000, 0x36600000, 0x36640000, 0x36680000, 0x366C0000,
+        0x36700000, 0x36740000, 0x36780000, 0x367C0000, 0x36800000, 0x36820000, 0x36840000, 0x36860000, 0x36880000,
+        0x368A0000, 0x368C0000, 0x368E0000, 0x36900000, 0x36920000, 0x36940000, 0x36960000, 0x36980000, 0x369A0000,
+        0x369C0000, 0x369E0000, 0x36A00000, 0x36A20000, 0x36A40000, 0x36A60000, 0x36A80000, 0x36AA0000, 0x36AC0000,
+        0x36AE0000, 0x36B00000, 0x36B20000, 0x36B40000, 0x36B60000, 0x36B80000, 0x36BA0000, 0x36BC0000, 0x36BE0000,
+        0x36C00000, 0x36C20000, 0x36C40000, 0x36C60000, 0x36C80000, 0x36CA0000, 0x36CC0000, 0x36CE0000, 0x36D00000,
+        0x36D20000, 0x36D40000, 0x36D60000, 0x36D80000, 0x36DA0000, 0x36DC0000, 0x36DE0000, 0x36E00000, 0x36E20000,
+        0x36E40000, 0x36E60000, 0x36E80000, 0x36EA0000, 0x36EC0000, 0x36EE0000, 0x36F00000, 0x36F20000, 0x36F40000,
+        0x36F60000, 0x36F80000, 0x36FA0000, 0x36FC0000, 0x36FE0000, 0x37000000, 0x37010000, 0x37020000, 0x37030000,
+        0x37040000, 0x37050000, 0x37060000, 0x37070000, 0x37080000, 0x37090000, 0x370A0000, 0x370B0000, 0x370C0000,
+        0x370D0000, 0x370E0000, 0x370F0000, 0x37100000, 0x37110000, 0x37120000, 0x37130000, 0x37140000, 0x37150000,
+        0x37160000, 0x37170000, 0x37180000, 0x37190000, 0x371A0000, 0x371B0000, 0x371C0000, 0x371D0000, 0x371E0000,
+        0x371F0000, 0x37200000, 0x37210000, 0x37220000, 0x37230000, 0x37240000, 0x37250000, 0x37260000, 0x37270000,
+        0x37280000, 0x37290000, 0x372A0000, 0x372B0000, 0x372C0000, 0x372D0000, 0x372E0000, 0x372F0000, 0x37300000,
+        0x37310000, 0x37320000, 0x37330000, 0x37340000, 0x37350000, 0x37360000, 0x37370000, 0x37380000, 0x37390000,
+        0x373A0000, 0x373B0000, 0x373C0000, 0x373D0000, 0x373E0000, 0x373F0000, 0x37400000, 0x37410000, 0x37420000,
+        0x37430000, 0x37440000, 0x37450000, 0x37460000, 0x37470000, 0x37480000, 0x37490000, 0x374A0000, 0x374B0000,
+        0x374C0000, 0x374D0000, 0x374E0000, 0x374F0000, 0x37500000, 0x37510000, 0x37520000, 0x37530000, 0x37540000,
+        0x37550000, 0x37560000, 0x37570000, 0x37580000, 0x37590000, 0x375A0000, 0x375B0000, 0x375C0000, 0x375D0000,
+        0x375E0000, 0x375F0000, 0x37600000, 0x37610000, 0x37620000, 0x37630000, 0x37640000, 0x37650000, 0x37660000,
+        0x37670000, 0x37680000, 0x37690000, 0x376A0000, 0x376B0000, 0x376C0000, 0x376D0000, 0x376E0000, 0x376F0000,
+        0x37700000, 0x37710000, 0x37720000, 0x37730000, 0x37740000, 0x37750000, 0x37760000, 0x37770000, 0x37780000,
+        0x37790000, 0x377A0000, 0x377B0000, 0x377C0000, 0x377D0000, 0x377E0000, 0x377F0000, 0x37800000, 0x37808000,
+        0x37810000, 0x37818000, 0x37820000, 0x37828000, 0x37830000, 0x37838000, 0x37840000, 0x37848000, 0x37850000,
+        0x37858000, 0x37860000, 0x37868000, 0x37870000, 0x37878000, 0x37880000, 0x37888000, 0x37890000, 0x37898000,
+        0x378A0000, 0x378A8000, 0x378B0000, 0x378B8000, 0x378C0000, 0x378C8000, 0x378D0000, 0x378D8000, 0x378E0000,
+        0x378E8000, 0x378F0000, 0x378F8000, 0x37900000, 0x37908000, 0x37910000, 0x37918000, 0x37920000, 0x37928000,
+        0x37930000, 0x37938000, 0x37940000, 0x37948000, 0x37950000, 0x37958000, 0x37960000, 0x37968000, 0x37970000,
+        0x37978000, 0x37980000, 0x37988000, 0x37990000, 0x37998000, 0x379A0000, 0x379A8000, 0x379B0000, 0x379B8000,
+        0x379C0000, 0x379C8000, 0x379D0000, 0x379D8000, 0x379E0000, 0x379E8000, 0x379F0000, 0x379F8000, 0x37A00000,
+        0x37A08000, 0x37A10000, 0x37A18000, 0x37A20000, 0x37A28000, 0x37A30000, 0x37A38000, 0x37A40000, 0x37A48000,
+        0x37A50000, 0x37A58000, 0x37A60000, 0x37A68000, 0x37A70000, 0x37A78000, 0x37A80000, 0x37A88000, 0x37A90000,
+        0x37A98000, 0x37AA0000, 0x37AA8000, 0x37AB0000, 0x37AB8000, 0x37AC0000, 0x37AC8000, 0x37AD0000, 0x37AD8000,
+        0x37AE0000, 0x37AE8000, 0x37AF0000, 0x37AF8000, 0x37B00000, 0x37B08000, 0x37B10000, 0x37B18000, 0x37B20000,
+        0x37B28000, 0x37B30000, 0x37B38000, 0x37B40000, 0x37B48000, 0x37B50000, 0x37B58000, 0x37B60000, 0x37B68000,
+        0x37B70000, 0x37B78000, 0x37B80000, 0x37B88000, 0x37B90000, 0x37B98000, 0x37BA0000, 0x37BA8000, 0x37BB0000,
+        0x37BB8000, 0x37BC0000, 0x37BC8000, 0x37BD0000, 0x37BD8000, 0x37BE0000, 0x37BE8000, 0x37BF0000, 0x37BF8000,
+        0x37C00000, 0x37C08000, 0x37C10000, 0x37C18000, 0x37C20000, 0x37C28000, 0x37C30000, 0x37C38000, 0x37C40000,
+        0x37C48000, 0x37C50000, 0x37C58000, 0x37C60000, 0x37C68000, 0x37C70000, 0x37C78000, 0x37C80000, 0x37C88000,
+        0x37C90000, 0x37C98000, 0x37CA0000, 0x37CA8000, 0x37CB0000, 0x37CB8000, 0x37CC0000, 0x37CC8000, 0x37CD0000,
+        0x37CD8000, 0x37CE0000, 0x37CE8000, 0x37CF0000, 0x37CF8000, 0x37D00000, 0x37D08000, 0x37D10000, 0x37D18000,
+        0x37D20000, 0x37D28000, 0x37D30000, 0x37D38000, 0x37D40000, 0x37D48000, 0x37D50000, 0x37D58000, 0x37D60000,
+        0x37D68000, 0x37D70000, 0x37D78000, 0x37D80000, 0x37D88000, 0x37D90000, 0x37D98000, 0x37DA0000, 0x37DA8000,
+        0x37DB0000, 0x37DB8000, 0x37DC0000, 0x37DC8000, 0x37DD0000, 0x37DD8000, 0x37DE0000, 0x37DE8000, 0x37DF0000,
+        0x37DF8000, 0x37E00000, 0x37E08000, 0x37E10000, 0x37E18000, 0x37E20000, 0x37E28000, 0x37E30000, 0x37E38000,
+        0x37E40000, 0x37E48000, 0x37E50000, 0x37E58000, 0x37E60000, 0x37E68000, 0x37E70000, 0x37E78000, 0x37E80000,
+        0x37E88000, 0x37E90000, 0x37E98000, 0x37EA0000, 0x37EA8000, 0x37EB0000, 0x37EB8000, 0x37EC0000, 0x37EC8000,
+        0x37ED0000, 0x37ED8000, 0x37EE0000, 0x37EE8000, 0x37EF0000, 0x37EF8000, 0x37F00000, 0x37F08000, 0x37F10000,
+        0x37F18000, 0x37F20000, 0x37F28000, 0x37F30000, 0x37F38000, 0x37F40000, 0x37F48000, 0x37F50000, 0x37F58000,
+        0x37F60000, 0x37F68000, 0x37F70000, 0x37F78000, 0x37F80000, 0x37F88000, 0x37F90000, 0x37F98000, 0x37FA0000,
+        0x37FA8000, 0x37FB0000, 0x37FB8000, 0x37FC0000, 0x37FC8000, 0x37FD0000, 0x37FD8000, 0x37FE0000, 0x37FE8000,
+        0x37FF0000, 0x37FF8000, 0x38000000, 0x38004000, 0x38008000, 0x3800C000, 0x38010000, 0x38014000, 0x38018000,
+        0x3801C000, 0x38020000, 0x38024000, 0x38028000, 0x3802C000, 0x38030000, 0x38034000, 0x38038000, 0x3803C000,
+        0x38040000, 0x38044000, 0x38048000, 0x3804C000, 0x38050000, 0x38054000, 0x38058000, 0x3805C000, 0x38060000,
+        0x38064000, 0x38068000, 0x3806C000, 0x38070000, 0x38074000, 0x38078000, 0x3807C000, 0x38080000, 0x38084000,
+        0x38088000, 0x3808C000, 0x38090000, 0x38094000, 0x38098000, 0x3809C000, 0x380A0000, 0x380A4000, 0x380A8000,
+        0x380AC000, 0x380B0000, 0x380B4000, 0x380B8000, 0x380BC000, 0x380C0000, 0x380C4000, 0x380C8000, 0x380CC000,
+        0x380D0000, 0x380D4000, 0x380D8000, 0x380DC000, 0x380E0000, 0x380E4000, 0x380E8000, 0x380EC000, 0x380F0000,
+        0x380F4000, 0x380F8000, 0x380FC000, 0x38100000, 0x38104000, 0x38108000, 0x3810C000, 0x38110000, 0x38114000,
+        0x38118000, 0x3811C000, 0x38120000, 0x38124000, 0x38128000, 0x3812C000, 0x38130000, 0x38134000, 0x38138000,
+        0x3813C000, 0x38140000, 0x38144000, 0x38148000, 0x3814C000, 0x38150000, 0x38154000, 0x38158000, 0x3815C000,
+        0x38160000, 0x38164000, 0x38168000, 0x3816C000, 0x38170000, 0x38174000, 0x38178000, 0x3817C000, 0x38180000,
+        0x38184000, 0x38188000, 0x3818C000, 0x38190000, 0x38194000, 0x38198000, 0x3819C000, 0x381A0000, 0x381A4000,
+        0x381A8000, 0x381AC000, 0x381B0000, 0x381B4000, 0x381B8000, 0x381BC000, 0x381C0000, 0x381C4000, 0x381C8000,
+        0x381CC000, 0x381D0000, 0x381D4000, 0x381D8000, 0x381DC000, 0x381E0000, 0x381E4000, 0x381E8000, 0x381EC000,
+        0x381F0000, 0x381F4000, 0x381F8000, 0x381FC000, 0x38200000, 0x38204000, 0x38208000, 0x3820C000, 0x38210000,
+        0x38214000, 0x38218000, 0x3821C000, 0x38220000, 0x38224000, 0x38228000, 0x3822C000, 0x38230000, 0x38234000,
+        0x38238000, 0x3823C000, 0x38240000, 0x38244000, 0x38248000, 0x3824C000, 0x38250000, 0x38254000, 0x38258000,
+        0x3825C000, 0x38260000, 0x38264000, 0x38268000, 0x3826C000, 0x38270000, 0x38274000, 0x38278000, 0x3827C000,
+        0x38280000, 0x38284000, 0x38288000, 0x3828C000, 0x38290000, 0x38294000, 0x38298000, 0x3829C000, 0x382A0000,
+        0x382A4000, 0x382A8000, 0x382AC000, 0x382B0000, 0x382B4000, 0x382B8000, 0x382BC000, 0x382C0000, 0x382C4000,
+        0x382C8000, 0x382CC000, 0x382D0000, 0x382D4000, 0x382D8000, 0x382DC000, 0x382E0000, 0x382E4000, 0x382E8000,
+        0x382EC000, 0x382F0000, 0x382F4000, 0x382F8000, 0x382FC000, 0x38300000, 0x38304000, 0x38308000, 0x3830C000,
+        0x38310000, 0x38314000, 0x38318000, 0x3831C000, 0x38320000, 0x38324000, 0x38328000, 0x3832C000, 0x38330000,
+        0x38334000, 0x38338000, 0x3833C000, 0x38340000, 0x38344000, 0x38348000, 0x3834C000, 0x38350000, 0x38354000,
+        0x38358000, 0x3835C000, 0x38360000, 0x38364000, 0x38368000, 0x3836C000, 0x38370000, 0x38374000, 0x38378000,
+        0x3837C000, 0x38380000, 0x38384000, 0x38388000, 0x3838C000, 0x38390000, 0x38394000, 0x38398000, 0x3839C000,
+        0x383A0000, 0x383A4000, 0x383A8000, 0x383AC000, 0x383B0000, 0x383B4000, 0x383B8000, 0x383BC000, 0x383C0000,
+        0x383C4000, 0x383C8000, 0x383CC000, 0x383D0000, 0x383D4000, 0x383D8000, 0x383DC000, 0x383E0000, 0x383E4000,
+        0x383E8000, 0x383EC000, 0x383F0000, 0x383F4000, 0x383F8000, 0x383FC000, 0x38400000, 0x38404000, 0x38408000,
+        0x3840C000, 0x38410000, 0x38414000, 0x38418000, 0x3841C000, 0x38420000, 0x38424000, 0x38428000, 0x3842C000,
+        0x38430000, 0x38434000, 0x38438000, 0x3843C000, 0x38440000, 0x38444000, 0x38448000, 0x3844C000, 0x38450000,
+        0x38454000, 0x38458000, 0x3845C000, 0x38460000, 0x38464000, 0x38468000, 0x3846C000, 0x38470000, 0x38474000,
+        0x38478000, 0x3847C000, 0x38480000, 0x38484000, 0x38488000, 0x3848C000, 0x38490000, 0x38494000, 0x38498000,
+        0x3849C000, 0x384A0000, 0x384A4000, 0x384A8000, 0x384AC000, 0x384B0000, 0x384B4000, 0x384B8000, 0x384BC000,
+        0x384C0000, 0x384C4000, 0x384C8000, 0x384CC000, 0x384D0000, 0x384D4000, 0x384D8000, 0x384DC000, 0x384E0000,
+        0x384E4000, 0x384E8000, 0x384EC000, 0x384F0000, 0x384F4000, 0x384F8000, 0x384FC000, 0x38500000, 0x38504000,
+        0x38508000, 0x3850C000, 0x38510000, 0x38514000, 0x38518000, 0x3851C000, 0x38520000, 0x38524000, 0x38528000,
+        0x3852C000, 0x38530000, 0x38534000, 0x38538000, 0x3853C000, 0x38540000, 0x38544000, 0x38548000, 0x3854C000,
+        0x38550000, 0x38554000, 0x38558000, 0x3855C000, 0x38560000, 0x38564000, 0x38568000, 0x3856C000, 0x38570000,
+        0x38574000, 0x38578000, 0x3857C000, 0x38580000, 0x38584000, 0x38588000, 0x3858C000, 0x38590000, 0x38594000,
+        0x38598000, 0x3859C000, 0x385A0000, 0x385A4000, 0x385A8000, 0x385AC000, 0x385B0000, 0x385B4000, 0x385B8000,
+        0x385BC000, 0x385C0000, 0x385C4000, 0x385C8000, 0x385CC000, 0x385D0000, 0x385D4000, 0x385D8000, 0x385DC000,
+        0x385E0000, 0x385E4000, 0x385E8000, 0x385EC000, 0x385F0000, 0x385F4000, 0x385F8000, 0x385FC000, 0x38600000,
+        0x38604000, 0x38608000, 0x3860C000, 0x38610000, 0x38614000, 0x38618000, 0x3861C000, 0x38620000, 0x38624000,
+        0x38628000, 0x3862C000, 0x38630000, 0x38634000, 0x38638000, 0x3863C000, 0x38640000, 0x38644000, 0x38648000,
+        0x3864C000, 0x38650000, 0x38654000, 0x38658000, 0x3865C000, 0x38660000, 0x38664000, 0x38668000, 0x3866C000,
+        0x38670000, 0x38674000, 0x38678000, 0x3867C000, 0x38680000, 0x38684000, 0x38688000, 0x3868C000, 0x38690000,
+        0x38694000, 0x38698000, 0x3869C000, 0x386A0000, 0x386A4000, 0x386A8000, 0x386AC000, 0x386B0000, 0x386B4000,
+        0x386B8000, 0x386BC000, 0x386C0000, 0x386C4000, 0x386C8000, 0x386CC000, 0x386D0000, 0x386D4000, 0x386D8000,
+        0x386DC000, 0x386E0000, 0x386E4000, 0x386E8000, 0x386EC000, 0x386F0000, 0x386F4000, 0x386F8000, 0x386FC000,
+        0x38700000, 0x38704000, 0x38708000, 0x3870C000, 0x38710000, 0x38714000, 0x38718000, 0x3871C000, 0x38720000,
+        0x38724000, 0x38728000, 0x3872C000, 0x38730000, 0x38734000, 0x38738000, 0x3873C000, 0x38740000, 0x38744000,
+        0x38748000, 0x3874C000, 0x38750000, 0x38754000, 0x38758000, 0x3875C000, 0x38760000, 0x38764000, 0x38768000,
+        0x3876C000, 0x38770000, 0x38774000, 0x38778000, 0x3877C000, 0x38780000, 0x38784000, 0x38788000, 0x3878C000,
+        0x38790000, 0x38794000, 0x38798000, 0x3879C000, 0x387A0000, 0x387A4000, 0x387A8000, 0x387AC000, 0x387B0000,
+        0x387B4000, 0x387B8000, 0x387BC000, 0x387C0000, 0x387C4000, 0x387C8000, 0x387CC000, 0x387D0000, 0x387D4000,
+        0x387D8000, 0x387DC000, 0x387E0000, 0x387E4000, 0x387E8000, 0x387EC000, 0x387F0000, 0x387F4000, 0x387F8000,
+        0x387FC000, 0x38000000, 0x38002000, 0x38004000, 0x38006000, 0x38008000, 0x3800A000, 0x3800C000, 0x3800E000,
+        0x38010000, 0x38012000, 0x38014000, 0x38016000, 0x38018000, 0x3801A000, 0x3801C000, 0x3801E000, 0x38020000,
+        0x38022000, 0x38024000, 0x38026000, 0x38028000, 0x3802A000, 0x3802C000, 0x3802E000, 0x38030000, 0x38032000,
+        0x38034000, 0x38036000, 0x38038000, 0x3803A000, 0x3803C000, 0x3803E000, 0x38040000, 0x38042000, 0x38044000,
+        0x38046000, 0x38048000, 0x3804A000, 0x3804C000, 0x3804E000, 0x38050000, 0x38052000, 0x38054000, 0x38056000,
+        0x38058000, 0x3805A000, 0x3805C000, 0x3805E000, 0x38060000, 0x38062000, 0x38064000, 0x38066000, 0x38068000,
+        0x3806A000, 0x3806C000, 0x3806E000, 0x38070000, 0x38072000, 0x38074000, 0x38076000, 0x38078000, 0x3807A000,
+        0x3807C000, 0x3807E000, 0x38080000, 0x38082000, 0x38084000, 0x38086000, 0x38088000, 0x3808A000, 0x3808C000,
+        0x3808E000, 0x38090000, 0x38092000, 0x38094000, 0x38096000, 0x38098000, 0x3809A000, 0x3809C000, 0x3809E000,
+        0x380A0000, 0x380A2000, 0x380A4000, 0x380A6000, 0x380A8000, 0x380AA000, 0x380AC000, 0x380AE000, 0x380B0000,
+        0x380B2000, 0x380B4000, 0x380B6000, 0x380B8000, 0x380BA000, 0x380BC000, 0x380BE000, 0x380C0000, 0x380C2000,
+        0x380C4000, 0x380C6000, 0x380C8000, 0x380CA000, 0x380CC000, 0x380CE000, 0x380D0000, 0x380D2000, 0x380D4000,
+        0x380D6000, 0x380D8000, 0x380DA000, 0x380DC000, 0x380DE000, 0x380E0000, 0x380E2000, 0x380E4000, 0x380E6000,
+        0x380E8000, 0x380EA000, 0x380EC000, 0x380EE000, 0x380F0000, 0x380F2000, 0x380F4000, 0x380F6000, 0x380F8000,
+        0x380FA000, 0x380FC000, 0x380FE000, 0x38100000, 0x38102000, 0x38104000, 0x38106000, 0x38108000, 0x3810A000,
+        0x3810C000, 0x3810E000, 0x38110000, 0x38112000, 0x38114000, 0x38116000, 0x38118000, 0x3811A000, 0x3811C000,
+        0x3811E000, 0x38120000, 0x38122000, 0x38124000, 0x38126000, 0x38128000, 0x3812A000, 0x3812C000, 0x3812E000,
+        0x38130000, 0x38132000, 0x38134000, 0x38136000, 0x38138000, 0x3813A000, 0x3813C000, 0x3813E000, 0x38140000,
+        0x38142000, 0x38144000, 0x38146000, 0x38148000, 0x3814A000, 0x3814C000, 0x3814E000, 0x38150000, 0x38152000,
+        0x38154000, 0x38156000, 0x38158000, 0x3815A000, 0x3815C000, 0x3815E000, 0x38160000, 0x38162000, 0x38164000,
+        0x38166000, 0x38168000, 0x3816A000, 0x3816C000, 0x3816E000, 0x38170000, 0x38172000, 0x38174000, 0x38176000,
+        0x38178000, 0x3817A000, 0x3817C000, 0x3817E000, 0x38180000, 0x38182000, 0x38184000, 0x38186000, 0x38188000,
+        0x3818A000, 0x3818C000, 0x3818E000, 0x38190000, 0x38192000, 0x38194000, 0x38196000, 0x38198000, 0x3819A000,
+        0x3819C000, 0x3819E000, 0x381A0000, 0x381A2000, 0x381A4000, 0x381A6000, 0x381A8000, 0x381AA000, 0x381AC000,
+        0x381AE000, 0x381B0000, 0x381B2000, 0x381B4000, 0x381B6000, 0x381B8000, 0x381BA000, 0x381BC000, 0x381BE000,
+        0x381C0000, 0x381C2000, 0x381C4000, 0x381C6000, 0x381C8000, 0x381CA000, 0x381CC000, 0x381CE000, 0x381D0000,
+        0x381D2000, 0x381D4000, 0x381D6000, 0x381D8000, 0x381DA000, 0x381DC000, 0x381DE000, 0x381E0000, 0x381E2000,
+        0x381E4000, 0x381E6000, 0x381E8000, 0x381EA000, 0x381EC000, 0x381EE000, 0x381F0000, 0x381F2000, 0x381F4000,
+        0x381F6000, 0x381F8000, 0x381FA000, 0x381FC000, 0x381FE000, 0x38200000, 0x38202000, 0x38204000, 0x38206000,
+        0x38208000, 0x3820A000, 0x3820C000, 0x3820E000, 0x38210000, 0x38212000, 0x38214000, 0x38216000, 0x38218000,
+        0x3821A000, 0x3821C000, 0x3821E000, 0x38220000, 0x38222000, 0x38224000, 0x38226000, 0x38228000, 0x3822A000,
+        0x3822C000, 0x3822E000, 0x38230000, 0x38232000, 0x38234000, 0x38236000, 0x38238000, 0x3823A000, 0x3823C000,
+        0x3823E000, 0x38240000, 0x38242000, 0x38244000, 0x38246000, 0x38248000, 0x3824A000, 0x3824C000, 0x3824E000,
+        0x38250000, 0x38252000, 0x38254000, 0x38256000, 0x38258000, 0x3825A000, 0x3825C000, 0x3825E000, 0x38260000,
+        0x38262000, 0x38264000, 0x38266000, 0x38268000, 0x3826A000, 0x3826C000, 0x3826E000, 0x38270000, 0x38272000,
+        0x38274000, 0x38276000, 0x38278000, 0x3827A000, 0x3827C000, 0x3827E000, 0x38280000, 0x38282000, 0x38284000,
+        0x38286000, 0x38288000, 0x3828A000, 0x3828C000, 0x3828E000, 0x38290000, 0x38292000, 0x38294000, 0x38296000,
+        0x38298000, 0x3829A000, 0x3829C000, 0x3829E000, 0x382A0000, 0x382A2000, 0x382A4000, 0x382A6000, 0x382A8000,
+        0x382AA000, 0x382AC000, 0x382AE000, 0x382B0000, 0x382B2000, 0x382B4000, 0x382B6000, 0x382B8000, 0x382BA000,
+        0x382BC000, 0x382BE000, 0x382C0000, 0x382C2000, 0x382C4000, 0x382C6000, 0x382C8000, 0x382CA000, 0x382CC000,
+        0x382CE000, 0x382D0000, 0x382D2000, 0x382D4000, 0x382D6000, 0x382D8000, 0x382DA000, 0x382DC000, 0x382DE000,
+        0x382E0000, 0x382E2000, 0x382E4000, 0x382E6000, 0x382E8000, 0x382EA000, 0x382EC000, 0x382EE000, 0x382F0000,
+        0x382F2000, 0x382F4000, 0x382F6000, 0x382F8000, 0x382FA000, 0x382FC000, 0x382FE000, 0x38300000, 0x38302000,
+        0x38304000, 0x38306000, 0x38308000, 0x3830A000, 0x3830C000, 0x3830E000, 0x38310000, 0x38312000, 0x38314000,
+        0x38316000, 0x38318000, 0x3831A000, 0x3831C000, 0x3831E000, 0x38320000, 0x38322000, 0x38324000, 0x38326000,
+        0x38328000, 0x3832A000, 0x3832C000, 0x3832E000, 0x38330000, 0x38332000, 0x38334000, 0x38336000, 0x38338000,
+        0x3833A000, 0x3833C000, 0x3833E000, 0x38340000, 0x38342000, 0x38344000, 0x38346000, 0x38348000, 0x3834A000,
+        0x3834C000, 0x3834E000, 0x38350000, 0x38352000, 0x38354000, 0x38356000, 0x38358000, 0x3835A000, 0x3835C000,
+        0x3835E000, 0x38360000, 0x38362000, 0x38364000, 0x38366000, 0x38368000, 0x3836A000, 0x3836C000, 0x3836E000,
+        0x38370000, 0x38372000, 0x38374000, 0x38376000, 0x38378000, 0x3837A000, 0x3837C000, 0x3837E000, 0x38380000,
+        0x38382000, 0x38384000, 0x38386000, 0x38388000, 0x3838A000, 0x3838C000, 0x3838E000, 0x38390000, 0x38392000,
+        0x38394000, 0x38396000, 0x38398000, 0x3839A000, 0x3839C000, 0x3839E000, 0x383A0000, 0x383A2000, 0x383A4000,
+        0x383A6000, 0x383A8000, 0x383AA000, 0x383AC000, 0x383AE000, 0x383B0000, 0x383B2000, 0x383B4000, 0x383B6000,
+        0x383B8000, 0x383BA000, 0x383BC000, 0x383BE000, 0x383C0000, 0x383C2000, 0x383C4000, 0x383C6000, 0x383C8000,
+        0x383CA000, 0x383CC000, 0x383CE000, 0x383D0000, 0x383D2000, 0x383D4000, 0x383D6000, 0x383D8000, 0x383DA000,
+        0x383DC000, 0x383DE000, 0x383E0000, 0x383E2000, 0x383E4000, 0x383E6000, 0x383E8000, 0x383EA000, 0x383EC000,
+        0x383EE000, 0x383F0000, 0x383F2000, 0x383F4000, 0x383F6000, 0x383F8000, 0x383FA000, 0x383FC000, 0x383FE000,
+        0x38400000, 0x38402000, 0x38404000, 0x38406000, 0x38408000, 0x3840A000, 0x3840C000, 0x3840E000, 0x38410000,
+        0x38412000, 0x38414000, 0x38416000, 0x38418000, 0x3841A000, 0x3841C000, 0x3841E000, 0x38420000, 0x38422000,
+        0x38424000, 0x38426000, 0x38428000, 0x3842A000, 0x3842C000, 0x3842E000, 0x38430000, 0x38432000, 0x38434000,
+        0x38436000, 0x38438000, 0x3843A000, 0x3843C000, 0x3843E000, 0x38440000, 0x38442000, 0x38444000, 0x38446000,
+        0x38448000, 0x3844A000, 0x3844C000, 0x3844E000, 0x38450000, 0x38452000, 0x38454000, 0x38456000, 0x38458000,
+        0x3845A000, 0x3845C000, 0x3845E000, 0x38460000, 0x38462000, 0x38464000, 0x38466000, 0x38468000, 0x3846A000,
+        0x3846C000, 0x3846E000, 0x38470000, 0x38472000, 0x38474000, 0x38476000, 0x38478000, 0x3847A000, 0x3847C000,
+        0x3847E000, 0x38480000, 0x38482000, 0x38484000, 0x38486000, 0x38488000, 0x3848A000, 0x3848C000, 0x3848E000,
+        0x38490000, 0x38492000, 0x38494000, 0x38496000, 0x38498000, 0x3849A000, 0x3849C000, 0x3849E000, 0x384A0000,
+        0x384A2000, 0x384A4000, 0x384A6000, 0x384A8000, 0x384AA000, 0x384AC000, 0x384AE000, 0x384B0000, 0x384B2000,
+        0x384B4000, 0x384B6000, 0x384B8000, 0x384BA000, 0x384BC000, 0x384BE000, 0x384C0000, 0x384C2000, 0x384C4000,
+        0x384C6000, 0x384C8000, 0x384CA000, 0x384CC000, 0x384CE000, 0x384D0000, 0x384D2000, 0x384D4000, 0x384D6000,
+        0x384D8000, 0x384DA000, 0x384DC000, 0x384DE000, 0x384E0000, 0x384E2000, 0x384E4000, 0x384E6000, 0x384E8000,
+        0x384EA000, 0x384EC000, 0x384EE000, 0x384F0000, 0x384F2000, 0x384F4000, 0x384F6000, 0x384F8000, 0x384FA000,
+        0x384FC000, 0x384FE000, 0x38500000, 0x38502000, 0x38504000, 0x38506000, 0x38508000, 0x3850A000, 0x3850C000,
+        0x3850E000, 0x38510000, 0x38512000, 0x38514000, 0x38516000, 0x38518000, 0x3851A000, 0x3851C000, 0x3851E000,
+        0x38520000, 0x38522000, 0x38524000, 0x38526000, 0x38528000, 0x3852A000, 0x3852C000, 0x3852E000, 0x38530000,
+        0x38532000, 0x38534000, 0x38536000, 0x38538000, 0x3853A000, 0x3853C000, 0x3853E000, 0x38540000, 0x38542000,
+        0x38544000, 0x38546000, 0x38548000, 0x3854A000, 0x3854C000, 0x3854E000, 0x38550000, 0x38552000, 0x38554000,
+        0x38556000, 0x38558000, 0x3855A000, 0x3855C000, 0x3855E000, 0x38560000, 0x38562000, 0x38564000, 0x38566000,
+        0x38568000, 0x3856A000, 0x3856C000, 0x3856E000, 0x38570000, 0x38572000, 0x38574000, 0x38576000, 0x38578000,
+        0x3857A000, 0x3857C000, 0x3857E000, 0x38580000, 0x38582000, 0x38584000, 0x38586000, 0x38588000, 0x3858A000,
+        0x3858C000, 0x3858E000, 0x38590000, 0x38592000, 0x38594000, 0x38596000, 0x38598000, 0x3859A000, 0x3859C000,
+        0x3859E000, 0x385A0000, 0x385A2000, 0x385A4000, 0x385A6000, 0x385A8000, 0x385AA000, 0x385AC000, 0x385AE000,
+        0x385B0000, 0x385B2000, 0x385B4000, 0x385B6000, 0x385B8000, 0x385BA000, 0x385BC000, 0x385BE000, 0x385C0000,
+        0x385C2000, 0x385C4000, 0x385C6000, 0x385C8000, 0x385CA000, 0x385CC000, 0x385CE000, 0x385D0000, 0x385D2000,
+        0x385D4000, 0x385D6000, 0x385D8000, 0x385DA000, 0x385DC000, 0x385DE000, 0x385E0000, 0x385E2000, 0x385E4000,
+        0x385E6000, 0x385E8000, 0x385EA000, 0x385EC000, 0x385EE000, 0x385F0000, 0x385F2000, 0x385F4000, 0x385F6000,
+        0x385F8000, 0x385FA000, 0x385FC000, 0x385FE000, 0x38600000, 0x38602000, 0x38604000, 0x38606000, 0x38608000,
+        0x3860A000, 0x3860C000, 0x3860E000, 0x38610000, 0x38612000, 0x38614000, 0x38616000, 0x38618000, 0x3861A000,
+        0x3861C000, 0x3861E000, 0x38620000, 0x38622000, 0x38624000, 0x38626000, 0x38628000, 0x3862A000, 0x3862C000,
+        0x3862E000, 0x38630000, 0x38632000, 0x38634000, 0x38636000, 0x38638000, 0x3863A000, 0x3863C000, 0x3863E000,
+        0x38640000, 0x38642000, 0x38644000, 0x38646000, 0x38648000, 0x3864A000, 0x3864C000, 0x3864E000, 0x38650000,
+        0x38652000, 0x38654000, 0x38656000, 0x38658000, 0x3865A000, 0x3865C000, 0x3865E000, 0x38660000, 0x38662000,
+        0x38664000, 0x38666000, 0x38668000, 0x3866A000, 0x3866C000, 0x3866E000, 0x38670000, 0x38672000, 0x38674000,
+        0x38676000, 0x38678000, 0x3867A000, 0x3867C000, 0x3867E000, 0x38680000, 0x38682000, 0x38684000, 0x38686000,
+        0x38688000, 0x3868A000, 0x3868C000, 0x3868E000, 0x38690000, 0x38692000, 0x38694000, 0x38696000, 0x38698000,
+        0x3869A000, 0x3869C000, 0x3869E000, 0x386A0000, 0x386A2000, 0x386A4000, 0x386A6000, 0x386A8000, 0x386AA000,
+        0x386AC000, 0x386AE000, 0x386B0000, 0x386B2000, 0x386B4000, 0x386B6000, 0x386B8000, 0x386BA000, 0x386BC000,
+        0x386BE000, 0x386C0000, 0x386C2000, 0x386C4000, 0x386C6000, 0x386C8000, 0x386CA000, 0x386CC000, 0x386CE000,
+        0x386D0000, 0x386D2000, 0x386D4000, 0x386D6000, 0x386D8000, 0x386DA000, 0x386DC000, 0x386DE000, 0x386E0000,
+        0x386E2000, 0x386E4000, 0x386E6000, 0x386E8000, 0x386EA000, 0x386EC000, 0x386EE000, 0x386F0000, 0x386F2000,
+        0x386F4000, 0x386F6000, 0x386F8000, 0x386FA000, 0x386FC000, 0x386FE000, 0x38700000, 0x38702000, 0x38704000,
+        0x38706000, 0x38708000, 0x3870A000, 0x3870C000, 0x3870E000, 0x38710000, 0x38712000, 0x38714000, 0x38716000,
+        0x38718000, 0x3871A000, 0x3871C000, 0x3871E000, 0x38720000, 0x38722000, 0x38724000, 0x38726000, 0x38728000,
+        0x3872A000, 0x3872C000, 0x3872E000, 0x38730000, 0x38732000, 0x38734000, 0x38736000, 0x38738000, 0x3873A000,
+        0x3873C000, 0x3873E000, 0x38740000, 0x38742000, 0x38744000, 0x38746000, 0x38748000, 0x3874A000, 0x3874C000,
+        0x3874E000, 0x38750000, 0x38752000, 0x38754000, 0x38756000, 0x38758000, 0x3875A000, 0x3875C000, 0x3875E000,
+        0x38760000, 0x38762000, 0x38764000, 0x38766000, 0x38768000, 0x3876A000, 0x3876C000, 0x3876E000, 0x38770000,
+        0x38772000, 0x38774000, 0x38776000, 0x38778000, 0x3877A000, 0x3877C000, 0x3877E000, 0x38780000, 0x38782000,
+        0x38784000, 0x38786000, 0x38788000, 0x3878A000, 0x3878C000, 0x3878E000, 0x38790000, 0x38792000, 0x38794000,
+        0x38796000, 0x38798000, 0x3879A000, 0x3879C000, 0x3879E000, 0x387A0000, 0x387A2000, 0x387A4000, 0x387A6000,
+        0x387A8000, 0x387AA000, 0x387AC000, 0x387AE000, 0x387B0000, 0x387B2000, 0x387B4000, 0x387B6000, 0x387B8000,
+        0x387BA000, 0x387BC000, 0x387BE000, 0x387C0000, 0x387C2000, 0x387C4000, 0x387C6000, 0x387C8000, 0x387CA000,
+        0x387CC000, 0x387CE000, 0x387D0000, 0x387D2000, 0x387D4000, 0x387D6000, 0x387D8000, 0x387DA000, 0x387DC000,
+        0x387DE000, 0x387E0000, 0x387E2000, 0x387E4000, 0x387E6000, 0x387E8000, 0x387EA000, 0x387EC000, 0x387EE000,
+        0x387F0000, 0x387F2000, 0x387F4000, 0x387F6000, 0x387F8000, 0x387FA000, 0x387FC000, 0x387FE000};
+    static const uint32 exponent_table[64] = {0x00000000, 0x00800000, 0x01000000, 0x01800000, 0x02000000, 0x02800000,
+        0x03000000, 0x03800000, 0x04000000, 0x04800000, 0x05000000, 0x05800000, 0x06000000, 0x06800000, 0x07000000,
+        0x07800000, 0x08000000, 0x08800000, 0x09000000, 0x09800000, 0x0A000000, 0x0A800000, 0x0B000000, 0x0B800000,
+        0x0C000000, 0x0C800000, 0x0D000000, 0x0D800000, 0x0E000000, 0x0E800000, 0x0F000000, 0x47800000, 0x80000000,
+        0x80800000, 0x81000000, 0x81800000, 0x82000000, 0x82800000, 0x83000000, 0x83800000, 0x84000000, 0x84800000,
+        0x85000000, 0x85800000, 0x86000000, 0x86800000, 0x87000000, 0x87800000, 0x88000000, 0x88800000, 0x89000000,
+        0x89800000, 0x8A000000, 0x8A800000, 0x8B000000, 0x8B800000, 0x8C000000, 0x8C800000, 0x8D000000, 0x8D800000,
+        0x8E000000, 0x8E800000, 0x8F000000, 0xC7800000};
+    static const unsigned short offset_table[64] = {0, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024,
+        1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024,
+        1024, 1024, 0, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024,
+        1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024};
+    uint32 bits = mantissa_table[offset_table[value >> 10] + (value & 0x3FF)] + exponent_table[value >> 10];
+    //			return *reinterpret_cast<float*>(&bits);			//violating strict aliasing!
+    float out;
+    std::memcpy(&out, &bits, sizeof(float));
+    return out;
+}
+
+/// Convert half-precision to IEEE double-precision.
+/// \param value binary representation of half-precision value
+/// \return double-precision value
+inline double half2float_impl(uint16 value, double, true_type)
+{
+    typedef bits<float>::type uint32;
+    typedef bits<double>::type uint64;
+    uint32 hi = static_cast<uint32>(value & 0x8000) << 16;
+    int abs = value & 0x7FFF;
+    if (abs)
+    {
+        hi |= 0x3F000000 << static_cast<unsigned>(abs >= 0x7C00);
+        for (; abs < 0x400; abs <<= 1, hi -= 0x100000)
+            ;
+        hi += static_cast<uint32>(abs) << 10;
+    }
+    uint64 bits = static_cast<uint64>(hi) << 32;
+    //			return *reinterpret_cast<double*>(&bits);			//violating strict aliasing!
+    double out;
+    std::memcpy(&out, &bits, sizeof(double));
+    return out;
+}
+
+/// Convert half-precision to non-IEEE floating point.
+/// \tparam T type to convert to (builtin integer type)
+/// \param value binary representation of half-precision value
+/// \return floating point value
+template <typename T>
+T half2float_impl(uint16 value, T, ...)
+{
+    T out;
+    int abs = value & 0x7FFF;
+    if (abs > 0x7C00)
+        out = std::numeric_limits<T>::has_quiet_NaN ? std::numeric_limits<T>::quiet_NaN() : T();
+    else if (abs == 0x7C00)
+        out = std::numeric_limits<T>::has_infinity ? std::numeric_limits<T>::infinity() : std::numeric_limits<T>::max();
+    else if (abs > 0x3FF)
+        out = std::ldexp(static_cast<T>((abs & 0x3FF) | 0x400), (abs >> 10) - 25);
+    else
+        out = std::ldexp(static_cast<T>(abs), -24);
+    return (value & 0x8000) ? -out : out;
+}
+
+/// Convert half-precision to floating point.
+/// \tparam T type to convert to (builtin integer type)
+/// \param value binary representation of half-precision value
+/// \return floating point value
+template <typename T>
+T half2float(uint16 value)
+{
+    return half2float_impl(
+        value, T(), bool_type < std::numeric_limits<T>::is_iec559 && sizeof(typename bits<T>::type) == sizeof(T) > ());
+}
+
+/// Convert half-precision floating point to integer.
+/// \tparam R rounding mode to use, `std::round_indeterminate` for fastest rounding
+/// \tparam E `true` for round to even, `false` for round away from zero
+/// \tparam T type to convert to (buitlin integer type with at least 16 bits precision, excluding any implicit sign
+/// bits) \param value binary representation of half-precision value \return integral value
+template <std::float_round_style R, bool E, typename T>
+T half2int_impl(uint16 value)
+{
+#if HALF_ENABLE_CPP11_STATIC_ASSERT && HALF_ENABLE_CPP11_TYPE_TRAITS
+    static_assert(std::is_integral<T>::value, "half to int conversion only supports builtin integer types");
+#endif
+    uint32_t e = value & 0x7FFF;
+    if (e >= 0x7C00)
+        return (value & 0x8000) ? std::numeric_limits<T>::min() : std::numeric_limits<T>::max();
+    if (e < 0x3800)
+    {
+        if (R == std::round_toward_infinity)
+            return T(~(value >> 15) & (e != 0));
+        else if (R == std::round_toward_neg_infinity)
+            return -T(value > 0x8000);
+        return T();
+    }
+    uint32_t m = (value & 0x3FF) | 0x400;
+    e >>= 10;
+    if (e < 25)
+    {
+        if (R == std::round_to_nearest)
+            m += (1 << (24 - e)) - (~(m >> (25 - e)) & E);
+        else if (R == std::round_toward_infinity)
+            m += ((value >> 15) - 1) & ((1 << (25 - e)) - 1U);
+        else if (R == std::round_toward_neg_infinity)
+            m += -(value >> 15) & ((1 << (25 - e)) - 1U);
+        m >>= 25 - e;
+    }
+    else
+        m <<= e - 25;
+    return (value & 0x8000) ? -static_cast<T>(m) : static_cast<T>(m);
+}
+
+/// Convert half-precision floating point to integer.
+/// \tparam R rounding mode to use, `std::round_indeterminate` for fastest rounding
+/// \tparam T type to convert to (buitlin integer type with at least 16 bits precision, excluding any implicit sign
+/// bits) \param value binary representation of half-precision value \return integral value
+template <std::float_round_style R, typename T>
+T half2int(uint16 value)
+{
+    return half2int_impl<R, HALF_ROUND_TIES_TO_EVEN, T>(value);
+}
+
+/// Convert half-precision floating point to integer using round-to-nearest-away-from-zero.
+/// \tparam T type to convert to (buitlin integer type with at least 16 bits precision, excluding any implicit sign
+/// bits) \param value binary representation of half-precision value \return integral value
+template <typename T>
+T half2int_up(uint16 value)
+{
+    return half2int_impl<std::round_to_nearest, 0, T>(value);
+}
+
+/// Round half-precision number to nearest integer value.
+/// \tparam R rounding mode to use, `std::round_indeterminate` for fastest rounding
+/// \tparam E `true` for round to even, `false` for round away from zero
+/// \param value binary representation of half-precision value
+/// \return half-precision bits for nearest integral value
+template <std::float_round_style R, bool E>
+uint16 round_half_impl(uint16 value)
+{
+    uint32_t e = value & 0x7FFF;
+    uint16 result = value;
+    if (e < 0x3C00)
+    {
+        result &= 0x8000;
+        if (R == std::round_to_nearest)
+            result |= 0x3C00U & -(e >= (0x3800 + E));
+        else if (R == std::round_toward_infinity)
+            result |= 0x3C00U & -(~(value >> 15) & (e != 0));
+        else if (R == std::round_toward_neg_infinity)
+            result |= 0x3C00U & -(value > 0x8000);
+    }
+    else if (e < 0x6400)
+    {
+        e = 25 - (e >> 10);
+        uint32_t mask = (1 << e) - 1;
+        if (R == std::round_to_nearest)
+            result += (1 << (e - 1)) - (~(result >> e) & E);
+        else if (R == std::round_toward_infinity)
+            result += mask & ((value >> 15) - 1);
+        else if (R == std::round_toward_neg_infinity)
+            result += mask & -(value >> 15);
+        result &= ~mask;
+    }
+    return result;
+}
+
+/// Round half-precision number to nearest integer value.
+/// \tparam R rounding mode to use, `std::round_indeterminate` for fastest rounding
+/// \param value binary representation of half-precision value
+/// \return half-precision bits for nearest integral value
+template <std::float_round_style R>
+uint16 round_half(uint16 value)
+{
+    return round_half_impl<R, HALF_ROUND_TIES_TO_EVEN>(value);
+}
+
+/// Round half-precision number to nearest integer value using round-to-nearest-away-from-zero.
+/// \param value binary representation of half-precision value
+/// \return half-precision bits for nearest integral value
+inline uint16 round_half_up(uint16 value)
+{
+    return round_half_impl<std::round_to_nearest, 0>(value);
+}
+/// \}
+
+struct functions;
+template <typename>
+struct unary_specialized;
+template <typename, typename>
+struct binary_specialized;
+template <typename, typename, std::float_round_style>
+struct half_caster;
+} // namespace detail
+
+/// Half-precision floating point type.
+/// This class implements an IEEE-conformant half-precision floating point type with the usual arithmetic operators and
+/// conversions. It is implicitly convertible to single-precision floating point, which makes artihmetic expressions and
+/// functions with mixed-type operands to be of the most precise operand type. Additionally all arithmetic operations
+/// (and many mathematical functions) are carried out in single-precision internally. All conversions from single- to
+/// half-precision are done using the library's default rounding mode, but temporary results inside chained arithmetic
+/// expressions are kept in single-precision as long as possible (while of course still maintaining a strong
+/// half-precision type).
+///
+/// According to the C++98/03 definition, the half type is not a POD type. But according to C++11's less strict and
+/// extended definitions it is both a standard layout type and a trivially copyable type (even if not a POD type), which
+/// means it can be standard-conformantly copied using raw binary copies. But in this context some more words about the
+/// actual size of the type. Although the half is representing an IEEE 16-bit type, it does not neccessarily have to be
+/// of exactly 16-bits size. But on any reasonable implementation the actual binary representation of this type will
+/// most probably not ivolve any additional "magic" or padding beyond the simple binary representation of the underlying
+/// 16-bit IEEE number, even if not strictly guaranteed by the standard. But even then it only has an actual size of 16
+/// bits if your C++ implementation supports an unsigned integer type of exactly 16 bits width. But this should be the
+/// case on nearly any reasonable platform.
+///
+/// So if your C++ implementation is not totally exotic or imposes special alignment requirements, it is a reasonable
+/// assumption that the data of a half is just comprised of the 2 bytes of the underlying IEEE representation.
+class half
+{
+    friend struct detail::functions;
+    friend struct detail::unary_specialized<half>;
+    friend struct detail::binary_specialized<half, half>;
+    template <typename, typename, std::float_round_style>
+    friend struct detail::half_caster;
+    friend class std::numeric_limits<half>;
+#if HALF_ENABLE_CPP11_HASH
+    friend struct std::hash<half>;
+#endif
+#if HALF_ENABLE_CPP11_USER_LITERALS
+    friend half literal::operator"" _h(long double);
+#endif
+
+public:
+    /// Default constructor.
+    /// This initializes the half to 0. Although this does not match the builtin types' default-initialization semantics
+    /// and may be less efficient than no initialization, it is needed to provide proper value-initialization semantics.
+    HALF_CONSTEXPR half() HALF_NOEXCEPT : data_() {}
+
+    /// Copy constructor.
+    /// \tparam T type of concrete half expression
+    /// \param rhs half expression to copy from
+    half(detail::expr rhs)
+        : data_(detail::float2half<round_style>(static_cast<float>(rhs)))
+    {
+    }
+
+    /// Conversion constructor.
+    /// \param rhs float to convert
+    explicit half(float rhs)
+        : data_(detail::float2half<round_style>(rhs))
+    {
+    }
+
+    /// Conversion to single-precision.
+    /// \return single precision value representing expression value
+    operator float() const
+    {
+        return detail::half2float<float>(data_);
+    }
+
+    /// Assignment operator.
+    /// \tparam T type of concrete half expression
+    /// \param rhs half expression to copy from
+    /// \return reference to this half
+    half& operator=(detail::expr rhs)
+    {
+        return *this = static_cast<float>(rhs);
+    }
+
+    /// Arithmetic assignment.
+    /// \tparam T type of concrete half expression
+    /// \param rhs half expression to add
+    /// \return reference to this half
+    template <typename T>
+    typename detail::enable<half&, T>::type operator+=(T rhs)
+    {
+        return *this += static_cast<float>(rhs);
+    }
+
+    /// Arithmetic assignment.
+    /// \tparam T type of concrete half expression
+    /// \param rhs half expression to subtract
+    /// \return reference to this half
+    template <typename T>
+    typename detail::enable<half&, T>::type operator-=(T rhs)
+    {
+        return *this -= static_cast<float>(rhs);
+    }
+
+    /// Arithmetic assignment.
+    /// \tparam T type of concrete half expression
+    /// \param rhs half expression to multiply with
+    /// \return reference to this half
+    template <typename T>
+    typename detail::enable<half&, T>::type operator*=(T rhs)
+    {
+        return *this *= static_cast<float>(rhs);
+    }
+
+    /// Arithmetic assignment.
+    /// \tparam T type of concrete half expression
+    /// \param rhs half expression to divide by
+    /// \return reference to this half
+    template <typename T>
+    typename detail::enable<half&, T>::type operator/=(T rhs)
+    {
+        return *this /= static_cast<float>(rhs);
+    }
+
+    /// Assignment operator.
+    /// \param rhs single-precision value to copy from
+    /// \return reference to this half
+    half& operator=(float rhs)
+    {
+        data_ = detail::float2half<round_style>(rhs);
+        return *this;
+    }
+
+    /// Arithmetic assignment.
+    /// \param rhs single-precision value to add
+    /// \return reference to this half
+    half& operator+=(float rhs)
+    {
+        data_ = detail::float2half<round_style>(detail::half2float<float>(data_) + rhs);
+        return *this;
+    }
+
+    /// Arithmetic assignment.
+    /// \param rhs single-precision value to subtract
+    /// \return reference to this half
+    half& operator-=(float rhs)
+    {
+        data_ = detail::float2half<round_style>(detail::half2float<float>(data_) - rhs);
+        return *this;
+    }
+
+    /// Arithmetic assignment.
+    /// \param rhs single-precision value to multiply with
+    /// \return reference to this half
+    half& operator*=(float rhs)
+    {
+        data_ = detail::float2half<round_style>(detail::half2float<float>(data_) * rhs);
+        return *this;
+    }
+
+    /// Arithmetic assignment.
+    /// \param rhs single-precision value to divide by
+    /// \return reference to this half
+    half& operator/=(float rhs)
+    {
+        data_ = detail::float2half<round_style>(detail::half2float<float>(data_) / rhs);
+        return *this;
+    }
+
+    /// Prefix increment.
+    /// \return incremented half value
+    half& operator++()
+    {
+        return *this += 1.0f;
+    }
+
+    /// Prefix decrement.
+    /// \return decremented half value
+    half& operator--()
+    {
+        return *this -= 1.0f;
+    }
+
+    /// Postfix increment.
+    /// \return non-incremented half value
+    half operator++(int)
+    {
+        half out(*this);
+        ++*this;
+        return out;
+    }
+
+    /// Postfix decrement.
+    /// \return non-decremented half value
+    half operator--(int)
+    {
+        half out(*this);
+        --*this;
+        return out;
+    }
+
+private:
+    /// Rounding mode to use
+    static const std::float_round_style round_style = (std::float_round_style)(HALF_ROUND_STYLE);
+
+    /// Constructor.
+    /// \param bits binary representation to set half to
+    HALF_CONSTEXPR half(detail::binary_t, detail::uint16 bits) HALF_NOEXCEPT : data_(bits) {}
+
+    /// Internal binary representation
+    detail::uint16 data_;
+};
+
+#if HALF_ENABLE_CPP11_USER_LITERALS
+namespace literal
+{
+/// Half literal.
+/// While this returns an actual half-precision value, half literals can unfortunately not be constant expressions due
+/// to rather involved conversions.
+/// \param value literal value
+/// \return half with given value (if representable)
+inline half operator"" _h(long double value)
+{
+    return half(detail::binary, detail::float2half<half::round_style>(value));
+}
+} // namespace literal
+#endif
+
+namespace detail
+{
+/// Wrapper implementing unspecialized half-precision functions.
+struct functions
+{
+    /// Addition implementation.
+    /// \param x first operand
+    /// \param y second operand
+    /// \return Half-precision sum stored in single-precision
+    static expr plus(float x, float y)
+    {
+        return expr(x + y);
+    }
+
+    /// Subtraction implementation.
+    /// \param x first operand
+    /// \param y second operand
+    /// \return Half-precision difference stored in single-precision
+    static expr minus(float x, float y)
+    {
+        return expr(x - y);
+    }
+
+    /// Multiplication implementation.
+    /// \param x first operand
+    /// \param y second operand
+    /// \return Half-precision product stored in single-precision
+    static expr multiplies(float x, float y)
+    {
+        return expr(x * y);
+    }
+
+    /// Division implementation.
+    /// \param x first operand
+    /// \param y second operand
+    /// \return Half-precision quotient stored in single-precision
+    static expr divides(float x, float y)
+    {
+        return expr(x / y);
+    }
+
+    /// Output implementation.
+    /// \param out stream to write to
+    /// \param arg value to write
+    /// \return reference to stream
+    template <typename charT, typename traits>
+    static std::basic_ostream<charT, traits>& write(std::basic_ostream<charT, traits>& out, float arg)
+    {
+        return out << arg;
+    }
+
+    /// Input implementation.
+    /// \param in stream to read from
+    /// \param arg half to read into
+    /// \return reference to stream
+    template <typename charT, typename traits>
+    static std::basic_istream<charT, traits>& read(std::basic_istream<charT, traits>& in, half& arg)
+    {
+        float f;
+        if (in >> f)
+            arg = f;
+        return in;
+    }
+
+    /// Modulo implementation.
+    /// \param x first operand
+    /// \param y second operand
+    /// \return Half-precision division remainder stored in single-precision
+    static expr fmod(float x, float y)
+    {
+        return expr(std::fmod(x, y));
+    }
+
+    /// Remainder implementation.
+    /// \param x first operand
+    /// \param y second operand
+    /// \return Half-precision division remainder stored in single-precision
+    static expr remainder(float x, float y)
+    {
+#if HALF_ENABLE_CPP11_CMATH
+        return expr(std::remainder(x, y));
+#else
+        if (builtin_isnan(x) || builtin_isnan(y))
+            return expr(std::numeric_limits<float>::quiet_NaN());
+        float ax = std::fabs(x), ay = std::fabs(y);
+        if (ax >= 65536.0f || ay < std::ldexp(1.0f, -24))
+            return expr(std::numeric_limits<float>::quiet_NaN());
+        if (ay >= 65536.0f)
+            return expr(x);
+        if (ax == ay)
+            return expr(builtin_signbit(x) ? -0.0f : 0.0f);
+        ax = std::fmod(ax, ay + ay);
+        float y2 = 0.5f * ay;
+        if (ax > y2)
+        {
+            ax -= ay;
+            if (ax >= y2)
+                ax -= ay;
+        }
+        return expr(builtin_signbit(x) ? -ax : ax);
+#endif
+    }
+
+    /// Remainder implementation.
+    /// \param x first operand
+    /// \param y second operand
+    /// \param quo address to store quotient bits at
+    /// \return Half-precision division remainder stored in single-precision
+    static expr remquo(float x, float y, int* quo)
+    {
+#if HALF_ENABLE_CPP11_CMATH
+        return expr(std::remquo(x, y, quo));
+#else
+        if (builtin_isnan(x) || builtin_isnan(y))
+            return expr(std::numeric_limits<float>::quiet_NaN());
+        bool sign = builtin_signbit(x), qsign = static_cast<bool>(sign ^ builtin_signbit(y));
+        float ax = std::fabs(x), ay = std::fabs(y);
+        if (ax >= 65536.0f || ay < std::ldexp(1.0f, -24))
+            return expr(std::numeric_limits<float>::quiet_NaN());
+        if (ay >= 65536.0f)
+            return expr(x);
+        if (ax == ay)
+            return *quo = qsign ? -1 : 1, expr(sign ? -0.0f : 0.0f);
+        ax = std::fmod(ax, 8.0f * ay);
+        int cquo = 0;
+        if (ax >= 4.0f * ay)
+        {
+            ax -= 4.0f * ay;
+            cquo += 4;
+        }
+        if (ax >= 2.0f * ay)
+        {
+            ax -= 2.0f * ay;
+            cquo += 2;
+        }
+        float y2 = 0.5f * ay;
+        if (ax > y2)
+        {
+            ax -= ay;
+            ++cquo;
+            if (ax >= y2)
+            {
+                ax -= ay;
+                ++cquo;
+            }
+        }
+        return *quo = qsign ? -cquo : cquo, expr(sign ? -ax : ax);
+#endif
+    }
+
+    /// Positive difference implementation.
+    /// \param x first operand
+    /// \param y second operand
+    /// \return Positive difference stored in single-precision
+    static expr fdim(float x, float y)
+    {
+#if HALF_ENABLE_CPP11_CMATH
+        return expr(std::fdim(x, y));
+#else
+        return expr((x <= y) ? 0.0f : (x - y));
+#endif
+    }
+
+    /// Fused multiply-add implementation.
+    /// \param x first operand
+    /// \param y second operand
+    /// \param z third operand
+    /// \return \a x * \a y + \a z stored in single-precision
+    static expr fma(float x, float y, float z)
+    {
+#if HALF_ENABLE_CPP11_CMATH && defined(FP_FAST_FMAF)
+        return expr(std::fma(x, y, z));
+#else
+        return expr(x * y + z);
+#endif
+    }
+
+    /// Get NaN.
+    /// \return Half-precision quiet NaN
+    static half nanh()
+    {
+        return half(binary, 0x7FFF);
+    }
+
+    /// Exponential implementation.
+    /// \param arg function argument
+    /// \return function value stored in single-preicision
+    static expr exp(float arg)
+    {
+        return expr(std::exp(arg));
+    }
+
+    /// Exponential implementation.
+    /// \param arg function argument
+    /// \return function value stored in single-preicision
+    static expr expm1(float arg)
+    {
+#if HALF_ENABLE_CPP11_CMATH
+        return expr(std::expm1(arg));
+#else
+        return expr(static_cast<float>(std::exp(static_cast<double>(arg)) - 1.0));
+#endif
+    }
+
+    /// Binary exponential implementation.
+    /// \param arg function argument
+    /// \return function value stored in single-preicision
+    static expr exp2(float arg)
+    {
+#if HALF_ENABLE_CPP11_CMATH
+        return expr(std::exp2(arg));
+#else
+        return expr(static_cast<float>(std::exp(arg * 0.69314718055994530941723212145818)));
+#endif
+    }
+
+    /// Logarithm implementation.
+    /// \param arg function argument
+    /// \return function value stored in single-preicision
+    static expr log(float arg)
+    {
+        return expr(std::log(arg));
+    }
+
+    /// Common logarithm implementation.
+    /// \param arg function argument
+    /// \return function value stored in single-preicision
+    static expr log10(float arg)
+    {
+        return expr(std::log10(arg));
+    }
+
+    /// Logarithm implementation.
+    /// \param arg function argument
+    /// \return function value stored in single-preicision
+    static expr log1p(float arg)
+    {
+#if HALF_ENABLE_CPP11_CMATH
+        return expr(std::log1p(arg));
+#else
+        return expr(static_cast<float>(std::log(1.0 + arg)));
+#endif
+    }
+
+    /// Binary logarithm implementation.
+    /// \param arg function argument
+    /// \return function value stored in single-preicision
+    static expr log2(float arg)
+    {
+#if HALF_ENABLE_CPP11_CMATH
+        return expr(std::log2(arg));
+#else
+        return expr(static_cast<float>(std::log(static_cast<double>(arg)) * 1.4426950408889634073599246810019));
+#endif
+    }
+
+    /// Square root implementation.
+    /// \param arg function argument
+    /// \return function value stored in single-preicision
+    static expr sqrt(float arg)
+    {
+        return expr(std::sqrt(arg));
+    }
+
+    /// Cubic root implementation.
+    /// \param arg function argument
+    /// \return function value stored in single-preicision
+    static expr cbrt(float arg)
+    {
+#if HALF_ENABLE_CPP11_CMATH
+        return expr(std::cbrt(arg));
+#else
+        if (builtin_isnan(arg) || builtin_isinf(arg))
+            return expr(arg);
+        return expr(builtin_signbit(arg) ? -static_cast<float>(std::pow(-static_cast<double>(arg), 1.0 / 3.0))
+                                         : static_cast<float>(std::pow(static_cast<double>(arg), 1.0 / 3.0)));
+#endif
+    }
+
+    /// Hypotenuse implementation.
+    /// \param x first argument
+    /// \param y second argument
+    /// \return function value stored in single-preicision
+    static expr hypot(float x, float y)
+    {
+#if HALF_ENABLE_CPP11_CMATH
+        return expr(std::hypot(x, y));
+#else
+        return expr((builtin_isinf(x) || builtin_isinf(y))
+                ? std::numeric_limits<float>::infinity()
+                : static_cast<float>(std::sqrt(static_cast<double>(x) * x + static_cast<double>(y) * y)));
+#endif
+    }
+
+    /// Power implementation.
+    /// \param base value to exponentiate
+    /// \param exp power to expontiate to
+    /// \return function value stored in single-preicision
+    static expr pow(float base, float exp)
+    {
+        return expr(std::pow(base, exp));
+    }
+
+    /// Sine implementation.
+    /// \param arg function argument
+    /// \return function value stored in single-preicision
+    static expr sin(float arg)
+    {
+        return expr(std::sin(arg));
+    }
+
+    /// Cosine implementation.
+    /// \param arg function argument
+    /// \return function value stored in single-preicision
+    static expr cos(float arg)
+    {
+        return expr(std::cos(arg));
+    }
+
+    /// Tan implementation.
+    /// \param arg function argument
+    /// \return function value stored in single-preicision
+    static expr tan(float arg)
+    {
+        return expr(std::tan(arg));
+    }
+
+    /// Arc sine implementation.
+    /// \param arg function argument
+    /// \return function value stored in single-preicision
+    static expr asin(float arg)
+    {
+        return expr(std::asin(arg));
+    }
+
+    /// Arc cosine implementation.
+    /// \param arg function argument
+    /// \return function value stored in single-preicision
+    static expr acos(float arg)
+    {
+        return expr(std::acos(arg));
+    }
+
+    /// Arc tangent implementation.
+    /// \param arg function argument
+    /// \return function value stored in single-preicision
+    static expr atan(float arg)
+    {
+        return expr(std::atan(arg));
+    }
+
+    /// Arc tangent implementation.
+    /// \param x first argument
+    /// \param y second argument
+    /// \return function value stored in single-preicision
+    static expr atan2(float x, float y)
+    {
+        return expr(std::atan2(x, y));
+    }
+
+    /// Hyperbolic sine implementation.
+    /// \param arg function argument
+    /// \return function value stored in single-preicision
+    static expr sinh(float arg)
+    {
+        return expr(std::sinh(arg));
+    }
+
+    /// Hyperbolic cosine implementation.
+    /// \param arg function argument
+    /// \return function value stored in single-preicision
+    static expr cosh(float arg)
+    {
+        return expr(std::cosh(arg));
+    }
+
+    /// Hyperbolic tangent implementation.
+    /// \param arg function argument
+    /// \return function value stored in single-preicision
+    static expr tanh(float arg)
+    {
+        return expr(std::tanh(arg));
+    }
+
+    /// Hyperbolic area sine implementation.
+    /// \param arg function argument
+    /// \return function value stored in single-preicision
+    static expr asinh(float arg)
+    {
+#if HALF_ENABLE_CPP11_CMATH
+        return expr(std::asinh(arg));
+#else
+        return expr((arg == -std::numeric_limits<float>::infinity())
+                ? arg
+                : static_cast<float>(std::log(arg + std::sqrt(arg * arg + 1.0))));
+#endif
+    }
+
+    /// Hyperbolic area cosine implementation.
+    /// \param arg function argument
+    /// \return function value stored in single-preicision
+    static expr acosh(float arg)
+    {
+#if HALF_ENABLE_CPP11_CMATH
+        return expr(std::acosh(arg));
+#else
+        return expr((arg < -1.0f) ? std::numeric_limits<float>::quiet_NaN()
+                                  : static_cast<float>(std::log(arg + std::sqrt(arg * arg - 1.0))));
+#endif
+    }
+
+    /// Hyperbolic area tangent implementation.
+    /// \param arg function argument
+    /// \return function value stored in single-preicision
+    static expr atanh(float arg)
+    {
+#if HALF_ENABLE_CPP11_CMATH
+        return expr(std::atanh(arg));
+#else
+        return expr(static_cast<float>(0.5 * std::log((1.0 + arg) / (1.0 - arg))));
+#endif
+    }
+
+    /// Error function implementation.
+    /// \param arg function argument
+    /// \return function value stored in single-preicision
+    static expr erf(float arg)
+    {
+#if HALF_ENABLE_CPP11_CMATH
+        return expr(std::erf(arg));
+#else
+        return expr(static_cast<float>(erf(static_cast<double>(arg))));
+#endif
+    }
+
+    /// Complementary implementation.
+    /// \param arg function argument
+    /// \return function value stored in single-preicision
+    static expr erfc(float arg)
+    {
+#if HALF_ENABLE_CPP11_CMATH
+        return expr(std::erfc(arg));
+#else
+        return expr(static_cast<float>(1.0 - erf(static_cast<double>(arg))));
+#endif
+    }
+
+    /// Gamma logarithm implementation.
+    /// \param arg function argument
+    /// \return function value stored in single-preicision
+    static expr lgamma(float arg)
+    {
+#if HALF_ENABLE_CPP11_CMATH
+        return expr(std::lgamma(arg));
+#else
+        if (builtin_isinf(arg))
+            return expr(std::numeric_limits<float>::infinity());
+        if (arg < 0.0f)
+        {
+            float i, f = std::modf(-arg, &i);
+            if (f == 0.0f)
+                return expr(std::numeric_limits<float>::infinity());
+            return expr(static_cast<float>(1.1447298858494001741434273513531
+                - std::log(std::abs(std::sin(3.1415926535897932384626433832795 * f))) - lgamma(1.0 - arg)));
+        }
+        return expr(static_cast<float>(lgamma(static_cast<double>(arg))));
+#endif
+    }
+
+    /// Gamma implementation.
+    /// \param arg function argument
+    /// \return function value stored in single-preicision
+    static expr tgamma(float arg)
+    {
+#if HALF_ENABLE_CPP11_CMATH
+        return expr(std::tgamma(arg));
+#else
+        if (arg == 0.0f)
+            return builtin_signbit(arg) ? expr(-std::numeric_limits<float>::infinity())
+                                        : expr(std::numeric_limits<float>::infinity());
+        if (arg < 0.0f)
+        {
+            float i, f = std::modf(-arg, &i);
+            if (f == 0.0f)
+                return expr(std::numeric_limits<float>::quiet_NaN());
+            double value = 3.1415926535897932384626433832795
+                / (std::sin(3.1415926535897932384626433832795 * f) * std::exp(lgamma(1.0 - arg)));
+            return expr(static_cast<float>((std::fmod(i, 2.0f) == 0.0f) ? -value : value));
+        }
+        if (builtin_isinf(arg))
+            return expr(arg);
+        return expr(static_cast<float>(std::exp(lgamma(static_cast<double>(arg)))));
+#endif
+    }
+
+    /// Floor implementation.
+    /// \param arg value to round
+    /// \return rounded value
+    static half floor(half arg)
+    {
+        return half(binary, round_half<std::round_toward_neg_infinity>(arg.data_));
+    }
+
+    /// Ceiling implementation.
+    /// \param arg value to round
+    /// \return rounded value
+    static half ceil(half arg)
+    {
+        return half(binary, round_half<std::round_toward_infinity>(arg.data_));
+    }
+
+    /// Truncation implementation.
+    /// \param arg value to round
+    /// \return rounded value
+    static half trunc(half arg)
+    {
+        return half(binary, round_half<std::round_toward_zero>(arg.data_));
+    }
+
+    /// Nearest integer implementation.
+    /// \param arg value to round
+    /// \return rounded value
+    static half round(half arg)
+    {
+        return half(binary, round_half_up(arg.data_));
+    }
+
+    /// Nearest integer implementation.
+    /// \param arg value to round
+    /// \return rounded value
+    static long lround(half arg)
+    {
+        return detail::half2int_up<long>(arg.data_);
+    }
+
+    /// Nearest integer implementation.
+    /// \param arg value to round
+    /// \return rounded value
+    static half rint(half arg)
+    {
+        return half(binary, round_half<half::round_style>(arg.data_));
+    }
+
+    /// Nearest integer implementation.
+    /// \param arg value to round
+    /// \return rounded value
+    static long lrint(half arg)
+    {
+        return detail::half2int<half::round_style, long>(arg.data_);
+    }
+
+#if HALF_ENABLE_CPP11_LONG_LONG
+    /// Nearest integer implementation.
+    /// \param arg value to round
+    /// \return rounded value
+    static long long llround(half arg)
+    {
+        return detail::half2int_up<long long>(arg.data_);
+    }
+
+    /// Nearest integer implementation.
+    /// \param arg value to round
+    /// \return rounded value
+    static long long llrint(half arg)
+    {
+        return detail::half2int<half::round_style, long long>(arg.data_);
+    }
+#endif
+
+    /// Decompression implementation.
+    /// \param arg number to decompress
+    /// \param exp address to store exponent at
+    /// \return normalized significant
+    static half frexp(half arg, int* exp)
+    {
+        int m = arg.data_ & 0x7FFF, e = -14;
+        if (m >= 0x7C00 || !m)
+            return *exp = 0, arg;
+        for (; m < 0x400; m <<= 1, --e)
+            ;
+        return *exp = e + (m >> 10), half(binary, (arg.data_ & 0x8000) | 0x3800 | (m & 0x3FF));
+    }
+
+    /// Decompression implementation.
+    /// \param arg number to decompress
+    /// \param iptr address to store integer part at
+    /// \return fractional part
+    static half modf(half arg, half* iptr)
+    {
+        uint32_t e = arg.data_ & 0x7FFF;
+        if (e >= 0x6400)
+            return *iptr = arg, half(binary, arg.data_ & (0x8000U | -(e > 0x7C00)));
+        if (e < 0x3C00)
+            return iptr->data_ = arg.data_ & 0x8000, arg;
+        e >>= 10;
+        uint32_t mask = (1 << (25 - e)) - 1, m = arg.data_ & mask;
+        iptr->data_ = arg.data_ & ~mask;
+        if (!m)
+            return half(binary, arg.data_ & 0x8000);
+        for (; m < 0x400; m <<= 1, --e)
+            ;
+        return half(binary, static_cast<uint16>((arg.data_ & 0x8000) | (e << 10) | (m & 0x3FF)));
+    }
+
+    /// Scaling implementation.
+    /// \param arg number to scale
+    /// \param exp power of two to scale by
+    /// \return scaled number
+    static half scalbln(half arg, long exp)
+    {
+        uint32_t m = arg.data_ & 0x7FFF;
+        if (m >= 0x7C00 || !m)
+            return arg;
+        for (; m < 0x400; m <<= 1, --exp)
+            ;
+        exp += m >> 10;
+        uint16 value = arg.data_ & 0x8000;
+        if (exp > 30)
+        {
+            if (half::round_style == std::round_toward_zero)
+                value |= 0x7BFF;
+            else if (half::round_style == std::round_toward_infinity)
+                value |= 0x7C00 - (value >> 15);
+            else if (half::round_style == std::round_toward_neg_infinity)
+                value |= 0x7BFF + (value >> 15);
+            else
+                value |= 0x7C00;
+        }
+        else if (exp > 0)
+            value |= (exp << 10) | (m & 0x3FF);
+        else if (exp > -11)
+        {
+            m = (m & 0x3FF) | 0x400;
+            if (half::round_style == std::round_to_nearest)
+            {
+                m += 1 << -exp;
+#if HALF_ROUND_TIES_TO_EVEN
+                m -= (m >> (1 - exp)) & 1;
+#endif
+            }
+            else if (half::round_style == std::round_toward_infinity)
+                m += ((value >> 15) - 1) & ((1 << (1 - exp)) - 1U);
+            else if (half::round_style == std::round_toward_neg_infinity)
+                m += -(value >> 15) & ((1 << (1 - exp)) - 1U);
+            value |= m >> (1 - exp);
+        }
+        else if (half::round_style == std::round_toward_infinity)
+            value -= (value >> 15) - 1;
+        else if (half::round_style == std::round_toward_neg_infinity)
+            value += value >> 15;
+        return half(binary, value);
+    }
+
+    /// Exponent implementation.
+    /// \param arg number to query
+    /// \return floating point exponent
+    static int ilogb(half arg)
+    {
+        int abs = arg.data_ & 0x7FFF;
+        if (!abs)
+            return FP_ILOGB0;
+        if (abs < 0x7C00)
+        {
+            int exp = (abs >> 10) - 15;
+            if (abs < 0x400)
+                for (; abs < 0x200; abs <<= 1, --exp)
+                    ;
+            return exp;
+        }
+        if (abs > 0x7C00)
+            return FP_ILOGBNAN;
+        return INT_MAX;
+    }
+
+    /// Exponent implementation.
+    /// \param arg number to query
+    /// \return floating point exponent
+    static half logb(half arg)
+    {
+        int abs = arg.data_ & 0x7FFF;
+        if (!abs)
+            return half(binary, 0xFC00);
+        if (abs < 0x7C00)
+        {
+            int exp = (abs >> 10) - 15;
+            if (abs < 0x400)
+                for (; abs < 0x200; abs <<= 1, --exp)
+                    ;
+            uint16 bits = (exp < 0) << 15;
+            if (exp)
+            {
+                uint32_t m = std::abs(exp) << 6, e = 18;
+                for (; m < 0x400; m <<= 1, --e)
+                    ;
+                bits |= (e << 10) + m;
+            }
+            return half(binary, bits);
+        }
+        if (abs > 0x7C00)
+            return arg;
+        return half(binary, 0x7C00);
+    }
+
+    /// Enumeration implementation.
+    /// \param from number to increase/decrease
+    /// \param to direction to enumerate into
+    /// \return next representable number
+    static half nextafter(half from, half to)
+    {
+        uint16 fabs = from.data_ & 0x7FFF, tabs = to.data_ & 0x7FFF;
+        if (fabs > 0x7C00)
+            return from;
+        if (tabs > 0x7C00 || from.data_ == to.data_ || !(fabs | tabs))
+            return to;
+        if (!fabs)
+            return half(binary, (to.data_ & 0x8000) + 1);
+        bool lt = ((fabs == from.data_) ? static_cast<int>(fabs) : -static_cast<int>(fabs))
+            < ((tabs == to.data_) ? static_cast<int>(tabs) : -static_cast<int>(tabs));
+        return half(binary, from.data_ + (((from.data_ >> 15) ^ static_cast<unsigned>(lt)) << 1) - 1);
+    }
+
+    /// Enumeration implementation.
+    /// \param from number to increase/decrease
+    /// \param to direction to enumerate into
+    /// \return next representable number
+    static half nexttoward(half from, long double to)
+    {
+        if (isnan(from))
+            return from;
+        long double lfrom = static_cast<long double>(from);
+        if (builtin_isnan(to) || lfrom == to)
+            return half(static_cast<float>(to));
+        if (!(from.data_ & 0x7FFF))
+            return half(binary, (static_cast<detail::uint16>(builtin_signbit(to)) << 15) + 1);
+        return half(binary, from.data_ + (((from.data_ >> 15) ^ static_cast<unsigned>(lfrom < to)) << 1) - 1);
+    }
+
+    /// Sign implementation
+    /// \param x first operand
+    /// \param y second operand
+    /// \return composed value
+    static half copysign(half x, half y)
+    {
+        return half(binary, x.data_ ^ ((x.data_ ^ y.data_) & 0x8000));
+    }
+
+    /// Classification implementation.
+    /// \param arg value to classify
+    /// \retval true if infinite number
+    /// \retval false else
+    static int fpclassify(half arg)
+    {
+        uint32_t abs = arg.data_ & 0x7FFF;
+        return abs
+            ? ((abs > 0x3FF) ? ((abs >= 0x7C00) ? ((abs > 0x7C00) ? FP_NAN : FP_INFINITE) : FP_NORMAL) : FP_SUBNORMAL)
+            : FP_ZERO;
+    }
+
+    /// Classification implementation.
+    /// \param arg value to classify
+    /// \retval true if finite number
+    /// \retval false else
+    static bool isfinite(half arg)
+    {
+        return (arg.data_ & 0x7C00) != 0x7C00;
+    }
+
+    /// Classification implementation.
+    /// \param arg value to classify
+    /// \retval true if infinite number
+    /// \retval false else
+    static bool isinf(half arg)
+    {
+        return (arg.data_ & 0x7FFF) == 0x7C00;
+    }
+
+    /// Classification implementation.
+    /// \param arg value to classify
+    /// \retval true if not a number
+    /// \retval false else
+    static bool isnan(half arg)
+    {
+        return (arg.data_ & 0x7FFF) > 0x7C00;
+    }
+
+    /// Classification implementation.
+    /// \param arg value to classify
+    /// \retval true if normal number
+    /// \retval false else
+    static bool isnormal(half arg)
+    {
+        return ((arg.data_ & 0x7C00) != 0) & ((arg.data_ & 0x7C00) != 0x7C00);
+    }
+
+    /// Sign bit implementation.
+    /// \param arg value to check
+    /// \retval true if signed
+    /// \retval false if unsigned
+    static bool signbit(half arg)
+    {
+        return (arg.data_ & 0x8000) != 0;
+    }
+
+    /// Comparison implementation.
+    /// \param x first operand
+    /// \param y second operand
+    /// \retval true if operands equal
+    /// \retval false else
+    static bool isequal(half x, half y)
+    {
+        return (x.data_ == y.data_ || !((x.data_ | y.data_) & 0x7FFF)) && !isnan(x);
+    }
+
+    /// Comparison implementation.
+    /// \param x first operand
+    /// \param y second operand
+    /// \retval true if operands not equal
+    /// \retval false else
+    static bool isnotequal(half x, half y)
+    {
+        return (x.data_ != y.data_ && ((x.data_ | y.data_) & 0x7FFF)) || isnan(x);
+    }
+
+    /// Comparison implementation.
+    /// \param x first operand
+    /// \param y second operand
+    /// \retval true if \a x > \a y
+    /// \retval false else
+    static bool isgreater(half x, half y)
+    {
+        int xabs = x.data_ & 0x7FFF, yabs = y.data_ & 0x7FFF;
+        return xabs <= 0x7C00 && yabs <= 0x7C00
+            && (((xabs == x.data_) ? xabs : -xabs) > ((yabs == y.data_) ? yabs : -yabs));
+    }
+
+    /// Comparison implementation.
+    /// \param x first operand
+    /// \param y second operand
+    /// \retval true if \a x >= \a y
+    /// \retval false else
+    static bool isgreaterequal(half x, half y)
+    {
+        int xabs = x.data_ & 0x7FFF, yabs = y.data_ & 0x7FFF;
+        return xabs <= 0x7C00 && yabs <= 0x7C00
+            && (((xabs == x.data_) ? xabs : -xabs) >= ((yabs == y.data_) ? yabs : -yabs));
+    }
+
+    /// Comparison implementation.
+    /// \param x first operand
+    /// \param y second operand
+    /// \retval true if \a x < \a y
+    /// \retval false else
+    static bool isless(half x, half y)
+    {
+        int xabs = x.data_ & 0x7FFF, yabs = y.data_ & 0x7FFF;
+        return xabs <= 0x7C00 && yabs <= 0x7C00
+            && (((xabs == x.data_) ? xabs : -xabs) < ((yabs == y.data_) ? yabs : -yabs));
+    }
+
+    /// Comparison implementation.
+    /// \param x first operand
+    /// \param y second operand
+    /// \retval true if \a x <= \a y
+    /// \retval false else
+    static bool islessequal(half x, half y)
+    {
+        int xabs = x.data_ & 0x7FFF, yabs = y.data_ & 0x7FFF;
+        return xabs <= 0x7C00 && yabs <= 0x7C00
+            && (((xabs == x.data_) ? xabs : -xabs) <= ((yabs == y.data_) ? yabs : -yabs));
+    }
+
+    /// Comparison implementation.
+    /// \param x first operand
+    /// \param y second operand
+    /// \retval true if either \a x > \a y nor \a x < \a y
+    /// \retval false else
+    static bool islessgreater(half x, half y)
+    {
+        int xabs = x.data_ & 0x7FFF, yabs = y.data_ & 0x7FFF;
+        if (xabs > 0x7C00 || yabs > 0x7C00)
+            return false;
+        int a = (xabs == x.data_) ? xabs : -xabs, b = (yabs == y.data_) ? yabs : -yabs;
+        return a < b || a > b;
+    }
+
+    /// Comparison implementation.
+    /// \param x first operand
+    /// \param y second operand
+    /// \retval true if operand unordered
+    /// \retval false else
+    static bool isunordered(half x, half y)
+    {
+        return isnan(x) || isnan(y);
+    }
+
+private:
+    static double erf(double arg)
+    {
+        if (builtin_isinf(arg))
+            return (arg < 0.0) ? -1.0 : 1.0;
+        double x2 = arg * arg, ax2 = 0.147 * x2,
+               value = std::sqrt(1.0 - std::exp(-x2 * (1.2732395447351626861510701069801 + ax2) / (1.0 + ax2)));
+        return builtin_signbit(arg) ? -value : value;
+    }
+
+    static double lgamma(double arg)
+    {
+        double v = 1.0;
+        for (; arg < 8.0; ++arg)
+            v *= arg;
+        double w = 1.0 / (arg * arg);
+        return (((((((-0.02955065359477124183006535947712 * w + 0.00641025641025641025641025641026) * w
+                        + -0.00191752691752691752691752691753)
+                           * w
+                       + 8.4175084175084175084175084175084e-4)
+                          * w
+                      + -5.952380952380952380952380952381e-4)
+                         * w
+                     + 7.9365079365079365079365079365079e-4)
+                        * w
+                    + -0.00277777777777777777777777777778)
+                       * w
+                   + 0.08333333333333333333333333333333)
+            / arg
+            + 0.91893853320467274178032973640562 - std::log(v) - arg + (arg - 0.5) * std::log(arg);
+    }
+};
+
+/// Wrapper for unary half-precision functions needing specialization for individual argument types.
+/// \tparam T argument type
+template <typename T>
+struct unary_specialized
+{
+    /// Negation implementation.
+    /// \param arg value to negate
+    /// \return negated value
+    static HALF_CONSTEXPR half negate(half arg)
+    {
+        return half(binary, arg.data_ ^ 0x8000);
+    }
+
+    /// Absolute value implementation.
+    /// \param arg function argument
+    /// \return absolute value
+    static half fabs(half arg)
+    {
+        return half(binary, arg.data_ & 0x7FFF);
+    }
+};
+template <>
+struct unary_specialized<expr>
+{
+    static HALF_CONSTEXPR expr negate(float arg)
+    {
+        return expr(-arg);
+    }
+    static expr fabs(float arg)
+    {
+        return expr(std::fabs(arg));
+    }
+};
+
+/// Wrapper for binary half-precision functions needing specialization for individual argument types.
+/// \tparam T first argument type
+/// \tparam U first argument type
+template <typename T, typename U>
+struct binary_specialized
+{
+    /// Minimum implementation.
+    /// \param x first operand
+    /// \param y second operand
+    /// \return minimum value
+    static expr fmin(float x, float y)
+    {
+#if HALF_ENABLE_CPP11_CMATH
+        return expr(std::fmin(x, y));
+#else
+        if (builtin_isnan(x))
+            return expr(y);
+        if (builtin_isnan(y))
+            return expr(x);
+        return expr(std::min(x, y));
+#endif
+    }
+
+    /// Maximum implementation.
+    /// \param x first operand
+    /// \param y second operand
+    /// \return maximum value
+    static expr fmax(float x, float y)
+    {
+#if HALF_ENABLE_CPP11_CMATH
+        return expr(std::fmax(x, y));
+#else
+        if (builtin_isnan(x))
+            return expr(y);
+        if (builtin_isnan(y))
+            return expr(x);
+        return expr(std::max(x, y));
+#endif
+    }
+};
+template <>
+struct binary_specialized<half, half>
+{
+    static half fmin(half x, half y)
+    {
+        int xabs = x.data_ & 0x7FFF, yabs = y.data_ & 0x7FFF;
+        if (xabs > 0x7C00)
+            return y;
+        if (yabs > 0x7C00)
+            return x;
+        return (((xabs == x.data_) ? xabs : -xabs) > ((yabs == y.data_) ? yabs : -yabs)) ? y : x;
+    }
+    static half fmax(half x, half y)
+    {
+        int xabs = x.data_ & 0x7FFF, yabs = y.data_ & 0x7FFF;
+        if (xabs > 0x7C00)
+            return y;
+        if (yabs > 0x7C00)
+            return x;
+        return (((xabs == x.data_) ? xabs : -xabs) < ((yabs == y.data_) ? yabs : -yabs)) ? y : x;
+    }
+};
+
+/// Helper class for half casts.
+/// This class template has to be specialized for all valid cast argument to define an appropriate static `cast` member
+/// function and a corresponding `type` member denoting its return type.
+/// \tparam T destination type
+/// \tparam U source type
+/// \tparam R rounding mode to use
+template <typename T, typename U, std::float_round_style R = (std::float_round_style)(HALF_ROUND_STYLE)>
+struct half_caster
+{
+};
+template <typename U, std::float_round_style R>
+struct half_caster<half, U, R>
+{
+#if HALF_ENABLE_CPP11_STATIC_ASSERT && HALF_ENABLE_CPP11_TYPE_TRAITS
+    static_assert(std::is_arithmetic<U>::value, "half_cast from non-arithmetic type unsupported");
+#endif
+
+    static half cast(U arg)
+    {
+        return cast_impl(arg, is_float<U>());
+    };
+
+private:
+    static half cast_impl(U arg, true_type)
+    {
+        return half(binary, float2half<R>(arg));
+    }
+    static half cast_impl(U arg, false_type)
+    {
+        return half(binary, int2half<R>(arg));
+    }
+};
+template <typename T, std::float_round_style R>
+struct half_caster<T, half, R>
+{
+#if HALF_ENABLE_CPP11_STATIC_ASSERT && HALF_ENABLE_CPP11_TYPE_TRAITS
+    static_assert(std::is_arithmetic<T>::value, "half_cast to non-arithmetic type unsupported");
+#endif
+
+    static T cast(half arg)
+    {
+        return cast_impl(arg, is_float<T>());
+    }
+
+private:
+    static T cast_impl(half arg, true_type)
+    {
+        return half2float<T>(arg.data_);
+    }
+    static T cast_impl(half arg, false_type)
+    {
+        return half2int<R, T>(arg.data_);
+    }
+};
+template <typename T, std::float_round_style R>
+struct half_caster<T, expr, R>
+{
+#if HALF_ENABLE_CPP11_STATIC_ASSERT && HALF_ENABLE_CPP11_TYPE_TRAITS
+    static_assert(std::is_arithmetic<T>::value, "half_cast to non-arithmetic type unsupported");
+#endif
+
+    static T cast(expr arg)
+    {
+        return cast_impl(arg, is_float<T>());
+    }
+
+private:
+    static T cast_impl(float arg, true_type)
+    {
+        return static_cast<T>(arg);
+    }
+    static T cast_impl(half arg, false_type)
+    {
+        return half2int<R, T>(arg.data_);
+    }
+};
+template <std::float_round_style R>
+struct half_caster<half, half, R>
+{
+    static half cast(half arg)
+    {
+        return arg;
+    }
+};
+template <std::float_round_style R>
+struct half_caster<half, expr, R> : half_caster<half, half, R>
+{
+};
+
+/// \name Comparison operators
+/// \{
+
+/// Comparison for equality.
+/// \param x first operand
+/// \param y second operand
+/// \retval true if operands equal
+/// \retval false else
+template <typename T, typename U>
+typename enable<bool, T, U>::type operator==(T x, U y)
+{
+    return functions::isequal(x, y);
+}
+
+/// Comparison for inequality.
+/// \param x first operand
+/// \param y second operand
+/// \retval true if operands not equal
+/// \retval false else
+template <typename T, typename U>
+typename enable<bool, T, U>::type operator!=(T x, U y)
+{
+    return functions::isnotequal(x, y);
+}
+
+/// Comparison for less than.
+/// \param x first operand
+/// \param y second operand
+/// \retval true if \a x less than \a y
+/// \retval false else
+template <typename T, typename U>
+typename enable<bool, T, U>::type operator<(T x, U y)
+{
+    return functions::isless(x, y);
+}
+
+/// Comparison for greater than.
+/// \param x first operand
+/// \param y second operand
+/// \retval true if \a x greater than \a y
+/// \retval false else
+template <typename T, typename U>
+typename enable<bool, T, U>::type operator>(T x, U y)
+{
+    return functions::isgreater(x, y);
+}
+
+/// Comparison for less equal.
+/// \param x first operand
+/// \param y second operand
+/// \retval true if \a x less equal \a y
+/// \retval false else
+template <typename T, typename U>
+typename enable<bool, T, U>::type operator<=(T x, U y)
+{
+    return functions::islessequal(x, y);
+}
+
+/// Comparison for greater equal.
+/// \param x first operand
+/// \param y second operand
+/// \retval true if \a x greater equal \a y
+/// \retval false else
+template <typename T, typename U>
+typename enable<bool, T, U>::type operator>=(T x, U y)
+{
+    return functions::isgreaterequal(x, y);
+}
+
+/// \}
+/// \name Arithmetic operators
+/// \{
+
+/// Add halfs.
+/// \param x left operand
+/// \param y right operand
+/// \return sum of half expressions
+template <typename T, typename U>
+typename enable<expr, T, U>::type operator+(T x, U y)
+{
+    return functions::plus(x, y);
+}
+
+/// Subtract halfs.
+/// \param x left operand
+/// \param y right operand
+/// \return difference of half expressions
+template <typename T, typename U>
+typename enable<expr, T, U>::type operator-(T x, U y)
+{
+    return functions::minus(x, y);
+}
+
+/// Multiply halfs.
+/// \param x left operand
+/// \param y right operand
+/// \return product of half expressions
+template <typename T, typename U>
+typename enable<expr, T, U>::type operator*(T x, U y)
+{
+    return functions::multiplies(x, y);
+}
+
+/// Divide halfs.
+/// \param x left operand
+/// \param y right operand
+/// \return quotient of half expressions
+template <typename T, typename U>
+typename enable<expr, T, U>::type operator/(T x, U y)
+{
+    return functions::divides(x, y);
+}
+
+/// Identity.
+/// \param arg operand
+/// \return uncahnged operand
+template <typename T>
+HALF_CONSTEXPR typename enable<T, T>::type operator+(T arg)
+{
+    return arg;
+}
+
+/// Negation.
+/// \param arg operand
+/// \return negated operand
+template <typename T>
+HALF_CONSTEXPR typename enable<T, T>::type operator-(T arg)
+{
+    return unary_specialized<T>::negate(arg);
+}
+
+/// \}
+/// \name Input and output
+/// \{
+
+/// Output operator.
+/// \param out output stream to write into
+/// \param arg half expression to write
+/// \return reference to output stream
+template <typename T, typename charT, typename traits>
+typename enable<std::basic_ostream<charT, traits>&, T>::type operator<<(std::basic_ostream<charT, traits>& out, T arg)
+{
+    return functions::write(out, arg);
+}
+
+/// Input operator.
+/// \param in input stream to read from
+/// \param arg half to read into
+/// \return reference to input stream
+template <typename charT, typename traits>
+std::basic_istream<charT, traits>& operator>>(std::basic_istream<charT, traits>& in, half& arg)
+{
+    return functions::read(in, arg);
+}
+
+/// \}
+/// \name Basic mathematical operations
+/// \{
+
+/// Absolute value.
+/// \param arg operand
+/// \return absolute value of \a arg
+//		template<typename T> typename enable<T,T>::type abs(T arg) { return unary_specialized<T>::fabs(arg); }
+inline half abs(half arg)
+{
+    return unary_specialized<half>::fabs(arg);
+}
+inline expr abs(expr arg)
+{
+    return unary_specialized<expr>::fabs(arg);
+}
+
+/// Absolute value.
+/// \param arg operand
+/// \return absolute value of \a arg
+//		template<typename T> typename enable<T,T>::type fabs(T arg) { return unary_specialized<T>::fabs(arg); }
+inline half fabs(half arg)
+{
+    return unary_specialized<half>::fabs(arg);
+}
+inline expr fabs(expr arg)
+{
+    return unary_specialized<expr>::fabs(arg);
+}
+
+/// Remainder of division.
+/// \param x first operand
+/// \param y second operand
+/// \return remainder of floating point division.
+//		template<typename T,typename U> typename enable<expr,T,U>::type fmod(T x, U y) { return functions::fmod(x, y); }
+inline expr fmod(half x, half y)
+{
+    return functions::fmod(x, y);
+}
+inline expr fmod(half x, expr y)
+{
+    return functions::fmod(x, y);
+}
+inline expr fmod(expr x, half y)
+{
+    return functions::fmod(x, y);
+}
+inline expr fmod(expr x, expr y)
+{
+    return functions::fmod(x, y);
+}
+
+/// Remainder of division.
+/// \param x first operand
+/// \param y second operand
+/// \return remainder of floating point division.
+//		template<typename T,typename U> typename enable<expr,T,U>::type remainder(T x, U y) { return
+// functions::remainder(x, y); }
+inline expr remainder(half x, half y)
+{
+    return functions::remainder(x, y);
+}
+inline expr remainder(half x, expr y)
+{
+    return functions::remainder(x, y);
+}
+inline expr remainder(expr x, half y)
+{
+    return functions::remainder(x, y);
+}
+inline expr remainder(expr x, expr y)
+{
+    return functions::remainder(x, y);
+}
+
+/// Remainder of division.
+/// \param x first operand
+/// \param y second operand
+/// \param quo address to store some bits of quotient at
+/// \return remainder of floating point division.
+//		template<typename T,typename U> typename enable<expr,T,U>::type remquo(T x, U y, int *quo) { return
+// functions::remquo(x, y, quo); }
+inline expr remquo(half x, half y, int* quo)
+{
+    return functions::remquo(x, y, quo);
+}
+inline expr remquo(half x, expr y, int* quo)
+{
+    return functions::remquo(x, y, quo);
+}
+inline expr remquo(expr x, half y, int* quo)
+{
+    return functions::remquo(x, y, quo);
+}
+inline expr remquo(expr x, expr y, int* quo)
+{
+    return functions::remquo(x, y, quo);
+}
+
+/// Fused multiply add.
+/// \param x first operand
+/// \param y second operand
+/// \param z third operand
+/// \return ( \a x * \a y ) + \a z rounded as one operation.
+//		template<typename T,typename U,typename V> typename enable<expr,T,U,V>::type fma(T x, U y, V z) { return
+// functions::fma(x, y, z); }
+inline expr fma(half x, half y, half z)
+{
+    return functions::fma(x, y, z);
+}
+inline expr fma(half x, half y, expr z)
+{
+    return functions::fma(x, y, z);
+}
+inline expr fma(half x, expr y, half z)
+{
+    return functions::fma(x, y, z);
+}
+inline expr fma(half x, expr y, expr z)
+{
+    return functions::fma(x, y, z);
+}
+inline expr fma(expr x, half y, half z)
+{
+    return functions::fma(x, y, z);
+}
+inline expr fma(expr x, half y, expr z)
+{
+    return functions::fma(x, y, z);
+}
+inline expr fma(expr x, expr y, half z)
+{
+    return functions::fma(x, y, z);
+}
+inline expr fma(expr x, expr y, expr z)
+{
+    return functions::fma(x, y, z);
+}
+
+/// Maximum of half expressions.
+/// \param x first operand
+/// \param y second operand
+/// \return maximum of operands
+//		template<typename T,typename U> typename result<T,U>::type fmax(T x, U y) { return
+// binary_specialized<T,U>::fmax(x, y); }
+inline half fmax(half x, half y)
+{
+    return binary_specialized<half, half>::fmax(x, y);
+}
+inline expr fmax(half x, expr y)
+{
+    return binary_specialized<half, expr>::fmax(x, y);
+}
+inline expr fmax(expr x, half y)
+{
+    return binary_specialized<expr, half>::fmax(x, y);
+}
+inline expr fmax(expr x, expr y)
+{
+    return binary_specialized<expr, expr>::fmax(x, y);
+}
+
+/// Minimum of half expressions.
+/// \param x first operand
+/// \param y second operand
+/// \return minimum of operands
+//		template<typename T,typename U> typename result<T,U>::type fmin(T x, U y) { return
+// binary_specialized<T,U>::fmin(x, y); }
+inline half fmin(half x, half y)
+{
+    return binary_specialized<half, half>::fmin(x, y);
+}
+inline expr fmin(half x, expr y)
+{
+    return binary_specialized<half, expr>::fmin(x, y);
+}
+inline expr fmin(expr x, half y)
+{
+    return binary_specialized<expr, half>::fmin(x, y);
+}
+inline expr fmin(expr x, expr y)
+{
+    return binary_specialized<expr, expr>::fmin(x, y);
+}
+
+/// Positive difference.
+/// \param x first operand
+/// \param y second operand
+/// \return \a x - \a y or 0 if difference negative
+//		template<typename T,typename U> typename enable<expr,T,U>::type fdim(T x, U y) { return functions::fdim(x, y); }
+inline expr fdim(half x, half y)
+{
+    return functions::fdim(x, y);
+}
+inline expr fdim(half x, expr y)
+{
+    return functions::fdim(x, y);
+}
+inline expr fdim(expr x, half y)
+{
+    return functions::fdim(x, y);
+}
+inline expr fdim(expr x, expr y)
+{
+    return functions::fdim(x, y);
+}
+
+/// Get NaN value.
+/// \return quiet NaN
+inline half nanh(const char*)
+{
+    return functions::nanh();
+}
+
+/// \}
+/// \name Exponential functions
+/// \{
+
+/// Exponential function.
+/// \param arg function argument
+/// \return e raised to \a arg
+//		template<typename T> typename enable<expr,T>::type exp(T arg) { return functions::exp(arg); }
+inline expr exp(half arg)
+{
+    return functions::exp(arg);
+}
+inline expr exp(expr arg)
+{
+    return functions::exp(arg);
+}
+
+/// Exponential minus one.
+/// \param arg function argument
+/// \return e raised to \a arg subtracted by 1
+//		template<typename T> typename enable<expr,T>::type expm1(T arg) { return functions::expm1(arg); }
+inline expr expm1(half arg)
+{
+    return functions::expm1(arg);
+}
+inline expr expm1(expr arg)
+{
+    return functions::expm1(arg);
+}
+
+/// Binary exponential.
+/// \param arg function argument
+/// \return 2 raised to \a arg
+//		template<typename T> typename enable<expr,T>::type exp2(T arg) { return functions::exp2(arg); }
+inline expr exp2(half arg)
+{
+    return functions::exp2(arg);
+}
+inline expr exp2(expr arg)
+{
+    return functions::exp2(arg);
+}
+
+/// Natural logorithm.
+/// \param arg function argument
+/// \return logarithm of \a arg to base e
+//		template<typename T> typename enable<expr,T>::type log(T arg) { return functions::log(arg); }
+inline expr log(half arg)
+{
+    return functions::log(arg);
+}
+inline expr log(expr arg)
+{
+    return functions::log(arg);
+}
+
+/// Common logorithm.
+/// \param arg function argument
+/// \return logarithm of \a arg to base 10
+//		template<typename T> typename enable<expr,T>::type log10(T arg) { return functions::log10(arg); }
+inline expr log10(half arg)
+{
+    return functions::log10(arg);
+}
+inline expr log10(expr arg)
+{
+    return functions::log10(arg);
+}
+
+/// Natural logorithm.
+/// \param arg function argument
+/// \return logarithm of \a arg plus 1 to base e
+//		template<typename T> typename enable<expr,T>::type log1p(T arg) { return functions::log1p(arg); }
+inline expr log1p(half arg)
+{
+    return functions::log1p(arg);
+}
+inline expr log1p(expr arg)
+{
+    return functions::log1p(arg);
+}
+
+/// Binary logorithm.
+/// \param arg function argument
+/// \return logarithm of \a arg to base 2
+//		template<typename T> typename enable<expr,T>::type log2(T arg) { return functions::log2(arg); }
+inline expr log2(half arg)
+{
+    return functions::log2(arg);
+}
+inline expr log2(expr arg)
+{
+    return functions::log2(arg);
+}
+
+/// \}
+/// \name Power functions
+/// \{
+
+/// Square root.
+/// \param arg function argument
+/// \return square root of \a arg
+//		template<typename T> typename enable<expr,T>::type sqrt(T arg) { return functions::sqrt(arg); }
+inline expr sqrt(half arg)
+{
+    return functions::sqrt(arg);
+}
+inline expr sqrt(expr arg)
+{
+    return functions::sqrt(arg);
+}
+
+/// Cubic root.
+/// \param arg function argument
+/// \return cubic root of \a arg
+//		template<typename T> typename enable<expr,T>::type cbrt(T arg) { return functions::cbrt(arg); }
+inline expr cbrt(half arg)
+{
+    return functions::cbrt(arg);
+}
+inline expr cbrt(expr arg)
+{
+    return functions::cbrt(arg);
+}
+
+/// Hypotenuse function.
+/// \param x first argument
+/// \param y second argument
+/// \return square root of sum of squares without internal over- or underflows
+//		template<typename T,typename U> typename enable<expr,T,U>::type hypot(T x, U y) { return functions::hypot(x, y);
+//}
+inline expr hypot(half x, half y)
+{
+    return functions::hypot(x, y);
+}
+inline expr hypot(half x, expr y)
+{
+    return functions::hypot(x, y);
+}
+inline expr hypot(expr x, half y)
+{
+    return functions::hypot(x, y);
+}
+inline expr hypot(expr x, expr y)
+{
+    return functions::hypot(x, y);
+}
+
+/// Power function.
+/// \param base first argument
+/// \param exp second argument
+/// \return \a base raised to \a exp
+//		template<typename T,typename U> typename enable<expr,T,U>::type pow(T base, U exp) { return functions::pow(base,
+// exp); }
+inline expr pow(half base, half exp)
+{
+    return functions::pow(base, exp);
+}
+inline expr pow(half base, expr exp)
+{
+    return functions::pow(base, exp);
+}
+inline expr pow(expr base, half exp)
+{
+    return functions::pow(base, exp);
+}
+inline expr pow(expr base, expr exp)
+{
+    return functions::pow(base, exp);
+}
+
+/// \}
+/// \name Trigonometric functions
+/// \{
+
+/// Sine function.
+/// \param arg function argument
+/// \return sine value of \a arg
+//		template<typename T> typename enable<expr,T>::type sin(T arg) { return functions::sin(arg); }
+inline expr sin(half arg)
+{
+    return functions::sin(arg);
+}
+inline expr sin(expr arg)
+{
+    return functions::sin(arg);
+}
+
+/// Cosine function.
+/// \param arg function argument
+/// \return cosine value of \a arg
+//		template<typename T> typename enable<expr,T>::type cos(T arg) { return functions::cos(arg); }
+inline expr cos(half arg)
+{
+    return functions::cos(arg);
+}
+inline expr cos(expr arg)
+{
+    return functions::cos(arg);
+}
+
+/// Tangent function.
+/// \param arg function argument
+/// \return tangent value of \a arg
+//		template<typename T> typename enable<expr,T>::type tan(T arg) { return functions::tan(arg); }
+inline expr tan(half arg)
+{
+    return functions::tan(arg);
+}
+inline expr tan(expr arg)
+{
+    return functions::tan(arg);
+}
+
+/// Arc sine.
+/// \param arg function argument
+/// \return arc sine value of \a arg
+//		template<typename T> typename enable<expr,T>::type asin(T arg) { return functions::asin(arg); }
+inline expr asin(half arg)
+{
+    return functions::asin(arg);
+}
+inline expr asin(expr arg)
+{
+    return functions::asin(arg);
+}
+
+/// Arc cosine function.
+/// \param arg function argument
+/// \return arc cosine value of \a arg
+//		template<typename T> typename enable<expr,T>::type acos(T arg) { return functions::acos(arg); }
+inline expr acos(half arg)
+{
+    return functions::acos(arg);
+}
+inline expr acos(expr arg)
+{
+    return functions::acos(arg);
+}
+
+/// Arc tangent function.
+/// \param arg function argument
+/// \return arc tangent value of \a arg
+//		template<typename T> typename enable<expr,T>::type atan(T arg) { return functions::atan(arg); }
+inline expr atan(half arg)
+{
+    return functions::atan(arg);
+}
+inline expr atan(expr arg)
+{
+    return functions::atan(arg);
+}
+
+/// Arc tangent function.
+/// \param x first argument
+/// \param y second argument
+/// \return arc tangent value
+//		template<typename T,typename U> typename enable<expr,T,U>::type atan2(T x, U y) { return functions::atan2(x, y);
+//}
+inline expr atan2(half x, half y)
+{
+    return functions::atan2(x, y);
+}
+inline expr atan2(half x, expr y)
+{
+    return functions::atan2(x, y);
+}
+inline expr atan2(expr x, half y)
+{
+    return functions::atan2(x, y);
+}
+inline expr atan2(expr x, expr y)
+{
+    return functions::atan2(x, y);
+}
+
+/// \}
+/// \name Hyperbolic functions
+/// \{
+
+/// Hyperbolic sine.
+/// \param arg function argument
+/// \return hyperbolic sine value of \a arg
+//		template<typename T> typename enable<expr,T>::type sinh(T arg) { return functions::sinh(arg); }
+inline expr sinh(half arg)
+{
+    return functions::sinh(arg);
+}
+inline expr sinh(expr arg)
+{
+    return functions::sinh(arg);
+}
+
+/// Hyperbolic cosine.
+/// \param arg function argument
+/// \return hyperbolic cosine value of \a arg
+//		template<typename T> typename enable<expr,T>::type cosh(T arg) { return functions::cosh(arg); }
+inline expr cosh(half arg)
+{
+    return functions::cosh(arg);
+}
+inline expr cosh(expr arg)
+{
+    return functions::cosh(arg);
+}
+
+/// Hyperbolic tangent.
+/// \param arg function argument
+/// \return hyperbolic tangent value of \a arg
+//		template<typename T> typename enable<expr,T>::type tanh(T arg) { return functions::tanh(arg); }
+inline expr tanh(half arg)
+{
+    return functions::tanh(arg);
+}
+inline expr tanh(expr arg)
+{
+    return functions::tanh(arg);
+}
+
+/// Hyperbolic area sine.
+/// \param arg function argument
+/// \return area sine value of \a arg
+//		template<typename T> typename enable<expr,T>::type asinh(T arg) { return functions::asinh(arg); }
+inline expr asinh(half arg)
+{
+    return functions::asinh(arg);
+}
+inline expr asinh(expr arg)
+{
+    return functions::asinh(arg);
+}
+
+/// Hyperbolic area cosine.
+/// \param arg function argument
+/// \return area cosine value of \a arg
+//		template<typename T> typename enable<expr,T>::type acosh(T arg) { return functions::acosh(arg); }
+inline expr acosh(half arg)
+{
+    return functions::acosh(arg);
+}
+inline expr acosh(expr arg)
+{
+    return functions::acosh(arg);
+}
+
+/// Hyperbolic area tangent.
+/// \param arg function argument
+/// \return area tangent value of \a arg
+//		template<typename T> typename enable<expr,T>::type atanh(T arg) { return functions::atanh(arg); }
+inline expr atanh(half arg)
+{
+    return functions::atanh(arg);
+}
+inline expr atanh(expr arg)
+{
+    return functions::atanh(arg);
+}
+
+/// \}
+/// \name Error and gamma functions
+/// \{
+
+/// Error function.
+/// \param arg function argument
+/// \return error function value of \a arg
+//		template<typename T> typename enable<expr,T>::type erf(T arg) { return functions::erf(arg); }
+inline expr erf(half arg)
+{
+    return functions::erf(arg);
+}
+inline expr erf(expr arg)
+{
+    return functions::erf(arg);
+}
+
+/// Complementary error function.
+/// \param arg function argument
+/// \return 1 minus error function value of \a arg
+//		template<typename T> typename enable<expr,T>::type erfc(T arg) { return functions::erfc(arg); }
+inline expr erfc(half arg)
+{
+    return functions::erfc(arg);
+}
+inline expr erfc(expr arg)
+{
+    return functions::erfc(arg);
+}
+
+/// Natural logarithm of gamma function.
+/// \param arg function argument
+/// \return natural logarith of gamma function for \a arg
+//		template<typename T> typename enable<expr,T>::type lgamma(T arg) { return functions::lgamma(arg); }
+inline expr lgamma(half arg)
+{
+    return functions::lgamma(arg);
+}
+inline expr lgamma(expr arg)
+{
+    return functions::lgamma(arg);
+}
+
+/// Gamma function.
+/// \param arg function argument
+/// \return gamma function value of \a arg
+//		template<typename T> typename enable<expr,T>::type tgamma(T arg) { return functions::tgamma(arg); }
+inline expr tgamma(half arg)
+{
+    return functions::tgamma(arg);
+}
+inline expr tgamma(expr arg)
+{
+    return functions::tgamma(arg);
+}
+
+/// \}
+/// \name Rounding
+/// \{
+
+/// Nearest integer not less than half value.
+/// \param arg half to round
+/// \return nearest integer not less than \a arg
+//		template<typename T> typename enable<half,T>::type ceil(T arg) { return functions::ceil(arg); }
+inline half ceil(half arg)
+{
+    return functions::ceil(arg);
+}
+inline half ceil(expr arg)
+{
+    return functions::ceil(arg);
+}
+
+/// Nearest integer not greater than half value.
+/// \param arg half to round
+/// \return nearest integer not greater than \a arg
+//		template<typename T> typename enable<half,T>::type floor(T arg) { return functions::floor(arg); }
+inline half floor(half arg)
+{
+    return functions::floor(arg);
+}
+inline half floor(expr arg)
+{
+    return functions::floor(arg);
+}
+
+/// Nearest integer not greater in magnitude than half value.
+/// \param arg half to round
+/// \return nearest integer not greater in magnitude than \a arg
+//		template<typename T> typename enable<half,T>::type trunc(T arg) { return functions::trunc(arg); }
+inline half trunc(half arg)
+{
+    return functions::trunc(arg);
+}
+inline half trunc(expr arg)
+{
+    return functions::trunc(arg);
+}
+
+/// Nearest integer.
+/// \param arg half to round
+/// \return nearest integer, rounded away from zero in half-way cases
+//		template<typename T> typename enable<half,T>::type round(T arg) { return functions::round(arg); }
+inline half round(half arg)
+{
+    return functions::round(arg);
+}
+inline half round(expr arg)
+{
+    return functions::round(arg);
+}
+
+/// Nearest integer.
+/// \param arg half to round
+/// \return nearest integer, rounded away from zero in half-way cases
+//		template<typename T> typename enable<long,T>::type lround(T arg) { return functions::lround(arg); }
+inline long lround(half arg)
+{
+    return functions::lround(arg);
+}
+inline long lround(expr arg)
+{
+    return functions::lround(arg);
+}
+
+/// Nearest integer using half's internal rounding mode.
+/// \param arg half expression to round
+/// \return nearest integer using default rounding mode
+//		template<typename T> typename enable<half,T>::type nearbyint(T arg) { return functions::nearbyint(arg); }
+inline half nearbyint(half arg)
+{
+    return functions::rint(arg);
+}
+inline half nearbyint(expr arg)
+{
+    return functions::rint(arg);
+}
+
+/// Nearest integer using half's internal rounding mode.
+/// \param arg half expression to round
+/// \return nearest integer using default rounding mode
+//		template<typename T> typename enable<half,T>::type rint(T arg) { return functions::rint(arg); }
+inline half rint(half arg)
+{
+    return functions::rint(arg);
+}
+inline half rint(expr arg)
+{
+    return functions::rint(arg);
+}
+
+/// Nearest integer using half's internal rounding mode.
+/// \param arg half expression to round
+/// \return nearest integer using default rounding mode
+//		template<typename T> typename enable<long,T>::type lrint(T arg) { return functions::lrint(arg); }
+inline long lrint(half arg)
+{
+    return functions::lrint(arg);
+}
+inline long lrint(expr arg)
+{
+    return functions::lrint(arg);
+}
+#if HALF_ENABLE_CPP11_LONG_LONG
+/// Nearest integer.
+/// \param arg half to round
+/// \return nearest integer, rounded away from zero in half-way cases
+//		template<typename T> typename enable<long long,T>::type llround(T arg) { return functions::llround(arg); }
+inline long long llround(half arg)
+{
+    return functions::llround(arg);
+}
+inline long long llround(expr arg)
+{
+    return functions::llround(arg);
+}
+
+/// Nearest integer using half's internal rounding mode.
+/// \param arg half expression to round
+/// \return nearest integer using default rounding mode
+//		template<typename T> typename enable<long long,T>::type llrint(T arg) { return functions::llrint(arg); }
+inline long long llrint(half arg)
+{
+    return functions::llrint(arg);
+}
+inline long long llrint(expr arg)
+{
+    return functions::llrint(arg);
+}
+#endif
+
+/// \}
+/// \name Floating point manipulation
+/// \{
+
+/// Decompress floating point number.
+/// \param arg number to decompress
+/// \param exp address to store exponent at
+/// \return significant in range [0.5, 1)
+//		template<typename T> typename enable<half,T>::type frexp(T arg, int *exp) { return functions::frexp(arg, exp); }
+inline half frexp(half arg, int* exp)
+{
+    return functions::frexp(arg, exp);
+}
+inline half frexp(expr arg, int* exp)
+{
+    return functions::frexp(arg, exp);
+}
+
+/// Multiply by power of two.
+/// \param arg number to modify
+/// \param exp power of two to multiply with
+/// \return \a arg multplied by 2 raised to \a exp
+//		template<typename T> typename enable<half,T>::type ldexp(T arg, int exp) { return functions::scalbln(arg, exp);
+//}
+inline half ldexp(half arg, int exp)
+{
+    return functions::scalbln(arg, exp);
+}
+inline half ldexp(expr arg, int exp)
+{
+    return functions::scalbln(arg, exp);
+}
+
+/// Extract integer and fractional parts.
+/// \param arg number to decompress
+/// \param iptr address to store integer part at
+/// \return fractional part
+//		template<typename T> typename enable<half,T>::type modf(T arg, half *iptr) { return functions::modf(arg, iptr);
+//}
+inline half modf(half arg, half* iptr)
+{
+    return functions::modf(arg, iptr);
+}
+inline half modf(expr arg, half* iptr)
+{
+    return functions::modf(arg, iptr);
+}
+
+/// Multiply by power of two.
+/// \param arg number to modify
+/// \param exp power of two to multiply with
+/// \return \a arg multplied by 2 raised to \a exp
+//		template<typename T> typename enable<half,T>::type scalbn(T arg, int exp) { return functions::scalbln(arg, exp);
+//}
+inline half scalbn(half arg, int exp)
+{
+    return functions::scalbln(arg, exp);
+}
+inline half scalbn(expr arg, int exp)
+{
+    return functions::scalbln(arg, exp);
+}
+
+/// Multiply by power of two.
+/// \param arg number to modify
+/// \param exp power of two to multiply with
+/// \return \a arg multplied by 2 raised to \a exp
+//		template<typename T> typename enable<half,T>::type scalbln(T arg, long exp) { return functions::scalbln(arg,
+// exp);
+//}
+inline half scalbln(half arg, long exp)
+{
+    return functions::scalbln(arg, exp);
+}
+inline half scalbln(expr arg, long exp)
+{
+    return functions::scalbln(arg, exp);
+}
+
+/// Extract exponent.
+/// \param arg number to query
+/// \return floating point exponent
+/// \retval FP_ILOGB0 for zero
+/// \retval FP_ILOGBNAN for NaN
+/// \retval MAX_INT for infinity
+//		template<typename T> typename enable<int,T>::type ilogb(T arg) { return functions::ilogb(arg); }
+inline int ilogb(half arg)
+{
+    return functions::ilogb(arg);
+}
+inline int ilogb(expr arg)
+{
+    return functions::ilogb(arg);
+}
+
+/// Extract exponent.
+/// \param arg number to query
+/// \return floating point exponent
+//		template<typename T> typename enable<half,T>::type logb(T arg) { return functions::logb(arg); }
+inline half logb(half arg)
+{
+    return functions::logb(arg);
+}
+inline half logb(expr arg)
+{
+    return functions::logb(arg);
+}
+
+/// Next representable value.
+/// \param from value to compute next representable value for
+/// \param to direction towards which to compute next value
+/// \return next representable value after \a from in direction towards \a to
+//		template<typename T,typename U> typename enable<half,T,U>::type nextafter(T from, U to) { return
+// functions::nextafter(from, to); }
+inline half nextafter(half from, half to)
+{
+    return functions::nextafter(from, to);
+}
+inline half nextafter(half from, expr to)
+{
+    return functions::nextafter(from, to);
+}
+inline half nextafter(expr from, half to)
+{
+    return functions::nextafter(from, to);
+}
+inline half nextafter(expr from, expr to)
+{
+    return functions::nextafter(from, to);
+}
+
+/// Next representable value.
+/// \param from value to compute next representable value for
+/// \param to direction towards which to compute next value
+/// \return next representable value after \a from in direction towards \a to
+//		template<typename T> typename enable<half,T>::type nexttoward(T from, long double to) { return
+// functions::nexttoward(from, to); }
+inline half nexttoward(half from, long double to)
+{
+    return functions::nexttoward(from, to);
+}
+inline half nexttoward(expr from, long double to)
+{
+    return functions::nexttoward(from, to);
+}
+
+/// Take sign.
+/// \param x value to change sign for
+/// \param y value to take sign from
+/// \return value equal to \a x in magnitude and to \a y in sign
+//		template<typename T,typename U> typename enable<half,T,U>::type copysign(T x, U y) { return
+// functions::copysign(x, y); }
+inline half copysign(half x, half y)
+{
+    return functions::copysign(x, y);
+}
+inline half copysign(half x, expr y)
+{
+    return functions::copysign(x, y);
+}
+inline half copysign(expr x, half y)
+{
+    return functions::copysign(x, y);
+}
+inline half copysign(expr x, expr y)
+{
+    return functions::copysign(x, y);
+}
+
+/// \}
+/// \name Floating point classification
+/// \{
+
+/// Classify floating point value.
+/// \param arg number to classify
+/// \retval FP_ZERO for positive and negative zero
+/// \retval FP_SUBNORMAL for subnormal numbers
+/// \retval FP_INFINITY for positive and negative infinity
+/// \retval FP_NAN for NaNs
+/// \retval FP_NORMAL for all other (normal) values
+//		template<typename T> typename enable<int,T>::type fpclassify(T arg) { return functions::fpclassify(arg); }
+inline int fpclassify(half arg)
+{
+    return functions::fpclassify(arg);
+}
+inline int fpclassify(expr arg)
+{
+    return functions::fpclassify(arg);
+}
+
+/// Check if finite number.
+/// \param arg number to check
+/// \retval true if neither infinity nor NaN
+/// \retval false else
+//		template<typename T> typename enable<bool,T>::type isfinite(T arg) { return functions::isfinite(arg); }
+inline bool isfinite(half arg)
+{
+    return functions::isfinite(arg);
+}
+inline bool isfinite(expr arg)
+{
+    return functions::isfinite(arg);
+}
+
+/// Check for infinity.
+/// \param arg number to check
+/// \retval true for positive or negative infinity
+/// \retval false else
+//		template<typename T> typename enable<bool,T>::type isinf(T arg) { return functions::isinf(arg); }
+inline bool isinf(half arg)
+{
+    return functions::isinf(arg);
+}
+inline bool isinf(expr arg)
+{
+    return functions::isinf(arg);
+}
+
+/// Check for NaN.
+/// \param arg number to check
+/// \retval true for NaNs
+/// \retval false else
+//		template<typename T> typename enable<bool,T>::type isnan(T arg) { return functions::isnan(arg); }
+inline bool isnan(half arg)
+{
+    return functions::isnan(arg);
+}
+inline bool isnan(expr arg)
+{
+    return functions::isnan(arg);
+}
+
+/// Check if normal number.
+/// \param arg number to check
+/// \retval true if normal number
+/// \retval false if either subnormal, zero, infinity or NaN
+//		template<typename T> typename enable<bool,T>::type isnormal(T arg) { return functions::isnormal(arg); }
+inline bool isnormal(half arg)
+{
+    return functions::isnormal(arg);
+}
+inline bool isnormal(expr arg)
+{
+    return functions::isnormal(arg);
+}
+
+/// Check sign.
+/// \param arg number to check
+/// \retval true for negative number
+/// \retval false for positive number
+//		template<typename T> typename enable<bool,T>::type signbit(T arg) { return functions::signbit(arg); }
+inline bool signbit(half arg)
+{
+    return functions::signbit(arg);
+}
+inline bool signbit(expr arg)
+{
+    return functions::signbit(arg);
+}
+
+/// \}
+/// \name Comparison
+/// \{
+
+/// Comparison for greater than.
+/// \param x first operand
+/// \param y second operand
+/// \retval true if \a x greater than \a y
+/// \retval false else
+//		template<typename T,typename U> typename enable<bool,T,U>::type isgreater(T x, U y) { return
+// functions::isgreater(x, y); }
+inline bool isgreater(half x, half y)
+{
+    return functions::isgreater(x, y);
+}
+inline bool isgreater(half x, expr y)
+{
+    return functions::isgreater(x, y);
+}
+inline bool isgreater(expr x, half y)
+{
+    return functions::isgreater(x, y);
+}
+inline bool isgreater(expr x, expr y)
+{
+    return functions::isgreater(x, y);
+}
+
+/// Comparison for greater equal.
+/// \param x first operand
+/// \param y second operand
+/// \retval true if \a x greater equal \a y
+/// \retval false else
+//		template<typename T,typename U> typename enable<bool,T,U>::type isgreaterequal(T x, U y) { return
+// functions::isgreaterequal(x, y); }
+inline bool isgreaterequal(half x, half y)
+{
+    return functions::isgreaterequal(x, y);
+}
+inline bool isgreaterequal(half x, expr y)
+{
+    return functions::isgreaterequal(x, y);
+}
+inline bool isgreaterequal(expr x, half y)
+{
+    return functions::isgreaterequal(x, y);
+}
+inline bool isgreaterequal(expr x, expr y)
+{
+    return functions::isgreaterequal(x, y);
+}
+
+/// Comparison for less than.
+/// \param x first operand
+/// \param y second operand
+/// \retval true if \a x less than \a y
+/// \retval false else
+//		template<typename T,typename U> typename enable<bool,T,U>::type isless(T x, U y) { return functions::isless(x,
+// y);
+//}
+inline bool isless(half x, half y)
+{
+    return functions::isless(x, y);
+}
+inline bool isless(half x, expr y)
+{
+    return functions::isless(x, y);
+}
+inline bool isless(expr x, half y)
+{
+    return functions::isless(x, y);
+}
+inline bool isless(expr x, expr y)
+{
+    return functions::isless(x, y);
+}
+
+/// Comparison for less equal.
+/// \param x first operand
+/// \param y second operand
+/// \retval true if \a x less equal \a y
+/// \retval false else
+//		template<typename T,typename U> typename enable<bool,T,U>::type islessequal(T x, U y) { return
+// functions::islessequal(x, y); }
+inline bool islessequal(half x, half y)
+{
+    return functions::islessequal(x, y);
+}
+inline bool islessequal(half x, expr y)
+{
+    return functions::islessequal(x, y);
+}
+inline bool islessequal(expr x, half y)
+{
+    return functions::islessequal(x, y);
+}
+inline bool islessequal(expr x, expr y)
+{
+    return functions::islessequal(x, y);
+}
+
+/// Comarison for less or greater.
+/// \param x first operand
+/// \param y second operand
+/// \retval true if either less or greater
+/// \retval false else
+//		template<typename T,typename U> typename enable<bool,T,U>::type islessgreater(T x, U y) { return
+// functions::islessgreater(x, y); }
+inline bool islessgreater(half x, half y)
+{
+    return functions::islessgreater(x, y);
+}
+inline bool islessgreater(half x, expr y)
+{
+    return functions::islessgreater(x, y);
+}
+inline bool islessgreater(expr x, half y)
+{
+    return functions::islessgreater(x, y);
+}
+inline bool islessgreater(expr x, expr y)
+{
+    return functions::islessgreater(x, y);
+}
+
+/// Check if unordered.
+/// \param x first operand
+/// \param y second operand
+/// \retval true if unordered (one or two NaN operands)
+/// \retval false else
+//		template<typename T,typename U> typename enable<bool,T,U>::type isunordered(T x, U y) { return
+// functions::isunordered(x, y); }
+inline bool isunordered(half x, half y)
+{
+    return functions::isunordered(x, y);
+}
+inline bool isunordered(half x, expr y)
+{
+    return functions::isunordered(x, y);
+}
+inline bool isunordered(expr x, half y)
+{
+    return functions::isunordered(x, y);
+}
+inline bool isunordered(expr x, expr y)
+{
+    return functions::isunordered(x, y);
+}
+
+/// \name Casting
+/// \{
+
+/// Cast to or from half-precision floating point number.
+/// This casts between [half](\ref half_float::half) and any built-in arithmetic type. The values are converted
+/// directly using the given rounding mode, without any roundtrip over `float` that a `static_cast` would otherwise do.
+/// It uses the default rounding mode.
+///
+/// Using this cast with neither of the two types being a [half](\ref half_float::half) or with any of the two types
+/// not being a built-in arithmetic type (apart from [half](\ref half_float::half), of course) results in a compiler
+/// error and casting between [half](\ref half_float::half)s is just a no-op.
+/// \tparam T destination type (half or built-in arithmetic type)
+/// \tparam U source type (half or built-in arithmetic type)
+/// \param arg value to cast
+/// \return \a arg converted to destination type
+template <typename T, typename U>
+T half_cast(U arg)
+{
+    return half_caster<T, U>::cast(arg);
+}
+
+/// Cast to or from half-precision floating point number.
+/// This casts between [half](\ref half_float::half) and any built-in arithmetic type. The values are converted
+/// directly using the given rounding mode, without any roundtrip over `float` that a `static_cast` would otherwise do.
+///
+/// Using this cast with neither of the two types being a [half](\ref half_float::half) or with any of the two types
+/// not being a built-in arithmetic type (apart from [half](\ref half_float::half), of course) results in a compiler
+/// error and casting between [half](\ref half_float::half)s is just a no-op.
+/// \tparam T destination type (half or built-in arithmetic type)
+/// \tparam R rounding mode to use.
+/// \tparam U source type (half or built-in arithmetic type)
+/// \param arg value to cast
+/// \return \a arg converted to destination type
+template <typename T, std::float_round_style R, typename U>
+T half_cast(U arg)
+{
+    return half_caster<T, U, R>::cast(arg);
+}
+/// \}
+} // namespace detail
+
+using detail::operator==;
+using detail::operator!=;
+using detail::operator<;
+using detail::operator>;
+using detail::operator<=;
+using detail::operator>=;
+using detail::operator+;
+using detail::operator-;
+using detail::operator*;
+using detail::operator/;
+using detail::operator<<;
+using detail::operator>>;
+
+using detail::abs;
+using detail::acos;
+using detail::acosh;
+using detail::asin;
+using detail::asinh;
+using detail::atan;
+using detail::atan2;
+using detail::atanh;
+using detail::cbrt;
+using detail::ceil;
+using detail::cos;
+using detail::cosh;
+using detail::erf;
+using detail::erfc;
+using detail::exp;
+using detail::exp2;
+using detail::expm1;
+using detail::fabs;
+using detail::fdim;
+using detail::floor;
+using detail::fma;
+using detail::fmax;
+using detail::fmin;
+using detail::fmod;
+using detail::hypot;
+using detail::lgamma;
+using detail::log;
+using detail::log10;
+using detail::log1p;
+using detail::log2;
+using detail::lrint;
+using detail::lround;
+using detail::nanh;
+using detail::nearbyint;
+using detail::pow;
+using detail::remainder;
+using detail::remquo;
+using detail::rint;
+using detail::round;
+using detail::sin;
+using detail::sinh;
+using detail::sqrt;
+using detail::tan;
+using detail::tanh;
+using detail::tgamma;
+using detail::trunc;
+#if HALF_ENABLE_CPP11_LONG_LONG
+using detail::llrint;
+using detail::llround;
+#endif
+using detail::copysign;
+using detail::fpclassify;
+using detail::frexp;
+using detail::ilogb;
+using detail::isfinite;
+using detail::isgreater;
+using detail::isgreaterequal;
+using detail::isinf;
+using detail::isless;
+using detail::islessequal;
+using detail::islessgreater;
+using detail::isnan;
+using detail::isnormal;
+using detail::isunordered;
+using detail::ldexp;
+using detail::logb;
+using detail::modf;
+using detail::nextafter;
+using detail::nexttoward;
+using detail::scalbln;
+using detail::scalbn;
+using detail::signbit;
+
+using detail::half_cast;
+} // namespace half_float
+
+/// Extensions to the C++ standard library.
+namespace std
+{
+/// Numeric limits for half-precision floats.
+/// Because of the underlying single-precision implementation of many operations, it inherits some properties from
+/// `std::numeric_limits<float>`.
+template <>
+class numeric_limits<half_float::half> : public numeric_limits<float>
+{
+public:
+    /// Supports signed values.
+    static HALF_CONSTEXPR_CONST bool is_signed = true;
+
+    /// Is not exact.
+    static HALF_CONSTEXPR_CONST bool is_exact = false;
+
+    /// Doesn't provide modulo arithmetic.
+    static HALF_CONSTEXPR_CONST bool is_modulo = false;
+
+    /// IEEE conformant.
+    static HALF_CONSTEXPR_CONST bool is_iec559 = true;
+
+    /// Supports infinity.
+    static HALF_CONSTEXPR_CONST bool has_infinity = true;
+
+    /// Supports quiet NaNs.
+    static HALF_CONSTEXPR_CONST bool has_quiet_NaN = true;
+
+    /// Supports subnormal values.
+    static HALF_CONSTEXPR_CONST float_denorm_style has_denorm = denorm_present;
+
+    /// Rounding mode.
+    /// Due to the mix of internal single-precision computations (using the rounding mode of the underlying
+    /// single-precision implementation) with the rounding mode of the single-to-half conversions, the actual rounding
+    /// mode might be `std::round_indeterminate` if the default half-precision rounding mode doesn't match the
+    /// single-precision rounding mode.
+    static HALF_CONSTEXPR_CONST float_round_style round_style
+        = (std::numeric_limits<float>::round_style == half_float::half::round_style) ? half_float::half::round_style
+                                                                                     : round_indeterminate;
+
+    /// Significant digits.
+    static HALF_CONSTEXPR_CONST int digits = 11;
+
+    /// Significant decimal digits.
+    static HALF_CONSTEXPR_CONST int digits10 = 3;
+
+    /// Required decimal digits to represent all possible values.
+    static HALF_CONSTEXPR_CONST int max_digits10 = 5;
+
+    /// Number base.
+    static HALF_CONSTEXPR_CONST int radix = 2;
+
+    /// One more than smallest exponent.
+    static HALF_CONSTEXPR_CONST int min_exponent = -13;
+
+    /// Smallest normalized representable power of 10.
+    static HALF_CONSTEXPR_CONST int min_exponent10 = -4;
+
+    /// One more than largest exponent
+    static HALF_CONSTEXPR_CONST int max_exponent = 16;
+
+    /// Largest finitely representable power of 10.
+    static HALF_CONSTEXPR_CONST int max_exponent10 = 4;
+
+    /// Smallest positive normal value.
+    static HALF_CONSTEXPR half_float::half min() HALF_NOTHROW
+    {
+        return half_float::half(half_float::detail::binary, 0x0400);
+    }
+
+    /// Smallest finite value.
+    static HALF_CONSTEXPR half_float::half lowest() HALF_NOTHROW
+    {
+        return half_float::half(half_float::detail::binary, 0xFBFF);
+    }
+
+    /// Largest finite value.
+    static HALF_CONSTEXPR half_float::half max() HALF_NOTHROW
+    {
+        return half_float::half(half_float::detail::binary, 0x7BFF);
+    }
+
+    /// Difference between one and next representable value.
+    static HALF_CONSTEXPR half_float::half epsilon() HALF_NOTHROW
+    {
+        return half_float::half(half_float::detail::binary, 0x1400);
+    }
+
+    /// Maximum rounding error.
+    static HALF_CONSTEXPR half_float::half round_error() HALF_NOTHROW
+    {
+        return half_float::half(half_float::detail::binary, (round_style == std::round_to_nearest) ? 0x3800 : 0x3C00);
+    }
+
+    /// Positive infinity.
+    static HALF_CONSTEXPR half_float::half infinity() HALF_NOTHROW
+    {
+        return half_float::half(half_float::detail::binary, 0x7C00);
+    }
+
+    /// Quiet NaN.
+    static HALF_CONSTEXPR half_float::half quiet_NaN() HALF_NOTHROW
+    {
+        return half_float::half(half_float::detail::binary, 0x7FFF);
+    }
+
+    /// Signalling NaN.
+    static HALF_CONSTEXPR half_float::half signaling_NaN() HALF_NOTHROW
+    {
+        return half_float::half(half_float::detail::binary, 0x7DFF);
+    }
+
+    /// Smallest positive subnormal value.
+    static HALF_CONSTEXPR half_float::half denorm_min() HALF_NOTHROW
+    {
+        return half_float::half(half_float::detail::binary, 0x0001);
+    }
+};
+
+#if HALF_ENABLE_CPP11_HASH
+/// Hash function for half-precision floats.
+/// This is only defined if C++11 `std::hash` is supported and enabled.
+template <>
+struct hash<half_float::half> //: unary_function<half_float::half,size_t>
+{
+    /// Type of function argument.
+    typedef half_float::half argument_type;
+
+    /// Function return type.
+    typedef size_t result_type;
+
+    /// Compute hash function.
+    /// \param arg half to hash
+    /// \return hash value
+    result_type operator()(argument_type arg) const
+    {
+        return hash<half_float::detail::uint16>()(static_cast<unsigned>(arg.data_) & -(arg.data_ != 0x8000));
+    }
+};
+#endif
+} // namespace std
+
+#undef HALF_CONSTEXPR
+#undef HALF_CONSTEXPR_CONST
+#undef HALF_NOEXCEPT
+#undef HALF_NOTHROW
+#ifdef HALF_POP_WARNINGS
+#pragma warning(pop)
+#undef HALF_POP_WARNINGS
+#endif
+
+#endif
diff --git a/src/Detector/tensorrt_yolo/common_deprecated/logger.cpp b/src/Detector/tensorrt_yolo/common_deprecated/logger.cpp
new file mode 100644
index 00000000..03c64398
--- /dev/null
+++ b/src/Detector/tensorrt_yolo/common_deprecated/logger.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "logger.h"
+#include "ErrorRecorder.h"
+#include "logging.h"
+
+SampleErrorRecorder gRecorder;
+namespace sample
+{
+Logger gLogger{Logger::Severity::kINFO};
+LogStreamConsumer gLogVerbose{LOG_VERBOSE(gLogger)};
+LogStreamConsumer gLogInfo{LOG_INFO(gLogger)};
+LogStreamConsumer gLogWarning{LOG_WARN(gLogger)};
+LogStreamConsumer gLogError{LOG_ERROR(gLogger)};
+LogStreamConsumer gLogFatal{LOG_FATAL(gLogger)};
+
+void setReportableSeverity(Logger::Severity severity)
+{
+    gLogger.setReportableSeverity(severity);
+    gLogVerbose.setReportableSeverity(severity);
+    gLogInfo.setReportableSeverity(severity);
+    gLogWarning.setReportableSeverity(severity);
+    gLogError.setReportableSeverity(severity);
+    gLogFatal.setReportableSeverity(severity);
+}
+} // namespace sample
diff --git a/src/Detector/tensorrt_yolo/common_deprecated/logger.h b/src/Detector/tensorrt_yolo/common_deprecated/logger.h
new file mode 100644
index 00000000..3069e8e9
--- /dev/null
+++ b/src/Detector/tensorrt_yolo/common_deprecated/logger.h
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LOGGER_H
+#define LOGGER_H
+
+#include "logging.h"
+
+class SampleErrorRecorder;
+extern SampleErrorRecorder gRecorder;
+namespace sample
+{
+extern Logger gLogger;
+extern LogStreamConsumer gLogVerbose;
+extern LogStreamConsumer gLogInfo;
+extern LogStreamConsumer gLogWarning;
+extern LogStreamConsumer gLogError;
+extern LogStreamConsumer gLogFatal;
+
+void setReportableSeverity(Logger::Severity severity);
+} // namespace sample
+
+#endif // LOGGER_H
diff --git a/src/Detector/tensorrt_yolo/common_deprecated/logging.h b/src/Detector/tensorrt_yolo/common_deprecated/logging.h
new file mode 100644
index 00000000..78732c10
--- /dev/null
+++ b/src/Detector/tensorrt_yolo/common_deprecated/logging.h
@@ -0,0 +1,578 @@
+/*
+ * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef TENSORRT_LOGGING_H
+#define TENSORRT_LOGGING_H
+
+#include "NvInferRuntimeCommon.h"
+#include "sampleOptions.h"
+#include <cassert>
+#include <ctime>
+#include <iomanip>
+#include <iostream>
+#include <mutex>
+#include <ostream>
+#include <sstream>
+#include <string>
+
+namespace sample
+{
+
+using Severity = nvinfer1::ILogger::Severity;
+
+class LogStreamConsumerBuffer : public std::stringbuf
+{
+public:
+    LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog)
+        : mOutput(stream)
+        , mPrefix(prefix)
+        , mShouldLog(shouldLog)
+    {
+    }
+
+    LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other) noexcept
+        : mOutput(other.mOutput)
+        , mPrefix(other.mPrefix)
+        , mShouldLog(other.mShouldLog)
+    {
+    }
+    LogStreamConsumerBuffer(const LogStreamConsumerBuffer& other) = delete;
+    LogStreamConsumerBuffer() = delete;
+    LogStreamConsumerBuffer& operator=(const LogStreamConsumerBuffer&) = delete;
+    LogStreamConsumerBuffer& operator=(LogStreamConsumerBuffer&&) = delete;
+
+    ~LogStreamConsumerBuffer() override
+    {
+        // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence
+        // std::streambuf::pptr() gives a pointer to the current position of the output sequence
+        // if the pointer to the beginning is not equal to the pointer to the current position,
+        // call putOutput() to log the output to the stream
+        if (pbase() != pptr())
+        {
+            putOutput();
+        }
+    }
+
+    //!
+    //! synchronizes the stream buffer and returns 0 on success
+    //! synchronizing the stream buffer consists of inserting the buffer contents into the stream,
+    //! resetting the buffer and flushing the stream
+    //!
+    int32_t sync() override
+    {
+        putOutput();
+        return 0;
+    }
+
+    void putOutput()
+    {
+        if (mShouldLog)
+        {
+            // prepend timestamp
+            std::time_t timestamp = std::time(nullptr);
+            tm* tm_local = std::localtime(&timestamp);
+            mOutput << "[";
+            mOutput << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/";
+            mOutput << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/";
+            mOutput << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-";
+            mOutput << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":";
+            mOutput << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":";
+            mOutput << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] ";
+            // std::stringbuf::str() gets the string contents of the buffer
+            // insert the buffer contents pre-appended by the appropriate prefix into the stream
+            mOutput << mPrefix << str();
+        }
+        // set the buffer to empty
+        str("");
+        // flush the stream
+        mOutput.flush();
+    }
+
+    void setShouldLog(bool shouldLog)
+    {
+        mShouldLog = shouldLog;
+    }
+
+private:
+    std::ostream& mOutput;
+    std::string mPrefix;
+    bool mShouldLog{};
+}; // class LogStreamConsumerBuffer
+
+//!
+//! \class LogStreamConsumerBase
+//! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer
+//!
+class LogStreamConsumerBase
+{
+public:
+    LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog)
+        : mBuffer(stream, prefix, shouldLog)
+    {
+    }
+
+protected:
+    std::mutex mLogMutex;
+    LogStreamConsumerBuffer mBuffer;
+}; // class LogStreamConsumerBase
+
+//!
+//! \class LogStreamConsumer
+//! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages.
+//!  Order of base classes is LogStreamConsumerBase and then std::ostream.
+//!  This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field
+//!  in LogStreamConsumer and then the address of the buffer is passed to std::ostream.
+//!  This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream.
+//!  Please do not change the order of the parent classes.
+//!
+class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream
+{
+public:
+    //!
+    //! \brief Creates a LogStreamConsumer which logs messages with level severity.
+    //!  Reportable severity determines if the messages are severe enough to be logged.
+    //!
+    LogStreamConsumer(nvinfer1::ILogger::Severity reportableSeverity, nvinfer1::ILogger::Severity severity)
+        : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity)
+        , std::ostream(&mBuffer) // links the stream buffer with the stream
+        , mShouldLog(severity <= reportableSeverity)
+        , mSeverity(severity)
+    {
+    }
+
+    LogStreamConsumer(LogStreamConsumer&& other) noexcept
+        : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog)
+        , std::ostream(&mBuffer) // links the stream buffer with the stream
+        , mShouldLog(other.mShouldLog)
+        , mSeverity(other.mSeverity)
+    {
+    }
+    LogStreamConsumer(const LogStreamConsumer& other) = delete;
+    LogStreamConsumer() = delete;
+    ~LogStreamConsumer() = default;
+    LogStreamConsumer& operator=(const LogStreamConsumer&) = delete;
+    LogStreamConsumer& operator=(LogStreamConsumer&&) = delete;
+
+    void setReportableSeverity(Severity reportableSeverity)
+    {
+        mShouldLog = mSeverity <= reportableSeverity;
+        mBuffer.setShouldLog(mShouldLog);
+    }
+
+    std::mutex& getMutex()
+    {
+        return mLogMutex;
+    }
+
+    bool getShouldLog() const
+    {
+        return mShouldLog;
+    }
+
+private:
+    static std::ostream& severityOstream(Severity severity)
+    {
+        return severity >= Severity::kINFO ? std::cout : std::cerr;
+    }
+
+    static std::string severityPrefix(Severity severity)
+    {
+        switch (severity)
+        {
+        case Severity::kINTERNAL_ERROR: return "[F] ";
+        case Severity::kERROR: return "[E] ";
+        case Severity::kWARNING: return "[W] ";
+        case Severity::kINFO: return "[I] ";
+        case Severity::kVERBOSE: return "[V] ";
+        default: assert(0); return "";
+        }
+    }
+
+    bool mShouldLog;
+    Severity mSeverity;
+}; // class LogStreamConsumer
+
+template <typename T>
+LogStreamConsumer& operator<<(LogStreamConsumer& logger, const T& obj)
+{
+    if (logger.getShouldLog())
+    {
+        std::lock_guard<std::mutex> guard(logger.getMutex());
+        auto& os = static_cast<std::ostream&>(logger);
+        os << obj;
+    }
+    return logger;
+}
+
+//!
+//! Special handling std::endl
+//!
+inline LogStreamConsumer& operator<<(LogStreamConsumer& logger, std::ostream& (*f)(std::ostream&) )
+{
+    if (logger.getShouldLog())
+    {
+        std::lock_guard<std::mutex> guard(logger.getMutex());
+        auto& os = static_cast<std::ostream&>(logger);
+        os << f;
+    }
+    return logger;
+}
+
+inline LogStreamConsumer& operator<<(LogStreamConsumer& logger, const nvinfer1::Dims& dims)
+{
+    if (logger.getShouldLog())
+    {
+        std::lock_guard<std::mutex> guard(logger.getMutex());
+        auto& os = static_cast<std::ostream&>(logger);
+        for (int32_t i = 0; i < dims.nbDims; ++i)
+        {
+            os << (i ? "x" : "") << dims.d[i];
+        }
+    }
+    return logger;
+}
+
+//!
+//! \class Logger
+//!
+//! \brief Class which manages logging of TensorRT tools and samples
+//!
+//! \details This class provides a common interface for TensorRT tools and samples to log information to the console,
+//! and supports logging two types of messages:
+//!
+//! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal)
+//! - Test pass/fail messages
+//!
+//! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is
+//! that the logic for controlling the verbosity and formatting of sample output is centralized in one location.
+//!
+//! In the future, this class could be extended to support dumping test results to a file in some standard format
+//! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run).
+//!
+//! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger
+//! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT
+//! library and messages coming from the sample.
+//!
+//! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the
+//! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger
+//! object.
+//!
+class Logger : public nvinfer1::ILogger
+{
+public:
+    explicit Logger(Severity severity = Severity::kWARNING)
+        : mReportableSeverity(severity)
+    {
+    }
+
+    //!
+    //! \enum TestResult
+    //! \brief Represents the state of a given test
+    //!
+    enum class TestResult
+    {
+        kRUNNING, //!< The test is running
+        kPASSED,  //!< The test passed
+        kFAILED,  //!< The test failed
+        kWAIVED   //!< The test was waived
+    };
+
+    //!
+    //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger
+    //! \return The nvinfer1::ILogger associated with this Logger
+    //!
+    //! TODO Once all samples are updated to use this method to register the logger with TensorRT,
+    //! we can eliminate the inheritance of Logger from ILogger
+    //!
+    nvinfer1::ILogger& getTRTLogger() noexcept
+    {
+        return *this;
+    }
+
+    //!
+    //! \brief Implementation of the nvinfer1::ILogger::log() virtual method
+    //!
+    //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the
+    //! inheritance from nvinfer1::ILogger
+    //!
+    void log(Severity severity, const char* msg) noexcept override
+    {
+        LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl;
+    }
+
+    //!
+    //! \brief Method for controlling the verbosity of logging output
+    //!
+    //! \param severity The logger will only emit messages that have severity of this level or higher.
+    //!
+    void setReportableSeverity(Severity severity) noexcept
+    {
+        mReportableSeverity = severity;
+    }
+
+    //!
+    //! \brief Opaque handle that holds logging information for a particular test
+    //!
+    //! This object is an opaque handle to information used by the Logger to print test results.
+    //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used
+    //! with Logger::reportTest{Start,End}().
+    //!
+    class TestAtom
+    {
+    public:
+        TestAtom(TestAtom&&) = default;
+
+    private:
+        friend class Logger;
+
+        TestAtom(bool started, const std::string& name, const std::string& cmdline)
+            : mStarted(started)
+            , mName(name)
+            , mCmdline(cmdline)
+        {
+        }
+
+        bool mStarted;
+        std::string mName;
+        std::string mCmdline;
+    };
+
+    //!
+    //! \brief Define a test for logging
+    //!
+    //! \param[in] name The name of the test.  This should be a string starting with
+    //!                  "TensorRT" and containing dot-separated strings containing
+    //!                  the characters [A-Za-z0-9_].
+    //!                  For example, "TensorRT.sample_googlenet"
+    //! \param[in] cmdline The command line used to reproduce the test
+    //
+    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
+    //!
+    static TestAtom defineTest(const std::string& name, const std::string& cmdline)
+    {
+        return TestAtom(false, name, cmdline);
+    }
+
+    //!
+    //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments
+    //!        as input
+    //!
+    //! \param[in] name The name of the test
+    //! \param[in] argc The number of command-line arguments
+    //! \param[in] argv The array of command-line arguments (given as C strings)
+    //!
+    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
+    //!
+    static TestAtom defineTest(const std::string& name, int32_t argc, char const* const* argv)
+    {
+        // Append TensorRT version as info
+        const std::string vname = name + " [TensorRT v" + std::to_string(NV_TENSORRT_VERSION) + "]";
+        auto cmdline = genCmdlineString(argc, argv);
+        return defineTest(vname, cmdline);
+    }
+
+    //!
+    //! \brief Report that a test has started.
+    //!
+    //! \pre reportTestStart() has not been called yet for the given testAtom
+    //!
+    //! \param[in] testAtom The handle to the test that has started
+    //!
+    static void reportTestStart(TestAtom& testAtom)
+    {
+        reportTestResult(testAtom, TestResult::kRUNNING);
+        assert(!testAtom.mStarted);
+        testAtom.mStarted = true;
+    }
+
+    //!
+    //! \brief Report that a test has ended.
+    //!
+    //! \pre reportTestStart() has been called for the given testAtom
+    //!
+    //! \param[in] testAtom The handle to the test that has ended
+    //! \param[in] result The result of the test. Should be one of TestResult::kPASSED,
+    //!                   TestResult::kFAILED, TestResult::kWAIVED
+    //!
+    static void reportTestEnd(TestAtom const& testAtom, TestResult result)
+    {
+        assert(result != TestResult::kRUNNING);
+        assert(testAtom.mStarted);
+        reportTestResult(testAtom, result);
+    }
+
+    static int32_t reportPass(TestAtom const& testAtom)
+    {
+        reportTestEnd(testAtom, TestResult::kPASSED);
+        return EXIT_SUCCESS;
+    }
+
+    static int32_t reportFail(TestAtom const& testAtom)
+    {
+        reportTestEnd(testAtom, TestResult::kFAILED);
+        return EXIT_FAILURE;
+    }
+
+    static int32_t reportWaive(TestAtom const& testAtom)
+    {
+        reportTestEnd(testAtom, TestResult::kWAIVED);
+        return EXIT_SUCCESS;
+    }
+
+    static int32_t reportTest(TestAtom const& testAtom, bool pass)
+    {
+        return pass ? reportPass(testAtom) : reportFail(testAtom);
+    }
+
+    Severity getReportableSeverity() const
+    {
+        return mReportableSeverity;
+    }
+
+private:
+    //!
+    //! \brief returns an appropriate string for prefixing a log message with the given severity
+    //!
+    static const char* severityPrefix(Severity severity)
+    {
+        switch (severity)
+        {
+        case Severity::kINTERNAL_ERROR: return "[F] ";
+        case Severity::kERROR: return "[E] ";
+        case Severity::kWARNING: return "[W] ";
+        case Severity::kINFO: return "[I] ";
+        case Severity::kVERBOSE: return "[V] ";
+        default: assert(0); return "";
+        }
+    }
+
+    //!
+    //! \brief returns an appropriate string for prefixing a test result message with the given result
+    //!
+    static const char* testResultString(TestResult result)
+    {
+        switch (result)
+        {
+        case TestResult::kRUNNING: return "RUNNING";
+        case TestResult::kPASSED: return "PASSED";
+        case TestResult::kFAILED: return "FAILED";
+        case TestResult::kWAIVED: return "WAIVED";
+        default: assert(0); return "";
+        }
+    }
+
+    //!
+    //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity
+    //!
+    static std::ostream& severityOstream(Severity severity)
+    {
+        return severity >= Severity::kINFO ? std::cout : std::cerr;
+    }
+
+    //!
+    //! \brief method that implements logging test results
+    //!
+    static void reportTestResult(TestAtom const& testAtom, TestResult result)
+    {
+        severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # "
+                                         << testAtom.mCmdline << std::endl;
+    }
+
+    //!
+    //! \brief generate a command line string from the given (argc, argv) values
+    //!
+    static std::string genCmdlineString(int32_t argc, char const* const* argv)
+    {
+        std::stringstream ss;
+        for (int32_t i = 0; i < argc; i++)
+        {
+            if (i > 0)
+            {
+                ss << " ";
+            }
+            ss << argv[i];
+        }
+        return ss.str();
+    }
+
+    Severity mReportableSeverity;
+}; // class Logger
+
+namespace
+{
+//!
+//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE
+//!
+//! Example usage:
+//!
+//!     LOG_VERBOSE(logger) << "hello world" << std::endl;
+//!
+inline LogStreamConsumer LOG_VERBOSE(const Logger& logger)
+{
+    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE);
+}
+
+//!
+//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO
+//!
+//! Example usage:
+//!
+//!     LOG_INFO(logger) << "hello world" << std::endl;
+//!
+inline LogStreamConsumer LOG_INFO(const Logger& logger)
+{
+    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO);
+}
+
+//!
+//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING
+//!
+//! Example usage:
+//!
+//!     LOG_WARN(logger) << "hello world" << std::endl;
+//!
+inline LogStreamConsumer LOG_WARN(const Logger& logger)
+{
+    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING);
+}
+
+//!
+//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR
+//!
+//! Example usage:
+//!
+//!     LOG_ERROR(logger) << "hello world" << std::endl;
+//!
+inline LogStreamConsumer LOG_ERROR(const Logger& logger)
+{
+    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR);
+}
+
+//!
+//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR
+//!        ("fatal" severity)
+//!
+//! Example usage:
+//!
+//!     LOG_FATAL(logger) << "hello world" << std::endl;
+//!
+inline LogStreamConsumer LOG_FATAL(const Logger& logger)
+{
+    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR);
+}
+} // anonymous namespace
+} // namespace sample
+#endif // TENSORRT_LOGGING_H
diff --git a/src/Detector/tensorrt_yolo/common_deprecated/parserOnnxConfig.h b/src/Detector/tensorrt_yolo/common_deprecated/parserOnnxConfig.h
new file mode 100644
index 00000000..c92a1420
--- /dev/null
+++ b/src/Detector/tensorrt_yolo/common_deprecated/parserOnnxConfig.h
@@ -0,0 +1,153 @@
+/*
+ * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef PARSER_ONNX_CONFIG_H
+#define PARSER_ONNX_CONFIG_H
+
+#include <cstring>
+#include <iostream>
+#include <string>
+
+#include "NvInfer.h"
+#include "NvOnnxConfig.h"
+#include "NvOnnxParser.h"
+
+#define ONNX_DEBUG 1
+
+/**
+ * \class ParserOnnxConfig
+ * \brief Configuration Manager Class Concrete Implementation
+ *
+ * \note:
+ *
+ */
+
+using namespace std;
+
+class ParserOnnxConfig : public nvonnxparser::IOnnxConfig
+{
+
+protected:
+    string mModelFilename{};
+    string mTextFilename{};
+    string mFullTextFilename{};
+    nvinfer1::DataType mModelDtype;
+    nvonnxparser::IOnnxConfig::Verbosity mVerbosity;
+    bool mPrintLayercInfo;
+
+public:
+    ParserOnnxConfig()
+        : mModelDtype(nvinfer1::DataType::kFLOAT)
+        , mVerbosity(static_cast<int>(nvinfer1::ILogger::Severity::kWARNING))
+        , mPrintLayercInfo(false)
+    {
+#ifdef ONNX_DEBUG
+        if (isDebug())
+        {
+            std::cout << " ParserOnnxConfig::ctor(): " << this << "\t" << std::endl;
+        }
+#endif
+    }
+
+protected:
+    ~ParserOnnxConfig()
+    {
+#ifdef ONNX_DEBUG
+        if (isDebug())
+        {
+            std::cout << "ParserOnnxConfig::dtor(): " << this << std::endl;
+        }
+#endif
+    }
+
+public:
+    virtual void setModelDtype(const nvinfer1::DataType modelDtype) noexcept
+    {
+        mModelDtype = modelDtype;
+    }
+
+    virtual nvinfer1::DataType getModelDtype() const noexcept
+    {
+        return mModelDtype;
+    }
+
+    virtual const char* getModelFileName() const noexcept
+    {
+        return mModelFilename.c_str();
+    }
+    virtual void setModelFileName(const char* onnxFilename) noexcept
+    {
+        mModelFilename = string(onnxFilename);
+    }
+    virtual nvonnxparser::IOnnxConfig::Verbosity getVerbosityLevel() const noexcept
+    {
+        return mVerbosity;
+    }
+    virtual void addVerbosity() noexcept
+    {
+        ++mVerbosity;
+    }
+    virtual void reduceVerbosity() noexcept
+    {
+        --mVerbosity;
+    }
+    virtual void setVerbosityLevel(nvonnxparser::IOnnxConfig::Verbosity verbosity) noexcept
+    {
+        mVerbosity = verbosity;
+    }
+
+    virtual const char* getTextFileName() const noexcept
+    {
+        return mTextFilename.c_str();
+    }
+    virtual void setTextFileName(const char* textFilename) noexcept
+    {
+        mTextFilename = string(textFilename);
+    }
+    virtual const char* getFullTextFileName() const noexcept
+    {
+        return mFullTextFilename.c_str();
+    }
+    virtual void setFullTextFileName(const char* fullTextFilename) noexcept
+    {
+        mFullTextFilename = string(fullTextFilename);
+    }
+    virtual bool getPrintLayerInfo() const noexcept
+    {
+        return mPrintLayercInfo;
+    }
+    virtual void setPrintLayerInfo(bool src) noexcept
+    {
+        mPrintLayercInfo = src;
+    } //!< get the boolean variable corresponding to the Layer Info, see getPrintLayerInfo()
+
+    virtual bool isDebug() const noexcept
+    {
+#if ONNX_DEBUG
+        return (std::getenv("ONNX_DEBUG") ? true : false);
+#else
+        return false;
+#endif
+    }
+
+    virtual void destroy() noexcept
+    {
+        delete this;
+    }
+
+}; // class ParserOnnxConfig
+
+#endif
diff --git a/src/Detector/tensorrt_yolo/common_deprecated/safeCommon.h b/src/Detector/tensorrt_yolo/common_deprecated/safeCommon.h
new file mode 100644
index 00000000..3d84b095
--- /dev/null
+++ b/src/Detector/tensorrt_yolo/common_deprecated/safeCommon.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef TENSORRT_SAFE_COMMON_H
+#define TENSORRT_SAFE_COMMON_H
+
+#include "NvInferRuntimeCommon.h"
+#include <cstdlib>
+#include <iostream>
+#include <memory>
+#include <stdexcept>
+#include <string>
+
+#define CHECK(status)                                                                                                  \
+    do                                                                                                                 \
+    {                                                                                                                  \
+        auto ret = (status);                                                                                           \
+        if (ret != 0)                                                                                                  \
+        {                                                                                                              \
+            std::cerr << "Cuda failure: " << ret << std::endl;                                                         \
+            abort();                                                                                                   \
+        }                                                                                                              \
+    } while (0)
+
+namespace samplesCommon
+{
+template <typename T>
+inline std::shared_ptr<T> infer_object(T* obj)
+{
+    if (!obj)
+    {
+        throw std::runtime_error("Failed to create object");
+    }
+    return std::shared_ptr<T>(obj);
+}
+
+inline uint32_t elementSize(nvinfer1::DataType t)
+{
+    switch (t)
+    {
+    case nvinfer1::DataType::kINT32:
+    case nvinfer1::DataType::kFLOAT: return 4;
+    case nvinfer1::DataType::kHALF: return 2;
+    case nvinfer1::DataType::kINT8: return 1;
+    case nvinfer1::DataType::kBOOL: return 1;
+    }
+    return 0;
+}
+
+template <typename A, typename B>
+inline A divUp(A x, B n)
+{
+    return (x + n - 1) / n;
+}
+
+} // namespace samplesCommon
+
+#endif // TENSORRT_SAFE_COMMON_H
diff --git a/src/Detector/tensorrt_yolo/common_deprecated/sampleConfig.h b/src/Detector/tensorrt_yolo/common_deprecated/sampleConfig.h
new file mode 100644
index 00000000..53a78331
--- /dev/null
+++ b/src/Detector/tensorrt_yolo/common_deprecated/sampleConfig.h
@@ -0,0 +1,337 @@
+/*
+ * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef SampleConfig_H
+#define SampleConfig_H
+
+#include <cstring>
+#include <iostream>
+#include <string>
+
+#include "NvInfer.h"
+#include "NvOnnxConfig.h"
+class SampleConfig : public nvonnxparser::IOnnxConfig
+{
+public:
+    enum class InputDataFormat : int
+    {
+        kASCII = 0,
+        kPPM = 1
+    };
+
+private:
+    std::string mModelFilename;
+    std::string mEngineFilename;
+    std::string mTextFilename;
+    std::string mFullTextFilename;
+    std::string mImageFilename;
+    std::string mReferenceFilename;
+    std::string mOutputFilename;
+    std::string mCalibrationFilename;
+    std::string mTimingCacheFilename;
+    int64_t mLabel{-1};
+    int64_t mMaxBatchSize{32};
+    int64_t mCalibBatchSize{0};
+    int64_t mMaxNCalibBatch{0};
+    int64_t mFirstCalibBatch{0};
+    int64_t mUseDLACore{-1};
+    nvinfer1::DataType mModelDtype{nvinfer1::DataType::kFLOAT};
+    bool mTF32{true};
+    Verbosity mVerbosity{static_cast<int>(nvinfer1::ILogger::Severity::kWARNING)};
+    bool mPrintLayercInfo{false};
+    bool mDebugBuilder{false};
+    InputDataFormat mInputDataFormat{InputDataFormat::kASCII};
+    uint64_t mTopK{0};
+    float mFailurePercentage{-1.0f};
+    float mTolerance{0.0f};
+    float mAbsTolerance{1e-5f};
+
+public:
+    SampleConfig()
+    {
+#ifdef ONNX_DEBUG
+        if (isDebug())
+        {
+            std::cout << " SampleConfig::ctor(): " << this << "\t" << std::endl;
+        }
+#endif
+    }
+
+protected:
+    ~SampleConfig()
+    {
+#ifdef ONNX_DEBUG
+        if (isDebug())
+        {
+            std::cout << "SampleConfig::dtor(): " << this << std::endl;
+        }
+#endif
+    }
+
+public:
+    void setModelDtype(const nvinfer1::DataType mdt) noexcept
+    {
+        mModelDtype = mdt;
+    }
+
+    nvinfer1::DataType getModelDtype() const noexcept
+    {
+        return mModelDtype;
+    }
+
+    bool getTF32() const noexcept
+    {
+        return mTF32;
+    }
+
+    void setTF32(bool enabled) noexcept
+    {
+        mTF32 = enabled;
+    }
+
+    const char* getModelFileName() const noexcept
+    {
+        return mModelFilename.c_str();
+    }
+
+    void setModelFileName(const char* onnxFilename) noexcept
+    {
+        mModelFilename = std::string(onnxFilename);
+    }
+    Verbosity getVerbosityLevel() const noexcept
+    {
+        return mVerbosity;
+    }
+    void addVerbosity() noexcept
+    {
+        ++mVerbosity;
+    }
+    void reduceVerbosity() noexcept
+    {
+        --mVerbosity;
+    }
+    virtual void setVerbosityLevel(Verbosity v) noexcept
+    {
+        mVerbosity = v;
+    }
+    const char* getEngineFileName() const noexcept
+    {
+        return mEngineFilename.c_str();
+    }
+    void setEngineFileName(const char* engineFilename) noexcept
+    {
+        mEngineFilename = std::string(engineFilename);
+    }
+    const char* getTextFileName() const noexcept
+    {
+        return mTextFilename.c_str();
+    }
+    void setTextFileName(const char* textFilename) noexcept
+    {
+        mTextFilename = std::string(textFilename);
+    }
+    const char* getFullTextFileName() const noexcept
+    {
+        return mFullTextFilename.c_str();
+    }
+    void setFullTextFileName(const char* fullTextFilename) noexcept
+    {
+        mFullTextFilename = std::string(fullTextFilename);
+    }
+    void setLabel(int64_t label) noexcept
+    {
+        mLabel = label;
+    } //!<  set the Label
+
+    int64_t getLabel() const noexcept
+    {
+        return mLabel;
+    } //!<  get the Label
+
+    bool getPrintLayerInfo() const noexcept
+    {
+        return mPrintLayercInfo;
+    }
+
+    void setPrintLayerInfo(bool b) noexcept
+    {
+        mPrintLayercInfo = b;
+    } //!< get the boolean variable corresponding to the Layer Info, see getPrintLayerInfo()
+
+    void setMaxBatchSize(int64_t maxBatchSize) noexcept
+    {
+        mMaxBatchSize = maxBatchSize;
+    } //!<  set the Max Batch Size
+    int64_t getMaxBatchSize() const noexcept
+    {
+        return mMaxBatchSize;
+    } //!<  get the Max Batch Size
+
+    void setCalibBatchSize(int64_t CalibBatchSize) noexcept
+    {
+        mCalibBatchSize = CalibBatchSize;
+    } //!<  set the calibration batch size
+    int64_t getCalibBatchSize() const noexcept
+    {
+        return mCalibBatchSize;
+    } //!<  get calibration batch size
+
+    void setMaxNCalibBatch(int64_t MaxNCalibBatch) noexcept
+    {
+        mMaxNCalibBatch = MaxNCalibBatch;
+    } //!<  set Max Number of Calibration Batches
+    int64_t getMaxNCalibBatch() const noexcept
+    {
+        return mMaxNCalibBatch;
+    } //!<  get the Max Number of Calibration Batches
+
+    void setFirstCalibBatch(int64_t FirstCalibBatch) noexcept
+    {
+        mFirstCalibBatch = FirstCalibBatch;
+    } //!<  set the first calibration batch
+    int64_t getFirstCalibBatch() const noexcept
+    {
+        return mFirstCalibBatch;
+    } //!<  get the first calibration batch
+
+    void setUseDLACore(int64_t UseDLACore) noexcept
+    {
+        mUseDLACore = UseDLACore;
+    } //!<  set the DLA core to use
+    int64_t getUseDLACore() const noexcept
+    {
+        return mUseDLACore;
+    } //!<  get the DLA core to use
+
+    void setDebugBuilder() noexcept
+    {
+        mDebugBuilder = true;
+    } //!<  enable the Debug info, while building the engine.
+    bool getDebugBuilder() const noexcept
+    {
+        return mDebugBuilder;
+    } //!<  get the boolean variable, corresponding to the debug builder
+
+    const char* getImageFileName() const noexcept //!<  set Image file name (PPM or ASCII)
+    {
+        return mImageFilename.c_str();
+    }
+    void setImageFileName(const char* imageFilename) noexcept //!< get the Image file name
+    {
+        mImageFilename = std::string(imageFilename);
+    }
+    const char* getReferenceFileName() const noexcept
+    {
+        return mReferenceFilename.c_str();
+    }
+    void setReferenceFileName(const char* referenceFilename) noexcept //!<  set reference file name
+    {
+        mReferenceFilename = std::string(referenceFilename);
+    }
+
+    void setInputDataFormat(InputDataFormat idt) noexcept
+    {
+        mInputDataFormat = idt;
+    } //!<  specifies expected data format of the image file (PPM or ASCII)
+    InputDataFormat getInputDataFormat() const noexcept
+    {
+        return mInputDataFormat;
+    } //!<  returns the expected data format of the image file.
+
+    const char* getOutputFileName() const noexcept //!<  specifies the file to save the results
+    {
+        return mOutputFilename.c_str();
+    }
+    void setOutputFileName(const char* outputFilename) noexcept //!<  get the output file name
+    {
+        mOutputFilename = std::string(outputFilename);
+    }
+
+    const char* getCalibrationFileName() const noexcept
+    {
+        return mCalibrationFilename.c_str();
+    } //!<  specifies the file containing the list of image files for int8 calibration
+    void setCalibrationFileName(const char* calibrationFilename) noexcept //!<  get the int 8 calibration list file name
+    {
+        mCalibrationFilename = std::string(calibrationFilename);
+    }
+
+    uint64_t getTopK() const noexcept
+    {
+        return mTopK;
+    }
+    void setTopK(uint64_t topK) noexcept
+    {
+        mTopK = topK;
+    } //!<  If this options is specified, return the K top probabilities.
+
+    float getFailurePercentage() const noexcept
+    {
+        return mFailurePercentage;
+    }
+
+    void setFailurePercentage(float f) noexcept
+    {
+        mFailurePercentage = f;
+    }
+
+    float getAbsoluteTolerance() const noexcept
+    {
+        return mAbsTolerance;
+    }
+
+    void setAbsoluteTolerance(float a) noexcept
+    {
+        mAbsTolerance = a;
+    }
+
+    float getTolerance() const noexcept
+    {
+        return mTolerance;
+    }
+
+    void setTolerance(float t) noexcept
+    {
+        mTolerance = t;
+    }
+
+    const char* getTimingCacheFilename() const noexcept
+    {
+        return mTimingCacheFilename.c_str();
+    }
+    
+    void setTimingCacheFileName(const char* timingCacheFilename) noexcept
+    {
+        mTimingCacheFilename = std::string(timingCacheFilename);
+    }
+
+    bool isDebug() const noexcept
+    {
+#if ONNX_DEBUG
+        return (std::getenv("ONNX_DEBUG") ? true : false);
+#else
+        return false;
+#endif
+    }
+
+    void destroy() noexcept
+    {
+        delete this;
+    }
+
+}; // class SampleConfig
+
+#endif
diff --git a/src/Detector/tensorrt_yolo/common_deprecated/sampleDevice.h b/src/Detector/tensorrt_yolo/common_deprecated/sampleDevice.h
new file mode 100644
index 00000000..2053ac7c
--- /dev/null
+++ b/src/Detector/tensorrt_yolo/common_deprecated/sampleDevice.h
@@ -0,0 +1,494 @@
+/*
+ * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef TRT_SAMPLE_DEVICE_H
+#define TRT_SAMPLE_DEVICE_H
+
+#include <cassert>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <iostream>
+#include <thread>
+
+namespace sample
+{
+
+inline void cudaCheck(cudaError_t ret, std::ostream& err = std::cerr)
+{
+    if (ret != cudaSuccess)
+    {
+        err << "Cuda failure: " << cudaGetErrorString(ret) << std::endl;
+        abort();
+    }
+}
+
+class TrtCudaEvent;
+
+namespace
+{
+
+void cudaSleep(void* sleep)
+{
+    std::this_thread::sleep_for(std::chrono::duration<float, std::milli>(*static_cast<float*>(sleep)));
+}
+
+} // namespace
+
+//!
+//! \class TrtCudaStream
+//! \brief Managed CUDA stream
+//!
+class TrtCudaStream
+{
+public:
+    TrtCudaStream()
+    {
+        cudaCheck(cudaStreamCreate(&mStream));
+    }
+
+    TrtCudaStream(const TrtCudaStream&) = delete;
+
+    TrtCudaStream& operator=(const TrtCudaStream&) = delete;
+
+    TrtCudaStream(TrtCudaStream&&) = delete;
+
+    TrtCudaStream& operator=(TrtCudaStream&&) = delete;
+
+    ~TrtCudaStream()
+    {
+        cudaCheck(cudaStreamDestroy(mStream));
+    }
+
+    cudaStream_t get() const
+    {
+        return mStream;
+    }
+
+    void synchronize()
+    {
+        cudaCheck(cudaStreamSynchronize(mStream));
+    }
+
+    void wait(TrtCudaEvent& event);
+
+    void sleep(float* ms)
+    {
+        cudaCheck(cudaLaunchHostFunc(mStream, cudaSleep, ms));
+    }
+
+private:
+    cudaStream_t mStream{};
+};
+
+//!
+//! \class TrtCudaEvent
+//! \brief Managed CUDA event
+//!
+class TrtCudaEvent
+{
+public:
+    explicit TrtCudaEvent(bool blocking = true)
+    {
+        const uint32_t flags = blocking ? cudaEventBlockingSync : cudaEventDefault;
+        cudaCheck(cudaEventCreateWithFlags(&mEvent, flags));
+    }
+
+    TrtCudaEvent(const TrtCudaEvent&) = delete;
+
+    TrtCudaEvent& operator=(const TrtCudaEvent&) = delete;
+
+    TrtCudaEvent(TrtCudaEvent&&) = delete;
+
+    TrtCudaEvent& operator=(TrtCudaEvent&&) = delete;
+
+    ~TrtCudaEvent()
+    {
+        cudaCheck(cudaEventDestroy(mEvent));
+    }
+
+    cudaEvent_t get() const
+    {
+        return mEvent;
+    }
+
+    void record(const TrtCudaStream& stream)
+    {
+        cudaCheck(cudaEventRecord(mEvent, stream.get()));
+    }
+
+    void synchronize()
+    {
+        cudaCheck(cudaEventSynchronize(mEvent));
+    }
+
+    // Returns time elapsed time in milliseconds
+    float operator-(const TrtCudaEvent& e) const
+    {
+        float time{0};
+        cudaCheck(cudaEventElapsedTime(&time, e.get(), get()));
+        return time;
+    }
+
+private:
+    cudaEvent_t mEvent{};
+};
+
+inline void TrtCudaStream::wait(TrtCudaEvent& event)
+{
+    cudaCheck(cudaStreamWaitEvent(mStream, event.get(), 0));
+}
+
+//!
+//! \class TrtCudaGraph
+//! \brief Managed CUDA graph
+//!
+class TrtCudaGraph
+{
+public:
+    explicit TrtCudaGraph() = default;
+
+    TrtCudaGraph(const TrtCudaGraph&) = delete;
+
+    TrtCudaGraph& operator=(const TrtCudaGraph&) = delete;
+
+    TrtCudaGraph(TrtCudaGraph&&) = delete;
+
+    TrtCudaGraph& operator=(TrtCudaGraph&&) = delete;
+
+    ~TrtCudaGraph()
+    {
+        if (mGraphExec)
+        {
+            cudaGraphExecDestroy(mGraphExec);
+        }
+    }
+
+    void beginCapture(TrtCudaStream& stream)
+    {
+        cudaCheck(cudaStreamBeginCapture(stream.get(), cudaStreamCaptureModeThreadLocal));
+    }
+
+    bool launch(TrtCudaStream& stream)
+    {
+        return cudaGraphLaunch(mGraphExec, stream.get()) == cudaSuccess;
+    }
+
+    void endCapture(TrtCudaStream& stream)
+    {
+        cudaCheck(cudaStreamEndCapture(stream.get(), &mGraph));
+        cudaCheck(cudaGraphInstantiate(&mGraphExec, mGraph, nullptr, nullptr, 0));
+        cudaCheck(cudaGraphDestroy(mGraph));
+    }
+
+    void endCaptureOnError(TrtCudaStream& stream)
+    {
+        // There are two possibilities why stream capture would fail:
+        // (1) stream is in cudaErrorStreamCaptureInvalidated state.
+        // (2) TRT reports a failure.
+        // In case (1), the returning mGraph should be nullptr.
+        // In case (2), the returning mGraph is not nullptr, but it should not be used.
+        const auto ret = cudaStreamEndCapture(stream.get(), &mGraph);
+        if (ret == cudaErrorStreamCaptureInvalidated)
+        {
+            assert(mGraph == nullptr);
+        }
+        else
+        {
+            assert(ret == cudaSuccess);
+            assert(mGraph != nullptr);
+            cudaCheck(cudaGraphDestroy(mGraph));
+            mGraph = nullptr;
+        }
+        // Clean up any CUDA error.
+        cudaGetLastError();
+        sample::gLogWarning << "The CUDA graph capture on the stream has failed." << std::endl;
+    }
+
+private:
+    cudaGraph_t mGraph{};
+    cudaGraphExec_t mGraphExec{};
+};
+
+//!
+//! \class TrtCudaBuffer
+//! \brief Managed buffer for host and device
+//!
+template <typename A, typename D>
+class TrtCudaBuffer
+{
+public:
+    TrtCudaBuffer() = default;
+
+    TrtCudaBuffer(const TrtCudaBuffer&) = delete;
+
+    TrtCudaBuffer& operator=(const TrtCudaBuffer&) = delete;
+
+    TrtCudaBuffer(TrtCudaBuffer&& rhs)
+    {
+        reset(rhs.mPtr);
+        rhs.mPtr = nullptr;
+    }
+
+    TrtCudaBuffer& operator=(TrtCudaBuffer&& rhs)
+    {
+        if (this != &rhs)
+        {
+            reset(rhs.mPtr);
+            rhs.mPtr = nullptr;
+        }
+        return *this;
+    }
+
+    ~TrtCudaBuffer()
+    {
+        reset();
+    }
+
+    TrtCudaBuffer(size_t size)
+    {
+        A()(&mPtr, size);
+    }
+
+    void allocate(size_t size)
+    {
+        reset();
+        A()(&mPtr, size);
+    }
+
+    void reset(void* ptr = nullptr)
+    {
+        if (mPtr)
+        {
+            D()(mPtr);
+        }
+        mPtr = ptr;
+    }
+
+    void* get() const
+    {
+        return mPtr;
+    }
+
+private:
+    void* mPtr{nullptr};
+};
+
+struct DeviceAllocator
+{
+    void operator()(void** ptr, size_t size)
+    {
+        cudaCheck(cudaMalloc(ptr, size));
+    }
+};
+
+struct DeviceDeallocator
+{
+    void operator()(void* ptr)
+    {
+        cudaCheck(cudaFree(ptr));
+    }
+};
+
+struct ManagedAllocator
+{
+    void operator()(void** ptr, size_t size)
+    {
+        cudaCheck(cudaMallocManaged(ptr, size));
+    }
+};
+
+struct HostAllocator
+{
+    void operator()(void** ptr, size_t size)
+    {
+        cudaCheck(cudaMallocHost(ptr, size));
+    }
+};
+
+struct HostDeallocator
+{
+    void operator()(void* ptr)
+    {
+        cudaCheck(cudaFreeHost(ptr));
+    }
+};
+
+using TrtDeviceBuffer = TrtCudaBuffer<DeviceAllocator, DeviceDeallocator>;
+using TrtManagedBuffer = TrtCudaBuffer<ManagedAllocator, DeviceDeallocator>;
+
+using TrtHostBuffer = TrtCudaBuffer<HostAllocator, HostDeallocator>;
+
+//!
+//! \class MirroredBuffer
+//! \brief Coupled host and device buffers
+//!
+class IMirroredBuffer
+{
+public:
+    //!
+    //! Allocate memory for the mirrored buffer give the size
+    //! of the allocation.
+    //!
+    virtual void allocate(size_t size) = 0;
+
+    //!
+    //! Get the pointer to the device side buffer.
+    //!
+    //! \return pointer to device memory or nullptr if uninitialized.
+    //!
+    virtual void* getDeviceBuffer() const = 0;
+
+    //!
+    //! Get the pointer to the host side buffer.
+    //!
+    //! \return pointer to host memory or nullptr if uninitialized.
+    //!
+    virtual void* getHostBuffer() const = 0;
+
+    //!
+    //! Copy the memory from host to device.
+    //!
+    virtual void hostToDevice(TrtCudaStream& stream) = 0;
+
+    //!
+    //! Copy the memory from device to host.
+    //!
+    virtual void deviceToHost(TrtCudaStream& stream) = 0;
+
+    //!
+    //! Interface to get the size of the memory
+    //!
+    //! \return the size of memory allocated.
+    //!
+    virtual size_t getSize() const = 0;
+
+    //!
+    //! Virtual destructor declaraion
+    //!
+    virtual ~IMirroredBuffer() = default;
+
+}; // class IMirroredBuffer
+
+//!
+//! Class to have a seperate memory buffer for discrete device and host allocations.
+//!
+class DiscreteMirroredBuffer : public IMirroredBuffer
+{
+public:
+    void allocate(size_t size)
+    {
+        mSize = size;
+        mHostBuffer.allocate(size);
+        mDeviceBuffer.allocate(size);
+    }
+
+    void* getDeviceBuffer() const
+    {
+        return mDeviceBuffer.get();
+    }
+
+    void* getHostBuffer() const
+    {
+        return mHostBuffer.get();
+    }
+
+    void hostToDevice(TrtCudaStream& stream)
+    {
+        cudaCheck(cudaMemcpyAsync(mDeviceBuffer.get(), mHostBuffer.get(), mSize, cudaMemcpyHostToDevice, stream.get()));
+    }
+
+    void deviceToHost(TrtCudaStream& stream)
+    {
+        cudaCheck(cudaMemcpyAsync(mHostBuffer.get(), mDeviceBuffer.get(), mSize, cudaMemcpyDeviceToHost, stream.get()));
+    }
+
+    size_t getSize() const
+    {
+        return mSize;
+    }
+
+private:
+    size_t mSize{0};
+    TrtHostBuffer mHostBuffer;
+    TrtDeviceBuffer mDeviceBuffer;
+}; // class DiscreteMirroredBuffer
+
+//!
+//! Class to have a unified memory buffer for embedded devices.
+//!
+class UnifiedMirroredBuffer : public IMirroredBuffer
+{
+public:
+    void allocate(size_t size)
+    {
+        mSize = size;
+        mBuffer.allocate(size);
+    }
+
+    void* getDeviceBuffer() const
+    {
+        return mBuffer.get();
+    }
+
+    void* getHostBuffer() const
+    {
+        return mBuffer.get();
+    }
+
+    void hostToDevice(TrtCudaStream& /*stream*/)
+    {
+        // Does nothing since we are using unified memory.
+    }
+
+    void deviceToHost(TrtCudaStream& /*stream*/)
+    {
+        // Does nothing since we are using unified memory.
+    }
+
+    size_t getSize() const
+    {
+        return mSize;
+    }
+
+private:
+    size_t mSize{0};
+    TrtManagedBuffer mBuffer;
+}; // class UnifiedMirroredBuffer
+
+inline void setCudaDevice(int device, std::ostream& os)
+{
+    cudaCheck(cudaSetDevice(device));
+
+    cudaDeviceProp properties;
+    cudaCheck(cudaGetDeviceProperties(&properties, device));
+
+// clang-format off
+    os << "=== Device Information ===" << std::endl;
+    os << "Selected Device: "      << properties.name                                               << std::endl;
+    os << "Compute Capability: "   << properties.major << "." << properties.minor                   << std::endl;
+    os << "SMs: "                  << properties.multiProcessorCount                                << std::endl;
+    os << "Compute Clock Rate: "   << properties.clockRate / 1000000.0F << " GHz"                   << std::endl;
+    os << "Device Global Memory: " << (properties.totalGlobalMem >> 20) << " MiB"                   << std::endl;
+    os << "Shared Memory per SM: " << (properties.sharedMemPerMultiprocessor >> 10) << " KiB"       << std::endl;
+    os << "Memory Bus Width: "     << properties.memoryBusWidth << " bits"
+                        << " (ECC " << (properties.ECCEnabled != 0 ? "enabled" : "disabled") << ")" << std::endl;
+    os << "Memory Clock Rate: "    << properties.memoryClockRate / 1000000.0F << " GHz"             << std::endl;
+    // clang-format on
+}
+
+} // namespace sample
+
+#endif // TRT_SAMPLE_DEVICE_H
diff --git a/src/Detector/tensorrt_yolo/common/sampleEngines.cpp b/src/Detector/tensorrt_yolo/common_deprecated/sampleEngines.cpp
similarity index 100%
rename from src/Detector/tensorrt_yolo/common/sampleEngines.cpp
rename to src/Detector/tensorrt_yolo/common_deprecated/sampleEngines.cpp
diff --git a/src/Detector/tensorrt_yolo/common_deprecated/sampleEngines.h b/src/Detector/tensorrt_yolo/common_deprecated/sampleEngines.h
new file mode 100644
index 00000000..620b51a1
--- /dev/null
+++ b/src/Detector/tensorrt_yolo/common_deprecated/sampleEngines.h
@@ -0,0 +1,183 @@
+/*
+ * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef TRT_SAMPLE_ENGINES_H
+#define TRT_SAMPLE_ENGINES_H
+
+#include <iostream>
+#include <vector>
+
+#include "NvInfer.h"
+
+#if (NV_TENSORRT_MAJOR > 7)
+
+#include "NvInferConsistency.h"
+#include "NvInferSafeRuntime.h"
+
+#endif
+
+#include "NvOnnxParser.h"
+#include "sampleOptions.h"
+#include "sampleUtils.h"
+
+namespace sample
+{
+
+struct Parser
+{
+    TrtUniquePtr<nvonnxparser::IParser> onnxParser;
+
+    operator bool() const
+    {
+        return onnxParser.operator bool();
+    }
+};
+
+struct BuildEnvironment
+{
+    TrtUniquePtr<nvinfer1::INetworkDefinition> network;
+    //! Parser that creates the network. Must be declared *after* network, so that when
+    //! ~BuildEnvironment() executes, the parser is destroyed before the network is destroyed.
+    Parser parser;
+    TrtUniquePtr<nvinfer1::ICudaEngine> engine;
+    std::unique_ptr<nvinfer1::safe::ICudaEngine> safeEngine;
+    std::vector<uint8_t> engineBlob;
+};
+
+//!
+//! \brief Generate a network definition for a given model
+//!
+//! \return Parser The parser used to initialize the network and that holds the weights for the network, or an invalid
+//! parser (the returned parser converts to false if tested)
+//!
+//! Constant input dimensions in the model must not be changed in the corresponding
+//! network definition, because its correctness may rely on the constants.
+//!
+//! \see Parser::operator bool()
+//!
+Parser modelToNetwork(const ModelOptions& model, nvinfer1::INetworkDefinition& network, std::ostream& err);
+
+//!
+//! \brief Set up network and config
+//!
+//! \return boolean Return true if network and config were successfully set
+//!
+bool setupNetworkAndConfig(const BuildOptions& build, const SystemOptions& sys, nvinfer1::IBuilder& builder,
+    nvinfer1::INetworkDefinition& network, nvinfer1::IBuilderConfig& config, std::ostream& err,
+    std::vector<std::vector<char>>& sparseWeights);
+
+//!
+//! \brief Log refittable layers and weights of a refittable engine
+//!
+void dumpRefittable(nvinfer1::ICudaEngine& engine);
+
+//!
+//! \brief Load a serialized engine
+//!
+//! \return Pointer to the engine loaded or nullptr if the operation failed
+//!
+nvinfer1::ICudaEngine* loadEngine(const std::string& engine, int DLACore, std::ostream& err);
+
+//!
+//! \brief Save an engine into a file
+//!
+//! \return boolean Return true if the engine was successfully saved
+//!
+bool saveEngine(const nvinfer1::ICudaEngine& engine, const std::string& fileName, std::ostream& err);
+
+//!
+//! \brief Create an engine from model or serialized file, and optionally save engine
+//!
+//! \return Pointer to the engine created or nullptr if the creation failed
+//!
+bool getEngineBuildEnv(const ModelOptions& model, const BuildOptions& build, const SystemOptions& sys, 
+    BuildEnvironment& env, std::ostream& err);
+
+//!
+//! \brief Create an engine from model or serialized file, and optionally save engine
+//!
+//! \return Pointer to the engine created or nullptr if the creation failed
+//!
+inline TrtUniquePtr<nvinfer1::ICudaEngine> getEngine(
+    const ModelOptions& model, const BuildOptions& build, const SystemOptions& sys, std::ostream& err)
+{
+    BuildEnvironment env;
+    TrtUniquePtr<nvinfer1::ICudaEngine> engine;
+    if (getEngineBuildEnv(model, build, sys, env, err))
+    {
+        engine.swap(env.engine);
+    }
+    return engine;
+}
+
+//!
+//! \brief Create a serialized network
+//!
+//! \return Pointer to a host memory for a serialized network
+//!
+nvinfer1::IHostMemory* networkToSerialized(const BuildOptions& build, const SystemOptions& sys, nvinfer1::IBuilder& builder,
+    nvinfer1::INetworkDefinition& network, std::ostream& err);
+
+//!
+//! \brief Tranfer model to a serialized network
+//!
+//! \return Pointer to a host memory for a serialized network
+//!
+nvinfer1::IHostMemory* modelToSerialized(const ModelOptions& model, const BuildOptions& build, const SystemOptions& sys, std::ostream& err);
+
+//!
+//! \brief Serialize network and save it into a file
+//!
+//! \return boolean Return true if the network was successfully serialized and saved
+//!
+bool serializeAndSave(const ModelOptions& model, const BuildOptions& build, const SystemOptions& sys, std::ostream& err);
+
+bool timeRefit(const nvinfer1::INetworkDefinition& network, nvinfer1::ICudaEngine& engine, bool multiThreading);
+
+//!
+//! \brief Set tensor scales from a calibration table
+//!
+void setTensorScalesFromCalibration(nvinfer1::INetworkDefinition& network, const std::vector<IOFormat>& inputFormats,
+        const std::vector<IOFormat>& outputFormats, const std::string& calibrationFile);
+
+//!
+//! \brief Check if safe runtime is loaded.
+//!
+bool hasSafeRuntime();
+
+//!
+//! \brief Create a safe runtime object if the dynamic library is loaded.
+//!
+nvinfer1::safe::IRuntime* createSafeInferRuntime(nvinfer1::ILogger& logger) noexcept;
+
+//!
+//! \brief Check if consistency checker is loaded.
+//!
+bool hasConsistencyChecker();
+
+//!
+//! \brief Create a consistency checker object if the dynamic library is loaded.
+//!
+nvinfer1::consistency::IConsistencyChecker* createConsistencyChecker(
+    nvinfer1::ILogger& logger, nvinfer1::IHostMemory const* engine) noexcept;
+
+//!
+//! \brief Run consistency check on serialized engine.
+//!
+bool checkSafeEngine(void const* serializedEngine, int32_t const engineSize);
+} // namespace sample
+
+#endif // TRT_SAMPLE_ENGINES_H
diff --git a/src/Detector/tensorrt_yolo/common/sampleInference.cpp b/src/Detector/tensorrt_yolo/common_deprecated/sampleInference.cpp
similarity index 100%
rename from src/Detector/tensorrt_yolo/common/sampleInference.cpp
rename to src/Detector/tensorrt_yolo/common_deprecated/sampleInference.cpp
diff --git a/src/Detector/tensorrt_yolo/common_deprecated/sampleInference.h b/src/Detector/tensorrt_yolo/common_deprecated/sampleInference.h
new file mode 100644
index 00000000..1c21f592
--- /dev/null
+++ b/src/Detector/tensorrt_yolo/common_deprecated/sampleInference.h
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef TRT_SAMPLE_INFERENCE_H
+#define TRT_SAMPLE_INFERENCE_H
+
+#include "sampleReporting.h"
+#include "sampleUtils.h"
+
+#include <iostream>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "NvInfer.h"
+
+#if (NV_TENSORRT_MAJOR > 7)
+
+#include "NvInferSafeRuntime.h"
+
+namespace sample
+{
+
+struct InferenceEnvironment
+{
+    TrtUniquePtr<nvinfer1::ICudaEngine> engine;
+    std::unique_ptr<Profiler> profiler;
+    std::vector<TrtUniquePtr<nvinfer1::IExecutionContext>> context;
+    std::vector<std::unique_ptr<Bindings>> bindings;
+    bool error{false};
+
+    std::vector<uint8_t> engineBlob;
+
+    bool safe{false};
+    std::unique_ptr<nvinfer1::safe::ICudaEngine> safeEngine;
+    std::vector<std::unique_ptr<nvinfer1::safe::IExecutionContext>> safeContext;
+
+    template <class ContextType>
+    inline ContextType* getContext(int32_t streamIdx);
+};
+
+template <>
+inline nvinfer1::IExecutionContext* InferenceEnvironment::getContext(int32_t streamIdx)
+{
+    return context[streamIdx].get();
+}
+
+template <>
+inline nvinfer1::safe::IExecutionContext* InferenceEnvironment::getContext(int32_t streamIdx)
+{
+    return safeContext[streamIdx].get();
+}
+
+//!
+//! \brief Set up contexts and bindings for inference
+//!
+bool setUpInference(InferenceEnvironment& iEnv, const InferenceOptions& inference);
+
+//!
+//! \brief Deserialize the engine and time how long it takes.
+//!
+bool timeDeserialize(InferenceEnvironment& iEnv);
+
+//!
+//! \brief Run inference and collect timing, return false if any error hit during inference
+//!
+bool runInference(
+    const InferenceOptions& inference, InferenceEnvironment& iEnv, int32_t device, std::vector<InferenceTrace>& trace);
+
+//!
+//! \brief Get layer information of the engine.
+//!
+std::string getLayerInformation(const InferenceEnvironment& iEnv, nvinfer1::LayerInformationFormat format);
+
+} // namespace sample
+
+#endif
+
+#endif // TRT_SAMPLE_INFERENCE_H
diff --git a/src/Detector/tensorrt_yolo/common_deprecated/sampleOptions.cpp b/src/Detector/tensorrt_yolo/common_deprecated/sampleOptions.cpp
new file mode 100644
index 00000000..0afd163f
--- /dev/null
+++ b/src/Detector/tensorrt_yolo/common_deprecated/sampleOptions.cpp
@@ -0,0 +1,1778 @@
+/*
+ * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <algorithm>
+#include <cctype>
+#include <cstring>
+#include <functional>
+#include <iostream>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+#include "NvInfer.h"
+
+#include "logger.h"
+#include "sampleOptions.h"
+
+namespace sample
+{
+
+namespace
+{
+
+std::vector<std::string> splitToStringVec(const std::string& option, char separator)
+{
+    std::vector<std::string> options;
+
+    for (size_t start = 0; start < option.length();)
+    {
+        size_t separatorIndex = option.find(separator, start);
+        if (separatorIndex == std::string::npos)
+        {
+            separatorIndex = option.length();
+        }
+        options.emplace_back(option.substr(start, separatorIndex - start));
+        start = separatorIndex + 1;
+    }
+
+    return options;
+}
+
+template <typename T>
+T stringToValue(const std::string& option)
+{
+    return T{option};
+}
+
+template <>
+int32_t stringToValue<int32_t>(const std::string& option)
+{
+    return std::stoi(option);
+}
+
+template <>
+float stringToValue<float>(const std::string& option)
+{
+    return std::stof(option);
+}
+
+template <>
+double stringToValue<double>(const std::string& option)
+{
+    return std::stod(option);
+}
+
+template <>
+bool stringToValue<bool>(const std::string& option)
+{
+    return true;
+}
+
+template <>
+std::vector<int32_t> stringToValue<std::vector<int32_t>>(const std::string& option)
+{
+    std::vector<int32_t> shape;
+    std::vector<std::string> dimsStrings = splitToStringVec(option, 'x');
+    for (const auto& d : dimsStrings)
+    {
+        shape.push_back(stringToValue<int32_t>(d));
+    }
+    return shape;
+}
+
+template <>
+nvinfer1::DataType stringToValue<nvinfer1::DataType>(const std::string& option)
+{
+    const std::unordered_map<std::string, nvinfer1::DataType> strToDT{{"fp32", nvinfer1::DataType::kFLOAT},
+        {"fp16", nvinfer1::DataType::kHALF}, {"int8", nvinfer1::DataType::kINT8},
+        {"int32", nvinfer1::DataType::kINT32}};
+    const auto& dt = strToDT.find(option);
+    if (dt == strToDT.end())
+    {
+        throw std::invalid_argument("Invalid DataType " + option);
+    }
+    return dt->second;
+}
+
+template <>
+nvinfer1::TensorFormats stringToValue<nvinfer1::TensorFormats>(const std::string& option)
+{
+    std::vector<std::string> optionStrings = splitToStringVec(option, '+');
+    const std::unordered_map<std::string, nvinfer1::TensorFormat> strToFmt{{"chw", nvinfer1::TensorFormat::kLINEAR},
+        {"chw2", nvinfer1::TensorFormat::kCHW2}, {"chw4", nvinfer1::TensorFormat::kCHW4},
+        {"hwc8", nvinfer1::TensorFormat::kHWC8}, {"chw16", nvinfer1::TensorFormat::kCHW16},
+        {"chw32", nvinfer1::TensorFormat::kCHW32}, {"dhwc8", nvinfer1::TensorFormat::kDHWC8},
+        {"hwc", nvinfer1::TensorFormat::kHWC}, {"dla_linear", nvinfer1::TensorFormat::kDLA_LINEAR},
+        {"dla_hwc4", nvinfer1::TensorFormat::kDLA_HWC4}};
+    nvinfer1::TensorFormats formats{};
+    for (auto f : optionStrings)
+    {
+        const auto& tf = strToFmt.find(f);
+        if (tf == strToFmt.end())
+        {
+            throw std::invalid_argument(std::string("Invalid TensorFormat ") + f);
+        }
+        formats |= 1U << static_cast<int32_t>(tf->second);
+    }
+
+    return formats;
+}
+
+template <>
+IOFormat stringToValue<IOFormat>(const std::string& option)
+{
+    IOFormat ioFormat{};
+    const size_t colon = option.find(':');
+
+    if (colon == std::string::npos)
+    {
+        throw std::invalid_argument(std::string("Invalid IOFormat ") + option);
+    }
+
+    ioFormat.first = stringToValue<nvinfer1::DataType>(option.substr(0, colon));
+    ioFormat.second = stringToValue<nvinfer1::TensorFormats>(option.substr(colon + 1));
+
+    return ioFormat;
+}
+
+template <typename T>
+std::pair<std::string, T> splitNameAndValue(const std::string& s)
+{
+    std::string tensorName;
+    std::string valueString;
+    // Split on the last :
+    std::vector<std::string> nameRange{splitToStringVec(s, ':')};
+    // Everything before the last : is the name
+    tensorName = nameRange[0];
+    for (size_t i = 1; i < nameRange.size() - 1; i++)
+    {
+        tensorName += ":" + nameRange[i];
+    }
+    // Value is the string element after the last :
+    valueString = nameRange[nameRange.size() - 1];
+    return std::pair<std::string, T>(tensorName, stringToValue<T>(valueString));
+}
+
+template <typename T>
+void splitInsertKeyValue(const std::vector<std::string>& kvList, T& map)
+{
+    for (const auto& kv : kvList)
+    {
+        map.insert(splitNameAndValue<typename T::mapped_type>(kv));
+    }
+}
+
+const char* boolToEnabled(bool enable)
+{
+    return enable ? "Enabled" : "Disabled";
+}
+
+//! Check if input option exists in input arguments.
+//! If it does: return its value, erase the argument and return true.
+//! If it does not: return false.
+template <typename T>
+bool getAndDelOption(Arguments& arguments, const std::string& option, T& value)
+{
+    const auto match = arguments.find(option);
+    if (match != arguments.end())
+    {
+        value = stringToValue<T>(match->second);
+        arguments.erase(match);
+        return true;
+    }
+
+    return false;
+}
+
+//! Check if input option exists in input arguments.
+//! If it does: return false in value, erase the argument and return true.
+//! If it does not: return false.
+bool getAndDelNegOption(Arguments& arguments, const std::string& option, bool& value)
+{
+    bool dummy;
+    if (getAndDelOption(arguments, option, dummy))
+    {
+        value = false;
+        return true;
+    }
+    return false;
+}
+
+//! Check if input option exists in input arguments.
+//! If it does: add all the matched arg values to values vector, erase the argument and return true.
+//! If it does not: return false.
+template <typename T>
+bool getAndDelRepeatedOption(Arguments& arguments, const std::string& option, std::vector<T>& values)
+{
+    const auto match = arguments.equal_range(option);
+    if (match.first == match.second)
+    {
+        return false;
+    }
+
+    auto addToValues = [&values](Arguments::value_type& argValue) {values.emplace_back(stringToValue<T>(argValue.second));};
+    std::for_each(match.first, match.second, addToValues);
+    arguments.erase(match.first, match.second);
+
+    return true;
+}
+
+void insertShapesBuild(std::unordered_map<std::string, ShapeRange>& shapes, nvinfer1::OptProfileSelector selector, const std::string& name, const std::vector<int32_t>& dims)
+{
+    shapes[name][static_cast<size_t>(selector)] = dims;
+}
+
+void insertShapesInference(std::unordered_map<std::string, std::vector<int32_t>>& shapes, const std::string& name, const std::vector<int32_t>& dims)
+{
+    shapes[name] = dims;
+}
+
+std::string removeSingleQuotationMarks(std::string& str)
+{
+     std::vector<std::string> strList{splitToStringVec(str, '\'')};
+     // Remove all the escaped single quotation marks
+     std::string retVal = "";
+     // Do not really care about unterminated sequences
+     for (size_t i = 0; i < strList.size(); i++)
+     {
+         retVal += strList[i];
+     }
+     return retVal;
+}
+
+void getLayerPrecisions(Arguments& arguments, char const* argument, LayerPrecisions& layerPrecisions)
+{
+    std::string list;
+    if (!getAndDelOption(arguments, argument, list))
+    {
+        return;
+    }
+
+    // The layerPrecisions flag contains comma-separated layerName:precision pairs.
+    std::vector<std::string> precisionList{splitToStringVec(list, ',')};
+    for (auto const& s : precisionList)
+    {
+        auto namePrecisionPair = splitNameAndValue<nvinfer1::DataType>(s);
+        auto const layerName = removeSingleQuotationMarks(namePrecisionPair.first);
+        layerPrecisions[layerName] = namePrecisionPair.second;
+    }
+}
+
+void getLayerOutputTypes(Arguments& arguments, char const* argument, LayerOutputTypes& layerOutputTypes)
+{
+    std::string list;
+    if (!getAndDelOption(arguments, argument, list))
+    {
+        return;
+    }
+
+    // The layerOutputTypes flag contains comma-separated layerName:types pairs.
+    std::vector<std::string> precisionList{splitToStringVec(list, ',')};
+    for (auto const& s : precisionList)
+    {
+        auto namePrecisionPair = splitNameAndValue<std::string>(s);
+        auto const layerName = removeSingleQuotationMarks(namePrecisionPair.first);
+        auto const typeStrings = splitToStringVec(namePrecisionPair.second, '+');
+        std::vector<nvinfer1::DataType> typeVec(typeStrings.size(), nvinfer1::DataType::kFLOAT);
+        std::transform(typeStrings.begin(), typeStrings.end(), typeVec.begin(), stringToValue<nvinfer1::DataType>);
+        layerOutputTypes[layerName] = typeVec;
+    }
+}
+
+bool getShapesBuild(Arguments& arguments, std::unordered_map<std::string, ShapeRange>& shapes, char const* argument,
+    nvinfer1::OptProfileSelector selector)
+{
+    std::string list;
+    bool retVal = getAndDelOption(arguments, argument, list);
+    std::vector<std::string> shapeList{splitToStringVec(list, ',')};
+    for (const auto& s : shapeList)
+    {
+        auto nameDimsPair = splitNameAndValue<std::vector<int32_t>>(s);
+        auto tensorName = removeSingleQuotationMarks(nameDimsPair.first);
+        auto dims = nameDimsPair.second;
+        insertShapesBuild(shapes, selector, tensorName, dims);
+    }
+    return retVal;
+}
+
+bool getShapesInference(Arguments& arguments, std::unordered_map<std::string, std::vector<int32_t>>& shapes, const char* argument)
+{
+    std::string list;
+    bool retVal = getAndDelOption(arguments, argument, list);
+    std::vector<std::string> shapeList{splitToStringVec(list, ',')};
+    for (const auto& s : shapeList)
+    {
+        auto nameDimsPair = splitNameAndValue<std::vector<int32_t>>(s);
+        auto tensorName = removeSingleQuotationMarks(nameDimsPair.first);
+        auto dims = nameDimsPair.second;
+        insertShapesInference(shapes, tensorName, dims);
+    }
+    return retVal;
+}
+
+void processShapes(std::unordered_map<std::string, ShapeRange>& shapes, bool minShapes, bool optShapes, bool maxShapes, bool calib)
+{
+    // Only accept optShapes only or all three of minShapes, optShapes, maxShapes
+    if ( ((minShapes || maxShapes) && !optShapes)  // minShapes only, maxShapes only, both minShapes and maxShapes
+        || (minShapes && !maxShapes && optShapes)  // both minShapes and optShapes
+        || (!minShapes && maxShapes && optShapes)) // both maxShapes and optShapes
+    {
+        if (calib)
+        {
+            throw std::invalid_argument("Must specify only --optShapesCalib or all of --minShapesCalib, --optShapesCalib, --maxShapesCalib");
+        }
+        else
+        {
+            throw std::invalid_argument("Must specify only --optShapes or all of --minShapes, --optShapes, --maxShapes");
+        }
+    }
+
+    // If optShapes only, expand optShapes to minShapes and maxShapes
+    if (optShapes && !minShapes && !maxShapes)
+    {
+        std::unordered_map<std::string, ShapeRange> newShapes;
+        for (auto& s : shapes)
+        {
+            insertShapesBuild(newShapes, nvinfer1::OptProfileSelector::kMIN, s.first, s.second[static_cast<size_t>(nvinfer1::OptProfileSelector::kOPT)]);
+            insertShapesBuild(newShapes, nvinfer1::OptProfileSelector::kOPT, s.first, s.second[static_cast<size_t>(nvinfer1::OptProfileSelector::kOPT)]);
+            insertShapesBuild(newShapes, nvinfer1::OptProfileSelector::kMAX, s.first, s.second[static_cast<size_t>(nvinfer1::OptProfileSelector::kOPT)]);
+        }
+        shapes = newShapes;
+    }
+}
+
+template <typename T>
+void printShapes(std::ostream& os, const char* phase, const T& shapes)
+{
+    if (shapes.empty())
+    {
+        os << "Input " << phase << " shapes: model" << std::endl;
+    }
+    else
+    {
+        for (const auto& s : shapes)
+        {
+            os << "Input " << phase << " shape: " << s.first << "=" << s.second << std::endl;
+        }
+    }
+}
+
+std::ostream& printBatch(std::ostream& os, int32_t maxBatch)
+{
+    if (maxBatch != maxBatchNotProvided)
+    {
+        os << maxBatch;
+    }
+    else
+    {
+        os << "explicit batch";
+    }
+    return os;
+}
+
+std::ostream& printTacticSources(std::ostream& os, nvinfer1::TacticSources enabledSources, nvinfer1::TacticSources disabledSources)
+{
+    if (!enabledSources && !disabledSources)
+    {
+        os << "Using default tactic sources";
+    }
+    else
+    {
+        auto const addSource = [&](uint32_t source, std::string const& name) {
+            if (enabledSources & source)
+            {
+                os << name << " [ON], ";
+            }
+            else if (disabledSources & source)
+            {
+                os << name << " [OFF], ";
+            }
+        };
+
+        addSource(1U << static_cast<uint32_t>(nvinfer1::TacticSource::kCUBLAS), "cublas");
+        addSource(1U << static_cast<uint32_t>(nvinfer1::TacticSource::kCUBLAS_LT), "cublasLt");
+#if (NV_TENSORRT_MAJOR > 7)
+        addSource(1U << static_cast<uint32_t>(nvinfer1::TacticSource::kCUDNN), "cudnn");
+#endif
+    }
+    return os;
+}
+
+std::ostream& printPrecision(std::ostream& os, BuildOptions const& options)
+{
+    os << "FP32";
+    if (options.fp16)
+    {
+        os << "+FP16";
+    }
+    if (options.int8)
+    {
+        os << "+INT8";
+    }
+    if (options.precisionConstraints == PrecisionConstraints::kOBEY)
+    {
+        os << " (obey precision constraints)";
+    }
+    if (options.precisionConstraints == PrecisionConstraints::kPREFER)
+    {
+        os << " (prefer precision constraints)";
+    }
+    return os;
+}
+
+std::ostream& printTimingCache(std::ostream& os, BuildOptions const& options)
+{
+    switch (options.timingCacheMode)
+    {
+        case TimingCacheMode::kGLOBAL: os << "global"; break;
+        case TimingCacheMode::kLOCAL: os << "local"; break;
+        case TimingCacheMode::kDISABLE: os << "disable"; break;
+    }
+    return os;
+}
+
+std::ostream& printSparsity(std::ostream& os, BuildOptions const& options)
+{
+    switch (options.sparsity)
+    {
+    case SparsityFlag::kDISABLE: os << "Disabled"; break;
+    case SparsityFlag::kENABLE: os << "Enabled"; break;
+    case SparsityFlag::kFORCE: os << "Forced"; break;
+    }
+
+    return os;
+}
+
+std::ostream& printMemoryPools(std::ostream& os, BuildOptions const& options)
+{
+    auto const printValueOrDefault = [&os](double const val) {
+        if (val >= 0)
+        {
+            os << val << " MiB";
+        }
+        else
+        {
+            os << "default";
+        }
+    };
+    os << "workspace: ";     printValueOrDefault(options.workspace);     os << ", ";
+    os << "dlaSRAM: ";       printValueOrDefault(options.dlaSRAM);       os << ", ";
+    os << "dlaLocalDRAM: ";  printValueOrDefault(options.dlaLocalDRAM);  os << ", ";
+    os << "dlaGlobalDRAM: "; printValueOrDefault(options.dlaGlobalDRAM);
+    return os;
+}
+
+} // namespace
+
+Arguments argsToArgumentsMap(int32_t argc, char* argv[])
+{
+    Arguments arguments;
+    for (int32_t i = 1; i < argc; ++i)
+    {
+        auto valuePtr = strchr(argv[i], '=');
+        if (valuePtr)
+        {
+            std::string value{valuePtr + 1};
+            arguments.emplace(std::string(argv[i], valuePtr - argv[i]), value);
+        }
+        else
+        {
+            arguments.emplace(argv[i], "");
+        }
+    }
+    return arguments;
+}
+
+void BaseModelOptions::parse(Arguments& arguments)
+{
+    if (getAndDelOption(arguments, "--onnx", model))
+    {
+        format = ModelFormat::kONNX;
+    }
+    else if (getAndDelOption(arguments, "--uff", model))
+    {
+        format = ModelFormat::kUFF;
+    }
+    else if (getAndDelOption(arguments, "--model", model))
+    {
+        format = ModelFormat::kCAFFE;
+    }
+}
+
+void UffInput::parse(Arguments& arguments)
+{
+    getAndDelOption(arguments, "--uffNHWC", NHWC);
+    std::vector<std::string> args;
+    if (getAndDelRepeatedOption(arguments, "--uffInput", args))
+    {
+        for (const auto& i : args)
+        {
+            std::vector<std::string> values{splitToStringVec(i, ',')};
+            if (values.size() == 4)
+            {
+                nvinfer1::Dims3 dims{std::stoi(values[1]), std::stoi(values[2]), std::stoi(values[3])};
+                inputs.emplace_back(values[0], dims);
+            }
+            else
+            {
+                throw std::invalid_argument(std::string("Invalid uffInput ") + i);
+            }
+        }
+    }
+}
+
+void ModelOptions::parse(Arguments& arguments)
+{
+    baseModel.parse(arguments);
+
+    switch (baseModel.format)
+    {
+    case ModelFormat::kCAFFE:
+    {
+        getAndDelOption(arguments, "--deploy", prototxt);
+        break;
+    }
+    case ModelFormat::kUFF:
+    {
+        uffInputs.parse(arguments);
+        if (uffInputs.inputs.empty())
+        {
+            throw std::invalid_argument("Uff models require at least one input");
+        }
+        break;
+    }
+    case ModelFormat::kONNX:
+        break;
+    case ModelFormat::kANY:
+    {
+        if (getAndDelOption(arguments, "--deploy", prototxt))
+        {
+            baseModel.format = ModelFormat::kCAFFE;
+        }
+        break;
+    }
+    }
+
+    // The --output flag should only be used with Caffe and UFF. It has no effect on ONNX.
+    std::vector<std::string> outArgs;
+    if (getAndDelRepeatedOption(arguments, "--output", outArgs))
+    {
+        for (const auto& o : outArgs)
+        {
+            for (auto& v : splitToStringVec(o, ','))
+            {
+                outputs.emplace_back(std::move(v));
+            }
+        }
+    }
+    if (baseModel.format == ModelFormat::kCAFFE || baseModel.format == ModelFormat::kUFF)
+    {
+        if (outputs.empty())
+        {
+            throw std::invalid_argument("Caffe and Uff models require at least one output");
+        }
+    }
+    else if (baseModel.format == ModelFormat::kONNX)
+    {
+        if (!outputs.empty())
+        {
+            throw std::invalid_argument("The --output flag should not be used with ONNX models.");
+        }
+    }
+}
+
+void BuildOptions::parse(Arguments& arguments)
+{
+    auto getFormats = [&arguments](std::vector<IOFormat>& formatsVector, const char* argument) {
+        std::string list;
+        getAndDelOption(arguments, argument, list);
+        std::vector<std::string> formats{splitToStringVec(list, ',')};
+        for (const auto& f : formats)
+        {
+            formatsVector.push_back(stringToValue<IOFormat>(f));
+        }
+    };
+
+    getFormats(inputFormats, "--inputIOFormats");
+    getFormats(outputFormats, "--outputIOFormats");
+
+    bool addedExplicitBatchFlag{false};
+    getAndDelOption(arguments, "--explicitBatch", addedExplicitBatchFlag);
+    if (addedExplicitBatchFlag)
+    {
+        sample::gLogWarning << "--explicitBatch flag has been deprecated and has no effect!" << std::endl;
+        sample::gLogWarning << "Explicit batch dim is automatically enabled if input model is ONNX or if dynamic "
+                            << "shapes are provided when the engine is built." << std::endl;
+    }
+
+    bool minShapes = getShapesBuild(arguments, shapes, "--minShapes", nvinfer1::OptProfileSelector::kMIN);
+    bool optShapes = getShapesBuild(arguments, shapes, "--optShapes", nvinfer1::OptProfileSelector::kOPT);
+    bool maxShapes = getShapesBuild(arguments, shapes, "--maxShapes", nvinfer1::OptProfileSelector::kMAX);
+    processShapes(shapes, minShapes, optShapes, maxShapes, false);
+    bool minShapesCalib
+        = getShapesBuild(arguments, shapesCalib, "--minShapesCalib", nvinfer1::OptProfileSelector::kMIN);
+    bool optShapesCalib
+        = getShapesBuild(arguments, shapesCalib, "--optShapesCalib", nvinfer1::OptProfileSelector::kOPT);
+    bool maxShapesCalib
+        = getShapesBuild(arguments, shapesCalib, "--maxShapesCalib", nvinfer1::OptProfileSelector::kMAX);
+    processShapes(shapesCalib, minShapesCalib, optShapesCalib, maxShapesCalib, true);
+
+    bool addedExplicitPrecisionFlag{false};
+    getAndDelOption(arguments, "--explicitPrecision", addedExplicitPrecisionFlag);
+    if (addedExplicitPrecisionFlag)
+    {
+        sample::gLogWarning << "--explicitPrecision flag has been deprecated and has no effect!" << std::endl;
+    }
+
+    if (getAndDelOption(arguments, "--workspace", workspace))
+    {
+        sample::gLogWarning << "--workspace flag has been deprecated by --memPoolSize flag." << std::endl;
+    }
+
+    std::string memPoolSizes;
+    getAndDelOption(arguments, "--memPoolSize", memPoolSizes);
+    std::vector<std::string> memPoolSpecs{splitToStringVec(memPoolSizes, ',')};
+    for (auto const& memPoolSpec : memPoolSpecs)
+    {
+        std::string memPoolName;
+        double memPoolSize;
+        std::tie(memPoolName, memPoolSize) = splitNameAndValue<double>(memPoolSpec);
+        if (memPoolSize < 0)
+        {
+            throw std::invalid_argument(std::string("Negative memory pool size: ") + std::to_string(memPoolSize));
+        }
+        if (memPoolName == "workspace")
+        {
+            workspace = memPoolSize;
+        }
+        else if (memPoolName == "dlaSRAM")
+        {
+            dlaSRAM = memPoolSize;
+        }
+        else if (memPoolName == "dlaLocalDRAM")
+        {
+            dlaLocalDRAM = memPoolSize;
+        }
+        else if (memPoolName == "dlaGlobalDRAM")
+        {
+            dlaGlobalDRAM = memPoolSize;
+        }
+        else if (!memPoolName.empty())
+        {
+            throw std::invalid_argument(std::string("Unknown memory pool: ") + memPoolName);
+        }
+    }
+
+    getAndDelOption(arguments, "--maxBatch", maxBatch);
+    getAndDelOption(arguments, "--minTiming", minTiming);
+    getAndDelOption(arguments, "--avgTiming", avgTiming);
+
+    bool best{false};
+    getAndDelOption(arguments, "--best", best);
+    if (best)
+    {
+        int8 = true;
+        fp16 = true;
+    }
+
+    getAndDelOption(arguments, "--refit", refittable);
+    getAndDelNegOption(arguments, "--noTF32", tf32);
+    getAndDelOption(arguments, "--fp16", fp16);
+    getAndDelOption(arguments, "--int8", int8);
+    getAndDelOption(arguments, "--safe", safe);
+    getAndDelOption(arguments, "--consistency", consistency);
+    getAndDelOption(arguments, "--restricted", restricted);
+
+    getAndDelOption(arguments, "--directIO", directIO);
+
+    std::string precisionConstraintsString;
+    getAndDelOption(arguments, "--precisionConstraints", precisionConstraintsString);
+    if (!precisionConstraintsString.empty())
+    {
+        const std::unordered_map<std::string, PrecisionConstraints> precisionConstraintsMap
+            = {{"obey", PrecisionConstraints::kOBEY}, {"prefer", PrecisionConstraints::kPREFER},
+                {"none", PrecisionConstraints::kNONE}};
+        auto it = precisionConstraintsMap.find(precisionConstraintsString);
+        if (it == precisionConstraintsMap.end())
+        {
+            throw std::invalid_argument(std::string("Unknown precision constraints: ") + precisionConstraintsString);
+        }
+        precisionConstraints = it->second;
+    }
+    else
+    {
+        precisionConstraints = PrecisionConstraints::kNONE;
+    }
+
+    getLayerPrecisions(arguments, "--layerPrecisions", layerPrecisions);
+    getLayerOutputTypes(arguments, "--layerOutputTypes", layerOutputTypes);
+
+    if (layerPrecisions.empty() && layerOutputTypes.empty() && precisionConstraints != PrecisionConstraints::kNONE)
+    {
+        sample::gLogWarning << "When --precisionConstraints flag is set to \"obey\" or \"prefer\", please add "
+                            << "--layerPrecision/--layerOutputTypes flags to set layer-wise precisions and output "
+                            << "types." << std::endl;
+    }
+    else if ((!layerPrecisions.empty() || !layerOutputTypes.empty())
+        && precisionConstraints == PrecisionConstraints::kNONE)
+    {
+        sample::gLogWarning << "--layerPrecision/--layerOutputTypes flags have no effect when --precisionConstraints "
+                            << "flag is set to \"none\"." << std::endl;
+    }
+
+    std::string sparsityString;
+    getAndDelOption(arguments, "--sparsity", sparsityString);
+    if (sparsityString == "disable")
+    {
+        sparsity = SparsityFlag::kDISABLE;
+    }
+    else if (sparsityString == "enable")
+    {
+        sparsity = SparsityFlag::kENABLE;
+    }
+    else if (sparsityString == "force")
+    {
+        sparsity = SparsityFlag::kFORCE;
+    }
+    else if (!sparsityString.empty())
+    {
+        throw std::invalid_argument(std::string("Unknown sparsity mode: ") + sparsityString);
+    }
+
+    bool calibCheck = getAndDelOption(arguments, "--calib", calibration);
+    if (int8 && calibCheck && !shapes.empty() && shapesCalib.empty())
+    {
+        shapesCalib = shapes;
+    }
+
+    std::string profilingVerbosityString;
+    if (getAndDelOption(arguments, "--nvtxMode", profilingVerbosityString))
+    {
+        sample::gLogWarning << "--nvtxMode flag has been deprecated by --profilingVerbosity flag." << std::endl;
+    }
+
+    getAndDelOption(arguments, "--profilingVerbosity", profilingVerbosityString);
+    if (profilingVerbosityString == "layer_names_only")
+    {
+#if (NV_TENSORRT_MAJOR > 7)
+        profilingVerbosity = nvinfer1::ProfilingVerbosity::kLAYER_NAMES_ONLY;
+#else
+		profilingVerbosity = nvinfer1::ProfilingVerbosity::kDEFAULT;
+#endif
+    }
+    else if (profilingVerbosityString == "none")
+    {
+        profilingVerbosity = nvinfer1::ProfilingVerbosity::kNONE;
+    }
+#if (NV_TENSORRT_MAJOR > 7)
+    else if (profilingVerbosityString == "detailed")
+    {
+        profilingVerbosity = nvinfer1::ProfilingVerbosity::kDETAILED;
+    }
+#endif
+    else if (profilingVerbosityString == "default")
+    {
+#if (NV_TENSORRT_MAJOR > 7)
+        sample::gLogWarning << "--profilingVerbosity=default has been deprecated by "
+                               "--profilingVerbosity=layer_names_only."
+                            << std::endl;
+        profilingVerbosity = nvinfer1::ProfilingVerbosity::kLAYER_NAMES_ONLY;
+#else
+		profilingVerbosity = nvinfer1::ProfilingVerbosity::kDEFAULT;
+#endif
+    }
+    else if (profilingVerbosityString == "verbose")
+    {
+#if (NV_TENSORRT_MAJOR > 7)
+        sample::gLogWarning << "--profilingVerbosity=verbose has been deprecated by --profilingVerbosity=detailed."
+                            << std::endl;
+        profilingVerbosity = nvinfer1::ProfilingVerbosity::kDETAILED;
+#else
+		profilingVerbosity = nvinfer1::ProfilingVerbosity::kDEFAULT;
+#endif
+    }
+    else if (!profilingVerbosityString.empty())
+    {
+        throw std::invalid_argument(std::string("Unknown profilingVerbosity: ") + profilingVerbosityString);
+    }
+
+    if (getAndDelOption(arguments, "--loadEngine", engine))
+    {
+        load = true;
+    }
+    if (getAndDelOption(arguments, "--saveEngine", engine))
+    {
+        save = true;
+    }
+    if (load && save)
+    {
+        throw std::invalid_argument("Incompatible load and save engine options selected");
+    }
+
+    std::string tacticSourceArgs;
+    if (getAndDelOption(arguments, "--tacticSources", tacticSourceArgs))
+    {
+        std::vector<std::string> tacticList = splitToStringVec(tacticSourceArgs, ',');
+        for (auto& t : tacticList)
+        {
+            bool enable{false};
+            if (t.front() == '+')
+            {
+                enable = true;
+            }
+            else if (t.front() != '-')
+            {
+                throw std::invalid_argument(
+                    "Tactic source must be prefixed with + or -, indicating whether it should be enabled or disabled "
+                    "respectively.");
+            }
+            t.erase(0, 1);
+
+            const auto toUpper = [](std::string& sourceName) {
+                std::transform(
+                    sourceName.begin(), sourceName.end(), sourceName.begin(), [](char c) { return std::toupper(c); });
+                return sourceName;
+            };
+
+            nvinfer1::TacticSource source{};
+            t = toUpper(t);
+            if (t == "CUBLAS")
+            {
+                source = nvinfer1::TacticSource::kCUBLAS;
+            }
+            else if (t == "CUBLASLT" || t == "CUBLAS_LT")
+            {
+                source = nvinfer1::TacticSource::kCUBLAS_LT;
+            }
+#if (NV_TENSORRT_MAJOR > 7)
+            else if (t == "CUDNN")
+            {
+                source = nvinfer1::TacticSource::kCUDNN;
+            }
+#endif
+            else
+            {
+                throw std::invalid_argument(std::string("Unknown tactic source: ") + t);
+            }
+
+            uint32_t sourceBit = 1U << static_cast<uint32_t>(source);
+
+            if (enable)
+            {
+                enabledTactics |= sourceBit;
+            }
+            else
+            {
+                disabledTactics |= sourceBit;
+            }
+
+            if (enabledTactics & disabledTactics)
+            {
+                throw std::invalid_argument(std::string("Cannot enable and disable ") + t);
+            }
+        }
+    }
+
+    bool noBuilderCache{false};
+    getAndDelOption(arguments, "--noBuilderCache", noBuilderCache);
+    getAndDelOption(arguments, "--timingCacheFile", timingCacheFile);
+    if (noBuilderCache)
+    {
+        timingCacheMode = TimingCacheMode::kDISABLE;
+    }
+    else if (!timingCacheFile.empty())
+    {
+        timingCacheMode = TimingCacheMode::kGLOBAL;
+    }
+    else
+    {
+        timingCacheMode = TimingCacheMode::kLOCAL;
+    }
+}
+
+void SystemOptions::parse(Arguments& arguments)
+{
+    getAndDelOption(arguments, "--device", device);
+    getAndDelOption(arguments, "--useDLACore", DLACore);
+    getAndDelOption(arguments, "--allowGPUFallback", fallback);
+    std::string pluginName;
+    while (getAndDelOption(arguments, "--plugins", pluginName))
+    {
+        plugins.emplace_back(pluginName);
+    }
+}
+
+void InferenceOptions::parse(Arguments& arguments)
+{
+    getAndDelOption(arguments, "--streams", streams);
+    getAndDelOption(arguments, "--iterations", iterations);
+    getAndDelOption(arguments, "--duration", duration);
+    getAndDelOption(arguments, "--warmUp", warmup);
+    getAndDelOption(arguments, "--sleepTime", sleep);
+    getAndDelOption(arguments, "--idleTime", idle);
+    bool exposeDMA{false};
+    if (getAndDelOption(arguments, "--exposeDMA", exposeDMA))
+    {
+        overlap = !exposeDMA;
+    }
+    getAndDelOption(arguments, "--noDataTransfers", skipTransfers);
+    getAndDelOption(arguments, "--useManagedMemory", useManaged);
+    getAndDelOption(arguments, "--useSpinWait", spin);
+    getAndDelOption(arguments, "--threads", threads);
+    getAndDelOption(arguments, "--useCudaGraph", graph);
+    getAndDelOption(arguments, "--separateProfileRun", rerun);
+    getAndDelOption(arguments, "--buildOnly", skip);
+    getAndDelOption(arguments, "--timeDeserialize", timeDeserialize);
+    getAndDelOption(arguments, "--timeRefit", timeRefit);
+
+    std::string list;
+    getAndDelOption(arguments, "--loadInputs", list);
+    std::vector<std::string> inputsList{splitToStringVec(list, ',')};
+    splitInsertKeyValue(inputsList, inputs);
+
+    getShapesInference(arguments, shapes, "--shapes");
+    getAndDelOption(arguments, "--batch", batch);
+}
+
+void ReportingOptions::parse(Arguments& arguments)
+{
+    getAndDelOption(arguments, "--percentile", percentile);
+    getAndDelOption(arguments, "--avgRuns", avgs);
+    getAndDelOption(arguments, "--verbose", verbose);
+    getAndDelOption(arguments, "--dumpRefit", refit);
+    getAndDelOption(arguments, "--dumpOutput", output);
+    getAndDelOption(arguments, "--dumpProfile", profile);
+    getAndDelOption(arguments, "--dumpLayerInfo", layerInfo);
+    getAndDelOption(arguments, "--exportTimes", exportTimes);
+    getAndDelOption(arguments, "--exportOutput", exportOutput);
+    getAndDelOption(arguments, "--exportProfile", exportProfile);
+    getAndDelOption(arguments, "--exportLayerInfo", exportLayerInfo);
+    if (percentile < 0 || percentile > 100)
+    {
+        throw std::invalid_argument(std::string("Percentile ") + std::to_string(percentile) + "is not in [0,100]");
+    }
+}
+
+bool parseHelp(Arguments& arguments)
+{
+    bool helpLong{false};
+    bool helpShort{false};
+    getAndDelOption(arguments, "--help", helpLong);
+    getAndDelOption(arguments, "-h", helpShort);
+    return helpLong || helpShort;
+}
+
+void AllOptions::parse(Arguments& arguments)
+{
+    model.parse(arguments);
+    build.parse(arguments);
+    system.parse(arguments);
+    inference.parse(arguments);
+
+    // Use explicitBatch when input model is ONNX or when dynamic shapes are used.
+    const bool isOnnx{model.baseModel.format == ModelFormat::kONNX};
+    const bool hasDynamicShapes{!build.shapes.empty() || !inference.shapes.empty()};
+    const bool detectedExplicitBatch = isOnnx || hasDynamicShapes;
+
+    // Throw an error if user tries to use --batch or --maxBatch when the engine has explicit batch dim.
+    const bool maxBatchWasSet{build.maxBatch != maxBatchNotProvided};
+    const bool batchWasSet{inference.batch != batchNotProvided};
+    if (detectedExplicitBatch && (maxBatchWasSet || batchWasSet))
+    {
+        throw std::invalid_argument(
+            "The --batch and --maxBatch flags should not be used when the input model is ONNX or when dynamic shapes "
+            "are provided. Please use --optShapes and --shapes to set input shapes instead.");
+    }
+
+    // If batch and/or maxBatch is not set and the engine has implicit batch dim, set them to default values.
+    if (!detectedExplicitBatch)
+    {
+        // If batch is not set, set it to default value.
+        if (!batchWasSet)
+        {
+            inference.batch = defaultBatch;
+        }
+        // If maxBatch is not set, set it to be equal to batch.
+        if (!maxBatchWasSet)
+        {
+            build.maxBatch = inference.batch;
+        }
+        // MaxBatch should not be less than batch.
+        if (build.maxBatch < inference.batch)
+        {
+            throw std::invalid_argument("Build max batch " + std::to_string(build.maxBatch)
+                + " is less than inference batch " + std::to_string(inference.batch));
+        }
+    }
+
+    if (build.shapes.empty() && !inference.shapes.empty())
+    {
+        // If --shapes are provided but --optShapes are not, assume that optShapes is the same as shapes.
+        for (auto& s : inference.shapes)
+        {
+            insertShapesBuild(build.shapes, nvinfer1::OptProfileSelector::kMIN, s.first, s.second);
+            insertShapesBuild(build.shapes, nvinfer1::OptProfileSelector::kOPT, s.first, s.second);
+            insertShapesBuild(build.shapes, nvinfer1::OptProfileSelector::kMAX, s.first, s.second);
+        }
+    }
+    else if (!build.shapes.empty() && inference.shapes.empty())
+    {
+        // If --optShapes are provided but --shapes are not, assume that shapes is the same as optShapes.
+        for (auto& s : build.shapes)
+        {
+            insertShapesInference(
+                inference.shapes, s.first, s.second[static_cast<size_t>(nvinfer1::OptProfileSelector::kOPT)]);
+        }
+    }
+
+    reporting.parse(arguments);
+    helps = parseHelp(arguments);
+
+    if (!helps)
+    {
+        if (!build.load && model.baseModel.format == ModelFormat::kANY)
+        {
+            throw std::invalid_argument("Model missing or format not recognized");
+        }
+        if (build.safe && system.DLACore >= 0)
+        {
+            auto checkSafeDLAFormats = [](std::vector<IOFormat> const& fmt) {
+                return fmt.empty() ? false : std::all_of(fmt.begin(), fmt.end(), [](IOFormat const& pair) {
+                    bool supported{false};
+                    bool const isLINEAR{pair.second == 1U << static_cast<int32_t>(nvinfer1::TensorFormat::kLINEAR)};
+                    bool const isCHW4{pair.second == 1U << static_cast<int32_t>(nvinfer1::TensorFormat::kCHW4)};
+                    bool const isCHW32{pair.second == 1U << static_cast<int32_t>(nvinfer1::TensorFormat::kCHW32)};
+                    bool const isCHW16{pair.second == 1U << static_cast<int32_t>(nvinfer1::TensorFormat::kCHW16)};
+                    supported |= pair.first == nvinfer1::DataType::kINT8 && (isLINEAR || isCHW4 || isCHW32);
+                    supported |= pair.first == nvinfer1::DataType::kHALF && (isLINEAR || isCHW4 || isCHW16);
+                    return supported;
+                });
+            };
+            if (!checkSafeDLAFormats(build.inputFormats) || !checkSafeDLAFormats(build.outputFormats))
+            {
+                throw std::invalid_argument(
+                    "I/O formats for safe DLA capability are restricted to fp16/int8:linear, fp16:chw16 or int8:chw32");
+            }
+            if (system.fallback)
+            {
+                throw std::invalid_argument("GPU fallback (--allowGPUFallback) not allowed for safe DLA capability");
+            }
+        }
+    }
+}
+
+void SafeBuilderOptions::parse(Arguments& arguments)
+{
+    auto getFormats = [&arguments](std::vector<IOFormat>& formatsVector, const char* argument) {
+        std::string list;
+        getAndDelOption(arguments, argument, list);
+        std::vector<std::string> formats{splitToStringVec(list, ',')};
+        for (const auto& f : formats)
+        {
+            formatsVector.push_back(stringToValue<IOFormat>(f));
+        }
+    };
+
+    getAndDelOption(arguments, "--serialized", serialized);
+    getAndDelOption(arguments, "--onnx", onnxModelFile);
+    getAndDelOption(arguments, "--help", help);
+    getAndDelOption(arguments, "-h", help);
+    getAndDelOption(arguments, "--verbose", verbose);
+    getAndDelOption(arguments, "-v", verbose);
+    getFormats(inputFormats, "--inputIOFormats");
+    getFormats(outputFormats, "--outputIOFormats");
+    getAndDelOption(arguments, "--int8", int8);
+    getAndDelOption(arguments, "--calib", calibFile);
+    getAndDelOption(arguments, "--consistency", consistency);
+    getAndDelOption(arguments, "--std", standard);
+    std::string pluginName;
+    while (getAndDelOption(arguments, "--plugins", pluginName))
+    {
+        plugins.emplace_back(pluginName);
+    }
+}
+
+std::ostream& operator<<(std::ostream& os, const BaseModelOptions& options)
+{
+    os << "=== Model Options ===" << std::endl;
+
+    os << "Format: ";
+    switch (options.format)
+    {
+    case ModelFormat::kCAFFE:
+    {
+        os << "Caffe";
+        break;
+    }
+    case ModelFormat::kONNX:
+    {
+        os << "ONNX";
+        break;
+    }
+    case ModelFormat::kUFF:
+    {
+        os << "UFF";
+        break;
+    }
+    case ModelFormat::kANY:
+        os << "*";
+        break;
+    }
+    os << std::endl << "Model: " << options.model << std::endl;
+
+    return os;
+}
+
+std::ostream& operator<<(std::ostream& os, const UffInput& input)
+{
+    os << "Uff Inputs Layout: " << (input.NHWC ? "NHWC" : "NCHW") << std::endl;
+    for (const auto& i : input.inputs)
+    {
+        os << "Input: " << i.first << "," << i.second.d[0] << "," << i.second.d[1] << "," << i.second.d[2] << std::endl;
+    }
+
+    return os;
+}
+
+std::ostream& operator<<(std::ostream& os, const ModelOptions& options)
+{
+    os << options.baseModel;
+    switch (options.baseModel.format)
+    {
+    case ModelFormat::kCAFFE:
+    {
+        os << "Prototxt: " << options.prototxt << std::endl;
+        break;
+    }
+    case ModelFormat::kUFF:
+    {
+        os << options.uffInputs;
+        break;
+    }
+    case ModelFormat::kONNX: // Fallthrough: No options to report for ONNX or the generic case
+    case ModelFormat::kANY:
+        break;
+    }
+
+    os << "Output:";
+    for (const auto& o : options.outputs)
+    {
+        os << " " << o;
+    }
+    os << std::endl;
+
+    return os;
+}
+
+std::ostream& operator<<(std::ostream& os, nvinfer1::DataType dtype)
+{
+    switch (dtype)
+    {
+    case nvinfer1::DataType::kFLOAT:
+    {
+        os << "fp32";
+        break;
+    }
+    case nvinfer1::DataType::kHALF:
+    {
+        os << "fp16";
+        break;
+    }
+    case nvinfer1::DataType::kINT8:
+    {
+        os << "int8";
+        break;
+    }
+    case nvinfer1::DataType::kINT32:
+    {
+        os << "int32";
+        break;
+    }
+    case nvinfer1::DataType::kBOOL:
+    {
+        os << "bool";
+        break;
+    }
+    }
+    return os;
+}
+
+std::ostream& operator<<(std::ostream& os, IOFormat const& format)
+{
+    os << format.first << ":";
+
+    for (int32_t f = 0; f < nvinfer1::EnumMax<nvinfer1::TensorFormat>(); ++f)
+    {
+        if ((1U << f) & format.second)
+        {
+            if (f)
+            {
+                os << "+";
+            }
+            switch (nvinfer1::TensorFormat(f))
+            {
+            case nvinfer1::TensorFormat::kLINEAR:
+            {
+                os << "chw";
+                break;
+            }
+            case nvinfer1::TensorFormat::kCHW2:
+            {
+                os << "chw2";
+                break;
+            }
+            case nvinfer1::TensorFormat::kHWC8:
+            {
+                os << "hwc8";
+                break;
+            }
+#if (NV_TENSORRT_MAJOR > 7)
+            case nvinfer1::TensorFormat::kHWC16:
+            {
+                os << "hwc16";
+                break;
+            }
+#endif
+            case nvinfer1::TensorFormat::kCHW4:
+            {
+                os << "chw4";
+                break;
+            }
+            case nvinfer1::TensorFormat::kCHW16:
+            {
+                os << "chw16";
+                break;
+            }
+            case nvinfer1::TensorFormat::kCHW32:
+            {
+                os << "chw32";
+                break;
+            }
+            case nvinfer1::TensorFormat::kDHWC8:
+            {
+                os << "dhwc8";
+                break;
+            }
+            case nvinfer1::TensorFormat::kCDHW32:
+            {
+                os << "cdhw32";
+                break;
+            }
+            case nvinfer1::TensorFormat::kHWC:
+            {
+                os << "hwc";
+                break;
+            }
+            case nvinfer1::TensorFormat::kDLA_LINEAR:
+            {
+                os << "dla_linear";
+                break;
+            }
+            case nvinfer1::TensorFormat::kDLA_HWC4:
+            {
+                os << "dla_hwc4";
+                break;
+            }
+            }
+        }
+    }
+    return os;
+}
+
+std::ostream& operator<<(std::ostream& os, const ShapeRange& dims)
+{
+    int32_t i = 0;
+    for (const auto& d : dims)
+    {
+        if (!d.size())
+        {
+            break;
+        }
+        os << (i ? "+" : "") << d;
+        ++i;
+    }
+    return os;
+}
+
+std::ostream& operator<<(std::ostream& os, LayerPrecisions const& layerPrecisions)
+{
+    int32_t i = 0;
+    for (auto const& layerPrecision : layerPrecisions)
+    {
+        os << (i ? "," : "") << layerPrecision.first << ":" << layerPrecision.second;
+        ++i;
+    }
+    return os;
+}
+
+std::ostream& operator<<(std::ostream& os, const BuildOptions& options)
+{
+    // clang-format off
+    os << "=== Build Options ==="                                                                                       << std::endl <<
+
+          "Max batch: ";        printBatch(os, options.maxBatch)                                                        << std::endl <<
+          "Memory Pools: ";     printMemoryPools(os, options)                                                           << std::endl <<
+          "minTiming: "      << options.minTiming                                                                       << std::endl <<
+          "avgTiming: "      << options.avgTiming                                                                       << std::endl <<
+          "Precision: ";        printPrecision(os, options)                                                             << std::endl <<
+          "LayerPrecisions: " << options.layerPrecisions                                                                << std::endl <<
+          "Calibration: "    << (options.int8 && options.calibration.empty() ? "Dynamic" : options.calibration.c_str()) << std::endl <<
+          "Refit: "          << boolToEnabled(options.refittable)                                                       << std::endl <<
+          "Sparsity: ";         printSparsity(os, options)                                                              << std::endl <<
+          "Safe mode: "      << boolToEnabled(options.safe)                                                             << std::endl <<
+          "DirectIO mode: "  << boolToEnabled(options.directIO)                                                         << std::endl <<
+          "Restricted mode: " << boolToEnabled(options.restricted)                                                      << std::endl <<
+          "Save engine: "    << (options.save ? options.engine : "")                                                    << std::endl <<
+          "Load engine: "    << (options.load ? options.engine : "")                                                    << std::endl <<
+          "Profiling verbosity: " << static_cast<int32_t>(options.profilingVerbosity)                                   << std::endl <<
+          "Tactic sources: ";   printTacticSources(os, options.enabledTactics, options.disabledTactics)                 << std::endl <<
+          "timingCacheMode: ";  printTimingCache(os, options)                                                           << std::endl <<
+          "timingCacheFile: " << options.timingCacheFile                                                                << std::endl;
+    // clang-format on
+
+    auto printIOFormats = [](std::ostream& os, const char* direction, const std::vector<IOFormat> formats) {
+        if (formats.empty())
+        {
+            os << direction << "s format: fp32:CHW" << std::endl;
+        }
+        else
+        {
+            for(const auto& f : formats)
+            {
+                os << direction << ": " << f << std::endl;
+            }
+        }
+    };
+
+    printIOFormats(os, "Input(s)", options.inputFormats);
+    printIOFormats(os, "Output(s)", options.outputFormats);
+    printShapes(os, "build", options.shapes);
+    printShapes(os, "calibration", options.shapesCalib);
+
+    return os;
+}
+
+std::ostream& operator<<(std::ostream& os, const SystemOptions& options)
+{
+    // clang-format off
+    os << "=== System Options ==="                                                                << std::endl <<
+
+          "Device: "  << options.device                                                           << std::endl <<
+          "DLACore: " << (options.DLACore != -1 ? std::to_string(options.DLACore) : "")           <<
+                         (options.DLACore != -1 && options.fallback ? "(With GPU fallback)" : "") << std::endl;
+    os << "Plugins:";
+
+    for (const auto& p : options.plugins)
+    {
+        os << " " << p;
+    }
+    os << std::endl;
+
+    return os;
+    // clang-format on
+}
+
+std::ostream& operator<<(std::ostream& os, const InferenceOptions& options)
+{
+// clang-format off
+    os << "=== Inference Options ==="                                     << std::endl <<
+
+          "Batch: ";
+    if (options.batch && options.shapes.empty())
+    {
+                          os << options.batch                             << std::endl;
+    }
+    else
+    {
+                          os << "Explicit"                                << std::endl;
+    }
+    printShapes(os, "inference", options.shapes);
+    os << "Iterations: "         << options.iterations                    << std::endl <<
+          "Duration: "           << options.duration   << "s (+ "
+                                 << options.warmup     << "ms warm up)"   << std::endl <<
+          "Sleep time: "         << options.sleep      << "ms"            << std::endl <<
+          "Idle time: "          << options.idle       << "ms"            << std::endl <<
+          "Streams: "            << options.streams                       << std::endl <<
+          "ExposeDMA: "          << boolToEnabled(!options.overlap)       << std::endl <<
+          "Data transfers: "     << boolToEnabled(!options.skipTransfers) << std::endl <<
+          "Spin-wait: "          << boolToEnabled(options.spin)           << std::endl <<
+          "Multithreading: "     << boolToEnabled(options.threads)        << std::endl <<
+          "CUDA Graph: "         << boolToEnabled(options.graph)          << std::endl <<
+          "Separate profiling: " << boolToEnabled(options.rerun)          << std::endl <<
+          "Time Deserialize: "   << boolToEnabled(options.timeDeserialize) << std::endl <<
+          "Time Refit: "         << boolToEnabled(options.timeRefit) << std::endl <<
+          "Skip inference: "     << boolToEnabled(options.skip)           << std::endl;
+
+// clang-format on
+    os << "Inputs:" << std::endl;
+    for (const auto& input : options.inputs)
+    {
+        os << input.first << "<-" << input.second << std::endl;
+    }
+
+    return os;
+}
+
+std::ostream& operator<<(std::ostream& os, const ReportingOptions& options)
+{
+// clang-format off
+    os << "=== Reporting Options ==="                                       << std::endl <<
+
+          "Verbose: "                     << boolToEnabled(options.verbose) << std::endl <<
+          "Averages: "                    << options.avgs << " inferences"  << std::endl <<
+          "Percentile: "                  << options.percentile             << std::endl <<
+          "Dump refittable layers:"       << boolToEnabled(options.refit)   << std::endl <<
+          "Dump output: "                 << boolToEnabled(options.output)  << std::endl <<
+          "Profile: "                     << boolToEnabled(options.profile) << std::endl <<
+          "Export timing to JSON file: "  << options.exportTimes            << std::endl <<
+          "Export output to JSON file: "  << options.exportOutput           << std::endl <<
+          "Export profile to JSON file: " << options.exportProfile          << std::endl;
+// clang-format on
+
+    return os;
+}
+
+std::ostream& operator<<(std::ostream& os, const AllOptions& options)
+{
+    os << options.model << options.build << options.system << options.inference << options.reporting << std::endl;
+    return os;
+}
+
+std::ostream& operator<<(std::ostream& os, const SafeBuilderOptions& options)
+{
+    auto printIOFormats = [](std::ostream& os, const char* direction, const std::vector<IOFormat> formats) {
+        if (formats.empty())
+        {
+            os << direction << "s format: fp32:CHW" << std::endl;
+        }
+        else
+        {
+            for(const auto& f : formats)
+            {
+                os << direction << ": " << f << std::endl;
+            }
+        }
+    };
+
+    os << "=== Build Options ===" << std::endl;
+    os << "Model ONNX: " << options.onnxModelFile << std::endl;
+
+    os << "Precision: FP16";
+    if (options.int8)
+    {
+        os << " + INT8";
+    }
+    os << std::endl;
+    os << "Calibration file: " << options.calibFile << std::endl;
+    os << "Serialized Network: " << options.serialized << std::endl;
+
+    printIOFormats(os, "Input(s)", options.inputFormats);
+    printIOFormats(os, "Output(s)", options.outputFormats);
+
+    os << "Plugins:";
+    for (const auto& p : options.plugins)
+    {
+        os << " " << p;
+    }
+    os << std::endl;
+    return os;
+}
+
+void BaseModelOptions::help(std::ostream& os)
+{
+// clang-format off
+    os << "  --uff=<file>                UFF model"                                             << std::endl <<
+          "  --onnx=<file>               ONNX model"                                            << std::endl <<
+          "  --model=<file>              Caffe model (default = no model, random weights used)" << std::endl;
+// clang-format on
+}
+
+void UffInput::help(std::ostream& os)
+{
+// clang-format off
+    os << "  --uffInput=<name>,X,Y,Z     Input blob name and its dimensions (X,Y,Z=C,H,W), it can be specified "
+                                                       "multiple times; at least one is required for UFF models" << std::endl <<
+          "  --uffNHWC                   Set if inputs are in the NHWC layout instead of NCHW (use "             <<
+                                                                    "X,Y,Z=H,W,C order in --uffInput)"           << std::endl;
+// clang-format on
+}
+
+void ModelOptions::help(std::ostream& os)
+{
+// clang-format off
+    os << "=== Model Options ==="                                                                                 << std::endl;
+    BaseModelOptions::help(os);
+    os << "  --deploy=<file>             Caffe prototxt file"                                                     << std::endl <<
+          "  --output=<name>[,<name>]*   Output names (it can be specified multiple times); at least one output "
+                                                                                  "is required for UFF and Caffe" << std::endl;
+    UffInput::help(os);
+// clang-format on
+}
+
+void BuildOptions::help(std::ostream& os)
+{
+// clang-format off
+    os << "=== Build Options ==="                                                                                                            "\n"
+          "  --maxBatch                  Set max batch size and build an implicit batch engine (default = same size as --batch)"             "\n"
+          "                              This option should not be used when the input model is ONNX or when dynamic shapes are provided."   "\n"
+          "  --minShapes=spec            Build with dynamic shapes using a profile with the min shapes provided"                             "\n"
+          "  --optShapes=spec            Build with dynamic shapes using a profile with the opt shapes provided"                             "\n"
+          "  --maxShapes=spec            Build with dynamic shapes using a profile with the max shapes provided"                             "\n"
+          "  --minShapesCalib=spec       Calibrate with dynamic shapes using a profile with the min shapes provided"                         "\n"
+          "  --optShapesCalib=spec       Calibrate with dynamic shapes using a profile with the opt shapes provided"                         "\n"
+          "  --maxShapesCalib=spec       Calibrate with dynamic shapes using a profile with the max shapes provided"                         "\n"
+          "                              Note: All three of min, opt and max shapes must be supplied."                                       "\n"
+          "                                    However, if only opt shapes is supplied then it will be expanded so"                          "\n"
+          "                                    that min shapes and max shapes are set to the same values as opt shapes."                     "\n"
+          "                                    Input names can be wrapped with escaped single quotes (ex: \\\'Input:0\\\')."                 "\n"
+          "                              Example input shapes spec: input0:1x3x256x256,input1:1x3x128x128"                                   "\n"
+          "                              Each input shape is supplied as a key-value pair where key is the input name and"                   "\n"
+          "                              value is the dimensions (including the batch dimension) to be used for that input."                 "\n"
+          "                              Each key-value pair has the key and value separated using a colon (:)."                             "\n"
+          "                              Multiple input shapes can be provided via comma-separated key-value pairs."                         "\n"
+          "  --inputIOFormats=spec       Type and format of each of the input tensors (default = all inputs in fp32:chw)"                    "\n"
+          "                              See --outputIOFormats help for the grammar of type and format list."                                "\n"
+          "                              Note: If this option is specified, please set comma-separated types and formats for all"            "\n"
+          "                                    inputs following the same order as network inputs ID (even if only one input"                 "\n"
+          "                                    needs specifying IO format) or set the type and format once for broadcasting."                "\n"
+          "  --outputIOFormats=spec      Type and format of each of the output tensors (default = all outputs in fp32:chw)"                  "\n"
+          "                              Note: If this option is specified, please set comma-separated types and formats for all"            "\n"
+          "                                    outputs following the same order as network outputs ID (even if only one output"              "\n"
+          "                                    needs specifying IO format) or set the type and format once for broadcasting."                "\n"
+          "                              IO Formats: spec  ::= IOfmt[\",\"spec]"                                                             "\n"
+          "                                          IOfmt ::= type:fmt"                                                                     "\n"
+          "                                          type  ::= \"fp32\"|\"fp16\"|\"int32\"|\"int8\""                                         "\n"
+          "                                          fmt   ::= (\"chw\"|\"chw2\"|\"chw4\"|\"hwc8\"|\"chw16\"|\"chw32\"|\"dhwc8\")[\"+\"fmt]" "\n"
+          "  --workspace=N               Set workspace size in MiB."                                                                         "\n"
+          "  --memPoolSize=poolspec      Specify the size constraints of the designated memory pool(s) in MiB."                              "\n"
+          "                              Note: Also accepts decimal sizes, e.g. 0.25MiB. Will be rounded down to the nearest integer bytes." "\n"
+          "                              Pool constraint: poolspec ::= poolfmt[\",\"poolspec]"                                               "\n"
+          "                                               poolfmt ::= pool:sizeInMiB"                                                        "\n"
+          "                                               pool ::= \"workspace\"|\"dlaSRAM\"|\"dlaLocalDRAM\"|\"dlaGlobalDRAM\""             "\n"
+          "  --profilingVerbosity=mode   Specify profiling verbosity. mode ::= layer_names_only|detailed|none (default = layer_names_only)"  "\n"
+          "  --minTiming=M               Set the minimum number of iterations used in kernel selection (default = "
+                                                                                                           << defaultMinTiming << ")"        "\n"
+          "  --avgTiming=M               Set the number of times averaged in each iteration for kernel selection (default = "
+                                                                                                           << defaultAvgTiming << ")"        "\n"
+          "  --refit                     Mark the engine as refittable. This will allow the inspection of refittable layers "                "\n"
+          "                              and weights within the engine."                                                                     "\n"
+          "  --sparsity=spec             Control sparsity (default = disabled). "                                                            "\n"
+          "                              Sparsity: spec ::= \"disable\", \"enable\", \"force\""                                              "\n"
+          "                              Note: Description about each of these options is as below"                                          "\n"
+          "                                    disable = do not enable sparse tactics in the builder (this is the default)"                  "\n"
+          "                                    enable  = enable sparse tactics in the builder (but these tactics will only be"               "\n"
+          "                                              considered if the weights have the right sparsity pattern)"                         "\n"
+          "                                    force   = enable sparse tactics in the builder and force-overwrite the weights to have"       "\n"
+          "                                              a sparsity pattern (even if you loaded a model yourself)"                           "\n"
+          "  --noTF32                    Disable tf32 precision (default is to enable tf32, in addition to fp32)"                            "\n"
+          "  --fp16                      Enable fp16 precision, in addition to fp32 (default = disabled)"                                    "\n"
+          "  --int8                      Enable int8 precision, in addition to fp32 (default = disabled)"                                    "\n"
+          "  --best                      Enable all precisions to achieve the best performance (default = disabled)"                         "\n"
+          "  --directIO                  Avoid reformatting at network boundaries. (default = disabled)"                                     "\n"
+          "  --precisionConstraints=spec Control precision constraint setting. (default = none)"                                             "\n"
+          "                                  Precision Constaints: spec ::= \"none\" | \"obey\" | \"prefer\""                                "\n"
+          "                                  none = no constraints"                                                                          "\n"
+          "                                  prefer = meet precision constraints set by --layerPrecisions/--layerOutputTypes if possible"    "\n"
+          "                                  obey = meet precision constraints set by --layerPrecisions/--layerOutputTypes or fail"          "\n"
+          "                                         otherwise"                                                                               "\n"
+          "  --layerPrecisions=spec      Control per-layer precision constraints. Effective only when precisionConstraints is set to"        "\n"
+          "                              \"obey\" or \"prefer\". (default = none)"                                                           "\n"
+          "                              The specs are read left-to-right, and later ones override earlier ones. \"*\" can be used as a"     "\n"
+          "                              layerName to specify the default precision for all the unspecified layers."                         "\n"
+          "                              Per-layer precision spec ::= layerPrecision[\",\"spec]"                                             "\n"
+          "                                                  layerPrecision ::= layerName\":\"precision"                                     "\n"
+          "                                                  precision ::= \"fp32\"|\"fp16\"|\"int32\"|\"int8\""                             "\n"
+          "  --layerOutputTypes=spec     Control per-layer output type constraints. Effective only when precisionConstraints is set to"      "\n"
+          "                              \"obey\" or \"prefer\". (default = none)"                                                           "\n"
+          "                              The specs are read left-to-right, and later ones override earlier ones. \"*\" can be used as a"     "\n"
+          "                              layerName to specify the default precision for all the unspecified layers. If a layer has more than""\n"
+          "                              one output, then multiple types separated by \"+\" can be provided for this layer."                 "\n"
+          "                              Per-layer output type spec ::= layerOutputTypes[\",\"spec]"                                         "\n"
+          "                                                    layerOutputTypes ::= layerName\":\"type"                                      "\n"
+          "                                                    type ::= \"fp32\"|\"fp16\"|\"int32\"|\"int8\"[\"+\"type]"                     "\n"
+          "  --calib=<file>              Read INT8 calibration cache file"                                                                   "\n"
+          "  --safe                      Enable build safety certified engine"                                                               "\n"
+          "  --consistency               Perform consistency checking on safety certified engine"                                            "\n"
+          "  --restricted                Enable safety scope checking with kSAFETY_SCOPE build flag"                                         "\n"
+          "  --saveEngine=<file>         Save the serialized engine"                                                                         "\n"
+          "  --loadEngine=<file>         Load a serialized engine"                                                                           "\n"
+          "  --tacticSources=tactics     Specify the tactics to be used by adding (+) or removing (-) tactics from the default "             "\n"
+          "                              tactic sources (default = all available tactics)."                                                  "\n"
+          "                              Note: Currently only cuDNN, cuBLAS and cuBLAS-LT are listed as optional tactics."                   "\n"
+          "                              Tactic Sources: tactics ::= [\",\"tactic]"                                                          "\n"
+          "                                              tactic  ::= (+|-)lib"                                                               "\n"
+          "                                              lib     ::= \"CUBLAS\"|\"CUBLAS_LT\"|\"CUDNN\""                                     "\n"
+          "                              For example, to disable cudnn and enable cublas: --tacticSources=-CUDNN,+CUBLAS"                    "\n"
+          "  --noBuilderCache            Disable timing cache in builder (default is to enable timing cache)"                                "\n"
+          "  --timingCacheFile=<file>    Save/load the serialized global timing cache"                                                       "\n"
+          ;
+// clang-format on
+    os << std::flush;
+}
+
+void SystemOptions::help(std::ostream& os)
+{
+// clang-format off
+    os << "=== System Options ==="                                                                         << std::endl <<
+          "  --device=N                  Select cuda device N (default = "         << defaultDevice << ")" << std::endl <<
+          "  --useDLACore=N              Select DLA core N for layers that support DLA (default = none)"   << std::endl <<
+          "  --allowGPUFallback          When DLA is enabled, allow GPU fallback for unsupported layers "
+                                                                                    "(default = disabled)" << std::endl;
+    os << "  --plugins                   Plugin library (.so) to load (can be specified multiple times)"   << std::endl;
+// clang-format on
+}
+
+void InferenceOptions::help(std::ostream& os)
+{
+    // clang-format off
+    os << "=== Inference Options ==="                                                                                                << std::endl <<
+          "  --batch=N                   Set batch size for implicit batch engines (default = "              << defaultBatch << ")"  << std::endl <<
+          "                              This option should not be used when the engine is built from an ONNX model or when dynamic" << std::endl <<
+          "                              shapes are provided when the engine is built."                                              << std::endl <<
+          "  --shapes=spec               Set input shapes for dynamic shapes inference inputs."                                      << std::endl <<
+          "                              Note: Input names can be wrapped with escaped single quotes (ex: \\\'Input:0\\\')."         << std::endl <<
+          "                              Example input shapes spec: input0:1x3x256x256, input1:1x3x128x128"                          << std::endl <<
+          "                              Each input shape is supplied as a key-value pair where key is the input name and"           << std::endl <<
+          "                              value is the dimensions (including the batch dimension) to be used for that input."         << std::endl <<
+          "                              Each key-value pair has the key and value separated using a colon (:)."                     << std::endl <<
+          "                              Multiple input shapes can be provided via comma-separated key-value pairs."                 << std::endl <<
+          "  --loadInputs=spec           Load input values from files (default = generate random inputs). Input names can be "
+                                                                                       "wrapped with single quotes (ex: 'Input:0')"  << std::endl <<
+          "                              Input values spec ::= Ival[\",\"spec]"                                                      << std::endl <<
+          "                                           Ival ::= name\":\"file"                                                        << std::endl <<
+          "  --iterations=N              Run at least N inference iterations (default = "               << defaultIterations << ")"  << std::endl <<
+          "  --warmUp=N                  Run for N milliseconds to warmup before measuring performance (default = "
+                                                                                                            << defaultWarmUp << ")"  << std::endl <<
+          "  --duration=N                Run performance measurements for at least N seconds wallclock time (default = "
+                                                                                                          << defaultDuration << ")"  << std::endl <<
+          "  --sleepTime=N               Delay inference start with a gap of N milliseconds between launch and compute "
+                                                                                               "(default = " << defaultSleep << ")"  << std::endl <<
+          "  --idleTime=N                Sleep N milliseconds between two continuous iterations"
+                                                                                               "(default = " << defaultIdle << ")"   << std::endl <<
+          "  --streams=N                 Instantiate N engines to use concurrently (default = "            << defaultStreams << ")"  << std::endl <<
+          "  --exposeDMA                 Serialize DMA transfers to and from device (default = disabled)."                           << std::endl <<
+          "  --noDataTransfers           Disable DMA transfers to and from device (default = enabled)."                              << std::endl <<
+          "  --useManagedMemory          Use managed memory instead of seperate host and device allocations (default = disabled)."   << std::endl <<
+          "  --useSpinWait               Actively synchronize on GPU events. This option may decrease synchronization time but "
+                                                                             "increase CPU usage and power (default = disabled)"     << std::endl <<
+          "  --threads                   Enable multithreading to drive engines with independent threads"
+                                                                                " or speed up refitting (default = disabled) "       << std::endl <<
+          "  --useCudaGraph              Use CUDA graph to capture engine execution and then launch inference (default = disabled)." << std::endl <<
+          "                              This flag may be ignored if the graph capture fails."                                       << std::endl <<
+          "  --timeDeserialize           Time the amount of time it takes to deserialize the network and exit."                      << std::endl <<
+          "  --timeRefit                 Time the amount of time it takes to refit the engine before inference."                     << std::endl <<
+          "  --separateProfileRun        Do not attach the profiler in the benchmark run; if profiling is enabled, a second "
+                                                                                "profile run will be executed (default = disabled)"  << std::endl <<
+          "  --buildOnly                 Skip inference perf measurement (default = disabled)"                                       << std::endl;
+    // clang-format on
+}
+
+void ReportingOptions::help(std::ostream& os)
+{
+// clang-format off
+    os << "=== Reporting Options ==="                                                                    << std::endl <<
+          "  --verbose                   Use verbose logging (default = false)"                          << std::endl <<
+          "  --avgRuns=N                 Report performance measurements averaged over N consecutive "
+                                                       "iterations (default = " << defaultAvgRuns << ")" << std::endl <<
+          "  --percentile=P              Report performance for the P percentage (0<=P<=100, 0 "
+                                        "representing max perf, and 100 representing min perf; (default"
+                                                                      " = " << defaultPercentile << "%)" << std::endl <<
+          "  --dumpRefit                 Print the refittable layers and weights from a refittable "
+                                        "engine"                                                         << std::endl <<
+          "  --dumpOutput                Print the output tensor(s) of the last inference iteration "
+                                                                                  "(default = disabled)" << std::endl <<
+          "  --dumpProfile               Print profile information per layer (default = disabled)"       << std::endl <<
+          "  --dumpLayerInfo             Print layer information of the engine to console "
+                                                                                "(default = disabled)"   << std::endl <<
+          "  --exportTimes=<file>        Write the timing results in a json file (default = disabled)"   << std::endl <<
+          "  --exportOutput=<file>       Write the output tensors to a json file (default = disabled)"   << std::endl <<
+          "  --exportProfile=<file>      Write the profile information per layer in a json file "
+                                                                              "(default = disabled)"     << std::endl <<
+          "  --exportLayerInfo=<file>    Write the layer information of the engine in a json file "
+                                                                              "(default = disabled)"     << std::endl;
+// clang-format on
+}
+
+void helpHelp(std::ostream& os)
+{
+// clang-format off
+    os << "=== Help ==="                                     << std::endl <<
+          "  --help, -h                  Print this message" << std::endl;
+// clang-format on
+}
+
+void AllOptions::help(std::ostream& os)
+{
+    ModelOptions::help(os);
+    os << std::endl;
+    BuildOptions::help(os);
+    os << std::endl;
+    InferenceOptions::help(os);
+    os << std::endl;
+// clang-format off
+    os << "=== Build and Inference Batch Options ==="                                                                   << std::endl <<
+          "                              When using implicit batch, the max batch size of the engine, if not given, "   << std::endl <<
+          "                              is set to the inference batch size;"                                           << std::endl <<
+          "                              when using explicit batch, if shapes are specified only for inference, they "  << std::endl <<
+          "                              will be used also as min/opt/max in the build profile; if shapes are "         << std::endl <<
+          "                              specified only for the build, the opt shapes will be used also for inference;" << std::endl <<
+          "                              if both are specified, they must be compatible; and if explicit batch is "     << std::endl <<
+          "                              enabled but neither is specified, the model must provide complete static"      << std::endl <<
+          "                              dimensions, including batch size, for all inputs"                              << std::endl <<
+          "                              Using ONNX models automatically forces explicit batch."                        << std::endl <<
+    std::endl;
+    // clang-format on
+    ReportingOptions::help(os);
+    os << std::endl;
+    SystemOptions::help(os);
+    os << std::endl;
+    helpHelp(os);
+}
+
+void SafeBuilderOptions::printHelp(std::ostream& os)
+{
+// clang-format off
+    os << "=== Mandatory ==="                                                                                                                << std::endl <<
+          "  --onnx=<file>               ONNX model"                                                                                         << std::endl <<
+          " "                                                                                                                                << std::endl <<
+          "=== Optional ==="                                                                                                                 << std::endl <<
+          "  --inputIOFormats=spec       Type and format of each of the input tensors (default = all inputs in fp32:chw)"                    << std::endl <<
+          "                              See --outputIOFormats help for the grammar of type and format list."                                << std::endl <<
+          "                              Note: If this option is specified, please set comma-separated types and formats for all"            << std::endl <<
+          "                                    inputs following the same order as network inputs ID (even if only one input"                 << std::endl <<
+          "                                    needs specifying IO format) or set the type and format once for broadcasting."                << std::endl <<
+          "  --outputIOFormats=spec      Type and format of each of the output tensors (default = all outputs in fp32:chw)"                  << std::endl <<
+          "                              Note: If this option is specified, please set comma-separated types and formats for all"            << std::endl <<
+          "                                    outputs following the same order as network outputs ID (even if only one output"              << std::endl <<
+          "                                    needs specifying IO format) or set the type and format once for broadcasting."                << std::endl <<
+          "                              IO Formats: spec  ::= IOfmt[\",\"spec]"                                                             << std::endl <<
+          "                                          IOfmt ::= type:fmt"                                                                     << std::endl <<
+          "                                          type  ::= \"fp32\"|\"fp16\"|\"int32\"|\"int8\""                                         << std::endl <<
+          "                                          fmt   ::= (\"chw\"|\"chw2\"|\"chw4\"|\"hwc8\"|\"chw16\"|\"chw32\"|\"dhwc8\")[\"+\"fmt]" << std::endl <<
+          "  --int8                      Enable int8 precision, in addition to fp16 (default = disabled)"                                    << std::endl <<
+          "  --consistency               Enable consistency check for serialized engine, (default = disabled)"                               << std::endl <<
+          "  --std                       Build standard serialized engine, (default = disabled)"                                             << std::endl <<
+          "  --calib=<file>              Read INT8 calibration cache file"                                                                   << std::endl <<
+          "  --serialized=<file>         Save the serialized network"                                                                        << std::endl <<
+          "  --plugins                   Plugin library (.so) to load (can be specified multiple times)"                                     << std::endl <<
+          "  --verbose or -v             Use verbose logging (default = false)"                                                              << std::endl <<
+          "  --help or -h                Print this message"                                                                                 << std::endl <<
+          " "                                                                                                                                << std::endl;
+// clang-format on
+}
+
+} // namespace sample
diff --git a/src/Detector/tensorrt_yolo/common_deprecated/sampleOptions.h b/src/Detector/tensorrt_yolo/common_deprecated/sampleOptions.h
new file mode 100644
index 00000000..8975e1ea
--- /dev/null
+++ b/src/Detector/tensorrt_yolo/common_deprecated/sampleOptions.h
@@ -0,0 +1,355 @@
+/*
+ * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef TRT_SAMPLE_OPTIONS_H
+#define TRT_SAMPLE_OPTIONS_H
+
+#include <algorithm>
+#include <array>
+#include <iostream>
+#include <stdexcept>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "NvInfer.h"
+
+namespace sample
+{
+
+// Build default params
+constexpr int32_t maxBatchNotProvided{0};
+constexpr int32_t defaultMinTiming{1};
+constexpr int32_t defaultAvgTiming{8};
+
+// System default params
+constexpr int32_t defaultDevice{0};
+
+// Inference default params
+constexpr int32_t defaultBatch{1};
+constexpr int32_t batchNotProvided{0};
+constexpr int32_t defaultStreams{1};
+constexpr int32_t defaultIterations{10};
+constexpr float defaultWarmUp{200.F};
+constexpr float defaultDuration{3.F};
+constexpr float defaultSleep{};
+constexpr float defaultIdle{};
+
+// Reporting default params
+constexpr int32_t defaultAvgRuns{10};
+constexpr float defaultPercentile{99};
+
+enum class PrecisionConstraints
+{
+    kNONE,
+    kOBEY,
+    kPREFER
+};
+
+enum class ModelFormat
+{
+    kANY,
+    kCAFFE,
+    kONNX,
+    kUFF
+};
+
+enum class SparsityFlag
+{
+    kDISABLE,
+    kENABLE,
+    kFORCE
+};
+
+enum class TimingCacheMode
+{
+    kDISABLE,
+    kLOCAL,
+    kGLOBAL
+};
+
+using Arguments = std::unordered_multimap<std::string, std::string>;
+
+using IOFormat = std::pair<nvinfer1::DataType, nvinfer1::TensorFormats>;
+
+using ShapeRange = std::array<std::vector<int32_t>, nvinfer1::EnumMax<nvinfer1::OptProfileSelector>()>;
+
+using LayerPrecisions = std::unordered_map<std::string, nvinfer1::DataType>;
+using LayerOutputTypes = std::unordered_map<std::string, std::vector<nvinfer1::DataType>>;
+
+struct Options
+{
+    virtual void parse(Arguments& arguments) = 0;
+};
+
+struct BaseModelOptions : public Options
+{
+    ModelFormat format{ModelFormat::kANY};
+    std::string model;
+
+    void parse(Arguments& arguments) override;
+
+    static void help(std::ostream& out);
+};
+
+struct UffInput : public Options
+{
+    std::vector<std::pair<std::string, nvinfer1::Dims>> inputs;
+    bool NHWC{false};
+
+    void parse(Arguments& arguments) override;
+
+    static void help(std::ostream& out);
+};
+
+struct ModelOptions : public Options
+{
+    BaseModelOptions baseModel;
+    std::string prototxt;
+    std::vector<std::string> outputs;
+    UffInput uffInputs;
+
+    void parse(Arguments& arguments) override;
+
+    static void help(std::ostream& out);
+};
+
+struct BuildOptions : public Options
+{
+    int32_t maxBatch{maxBatchNotProvided};
+    double workspace{-1.0};
+    double dlaSRAM{-1.0};
+    double dlaLocalDRAM{-1.0};
+    double dlaGlobalDRAM{-1.0};
+    int32_t minTiming{defaultMinTiming};
+    int32_t avgTiming{defaultAvgTiming};
+    bool tf32{true};
+    bool fp16{false};
+    bool int8{false};
+    bool directIO{false};
+    PrecisionConstraints precisionConstraints{PrecisionConstraints::kNONE};
+    LayerPrecisions layerPrecisions;
+    LayerOutputTypes layerOutputTypes;
+    bool safe{false};
+    bool consistency{false};
+    bool restricted{false};
+    bool save{false};
+    bool load{false};
+    bool refittable{false};
+    SparsityFlag sparsity{SparsityFlag::kDISABLE};
+#if (NV_TENSORRT_MAJOR > 7)
+	nvinfer1::ProfilingVerbosity profilingVerbosity{nvinfer1::ProfilingVerbosity::kLAYER_NAMES_ONLY};
+#else
+	nvinfer1::ProfilingVerbosity profilingVerbosity{ nvinfer1::ProfilingVerbosity::kDEFAULT };
+#endif
+    std::string engine;
+    std::string calibration;
+    std::unordered_map<std::string, ShapeRange> shapes;
+    std::unordered_map<std::string, ShapeRange> shapesCalib;
+    std::vector<IOFormat> inputFormats;
+    std::vector<IOFormat> outputFormats;
+    nvinfer1::TacticSources enabledTactics{0};
+    nvinfer1::TacticSources disabledTactics{0};
+    TimingCacheMode timingCacheMode{TimingCacheMode::kLOCAL};
+    std::string timingCacheFile{};
+    void parse(Arguments& arguments) override;
+
+    static void help(std::ostream& out);
+};
+
+struct SystemOptions : public Options
+{
+    int32_t device{defaultDevice};
+    int32_t DLACore{-1};
+    bool fallback{false};
+    std::vector<std::string> plugins;
+
+    void parse(Arguments& arguments) override;
+
+    static void help(std::ostream& out);
+};
+
+struct InferenceOptions : public Options
+{
+    int32_t batch{batchNotProvided};
+    int32_t iterations{defaultIterations};
+    int32_t streams{defaultStreams};
+    float warmup{defaultWarmUp};
+    float duration{defaultDuration};
+    float sleep{defaultSleep};
+    float idle{defaultIdle};
+    bool overlap{true};
+    bool skipTransfers{false};
+    bool useManaged{false};
+    bool spin{false};
+    bool threads{false};
+    bool graph{false};
+    bool skip{false};
+    bool rerun{false};
+    bool timeDeserialize{false};
+    bool timeRefit{false};
+    std::unordered_map<std::string, std::string> inputs;
+    std::unordered_map<std::string, std::vector<int32_t>> shapes;
+
+    void parse(Arguments& arguments) override;
+
+    static void help(std::ostream& out);
+};
+
+struct ReportingOptions : public Options
+{
+    bool verbose{false};
+    int32_t avgs{defaultAvgRuns};
+    float percentile{defaultPercentile};
+    bool refit{false};
+    bool output{false};
+    bool profile{false};
+    bool layerInfo{false};
+    std::string exportTimes;
+    std::string exportOutput;
+    std::string exportProfile;
+    std::string exportLayerInfo;
+
+    void parse(Arguments& arguments) override;
+
+    static void help(std::ostream& out);
+};
+
+struct SafeBuilderOptions : public Options
+{
+    std::string serialized{};
+    std::string onnxModelFile{};
+    bool help{false};
+    bool verbose{false};
+    std::vector<IOFormat> inputFormats;
+    std::vector<IOFormat> outputFormats;
+    bool int8{false};
+    std::string calibFile{};
+    std::vector<std::string> plugins;
+    bool consistency{false};
+    bool standard{false};
+
+    void parse(Arguments& arguments) override;
+
+    static void printHelp(std::ostream& out);
+};
+
+struct AllOptions : public Options
+{
+    ModelOptions model;
+    BuildOptions build;
+    SystemOptions system;
+    InferenceOptions inference;
+    ReportingOptions reporting;
+    bool helps{false};
+
+    void parse(Arguments& arguments) override;
+
+    static void help(std::ostream& out);
+};
+
+Arguments argsToArgumentsMap(int32_t argc, char* argv[]);
+
+bool parseHelp(Arguments& arguments);
+
+void helpHelp(std::ostream& out);
+
+// Functions to print options
+
+std::ostream& operator<<(std::ostream& os, const BaseModelOptions& options);
+
+std::ostream& operator<<(std::ostream& os, const UffInput& input);
+
+std::ostream& operator<<(std::ostream& os, const IOFormat& format);
+
+std::ostream& operator<<(std::ostream& os, const ShapeRange& dims);
+
+std::ostream& operator<<(std::ostream& os, const ModelOptions& options);
+
+std::ostream& operator<<(std::ostream& os, const BuildOptions& options);
+
+std::ostream& operator<<(std::ostream& os, const SystemOptions& options);
+
+std::ostream& operator<<(std::ostream& os, const InferenceOptions& options);
+
+std::ostream& operator<<(std::ostream& os, const ReportingOptions& options);
+
+std::ostream& operator<<(std::ostream& os, const AllOptions& options);
+
+std::ostream& operator<<(std::ostream& os, const SafeBuilderOptions& options);
+
+inline std::ostream& operator<<(std::ostream& os, const nvinfer1::Dims& dims)
+{
+    for (int32_t i = 0; i < dims.nbDims; ++i)
+    {
+        os << (i ? "x" : "") << dims.d[i];
+    }
+    return os;
+}
+inline std::ostream& operator<<(std::ostream& os, const nvinfer1::WeightsRole role)
+{
+    switch (role)
+    {
+    case nvinfer1::WeightsRole::kKERNEL:
+    {
+        os << "Kernel";
+        break;
+    }
+    case nvinfer1::WeightsRole::kBIAS:
+    {
+        os << "Bias";
+        break;
+    }
+    case nvinfer1::WeightsRole::kSHIFT:
+    {
+        os << "Shift";
+        break;
+    }
+    case nvinfer1::WeightsRole::kSCALE:
+    {
+        os << "Scale";
+        break;
+    }
+    case nvinfer1::WeightsRole::kCONSTANT:
+    {
+        os << "Constant";
+        break;
+    }
+#if (NV_TENSORRT_MAJOR > 7)
+    case nvinfer1::WeightsRole::kANY:
+    {
+        os << "Any";
+        break;
+    }
+#endif
+    }
+
+    return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const std::vector<int32_t>& vec)
+{
+    for (int32_t i = 0, e = static_cast<int32_t>(vec.size()); i < e; ++i)
+    {
+        os << (i ? "x" : "") << vec[i];
+    }
+    return os;
+}
+
+} // namespace sample
+
+#endif // TRT_SAMPLES_OPTIONS_H
diff --git a/src/Detector/tensorrt_yolo/common_deprecated/sampleReporting.cpp b/src/Detector/tensorrt_yolo/common_deprecated/sampleReporting.cpp
new file mode 100644
index 00000000..a92938c5
--- /dev/null
+++ b/src/Detector/tensorrt_yolo/common_deprecated/sampleReporting.cpp
@@ -0,0 +1,445 @@
+/*
+ * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <algorithm>
+#include <exception>
+#include <fstream>
+#include <iomanip>
+#include <iostream>
+#include <numeric>
+#include <utility>
+
+#include "sampleInference.h"
+#include "sampleOptions.h"
+#include "sampleReporting.h"
+
+namespace sample
+{
+
+namespace
+{
+
+//!
+//! \brief Find percentile in an ascending sequence of timings
+//! \note percentile must be in [0, 100]. Otherwise, an exception is thrown.
+//!
+template <typename T>
+float findPercentile(float percentile, std::vector<InferenceTime> const& timings, T const& toFloat)
+{
+    int32_t const all = static_cast<int32_t>(timings.size());
+    int32_t const exclude = static_cast<int32_t>((1 - percentile / 100) * all);
+    if (timings.empty())
+    {
+        return std::numeric_limits<float>::infinity();
+    }
+    if (percentile < 0.0f || percentile > 100.0f)
+    {
+        throw std::runtime_error("percentile is not in [0, 100]!");
+    }
+    return toFloat(timings[std::max(all - 1 - exclude, 0)]);
+}
+
+//!
+//! \brief Find median in a sorted sequence of timings
+//!
+template <typename T>
+float findMedian(std::vector<InferenceTime> const& timings, T const& toFloat)
+{
+    if (timings.empty())
+    {
+        return std::numeric_limits<float>::infinity();
+    }
+
+    int32_t const m = timings.size() / 2;
+    if (timings.size() % 2)
+    {
+        return toFloat(timings[m]);
+    }
+
+    return (toFloat(timings[m - 1]) + toFloat(timings[m])) / 2;
+}
+
+//!
+//! \brief Find coefficient of variance (which is std / mean) in a sorted sequence of timings given the mean
+//!
+template <typename T>
+float findCoeffOfVariance(std::vector<InferenceTime> const& timings, T const& toFloat, float mean)
+{
+    if (timings.empty())
+    {
+        return 0;
+    }
+
+    if (mean == 0.F)
+    {
+        return std::numeric_limits<float>::infinity();
+    }
+
+    auto const metricAccumulator = [toFloat, mean](float acc, InferenceTime const& a) {
+        float const diff = toFloat(a) - mean;
+        return acc + diff * diff;
+    };
+    float const variance = std::accumulate(timings.begin(), timings.end(), 0.F, metricAccumulator) / timings.size();
+
+    return std::sqrt(variance) / mean * 100.F;
+}
+
+inline InferenceTime traceToTiming(const InferenceTrace& a)
+{
+    return InferenceTime((a.enqEnd - a.enqStart), (a.h2dEnd - a.h2dStart), (a.computeEnd - a.computeStart),
+        (a.d2hEnd - a.d2hStart), (a.d2hEnd - a.h2dStart));
+}
+
+} // namespace
+
+void printProlog(int32_t warmups, int32_t timings, float warmupMs, float benchTimeMs, std::ostream& os)
+{
+    os << "Warmup completed " << warmups << " queries over " << warmupMs << " ms" << std::endl;
+    os << "Timing trace has " << timings << " queries over " << benchTimeMs / 1000 << " s" << std::endl;
+}
+
+void printTiming(std::vector<InferenceTime> const& timings, int32_t runsPerAvg, std::ostream& os)
+{
+    int32_t count = 0;
+    InferenceTime sum;
+
+    os << std::endl;
+    os << "=== Trace details ===" << std::endl;
+    os << "Trace averages of " << runsPerAvg << " runs:" << std::endl;
+    for (auto const& t : timings)
+    {
+        sum += t;
+
+        if (++count == runsPerAvg)
+        {
+            // clang-format off
+            os << "Average on " << runsPerAvg << " runs - GPU latency: " << sum.compute / runsPerAvg
+               << " ms - Host latency: " << sum.latency() / runsPerAvg << " ms (end to end " << sum.e2e / runsPerAvg
+               << " ms, enqueue " << sum.enq / runsPerAvg << " ms)" << std::endl;
+            // clang-format on
+            count = 0;
+            sum.enq = 0;
+            sum.h2d = 0;
+            sum.compute = 0;
+            sum.d2h = 0;
+            sum.e2e = 0;
+        }
+    }
+}
+
+void printMetricExplanations(std::ostream& os)
+{
+    os << std::endl;
+    os << "=== Explanations of the performance metrics ===" << std::endl;
+    os << "Total Host Walltime: the host walltime from when the first query (after warmups) is enqueued to when the "
+          "last query is completed."
+       << std::endl;
+    os << "GPU Compute Time: the GPU latency to execute the kernels for a query." << std::endl;
+    os << "Total GPU Compute Time: the summation of the GPU Compute Time of all the queries. If this is significantly "
+          "shorter than Total Host Walltime, the GPU may be under-utilized because of host-side overheads or data "
+          "transfers."
+       << std::endl;
+    os << "Throughput: the observed throughput computed by dividing the number of queries by the Total Host Walltime. "
+          "If this is significantly lower than the reciprocal of GPU Compute Time, the GPU may be under-utilized "
+          "because of host-side overheads or data transfers."
+       << std::endl;
+    os << "Enqueue Time: the host latency to enqueue a query. If this is longer than GPU Compute Time, the GPU may be "
+          "under-utilized."
+       << std::endl;
+    os << "H2D Latency: the latency for host-to-device data transfers for input tensors of a single query."
+       << std::endl;
+    os << "D2H Latency: the latency for device-to-host data transfers for output tensors of a single query."
+       << std::endl;
+    os << "Latency: the summation of H2D Latency, GPU Compute Time, and D2H Latency. This is the latency to infer a "
+          "single query."
+       << std::endl;
+    os << "End-to-End Host Latency: the duration from when the H2D of a query is called to when the D2H of the same "
+          "query is completed, which includes the latency to wait for the completion of the previous query. This is "
+          "the latency of a query if multiple queries are enqueued consecutively."
+       << std::endl;
+}
+
+PerformanceResult getPerformanceResult(std::vector<InferenceTime> const& timings,
+    std::function<float(InferenceTime const&)> metricGetter, float percentile)
+{
+    auto const metricComparator
+        = [metricGetter](InferenceTime const& a, InferenceTime const& b) { return metricGetter(a) < metricGetter(b); };
+    auto const metricAccumulator = [metricGetter](float acc, InferenceTime const& a) { return acc + metricGetter(a); };
+    std::vector<InferenceTime> newTimings = timings;
+    std::sort(newTimings.begin(), newTimings.end(), metricComparator);
+    PerformanceResult result;
+    result.min = metricGetter(newTimings.front());
+    result.max = metricGetter(newTimings.back());
+    result.mean = std::accumulate(newTimings.begin(), newTimings.end(), 0.0f, metricAccumulator) / newTimings.size();
+    result.median = findMedian(newTimings, metricGetter);
+    result.percentile = findPercentile(percentile, newTimings, metricGetter);
+    result.coeffVar = findCoeffOfVariance(newTimings, metricGetter, result.mean);
+    return result;
+}
+
+void printEpilog(std::vector<InferenceTime> const& timings, float walltimeMs, float percentile, int32_t batchSize,
+    std::ostream& osInfo, std::ostream& osWarning, std::ostream& osVerbose)
+{
+    float const throughput = batchSize * timings.size() / walltimeMs * 1000;
+
+    auto const getLatency = [](InferenceTime const& t) { return t.latency(); };
+    auto const latencyResult = getPerformanceResult(timings, getLatency, percentile);
+
+    auto const getEndToEnd = [](InferenceTime const& t) { return t.e2e; };
+    auto const e2eLatencyResult = getPerformanceResult(timings, getEndToEnd, percentile);
+
+    auto const getEnqueue = [](InferenceTime const& t) { return t.enq; };
+    auto const enqueueResult = getPerformanceResult(timings, getEnqueue, percentile);
+
+    auto const getH2d = [](InferenceTime const& t) { return t.h2d; };
+    auto const h2dResult = getPerformanceResult(timings, getH2d, percentile);
+
+    auto const getCompute = [](InferenceTime const& t) { return t.compute; };
+    auto const gpuComputeResult = getPerformanceResult(timings, getCompute, percentile);
+
+    auto const getD2h = [](InferenceTime const& t) { return t.d2h; };
+    auto const d2hResult = getPerformanceResult(timings, getD2h, percentile);
+
+    auto const toPerfString = [percentile](const PerformanceResult& r) {
+        std::stringstream s;
+        s << "min = " << r.min << " ms, max = " << r.max << " ms, mean = " << r.mean << " ms, "
+          << "median = " << r.median << " ms, percentile(" << percentile << "%) = " << r.percentile << " ms";
+        return s.str();
+    };
+
+    osInfo << std::endl;
+    osInfo << "=== Performance summary ===" << std::endl;
+    osInfo << "Throughput: " << throughput << " qps" << std::endl;
+    osInfo << "Latency: " << toPerfString(latencyResult) << std::endl;
+    osInfo << "End-to-End Host Latency: " << toPerfString(e2eLatencyResult) << std::endl;
+    osInfo << "Enqueue Time: " << toPerfString(enqueueResult) << std::endl;
+    osInfo << "H2D Latency: " << toPerfString(h2dResult) << std::endl;
+    osInfo << "GPU Compute Time: " << toPerfString(gpuComputeResult) << std::endl;
+    osInfo << "D2H Latency: " << toPerfString(d2hResult) << std::endl;
+    osInfo << "Total Host Walltime: " << walltimeMs / 1000 << " s" << std::endl;
+    osInfo << "Total GPU Compute Time: " << gpuComputeResult.mean * timings.size() / 1000 << " s" << std::endl;
+
+    // Report warnings if the throughput is bound by other factors than GPU Compute Time.
+    constexpr float kENQUEUE_BOUND_REPORTING_THRESHOLD{0.8F};
+    if (enqueueResult.median > kENQUEUE_BOUND_REPORTING_THRESHOLD * gpuComputeResult.median)
+    {
+        osWarning
+            << "* Throughput may be bound by Enqueue Time rather than GPU Compute and the GPU may be under-utilized."
+            << std::endl;
+        osWarning << "  If not already in use, --useCudaGraph (utilize CUDA graphs where possible) may increase the "
+                     "throughput."
+                  << std::endl;
+    }
+    if (h2dResult.median >= gpuComputeResult.median)
+    {
+        osWarning << "* Throughput may be bound by host-to-device transfers for the inputs rather than GPU Compute and "
+                     "the GPU may be under-utilized."
+                  << std::endl;
+        osWarning << "  Add --noDataTransfers flag to disable data transfers." << std::endl;
+    }
+    if (d2hResult.median >= gpuComputeResult.median)
+    {
+        osWarning << "* Throughput may be bound by device-to-host transfers for the outputs rather than GPU Compute "
+                     "and the GPU may be under-utilized."
+                  << std::endl;
+        osWarning << "  Add --noDataTransfers flag to disable data transfers." << std::endl;
+    }
+
+    // Report warnings if the GPU Compute Time is unstable.
+    constexpr float kUNSTABLE_PERF_REPORTING_THRESHOLD{1.0F};
+    if (gpuComputeResult.coeffVar > kUNSTABLE_PERF_REPORTING_THRESHOLD)
+    {
+        osWarning << "* GPU compute time is unstable, with coefficient of variance = " << gpuComputeResult.coeffVar
+                  << "%." << std::endl;
+        osWarning << "  If not already in use, locking GPU clock frequency or adding --useSpinWait may improve the "
+                  << "stability." << std::endl;
+    }
+
+    // Explain what the metrics mean.
+    osInfo << "Explanations of the performance metrics are printed in the verbose logs." << std::endl;
+    printMetricExplanations(osVerbose);
+
+    osInfo << std::endl;
+}
+
+void printPerformanceReport(std::vector<InferenceTrace> const& trace, const ReportingOptions& reporting, float warmupMs,
+    int32_t batchSize, std::ostream& osInfo, std::ostream& osWarning, std::ostream& osVerbose)
+{
+    auto const isNotWarmup = [&warmupMs](const InferenceTrace& a) { return a.computeStart >= warmupMs; };
+    auto const noWarmup = std::find_if(trace.begin(), trace.end(), isNotWarmup);
+    int32_t const warmups = noWarmup - trace.begin();
+    float const benchTime = trace.back().d2hEnd - noWarmup->h2dStart;
+    // when implicit batch used, batchSize = options.inference.batch, which is parsed through --batch
+    // when explicit batch used, batchSize = options.inference.batch = 0
+    // treat inference with explicit batch as a single query and report the throughput
+    batchSize = batchSize ? batchSize : 1;
+    printProlog(warmups * batchSize, (trace.size() - warmups) * batchSize, warmupMs, benchTime, osInfo);
+
+    std::vector<InferenceTime> timings(trace.size() - warmups);
+    std::transform(noWarmup, trace.end(), timings.begin(), traceToTiming);
+    printTiming(timings, reporting.avgs, osInfo);
+    printEpilog(timings, benchTime, reporting.percentile, batchSize, osInfo, osWarning, osVerbose);
+
+    if (!reporting.exportTimes.empty())
+    {
+        exportJSONTrace(trace, reporting.exportTimes);
+    }
+}
+
+//! Printed format:
+//! [ value, ...]
+//! value ::= { "start enq : time, "end enq" : time, "start h2d" : time, "end h2d" : time, "start compute" : time,
+//!             "end compute" : time, "start d2h" : time, "end d2h" : time, "h2d" : time, "compute" : time,
+//!             "d2h" : time, "latency" : time, "end to end" : time }
+//!
+void exportJSONTrace(std::vector<InferenceTrace> const& trace, std::string const& fileName)
+{
+    std::ofstream os(fileName, std::ofstream::trunc);
+    os << "[" << std::endl;
+    char const* sep = "  ";
+    for (auto const& t : trace)
+    {
+        InferenceTime const it(traceToTiming(t));
+        os << sep << "{ ";
+        sep = ", ";
+        // clang-format off
+        os << "\"startEnqMs\" : "     << t.enqStart     << sep << "\"endEnqMs\" : "     << t.enqEnd     << sep
+           << "\"startH2dMs\" : "     << t.h2dStart     << sep << "\"endH2dMs\" : "     << t.h2dEnd     << sep
+           << "\"startComputeMs\" : " << t.computeStart << sep << "\"endComputeMs\" : " << t.computeEnd << sep
+           << "\"startD2hMs\" : "     << t.d2hStart     << sep << "\"endD2hMs\" : "     << t.d2hEnd     << sep
+           << "\"h2dMs\" : "          << it.h2d         << sep << "\"computeMs\" : "    << it.compute   << sep
+           << "\"d2hMs\" : "          << it.d2h         << sep << "\"latencyMs\" : "    << it.latency() << sep
+           << "\"endToEndMs\" : "     << it.e2e         << " }"                                         << std::endl;
+        // clang-format on
+    }
+    os << "]" << std::endl;
+}
+
+void Profiler::reportLayerTime(char const* layerName, float timeMs) noexcept
+{
+    if (mIterator == mLayers.end())
+    {
+        bool const first = !mLayers.empty() && mLayers.begin()->name == layerName;
+        mUpdatesCount += mLayers.empty() || first;
+        if (first)
+        {
+            mIterator = mLayers.begin();
+        }
+        else
+        {
+            mLayers.emplace_back();
+            mLayers.back().name = layerName;
+            mIterator = mLayers.end() - 1;
+        }
+    }
+
+    mIterator->timeMs += timeMs;
+    ++mIterator;
+}
+
+void Profiler::print(std::ostream& os) const noexcept
+{
+    std::string const nameHdr("Layer");
+    std::string const timeHdr("   Time (ms)");
+    std::string const avgHdr("   Avg. Time (ms)");
+    std::string const percentageHdr("   Time %");
+
+    float const totalTimeMs = getTotalTime();
+
+    auto const cmpLayer = [](LayerProfile const& a, LayerProfile const& b) { return a.name.size() < b.name.size(); };
+    auto const longestName = std::max_element(mLayers.begin(), mLayers.end(), cmpLayer);
+    auto const nameLength = std::max(longestName->name.size() + 1, nameHdr.size());
+    auto const timeLength = timeHdr.size();
+    auto const avgLength = avgHdr.size();
+    auto const percentageLength = percentageHdr.size();
+
+    os << std::endl
+       << "=== Profile (" << mUpdatesCount << " iterations ) ===" << std::endl
+       << std::setw(nameLength) << nameHdr << timeHdr << avgHdr << percentageHdr << std::endl;
+
+    for (auto const& p : mLayers)
+    {
+        // clang-format off
+        os << std::setw(nameLength) << p.name << std::setw(timeLength) << std::fixed << std::setprecision(2) << p.timeMs
+           << std::setw(avgLength) << std::fixed << std::setprecision(4) << p.timeMs / mUpdatesCount
+           << std::setw(percentageLength) << std::fixed << std::setprecision(1) << p.timeMs / totalTimeMs * 100
+           << std::endl;
+    }
+    {
+        os << std::setw(nameLength) << "Total" << std::setw(timeLength) << std::fixed << std::setprecision(2)
+           << totalTimeMs << std::setw(avgLength) << std::fixed << std::setprecision(4) << totalTimeMs / mUpdatesCount
+           << std::setw(percentageLength) << std::fixed << std::setprecision(1) << 100.0 << std::endl;
+        // clang-format on
+    }
+    os << std::endl;
+}
+
+void Profiler::exportJSONProfile(std::string const& fileName) const noexcept
+{
+    std::ofstream os(fileName, std::ofstream::trunc);
+    os << "[" << std::endl << "  { \"count\" : " << mUpdatesCount << " }" << std::endl;
+
+    auto const totalTimeMs = getTotalTime();
+
+    for (auto const& l : mLayers)
+    {
+        // clang-format off
+        os << ", {" << " \"name\" : \""      << l.name << "\""
+                       ", \"timeMs\" : "     << l.timeMs
+           <<          ", \"averageMs\" : "  << l.timeMs / mUpdatesCount
+           <<          ", \"percentage\" : " << l.timeMs / totalTimeMs * 100
+           << " }"  << std::endl;
+        // clang-format on
+    }
+    os << "]" << std::endl;
+}
+
+void dumpInputs(nvinfer1::IExecutionContext const& context, Bindings const& bindings, std::ostream& os)
+{
+    os << "Input Tensors:" << std::endl;
+    bindings.dumpInputs(context, os);
+}
+
+void dumpOutputs(nvinfer1::IExecutionContext const& context, Bindings const& bindings, std::ostream& os)
+{
+    os << "Output Tensors:" << std::endl;
+    bindings.dumpOutputs(context, os);
+}
+
+void exportJSONOutput(
+    nvinfer1::IExecutionContext const& context, Bindings const& bindings, std::string const& fileName, int32_t batch)
+{
+    std::ofstream os(fileName, std::ofstream::trunc);
+    std::string sep = "  ";
+    auto const output = bindings.getOutputBindings();
+    os << "[" << std::endl;
+    for (auto const& binding : output)
+    {
+        // clang-format off
+        os << sep << "{ \"name\" : \"" << binding.first << "\"" << std::endl;
+        sep = ", ";
+        os << "  " << sep << "\"dimensions\" : \"";
+        bindings.dumpBindingDimensions(binding.second, context, os);
+        os << "\"" << std::endl;
+        os << "  " << sep << "\"values\" : [ ";
+        bindings.dumpBindingValues(context, binding.second, os, sep, batch);
+        os << " ]" << std::endl << "  }" << std::endl;
+        // clang-format on
+    }
+    os << "]" << std::endl;
+}
+
+} // namespace sample
diff --git a/src/Detector/tensorrt_yolo/common_deprecated/sampleReporting.h b/src/Detector/tensorrt_yolo/common_deprecated/sampleReporting.h
new file mode 100644
index 00000000..5f730987
--- /dev/null
+++ b/src/Detector/tensorrt_yolo/common_deprecated/sampleReporting.h
@@ -0,0 +1,222 @@
+/*
+ * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef TRT_SAMPLE_REPORTING_H
+#define TRT_SAMPLE_REPORTING_H
+
+#include <functional>
+#include <iostream>
+
+#include "NvInfer.h"
+
+#include "sampleOptions.h"
+#include "sampleUtils.h"
+
+namespace sample
+{
+
+//!
+//! \struct InferenceTime
+//! \brief Measurement times in milliseconds
+//!
+struct InferenceTime
+{
+    InferenceTime(float q, float i, float c, float o, float e)
+        : enq(q)
+        , h2d(i)
+        , compute(c)
+        , d2h(o)
+        , e2e(e)
+    {
+    }
+
+    InferenceTime() = default;
+    InferenceTime(InferenceTime const&) = default;
+    InferenceTime(InferenceTime&&) = default;
+    InferenceTime& operator=(InferenceTime const&) = default;
+    InferenceTime& operator=(InferenceTime&&) = default;
+    ~InferenceTime() = default;
+
+    float enq{0};     // Enqueue
+    float h2d{0};     // Host to Device
+    float compute{0}; // Compute
+    float d2h{0};     // Device to Host
+    float e2e{0};     // end to end
+
+    // ideal latency
+    float latency() const
+    {
+        return h2d + compute + d2h;
+    }
+};
+
+//!
+//! \struct InferenceTrace
+//! \brief Measurement points in milliseconds
+//!
+struct InferenceTrace
+{
+    InferenceTrace(int32_t s, float es, float ee, float is, float ie, float cs, float ce, float os, float oe)
+        : stream(s)
+        , enqStart(es)
+        , enqEnd(ee)
+        , h2dStart(is)
+        , h2dEnd(ie)
+        , computeStart(cs)
+        , computeEnd(ce)
+        , d2hStart(os)
+        , d2hEnd(oe)
+    {
+    }
+
+    InferenceTrace() = default;
+    InferenceTrace(InferenceTrace const&) = default;
+    InferenceTrace(InferenceTrace&&) = default;
+    InferenceTrace& operator=(InferenceTrace const&) = default;
+    InferenceTrace& operator=(InferenceTrace&&) = default;
+    ~InferenceTrace() = default;
+
+    int32_t stream{0};
+    float enqStart{0};
+    float enqEnd{0};
+    float h2dStart{0};
+    float h2dEnd{0};
+    float computeStart{0};
+    float computeEnd{0};
+    float d2hStart{0};
+    float d2hEnd{0};
+};
+
+inline InferenceTime operator+(InferenceTime const& a, InferenceTime const& b)
+{
+    return InferenceTime(a.enq + b.enq, a.h2d + b.h2d, a.compute + b.compute, a.d2h + b.d2h, a.e2e + b.e2e);
+}
+
+inline InferenceTime operator+=(InferenceTime& a, InferenceTime const& b)
+{
+    return a = a + b;
+}
+
+//!
+//! \struct PerformanceResult
+//! \brief Performance result of a performance metric
+//!
+struct PerformanceResult
+{
+    float min{0};
+    float max{0};
+    float mean{0};
+    float median{0};
+    float percentile{0};
+    float coeffVar{0}; // coefficient of variation
+};
+
+//!
+//! \brief Print benchmarking time and number of traces collected
+//!
+void printProlog(int32_t warmups, int32_t timings, float warmupMs, float walltime, std::ostream& os);
+
+//!
+//! \brief Print a timing trace
+//!
+void printTiming(std::vector<InferenceTime> const& timings, int32_t runsPerAvg, std::ostream& os);
+
+//!
+//! \brief Print the performance summary of a trace
+//!
+void printEpilog(std::vector<InferenceTime> const& timings, float percentile, int32_t batchSize, std::ostream& osInfo,
+    std::ostream& osWarning, std::ostream& osVerbose);
+
+//!
+//! \brief Get the result of a specific performance metric from a trace
+//!
+PerformanceResult getPerformanceResult(std::vector<InferenceTime> const& timings,
+    std::function<float(InferenceTime const&)> metricGetter, float percentile);
+
+//!
+//! \brief Print the explanations of the performance metrics printed in printEpilog() function.
+//!
+void printMetricExplanations(std::ostream& os);
+
+//!
+//! \brief Print and summarize a timing trace
+//!
+void printPerformanceReport(std::vector<InferenceTrace> const& trace, ReportingOptions const& reporting, float warmupMs,
+    int32_t batchSize, std::ostream& osInfo, std::ostream& osWarning, std::ostream& osVerbose);
+
+//!
+//! \brief Export a timing trace to JSON file
+//!
+void exportJSONTrace(std::vector<InferenceTrace> const& trace, std::string const& fileName);
+
+//!
+//! \brief Print input tensors to stream
+//!
+void dumpInputs(nvinfer1::IExecutionContext const& context, Bindings const& bindings, std::ostream& os);
+
+//!
+//! \brief Print output tensors to stream
+//!
+void dumpOutputs(nvinfer1::IExecutionContext const& context, Bindings const& bindings, std::ostream& os);
+
+//!
+//! \brief Export output tensors to JSON file
+//!
+void exportJSONOutput(
+    nvinfer1::IExecutionContext const& context, Bindings const& bindings, std::string const& fileName, int32_t batch);
+
+//!
+//! \struct LayerProfile
+//! \brief Layer profile information
+//!
+struct LayerProfile
+{
+    std::string name;
+    float timeMs{0};
+};
+
+//!
+//! \class Profiler
+//! \brief Collect per-layer profile information, assuming times are reported in the same order
+//!
+class Profiler : public nvinfer1::IProfiler
+{
+
+public:
+    void reportLayerTime(char const* layerName, float timeMs) noexcept override;
+
+    void print(std::ostream& os) const noexcept;
+
+    //!
+    //! \brief Export a profile to JSON file
+    //!
+    void exportJSONProfile(std::string const& fileName) const noexcept;
+
+private:
+    float getTotalTime() const noexcept
+    {
+        auto const plusLayerTime = [](float accumulator, LayerProfile const& lp) { return accumulator + lp.timeMs; };
+        return std::accumulate(mLayers.begin(), mLayers.end(), 0.0, plusLayerTime);
+    }
+
+    std::vector<LayerProfile> mLayers;
+    std::vector<LayerProfile>::iterator mIterator{mLayers.begin()};
+    int32_t mUpdatesCount{0};
+};
+
+} // namespace sample
+
+#endif // TRT_SAMPLE_REPORTING_H
diff --git a/src/Detector/tensorrt_yolo/common_deprecated/sampleUtils.h b/src/Detector/tensorrt_yolo/common_deprecated/sampleUtils.h
new file mode 100644
index 00000000..1509a7fc
--- /dev/null
+++ b/src/Detector/tensorrt_yolo/common_deprecated/sampleUtils.h
@@ -0,0 +1,543 @@
+/*
+ * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef TRT_SAMPLE_UTILS_H
+#define TRT_SAMPLE_UTILS_H
+
+#include <fstream>
+#include <iostream>
+#include <memory>
+#include <numeric>
+#include <random>
+#include <unordered_map>
+#include <vector>
+
+#include <cuda.h>
+#include <cuda_fp16.h>
+
+#include "NvInfer.h"
+
+#include "common.h"
+#include "logger.h"
+#include "sampleDevice.h"
+#include "sampleOptions.h"
+
+namespace sample
+{
+
+inline int dataTypeSize(nvinfer1::DataType dataType)
+{
+    switch (dataType)
+    {
+    case nvinfer1::DataType::kINT32:
+    case nvinfer1::DataType::kFLOAT: return 4;
+    case nvinfer1::DataType::kHALF: return 2;
+    case nvinfer1::DataType::kBOOL:
+    case nvinfer1::DataType::kINT8: return 1;
+    }
+    return 0;
+}
+
+template <typename T>
+inline T roundUp(T m, T n)
+{
+    return ((m + n - 1) / n) * n;
+}
+
+inline int volume(const nvinfer1::Dims& d)
+{
+    return std::accumulate(d.d, d.d + d.nbDims, 1, std::multiplies<int>());
+}
+
+//! comps is the number of components in a vector. Ignored if vecDim < 0.
+inline int64_t volume(const nvinfer1::Dims& dims, const nvinfer1::Dims& strides, int vecDim, int comps, int batch)
+{
+    int maxNbElems = 1;
+    for (int i = 0; i < dims.nbDims; ++i)
+    {
+        // Get effective length of axis.
+        int d = dims.d[i];
+        // Any dimension is 0, it is an empty tensor.
+        if (d == 0)
+        {
+            return 0;
+        }
+        if (i == vecDim)
+        {
+            d = samplesCommon::divUp(d, comps);
+        }
+        maxNbElems = std::max(maxNbElems, d * strides.d[i]);
+    }
+    return static_cast<int64_t>(maxNbElems) * batch * (vecDim < 0 ? 1 : comps);
+}
+
+inline int64_t volume(nvinfer1::Dims dims, int vecDim, int comps, int batch)
+{
+    if (vecDim != -1)
+    {
+        dims.d[vecDim] = roundUp(dims.d[vecDim], comps);
+    }
+    return volume(dims) * std::max(batch, 1);
+}
+
+inline nvinfer1::Dims toDims(const std::vector<int>& vec)
+{
+    int limit = static_cast<int>(nvinfer1::Dims::MAX_DIMS);
+    if (static_cast<int>(vec.size()) > limit)
+    {
+        sample::gLogWarning << "Vector too long, only first 8 elements are used in dimension." << std::endl;
+    }
+    // Pick first nvinfer1::Dims::MAX_DIMS elements
+    nvinfer1::Dims dims{std::min(static_cast<int>(vec.size()), limit), {}};
+    std::copy_n(vec.begin(), dims.nbDims, std::begin(dims.d));
+    return dims;
+}
+
+template <typename T>
+inline void fillBuffer(void* buffer, int64_t volume, T min, T max)
+{
+    T* typedBuffer = static_cast<T*>(buffer);
+    std::default_random_engine engine;
+    if (std::is_integral<T>::value)
+    {
+        std::uniform_int_distribution<int> distribution(min, max);
+        auto generator = [&engine, &distribution]() { return static_cast<T>(distribution(engine)); };
+        std::generate(typedBuffer, typedBuffer + volume, generator);
+    }
+    else
+    {
+        std::uniform_real_distribution<float> distribution(min, max);
+        auto generator = [&engine, &distribution]() { return static_cast<T>(distribution(engine)); };
+        std::generate(typedBuffer, typedBuffer + volume, generator);
+    }
+}
+
+// Specialization needed for custom type __half
+template <typename H>
+inline void fillBufferHalf(void* buffer, int64_t volume, H min, H max)
+{
+    H* typedBuffer = static_cast<H*>(buffer);
+    std::default_random_engine engine;
+    std::uniform_real_distribution<float> distribution(min, max);
+    auto generator = [&engine, &distribution]() { return static_cast<H>(distribution(engine)); };
+    std::generate(typedBuffer, typedBuffer + volume, generator);
+}
+template <>
+inline void fillBuffer<__half>(void* buffer, int64_t volume, __half min, __half max)
+{
+    fillBufferHalf(buffer, volume, min, max);
+}
+
+template <typename T>
+inline void dumpBuffer(const void* buffer, const std::string& separator, std::ostream& os, const nvinfer1::Dims& dims,
+    const nvinfer1::Dims& strides, int32_t vectorDim, int32_t spv)
+{
+    const int64_t volume = std::accumulate(dims.d, dims.d + dims.nbDims, 1, std::multiplies<int64_t>());
+    const T* typedBuffer = static_cast<const T*>(buffer);
+    std::string sep;
+    for (int64_t v = 0; v < volume; ++v)
+    {
+        int64_t curV = v;
+        int32_t dataOffset = 0;
+        for (int32_t dimIndex = dims.nbDims - 1; dimIndex >= 0; --dimIndex)
+        {
+            int32_t dimVal = curV % dims.d[dimIndex];
+            if (dimIndex == vectorDim)
+            {
+                dataOffset += (dimVal / spv) * strides.d[dimIndex] * spv + dimVal % spv;
+            }
+            else
+            {
+                dataOffset += dimVal * strides.d[dimIndex] * (vectorDim == -1 ? 1 : spv);
+            }
+            curV /= dims.d[dimIndex];
+            ASSERT(curV >= 0);
+        }
+
+        os << sep << typedBuffer[dataOffset];
+        sep = separator;
+    }
+}
+
+inline void loadFromFile(std::string const& fileName, char* dst, size_t size)
+{
+    ASSERT(dst);
+
+    std::ifstream file(fileName, std::ios::in | std::ios::binary);
+    if (file.is_open())
+    {
+        file.read(dst, size);
+        file.close();
+    }
+    else
+    {
+        std::stringstream msg;
+        msg << "Cannot open file " << fileName << "!";
+        throw std::invalid_argument(msg.str());
+    }
+}
+
+struct Binding
+{
+    bool isInput{false};
+    std::unique_ptr<IMirroredBuffer> buffer;
+    int64_t volume{0};
+    nvinfer1::DataType dataType{nvinfer1::DataType::kFLOAT};
+
+    void fill(const std::string& fileName)
+    {
+        loadFromFile(fileName, static_cast<char*>(buffer->getHostBuffer()), buffer->getSize());
+    }
+
+    void fill()
+    {
+        switch (dataType)
+        {
+        case nvinfer1::DataType::kBOOL:
+        {
+            fillBuffer<bool>(buffer->getHostBuffer(), volume, 0, 1);
+            break;
+        }
+        case nvinfer1::DataType::kINT32:
+        {
+            fillBuffer<int32_t>(buffer->getHostBuffer(), volume, -128, 127);
+            break;
+        }
+        case nvinfer1::DataType::kINT8:
+        {
+            fillBuffer<int8_t>(buffer->getHostBuffer(), volume, -128, 127);
+            break;
+        }
+        case nvinfer1::DataType::kFLOAT:
+        {
+            fillBuffer<float>(buffer->getHostBuffer(), volume, -1.0F, 1.0F);
+            break;
+        }
+        case nvinfer1::DataType::kHALF:
+        {
+            fillBuffer<__half>(buffer->getHostBuffer(), volume, -1.0F, 1.0F);
+            break;
+        }
+        }
+    }
+
+    void dump(std::ostream& os, nvinfer1::Dims dims, nvinfer1::Dims strides, int32_t vectorDim, int32_t spv,
+        const std::string separator = " ") const
+    {
+        switch (dataType)
+        {
+        case nvinfer1::DataType::kBOOL:
+        {
+            dumpBuffer<bool>(buffer->getHostBuffer(), separator, os, dims, strides, vectorDim, spv);
+            break;
+        }
+        case nvinfer1::DataType::kINT32:
+        {
+            dumpBuffer<int32_t>(buffer->getHostBuffer(), separator, os, dims, strides, vectorDim, spv);
+            break;
+        }
+        case nvinfer1::DataType::kINT8:
+        {
+            dumpBuffer<int8_t>(buffer->getHostBuffer(), separator, os, dims, strides, vectorDim, spv);
+            break;
+        }
+        case nvinfer1::DataType::kFLOAT:
+        {
+            dumpBuffer<float>(buffer->getHostBuffer(), separator, os, dims, strides, vectorDim, spv);
+            break;
+        }
+        case nvinfer1::DataType::kHALF:
+        {
+            dumpBuffer<__half>(buffer->getHostBuffer(), separator, os, dims, strides, vectorDim, spv);
+            break;
+        }
+        }
+    }
+};
+
+class Bindings
+{
+public:
+    Bindings() = delete;
+    explicit Bindings(bool useManaged)
+        : mUseManaged(useManaged)
+    {
+    }
+
+    void addBinding(int b, const std::string& name, bool isInput, int64_t volume, nvinfer1::DataType dataType,
+        const std::string& fileName = "")
+    {
+        while (mBindings.size() <= static_cast<size_t>(b))
+        {
+            mBindings.emplace_back();
+            mDevicePointers.emplace_back();
+        }
+        mNames[name] = b;
+        if (mBindings[b].buffer == nullptr)
+        {
+            if (mUseManaged)
+                mBindings[b].buffer.reset(new UnifiedMirroredBuffer);
+            else
+                mBindings[b].buffer.reset(new DiscreteMirroredBuffer);
+        }
+        mBindings[b].isInput = isInput;
+        // Some memory allocators return nullptr when allocating zero bytes, but TensorRT requires a non-null ptr
+        // even for empty tensors, so allocate a dummy byte.
+        if (volume == 0)
+            mBindings[b].buffer->allocate(1);
+        else
+            mBindings[b].buffer->allocate(static_cast<size_t>(volume) * static_cast<size_t>(dataTypeSize(dataType)));
+
+        mBindings[b].volume = volume;
+        mBindings[b].dataType = dataType;
+        mDevicePointers[b] = mBindings[b].buffer->getDeviceBuffer();
+        if (isInput)
+        {
+            if (fileName.empty())
+                fill(b);
+            else
+                fill(b, fileName);
+        }
+    }
+
+    void** getDeviceBuffers()
+    {
+        return mDevicePointers.data();
+    }
+
+    void transferInputToDevice(TrtCudaStream& stream)
+    {
+        for (auto& b : mNames)
+        {
+            if (mBindings[b.second].isInput)
+                mBindings[b.second].buffer->hostToDevice(stream);
+        }
+    }
+
+    void transferOutputToHost(TrtCudaStream& stream)
+    {
+        for (auto& b : mNames)
+        {
+            if (!mBindings[b.second].isInput)
+                mBindings[b.second].buffer->deviceToHost(stream);
+        }
+    }
+
+    void fill(int binding, const std::string& fileName)
+    {
+        mBindings[binding].fill(fileName);
+    }
+
+    void fill(int binding)
+    {
+        mBindings[binding].fill();
+    }
+
+    void dumpBindingDimensions(int binding, const nvinfer1::IExecutionContext& context, std::ostream& os) const
+    {
+        const auto dims = context.getBindingDimensions(binding);
+        // Do not add a newline terminator, because the caller may be outputting a JSON string.
+        os << dims;
+    }
+
+    void dumpBindingValues(const nvinfer1::IExecutionContext& context, int binding, std::ostream& os,
+        const std::string& separator = " ", int32_t batch = 1) const
+    {
+        nvinfer1::Dims dims = context.getBindingDimensions(binding);
+        nvinfer1::Dims strides = context.getStrides(binding);
+        int32_t vectorDim = context.getEngine().getBindingVectorizedDim(binding);
+        const int32_t spv = context.getEngine().getBindingComponentsPerElement(binding);
+
+        if (context.getEngine().hasImplicitBatchDimension())
+        {
+            auto insertN = [](nvinfer1::Dims& d, int32_t bs) {
+                const int32_t nbDims = d.nbDims;
+                ASSERT(nbDims < nvinfer1::Dims::MAX_DIMS);
+                std::copy_backward(&d.d[0], &d.d[nbDims], &d.d[nbDims + 1]);
+                d.d[0] = bs;
+                d.nbDims = nbDims + 1;
+            };
+            int32_t batchStride = 0;
+            for (int32_t i = 0; i < strides.nbDims; ++i)
+            {
+                if (strides.d[i] * dims.d[i] > batchStride)
+                {
+                    batchStride = strides.d[i] * dims.d[i];
+                }
+            }
+            insertN(dims, batch);
+            insertN(strides, batchStride);
+            vectorDim = (vectorDim == -1) ? -1 : vectorDim + 1;
+        }
+
+        mBindings[binding].dump(os, dims, strides, vectorDim, spv, separator);
+    }
+
+    void dumpInputs(const nvinfer1::IExecutionContext& context, std::ostream& os) const
+    {
+        auto isInput = [](const Binding& b) { return b.isInput; };
+        dumpBindings(context, isInput, os);
+    }
+
+    void dumpOutputs(const nvinfer1::IExecutionContext& context, std::ostream& os) const
+    {
+        auto isOutput = [](const Binding& b) { return !b.isInput; };
+        dumpBindings(context, isOutput, os);
+    }
+
+    void dumpBindings(const nvinfer1::IExecutionContext& context, std::ostream& os) const
+    {
+        auto all = [](const Binding& /*b*/) { return true; };
+        dumpBindings(context, all, os);
+    }
+
+    void dumpBindings(
+        const nvinfer1::IExecutionContext& context, bool (*predicate)(const Binding& b), std::ostream& os) const
+    {
+        for (const auto& n : mNames)
+        {
+            const auto binding = n.second;
+            if (predicate(mBindings[binding]))
+            {
+                os << n.first << ": (";
+                dumpBindingDimensions(binding, context, os);
+                os << ")" << std::endl;
+
+                dumpBindingValues(context, binding, os);
+                os << std::endl;
+            }
+        }
+    }
+
+    std::unordered_map<std::string, int> getInputBindings() const
+    {
+        auto isInput = [](const Binding& b) { return b.isInput; };
+        return getBindings(isInput);
+    }
+
+    std::unordered_map<std::string, int> getOutputBindings() const
+    {
+        auto isOutput = [](const Binding& b) { return !b.isInput; };
+        return getBindings(isOutput);
+    }
+
+    std::unordered_map<std::string, int> getBindings() const
+    {
+        auto all = [](const Binding& /*b*/) { return true; };
+        return getBindings(all);
+    }
+
+    std::unordered_map<std::string, int> getBindings(bool (*predicate)(const Binding& b)) const
+    {
+        std::unordered_map<std::string, int> bindings;
+        for (const auto& n : mNames)
+        {
+            const auto binding = n.second;
+            if (predicate(mBindings[binding]))
+                bindings.insert(n);
+        }
+        return bindings;
+    }
+
+private:
+    std::unordered_map<std::string, int32_t> mNames;
+    std::vector<Binding> mBindings;
+    std::vector<void*> mDevicePointers;
+    bool mUseManaged{false};
+};
+
+template <typename T>
+struct TrtDestroyer
+{
+    void operator()(T* t)
+    {
+        //t->destroy();
+        delete t;
+    }
+};
+
+template <typename T>
+using TrtUniquePtr = std::unique_ptr<T, TrtDestroyer<T>>;
+
+inline bool broadcastIOFormats(const std::vector<IOFormat>& formats, size_t nbBindings, bool isInput = true)
+{
+    bool broadcast = formats.size() == 1;
+    bool validFormatsCount = broadcast || (formats.size() == nbBindings);
+    if (!formats.empty() && !validFormatsCount)
+    {
+        if (isInput)
+        {
+            throw std::invalid_argument(
+                "The number of inputIOFormats must match network's inputs or be one for broadcasting.");
+        }
+        else
+        {
+            throw std::invalid_argument(
+                "The number of outputIOFormats must match network's outputs or be one for broadcasting.");
+        }
+    }
+    return broadcast;
+}
+
+inline std::vector<char> loadTimingCacheFile(const std::string inFileName)
+{
+    std::ifstream iFile(inFileName, std::ios::in | std::ios::binary);
+    if (!iFile)
+    {
+        sample::gLogWarning << "Could not read timing cache from: " << inFileName
+                            << ". A new timing cache will be generated and written." << std::endl;
+        return std::vector<char>();
+    }
+    iFile.seekg(0, std::ifstream::end);
+    size_t fsize = iFile.tellg();
+    iFile.seekg(0, std::ifstream::beg);
+    std::vector<char> content(fsize);
+    iFile.read(content.data(), fsize);
+    iFile.close();
+    sample::gLogInfo << "Loaded " << fsize << " bytes of timing cache from " << inFileName << std::endl;
+    return content;
+}
+
+inline void saveTimingCacheFile(const std::string outFileName, const nvinfer1::IHostMemory* blob)
+{
+    std::ofstream oFile(outFileName, std::ios::out | std::ios::binary);
+    if (!oFile)
+    {
+        sample::gLogWarning << "Could not write timing cache to: " << outFileName << std::endl;
+        return;
+    }
+    oFile.write((char*) blob->data(), blob->size());
+    oFile.close();
+    sample::gLogInfo << "Saved " << blob->size() << " bytes of timing cache to " << outFileName << std::endl;
+}
+
+inline int32_t getCudaDriverVersion()
+{
+    int32_t version{-1};
+    cudaCheck(cudaDriverGetVersion(&version));
+    return version;
+}
+
+inline int32_t getCudaRuntimeVersion()
+{
+    int32_t version{-1};
+    cudaCheck(cudaRuntimeGetVersion(&version));
+    return version;
+}
+
+} // namespace sample
+
+#endif // TRT_SAMPLE_UTILS_H
diff --git a/src/Detector/tensorrt_yolo/yolo.cpp b/src/Detector/tensorrt_yolo/yolo.cpp
index a60d3dc4..4ee202b6 100644
--- a/src/Detector/tensorrt_yolo/yolo.cpp
+++ b/src/Detector/tensorrt_yolo/yolo.cpp
@@ -78,7 +78,31 @@ Yolo::Yolo(const NetworkInfo& networkInfo, const InferParams& inferParams)
 	assert(m_Engine != nullptr);
 	m_Context = m_Engine->createExecutionContext();
 	assert(m_Context != nullptr);
+
+	auto numBindings = m_Engine->getNbIOTensors();
+	//std::cout << "** Bindings: " << numBindings << " **" << std::endl;
+	for (int32_t i = 0; i < numBindings; ++i)
+	{
+		std::string bindName = m_Engine->getIOTensorName(i);
+		m_tensorNames.emplace(bindName, i);
+		nvinfer1::Dims dim = m_Engine->getTensorShape(bindName.c_str());
+
+		std::cout << i << ": name: " << bindName;
+		std::cout << ", size: ";
+		for (int j = 0; j < dim.nbDims; ++j)
+		{
+			std::cout << dim.d[j];
+			if (j < dim.nbDims - 1)
+				std::cout << "x";
+		}
+		std::cout << std::endl;
+
+		if (m_InputBlobName == bindName)
+			m_InputBindingIndex = i;
+	}
+#if (NV_TENSORRT_MAJOR < 9)
 	m_InputBindingIndex = m_Engine->getBindingIndex(m_InputBlobName.c_str());
+#endif
 	assert(m_InputBindingIndex != -1);
 	assert(m_BatchSize <= static_cast<uint32_t>(m_Engine->getMaxBatchSize()));
 	allocateBuffers();
@@ -464,7 +488,14 @@ void Yolo::createYOLOEngine(const nvinfer1::DataType dataType, Int8EntropyCalibr
 
     // Build the engine
     std::cout << "Building the TensorRT Engine..." << std::endl;
-    m_Engine = m_Builder->buildEngineWithConfig(*m_Network, *config);
+#if (NV_TENSORRT_MAJOR < 9)
+	m_Engine = m_Builder->buildEngineWithConfig(*m_Network, *config);
+#else
+	nvinfer1::IRuntime* inferRuntime = nvinfer1::createInferRuntime(m_Logger);
+	nvinfer1::IHostMemory* serialNetmork = m_Builder->buildSerializedNetwork(*m_Network, *config);
+	m_Engine = inferRuntime->deserializeCudaEngine(serialNetmork->data(), serialNetmork->size());
+	delete inferRuntime;
+#endif
     assert(m_Engine != nullptr);
     std::cout << "Building complete!" << std::endl;
 
@@ -942,7 +973,15 @@ void Yolo::create_engine_yolov5(const nvinfer1::DataType dataType, Int8EntropyCa
 #endif
 	// Build the engine
 	std::cout << "Building the TensorRT Engine..." << std::endl;
+#if (NV_TENSORRT_MAJOR < 9)
 	m_Engine = m_Builder->buildEngineWithConfig(*m_Network, *config);
+#else
+	nvinfer1::IRuntime* inferRuntime = nvinfer1::createInferRuntime(m_Logger);
+	nvinfer1::IHostMemory* serialNetmork = m_Builder->buildSerializedNetwork(*m_Network, *config);
+	m_Engine = inferRuntime->deserializeCudaEngine(serialNetmork->data(), serialNetmork->size());
+	delete inferRuntime;
+#endif
+
 	assert(m_Engine != nullptr);
 	std::cout << "Building complete!" << std::endl;
 
@@ -987,7 +1026,8 @@ void Yolo::doInference(const unsigned char* input, const uint32_t batchSize)
                                   batchSize * m_InputSize * sizeof(float), cudaMemcpyHostToDevice,
                                   m_CudaStream));
 
-    m_Context->enqueue(batchSize, m_DeviceBuffers.data(), m_CudaStream, nullptr);
+    //m_Context->enqueueV3(batchSize, m_DeviceBuffers.data(), m_CudaStream, nullptr);
+	m_Context->enqueueV3(m_CudaStream);
     for (auto& tensor : m_OutputTensors)
     {
         NV_CUDA_CHECK(cudaMemcpyAsync(tensor.hostBuffer, m_DeviceBuffers.at(tensor.bindingIndex),
@@ -1249,8 +1289,7 @@ void Yolo::parse_cfg_blocks_v5(const  std::vector<std::map<std::string, std::str
 				}
 				outputTensor.stride_h = m_InputH / outputTensor.grid_h;
 				outputTensor.stride_w = m_InputW / outputTensor.grid_w;
-				outputTensor.volume = outputTensor.grid_h * outputTensor.grid_w
-					*(outputTensor.numBBoxes*(5 + outputTensor.numClasses));
+				outputTensor.volume = outputTensor.grid_h * outputTensor.grid_w*(outputTensor.numBBoxes*(5 + outputTensor.numClasses));
 				m_OutputTensors.push_back(outputTensor);
 
 				if (m_ClassNames.empty())
@@ -1268,19 +1307,21 @@ void Yolo::parse_cfg_blocks_v5(const  std::vector<std::map<std::string, std::str
 
 void Yolo::allocateBuffers()
 {
-    m_DeviceBuffers.resize(m_Engine->getNbBindings(), nullptr);
+    m_DeviceBuffers.resize(m_Engine->getNbIOTensors(), nullptr);
     assert(m_InputBindingIndex != -1 && "Invalid input binding index");
-    NV_CUDA_CHECK(cudaMalloc(&m_DeviceBuffers.at(m_InputBindingIndex),
-                             m_BatchSize * m_InputSize * sizeof(float)));
+    NV_CUDA_CHECK(cudaMalloc(&m_DeviceBuffers.at(m_InputBindingIndex), m_BatchSize * m_InputSize * sizeof(float)));
 
     for (auto& tensor : m_OutputTensors)
     {
+#if (NV_TENSORRT_MAJOR < 9)
         tensor.bindingIndex = m_Engine->getBindingIndex(tensor.blobName.c_str());
+#else
+		auto it = m_tensorNames.find(tensor.blobName);
+		tensor.bindingIndex = (it != std::end(m_tensorNames)) ? it->second : -1;
+#endif
         assert((tensor.bindingIndex != -1) && "Invalid output binding index");
-        NV_CUDA_CHECK(cudaMalloc(&m_DeviceBuffers.at(tensor.bindingIndex),
-                                 m_BatchSize * tensor.volume * sizeof(float)));
-        NV_CUDA_CHECK(
-            cudaMallocHost(&tensor.hostBuffer, tensor.volume * m_BatchSize * sizeof(float)));
+        NV_CUDA_CHECK(cudaMalloc(&m_DeviceBuffers.at(tensor.bindingIndex), m_BatchSize * tensor.volume * sizeof(float)));
+        NV_CUDA_CHECK(cudaMallocHost((void**)&tensor.hostBuffer, tensor.volume * m_BatchSize * sizeof(float)));
     }
 }
 
diff --git a/src/Detector/tensorrt_yolo/yolo.h b/src/Detector/tensorrt_yolo/yolo.h
index be347d19..4cfdba16 100644
--- a/src/Detector/tensorrt_yolo/yolo.h
+++ b/src/Detector/tensorrt_yolo/yolo.h
@@ -158,6 +158,7 @@ class Yolo
     std::vector<void*> m_DeviceBuffers;
     int m_InputBindingIndex = -1;
     cudaStream_t m_CudaStream = nullptr;
+    std::map<std::string, int> m_tensorNames;
 
     virtual std::vector<BBoxInfo> decodeTensor(const int imageIdx, const int imageH, const int imageW, const TensorInfo& tensor) = 0;