From c2cad0997bb68d5963f912d23f6aa747d0d0f52e Mon Sep 17 00:00:00 2001 From: Nuzhny007 Date: Wed, 2 Oct 2024 08:08:47 +0300 Subject: [PATCH] First version with cuda 12.6, trt 10 --- src/Detector/tensorrt_yolo/CMakeLists.txt | 14 +- src/Detector/tensorrt_yolo/YoloONNX.cpp | 29 +- .../tensorrt_yolo/common/BatchStream.h | 47 +- .../tensorrt_yolo/common/EntropyCalibrator.h | 18 +- .../tensorrt_yolo/common/ErrorRecorder.h | 9 +- .../tensorrt_yolo/common/argsParser.h | 162 + .../tensorrt_yolo/common/bfloat16.cpp | 60 + src/Detector/tensorrt_yolo/common/bfloat16.h | 46 + src/Detector/tensorrt_yolo/common/buffers.h | 164 +- src/Detector/tensorrt_yolo/common/common.h | 330 +- .../tensorrt_yolo/common/dumpTFWts.py | 124 + .../tensorrt_yolo/common/fileLock.cpp | 100 + src/Detector/tensorrt_yolo/common/fileLock.h | 86 + .../tensorrt_yolo/common/getOptions.cpp | 248 + .../tensorrt_yolo/common/getOptions.h | 128 + src/Detector/tensorrt_yolo/common/getopt.c | 568 +++ src/Detector/tensorrt_yolo/common/getoptWin.h | 124 + src/Detector/tensorrt_yolo/common/half.h | 9 +- src/Detector/tensorrt_yolo/common/logger.cpp | 7 +- src/Detector/tensorrt_yolo/common/logger.h | 5 +- src/Detector/tensorrt_yolo/common/logging.h | 16 +- .../tensorrt_yolo/common/parserOnnxConfig.h | 56 +- .../tensorrt_yolo/common/safeCommon.h | 321 +- .../tensorrt_yolo/common/sampleConfig.h | 50 +- .../tensorrt_yolo/common/sampleDevice.cpp | 133 + .../tensorrt_yolo/common/sampleDevice.h | 142 +- .../tensorrt_yolo/common/sampleEngines.cpp_ | 1688 +++++++ .../tensorrt_yolo/common/sampleEngines.h | 296 +- .../tensorrt_yolo/common/sampleEntrypoints.h | 101 + .../tensorrt_yolo/common/sampleInference.cpp_ | 1622 +++++++ .../tensorrt_yolo/common/sampleInference.h | 226 +- .../tensorrt_yolo/common/sampleOptions.cpp | 2081 ++++++-- .../tensorrt_yolo/common/sampleOptions.h | 236 +- .../tensorrt_yolo/common/sampleReporting.cpp | 300 +- .../tensorrt_yolo/common/sampleReporting.h | 124 +- .../tensorrt_yolo/common/sampleUtils.cpp | 587 +++ .../tensorrt_yolo/common/sampleUtils.h | 528 +- .../tensorrt_yolo/common/streamReader.h | 78 + .../tensorrt_yolo/common/timingCache.cpp | 157 + .../tensorrt_yolo/common/timingCache.h | 38 + .../common_deprecated/BatchStream.h | 388 ++ .../common_deprecated/EntropyCalibrator.h | 134 + .../common_deprecated/ErrorRecorder.h | 137 + .../tensorrt_yolo/common_deprecated/buffers.h | 478 ++ .../tensorrt_yolo/common_deprecated/common.h | 963 ++++ .../tensorrt_yolo/common_deprecated/half.h | 4302 +++++++++++++++++ .../common_deprecated/logger.cpp | 40 + .../tensorrt_yolo/common_deprecated/logger.h | 36 + .../tensorrt_yolo/common_deprecated/logging.h | 578 +++ .../common_deprecated/parserOnnxConfig.h | 153 + .../common_deprecated/safeCommon.h | 71 + .../common_deprecated/sampleConfig.h | 337 ++ .../common_deprecated/sampleDevice.h | 494 ++ .../sampleEngines.cpp | 0 .../common_deprecated/sampleEngines.h | 183 + .../sampleInference.cpp | 0 .../common_deprecated/sampleInference.h | 92 + .../common_deprecated/sampleOptions.cpp | 1778 +++++++ .../common_deprecated/sampleOptions.h | 355 ++ .../common_deprecated/sampleReporting.cpp | 445 ++ .../common_deprecated/sampleReporting.h | 222 + .../common_deprecated/sampleUtils.h | 543 +++ src/Detector/tensorrt_yolo/yolo.cpp | 63 +- src/Detector/tensorrt_yolo/yolo.h | 1 + 64 files changed, 21126 insertions(+), 1725 deletions(-) create mode 100644 src/Detector/tensorrt_yolo/common/argsParser.h create mode 100644 src/Detector/tensorrt_yolo/common/bfloat16.cpp create mode 100644 src/Detector/tensorrt_yolo/common/bfloat16.h create mode 100644 src/Detector/tensorrt_yolo/common/dumpTFWts.py create mode 100644 src/Detector/tensorrt_yolo/common/fileLock.cpp create mode 100644 src/Detector/tensorrt_yolo/common/fileLock.h create mode 100644 src/Detector/tensorrt_yolo/common/getOptions.cpp create mode 100644 src/Detector/tensorrt_yolo/common/getOptions.h create mode 100644 src/Detector/tensorrt_yolo/common/getopt.c create mode 100644 src/Detector/tensorrt_yolo/common/getoptWin.h create mode 100644 src/Detector/tensorrt_yolo/common/sampleDevice.cpp create mode 100644 src/Detector/tensorrt_yolo/common/sampleEngines.cpp_ create mode 100644 src/Detector/tensorrt_yolo/common/sampleEntrypoints.h create mode 100644 src/Detector/tensorrt_yolo/common/sampleInference.cpp_ create mode 100644 src/Detector/tensorrt_yolo/common/sampleUtils.cpp create mode 100644 src/Detector/tensorrt_yolo/common/streamReader.h create mode 100644 src/Detector/tensorrt_yolo/common/timingCache.cpp create mode 100644 src/Detector/tensorrt_yolo/common/timingCache.h create mode 100644 src/Detector/tensorrt_yolo/common_deprecated/BatchStream.h create mode 100644 src/Detector/tensorrt_yolo/common_deprecated/EntropyCalibrator.h create mode 100644 src/Detector/tensorrt_yolo/common_deprecated/ErrorRecorder.h create mode 100644 src/Detector/tensorrt_yolo/common_deprecated/buffers.h create mode 100644 src/Detector/tensorrt_yolo/common_deprecated/common.h create mode 100644 src/Detector/tensorrt_yolo/common_deprecated/half.h create mode 100644 src/Detector/tensorrt_yolo/common_deprecated/logger.cpp create mode 100644 src/Detector/tensorrt_yolo/common_deprecated/logger.h create mode 100644 src/Detector/tensorrt_yolo/common_deprecated/logging.h create mode 100644 src/Detector/tensorrt_yolo/common_deprecated/parserOnnxConfig.h create mode 100644 src/Detector/tensorrt_yolo/common_deprecated/safeCommon.h create mode 100644 src/Detector/tensorrt_yolo/common_deprecated/sampleConfig.h create mode 100644 src/Detector/tensorrt_yolo/common_deprecated/sampleDevice.h rename src/Detector/tensorrt_yolo/{common => common_deprecated}/sampleEngines.cpp (100%) create mode 100644 src/Detector/tensorrt_yolo/common_deprecated/sampleEngines.h rename src/Detector/tensorrt_yolo/{common => common_deprecated}/sampleInference.cpp (100%) create mode 100644 src/Detector/tensorrt_yolo/common_deprecated/sampleInference.h create mode 100644 src/Detector/tensorrt_yolo/common_deprecated/sampleOptions.cpp create mode 100644 src/Detector/tensorrt_yolo/common_deprecated/sampleOptions.h create mode 100644 src/Detector/tensorrt_yolo/common_deprecated/sampleReporting.cpp create mode 100644 src/Detector/tensorrt_yolo/common_deprecated/sampleReporting.h create mode 100644 src/Detector/tensorrt_yolo/common_deprecated/sampleUtils.h diff --git a/src/Detector/tensorrt_yolo/CMakeLists.txt b/src/Detector/tensorrt_yolo/CMakeLists.txt index 30509d0e..30f916bf 100644 --- a/src/Detector/tensorrt_yolo/CMakeLists.txt +++ b/src/Detector/tensorrt_yolo/CMakeLists.txt @@ -58,13 +58,20 @@ file(GLOB TENSORRT_CUDA_FILES *.cu) cuda_add_library(${libname_rt} SHARED ${TENSORRT_CUDA_FILES} ${TENSORRT_SOURCE_FILES} - ${TENSORRT_HEADER_FILES} -) + ${TENSORRT_HEADER_FILES}) #message("TensorRT OpenCV libraries:") #message("${OpenCV_LIBS}") #message(${OpenCV_DIR}) +if (MSVC) + file(GLOB TensorRT_LIBRARIES ${TensorRT_LIBRARY}) +endif() + +message("TensorRT_LIBRARY: ${TensorRT_LIBRARY}") +message("TensorRT_LIBRARIES: ${TensorRT_LIBRARIES}") + + set(TENSORRT_LIBS ${OpenCV_LIBS} #${CUDA_LIBRARIES} @@ -74,8 +81,7 @@ set(TENSORRT_LIBS ${CUDA_curand_LIBRARY} ${CUDNN_LIBRARY} # ${LIB_PTHREAD} - ${TensorRT_LIBRARIES} -) + ${TensorRT_LIBRARIES}) if (CMAKE_COMPILER_IS_GNUCXX) set(TENSORRT_LIBS ${TENSORRT_LIBS} stdc++fs nvinfer_plugin nvonnxparser) diff --git a/src/Detector/tensorrt_yolo/YoloONNX.cpp b/src/Detector/tensorrt_yolo/YoloONNX.cpp index b016c4b3..0b19d5cc 100644 --- a/src/Detector/tensorrt_yolo/YoloONNX.cpp +++ b/src/Detector/tensorrt_yolo/YoloONNX.cpp @@ -22,14 +22,13 @@ bool YoloONNX::Init(const SampleYoloParams& params) auto GetBindings = [&]() { - auto numBindings = m_engine->getNbBindings(); + auto numBindings = m_engine->getNbIOTensors(); std::cout << "** Bindings: " << numBindings << " **" << std::endl; for (int32_t i = 0; i < numBindings; ++i) { - nvinfer1::Dims dim = m_engine->getBindingDimensions(i); - - std::string bindName = m_engine->getBindingName(i); + std::string bindName = m_engine->getIOTensorName(i); + nvinfer1::Dims dim = m_engine->getTensorShape(bindName.c_str()); for (const auto& outName : m_params.outputTensorNames) { if (bindName == outName) @@ -77,27 +76,17 @@ bool YoloONNX::Init(const SampleYoloParams& params) delete infer; #endif - sample::gLogInfo << "TRT Engine loaded from: " << m_params.engineFileName << std::endl; - - GetBindings(); - - if (!m_engine) + if (m_engine) { - res = false; + GetBindings(); + m_inputDims = m_engine->getTensorShape(m_engine->getIOTensorName(0)); + res = true; } else { -#if 1 - m_inputDims = m_engine->getBindingDimensions(0); -#else - m_inputDims.nbDims = 4; - m_inputDims.d[0] = m_params.explicitBatchSize; - m_inputDims.d[1] = 3; - m_inputDims.d[2] = m_params.width; - m_inputDims.d[3] = m_params.height; -#endif res = true; } + sample::gLogInfo << "TRT Engine loaded from: " << m_params.engineFileName << " with res = " << res << std::endl; } else { @@ -177,7 +166,7 @@ bool YoloONNX::ConstructNetwork(YoloONNXUniquePtr& builder, size_t dlaGlobalDRAMSize = config->getMemoryPoolLimit(nvinfer1::MemoryPoolType::kDLA_GLOBAL_DRAM); std::cout << "workspaceSize = " << workspaceSize << ", dlaManagedSRAMSize = " << dlaManagedSRAMSize << ", dlaLocalDRAMSize = " << dlaLocalDRAMSize << ", dlaGlobalDRAMSize = " << dlaGlobalDRAMSize << std::endl; - config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, m_params.videoMemory ? m_params.videoMemory : 4096_MiB); + config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, m_params.videoMemory ? m_params.videoMemory : (1 << 20)); #endif config->setFlag(nvinfer1::BuilderFlag::kGPU_FALLBACK); diff --git a/src/Detector/tensorrt_yolo/common/BatchStream.h b/src/Detector/tensorrt_yolo/common/BatchStream.h index a8da9923..c4ab9de0 100644 --- a/src/Detector/tensorrt_yolo/common/BatchStream.h +++ b/src/Detector/tensorrt_yolo/common/BatchStream.h @@ -1,11 +1,12 @@ /* - * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, @@ -119,7 +120,7 @@ class MNISTBatchStream : public IBatchStream file.read(reinterpret_cast(rawData.data()), numElements * sizeof(uint8_t)); mData.resize(numElements); std::transform( - rawData.begin(), rawData.end(), mData.begin(), [](uint8_t val) { return static_cast(val) / 255.f; }); + rawData.begin(), rawData.end(), mData.begin(), [](uint8_t val) { return static_cast(val) / 255.F; }); } void readLabelsFile(const std::string& labelsFilePath) @@ -152,42 +153,39 @@ class MNISTBatchStream : public IBatchStream class BatchStream : public IBatchStream { public: - BatchStream( - int batchSize, int maxBatches, std::string prefix, std::string suffix, std::vector directories) + BatchStream(int batchSize, int maxBatches, std::string const& prefix, std::string const& suffix, + std::vector const& directories) : mBatchSize(batchSize) , mMaxBatches(maxBatches) , mPrefix(prefix) , mSuffix(suffix) , mDataDir(directories) { - FILE* file = fopen(locateFile(mPrefix + std::string("0") + mSuffix, mDataDir).c_str(), "rb"); - ASSERT(file != nullptr); + std::ifstream file(locateFile(mPrefix + std::string("0") + mSuffix, mDataDir).c_str(), std::ios::binary); + ASSERT(file.good()); int d[4]; - size_t readSize = fread(d, sizeof(int), 4, file); - ASSERT(readSize == 4); + file.read(reinterpret_cast(d), 4 * sizeof(int32_t)); mDims.nbDims = 4; // The number of dimensions. mDims.d[0] = d[0]; // Batch Size mDims.d[1] = d[1]; // Channels mDims.d[2] = d[2]; // Height mDims.d[3] = d[3]; // Width ASSERT(mDims.d[0] > 0 && mDims.d[1] > 0 && mDims.d[2] > 0 && mDims.d[3] > 0); - fclose(file); mImageSize = mDims.d[1] * mDims.d[2] * mDims.d[3]; mBatch.resize(mBatchSize * mImageSize, 0); mLabels.resize(mBatchSize, 0); mFileBatch.resize(mDims.d[0] * mImageSize, 0); mFileLabels.resize(mDims.d[0], 0); - reset(0); } - BatchStream(int batchSize, int maxBatches, std::string prefix, std::vector directories) + BatchStream(int batchSize, int maxBatches, std::string const& prefix, std::vector const& directories) : BatchStream(batchSize, maxBatches, prefix, ".batch", directories) { } - BatchStream( - int batchSize, int maxBatches, nvinfer1::Dims dims, std::string listFile, std::vector directories) + BatchStream(int batchSize, int maxBatches, nvinfer1::Dims const& dims, std::string const& listFile, + std::vector const& directories) : mBatchSize(batchSize) , mMaxBatches(maxBatches) , mDims(dims) @@ -199,7 +197,6 @@ class BatchStream : public IBatchStream mLabels.resize(mBatchSize, 0); mFileBatch.resize(mDims.d[0] * mImageSize, 0); mFileLabels.resize(mDims.d[0], 0); - reset(0); } // Resets data members @@ -219,7 +216,7 @@ class BatchStream : public IBatchStream return false; } - for (int csize = 1, batchPos = 0; batchPos < mBatchSize; batchPos += csize, mFileBatchPos += csize) + for (int64_t csize = 1, batchPos = 0; batchPos < mBatchSize; batchPos += csize, mFileBatchPos += csize) { ASSERT(mFileBatchPos > 0 && mFileBatchPos <= mDims.d[0]); if (mFileBatchPos == mDims.d[0] && !update()) @@ -228,7 +225,7 @@ class BatchStream : public IBatchStream } // copy the smaller of: elements left to fulfill the request, or elements left in the file buffer. - csize = std::min(mBatchSize - batchPos, mDims.d[0] - mFileBatchPos); + csize = std::min(mBatchSize - batchPos, mDims.d[0] - mFileBatchPos); std::copy_n( getFileBatch() + mFileBatchPos * mImageSize, csize * mImageSize, getBatch() + batchPos * mImageSize); std::copy_n(getFileLabels() + mFileBatchPos, csize, getLabels() + batchPos); @@ -295,22 +292,16 @@ class BatchStream : public IBatchStream if (mListFile.empty()) { std::string inputFileName = locateFile(mPrefix + std::to_string(mFileCount++) + mSuffix, mDataDir); - FILE* file = fopen(inputFileName.c_str(), "rb"); + std::ifstream file(inputFileName.c_str(), std::ios::binary); if (!file) { return false; } - int d[4]; - size_t readSize = fread(d, sizeof(int), 4, file); - ASSERT(readSize == 4); + file.read(reinterpret_cast(d), 4 * sizeof(int32_t)); ASSERT(mDims.d[0] == d[0] && mDims.d[1] == d[1] && mDims.d[2] == d[2] && mDims.d[3] == d[3]); - size_t readInputCount = fread(getFileBatch(), sizeof(float), mDims.d[0] * mImageSize, file); - ASSERT(readInputCount == size_t(mDims.d[0] * mImageSize)); - size_t readLabelCount = fread(getFileLabels(), sizeof(float), mDims.d[0], file); - ASSERT(readLabelCount == 0 || readLabelCount == size_t(mDims.d[0])); - - fclose(file); + file.read(reinterpret_cast(getFileBatch()), sizeof(float) * mDims.d[0] * mImageSize); + file.read(reinterpret_cast(getFileLabels()), sizeof(float) * mDims.d[0]); } else { @@ -368,7 +359,7 @@ class BatchStream : public IBatchStream return true; } - int mBatchSize{0}; + int64_t mBatchSize{0}; int mMaxBatches{0}; int mBatchCount{0}; int mFileCount{0}; diff --git a/src/Detector/tensorrt_yolo/common/EntropyCalibrator.h b/src/Detector/tensorrt_yolo/common/EntropyCalibrator.h index f31789bf..67a0130e 100644 --- a/src/Detector/tensorrt_yolo/common/EntropyCalibrator.h +++ b/src/Detector/tensorrt_yolo/common/EntropyCalibrator.h @@ -1,11 +1,12 @@ /* - * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, @@ -28,8 +29,8 @@ template class EntropyCalibratorImpl { public: - EntropyCalibratorImpl( - TBatchStream stream, int firstBatch, std::string networkName, const char* inputBlobName, bool readCache = true) + EntropyCalibratorImpl(TBatchStream const& stream, int firstBatch, std::string const& networkName, + const char* inputBlobName, bool readCache = true) : mStream{stream} , mCalibrationTableName("CalibrationTable" + networkName) , mInputBlobName(inputBlobName) @@ -51,11 +52,12 @@ class EntropyCalibratorImpl return mStream.getBatchSize(); } - bool getBatch(void* bindings[], const char* names[], int /*nbBindings*/) noexcept + bool getBatch(void* bindings[], const char* names[], int nbBindings) noexcept { if (!mStream.next()) + { return false; - + } CHECK(cudaMemcpy(mDeviceInput, mStream.getBatch(), mInputCount * sizeof(float), cudaMemcpyHostToDevice)); ASSERT(!strcmp(names[0], mInputBlobName)); bindings[0] = mDeviceInput; @@ -101,8 +103,8 @@ template class Int8EntropyCalibrator2 : public nvinfer1::IInt8EntropyCalibrator2 { public: - Int8EntropyCalibrator2( - TBatchStream stream, int firstBatch, const char* networkName, const char* inputBlobName, bool readCache = true) + Int8EntropyCalibrator2(TBatchStream const& stream, int32_t firstBatch, const char* networkName, + const char* inputBlobName, bool readCache = true) : mImpl(stream, firstBatch, networkName, inputBlobName, readCache) { } diff --git a/src/Detector/tensorrt_yolo/common/ErrorRecorder.h b/src/Detector/tensorrt_yolo/common/ErrorRecorder.h index 40b35fb5..bfb857c5 100644 --- a/src/Detector/tensorrt_yolo/common/ErrorRecorder.h +++ b/src/Detector/tensorrt_yolo/common/ErrorRecorder.h @@ -1,11 +1,12 @@ /* - * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, @@ -16,7 +17,7 @@ #ifndef ERROR_RECORDER_H #define ERROR_RECORDER_H -#include "NvInferRuntimeCommon.h" +#include "NvInferRuntime.h" #include "logger.h" #include #include @@ -44,7 +45,7 @@ class SampleErrorRecorder : public IErrorRecorder public: SampleErrorRecorder() = default; - virtual ~SampleErrorRecorder() noexcept {} + ~SampleErrorRecorder() noexcept override {} int32_t getNbErrors() const noexcept final { return mErrorStack.size(); diff --git a/src/Detector/tensorrt_yolo/common/argsParser.h b/src/Detector/tensorrt_yolo/common/argsParser.h new file mode 100644 index 00000000..1f0b9025 --- /dev/null +++ b/src/Detector/tensorrt_yolo/common/argsParser.h @@ -0,0 +1,162 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef TENSORRT_ARGS_PARSER_H +#define TENSORRT_ARGS_PARSER_H + +#ifdef _MSC_VER +#include "getOptWin.h" +#else +#include +#endif +#include +#include +#include + +namespace samplesCommon +{ + +//! +//! \brief The SampleParams structure groups the basic parameters required by +//! all sample networks. +//! +struct SampleParams +{ + int32_t batchSize{1}; //!< Number of inputs in a batch + int32_t dlaCore{-1}; //!< Specify the DLA core to run network on. + bool int8{false}; //!< Allow runnning the network in Int8 mode. + bool fp16{false}; //!< Allow running the network in FP16 mode. + bool bf16{false}; //!< Allow running the network in BF16 mode. + std::vector dataDirs; //!< Directory paths where sample data files are stored + std::vector inputTensorNames; + std::vector outputTensorNames; + std::string timingCacheFile; //!< Path to timing cache file +}; + +//! +//! \brief The OnnxSampleParams structure groups the additional parameters required by +//! networks that use ONNX +//! +struct OnnxSampleParams : public SampleParams +{ + std::string onnxFileName; //!< Filename of ONNX file of a network +}; + +//! +//! /brief Struct to maintain command-line arguments. +//! +struct Args +{ + bool runInInt8{false}; + bool runInFp16{false}; + bool runInBf16{false}; + bool help{false}; + int32_t useDLACore{-1}; + int32_t batch{1}; + std::vector dataDirs; + std::string saveEngine; + std::string loadEngine; + bool rowOrder{true}; + std::string timingCacheFile; +}; + +//! +//! \brief Populates the Args struct with the provided command-line parameters. +//! +//! \throw invalid_argument if any of the arguments are not valid +//! +//! \return boolean If return value is true, execution can continue, otherwise program should exit +//! +inline bool parseArgs(Args& args, int32_t argc, char* argv[]) +{ + while (1) + { + int32_t arg; + static struct option long_options[] + = {{"help", no_argument, 0, 'h'}, {"datadir", required_argument, 0, 'd'}, {"int8", no_argument, 0, 'i'}, + {"fp16", no_argument, 0, 'f'}, {"bf16", no_argument, 0, 'z'}, {"columnOrder", no_argument, 0, 'c'}, + {"saveEngine", required_argument, 0, 's'}, {"loadEngine", required_argument, 0, 'o'}, + {"useDLACore", required_argument, 0, 'u'}, {"batch", required_argument, 0, 'b'}, + {"timingCacheFile", required_argument, 0, 't'}, {nullptr, 0, nullptr, 0}}; + int32_t option_index = 0; + arg = getopt_long(argc, argv, "hd:iu", long_options, &option_index); + if (arg == -1) + { + break; + } + + switch (arg) + { + case 'h': args.help = true; return true; + case 'd': + if (optarg) + { + args.dataDirs.push_back(optarg); + } + else + { + std::cerr << "ERROR: --datadir requires option argument" << std::endl; + return false; + } + break; + case 's': + if (optarg) + { + args.saveEngine = optarg; + } + break; + case 'o': + if (optarg) + { + args.loadEngine = optarg; + } + break; + case 'i': args.runInInt8 = true; break; + case 'f': args.runInFp16 = true; break; + case 'z': args.runInBf16 = true; break; + case 'c': args.rowOrder = false; break; + case 'u': + if (optarg) + { + args.useDLACore = std::stoi(optarg); + } + break; + case 'b': + if (optarg) + { + args.batch = std::stoi(optarg); + } + break; + case 't': + if (optarg) + { + args.timingCacheFile = optarg; + } + else + { + std::cerr << "ERROR: --timingCacheFile requires option argument" << std::endl; + return false; + } + break; + default: return false; + } + } + return true; +} + +} // namespace samplesCommon + +#endif // TENSORRT_ARGS_PARSER_H diff --git a/src/Detector/tensorrt_yolo/common/bfloat16.cpp b/src/Detector/tensorrt_yolo/common/bfloat16.cpp new file mode 100644 index 00000000..8222826a --- /dev/null +++ b/src/Detector/tensorrt_yolo/common/bfloat16.cpp @@ -0,0 +1,60 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "bfloat16.h" +#include + +namespace sample +{ + +BFloat16::operator float() const +{ + static_assert(sizeof(uint32_t) == sizeof(float), ""); + float val{0.F}; + auto bits = static_cast(mRep) << 16; + std::memcpy(&val, &bits, sizeof(uint32_t)); + return val; +} + +BFloat16::BFloat16(float x) +{ + static_assert(sizeof(uint32_t) == sizeof(float), ""); + uint32_t bits{0}; + std::memcpy(&bits, &x, sizeof(float)); + + // FP32 format: 1 sign bit, 8 bit exponent, 23 bit mantissa + // BF16 format: 1 sign bit, 8 bit exponent, 7 bit mantissa + + // Mask for exponent + constexpr uint32_t exponent = 0xFFU << 23; + + // Check if exponent is all 1s (NaN or infinite) + if ((bits & exponent) != exponent) + { + // x is finite - round to even + bits += 0x7FFFU + (bits >> 16 & 1); + } + + mRep = static_cast(bits >> 16); +} + +BFloat16 operator+(BFloat16 x, BFloat16 y) +{ + return BFloat16(static_cast(x) + static_cast(y)); +} + +} // namespace sample diff --git a/src/Detector/tensorrt_yolo/common/bfloat16.h b/src/Detector/tensorrt_yolo/common/bfloat16.h new file mode 100644 index 00000000..0d0ab922 --- /dev/null +++ b/src/Detector/tensorrt_yolo/common/bfloat16.h @@ -0,0 +1,46 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +namespace sample +{ + +//! Implements "Brain Floating Point": like an IEEE FP32, +//! but the significand is only 7 bits instead of 23 bits. +class BFloat16 +{ +public: + BFloat16() + : mRep(0) + { + } + + // Rounds to even if there is a tie. + BFloat16(float x); + + operator float() const; + +private: + //! Value stored in BFloat16 representation. + uint16_t mRep; +}; +BFloat16 operator+(BFloat16 x, BFloat16 y); + +} // namespace sample diff --git a/src/Detector/tensorrt_yolo/common/buffers.h b/src/Detector/tensorrt_yolo/common/buffers.h index ef673b2b..e58f2f5c 100644 --- a/src/Detector/tensorrt_yolo/common/buffers.h +++ b/src/Detector/tensorrt_yolo/common/buffers.h @@ -1,11 +1,12 @@ /* - * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, @@ -238,28 +239,53 @@ class BufferManager public: static const size_t kINVALID_SIZE_VALUE = ~size_t(0); + //! + //! \brief Create a BufferManager for handling buffer interactions with engine, when the I/O tensor volumes + //! are provided + //! + BufferManager( + std::shared_ptr engine, std::vector const& volumes, int32_t batchSize = 0) + : mEngine(engine) + , mBatchSize(batchSize) + { + // Create host and device buffers + for (int32_t i = 0; i < mEngine->getNbIOTensors(); i++) + { + auto const name = engine->getIOTensorName(i); + mNames[name] = i; + + nvinfer1::DataType type = mEngine->getTensorDataType(name); + + std::unique_ptr manBuf{new ManagedBuffer()}; + manBuf->deviceBuffer = DeviceBuffer(volumes[i], type); + manBuf->hostBuffer = HostBuffer(volumes[i], type); + void* deviceBuffer = manBuf->deviceBuffer.data(); + mDeviceBindings.emplace_back(deviceBuffer); + mManagedBuffers.emplace_back(std::move(manBuf)); + } + } + //! //! \brief Create a BufferManager for handling buffer interactions with engine. //! - BufferManager(std::shared_ptr engine, const int batchSize, - const nvinfer1::IExecutionContext* context = nullptr) + BufferManager(std::shared_ptr engine, int32_t const batchSize = 0, + nvinfer1::IExecutionContext const* context = nullptr) : mEngine(engine) , mBatchSize(batchSize) { - // Full Dims implies no batch size. - auto impbs = engine->hasImplicitBatchDimension(); - std::cout << "hasImplicitBatchDimension: " << impbs << ", mBatchSize = " << mBatchSize << std::endl; - assert(engine->hasImplicitBatchDimension() || mBatchSize == 0); // Create host and device buffers - for (int i = 0; i < mEngine->getNbBindings(); i++) + for (int32_t i = 0, e = mEngine->getNbIOTensors(); i < e; i++) { - auto dims = context ? context->getBindingDimensions(i) : mEngine->getBindingDimensions(i); + auto const name = engine->getIOTensorName(i); + mNames[name] = i; + + auto dims = context ? context->getTensorShape(name) : mEngine->getTensorShape(name); size_t vol = context || !mBatchSize ? 1 : static_cast(mBatchSize); - nvinfer1::DataType type = mEngine->getBindingDataType(i); - int vecDim = mEngine->getBindingVectorizedDim(i); + nvinfer1::DataType type = mEngine->getTensorDataType(name); + int32_t vecDim = mEngine->getTensorVectorizedDim(name); if (-1 != vecDim) // i.e., 0 != lgScalarsPerVector { - int scalarsPerVec = mEngine->getBindingComponentsPerElement(i); + int32_t scalarsPerVec = mEngine->getTensorComponentsPerElement(name); dims.d[vecDim] = divUp(dims.d[vecDim], scalarsPerVec); vol *= scalarsPerVec; } @@ -267,7 +293,8 @@ class BufferManager std::unique_ptr manBuf{new ManagedBuffer()}; manBuf->deviceBuffer = DeviceBuffer(vol, type); manBuf->hostBuffer = HostBuffer(vol, type); - mDeviceBindings.emplace_back(manBuf->deviceBuffer.data()); + void* deviceBuffer = manBuf->deviceBuffer.data(); + mDeviceBindings.emplace_back(deviceBuffer); mManagedBuffers.emplace_back(std::move(manBuf)); } } @@ -284,7 +311,7 @@ class BufferManager //! //! \brief Returns a vector of device buffers. //! - const std::vector& getDeviceBindings() const + std::vector const& getDeviceBindings() const { return mDeviceBindings; } @@ -293,7 +320,7 @@ class BufferManager //! \brief Returns the device buffer corresponding to tensorName. //! Returns nullptr if no such tensor can be found. //! - void* getDeviceBuffer(const std::string& tensorName) const + void* getDeviceBuffer(std::string const& tensorName) const { return getBuffer(false, tensorName); } @@ -302,72 +329,21 @@ class BufferManager //! \brief Returns the host buffer corresponding to tensorName. //! Returns nullptr if no such tensor can be found. //! - void* getHostBuffer(const std::string& tensorName) const + void* getHostBuffer(std::string const& tensorName) const { return getBuffer(true, tensorName); } - //! - //! \brief Returns the host buffer corresponding to tensorName. - //! Returns nullptr if no such tensor can be found. - //! - void* getHostBuffer(int bindingIndex) const - { - return getBuffer(true, bindingIndex); - } - //! //! \brief Returns the size of the host and device buffers that correspond to tensorName. //! Returns kINVALID_SIZE_VALUE if no such tensor can be found. //! - size_t size(const std::string& tensorName) const + size_t size(std::string const& tensorName) const { - int index = mEngine->getBindingIndex(tensorName.c_str()); - if (index == -1) + auto record = mNames.find(tensorName); + if (record == mNames.end()) return kINVALID_SIZE_VALUE; - return mManagedBuffers[index]->hostBuffer.nbBytes(); - } - - //! - //! \brief Dump host buffer with specified tensorName to ostream. - //! Prints error message to std::ostream if no such tensor can be found. - //! - void dumpBuffer(std::ostream& os, const std::string& tensorName) - { - int index = mEngine->getBindingIndex(tensorName.c_str()); - if (index == -1) - { - os << "Invalid tensor name" << std::endl; - return; - } - void* buf = mManagedBuffers[index]->hostBuffer.data(); - size_t bufSize = mManagedBuffers[index]->hostBuffer.nbBytes(); - nvinfer1::Dims bufDims = mEngine->getBindingDimensions(index); - size_t rowCount = static_cast(bufDims.nbDims > 0 ? bufDims.d[bufDims.nbDims - 1] : mBatchSize); - int leadDim = mBatchSize; - int* trailDims = bufDims.d; - int nbDims = bufDims.nbDims; - - // Fix explicit Dimension networks - if (!leadDim && nbDims > 0) - { - leadDim = bufDims.d[0]; - ++trailDims; - --nbDims; - } - - os << "[" << leadDim; - for (int i = 0; i < nbDims; i++) - os << ", " << trailDims[i]; - os << "]" << std::endl; - switch (mEngine->getBindingDataType(index)) - { - case nvinfer1::DataType::kINT32: print(os, buf, bufSize, rowCount); break; - case nvinfer1::DataType::kFLOAT: print(os, buf, bufSize, rowCount); break; - case nvinfer1::DataType::kHALF: print(os, buf, bufSize, rowCount); break; - case nvinfer1::DataType::kINT8: assert(0 && "Int8 network-level input and output is not supported"); break; - case nvinfer1::DataType::kBOOL: assert(0 && "Bool network-level input and output are not supported"); break; - } + return mManagedBuffers[record->second]->hostBuffer.nbBytes(); } //! @@ -382,7 +358,7 @@ class BufferManager assert(bufSize % sizeof(T) == 0); T* typedBuf = static_cast(buf); size_t numItems = bufSize / sizeof(T); - for (int i = 0; i < static_cast(numItems); i++) + for (int32_t i = 0; i < static_cast(numItems); i++) { // Handle rowCount == 1 case if (rowCount == 1 && i != static_cast(numItems) - 1) @@ -404,7 +380,7 @@ class BufferManager //! void copyInputToDevice() { - memcpyBuffers(true, false, false, 0); + memcpyBuffers(true, false, false); } //! @@ -412,13 +388,13 @@ class BufferManager //! void copyOutputToHost() { - memcpyBuffers(false, true, false, 0); + memcpyBuffers(false, true, false); } //! //! \brief Copy the contents of input host buffers to input device buffers asynchronously. //! - void copyInputToDeviceAsync(const cudaStream_t& stream) + void copyInputToDeviceAsync(cudaStream_t const& stream = 0) { memcpyBuffers(true, false, true, stream); } @@ -426,7 +402,7 @@ class BufferManager //! //! \brief Copy the contents of output device buffers to output host buffers asynchronously. //! - void copyOutputToHostAsync(const cudaStream_t& stream) + void copyOutputToHostAsync(cudaStream_t const& stream = 0) { memcpyBuffers(false, true, true, stream); } @@ -434,30 +410,31 @@ class BufferManager ~BufferManager() = default; private: - void* getBuffer(const bool isHost, const std::string& tensorName) const + void* getBuffer(bool const isHost, std::string const& tensorName) const { - int index = mEngine->getBindingIndex(tensorName.c_str()); - if (index == -1) + auto record = mNames.find(tensorName); + if (record == mNames.end()) return nullptr; - return (isHost ? mManagedBuffers[index]->hostBuffer.data() : mManagedBuffers[index]->deviceBuffer.data()); + return (isHost ? mManagedBuffers[record->second]->hostBuffer.data() + : mManagedBuffers[record->second]->deviceBuffer.data()); } - void* getBuffer(const bool isHost, int bindingIndex) const + bool tenosrIsInput(const std::string& tensorName) const { - if (bindingIndex == -1) - return nullptr; - return (isHost ? mManagedBuffers[bindingIndex]->hostBuffer.data() : mManagedBuffers[bindingIndex]->deviceBuffer.data()); + return mEngine->getTensorIOMode(tensorName.c_str()) == nvinfer1::TensorIOMode::kINPUT; } - void memcpyBuffers(const bool copyInput, const bool deviceToHost, const bool async, const cudaStream_t& stream) + void memcpyBuffers(bool const copyInput, bool const deviceToHost, bool const async, cudaStream_t const& stream = 0) { - for (int i = 0; i < mEngine->getNbBindings(); i++) + for (auto const& n : mNames) { - void* dstPtr = deviceToHost ? mManagedBuffers[i]->hostBuffer.data() : mManagedBuffers[i]->deviceBuffer.data(); - const void* srcPtr = deviceToHost ? mManagedBuffers[i]->deviceBuffer.data() : mManagedBuffers[i]->hostBuffer.data(); - const size_t byteSize = mManagedBuffers[i]->hostBuffer.nbBytes(); + void* dstPtr = deviceToHost ? mManagedBuffers[n.second]->hostBuffer.data() + : mManagedBuffers[n.second]->deviceBuffer.data(); + void const* srcPtr = deviceToHost ? mManagedBuffers[n.second]->deviceBuffer.data() + : mManagedBuffers[n.second]->hostBuffer.data(); + size_t const byteSize = mManagedBuffers[n.second]->hostBuffer.nbBytes(); const cudaMemcpyKind memcpyType = deviceToHost ? cudaMemcpyDeviceToHost : cudaMemcpyHostToDevice; - if ((copyInput && mEngine->bindingIsInput(i)) || (!copyInput && !mEngine->bindingIsInput(i))) + if ((copyInput && tenosrIsInput(n.first)) || (!copyInput && !tenosrIsInput(n.first))) { if (async) CHECK(cudaMemcpyAsync(dstPtr, srcPtr, byteSize, memcpyType, stream)); @@ -468,9 +445,10 @@ class BufferManager } std::shared_ptr mEngine; //!< The pointer to the engine - int mBatchSize = 0; //!< The batch size for legacy networks, 0 otherwise. + int mBatchSize; //!< The batch size for legacy networks, 0 otherwise. std::vector> mManagedBuffers; //!< The vector of pointers to managed buffers - std::vector mDeviceBindings; //!< The vector of device buffers needed for engine execution + std::vector mDeviceBindings; //!< The vector of device buffers needed for engine execution + std::unordered_map mNames; //!< The map of tensor name and index pairs }; } // namespace samplesCommon diff --git a/src/Detector/tensorrt_yolo/common/common.h b/src/Detector/tensorrt_yolo/common/common.h index 2270a2cd..538c6094 100644 --- a/src/Detector/tensorrt_yolo/common/common.h +++ b/src/Detector/tensorrt_yolo/common/common.h @@ -1,11 +1,12 @@ /* - * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, @@ -16,22 +17,13 @@ #ifndef TENSORRT_COMMON_H #define TENSORRT_COMMON_H - -// For loadLibrary -#ifdef _MSC_VER -// Needed so that the max/min definitions in windows.h do not conflict with std::max/min. -#ifndef NOMINMAX -#define NOMINMAX -#endif -#include -#undef NOMINMAX -#else -#include -#endif - #include "NvInfer.h" +#if !TRT_WINML #include "NvInferPlugin.h" +#endif #include "logger.h" +#include "safeCommon.h" +#include "timingCache.h" #include #include #include @@ -39,6 +31,7 @@ #include #include #include +#include #include #include #include @@ -52,7 +45,15 @@ #include #include -#include "safeCommon.h" +#ifdef _MSC_VER +// For loadLibrary +// Needed so that the max/min definitions in windows.h do not conflict with std::max/min. +#define NOMINMAX +#include +#undef NOMINMAX +#else +#include +#endif #ifdef _MSC_VER #define FN_NAME __FUNCTION__ @@ -82,7 +83,7 @@ if (!(condition)) \ { \ sample::gLogError << "Assertion failure: " << #condition << std::endl; \ - abort(); \ + exit(EXIT_FAILURE); \ } \ } while (0) @@ -96,7 +97,7 @@ OBJ_GUARD(T) makeObjGuard(T_* t) { CHECK(!(std::is_base_of::value || std::is_same::value)); - auto deleter = [](T* t) { t->destroy(); }; + auto deleter = [](T* t) { delete t; }; return std::unique_ptr{static_cast(t), deleter}; } @@ -113,21 +114,6 @@ constexpr long double operator"" _KiB(long double val) return val * (1 << 10); } -// These is necessary if we want to be able to write 1_GiB instead of 1.0_GiB. -// Since the return type is signed, -1_GiB will work as expected. -constexpr long long int operator"" _GiB(unsigned long long val) -{ - return val * (1 << 30); -} -constexpr long long int operator"" _MiB(unsigned long long val) -{ - return val * (1 << 20); -} -constexpr long long int operator"" _KiB(unsigned long long val) -{ - return val * (1 << 10); -} - struct SimpleProfiler : public nvinfer1::IProfiler { struct Record @@ -136,7 +122,7 @@ struct SimpleProfiler : public nvinfer1::IProfiler int count{0}; }; - virtual void reportLayerTime(const char* layerName, float ms) noexcept + void reportLayerTime(const char* layerName, float ms) noexcept override { mProfile[layerName].count++; mProfile[layerName].time += ms; @@ -183,7 +169,7 @@ struct SimpleProfiler : public nvinfer1::IProfiler auto old_precision = out.precision(); // Output header { - out << std::setw(maxLayerNameLength) << layerNameStr << " "; + out << std::setfill(' ') << std::setw(maxLayerNameLength) << layerNameStr << " "; out << std::setw(12) << "Runtime, " << "%" << " "; @@ -214,80 +200,12 @@ struct SimpleProfiler : public nvinfer1::IProfiler std::map mProfile; }; -//! Locate path to file, given its filename or filepath suffix and possible dirs it might lie in. -//! Function will also walk back MAX_DEPTH dirs from CWD to check for such a file path. -inline std::string locateFile( - const std::string& filepathSuffix, const std::vector& directories, bool reportError = true) -{ - const int MAX_DEPTH{10}; - bool found{false}; - std::string filepath; - - for (auto& dir : directories) - { - if (!dir.empty() && dir.back() != '/') - { -#ifdef _MSC_VER - filepath = dir + "\\" + filepathSuffix; -#else - filepath = dir + "/" + filepathSuffix; -#endif - } - else - { - filepath = dir + filepathSuffix; - } - - for (int i = 0; i < MAX_DEPTH && !found; i++) - { - const std::ifstream checkFile(filepath); - found = checkFile.is_open(); - if (found) - { - break; - } - - filepath = "../" + filepath; // Try again in parent dir - } - - if (found) - { - break; - } - - filepath.clear(); - } - - // Could not find the file - if (filepath.empty()) - { - const std::string dirList = std::accumulate(directories.begin() + 1, directories.end(), directories.front(), - [](const std::string& a, const std::string& b) { return a + "\n\t" + b; }); - std::cout << "Could not find " << filepathSuffix << " in data directories:\n\t" << dirList << std::endl; - - if (reportError) - { - std::cout << "&&&& FAILED" << std::endl; - exit(EXIT_FAILURE); - } - } - - return filepath; -} - -inline void readPGMFile(const std::string& fileName, uint8_t* buffer, int inH, int inW) -{ - std::ifstream infile(fileName, std::ifstream::binary); - assert(infile.is_open() && "Attempting to read from a file that is not open."); - std::string magic, h, w, max; - infile >> magic >> h >> w >> max; - infile.seekg(1, infile.cur); - infile.read(reinterpret_cast(buffer), inH * inW); -} - namespace samplesCommon { - +using nvinfer1::utils::loadTimingCacheFile; +using nvinfer1::utils::buildTimingCacheFromFile; +using nvinfer1::utils::saveTimingCacheFile; +using nvinfer1::utils::updateTimingCacheFile; // Swaps endianness of an integral type. template ::value, int>::type = 0> inline T swapEndianness(const T& value) @@ -339,7 +257,7 @@ class TypedHostMemory : public HostMemory { mData = new ElemType[size]; }; - ~TypedHostMemory() noexcept + ~TypedHostMemory() noexcept override { delete[](ElemType*) mData; } @@ -360,7 +278,7 @@ inline void* safeCudaMalloc(size_t memSize) if (deviceMem == nullptr) { std::cerr << "Out of memory" << std::endl; - exit(1); + exit(EXIT_FAILURE); } return deviceMem; } @@ -375,25 +293,20 @@ struct InferDeleter template void operator()(T* obj) const { -#if (NV_TENSORRT_MAJOR < 8) - obj->destroy(); -#else delete obj; -#endif } }; template -using SampleUniquePtr = std::unique_ptr; +using SampleUniquePtr = std::unique_ptr; -static auto StreamDeleter = [](cudaStream_t* pStream) +static auto StreamDeleter = [](cudaStream_t* pStream) { + if (pStream) { - if (pStream) - { - cudaStreamDestroy(*pStream); - delete pStream; - } - }; + static_cast(cudaStreamDestroy(*pStream)); + delete pStream; + } +}; inline std::unique_ptr makeCudaStream() { @@ -531,7 +444,7 @@ inline float getMaxValue(const float* buffer, int64_t size) // // The default parameter values choosen arbitrarily. Range values should be choosen such that // we avoid underflow or overflow. Also range value should be non zero to avoid uniform zero scale tensor. -inline void setAllDynamicRanges(nvinfer1::INetworkDefinition* network, float inRange = 2.0f, float outRange = 4.0f) +inline void setAllDynamicRanges(nvinfer1::INetworkDefinition* network, float inRange = 2.0F, float outRange = 4.0F) { // Ensure that all layer inputs have a scale. for (int i = 0; i < network->getNbLayers(); i++) @@ -579,14 +492,15 @@ inline void setDummyInt8DynamicRanges(const nvinfer1::IBuilderConfig* c, nvinfer // Set dummy per-tensor dynamic range if Int8 mode is requested. if (c->getFlag(nvinfer1::BuilderFlag::kINT8)) { - sample::gLogWarning - << "Int8 calibrator not provided. Generating dummy per-tensor dynamic range. Int8 accuracy is not guaranteed." - << std::endl; + sample::gLogWarning << "Int8 calibrator not provided. Generating dummy per-tensor dynamic range. Int8 accuracy " + "is not guaranteed." + << std::endl; setAllDynamicRanges(n); } } -inline void enableDLA(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, int useDLACore, bool allowGPUFallback = true) +inline void enableDLA( + nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, int useDLACore, bool allowGPUFallback = true) { if (useDLACore >= 0) { @@ -627,18 +541,28 @@ inline uint32_t getElementSize(nvinfer1::DataType t) noexcept { switch (t) { - case nvinfer1::DataType::kINT32: return 4; + case nvinfer1::DataType::kINT64: return 8; + case nvinfer1::DataType::kINT32: case nvinfer1::DataType::kFLOAT: return 4; + case nvinfer1::DataType::kBF16: case nvinfer1::DataType::kHALF: return 2; case nvinfer1::DataType::kBOOL: - case nvinfer1::DataType::kINT8: return 1; + case nvinfer1::DataType::kUINT8: + case nvinfer1::DataType::kINT8: + case nvinfer1::DataType::kFP8: return 1; + case nvinfer1::DataType::kINT4: + ASSERT(false && "Element size is not implemented for sub-byte data-types"); } return 0; } -inline int64_t volume(const nvinfer1::Dims& d) +inline int64_t volume(nvinfer1::Dims const& dims, int32_t start, int32_t stop) { - return std::accumulate(d.d, d.d + d.nbDims, 1, std::multiplies()); + ASSERT(start >= 0); + ASSERT(start <= stop); + ASSERT(stop <= dims.nbDims); + ASSERT(std::all_of(dims.d + start, dims.d + stop, [](int32_t x) { return x >= 0; })); + return std::accumulate(dims.d + start, dims.d + stop, int64_t{1}, std::multiplies{}); } template @@ -698,7 +622,7 @@ void writePPMFileWithBBox(const std::string& filename, PPM& ppm, const << ppm.w << " " << ppm.h << "\n" << ppm.max << "\n"; - auto round = [](float x) -> int { return int(std::floor(x + 0.5f)); }; + auto round = [](float x) -> int { return int(std::floor(x + 0.5F)); }; const int x1 = std::min(std::max(0, round(int(bbox.x1))), W - 1); const int x2 = std::min(std::max(0, round(int(bbox.x2))), W - 1); const int y1 = std::min(std::max(0, round(int(bbox.y1))), H - 1); @@ -739,7 +663,7 @@ inline void writePPMFileWithBBox(const std::string& filename, vPPM ppm, std::vec << "\n" << ppm.w << " " << ppm.h << "\n" << ppm.max << "\n"; - auto round = [](float x) -> int { return int(std::floor(x + 0.5f)); }; + auto round = [](float x) -> int { return int(std::floor(x + 0.5F)); }; for (auto bbox : dets) { @@ -778,7 +702,7 @@ class TimerBase virtual void stop() {} float microseconds() const noexcept { - return mMs * 1000.f; + return mMs * 1000.F; } float milliseconds() const noexcept { @@ -786,15 +710,15 @@ class TimerBase } float seconds() const noexcept { - return mMs / 1000.f; + return mMs / 1000.F; } void reset() noexcept { - mMs = 0.f; + mMs = 0.F; } protected: - float mMs{0.0f}; + float mMs{0.0F}; }; class GpuTimer : public TimerBase @@ -811,14 +735,14 @@ class GpuTimer : public TimerBase CHECK(cudaEventDestroy(mStart)); CHECK(cudaEventDestroy(mStop)); } - void start() + void start() override { CHECK(cudaEventRecord(mStart, mStream)); } - void stop() + void stop() override { CHECK(cudaEventRecord(mStop, mStream)); - float ms{0.0f}; + float ms{0.0F}; CHECK(cudaEventSynchronize(mStop)); CHECK(cudaEventElapsedTime(&ms, mStart, mStop)); mMs += ms; @@ -835,11 +759,11 @@ class CpuTimer : public TimerBase public: using clock_type = Clock; - void start() + void start() override { mStart = Clock::now(); } - void stop() + void stop() override { mStop = Clock::now(); mMs += std::chrono::duration{mStop - mStart}.count(); @@ -865,13 +789,7 @@ inline std::vector splitString(std::string str, char delimiter = ', return splitVect; } -// Return m rounded up to nearest multiple of n -inline int roundUp(int m, int n) -{ - return ((m + n - 1) / n) * n; -} - -inline int getC(const nvinfer1::Dims& d) +inline int getC(nvinfer1::Dims const& d) { return d.nbDims >= 3 ? d.d[d.nbDims - 3] : 1; } @@ -886,54 +804,111 @@ inline int getW(const nvinfer1::Dims& d) return d.nbDims >= 1 ? d.d[d.nbDims - 1] : 1; } -inline void loadLibrary(const std::string& path) +//! Platform-agnostic wrapper around dynamic libraries. +class DynamicLibrary { -#ifdef _MSC_VER - void* handle = LoadLibrary(path.c_str()); -#else - int32_t flags{RTLD_LAZY}; +public: + explicit DynamicLibrary(std::string const& name) + : mLibName{name} + { +#if defined(_WIN32) + mHandle = LoadLibraryA(name.c_str()); +#else // defined(_WIN32) + int32_t flags{RTLD_LAZY}; #if ENABLE_ASAN - // https://github.com/google/sanitizers/issues/89 - // asan doesn't handle module unloading correctly and there are no plans on doing - // so. In order to get proper stack traces, don't delete the shared library on - // close so that asan can resolve the symbols correctly. - flags |= RTLD_NODELETE; + // https://github.com/google/sanitizers/issues/89 + // asan doesn't handle module unloading correctly and there are no plans on doing + // so. In order to get proper stack traces, don't delete the shared library on + // close so that asan can resolve the symbols correctly. + flags |= RTLD_NODELETE; #endif // ENABLE_ASAN - void* handle = dlopen(path.c_str(), flags); + mHandle = dlopen(name.c_str(), flags); +#endif // defined(_WIN32) + + if (mHandle == nullptr) + { + std::string errorStr{}; +#if !defined(_WIN32) + errorStr = std::string{" due to "} + std::string{dlerror()}; #endif - if (handle == nullptr) + throw std::runtime_error("Unable to open library: " + name + errorStr); + } + } + + DynamicLibrary(DynamicLibrary const&) = delete; + DynamicLibrary(DynamicLibrary const&&) = delete; + + //! + //! Retrieve a function symbol from the loaded library. + //! + //! \return the loaded symbol on success + //! \throw std::invalid_argument if loading the symbol failed. + //! + template + std::function symbolAddress(char const* name) { -#ifdef _MSC_VER - sample::gLogError << "Could not load plugin library: " << path << std::endl; + if (mHandle == nullptr) + { + throw std::runtime_error("Handle to library is nullptr."); + } + void* ret; +#if defined(_MSC_VER) + ret = static_cast(GetProcAddress(static_cast(mHandle), name)); #else - sample::gLogError << "Could not load plugin library: " << path << ", due to: " << dlerror() << std::endl; + ret = dlsym(mHandle, name); #endif + if (ret == nullptr) + { + std::string const kERROR_MSG(mLibName + ": error loading symbol: " + std::string(name)); + throw std::invalid_argument(kERROR_MSG); + } + return reinterpret_cast(ret); } -} -inline int32_t getSMVersion() -{ - int32_t deviceIndex = 0; - CHECK(cudaGetDevice(&deviceIndex)); + ~DynamicLibrary() + { + try + { +#if defined(_WIN32) + ASSERT(static_cast(FreeLibrary(static_cast(mHandle)))); +#else + ASSERT(dlclose(mHandle) == 0); +#endif + } + catch (...) + { + sample::gLogError << "Unable to close library: " << mLibName << std::endl; + } + } - int32_t major, minor; - CHECK(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, deviceIndex)); - CHECK(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, deviceIndex)); +private: + std::string mLibName{}; //!< Name of the DynamicLibrary + void* mHandle{}; //!< Handle to the DynamicLibrary +}; - return ((major << 8) | minor); +inline std::unique_ptr loadLibrary(std::string const& path) +{ + // make_unique not available until C++14 - we still need to support C++11 builds. + return std::unique_ptr(new DynamicLibrary{path}); } -inline bool isSMSafe() +inline int32_t getMaxPersistentCacheSize() { - const int32_t smVersion = getSMVersion(); - return smVersion == 0x0700 || smVersion == 0x0702 || smVersion == 0x0705 || - smVersion == 0x0800 || smVersion == 0x0806 || smVersion == 0x0807; + int32_t deviceIndex{}; + CHECK(cudaGetDevice(&deviceIndex)); + + int32_t maxPersistentL2CacheSize{}; +#if CUDART_VERSION >= 11030 && !TRT_WINML + CHECK(cudaDeviceGetAttribute(&maxPersistentL2CacheSize, cudaDevAttrMaxPersistingL2CacheSize, deviceIndex)); +#endif + + return maxPersistentL2CacheSize; } inline bool isDataTypeSupported(nvinfer1::DataType dataType) { - auto builder = SampleUniquePtr(nvinfer1::createInferBuilder(sample::gLogger.getTRTLogger())); + auto builder = SampleUniquePtr(createBuilder()); if (!builder) { return false; @@ -947,7 +922,6 @@ inline bool isDataTypeSupported(nvinfer1::DataType dataType) return true; } - } // namespace samplesCommon inline std::ostream& operator<<(std::ostream& os, const nvinfer1::Dims& dims) diff --git a/src/Detector/tensorrt_yolo/common/dumpTFWts.py b/src/Detector/tensorrt_yolo/common/dumpTFWts.py new file mode 100644 index 00000000..70770fbd --- /dev/null +++ b/src/Detector/tensorrt_yolo/common/dumpTFWts.py @@ -0,0 +1,124 @@ +#!/usr/bin/python +# +# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Script to dump TensorFlow weights in TRT v1 and v2 dump format. +# The V1 format is for TensorRT 4.0. The V2 format is for TensorRT 4.0 and later. + +import sys +import struct +import argparse + +try: + import tensorflow as tf + from tensorflow.python import pywrap_tensorflow +except ImportError as err: + sys.stderr.write("""Error: Failed to import module ({})""".format(err)) + sys.exit() + +parser = argparse.ArgumentParser(description="TensorFlow Weight Dumper") + +parser.add_argument( + "-m", + "--model", + required=True, + help="The checkpoint file basename, example basename(model.ckpt-766908.data-00000-of-00001) -> model.ckpt-766908", +) +parser.add_argument("-o", "--output", required=True, help="The weight file to dump all the weights to.") +parser.add_argument("-1", "--wtsv1", required=False, default=False, type=bool, help="Dump the weights in the wts v1.") + +opt = parser.parse_args() + +if opt.wtsv1: + print("Outputting the trained weights in TensorRT's wts v1 format. This format is documented as:") + print("Line 0: ") + print("Line 1-Num: [buffer name] [buffer type] [buffer size] ") +else: + print("Outputting the trained weights in TensorRT's wts v2 format. This format is documented as:") + print("Line 0: ") + print("Line 1-Num: [buffer name] [buffer type] [(buffer shape{e.g. (1, 2, 3)}] ") + +inputbase = opt.model +outputbase = opt.output + + +def float_to_hex(f): + return hex(struct.unpack(" +#include +#include + +namespace nvinfer1 +{ +namespace utils +{ +FileLock::FileLock(ILogger& logger, std::string const& fileName) + : mLogger(logger) + , mFileName(fileName) +{ + std::string lockFileName = mFileName + ".lock"; +#ifdef _MSC_VER + { + std::stringstream ss; + ss << "Trying to set exclusive file lock " << lockFileName << std::endl; + mLogger.log(ILogger::Severity::kVERBOSE, ss.str().c_str()); + } + // MS docs said this is a blocking IO if "FILE_FLAG_OVERLAPPED" is not provided + mHandle = CreateFileA(lockFileName.c_str(), GENERIC_WRITE, 0, NULL, OPEN_ALWAYS, 0, NULL); + if (mHandle == INVALID_HANDLE_VALUE) + { + throw std::runtime_error("Failed to lock " + lockFileName + "!"); + } +#elif defined(__QNX__) + // We once enabled the file lock on QNX, lockf(F_TLOCK) return -1 and the reported error is + // The error generated was 89, which means that the function is not implemented. +#else + mHandle = fopen(lockFileName.c_str(), "wb+"); + if (mHandle == nullptr) + { + throw std::runtime_error("Cannot open " + lockFileName + "!"); + } + { + std::stringstream ss; + ss << "Trying to set exclusive file lock " << lockFileName << std::endl; + mLogger.log(ILogger::Severity::kVERBOSE, ss.str().c_str()); + } + mDescriptor = fileno(mHandle); + auto ret = lockf(mDescriptor, F_LOCK, 0); + if (ret != 0) + { + mDescriptor = -1; + fclose(mHandle); + throw std::runtime_error("Failed to lock " + lockFileName + "!"); + } +#endif +} + +FileLock::~FileLock() +{ + std::string lockFileName = mFileName + ".lock"; +#ifdef _MSC_VER + if (mHandle != INVALID_HANDLE_VALUE) + { + CloseHandle(mHandle); + } +#elif defined(__QNX__) + // We once enabled the file lock on QNX, lockf(F_TLOCK) return -1 and the reported error is + // The error generated was 89 + // That means : Function not implemented +#else + if (mDescriptor != -1) + { + auto ret = lockf(mDescriptor, F_ULOCK, 0); + if (mHandle != nullptr) + { + fclose(mHandle); + } + if (ret != 0) + { + std::stringstream ss; + ss << "Failed to unlock " << lockFileName << ", please remove " << lockFileName << ".lock manually!" + << std::endl; + mLogger.log(ILogger::Severity::kVERBOSE, ss.str().c_str()); + } + } +#endif +} +} // namespace utils +} // namespace nvinfer1 diff --git a/src/Detector/tensorrt_yolo/common/fileLock.h b/src/Detector/tensorrt_yolo/common/fileLock.h new file mode 100644 index 00000000..d0f64a5b --- /dev/null +++ b/src/Detector/tensorrt_yolo/common/fileLock.h @@ -0,0 +1,86 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef TENSORRT_SAMPLES_COMMON_FILELOCK_H_ +#define TENSORRT_SAMPLES_COMMON_FILELOCK_H_ +#include "NvInfer.h" +#ifdef _MSC_VER +// Needed so that the max/min definitions in windows.h do not conflict with std::max/min. +#define NOMINMAX +#include +#undef NOMINMAX +#else +#include // fileno +#include // lockf +#endif +#include + +namespace nvinfer1 +{ +namespace utils +{ +//! +//! \brief RAII object that locks a the specified file. +//! +//! The FileLock class uses a lock file to specify that the +//! current file is being used by a TensorRT tool or sample +//! so that things like the TimingCache can be updated across +//! processes without having conflicts. +//! +class FileLock +{ +public: + FileLock(nvinfer1::ILogger& logger, std::string const& fileName); + ~FileLock(); + FileLock() = delete; // no default ctor + FileLock(FileLock const&) = delete; // no copy ctor + FileLock& operator=(FileLock const&) = delete; // no copy assignment + FileLock(FileLock&&) = delete; // no move ctor + FileLock& operator=(FileLock&&) = delete; // no move assignment + +private: + //! + //! The logger that emits any error messages that might show up. + //! + nvinfer1::ILogger& mLogger; + + //! + //! The filename that the FileLock is protecting from multiple + //! TensorRT processes from writing to. + //! + std::string const mFileName; + +#ifdef _MSC_VER + //! + //! The file handle on windows for the file lock. + //! + HANDLE mHandle{}; +#else + //! + //! The file handle on linux for the file lock. + //! + FILE* mHandle{}; + //! + //! The file descriptor on linux of the file lock. + //! + int32_t mDescriptor{-1}; +#endif +}; // class FileLock +} // namespace utils +} // namespace nvinfer1 + +#endif // TENSORRT_SAMPLES_COMMON_FILELOCK_H_ diff --git a/src/Detector/tensorrt_yolo/common/getOptions.cpp b/src/Detector/tensorrt_yolo/common/getOptions.cpp new file mode 100644 index 00000000..19cd3281 --- /dev/null +++ b/src/Detector/tensorrt_yolo/common/getOptions.cpp @@ -0,0 +1,248 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "getOptions.h" +#include "logger.h" + +#include +#include +#include +#include +#include + +namespace nvinfer1 +{ +namespace utility +{ + +//! Matching for TRTOptions is defined as follows: +//! +//! If A and B both have longName set, A matches B if and only if A.longName == +//! B.longName and (A.shortName == B.shortName if both have short name set). +//! +//! If A only has shortName set and B only has longName set, then A does not +//! match B. It is assumed that when 2 TRTOptions are compared, one of them is +//! the definition of a TRTOption in the input to getOptions. As such, if the +//! definition only has shortName set, it will never be equal to a TRTOption +//! that does not have shortName set (and same for longName). +//! +//! If A and B both have shortName set but B does not have longName set, A +//! matches B if and only if A.shortName == B.shortName. +//! +//! If A has neither long or short name set, A matches B if and only if B has +//! neither long or short name set. +bool matches(const TRTOption& a, const TRTOption& b) +{ + if (!a.longName.empty() && !b.longName.empty()) + { + if (a.shortName && b.shortName) + { + return (a.longName == b.longName) && (a.shortName == b.shortName); + } + return a.longName == b.longName; + } + + // If only one of them is not set, this will return false anyway. + return a.shortName == b.shortName; +} + +//! getTRTOptionIndex returns the index of a TRTOption in a vector of +//! TRTOptions, -1 if not found. +int getTRTOptionIndex(const std::vector& options, const TRTOption& opt) +{ + for (size_t i = 0; i < options.size(); ++i) + { + if (matches(opt, options[i])) + { + return i; + } + } + return -1; +} + +//! validateTRTOption will return a string containing an error message if options +//! contain non-numeric characters, or if there are duplicate option names found. +//! Otherwise, returns the empty string. +std::string validateTRTOption( + const std::set& seenShortNames, const std::set& seenLongNames, const TRTOption& opt) +{ + if (opt.shortName != 0) + { + if (!std::isalnum(opt.shortName)) + { + return "Short name '" + std::to_string(opt.shortName) + "' is non-alphanumeric"; + } + + if (seenShortNames.find(opt.shortName) != seenShortNames.end()) + { + return "Short name '" + std::to_string(opt.shortName) + "' is a duplicate"; + } + } + + if (!opt.longName.empty()) + { + for (const char& c : opt.longName) + { + if (!std::isalnum(c) && c != '-' && c != '_') + { + return "Long name '" + opt.longName + "' contains characters that are not '-', '_', or alphanumeric"; + } + } + + if (seenLongNames.find(opt.longName) != seenLongNames.end()) + { + return "Long name '" + opt.longName + "' is a duplicate"; + } + } + return ""; +} + +//! validateTRTOptions will return a string containing an error message if any +//! options contain non-numeric characters, or if there are duplicate option +//! names found. Otherwise, returns the empty string. +std::string validateTRTOptions(const std::vector& options) +{ + std::set seenShortNames; + std::set seenLongNames; + for (size_t i = 0; i < options.size(); ++i) + { + const std::string errMsg = validateTRTOption(seenShortNames, seenLongNames, options[i]); + if (!errMsg.empty()) + { + return "Error '" + errMsg + "' at TRTOption " + std::to_string(i); + } + + seenShortNames.insert(options[i].shortName); + seenLongNames.insert(options[i].longName); + } + return ""; +} + +//! parseArgs parses an argument list and returns a TRTParsedArgs with the +//! fields set accordingly. Assumes that options is validated. +//! ErrMsg will be set if: +//! - an argument is null +//! - an argument is empty +//! - an argument does not have option (i.e. "-" and "--") +//! - a short argument has more than 1 character +//! - the last argument in the list requires a value +TRTParsedArgs parseArgs(int argc, const char* const* argv, const std::vector& options) +{ + TRTParsedArgs parsedArgs; + parsedArgs.values.resize(options.size()); + + for (int i = 1; i < argc; ++i) // index of current command-line argument + { + if (argv[i] == nullptr) + { + return TRTParsedArgs{"Null argument at index " + std::to_string(i)}; + } + + const std::string argStr(argv[i]); + if (argStr.empty()) + { + return TRTParsedArgs{"Empty argument at index " + std::to_string(i)}; + } + + // No starting hyphen means it is a positional argument + if (argStr[0] != '-') + { + parsedArgs.positionalArgs.push_back(argStr); + continue; + } + + if (argStr == "-" || argStr == "--") + { + return TRTParsedArgs{"Argument does not specify an option at index " + std::to_string(i)}; + } + + // If only 1 hyphen, char after is the flag. + TRTOption opt{' ', "", false, ""}; + std::string value; + if (argStr[1] != '-') + { + // Must only have 1 char after the hyphen + if (argStr.size() > 2) + { + return TRTParsedArgs{"Short arg contains more than 1 character at index " + std::to_string(i)}; + } + opt.shortName = argStr[1]; + } + else + { + opt.longName = argStr.substr(2); + + // We need to support --foo=bar syntax, so look for '=' + const size_t eqIndex = opt.longName.find('='); + if (eqIndex < opt.longName.size()) + { + value = opt.longName.substr(eqIndex + 1); + opt.longName = opt.longName.substr(0, eqIndex); + } + } + + const int idx = getTRTOptionIndex(options, opt); + if (idx < 0) + { + continue; + } + + if (options[idx].valueRequired) + { + if (!value.empty()) + { + parsedArgs.values[idx].second.push_back(value); + parsedArgs.values[idx].first = parsedArgs.values[idx].second.size(); + continue; + } + + if (i + 1 >= argc) + { + return TRTParsedArgs{"Last argument requires value, but none given"}; + } + + const std::string nextArg(argv[i + 1]); + if (nextArg.size() >= 1 && nextArg[0] == '-') + { + sample::gLogWarning << "Warning: Using '" << nextArg << "' as a value for '" << argStr + << "', Should this be its own flag?" << std::endl; + } + + parsedArgs.values[idx].second.push_back(nextArg); + i += 1; // Next argument already consumed + + parsedArgs.values[idx].first = parsedArgs.values[idx].second.size(); + } + else + { + parsedArgs.values[idx].first += 1; + } + } + return parsedArgs; +} + +TRTParsedArgs getOptions(int argc, const char* const* argv, const std::vector& options) +{ + const std::string errMsg = validateTRTOptions(options); + if (!errMsg.empty()) + { + return TRTParsedArgs{errMsg}; + } + return parseArgs(argc, argv, options); +} +} // namespace utility +} // namespace nvinfer1 diff --git a/src/Detector/tensorrt_yolo/common/getOptions.h b/src/Detector/tensorrt_yolo/common/getOptions.h new file mode 100644 index 00000000..4bbf9e27 --- /dev/null +++ b/src/Detector/tensorrt_yolo/common/getOptions.h @@ -0,0 +1,128 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef TRT_GET_OPTIONS_H +#define TRT_GET_OPTIONS_H + +#include +#include +#include + +namespace nvinfer1 +{ +namespace utility +{ + +//! TRTOption defines a command line option. At least 1 of shortName and longName +//! must be defined. +//! If bool initialization is undefined behavior on your system, valueRequired +//! must also be explicitly defined. +//! helpText is optional. +struct TRTOption +{ + char shortName; //!< Option name in short (single hyphen) form (i.e. -a, -b) + std::string longName; //!< Option name in long (double hyphen) form (i.e. --foo, --bar) + bool valueRequired; //!< True if a value is needed for an option (i.e. -N 4, --foo bar) + std::string helpText; //!< Text to show when printing out the command usage +}; + +//! TRTParsedArgs is returned by getOptions after it has parsed a command line +//! argument list (argv). +//! +//! errMsg is a string containing an error message if any errors occurred. If it +//! is empty, no errors occurred. +//! +//! values stores a vector of pairs for each option (ordered by order in the +//! input). Each pair contains an int (the number of occurrences) and a vector +//! of strings (a list of values). The user should know which of these to use, +//! and which options required values. For non-value options, only occurrences is +//! populated. For value-required options, occurrences == # of values. Values do +//! not need to be unique. +//! +//! positionalArgs stores additional arguments that are passed in without an +//! option (these must not start with a hyphen). +struct TRTParsedArgs +{ + std::string errMsg; + std::vector>> values; + std::vector positionalArgs; +}; + +//! Parse the input arguments passed to main() and extract options as well as +//! positional arguments. +//! +//! Options are supposed to be passed to main() with a preceding hyphen '-'. +//! +//! If there is a single preceding hyphen, there should be exactly 1 character +//! after the hyphen, which is interpreted as the option. +//! +//! If there are 2 preceding hyphens, the entire argument (without the hyphens) +//! is interpreted as the option. +//! +//! If the option requires a value, the next argument is used as the value. +//! +//! Positional arguments must not start with a hyphen. +//! +//! If an argument requires a value, the next argument is interpreted as the +//! value, even if it is the form of a valid option (i.e. --foo --bar will store +//! "--bar" as a value for option "foo" if "foo" requires a value). +//! We also support --name=value syntax. In this case, 'value' would be used as +//! the value, NOT the next argument. +//! +//! For options: +//! { { 'a', "", false }, +//! { 'b', "", false }, +//! { 0, "cee", false }, +//! { 'd', "", true }, +//! { 'e', "", true }, +//! { 'f', "foo", true } } +//! +//! ./main hello world -a -a --cee -d 12 -f 34 +//! and +//! ./main hello world -a -a --cee -d 12 --foo 34 +//! +//! will result in: +//! +//! TRTParsedArgs { +//! errMsg: "", +//! values: { { 2, {} }, +//! { 0, {} }, +//! { 1, {} }, +//! { 1, {"12"} }, +//! { 0, {} }, +//! { 1, {"34"} } } +//! positionalArgs: {"hello", "world"}, +//! } +//! +//! Non-POSIX behavior: +//! - Does not support "-abcde" as a shorthand for "-a -b -c -d -e". Each +//! option must have its own hyphen prefix. +//! - Does not support -e12 as a shorthand for "-e 12". Values MUST be +//! whitespace-separated from the option it is for. +//! +//! @param[in] argc The number of arguments passed to main (including the +//! file name, which is disregarded) +//! @param[in] argv The arguments passed to main (including the file name, +//! which is disregarded) +//! @param[in] options List of TRTOptions to parse +//! @return TRTParsedArgs. See TRTParsedArgs documentation for descriptions of +//! the fields. +TRTParsedArgs getOptions(int argc, const char* const* argv, const std::vector& options); +} // namespace utility +} // namespace nvinfer1 + +#endif // TRT_GET_OPTIONS_H diff --git a/src/Detector/tensorrt_yolo/common/getopt.c b/src/Detector/tensorrt_yolo/common/getopt.c new file mode 100644 index 00000000..c1da08b5 --- /dev/null +++ b/src/Detector/tensorrt_yolo/common/getopt.c @@ -0,0 +1,568 @@ +/* $OpenBSD: getopt_long.c,v 1.23 2007/10/31 12:34:57 chl Exp $ */ +/* $NetBSD: getopt_long.c,v 1.15 2002/01/31 22:43:40 tv Exp $ */ + +/* + * Copyright (c) 2002 Todd C. Miller + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + * Sponsored in part by the Defense Advanced Research Projects + * Agency (DARPA) and Air Force Research Laboratory, Air Force + * Materiel Command, USAF, under agreement number F39502-99-1-0512. + */ +/*- + * Copyright (c) 2000 The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by Dieter Baron and Thomas Klausner. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "getoptWin.h" +#include +#include +#include +#include +#include +#include + +#define REPLACE_GETOPT /* use this getopt as the system getopt(3) */ + +#ifdef REPLACE_GETOPT +int opterr = 1; /* if error message should be printed */ +int optind = 1; /* index into parent argv vector */ +int optopt = '?'; /* character checked for validity */ +#undef optreset /* see getopt.h */ +#define optreset __mingw_optreset +int optreset; /* reset getopt */ +char* optarg; /* argument associated with option */ +#endif + +#define PRINT_ERROR ((opterr) && (*options != ':')) + +#define FLAG_PERMUTE 0x01 /* permute non-options to the end of argv */ +#define FLAG_ALLARGS 0x02 /* treat non-options as args to option "-1" */ +#define FLAG_LONGONLY 0x04 /* operate as getopt_long_only */ + +/* return values */ +#define BADCH (int) '?' +#define BADARG ((*options == ':') ? (int) ':' : (int) '?') +#define INORDER (int) 1 + +#ifndef __CYGWIN__ +#define __progname __argv[0] +#else +extern char __declspec(dllimport) * __progname; +#endif + +#ifdef __CYGWIN__ +static char EMSG[] = ""; +#else +#define EMSG "" +#endif + +static int getopt_internal(int, char* const*, char const*, const struct option*, int*, int); +static int parse_long_options(char* const*, char const*, const struct option*, int*, int); +static int gcd(int, int); +static void permute_args(int, int, int, char* const*); + +static char* place = EMSG; /* option letter processing */ + +/* XXX: set optreset to 1 rather than these two */ +static int nonopt_start = -1; /* first non option argument (for permute) */ +static int nonopt_end = -1; /* first option after non options (for permute) */ + +/* Error messages */ +static char const recargchar[] = "option requires an argument -- %c"; +static char const recargstring[] = "option requires an argument -- %s"; +static char const ambig[] = "ambiguous option -- %.*s"; +static char const noarg[] = "option doesn't take an argument -- %.*s"; +static char const illoptchar[] = "unknown option -- %c"; +static char const illoptstring[] = "unknown option -- %s"; + +static void _vwarnx(char const* fmt, va_list ap) +{ + (void) fprintf(stderr, "%s: ", __progname); + if (fmt != NULL) + (void) vfprintf(stderr, fmt, ap); + (void) fprintf(stderr, "\n"); +} + +static void warnx(char const* fmt, ...) +{ + va_list ap; + va_start(ap, fmt); + _vwarnx(fmt, ap); + va_end(ap); +} + +/* + * Compute the greatest common divisor of a and b. + */ +static int gcd(int a, int b) +{ + int c; + + c = a % b; + while (c != 0) + { + a = b; + b = c; + c = a % b; + } + + return (b); +} + +/* + * Exchange the block from nonopt_start to nonopt_end with the block + * from nonopt_end to opt_end (keeping the same order of arguments + * in each block). + */ +static void permute_args(int panonopt_start, int panonopt_end, int opt_end, char* const* nargv) +{ + int cstart, cyclelen, i, j, ncycle, nnonopts, nopts, pos; + char* swap; + + /* + * compute lengths of blocks and number and size of cycles + */ + nnonopts = panonopt_end - panonopt_start; + nopts = opt_end - panonopt_end; + ncycle = gcd(nnonopts, nopts); + cyclelen = (opt_end - panonopt_start) / ncycle; + + for (i = 0; i < ncycle; i++) + { + cstart = panonopt_end + i; + pos = cstart; + for (j = 0; j < cyclelen; j++) + { + if (pos >= panonopt_end) + pos -= nnonopts; + else + pos += nopts; + swap = nargv[pos]; + /* LINTED const cast */ + ((char**) nargv)[pos] = nargv[cstart]; + /* LINTED const cast */ + ((char**) nargv)[cstart] = swap; + } + } +} + +/* + * parse_long_options -- + * Parse long options in argc/argv argument vector. + * Returns -1 if short_too is set and the option does not match long_options. + */ +static int parse_long_options( + char* const* nargv, char const* options, const struct option* long_options, int* idx, int short_too) +{ + char *current_argv, *has_equal; + size_t current_argv_len; + int i, ambiguous, match; + +#define IDENTICAL_INTERPRETATION(_x, _y) \ + (long_options[(_x)].has_arg == long_options[(_y)].has_arg && long_options[(_x)].flag == long_options[(_y)].flag \ + && long_options[(_x)].val == long_options[(_y)].val) + + current_argv = place; + match = -1; + ambiguous = 0; + + optind++; + + if ((has_equal = strchr(current_argv, '=')) != NULL) + { + /* argument found (--option=arg) */ + current_argv_len = has_equal - current_argv; + has_equal++; + } + else + current_argv_len = strlen(current_argv); + + for (i = 0; long_options[i].name; i++) + { + /* find matching long option */ + if (strncmp(current_argv, long_options[i].name, current_argv_len)) + continue; + + if (strlen(long_options[i].name) == current_argv_len) + { + /* exact match */ + match = i; + ambiguous = 0; + break; + } + /* + * If this is a known short option, don't allow + * a partial match of a single character. + */ + if (short_too && current_argv_len == 1) + continue; + + if (match == -1) /* partial match */ + match = i; + else if (!IDENTICAL_INTERPRETATION(i, match)) + ambiguous = 1; + } + if (ambiguous) + { + /* ambiguous abbreviation */ + if (PRINT_ERROR) + warnx(ambig, (int) current_argv_len, current_argv); + optopt = 0; + return (BADCH); + } + if (match != -1) + { /* option found */ + if (long_options[match].has_arg == no_argument && has_equal) + { + if (PRINT_ERROR) + warnx(noarg, (int) current_argv_len, current_argv); + /* + * XXX: GNU sets optopt to val regardless of flag + */ + if (long_options[match].flag == NULL) + optopt = long_options[match].val; + else + optopt = 0; + return (BADARG); + } + if (long_options[match].has_arg == required_argument || long_options[match].has_arg == optional_argument) + { + if (has_equal) + optarg = has_equal; + else if (long_options[match].has_arg == required_argument) + { + /* + * optional argument doesn't use next nargv + */ + optarg = nargv[optind++]; + } + } + if ((long_options[match].has_arg == required_argument) && (optarg == NULL)) + { + /* + * Missing argument; leading ':' indicates no error + * should be generated. + */ + if (PRINT_ERROR) + warnx(recargstring, current_argv); + /* + * XXX: GNU sets optopt to val regardless of flag + */ + if (long_options[match].flag == NULL) + optopt = long_options[match].val; + else + optopt = 0; + --optind; + return (BADARG); + } + } + else + { /* unknown option */ + if (short_too) + { + --optind; + return (-1); + } + if (PRINT_ERROR) + warnx(illoptstring, current_argv); + optopt = 0; + return (BADCH); + } + if (idx) + *idx = match; + if (long_options[match].flag) + { + *long_options[match].flag = long_options[match].val; + return (0); + } + else + return (long_options[match].val); +#undef IDENTICAL_INTERPRETATION +} + +/* + * getopt_internal -- + * Parse argc/argv argument vector. Called by user level routines. + */ +static int getopt_internal( + int nargc, char* const* nargv, char const* options, const struct option* long_options, int* idx, int flags) +{ + char const* oli; /* option letter list index */ + int optchar, short_too; + static int posixly_correct = -1; + + if (options == NULL) + return (-1); + + /* + * XXX Some GNU programs (like cvs) set optind to 0 instead of + * XXX using optreset. Work around this braindamage. + */ + if (optind == 0) + optind = optreset = 1; + + /* + * Disable GNU extensions if POSIXLY_CORRECT is set or options + * string begins with a '+'. + * + * CV, 2009-12-14: Check POSIXLY_CORRECT anew if optind == 0 or + * optreset != 0 for GNU compatibility. + */ + if (posixly_correct == -1 || optreset != 0) + posixly_correct = (getenv("POSIXLY_CORRECT") != NULL); + if (*options == '-') + flags |= FLAG_ALLARGS; + else if (posixly_correct || *options == '+') + flags &= ~FLAG_PERMUTE; + if (*options == '+' || *options == '-') + options++; + + optarg = NULL; + if (optreset) + nonopt_start = nonopt_end = -1; +start: + if (optreset || !*place) + { /* update scanning pointer */ + optreset = 0; + if (optind >= nargc) + { /* end of argument vector */ + place = EMSG; + if (nonopt_end != -1) + { + /* do permutation, if we have to */ + permute_args(nonopt_start, nonopt_end, optind, nargv); + optind -= nonopt_end - nonopt_start; + } + else if (nonopt_start != -1) + { + /* + * If we skipped non-options, set optind + * to the first of them. + */ + optind = nonopt_start; + } + nonopt_start = nonopt_end = -1; + return (-1); + } + if (*(place = nargv[optind]) != '-' || (place[1] == '\0' && strchr(options, '-') == NULL)) + { + place = EMSG; /* found non-option */ + if (flags & FLAG_ALLARGS) + { + /* + * GNU extension: + * return non-option as argument to option 1 + */ + optarg = nargv[optind++]; + return (INORDER); + } + if (!(flags & FLAG_PERMUTE)) + { + /* + * If no permutation wanted, stop parsing + * at first non-option. + */ + return (-1); + } + /* do permutation */ + if (nonopt_start == -1) + nonopt_start = optind; + else if (nonopt_end != -1) + { + permute_args(nonopt_start, nonopt_end, optind, nargv); + nonopt_start = optind - (nonopt_end - nonopt_start); + nonopt_end = -1; + } + optind++; + /* process next argument */ + goto start; + } + if (nonopt_start != -1 && nonopt_end == -1) + nonopt_end = optind; + + /* + * If we have "-" do nothing, if "--" we are done. + */ + if (place[1] != '\0' && *++place == '-' && place[1] == '\0') + { + optind++; + place = EMSG; + /* + * We found an option (--), so if we skipped + * non-options, we have to permute. + */ + if (nonopt_end != -1) + { + permute_args(nonopt_start, nonopt_end, optind, nargv); + optind -= nonopt_end - nonopt_start; + } + nonopt_start = nonopt_end = -1; + return (-1); + } + } + + /* + * Check long options if: + * 1) we were passed some + * 2) the arg is not just "-" + * 3) either the arg starts with -- we are getopt_long_only() + */ + if (long_options != NULL && place != nargv[optind] && (*place == '-' || (flags & FLAG_LONGONLY))) + { + short_too = 0; + if (*place == '-') + place++; /* --foo long option */ + else if (*place != ':' && strchr(options, *place) != NULL) + short_too = 1; /* could be short option too */ + + optchar = parse_long_options(nargv, options, long_options, idx, short_too); + if (optchar != -1) + { + place = EMSG; + return (optchar); + } + } + + if ((optchar = (int) *place++) == (int) ':' || (optchar == (int) '-' && *place != '\0') + || (oli = strchr(options, optchar)) == NULL) + { + /* + * If the user specified "-" and '-' isn't listed in + * options, return -1 (non-option) as per POSIX. + * Otherwise, it is an unknown option character (or ':'). + */ + if (optchar == (int) '-' && *place == '\0') + return (-1); + if (!*place) + ++optind; + if (PRINT_ERROR) + warnx(illoptchar, optchar); + optopt = optchar; + return (BADCH); + } + if (long_options != NULL && optchar == 'W' && oli[1] == ';') + { + /* -W long-option */ + if (*place) /* no space */ + /* NOTHING */; + else if (++optind >= nargc) + { /* no arg */ + place = EMSG; + if (PRINT_ERROR) + warnx(recargchar, optchar); + optopt = optchar; + return (BADARG); + } + else /* white space */ + place = nargv[optind]; + optchar = parse_long_options(nargv, options, long_options, idx, 0); + place = EMSG; + return (optchar); + } + if (*++oli != ':') + { /* doesn't take argument */ + if (!*place) + ++optind; + } + else + { /* takes (optional) argument */ + optarg = NULL; + if (*place) /* no white space */ + optarg = place; + else if (oli[1] != ':') + { /* arg not optional */ + if (++optind >= nargc) + { /* no arg */ + place = EMSG; + if (PRINT_ERROR) + warnx(recargchar, optchar); + optopt = optchar; + return (BADARG); + } + else + optarg = nargv[optind]; + } + place = EMSG; + ++optind; + } + /* dump back option letter */ + return (optchar); +} + +#ifdef REPLACE_GETOPT +/* + * getopt -- + * Parse argc/argv argument vector. + * + * [eventually this will replace the BSD getopt] + */ +int getopt(int nargc, char* const* nargv, char const* options) +{ + + /* + * We don't pass FLAG_PERMUTE to getopt_internal() since + * the BSD getopt(3) (unlike GNU) has never done this. + * + * Furthermore, since many privileged programs call getopt() + * before dropping privileges it makes sense to keep things + * as simple (and bug-free) as possible. + */ + return (getopt_internal(nargc, nargv, options, NULL, NULL, 0)); +} +#endif /* REPLACE_GETOPT */ + +/* + * getopt_long -- + * Parse argc/argv argument vector. + */ +int getopt_long(int nargc, char* const* nargv, char const* options, const struct option* long_options, int* idx) +{ + + return (getopt_internal(nargc, nargv, options, long_options, idx, FLAG_PERMUTE)); +} + +/* + * getopt_long_only -- + * Parse argc/argv argument vector. + */ +int getopt_long_only(int nargc, char* const* nargv, char const* options, const struct option* long_options, int* idx) +{ + + return (getopt_internal(nargc, nargv, options, long_options, idx, FLAG_PERMUTE | FLAG_LONGONLY)); +} diff --git a/src/Detector/tensorrt_yolo/common/getoptWin.h b/src/Detector/tensorrt_yolo/common/getoptWin.h new file mode 100644 index 00000000..a1dc6ffa --- /dev/null +++ b/src/Detector/tensorrt_yolo/common/getoptWin.h @@ -0,0 +1,124 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __GETOPT_H__ +/** + * DISCLAIMER + * This file has no copyright assigned and is placed in the Public Domain. + * This file is a part of the w64 mingw-runtime package. + * + * The w64 mingw-runtime package and its code is distributed in the hope that it + * will be useful but WITHOUT ANY WARRANTY. ALL WARRANTIES, EXPRESSED OR + * IMPLIED ARE HEREBY DISCLAIMED. This includes but is not limited to + * warranties of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ + +#define __GETOPT_H__ + +/* All the headers include this file. */ +#include + +#if defined(WINGETOPT_SHARED_LIB) +#if defined(BUILDING_WINGETOPT_DLL) +#define WINGETOPT_API __declspec(dllexport) +#else +#define WINGETOPT_API __declspec(dllimport) +#endif +#else +#define WINGETOPT_API +#endif + +#ifdef __cplusplus +extern "C" +{ +#endif + + WINGETOPT_API extern int optind; /* index of first non-option in argv */ + WINGETOPT_API extern int optopt; /* single option character, as parsed */ + WINGETOPT_API extern int opterr; /* flag to enable built-in diagnostics... */ + /* (user may set to zero, to suppress) */ + + WINGETOPT_API extern char* optarg; /* pointer to argument of current option */ + + extern int getopt(int nargc, char* const* nargv, char const* options); + +#ifdef _BSD_SOURCE +/* + * BSD adds the non-standard `optreset' feature, for reinitialisation + * of `getopt' parsing. We support this feature, for applications which + * proclaim their BSD heritage, before including this header; however, + * to maintain portability, developers are advised to avoid it. + */ +#define optreset __mingw_optreset + extern int optreset; +#endif +#ifdef __cplusplus +} +#endif +/* + * POSIX requires the `getopt' API to be specified in `unistd.h'; + * thus, `unistd.h' includes this header. However, we do not want + * to expose the `getopt_long' or `getopt_long_only' APIs, when + * included in this manner. Thus, close the standard __GETOPT_H__ + * declarations block, and open an additional __GETOPT_LONG_H__ + * specific block, only when *not* __UNISTD_H_SOURCED__, in which + * to declare the extended API. + */ +#endif /* !defined(__GETOPT_H__) */ + +#if !defined(__UNISTD_H_SOURCED__) && !defined(__GETOPT_LONG_H__) +#define __GETOPT_LONG_H__ + +#ifdef __cplusplus +extern "C" +{ +#endif + + struct option /* specification for a long form option... */ + { + char const* name; /* option name, without leading hyphens */ + int has_arg; /* does it take an argument? */ + int* flag; /* where to save its status, or NULL */ + int val; /* its associated status value */ + }; + + enum /* permitted values for its `has_arg' field... */ + { + no_argument = 0, /* option never takes an argument */ + required_argument, /* option always requires an argument */ + optional_argument /* option may take an argument */ + }; + + extern int getopt_long( + int nargc, char* const* nargv, char const* options, const struct option* long_options, int* idx); + extern int getopt_long_only( + int nargc, char* const* nargv, char const* options, const struct option* long_options, int* idx); +/* + * Previous MinGW implementation had... + */ +#ifndef HAVE_DECL_GETOPT +/* + * ...for the long form API only; keep this for compatibility. + */ +#define HAVE_DECL_GETOPT 1 +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* !defined(__UNISTD_H_SOURCED__) && !defined(__GETOPT_LONG_H__) */ diff --git a/src/Detector/tensorrt_yolo/common/half.h b/src/Detector/tensorrt_yolo/common/half.h index 0755c316..b997e7db 100644 --- a/src/Detector/tensorrt_yolo/common/half.h +++ b/src/Detector/tensorrt_yolo/common/half.h @@ -16,13 +16,14 @@ // OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. /* - * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, @@ -1522,14 +1523,14 @@ class half /// \return incremented half value half& operator++() { - return *this += 1.0f; + return *this += 1.0F; } /// Prefix decrement. /// \return decremented half value half& operator--() { - return *this -= 1.0f; + return *this -= 1.0F; } /// Postfix increment. diff --git a/src/Detector/tensorrt_yolo/common/logger.cpp b/src/Detector/tensorrt_yolo/common/logger.cpp index 03c64398..909ec0bb 100644 --- a/src/Detector/tensorrt_yolo/common/logger.cpp +++ b/src/Detector/tensorrt_yolo/common/logger.cpp @@ -1,11 +1,12 @@ /* - * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, @@ -17,7 +18,7 @@ #include "logger.h" #include "ErrorRecorder.h" #include "logging.h" - +using namespace nvinfer1; SampleErrorRecorder gRecorder; namespace sample { diff --git a/src/Detector/tensorrt_yolo/common/logger.h b/src/Detector/tensorrt_yolo/common/logger.h index 3069e8e9..8205e457 100644 --- a/src/Detector/tensorrt_yolo/common/logger.h +++ b/src/Detector/tensorrt_yolo/common/logger.h @@ -1,11 +1,12 @@ /* - * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, diff --git a/src/Detector/tensorrt_yolo/common/logging.h b/src/Detector/tensorrt_yolo/common/logging.h index 78732c10..69273a5e 100644 --- a/src/Detector/tensorrt_yolo/common/logging.h +++ b/src/Detector/tensorrt_yolo/common/logging.h @@ -1,11 +1,12 @@ /* - * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, @@ -17,7 +18,7 @@ #ifndef TENSORRT_LOGGING_H #define TENSORRT_LOGGING_H -#include "NvInferRuntimeCommon.h" +#include "NvInferRuntime.h" #include "sampleOptions.h" #include #include @@ -162,7 +163,7 @@ class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream } LogStreamConsumer(const LogStreamConsumer& other) = delete; LogStreamConsumer() = delete; - ~LogStreamConsumer() = default; + ~LogStreamConsumer() override = default; LogStreamConsumer& operator=(const LogStreamConsumer&) = delete; LogStreamConsumer& operator=(LogStreamConsumer&&) = delete; @@ -291,7 +292,7 @@ class Logger : public nvinfer1::ILogger }; //! - //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger + //! \brief Forward-compatible method for retrieving the nvinfer1::ILogger associated with this Logger //! \return The nvinfer1::ILogger associated with this Logger //! //! TODO Once all samples are updated to use this method to register the logger with TensorRT, @@ -353,7 +354,7 @@ class Logger : public nvinfer1::ILogger //! //! \brief Define a test for logging //! - //! \param[in] name The name of the test. This should be a string starting with + //! \param[in] name The name of the test. This should be a string starting with //! "TensorRT" and containing dot-separated strings containing //! the characters [A-Za-z0-9_]. //! For example, "TensorRT.sample_googlenet" @@ -379,7 +380,8 @@ class Logger : public nvinfer1::ILogger static TestAtom defineTest(const std::string& name, int32_t argc, char const* const* argv) { // Append TensorRT version as info - const std::string vname = name + " [TensorRT v" + std::to_string(NV_TENSORRT_VERSION) + "]"; + const std::string vname = name + " [TensorRT v" + std::to_string(NV_TENSORRT_VERSION) + "] [b" + + std::to_string(NV_TENSORRT_BUILD) + "]"; auto cmdline = genCmdlineString(argc, argv); return defineTest(vname, cmdline); } diff --git a/src/Detector/tensorrt_yolo/common/parserOnnxConfig.h b/src/Detector/tensorrt_yolo/common/parserOnnxConfig.h index c92a1420..67ee6c71 100644 --- a/src/Detector/tensorrt_yolo/common/parserOnnxConfig.h +++ b/src/Detector/tensorrt_yolo/common/parserOnnxConfig.h @@ -1,11 +1,12 @@ /* - * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, @@ -35,15 +36,13 @@ * */ -using namespace std; - class ParserOnnxConfig : public nvonnxparser::IOnnxConfig { protected: - string mModelFilename{}; - string mTextFilename{}; - string mFullTextFilename{}; + std::string mModelFilename{}; + std::string mTextFilename{}; + std::string mFullTextFilename{}; nvinfer1::DataType mModelDtype; nvonnxparser::IOnnxConfig::Verbosity mVerbosity; bool mPrintLayercInfo; @@ -62,8 +61,7 @@ class ParserOnnxConfig : public nvonnxparser::IOnnxConfig #endif } -protected: - ~ParserOnnxConfig() + ~ParserOnnxConfig() override { #ifdef ONNX_DEBUG if (isDebug()) @@ -74,62 +72,62 @@ class ParserOnnxConfig : public nvonnxparser::IOnnxConfig } public: - virtual void setModelDtype(const nvinfer1::DataType modelDtype) noexcept + void setModelDtype(const nvinfer1::DataType modelDtype) noexcept override { mModelDtype = modelDtype; } - virtual nvinfer1::DataType getModelDtype() const noexcept + nvinfer1::DataType getModelDtype() const noexcept override { return mModelDtype; } - virtual const char* getModelFileName() const noexcept + const char* getModelFileName() const noexcept override { return mModelFilename.c_str(); } - virtual void setModelFileName(const char* onnxFilename) noexcept + void setModelFileName(const char* onnxFilename) noexcept override { - mModelFilename = string(onnxFilename); + mModelFilename = std::string(onnxFilename); } - virtual nvonnxparser::IOnnxConfig::Verbosity getVerbosityLevel() const noexcept + nvonnxparser::IOnnxConfig::Verbosity getVerbosityLevel() const noexcept override { return mVerbosity; } - virtual void addVerbosity() noexcept + void addVerbosity() noexcept override { ++mVerbosity; } - virtual void reduceVerbosity() noexcept + void reduceVerbosity() noexcept override { --mVerbosity; } - virtual void setVerbosityLevel(nvonnxparser::IOnnxConfig::Verbosity verbosity) noexcept + void setVerbosityLevel(nvonnxparser::IOnnxConfig::Verbosity verbosity) noexcept override { mVerbosity = verbosity; } - virtual const char* getTextFileName() const noexcept + const char* getTextFileName() const noexcept override { return mTextFilename.c_str(); } - virtual void setTextFileName(const char* textFilename) noexcept + void setTextFileName(const char* textFilename) noexcept override { - mTextFilename = string(textFilename); + mTextFilename = std::string(textFilename); } - virtual const char* getFullTextFileName() const noexcept + const char* getFullTextFileName() const noexcept override { return mFullTextFilename.c_str(); } - virtual void setFullTextFileName(const char* fullTextFilename) noexcept + void setFullTextFileName(const char* fullTextFilename) noexcept override { - mFullTextFilename = string(fullTextFilename); + mFullTextFilename = std::string(fullTextFilename); } - virtual bool getPrintLayerInfo() const noexcept + bool getPrintLayerInfo() const noexcept override { return mPrintLayercInfo; } - virtual void setPrintLayerInfo(bool src) noexcept + void setPrintLayerInfo(bool src) noexcept override { mPrintLayercInfo = src; } //!< get the boolean variable corresponding to the Layer Info, see getPrintLayerInfo() @@ -142,12 +140,6 @@ class ParserOnnxConfig : public nvonnxparser::IOnnxConfig return false; #endif } - - virtual void destroy() noexcept - { - delete this; - } - }; // class ParserOnnxConfig #endif diff --git a/src/Detector/tensorrt_yolo/common/safeCommon.h b/src/Detector/tensorrt_yolo/common/safeCommon.h index 3d84b095..f10aad18 100644 --- a/src/Detector/tensorrt_yolo/common/safeCommon.h +++ b/src/Detector/tensorrt_yolo/common/safeCommon.h @@ -1,11 +1,12 @@ /* - * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, @@ -17,13 +18,32 @@ #ifndef TENSORRT_SAFE_COMMON_H #define TENSORRT_SAFE_COMMON_H -#include "NvInferRuntimeCommon.h" +#include "cuda_runtime.h" +#include "sampleEntrypoints.h" +#include #include +#include #include #include +#include #include #include +// For safeLoadLibrary +#ifdef _MSC_VER +// Needed so that the max/min definitions in windows.h do not conflict with std::max/min. +#define NOMINMAX +#include +#undef NOMINMAX +#else +#include +#endif +#if IS_QNX_SAFE +#include +#include +#endif // IS_QNX_SAFE + +#undef CHECK #define CHECK(status) \ do \ { \ @@ -31,10 +51,92 @@ if (ret != 0) \ { \ std::cerr << "Cuda failure: " << ret << std::endl; \ - abort(); \ + exit(EXIT_FAILURE); \ } \ } while (0) +#undef SAFE_ASSERT +#define SAFE_ASSERT(condition) \ + do \ + { \ + if (!(condition)) \ + { \ + std::cerr << "Assertion failure: " << #condition << std::endl; \ + exit(EXIT_FAILURE); \ + } \ + } while (0) + +//! Locate path to file, given its filename or filepath suffix and possible dirs it might lie in. +//! Function will also walk back MAX_DEPTH dirs from CWD to check for such a file path. +inline std::string locateFile( + const std::string& filepathSuffix, const std::vector& directories, bool reportError = true) +{ + const int MAX_DEPTH{10}; + bool found{false}; + std::string filepath; + + for (auto& dir : directories) + { + if (!dir.empty() && dir.back() != '/') + { +#ifdef _MSC_VER + filepath = dir + "\\" + filepathSuffix; +#else + filepath = dir + "/" + filepathSuffix; +#endif + } + else + { + filepath = dir + filepathSuffix; + } + + for (int i = 0; i < MAX_DEPTH && !found; i++) + { + const std::ifstream checkFile(filepath); + found = checkFile.is_open(); + if (found) + { + break; + } + + filepath = "../" + filepath; // Try again in parent dir + } + + if (found) + { + break; + } + + filepath.clear(); + } + + // Could not find the file + if (filepath.empty()) + { + const std::string dirList = std::accumulate(directories.begin() + 1, directories.end(), directories.front(), + [](const std::string& a, const std::string& b) { return a + "\n\t" + b; }); + std::cout << "Could not find " << filepathSuffix << " in data directories:\n\t" << dirList << std::endl; + + if (reportError) + { + std::cout << "&&&& FAILED" << std::endl; + exit(EXIT_FAILURE); + } + } + + return filepath; +} + +inline void readPGMFile(const std::string& fileName, uint8_t* buffer, int32_t inH, int32_t inW) +{ + std::ifstream infile(fileName, std::ifstream::binary); + SAFE_ASSERT(infile.is_open() && "Attempting to read from a file that is not open."); + std::string magic, w, h, max; + infile >> magic >> w >> h >> max; + infile.seekg(1, infile.cur); + infile.read(reinterpret_cast(buffer), inH * inW); +} + namespace samplesCommon { template @@ -51,11 +153,17 @@ inline uint32_t elementSize(nvinfer1::DataType t) { switch (t) { + case nvinfer1::DataType::kINT64: return 8; case nvinfer1::DataType::kINT32: case nvinfer1::DataType::kFLOAT: return 4; - case nvinfer1::DataType::kHALF: return 2; - case nvinfer1::DataType::kINT8: return 1; - case nvinfer1::DataType::kBOOL: return 1; + case nvinfer1::DataType::kHALF: + case nvinfer1::DataType::kBF16: return 2; + case nvinfer1::DataType::kINT8: + case nvinfer1::DataType::kUINT8: + case nvinfer1::DataType::kBOOL: + case nvinfer1::DataType::kFP8: return 1; + case nvinfer1::DataType::kINT4: + SAFE_ASSERT(false && "Element size is not implemented for sub-byte data-types"); } return 0; } @@ -66,6 +174,205 @@ inline A divUp(A x, B n) return (x + n - 1) / n; } +inline int64_t volume(nvinfer1::Dims const& d) +{ + return std::accumulate(d.d, d.d + d.nbDims, int64_t{1}, std::multiplies{}); +} + +//! Return m rounded up to nearest multiple of n +template +inline T1 roundUp(T1 m, T2 n) +{ + static_assert(std::is_integral::value && std::is_integral::value, "arguments must be integers"); + static_assert(std::is_signed::value == std::is_signed::value, "mixed signedness not allowed"); + static_assert(sizeof(T1) >= sizeof(T2), "first type must be as least as wide as second type"); + return ((m + n - 1) / n) * n; +} + +//! comps is the number of components in a vector. Ignored if vecDim < 0. +inline int64_t volume(nvinfer1::Dims dims, int32_t vecDim, int32_t comps, int32_t batch) +{ + if (vecDim >= 0) + { + dims.d[vecDim] = roundUp(dims.d[vecDim], comps); + } + return samplesCommon::volume(dims) * std::max(batch, 1); +} + +inline int32_t getSMVersion() +{ +#if 0 + // Use default value for 4090 + int32_t major{8}; + int32_t minor{9}; +#else + int32_t major{}; + int32_t minor{}; + int32_t deviceIndex{}; + CHECK(cudaGetDevice(&deviceIndex)); + CHECK(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, deviceIndex)); + CHECK(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, deviceIndex)); +#endif + return ((major << 8) | minor); +} + +inline bool isSMSafe() +{ + const int32_t smVersion = getSMVersion(); + return smVersion == 0x0700 || smVersion == 0x0705 || smVersion == 0x0800 || smVersion == 0x0806 + || smVersion == 0x0807; +} + +inline int32_t calculateSoftmax(float* const prob, int32_t const numDigits) +{ + SAFE_ASSERT(prob != nullptr); + SAFE_ASSERT(numDigits == 10); + float sum{0.0F}; + std::transform(prob, prob + numDigits, prob, [&sum](float v) -> float { + sum += exp(v); + return exp(v); + }); + + SAFE_ASSERT(sum != 0.0F); + std::transform(prob, prob + numDigits, prob, [sum](float v) -> float { return v / sum; }); + int32_t idx = std::max_element(prob, prob + numDigits) - prob; + return idx; +} + +//! +//! \class TrtCudaGraphSafe +//! \brief Managed CUDA graph +//! +class TrtCudaGraphSafe +{ +public: + explicit TrtCudaGraphSafe() = default; + + TrtCudaGraphSafe(const TrtCudaGraphSafe&) = delete; + + TrtCudaGraphSafe& operator=(const TrtCudaGraphSafe&) = delete; + + TrtCudaGraphSafe(TrtCudaGraphSafe&&) = delete; + + TrtCudaGraphSafe& operator=(TrtCudaGraphSafe&&) = delete; + + ~TrtCudaGraphSafe() + { + if (mGraphExec) + { + cudaGraphExecDestroy(mGraphExec); + } + } + + void beginCapture(cudaStream_t& stream) + { + // cudaStreamCaptureModeGlobal is the only allowed mode in SAFE CUDA + CHECK(cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal)); + } + + bool launch(cudaStream_t& stream) + { + return cudaGraphLaunch(mGraphExec, stream) == cudaSuccess; + } + + void endCapture(cudaStream_t& stream) + { + CHECK(cudaStreamEndCapture(stream, &mGraph)); + CHECK(cudaGraphInstantiate(&mGraphExec, mGraph, nullptr, nullptr, 0)); + CHECK(cudaGraphDestroy(mGraph)); + } + + void endCaptureOnError(cudaStream_t& stream) + { + // There are two possibilities why stream capture would fail: + // (1) stream is in cudaErrorStreamCaptureInvalidated state. + // (2) TRT reports a failure. + // In case (1), the returning mGraph should be nullptr. + // In case (2), the returning mGraph is not nullptr, but it should not be used. + const auto ret = cudaStreamEndCapture(stream, &mGraph); + if (ret == cudaErrorStreamCaptureInvalidated) + { + SAFE_ASSERT(mGraph == nullptr); + } + else + { + SAFE_ASSERT(ret == cudaSuccess); + SAFE_ASSERT(mGraph != nullptr); + CHECK(cudaGraphDestroy(mGraph)); + mGraph = nullptr; + } + // Clean up any CUDA error. + cudaGetLastError(); + sample::gLogError << "The CUDA graph capture on the stream has failed." << std::endl; + } + +private: + cudaGraph_t mGraph{}; + cudaGraphExec_t mGraphExec{}; +}; + +inline void safeLoadLibrary(const std::string& path) +{ +#ifdef _MSC_VER + void* handle = LoadLibraryA(path.c_str()); +#else + int32_t flags{RTLD_LAZY}; + void* handle = dlopen(path.c_str(), flags); +#endif + if (handle == nullptr) + { +#ifdef _MSC_VER + sample::gLogError << "Could not load plugin library: " << path << std::endl; +#else + sample::gLogError << "Could not load plugin library: " << path << ", due to: " << dlerror() << std::endl; +#endif + } +} + +inline std::vector safeSplitString(std::string str, char delimiter = ',') +{ + std::vector splitVect; + std::stringstream ss(str); + std::string substr; + + while (ss.good()) + { + getline(ss, substr, delimiter); + splitVect.emplace_back(std::move(substr)); + } + return splitVect; +} + } // namespace samplesCommon +namespace safetyCompliance +{ +inline void initSafeCuda() +{ + // According to CUDA initialization in NVIDIA CUDA SAFETY API REFERENCE FOR DRIVE OS + // We will need to do the following in order + // 1. Initialize the calling thread with CUDA specific information (Call any CUDA RT API identified as init) + // 2. Query/Configure and choose the desired CUDA device + // 3. CUDA context initialization. (Call cudaDeviceGetLimit or cuCtxCreate) + size_t stackSizeLimit = 0; + int32_t deviceIndex = 0; + CHECK(cudaGetDevice(&deviceIndex)); + CHECK(cudaDeviceGetLimit(&stackSizeLimit, cudaLimitStackSize)); +#if IS_QNX_SAFE + CHECK(cudaSafeExSelectAPIMode(cudaSafeExAPIModeAsilB)); +#endif // IS_QNX_SAFE +} + +inline void setPromgrAbility() +{ +#if IS_QNX_SAFE + // Comply with DEEPLRN_RES_117 on QNX-safe by dropping PROCMGR_AID_MEM_PHYS ability and locking out any further + // changes + procmgr_ability( + 0, PROCMGR_ADN_NONROOT | PROCMGR_AOP_DENY | PROCMGR_AOP_LOCK | PROCMGR_AID_MEM_PHYS, PROCMGR_AID_EOL); +#endif // IS_QNX_SAFE +} + +} // namespace safetyCompliance + #endif // TENSORRT_SAFE_COMMON_H diff --git a/src/Detector/tensorrt_yolo/common/sampleConfig.h b/src/Detector/tensorrt_yolo/common/sampleConfig.h index 53a78331..801a268a 100644 --- a/src/Detector/tensorrt_yolo/common/sampleConfig.h +++ b/src/Detector/tensorrt_yolo/common/sampleConfig.h @@ -1,11 +1,12 @@ /* - * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, @@ -55,9 +56,9 @@ class SampleConfig : public nvonnxparser::IOnnxConfig bool mDebugBuilder{false}; InputDataFormat mInputDataFormat{InputDataFormat::kASCII}; uint64_t mTopK{0}; - float mFailurePercentage{-1.0f}; - float mTolerance{0.0f}; - float mAbsTolerance{1e-5f}; + float mFailurePercentage{-1.0F}; + float mTolerance{0.0F}; + float mAbsTolerance{1e-5F}; public: SampleConfig() @@ -70,8 +71,7 @@ class SampleConfig : public nvonnxparser::IOnnxConfig #endif } -protected: - ~SampleConfig() + ~SampleConfig() override { #ifdef ONNX_DEBUG if (isDebug()) @@ -82,12 +82,12 @@ class SampleConfig : public nvonnxparser::IOnnxConfig } public: - void setModelDtype(const nvinfer1::DataType mdt) noexcept + void setModelDtype(const nvinfer1::DataType mdt) noexcept override { mModelDtype = mdt; } - nvinfer1::DataType getModelDtype() const noexcept + nvinfer1::DataType getModelDtype() const noexcept override { return mModelDtype; } @@ -102,28 +102,28 @@ class SampleConfig : public nvonnxparser::IOnnxConfig mTF32 = enabled; } - const char* getModelFileName() const noexcept + const char* getModelFileName() const noexcept override { return mModelFilename.c_str(); } - void setModelFileName(const char* onnxFilename) noexcept + void setModelFileName(const char* onnxFilename) noexcept override { mModelFilename = std::string(onnxFilename); } - Verbosity getVerbosityLevel() const noexcept + Verbosity getVerbosityLevel() const noexcept override { return mVerbosity; } - void addVerbosity() noexcept + void addVerbosity() noexcept override { ++mVerbosity; } - void reduceVerbosity() noexcept + void reduceVerbosity() noexcept override { --mVerbosity; } - virtual void setVerbosityLevel(Verbosity v) noexcept + void setVerbosityLevel(Verbosity v) noexcept override { mVerbosity = v; } @@ -135,19 +135,19 @@ class SampleConfig : public nvonnxparser::IOnnxConfig { mEngineFilename = std::string(engineFilename); } - const char* getTextFileName() const noexcept + const char* getTextFileName() const noexcept override { return mTextFilename.c_str(); } - void setTextFileName(const char* textFilename) noexcept + void setTextFileName(const char* textFilename) noexcept override { mTextFilename = std::string(textFilename); } - const char* getFullTextFileName() const noexcept + const char* getFullTextFileName() const noexcept override { return mFullTextFilename.c_str(); } - void setFullTextFileName(const char* fullTextFilename) noexcept + void setFullTextFileName(const char* fullTextFilename) noexcept override { mFullTextFilename = std::string(fullTextFilename); } @@ -161,12 +161,12 @@ class SampleConfig : public nvonnxparser::IOnnxConfig return mLabel; } //!< get the Label - bool getPrintLayerInfo() const noexcept + bool getPrintLayerInfo() const noexcept override { return mPrintLayercInfo; } - void setPrintLayerInfo(bool b) noexcept + void setPrintLayerInfo(bool b) noexcept override { mPrintLayercInfo = b; } //!< get the boolean variable corresponding to the Layer Info, see getPrintLayerInfo() @@ -312,7 +312,7 @@ class SampleConfig : public nvonnxparser::IOnnxConfig { return mTimingCacheFilename.c_str(); } - + void setTimingCacheFileName(const char* timingCacheFilename) noexcept { mTimingCacheFilename = std::string(timingCacheFilename); @@ -326,12 +326,6 @@ class SampleConfig : public nvonnxparser::IOnnxConfig return false; #endif } - - void destroy() noexcept - { - delete this; - } - }; // class SampleConfig #endif diff --git a/src/Detector/tensorrt_yolo/common/sampleDevice.cpp b/src/Detector/tensorrt_yolo/common/sampleDevice.cpp new file mode 100644 index 00000000..7964aeb5 --- /dev/null +++ b/src/Detector/tensorrt_yolo/common/sampleDevice.cpp @@ -0,0 +1,133 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "sampleDevice.h" + +#include + +namespace sample +{ + +void cudaCheck(cudaError_t ret, std::ostream& err) +{ + if (ret != cudaSuccess) + { + err << "Cuda failure: " << cudaGetErrorString(ret) << std::endl; + exit(EXIT_FAILURE); + } +} + +// Construct GPU UUID string in the same format as nvidia-smi does. +std::string getUuidString(cudaUUID_t uuid) +{ + constexpr int32_t kUUID_SIZE = sizeof(cudaUUID_t); + static_assert(kUUID_SIZE == 16, "Unexpected size for cudaUUID_t!"); + + std::ostringstream ss; + std::vector const splits = {0, 4, 6, 8, 10, kUUID_SIZE}; + + ss << "GPU" << std::hex << std::setfill('0'); + for (int32_t splitIdx = 0; splitIdx < static_cast(splits.size()) - 1; ++splitIdx) + { + ss << "-"; + for (int32_t byteIdx = splits[splitIdx]; byteIdx < splits[splitIdx + 1]; ++byteIdx) + { + ss << std::setw(2) << +static_cast(uuid.bytes[byteIdx]); + } + } + return ss.str(); +} + +void setCudaDevice(int32_t device, std::ostream& os) +{ +#if !TRT_WINML + os << "=== Device Information ===" << std::endl; + + // Get the number of visible GPUs. + int32_t nbDevices{-1}; + cudaCheck(cudaGetDeviceCount(&nbDevices)); + + if (nbDevices <= 0) + { + os << "Cannot find any available devices (GPUs)!" << std::endl; + exit(EXIT_FAILURE); + } + + // Print out the GPU name and PCIe bus ID of each GPU. + os << "Available Devices: " << std::endl; + cudaDeviceProp properties; + for (int32_t deviceIdx = 0; deviceIdx < nbDevices; ++deviceIdx) + { + cudaDeviceProp tempProperties; + cudaCheck(cudaGetDeviceProperties(&tempProperties, deviceIdx)); + + // clang-format off + os << " Device " << deviceIdx << ": \"" << tempProperties.name << "\" UUID: " + << getUuidString(tempProperties.uuid) << std::endl; + // clang-format on + + // Record the properties of the desired GPU. + if (deviceIdx == device) + { + properties = tempProperties; + } + } + + // Exit with error if the requested device ID does not exist. + if (device < 0 || device >= nbDevices) + { + os << "Cannot find device ID " << device << "!" << std::endl; + exit(EXIT_FAILURE); + } + + // Set to the corresponding GPU. + cudaCheck(cudaSetDevice(device)); + + // clang-format off + os << "Selected Device: " << properties.name << std::endl; + os << "Selected Device ID: " << device << std::endl; + os << "Selected Device UUID: " << getUuidString(properties.uuid) << std::endl; + os << "Compute Capability: " << properties.major << "." << properties.minor << std::endl; + os << "SMs: " << properties.multiProcessorCount << std::endl; + os << "Device Global Memory: " << (properties.totalGlobalMem >> 20) << " MiB" << std::endl; + os << "Shared Memory per SM: " << (properties.sharedMemPerMultiprocessor >> 10) << " KiB" << std::endl; + os << "Memory Bus Width: " << properties.memoryBusWidth << " bits" + << " (ECC " << (properties.ECCEnabled != 0 ? "enabled" : "disabled") << ")" << std::endl; + os << "Application Compute Clock Rate: " << properties.clockRate / 1000000.0F << " GHz" << std::endl; + os << "Application Memory Clock Rate: " << properties.memoryClockRate / 1000000.0F << " GHz" << std::endl; + os << std::endl; + os << "Note: The application clock rates do not reflect the actual clock rates that the GPU is " + << "currently running at." << std::endl; + // clang-format on +#endif +} + +int32_t getCudaDriverVersion() +{ + int32_t version{-1}; + cudaCheck(cudaDriverGetVersion(&version)); + return version; +} + +int32_t getCudaRuntimeVersion() +{ + int32_t version{-1}; + cudaCheck(cudaRuntimeGetVersion(&version)); + return version; +} + +} // namespace sample diff --git a/src/Detector/tensorrt_yolo/common/sampleDevice.h b/src/Detector/tensorrt_yolo/common/sampleDevice.h index 2053ac7c..986dccb4 100644 --- a/src/Detector/tensorrt_yolo/common/sampleDevice.h +++ b/src/Detector/tensorrt_yolo/common/sampleDevice.h @@ -1,11 +1,12 @@ /* - * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, @@ -23,17 +24,13 @@ #include #include +#include "sampleUtils.h" + namespace sample { -inline void cudaCheck(cudaError_t ret, std::ostream& err = std::cerr) -{ - if (ret != cudaSuccess) - { - err << "Cuda failure: " << cudaGetErrorString(ret) << std::endl; - abort(); - } -} +//! Check if the CUDA return status shows any error. If so, exit the program immediately. +void cudaCheck(cudaError_t ret, std::ostream& err = std::cerr); class TrtCudaEvent; @@ -238,16 +235,18 @@ class TrtCudaBuffer TrtCudaBuffer(TrtCudaBuffer&& rhs) { - reset(rhs.mPtr); + reset(rhs.mPtr, rhs.mSize); rhs.mPtr = nullptr; + rhs.mSize = 0; } TrtCudaBuffer& operator=(TrtCudaBuffer&& rhs) { if (this != &rhs) { - reset(rhs.mPtr); + reset(rhs.mPtr, rhs.mSize); rhs.mPtr = nullptr; + rhs.mSize = 0; } return *this; } @@ -260,21 +259,24 @@ class TrtCudaBuffer TrtCudaBuffer(size_t size) { A()(&mPtr, size); + mSize = size; } void allocate(size_t size) { reset(); A()(&mPtr, size); + mSize = size; } - void reset(void* ptr = nullptr) + void reset(void* ptr = nullptr, size_t size = 0) { if (mPtr) { D()(mPtr); } mPtr = ptr; + mSize = size; } void* get() const @@ -282,8 +284,14 @@ class TrtCudaBuffer return mPtr; } + size_t getSize() const + { + return mSize; + } + private: void* mPtr{nullptr}; + size_t mSize{0}; }; struct DeviceAllocator @@ -383,39 +391,39 @@ class IMirroredBuffer }; // class IMirroredBuffer //! -//! Class to have a seperate memory buffer for discrete device and host allocations. +//! Class to have a separate memory buffer for discrete device and host allocations. //! class DiscreteMirroredBuffer : public IMirroredBuffer { public: - void allocate(size_t size) + void allocate(size_t size) override { mSize = size; mHostBuffer.allocate(size); mDeviceBuffer.allocate(size); } - void* getDeviceBuffer() const + void* getDeviceBuffer() const override { return mDeviceBuffer.get(); } - void* getHostBuffer() const + void* getHostBuffer() const override { return mHostBuffer.get(); } - void hostToDevice(TrtCudaStream& stream) + void hostToDevice(TrtCudaStream& stream) override { cudaCheck(cudaMemcpyAsync(mDeviceBuffer.get(), mHostBuffer.get(), mSize, cudaMemcpyHostToDevice, stream.get())); } - void deviceToHost(TrtCudaStream& stream) + void deviceToHost(TrtCudaStream& stream) override { cudaCheck(cudaMemcpyAsync(mHostBuffer.get(), mDeviceBuffer.get(), mSize, cudaMemcpyDeviceToHost, stream.get())); } - size_t getSize() const + size_t getSize() const override { return mSize; } @@ -432,33 +440,33 @@ class DiscreteMirroredBuffer : public IMirroredBuffer class UnifiedMirroredBuffer : public IMirroredBuffer { public: - void allocate(size_t size) + void allocate(size_t size) override { mSize = size; mBuffer.allocate(size); } - void* getDeviceBuffer() const + void* getDeviceBuffer() const override { return mBuffer.get(); } - void* getHostBuffer() const + void* getHostBuffer() const override { return mBuffer.get(); } - void hostToDevice(TrtCudaStream& /*stream*/) + void hostToDevice(TrtCudaStream& stream) override { // Does nothing since we are using unified memory. } - void deviceToHost(TrtCudaStream& /*stream*/) + void deviceToHost(TrtCudaStream& stream) override { // Does nothing since we are using unified memory. } - size_t getSize() const + size_t getSize() const override { return mSize; } @@ -468,26 +476,70 @@ class UnifiedMirroredBuffer : public IMirroredBuffer TrtManagedBuffer mBuffer; }; // class UnifiedMirroredBuffer -inline void setCudaDevice(int device, std::ostream& os) +//! +//! Class to allocate memory for outputs with data-dependent shapes. The sizes of those are unknown so pre-allocation is +//! not possible. +//! +class OutputAllocator : public nvinfer1::IOutputAllocator { - cudaCheck(cudaSetDevice(device)); - - cudaDeviceProp properties; - cudaCheck(cudaGetDeviceProperties(&properties, device)); - -// clang-format off - os << "=== Device Information ===" << std::endl; - os << "Selected Device: " << properties.name << std::endl; - os << "Compute Capability: " << properties.major << "." << properties.minor << std::endl; - os << "SMs: " << properties.multiProcessorCount << std::endl; - os << "Compute Clock Rate: " << properties.clockRate / 1000000.0F << " GHz" << std::endl; - os << "Device Global Memory: " << (properties.totalGlobalMem >> 20) << " MiB" << std::endl; - os << "Shared Memory per SM: " << (properties.sharedMemPerMultiprocessor >> 10) << " KiB" << std::endl; - os << "Memory Bus Width: " << properties.memoryBusWidth << " bits" - << " (ECC " << (properties.ECCEnabled != 0 ? "enabled" : "disabled") << ")" << std::endl; - os << "Memory Clock Rate: " << properties.memoryClockRate / 1000000.0F << " GHz" << std::endl; - // clang-format on -} +public: + OutputAllocator(IMirroredBuffer* buffer) + : mBuffer(buffer) + { + } + + void* reallocateOutput( + char const* tensorName, void* currentMemory, uint64_t size, uint64_t alignment) noexcept override + { + // Some memory allocators return nullptr when allocating zero bytes, but TensorRT requires a non-null ptr + // even for empty tensors, so allocate a dummy byte. + size = std::max(size, static_cast(1)); + if (size > mSize) + { + mBuffer->allocate(roundUp(size, alignment)); + mSize = size; + } + return mBuffer->getDeviceBuffer(); + } + + //! IMirroredBuffer does not implement Async allocation, hence this is just a wrap around + void* reallocateOutputAsync(char const* tensorName, void* currentMemory, uint64_t size, uint64_t alignment, + cudaStream_t /*stream*/) noexcept override + { + return reallocateOutput(tensorName, currentMemory, size, alignment); + } + + void notifyShape(char const* tensorName, nvinfer1::Dims const& dims) noexcept override + { + mFinalDims = dims; + } + + IMirroredBuffer* getBuffer() + { + return mBuffer.get(); + } + + nvinfer1::Dims getFinalDims() + { + return mFinalDims; + } + + ~OutputAllocator() override {} + +private: + std::unique_ptr mBuffer; + uint64_t mSize{}; + nvinfer1::Dims mFinalDims; +}; + +//! Set the GPU to run the inference on. +void setCudaDevice(int32_t device, std::ostream& os); + +//! Get the CUDA version of the current CUDA driver. +int32_t getCudaDriverVersion(); + +//! Get the CUDA version of the current CUDA runtime. +int32_t getCudaRuntimeVersion(); } // namespace sample diff --git a/src/Detector/tensorrt_yolo/common/sampleEngines.cpp_ b/src/Detector/tensorrt_yolo/common/sampleEngines.cpp_ new file mode 100644 index 00000000..8ada0526 --- /dev/null +++ b/src/Detector/tensorrt_yolo/common/sampleEngines.cpp_ @@ -0,0 +1,1688 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "NvInfer.h" +#include "NvOnnxParser.h" + +#include "ErrorRecorder.h" +#include "common.h" +#include "half.h" +#include "logger.h" +#include "sampleDevice.h" +#include "sampleEngines.h" +#include "sampleOptions.h" +#include "sampleUtils.h" + +using namespace nvinfer1; + +namespace sample +{ + +namespace +{ + +std::map readScalesFromCalibrationCache(std::string const& calibrationFile) +{ + std::map tensorScales; + std::ifstream cache{calibrationFile}; + if (!cache.is_open()) + { + sample::gLogError << "[TRT] Can not open provided calibration cache file" << std::endl; + return tensorScales; + } + std::string line; + while (std::getline(cache, line)) + { + auto colonPos = line.find_last_of(':'); + if (colonPos != std::string::npos) + { + // Scales should be stored in calibration cache as 32-bit floating numbers encoded as 32-bit integers + int32_t scalesAsInt = std::stoi(line.substr(colonPos + 2, 8), nullptr, 16); + auto const tensorName = line.substr(0, colonPos); + tensorScales[tensorName] = *reinterpret_cast(&scalesAsInt); + } + } + cache.close(); + return tensorScales; +} +} // namespace + +nvinfer1::ICudaEngine* LazilyDeserializedEngine::get() +{ + SMP_RETVAL_IF_FALSE( + !mIsSafe, "Safe mode is enabled, but trying to get standard engine!", nullptr, sample::gLogError); + + if (mEngine == nullptr) + { + SMP_RETVAL_IF_FALSE(getFileReader().isOpen() || !getBlob().empty(), "Engine is empty. Nothing to deserialize!", + nullptr, sample::gLogError); + + using time_point = std::chrono::time_point; + using duration = std::chrono::duration; + time_point const deserializeStartTime{std::chrono::high_resolution_clock::now()}; + + if (mLeanDLLPath.empty()) + { + mRuntime.reset(createRuntime()); + } + else + { + mParentRuntime.reset(createRuntime()); + ASSERT(mParentRuntime.get() != nullptr); + + mRuntime.reset(mParentRuntime->loadRuntime(mLeanDLLPath.c_str())); + } + ASSERT(mRuntime.get() != nullptr); + + if (mVersionCompatible) + { + // Application needs to opt into allowing deserialization of engines with embedded lean runtime. + mRuntime->setEngineHostCodeAllowed(true); + } + + if (!mTempdir.empty()) + { + mRuntime->setTemporaryDirectory(mTempdir.c_str()); + } + + mRuntime->setTempfileControlFlags(mTempfileControls); + + SMP_RETVAL_IF_FALSE(mRuntime != nullptr, "runtime creation failed", nullptr, sample::gLogError); + if (mDLACore != -1) + { + mRuntime->setDLACore(mDLACore); + } + mRuntime->setErrorRecorder(&gRecorder); +#if !TRT_WINML + for (auto const& pluginPath : mDynamicPlugins) + { + mRuntime->getPluginRegistry().loadLibrary(pluginPath.c_str()); + } +#endif + + if (getFileReader().isOpen()) + { + mEngine.reset(mRuntime->deserializeCudaEngine(getFileReader())); + } + else + { + auto const& engineBlob = getBlob(); + mEngine.reset(mRuntime->deserializeCudaEngine(engineBlob.data, engineBlob.size)); + } + SMP_RETVAL_IF_FALSE(mEngine != nullptr, "Engine deserialization failed", nullptr, sample::gLogError); + + time_point const deserializeEndTime{std::chrono::high_resolution_clock::now()}; + sample::gLogInfo << "Engine deserialized in " << duration(deserializeEndTime - deserializeStartTime).count() + << " sec." << std::endl; + } + + return mEngine.get(); +} + +nvinfer1::ICudaEngine* LazilyDeserializedEngine::release() +{ + return mEngine.release(); +} + +void setTensorScalesFromCalibration(nvinfer1::INetworkDefinition& network, std::vector const& inputFormats, + std::vector const& outputFormats, std::string const& calibrationFile) +{ + auto const tensorScales = readScalesFromCalibrationCache(calibrationFile); + bool const broadcastInputFormats = broadcastIOFormats(inputFormats, network.getNbInputs()); + for (int32_t i = 0, n = network.getNbInputs(); i < n; ++i) + { + int32_t formatIdx = broadcastInputFormats ? 0 : i; + if (!inputFormats.empty() && inputFormats[formatIdx].first == DataType::kINT8) + { + auto* input = network.getInput(i); + auto const calibScale = tensorScales.at(input->getName()); + input->setDynamicRange(-127 * calibScale, 127 * calibScale); + } + } + bool const broadcastOutputFormats = broadcastIOFormats(outputFormats, network.getNbOutputs()); + for (int32_t i = 0, n = network.getNbOutputs(); i < n; ++i) + { + int32_t formatIdx = broadcastOutputFormats ? 0 : i; + if (!outputFormats.empty() && outputFormats[formatIdx].first == DataType::kINT8) + { + auto* output = network.getOutput(i); + auto const calibScale = tensorScales.at(output->getName()); + output->setDynamicRange(-127 * calibScale, 127 * calibScale); + } + } +} + +//! +//! \brief Generate a network definition for a given model +//! +//! \param[in] model Model options for this network +//! \param[in,out] network Network storing the parsed results +//! \param[in,out] err Error stream +//! \param[out] vcPluginLibrariesUsed If not nullptr, will be populated with paths to VC plugin libraries required by +//! the parsed network. +//! +//! \return Parser The parser used to initialize the network and that holds the weights for the network, or an invalid +//! parser (the returned parser converts to false if tested) +//! +//! Constant input dimensions in the model must not be changed in the corresponding +//! network definition, because its correctness may rely on the constants. +//! +//! \see Parser::operator bool() +//! +Parser modelToNetwork(ModelOptions const& model, BuildOptions const& build, nvinfer1::INetworkDefinition& network, + std::ostream& err, std::vector* vcPluginLibrariesUsed) +{ + sample::gLogInfo << "Start parsing network model." << std::endl; + auto const tBegin = std::chrono::high_resolution_clock::now(); + + Parser parser; + switch (model.baseModel.format) + { + case ModelFormat::kONNX: + { + using namespace nvonnxparser; + parser.onnxParser.reset(createONNXParser(network)); + ASSERT(parser.onnxParser != nullptr); +#if !TRT_WINML + // kNATIVE_INSTANCENORM is ON by default in the parser and must be cleared to use the plugin implementation. + if (build.pluginInstanceNorm) + { + parser.onnxParser->clearFlag(OnnxParserFlag::kNATIVE_INSTANCENORM); + } +#endif + if (!parser.onnxParser->parseFromFile( + model.baseModel.model.c_str(), static_cast(sample::gLogger.getReportableSeverity()))) + { + err << "Failed to parse onnx file" << std::endl; + parser.onnxParser.reset(); + } +#if !TRT_WINML + if (vcPluginLibrariesUsed && parser.onnxParser.get()) + { + int64_t nbPluginLibs; + char const* const* pluginLibArray = parser.onnxParser->getUsedVCPluginLibraries(nbPluginLibs); + if (nbPluginLibs >= 0) + { + vcPluginLibrariesUsed->reserve(nbPluginLibs); + for (int64_t i = 0; i < nbPluginLibs; ++i) + { + sample::gLogInfo << "Using VC plugin library " << pluginLibArray[i] << std::endl; + vcPluginLibrariesUsed->emplace_back(std::string{pluginLibArray[i]}); + } + } + else + { + sample::gLogWarning << "Failure to query VC plugin libraries required by parsed ONNX network" + << std::endl; + } + } +#endif + break; + } + case ModelFormat::kANY: break; + } + + auto const tEnd = std::chrono::high_resolution_clock::now(); + float const parseTime = std::chrono::duration(tEnd - tBegin).count(); + + sample::gLogInfo << "Finished parsing network model. Parse time: " << parseTime << std::endl; + return parser; +} + +namespace +{ + +class RndInt8Calibrator : public nvinfer1::IInt8EntropyCalibrator2 +{ +public: + RndInt8Calibrator(int32_t batches, std::vector& elemCount, std::string const& cacheFile, + nvinfer1::INetworkDefinition const& network, std::ostream& err); + + ~RndInt8Calibrator() override + { + for (auto& elem : mInputDeviceBuffers) + { + cudaCheck(cudaFree(elem.second), mErr); + } + } + + bool getBatch(void* bindings[], char const* names[], int32_t nbBindings) noexcept override; + + int32_t getBatchSize() const noexcept override + { + return 1; + } + + const void* readCalibrationCache(size_t& length) noexcept override; + + void writeCalibrationCache(void const*, size_t) noexcept override {} + +private: + int32_t mBatches{}; + int32_t mCurrentBatch{}; + std::string mCacheFile; + std::map mInputDeviceBuffers; + std::vector mCalibrationCache; + std::ostream& mErr; +}; + +RndInt8Calibrator::RndInt8Calibrator(int32_t batches, std::vector& elemCount, std::string const& cacheFile, + INetworkDefinition const& network, std::ostream& err) + : mBatches(batches) + , mCurrentBatch(0) + , mCacheFile(cacheFile) + , mErr(err) +{ + std::ifstream tryCache(cacheFile, std::ios::binary); + if (tryCache.good()) + { + return; + } + + std::default_random_engine generator; + std::uniform_real_distribution distribution(-1.0F, 1.0F); + auto gen = [&generator, &distribution]() { return distribution(generator); }; + + for (int32_t i = 0; i < network.getNbInputs(); i++) + { + auto* input = network.getInput(i); + std::vector rnd_data(elemCount[i]); + std::generate_n(rnd_data.begin(), elemCount[i], gen); + + void* data; + cudaCheck(cudaMalloc(&data, elemCount[i] * sizeof(float)), mErr); + cudaCheck(cudaMemcpy(data, rnd_data.data(), elemCount[i] * sizeof(float), cudaMemcpyHostToDevice), mErr); + + mInputDeviceBuffers.insert(std::make_pair(input->getName(), data)); + } +} + +bool RndInt8Calibrator::getBatch(void* bindings[], char const* names[], int32_t nbBindings) noexcept +{ + if (mCurrentBatch >= mBatches) + { + return false; + } + + for (int32_t i = 0; i < nbBindings; ++i) + { + bindings[i] = mInputDeviceBuffers[names[i]]; + } + + ++mCurrentBatch; + + return true; +} + +const void* RndInt8Calibrator::readCalibrationCache(size_t& length) noexcept +{ + mCalibrationCache.clear(); + std::ifstream input(mCacheFile, std::ios::binary); + input >> std::noskipws; + if (input.good()) + { + std::copy( + std::istream_iterator(input), std::istream_iterator(), std::back_inserter(mCalibrationCache)); + } + + length = mCalibrationCache.size(); + return !mCalibrationCache.empty() ? mCalibrationCache.data() : nullptr; +} + +bool setTensorDynamicRange(INetworkDefinition const& network, float inRange = 2.0F, float outRange = 4.0F) +{ + // Ensure that all layer inputs have a dynamic range. + for (int32_t l = 0; l < network.getNbLayers(); l++) + { + auto* layer = network.getLayer(l); + for (int32_t i = 0; i < layer->getNbInputs(); i++) + { + ITensor* input{layer->getInput(i)}; + // Optional inputs are nullptr here and are from RNN layers. + if (input && !input->dynamicRangeIsSet()) + { + // Concat should propagate dynamic range from outputs to inputs to avoid + // Re-quantization during the concatenation + auto dynRange = (layer->getType() == LayerType::kCONCATENATION) ? outRange : inRange; + if (!input->setDynamicRange(-dynRange, dynRange)) + { + return false; + } + } + } + for (int32_t o = 0; o < layer->getNbOutputs(); o++) + { + ITensor* output{layer->getOutput(o)}; + // Optional outputs are nullptr here and are from RNN layers. + if (output && !output->dynamicRangeIsSet()) + { + // Pooling must have the same input and output dynamic range. + if (layer->getType() == LayerType::kPOOLING) + { + if (!output->setDynamicRange(-inRange, inRange)) + { + return false; + } + } + else + { + if (!output->setDynamicRange(-outRange, outRange)) + { + return false; + } + } + } + } + } + return true; +} + +bool isNonActivationType(nvinfer1::DataType const type) +{ + return type == nvinfer1::DataType::kINT32 || type == nvinfer1::DataType::kINT64 || type == nvinfer1::DataType::kBOOL + || type == nvinfer1::DataType::kUINT8; +} + +void setLayerPrecisions(INetworkDefinition& network, LayerPrecisions const& layerPrecisions) +{ + bool hasLayerPrecisionSkipped{false}; + for (int32_t layerIdx = 0; layerIdx < network.getNbLayers(); ++layerIdx) + { + auto* layer = network.getLayer(layerIdx); + auto const layerName = layer->getName(); + auto exactMatch = layerPrecisions.find(layerName); + auto plausibleMatch = findPlausible(layerPrecisions, layerName); + if (exactMatch != layerPrecisions.end()) + { + sample::gLogInfo << "Set layer " << layerName << " to precision " << exactMatch->second << std::endl; + layer->setPrecision(exactMatch->second); + } + else if (plausibleMatch != layerPrecisions.end()) + { + if (isNonActivationType(layer->getPrecision())) + { + hasLayerPrecisionSkipped = true; + sample::gLogVerbose << "Skipped setting precision for layer " << layerName << " because the " + << " default layer precision is of non-activation type." << std::endl; + continue; + } + if (layer->getType() == nvinfer1::LayerType::kCONSTANT + && (isNonActivationType(static_cast(layer)->getWeights().type))) + { + hasLayerPrecisionSkipped = true; + sample::gLogVerbose << "Skipped setting precision for layer " << layerName << " because this " + << "constant layer has weights of non-activation type." << std::endl; + continue; + } + if (layer->getNbInputs() >= 1 && layer->getInput(0)->isShapeTensor()) + { + hasLayerPrecisionSkipped = true; + sample::gLogVerbose << "Skipped setting precision for layer " << layerName << " because this layer " + << "operates on a shape tensor." << std::endl; + continue; + } + if (layer->getNbInputs() >= 1 && isNonActivationType(layer->getInput(0)->getType()) + && layer->getNbOutputs() >= 1 && isNonActivationType(layer->getOutput(0)->getType())) + { + hasLayerPrecisionSkipped = true; + sample::gLogVerbose << "Skipped setting precision for layer " << layerName << " because this " + << "layer has input and output of non-activation type." << std::endl; + continue; + } + // All heuristics passed. Set the layer precision. + sample::gLogInfo << "Set layer " << layerName << " to precision " << plausibleMatch->second << std::endl; + layer->setPrecision(plausibleMatch->second); + } + } + + if (hasLayerPrecisionSkipped) + { + sample::gLogInfo << "Skipped setting precisions for some layers. Check verbose logs for more details." + << std::endl; + } +} + +void setLayerOutputTypes(INetworkDefinition& network, LayerOutputTypes const& layerOutputTypes) +{ + bool const hasGlobalOutputType{layerOutputTypes.find("*") != layerOutputTypes.end()}; + auto const globalOutputType = hasGlobalOutputType ? layerOutputTypes.at("*").at(0) : nvinfer1::DataType::kFLOAT; + bool hasLayerOutputTypeSkipped{false}; + for (int32_t layerIdx = 0; layerIdx < network.getNbLayers(); ++layerIdx) + { + auto* layer = network.getLayer(layerIdx); + auto const layerName = layer->getName(); + auto const nbOutputs = layer->getNbOutputs(); + auto exactMatch = layerOutputTypes.find(layerName); + auto plausibleMatch = findPlausible(layerOutputTypes, layerName); + if (exactMatch != layerOutputTypes.end()) + { + auto const& outputTypes = exactMatch->second; + bool const isBroadcast = (outputTypes.size() == 1); + if (!isBroadcast && static_cast(outputTypes.size()) != nbOutputs) + { + sample::gLogError << "Layer " << layerName << " has " << nbOutputs << " outputs but " + << outputTypes.size() << " output types are given in --layerOutputTypes flag." + << std::endl; + throw std::invalid_argument("Invalid --layerOutputTypes flag."); + } + for (int32_t outputIdx = 0; outputIdx < nbOutputs; ++outputIdx) + { + auto const outputType = outputTypes.at(isBroadcast ? 0 : outputIdx); + sample::gLogInfo << "Set output " << outputIdx << " of layer " << layerName << " to type " << outputType + << std::endl; + layer->setOutputType(outputIdx, outputType); + } + } + else if (plausibleMatch != layerOutputTypes.end()) + { + auto const& outputTypes = plausibleMatch->second; + bool const isBroadcast = (outputTypes.size() == 1); + + // We should not set the layer output types if its default precision is INT32 or Bool. + if (layer->getPrecision() == nvinfer1::DataType::kINT32 + || layer->getPrecision() == nvinfer1::DataType::kBOOL) + { + hasLayerOutputTypeSkipped = true; + sample::gLogVerbose << "Skipped setting output types for layer " << layerName << " because the " + << " default layer precision is INT32 or Bool." << std::endl; + continue; + } + // We should not set the constant layer output types if its weights are in INT32. + if (layer->getType() == nvinfer1::LayerType::kCONSTANT + && static_cast(layer)->getWeights().type == nvinfer1::DataType::kINT32) + { + hasLayerOutputTypeSkipped = true; + sample::gLogVerbose << "Skipped setting output types for layer " << layerName << " because this " + << "constant layer has INT32 weights." << std::endl; + continue; + } + for (int32_t outputIdx = 0; outputIdx < nbOutputs; ++outputIdx) + { + // We should not set the output type if the output is a shape tensor. + if (layer->getOutput(0)->isShapeTensor()) + { + hasLayerOutputTypeSkipped = true; + sample::gLogVerbose << "Skipped setting output type for output " << outputIdx << " of layer " + << layerName << " because it is a shape tensor." << std::endl; + continue; + } + + auto const outputType = outputTypes.at(isBroadcast ? 0 : outputIdx); + sample::gLogInfo << "Set output " << outputIdx << " of layer " << layerName << " to type " << outputType + << std::endl; + layer->setOutputType(outputIdx, globalOutputType); + } + } + } + + if (hasLayerOutputTypeSkipped) + { + sample::gLogInfo << "Skipped setting output types for some layers. Check verbose logs for more details." + << std::endl; + } +} + +void setLayerDeviceTypes( + INetworkDefinition const& network, IBuilderConfig& config, LayerDeviceTypes const& layerDeviceTypes) +{ + for (int32_t layerIdx = 0; layerIdx < network.getNbLayers(); ++layerIdx) + { + auto* layer = network.getLayer(layerIdx); + auto const layerName = layer->getName(); + auto match = findPlausible(layerDeviceTypes, layerName); + if (match != layerDeviceTypes.end()) + { + DeviceType const deviceType = match->second; + sample::gLogInfo << "Set layer " << layerName << " to device type " << deviceType << std::endl; + config.setDeviceType(layer, deviceType); + } + } +} + +void markDebugTensors(INetworkDefinition& network, StringSet const& debugTensors) +{ + for (int64_t inputIndex = 0; inputIndex < network.getNbInputs(); ++inputIndex) + { + auto* t = network.getInput(inputIndex); + auto const tensorName = t->getName(); + if (debugTensors.count(tensorName) > 0) + { + network.markDebug(*t); + } + } + for (int64_t layerIndex = 0; layerIndex < network.getNbLayers(); ++layerIndex) + { + auto* layer = network.getLayer(layerIndex); + for (int64_t outputIndex = 0; outputIndex < layer->getNbOutputs(); ++outputIndex) + { + auto* t = layer->getOutput(outputIndex); + auto const tensorName = t->getName(); + if (debugTensors.count(tensorName) > 0) + { + network.markDebug(*t); + } + } + } +} + +void setMemoryPoolLimits(IBuilderConfig& config, BuildOptions const& build) +{ + auto const roundToBytes = [](double const size, bool fromMB = true) { + return static_cast(size * (fromMB ? 1.0_MiB : 1.0_KiB)); + }; + if (build.workspace >= 0) + { + config.setMemoryPoolLimit(MemoryPoolType::kWORKSPACE, roundToBytes(build.workspace)); + } + if (build.dlaSRAM >= 0) + { + size_t const sizeInBytes = roundToBytes(build.dlaSRAM); + size_t sizeInPowerOf2{1}; + // Using 2^30 bytes as a loose upper bound to prevent the possibility of overflows and infinite loops. + while (sizeInPowerOf2 < 31 && (static_cast(1) << sizeInPowerOf2) <= sizeInBytes) + { + ++sizeInPowerOf2; + } + --sizeInPowerOf2; + if (sizeInPowerOf2 == 30) + { + sample::gLogWarning + << "User-specified DLA managed SRAM size is too large and has been clipped to 2^30 bytes. " + << "Please make sure that this is the intended managed SRAM size." << std::endl; + } + config.setMemoryPoolLimit(MemoryPoolType::kDLA_MANAGED_SRAM, static_cast(1) << sizeInPowerOf2); + } + if (build.dlaLocalDRAM >= 0) + { + config.setMemoryPoolLimit(MemoryPoolType::kDLA_LOCAL_DRAM, roundToBytes(build.dlaLocalDRAM)); + } + if (build.dlaGlobalDRAM >= 0) + { + config.setMemoryPoolLimit(MemoryPoolType::kDLA_GLOBAL_DRAM, roundToBytes(build.dlaGlobalDRAM)); + } + if (build.tacticSharedMem >= 0) + { + config.setMemoryPoolLimit(MemoryPoolType::kTACTIC_SHARED_MEMORY, roundToBytes(build.tacticSharedMem, false)); + } +} + +void setPreviewFeatures(IBuilderConfig& config, BuildOptions const& build) +{ + auto const setFlag = [&](PreviewFeature feat) { + int32_t featVal = static_cast(feat); + if (build.previewFeatures.find(featVal) != build.previewFeatures.end()) + { + config.setPreviewFeature(feat, build.previewFeatures.at(featVal)); + } + }; + setFlag(PreviewFeature::kALIASED_PLUGIN_IO_10_03); +} + +} // namespace + +bool setupNetworkAndConfig(BuildOptions const& build, SystemOptions const& sys, IBuilder& builder, + INetworkDefinition& network, IBuilderConfig& config, std::unique_ptr& calibrator, + std::ostream& err, std::vector>& sparseWeights) +{ + std::vector profiles{}; + profiles.resize(build.optProfiles.size()); + for (auto& profile : profiles) + { + profile = builder.createOptimizationProfile(); + } + + bool hasDynamicShapes{false}; + + bool broadcastInputFormats = broadcastIOFormats(build.inputFormats, network.getNbInputs()); + + // Check if the provided input tensor names match the input tensors of the engine. + // Throw an error if the provided input tensor names cannot be found because it implies a potential typo. + for (auto const& shapes : build.optProfiles) + { + for (auto const& shape : shapes) + { + bool tensorNameFound{false}; + for (int32_t i = 0; i < network.getNbInputs(); ++i) + { + if (matchStringWithOneWildcard(shape.first, network.getInput(i)->getName())) + { + tensorNameFound = true; + break; + } + } + if (!tensorNameFound) + { + sample::gLogError << "Cannot find input tensor with name \"" << shape.first << "\" in the network " + << "inputs! Please make sure the input tensor names are correct." << std::endl; + return false; + } + } + } + + for (uint32_t i = 0, n = network.getNbInputs(); i < n; i++) + { + // Set formats and data types of inputs + auto* input = network.getInput(i); + if (!build.inputFormats.empty()) + { + int32_t inputFormatIndex = broadcastInputFormats ? 0 : i; + input->setType(build.inputFormats[inputFormatIndex].first); + input->setAllowedFormats(build.inputFormats[inputFormatIndex].second); + } + + auto const dims = input->getDimensions(); + auto const isScalar = dims.nbDims == 0; + auto const isDynamicInput = std::any_of(dims.d, dims.d + dims.nbDims, [](int32_t dim) { return dim == -1; }) + || input->isShapeTensor(); + if (isDynamicInput) + { + hasDynamicShapes = true; + for (size_t i = 0; i < build.optProfiles.size(); i++) + { + auto const& optShapes = build.optProfiles[i]; + auto profile = profiles[i]; + auto const tensorName = input->getName(); + auto shape = findPlausible(optShapes, tensorName); + ShapeRange shapes{}; + + // If no shape is provided, set dynamic dimensions to 1. + if (shape == optShapes.end()) + { + constexpr int32_t kDEFAULT_DIMENSION{1}; + std::vector staticDims; + if (input->isShapeTensor()) + { + if (isScalar) + { + staticDims.push_back(1); + } + else + { + staticDims.resize(dims.d[0]); + std::fill(staticDims.begin(), staticDims.end(), kDEFAULT_DIMENSION); + } + } + else + { + staticDims.resize(dims.nbDims); + std::transform(dims.d, dims.d + dims.nbDims, staticDims.begin(), + [&](int dimension) { return dimension > 0 ? dimension : kDEFAULT_DIMENSION; }); + } + sample::gLogWarning << "Dynamic dimensions required for input: " << tensorName + << ", but no shapes were provided. Automatically overriding shape to: " + << staticDims << std::endl; + std::fill(shapes.begin(), shapes.end(), staticDims); + } + else + { + shapes = shape->second; + } + + std::vector profileDims{}; + if (input->isShapeTensor()) + { + profileDims = shapes[static_cast(OptProfileSelector::kMIN)]; + SMP_RETVAL_IF_FALSE(profile->setShapeValues(tensorName, OptProfileSelector::kMIN, + profileDims.data(), static_cast(profileDims.size())), + "Error in set shape values MIN", false, err); + profileDims = shapes[static_cast(OptProfileSelector::kOPT)]; + SMP_RETVAL_IF_FALSE(profile->setShapeValues(tensorName, OptProfileSelector::kOPT, + profileDims.data(), static_cast(profileDims.size())), + "Error in set shape values OPT", false, err); + profileDims = shapes[static_cast(OptProfileSelector::kMAX)]; + SMP_RETVAL_IF_FALSE(profile->setShapeValues(tensorName, OptProfileSelector::kMAX, + profileDims.data(), static_cast(profileDims.size())), + "Error in set shape values MAX", false, err); + sample::gLogInfo << "Set input shape tensor " << tensorName << " for optimization profile " << i + << " to:" + << " MIN=" << shapes[static_cast(OptProfileSelector::kMIN)] + << " OPT=" << shapes[static_cast(OptProfileSelector::kOPT)] + << " MAX=" << shapes[static_cast(OptProfileSelector::kMAX)] << std::endl; + } + else + { + profileDims = shapes[static_cast(OptProfileSelector::kMIN)]; + SMP_RETVAL_IF_FALSE( + profile->setDimensions(tensorName, OptProfileSelector::kMIN, toDims(profileDims)), + "Error in set dimensions to profile MIN", false, err); + profileDims = shapes[static_cast(OptProfileSelector::kOPT)]; + SMP_RETVAL_IF_FALSE( + profile->setDimensions(tensorName, OptProfileSelector::kOPT, toDims(profileDims)), + "Error in set dimensions to profile OPT", false, err); + profileDims = shapes[static_cast(OptProfileSelector::kMAX)]; + SMP_RETVAL_IF_FALSE( + profile->setDimensions(tensorName, OptProfileSelector::kMAX, toDims(profileDims)), + "Error in set dimensions to profile MAX", false, err); + sample::gLogInfo << "Set shape of input tensor " << tensorName << " for optimization profile " << i + << " to:" + << " MIN=" << shapes[static_cast(OptProfileSelector::kMIN)] + << " OPT=" << shapes[static_cast(OptProfileSelector::kOPT)] + << " MAX=" << shapes[static_cast(OptProfileSelector::kMAX)] << std::endl; + } + } + } + } + + for (uint32_t i = 0, n = network.getNbOutputs(); i < n; i++) + { + auto* output = network.getOutput(i); + auto const dims = output->getDimensions(); + // A shape tensor output with known static dimensions may have dynamic shape values inside it. + auto const isDynamicOutput = std::any_of(dims.d, dims.d + dims.nbDims, [](int32_t dim) { return dim == -1; }) + || output->isShapeTensor(); + if (isDynamicOutput) + { + hasDynamicShapes = true; + } + } + + if (!hasDynamicShapes && !build.optProfiles[0].empty()) + { + sample::gLogError << "Static model does not take explicit shapes since the shape of inference tensors will be " + "determined by the model itself" + << std::endl; + return false; + } + + if (hasDynamicShapes) + { + for (auto profile : profiles) + { + SMP_RETVAL_IF_FALSE(profile->isValid(), "Required optimization profile is invalid", false, err); + SMP_RETVAL_IF_FALSE( + config.addOptimizationProfile(profile) != -1, "Error in add optimization profile", false, err); + } + } + + bool broadcastOutputFormats = broadcastIOFormats(build.outputFormats, network.getNbOutputs(), false); + + for (uint32_t i = 0, n = network.getNbOutputs(); i < n; i++) + { + // Set formats and data types of outputs + auto* output = network.getOutput(i); + if (!build.outputFormats.empty()) + { + int32_t outputFormatIndex = broadcastOutputFormats ? 0 : i; + output->setType(build.outputFormats[outputFormatIndex].first); + output->setAllowedFormats(build.outputFormats[outputFormatIndex].second); + } + } + + setMemoryPoolLimits(config, build); + + setPreviewFeatures(config, build); + + if (build.builderOptimizationLevel != defaultBuilderOptimizationLevel) + { + config.setBuilderOptimizationLevel(build.builderOptimizationLevel); + } + + if (build.maxTactics != defaultMaxTactics) + { + config.setMaxNbTactics(build.maxTactics); + } + + if (build.timingCacheMode == TimingCacheMode::kDISABLE) + { + config.setFlag(BuilderFlag::kDISABLE_TIMING_CACHE); + } + + if (build.disableCompilationCache) + { + config.setFlag(BuilderFlag::kDISABLE_COMPILATION_CACHE); + } + + if (build.errorOnTimingCacheMiss) + { + config.setFlag(BuilderFlag::kERROR_ON_TIMING_CACHE_MISS); + } + + if (!build.tf32) + { + config.clearFlag(BuilderFlag::kTF32); + } + + if (build.refittable) + { + config.setFlag(BuilderFlag::kREFIT); + } + + if (build.stripWeights) + { + // The kREFIT_IDENTICAL is enabled by default when kSTRIP_PLAN is on. + config.setFlag(BuilderFlag::kSTRIP_PLAN); + } + + if (build.versionCompatible) + { + config.setFlag(BuilderFlag::kVERSION_COMPATIBLE); + } +#if !TRT_WINML + std::vector pluginPaths; + for (auto const& pluginPath : sys.setPluginsToSerialize) + { + sample::gLogVerbose << "Setting plugin to serialize: " << pluginPath << std::endl; + pluginPaths.push_back(pluginPath.c_str()); + } + if (!pluginPaths.empty()) + { + config.setPluginsToSerialize(pluginPaths.data(), pluginPaths.size()); + } +#endif + if (build.excludeLeanRuntime) + { + config.setFlag(BuilderFlag::kEXCLUDE_LEAN_RUNTIME); + } + + if (build.sparsity != SparsityFlag::kDISABLE) + { + config.setFlag(BuilderFlag::kSPARSE_WEIGHTS); + if (build.sparsity == SparsityFlag::kFORCE) + { + sparsify(network, sparseWeights); + } + } + + config.setProfilingVerbosity(build.profilingVerbosity); + config.setAvgTimingIterations(build.avgTiming); + + if (build.fp16) + { + config.setFlag(BuilderFlag::kFP16); + } + if (build.int8) + { + config.setFlag(BuilderFlag::kINT8); + } + if (build.bf16) + { + config.setFlag(BuilderFlag::kBF16); + } + + SMP_RETVAL_IF_FALSE(!(build.int8 && build.fp8), "FP8 and INT8 precisions have been specified", false, err); + + if (build.fp8) + { + config.setFlag(BuilderFlag::kFP8); + } + + if (build.int4) + { + config.setFlag(BuilderFlag::kINT4); + } + + if (build.int8 && !build.fp16) + { + sample::gLogInfo + << "FP32 and INT8 precisions have been specified - more performance might be enabled by additionally " + "specifying --fp16 or --best" + << std::endl; + } + + auto isInt8 = [](const IOFormat& format) { return format.first == DataType::kINT8; }; + auto int8IO = std::count_if(build.inputFormats.begin(), build.inputFormats.end(), isInt8) + + std::count_if(build.outputFormats.begin(), build.outputFormats.end(), isInt8); + + auto hasQDQLayers = [](INetworkDefinition& network) { + // Determine if our network has QDQ layers. + auto const nbLayers = network.getNbLayers(); + for (int32_t i = 0; i < nbLayers; i++) + { + auto const& layer = network.getLayer(i); + if (layer->getType() == LayerType::kQUANTIZE || layer->getType() == LayerType::kDEQUANTIZE) + { + return true; + } + } + return false; + }; + + if (!hasQDQLayers(network) && (build.int8 || int8IO) && build.calibration.empty()) + { + // Explicitly set int8 scales if no calibrator is provided and if I/O tensors use int8, + // because auto calibration does not support this case. + SMP_RETVAL_IF_FALSE(setTensorDynamicRange(network), "Error in set tensor dynamic range.", false, err); + } + else if (build.int8) + { + if (!hasQDQLayers(network) && int8IO) + { + try + { + // Set dynamic ranges of int8 inputs / outputs to match scales loaded from calibration cache + // TODO http://nvbugs/3262234 Change the network validation so that this workaround can be removed + setTensorScalesFromCalibration(network, build.inputFormats, build.outputFormats, build.calibration); + } + catch (std::exception&) + { + sample::gLogError + << "Int8IO was specified but impossible to read tensor scales from provided calibration cache file" + << std::endl; + return false; + } + } + IOptimizationProfile* profileCalib{nullptr}; + if (!build.shapesCalib.empty()) + { + profileCalib = builder.createOptimizationProfile(); + for (uint32_t i = 0, n = network.getNbInputs(); i < n; i++) + { + auto* input = network.getInput(i); + Dims profileDims{}; + auto const tensorName = input->getName(); + auto shape = findPlausible(build.shapesCalib, tensorName); + + if (shape == build.shapesCalib.end()) + { + std::ostringstream msg; + msg << "Calibration profile for tensor " << tensorName << " cannot be found!"; + throw std::invalid_argument(msg.str()); + } + + auto shapesCalib = shape->second; + profileDims = toDims(shapesCalib[static_cast(OptProfileSelector::kOPT)]); + // Here we check only kMIN as all profileDims are the same. + SMP_RETVAL_IF_FALSE(profileCalib->setDimensions(tensorName, OptProfileSelector::kMIN, profileDims), + "Error in set dimensions to calibration profile OPT", false, err); + profileCalib->setDimensions(tensorName, OptProfileSelector::kOPT, profileDims); + profileCalib->setDimensions(tensorName, OptProfileSelector::kMAX, profileDims); + sample::gLogInfo << "Set calibration profile for input tensor " << tensorName << " to " << profileDims + << std::endl; + } + SMP_RETVAL_IF_FALSE(profileCalib->isValid(), "Calibration profile is invalid", false, err); + SMP_RETVAL_IF_FALSE( + config.setCalibrationProfile(profileCalib), "Error in set calibration profile", false, err); + } + + std::vector elemCount{}; + for (int i = 0; i < network.getNbInputs(); i++) + { + auto* input = network.getInput(i); + auto const dims = input->getDimensions(); + auto const isDynamicInput + = std::any_of(dims.d, dims.d + dims.nbDims, [](int32_t dim) { return dim == -1; }); + + if (profileCalib) + { + elemCount.push_back(volume(profileCalib->getDimensions(input->getName(), OptProfileSelector::kOPT))); + } + else if (!profiles.empty() && isDynamicInput) + { + elemCount.push_back( + volume(profiles[build.calibProfile]->getDimensions(input->getName(), OptProfileSelector::kOPT))); + } + else + { + elemCount.push_back(volume(input->getDimensions())); + } + } + + calibrator.reset(new RndInt8Calibrator(1, elemCount, build.calibration, network, err)); + config.setInt8Calibrator(calibrator.get()); + } + + if (build.directIO) + { + config.setFlag(BuilderFlag::kDIRECT_IO); + } + + switch (build.precisionConstraints) + { + case PrecisionConstraints::kNONE: + // It's the default for TensorRT. + break; + case PrecisionConstraints::kOBEY: config.setFlag(BuilderFlag::kOBEY_PRECISION_CONSTRAINTS); break; + case PrecisionConstraints::kPREFER: config.setFlag(BuilderFlag::kPREFER_PRECISION_CONSTRAINTS); break; + } + + if (!build.layerPrecisions.empty() && build.precisionConstraints != PrecisionConstraints::kNONE) + { + setLayerPrecisions(network, build.layerPrecisions); + } + + if (!build.layerOutputTypes.empty() && build.precisionConstraints != PrecisionConstraints::kNONE) + { + setLayerOutputTypes(network, build.layerOutputTypes); + } + + if (!build.layerDeviceTypes.empty()) + { + setLayerDeviceTypes(network, config, build.layerDeviceTypes); + } + + if (!build.debugTensors.empty()) + { + markDebugTensors(network, build.debugTensors); + } + + if (build.safe && sys.DLACore == -1) + { + config.setEngineCapability(EngineCapability::kSAFETY); + } + + if (build.restricted) + { + config.setFlag(BuilderFlag::kSAFETY_SCOPE); + } + + if (sys.DLACore != -1) + { + if (sys.DLACore < builder.getNbDLACores()) + { + config.setDefaultDeviceType(DeviceType::kDLA); + config.setDLACore(sys.DLACore); + config.setFlag(BuilderFlag::kPREFER_PRECISION_CONSTRAINTS); + if (build.buildDLAStandalone) + { + config.setEngineCapability(EngineCapability::kDLA_STANDALONE); + } + if (build.allowGPUFallback) + { + config.setFlag(BuilderFlag::kGPU_FALLBACK); + } + else + { + // Reformatting runs on GPU, so avoid I/O reformatting. + config.setFlag(BuilderFlag::kDIRECT_IO); + } + if (!build.int8) + { + config.setFlag(BuilderFlag::kFP16); + } + } + else + { + err << "Cannot create DLA engine, " << sys.DLACore << " not available" << std::endl; + return false; + } + } + + if (build.enabledTactics || build.disabledTactics) + { + TacticSources tacticSources = config.getTacticSources(); + tacticSources |= build.enabledTactics; + tacticSources &= ~build.disabledTactics; + config.setTacticSources(tacticSources); + } + + config.setHardwareCompatibilityLevel(build.hardwareCompatibilityLevel); + config.setRuntimePlatform(build.runtimePlatform); + + if (build.maxAuxStreams != defaultMaxAuxStreams) + { + config.setMaxAuxStreams(build.maxAuxStreams); + } + + if (build.allowWeightStreaming) + { + config.setFlag(BuilderFlag::kWEIGHT_STREAMING); + } + + return true; +} + +//! +//! \brief Create a serialized engine for a network defintion +//! +//! \return Whether the engine creation succeeds or fails. +//! +bool networkToSerializedEngine( + BuildOptions const& build, SystemOptions const& sys, IBuilder& builder, BuildEnvironment& env, std::ostream& err) +{ + std::unique_ptr config{builder.createBuilderConfig()}; + std::unique_ptr calibrator; + std::vector> sparseWeights; + SMP_RETVAL_IF_FALSE(config != nullptr, "Config creation failed", false, err); + SMP_RETVAL_IF_FALSE( + setupNetworkAndConfig(build, sys, builder, *env.network, *config, calibrator, err, sparseWeights), + "Network And Config setup failed", false, err); + + std::unique_ptr timingCache{}; + // Try to load cache from file. Create a fresh cache if the file doesn't exist + if (build.timingCacheMode == TimingCacheMode::kGLOBAL) + { + timingCache + = samplesCommon::buildTimingCacheFromFile(gLogger.getTRTLogger(), *config, build.timingCacheFile, err); + } + + // CUDA stream used for profiling by the builder. + auto profileStream = samplesCommon::makeCudaStream(); + SMP_RETVAL_IF_FALSE(profileStream != nullptr, "Cuda stream creation failed", false, err); + config->setProfileStream(*profileStream); + + auto const tBegin = std::chrono::high_resolution_clock::now(); + std::unique_ptr serializedEngine{builder.buildSerializedNetwork(*env.network, *config)}; + SMP_RETVAL_IF_FALSE(serializedEngine != nullptr, "Engine could not be created from network", false, err); + auto const tEnd = std::chrono::high_resolution_clock::now(); + float const buildTime = std::chrono::duration(tEnd - tBegin).count(); + sample::gLogInfo << "Engine built in " << buildTime << " sec." << std::endl; + sample::gLogInfo << "Created engine with size: " << (serializedEngine->size() / 1.0_MiB) << " MiB" << std::endl; + + env.engine.setBlob(serializedEngine); + + if (build.timingCacheMode == TimingCacheMode::kGLOBAL) + { + auto timingCache = config->getTimingCache(); + samplesCommon::updateTimingCacheFile(gLogger.getTRTLogger(), build.timingCacheFile, timingCache, builder); + } + + return true; +} + +//! +//! \brief Parse a given model, create a network and an engine. +//! +bool modelToBuildEnv( + ModelOptions const& model, BuildOptions const& build, SystemOptions& sys, BuildEnvironment& env, std::ostream& err) +{ + env.builder.reset(createBuilder()); + SMP_RETVAL_IF_FALSE(env.builder != nullptr, "Builder creation failed", false, err); + env.builder->setErrorRecorder(&gRecorder); + auto networkFlags = (build.stronglyTyped) + ? 1U << static_cast(nvinfer1::NetworkDefinitionCreationFlag::kSTRONGLY_TYPED) + : 0U; +#if !TRT_WINML + for (auto const& pluginPath : sys.dynamicPlugins) + { + env.builder->getPluginRegistry().loadLibrary(pluginPath.c_str()); + } +#endif + env.network.reset(env.builder->createNetworkV2(networkFlags)); + + std::vector vcPluginLibrariesUsed; + SMP_RETVAL_IF_FALSE(env.network != nullptr, "Network creation failed", false, err); + env.parser + = modelToNetwork(model, build, *env.network, err, build.versionCompatible ? &vcPluginLibrariesUsed : nullptr); + SMP_RETVAL_IF_FALSE(env.parser.operator bool(), "Parsing model failed", false, err); + +#if !TRT_WINML + if (build.versionCompatible && !sys.ignoreParsedPluginLibs && !vcPluginLibrariesUsed.empty()) + { + sample::gLogInfo << "The following plugin libraries were identified by the parser as required for a " + "version-compatible engine:" + << std::endl; + for (auto const& lib : vcPluginLibrariesUsed) + { + sample::gLogInfo << " " << lib << std::endl; + } + if (!build.excludeLeanRuntime) + { + sample::gLogInfo << "These libraries will be added to --setPluginsToSerialize since --excludeLeanRuntime " + "was not specified." + << std::endl; + std::copy(vcPluginLibrariesUsed.begin(), vcPluginLibrariesUsed.end(), + std::back_inserter(sys.setPluginsToSerialize)); + } + sample::gLogInfo << "These libraries will be added to --dynamicPlugins for use at inference time." << std::endl; + std::copy(vcPluginLibrariesUsed.begin(), vcPluginLibrariesUsed.end(), std::back_inserter(sys.dynamicPlugins)); + + // Implicitly-added plugins from ONNX parser should be loaded into plugin registry as well. + for (auto const& pluginPath : vcPluginLibrariesUsed) + { + env.builder->getPluginRegistry().loadLibrary(pluginPath.c_str()); + } + + sample::gLogInfo << "Use --ignoreParsedPluginLibs to disable this behavior." << std::endl; + } +#endif + + SMP_RETVAL_IF_FALSE( + networkToSerializedEngine(build, sys, *env.builder, env, err), "Building engine failed", false, err); + return true; +} + +namespace +{ +std::pair, std::vector> getLayerWeightsRolePair(IRefitter& refitter) +{ + // Get number of refittable items. + auto const nbAll = refitter.getAll(0, nullptr, nullptr); + std::vector layerNames(nbAll); + // Allocate buffers for the items and get them. + std::vector weightsRoles(nbAll); + refitter.getAll(nbAll, layerNames.data(), weightsRoles.data()); + std::vector layerNameStrs(nbAll); + std::transform(layerNames.begin(), layerNames.end(), layerNameStrs.begin(), [](char const* name) { + if (name == nullptr) + { + return std::string{}; + } + return std::string{name}; + }); + return {layerNameStrs, weightsRoles}; +} + +std::pair, std::vector> getMissingLayerWeightsRolePair(IRefitter& refitter) +{ + // Get number of refittable items. + auto const nbMissing = refitter.getMissing(0, nullptr, nullptr); + std::vector layerNames(nbMissing); + // Allocate buffers for the items and get them. + std::vector weightsRoles(nbMissing); + refitter.getMissing(nbMissing, layerNames.data(), weightsRoles.data()); + std::vector layerNameStrs(nbMissing); + std::transform(layerNames.begin(), layerNames.end(), layerNameStrs.begin(), [](char const* name) { + if (name == nullptr) + { + return std::string{}; + } + return std::string{name}; + }); + return {layerNameStrs, weightsRoles}; +} +} // namespace + +bool loadStreamingEngineToBuildEnv(std::string const& filepath, BuildEnvironment& env, std::ostream& err) +{ + auto& reader = env.engine.getFileReader(); + SMP_RETVAL_IF_FALSE(reader.open(filepath), "", false, err << "Error opening engine file: " << filepath); + return true; +} + +bool loadEngineToBuildEnv(std::string const& filepath, BuildEnvironment& env, std::ostream& err) +{ + auto const tBegin = std::chrono::high_resolution_clock::now(); + std::ifstream engineFile(filepath, std::ios::binary); + SMP_RETVAL_IF_FALSE(engineFile.good(), "", false, err << "Error opening engine file: " << filepath); + engineFile.seekg(0, std::ifstream::end); + int64_t fsize = engineFile.tellg(); + engineFile.seekg(0, std::ifstream::beg); + + std::vector engineBlob(fsize); + engineFile.read(reinterpret_cast(engineBlob.data()), fsize); + SMP_RETVAL_IF_FALSE(engineFile.good(), "", false, err << "Error loading engine file: " << filepath); + auto const tEnd = std::chrono::high_resolution_clock::now(); + float const loadTime = std::chrono::duration(tEnd - tBegin).count(); + sample::gLogInfo << "Engine loaded in " << loadTime << " sec." << std::endl; + sample::gLogInfo << "Loaded engine with size: " << (fsize / 1.0_MiB) << " MiB" << std::endl; + + env.engine.setBlob(std::move(engineBlob)); + + return true; +} + +bool printPlanVersion(BuildEnvironment& env, std::ostream& err) +{ + constexpr int64_t kPLAN_SIZE{28}; + std::vector data(kPLAN_SIZE); + auto blob = data.data(); + + auto& reader = env.engine.getFileReader(); + if (reader.isOpen()) + { + SMP_RETVAL_IF_FALSE(reader.read(data.data(), kPLAN_SIZE) == kPLAN_SIZE, "Failed to read plan file", false, err); + } + else + { + SMP_RETVAL_IF_FALSE(env.engine.getBlob().data != nullptr, "Plan file is empty", false, err); + SMP_RETVAL_IF_FALSE(env.engine.getBlob().size >= 28, "Plan file is incorrect", false, err); + blob = static_cast(env.engine.getBlob().data); + } + auto blob32 = reinterpret_cast(blob); + + //! Correct TensorRT plan file starts with this tag + constexpr uint32_t kPLAN_FILE_TAG{0x74727466U}; + SMP_RETVAL_IF_FALSE(blob32[0] == kPLAN_FILE_TAG, "Failed to verify a plan tag.", false, err); + switch (blob32[1]) + { + case 0U: + { + // Blob index to store the plan version may depend on the serialization version. + sample::gLogInfo << "Plan was created with TensorRT version " << static_cast(blob[24]) + << "." << static_cast(blob[25]) << "." << static_cast(blob[26]) + << "." << static_cast(blob[27]) << std::endl; + return true; + } + } + sample::gLogError << "Serialization version is not supported." << std::endl; + return false; +} + +void dumpRefittable(nvinfer1::ICudaEngine& engine) +{ + std::unique_ptr refitter{createRefitter(engine)}; + if (refitter == nullptr) + { + sample::gLogError << "Failed to create a refitter." << std::endl; + return; + } + + auto const& layerWeightsRolePair = getLayerWeightsRolePair(*refitter); + auto const& layerNames = layerWeightsRolePair.first; + auto const& weightsRoles = layerWeightsRolePair.second; + auto const nbAll = layerWeightsRolePair.first.size(); + for (size_t i = 0; i < nbAll; ++i) + { + sample::gLogInfo << layerNames[i] << " " << weightsRoles[i] << std::endl; + } +} + +ICudaEngine* loadEngine(std::string const& engine, int32_t DLACore, std::ostream& err) +{ + BuildEnvironment env(/* isSafe */ false, /* versionCompatible */ false, DLACore, "", getTempfileControlDefaults()); + return loadEngineToBuildEnv(engine, env, err) ? env.engine.release() : nullptr; +} + +bool saveEngine(const ICudaEngine& engine, std::string const& fileName, std::ostream& err) +{ + std::ofstream engineFile(fileName, std::ios::binary); + if (!engineFile) + { + err << "Cannot open engine file: " << fileName << std::endl; + return false; + } + + std::unique_ptr serializedEngine{engine.serialize()}; + if (serializedEngine == nullptr) + { + err << "Engine serialization failed" << std::endl; + return false; + } + + engineFile.write(static_cast(serializedEngine->data()), serializedEngine->size()); + return !engineFile.fail(); +} + +bool getEngineBuildEnv( + const ModelOptions& model, BuildOptions const& build, SystemOptions& sys, BuildEnvironment& env, std::ostream& err) +{ + bool createEngineSuccess{false}; + + if (build.load) + { + if (build.safe) + { + createEngineSuccess = loadEngineToBuildEnv(build.engine, env, err); + } + else + { + createEngineSuccess = loadStreamingEngineToBuildEnv(build.engine, env, err); + } + } + else + { + createEngineSuccess = modelToBuildEnv(model, build, sys, env, err); + } + + SMP_RETVAL_IF_FALSE(createEngineSuccess, "Failed to create engine from model or file.", false, err); + + if (build.getPlanVersionOnly && build.load) + { + SMP_RETVAL_IF_FALSE(printPlanVersion(env, err), "Failed to get plan file version.", false, err); + return true; + } + + if (build.save) + { + std::ofstream engineFile(build.engine, std::ios::binary); + auto& engineBlob = env.engine.getBlob(); + engineFile.write(static_cast(engineBlob.data), engineBlob.size); + SMP_RETVAL_IF_FALSE(!engineFile.fail(), "Saving engine to file failed.", false, err); + engineFile.flush(); + engineFile.close(); + if (!build.safe) + { + env.engine.releaseBlob(); + SMP_RETVAL_IF_FALSE(loadStreamingEngineToBuildEnv(build.engine, env, err), "Reading engine file failed.", false, err); + } + } + + return true; +} + +// There is not a getWeightsName API, so we need to use WeightsRole. +std::vector> getAllRefitWeightsForLayer(const ILayer& l) +{ + switch (l.getType()) + { + case LayerType::kCONSTANT: + { + auto const& layer = static_cast(l); + auto const weights = layer.getWeights(); + switch (weights.type) + { + case DataType::kFLOAT: + case DataType::kHALF: + case DataType::kBF16: + case DataType::kINT8: + case DataType::kINT32: + case DataType::kINT64: return {std::make_pair(WeightsRole::kCONSTANT, weights)}; + case DataType::kBOOL: + case DataType::kUINT8: + case DataType::kFP8: + case DataType::kINT4: + // Refit not supported for these types. + break; + } + break; + } + case LayerType::kCONVOLUTION: + { + auto const& layer = static_cast(l); + return {std::make_pair(WeightsRole::kKERNEL, layer.getKernelWeights()), + std::make_pair(WeightsRole::kBIAS, layer.getBiasWeights())}; + } + case LayerType::kDECONVOLUTION: + { + auto const& layer = static_cast(l); + return {std::make_pair(WeightsRole::kKERNEL, layer.getKernelWeights()), + std::make_pair(WeightsRole::kBIAS, layer.getBiasWeights())}; + } + case LayerType::kSCALE: + { + auto const& layer = static_cast(l); + return {std::make_pair(WeightsRole::kSCALE, layer.getScale()), + std::make_pair(WeightsRole::kSHIFT, layer.getShift())}; + } + case LayerType::kACTIVATION: + case LayerType::kASSERTION: + case LayerType::kCAST: + case LayerType::kCONCATENATION: + case LayerType::kCONDITION: + case LayerType::kCONDITIONAL_INPUT: + case LayerType::kCONDITIONAL_OUTPUT: + case LayerType::kDEQUANTIZE: + case LayerType::kEINSUM: + case LayerType::kELEMENTWISE: + case LayerType::kFILL: + case LayerType::kGATHER: + case LayerType::kGRID_SAMPLE: + case LayerType::kIDENTITY: + case LayerType::kITERATOR: + case LayerType::kLOOP_OUTPUT: + case LayerType::kLRN: + case LayerType::kMATRIX_MULTIPLY: + case LayerType::kNMS: + case LayerType::kNON_ZERO: + case LayerType::kNORMALIZATION: + case LayerType::kONE_HOT: + case LayerType::kPADDING: + case LayerType::kPARAMETRIC_RELU: + case LayerType::kPLUGIN: + case LayerType::kPLUGIN_V2: + case LayerType::kPLUGIN_V3: + case LayerType::kPOOLING: + case LayerType::kQUANTIZE: + case LayerType::kRAGGED_SOFTMAX: + case LayerType::kRECURRENCE: + case LayerType::kREDUCE: + case LayerType::kRESIZE: + case LayerType::kREVERSE_SEQUENCE: + case LayerType::kSCATTER: + case LayerType::kSELECT: + case LayerType::kSHAPE: + case LayerType::kSHUFFLE: + case LayerType::kSLICE: + case LayerType::kSOFTMAX: + case LayerType::kTOPK: + case LayerType::kTRIP_LIMIT: + case LayerType::kUNARY: return {}; + } + return {}; +} + +bool timeRefit(INetworkDefinition const& network, nvinfer1::ICudaEngine& engine, bool multiThreading) +{ + using time_point = std::chrono::time_point; + using durationMs = std::chrono::duration; + + auto const nbLayers = network.getNbLayers(); + std::unique_ptr refitter{createRefitter(engine)}; + // Set max threads that can be used by refitter. + if (multiThreading && !refitter->setMaxThreads(10)) + { + sample::gLogError << "Failed to set max threads to refitter." << std::endl; + return false; + } + auto const& layerWeightsRolePair = getLayerWeightsRolePair(*refitter); + // We use std::string instead of char const* since we can have copies of layer names. + std::set> layerRoleSet; + + auto const& layerNames = layerWeightsRolePair.first; + auto const& weightsRoles = layerWeightsRolePair.second; + + std::transform(layerNames.begin(), layerNames.end(), weightsRoles.begin(), + std::inserter(layerRoleSet, layerRoleSet.begin()), + [](std::string const& layerName, WeightsRole const role) { return std::make_pair(layerName, role); }); + + auto const isRefittable = [&layerRoleSet](char const* layerName, WeightsRole const role) { + return layerRoleSet.find(std::make_pair(layerName, role)) != layerRoleSet.end(); + }; + + auto const setWeights = [&] { + for (int32_t i = 0; i < nbLayers; i++) + { + auto const layer = network.getLayer(i); + auto const roleWeightsVec = getAllRefitWeightsForLayer(*layer); + for (auto const& roleWeights : roleWeightsVec) + { + if (isRefittable(layer->getName(), roleWeights.first)) + { + bool const success = refitter->setWeights(layer->getName(), roleWeights.first, roleWeights.second); + if (!success) + { + return false; + } + } + } + } + return true; + }; + + auto const reportMissingWeights = [&] { + auto const& missingPair = getMissingLayerWeightsRolePair(*refitter); + auto const& layerNames = missingPair.first; + auto const& weightsRoles = missingPair.second; + for (size_t i = 0; i < layerNames.size(); ++i) + { + sample::gLogError << "Missing (" << layerNames[i] << ", " << weightsRoles[i] << ") for refitting." + << std::endl; + } + return layerNames.empty(); + }; + + // Skip weights validation since we are confident that the new weights are similar to the weights used to build + // engine. + refitter->setWeightsValidation(false); + + // Warm up and report missing weights + // We only need to set weights for the first time and that can be reused in later refitting process. + bool const success = setWeights() && reportMissingWeights() && refitter->refitCudaEngine(); + if (!success) + { + return false; + } + + TrtCudaStream stream; + constexpr int32_t kLOOP = 10; + time_point const refitStartTime{std::chrono::steady_clock::now()}; + { + for (int32_t l = 0; l < kLOOP; l++) + { + if (!refitter->refitCudaEngineAsync(stream.get())) + { + return false; + } + } + } + stream.synchronize(); + time_point const refitEndTime{std::chrono::steady_clock::now()}; + + sample::gLogInfo << "Engine refitted" + << " in " << durationMs(refitEndTime - refitStartTime).count() / kLOOP << " ms." << std::endl; + return true; +} + +namespace +{ +void* initSafeRuntime() +{ + void* handle{nullptr}; + // libsafe_executor.so will be renamed to libnvinfer_safe.so when TRTS-9421 completes. + // Currently libsafe_executor_debug.so for samplesCommon::isDebug() is not ready. +#define TRTS_9421_COMPLETED 0 +#if TRTS_9421_COMPLETED +#if !defined(_WIN32) + std::string const dllName{"libsafe_executor.so"}; +#if SANITIZER_BUILD + handle = dlopen(dllName.c_str(), RTLD_LAZY | RTLD_NODELETE); +#else + // RTLD_GLOBAL is used for symbol resolution of subsequently loaded plugin libraries + handle = dlopen(dllName.c_str(), RTLD_LAZY | RTLD_GLOBAL); +#endif +#endif +#endif // TRTS_9421_COMPLETED + return handle; +} + +#if !defined(_WIN32) +struct DllDeleter +{ + void operator()(void* handle) + { + if (handle != nullptr) + { + dlclose(handle); + } + } +}; +const std::unique_ptr safeRuntimeLibrary{initSafeRuntime()}; +#endif +} // namespace + +bool hasSafeRuntime() +{ + bool ret{false}; +#if !defined(_WIN32) + ret = (safeRuntimeLibrary != nullptr); +#endif + return ret; +} + +} // namespace sample diff --git a/src/Detector/tensorrt_yolo/common/sampleEngines.h b/src/Detector/tensorrt_yolo/common/sampleEngines.h index 620b51a1..ec02e909 100644 --- a/src/Detector/tensorrt_yolo/common/sampleEngines.h +++ b/src/Detector/tensorrt_yolo/common/sampleEngines.h @@ -1,11 +1,12 @@ /* - * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, @@ -17,58 +18,227 @@ #ifndef TRT_SAMPLE_ENGINES_H #define TRT_SAMPLE_ENGINES_H -#include -#include - #include "NvInfer.h" - -#if (NV_TENSORRT_MAJOR > 7) - -#include "NvInferConsistency.h" -#include "NvInferSafeRuntime.h" - -#endif - #include "NvOnnxParser.h" #include "sampleOptions.h" #include "sampleUtils.h" +#include "streamReader.h" +#include +#include namespace sample { struct Parser { - TrtUniquePtr onnxParser; + std::unique_ptr onnxParser; operator bool() const { - return onnxParser.operator bool(); + return onnxParser != nullptr; } }; -struct BuildEnvironment +//! +//! \brief Helper struct to faciliate engine serialization and deserialization. It does not own the underlying memory. +//! +struct EngineBlob { - TrtUniquePtr network; - //! Parser that creates the network. Must be declared *after* network, so that when - //! ~BuildEnvironment() executes, the parser is destroyed before the network is destroyed. - Parser parser; - TrtUniquePtr engine; - std::unique_ptr safeEngine; - std::vector engineBlob; + EngineBlob(void* engineData, size_t engineSize) + : data(engineData) + , size(engineSize) + { + } + void* data{}; + size_t size{}; + bool empty() const + { + return size == 0; + } }; //! -//! \brief Generate a network definition for a given model -//! -//! \return Parser The parser used to initialize the network and that holds the weights for the network, or an invalid -//! parser (the returned parser converts to false if tested) +//! \brief A helper class to hold a serialized engine (std or safe) and only deserialize it when being accessed. //! -//! Constant input dimensions in the model must not be changed in the corresponding -//! network definition, because its correctness may rely on the constants. -//! -//! \see Parser::operator bool() -//! -Parser modelToNetwork(const ModelOptions& model, nvinfer1::INetworkDefinition& network, std::ostream& err); +class LazilyDeserializedEngine +{ +public: + //! + //! \brief Delete default constructor to make sure isSafe and DLACore are always set. + //! + LazilyDeserializedEngine() = delete; + + //! + //! \brief Constructor of LazilyDeserializedEngine. + //! + LazilyDeserializedEngine(bool isSafe, bool versionCompatible, int32_t DLACore, std::string const& tempdir, + nvinfer1::TempfileControlFlags tempfileControls, std::string const& leanDLLPath) + : mIsSafe(isSafe) + , mVersionCompatible(versionCompatible) + , mDLACore(DLACore) + , mTempdir(tempdir) + , mTempfileControls(tempfileControls) + , mLeanDLLPath(leanDLLPath) + { + mFileReader = std::make_unique(); + } + + //! + //! \brief Move from another LazilyDeserializedEngine. + //! + LazilyDeserializedEngine(LazilyDeserializedEngine&& other) = default; + + //! + //! \brief Delete copy constructor. + //! + LazilyDeserializedEngine(LazilyDeserializedEngine const& other) = delete; + + //! + //! \brief Get the pointer to the ICudaEngine. Triggers deserialization if not already done so. + //! + nvinfer1::ICudaEngine* get(); + + //! + //! \brief Get the pointer to the ICudaEngine and release the ownership. + //! + nvinfer1::ICudaEngine* release(); + + //! + //! \brief Get the underlying blob storing serialized engine. + //! + EngineBlob const getBlob() const + { + ASSERT((!mFileReader || !mFileReader->isOpen()) + && "Attempting to access the glob when there is an open file reader!"); + if (!mEngineBlob.empty()) + { + return EngineBlob{const_cast(static_cast(mEngineBlob.data())), mEngineBlob.size()}; + } + if (mEngineBlobHostMemory.get() != nullptr && mEngineBlobHostMemory->size() > 0) + { + return EngineBlob{mEngineBlobHostMemory->data(), mEngineBlobHostMemory->size()}; + } + ASSERT(false && "Attempting to access an empty engine!"); + return EngineBlob{nullptr, 0}; + } + + //! + //! \brief Set the underlying blob storing the serialized engine without duplicating IHostMemory. + //! + void setBlob(std::unique_ptr& data) + { + ASSERT(data.get() && data->size() > 0); + mEngineBlobHostMemory = std::move(data); + mEngine.reset(); + } + + //! + //! \brief Set the underlying blob storing the serialized engine without duplicating vector memory. + //! + void setBlob(std::vector&& engineBlob) + { + mEngineBlob = std::move(engineBlob); + mEngine.reset(); + } + + //! + //! \brief Release the underlying blob without deleting the deserialized engine. + //! + void releaseBlob() + { + mEngineBlob.clear(); + mEngineBlobHostMemory.reset(); + } + + //! + //! \brief Get the file stream reader used for deserialization + //! + samplesCommon::FileStreamReader& getFileReader() + { + ASSERT(mFileReader); + return *mFileReader; + } + + //! + //! \brief Get if safe mode is enabled. + //! + bool isSafe() + { + return mIsSafe; + } + + void setDynamicPlugins(std::vector const& dynamicPlugins) + { + mDynamicPlugins = dynamicPlugins; + } + +private: + bool mIsSafe{false}; + bool mVersionCompatible{false}; + int32_t mDLACore{-1}; + std::vector mEngineBlob; + std::unique_ptr mFileReader; + + // Directly use the host memory of a serialized engine instead of duplicating the engine in CPU memory. + std::unique_ptr mEngineBlobHostMemory; + + std::string mTempdir{}; + nvinfer1::TempfileControlFlags mTempfileControls{getTempfileControlDefaults()}; + std::string mLeanDLLPath{}; + std::vector mDynamicPlugins; + + //! \name Owned TensorRT objects + //! Per TensorRT object lifetime requirements as outlined in the developer guide, + //! the runtime must remain live while any engines created by the runtime are live. + //! DO NOT ADJUST the declaration order here: runtime -> (engine). + //! Destruction occurs in reverse declaration order: (engine) -> runtime. + //!@{ + + //! The runtime used to track parent of mRuntime if one exists. + //! Needed to load mRuntime if lean.so is supplied through file system path. + std::unique_ptr mParentRuntime{}; + + //! The runtime that is used to deserialize the engine. + std::unique_ptr mRuntime{}; + + //! If mIsSafe is false, this points to the deserialized std engine + std::unique_ptr mEngine{}; + + //!@} +}; + +struct BuildEnvironment +{ + BuildEnvironment() = delete; + BuildEnvironment(BuildEnvironment const& other) = delete; + BuildEnvironment(BuildEnvironment&& other) = delete; + BuildEnvironment(bool isSafe, bool versionCompatible, int32_t DLACore, std::string const& tempdir, + nvinfer1::TempfileControlFlags tempfileControls, std::string const& leanDLLPath = "") + : engine(isSafe, versionCompatible, DLACore, tempdir, tempfileControls, leanDLLPath) + { + } + + //! \name Owned TensorRT objects + //! Per TensorRT object lifetime requirements as outlined in the developer guide, + //! factory objects must remain live while the objects created by those factories + //! are live (with the exception of builder -> engine). + //! DO NOT ADJUST the declaration order here: builder -> network -> parser. + //! Destruction occurs in reverse declaration order: parser -> network -> builder. + //!@{ + + //! The builder used to build the engine. + std::unique_ptr builder; + + //! The network used by the builder. + std::unique_ptr network; + + //! The parser used to specify the network. + Parser parser; + + //! The engine. + LazilyDeserializedEngine engine; + //!@} +}; //! //! \brief Set up network and config @@ -89,95 +259,63 @@ void dumpRefittable(nvinfer1::ICudaEngine& engine); //! //! \return Pointer to the engine loaded or nullptr if the operation failed //! -nvinfer1::ICudaEngine* loadEngine(const std::string& engine, int DLACore, std::ostream& err); +nvinfer1::ICudaEngine* loadEngine(std::string const& engine, int32_t DLACore, std::ostream& err); //! //! \brief Save an engine into a file //! //! \return boolean Return true if the engine was successfully saved //! -bool saveEngine(const nvinfer1::ICudaEngine& engine, const std::string& fileName, std::ostream& err); +bool saveEngine(nvinfer1::ICudaEngine const& engine, std::string const& fileName, std::ostream& err); //! //! \brief Create an engine from model or serialized file, and optionally save engine //! //! \return Pointer to the engine created or nullptr if the creation failed //! -bool getEngineBuildEnv(const ModelOptions& model, const BuildOptions& build, const SystemOptions& sys, - BuildEnvironment& env, std::ostream& err); - -//! -//! \brief Create an engine from model or serialized file, and optionally save engine -//! -//! \return Pointer to the engine created or nullptr if the creation failed -//! -inline TrtUniquePtr getEngine( - const ModelOptions& model, const BuildOptions& build, const SystemOptions& sys, std::ostream& err) -{ - BuildEnvironment env; - TrtUniquePtr engine; - if (getEngineBuildEnv(model, build, sys, env, err)) - { - engine.swap(env.engine); - } - return engine; -} +bool getEngineBuildEnv( + ModelOptions const& model, BuildOptions const& build, SystemOptions& sys, BuildEnvironment& env, std::ostream& err); //! //! \brief Create a serialized network //! //! \return Pointer to a host memory for a serialized network //! -nvinfer1::IHostMemory* networkToSerialized(const BuildOptions& build, const SystemOptions& sys, nvinfer1::IBuilder& builder, - nvinfer1::INetworkDefinition& network, std::ostream& err); +nvinfer1::IHostMemory* networkToSerialized(const BuildOptions& build, const SystemOptions& sys, + nvinfer1::IBuilder& builder, nvinfer1::INetworkDefinition& network, std::ostream& err); //! //! \brief Tranfer model to a serialized network //! //! \return Pointer to a host memory for a serialized network //! -nvinfer1::IHostMemory* modelToSerialized(const ModelOptions& model, const BuildOptions& build, const SystemOptions& sys, std::ostream& err); +nvinfer1::IHostMemory* modelToSerialized( + const ModelOptions& model, const BuildOptions& build, const SystemOptions& sys, std::ostream& err); //! //! \brief Serialize network and save it into a file //! //! \return boolean Return true if the network was successfully serialized and saved //! -bool serializeAndSave(const ModelOptions& model, const BuildOptions& build, const SystemOptions& sys, std::ostream& err); +bool serializeAndSave( + const ModelOptions& model, const BuildOptions& build, const SystemOptions& sys, std::ostream& err); bool timeRefit(const nvinfer1::INetworkDefinition& network, nvinfer1::ICudaEngine& engine, bool multiThreading); //! //! \brief Set tensor scales from a calibration table //! -void setTensorScalesFromCalibration(nvinfer1::INetworkDefinition& network, const std::vector& inputFormats, - const std::vector& outputFormats, const std::string& calibrationFile); +void setTensorScalesFromCalibration(nvinfer1::INetworkDefinition& network, std::vector const& inputFormats, + std::vector const& outputFormats, std::string const& calibrationFile); //! //! \brief Check if safe runtime is loaded. //! bool hasSafeRuntime(); -//! -//! \brief Create a safe runtime object if the dynamic library is loaded. -//! -nvinfer1::safe::IRuntime* createSafeInferRuntime(nvinfer1::ILogger& logger) noexcept; - -//! -//! \brief Check if consistency checker is loaded. -//! -bool hasConsistencyChecker(); +bool loadStreamingEngineToBuildEnv(std::string const& engine, BuildEnvironment& env, std::ostream& err); -//! -//! \brief Create a consistency checker object if the dynamic library is loaded. -//! -nvinfer1::consistency::IConsistencyChecker* createConsistencyChecker( - nvinfer1::ILogger& logger, nvinfer1::IHostMemory const* engine) noexcept; - -//! -//! \brief Run consistency check on serialized engine. -//! -bool checkSafeEngine(void const* serializedEngine, int32_t const engineSize); +bool loadEngineToBuildEnv(std::string const& engine, BuildEnvironment& env, std::ostream& err); } // namespace sample #endif // TRT_SAMPLE_ENGINES_H diff --git a/src/Detector/tensorrt_yolo/common/sampleEntrypoints.h b/src/Detector/tensorrt_yolo/common/sampleEntrypoints.h new file mode 100644 index 00000000..cc8bf1b9 --- /dev/null +++ b/src/Detector/tensorrt_yolo/common/sampleEntrypoints.h @@ -0,0 +1,101 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef TRT_SAMPLE_ENTRYPOINTS_H +#define TRT_SAMPLE_ENTRYPOINTS_H + +//! \file sampleEntrypoints.h +//! +//! Declares and conditionally defines entrypoints needed to create base TensorRT objects, depending +//! on whether the given sample uses TRT at link time or dynamically. Since common code is built once +//! and shared across all samples (both link-time and dynamic TRT), it does not define these entrypoints, +//! so each sample must define them individually. +//! +//! Samples that use TRT at link time can define DEFINE_TRT_ENTRYPOINTS before including this header to +//! pick up the definitions here. + +#include "NvInfer.h" +#include "NvOnnxParser.h" +#include "logger.h" + +extern nvinfer1::IBuilder* createBuilder(); +extern nvinfer1::IRuntime* createRuntime(); +extern nvinfer1::IRefitter* createRefitter(nvinfer1::ICudaEngine& engine); + +extern nvonnxparser::IParser* createONNXParser(nvinfer1::INetworkDefinition& network); + +#if !defined(DEFINE_TRT_ENTRYPOINTS) +#define DEFINE_TRT_ENTRYPOINTS 0 +#endif + +// Allow opting out of individual entrypoints that are unused by the sample +#if !defined(DEFINE_TRT_BUILDER_ENTRYPOINT) +#define DEFINE_TRT_BUILDER_ENTRYPOINT 1 +#endif +#if !defined(DEFINE_TRT_RUNTIME_ENTRYPOINT) +#define DEFINE_TRT_RUNTIME_ENTRYPOINT 1 +#endif +#if !defined(DEFINE_TRT_REFITTER_ENTRYPOINT) +#define DEFINE_TRT_REFITTER_ENTRYPOINT 1 +#endif +#if !defined(DEFINE_TRT_ONNX_PARSER_ENTRYPOINT) +#define DEFINE_TRT_ONNX_PARSER_ENTRYPOINT 1 +#endif +#if !defined(DEFINE_TRT_LEGACY_PARSER_ENTRYPOINT) +#define DEFINE_TRT_LEGACY_PARSER_ENTRYPOINT 1 +#endif + +#if DEFINE_TRT_ENTRYPOINTS +nvinfer1::IBuilder* createBuilder() +{ +#if DEFINE_TRT_BUILDER_ENTRYPOINT + return nvinfer1::createInferBuilder(sample::gLogger.getTRTLogger()); +#else + return {}; +#endif +} + +nvinfer1::IRuntime* createRuntime() +{ +#if DEFINE_TRT_RUNTIME_ENTRYPOINT + return nvinfer1::createInferRuntime(sample::gLogger.getTRTLogger()); +#else + return {}; +#endif +} + +nvinfer1::IRefitter* createRefitter(nvinfer1::ICudaEngine& engine) +{ +#if DEFINE_TRT_REFITTER_ENTRYPOINT + return nvinfer1::createInferRefitter(engine, sample::gLogger.getTRTLogger()); +#else + return {}; +#endif +} + +nvonnxparser::IParser* createONNXParser(nvinfer1::INetworkDefinition& network) +{ +#if DEFINE_TRT_ONNX_PARSER_ENTRYPOINT + return nvonnxparser::createParser(network, sample::gLogger.getTRTLogger()); +#else + return {}; +#endif +} + +#endif // DEFINE_TRT_ENTRYPOINTS + +#endif // TRT_SAMPLE_ENTRYPOINTS_H diff --git a/src/Detector/tensorrt_yolo/common/sampleInference.cpp_ b/src/Detector/tensorrt_yolo/common/sampleInference.cpp_ new file mode 100644 index 00000000..ca0098d4 --- /dev/null +++ b/src/Detector/tensorrt_yolo/common/sampleInference.cpp_ @@ -0,0 +1,1622 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined(__QNX__) +#include +#include +#endif + +#include "NvInfer.h" + +#include "ErrorRecorder.h" +#include "bfloat16.h" +#include "logger.h" +#include "sampleDevice.h" +#include "sampleEngines.h" +#include "sampleInference.h" +#include "sampleOptions.h" +#include "sampleReporting.h" +#include "sampleUtils.h" +using namespace nvinfer1; +namespace sample +{ + +template +bool validateTensorNames(TMapType const& map, TEngineType const* engine, int32_t const endBindingIndex) +{ + // Check if the provided input tensor names match the input tensors of the engine. + // Throw an error if the provided input tensor names cannot be found because it implies a potential typo. + for (auto const& item : map) + { + bool tensorNameFound{false}; + for (int32_t b = 0; b < endBindingIndex; ++b) + { + auto const tensorName = engine->getIOTensorName(b); + auto const tensorIOMode = engine->getTensorIOMode(tensorName); + if (tensorIOMode == nvinfer1::TensorIOMode::kINPUT && matchStringWithOneWildcard(item.first, tensorName)) + { + tensorNameFound = true; + break; + } + } + if (!tensorNameFound) + { + sample::gLogError << "Cannot find input tensor with name \"" << item.first << "\" in the engine bindings! " + << "Please make sure the input tensor names are correct." << std::endl; + return false; + } + } + return true; +} + +template +class FillBindingClosure +{ +private: + using InputsMap = std::unordered_map; + using BindingsVector = std::vector>; + + TEngineType const* mEngine; + nvinfer1::IExecutionContext const* mContext; + InputsMap const& inputs; + BindingsVector& bindings; + int32_t batch; + int32_t endBindingIndex; + int32_t profileIndex; + + void fillOneBinding(TensorInfo const& tensorInfo) + { + auto const name = tensorInfo.name; + auto const* bindingInOutStr = tensorInfo.isInput ? "Input" : "Output"; + for (auto& binding : bindings) + { + auto const input = findPlausible(inputs, name); + if (tensorInfo.isInput && input != inputs.end()) + { + sample::gLogInfo << "Using values loaded from " << input->second << " for input " << name << std::endl; + binding->addBinding(tensorInfo, input->second); + } + else + { + if (tensorInfo.isInput) + { + sample::gLogInfo << "Using random values for input " << name << std::endl; + } + binding->addBinding(tensorInfo); + } + if (tensorInfo.isDynamic) + { + sample::gLogInfo << bindingInOutStr << " binding for " << name + << " is dynamic and will be created during execution using OutputAllocator." + << std::endl; + } + else + { + sample::gLogInfo << bindingInOutStr << " binding for " << name << " with dimensions " << tensorInfo.dims + << " is created." << std::endl; + } + } + } + + bool fillAllBindings(int32_t batch, int32_t endBindingIndex) + { + if (!validateTensorNames(inputs, mEngine, endBindingIndex)) + { + sample::gLogError << "Invalid tensor names found in --loadInputs flag." << std::endl; + return false; + } + for (int32_t b = 0; b < endBindingIndex; b++) + { + TensorInfo tensorInfo; + tensorInfo.bindingIndex = b; + getTensorInfo(tensorInfo); + tensorInfo.updateVolume(batch); + fillOneBinding(tensorInfo); + } + return true; + } + + void getTensorInfo(TensorInfo& tensorInfo); + +public: + FillBindingClosure(TEngineType const* _engine, nvinfer1::IExecutionContext const* _context, + InputsMap const& _inputs, BindingsVector& _bindings, int32_t _batch, int32_t _endBindingIndex, + int32_t _profileIndex) + : mEngine(_engine) + , mContext(_context) + , inputs(_inputs) + , bindings(_bindings) + , batch(_batch) + , endBindingIndex(_endBindingIndex) + , profileIndex(_profileIndex) + { + } + + bool operator()() + { + return fillAllBindings(batch, endBindingIndex); + } +}; + +template <> +void FillBindingClosure::getTensorInfo(TensorInfo& tensorInfo) +{ + auto const b = tensorInfo.bindingIndex; + auto const name = mEngine->getIOTensorName(b); + tensorInfo.name = name; + tensorInfo.dims = mContext->getTensorShape(name); + tensorInfo.isDynamic = std::any_of( + tensorInfo.dims.d, tensorInfo.dims.d + tensorInfo.dims.nbDims, [](int32_t dim) { return dim == -1; }); + tensorInfo.comps = mEngine->getTensorComponentsPerElement(name, profileIndex); + tensorInfo.strides = mContext->getTensorStrides(name); + tensorInfo.vectorDimIndex = mEngine->getTensorVectorizedDim(name, profileIndex); + tensorInfo.isInput = mEngine->getTensorIOMode(name) == TensorIOMode::kINPUT; + tensorInfo.dataType = mEngine->getTensorDataType(name); +} + +namespace +{ +bool allocateContextMemory(InferenceEnvironment& iEnv, InferenceOptions const& inference) +{ + auto* engine = iEnv.engine.get(); + iEnv.deviceMemory.resize(inference.infStreams); + // Delay context memory allocation until input shapes are specified because runtime allocation would require actual + // input shapes. + for (int32_t i = 0; i < inference.infStreams; ++i) + { + auto const& ec = iEnv.contexts.at(i); + if (inference.memoryAllocationStrategy == MemoryAllocationStrategy::kSTATIC) + { + sample::gLogInfo << "Created execution context with device memory size: " + << (engine->getDeviceMemorySize() / 1.0_MiB) << " MiB" << std::endl; + } + else + { + size_t sizeToAlloc{0}; + const char* allocReason{nullptr}; + if (inference.memoryAllocationStrategy == MemoryAllocationStrategy::kPROFILE) + { + auto const p = inference.optProfileIndex; + sizeToAlloc = engine->getDeviceMemorySizeForProfile(p); + allocReason = "current profile"; + } + else if (inference.memoryAllocationStrategy == MemoryAllocationStrategy::kRUNTIME) + { + sizeToAlloc = ec->updateDeviceMemorySizeForShapes(); + allocReason = "current input shapes"; + } + else + { + sample::gLogError << "Unrecognizable memory allocation strategy." << std::endl; + return false; + } + iEnv.deviceMemory.at(i) = TrtDeviceBuffer(sizeToAlloc); + ec->setDeviceMemoryV2(iEnv.deviceMemory.at(i).get(), iEnv.deviceMemory.at(i).getSize()); + sample::gLogInfo << "Maximum device memory size across all profiles: " + << (engine->getDeviceMemorySizeV2() / 1.0_MiB) << " MiB" << std::endl; + sample::gLogInfo << "Only allocated device memory enough for " << allocReason << ": " + << (sizeToAlloc / 1.0_MiB) << " MiB" << std::endl; + } + } + return true; +} +} // namespace + +bool setUpInference(InferenceEnvironment& iEnv, InferenceOptions const& inference, SystemOptions const& system) +{ +#if TRT_WINML + int32_t const isIntegrated{}; +#else + int32_t device{}; + cudaCheck(cudaGetDevice(&device)); + + cudaDeviceProp properties; + cudaCheck(cudaGetDeviceProperties(&properties, device)); + int32_t const isIntegrated{properties.integrated}; +#endif + // Use managed memory on integrated devices when transfers are skipped + // and when it is explicitly requested on the commandline. + bool useManagedMemory{(inference.skipTransfers && isIntegrated) || inference.useManaged}; + SMP_RETVAL_IF_FALSE(!iEnv.safe, "Safe inference is not supported!", false, sample::gLogError); + + using FillStdBindings = FillBindingClosure; + + auto* engine = iEnv.engine.get(); + SMP_RETVAL_IF_FALSE(engine != nullptr, "Got invalid engine!", false, sample::gLogError); + + // Release serialized blob to save memory space. + iEnv.engine.releaseBlob(); + + // Setup weight streaming if enabled + if (engine->getStreamableWeightsSize() > 0) + { + auto const& budget = inference.weightStreamingBudget; + int64_t wsBudget = budget.bytes; + if (budget.percent != 100.0) + { + double const percent = budget.percent; + ASSERT(percent < 100.0); + auto const max = engine->getStreamableWeightsSize(); + wsBudget = (max >= 0) ? (percent / 100) * (max) : WeightStreamingBudget::kDISABLE; + } + + if (wsBudget == WeightStreamingBudget::kDISABLE) + { + wsBudget = engine->getStreamableWeightsSize(); + } + else if (wsBudget == WeightStreamingBudget::kAUTOMATIC) + { + wsBudget = engine->getWeightStreamingAutomaticBudget(); + } + ASSERT(wsBudget >= 0); + bool success = engine->setWeightStreamingBudgetV2(wsBudget); + SMP_RETVAL_IF_FALSE(success, "Failed to set weight streaming limit!", false, sample::gLogError); + switch (wsBudget) + { + case WeightStreamingBudget::kDISABLE: + { + sample::gLogInfo << "Weight streaming has been disabled at runtime." << std::endl; + break; + } + + case WeightStreamingBudget::kAUTOMATIC: + { + sample::gLogInfo << "The weight streaming budget will automatically be chosen by TensorRT." << std::endl; + break; + } + default: + { + sample::gLogInfo << "Weight streaming is enabled with a device memory limit of " << wsBudget << " bytes." + << std::endl; + break; + } + } + } + + int32_t const nbOptProfiles = engine->getNbOptimizationProfiles(); + + if (inference.optProfileIndex >= nbOptProfiles) + { + sample::gLogError << "Selected profile index " << inference.optProfileIndex + << " exceeds the number of profiles that the engine holds. " << std::endl; + return false; + } + + if (nbOptProfiles > 1 && !inference.setOptProfile) + { + sample::gLogWarning << nbOptProfiles + << " profiles detected but not set. Running with profile 0. Please use " + "--dumpOptimizationProfile to see all available profiles." + << std::endl; + } + + cudaStream_t setOptProfileStream; + CHECK(cudaStreamCreate(&setOptProfileStream)); + + for (int32_t s = 0; s < inference.infStreams; ++s) + { + IExecutionContext* ec{nullptr}; + if (inference.memoryAllocationStrategy == MemoryAllocationStrategy::kSTATIC) + { + // Let TRT pre-allocate and manage the memory. + ec = engine->createExecutionContext(); + } + else + { + // Allocate based on the current profile or runtime shapes. + ec = engine->createExecutionContext(ExecutionContextAllocationStrategy::kUSER_MANAGED); + } + if (ec == nullptr) + { + sample::gLogError << "Unable to create execution context for stream " << s << "." << std::endl; + return false; + } + ec->setNvtxVerbosity(inference.nvtxVerbosity); + +#if !TRT_WINML + int32_t const persistentCacheLimit + = samplesCommon::getMaxPersistentCacheSize() * inference.persistentCacheRatio; + sample::gLogInfo << "Setting persistentCacheLimit to " << persistentCacheLimit << " bytes." << std::endl; + ec->setPersistentCacheLimit(persistentCacheLimit); +#endif + + auto setProfile = ec->setOptimizationProfileAsync(inference.optProfileIndex, setOptProfileStream); + CHECK(cudaStreamSynchronize(setOptProfileStream)); + + if (!setProfile) + { + sample::gLogError << "Set optimization profile failed. " << std::endl; + if (inference.infStreams > 1) + { + sample::gLogError + << "Please ensure that the engine is built with preview feature profileSharing0806 enabled. " + << std::endl; + } + return false; + } + + iEnv.contexts.emplace_back(ec); + iEnv.bindings.emplace_back(new Bindings(useManagedMemory)); + } + + CHECK(cudaStreamDestroy(setOptProfileStream)); + + if (iEnv.profiler) + { + iEnv.contexts.front()->setProfiler(iEnv.profiler.get()); + // Always run reportToProfiler() after enqueue launch + iEnv.contexts.front()->setEnqueueEmitsProfile(false); + } + + int32_t const endBindingIndex = engine->getNbIOTensors(); + + // Make sure that the tensor names provided in command-line args actually exist in any of the engine bindings + // to avoid silent typos. + if (!validateTensorNames(inference.shapes, engine, endBindingIndex)) + { + sample::gLogError << "Invalid tensor names found in --shapes flag." << std::endl; + return false; + } + + for (int32_t b = 0; b < endBindingIndex; ++b) + { + auto const& name = engine->getIOTensorName(b); + auto const& mode = engine->getTensorIOMode(name); + if (mode == TensorIOMode::kINPUT) + { + Dims const dims = iEnv.contexts.front()->getTensorShape(name); + bool isShapeInferenceIO{false}; + isShapeInferenceIO = engine->isShapeInferenceIO(name); + bool const hasRuntimeDim = std::any_of(dims.d, dims.d + dims.nbDims, [](int32_t dim) { return dim == -1; }); + auto const shape = findPlausible(inference.shapes, name); + if (hasRuntimeDim || isShapeInferenceIO) + { + // Set shapeData to either dimensions of the input (if it has a dynamic shape) + // or set to values of the input (if it is an input shape tensor). + std::vector shapeData; + + if (shape == inference.shapes.end()) + { + // No information provided. Use default value for missing data. + constexpr int32_t kDEFAULT_VALUE = 1; + if (isShapeInferenceIO) + { + // Set shape tensor to all ones. + shapeData.assign(volume(dims, 0, dims.nbDims), kDEFAULT_VALUE); + sample::gLogWarning << "Values missing for input shape tensor: " << name + << "Automatically setting values to: " << shapeData << std::endl; + } + else + { + // Use default value for unspecified runtime dimensions. + shapeData.resize(dims.nbDims); + std::transform(dims.d, dims.d + dims.nbDims, shapeData.begin(), + [&](int32_t dimension) { return dimension >= 0 ? dimension : kDEFAULT_VALUE; }); + sample::gLogWarning << "Shape missing for input with dynamic shape: " << name + << "Automatically setting shape to: " << shapeData << std::endl; + } + } + else if (inference.inputs.count(shape->first) && isShapeInferenceIO) + { + // Load shape tensor from file. + int64_t const size = volume(dims, 0, dims.nbDims); + shapeData.resize(size); + auto const& filename = inference.inputs.at(shape->first); + auto dst = reinterpret_cast(shapeData.data()); + loadFromFile(filename, dst, size * sizeof(decltype(shapeData)::value_type)); + } + else + { + shapeData = shape->second; + } + + int32_t* shapeTensorData{nullptr}; + if (isShapeInferenceIO) + { + // Save the data in iEnv, in a way that it's address does not change + // before enqueueV3 is called. + iEnv.inputShapeTensorValues.emplace_back(shapeData); + shapeTensorData = iEnv.inputShapeTensorValues.back().data(); + } + + for (auto& c : iEnv.contexts) + { + if (isShapeInferenceIO) + { + sample::gLogInfo << "Set input shape tensor " << name << " to: " << shapeData << std::endl; + if (!c->setTensorAddress(name, shapeTensorData)) + { + return false; + } + } + else + { + sample::gLogInfo << "Set shape of input tensor " << name << " to: " << shapeData + << std::endl; + if (!c->setInputShape(name, toDims(shapeData))) + { + return false; + } + } + } + } + else if (nbOptProfiles && shape != inference.shapes.end()) + { + // Check if the provided shape matches the static dimensions in the engine. + for (auto& c : iEnv.contexts) + { + if (!c->setInputShape(name, toDims(shape->second))) + { + sample::gLogError << "The engine was built with static shapes for input tensor " << name + << " but the provided shapes do not match the static shapes!" << std::endl; + return false; + } + } + } + } + } + + // Create Debug Listener and turn on debug states if client requested dumping debug tensors. + if (!inference.debugTensorFileNames.empty()) + { + iEnv.listener.reset(new DebugTensorWriter(inference.debugTensorFileNames)); + iEnv.contexts.front()->setDebugListener(iEnv.listener.get()); + for (auto const& s : inference.debugTensorFileNames) + { + iEnv.contexts.front()->setTensorDebugState(s.first.c_str(), true); + } + } + + if (!allocateContextMemory(iEnv, inference)) + { + return false; + } + + auto const* context = iEnv.contexts.front().get(); + return FillStdBindings( + engine, context, inference.inputs, iEnv.bindings, 1, endBindingIndex, inference.optProfileIndex)(); +} + +TaskInferenceEnvironment::TaskInferenceEnvironment( + std::string engineFile, InferenceOptions inference, int32_t deviceId, int32_t DLACore, int32_t bs) + : iOptions(inference) + , device(deviceId) + , batch(bs) +{ + BuildEnvironment bEnv(/* isSafe */ false, /* versionCompatible */ false, DLACore, "", getTempfileControlDefaults()); + loadEngineToBuildEnv(engineFile, bEnv, sample::gLogError); + std::unique_ptr tmp(new InferenceEnvironment(bEnv)); + iEnv = std::move(tmp); + + cudaCheck(cudaSetDevice(device)); + SystemOptions system{}; + system.device = device; + system.DLACore = DLACore; + if (!setUpInference(*iEnv, iOptions, system)) + { + sample::gLogError << "Inference set up failed" << std::endl; + } +} +namespace +{ + +#if defined(__QNX__) +using TimePoint = double; +#else +using TimePoint = std::chrono::time_point; +#endif + +TimePoint getCurrentTime() +{ +#if defined(__QNX__) + uint64_t const currentCycles = ClockCycles(); + uint64_t const cyclesPerSecond = SYSPAGE_ENTRY(qtime)->cycles_per_sec; + // Return current timestamp in ms. + return static_cast(currentCycles) * 1000. / cyclesPerSecond; +#else + return std::chrono::high_resolution_clock::now(); +#endif +} + +//! +//! \struct SyncStruct +//! \brief Threads synchronization structure +//! +struct SyncStruct +{ + std::mutex mutex; + TrtCudaStream mainStream; + TrtCudaEvent gpuStart{cudaEventBlockingSync}; + TimePoint cpuStart{}; + float sleep{}; +}; + +struct Enqueue +{ + explicit Enqueue(nvinfer1::IExecutionContext& context) + : mContext(context) + { + } + + nvinfer1::IExecutionContext& mContext; +}; + +//! +//! \class EnqueueExplicit +//! \brief Functor to enqueue inference with explict batch +//! +class EnqueueExplicit : private Enqueue +{ + +public: + explicit EnqueueExplicit(nvinfer1::IExecutionContext& context, Bindings const& bindings) + : Enqueue(context) + , mBindings(bindings) + { + ASSERT(mBindings.setTensorAddresses(mContext)); + } + + bool operator()(TrtCudaStream& stream) const + { + try + { + bool const result = mContext.enqueueV3(stream.get()); + // Collecting layer timing info from current profile index of execution context, except under capturing + // mode. + if (!isStreamCapturing(stream) && mContext.getProfiler() && !mContext.getEnqueueEmitsProfile() + && !mContext.reportToProfiler()) + { + gLogWarning << "Failed to collect layer timing info from previous enqueueV3()" << std::endl; + } + return result; + } + catch (const std::exception&) + { + return false; + } + return false; + } + +private: + // Helper function to check if a stream is in capturing mode. + bool isStreamCapturing(TrtCudaStream& stream) const + { + cudaStreamCaptureStatus status{cudaStreamCaptureStatusNone}; + cudaCheck(cudaStreamIsCapturing(stream.get(), &status)); + return status != cudaStreamCaptureStatusNone; + } + + Bindings const& mBindings; +}; + +//! +//! \class EnqueueGraph +//! \brief Functor to enqueue inference from CUDA Graph +//! +class EnqueueGraph +{ + +public: + explicit EnqueueGraph(nvinfer1::IExecutionContext& context, TrtCudaGraph& graph) + : mGraph(graph) + , mContext(context) + { + } + + bool operator()(TrtCudaStream& stream) const + { + if (mGraph.launch(stream)) + { + // Collecting layer timing info from current profile index of execution context + if (mContext.getProfiler() && !mContext.getEnqueueEmitsProfile() && !mContext.reportToProfiler()) + { + gLogWarning << "Failed to collect layer timing info from previous CUDA graph launch" << std::endl; + } + return true; + } + return false; + } + + TrtCudaGraph& mGraph; + nvinfer1::IExecutionContext& mContext; +}; + +//! +//! \class EnqueueGraphSafe +//! \brief Functor to enqueue inference from CUDA Graph +//! +class EnqueueGraphSafe +{ + +public: + explicit EnqueueGraphSafe(TrtCudaGraph& graph) + : mGraph(graph) + { + } + + bool operator()(TrtCudaStream& stream) const + { + return mGraph.launch(stream); + } + + TrtCudaGraph& mGraph; +}; + +using EnqueueFunction = std::function; + +enum class StreamType : int32_t +{ + kINPUT = 0, + kCOMPUTE = 1, + kOUTPUT = 2, + kNUM = 3 +}; + +enum class EventType : int32_t +{ + kINPUT_S = 0, + kINPUT_E = 1, + kCOMPUTE_S = 2, + kCOMPUTE_E = 3, + kOUTPUT_S = 4, + kOUTPUT_E = 5, + kNUM = 6 +}; + +using MultiStream = std::array(StreamType::kNUM)>; + +using MultiEvent = std::array, static_cast(EventType::kNUM)>; + +using EnqueueTimes = std::array; + +//! +//! \class Iteration +//! \brief Inference iteration and streams management +//! +class Iteration +{ + +public: + Iteration(int32_t id, InferenceOptions const& inference, nvinfer1::IExecutionContext& context, Bindings& bindings) + : mBindings(bindings) + , mStreamId(id) + , mDepth(1 + inference.overlap) + , mActive(mDepth) + , mEvents(mDepth) + , mEnqueueTimes(mDepth) + , mContext(&context) + { + for (int32_t d = 0; d < mDepth; ++d) + { + for (int32_t e = 0; e < static_cast(EventType::kNUM); ++e) + { + mEvents[d][e].reset(new TrtCudaEvent(!inference.spin)); + } + } + createEnqueueFunction(inference, context, bindings); + } + + bool query(bool skipTransfers) + { + if (mActive[mNext]) + { + return true; + } + + if (!skipTransfers) + { + record(EventType::kINPUT_S, StreamType::kINPUT); + setInputData(false); + record(EventType::kINPUT_E, StreamType::kINPUT); + wait(EventType::kINPUT_E, StreamType::kCOMPUTE); // Wait for input DMA before compute + } + + record(EventType::kCOMPUTE_S, StreamType::kCOMPUTE); + recordEnqueueTime(); + if (!mEnqueue(getStream(StreamType::kCOMPUTE))) + { + return false; + } + recordEnqueueTime(); + record(EventType::kCOMPUTE_E, StreamType::kCOMPUTE); + + if (!skipTransfers) + { + wait(EventType::kCOMPUTE_E, StreamType::kOUTPUT); // Wait for compute before output DMA + record(EventType::kOUTPUT_S, StreamType::kOUTPUT); + fetchOutputData(false); + record(EventType::kOUTPUT_E, StreamType::kOUTPUT); + } + + mActive[mNext] = true; + moveNext(); + return true; + } + + float sync( + TimePoint const& cpuStart, TrtCudaEvent const& gpuStart, std::vector& trace, bool skipTransfers) + { + if (mActive[mNext]) + { + if (skipTransfers) + { + getEvent(EventType::kCOMPUTE_E).synchronize(); + } + else + { + getEvent(EventType::kOUTPUT_E).synchronize(); + } + trace.emplace_back(getTrace(cpuStart, gpuStart, skipTransfers)); + mActive[mNext] = false; + return getEvent(EventType::kCOMPUTE_S) - gpuStart; + } + return 0; + } + + void syncAll( + TimePoint const& cpuStart, TrtCudaEvent const& gpuStart, std::vector& trace, bool skipTransfers) + { + for (int32_t d = 0; d < mDepth; ++d) + { + sync(cpuStart, gpuStart, trace, skipTransfers); + moveNext(); + } + } + + void wait(TrtCudaEvent& gpuStart) + { + getStream(StreamType::kINPUT).wait(gpuStart); + } + + void setInputData(bool sync) + { + mBindings.transferInputToDevice(getStream(StreamType::kINPUT)); + // additional sync to avoid overlapping with inference execution. + if (sync) + { + getStream(StreamType::kINPUT).synchronize(); + } + } + + void fetchOutputData(bool sync) + { + mBindings.transferOutputToHost(getStream(StreamType::kOUTPUT)); + // additional sync to avoid overlapping with inference execution. + if (sync) + { + getStream(StreamType::kOUTPUT).synchronize(); + } + } + +private: + void moveNext() + { + mNext = mDepth - 1 - mNext; + } + + TrtCudaStream& getStream(StreamType t) + { + return mStream[static_cast(t)]; + } + + TrtCudaEvent& getEvent(EventType t) + { + return *mEvents[mNext][static_cast(t)]; + } + + void record(EventType e, StreamType s) + { + getEvent(e).record(getStream(s)); + } + + void recordEnqueueTime() + { + mEnqueueTimes[mNext][enqueueStart] = getCurrentTime(); + enqueueStart = 1 - enqueueStart; + } + + TimePoint getEnqueueTime(bool start) + { + return mEnqueueTimes[mNext][start ? 0 : 1]; + } + + void wait(EventType e, StreamType s) + { + getStream(s).wait(getEvent(e)); + } + + InferenceTrace getTrace(TimePoint const& cpuStart, TrtCudaEvent const& gpuStart, bool skipTransfers) + { + float is + = skipTransfers ? getEvent(EventType::kCOMPUTE_S) - gpuStart : getEvent(EventType::kINPUT_S) - gpuStart; + float ie + = skipTransfers ? getEvent(EventType::kCOMPUTE_S) - gpuStart : getEvent(EventType::kINPUT_E) - gpuStart; + float os + = skipTransfers ? getEvent(EventType::kCOMPUTE_E) - gpuStart : getEvent(EventType::kOUTPUT_S) - gpuStart; + float oe + = skipTransfers ? getEvent(EventType::kCOMPUTE_E) - gpuStart : getEvent(EventType::kOUTPUT_E) - gpuStart; + + return InferenceTrace(mStreamId, + std::chrono::duration(getEnqueueTime(true) - cpuStart).count(), + std::chrono::duration(getEnqueueTime(false) - cpuStart).count(), is, ie, + getEvent(EventType::kCOMPUTE_S) - gpuStart, getEvent(EventType::kCOMPUTE_E) - gpuStart, os, oe); + } + + void createEnqueueFunction( + InferenceOptions const& inference, nvinfer1::IExecutionContext& context, Bindings& bindings) + { + mEnqueue = EnqueueFunction(EnqueueExplicit(context, mBindings)); + if (inference.graph) + { + sample::gLogInfo << "Capturing CUDA graph for the current execution context" << std::endl; + + TrtCudaStream& stream = getStream(StreamType::kCOMPUTE); + // Avoid capturing initialization calls by executing the enqueue function at least + // once before starting CUDA graph capture. + auto const ret = mEnqueue(stream); + if (!ret) + { + throw std::runtime_error("Inference enqueue failed."); + } + stream.synchronize(); + + mGraph.beginCapture(stream); + // The built TRT engine may contain operations that are not permitted under CUDA graph capture mode. + // When the stream is capturing, the enqueue call may return false if the current CUDA graph capture fails. + if (mEnqueue(stream)) + { + mGraph.endCapture(stream); + mEnqueue = EnqueueFunction(EnqueueGraph(context, mGraph)); + sample::gLogInfo << "Successfully captured CUDA graph for the current execution context" << std::endl; + } + else + { + mGraph.endCaptureOnError(stream); + // Ensure any CUDA error has been cleaned up. + cudaCheck(cudaGetLastError()); + sample::gLogWarning << "The built TensorRT engine contains operations that are not permitted under " + "CUDA graph capture mode." + << std::endl; + sample::gLogWarning << "The specified --useCudaGraph flag has been ignored. The inference will be " + "launched without using CUDA graph launch." + << std::endl; + } + } + } + + Bindings& mBindings; + + TrtCudaGraph mGraph; + EnqueueFunction mEnqueue; + + int32_t mStreamId{0}; + int32_t mNext{0}; + int32_t mDepth{2}; // default to double buffer to hide DMA transfers + + std::vector mActive; + MultiStream mStream; + std::vector mEvents; + + int32_t enqueueStart{0}; + std::vector mEnqueueTimes; + nvinfer1::IExecutionContext* mContext{nullptr}; +}; + +bool inferenceLoop(std::vector>& iStreams, TimePoint const& cpuStart, + TrtCudaEvent const& gpuStart, int iterations, float maxDurationMs, float warmupMs, + std::vector& trace, bool skipTransfers, float idleMs) +{ + float durationMs = 0; + int32_t skip = 0; + + if (maxDurationMs == -1.F) + { + sample::gLogWarning << "--duration=-1 is specified, inference will run in an endless loop until" + << " aborted with CTRL-C (SIGINT)" << std::endl; + while (true) + { + for (auto& s : iStreams) + { + if (!s->query(skipTransfers)) + { + return false; + } + } + for (auto& s : iStreams) + { + s->sync(cpuStart, gpuStart, trace, skipTransfers); + } + } + } + + for (int32_t i = 0; i < iterations + skip || durationMs < maxDurationMs; ++i) + { + for (auto& s : iStreams) + { + if (!s->query(skipTransfers)) + { + return false; + } + } + for (auto& s : iStreams) + { + durationMs = std::max(durationMs, s->sync(cpuStart, gpuStart, trace, skipTransfers)); + } + if (durationMs < warmupMs) // Warming up + { + if (durationMs) // Skip complete iterations + { + ++skip; + } + continue; + } + if (idleMs != 0.F) + { + std::this_thread::sleep_for(std::chrono::duration(idleMs)); + } + } + for (auto& s : iStreams) + { + s->syncAll(cpuStart, gpuStart, trace, skipTransfers); + } + return true; +} + +void inferenceExecution(InferenceOptions const& inference, InferenceEnvironment& iEnv, SyncStruct& sync, + int32_t const threadIdx, int32_t const streamsPerThread, int32_t device, + std::vector& trace) noexcept +{ + try + { + float warmupMs = inference.warmup; + float durationMs = -1.F; + if (inference.duration != -1.F) + { + durationMs = inference.duration * 1000.F + warmupMs; + } + + cudaCheck(cudaSetDevice(device)); + + std::vector> iStreams; + + for (int32_t s = 0; s < streamsPerThread; ++s) + { + int32_t const streamId{threadIdx * streamsPerThread + s}; + auto* iteration = new Iteration(streamId, inference, *iEnv.getContext(streamId), *iEnv.bindings[streamId]); + if (inference.skipTransfers) + { + iteration->setInputData(true); + } + iStreams.emplace_back(iteration); + } + + for (auto& s : iStreams) + { + s->wait(sync.gpuStart); + } + + std::vector localTrace; + if (!inferenceLoop(iStreams, sync.cpuStart, sync.gpuStart, inference.iterations, durationMs, warmupMs, + localTrace, inference.skipTransfers, inference.idle)) + { + sync.mutex.lock(); + iEnv.error = true; + sync.mutex.unlock(); + } + + if (inference.skipTransfers) + { + for (auto& s : iStreams) + { + s->fetchOutputData(true); + } + } + + sync.mutex.lock(); + trace.insert(trace.end(), localTrace.begin(), localTrace.end()); + sync.mutex.unlock(); + } + catch (...) + { + sync.mutex.lock(); + iEnv.error = true; + sync.mutex.unlock(); + } +} + +inline std::thread makeThread(InferenceOptions const& inference, InferenceEnvironment& iEnv, SyncStruct& sync, + int32_t threadIdx, int32_t streamsPerThread, int32_t device, std::vector& trace) +{ + return std::thread(inferenceExecution, std::cref(inference), std::ref(iEnv), std::ref(sync), threadIdx, + streamsPerThread, device, std::ref(trace)); +} + +} // namespace + +bool runInference( + InferenceOptions const& inference, InferenceEnvironment& iEnv, int32_t device, std::vector& trace) +{ + SMP_RETVAL_IF_FALSE(!iEnv.safe, "Safe inference is not supported!", false, sample::gLogError); + cudaCheck(cudaProfilerStart()); + + trace.resize(0); + + SyncStruct sync; + sync.sleep = inference.sleep; + sync.mainStream.sleep(&sync.sleep); + sync.cpuStart = getCurrentTime(); + sync.gpuStart.record(sync.mainStream); + + // When multiple streams are used, trtexec can run inference in two modes: + // (1) if inference.threads is true, then run each stream on each thread. + // (2) if inference.threads is false, then run all streams on the same thread. + int32_t const numThreads = inference.threads ? inference.infStreams : 1; + int32_t const streamsPerThread = inference.threads ? 1 : inference.infStreams; + + std::vector threads; + for (int32_t threadIdx = 0; threadIdx < numThreads; ++threadIdx) + { + threads.emplace_back(makeThread(inference, iEnv, sync, threadIdx, streamsPerThread, device, trace)); + } + for (auto& th : threads) + { + th.join(); + } + + cudaCheck(cudaProfilerStop()); + + auto cmpTrace = [](InferenceTrace const& a, InferenceTrace const& b) { return a.h2dStart < b.h2dStart; }; + std::sort(trace.begin(), trace.end(), cmpTrace); + + return !iEnv.error; +} + +bool runMultiTasksInference(std::vector>& tEnvList) +{ + cudaCheck(cudaProfilerStart()); + cudaSetDeviceFlags(cudaDeviceScheduleSpin); + + SyncStruct sync; + sync.sleep = 0; + sync.mainStream.sleep(&sync.sleep); + sync.cpuStart = getCurrentTime(); + sync.gpuStart.record(sync.mainStream); + + std::vector threads; + for (size_t i = 0; i < tEnvList.size(); ++i) + { + auto& tEnv = tEnvList[i]; + threads.emplace_back(makeThread( + tEnv->iOptions, *(tEnv->iEnv), sync, /*threadIdx*/ 0, /*streamsPerThread*/ 1, tEnv->device, tEnv->trace)); + } + for (auto& th : threads) + { + th.join(); + } + + cudaCheck(cudaProfilerStop()); + + auto cmpTrace = [](InferenceTrace const& a, InferenceTrace const& b) { return a.h2dStart < b.h2dStart; }; + for (auto& tEnv : tEnvList) + { + std::sort(tEnv->trace.begin(), tEnv->trace.end(), cmpTrace); + } + + return std::none_of(tEnvList.begin(), tEnvList.end(), + [](std::unique_ptr& tEnv) { return tEnv->iEnv->error; }); +} + +namespace +{ +size_t reportGpuMemory() +{ + static size_t prevFree{0}; + size_t free{0}; + size_t total{0}; + size_t newlyAllocated{0}; + cudaCheck(cudaMemGetInfo(&free, &total)); + sample::gLogInfo << "Free GPU memory = " << free / 1024.0_MiB << " GiB"; + if (prevFree != 0) + { + newlyAllocated = (prevFree - free); + sample::gLogInfo << ", newly allocated GPU memory = " << newlyAllocated / 1024.0_MiB << " GiB"; + } + sample::gLogInfo << ", total GPU memory = " << total / 1024.0_MiB << " GiB" << std::endl; + prevFree = free; + return newlyAllocated; +} +} // namespace + +//! Returns true if deserialization is slower than expected or fails. +bool timeDeserialize(InferenceEnvironment& iEnv, SystemOptions const& sys) +{ + constexpr int32_t kNB_ITERS{20}; + std::unique_ptr rt{createRuntime()}; + std::unique_ptr engine; + + SMP_RETVAL_IF_FALSE(!iEnv.safe, "Safe inference is not supported!", false, sample::gLogError); + + auto timeDeserializeFn = [&]() -> float { + bool deserializeOK{false}; + engine.reset(nullptr); + auto startClock = std::chrono::high_resolution_clock::now(); + SMP_RETVAL_IF_FALSE(!iEnv.safe, "Safe inference is not supported!", false, sample::gLogError); + + auto& reader = iEnv.engine.getFileReader(); + reader.reset(); + ASSERT(reader.isOpen()); +#if !TRT_WINML + for (auto const& pluginPath : sys.dynamicPlugins) + { + rt->getPluginRegistry().loadLibrary(pluginPath.c_str()); + } +#endif + engine.reset(rt->deserializeCudaEngine(reader)); + deserializeOK = (engine != nullptr); + auto endClock = std::chrono::high_resolution_clock::now(); + // return NAN if deserialization failed. + return deserializeOK ? std::chrono::duration(endClock - startClock).count() : NAN; + }; + + // Warmup the caches to make sure that cache thrashing isn't throwing off the results + { + sample::gLogInfo << "Begin deserialization warmup..." << std::endl; + for (int32_t i = 0, e = 2; i < e; ++i) + { + timeDeserializeFn(); + } + } + sample::gLogInfo << "Begin deserialization engine timing..." << std::endl; + float const first = timeDeserializeFn(); + + // Check if first deserialization succeeded. + if (std::isnan(first)) + { + sample::gLogError << "Engine deserialization failed." << std::endl; + return true; + } + + sample::gLogInfo << "First deserialization time = " << first << " milliseconds" << std::endl; + + // Record initial gpu memory state. + reportGpuMemory(); + + float totalTime{0.F}; + for (int32_t i = 0; i < kNB_ITERS; ++i) + { + totalTime += timeDeserializeFn(); + } + auto const averageTime = totalTime / kNB_ITERS; + // reportGpuMemory sometimes reports zero after a single deserialization of a small engine, + // so use the size of memory for all the iterations. + auto const totalEngineSizeGpu = reportGpuMemory(); + sample::gLogInfo << "Total deserialization time = " << totalTime << " milliseconds in " << kNB_ITERS + << " iterations, average time = " << averageTime << " milliseconds, first time = " << first + << " milliseconds." << std::endl; + sample::gLogInfo << "Deserialization Bandwidth = " << 1E-6 * totalEngineSizeGpu / totalTime << " GB/s" << std::endl; + + // If the first deserialization is more than tolerance slower than + // the average deserialization, return true, which means an error occurred. + // The tolerance is set to 2x since the deserialization time is quick and susceptible + // to caching issues causing problems in the first timing. + auto const tolerance = 2.0F; + bool const isSlowerThanExpected = first > averageTime * tolerance; + if (isSlowerThanExpected) + { + sample::gLogInfo << "First deserialization time divided by average time is " << (first / averageTime) + << ". Exceeds tolerance of " << tolerance << "x." << std::endl; + } + return isSlowerThanExpected; +} + +std::string getLayerInformation( + nvinfer1::ICudaEngine* engine, nvinfer1::IExecutionContext* context, nvinfer1::LayerInformationFormat format) +{ + auto runtime = std::unique_ptr{createRuntime()}; + auto inspector = std::unique_ptr(engine->createEngineInspector()); + if (context != nullptr) + { + inspector->setExecutionContext(context); + } + std::string result = inspector->getEngineInformation(format); + return result; +} + +void Binding::fill(std::string const& fileName) +{ + loadFromFile(fileName, static_cast(buffer->getHostBuffer()), buffer->getSize()); +} + +void Binding::fill() +{ + switch (dataType) + { + case nvinfer1::DataType::kBOOL: + { + fillBuffer(buffer->getHostBuffer(), volume, 0, 1); + break; + } + case nvinfer1::DataType::kINT32: + { + fillBuffer(buffer->getHostBuffer(), volume, -128, 127); + break; + } + case nvinfer1::DataType::kINT64: + { + fillBuffer(buffer->getHostBuffer(), volume, -128, 127); + break; + } + case nvinfer1::DataType::kINT8: + { + fillBuffer(buffer->getHostBuffer(), volume, -128, 127); + break; + } + case nvinfer1::DataType::kFLOAT: + { + fillBuffer(buffer->getHostBuffer(), volume, -1.0F, 1.0F); + break; + } + case nvinfer1::DataType::kHALF: + { + fillBuffer<__half>(buffer->getHostBuffer(), volume, -1.0F, 1.0F); + break; + } + case nvinfer1::DataType::kBF16: + { + fillBuffer(buffer->getHostBuffer(), volume, -1.0F, 1.0F); + break; + } + case nvinfer1::DataType::kUINT8: + { + fillBuffer(buffer->getHostBuffer(), volume, 0, 255); + break; + } + case nvinfer1::DataType::kFP8: ASSERT(false && "FP8 is not supported"); + case nvinfer1::DataType::kINT4: ASSERT(false && "INT4 is not supported"); + } +} + +void Binding::dump(std::ostream& os, Dims dims, Dims strides, int32_t vectorDim, int32_t spv, + std::string const separator /*= " "*/) const +{ + void* outputBuffer{}; + if (outputAllocator != nullptr) + { + outputBuffer = outputAllocator->getBuffer()->getHostBuffer(); + // Overwrite dimensions with those reported by the output allocator. + dims = outputAllocator->getFinalDims(); + os << "Final shape is " << dims << " reported by the output allocator." << std::endl; + } + else + { + outputBuffer = buffer->getHostBuffer(); + } + switch (dataType) + { + case nvinfer1::DataType::kBOOL: + { + dumpBuffer(outputBuffer, separator, os, dims, strides, vectorDim, spv); + break; + } + case nvinfer1::DataType::kINT32: + { + dumpBuffer(outputBuffer, separator, os, dims, strides, vectorDim, spv); + break; + } + case nvinfer1::DataType::kINT8: + { + dumpBuffer(outputBuffer, separator, os, dims, strides, vectorDim, spv); + break; + } + case nvinfer1::DataType::kFLOAT: + { + dumpBuffer(outputBuffer, separator, os, dims, strides, vectorDim, spv); + break; + } + case nvinfer1::DataType::kHALF: + { + dumpBuffer<__half>(outputBuffer, separator, os, dims, strides, vectorDim, spv); + break; + } + case nvinfer1::DataType::kBF16: + { + dumpBuffer(outputBuffer, separator, os, dims, strides, vectorDim, spv); + break; + } + case nvinfer1::DataType::kUINT8: + { + dumpBuffer(outputBuffer, separator, os, dims, strides, vectorDim, spv); + break; + } + case nvinfer1::DataType::kINT64: + { + dumpBuffer(outputBuffer, separator, os, dims, strides, vectorDim, spv); + break; + } + case nvinfer1::DataType::kFP8: ASSERT(false && "FP8 is not supported"); + case nvinfer1::DataType::kINT4: ASSERT(false && "INT4 is not supported"); + } +} + +void Bindings::addBinding(TensorInfo const& tensorInfo, std::string const& fileName /*= ""*/) +{ + auto const b = tensorInfo.bindingIndex; + while (mBindings.size() <= static_cast(b)) + { + mBindings.emplace_back(); + mDevicePointers.emplace_back(); + } + mNames[tensorInfo.name] = b; + mBindings[b].isInput = tensorInfo.isInput; + mBindings[b].volume = tensorInfo.vol; + mBindings[b].dataType = tensorInfo.dataType; + if (tensorInfo.isDynamic) + { + ASSERT(!tensorInfo.isInput); // Only output shape can be possibly unknown because of DDS. + if (mBindings[b].outputAllocator == nullptr) + { + if (mUseManaged) + { + mBindings[b].outputAllocator.reset(new OutputAllocator(new UnifiedMirroredBuffer)); + } + else + { + mBindings[b].outputAllocator.reset(new OutputAllocator(new DiscreteMirroredBuffer)); + } + } + } + else + { + if (mBindings[b].buffer == nullptr) + { + if (mUseManaged) + { + mBindings[b].buffer.reset(new UnifiedMirroredBuffer); + } + else + { + mBindings[b].buffer.reset(new DiscreteMirroredBuffer); + } + } + // Some memory allocators return nullptr when allocating zero bytes, but TensorRT requires a non-null ptr + // even for empty tensors, so allocate a dummy byte. + if (tensorInfo.vol == 0) + { + mBindings[b].buffer->allocate(1); + } + else + { + mBindings[b].buffer->allocate( + static_cast(tensorInfo.vol) * static_cast(dataTypeSize(tensorInfo.dataType))); + } + mDevicePointers[b] = mBindings[b].buffer->getDeviceBuffer(); + } + if (tensorInfo.isInput) + { + if (fileName.empty()) + { + fill(b); + } + else + { + fill(b, fileName); + } + } +} + +void** Bindings::getDeviceBuffers() +{ + return mDevicePointers.data(); +} + +void Bindings::transferInputToDevice(TrtCudaStream& stream) +{ + for (auto& b : mNames) + { + if (mBindings[b.second].isInput) + { + mBindings[b.second].buffer->hostToDevice(stream); + } + } +} + +void Bindings::transferOutputToHost(TrtCudaStream& stream) +{ + for (auto& b : mNames) + { + if (!mBindings[b.second].isInput) + { + if (mBindings[b.second].outputAllocator != nullptr) + { + mBindings[b.second].outputAllocator->getBuffer()->deviceToHost(stream); + } + else + { + mBindings[b.second].buffer->deviceToHost(stream); + } + } + } +} + +void Bindings::dumpBindingValues(nvinfer1::IExecutionContext const& context, int32_t binding, std::ostream& os, + std::string const& separator /*= " "*/, int32_t batch /*= 1*/) const +{ + auto const tensorName = context.getEngine().getIOTensorName(binding); + Dims dims = context.getTensorShape(tensorName); + Dims strides = context.getTensorStrides(tensorName); + int32_t vectorDim = context.getEngine().getTensorVectorizedDim(tensorName); + int32_t const spv = context.getEngine().getTensorComponentsPerElement(tensorName); + + mBindings[binding].dump(os, dims, strides, vectorDim, spv, separator); +} + +namespace +{ + +std::string genFilenameSafeString(std::string const& s) +{ + std::string res = s; + static std::string const allowedSpecialChars{"._-,"}; + for (auto& c : res) + { + if (!isalnum(c) && allowedSpecialChars.find(c) == std::string::npos) + { + c = '_'; + } + } + return res; +} + +Dims getBindingDimensions(nvinfer1::IExecutionContext const& context, std::string const& name) +{ + return context.getTensorShape(name.c_str()); +} +} // namespace + +void Bindings::dumpRawBindingToFiles(nvinfer1::IExecutionContext const& context, std::ostream& os) const +{ + os << "Dumping I/O Bindings to RAW Files:" << std::endl; + for (auto const& n : mNames) + { + auto name = n.first; + auto bIndex = n.second; + auto const& binding = mBindings[bIndex]; + void* outputBuffer{}; + if (binding.outputAllocator != nullptr) + { + outputBuffer = binding.outputAllocator->getBuffer()->getHostBuffer(); + } + else + { + outputBuffer = binding.buffer->getHostBuffer(); + } + + Dims dims = getBindingDimensions(context, name); + std::string dimsStr; + std::string dotStr; + + for (int32_t i = 0; i < dims.nbDims; i++) + { + dimsStr += dotStr + std::to_string(dims.d[i]); + dotStr = "."; + } + + std::string const bindingTypeStr = (binding.isInput ? "input" : "output"); + + std::stringstream fileName; + fileName << genFilenameSafeString(name) << "." << bindingTypeStr << "." << dimsStr << "." << binding.dataType + << ".raw"; + + os << "Writing file for " << bindingTypeStr << " binding " << name << " (with datatype " << binding.dataType + << " and dimensions " << dimsStr << ") to " << fileName.str() << std::endl; + + std::ofstream f(fileName.str(), std::ios::out | std::ios::binary); + ASSERT(f && "Cannot open file for write"); + f.write(static_cast(outputBuffer), binding.volume * samplesCommon::elementSize(binding.dataType)); + f.close(); + } +} + +void Bindings::dumpBindingDimensions( + std::string const& name, nvinfer1::IExecutionContext const& context, std::ostream& os) const +{ + auto const dims = context.getTensorShape(name.c_str()); + // Do not add a newline terminator, because the caller may be outputting a JSON string. + os << dims; +} + +std::unordered_map Bindings::getBindings(std::function predicate) const +{ + std::unordered_map bindings; + for (auto const& n : mNames) + { + auto const binding = n.second; + if (predicate(mBindings[binding])) + { + bindings.insert(n); + } + } + return bindings; +} + +bool Bindings::setTensorAddresses(nvinfer1::IExecutionContext& context) const +{ + for (auto const& b : mNames) + { + auto const name = b.first.c_str(); + auto const location = context.getEngine().getTensorLocation(name); + if (location == TensorLocation::kDEVICE) + { + if (mBindings[b.second].outputAllocator != nullptr) + { + if (!context.setOutputAllocator(name, mBindings[b.second].outputAllocator.get())) + { + return false; + } + } + else + { + if (!context.setTensorAddress(name, mDevicePointers[b.second])) + { + return false; + } + } + } + } + return true; +} + +bool DebugTensorWriter::processDebugTensor(void const* addr, nvinfer1::TensorLocation location, nvinfer1::DataType type, + nvinfer1::Dims const& shape, char const* name, cudaStream_t stream) +{ + CHECK(cudaStreamSynchronize(stream)); + // Store data from callback. + int64_t size = std::accumulate(shape.d, shape.d + shape.nbDims, 1LL, std::multiplies{}) + * samplesCommon::elementSize(type); + std::vector hostDataOut(size, 0); + CHECK(cudaMemcpy(hostDataOut.data(), addr, size, cudaMemcpyDeviceToHost)); + + auto it = mDebugTensorFileNames.find(name); + ASSERT(it != mDebugTensorFileNames.end()); + std::string fileName = it->second; + + std::ofstream f(fileName, std::ios::out | std::ios::binary); + ASSERT(f && "Cannot open file for write"); + sample::gLogInfo << "Writing to file " << fileName << " for debug tensor " << name << std::endl; + f.write(hostDataOut.data(), size); + f.close(); + + CHECK(cudaStreamSynchronize(stream)); + return true; +} + +} // namespace sample diff --git a/src/Detector/tensorrt_yolo/common/sampleInference.h b/src/Detector/tensorrt_yolo/common/sampleInference.h index 1c21f592..d9ebed92 100644 --- a/src/Detector/tensorrt_yolo/common/sampleInference.h +++ b/src/Detector/tensorrt_yolo/common/sampleInference.h @@ -1,11 +1,12 @@ /* - * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, @@ -17,76 +18,243 @@ #ifndef TRT_SAMPLE_INFERENCE_H #define TRT_SAMPLE_INFERENCE_H +#include "sampleDevice.h" +#include "sampleEngines.h" #include "sampleReporting.h" #include "sampleUtils.h" +#include #include +#include #include #include #include -#include "NvInfer.h" +namespace sample +{ -#if (NV_TENSORRT_MAJOR > 7) +// IDebugListener class for writing debug tensors to output file. +class DebugTensorWriter : public nvinfer1::IDebugListener +{ +public: + DebugTensorWriter(std::unordered_map fileNames) + : mDebugTensorFileNames(fileNames) + { + } -#include "NvInferSafeRuntime.h" + bool processDebugTensor(void const* addr, nvinfer1::TensorLocation location, nvinfer1::DataType type, + nvinfer1::Dims const& shape, char const* name, cudaStream_t stream) override; -namespace sample -{ +private: + std::unordered_map mDebugTensorFileNames; +}; struct InferenceEnvironment { - TrtUniquePtr engine; + InferenceEnvironment() = delete; + InferenceEnvironment(InferenceEnvironment const& other) = delete; + InferenceEnvironment(InferenceEnvironment&& other) = delete; + InferenceEnvironment(BuildEnvironment& bEnv) : engine(std::move(bEnv.engine)), safe(bEnv.engine.isSafe()) + { + } + + LazilyDeserializedEngine engine; std::unique_ptr profiler; - std::vector> context; + std::vector> contexts; + std::vector + deviceMemory; //< Device memory used for inference when the allocation strategy is not static. std::vector> bindings; + std::unique_ptr listener; bool error{false}; - std::vector engineBlob; - bool safe{false}; - std::unique_ptr safeEngine; - std::vector> safeContext; - template - inline ContextType* getContext(int32_t streamIdx); + inline nvinfer1::IExecutionContext* getContext(int32_t streamIdx); + + //! Storage for input shape tensors. + //! + //! It's important that the addresses of the data do not change between the calls to + //! setTensorAddress/setInputShape (which tells TensorRT where the input shape tensor is) + //! and enqueueV3 (when TensorRT might use the input shape tensor). + //! + //! The input shape tensors could alternatively be handled via member bindings, + //! but it simplifies control-flow to store the data here since it's shared across + //! the bindings. + std::list> inputShapeTensorValues; }; -template <> inline nvinfer1::IExecutionContext* InferenceEnvironment::getContext(int32_t streamIdx) { - return context[streamIdx].get(); -} - -template <> -inline nvinfer1::safe::IExecutionContext* InferenceEnvironment::getContext(int32_t streamIdx) -{ - return safeContext[streamIdx].get(); + return contexts[streamIdx].get(); } //! //! \brief Set up contexts and bindings for inference //! -bool setUpInference(InferenceEnvironment& iEnv, const InferenceOptions& inference); +bool setUpInference(InferenceEnvironment& iEnv, InferenceOptions const& inference, SystemOptions const& system); //! //! \brief Deserialize the engine and time how long it takes. //! -bool timeDeserialize(InferenceEnvironment& iEnv); +bool timeDeserialize(InferenceEnvironment& iEnv, SystemOptions const& sys); //! //! \brief Run inference and collect timing, return false if any error hit during inference //! bool runInference( - const InferenceOptions& inference, InferenceEnvironment& iEnv, int32_t device, std::vector& trace); + InferenceOptions const& inference, InferenceEnvironment& iEnv, int32_t device, std::vector& trace); //! //! \brief Get layer information of the engine. //! -std::string getLayerInformation(const InferenceEnvironment& iEnv, nvinfer1::LayerInformationFormat format); +std::string getLayerInformation( + nvinfer1::ICudaEngine* engine, nvinfer1::IExecutionContext* context, nvinfer1::LayerInformationFormat format); -} // namespace sample +struct Binding +{ + bool isInput{false}; + std::unique_ptr buffer; + std::unique_ptr outputAllocator; + int64_t volume{0}; + nvinfer1::DataType dataType{nvinfer1::DataType::kFLOAT}; + + void fill(std::string const& fileName); + + void fill(); + + void dump(std::ostream& os, nvinfer1::Dims dims, nvinfer1::Dims strides, int32_t vectorDim, int32_t spv, + std::string const separator = " ") const; +}; + +struct TensorInfo +{ + int32_t bindingIndex{-1}; + char const* name{nullptr}; + nvinfer1::Dims dims{}; + bool isDynamic{}; + int32_t comps{-1}; + nvinfer1::Dims strides{}; + int32_t vectorDimIndex{-1}; + bool isInput{}; + nvinfer1::DataType dataType{}; + int64_t vol{-1}; + + void updateVolume(int32_t batch) + { + vol = volume(dims, strides, vectorDimIndex, comps, batch); + } +}; + +class Bindings +{ +public: + Bindings() = delete; + explicit Bindings(bool useManaged) + : mUseManaged(useManaged) + { + } + + void addBinding(TensorInfo const& tensorInfo, std::string const& fileName = ""); -#endif + void** getDeviceBuffers(); + + void transferInputToDevice(TrtCudaStream& stream); + + void transferOutputToHost(TrtCudaStream& stream); + + void fill(int binding, std::string const& fileName) + { + mBindings[binding].fill(fileName); + } + + void fill(int binding) + { + mBindings[binding].fill(); + } + + void dumpBindingDimensions( + std::string const& name, nvinfer1::IExecutionContext const& context, std::ostream& os) const; + + void dumpBindingValues(nvinfer1::IExecutionContext const& context, int32_t binding, std::ostream& os, + std::string const& separator = " ", int32_t batch = 1) const; + + void dumpRawBindingToFiles(nvinfer1::IExecutionContext const& context, std::ostream& os) const; + + void dumpInputs(nvinfer1::IExecutionContext const& context, std::ostream& os) const + { + auto isInput = [](Binding const& b) { return b.isInput; }; + dumpBindings(context, isInput, os); + } + + void dumpOutputs(nvinfer1::IExecutionContext const& context, std::ostream& os) const; + + void dumpBindings(nvinfer1::IExecutionContext const& context, std::ostream& os) const + { + auto all = [](Binding const& b) { return true; }; + dumpBindings(context, all, os); + } + + void dumpBindings(nvinfer1::IExecutionContext const& context, std::function predicate, + std::ostream& os) const + { + for (auto const& n : mNames) + { + auto const name = n.first; + auto const binding = n.second; + if (predicate(mBindings[binding])) + { + os << n.first << ": ("; + dumpBindingDimensions(name, context, os); + os << ")" << std::endl; + + dumpBindingValues(context, binding, os); + os << std::endl; + } + } + } + + std::unordered_map getInputBindings() const + { + auto isInput = [](Binding const& b) { return b.isInput; }; + return getBindings(isInput); + } + + std::unordered_map getOutputBindings() const + { + auto isOutput = [](Binding const& b) { return !b.isInput; }; + return getBindings(isOutput); + } + + std::unordered_map getBindings() const + { + auto all = [](Binding const& b) { return true; }; + return getBindings(all); + } + + std::unordered_map getBindings(std::function predicate) const; + + bool setTensorAddresses(nvinfer1::IExecutionContext& context) const; + +private: + std::unordered_map mNames; + std::vector mBindings; + std::vector mDevicePointers; + bool mUseManaged{false}; +}; + +struct TaskInferenceEnvironment +{ + TaskInferenceEnvironment(std::string engineFile, InferenceOptions inference, int32_t deviceId = 0, + int32_t DLACore = -1, int32_t bs = batchNotProvided); + InferenceOptions iOptions{}; + int32_t device{defaultDevice}; + int32_t batch{batchNotProvided}; + std::unique_ptr iEnv; + std::vector trace; +}; + +bool runMultiTasksInference(std::vector>& tEnvList); + +} // namespace sample #endif // TRT_SAMPLE_INFERENCE_H diff --git a/src/Detector/tensorrt_yolo/common/sampleOptions.cpp b/src/Detector/tensorrt_yolo/common/sampleOptions.cpp index 0afd163f..bdb1b21c 100644 --- a/src/Detector/tensorrt_yolo/common/sampleOptions.cpp +++ b/src/Detector/tensorrt_yolo/common/sampleOptions.cpp @@ -1,11 +1,12 @@ /* - * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, @@ -19,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -27,29 +29,64 @@ #include "logger.h" #include "sampleOptions.h" - +#include "sampleUtils.h" +using namespace nvinfer1; namespace sample { namespace { -std::vector splitToStringVec(const std::string& option, char separator) +static const std::map> kUNIT_MULTIPLIERS{ + {'B', {1, "Bytes"}}, + {'K', {1 << 10, "Kibibytes"}}, + {'M', {1 << 20, "Mebibytes"}}, + {'G', {1 << 30, "Gibibytes"}}, +}; + +std::string addDefaultUnitSuffixIfNotSpecified(std::string const& option, char defaultUnit) { - std::vector options; + char lastChar = option.at(option.size() - 1); + return std::isdigit(lastChar) ? option + defaultUnit : option; +} - for (size_t start = 0; start < option.length();) +// Returns "B (Bytes), K (Kilobytes), ..." +std::string getAvailableUnitSuffixes() +{ + std::ostringstream ss; + for (auto it = kUNIT_MULTIPLIERS.begin(); it != kUNIT_MULTIPLIERS.end(); ++it) { - size_t separatorIndex = option.find(separator, start); - if (separatorIndex == std::string::npos) + if (it != kUNIT_MULTIPLIERS.begin()) { - separatorIndex = option.length(); + ss << ", "; } - options.emplace_back(option.substr(start, separatorIndex - start)); - start = separatorIndex + 1; + ss << it->first << " (" << it->second.second << ")"; } + return ss.str(); +} - return options; +// Numeric trtexec arguments can have unit specifiers in similar to polygraphy. +// E.g. --weightStreamingBudget=20M would be 20 Mebibytes (base 2). +int64_t getUnitMultiplier(std::string const& option) +{ + char lastChar = option.at(option.size() - 1); + if (!std::isdigit(lastChar)) + { + char unit = std::toupper(lastChar); + auto found = kUNIT_MULTIPLIERS.find(unit); + if (found == kUNIT_MULTIPLIERS.end()) + { + std::ostringstream ss; + ss << "Error parsing \"" << option << "\": invalid unit specifier '" << unit + << "'. Valid base-2 unit suffixes include: "; + ss << getAvailableUnitSuffixes() << "."; + throw std::invalid_argument(ss.str()); + } + return found->second.first; + } + + // Return bytes by default + return kUNIT_MULTIPLIERS.at('B').first; } template @@ -64,6 +101,12 @@ int32_t stringToValue(const std::string& option) return std::stoi(option); } +template <> +size_t stringToValue(const std::string& option) +{ + return std::stoi(option) * getUnitMultiplier(option); +} + template <> float stringToValue(const std::string& option) { @@ -73,7 +116,7 @@ float stringToValue(const std::string& option) template <> double stringToValue(const std::string& option) { - return std::stod(option); + return std::stod(option) * getUnitMultiplier(option); } template <> @@ -86,6 +129,10 @@ template <> std::vector stringToValue>(const std::string& option) { std::vector shape; + if (option == "scalar") + { + return shape; + } std::vector dimsStrings = splitToStringVec(option, 'x'); for (const auto& d : dimsStrings) { @@ -98,8 +145,9 @@ template <> nvinfer1::DataType stringToValue(const std::string& option) { const std::unordered_map strToDT{{"fp32", nvinfer1::DataType::kFLOAT}, - {"fp16", nvinfer1::DataType::kHALF}, {"int8", nvinfer1::DataType::kINT8}, - {"int32", nvinfer1::DataType::kINT32}}; + {"fp16", nvinfer1::DataType::kHALF}, {"bf16", nvinfer1::DataType::kBF16}, {"int8", nvinfer1::DataType::kINT8}, + {"fp8", nvinfer1::DataType::kFP8}, {"int32", nvinfer1::DataType::kINT32}, {"int64", nvinfer1::DataType::kINT64}, + {"bool", nvinfer1::DataType::kBOOL}, {"uint8", nvinfer1::DataType::kUINT8}, {"int4", nvinfer1::DataType::kINT4}}; const auto& dt = strToDT.find(option); if (dt == strToDT.end()) { @@ -108,6 +156,21 @@ nvinfer1::DataType stringToValue(const std::string& option) return dt->second; } +template <> +nvinfer1::DeviceType stringToValue(std::string const& option) +{ + std::unordered_map const strToDevice = { + {"GPU", nvinfer1::DeviceType::kGPU}, + {"DLA", nvinfer1::DeviceType::kDLA}, + }; + auto const& device = strToDevice.find(option); + if (device == strToDevice.end()) + { + throw std::invalid_argument("Invalid Device Type " + option); + } + return device->second; +} + template <> nvinfer1::TensorFormats stringToValue(const std::string& option) { @@ -116,7 +179,8 @@ nvinfer1::TensorFormats stringToValue(const std::string {"chw2", nvinfer1::TensorFormat::kCHW2}, {"chw4", nvinfer1::TensorFormat::kCHW4}, {"hwc8", nvinfer1::TensorFormat::kHWC8}, {"chw16", nvinfer1::TensorFormat::kCHW16}, {"chw32", nvinfer1::TensorFormat::kCHW32}, {"dhwc8", nvinfer1::TensorFormat::kDHWC8}, - {"hwc", nvinfer1::TensorFormat::kHWC}, {"dla_linear", nvinfer1::TensorFormat::kDLA_LINEAR}, + {"cdhw32", nvinfer1::TensorFormat::kCDHW32}, {"hwc", nvinfer1::TensorFormat::kHWC}, + {"dhwc", nvinfer1::TensorFormat::kDHWC}, {"dla_linear", nvinfer1::TensorFormat::kDLA_LINEAR}, {"dla_hwc4", nvinfer1::TensorFormat::kDLA_HWC4}}; nvinfer1::TensorFormats formats{}; for (auto f : optionStrings) @@ -149,11 +213,82 @@ IOFormat stringToValue(const std::string& option) return ioFormat; } +template <> +SparsityFlag stringToValue(std::string const& option) +{ + std::unordered_map const table{ + {"disable", SparsityFlag::kDISABLE}, {"enable", SparsityFlag::kENABLE}, {"force", SparsityFlag::kFORCE}}; + auto search = table.find(option); + if (search == table.end()) + { + throw std::invalid_argument(std::string("Unknown sparsity mode: ") + option); + } + if (search->second == SparsityFlag::kFORCE) + { + sample::gLogWarning << "--sparsity=force has been deprecated. " + << "Please use to rewrite the weights to a sparsity pattern " + << "and then run with --sparsity=enable" << std::endl; + } + + return search->second; +} + +template <> +WeightStreamingBudget stringToValue(std::string const& option) +{ + WeightStreamingBudget budget; + if (option.find('%') != std::string::npos) + { + double percent = std::stod(option); + if (!(percent >= 0 && percent <= 100.0)) + { + std::ostringstream err; + err << "The weight streaming percent must be between 0 and 100."; + throw std::invalid_argument(err.str()); + } + budget.percent = percent; + } + else + { + double bytes = stringToValue(option); + if (!(bytes == WeightStreamingBudget::kAUTOMATIC || bytes == WeightStreamingBudget::kDISABLE || bytes >= 0)) + { + std::ostringstream err; + err << "The weight streaming budget must be " << WeightStreamingBudget::kDISABLE << ", " + << WeightStreamingBudget::kAUTOMATIC << ", or at least 0."; + throw std::invalid_argument(err.str()); + } + budget.bytes = static_cast(bytes); + } + return budget; +} + template std::pair splitNameAndValue(const std::string& s) { std::string tensorName; std::string valueString; + + // Support 'inputName':Path format for --loadInputs flag when dealing with Windows paths. + // i.e. 'inputName':c:\inputData + std::vector quoteNameRange{ splitToStringVec(s, '\'') }; + // splitToStringVec returns the entire string when delimiter is not found, so it's size is always at least 1 + if (quoteNameRange.size() != 1) + { + if (quoteNameRange.size() != 3) + { + std::string errorMsg = std::string("Found invalid number of \'s when parsing ") + s + + std::string(". Expected: 2, received: ") + std::to_string(quoteNameRange.size() -1) + + ". Please ensure that a singular comma is used within each comma-separated key-value pair for options like --inputIOFormats, --optShapes, --optShapesCalib, --layerPrecisions, etc."; + throw std::invalid_argument(errorMsg); + } + // Everything before the second "'" is the name. + tensorName = quoteNameRange[0] + quoteNameRange[1]; + // Path is the last string - ignoring leading ":" so slice it with [1:] + valueString = quoteNameRange[2].substr(1); + return std::pair(tensorName, stringToValue(valueString)); + } + // Split on the last : std::vector nameRange{splitToStringVec(s, ':')}; // Everything before the last : is the name @@ -181,16 +316,71 @@ const char* boolToEnabled(bool enable) return enable ? "Enabled" : "Disabled"; } +//! A helper function similar to sep.join(list) in Python. +template +std::string joinValuesToString(std::vector const& list, std::string const& sep) +{ + std::ostringstream os; + for (int32_t i = 0, n = list.size(); i < n; ++i) + { + os << list[i]; + if (i != n - 1) + { + os << sep; + } + } + return os.str(); +} + +template +std::string joinValuesToString(std::array const& list, std::string const& sep) +{ + return joinValuesToString(std::vector(list.begin(), list.end()), sep); +} + //! Check if input option exists in input arguments. -//! If it does: return its value, erase the argument and return true. +//! If it does: set its value, and return true //! If it does not: return false. template -bool getAndDelOption(Arguments& arguments, const std::string& option, T& value) +bool getOption(Arguments& arguments, const std::string& option, T& value) { - const auto match = arguments.find(option); + auto const match = arguments.find(option); if (match != arguments.end()) { - value = stringToValue(match->second); + value = stringToValue(match->second.first); + return true; + } + + return false; +} + +//! Check if input option exists in input arguments. +//! If it does: set its value, erase the argument and return true. +//! If it does not: return false. +template +bool getAndDelOption(Arguments& arguments, const std::string& option, T_& value) +{ + bool found = getOption(arguments, option, value); + if (found) + { + const auto match = arguments.find(option); + arguments.erase(match); + } + + return found; +} + +//! Check if input option exists in input arguments. +//! If it does: set its value and position, erase the argument and return true. +//! If it does not: return false. +template +bool getAndDelOptionWithPosition(Arguments& arguments, std::string const& option, T_& value, int32_t& pos) +{ + auto const match = arguments.find(option); + if (match != arguments.end()) + { + value = stringToValue(match->second.first); + pos = match->second.second; arguments.erase(match); return true; } @@ -198,8 +388,31 @@ bool getAndDelOption(Arguments& arguments, const std::string& option, T& value) return false; } +//! Check if input option exists in input arguments behind the position spcecified by pos. +//! If it does: set its value, erase the argument and return true. +//! If it does not: return false. +template +bool getAndDelOptionBehind(Arguments& arguments, std::string const& option, int32_t pos, T_& value) +{ + auto const match = arguments.equal_range(option); + if (match.first == match.second) + { + return false; + } + for (auto i = match.first; i != match.second; ++i) + { + if (i->second.second - pos == 1) + { + value = stringToValue(i->second.first); + arguments.erase(i); + return true; + } + } + return false; +} + //! Check if input option exists in input arguments. -//! If it does: return false in value, erase the argument and return true. +//! If it does: set false in value, erase the argument and return true. //! If it does not: return false. bool getAndDelNegOption(Arguments& arguments, const std::string& option, bool& value) { @@ -224,34 +437,37 @@ bool getAndDelRepeatedOption(Arguments& arguments, const std::string& option, st return false; } - auto addToValues = [&values](Arguments::value_type& argValue) {values.emplace_back(stringToValue(argValue.second));}; + auto addToValues + = [&values](Arguments::value_type& argValue) { values.emplace_back(stringToValue(argValue.second.first)); }; std::for_each(match.first, match.second, addToValues); arguments.erase(match.first, match.second); return true; } -void insertShapesBuild(std::unordered_map& shapes, nvinfer1::OptProfileSelector selector, const std::string& name, const std::vector& dims) +void insertShapesBuild(BuildOptions::ShapeProfile& shapes, nvinfer1::OptProfileSelector selector, + const std::string& name, const std::vector& dims) { shapes[name][static_cast(selector)] = dims; } -void insertShapesInference(std::unordered_map>& shapes, const std::string& name, const std::vector& dims) +void insertShapesInference( + InferenceOptions::ShapeProfile& shapes, std::string const& name, std::vector const& dims) { shapes[name] = dims; } std::string removeSingleQuotationMarks(std::string& str) { - std::vector strList{splitToStringVec(str, '\'')}; - // Remove all the escaped single quotation marks - std::string retVal = ""; - // Do not really care about unterminated sequences - for (size_t i = 0; i < strList.size(); i++) - { - retVal += strList[i]; - } - return retVal; + std::vector strList{splitToStringVec(str, '\'')}; + // Remove all the escaped single quotation marks + std::string retVal; + // Do not really care about unterminated sequences + for (size_t i = 0; i < strList.size(); i++) + { + retVal += strList[i]; + } + return retVal; } void getLayerPrecisions(Arguments& arguments, char const* argument, LayerPrecisions& layerPrecisions) @@ -293,7 +509,41 @@ void getLayerOutputTypes(Arguments& arguments, char const* argument, LayerOutput } } -bool getShapesBuild(Arguments& arguments, std::unordered_map& shapes, char const* argument, +void getLayerDeviceTypes(Arguments& arguments, char const* argument, LayerDeviceTypes& layerDeviceTypes) +{ + std::string list; + if (!getAndDelOption(arguments, argument, list)) + { + return; + } + + // The layerDeviceTypes flag contains comma-separated layerName:deviceType pairs. + std::vector deviceList{splitToStringVec(list, ',')}; + for (auto const& s : deviceList) + { + auto nameDevicePair = splitNameAndValue(s); + auto const layerName = removeSingleQuotationMarks(nameDevicePair.first); + layerDeviceTypes[layerName] = stringToValue(nameDevicePair.second); + } +} + +void getStringsSet(Arguments& arguments, char const* argument, StringSet& stringSet) +{ + std::string list; + if (!getAndDelOption(arguments, argument, list)) + { + return; + } + + // The layerPrecisions flag contains comma-separated layerName:precision pairs. + std::vector strings{splitToStringVec(list, ',')}; + for (auto const& s : strings) + { + stringSet.insert(s); + } +} + +bool getShapesBuild(Arguments& arguments, BuildOptions::ShapeProfile& shapes, char const* argument, nvinfer1::OptProfileSelector selector) { std::string list; @@ -309,7 +559,7 @@ bool getShapesBuild(Arguments& arguments, std::unordered_map>& shapes, const char* argument) +bool getShapesInference(Arguments& arguments, InferenceOptions::ShapeProfile& shapes, const char* argument) { std::string list; bool retVal = getAndDelOption(arguments, argument, list); @@ -324,67 +574,195 @@ bool getShapesInference(Arguments& arguments, std::unordered_map& shapes, bool minShapes, bool optShapes, bool maxShapes, bool calib) +void fillShapes(BuildOptions::ShapeProfile& shapes, std::string const& name, ShapeRange const& sourceShapeRange, + nvinfer1::OptProfileSelector minDimsSource, nvinfer1::OptProfileSelector optDimsSource, + nvinfer1::OptProfileSelector maxDimsSource) { - // Only accept optShapes only or all three of minShapes, optShapes, maxShapes - if ( ((minShapes || maxShapes) && !optShapes) // minShapes only, maxShapes only, both minShapes and maxShapes + insertShapesBuild( + shapes, nvinfer1::OptProfileSelector::kMIN, name, sourceShapeRange[static_cast(minDimsSource)]); + insertShapesBuild( + shapes, nvinfer1::OptProfileSelector::kOPT, name, sourceShapeRange[static_cast(optDimsSource)]); + insertShapesBuild( + shapes, nvinfer1::OptProfileSelector::kMAX, name, sourceShapeRange[static_cast(maxDimsSource)]); +} + +void processShapes(BuildOptions::ShapeProfile& shapes, bool minShapes, bool optShapes, bool maxShapes, bool calib) +{ + // Only accept optShapes only or all three of minShapes, optShapes, maxShapes when calib is set + if (((minShapes || maxShapes) && !optShapes) // minShapes only, maxShapes only, both minShapes and maxShapes || (minShapes && !maxShapes && optShapes) // both minShapes and optShapes || (!minShapes && maxShapes && optShapes)) // both maxShapes and optShapes { if (calib) { - throw std::invalid_argument("Must specify only --optShapesCalib or all of --minShapesCalib, --optShapesCalib, --maxShapesCalib"); - } - else - { - throw std::invalid_argument("Must specify only --optShapes or all of --minShapes, --optShapes, --maxShapes"); + throw std::invalid_argument( + "Must specify only --optShapesCalib or all of --minShapesCalib, --optShapesCalib, --maxShapesCalib"); } } - // If optShapes only, expand optShapes to minShapes and maxShapes - if (optShapes && !minShapes && !maxShapes) + if (!minShapes && !optShapes && !maxShapes) { - std::unordered_map newShapes; - for (auto& s : shapes) + return; + } + + BuildOptions::ShapeProfile newShapes; + for (auto& s : shapes) + { + nvinfer1::OptProfileSelector minDimsSource, optDimsSource, maxDimsSource; + minDimsSource = nvinfer1::OptProfileSelector::kMIN; + optDimsSource = nvinfer1::OptProfileSelector::kOPT; + maxDimsSource = nvinfer1::OptProfileSelector::kMAX; + + // Populate missing minShapes + if (!minShapes) + { + if (optShapes) + { + minDimsSource = optDimsSource; + sample::gLogWarning << "optShapes is being broadcasted to minShapes for tensor " << s.first + << std::endl; + } + else + { + minDimsSource = maxDimsSource; + sample::gLogWarning << "maxShapes is being broadcasted to minShapes for tensor " << s.first + << std::endl; + } + } + + // Populate missing optShapes + if (!optShapes) { - insertShapesBuild(newShapes, nvinfer1::OptProfileSelector::kMIN, s.first, s.second[static_cast(nvinfer1::OptProfileSelector::kOPT)]); - insertShapesBuild(newShapes, nvinfer1::OptProfileSelector::kOPT, s.first, s.second[static_cast(nvinfer1::OptProfileSelector::kOPT)]); - insertShapesBuild(newShapes, nvinfer1::OptProfileSelector::kMAX, s.first, s.second[static_cast(nvinfer1::OptProfileSelector::kOPT)]); + if (maxShapes) + { + optDimsSource = maxDimsSource; + sample::gLogWarning << "maxShapes is being broadcasted to optShapes for tensor " << s.first + << std::endl; + } + else + { + optDimsSource = minDimsSource; + sample::gLogWarning << "minShapes is being broadcasted to optShapes for tensor " << s.first + << std::endl; + } + } + + // Populate missing maxShapes + if (!maxShapes) + { + if (optShapes) + { + maxDimsSource = optDimsSource; + sample::gLogWarning << "optShapes is being broadcasted to maxShapes for tensor " << s.first + << std::endl; + } + else + { + maxDimsSource = minDimsSource; + sample::gLogWarning << "minShapes is being broadcasted to maxShapes for tensor " << s.first + << std::endl; + } } - shapes = newShapes; + + fillShapes(newShapes, s.first, s.second, minDimsSource, optDimsSource, maxDimsSource); } + shapes = newShapes; } -template -void printShapes(std::ostream& os, const char* phase, const T& shapes) +bool getOptimizationProfiles( + Arguments& arguments, std::vector& optProfiles, char const* argument) { - if (shapes.empty()) + bool retValue{false}; + int32_t pos{}; + size_t profileIndex{}; + + auto getShapes + = [](BuildOptions::ShapeProfile& shapes, std::string const& list, nvinfer1::OptProfileSelector selector) { + std::vector shapeList{splitToStringVec(list, ',')}; + for (auto const& s : shapeList) + { + auto nameDimsPair = splitNameAndValue>(s); + auto tensorName = removeSingleQuotationMarks(nameDimsPair.first); + auto dims = nameDimsPair.second; + insertShapesBuild(shapes, selector, tensorName, dims); + } + }; + + while (getAndDelOptionWithPosition(arguments, argument, profileIndex, pos)) { - os << "Input " << phase << " shapes: model" << std::endl; + BuildOptions::ShapeProfile optProfile{}; + bool minShapes{false}, maxShapes{false}, optShapes{false}; + for (int32_t i = 0; i < nvinfer1::EnumMax(); i++, pos++) + { + std::string value; + + if (!minShapes && getAndDelOptionBehind(arguments, "--minShapes", pos, value)) + { + minShapes = true; + getShapes(optProfile, value, nvinfer1::OptProfileSelector::kMIN); + } + else if (!maxShapes && getAndDelOptionBehind(arguments, "--maxShapes", pos, value)) + { + maxShapes = true; + getShapes(optProfile, value, nvinfer1::OptProfileSelector::kMAX); + } + else if (!optShapes && getAndDelOptionBehind(arguments, "--optShapes", pos, value)) + { + optShapes = true; + getShapes(optProfile, value, nvinfer1::OptProfileSelector::kOPT); + } + else + { + break; + } + } + processShapes(optProfile, minShapes, optShapes, maxShapes, false); + if (profileIndex >= optProfiles.size()) + { + optProfiles.resize(profileIndex + 1); + } + if (!optProfiles[profileIndex].empty()) + { + throw std::invalid_argument("Optimization profile index cannot be the same."); + } + optProfiles[profileIndex] = optProfile; + retValue = true; } - else + + profileIndex = 0; + for (auto const& optProfile : optProfiles) { - for (const auto& s : shapes) + if (optProfile.empty()) { - os << "Input " << phase << " shape: " << s.first << "=" << s.second << std::endl; + throw std::invalid_argument(std::string("Found invalid or missing shape spec at profile index ") + + std::to_string(profileIndex) + std::string(". ")); } + ++profileIndex; } + return retValue; } -std::ostream& printBatch(std::ostream& os, int32_t maxBatch) +template +void printShapes(std::ostream& os, char const* phase, T const& shapes, int32_t profileIndex) { - if (maxBatch != maxBatchNotProvided) + if (shapes.empty()) { - os << maxBatch; + os << "Input " << phase << " shapes: model" << std::endl; } else { - os << "explicit batch"; + std::string profileString = (profileIndex != -1 && strcmp(phase, "build") == 0) + ? "(profile " + std::to_string(profileIndex) + ")" + : ""; + for (auto const& s : shapes) + { + os << "Input " << phase << " shape " << profileString << ": " << s.first << "=" << s.second << std::endl; + } } - return os; } -std::ostream& printTacticSources(std::ostream& os, nvinfer1::TacticSources enabledSources, nvinfer1::TacticSources disabledSources) +std::ostream& printTacticSources( + std::ostream& os, nvinfer1::TacticSources enabledSources, nvinfer1::TacticSources disabledSources) { if (!enabledSources && !disabledSources) { @@ -405,24 +783,41 @@ std::ostream& printTacticSources(std::ostream& os, nvinfer1::TacticSources enabl addSource(1U << static_cast(nvinfer1::TacticSource::kCUBLAS), "cublas"); addSource(1U << static_cast(nvinfer1::TacticSource::kCUBLAS_LT), "cublasLt"); -#if (NV_TENSORRT_MAJOR > 7) addSource(1U << static_cast(nvinfer1::TacticSource::kCUDNN), "cudnn"); -#endif + addSource(1U << static_cast(nvinfer1::TacticSource::kEDGE_MASK_CONVOLUTIONS), "edge mask convolutions"); + addSource(1U << static_cast(nvinfer1::TacticSource::kJIT_CONVOLUTIONS), "JIT convolutions"); } return os; } std::ostream& printPrecision(std::ostream& os, BuildOptions const& options) { + if (options.stronglyTyped) + { + os << "Strongly Typed"; + return os; + } os << "FP32"; if (options.fp16) { os << "+FP16"; } + if (options.bf16) + { + os << "+BF16"; + } if (options.int8) { os << "+INT8"; } + if (options.fp8) + { + os << "+FP8"; + } + if (options.int4) + { + os << "+INT4"; + } if (options.precisionConstraints == PrecisionConstraints::kOBEY) { os << " (obey precision constraints)"; @@ -434,13 +829,27 @@ std::ostream& printPrecision(std::ostream& os, BuildOptions const& options) return os; } -std::ostream& printTimingCache(std::ostream& os, BuildOptions const& options) +std::ostream& printTempfileControls(std::ostream& os, TempfileControlFlags const tempfileControls) +{ + auto getFlag = [&](TempfileControlFlag f) -> char const* { + bool allowed = !!(tempfileControls & (1U << static_cast(f))); + return allowed ? "allow" : "deny"; + }; + auto const inMemory = getFlag(TempfileControlFlag::kALLOW_IN_MEMORY_FILES); + auto const temporary = getFlag(TempfileControlFlag::kALLOW_TEMPORARY_FILES); + + os << "{ in_memory: " << inMemory << ", temporary: " << temporary << " }"; + + return os; +} + +std::ostream& printTimingCache(std::ostream& os, TimingCacheMode const& timingCacheMode) { - switch (options.timingCacheMode) + switch (timingCacheMode) { - case TimingCacheMode::kGLOBAL: os << "global"; break; - case TimingCacheMode::kLOCAL: os << "local"; break; - case TimingCacheMode::kDISABLE: os << "disable"; break; + case TimingCacheMode::kGLOBAL: os << "global"; break; + case TimingCacheMode::kLOCAL: os << "local"; break; + case TimingCacheMode::kDISABLE: os << "disable"; break; } return os; } @@ -459,20 +868,67 @@ std::ostream& printSparsity(std::ostream& os, BuildOptions const& options) std::ostream& printMemoryPools(std::ostream& os, BuildOptions const& options) { - auto const printValueOrDefault = [&os](double const val) { + auto const printValueOrDefault = [&os](double const val, char const* unit = "MiB") { if (val >= 0) { - os << val << " MiB"; + os << val << " " << unit; } else { os << "default"; } }; - os << "workspace: "; printValueOrDefault(options.workspace); os << ", "; - os << "dlaSRAM: "; printValueOrDefault(options.dlaSRAM); os << ", "; - os << "dlaLocalDRAM: "; printValueOrDefault(options.dlaLocalDRAM); os << ", "; - os << "dlaGlobalDRAM: "; printValueOrDefault(options.dlaGlobalDRAM); + os << "workspace: "; + printValueOrDefault(options.workspace); + os << ", "; + os << "dlaSRAM: "; + printValueOrDefault(options.dlaSRAM); + os << ", "; + os << "dlaLocalDRAM: "; + printValueOrDefault(options.dlaLocalDRAM); + os << ", "; + os << "dlaGlobalDRAM: "; + printValueOrDefault(options.dlaGlobalDRAM); + os << ", "; + os << "tacticSharedMem: "; + printValueOrDefault(options.tacticSharedMem, "KiB"); + return os; +} + +std::string previewFeatureToString(PreviewFeature feature) +{ + // clang-format off + switch (feature) + { + case PreviewFeature::kPROFILE_SHARING_0806: + { + gLogWarning << "profileSharing0806 is on by default in TensorRT 10.0. This flag is deprecated and has no effect." << std::endl; + break; + } + case PreviewFeature::kALIASED_PLUGIN_IO_10_03: return "kALIASED_PLUGIN_IO_10_03"; + } + return "Invalid Preview Feature"; + // clang-format on +} + +std::ostream& printPreviewFlags(std::ostream& os, BuildOptions const& options) +{ + if (options.previewFeatures.empty()) + { + os << "Use default preview flags."; + return os; + } + + auto const addFlag = [&](PreviewFeature feat) { + int32_t featVal = static_cast(feat); + if (options.previewFeatures.find(featVal) != options.previewFeatures.end()) + { + os << previewFeatureToString(feat) << (options.previewFeatures.at(featVal) ? " [ON], " : " [OFF], "); + } + }; + + addFlag(PreviewFeature::kALIASED_PLUGIN_IO_10_03); + return os; } @@ -487,51 +943,41 @@ Arguments argsToArgumentsMap(int32_t argc, char* argv[]) if (valuePtr) { std::string value{valuePtr + 1}; - arguments.emplace(std::string(argv[i], valuePtr - argv[i]), value); + arguments.emplace(std::string(argv[i], valuePtr - argv[i]), std::make_pair(value, i)); } else { - arguments.emplace(argv[i], ""); + arguments.emplace(argv[i], std::make_pair(std::string(""), i)); } } return arguments; } -void BaseModelOptions::parse(Arguments& arguments) +namespace { - if (getAndDelOption(arguments, "--onnx", model)) - { - format = ModelFormat::kONNX; - } - else if (getAndDelOption(arguments, "--uff", model)) - { - format = ModelFormat::kUFF; - } - else if (getAndDelOption(arguments, "--model", model)) +std::string resolveHomeDirectoryOnLinux(std::string const& model) +{ + std::string filePath{model}; +#ifndef _WIN32 + if (filePath[0] == '~') { - format = ModelFormat::kCAFFE; + char const* home = std::getenv("HOME"); + if (home) + { + filePath.replace(0, 1, home); + } } +#endif + return filePath; } +} // namespace -void UffInput::parse(Arguments& arguments) +void BaseModelOptions::parse(Arguments& arguments) { - getAndDelOption(arguments, "--uffNHWC", NHWC); - std::vector args; - if (getAndDelRepeatedOption(arguments, "--uffInput", args)) + if (getAndDelOption(arguments, "--onnx", model)) { - for (const auto& i : args) - { - std::vector values{splitToStringVec(i, ',')}; - if (values.size() == 4) - { - nvinfer1::Dims3 dims{std::stoi(values[1]), std::stoi(values[2]), std::stoi(values[3])}; - inputs.emplace_back(values[0], dims); - } - else - { - throw std::invalid_argument(std::string("Invalid uffInput ") + i); - } - } + format = ModelFormat::kONNX; + model = resolveHomeDirectoryOnLinux(model); } } @@ -541,56 +987,66 @@ void ModelOptions::parse(Arguments& arguments) switch (baseModel.format) { - case ModelFormat::kCAFFE: + case ModelFormat::kONNX: + case ModelFormat::kANY: { - getAndDelOption(arguments, "--deploy", prototxt); break; } - case ModelFormat::kUFF: - { - uffInputs.parse(arguments); - if (uffInputs.inputs.empty()) - { - throw std::invalid_argument("Uff models require at least one input"); - } - break; } - case ModelFormat::kONNX: - break; - case ModelFormat::kANY: + + if (baseModel.format == ModelFormat::kONNX) { - if (getAndDelOption(arguments, "--deploy", prototxt)) + if (!outputs.empty()) { - baseModel.format = ModelFormat::kCAFFE; + throw std::invalid_argument("The --output flag should not be used with ONNX models."); } - break; } +} + +void getTempfileControls(Arguments& arguments, char const* argument, TempfileControlFlags& tempfileControls) +{ + std::string list; + if (!getAndDelOption(arguments, argument, list)) + { + return; } - // The --output flag should only be used with Caffe and UFF. It has no effect on ONNX. - std::vector outArgs; - if (getAndDelRepeatedOption(arguments, "--output", outArgs)) + std::vector controlList{splitToStringVec(list, ',')}; + for (auto const& s : controlList) { - for (const auto& o : outArgs) + auto controlAllowPair = splitNameAndValue(s); + bool allowed{false}; + int32_t offset{-1}; + + if (controlAllowPair.second.compare("allow") == 0) { - for (auto& v : splitToStringVec(o, ',')) - { - outputs.emplace_back(std::move(v)); - } + allowed = true; } - } - if (baseModel.format == ModelFormat::kCAFFE || baseModel.format == ModelFormat::kUFF) - { - if (outputs.empty()) + else if (controlAllowPair.second.compare("deny") != 0) { - throw std::invalid_argument("Caffe and Uff models require at least one output"); + throw std::invalid_argument("--tempfileControls value should be `deny` or `allow`"); } - } - else if (baseModel.format == ModelFormat::kONNX) - { - if (!outputs.empty()) + + if (controlAllowPair.first.compare("in_memory") == 0) { - throw std::invalid_argument("The --output flag should not be used with ONNX models."); + offset = static_cast(TempfileControlFlag::kALLOW_IN_MEMORY_FILES); + } + else if (controlAllowPair.first.compare("temporary") == 0) + { + offset = static_cast(TempfileControlFlag::kALLOW_TEMPORARY_FILES); + } + else + { + throw std::invalid_argument(std::string{"Unknown --tempfileControls key "} + controlAllowPair.first); + } + + if (allowed) + { + tempfileControls |= (1U << offset); + } + else + { + tempfileControls &= ~(1U << offset); } } } @@ -610,38 +1066,59 @@ void BuildOptions::parse(Arguments& arguments) getFormats(inputFormats, "--inputIOFormats"); getFormats(outputFormats, "--outputIOFormats"); - bool addedExplicitBatchFlag{false}; - getAndDelOption(arguments, "--explicitBatch", addedExplicitBatchFlag); - if (addedExplicitBatchFlag) - { - sample::gLogWarning << "--explicitBatch flag has been deprecated and has no effect!" << std::endl; - sample::gLogWarning << "Explicit batch dim is automatically enabled if input model is ONNX or if dynamic " - << "shapes are provided when the engine is built." << std::endl; - } - - bool minShapes = getShapesBuild(arguments, shapes, "--minShapes", nvinfer1::OptProfileSelector::kMIN); - bool optShapes = getShapesBuild(arguments, shapes, "--optShapes", nvinfer1::OptProfileSelector::kOPT); - bool maxShapes = getShapesBuild(arguments, shapes, "--maxShapes", nvinfer1::OptProfileSelector::kMAX); - processShapes(shapes, minShapes, optShapes, maxShapes, false); - bool minShapesCalib - = getShapesBuild(arguments, shapesCalib, "--minShapesCalib", nvinfer1::OptProfileSelector::kMIN); - bool optShapesCalib - = getShapesBuild(arguments, shapesCalib, "--optShapesCalib", nvinfer1::OptProfileSelector::kOPT); - bool maxShapesCalib - = getShapesBuild(arguments, shapesCalib, "--maxShapesCalib", nvinfer1::OptProfileSelector::kMAX); - processShapes(shapesCalib, minShapesCalib, optShapesCalib, maxShapesCalib, true); + bool getCalibProfile = getAndDelOption(arguments, "--calibProfile", calibProfile); + if (!getOptimizationProfiles(arguments, optProfiles, "--profile")) + { + ShapeProfile shapes; + bool minShapes{false}, optShapes{false}, maxShapes{false}; + try + { + minShapes = getShapesBuild(arguments, shapes, "--minShapes", nvinfer1::OptProfileSelector::kMIN); + optShapes = getShapesBuild(arguments, shapes, "--optShapes", nvinfer1::OptProfileSelector::kOPT); + maxShapes = getShapesBuild(arguments, shapes, "--maxShapes", nvinfer1::OptProfileSelector::kMAX); + } + catch (std::invalid_argument const& arg) + { + throw std::invalid_argument(arg.what() + + std::string(" conversion failure: failed to parse minShapes/optShapes/maxShapes. Please double check " + "your input string.")); + } - bool addedExplicitPrecisionFlag{false}; - getAndDelOption(arguments, "--explicitPrecision", addedExplicitPrecisionFlag); - if (addedExplicitPrecisionFlag) + processShapes(shapes, minShapes, optShapes, maxShapes, false); + optProfiles.emplace_back(shapes); + } + + if (calibProfile >= optProfiles.size()) + { + throw std::invalid_argument( + std::string("--calibProfile shouldn't greater than the size of optimization profile.")); + } + + BuildOptions::ShapeProfile dummyShapes; + + bool remainingMinShapes = getShapesBuild(arguments, dummyShapes, "--minShapes", nvinfer1::OptProfileSelector::kMIN); + bool remainingOptShapes = getShapesBuild(arguments, dummyShapes, "--optShapes", nvinfer1::OptProfileSelector::kOPT); + bool remainingMaxShapes = getShapesBuild(arguments, dummyShapes, "--maxShapes", nvinfer1::OptProfileSelector::kMAX); + if (remainingMinShapes || remainingOptShapes || remainingMaxShapes) { - sample::gLogWarning << "--explicitPrecision flag has been deprecated and has no effect!" << std::endl; + throw std::invalid_argument("Multiple --minShapes/--optShapes/--maxShapes without --profile are not allowed. "); } - if (getAndDelOption(arguments, "--workspace", workspace)) + bool minShapesCalib{false}, optShapesCalib{false}, maxShapesCalib{false}; + try { - sample::gLogWarning << "--workspace flag has been deprecated by --memPoolSize flag." << std::endl; + minShapesCalib = getShapesBuild(arguments, shapesCalib, "--minShapesCalib", nvinfer1::OptProfileSelector::kMIN); + optShapesCalib = getShapesBuild(arguments, shapesCalib, "--optShapesCalib", nvinfer1::OptProfileSelector::kOPT); + maxShapesCalib = getShapesBuild(arguments, shapesCalib, "--maxShapesCalib", nvinfer1::OptProfileSelector::kMAX); } + catch (std::invalid_argument const& arg) + { + throw std::invalid_argument(arg.what() + + std::string(" conversion failure: failed to parse minShapesCalib/optShapesCalib/maxShapesCalib. Please " + "double check your input string.")); + } + + processShapes(shapesCalib, minShapesCalib, optShapesCalib, maxShapesCalib, true); std::string memPoolSizes; getAndDelOption(arguments, "--memPoolSize", memPoolSizes); @@ -650,26 +1127,47 @@ void BuildOptions::parse(Arguments& arguments) { std::string memPoolName; double memPoolSize; - std::tie(memPoolName, memPoolSize) = splitNameAndValue(memPoolSpec); + try + { + std::string strPoolSize; + std::tie(memPoolName, strPoolSize) = splitNameAndValue(memPoolSpec); + memPoolSize = stringToValue(addDefaultUnitSuffixIfNotSpecified(strPoolSize, 'M')); + } + catch (std::invalid_argument const& arg) + { + throw std::invalid_argument(arg.what() + + std::string( + " conversion failure: failed to parse --memPoolSize. Please double check your input string.")); + } + if (memPoolSize < 0) { throw std::invalid_argument(std::string("Negative memory pool size: ") + std::to_string(memPoolSize)); } if (memPoolName == "workspace") { - workspace = memPoolSize; + // use unit in MB. + workspace = memPoolSize / 1.0_MiB; } else if (memPoolName == "dlaSRAM") { - dlaSRAM = memPoolSize; + // use unit in MB. + dlaSRAM = memPoolSize / 1.0_MiB; } else if (memPoolName == "dlaLocalDRAM") { - dlaLocalDRAM = memPoolSize; + // use unit in MB. + dlaLocalDRAM = memPoolSize / 1.0_MiB; } else if (memPoolName == "dlaGlobalDRAM") { - dlaGlobalDRAM = memPoolSize; + // use unit in MB. + dlaGlobalDRAM = memPoolSize / 1.0_MiB; + } + else if (memPoolName == "tacticSharedMem") + { + // use unit in KB. + tacticSharedMem = memPoolSize / 1.0_KiB; } else if (!memPoolName.empty()) { @@ -677,8 +1175,6 @@ void BuildOptions::parse(Arguments& arguments) } } - getAndDelOption(arguments, "--maxBatch", maxBatch); - getAndDelOption(arguments, "--minTiming", minTiming); getAndDelOption(arguments, "--avgTiming", avgTiming); bool best{false}; @@ -687,16 +1183,79 @@ void BuildOptions::parse(Arguments& arguments) { int8 = true; fp16 = true; + + // BF16 only supported on Ampere+ + if (samplesCommon::getSMVersion() >= 0x0800) + { + bf16 = true; + } } getAndDelOption(arguments, "--refit", refittable); + + getAndDelOption(arguments, "--weightless", stripWeights); + getAndDelOption(arguments, "--stripWeights", stripWeights); + + bool stripAllWeights{}; + getAndDelOption(arguments, "--stripAllWeights", stripAllWeights); + if (stripAllWeights) + { + refittable = true; + stripWeights = true; + } + + // --vc and --versionCompatible are synonyms + getAndDelOption(arguments, "--vc", versionCompatible); + if (!versionCompatible) + { + getAndDelOption(arguments, "--versionCompatible", versionCompatible); + } + +#if !TRT_WINML + // --pi and --pluginInstanceNorm are synonyms + getAndDelOption(arguments, "--pi", pluginInstanceNorm); + if (!pluginInstanceNorm) + { + getAndDelOption(arguments, "--pluginInstanceNorm", pluginInstanceNorm); + } +#endif + + getAndDelOption(arguments, "--excludeLeanRuntime", excludeLeanRuntime); + getAndDelOption(arguments, "--noCompilationCache", disableCompilationCache); getAndDelNegOption(arguments, "--noTF32", tf32); getAndDelOption(arguments, "--fp16", fp16); + getAndDelOption(arguments, "--bf16", bf16); getAndDelOption(arguments, "--int8", int8); + getAndDelOption(arguments, "--fp8", fp8); + getAndDelOption(arguments, "--int4", int4); + getAndDelOption(arguments, "--stronglyTyped", stronglyTyped); + if (stronglyTyped) + { + auto disableAndLog = [](bool& flag, std::string mode, std::string type) { + if (flag) + { + flag = false; + sample::gLogWarning << "Invalid usage, setting " << mode + << " mode is not allowed if graph is strongly typed. Disabling BuilderFlag::" + << type << "." << std::endl; + } + }; + disableAndLog(fp16, "fp16", "kFP16"); + disableAndLog(int8, "int8", "kINT8"); + disableAndLog(bf16, "bf16", "kBF16"); + disableAndLog(fp8, "fp8", "kFP8"); + disableAndLog(int4, "int4", "kINT4"); + } + + if (fp8 && int8) + { + throw std::invalid_argument("Invalid usage, fp8 and int8 aren't allowed to be enabled together."); + } getAndDelOption(arguments, "--safe", safe); - getAndDelOption(arguments, "--consistency", consistency); + getAndDelOption(arguments, "--buildDLAStandalone", buildDLAStandalone); + getAndDelOption(arguments, "--allowGPUFallback", allowGPUFallback); getAndDelOption(arguments, "--restricted", restricted); - + getAndDelOption(arguments, "--skipInference", skipInference); getAndDelOption(arguments, "--directIO", directIO); std::string precisionConstraintsString; @@ -720,10 +1279,11 @@ void BuildOptions::parse(Arguments& arguments) getLayerPrecisions(arguments, "--layerPrecisions", layerPrecisions); getLayerOutputTypes(arguments, "--layerOutputTypes", layerOutputTypes); + getLayerDeviceTypes(arguments, "--layerDeviceTypes", layerDeviceTypes); if (layerPrecisions.empty() && layerOutputTypes.empty() && precisionConstraints != PrecisionConstraints::kNONE) { - sample::gLogWarning << "When --precisionConstraints flag is set to \"obey\" or \"prefer\", please add " + sample::gLogWarning << R"(When --precisionConstraints flag is set to "obey" or "prefer", please add )" << "--layerPrecision/--layerOutputTypes flags to set layer-wise precisions and output " << "types." << std::endl; } @@ -731,79 +1291,52 @@ void BuildOptions::parse(Arguments& arguments) && precisionConstraints == PrecisionConstraints::kNONE) { sample::gLogWarning << "--layerPrecision/--layerOutputTypes flags have no effect when --precisionConstraints " - << "flag is set to \"none\"." << std::endl; + << R"(flag is set to "none".)" << std::endl; } - std::string sparsityString; - getAndDelOption(arguments, "--sparsity", sparsityString); - if (sparsityString == "disable") - { - sparsity = SparsityFlag::kDISABLE; - } - else if (sparsityString == "enable") - { - sparsity = SparsityFlag::kENABLE; - } - else if (sparsityString == "force") - { - sparsity = SparsityFlag::kFORCE; - } - else if (!sparsityString.empty()) - { - throw std::invalid_argument(std::string("Unknown sparsity mode: ") + sparsityString); - } + getStringsSet(arguments, "--markDebug", debugTensors); + + getAndDelOption(arguments, "--sparsity", sparsity); bool calibCheck = getAndDelOption(arguments, "--calib", calibration); - if (int8 && calibCheck && !shapes.empty() && shapesCalib.empty()) + if (int8 && calibCheck && !optProfiles[calibProfile].empty() && shapesCalib.empty()) { - shapesCalib = shapes; + shapesCalib = optProfiles[calibProfile]; } - - std::string profilingVerbosityString; - if (getAndDelOption(arguments, "--nvtxMode", profilingVerbosityString)) + else if (!shapesCalib.empty() && getCalibProfile) { - sample::gLogWarning << "--nvtxMode flag has been deprecated by --profilingVerbosity flag." << std::endl; + sample::gLogWarning + << "--calibProfile have no effect when --minShapesCalib/--optShapesCalib/--maxShapesCalib is set." + << std::endl; } + std::string profilingVerbosityString; + getAndDelOption(arguments, "--profilingVerbosity", profilingVerbosityString); if (profilingVerbosityString == "layer_names_only") { -#if (NV_TENSORRT_MAJOR > 7) profilingVerbosity = nvinfer1::ProfilingVerbosity::kLAYER_NAMES_ONLY; -#else - profilingVerbosity = nvinfer1::ProfilingVerbosity::kDEFAULT; -#endif } else if (profilingVerbosityString == "none") { profilingVerbosity = nvinfer1::ProfilingVerbosity::kNONE; } -#if (NV_TENSORRT_MAJOR > 7) else if (profilingVerbosityString == "detailed") { profilingVerbosity = nvinfer1::ProfilingVerbosity::kDETAILED; } -#endif else if (profilingVerbosityString == "default") { -#if (NV_TENSORRT_MAJOR > 7) sample::gLogWarning << "--profilingVerbosity=default has been deprecated by " "--profilingVerbosity=layer_names_only." << std::endl; profilingVerbosity = nvinfer1::ProfilingVerbosity::kLAYER_NAMES_ONLY; -#else - profilingVerbosity = nvinfer1::ProfilingVerbosity::kDEFAULT; -#endif } else if (profilingVerbosityString == "verbose") { -#if (NV_TENSORRT_MAJOR > 7) sample::gLogWarning << "--profilingVerbosity=verbose has been deprecated by --profilingVerbosity=detailed." << std::endl; profilingVerbosity = nvinfer1::ProfilingVerbosity::kDETAILED; -#else - profilingVerbosity = nvinfer1::ProfilingVerbosity::kDEFAULT; -#endif } else if (!profilingVerbosityString.empty()) { @@ -814,6 +1347,8 @@ void BuildOptions::parse(Arguments& arguments) { load = true; } + getAndDelOption(arguments, "--getPlanVersionOnly", getPlanVersionOnly); + if (getAndDelOption(arguments, "--saveEngine", engine)) { save = true; @@ -858,12 +1393,18 @@ void BuildOptions::parse(Arguments& arguments) { source = nvinfer1::TacticSource::kCUBLAS_LT; } -#if (NV_TENSORRT_MAJOR > 7) else if (t == "CUDNN") { source = nvinfer1::TacticSource::kCUDNN; } -#endif + else if (t == "EDGE_MASK_CONVOLUTIONS") + { + source = nvinfer1::TacticSource::kEDGE_MASK_CONVOLUTIONS; + } + else if (t == "JIT_CONVOLUTIONS") + { + source = nvinfer1::TacticSource::kJIT_CONVOLUTIONS; + } else { throw std::invalid_argument(std::string("Unknown tactic source: ") + t); @@ -887,38 +1428,179 @@ void BuildOptions::parse(Arguments& arguments) } } - bool noBuilderCache{false}; - getAndDelOption(arguments, "--noBuilderCache", noBuilderCache); - getAndDelOption(arguments, "--timingCacheFile", timingCacheFile); - if (noBuilderCache) + bool noBuilderCache{false}; + getAndDelOption(arguments, "--noBuilderCache", noBuilderCache); + getAndDelOption(arguments, "--timingCacheFile", timingCacheFile); + if (noBuilderCache) + { + timingCacheMode = TimingCacheMode::kDISABLE; + } + else if (!timingCacheFile.empty()) + { + timingCacheMode = TimingCacheMode::kGLOBAL; + } + else + { + timingCacheMode = TimingCacheMode::kLOCAL; + } + getAndDelOption(arguments, "--errorOnTimingCacheMiss", errorOnTimingCacheMiss); + getAndDelOption(arguments, "--builderOptimizationLevel", builderOptimizationLevel); + getAndDelOption(arguments, "--maxTactics", maxTactics); + + std::string runtimePlatformArgs; + getAndDelOption(arguments, "--runtimePlatform", runtimePlatformArgs); + if (runtimePlatformArgs == "SameAsBuild" || runtimePlatformArgs.empty()) + { + runtimePlatform = RuntimePlatform::kSAME_AS_BUILD; + } + else if (runtimePlatformArgs == "WindowsAMD64") + { + runtimePlatform = RuntimePlatform::kWINDOWS_AMD64; + } + else + { + throw std::invalid_argument(std::string("Unknown runtime platform: ") + runtimePlatformArgs + + ". Valid options: SameAsBuild, WindowsAMD64."); + } + + std::string hardwareCompatibleArgs; + getAndDelOption(arguments, "--hardwareCompatibilityLevel", hardwareCompatibleArgs); + if (hardwareCompatibleArgs == "none" || hardwareCompatibleArgs.empty()) + { + hardwareCompatibilityLevel = HardwareCompatibilityLevel::kNONE; + } + else if (samplesCommon::toLower(hardwareCompatibleArgs) == "ampere+") + { + hardwareCompatibilityLevel = HardwareCompatibilityLevel::kAMPERE_PLUS; + } + else + { + throw std::invalid_argument(std::string("Unknown hardwareCompatibilityLevel: ") + hardwareCompatibleArgs + + ". Valid options: none, ampere+."); + } + + if (pluginInstanceNorm && (versionCompatible || hardwareCompatibilityLevel == HardwareCompatibilityLevel::kAMPERE_PLUS)) + { + throw std::invalid_argument("Plugin InstanceNorm cannot be used with version compatible or hardware compatible engines!"); + } + + getAndDelOption(arguments, "--maxAuxStreams", maxAuxStreams); + + std::string previewFeaturesBuf; + getAndDelOption(arguments, "--preview", previewFeaturesBuf); + std::vector previewFeaturesVec{splitToStringVec(previewFeaturesBuf, ',')}; + for (auto featureName : previewFeaturesVec) + { + bool enable{false}; + if (featureName.front() == '+') + { + enable = true; + } + else if (featureName.front() != '-') + { + throw std::invalid_argument( + "Preview features must be prefixed with + or -, indicating whether it should be enabled or disabled " + "respectively."); + } + featureName.erase(0, 1); + + PreviewFeature feat{}; + if (featureName == "profileSharing0806") + { + sample::gLogWarning + << "profileSharing0806 is on by default in TensorRT 10.0. This flag is deprecated and has no effect." + << std::endl; + } + else if (featureName == "aliasedPluginIO1003") + { + feat = PreviewFeature::kALIASED_PLUGIN_IO_10_03; + } + else + { + throw std::invalid_argument(std::string("Unknown preview feature: ") + featureName); + } + previewFeatures[static_cast(feat)] = enable; + } + + getAndDelOption(arguments, "--tempdir", tempdir); + getTempfileControls(arguments, "--tempfileControls", tempfileControls); + + std::string runtimeMode; + getAndDelOption(arguments, "--useRuntime", runtimeMode); + if (runtimeMode == "full") { - timingCacheMode = TimingCacheMode::kDISABLE; + useRuntime = RuntimeMode::kFULL; } - else if (!timingCacheFile.empty()) + else if (runtimeMode == "dispatch") { - timingCacheMode = TimingCacheMode::kGLOBAL; + useRuntime = RuntimeMode::kDISPATCH; } - else + else if (runtimeMode == "lean") { - timingCacheMode = TimingCacheMode::kLOCAL; + useRuntime = RuntimeMode::kLEAN; + } + else if (!runtimeMode.empty()) + { + throw std::invalid_argument(std::string("Unknown useRuntime: ") + runtimeMode); } + + if ((useRuntime == RuntimeMode::kDISPATCH || useRuntime == RuntimeMode::kLEAN) && !versionCompatible) + { + versionCompatible = true; + sample::gLogWarning << "Implicitly enabling --versionCompatible since --useRuntime=" << runtimeMode + << " is set." << std::endl; + } + + if (useRuntime != RuntimeMode::kFULL && !load) + { + throw std::invalid_argument(std::string("Building a TensorRT engine requires --useRuntime=full.")); + } + + getAndDelOption(arguments, "--leanDLLPath", leanDLLPath); + + // Don't delete the option because the inference option parser requires it + getOption(arguments, "--allowWeightStreaming", allowWeightStreaming); } void SystemOptions::parse(Arguments& arguments) { getAndDelOption(arguments, "--device", device); getAndDelOption(arguments, "--useDLACore", DLACore); - getAndDelOption(arguments, "--allowGPUFallback", fallback); +#if !TRT_WINML std::string pluginName; while (getAndDelOption(arguments, "--plugins", pluginName)) { + sample::gLogWarning << "--plugins flag has been deprecated, use --staticPlugins flag instead." << std::endl; plugins.emplace_back(pluginName); } + while (getAndDelOption(arguments, "--staticPlugins", pluginName)) + { + plugins.emplace_back(pluginName); + } + while (getAndDelOption(arguments, "--setPluginsToSerialize", pluginName)) + { + setPluginsToSerialize.emplace_back(pluginName); + } + while (getAndDelOption(arguments, "--dynamicPlugins", pluginName)) + { + dynamicPlugins.emplace_back(pluginName); + } + getAndDelOption(arguments, "--ignoreParsedPluginLibs", ignoreParsedPluginLibs); +#endif } +constexpr int64_t WeightStreamingBudget::kDISABLE; +constexpr int64_t WeightStreamingBudget::kAUTOMATIC; + void InferenceOptions::parse(Arguments& arguments) { - getAndDelOption(arguments, "--streams", streams); + + if (getAndDelOption(arguments, "--streams", infStreams)) + { + sample::gLogWarning << "--streams flag has been deprecated, use --infStreams flag instead." << std::endl; + } + getAndDelOption(arguments, "--infStreams", infStreams); + getAndDelOption(arguments, "--iterations", iterations); getAndDelOption(arguments, "--duration", duration); getAndDelOption(arguments, "--warmUp", warmup); @@ -935,9 +1617,9 @@ void InferenceOptions::parse(Arguments& arguments) getAndDelOption(arguments, "--threads", threads); getAndDelOption(arguments, "--useCudaGraph", graph); getAndDelOption(arguments, "--separateProfileRun", rerun); - getAndDelOption(arguments, "--buildOnly", skip); getAndDelOption(arguments, "--timeDeserialize", timeDeserialize); getAndDelOption(arguments, "--timeRefit", timeRefit); + getAndDelOption(arguments, "--persistentCacheRatio", persistentCacheRatio); std::string list; getAndDelOption(arguments, "--loadInputs", list); @@ -945,25 +1627,81 @@ void InferenceOptions::parse(Arguments& arguments) splitInsertKeyValue(inputsList, inputs); getShapesInference(arguments, shapes, "--shapes"); - getAndDelOption(arguments, "--batch", batch); + setOptProfile = getAndDelOption(arguments, "--useProfile", optProfileIndex); + + std::string allocationStrategyString; + getAndDelOption(arguments, "--allocationStrategy", allocationStrategyString); + if (allocationStrategyString == "static") + { + memoryAllocationStrategy = MemoryAllocationStrategy::kSTATIC; + } + else if (allocationStrategyString == "profile") + { + memoryAllocationStrategy = MemoryAllocationStrategy::kPROFILE; + } + else if (allocationStrategyString == "runtime") + { + memoryAllocationStrategy = MemoryAllocationStrategy::kRUNTIME; + } + else if (!allocationStrategyString.empty()) + { + throw std::invalid_argument(std::string("Unknown allocationStrategy: ") + allocationStrategyString); + } + + bool allowWs{false}; + getAndDelOption(arguments, "--allowWeightStreaming", allowWs); + bool wsBudgetFound = getAndDelOption(arguments, "--weightStreamingBudget", weightStreamingBudget); + if (wsBudgetFound && !allowWs) + { + throw std::invalid_argument( + "The weight streaming budget can only be set with --allowWeightStreaming specified."); + } + if (allowWs && weightStreamingBudget.isDisabled()) + { + sample::gLogWarning << "The engine can stream its weights but it will not at runtime because " + "--weightStreamingBudget unset or set to " + << WeightStreamingBudget::kDISABLE << "." << std::endl; + } + + std::string debugTensorList; + getAndDelOption(arguments, "--saveDebugTensors", debugTensorList); + std::vector fileNames{splitToStringVec(debugTensorList, ',')}; + splitInsertKeyValue(fileNames, debugTensorFileNames); } void ReportingOptions::parse(Arguments& arguments) { - getAndDelOption(arguments, "--percentile", percentile); getAndDelOption(arguments, "--avgRuns", avgs); getAndDelOption(arguments, "--verbose", verbose); getAndDelOption(arguments, "--dumpRefit", refit); getAndDelOption(arguments, "--dumpOutput", output); + getAndDelOption(arguments, "--dumpRawBindingsToFile", dumpRawBindings); getAndDelOption(arguments, "--dumpProfile", profile); getAndDelOption(arguments, "--dumpLayerInfo", layerInfo); + getAndDelOption(arguments, "--dumpOptimizationProfile", optProfileInfo); getAndDelOption(arguments, "--exportTimes", exportTimes); getAndDelOption(arguments, "--exportOutput", exportOutput); getAndDelOption(arguments, "--exportProfile", exportProfile); getAndDelOption(arguments, "--exportLayerInfo", exportLayerInfo); - if (percentile < 0 || percentile > 100) + + std::string percentileString; + getAndDelOption(arguments, "--percentile", percentileString); + std::vector percentileStrings = splitToStringVec(percentileString, ','); + if (!percentileStrings.empty()) + { + percentiles.clear(); + } + for (const auto& p : percentileStrings) { - throw std::invalid_argument(std::string("Percentile ") + std::to_string(percentile) + "is not in [0,100]"); + percentiles.push_back(stringToValue(p)); + } + + for (auto percentile : percentiles) + { + if (percentile < 0.F || percentile > 100.F) + { + throw std::invalid_argument(std::string("Percentile ") + std::to_string(percentile) + "is not in [0,100]"); + } } } @@ -983,61 +1721,40 @@ void AllOptions::parse(Arguments& arguments) system.parse(arguments); inference.parse(arguments); - // Use explicitBatch when input model is ONNX or when dynamic shapes are used. - const bool isOnnx{model.baseModel.format == ModelFormat::kONNX}; - const bool hasDynamicShapes{!build.shapes.empty() || !inference.shapes.empty()}; - const bool detectedExplicitBatch = isOnnx || hasDynamicShapes; - - // Throw an error if user tries to use --batch or --maxBatch when the engine has explicit batch dim. - const bool maxBatchWasSet{build.maxBatch != maxBatchNotProvided}; - const bool batchWasSet{inference.batch != batchNotProvided}; - if (detectedExplicitBatch && (maxBatchWasSet || batchWasSet)) + if (build.useRuntime != RuntimeMode::kFULL && inference.timeRefit) { - throw std::invalid_argument( - "The --batch and --maxBatch flags should not be used when the input model is ONNX or when dynamic shapes " - "are provided. Please use --optShapes and --shapes to set input shapes instead."); + throw std::invalid_argument("--timeRefit requires --useRuntime=full."); } - // If batch and/or maxBatch is not set and the engine has implicit batch dim, set them to default values. - if (!detectedExplicitBatch) + if (inference.optProfileIndex < static_cast(build.optProfiles.size())) { - // If batch is not set, set it to default value. - if (!batchWasSet) - { - inference.batch = defaultBatch; - } - // If maxBatch is not set, set it to be equal to batch. - if (!maxBatchWasSet) + // Propagate shape profile between builder and inference + for (auto const& s : build.optProfiles[inference.optProfileIndex]) { - build.maxBatch = inference.batch; + if (inference.shapes.find(s.first) == inference.shapes.end()) + { + insertShapesInference( + inference.shapes, s.first, s.second[static_cast(nvinfer1::OptProfileSelector::kOPT)]); + } } - // MaxBatch should not be less than batch. - if (build.maxBatch < inference.batch) + for (auto const& s : inference.shapes) { - throw std::invalid_argument("Build max batch " + std::to_string(build.maxBatch) - + " is less than inference batch " + std::to_string(inference.batch)); + if (build.optProfiles[inference.optProfileIndex].find(s.first) + == build.optProfiles[inference.optProfileIndex].end()) + { + // assume min/opt/max all the same + insertShapesBuild(build.optProfiles[inference.optProfileIndex], nvinfer1::OptProfileSelector::kMIN, + s.first, s.second); + insertShapesBuild(build.optProfiles[inference.optProfileIndex], nvinfer1::OptProfileSelector::kOPT, + s.first, s.second); + insertShapesBuild(build.optProfiles[inference.optProfileIndex], nvinfer1::OptProfileSelector::kMAX, + s.first, s.second); + } } } - if (build.shapes.empty() && !inference.shapes.empty()) - { - // If --shapes are provided but --optShapes are not, assume that optShapes is the same as shapes. - for (auto& s : inference.shapes) - { - insertShapesBuild(build.shapes, nvinfer1::OptProfileSelector::kMIN, s.first, s.second); - insertShapesBuild(build.shapes, nvinfer1::OptProfileSelector::kOPT, s.first, s.second); - insertShapesBuild(build.shapes, nvinfer1::OptProfileSelector::kMAX, s.first, s.second); - } - } - else if (!build.shapes.empty() && inference.shapes.empty()) - { - // If --optShapes are provided but --shapes are not, assume that shapes is the same as optShapes. - for (auto& s : build.shapes) - { - insertShapesInference( - inference.shapes, s.first, s.second[static_cast(nvinfer1::OptProfileSelector::kOPT)]); - } - } + // Set nvtxVerbosity to be the same as build-time profilingVerbosity. + inference.nvtxVerbosity = build.profilingVerbosity; reporting.parse(arguments); helps = parseHelp(arguments); @@ -1050,31 +1767,56 @@ void AllOptions::parse(Arguments& arguments) } if (build.safe && system.DLACore >= 0) { - auto checkSafeDLAFormats = [](std::vector const& fmt) { - return fmt.empty() ? false : std::all_of(fmt.begin(), fmt.end(), [](IOFormat const& pair) { + build.buildDLAStandalone = true; + } + if (build.runtimePlatform != nvinfer1::RuntimePlatform::kSAME_AS_BUILD) + { + build.skipInference = true; + } + if (build.buildDLAStandalone) + { + build.skipInference = true; + auto checkSafeDLAFormats = [](std::vector const& fmt, bool isInput) { + return fmt.empty() ? false : std::all_of(fmt.begin(), fmt.end(), [&](IOFormat const& pair) { bool supported{false}; - bool const isLINEAR{pair.second == 1U << static_cast(nvinfer1::TensorFormat::kLINEAR)}; - bool const isCHW4{pair.second == 1U << static_cast(nvinfer1::TensorFormat::kCHW4)}; + bool const isDLA_LINEAR{ + pair.second == 1U << static_cast(nvinfer1::TensorFormat::kDLA_LINEAR)}; + bool const isHWC4{pair.second == 1U << static_cast(nvinfer1::TensorFormat::kCHW4) + || pair.second == 1U << static_cast(nvinfer1::TensorFormat::kDLA_HWC4)}; bool const isCHW32{pair.second == 1U << static_cast(nvinfer1::TensorFormat::kCHW32)}; bool const isCHW16{pair.second == 1U << static_cast(nvinfer1::TensorFormat::kCHW16)}; - supported |= pair.first == nvinfer1::DataType::kINT8 && (isLINEAR || isCHW4 || isCHW32); - supported |= pair.first == nvinfer1::DataType::kHALF && (isLINEAR || isCHW4 || isCHW16); + supported |= pair.first == nvinfer1::DataType::kINT8 + && (isDLA_LINEAR || (isInput ? isHWC4 : false) || isCHW32); + supported |= pair.first == nvinfer1::DataType::kHALF + && (isDLA_LINEAR || (isInput ? isHWC4 : false) || isCHW16); return supported; }); }; - if (!checkSafeDLAFormats(build.inputFormats) || !checkSafeDLAFormats(build.outputFormats)) + if (!checkSafeDLAFormats(build.inputFormats, true) || !checkSafeDLAFormats(build.outputFormats, false)) { throw std::invalid_argument( - "I/O formats for safe DLA capability are restricted to fp16/int8:linear, fp16:chw16 or int8:chw32"); + "I/O formats for safe DLA capability are restricted to fp16/int8:dla_linear, fp16/int8:hwc4, " + "fp16:chw16 or " + "int8:chw32"); } - if (system.fallback) + if (build.allowGPUFallback) { - throw std::invalid_argument("GPU fallback (--allowGPUFallback) not allowed for safe DLA capability"); + throw std::invalid_argument("GPU fallback (--allowGPUFallback) not allowed for DLA standalone mode"); } } } } +void TaskInferenceOptions::parse(Arguments& arguments) +{ + getAndDelOption(arguments, "engine", engine); + getAndDelOption(arguments, "device", device); + getAndDelOption(arguments, "batch", batch); + getAndDelOption(arguments, "DLACore", DLACore); + getAndDelOption(arguments, "graph", graph); + getAndDelOption(arguments, "persistentCacheRatio", persistentCacheRatio); +} + void SafeBuilderOptions::parse(Arguments& arguments) { auto getFormats = [&arguments](std::vector& formatsVector, const char* argument) { @@ -1097,13 +1839,36 @@ void SafeBuilderOptions::parse(Arguments& arguments) getFormats(outputFormats, "--outputIOFormats"); getAndDelOption(arguments, "--int8", int8); getAndDelOption(arguments, "--calib", calibFile); - getAndDelOption(arguments, "--consistency", consistency); getAndDelOption(arguments, "--std", standard); +#if !TRT_WINML std::string pluginName; while (getAndDelOption(arguments, "--plugins", pluginName)) { + sample::gLogWarning << "--plugins flag has been deprecated, use --staticPlugins flag instead." << std::endl; plugins.emplace_back(pluginName); } + while (getAndDelOption(arguments, "--staticPlugins", pluginName)) + { + plugins.emplace_back(pluginName); + } +#endif + bool noBuilderCache{false}; + getAndDelOption(arguments, "--noBuilderCache", noBuilderCache); + getAndDelOption(arguments, "--timingCacheFile", timingCacheFile); + getAndDelOption(arguments, "--avgTiming", avgTiming); + if (noBuilderCache) + { + timingCacheMode = TimingCacheMode::kDISABLE; + } + else if (!timingCacheFile.empty()) + { + timingCacheMode = TimingCacheMode::kGLOBAL; + } + else + { + timingCacheMode = TimingCacheMode::kLOCAL; + } + getAndDelOption(arguments, "--sparsity", sparsity); } std::ostream& operator<<(std::ostream& os, const BaseModelOptions& options) @@ -1113,59 +1878,25 @@ std::ostream& operator<<(std::ostream& os, const BaseModelOptions& options) os << "Format: "; switch (options.format) { - case ModelFormat::kCAFFE: - { - os << "Caffe"; - break; - } case ModelFormat::kONNX: { os << "ONNX"; break; } - case ModelFormat::kUFF: - { - os << "UFF"; - break; - } - case ModelFormat::kANY: - os << "*"; - break; + case ModelFormat::kANY: os << "*"; break; } os << std::endl << "Model: " << options.model << std::endl; return os; } -std::ostream& operator<<(std::ostream& os, const UffInput& input) -{ - os << "Uff Inputs Layout: " << (input.NHWC ? "NHWC" : "NCHW") << std::endl; - for (const auto& i : input.inputs) - { - os << "Input: " << i.first << "," << i.second.d[0] << "," << i.second.d[1] << "," << i.second.d[2] << std::endl; - } - - return os; -} - std::ostream& operator<<(std::ostream& os, const ModelOptions& options) { os << options.baseModel; switch (options.baseModel.format) { - case ModelFormat::kCAFFE: - { - os << "Prototxt: " << options.prototxt << std::endl; - break; - } - case ModelFormat::kUFF: - { - os << options.uffInputs; - break; - } case ModelFormat::kONNX: // Fallthrough: No options to report for ONNX or the generic case - case ModelFormat::kANY: - break; + case ModelFormat::kANY: break; } os << "Output:"; @@ -1192,6 +1923,11 @@ std::ostream& operator<<(std::ostream& os, nvinfer1::DataType dtype) os << "fp16"; break; } + case nvinfer1::DataType::kBF16: + { + os << "bf16"; + break; + } case nvinfer1::DataType::kINT8: { os << "int8"; @@ -1207,6 +1943,26 @@ std::ostream& operator<<(std::ostream& os, nvinfer1::DataType dtype) os << "bool"; break; } + case nvinfer1::DataType::kUINT8: + { + os << "uint8"; + break; + } + case nvinfer1::DataType::kFP8: + { + os << "fp8"; + break; + } + case nvinfer1::DataType::kINT64: + { + os << "int64"; + break; + } + case nvinfer1::DataType::kINT4: + { + os << "int4"; + break; + } } return os; } @@ -1240,13 +1996,11 @@ std::ostream& operator<<(std::ostream& os, IOFormat const& format) os << "hwc8"; break; } -#if (NV_TENSORRT_MAJOR > 7) case nvinfer1::TensorFormat::kHWC16: { os << "hwc16"; break; } -#endif case nvinfer1::TensorFormat::kCHW4: { os << "chw4"; @@ -1277,6 +2031,11 @@ std::ostream& operator<<(std::ostream& os, IOFormat const& format) os << "hwc"; break; } + case nvinfer1::TensorFormat::kDHWC: + { + os << "dhwc"; + break; + } case nvinfer1::TensorFormat::kDLA_LINEAR: { os << "dla_linear"; @@ -1293,6 +2052,42 @@ std::ostream& operator<<(std::ostream& os, IOFormat const& format) return os; } +std::ostream& operator<<(std::ostream& os, nvinfer1::DeviceType devType) +{ + switch (devType) + { + case nvinfer1::DeviceType::kGPU: + { + os << "GPU"; + break; + } + case nvinfer1::DeviceType::kDLA: + { + os << "DLA"; + break; + } + } + return os; +} + +std::ostream& operator<<(std::ostream& os, nvinfer1::RuntimePlatform platform) +{ + switch (platform) + { + case nvinfer1::RuntimePlatform::kSAME_AS_BUILD: + { + os << "Same As Build"; + break; + } + case nvinfer1::RuntimePlatform::kWINDOWS_AMD64: + { + os << "Windows AMD64"; + break; + } + } + return os; +} + std::ostream& operator<<(std::ostream& os, const ShapeRange& dims) { int32_t i = 0; @@ -1319,29 +2114,76 @@ std::ostream& operator<<(std::ostream& os, LayerPrecisions const& layerPrecision return os; } +std::ostream& operator<<(std::ostream& os, LayerDeviceTypes const& layerDeviceTypes) +{ + int32_t i = 0; + for (auto const& layerDevicePair : layerDeviceTypes) + { + os << (i++ ? ", " : "") << layerDevicePair.first << ":" << layerDevicePair.second; + } + return os; +} + +std::ostream& operator<<(std::ostream& os, StringSet const& stringSet) +{ + int64_t i = 0; + for (auto const& s : stringSet) + { + os << (i ? "," : "") << s; + ++i; + } + return os; +} + std::ostream& operator<<(std::ostream& os, const BuildOptions& options) { + // if loadEngine is specified, BuildOptions are N/A + if (options.load) + { + os << std::endl; + return os; + } // clang-format off os << "=== Build Options ===" << std::endl << - - "Max batch: "; printBatch(os, options.maxBatch) << std::endl << "Memory Pools: "; printMemoryPools(os, options) << std::endl << - "minTiming: " << options.minTiming << std::endl << "avgTiming: " << options.avgTiming << std::endl << "Precision: "; printPrecision(os, options) << std::endl << "LayerPrecisions: " << options.layerPrecisions << std::endl << + "Layer Device Types: " << options.layerDeviceTypes << std::endl << "Calibration: " << (options.int8 && options.calibration.empty() ? "Dynamic" : options.calibration.c_str()) << std::endl << "Refit: " << boolToEnabled(options.refittable) << std::endl << + "Strip weights: " << boolToEnabled(options.stripWeights) << std::endl << + "Version Compatible: " << boolToEnabled(options.versionCompatible) << std::endl << +#if !TRT_WINML + "ONNX Plugin InstanceNorm: " << boolToEnabled(options.pluginInstanceNorm) << std::endl << +#endif + "TensorRT runtime: " << options.useRuntime << std::endl << + "Lean DLL Path: " << options.leanDLLPath << std::endl << + "Tempfile Controls: "; printTempfileControls(os, options.tempfileControls) << std::endl << + "Exclude Lean Runtime: " << boolToEnabled(options.excludeLeanRuntime) << std::endl << "Sparsity: "; printSparsity(os, options) << std::endl << "Safe mode: " << boolToEnabled(options.safe) << std::endl << + "Build DLA standalone loadable: " << boolToEnabled(options.buildDLAStandalone) << std::endl << + "Allow GPU fallback for DLA: " << boolToEnabled(options.allowGPUFallback) << std::endl << "DirectIO mode: " << boolToEnabled(options.directIO) << std::endl << "Restricted mode: " << boolToEnabled(options.restricted) << std::endl << + "Skip inference: " << boolToEnabled(options.skipInference) << std::endl << "Save engine: " << (options.save ? options.engine : "") << std::endl << "Load engine: " << (options.load ? options.engine : "") << std::endl << "Profiling verbosity: " << static_cast(options.profilingVerbosity) << std::endl << "Tactic sources: "; printTacticSources(os, options.enabledTactics, options.disabledTactics) << std::endl << - "timingCacheMode: "; printTimingCache(os, options) << std::endl << - "timingCacheFile: " << options.timingCacheFile << std::endl; + "timingCacheMode: "; printTimingCache(os, options.timingCacheMode) << std::endl << + "timingCacheFile: " << options.timingCacheFile << std::endl << + "Enable Compilation Cache: "<< boolToEnabled(!options.disableCompilationCache) << std::endl << + "errorOnTimingCacheMiss: " << boolToEnabled(options.errorOnTimingCacheMiss) << std::endl << + "Preview Features: "; printPreviewFlags(os, options) << std::endl << + "MaxAuxStreams: " << options.maxAuxStreams << std::endl << + "BuilderOptimizationLevel: " << options.builderOptimizationLevel << std::endl << + "MaxTactics: " << options.maxTactics << std::endl << + "Calibration Profile Index: " << options.calibProfile << std::endl << + "Weight Streaming: " << boolToEnabled(options.allowWeightStreaming) << std::endl << + "Runtime Platform: " << options.runtimePlatform << std::endl << + "Debug Tensors: " << options.debugTensors << std::endl; // clang-format on auto printIOFormats = [](std::ostream& os, const char* direction, const std::vector formats) { @@ -1351,7 +2193,7 @@ std::ostream& operator<<(std::ostream& os, const BuildOptions& options) } else { - for(const auto& f : formats) + for (const auto& f : formats) { os << direction << ": " << f << std::endl; } @@ -1360,8 +2202,11 @@ std::ostream& operator<<(std::ostream& os, const BuildOptions& options) printIOFormats(os, "Input(s)", options.inputFormats); printIOFormats(os, "Output(s)", options.outputFormats); - printShapes(os, "build", options.shapes); - printShapes(os, "calibration", options.shapesCalib); + for (size_t i = 0; i < options.optProfiles.size(); i++) + { + printShapes(os, "build", options.optProfiles[i], i); + } + printShapes(os, "calibration", options.shapesCalib, -1); return os; } @@ -1372,8 +2217,8 @@ std::ostream& operator<<(std::ostream& os, const SystemOptions& options) os << "=== System Options ===" << std::endl << "Device: " << options.device << std::endl << - "DLACore: " << (options.DLACore != -1 ? std::to_string(options.DLACore) : "") << - (options.DLACore != -1 && options.fallback ? "(With GPU fallback)" : "") << std::endl; + "DLACore: " << (options.DLACore != -1 ? std::to_string(options.DLACore) : "") << std::endl; +#if !TRT_WINML os << "Plugins:"; for (const auto& p : options.plugins) @@ -1382,13 +2227,32 @@ std::ostream& operator<<(std::ostream& os, const SystemOptions& options) } os << std::endl; + os << "setPluginsToSerialize:"; + + for (const auto& p : options.setPluginsToSerialize) + { + os << " " << p; + } + os << std::endl; + + os << "dynamicPlugins:"; + + for (const auto& p : options.dynamicPlugins) + { + os << " " << p; + } + os << std::endl; + + os << "ignoreParsedPluginLibs: " << options.ignoreParsedPluginLibs << std::endl; + os << std::endl; +#endif return os; // clang-format on } std::ostream& operator<<(std::ostream& os, const InferenceOptions& options) { -// clang-format off + // clang-format off os << "=== Inference Options ===" << std::endl << "Batch: "; @@ -1400,48 +2264,71 @@ std::ostream& operator<<(std::ostream& os, const InferenceOptions& options) { os << "Explicit" << std::endl; } - printShapes(os, "inference", options.shapes); - os << "Iterations: " << options.iterations << std::endl << - "Duration: " << options.duration << "s (+ " - << options.warmup << "ms warm up)" << std::endl << - "Sleep time: " << options.sleep << "ms" << std::endl << - "Idle time: " << options.idle << "ms" << std::endl << - "Streams: " << options.streams << std::endl << - "ExposeDMA: " << boolToEnabled(!options.overlap) << std::endl << - "Data transfers: " << boolToEnabled(!options.skipTransfers) << std::endl << - "Spin-wait: " << boolToEnabled(options.spin) << std::endl << - "Multithreading: " << boolToEnabled(options.threads) << std::endl << - "CUDA Graph: " << boolToEnabled(options.graph) << std::endl << - "Separate profiling: " << boolToEnabled(options.rerun) << std::endl << - "Time Deserialize: " << boolToEnabled(options.timeDeserialize) << std::endl << - "Time Refit: " << boolToEnabled(options.timeRefit) << std::endl << - "Skip inference: " << boolToEnabled(options.skip) << std::endl; - -// clang-format on + printShapes(os, "inference", options.shapes, options.optProfileIndex); + + std::string wsBudget{"Disabled"}; + if (options.weightStreamingBudget.bytes == WeightStreamingBudget::kAUTOMATIC) + { + wsBudget = "Automatic"; + } + else if (options.weightStreamingBudget.bytes != WeightStreamingBudget::kDISABLE) + { + wsBudget = std::to_string(options.weightStreamingBudget.bytes) + " bytes"; + } + else if (options.weightStreamingBudget.percent != WeightStreamingBudget::kDISABLE) + { + wsBudget = std::to_string(options.weightStreamingBudget.percent) + "%"; + } + + os << "Iterations: " << options.iterations << std::endl << + "Duration: " << options.duration << "s (+ " + << options.warmup << "ms warm up)" << std::endl << + "Sleep time: " << options.sleep << "ms" << std::endl << + "Idle time: " << options.idle << "ms" << std::endl << + "Inference Streams: " << options.infStreams << std::endl << + "ExposeDMA: " << boolToEnabled(!options.overlap) << std::endl << + "Data transfers: " << boolToEnabled(!options.skipTransfers) << std::endl << + "Spin-wait: " << boolToEnabled(options.spin) << std::endl << + "Multithreading: " << boolToEnabled(options.threads) << std::endl << + "CUDA Graph: " << boolToEnabled(options.graph) << std::endl << + "Separate profiling: " << boolToEnabled(options.rerun) << std::endl << + "Time Deserialize: " << boolToEnabled(options.timeDeserialize) << std::endl << + "Time Refit: " << boolToEnabled(options.timeRefit) << std::endl << + "NVTX verbosity: " << static_cast(options.nvtxVerbosity) << std::endl << + "Persistent Cache Ratio: " << static_cast(options.persistentCacheRatio) << std::endl << + "Optimization Profile Index: "<< options.optProfileIndex << std::endl << + "Weight Streaming Budget: " << wsBudget << std::endl; + // clang-format on + os << "Inputs:" << std::endl; for (const auto& input : options.inputs) { os << input.first << "<-" << input.second << std::endl; } + os << "Debug Tensor Save Destinations:" << std::endl; + for (auto const& fileName : options.debugTensorFileNames) + { + os << fileName.first << ": " << fileName.second << std::endl; + } + return os; } std::ostream& operator<<(std::ostream& os, const ReportingOptions& options) { -// clang-format off - os << "=== Reporting Options ===" << std::endl << - - "Verbose: " << boolToEnabled(options.verbose) << std::endl << - "Averages: " << options.avgs << " inferences" << std::endl << - "Percentile: " << options.percentile << std::endl << - "Dump refittable layers:" << boolToEnabled(options.refit) << std::endl << - "Dump output: " << boolToEnabled(options.output) << std::endl << - "Profile: " << boolToEnabled(options.profile) << std::endl << - "Export timing to JSON file: " << options.exportTimes << std::endl << - "Export output to JSON file: " << options.exportOutput << std::endl << - "Export profile to JSON file: " << options.exportProfile << std::endl; -// clang-format on + // clang-format off + os << "=== Reporting Options ===" << std::endl << + "Verbose: " << boolToEnabled(options.verbose) << std::endl << + "Averages: " << options.avgs << " inferences" << std::endl << + "Percentiles: " << joinValuesToString(options.percentiles, ",") << std::endl << + "Dump refittable layers:" << boolToEnabled(options.refit) << std::endl << + "Dump output: " << boolToEnabled(options.output) << std::endl << + "Profile: " << boolToEnabled(options.profile) << std::endl << + "Export timing to JSON file: " << options.exportTimes << std::endl << + "Export output to JSON file: " << options.exportOutput << std::endl << + "Export profile to JSON file: " << options.exportProfile << std::endl; + // clang-format on return os; } @@ -1461,7 +2348,7 @@ std::ostream& operator<<(std::ostream& os, const SafeBuilderOptions& options) } else { - for(const auto& f : formats) + for (const auto& f : formats) { os << direction << ": " << f << std::endl; } @@ -1476,197 +2363,288 @@ std::ostream& operator<<(std::ostream& os, const SafeBuilderOptions& options) { os << " + INT8"; } + if (options.fp8) + { + os << " + FP8"; + } + if (options.int4) + { + os << " + INT4"; + } os << std::endl; os << "Calibration file: " << options.calibFile << std::endl; os << "Serialized Network: " << options.serialized << std::endl; printIOFormats(os, "Input(s)", options.inputFormats); printIOFormats(os, "Output(s)", options.outputFormats); - +#if !TRT_WINML os << "Plugins:"; for (const auto& p : options.plugins) { os << " " << p; } +#endif + os << "timingCacheMode: "; + printTimingCache(os, options.timingCacheMode) << std::endl; + os << "timingCacheFile: " << options.timingCacheFile << std::endl; os << std::endl; return os; } void BaseModelOptions::help(std::ostream& os) { -// clang-format off - os << " --uff= UFF model" << std::endl << - " --onnx= ONNX model" << std::endl << - " --model= Caffe model (default = no model, random weights used)" << std::endl; -// clang-format on -} - -void UffInput::help(std::ostream& os) -{ -// clang-format off - os << " --uffInput=,X,Y,Z Input blob name and its dimensions (X,Y,Z=C,H,W), it can be specified " - "multiple times; at least one is required for UFF models" << std::endl << - " --uffNHWC Set if inputs are in the NHWC layout instead of NCHW (use " << - "X,Y,Z=H,W,C order in --uffInput)" << std::endl; -// clang-format on + // clang-format off + os << " --onnx= ONNX model" << std::endl; + // clang-format on } void ModelOptions::help(std::ostream& os) { -// clang-format off + // clang-format off os << "=== Model Options ===" << std::endl; BaseModelOptions::help(os); - os << " --deploy= Caffe prototxt file" << std::endl << - " --output=[,]* Output names (it can be specified multiple times); at least one output " - "is required for UFF and Caffe" << std::endl; - UffInput::help(os); -// clang-format on + // clang-format on } void BuildOptions::help(std::ostream& os) { -// clang-format off - os << "=== Build Options ===" "\n" - " --maxBatch Set max batch size and build an implicit batch engine (default = same size as --batch)" "\n" - " This option should not be used when the input model is ONNX or when dynamic shapes are provided." "\n" - " --minShapes=spec Build with dynamic shapes using a profile with the min shapes provided" "\n" - " --optShapes=spec Build with dynamic shapes using a profile with the opt shapes provided" "\n" - " --maxShapes=spec Build with dynamic shapes using a profile with the max shapes provided" "\n" - " --minShapesCalib=spec Calibrate with dynamic shapes using a profile with the min shapes provided" "\n" - " --optShapesCalib=spec Calibrate with dynamic shapes using a profile with the opt shapes provided" "\n" - " --maxShapesCalib=spec Calibrate with dynamic shapes using a profile with the max shapes provided" "\n" - " Note: All three of min, opt and max shapes must be supplied." "\n" - " However, if only opt shapes is supplied then it will be expanded so" "\n" - " that min shapes and max shapes are set to the same values as opt shapes." "\n" - " Input names can be wrapped with escaped single quotes (ex: \\\'Input:0\\\')." "\n" - " Example input shapes spec: input0:1x3x256x256,input1:1x3x128x128" "\n" - " Each input shape is supplied as a key-value pair where key is the input name and" "\n" - " value is the dimensions (including the batch dimension) to be used for that input." "\n" - " Each key-value pair has the key and value separated using a colon (:)." "\n" - " Multiple input shapes can be provided via comma-separated key-value pairs." "\n" - " --inputIOFormats=spec Type and format of each of the input tensors (default = all inputs in fp32:chw)" "\n" - " See --outputIOFormats help for the grammar of type and format list." "\n" - " Note: If this option is specified, please set comma-separated types and formats for all" "\n" - " inputs following the same order as network inputs ID (even if only one input" "\n" - " needs specifying IO format) or set the type and format once for broadcasting." "\n" - " --outputIOFormats=spec Type and format of each of the output tensors (default = all outputs in fp32:chw)" "\n" - " Note: If this option is specified, please set comma-separated types and formats for all" "\n" - " outputs following the same order as network outputs ID (even if only one output" "\n" - " needs specifying IO format) or set the type and format once for broadcasting." "\n" - " IO Formats: spec ::= IOfmt[\",\"spec]" "\n" - " IOfmt ::= type:fmt" "\n" - " type ::= \"fp32\"|\"fp16\"|\"int32\"|\"int8\"" "\n" - " fmt ::= (\"chw\"|\"chw2\"|\"chw4\"|\"hwc8\"|\"chw16\"|\"chw32\"|\"dhwc8\")[\"+\"fmt]" "\n" - " --workspace=N Set workspace size in MiB." "\n" - " --memPoolSize=poolspec Specify the size constraints of the designated memory pool(s) in MiB." "\n" - " Note: Also accepts decimal sizes, e.g. 0.25MiB. Will be rounded down to the nearest integer bytes." "\n" - " Pool constraint: poolspec ::= poolfmt[\",\"poolspec]" "\n" - " poolfmt ::= pool:sizeInMiB" "\n" - " pool ::= \"workspace\"|\"dlaSRAM\"|\"dlaLocalDRAM\"|\"dlaGlobalDRAM\"" "\n" - " --profilingVerbosity=mode Specify profiling verbosity. mode ::= layer_names_only|detailed|none (default = layer_names_only)" "\n" - " --minTiming=M Set the minimum number of iterations used in kernel selection (default = " - << defaultMinTiming << ")" "\n" - " --avgTiming=M Set the number of times averaged in each iteration for kernel selection (default = " - << defaultAvgTiming << ")" "\n" - " --refit Mark the engine as refittable. This will allow the inspection of refittable layers " "\n" - " and weights within the engine." "\n" - " --sparsity=spec Control sparsity (default = disabled). " "\n" - " Sparsity: spec ::= \"disable\", \"enable\", \"force\"" "\n" - " Note: Description about each of these options is as below" "\n" - " disable = do not enable sparse tactics in the builder (this is the default)" "\n" - " enable = enable sparse tactics in the builder (but these tactics will only be" "\n" - " considered if the weights have the right sparsity pattern)" "\n" - " force = enable sparse tactics in the builder and force-overwrite the weights to have" "\n" - " a sparsity pattern (even if you loaded a model yourself)" "\n" - " --noTF32 Disable tf32 precision (default is to enable tf32, in addition to fp32)" "\n" - " --fp16 Enable fp16 precision, in addition to fp32 (default = disabled)" "\n" - " --int8 Enable int8 precision, in addition to fp32 (default = disabled)" "\n" - " --best Enable all precisions to achieve the best performance (default = disabled)" "\n" - " --directIO Avoid reformatting at network boundaries. (default = disabled)" "\n" - " --precisionConstraints=spec Control precision constraint setting. (default = none)" "\n" - " Precision Constaints: spec ::= \"none\" | \"obey\" | \"prefer\"" "\n" - " none = no constraints" "\n" - " prefer = meet precision constraints set by --layerPrecisions/--layerOutputTypes if possible" "\n" - " obey = meet precision constraints set by --layerPrecisions/--layerOutputTypes or fail" "\n" - " otherwise" "\n" - " --layerPrecisions=spec Control per-layer precision constraints. Effective only when precisionConstraints is set to" "\n" - " \"obey\" or \"prefer\". (default = none)" "\n" - " The specs are read left-to-right, and later ones override earlier ones. \"*\" can be used as a" "\n" - " layerName to specify the default precision for all the unspecified layers." "\n" - " Per-layer precision spec ::= layerPrecision[\",\"spec]" "\n" - " layerPrecision ::= layerName\":\"precision" "\n" - " precision ::= \"fp32\"|\"fp16\"|\"int32\"|\"int8\"" "\n" - " --layerOutputTypes=spec Control per-layer output type constraints. Effective only when precisionConstraints is set to" "\n" - " \"obey\" or \"prefer\". (default = none)" "\n" - " The specs are read left-to-right, and later ones override earlier ones. \"*\" can be used as a" "\n" - " layerName to specify the default precision for all the unspecified layers. If a layer has more than""\n" - " one output, then multiple types separated by \"+\" can be provided for this layer." "\n" - " Per-layer output type spec ::= layerOutputTypes[\",\"spec]" "\n" - " layerOutputTypes ::= layerName\":\"type" "\n" - " type ::= \"fp32\"|\"fp16\"|\"int32\"|\"int8\"[\"+\"type]" "\n" - " --calib= Read INT8 calibration cache file" "\n" - " --safe Enable build safety certified engine" "\n" - " --consistency Perform consistency checking on safety certified engine" "\n" - " --restricted Enable safety scope checking with kSAFETY_SCOPE build flag" "\n" - " --saveEngine= Save the serialized engine" "\n" - " --loadEngine= Load a serialized engine" "\n" - " --tacticSources=tactics Specify the tactics to be used by adding (+) or removing (-) tactics from the default " "\n" - " tactic sources (default = all available tactics)." "\n" - " Note: Currently only cuDNN, cuBLAS and cuBLAS-LT are listed as optional tactics." "\n" - " Tactic Sources: tactics ::= [\",\"tactic]" "\n" - " tactic ::= (+|-)lib" "\n" - " lib ::= \"CUBLAS\"|\"CUBLAS_LT\"|\"CUDNN\"" "\n" - " For example, to disable cudnn and enable cublas: --tacticSources=-CUDNN,+CUBLAS" "\n" - " --noBuilderCache Disable timing cache in builder (default is to enable timing cache)" "\n" - " --timingCacheFile= Save/load the serialized global timing cache" "\n" + // clang-format off + os << "=== Build Options ===" "\n" + " --minShapes=spec Build with dynamic shapes using a profile with the min shapes provided" "\n" + " --optShapes=spec Build with dynamic shapes using a profile with the opt shapes provided" "\n" + " --maxShapes=spec Build with dynamic shapes using a profile with the max shapes provided" "\n" + " --minShapesCalib=spec Calibrate with dynamic shapes using a profile with the min shapes provided" "\n" + " --optShapesCalib=spec Calibrate with dynamic shapes using a profile with the opt shapes provided" "\n" + " --maxShapesCalib=spec Calibrate with dynamic shapes using a profile with the max shapes provided" "\n" + " Note: All three of min, opt and max shapes must be supplied." "\n" + " However, if only opt shapes is supplied then it will be expanded so" "\n" + " that min shapes and max shapes are set to the same values as opt shapes." "\n" + " Input names can be wrapped with escaped single quotes (ex: 'Input:0')." "\n" + " Example input shapes spec: input0:1x3x256x256,input1:1x3x128x128" "\n" + " For scalars (0-D shapes), use input0:scalar or simply input0: with nothing after the colon." "\n" + " Each input shape is supplied as a key-value pair where key is the input name and" "\n" + " value is the dimensions (including the batch dimension) to be used for that input." "\n" + " Each key-value pair has the key and value separated using a colon (:)." "\n" + " Multiple input shapes can be provided via comma-separated key-value pairs, and each input name can" "\n" + " contain at most one wildcard ('*') character." "\n" + " --inputIOFormats=spec Type and format of each of the input tensors (default = all inputs in fp32:chw)" "\n" + " See --outputIOFormats help for the grammar of type and format list." "\n" + " Note: If this option is specified, please set comma-separated types and formats for all" "\n" + " inputs following the same order as network inputs ID (even if only one input" "\n" + " needs specifying IO format) or set the type and format once for broadcasting." "\n" + " --outputIOFormats=spec Type and format of each of the output tensors (default = all outputs in fp32:chw)" "\n" + " Note: If this option is specified, please set comma-separated types and formats for all" "\n" + " outputs following the same order as network outputs ID (even if only one output" "\n" + " needs specifying IO format) or set the type and format once for broadcasting." "\n" + R"( IO Formats: spec ::= IOfmt[","spec])" "\n" + " IOfmt ::= type:fmt" "\n" + R"( type ::= "fp32"|"fp16"|"bf16"|"int32"|"int64"|"int8"|"uint8"|"bool")" "\n" + R"( fmt ::= ("chw"|"chw2"|"chw4"|"hwc8"|"chw16"|"chw32"|"dhwc8"|)" "\n" + R"( "cdhw32"|"hwc"|"dla_linear"|"dla_hwc4")["+"fmt])" "\n" + " --memPoolSize=poolspec Specify the size constraints of the designated memory pool(s)" "\n" + " Supports the following base-2 suffixes: " << getAvailableUnitSuffixes() << "." "\n" + " If none of suffixes is appended, the defualt unit is in MiB." "\n" + " Note: Also accepts decimal sizes, e.g. 0.25M. Will be rounded down to the nearest integer bytes." "\n" + " In particular, for dlaSRAM the bytes will be rounded down to the nearest power of 2." "\n" + R"( Pool constraint: poolspec ::= poolfmt[","poolspec])" "\n" + " poolfmt ::= pool:size" "\n" + R"( pool ::= "workspace"|"dlaSRAM"|"dlaLocalDRAM"|"dlaGlobalDRAM"|"tacticSharedMem")" "\n" + " --profilingVerbosity=mode Specify profiling verbosity. mode ::= layer_names_only|detailed|none (default = layer_names_only)." "\n" + " Please only assign once." "\n" + " --avgTiming=M Set the number of times averaged in each iteration for kernel selection (default = " + << defaultAvgTiming << ")" "\n" + " --refit Mark the engine as refittable. This will allow the inspection of refittable layers " "\n" + " and weights within the engine." "\n" + " --stripWeights Strip weights from plan. This flag works with either refit or refit with identical weights. Default""\n" + " to latter, but you can switch to the former by enabling both --stripWeights and --refit at the same""\n" + " time." "\n" + " --stripAllWeights Alias for combining the --refit and --stripWeights options. It marks all weights as refittable," "\n" + " disregarding any performance impact. Additionally, it strips all refittable weights after the " "\n" + " engine is built." "\n" + " --weightless [Deprecated] this knob has been deprecated. Please use --stripWeights" "\n" + " --versionCompatible, --vc Mark the engine as version compatible. This allows the engine to be used with newer versions" "\n" + " of TensorRT on the same host OS, as well as TensorRT's dispatch and lean runtimes." "\n" +#if !TRT_WINML + " --pluginInstanceNorm, --pi Set `kNATIVE_INSTANCENORM` to false in the ONNX parser. This will cause the ONNX parser to use" "\n" + " a plugin InstanceNorm implementation over the native implementation when parsing." "\n" +#endif + R"( --useRuntime=runtime TensorRT runtime to execute engine. "lean" and "dispatch" require loading VC engine and do)" "\n" + " not support building an engine." "\n" + R"( runtime::= "full"|"lean"|"dispatch")" "\n" + " --leanDLLPath= External lean runtime DLL to use in version compatiable mode." "\n" + " --excludeLeanRuntime When --versionCompatible is enabled, this flag indicates that the generated engine should" "\n" + " not include an embedded lean runtime. If this is set, the user must explicitly specify a" "\n" + " valid lean runtime to use when loading the engine." "\n" + " --sparsity=spec Control sparsity (default = disabled). " "\n" + R"( Sparsity: spec ::= "disable", "enable", "force")" "\n" + " Note: Description about each of these options is as below" "\n" + " disable = do not enable sparse tactics in the builder (this is the default)" "\n" + " enable = enable sparse tactics in the builder (but these tactics will only be" "\n" + " considered if the weights have the right sparsity pattern)" "\n" + " force = enable sparse tactics in the builder and force-overwrite the weights to have" "\n" + " a sparsity pattern (even if you loaded a model yourself)" "\n" + " [Deprecated] this knob has been deprecated." "\n" + " Please use to rewrite the weights." "\n" + " --noTF32 Disable tf32 precision (default is to enable tf32, in addition to fp32)" "\n" + " --fp16 Enable fp16 precision, in addition to fp32 (default = disabled)" "\n" + " --bf16 Enable bf16 precision, in addition to fp32 (default = disabled)" "\n" + " --int8 Enable int8 precision, in addition to fp32 (default = disabled)" "\n" + " --fp8 Enable fp8 precision, in addition to fp32 (default = disabled)" "\n" + " --int4 Enable int4 precision, in addition to fp32 (default = disabled)" "\n" + " --best Enable all precisions to achieve the best performance (default = disabled)" "\n" + " --stronglyTyped Create a strongly typed network. (default = disabled)" "\n" + " --directIO Avoid reformatting at network boundaries. (default = disabled)" "\n" + " --precisionConstraints=spec Control precision constraint setting. (default = none)" "\n" + R"( Precision Constraints: spec ::= "none" | "obey" | "prefer")" "\n" + " none = no constraints" "\n" + " prefer = meet precision constraints set by --layerPrecisions/--layerOutputTypes if possible" "\n" + " obey = meet precision constraints set by --layerPrecisions/--layerOutputTypes or fail" "\n" + " otherwise" "\n" + " --layerPrecisions=spec Control per-layer precision constraints. Effective only when precisionConstraints is set to" "\n" + R"( "obey" or "prefer". (default = none))" "\n" + R"( The specs are read left-to-right, and later ones override earlier ones. Each layer name can)" "\n" + " contain at most one wildcard ('*') character." "\n" + R"( Per-layer precision spec ::= layerPrecision[","spec])" "\n" + R"( layerPrecision ::= layerName":"precision)" "\n" + R"( precision ::= "fp32"|"fp16"|"bf16"|"int32"|"int8")" "\n" + " --layerOutputTypes=spec Control per-layer output type constraints. Effective only when precisionConstraints is set to" "\n" + R"( "obey" or "prefer". (default = none)" "\n" + R"( The specs are read left-to-right, and later ones override earlier ones. Each layer name can)" "\n" + " contain at most one wildcard ('*') character. If a layer has more than" "\n" + R"( one output, then multiple types separated by "+" can be provided for this layer.)" "\n" + R"( Per-layer output type spec ::= layerOutputTypes[","spec])" "\n" + R"( layerOutputTypes ::= layerName":"type)" "\n" + R"( type ::= "fp32"|"fp16"|"bf16"|"int32"|"int8"["+"type])" "\n" + " --layerDeviceTypes=spec Specify layer-specific device type." "\n" + " The specs are read left-to-right, and later ones override earlier ones. If a layer does not have" "\n" + " a device type specified, the layer will opt for the default device type." "\n" + R"( Per-layer device type spec ::= layerDeviceTypePair[","spec])" "\n" + R"( layerDeviceTypePair ::= layerName":"deviceType)" "\n" + R"( deviceType ::= "GPU"|"DLA")" "\n" + " --calib= Read INT8 calibration cache file" "\n" + " --safe Enable build safety certified engine, if DLA is enable, --buildDLAStandalone will be specified" "\n" + " automatically (default = disabled)" "\n" + " --buildDLAStandalone Enable build DLA standalone loadable which can be loaded by cuDLA, when this option is enabled, " "\n" + " --allowGPUFallback is disallowed and --skipInference is enabled by default. Additionally, " "\n" + " specifying --inputIOFormats and --outputIOFormats restricts I/O data type and memory layout" "\n" + " (default = disabled)" "\n" + " --allowGPUFallback When DLA is enabled, allow GPU fallback for unsupported layers (default = disabled)" "\n" + " --restricted Enable safety scope checking with kSAFETY_SCOPE build flag" "\n" + " --saveEngine= Save the serialized engine" "\n" + " --loadEngine= Load a serialized engine" "\n" + " --getPlanVersionOnly Print TensorRT version when loaded plan was created. Works without deserialization of the plan." "\n" + " Use together with --loadEngine. Supported only for engines created with 8.6 and forward." "\n" + " --tacticSources=tactics Specify the tactics to be used by adding (+) or removing (-) tactics from the default " "\n" + " tactic sources (default = all available tactics)." "\n" + " Note: Currently only cuDNN, cuBLAS, cuBLAS-LT, and edge mask convolutions are listed as optional" "\n" + " tactics." "\n" + R"( Tactic Sources: tactics ::= [","tactic])" "\n" + " tactic ::= (+|-)lib" "\n" + R"( lib ::= "CUBLAS"|"CUBLAS_LT"|"CUDNN"|"EDGE_MASK_CONVOLUTIONS")" "\n" + R"( |"JIT_CONVOLUTIONS")" "\n" + " For example, to disable cudnn and enable cublas: --tacticSources=-CUDNN,+CUBLAS" "\n" + " --noBuilderCache Disable timing cache in builder (default is to enable timing cache)" "\n" + " --noCompilationCache Disable Compilation cache in builder, and the cache is part of timing cache (default is to enable compilation cache)" "\n" + " --errorOnTimingCacheMiss Emit error when a tactic being timed is not present in the timing cache (default = false)" "\n" + " --timingCacheFile= Save/load the serialized global timing cache" "\n" + " --preview=features Specify preview feature to be used by adding (+) or removing (-) preview features from the default" "\n" + R"( Preview Features: features ::= [","feature])" "\n" + " feature ::= (+|-)flag" "\n" + R"( flag ::= "aliasedPluginIO1003")" "\n" + R"( |"profileSharing0806")" "\n" + " --builderOptimizationLevel Set the builder optimization level. (default is 3)" "\n" + " Higher level allows TensorRT to spend more building time for more optimization options." "\n" + " Valid values include integers from 0 to the maximum optimization level, which is currently 5." "\n" + " --maxTactics Set the maximum number of tactics to time when there is a choice of tactics. (default is -1)" "\n" + " Larger number of tactics allow TensorRT to spend more building time on evaluating tactics." "\n" + " Default value -1 means TensorRT can decide the number of tactics based on its own heuristic." "\n" + " --hardwareCompatibilityLevel=mode Make the engine file compatible with other GPU architectures. (default = none)" "\n" + R"( Hardware Compatibility Level: mode ::= "none" | "ampere+")" "\n" + " none = no compatibility" "\n" + " ampere+ = compatible with Ampere and newer GPUs" "\n" + " --runtimePlatform=platform Set the target platform for runtime execution. (default = SameAsBuild)" "\n" + " When this option is enabled, --skipInference is enabled by default." "\n" + R"( RuntimePlatfrom: platform ::= "SameAsBuild" | "WindowsAMD64")" "\n" + " SameAsBuild = no requirement for cross-platform compatibility." "\n" + " WindowsAMD64 = set the target platform for engine execution as Windows AMD64 system" "\n" + " --tempdir= Overrides the default temporary directory TensorRT will use when creating temporary files." "\n" + " See IRuntime::setTemporaryDirectory API documentation for more information." "\n" + " --tempfileControls=controls Controls what TensorRT is allowed to use when creating temporary executable files." "\n" + " Should be a comma-separated list with entries in the format (in_memory|temporary):(allow|deny)." "\n" + " in_memory: Controls whether TensorRT is allowed to create temporary in-memory executable files." "\n" + " temporary: Controls whether TensorRT is allowed to create temporary executable files in the" "\n" + " filesystem (in the directory given by --tempdir)." "\n" + " For example, to allow in-memory files and disallow temporary files:" "\n" + " --tempfileControls=in_memory:allow,temporary:deny" "\n" + R"( If a flag is unspecified, the default behavior is "allow".)" "\n" + " --maxAuxStreams=N Set maximum number of auxiliary streams per inference stream that TRT is allowed to use to run " "\n" + " kernels in parallel if the network contains ops that can run in parallel, with the cost of more " "\n" + " memory usage. Set this to 0 for optimal memory usage. (default = using heuristics)" "\n" + " --profile Build with dynamic shapes using a profile with the min/max/opt shapes provided. Can be specified" "\n" + " multiple times to create multiple profiles with contiguous index." "\n" + " (ex: --profile=0 --minShapes= --optShapes= --maxShapes= --profile=1 ...)" "\n" + " --calibProfile Select the optimization profile to calibrate by index. (default = " + << defaultOptProfileIndex << ")" "\n" + " --allowWeightStreaming Enable a weight streaming engine. Must be specified with --stronglyTyped. TensorRT will disable" "\n" + " weight streaming at runtime unless --weightStreamingBudget is specified." "\n" + " --markDebug Specify list of names of tensors to be marked as debug tensors. Separate names with a comma" "\n" ; -// clang-format on + // clang-format on os << std::flush; } void SystemOptions::help(std::ostream& os) { -// clang-format off + // clang-format off os << "=== System Options ===" << std::endl << " --device=N Select cuda device N (default = " << defaultDevice << ")" << std::endl << " --useDLACore=N Select DLA core N for layers that support DLA (default = none)" << std::endl << - " --allowGPUFallback When DLA is enabled, allow GPU fallback for unsupported layers " - "(default = disabled)" << std::endl; - os << " --plugins Plugin library (.so) to load (can be specified multiple times)" << std::endl; -// clang-format on +#if TRT_WINML + std::endl; +#else + " --staticPlugins Plugin library (.so) to load statically (can be specified multiple times)" << std::endl << + " --dynamicPlugins Plugin library (.so) to load dynamically and may be serialized with the engine if they are included in --setPluginsToSerialize (can be specified multiple times)" << std::endl << + " --setPluginsToSerialize Plugin library (.so) to be serialized with the engine (can be specified multiple times)" << std::endl << + " --ignoreParsedPluginLibs By default, when building a version-compatible engine, plugin libraries specified by the ONNX parser " << std::endl << + " are implicitly serialized with the engine (unless --excludeLeanRuntime is specified) and loaded dynamically. " << std::endl << + " Enable this flag to ignore these plugin libraries instead." << std::endl; +#endif + // clang-format on } void InferenceOptions::help(std::ostream& os) { // clang-format off os << "=== Inference Options ===" << std::endl << - " --batch=N Set batch size for implicit batch engines (default = " << defaultBatch << ")" << std::endl << - " This option should not be used when the engine is built from an ONNX model or when dynamic" << std::endl << - " shapes are provided when the engine is built." << std::endl << " --shapes=spec Set input shapes for dynamic shapes inference inputs." << std::endl << - " Note: Input names can be wrapped with escaped single quotes (ex: \\\'Input:0\\\')." << std::endl << + R"( Note: Input names can be wrapped with escaped single quotes (ex: 'Input:0').)" << std::endl << " Example input shapes spec: input0:1x3x256x256, input1:1x3x128x128" << std::endl << + " For scalars (0-D shapes), use input0:scalar or simply input0: with nothing after the colon."<< std::endl << " Each input shape is supplied as a key-value pair where key is the input name and" << std::endl << " value is the dimensions (including the batch dimension) to be used for that input." << std::endl << " Each key-value pair has the key and value separated using a colon (:)." << std::endl << - " Multiple input shapes can be provided via comma-separated key-value pairs." << std::endl << + " Multiple input shapes can be provided via comma-separated key-value pairs, and each input " << std::endl << + " name can contain at most one wildcard ('*') character." << std::endl << " --loadInputs=spec Load input values from files (default = generate random inputs). Input names can be " "wrapped with single quotes (ex: 'Input:0')" << std::endl << - " Input values spec ::= Ival[\",\"spec]" << std::endl << - " Ival ::= name\":\"file" << std::endl << + R"( Input values spec ::= Ival[","spec])" << std::endl << + R"( Ival ::= name":"file)" << std::endl << + " Consult the README for more information on generating files for custom inputs." << std::endl << " --iterations=N Run at least N inference iterations (default = " << defaultIterations << ")" << std::endl << " --warmUp=N Run for N milliseconds to warmup before measuring performance (default = " << defaultWarmUp << ")" << std::endl << " --duration=N Run performance measurements for at least N seconds wallclock time (default = " << defaultDuration << ")" << std::endl << + " If -1 is specified, inference will keep running unless stopped manually" << std::endl << " --sleepTime=N Delay inference start with a gap of N milliseconds between launch and compute " "(default = " << defaultSleep << ")" << std::endl << " --idleTime=N Sleep N milliseconds between two continuous iterations" "(default = " << defaultIdle << ")" << std::endl << - " --streams=N Instantiate N engines to use concurrently (default = " << defaultStreams << ")" << std::endl << + " --infStreams=N Instantiate N execution contexts to run inference concurrently " + "(default = " << defaultStreams << ")" << std::endl << " --exposeDMA Serialize DMA transfers to and from device (default = disabled)." << std::endl << " --noDataTransfers Disable DMA transfers to and from device (default = enabled)." << std::endl << - " --useManagedMemory Use managed memory instead of seperate host and device allocations (default = disabled)." << std::endl << + " --useManagedMemory Use managed memory instead of separate host and device allocations (default = disabled)." << std::endl << " --useSpinWait Actively synchronize on GPU events. This option may decrease synchronization time but " "increase CPU usage and power (default = disabled)" << std::endl << " --threads Enable multithreading to drive engines with independent threads" @@ -1677,42 +2655,84 @@ void InferenceOptions::help(std::ostream& os) " --timeRefit Time the amount of time it takes to refit the engine before inference." << std::endl << " --separateProfileRun Do not attach the profiler in the benchmark run; if profiling is enabled, a second " "profile run will be executed (default = disabled)" << std::endl << - " --buildOnly Skip inference perf measurement (default = disabled)" << std::endl; + " --skipInference Exit after the engine has been built and skip inference perf measurement " + "(default = disabled)" << std::endl << + " --persistentCacheRatio Set the persistentCacheLimit in ratio, 0.5 represent half of max persistent L2 size " + "(default = 0)" << std::endl << + " --useProfile Set the optimization profile for the inference context " + "(default = " << defaultOptProfileIndex << " )." << std::endl << + " --allocationStrategy=spec Specify how the internal device memory for inference is allocated." << std::endl << + R"( Strategy: spec ::= "static", "profile", "runtime")" << std::endl << + " static = Allocate device memory based on max size across all profiles." << std::endl << + " profile = Allocate device memory based on max size of the current profile." << std::endl << + " runtime = Allocate device memory based on the actual input shapes." << std::endl << + " --saveDebugTensors Specify list of names of tensors to turn on the debug state" << std::endl << + " and filename to save raw outputs to." << std::endl << + " These tensors must be specified as debug tensors during build time." << std::endl << + R"( Input values spec ::= Ival[","spec])" << std::endl << + R"( Ival ::= name":"file)" << std::endl << + " --weightStreamingBudget Set the maximum amount of GPU memory TensorRT is allowed to use for weights." << std::endl << + " It can take on the following values:" << std::endl << + " -2: (default) Disable weight streaming at runtime." << std::endl << + " -1: TensorRT will automatically decide the budget." << std::endl << + " 0-100%: Percentage of streamable weights that reside on the GPU." << std::endl << + " 0% saves the most memory but will have the worst performance." << std::endl << + " Requires the % character." << std::endl << + " >=0B: The exact amount of streamable weights that reside on the GPU. Supports the " << std::endl << + " following base-2 suffixes: " << getAvailableUnitSuffixes() << "." << std::endl; // clang-format on } void ReportingOptions::help(std::ostream& os) { -// clang-format off + // clang-format off os << "=== Reporting Options ===" << std::endl << " --verbose Use verbose logging (default = false)" << std::endl << " --avgRuns=N Report performance measurements averaged over N consecutive " "iterations (default = " << defaultAvgRuns << ")" << std::endl << - " --percentile=P Report performance for the P percentage (0<=P<=100, 0 " + " --percentile=P1,P2,P3,... Report performance for the P1,P2,P3,... percentages (0<=P_i<=100, 0 " "representing max perf, and 100 representing min perf; (default" - " = " << defaultPercentile << "%)" << std::endl << + " = " << joinValuesToString(defaultPercentiles, ",") << "%)" << std::endl << " --dumpRefit Print the refittable layers and weights from a refittable " "engine" << std::endl << " --dumpOutput Print the output tensor(s) of the last inference iteration " "(default = disabled)" << std::endl << + " --dumpRawBindingsToFile Print the input/output tensor(s) of the last inference iteration to file" + "(default = disabled)" << std::endl << " --dumpProfile Print profile information per layer (default = disabled)" << std::endl << " --dumpLayerInfo Print layer information of the engine to console " "(default = disabled)" << std::endl << + " --dumpOptimizationProfile Print the optimization profile(s) information " + "(default = disabled)" << std::endl << " --exportTimes= Write the timing results in a json file (default = disabled)" << std::endl << " --exportOutput= Write the output tensors to a json file (default = disabled)" << std::endl << " --exportProfile= Write the profile information per layer in a json file " "(default = disabled)" << std::endl << " --exportLayerInfo= Write the layer information of the engine in a json file " "(default = disabled)" << std::endl; -// clang-format on + // clang-format on +} + +void TaskInferenceOptions::help(std::ostream& os) +{ + // clang-format off + os << "=== Task Inference Options ===" << std::endl << + " engine= Specify a serialized engine for this task" << std::endl << + " device=N Specify a GPU device for this task" << std::endl << + " DLACore=N Specify a DLACore for this task" << std::endl << + " batch=N Set batch size for implicit batch engines (default = " << defaultBatch << ")" << std::endl << + " This option should not be used for explicit batch engines" << std::endl << + " graph=1 Use cuda graph for this task" << std::endl << + " persistentCacheRatio=[0-1] Set the persistentCacheLimit ratio for this task (default = 0)" << std::endl; + // clang-format on } void helpHelp(std::ostream& os) { -// clang-format off + // clang-format off os << "=== Help ===" << std::endl << " --help, -h Print this message" << std::endl; -// clang-format on + // clang-format on } void AllOptions::help(std::ostream& os) @@ -1723,19 +2743,6 @@ void AllOptions::help(std::ostream& os) os << std::endl; InferenceOptions::help(os); os << std::endl; -// clang-format off - os << "=== Build and Inference Batch Options ===" << std::endl << - " When using implicit batch, the max batch size of the engine, if not given, " << std::endl << - " is set to the inference batch size;" << std::endl << - " when using explicit batch, if shapes are specified only for inference, they " << std::endl << - " will be used also as min/opt/max in the build profile; if shapes are " << std::endl << - " specified only for the build, the opt shapes will be used also for inference;" << std::endl << - " if both are specified, they must be compatible; and if explicit batch is " << std::endl << - " enabled but neither is specified, the model must provide complete static" << std::endl << - " dimensions, including batch size, for all inputs" << std::endl << - " Using ONNX models automatically forces explicit batch." << std::endl << - std::endl; - // clang-format on ReportingOptions::help(os); os << std::endl; SystemOptions::help(os); @@ -1745,7 +2752,7 @@ void AllOptions::help(std::ostream& os) void SafeBuilderOptions::printHelp(std::ostream& os) { -// clang-format off + // clang-format off os << "=== Mandatory ===" << std::endl << " --onnx= ONNX model" << std::endl << " " << std::endl << @@ -1759,20 +2766,34 @@ void SafeBuilderOptions::printHelp(std::ostream& os) " Note: If this option is specified, please set comma-separated types and formats for all" << std::endl << " outputs following the same order as network outputs ID (even if only one output" << std::endl << " needs specifying IO format) or set the type and format once for broadcasting." << std::endl << - " IO Formats: spec ::= IOfmt[\",\"spec]" << std::endl << + R"( IO Formats: spec ::= IOfmt[","spec])" << std::endl << " IOfmt ::= type:fmt" << std::endl << - " type ::= \"fp32\"|\"fp16\"|\"int32\"|\"int8\"" << std::endl << - " fmt ::= (\"chw\"|\"chw2\"|\"chw4\"|\"hwc8\"|\"chw16\"|\"chw32\"|\"dhwc8\")[\"+\"fmt]" << std::endl << + R"( type ::= "fp32"|"fp16"|"int32"|"int8")" << std::endl << + R"( fmt ::= ("chw"|"chw2"|"chw4"|"hwc8"|"chw16"|"chw32"|"dhwc8"|)" << std::endl << + R"( "cdhw32"|"hwc"|"dla_linear"|"dla_hwc4")["+"fmt])" << std::endl << " --int8 Enable int8 precision, in addition to fp16 (default = disabled)" << std::endl << - " --consistency Enable consistency check for serialized engine, (default = disabled)" << std::endl << " --std Build standard serialized engine, (default = disabled)" << std::endl << " --calib= Read INT8 calibration cache file" << std::endl << " --serialized= Save the serialized network" << std::endl << - " --plugins Plugin library (.so) to load (can be specified multiple times)" << std::endl << +#if !TRT_WINML + " --staticPlugins Plugin library (.so) to load statically (can be specified multiple times)" << std::endl << +#endif " --verbose or -v Use verbose logging (default = false)" << std::endl << " --help or -h Print this message" << std::endl << - " " << std::endl; -// clang-format on + " --noBuilderCache Disable timing cache in builder (default is to enable timing cache)" << std::endl << + " --timingCacheFile= Save/load the serialized global timing cache" << std::endl << + " --sparsity=spec Control sparsity (default = disabled). " << std::endl << + R"( Sparsity: spec ::= "disable", "enable", "force")" << std::endl << + " Note: Description about each of these options is as below" << std::endl << + " disable = do not enable sparse tactics in the builder (this is the default)" << std::endl << + " enable = enable sparse tactics in the builder (but these tactics will only be" << std::endl << + " considered if the weights have the right sparsity pattern)" << std::endl << + " force = enable sparse tactics in the builder and force-overwrite the weights to have" << std::endl << + " a sparsity pattern" << std::endl << + " --avgTiming=M Set the number of times averaged in each iteration for kernel selection (default = " << std::endl << + "" << defaultAvgTiming << ")" << std::endl << + "" << std::endl; + // clang-format on } } // namespace sample diff --git a/src/Detector/tensorrt_yolo/common/sampleOptions.h b/src/Detector/tensorrt_yolo/common/sampleOptions.h index 8975e1ea..8ca0a655 100644 --- a/src/Detector/tensorrt_yolo/common/sampleOptions.h +++ b/src/Detector/tensorrt_yolo/common/sampleOptions.h @@ -1,11 +1,12 @@ /* - * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, @@ -23,6 +24,7 @@ #include #include #include +#include #include #include @@ -32,9 +34,10 @@ namespace sample { // Build default params -constexpr int32_t maxBatchNotProvided{0}; -constexpr int32_t defaultMinTiming{1}; constexpr int32_t defaultAvgTiming{8}; +constexpr int32_t defaultMaxAuxStreams{-1}; +constexpr int32_t defaultBuilderOptimizationLevel{-1}; +constexpr int32_t defaultMaxTactics{-1}; // System default params constexpr int32_t defaultDevice{0}; @@ -44,14 +47,16 @@ constexpr int32_t defaultBatch{1}; constexpr int32_t batchNotProvided{0}; constexpr int32_t defaultStreams{1}; constexpr int32_t defaultIterations{10}; +constexpr int32_t defaultOptProfileIndex{0}; constexpr float defaultWarmUp{200.F}; constexpr float defaultDuration{3.F}; constexpr float defaultSleep{}; constexpr float defaultIdle{}; +constexpr float defaultPersistentCacheRatio{0}; // Reporting default params constexpr int32_t defaultAvgRuns{10}; -constexpr float defaultPercentile{99}; +constexpr std::array defaultPercentiles{90, 95, 99}; enum class PrecisionConstraints { @@ -63,9 +68,7 @@ enum class PrecisionConstraints enum class ModelFormat { kANY, - kCAFFE, - kONNX, - kUFF + kONNX }; enum class SparsityFlag @@ -82,7 +85,55 @@ enum class TimingCacheMode kGLOBAL }; -using Arguments = std::unordered_multimap; +enum class MemoryAllocationStrategy +{ + kSTATIC, //< Allocate device memory based on max size across all profiles. + kPROFILE, //< Allocate device memory based on max size of the current profile. + kRUNTIME, //< Allocate device memory based on the current input shapes. +}; + +//! +//! \enum RuntimeMode +//! +//! \brief Used to dictate which TensorRT runtime library to dynamically load. +//! +enum class RuntimeMode +{ + //! Maps to libnvinfer.so or nvinfer.dll + kFULL, + + //! Maps to libnvinfer_dispatch.so or nvinfer_dispatch.dll + kDISPATCH, + + //! Maps to libnvinfer_lean.so or nvinfer_lean.dll + kLEAN, +}; + +inline std::ostream& operator<<(std::ostream& os, RuntimeMode const mode) +{ + switch (mode) + { + case RuntimeMode::kFULL: + { + os << "full"; + break; + } + case RuntimeMode::kDISPATCH: + { + os << "dispatch"; + break; + } + case RuntimeMode::kLEAN: + { + os << "lean"; + break; + } + } + + return os; +} + +using Arguments = std::unordered_multimap>; using IOFormat = std::pair; @@ -90,135 +141,201 @@ using ShapeRange = std::array, nvinfer1::EnumMax; using LayerOutputTypes = std::unordered_map>; +using LayerDeviceTypes = std::unordered_map; -struct Options -{ - virtual void parse(Arguments& arguments) = 0; -}; +using StringSet = std::unordered_set; -struct BaseModelOptions : public Options +class WeightStreamingBudget { - ModelFormat format{ModelFormat::kANY}; - std::string model; +public: + static constexpr int64_t kDISABLE{-2}; + static constexpr int64_t kAUTOMATIC{-1}; + int64_t bytes{kDISABLE}; + double percent{static_cast(100.0)}; - void parse(Arguments& arguments) override; + bool isDisabled() + { + return bytes == kDISABLE && percent == kDISABLE; + } +}; - static void help(std::ostream& out); +class Options +{ +public: + virtual ~Options() = default; + virtual void parse(Arguments& arguments) = 0; }; -struct UffInput : public Options +class BaseModelOptions : public Options { - std::vector> inputs; - bool NHWC{false}; +public: + ModelFormat format{ModelFormat::kANY}; + std::string model; void parse(Arguments& arguments) override; static void help(std::ostream& out); }; -struct ModelOptions : public Options +class ModelOptions : public Options { +public: BaseModelOptions baseModel; std::string prototxt; std::vector outputs; - UffInput uffInputs; void parse(Arguments& arguments) override; static void help(std::ostream& out); }; -struct BuildOptions : public Options +constexpr nvinfer1::TempfileControlFlags getTempfileControlDefaults() { - int32_t maxBatch{maxBatchNotProvided}; + using F = nvinfer1::TempfileControlFlag; + return (1U << static_cast(F::kALLOW_TEMPORARY_FILES)) + | (1U << static_cast(F::kALLOW_IN_MEMORY_FILES)); +} + +class BuildOptions : public Options +{ +public: + // Unit in MB. double workspace{-1.0}; + // Unit in MB. double dlaSRAM{-1.0}; + // Unit in MB. double dlaLocalDRAM{-1.0}; + // Unit in MB. double dlaGlobalDRAM{-1.0}; - int32_t minTiming{defaultMinTiming}; + // Unit in KB. + double tacticSharedMem{-1.0}; int32_t avgTiming{defaultAvgTiming}; + size_t calibProfile{defaultOptProfileIndex}; bool tf32{true}; bool fp16{false}; + bool bf16{false}; bool int8{false}; + bool fp8{false}; + bool int4{false}; + bool stronglyTyped{false}; bool directIO{false}; PrecisionConstraints precisionConstraints{PrecisionConstraints::kNONE}; LayerPrecisions layerPrecisions; LayerOutputTypes layerOutputTypes; + LayerDeviceTypes layerDeviceTypes; + StringSet debugTensors; + StringSet debugTensorStates; bool safe{false}; - bool consistency{false}; + bool buildDLAStandalone{false}; + bool allowGPUFallback{false}; bool restricted{false}; + bool skipInference{false}; bool save{false}; bool load{false}; bool refittable{false}; + bool stripWeights{false}; + bool versionCompatible{false}; + bool pluginInstanceNorm{false}; + bool excludeLeanRuntime{false}; + bool disableCompilationCache{false}; + int32_t builderOptimizationLevel{defaultBuilderOptimizationLevel}; + int32_t maxTactics{defaultMaxTactics}; SparsityFlag sparsity{SparsityFlag::kDISABLE}; -#if (NV_TENSORRT_MAJOR > 7) - nvinfer1::ProfilingVerbosity profilingVerbosity{nvinfer1::ProfilingVerbosity::kLAYER_NAMES_ONLY}; -#else - nvinfer1::ProfilingVerbosity profilingVerbosity{ nvinfer1::ProfilingVerbosity::kDEFAULT }; -#endif + nvinfer1::ProfilingVerbosity profilingVerbosity{nvinfer1::ProfilingVerbosity::kLAYER_NAMES_ONLY}; std::string engine; std::string calibration; - std::unordered_map shapes; - std::unordered_map shapesCalib; + using ShapeProfile = std::unordered_map; + std::vector optProfiles; + ShapeProfile shapesCalib; std::vector inputFormats; std::vector outputFormats; nvinfer1::TacticSources enabledTactics{0}; nvinfer1::TacticSources disabledTactics{0}; TimingCacheMode timingCacheMode{TimingCacheMode::kLOCAL}; std::string timingCacheFile{}; + bool errorOnTimingCacheMiss{false}; + // C++11 does not automatically generate hash function for enum class. + // Use int32_t to support C++11 compilers. + std::unordered_map previewFeatures; + nvinfer1::HardwareCompatibilityLevel hardwareCompatibilityLevel{nvinfer1::HardwareCompatibilityLevel::kNONE}; + nvinfer1::RuntimePlatform runtimePlatform{nvinfer1::RuntimePlatform::kSAME_AS_BUILD}; + std::string tempdir{}; + nvinfer1::TempfileControlFlags tempfileControls{getTempfileControlDefaults()}; + RuntimeMode useRuntime{RuntimeMode::kFULL}; + std::string leanDLLPath{}; + int32_t maxAuxStreams{defaultMaxAuxStreams}; + bool getPlanVersionOnly{false}; + + bool allowWeightStreaming{false}; + void parse(Arguments& arguments) override; static void help(std::ostream& out); }; -struct SystemOptions : public Options +class SystemOptions : public Options { +public: int32_t device{defaultDevice}; int32_t DLACore{-1}; - bool fallback{false}; + bool ignoreParsedPluginLibs{false}; std::vector plugins; + std::vector setPluginsToSerialize; + std::vector dynamicPlugins; void parse(Arguments& arguments) override; static void help(std::ostream& out); }; -struct InferenceOptions : public Options +class InferenceOptions : public Options { +public: int32_t batch{batchNotProvided}; int32_t iterations{defaultIterations}; - int32_t streams{defaultStreams}; + int32_t infStreams{defaultStreams}; + int32_t optProfileIndex{defaultOptProfileIndex}; float warmup{defaultWarmUp}; float duration{defaultDuration}; float sleep{defaultSleep}; float idle{defaultIdle}; + float persistentCacheRatio{defaultPersistentCacheRatio}; bool overlap{true}; bool skipTransfers{false}; bool useManaged{false}; bool spin{false}; bool threads{false}; bool graph{false}; - bool skip{false}; bool rerun{false}; bool timeDeserialize{false}; bool timeRefit{false}; + bool setOptProfile{false}; std::unordered_map inputs; - std::unordered_map> shapes; + using ShapeProfile = std::unordered_map>; + ShapeProfile shapes; + nvinfer1::ProfilingVerbosity nvtxVerbosity{nvinfer1::ProfilingVerbosity::kLAYER_NAMES_ONLY}; + MemoryAllocationStrategy memoryAllocationStrategy{MemoryAllocationStrategy::kSTATIC}; + std::unordered_map debugTensorFileNames; + + WeightStreamingBudget weightStreamingBudget; void parse(Arguments& arguments) override; static void help(std::ostream& out); }; -struct ReportingOptions : public Options +class ReportingOptions : public Options { +public: bool verbose{false}; int32_t avgs{defaultAvgRuns}; - float percentile{defaultPercentile}; + std::vector percentiles{defaultPercentiles.begin(), defaultPercentiles.end()}; bool refit{false}; bool output{false}; + bool dumpRawBindings{false}; bool profile{false}; bool layerInfo{false}; + bool optProfileInfo{false}; std::string exportTimes; std::string exportOutput; std::string exportProfile; @@ -229,8 +346,9 @@ struct ReportingOptions : public Options static void help(std::ostream& out); }; -struct SafeBuilderOptions : public Options +class SafeBuilderOptions : public Options { +public: std::string serialized{}; std::string onnxModelFile{}; bool help{false}; @@ -238,18 +356,24 @@ struct SafeBuilderOptions : public Options std::vector inputFormats; std::vector outputFormats; bool int8{false}; + bool fp8{false}; + bool int4{false}; std::string calibFile{}; std::vector plugins; - bool consistency{false}; bool standard{false}; + TimingCacheMode timingCacheMode{TimingCacheMode::kLOCAL}; + std::string timingCacheFile{}; + SparsityFlag sparsity{SparsityFlag::kDISABLE}; + int32_t avgTiming{defaultAvgTiming}; void parse(Arguments& arguments) override; static void printHelp(std::ostream& out); }; -struct AllOptions : public Options +class AllOptions : public Options { +public: ModelOptions model; BuildOptions build; SystemOptions system; @@ -262,6 +386,20 @@ struct AllOptions : public Options static void help(std::ostream& out); }; +class TaskInferenceOptions : public Options +{ +public: + std::string engine; + int32_t device{defaultDevice}; + int32_t DLACore{-1}; + int32_t batch{batchNotProvided}; + bool graph{false}; + float persistentCacheRatio{defaultPersistentCacheRatio}; + void parse(Arguments& arguments) override; + static void help(std::ostream& out); +}; + + Arguments argsToArgumentsMap(int32_t argc, char* argv[]); bool parseHelp(Arguments& arguments); @@ -272,8 +410,6 @@ void helpHelp(std::ostream& out); std::ostream& operator<<(std::ostream& os, const BaseModelOptions& options); -std::ostream& operator<<(std::ostream& os, const UffInput& input); - std::ostream& operator<<(std::ostream& os, const IOFormat& format); std::ostream& operator<<(std::ostream& os, const ShapeRange& dims); @@ -292,6 +428,10 @@ std::ostream& operator<<(std::ostream& os, const AllOptions& options); std::ostream& operator<<(std::ostream& os, const SafeBuilderOptions& options); +std::ostream& operator<<(std::ostream& os, nvinfer1::DataType dtype); + +std::ostream& operator<<(std::ostream& os, nvinfer1::DeviceType devType); + inline std::ostream& operator<<(std::ostream& os, const nvinfer1::Dims& dims) { for (int32_t i = 0; i < dims.nbDims; ++i) @@ -329,13 +469,11 @@ inline std::ostream& operator<<(std::ostream& os, const nvinfer1::WeightsRole ro os << "Constant"; break; } -#if (NV_TENSORRT_MAJOR > 7) case nvinfer1::WeightsRole::kANY: { os << "Any"; break; } -#endif } return os; diff --git a/src/Detector/tensorrt_yolo/common/sampleReporting.cpp b/src/Detector/tensorrt_yolo/common/sampleReporting.cpp index a92938c5..e9dda6e0 100644 --- a/src/Detector/tensorrt_yolo/common/sampleReporting.cpp +++ b/src/Detector/tensorrt_yolo/common/sampleReporting.cpp @@ -1,11 +1,12 @@ /* - * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, @@ -26,6 +27,8 @@ #include "sampleOptions.h" #include "sampleReporting.h" +using namespace nvinfer1; + namespace sample { @@ -45,7 +48,7 @@ float findPercentile(float percentile, std::vector const& timings { return std::numeric_limits::infinity(); } - if (percentile < 0.0f || percentile > 100.0f) + if (percentile < 0.F || percentile > 100.F) { throw std::runtime_error("percentile is not in [0, 100]!"); } @@ -99,8 +102,26 @@ float findCoeffOfVariance(std::vector const& timings, T const& to inline InferenceTime traceToTiming(const InferenceTrace& a) { - return InferenceTime((a.enqEnd - a.enqStart), (a.h2dEnd - a.h2dStart), (a.computeEnd - a.computeStart), - (a.d2hEnd - a.d2hStart), (a.d2hEnd - a.h2dStart)); + return InferenceTime( + (a.enqEnd - a.enqStart), (a.h2dEnd - a.h2dStart), (a.computeEnd - a.computeStart), (a.d2hEnd - a.d2hStart)); +} + +inline std::string dimsToString(Dims const& shape) +{ + std::stringstream ss; + + if (shape.nbDims == 0) + { + ss << "scalar"; + } + else + { + for (int32_t i = 0; i < shape.nbDims; i++) + { + ss << shape.d[i] << (i != shape.nbDims - 1 ? "x" : ""); + } + } + return ss.str(); } } // namespace @@ -113,29 +134,40 @@ void printProlog(int32_t warmups, int32_t timings, float warmupMs, float benchTi void printTiming(std::vector const& timings, int32_t runsPerAvg, std::ostream& os) { - int32_t count = 0; + int64_t count = 0; InferenceTime sum; os << std::endl; os << "=== Trace details ===" << std::endl; os << "Trace averages of " << runsPerAvg << " runs:" << std::endl; - for (auto const& t : timings) + + // Show only the first N lines and the last N lines, where N = kTIMING_PRINT_THRESHOLD. + constexpr int64_t kTIMING_PRINT_THRESHOLD{200}; + int64_t const maxNbTimings{kTIMING_PRINT_THRESHOLD * runsPerAvg}; + + for (int64_t idx = 0, size = timings.size(); idx < size; ++idx) { - sum += t; + // Omit some latency printing to avoid very long logs. + if (size > 2 * maxNbTimings && idx == maxNbTimings) + { + os << "... Omitting " << (size - 2 * maxNbTimings) << " lines" << std::endl; + idx = size - kTIMING_PRINT_THRESHOLD * runsPerAvg - 1; + } + + sum += timings[idx]; if (++count == runsPerAvg) { // clang-format off os << "Average on " << runsPerAvg << " runs - GPU latency: " << sum.compute / runsPerAvg - << " ms - Host latency: " << sum.latency() / runsPerAvg << " ms (end to end " << sum.e2e / runsPerAvg - << " ms, enqueue " << sum.enq / runsPerAvg << " ms)" << std::endl; + << " ms - Host latency: " << sum.latency() / runsPerAvg << " ms (enqueue " << sum.enq / runsPerAvg + << " ms)" << std::endl; // clang-format on count = 0; sum.enq = 0; sum.h2d = 0; sum.compute = 0; sum.d2h = 0; - sum.e2e = 0; } } } @@ -166,14 +198,10 @@ void printMetricExplanations(std::ostream& os) os << "Latency: the summation of H2D Latency, GPU Compute Time, and D2H Latency. This is the latency to infer a " "single query." << std::endl; - os << "End-to-End Host Latency: the duration from when the H2D of a query is called to when the D2H of the same " - "query is completed, which includes the latency to wait for the completion of the previous query. This is " - "the latency of a query if multiple queries are enqueued consecutively." - << std::endl; } PerformanceResult getPerformanceResult(std::vector const& timings, - std::function metricGetter, float percentile) + std::function metricGetter, std::vector const& percentiles) { auto const metricComparator = [metricGetter](InferenceTime const& a, InferenceTime const& b) { return metricGetter(a) < metricGetter(b); }; @@ -183,40 +211,44 @@ PerformanceResult getPerformanceResult(std::vector const& timings PerformanceResult result; result.min = metricGetter(newTimings.front()); result.max = metricGetter(newTimings.back()); - result.mean = std::accumulate(newTimings.begin(), newTimings.end(), 0.0f, metricAccumulator) / newTimings.size(); + result.mean = std::accumulate(newTimings.begin(), newTimings.end(), 0.0F, metricAccumulator) / newTimings.size(); result.median = findMedian(newTimings, metricGetter); - result.percentile = findPercentile(percentile, newTimings, metricGetter); + for (auto percentile : percentiles) + { + result.percentiles.emplace_back(findPercentile(percentile, newTimings, metricGetter)); + } result.coeffVar = findCoeffOfVariance(newTimings, metricGetter, result.mean); return result; } -void printEpilog(std::vector const& timings, float walltimeMs, float percentile, int32_t batchSize, - std::ostream& osInfo, std::ostream& osWarning, std::ostream& osVerbose) +void printEpilog(std::vector const& timings, float walltimeMs, std::vector const& percentiles, + int32_t batchSize, int32_t infStreams, std::ostream& osInfo, std::ostream& osWarning, std::ostream& osVerbose) { float const throughput = batchSize * timings.size() / walltimeMs * 1000; auto const getLatency = [](InferenceTime const& t) { return t.latency(); }; - auto const latencyResult = getPerformanceResult(timings, getLatency, percentile); - - auto const getEndToEnd = [](InferenceTime const& t) { return t.e2e; }; - auto const e2eLatencyResult = getPerformanceResult(timings, getEndToEnd, percentile); + auto const latencyResult = getPerformanceResult(timings, getLatency, percentiles); auto const getEnqueue = [](InferenceTime const& t) { return t.enq; }; - auto const enqueueResult = getPerformanceResult(timings, getEnqueue, percentile); + auto const enqueueResult = getPerformanceResult(timings, getEnqueue, percentiles); auto const getH2d = [](InferenceTime const& t) { return t.h2d; }; - auto const h2dResult = getPerformanceResult(timings, getH2d, percentile); + auto const h2dResult = getPerformanceResult(timings, getH2d, percentiles); auto const getCompute = [](InferenceTime const& t) { return t.compute; }; - auto const gpuComputeResult = getPerformanceResult(timings, getCompute, percentile); + auto const gpuComputeResult = getPerformanceResult(timings, getCompute, percentiles); auto const getD2h = [](InferenceTime const& t) { return t.d2h; }; - auto const d2hResult = getPerformanceResult(timings, getD2h, percentile); + auto const d2hResult = getPerformanceResult(timings, getD2h, percentiles); - auto const toPerfString = [percentile](const PerformanceResult& r) { + auto const toPerfString = [&](const PerformanceResult& r) { std::stringstream s; s << "min = " << r.min << " ms, max = " << r.max << " ms, mean = " << r.mean << " ms, " - << "median = " << r.median << " ms, percentile(" << percentile << "%) = " << r.percentile << " ms"; + << "median = " << r.median << " ms"; + for (int32_t i = 0, n = percentiles.size(); i < n; ++i) + { + s << ", percentile(" << percentiles[i] << "%) = " << r.percentiles[i] << " ms"; + } return s.str(); }; @@ -224,7 +256,6 @@ void printEpilog(std::vector const& timings, float walltimeMs, fl osInfo << "=== Performance summary ===" << std::endl; osInfo << "Throughput: " << throughput << " qps" << std::endl; osInfo << "Latency: " << toPerfString(latencyResult) << std::endl; - osInfo << "End-to-End Host Latency: " << toPerfString(e2eLatencyResult) << std::endl; osInfo << "Enqueue Time: " << toPerfString(enqueueResult) << std::endl; osInfo << "H2D Latency: " << toPerfString(h2dResult) << std::endl; osInfo << "GPU Compute Time: " << toPerfString(gpuComputeResult) << std::endl; @@ -268,6 +299,13 @@ void printEpilog(std::vector const& timings, float walltimeMs, fl << "stability." << std::endl; } + // Report warnings if multiple inference streams are used. + if (infStreams > 1) + { + osWarning << "* Multiple inference streams are used. Latencies may not be accurate since inferences may run in " + << " parallel. Please use \"Throughput\" as the performance metric instead." << std::endl; + } + // Explain what the metrics mean. osInfo << "Explanations of the performance metrics are printed in the verbose logs." << std::endl; printMetricExplanations(osVerbose); @@ -275,27 +313,28 @@ void printEpilog(std::vector const& timings, float walltimeMs, fl osInfo << std::endl; } -void printPerformanceReport(std::vector const& trace, const ReportingOptions& reporting, float warmupMs, - int32_t batchSize, std::ostream& osInfo, std::ostream& osWarning, std::ostream& osVerbose) +void printPerformanceReport(std::vector const& trace, ReportingOptions const& reportingOpts, + InferenceOptions const& infOpts, std::ostream& osInfo, std::ostream& osWarning, std::ostream& osVerbose) { + int32_t batchSize = infOpts.batch; + float const warmupMs = infOpts.warmup; auto const isNotWarmup = [&warmupMs](const InferenceTrace& a) { return a.computeStart >= warmupMs; }; auto const noWarmup = std::find_if(trace.begin(), trace.end(), isNotWarmup); int32_t const warmups = noWarmup - trace.begin(); float const benchTime = trace.back().d2hEnd - noWarmup->h2dStart; - // when implicit batch used, batchSize = options.inference.batch, which is parsed through --batch - // when explicit batch used, batchSize = options.inference.batch = 0 // treat inference with explicit batch as a single query and report the throughput batchSize = batchSize ? batchSize : 1; printProlog(warmups * batchSize, (trace.size() - warmups) * batchSize, warmupMs, benchTime, osInfo); std::vector timings(trace.size() - warmups); std::transform(noWarmup, trace.end(), timings.begin(), traceToTiming); - printTiming(timings, reporting.avgs, osInfo); - printEpilog(timings, benchTime, reporting.percentile, batchSize, osInfo, osWarning, osVerbose); + printTiming(timings, reportingOpts.avgs, osInfo); + printEpilog( + timings, benchTime, reportingOpts.percentiles, batchSize, infOpts.infStreams, osInfo, osWarning, osVerbose); - if (!reporting.exportTimes.empty()) + if (!reportingOpts.exportTimes.empty()) { - exportJSONTrace(trace, reporting.exportTimes); + exportJSONTrace(trace, reportingOpts.exportTimes, warmups); } } @@ -303,15 +342,16 @@ void printPerformanceReport(std::vector const& trace, const Repo //! [ value, ...] //! value ::= { "start enq : time, "end enq" : time, "start h2d" : time, "end h2d" : time, "start compute" : time, //! "end compute" : time, "start d2h" : time, "end d2h" : time, "h2d" : time, "compute" : time, -//! "d2h" : time, "latency" : time, "end to end" : time } +//! "d2h" : time, "latency" : time } //! -void exportJSONTrace(std::vector const& trace, std::string const& fileName) +void exportJSONTrace(std::vector const& trace, std::string const& fileName, int32_t const nbWarmups) { std::ofstream os(fileName, std::ofstream::trunc); os << "[" << std::endl; char const* sep = " "; - for (auto const& t : trace) + for (auto iter = trace.begin() + nbWarmups; iter < trace.end(); ++iter) { + auto const& t = *iter; InferenceTime const it(traceToTiming(t)); os << sep << "{ "; sep = ", "; @@ -321,8 +361,8 @@ void exportJSONTrace(std::vector const& trace, std::string const << "\"startComputeMs\" : " << t.computeStart << sep << "\"endComputeMs\" : " << t.computeEnd << sep << "\"startD2hMs\" : " << t.d2hStart << sep << "\"endD2hMs\" : " << t.d2hEnd << sep << "\"h2dMs\" : " << it.h2d << sep << "\"computeMs\" : " << it.compute << sep - << "\"d2hMs\" : " << it.d2h << sep << "\"latencyMs\" : " << it.latency() << sep - << "\"endToEndMs\" : " << it.e2e << " }" << std::endl; + << "\"d2hMs\" : " << it.d2h << sep << "\"latencyMs\" : " << it.latency() << " }" + << std::endl; // clang-format on } os << "]" << std::endl; @@ -346,42 +386,49 @@ void Profiler::reportLayerTime(char const* layerName, float timeMs) noexcept } } - mIterator->timeMs += timeMs; + mIterator->timeMs.push_back(timeMs); ++mIterator; } void Profiler::print(std::ostream& os) const noexcept { - std::string const nameHdr("Layer"); - std::string const timeHdr(" Time (ms)"); - std::string const avgHdr(" Avg. Time (ms)"); - std::string const percentageHdr(" Time %"); + std::string const nameHdr(" Layer"); + std::string const timeHdr(" Time(ms)"); + std::string const avgHdr(" Avg.(ms)"); + std::string const medHdr(" Median(ms)"); + std::string const percentageHdr(" Time(%)"); float const totalTimeMs = getTotalTime(); - auto const cmpLayer = [](LayerProfile const& a, LayerProfile const& b) { return a.name.size() < b.name.size(); }; - auto const longestName = std::max_element(mLayers.begin(), mLayers.end(), cmpLayer); - auto const nameLength = std::max(longestName->name.size() + 1, nameHdr.size()); auto const timeLength = timeHdr.size(); auto const avgLength = avgHdr.size(); + auto const medLength = medHdr.size(); auto const percentageLength = percentageHdr.size(); os << std::endl << "=== Profile (" << mUpdatesCount << " iterations ) ===" << std::endl - << std::setw(nameLength) << nameHdr << timeHdr << avgHdr << percentageHdr << std::endl; + << timeHdr << avgHdr << medHdr << percentageHdr << nameHdr << std::endl; for (auto const& p : mLayers) { + if (p.timeMs.empty() || getTotalTime(p) == 0.F) + { + // there is no point to print profiling for layer that didn't run at all + continue; + } // clang-format off - os << std::setw(nameLength) << p.name << std::setw(timeLength) << std::fixed << std::setprecision(2) << p.timeMs - << std::setw(avgLength) << std::fixed << std::setprecision(4) << p.timeMs / mUpdatesCount - << std::setw(percentageLength) << std::fixed << std::setprecision(1) << p.timeMs / totalTimeMs * 100 - << std::endl; + os << std::setw(timeLength) << std::fixed << std::setprecision(2) << getTotalTime(p) + << std::setw(avgLength) << std::fixed << std::setprecision(4) << getAvgTime(p) + << std::setw(medLength) << std::fixed << std::setprecision(4) << getMedianTime(p) + << std::setw(percentageLength) << std::fixed << std::setprecision(1) << getTotalTime(p) / totalTimeMs * 100 + << " " << p.name << std::endl; } { - os << std::setw(nameLength) << "Total" << std::setw(timeLength) << std::fixed << std::setprecision(2) + os << std::setw(timeLength) << std::fixed << std::setprecision(2) << totalTimeMs << std::setw(avgLength) << std::fixed << std::setprecision(4) << totalTimeMs / mUpdatesCount - << std::setw(percentageLength) << std::fixed << std::setprecision(1) << 100.0 << std::endl; + << std::setw(medLength) << std::fixed << std::setprecision(4) << getMedianTime() + << std::setw(percentageLength) << std::fixed << std::setprecision(1) << 100.0 + << " Total" << std::endl; // clang-format on } os << std::endl; @@ -397,10 +444,11 @@ void Profiler::exportJSONProfile(std::string const& fileName) const noexcept for (auto const& l : mLayers) { // clang-format off - os << ", {" << " \"name\" : \"" << l.name << "\"" - ", \"timeMs\" : " << l.timeMs - << ", \"averageMs\" : " << l.timeMs / mUpdatesCount - << ", \"percentage\" : " << l.timeMs / totalTimeMs * 100 + os << ", {" << R"( "name" : ")" << l.name << R"(")" + R"(, "timeMs" : )" << getTotalTime(l) + << R"(, "averageMs" : )" << getAvgTime(l) + << R"(, "medianMs" : )" << getMedianTime(l) + << R"(, "percentage" : )" << getTotalTime(l) / totalTimeMs * 100 << " }" << std::endl; // clang-format on } @@ -415,8 +463,13 @@ void dumpInputs(nvinfer1::IExecutionContext const& context, Bindings const& bind void dumpOutputs(nvinfer1::IExecutionContext const& context, Bindings const& bindings, std::ostream& os) { - os << "Output Tensors:" << std::endl; - bindings.dumpOutputs(context, os); + auto isOutput = [](Binding const& b) { return !b.isInput; }; + bindings.dumpBindings(context, isOutput, os); +} + +void dumpRawBindingsToFiles(nvinfer1::IExecutionContext const& context, Bindings const& bindings, std::ostream& os) +{ + bindings.dumpRawBindingToFiles(context, os); } void exportJSONOutput( @@ -429,10 +482,10 @@ void exportJSONOutput( for (auto const& binding : output) { // clang-format off - os << sep << "{ \"name\" : \"" << binding.first << "\"" << std::endl; + os << sep << R"({ "name" : ")" << binding.first << "\"" << std::endl; sep = ", "; - os << " " << sep << "\"dimensions\" : \""; - bindings.dumpBindingDimensions(binding.second, context, os); + os << " " << sep << R"("dimensions" : ")"; + bindings.dumpBindingDimensions(binding.first, context, os); os << "\"" << std::endl; os << " " << sep << "\"values\" : [ "; bindings.dumpBindingValues(context, binding.second, os, sep, batch); @@ -442,4 +495,115 @@ void exportJSONOutput( os << "]" << std::endl; } +void exportJSONOutput( + nvinfer1::IExecutionContext const& context, Bindings const& bindings, std::string const& fileName, int32_t batch); + +void printLayerInfo( + ReportingOptions const& reporting, nvinfer1::ICudaEngine* engine, nvinfer1::IExecutionContext* context) +{ + if (reporting.layerInfo) + { + sample::gLogInfo << "Layer Information:" << std::endl; + sample::gLogInfo << getLayerInformation(engine, context, nvinfer1::LayerInformationFormat::kONELINE) + << std::flush; + } + if (!reporting.exportLayerInfo.empty()) + { + std::ofstream os(reporting.exportLayerInfo, std::ofstream::trunc); + os << getLayerInformation(engine, context, nvinfer1::LayerInformationFormat::kJSON) << std::flush; + } +} + +void printOptimizationProfileInfo(ReportingOptions const& reporting, nvinfer1::ICudaEngine const* engine) +{ + if (reporting.optProfileInfo) + { + sample::gLogInfo << "Optimization Profile Information:" << std::endl; + for (int32_t i = 0; i < engine->getNbOptimizationProfiles(); i++) + { + for (int32_t j = 0, e = engine->getNbIOTensors(); j < e; j++) + { + auto const tensorName = engine->getIOTensorName(j); + + if (engine->getTensorIOMode(tensorName) == nvinfer1::TensorIOMode::kINPUT) + { + auto tensorMinShape = engine->getProfileShape(tensorName, i, nvinfer1::OptProfileSelector::kMIN); + auto tensorOptShape = engine->getProfileShape(tensorName, i, nvinfer1::OptProfileSelector::kOPT); + auto tensorMaxShape = engine->getProfileShape(tensorName, i, nvinfer1::OptProfileSelector::kMAX); + + sample::gLogInfo << "Model input " << tensorName << " (profile " << i << "): " + << "min=" << dimsToString(tensorMinShape) + << ", opt=" << dimsToString(tensorOptShape) + << ", max=" << dimsToString(tensorMaxShape) << std::endl; + } + } + } + } +} + +void printPerformanceProfile(ReportingOptions const& reporting, InferenceEnvironment& iEnv) +{ + if (reporting.profile) + { + iEnv.profiler->print(sample::gLogInfo); + } + if (!reporting.exportProfile.empty()) + { + iEnv.profiler->exportJSONProfile(reporting.exportProfile); + } + + // Print an warning about total per-layer latency when auxiliary streams are used. + if (!iEnv.safe && (reporting.profile || !reporting.exportProfile.empty())) + { + int32_t const nbAuxStreams = iEnv.engine.get()->getNbAuxStreams(); + if (nbAuxStreams > 0) + { + sample::gLogWarning << "The engine uses " << nbAuxStreams << " auxiliary streams, so the \"Total\" latency " + << "may not be accurate because some layers may have run in parallel!" << std::endl; + } + } +} + +namespace details +{ +void dump(std::unique_ptr const& context, std::unique_ptr const& binding, + ReportingOptions const& reporting, int32_t batch) +{ + if (!context) + { + sample::gLogError << "Empty context! Skip printing outputs." << std::endl; + return; + } + if (reporting.output) + { + dumpOutputs(*context, *binding, sample::gLogInfo); + } + if (reporting.dumpRawBindings) + { + dumpRawBindingsToFiles(*context, *binding, sample::gLogInfo); + } + if (!reporting.exportOutput.empty()) + { + exportJSONOutput(*context, *binding, reporting.exportOutput, batch); + } +} +} // namespace details + +void printOutput(ReportingOptions const& reporting, InferenceEnvironment const& iEnv, int32_t batch) +{ + auto const& binding = iEnv.bindings.at(0); + if (!binding) + { + sample::gLogError << "Empty bindings! Skip printing outputs." << std::endl; + return; + } + if (iEnv.safe) + { + sample::gLogError << "Safe inferernce is not supported!" << std::endl; + return; + } + auto const& context = iEnv.contexts.at(0); + details::dump(context, binding, reporting, batch); +} + } // namespace sample diff --git a/src/Detector/tensorrt_yolo/common/sampleReporting.h b/src/Detector/tensorrt_yolo/common/sampleReporting.h index 5f730987..922ef3c8 100644 --- a/src/Detector/tensorrt_yolo/common/sampleReporting.h +++ b/src/Detector/tensorrt_yolo/common/sampleReporting.h @@ -1,11 +1,12 @@ /* - * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, @@ -19,27 +20,26 @@ #include #include - -#include "NvInfer.h" +#include #include "sampleOptions.h" -#include "sampleUtils.h" namespace sample { +class Bindings; + //! //! \struct InferenceTime //! \brief Measurement times in milliseconds //! struct InferenceTime { - InferenceTime(float q, float i, float c, float o, float e) + InferenceTime(float q, float i, float c, float o) : enq(q) , h2d(i) , compute(c) , d2h(o) - , e2e(e) { } @@ -54,7 +54,6 @@ struct InferenceTime float h2d{0}; // Host to Device float compute{0}; // Compute float d2h{0}; // Device to Host - float e2e{0}; // end to end // ideal latency float latency() const @@ -102,7 +101,7 @@ struct InferenceTrace inline InferenceTime operator+(InferenceTime const& a, InferenceTime const& b) { - return InferenceTime(a.enq + b.enq, a.h2d + b.h2d, a.compute + b.compute, a.d2h + b.d2h, a.e2e + b.e2e); + return InferenceTime(a.enq + b.enq, a.h2d + b.h2d, a.compute + b.compute, a.d2h + b.d2h); } inline InferenceTime operator+=(InferenceTime& a, InferenceTime const& b) @@ -116,12 +115,12 @@ inline InferenceTime operator+=(InferenceTime& a, InferenceTime const& b) //! struct PerformanceResult { - float min{0}; - float max{0}; - float mean{0}; - float median{0}; - float percentile{0}; - float coeffVar{0}; // coefficient of variation + float min{0.F}; + float max{0.F}; + float mean{0.F}; + float median{0.F}; + std::vector percentiles; + float coeffVar{0.F}; // coefficient of variation }; //! @@ -137,14 +136,14 @@ void printTiming(std::vector const& timings, int32_t runsPerAvg, //! //! \brief Print the performance summary of a trace //! -void printEpilog(std::vector const& timings, float percentile, int32_t batchSize, std::ostream& osInfo, - std::ostream& osWarning, std::ostream& osVerbose); +void printEpilog(std::vector const& timings, std::vector const& percentiles, int32_t batchSize, + std::ostream& osInfo, std::ostream& osWarning, std::ostream& osVerbose); //! //! \brief Get the result of a specific performance metric from a trace //! PerformanceResult getPerformanceResult(std::vector const& timings, - std::function metricGetter, float percentile); + std::function metricGetter, std::vector const& percentiles); //! //! \brief Print the explanations of the performance metrics printed in printEpilog() function. @@ -154,13 +153,14 @@ void printMetricExplanations(std::ostream& os); //! //! \brief Print and summarize a timing trace //! -void printPerformanceReport(std::vector const& trace, ReportingOptions const& reporting, float warmupMs, - int32_t batchSize, std::ostream& osInfo, std::ostream& osWarning, std::ostream& osVerbose); +void printPerformanceReport(std::vector const& trace, ReportingOptions const& reportingOpts, + InferenceOptions const& infOpts, std::ostream& osInfo, std::ostream& osWarning, std::ostream& osVerbose); //! //! \brief Export a timing trace to JSON file //! -void exportJSONTrace(std::vector const& trace, std::string const& fileName); +void exportJSONTrace( + std::vector const& InferenceTime, std::string const& fileName, int32_t const nbWarmups); //! //! \brief Print input tensors to stream @@ -172,6 +172,8 @@ void dumpInputs(nvinfer1::IExecutionContext const& context, Bindings const& bind //! void dumpOutputs(nvinfer1::IExecutionContext const& context, Bindings const& bindings, std::ostream& os); +void dumpRawBindingsToFiles(nvinfer1::IExecutionContext const& context, Bindings const& bindings, std::ostream& os); + //! //! \brief Export output tensors to JSON file //! @@ -185,7 +187,7 @@ void exportJSONOutput( struct LayerProfile { std::string name; - float timeMs{0}; + std::vector timeMs; }; //! @@ -208,8 +210,58 @@ class Profiler : public nvinfer1::IProfiler private: float getTotalTime() const noexcept { - auto const plusLayerTime = [](float accumulator, LayerProfile const& lp) { return accumulator + lp.timeMs; }; - return std::accumulate(mLayers.begin(), mLayers.end(), 0.0, plusLayerTime); + auto const plusLayerTime = [](float accumulator, LayerProfile const& lp) { + return accumulator + std::accumulate(lp.timeMs.begin(), lp.timeMs.end(), 0.F, std::plus()); + }; + return std::accumulate(mLayers.begin(), mLayers.end(), 0.0F, plusLayerTime); + } + + float getMedianTime() const noexcept + { + if (mLayers.empty()) + { + return 0.F; + } + std::vector totalTime; + for (size_t run = 0; run < mLayers[0].timeMs.size(); ++run) + { + auto const layerTime + = [&run](float accumulator, LayerProfile const& lp) { return accumulator + lp.timeMs[run]; }; + auto t = std::accumulate(mLayers.begin(), mLayers.end(), 0.F, layerTime); + totalTime.push_back(t); + } + return median(totalTime); + } + + float getMedianTime(LayerProfile const& p) const noexcept + { + return median(p.timeMs); + } + + static float median(std::vector vals) + { + if (vals.empty()) + { + return 0.F; + } + std::sort(vals.begin(), vals.end()); + if (vals.size() % 2U == 1U) + { + return vals[vals.size() / 2U]; + } + return (vals[vals.size() / 2U - 1U] + vals[vals.size() / 2U]) * 0.5F; + } + + //! return the total runtime of given layer profile + float getTotalTime(LayerProfile const& p) const noexcept + { + auto const& vals = p.timeMs; + return std::accumulate(vals.begin(), vals.end(), 0.F, std::plus()); + } + + float getAvgTime(LayerProfile const& p) const noexcept + { + return getTotalTime(p) / p.timeMs.size(); } std::vector mLayers; @@ -217,6 +269,30 @@ class Profiler : public nvinfer1::IProfiler int32_t mUpdatesCount{0}; }; +//! +//! \brief Print layer info to logger or export it to output JSON file. +//! +void printLayerInfo( + ReportingOptions const& reporting, nvinfer1::ICudaEngine* engine, nvinfer1::IExecutionContext* context); + +//! +//! \brief Print optimization profile info to logger. +//! +void printOptimizationProfileInfo(ReportingOptions const& reporting, nvinfer1::ICudaEngine const* engine); + +//! Forward declaration. +struct InferenceEnvironment; + +//! +//! \brief Print per-layer perf profile data to logger or export it to output JSON file. +//! +void printPerformanceProfile(ReportingOptions const& reporting, InferenceEnvironment& iEnv); + +//! +//! \brief Print binding output values to logger or export them to output JSON file. +//! +void printOutput(ReportingOptions const& reporting, InferenceEnvironment const& iEnv, int32_t batch); + } // namespace sample #endif // TRT_SAMPLE_REPORTING_H diff --git a/src/Detector/tensorrt_yolo/common/sampleUtils.cpp b/src/Detector/tensorrt_yolo/common/sampleUtils.cpp new file mode 100644 index 00000000..689e5857 --- /dev/null +++ b/src/Detector/tensorrt_yolo/common/sampleUtils.cpp @@ -0,0 +1,587 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "sampleUtils.h" +#include "bfloat16.h" +#include "half.h" + +using namespace nvinfer1; + +namespace sample +{ + +size_t dataTypeSize(nvinfer1::DataType dataType) +{ + switch (dataType) + { + case nvinfer1::DataType::kINT64: return 8U; + case nvinfer1::DataType::kINT32: + case nvinfer1::DataType::kFLOAT: return 4U; + case nvinfer1::DataType::kBF16: + case nvinfer1::DataType::kHALF: return 2U; + case nvinfer1::DataType::kBOOL: + case nvinfer1::DataType::kUINT8: + case nvinfer1::DataType::kINT8: + case nvinfer1::DataType::kFP8: return 1U; + case nvinfer1::DataType::kINT4: + ASSERT(false && "Element size is not implemented for sub-byte data-types."); + } + return 0; +} + +int64_t volume(nvinfer1::Dims const& dims, nvinfer1::Dims const& strides, int32_t vecDim, int32_t comps, int32_t batch) +{ + int64_t maxNbElems = 1; + for (int32_t i = 0; i < dims.nbDims; ++i) + { + // Get effective length of axis. + int64_t d = dims.d[i]; + // Any dimension is 0, it is an empty tensor. + if (d == 0) + { + return 0; + } + if (i == vecDim) + { + d = samplesCommon::divUp(d, comps); + } + maxNbElems = std::max(maxNbElems, d * strides.d[i]); + } + return maxNbElems * batch * (vecDim < 0 ? 1 : comps); +} + +nvinfer1::Dims toDims(std::vector const& vec) +{ + int32_t limit = static_cast(nvinfer1::Dims::MAX_DIMS); + if (static_cast(vec.size()) > limit) + { + sample::gLogWarning << "Vector too long, only first 8 elements are used in dimension." << std::endl; + } + // Pick first nvinfer1::Dims::MAX_DIMS elements + nvinfer1::Dims dims{std::min(static_cast(vec.size()), limit), {}}; + std::copy_n(vec.begin(), dims.nbDims, std::begin(dims.d)); + return dims; +} + +void loadFromFile(std::string const& fileName, char* dst, size_t size) +{ + ASSERT(dst); + + std::ifstream file(fileName, std::ios::in | std::ios::binary); + if (file.is_open()) + { + file.seekg(0, std::ios::end); + int64_t fileSize = static_cast(file.tellg()); + // Due to change from int32_t to int64_t VC engines created with earlier versions + // may expect input of the half of the size + if (fileSize != static_cast(size) && fileSize != static_cast(size * 2)) + { + std::ostringstream msg; + msg << "Unexpected file size for input file: " << fileName << ". Note: Input binding size is: " << size + << " bytes but the file size is " << fileSize + << " bytes. Double check the size and datatype of the provided data."; + throw std::invalid_argument(msg.str()); + } + // Move file pointer back to the beginning after reading file size. + file.seekg(0, std::ios::beg); + file.read(dst, size); + size_t const nbBytesRead = file.gcount(); + file.close(); + if (nbBytesRead != size) + { + std::ostringstream msg; + msg << "Unexpected file size for input file: " << fileName << ". Note: Expected: " << size + << " bytes but only read: " << nbBytesRead << " bytes"; + throw std::invalid_argument(msg.str()); + } + } + else + { + std::ostringstream msg; + msg << "Cannot open file " << fileName << "!"; + throw std::invalid_argument(msg.str()); + } +} + +std::vector splitToStringVec(std::string const& s, char separator, int64_t maxSplit) +{ + std::vector splitted; + + for (size_t start = 0; start < s.length();) + { + // If maxSplit is specified and we have reached maxSplit, emplace back the rest of the string and break the + // loop. + if (maxSplit >= 0 && static_cast(splitted.size()) == maxSplit) + { + splitted.emplace_back(s.substr(start, s.length() - start)); + break; + } + + size_t separatorIndex = s.find(separator, start); + if (separatorIndex == std::string::npos) + { + separatorIndex = s.length(); + } + splitted.emplace_back(s.substr(start, separatorIndex - start)); + + // If the separator is the last character, then we should push an empty string at the end. + if (separatorIndex == s.length() - 1) + { + splitted.emplace_back(""); + } + + start = separatorIndex + 1; + } + + return splitted; +} + +bool broadcastIOFormats(std::vector const& formats, size_t nbBindings, bool isInput /*= true*/) +{ + bool broadcast = formats.size() == 1; + bool validFormatsCount = broadcast || (formats.size() == nbBindings); + if (!formats.empty() && !validFormatsCount) + { + if (isInput) + { + throw std::invalid_argument( + "The number of inputIOFormats must match network's inputs or be one for broadcasting."); + } + + throw std::invalid_argument( + "The number of outputIOFormats must match network's outputs or be one for broadcasting."); + } + return broadcast; +} + +void sparsifyMatMulKernelWeights(nvinfer1::INetworkDefinition& network, std::vector>& sparseWeights) +{ + using TensorToLayer = std::unordered_map; + using LayerToTensor = std::unordered_map; + + // 1. Collect layers and tensors information from the network. + TensorToLayer matmulI2L; + TensorToLayer constO2L; + TensorToLayer shuffleI2L; + LayerToTensor shuffleL2O; + auto collectMappingInfo = [&](int32_t const idx) + { + ILayer* l = network.getLayer(idx); + switch (l->getType()) + { + case nvinfer1::LayerType::kMATRIX_MULTIPLY: + { + // assume weights on the second input. + matmulI2L.insert({l->getInput(1), l}); + break; + } + case nvinfer1::LayerType::kCONSTANT: + { + DataType const dtype = static_cast(l)->getWeights().type; + if (dtype == nvinfer1::DataType::kFLOAT || dtype == nvinfer1::DataType::kHALF) + { + // Sparsify float only. + constO2L.insert({l->getOutput(0), l}); + } + break; + } + case nvinfer1::LayerType::kSHUFFLE: + { + shuffleI2L.insert({l->getInput(0), l}); + shuffleL2O.insert({l, l->getOutput(0)}); + break; + } + default: break; + } + }; + int32_t const nbLayers = network.getNbLayers(); + for (int32_t i = 0; i < nbLayers; ++i) + { + collectMappingInfo(i); + } + if (matmulI2L.size() == 0 || constO2L.size() == 0) + { + // No MatrixMultiply or Constant layer found, no weights to sparsify. + return; + } + + // Helper for analysis + auto isTranspose + = [](nvinfer1::Permutation const& perm) -> bool { return (perm.order[0] == 1 && perm.order[1] == 0); }; + auto is2D = [](nvinfer1::Dims const& dims) -> bool { return dims.nbDims == 2; }; + auto isIdenticalReshape = [](nvinfer1::Dims const& dims) -> bool + { + for (int32_t i = 0; i < dims.nbDims; ++i) + { + if (dims.d[i] != i || dims.d[i] != -1) + { + return false; + } + } + return true; + }; + auto tensorReachedViaTranspose = [&](nvinfer1::ITensor* t, bool& needTranspose) -> ITensor* + { + while (shuffleI2L.find(t) != shuffleI2L.end()) + { + nvinfer1::IShuffleLayer* s = static_cast(shuffleI2L.at(t)); + if (!is2D(s->getInput(0)->getDimensions()) || !is2D(s->getReshapeDimensions()) + || !isIdenticalReshape(s->getReshapeDimensions())) + { + break; + } + + if (isTranspose(s->getFirstTranspose())) + { + needTranspose = !needTranspose; + } + if (isTranspose(s->getSecondTranspose())) + { + needTranspose = !needTranspose; + } + + t = shuffleL2O.at(s); + } + return t; + }; + + // 2. Forward analysis to collect the Constant layers connected to MatMul via Transpose + std::unordered_map constantLayerToSparse; + for (auto& o2l : constO2L) + { + // If need to transpose the weights of the Constant layer. + // Need to transpose by default due to semantic difference. + bool needTranspose{true}; + ITensor* t = tensorReachedViaTranspose(o2l.first, needTranspose); + if (matmulI2L.find(t) == matmulI2L.end()) + { + continue; + } + + // check MatMul params... + IMatrixMultiplyLayer* mm = static_cast(matmulI2L.at(t)); + bool const twoInputs = mm->getNbInputs() == 2; + bool const all2D = is2D(mm->getInput(0)->getDimensions()) && is2D(mm->getInput(1)->getDimensions()); + bool const isSimple = mm->getOperation(0) == nvinfer1::MatrixOperation::kNONE + && mm->getOperation(1) != nvinfer1::MatrixOperation::kVECTOR; + if (!(twoInputs && all2D && isSimple)) + { + continue; + } + if (mm->getOperation(1) == nvinfer1::MatrixOperation::kTRANSPOSE) + { + needTranspose = !needTranspose; + } + + constantLayerToSparse.insert({static_cast(o2l.second), needTranspose}); + } + + // 3. Finally, sparsify the weights + auto sparsifyConstantWeights = [&sparseWeights](nvinfer1::IConstantLayer* layer, bool const needTranspose) + { + Dims dims = layer->getOutput(0)->getDimensions(); + ASSERT(dims.nbDims == 2); + int32_t const idxN = needTranspose ? 1 : 0; + int32_t const n = dims.d[idxN]; + int32_t const k = dims.d[1 - idxN]; + sparseWeights.emplace_back(); + std::vector& spw = sparseWeights.back(); + Weights w = layer->getWeights(); + DataType const dtype = w.type; + ASSERT(dtype == nvinfer1::DataType::kFLOAT + || dtype == nvinfer1::DataType::kHALF); // non-float weights should have been ignored. + + if (needTranspose) + { + if (dtype == nvinfer1::DataType::kFLOAT) + { + spw.resize(w.count * sizeof(float)); + transpose2DWeights(spw.data(), w.values, k, n); + } + else if (dtype == nvinfer1::DataType::kHALF) + { + spw.resize(w.count * sizeof(half_float::half)); + transpose2DWeights(spw.data(), w.values, k, n); + } + + w.values = spw.data(); + std::vector tmpW; + sparsify(w, n, 1, tmpW); + + if (dtype == nvinfer1::DataType::kFLOAT) + { + transpose2DWeights(spw.data(), tmpW.data(), n, k); + } + else if (dtype == nvinfer1::DataType::kHALF) + { + transpose2DWeights(spw.data(), tmpW.data(), n, k); + } + } + else + { + sparsify(w, n, 1, spw); + } + + w.values = spw.data(); + layer->setWeights(w); + }; + for (auto& l : constantLayerToSparse) + { + sparsifyConstantWeights(l.first, l.second); + } +} + +template +void setSparseWeights(L& l, int32_t k, int32_t trs, std::vector& sparseWeights) +{ + auto weights = l.getKernelWeights(); + sparsify(weights, k, trs, sparseWeights); + weights.values = sparseWeights.data(); + l.setKernelWeights(weights); +} + +// Explicit instantiation +template void setSparseWeights( + IConvolutionLayer& l, int32_t k, int32_t trs, std::vector& sparseWeights); + +void sparsify(nvinfer1::INetworkDefinition& network, std::vector>& sparseWeights) +{ + for (int32_t l = 0; l < network.getNbLayers(); ++l) + { + auto* layer = network.getLayer(l); + auto const t = layer->getType(); + if (t == nvinfer1::LayerType::kCONVOLUTION) + { + auto& conv = *static_cast(layer); + auto const& dims = conv.getKernelSizeNd(); + ASSERT(dims.nbDims == 2 || dims.nbDims == 3); + auto const k = conv.getNbOutputMaps(); + auto const trs = std::accumulate(dims.d, dims.d + dims.nbDims, 1, std::multiplies()); + sparseWeights.emplace_back(); + setSparseWeights(conv, k, trs, sparseWeights.back()); + } + } + + sparsifyMatMulKernelWeights(network, sparseWeights); + sample::gLogVerbose << "--sparsity=force pruned " << sparseWeights.size() << " weights to be sparsity pattern." << std::endl; + sample::gLogVerbose << "--sparsity=force has been deprecated. Please use to rewrite the weights to a sparsity pattern and then run with --sparsity=enable" << std::endl; +} + +void sparsify(Weights const& weights, int32_t k, int32_t trs, std::vector& sparseWeights) +{ + switch (weights.type) + { + case DataType::kFLOAT: + sparsify(static_cast(weights.values), weights.count, k, trs, sparseWeights); + break; + case DataType::kHALF: + sparsify(static_cast(weights.values), weights.count, k, trs, sparseWeights); + break; + case DataType::kBF16: + sparsify(static_cast(weights.values), weights.count, k, trs, sparseWeights); + break; + case DataType::kINT8: + case DataType::kINT32: + case DataType::kUINT8: + case DataType::kBOOL: + case DataType::kINT4: + case DataType::kFP8: + case DataType::kINT64: + ASSERT(false && "Unsupported data type"); + } +} + +template +void print(std::ostream& os, T v) +{ + os << v; +} + +void print(std::ostream& os, int8_t v) +{ + os << static_cast(v); +} + +void print(std::ostream& os, __half v) +{ + os << static_cast(v); +} + +template +void dumpBuffer(void const* buffer, std::string const& separator, std::ostream& os, Dims const& dims, + Dims const& strides, int32_t vectorDim, int32_t spv) +{ + auto const vol = volume(dims); + T const* typedBuffer = static_cast(buffer); + std::string sep; + for (int64_t v = 0; v < vol; ++v) + { + int64_t curV = v; + int32_t dataOffset = 0; + for (int32_t dimIndex = dims.nbDims - 1; dimIndex >= 0; --dimIndex) + { + int32_t dimVal = curV % dims.d[dimIndex]; + if (dimIndex == vectorDim) + { + dataOffset += (dimVal / spv) * strides.d[dimIndex] * spv + dimVal % spv; + } + else + { + dataOffset += dimVal * strides.d[dimIndex] * (vectorDim == -1 ? 1 : spv); + } + curV /= dims.d[dimIndex]; + ASSERT(curV >= 0); + } + + os << sep; + sep = separator; + print(os, typedBuffer[dataOffset]); + } +} + +// Explicit instantiation +template void dumpBuffer(void const* buffer, std::string const& separator, std::ostream& os, Dims const& dims, + Dims const& strides, int32_t vectorDim, int32_t spv); +template void dumpBuffer(void const* buffer, std::string const& separator, std::ostream& os, Dims const& dims, + Dims const& strides, int32_t vectorDim, int32_t spv); +template void dumpBuffer(void const* buffer, std::string const& separator, std::ostream& os, Dims const& dims, + Dims const& strides, int32_t vectorDim, int32_t spv); +template void dumpBuffer(void const* buffer, std::string const& separator, std::ostream& os, Dims const& dims, + Dims const& strides, int32_t vectorDim, int32_t spv); +template void dumpBuffer<__half>(void const* buffer, std::string const& separator, std::ostream& os, Dims const& dims, + Dims const& strides, int32_t vectorDim, int32_t spv); +template void dumpBuffer(void const* buffer, std::string const& separator, std::ostream& os, Dims const& dims, + Dims const& strides, int32_t vectorDim, int32_t spv); +template void dumpBuffer(void const* buffer, std::string const& separator, std::ostream& os, Dims const& dims, + Dims const& strides, int32_t vectorDim, int32_t spv); +template void dumpBuffer(void const* buffer, std::string const& separator, std::ostream& os, Dims const& dims, + Dims const& strides, int32_t vectorDim, int32_t spv); + +template +void sparsify(T const* values, int64_t count, int32_t k, int32_t trs, std::vector& sparseWeights) +{ + auto const c = count / (k * trs); + sparseWeights.resize(count * sizeof(T)); + auto* sparseValues = reinterpret_cast(sparseWeights.data()); + + constexpr int32_t window = 4; + constexpr int32_t nonzeros = 2; + + int32_t const crs = c * trs; + auto const getIndex = [=](int32_t ki, int32_t ci, int32_t rsi) { return ki * crs + ci * trs + rsi; }; + + for (int64_t ki = 0; ki < k; ++ki) + { + for (int64_t rsi = 0; rsi < trs; ++rsi) + { + int32_t w = 0; + int32_t nz = 0; + for (int64_t ci = 0; ci < c; ++ci) + { + auto const index = getIndex(ki, ci, rsi); + if (nz < nonzeros) + { + sparseValues[index] = values[index]; + ++nz; + } + else + { + sparseValues[index] = 0; + } + if (++w == window) + { + w = 0; + nz = 0; + } + } + } + } +} + +// Explicit instantiation +template void sparsify( + float const* values, int64_t count, int32_t k, int32_t trs, std::vector& sparseWeights); +template void sparsify( + half_float::half const* values, int64_t count, int32_t k, int32_t trs, std::vector& sparseWeights); + +template +void transpose2DWeights(void* dst, void const* src, int32_t const m, int32_t const n) +{ + ASSERT(dst != src); + T* tdst = reinterpret_cast(dst); + T const* tsrc = reinterpret_cast(src); + for (int32_t mi = 0; mi < m; ++mi) + { + for (int32_t ni = 0; ni < n; ++ni) + { + int32_t const isrc = mi * n + ni; + int32_t const idst = ni * m + mi; + tdst[idst] = tsrc[isrc]; + } + } +} + +// Explicit instantiation +template void transpose2DWeights(void* dst, void const* src, int32_t const m, int32_t const n); +template void transpose2DWeights(void* dst, void const* src, int32_t const m, int32_t const n); + +template ::value, bool>::type> +void fillBuffer(void* buffer, int64_t volume, T min, T max) +{ + T* typedBuffer = static_cast(buffer); + std::default_random_engine engine; + std::uniform_int_distribution distribution(min, max); + auto generator = [&engine, &distribution]() { return static_cast(distribution(engine)); }; + std::generate(typedBuffer, typedBuffer + volume, generator); +} + +template ::value, int32_t>::type> +void fillBuffer(void* buffer, int64_t volume, T min, T max) +{ + T* typedBuffer = static_cast(buffer); + std::default_random_engine engine; + std::uniform_real_distribution distribution(min, max); + auto generator = [&engine, &distribution]() { return static_cast(distribution(engine)); }; + std::generate(typedBuffer, typedBuffer + volume, generator); +} + +// Explicit instantiation +template void fillBuffer(void* buffer, int64_t volume, bool min, bool max); +template void fillBuffer(void* buffer, int64_t volume, float min, float max); +template void fillBuffer(void* buffer, int64_t volume, int32_t min, int32_t max); +template void fillBuffer(void* buffer, int64_t volume, int64_t min, int64_t max); +template void fillBuffer(void* buffer, int64_t volume, int8_t min, int8_t max); +template void fillBuffer<__half>(void* buffer, int64_t volume, __half min, __half max); +template void fillBuffer(void* buffer, int64_t volume, BFloat16 min, BFloat16 max); +template void fillBuffer(void* buffer, int64_t volume, uint8_t min, uint8_t max); + +bool matchStringWithOneWildcard(std::string const& pattern, std::string const& target) +{ + auto const splitPattern = splitToStringVec(pattern, '*', 1); + + // If there is no wildcard, return if the two strings match exactly. + if (splitPattern.size() == 1) + { + return pattern == target; + } + + // Otherwise, target must follow prefix+anything+postfix pattern. + return target.size() >= (splitPattern[0].size() + splitPattern[1].size()) && target.find(splitPattern[0]) == 0 + && target.rfind(splitPattern[1]) == (target.size() - splitPattern[1].size()); +} + +} // namespace sample diff --git a/src/Detector/tensorrt_yolo/common/sampleUtils.h b/src/Detector/tensorrt_yolo/common/sampleUtils.h index 1509a7fc..6cd4280b 100644 --- a/src/Detector/tensorrt_yolo/common/sampleUtils.h +++ b/src/Detector/tensorrt_yolo/common/sampleUtils.h @@ -1,11 +1,12 @@ /* - * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, @@ -22,6 +23,7 @@ #include #include #include +#include #include #include @@ -32,24 +34,20 @@ #include "common.h" #include "logger.h" -#include "sampleDevice.h" -#include "sampleOptions.h" + +#define SMP_RETVAL_IF_FALSE(condition, msg, retval, err) \ + { \ + if ((condition) == false) \ + { \ + (err) << (msg) << std::endl; \ + return retval; \ + } \ + } namespace sample { -inline int dataTypeSize(nvinfer1::DataType dataType) -{ - switch (dataType) - { - case nvinfer1::DataType::kINT32: - case nvinfer1::DataType::kFLOAT: return 4; - case nvinfer1::DataType::kHALF: return 2; - case nvinfer1::DataType::kBOOL: - case nvinfer1::DataType::kINT8: return 1; - } - return 0; -} +size_t dataTypeSize(nvinfer1::DataType dataType); template inline T roundUp(T m, T n) @@ -57,485 +55,71 @@ inline T roundUp(T m, T n) return ((m + n - 1) / n) * n; } -inline int volume(const nvinfer1::Dims& d) -{ - return std::accumulate(d.d, d.d + d.nbDims, 1, std::multiplies()); -} - //! comps is the number of components in a vector. Ignored if vecDim < 0. -inline int64_t volume(const nvinfer1::Dims& dims, const nvinfer1::Dims& strides, int vecDim, int comps, int batch) -{ - int maxNbElems = 1; - for (int i = 0; i < dims.nbDims; ++i) - { - // Get effective length of axis. - int d = dims.d[i]; - // Any dimension is 0, it is an empty tensor. - if (d == 0) - { - return 0; - } - if (i == vecDim) - { - d = samplesCommon::divUp(d, comps); - } - maxNbElems = std::max(maxNbElems, d * strides.d[i]); - } - return static_cast(maxNbElems) * batch * (vecDim < 0 ? 1 : comps); -} +int64_t volume(nvinfer1::Dims const& dims, nvinfer1::Dims const& strides, int32_t vecDim, int32_t comps, int32_t batch); -inline int64_t volume(nvinfer1::Dims dims, int vecDim, int comps, int batch) -{ - if (vecDim != -1) - { - dims.d[vecDim] = roundUp(dims.d[vecDim], comps); - } - return volume(dims) * std::max(batch, 1); -} +using samplesCommon::volume; -inline nvinfer1::Dims toDims(const std::vector& vec) -{ - int limit = static_cast(nvinfer1::Dims::MAX_DIMS); - if (static_cast(vec.size()) > limit) - { - sample::gLogWarning << "Vector too long, only first 8 elements are used in dimension." << std::endl; - } - // Pick first nvinfer1::Dims::MAX_DIMS elements - nvinfer1::Dims dims{std::min(static_cast(vec.size()), limit), {}}; - std::copy_n(vec.begin(), dims.nbDims, std::begin(dims.d)); - return dims; -} +nvinfer1::Dims toDims(std::vector const& vec); -template -inline void fillBuffer(void* buffer, int64_t volume, T min, T max) -{ - T* typedBuffer = static_cast(buffer); - std::default_random_engine engine; - if (std::is_integral::value) - { - std::uniform_int_distribution distribution(min, max); - auto generator = [&engine, &distribution]() { return static_cast(distribution(engine)); }; - std::generate(typedBuffer, typedBuffer + volume, generator); - } - else - { - std::uniform_real_distribution distribution(min, max); - auto generator = [&engine, &distribution]() { return static_cast(distribution(engine)); }; - std::generate(typedBuffer, typedBuffer + volume, generator); - } -} +template ::value, bool>::type = true> +void fillBuffer(void* buffer, int64_t volume, T min, T max); -// Specialization needed for custom type __half -template -inline void fillBufferHalf(void* buffer, int64_t volume, H min, H max) -{ - H* typedBuffer = static_cast(buffer); - std::default_random_engine engine; - std::uniform_real_distribution distribution(min, max); - auto generator = [&engine, &distribution]() { return static_cast(distribution(engine)); }; - std::generate(typedBuffer, typedBuffer + volume, generator); -} -template <> -inline void fillBuffer<__half>(void* buffer, int64_t volume, __half min, __half max) -{ - fillBufferHalf(buffer, volume, min, max); -} +template ::value, int32_t>::type = 0> +void fillBuffer(void* buffer, int64_t volume, T min, T max); template -inline void dumpBuffer(const void* buffer, const std::string& separator, std::ostream& os, const nvinfer1::Dims& dims, - const nvinfer1::Dims& strides, int32_t vectorDim, int32_t spv) -{ - const int64_t volume = std::accumulate(dims.d, dims.d + dims.nbDims, 1, std::multiplies()); - const T* typedBuffer = static_cast(buffer); - std::string sep; - for (int64_t v = 0; v < volume; ++v) - { - int64_t curV = v; - int32_t dataOffset = 0; - for (int32_t dimIndex = dims.nbDims - 1; dimIndex >= 0; --dimIndex) - { - int32_t dimVal = curV % dims.d[dimIndex]; - if (dimIndex == vectorDim) - { - dataOffset += (dimVal / spv) * strides.d[dimIndex] * spv + dimVal % spv; - } - else - { - dataOffset += dimVal * strides.d[dimIndex] * (vectorDim == -1 ? 1 : spv); - } - curV /= dims.d[dimIndex]; - ASSERT(curV >= 0); - } - - os << sep << typedBuffer[dataOffset]; - sep = separator; - } -} - -inline void loadFromFile(std::string const& fileName, char* dst, size_t size) -{ - ASSERT(dst); - - std::ifstream file(fileName, std::ios::in | std::ios::binary); - if (file.is_open()) - { - file.read(dst, size); - file.close(); - } - else - { - std::stringstream msg; - msg << "Cannot open file " << fileName << "!"; - throw std::invalid_argument(msg.str()); - } -} - -struct Binding -{ - bool isInput{false}; - std::unique_ptr buffer; - int64_t volume{0}; - nvinfer1::DataType dataType{nvinfer1::DataType::kFLOAT}; - - void fill(const std::string& fileName) - { - loadFromFile(fileName, static_cast(buffer->getHostBuffer()), buffer->getSize()); - } - - void fill() - { - switch (dataType) - { - case nvinfer1::DataType::kBOOL: - { - fillBuffer(buffer->getHostBuffer(), volume, 0, 1); - break; - } - case nvinfer1::DataType::kINT32: - { - fillBuffer(buffer->getHostBuffer(), volume, -128, 127); - break; - } - case nvinfer1::DataType::kINT8: - { - fillBuffer(buffer->getHostBuffer(), volume, -128, 127); - break; - } - case nvinfer1::DataType::kFLOAT: - { - fillBuffer(buffer->getHostBuffer(), volume, -1.0F, 1.0F); - break; - } - case nvinfer1::DataType::kHALF: - { - fillBuffer<__half>(buffer->getHostBuffer(), volume, -1.0F, 1.0F); - break; - } - } - } - - void dump(std::ostream& os, nvinfer1::Dims dims, nvinfer1::Dims strides, int32_t vectorDim, int32_t spv, - const std::string separator = " ") const - { - switch (dataType) - { - case nvinfer1::DataType::kBOOL: - { - dumpBuffer(buffer->getHostBuffer(), separator, os, dims, strides, vectorDim, spv); - break; - } - case nvinfer1::DataType::kINT32: - { - dumpBuffer(buffer->getHostBuffer(), separator, os, dims, strides, vectorDim, spv); - break; - } - case nvinfer1::DataType::kINT8: - { - dumpBuffer(buffer->getHostBuffer(), separator, os, dims, strides, vectorDim, spv); - break; - } - case nvinfer1::DataType::kFLOAT: - { - dumpBuffer(buffer->getHostBuffer(), separator, os, dims, strides, vectorDim, spv); - break; - } - case nvinfer1::DataType::kHALF: - { - dumpBuffer<__half>(buffer->getHostBuffer(), separator, os, dims, strides, vectorDim, spv); - break; - } - } - } -}; - -class Bindings -{ -public: - Bindings() = delete; - explicit Bindings(bool useManaged) - : mUseManaged(useManaged) - { - } - - void addBinding(int b, const std::string& name, bool isInput, int64_t volume, nvinfer1::DataType dataType, - const std::string& fileName = "") - { - while (mBindings.size() <= static_cast(b)) - { - mBindings.emplace_back(); - mDevicePointers.emplace_back(); - } - mNames[name] = b; - if (mBindings[b].buffer == nullptr) - { - if (mUseManaged) - mBindings[b].buffer.reset(new UnifiedMirroredBuffer); - else - mBindings[b].buffer.reset(new DiscreteMirroredBuffer); - } - mBindings[b].isInput = isInput; - // Some memory allocators return nullptr when allocating zero bytes, but TensorRT requires a non-null ptr - // even for empty tensors, so allocate a dummy byte. - if (volume == 0) - mBindings[b].buffer->allocate(1); - else - mBindings[b].buffer->allocate(static_cast(volume) * static_cast(dataTypeSize(dataType))); - - mBindings[b].volume = volume; - mBindings[b].dataType = dataType; - mDevicePointers[b] = mBindings[b].buffer->getDeviceBuffer(); - if (isInput) - { - if (fileName.empty()) - fill(b); - else - fill(b, fileName); - } - } - - void** getDeviceBuffers() - { - return mDevicePointers.data(); - } - - void transferInputToDevice(TrtCudaStream& stream) - { - for (auto& b : mNames) - { - if (mBindings[b.second].isInput) - mBindings[b.second].buffer->hostToDevice(stream); - } - } - - void transferOutputToHost(TrtCudaStream& stream) - { - for (auto& b : mNames) - { - if (!mBindings[b.second].isInput) - mBindings[b.second].buffer->deviceToHost(stream); - } - } - - void fill(int binding, const std::string& fileName) - { - mBindings[binding].fill(fileName); - } - - void fill(int binding) - { - mBindings[binding].fill(); - } +void dumpBuffer(void const* buffer, std::string const& separator, std::ostream& os, nvinfer1::Dims const& dims, + nvinfer1::Dims const& strides, int32_t vectorDim, int32_t spv); - void dumpBindingDimensions(int binding, const nvinfer1::IExecutionContext& context, std::ostream& os) const - { - const auto dims = context.getBindingDimensions(binding); - // Do not add a newline terminator, because the caller may be outputting a JSON string. - os << dims; - } - - void dumpBindingValues(const nvinfer1::IExecutionContext& context, int binding, std::ostream& os, - const std::string& separator = " ", int32_t batch = 1) const - { - nvinfer1::Dims dims = context.getBindingDimensions(binding); - nvinfer1::Dims strides = context.getStrides(binding); - int32_t vectorDim = context.getEngine().getBindingVectorizedDim(binding); - const int32_t spv = context.getEngine().getBindingComponentsPerElement(binding); +void loadFromFile(std::string const& fileName, char* dst, size_t size); - if (context.getEngine().hasImplicitBatchDimension()) - { - auto insertN = [](nvinfer1::Dims& d, int32_t bs) { - const int32_t nbDims = d.nbDims; - ASSERT(nbDims < nvinfer1::Dims::MAX_DIMS); - std::copy_backward(&d.d[0], &d.d[nbDims], &d.d[nbDims + 1]); - d.d[0] = bs; - d.nbDims = nbDims + 1; - }; - int32_t batchStride = 0; - for (int32_t i = 0; i < strides.nbDims; ++i) - { - if (strides.d[i] * dims.d[i] > batchStride) - { - batchStride = strides.d[i] * dims.d[i]; - } - } - insertN(dims, batch); - insertN(strides, batchStride); - vectorDim = (vectorDim == -1) ? -1 : vectorDim + 1; - } - - mBindings[binding].dump(os, dims, strides, vectorDim, spv, separator); - } - - void dumpInputs(const nvinfer1::IExecutionContext& context, std::ostream& os) const - { - auto isInput = [](const Binding& b) { return b.isInput; }; - dumpBindings(context, isInput, os); - } +std::vector splitToStringVec(std::string const& option, char separator, int64_t maxSplit = -1); - void dumpOutputs(const nvinfer1::IExecutionContext& context, std::ostream& os) const - { - auto isOutput = [](const Binding& b) { return !b.isInput; }; - dumpBindings(context, isOutput, os); - } +bool broadcastIOFormats(std::vector const& formats, size_t nbBindings, bool isInput = true); - void dumpBindings(const nvinfer1::IExecutionContext& context, std::ostream& os) const - { - auto all = [](const Binding& /*b*/) { return true; }; - dumpBindings(context, all, os); - } +int32_t getCudaDriverVersion(); - void dumpBindings( - const nvinfer1::IExecutionContext& context, bool (*predicate)(const Binding& b), std::ostream& os) const - { - for (const auto& n : mNames) - { - const auto binding = n.second; - if (predicate(mBindings[binding])) - { - os << n.first << ": ("; - dumpBindingDimensions(binding, context, os); - os << ")" << std::endl; +int32_t getCudaRuntimeVersion(); - dumpBindingValues(context, binding, os); - os << std::endl; - } - } - } +void sparsify(nvinfer1::INetworkDefinition& network, std::vector>& sparseWeights); +void sparsify(nvinfer1::Weights const& weights, int32_t k, int32_t rs, std::vector& sparseWeights); - std::unordered_map getInputBindings() const - { - auto isInput = [](const Binding& b) { return b.isInput; }; - return getBindings(isInput); - } - - std::unordered_map getOutputBindings() const - { - auto isOutput = [](const Binding& b) { return !b.isInput; }; - return getBindings(isOutput); - } - - std::unordered_map getBindings() const - { - auto all = [](const Binding& /*b*/) { return true; }; - return getBindings(all); - } +// Walk the weights elements and overwrite (at most) 2 out of 4 elements to 0. +template +void sparsify(T const* values, int64_t count, int32_t k, int32_t rs, std::vector& sparseWeights); - std::unordered_map getBindings(bool (*predicate)(const Binding& b)) const - { - std::unordered_map bindings; - for (const auto& n : mNames) - { - const auto binding = n.second; - if (predicate(mBindings[binding])) - bindings.insert(n); - } - return bindings; - } +template +void setSparseWeights(L& l, int32_t k, int32_t rs, std::vector& sparseWeights); -private: - std::unordered_map mNames; - std::vector mBindings; - std::vector mDevicePointers; - bool mUseManaged{false}; -}; +// Sparsify the weights of Constant layers that are fed to MatMul via Shuffle layers. +// Forward analysis on the API graph to determine which weights to sparsify. +void sparsifyMatMulKernelWeights( + nvinfer1::INetworkDefinition& network, std::vector>& sparseWeights); template -struct TrtDestroyer -{ - void operator()(T* t) - { - //t->destroy(); - delete t; - } -}; +void transpose2DWeights(void* dst, void const* src, int32_t const m, int32_t const n); -template -using TrtUniquePtr = std::unique_ptr>; +//! A helper function to match a target string with a pattern where the pattern can contain up to one wildcard ('*') +//! character that matches to any strings. +bool matchStringWithOneWildcard(std::string const& pattern, std::string const& target); -inline bool broadcastIOFormats(const std::vector& formats, size_t nbBindings, bool isInput = true) -{ - bool broadcast = formats.size() == 1; - bool validFormatsCount = broadcast || (formats.size() == nbBindings); - if (!formats.empty() && !validFormatsCount) - { - if (isInput) - { - throw std::invalid_argument( - "The number of inputIOFormats must match network's inputs or be one for broadcasting."); - } - else - { - throw std::invalid_argument( - "The number of outputIOFormats must match network's outputs or be one for broadcasting."); - } - } - return broadcast; -} - -inline std::vector loadTimingCacheFile(const std::string inFileName) -{ - std::ifstream iFile(inFileName, std::ios::in | std::ios::binary); - if (!iFile) - { - sample::gLogWarning << "Could not read timing cache from: " << inFileName - << ". A new timing cache will be generated and written." << std::endl; - return std::vector(); - } - iFile.seekg(0, std::ifstream::end); - size_t fsize = iFile.tellg(); - iFile.seekg(0, std::ifstream::beg); - std::vector content(fsize); - iFile.read(content.data(), fsize); - iFile.close(); - sample::gLogInfo << "Loaded " << fsize << " bytes of timing cache from " << inFileName << std::endl; - return content; -} - -inline void saveTimingCacheFile(const std::string outFileName, const nvinfer1::IHostMemory* blob) +//! A helper method to find an item from an unordered_map. If the exact match exists, this is identical to +//! map.find(target). If the exact match does not exist, it returns the first plausible match, taking up to one wildcard +//! into account. If there is no plausible match, then it returns map.end(). +template +typename std::unordered_map::const_iterator findPlausible( + std::unordered_map const& map, std::string const& target) { - std::ofstream oFile(outFileName, std::ios::out | std::ios::binary); - if (!oFile) + auto res = map.find(target); + if (res == map.end()) { - sample::gLogWarning << "Could not write timing cache to: " << outFileName << std::endl; - return; + res = std::find_if( + map.begin(), map.end(), [&](typename std::unordered_map::value_type const& item) { + return matchStringWithOneWildcard(item.first, target); + }); } - oFile.write((char*) blob->data(), blob->size()); - oFile.close(); - sample::gLogInfo << "Saved " << blob->size() << " bytes of timing cache to " << outFileName << std::endl; -} - -inline int32_t getCudaDriverVersion() -{ - int32_t version{-1}; - cudaCheck(cudaDriverGetVersion(&version)); - return version; -} - -inline int32_t getCudaRuntimeVersion() -{ - int32_t version{-1}; - cudaCheck(cudaRuntimeGetVersion(&version)); - return version; + return res; } } // namespace sample diff --git a/src/Detector/tensorrt_yolo/common/streamReader.h b/src/Detector/tensorrt_yolo/common/streamReader.h new file mode 100644 index 00000000..7d4aa1c6 --- /dev/null +++ b/src/Detector/tensorrt_yolo/common/streamReader.h @@ -0,0 +1,78 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef STREAM_READER_H +#define STREAM_READER_H + +#include "NvInferRuntime.h" +#include "sampleUtils.h" +#include + +namespace samplesCommon +{ + +//! Implements the TensorRT IStreamReader to allow deserializing an engine directly from the plan file. +class FileStreamReader final : public nvinfer1::IStreamReader +{ +public: + bool open(std::string filepath) + { + mFile.open(filepath, std::ios::binary); + return mFile.is_open(); + } + + void close() + { + if (mFile.is_open()) + { + mFile.close(); + } + } + + ~FileStreamReader() final + { + close(); + } + + int64_t read(void* dest, int64_t bytes) final + { + if (!mFile.good()) + { + return -1; + } + mFile.read(static_cast(dest), bytes); + return mFile.gcount(); + } + + void reset() + { + assert(mFile.good()); + mFile.seekg(0); + } + + bool isOpen() const + { + return mFile.is_open(); + } + +private: + std::ifstream mFile; +}; + +} // namespace samplesCommon + +#endif // STREAM_READER_H diff --git a/src/Detector/tensorrt_yolo/common/timingCache.cpp b/src/Detector/tensorrt_yolo/common/timingCache.cpp new file mode 100644 index 00000000..18e85ba4 --- /dev/null +++ b/src/Detector/tensorrt_yolo/common/timingCache.cpp @@ -0,0 +1,157 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "timingCache.h" +#include "NvInfer.h" +#include "fileLock.h" +#include "sampleUtils.h" +#include +#include +#include +#include +#include +#include +using namespace nvinfer1; +namespace nvinfer1 +{ +namespace utils +{ +std::vector loadTimingCacheFile(ILogger& logger, std::string const& inFileName) +{ + try + { + std::unique_ptr fileLock{new FileLock(logger, inFileName)}; + std::ifstream iFile(inFileName, std::ios::in | std::ios::binary); + if (!iFile) + { + std::stringstream ss; + ss << "Could not read timing cache from: " << inFileName + << ". A new timing cache will be generated and written."; + logger.log(ILogger::Severity::kWARNING, ss.str().c_str()); + return std::vector(); + } + iFile.seekg(0, std::ifstream::end); + size_t fsize = iFile.tellg(); + iFile.seekg(0, std::ifstream::beg); + std::vector content(fsize); + iFile.read(content.data(), fsize); + iFile.close(); + std::stringstream ss; + ss << "Loaded " << fsize << " bytes of timing cache from " << inFileName; + logger.log(ILogger::Severity::kINFO, ss.str().c_str()); + return content; + } + catch (std::exception const& e) + { + std::cerr << "Exception detected: " << e.what() << std::endl; + } + return {}; +} + +std::unique_ptr buildTimingCacheFromFile( + ILogger& logger, IBuilderConfig& config, std::string const& timingCacheFile, std::ostream& err) +{ + std::unique_ptr timingCache{}; + auto timingCacheContents = loadTimingCacheFile(logger, timingCacheFile); + timingCache.reset(config.createTimingCache(timingCacheContents.data(), timingCacheContents.size())); + SMP_RETVAL_IF_FALSE(timingCache != nullptr, "TimingCache creation failed", nullptr, err); + config.clearFlag(BuilderFlag::kDISABLE_TIMING_CACHE); + SMP_RETVAL_IF_FALSE( + config.setTimingCache(*timingCache, true), "IBuilderConfig setTimingCache failed", nullptr, err); + return timingCache; +} + +void saveTimingCacheFile(ILogger& logger, std::string const& outFileName, IHostMemory const* blob) +{ + try + { + std::unique_ptr fileLock{new FileLock(logger, outFileName)}; + std::ofstream oFile(outFileName, std::ios::out | std::ios::binary); + if (!oFile) + { + std::stringstream ss; + ss << "Could not write timing cache to: " << outFileName; + logger.log(ILogger::Severity::kWARNING, ss.str().c_str()); + return; + } + oFile.write(reinterpret_cast(blob->data()), blob->size()); + oFile.close(); + std::stringstream ss; + ss << "Saved " << blob->size() << " bytes of timing cache to " << outFileName; + logger.log(ILogger::Severity::kINFO, ss.str().c_str()); + } + catch (std::exception const& e) + { + std::cerr << "Exception detected: " << e.what() << std::endl; + } +} + +void updateTimingCacheFile(nvinfer1::ILogger& logger, std::string const& fileName, + nvinfer1::ITimingCache const* timingCache, nvinfer1::IBuilder& builder) +{ + try + { + // Prepare empty timingCache in case that there is no existing file to read + std::unique_ptr config{builder.createBuilderConfig()}; + std::unique_ptr fileTimingCache{config->createTimingCache(static_cast(nullptr), 0)}; + + std::unique_ptr fileLock{new FileLock(logger, fileName)}; + std::ifstream iFile(fileName, std::ios::in | std::ios::binary); + if (iFile) + { + iFile.seekg(0, std::ifstream::end); + size_t fsize = iFile.tellg(); + iFile.seekg(0, std::ifstream::beg); + std::vector content(fsize); + iFile.read(content.data(), fsize); + iFile.close(); + std::stringstream ss; + ss << "Loaded " << fsize << " bytes of timing cache from " << fileName; + logger.log(ILogger::Severity::kINFO, ss.str().c_str()); + fileTimingCache.reset(config->createTimingCache(static_cast(content.data()), content.size())); + if (!fileTimingCache) + { + throw std::runtime_error("Failed to create timingCache from " + fileName + "!"); + } + } + fileTimingCache->combine(*timingCache, false); + std::unique_ptr blob{fileTimingCache->serialize()}; + if (!blob) + { + throw std::runtime_error("Failed to serialize ITimingCache!"); + } + std::ofstream oFile(fileName, std::ios::out | std::ios::binary); + if (!oFile) + { + std::stringstream ss; + ss << "Could not write timing cache to: " << fileName; + logger.log(ILogger::Severity::kWARNING, ss.str().c_str()); + return; + } + oFile.write(reinterpret_cast(blob->data()), blob->size()); + oFile.close(); + std::stringstream ss; + ss << "Saved " << blob->size() << " bytes of timing cache to " << fileName; + logger.log(ILogger::Severity::kINFO, ss.str().c_str()); + } + catch (std::exception const& e) + { + std::cerr << "Exception detected: " << e.what() << std::endl; + } +} +} // namespace utils +} // namespace nvinfer1 diff --git a/src/Detector/tensorrt_yolo/common/timingCache.h b/src/Detector/tensorrt_yolo/common/timingCache.h new file mode 100644 index 00000000..c4c76e37 --- /dev/null +++ b/src/Detector/tensorrt_yolo/common/timingCache.h @@ -0,0 +1,38 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef TENSORRT_SAMPLES_COMMON_TIMINGCACHE_H_ +#define TENSORRT_SAMPLES_COMMON_TIMINGCACHE_H_ +#include "NvInfer.h" +#include +#include +#include +#include + +namespace nvinfer1 +{ +namespace utils +{ +std::vector loadTimingCacheFile(nvinfer1::ILogger& logger, std::string const& inFileName); +std::unique_ptr buildTimingCacheFromFile( + ILogger& logger, IBuilderConfig& config, std::string const& timingCacheFile, std::ostream& err); +void saveTimingCacheFile(nvinfer1::ILogger& logger, std::string const& outFileName, nvinfer1::IHostMemory const* blob); +void updateTimingCacheFile(nvinfer1::ILogger& logger, std::string const& fileName, + nvinfer1::ITimingCache const* timingCache, nvinfer1::IBuilder& builder); +} // namespace utils +} // namespace nvinfer1 + +#endif // TENSORRT_SAMPLES_COMMON_TIMINGCACHE_H_ diff --git a/src/Detector/tensorrt_yolo/common_deprecated/BatchStream.h b/src/Detector/tensorrt_yolo/common_deprecated/BatchStream.h new file mode 100644 index 00000000..9eaac768 --- /dev/null +++ b/src/Detector/tensorrt_yolo/common_deprecated/BatchStream.h @@ -0,0 +1,388 @@ +/* + * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef BATCH_STREAM_H +#define BATCH_STREAM_H + +#include "NvInfer.h" +#include "common.h" +#include +#include +#include + +class IBatchStream +{ +public: + virtual void reset(int firstBatch) = 0; + virtual bool next() = 0; + virtual void skip(int skipCount) = 0; + virtual float* getBatch() = 0; + virtual float* getLabels() = 0; + virtual int getBatchesRead() const = 0; + virtual int getBatchSize() const = 0; + virtual nvinfer1::Dims getDims() const = 0; +}; + +class MNISTBatchStream : public IBatchStream +{ +public: + MNISTBatchStream(int batchSize, int maxBatches, const std::string& dataFile, const std::string& labelsFile, + const std::vector& directories) + : mBatchSize{batchSize} + , mMaxBatches{maxBatches} + , mDims{3, {1, 28, 28}} //!< We already know the dimensions of MNIST images. + { + readDataFile(locateFile(dataFile, directories)); + readLabelsFile(locateFile(labelsFile, directories)); + } + + void reset(int firstBatch) override + { + mBatchCount = firstBatch; + } + + bool next() override + { + if (mBatchCount >= mMaxBatches) + { + return false; + } + ++mBatchCount; + return true; + } + + void skip(int skipCount) override + { + mBatchCount += skipCount; + } + + float* getBatch() override + { + return mData.data() + (mBatchCount * mBatchSize * samplesCommon::volume(mDims)); + } + + float* getLabels() override + { + return mLabels.data() + (mBatchCount * mBatchSize); + } + + int getBatchesRead() const override + { + return mBatchCount; + } + + int getBatchSize() const override + { + return mBatchSize; + } + + nvinfer1::Dims getDims() const override + { + return nvinfer1::Dims{4, {mBatchSize, mDims.d[0], mDims.d[1], mDims.d[2]}}; + } + +private: + void readDataFile(const std::string& dataFilePath) + { + std::ifstream file{dataFilePath.c_str(), std::ios::binary}; + + int magicNumber, numImages, imageH, imageW; + file.read(reinterpret_cast(&magicNumber), sizeof(magicNumber)); + // All values in the MNIST files are big endian. + magicNumber = samplesCommon::swapEndianness(magicNumber); + ASSERT(magicNumber == 2051 && "Magic Number does not match the expected value for an MNIST image set"); + + // Read number of images and dimensions + file.read(reinterpret_cast(&numImages), sizeof(numImages)); + file.read(reinterpret_cast(&imageH), sizeof(imageH)); + file.read(reinterpret_cast(&imageW), sizeof(imageW)); + + numImages = samplesCommon::swapEndianness(numImages); + imageH = samplesCommon::swapEndianness(imageH); + imageW = samplesCommon::swapEndianness(imageW); + + // The MNIST data is made up of unsigned bytes, so we need to cast to float and normalize. + int numElements = numImages * imageH * imageW; + std::vector rawData(numElements); + file.read(reinterpret_cast(rawData.data()), numElements * sizeof(uint8_t)); + mData.resize(numElements); + std::transform( + rawData.begin(), rawData.end(), mData.begin(), [](uint8_t val) { return static_cast(val) / 255.f; }); + } + + void readLabelsFile(const std::string& labelsFilePath) + { + std::ifstream file{labelsFilePath.c_str(), std::ios::binary}; + int magicNumber, numImages; + file.read(reinterpret_cast(&magicNumber), sizeof(magicNumber)); + // All values in the MNIST files are big endian. + magicNumber = samplesCommon::swapEndianness(magicNumber); + ASSERT(magicNumber == 2049 && "Magic Number does not match the expected value for an MNIST labels file"); + + file.read(reinterpret_cast(&numImages), sizeof(numImages)); + numImages = samplesCommon::swapEndianness(numImages); + + std::vector rawLabels(numImages); + file.read(reinterpret_cast(rawLabels.data()), numImages * sizeof(uint8_t)); + mLabels.resize(numImages); + std::transform( + rawLabels.begin(), rawLabels.end(), mLabels.begin(), [](uint8_t val) { return static_cast(val); }); + } + + int mBatchSize{0}; + int mBatchCount{0}; //!< The batch that will be read on the next invocation of next() + int mMaxBatches{0}; + nvinfer1::Dims mDims{}; + std::vector mData{}; + std::vector mLabels{}; +}; + +class BatchStream : public IBatchStream +{ +public: + BatchStream( + int batchSize, int maxBatches, std::string prefix, std::string suffix, std::vector directories) + : mBatchSize(batchSize) + , mMaxBatches(maxBatches) + , mPrefix(prefix) + , mSuffix(suffix) + , mDataDir(directories) + { + FILE* file = fopen(locateFile(mPrefix + std::string("0") + mSuffix, mDataDir).c_str(), "rb"); + ASSERT(file != nullptr); + int d[4]; + size_t readSize = fread(d, sizeof(int), 4, file); + ASSERT(readSize == 4); + mDims.nbDims = 4; // The number of dimensions. + mDims.d[0] = d[0]; // Batch Size + mDims.d[1] = d[1]; // Channels + mDims.d[2] = d[2]; // Height + mDims.d[3] = d[3]; // Width + ASSERT(mDims.d[0] > 0 && mDims.d[1] > 0 && mDims.d[2] > 0 && mDims.d[3] > 0); + fclose(file); + + mImageSize = mDims.d[1] * mDims.d[2] * mDims.d[3]; + mBatch.resize(mBatchSize * mImageSize, 0); + mLabels.resize(mBatchSize, 0); + mFileBatch.resize(mDims.d[0] * mImageSize, 0); + mFileLabels.resize(mDims.d[0], 0); + reset(0); + } + + BatchStream(int batchSize, int maxBatches, std::string prefix, std::vector directories) + : BatchStream(batchSize, maxBatches, prefix, ".batch", directories) + { + } + + BatchStream( + int batchSize, int maxBatches, nvinfer1::Dims dims, std::string listFile, std::vector directories) + : mBatchSize(batchSize) + , mMaxBatches(maxBatches) + , mDims(dims) + , mListFile(listFile) + , mDataDir(directories) + { + mImageSize = mDims.d[1] * mDims.d[2] * mDims.d[3]; + mBatch.resize(mBatchSize * mImageSize, 0); + mLabels.resize(mBatchSize, 0); + mFileBatch.resize(mDims.d[0] * mImageSize, 0); + mFileLabels.resize(mDims.d[0], 0); + reset(0); + } + + // Resets data members + void reset(int firstBatch) override + { + mBatchCount = 0; + mFileCount = 0; + mFileBatchPos = mDims.d[0]; + skip(firstBatch); + } + + // Advance to next batch and return true, or return false if there is no batch left. + bool next() override + { + if (mBatchCount == mMaxBatches) + { + return false; + } + + for (int csize = 1, batchPos = 0; batchPos < mBatchSize; batchPos += csize, mFileBatchPos += csize) + { + ASSERT(mFileBatchPos > 0 && mFileBatchPos <= mDims.d[0]); + if (mFileBatchPos == mDims.d[0] && !update()) + { + return false; + } + + // copy the smaller of: elements left to fulfill the request, or elements left in the file buffer. + csize = std::min(mBatchSize - batchPos, mDims.d[0] - mFileBatchPos); + std::copy_n( + getFileBatch() + mFileBatchPos * mImageSize, csize * mImageSize, getBatch() + batchPos * mImageSize); + std::copy_n(getFileLabels() + mFileBatchPos, csize, getLabels() + batchPos); + } + mBatchCount++; + return true; + } + + // Skips the batches + void skip(int skipCount) override + { + if (mBatchSize >= mDims.d[0] && mBatchSize % mDims.d[0] == 0 && mFileBatchPos == mDims.d[0]) + { + mFileCount += skipCount * mBatchSize / mDims.d[0]; + return; + } + + int x = mBatchCount; + for (int i = 0; i < skipCount; i++) + { + next(); + } + mBatchCount = x; + } + + float* getBatch() override + { + return mBatch.data(); + } + + float* getLabels() override + { + return mLabels.data(); + } + + int getBatchesRead() const override + { + return mBatchCount; + } + + int getBatchSize() const override + { + return mBatchSize; + } + + nvinfer1::Dims getDims() const override + { + return mDims; + } + +private: + float* getFileBatch() + { + return mFileBatch.data(); + } + + float* getFileLabels() + { + return mFileLabels.data(); + } + + bool update() + { + if (mListFile.empty()) + { + std::string inputFileName = locateFile(mPrefix + std::to_string(mFileCount++) + mSuffix, mDataDir); + FILE* file = fopen(inputFileName.c_str(), "rb"); + if (!file) + { + return false; + } + + int d[4]; + size_t readSize = fread(d, sizeof(int), 4, file); + ASSERT(readSize == 4); + ASSERT(mDims.d[0] == d[0] && mDims.d[1] == d[1] && mDims.d[2] == d[2] && mDims.d[3] == d[3]); + size_t readInputCount = fread(getFileBatch(), sizeof(float), mDims.d[0] * mImageSize, file); + ASSERT(readInputCount == size_t(mDims.d[0] * mImageSize)); + size_t readLabelCount = fread(getFileLabels(), sizeof(float), mDims.d[0], file); + ASSERT(readLabelCount == 0 || readLabelCount == size_t(mDims.d[0])); + + fclose(file); + } + else + { + std::vector fNames; + std::ifstream file(locateFile(mListFile, mDataDir), std::ios::binary); + if (!file) + { + return false; + } + + sample::gLogInfo << "Batch #" << mFileCount << std::endl; + file.seekg(((mBatchCount * mBatchSize)) * 7); + + for (int i = 1; i <= mBatchSize; i++) + { + std::string sName; + std::getline(file, sName); + sName = sName + ".ppm"; + sample::gLogInfo << "Calibrating with file " << sName << std::endl; + fNames.emplace_back(sName); + } + + mFileCount++; + + const int imageC = 3; + const int imageH = 300; + const int imageW = 300; + std::vector> ppms(fNames.size()); + for (uint32_t i = 0; i < fNames.size(); ++i) + { + readPPMFile(locateFile(fNames[i], mDataDir), ppms[i]); + } + + std::vector data(samplesCommon::volume(mDims)); + const float scale = 2.0 / 255.0; + const float bias = 1.0; + long int volChl = mDims.d[2] * mDims.d[3]; + + // Normalize input data + for (int i = 0, volImg = mDims.d[1] * mDims.d[2] * mDims.d[3]; i < mBatchSize; ++i) + { + for (int c = 0; c < mDims.d[1]; ++c) + { + for (int j = 0; j < volChl; ++j) + { + data[i * volImg + c * volChl + j] = scale * float(ppms[i].buffer[j * mDims.d[1] + c]) - bias; + } + } + } + + std::copy_n(data.data(), mDims.d[0] * mImageSize, getFileBatch()); + } + + mFileBatchPos = 0; + return true; + } + + int mBatchSize{0}; + int mMaxBatches{0}; + int mBatchCount{0}; + int mFileCount{0}; + int mFileBatchPos{0}; + int mImageSize{0}; + std::vector mBatch; //!< Data for the batch + std::vector mLabels; //!< Labels for the batch + std::vector mFileBatch; //!< List of image files + std::vector mFileLabels; //!< List of label files + std::string mPrefix; //!< Batch file name prefix + std::string mSuffix; //!< Batch file name suffix + nvinfer1::Dims mDims; //!< Input dimensions + std::string mListFile; //!< File name of the list of image names + std::vector mDataDir; //!< Directories where the files can be found +}; + +#endif diff --git a/src/Detector/tensorrt_yolo/common_deprecated/EntropyCalibrator.h b/src/Detector/tensorrt_yolo/common_deprecated/EntropyCalibrator.h new file mode 100644 index 00000000..f31789bf --- /dev/null +++ b/src/Detector/tensorrt_yolo/common_deprecated/EntropyCalibrator.h @@ -0,0 +1,134 @@ +/* + * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ENTROPY_CALIBRATOR_H +#define ENTROPY_CALIBRATOR_H + +#include "BatchStream.h" +#include "NvInfer.h" + +//! \class EntropyCalibratorImpl +//! +//! \brief Implements common functionality for Entropy calibrators. +//! +template +class EntropyCalibratorImpl +{ +public: + EntropyCalibratorImpl( + TBatchStream stream, int firstBatch, std::string networkName, const char* inputBlobName, bool readCache = true) + : mStream{stream} + , mCalibrationTableName("CalibrationTable" + networkName) + , mInputBlobName(inputBlobName) + , mReadCache(readCache) + { + nvinfer1::Dims dims = mStream.getDims(); + mInputCount = samplesCommon::volume(dims); + CHECK(cudaMalloc(&mDeviceInput, mInputCount * sizeof(float))); + mStream.reset(firstBatch); + } + + virtual ~EntropyCalibratorImpl() + { + CHECK(cudaFree(mDeviceInput)); + } + + int getBatchSize() const noexcept + { + return mStream.getBatchSize(); + } + + bool getBatch(void* bindings[], const char* names[], int /*nbBindings*/) noexcept + { + if (!mStream.next()) + return false; + + CHECK(cudaMemcpy(mDeviceInput, mStream.getBatch(), mInputCount * sizeof(float), cudaMemcpyHostToDevice)); + ASSERT(!strcmp(names[0], mInputBlobName)); + bindings[0] = mDeviceInput; + return true; + } + + const void* readCalibrationCache(size_t& length) noexcept + { + mCalibrationCache.clear(); + std::ifstream input(mCalibrationTableName, std::ios::binary); + input >> std::noskipws; + if (mReadCache && input.good()) + { + std::copy(std::istream_iterator(input), std::istream_iterator(), + std::back_inserter(mCalibrationCache)); + } + length = mCalibrationCache.size(); + return length ? mCalibrationCache.data() : nullptr; + } + + void writeCalibrationCache(const void* cache, size_t length) noexcept + { + std::ofstream output(mCalibrationTableName, std::ios::binary); + output.write(reinterpret_cast(cache), length); + } + +private: + TBatchStream mStream; + size_t mInputCount; + std::string mCalibrationTableName; + const char* mInputBlobName; + bool mReadCache{true}; + void* mDeviceInput{nullptr}; + std::vector mCalibrationCache; +}; + +//! \class Int8EntropyCalibrator2 +//! +//! \brief Implements Entropy calibrator 2. +//! CalibrationAlgoType is kENTROPY_CALIBRATION_2. +//! +template +class Int8EntropyCalibrator2 : public nvinfer1::IInt8EntropyCalibrator2 +{ +public: + Int8EntropyCalibrator2( + TBatchStream stream, int firstBatch, const char* networkName, const char* inputBlobName, bool readCache = true) + : mImpl(stream, firstBatch, networkName, inputBlobName, readCache) + { + } + + int getBatchSize() const noexcept override + { + return mImpl.getBatchSize(); + } + + bool getBatch(void* bindings[], const char* names[], int nbBindings) noexcept override + { + return mImpl.getBatch(bindings, names, nbBindings); + } + + const void* readCalibrationCache(size_t& length) noexcept override + { + return mImpl.readCalibrationCache(length); + } + + void writeCalibrationCache(const void* cache, size_t length) noexcept override + { + mImpl.writeCalibrationCache(cache, length); + } + +private: + EntropyCalibratorImpl mImpl; +}; + +#endif // ENTROPY_CALIBRATOR_H diff --git a/src/Detector/tensorrt_yolo/common_deprecated/ErrorRecorder.h b/src/Detector/tensorrt_yolo/common_deprecated/ErrorRecorder.h new file mode 100644 index 00000000..40b35fb5 --- /dev/null +++ b/src/Detector/tensorrt_yolo/common_deprecated/ErrorRecorder.h @@ -0,0 +1,137 @@ +/* + * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ERROR_RECORDER_H +#define ERROR_RECORDER_H +#include "NvInferRuntimeCommon.h" +#include "logger.h" +#include +#include +#include +#include +#include + +using nvinfer1::IErrorRecorder; +using nvinfer1::ErrorCode; + +//! +//! A simple implementation of the IErrorRecorder interface for +//! use by samples. This interface also can be used as a reference +//! implementation. +//! The sample Error recorder is based on a vector that pairs the error +//! code and the error string into a single element. It also uses +//! standard mutex's and atomics in order to make sure that the code +//! works in a multi-threaded environment. +//! +class SampleErrorRecorder : public IErrorRecorder +{ + using errorPair = std::pair; + using errorStack = std::vector; + +public: + SampleErrorRecorder() = default; + + virtual ~SampleErrorRecorder() noexcept {} + int32_t getNbErrors() const noexcept final + { + return mErrorStack.size(); + } + ErrorCode getErrorCode(int32_t errorIdx) const noexcept final + { + return invalidIndexCheck(errorIdx) ? ErrorCode::kINVALID_ARGUMENT : (*this)[errorIdx].first; + }; + IErrorRecorder::ErrorDesc getErrorDesc(int32_t errorIdx) const noexcept final + { + return invalidIndexCheck(errorIdx) ? "errorIdx out of range." : (*this)[errorIdx].second.c_str(); + } + // This class can never overflow since we have dynamic resize via std::vector usage. + bool hasOverflowed() const noexcept final + { + return false; + } + + // Empty the errorStack. + void clear() noexcept final + { + try + { + // grab a lock so that there is no addition while clearing. + std::lock_guard guard(mStackLock); + mErrorStack.clear(); + } + catch (const std::exception& e) + { + sample::gLogFatal << "Internal Error: " << e.what() << std::endl; + } + }; + + //! Simple helper function that + bool empty() const noexcept + { + return mErrorStack.empty(); + } + + bool reportError(ErrorCode val, IErrorRecorder::ErrorDesc desc) noexcept final + { + try + { + std::lock_guard guard(mStackLock); + sample::gLogError << "Error[" << static_cast(val) << "]: " << desc << std::endl; + mErrorStack.push_back(errorPair(val, desc)); + } + catch (const std::exception& e) + { + sample::gLogFatal << "Internal Error: " << e.what() << std::endl; + } + // All errors are considered fatal. + return true; + } + + // Atomically increment or decrement the ref counter. + IErrorRecorder::RefCount incRefCount() noexcept final + { + return ++mRefCount; + } + IErrorRecorder::RefCount decRefCount() noexcept final + { + return --mRefCount; + } + +private: + // Simple helper functions. + const errorPair& operator[](size_t index) const noexcept + { + return mErrorStack[index]; + } + + bool invalidIndexCheck(int32_t index) const noexcept + { + // By converting signed to unsigned, we only need a single check since + // negative numbers turn into large positive greater than the size. + size_t sIndex = index; + return sIndex >= mErrorStack.size(); + } + // Mutex to hold when locking mErrorStack. + std::mutex mStackLock; + + // Reference count of the class. Destruction of the class when mRefCount + // is not zero causes undefined behavior. + std::atomic mRefCount{0}; + + // The error stack that holds the errors recorded by TensorRT. + errorStack mErrorStack; +}; // class SampleErrorRecorder +#endif // ERROR_RECORDER_H diff --git a/src/Detector/tensorrt_yolo/common_deprecated/buffers.h b/src/Detector/tensorrt_yolo/common_deprecated/buffers.h new file mode 100644 index 00000000..ef673b2b --- /dev/null +++ b/src/Detector/tensorrt_yolo/common_deprecated/buffers.h @@ -0,0 +1,478 @@ +/* + * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef TENSORRT_BUFFERS_H +#define TENSORRT_BUFFERS_H + +#include "NvInfer.h" +#include "common.h" +#include "half.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace samplesCommon +{ + +//! +//! \brief The GenericBuffer class is a templated class for buffers. +//! +//! \details This templated RAII (Resource Acquisition Is Initialization) class handles the allocation, +//! deallocation, querying of buffers on both the device and the host. +//! It can handle data of arbitrary types because it stores byte buffers. +//! The template parameters AllocFunc and FreeFunc are used for the +//! allocation and deallocation of the buffer. +//! AllocFunc must be a functor that takes in (void** ptr, size_t size) +//! and returns bool. ptr is a pointer to where the allocated buffer address should be stored. +//! size is the amount of memory in bytes to allocate. +//! The boolean indicates whether or not the memory allocation was successful. +//! FreeFunc must be a functor that takes in (void* ptr) and returns void. +//! ptr is the allocated buffer address. It must work with nullptr input. +//! +template +class GenericBuffer +{ +public: + //! + //! \brief Construct an empty buffer. + //! + GenericBuffer(nvinfer1::DataType type = nvinfer1::DataType::kFLOAT) + : mSize(0) + , mCapacity(0) + , mType(type) + , mBuffer(nullptr) + { + } + + //! + //! \brief Construct a buffer with the specified allocation size in bytes. + //! + GenericBuffer(size_t size, nvinfer1::DataType type) + : mSize(size) + , mCapacity(size) + , mType(type) + { + if (!allocFn(&mBuffer, this->nbBytes())) + { + throw std::bad_alloc(); + } + } + + GenericBuffer(GenericBuffer&& buf) + : mSize(buf.mSize) + , mCapacity(buf.mCapacity) + , mType(buf.mType) + , mBuffer(buf.mBuffer) + { + buf.mSize = 0; + buf.mCapacity = 0; + buf.mType = nvinfer1::DataType::kFLOAT; + buf.mBuffer = nullptr; + } + + GenericBuffer& operator=(GenericBuffer&& buf) + { + if (this != &buf) + { + freeFn(mBuffer); + mSize = buf.mSize; + mCapacity = buf.mCapacity; + mType = buf.mType; + mBuffer = buf.mBuffer; + // Reset buf. + buf.mSize = 0; + buf.mCapacity = 0; + buf.mBuffer = nullptr; + } + return *this; + } + + //! + //! \brief Returns pointer to underlying array. + //! + void* data() + { + return mBuffer; + } + + //! + //! \brief Returns pointer to underlying array. + //! + const void* data() const + { + return mBuffer; + } + + //! + //! \brief Returns the size (in number of elements) of the buffer. + //! + size_t size() const + { + return mSize; + } + + //! + //! \brief Returns the size (in bytes) of the buffer. + //! + size_t nbBytes() const + { + return this->size() * samplesCommon::getElementSize(mType); + } + + //! + //! \brief Resizes the buffer. This is a no-op if the new size is smaller than or equal to the current capacity. + //! + void resize(size_t newSize) + { + mSize = newSize; + if (mCapacity < newSize) + { + freeFn(mBuffer); + if (!allocFn(&mBuffer, this->nbBytes())) + { + throw std::bad_alloc{}; + } + mCapacity = newSize; + } + } + + //! + //! \brief Overload of resize that accepts Dims + //! + void resize(const nvinfer1::Dims& dims) + { + return this->resize(samplesCommon::volume(dims)); + } + + ~GenericBuffer() + { + freeFn(mBuffer); + } + +private: + size_t mSize{0}, mCapacity{0}; + nvinfer1::DataType mType; + void* mBuffer; + AllocFunc allocFn; + FreeFunc freeFn; +}; + +class DeviceAllocator +{ +public: + bool operator()(void** ptr, size_t size) const + { + return cudaMalloc(ptr, size) == cudaSuccess; + } +}; + +class DeviceFree +{ +public: + void operator()(void* ptr) const + { + cudaFree(ptr); + } +}; + +class HostAllocator +{ +public: + bool operator()(void** ptr, size_t size) const + { + *ptr = malloc(size); + return *ptr != nullptr; + } +}; + +class HostFree +{ +public: + void operator()(void* ptr) const + { + free(ptr); + } +}; + +using DeviceBuffer = GenericBuffer; +using HostBuffer = GenericBuffer; + +//! +//! \brief The ManagedBuffer class groups together a pair of corresponding device and host buffers. +//! +class ManagedBuffer +{ +public: + DeviceBuffer deviceBuffer; + HostBuffer hostBuffer; +}; + +//! +//! \brief The BufferManager class handles host and device buffer allocation and deallocation. +//! +//! \details This RAII class handles host and device buffer allocation and deallocation, +//! memcpy between host and device buffers to aid with inference, +//! and debugging dumps to validate inference. The BufferManager class is meant to be +//! used to simplify buffer management and any interactions between buffers and the engine. +//! +class BufferManager +{ +public: + static const size_t kINVALID_SIZE_VALUE = ~size_t(0); + + //! + //! \brief Create a BufferManager for handling buffer interactions with engine. + //! + BufferManager(std::shared_ptr engine, const int batchSize, + const nvinfer1::IExecutionContext* context = nullptr) + : mEngine(engine) + , mBatchSize(batchSize) + { + // Full Dims implies no batch size. + auto impbs = engine->hasImplicitBatchDimension(); + std::cout << "hasImplicitBatchDimension: " << impbs << ", mBatchSize = " << mBatchSize << std::endl; + assert(engine->hasImplicitBatchDimension() || mBatchSize == 0); + // Create host and device buffers + for (int i = 0; i < mEngine->getNbBindings(); i++) + { + auto dims = context ? context->getBindingDimensions(i) : mEngine->getBindingDimensions(i); + size_t vol = context || !mBatchSize ? 1 : static_cast(mBatchSize); + nvinfer1::DataType type = mEngine->getBindingDataType(i); + int vecDim = mEngine->getBindingVectorizedDim(i); + if (-1 != vecDim) // i.e., 0 != lgScalarsPerVector + { + int scalarsPerVec = mEngine->getBindingComponentsPerElement(i); + dims.d[vecDim] = divUp(dims.d[vecDim], scalarsPerVec); + vol *= scalarsPerVec; + } + vol *= samplesCommon::volume(dims); + std::unique_ptr manBuf{new ManagedBuffer()}; + manBuf->deviceBuffer = DeviceBuffer(vol, type); + manBuf->hostBuffer = HostBuffer(vol, type); + mDeviceBindings.emplace_back(manBuf->deviceBuffer.data()); + mManagedBuffers.emplace_back(std::move(manBuf)); + } + } + + //! + //! \brief Returns a vector of device buffers that you can use directly as + //! bindings for the execute and enqueue methods of IExecutionContext. + //! + std::vector& getDeviceBindings() + { + return mDeviceBindings; + } + + //! + //! \brief Returns a vector of device buffers. + //! + const std::vector& getDeviceBindings() const + { + return mDeviceBindings; + } + + //! + //! \brief Returns the device buffer corresponding to tensorName. + //! Returns nullptr if no such tensor can be found. + //! + void* getDeviceBuffer(const std::string& tensorName) const + { + return getBuffer(false, tensorName); + } + + //! + //! \brief Returns the host buffer corresponding to tensorName. + //! Returns nullptr if no such tensor can be found. + //! + void* getHostBuffer(const std::string& tensorName) const + { + return getBuffer(true, tensorName); + } + + //! + //! \brief Returns the host buffer corresponding to tensorName. + //! Returns nullptr if no such tensor can be found. + //! + void* getHostBuffer(int bindingIndex) const + { + return getBuffer(true, bindingIndex); + } + + //! + //! \brief Returns the size of the host and device buffers that correspond to tensorName. + //! Returns kINVALID_SIZE_VALUE if no such tensor can be found. + //! + size_t size(const std::string& tensorName) const + { + int index = mEngine->getBindingIndex(tensorName.c_str()); + if (index == -1) + return kINVALID_SIZE_VALUE; + return mManagedBuffers[index]->hostBuffer.nbBytes(); + } + + //! + //! \brief Dump host buffer with specified tensorName to ostream. + //! Prints error message to std::ostream if no such tensor can be found. + //! + void dumpBuffer(std::ostream& os, const std::string& tensorName) + { + int index = mEngine->getBindingIndex(tensorName.c_str()); + if (index == -1) + { + os << "Invalid tensor name" << std::endl; + return; + } + void* buf = mManagedBuffers[index]->hostBuffer.data(); + size_t bufSize = mManagedBuffers[index]->hostBuffer.nbBytes(); + nvinfer1::Dims bufDims = mEngine->getBindingDimensions(index); + size_t rowCount = static_cast(bufDims.nbDims > 0 ? bufDims.d[bufDims.nbDims - 1] : mBatchSize); + int leadDim = mBatchSize; + int* trailDims = bufDims.d; + int nbDims = bufDims.nbDims; + + // Fix explicit Dimension networks + if (!leadDim && nbDims > 0) + { + leadDim = bufDims.d[0]; + ++trailDims; + --nbDims; + } + + os << "[" << leadDim; + for (int i = 0; i < nbDims; i++) + os << ", " << trailDims[i]; + os << "]" << std::endl; + switch (mEngine->getBindingDataType(index)) + { + case nvinfer1::DataType::kINT32: print(os, buf, bufSize, rowCount); break; + case nvinfer1::DataType::kFLOAT: print(os, buf, bufSize, rowCount); break; + case nvinfer1::DataType::kHALF: print(os, buf, bufSize, rowCount); break; + case nvinfer1::DataType::kINT8: assert(0 && "Int8 network-level input and output is not supported"); break; + case nvinfer1::DataType::kBOOL: assert(0 && "Bool network-level input and output are not supported"); break; + } + } + + //! + //! \brief Templated print function that dumps buffers of arbitrary type to std::ostream. + //! rowCount parameter controls how many elements are on each line. + //! A rowCount of 1 means that there is only 1 element on each line. + //! + template + void print(std::ostream& os, void* buf, size_t bufSize, size_t rowCount) + { + assert(rowCount != 0); + assert(bufSize % sizeof(T) == 0); + T* typedBuf = static_cast(buf); + size_t numItems = bufSize / sizeof(T); + for (int i = 0; i < static_cast(numItems); i++) + { + // Handle rowCount == 1 case + if (rowCount == 1 && i != static_cast(numItems) - 1) + os << typedBuf[i] << std::endl; + else if (rowCount == 1) + os << typedBuf[i]; + // Handle rowCount > 1 case + else if (i % rowCount == 0) + os << typedBuf[i]; + else if (i % rowCount == rowCount - 1) + os << " " << typedBuf[i] << std::endl; + else + os << " " << typedBuf[i]; + } + } + + //! + //! \brief Copy the contents of input host buffers to input device buffers synchronously. + //! + void copyInputToDevice() + { + memcpyBuffers(true, false, false, 0); + } + + //! + //! \brief Copy the contents of output device buffers to output host buffers synchronously. + //! + void copyOutputToHost() + { + memcpyBuffers(false, true, false, 0); + } + + //! + //! \brief Copy the contents of input host buffers to input device buffers asynchronously. + //! + void copyInputToDeviceAsync(const cudaStream_t& stream) + { + memcpyBuffers(true, false, true, stream); + } + + //! + //! \brief Copy the contents of output device buffers to output host buffers asynchronously. + //! + void copyOutputToHostAsync(const cudaStream_t& stream) + { + memcpyBuffers(false, true, true, stream); + } + + ~BufferManager() = default; + +private: + void* getBuffer(const bool isHost, const std::string& tensorName) const + { + int index = mEngine->getBindingIndex(tensorName.c_str()); + if (index == -1) + return nullptr; + return (isHost ? mManagedBuffers[index]->hostBuffer.data() : mManagedBuffers[index]->deviceBuffer.data()); + } + + void* getBuffer(const bool isHost, int bindingIndex) const + { + if (bindingIndex == -1) + return nullptr; + return (isHost ? mManagedBuffers[bindingIndex]->hostBuffer.data() : mManagedBuffers[bindingIndex]->deviceBuffer.data()); + } + + void memcpyBuffers(const bool copyInput, const bool deviceToHost, const bool async, const cudaStream_t& stream) + { + for (int i = 0; i < mEngine->getNbBindings(); i++) + { + void* dstPtr = deviceToHost ? mManagedBuffers[i]->hostBuffer.data() : mManagedBuffers[i]->deviceBuffer.data(); + const void* srcPtr = deviceToHost ? mManagedBuffers[i]->deviceBuffer.data() : mManagedBuffers[i]->hostBuffer.data(); + const size_t byteSize = mManagedBuffers[i]->hostBuffer.nbBytes(); + const cudaMemcpyKind memcpyType = deviceToHost ? cudaMemcpyDeviceToHost : cudaMemcpyHostToDevice; + if ((copyInput && mEngine->bindingIsInput(i)) || (!copyInput && !mEngine->bindingIsInput(i))) + { + if (async) + CHECK(cudaMemcpyAsync(dstPtr, srcPtr, byteSize, memcpyType, stream)); + else + CHECK(cudaMemcpy(dstPtr, srcPtr, byteSize, memcpyType)); + } + } + } + + std::shared_ptr mEngine; //!< The pointer to the engine + int mBatchSize = 0; //!< The batch size for legacy networks, 0 otherwise. + std::vector> mManagedBuffers; //!< The vector of pointers to managed buffers + std::vector mDeviceBindings; //!< The vector of device buffers needed for engine execution +}; + +} // namespace samplesCommon + +#endif // TENSORRT_BUFFERS_H diff --git a/src/Detector/tensorrt_yolo/common_deprecated/common.h b/src/Detector/tensorrt_yolo/common_deprecated/common.h new file mode 100644 index 00000000..2270a2cd --- /dev/null +++ b/src/Detector/tensorrt_yolo/common_deprecated/common.h @@ -0,0 +1,963 @@ +/* + * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef TENSORRT_COMMON_H +#define TENSORRT_COMMON_H + +// For loadLibrary +#ifdef _MSC_VER +// Needed so that the max/min definitions in windows.h do not conflict with std::max/min. +#ifndef NOMINMAX +#define NOMINMAX +#endif +#include +#undef NOMINMAX +#else +#include +#endif + +#include "NvInfer.h" +#include "NvInferPlugin.h" +#include "logger.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "safeCommon.h" + +#ifdef _MSC_VER +#define FN_NAME __FUNCTION__ +#else +#define FN_NAME __func__ +#endif + +#if defined(__aarch64__) || defined(__QNX__) +#define ENABLE_DLA_API 1 +#endif + +#define CHECK_RETURN_W_MSG(status, val, errMsg) \ + do \ + { \ + if (!(status)) \ + { \ + sample::gLogError << errMsg << " Error in " << __FILE__ << ", function " << FN_NAME << "(), line " << __LINE__ \ + << std::endl; \ + return val; \ + } \ + } while (0) + +#undef ASSERT +#define ASSERT(condition) \ + do \ + { \ + if (!(condition)) \ + { \ + sample::gLogError << "Assertion failure: " << #condition << std::endl; \ + abort(); \ + } \ + } while (0) + + +#define CHECK_RETURN(status, val) CHECK_RETURN_W_MSG(status, val, "") + +#define OBJ_GUARD(A) std::unique_ptr + +template +OBJ_GUARD(T) +makeObjGuard(T_* t) +{ + CHECK(!(std::is_base_of::value || std::is_same::value)); + auto deleter = [](T* t) { t->destroy(); }; + return std::unique_ptr{static_cast(t), deleter}; +} + +constexpr long double operator"" _GiB(long double val) +{ + return val * (1 << 30); +} +constexpr long double operator"" _MiB(long double val) +{ + return val * (1 << 20); +} +constexpr long double operator"" _KiB(long double val) +{ + return val * (1 << 10); +} + +// These is necessary if we want to be able to write 1_GiB instead of 1.0_GiB. +// Since the return type is signed, -1_GiB will work as expected. +constexpr long long int operator"" _GiB(unsigned long long val) +{ + return val * (1 << 30); +} +constexpr long long int operator"" _MiB(unsigned long long val) +{ + return val * (1 << 20); +} +constexpr long long int operator"" _KiB(unsigned long long val) +{ + return val * (1 << 10); +} + +struct SimpleProfiler : public nvinfer1::IProfiler +{ + struct Record + { + float time{0}; + int count{0}; + }; + + virtual void reportLayerTime(const char* layerName, float ms) noexcept + { + mProfile[layerName].count++; + mProfile[layerName].time += ms; + if (std::find(mLayerNames.begin(), mLayerNames.end(), layerName) == mLayerNames.end()) + { + mLayerNames.push_back(layerName); + } + } + + SimpleProfiler(const char* name, const std::vector& srcProfilers = std::vector()) + : mName(name) + { + for (const auto& srcProfiler : srcProfilers) + { + for (const auto& rec : srcProfiler.mProfile) + { + auto it = mProfile.find(rec.first); + if (it == mProfile.end()) + { + mProfile.insert(rec); + } + else + { + it->second.time += rec.second.time; + it->second.count += rec.second.count; + } + } + } + } + + friend std::ostream& operator<<(std::ostream& out, const SimpleProfiler& value) + { + out << "========== " << value.mName << " profile ==========" << std::endl; + float totalTime = 0; + std::string layerNameStr = "TensorRT layer name"; + int maxLayerNameLength = std::max(static_cast(layerNameStr.size()), 70); + for (const auto& elem : value.mProfile) + { + totalTime += elem.second.time; + maxLayerNameLength = std::max(maxLayerNameLength, static_cast(elem.first.size())); + } + + auto old_settings = out.flags(); + auto old_precision = out.precision(); + // Output header + { + out << std::setw(maxLayerNameLength) << layerNameStr << " "; + out << std::setw(12) << "Runtime, " + << "%" + << " "; + out << std::setw(12) << "Invocations" + << " "; + out << std::setw(12) << "Runtime, ms" << std::endl; + } + for (size_t i = 0; i < value.mLayerNames.size(); i++) + { + const std::string layerName = value.mLayerNames[i]; + auto elem = value.mProfile.at(layerName); + out << std::setw(maxLayerNameLength) << layerName << " "; + out << std::setw(12) << std::fixed << std::setprecision(1) << (elem.time * 100.0F / totalTime) << "%" + << " "; + out << std::setw(12) << elem.count << " "; + out << std::setw(12) << std::fixed << std::setprecision(2) << elem.time << std::endl; + } + out.flags(old_settings); + out.precision(old_precision); + out << "========== " << value.mName << " total runtime = " << totalTime << " ms ==========" << std::endl; + + return out; + } + +private: + std::string mName; + std::vector mLayerNames; + std::map mProfile; +}; + +//! Locate path to file, given its filename or filepath suffix and possible dirs it might lie in. +//! Function will also walk back MAX_DEPTH dirs from CWD to check for such a file path. +inline std::string locateFile( + const std::string& filepathSuffix, const std::vector& directories, bool reportError = true) +{ + const int MAX_DEPTH{10}; + bool found{false}; + std::string filepath; + + for (auto& dir : directories) + { + if (!dir.empty() && dir.back() != '/') + { +#ifdef _MSC_VER + filepath = dir + "\\" + filepathSuffix; +#else + filepath = dir + "/" + filepathSuffix; +#endif + } + else + { + filepath = dir + filepathSuffix; + } + + for (int i = 0; i < MAX_DEPTH && !found; i++) + { + const std::ifstream checkFile(filepath); + found = checkFile.is_open(); + if (found) + { + break; + } + + filepath = "../" + filepath; // Try again in parent dir + } + + if (found) + { + break; + } + + filepath.clear(); + } + + // Could not find the file + if (filepath.empty()) + { + const std::string dirList = std::accumulate(directories.begin() + 1, directories.end(), directories.front(), + [](const std::string& a, const std::string& b) { return a + "\n\t" + b; }); + std::cout << "Could not find " << filepathSuffix << " in data directories:\n\t" << dirList << std::endl; + + if (reportError) + { + std::cout << "&&&& FAILED" << std::endl; + exit(EXIT_FAILURE); + } + } + + return filepath; +} + +inline void readPGMFile(const std::string& fileName, uint8_t* buffer, int inH, int inW) +{ + std::ifstream infile(fileName, std::ifstream::binary); + assert(infile.is_open() && "Attempting to read from a file that is not open."); + std::string magic, h, w, max; + infile >> magic >> h >> w >> max; + infile.seekg(1, infile.cur); + infile.read(reinterpret_cast(buffer), inH * inW); +} + +namespace samplesCommon +{ + +// Swaps endianness of an integral type. +template ::value, int>::type = 0> +inline T swapEndianness(const T& value) +{ + uint8_t bytes[sizeof(T)]; + for (int i = 0; i < static_cast(sizeof(T)); ++i) + { + bytes[sizeof(T) - 1 - i] = *(reinterpret_cast(&value) + i); + } + return *reinterpret_cast(bytes); +} + +class HostMemory +{ +public: + HostMemory() = delete; + virtual void* data() const noexcept + { + return mData; + } + virtual std::size_t size() const noexcept + { + return mSize; + } + virtual nvinfer1::DataType type() const noexcept + { + return mType; + } + virtual ~HostMemory() {} + +protected: + HostMemory(std::size_t size, nvinfer1::DataType type) + : mData{nullptr} + , mSize(size) + , mType(type) + { + } + void* mData; + std::size_t mSize; + nvinfer1::DataType mType; +}; + +template +class TypedHostMemory : public HostMemory +{ +public: + explicit TypedHostMemory(std::size_t size) + : HostMemory(size, dataType) + { + mData = new ElemType[size]; + }; + ~TypedHostMemory() noexcept + { + delete[](ElemType*) mData; + } + ElemType* raw() noexcept + { + return static_cast(data()); + } +}; + +using FloatMemory = TypedHostMemory; +using HalfMemory = TypedHostMemory; +using ByteMemory = TypedHostMemory; + +inline void* safeCudaMalloc(size_t memSize) +{ + void* deviceMem; + CHECK(cudaMalloc(&deviceMem, memSize)); + if (deviceMem == nullptr) + { + std::cerr << "Out of memory" << std::endl; + exit(1); + } + return deviceMem; +} + +inline bool isDebug() +{ + return (std::getenv("TENSORRT_DEBUG") ? true : false); +} + +struct InferDeleter +{ + template + void operator()(T* obj) const + { +#if (NV_TENSORRT_MAJOR < 8) + obj->destroy(); +#else + delete obj; +#endif + } +}; + +template +using SampleUniquePtr = std::unique_ptr; + +static auto StreamDeleter = [](cudaStream_t* pStream) + { + if (pStream) + { + cudaStreamDestroy(*pStream); + delete pStream; + } + }; + +inline std::unique_ptr makeCudaStream() +{ + std::unique_ptr pStream(new cudaStream_t, StreamDeleter); + if (cudaStreamCreateWithFlags(pStream.get(), cudaStreamNonBlocking) != cudaSuccess) + { + pStream.reset(nullptr); + } + + return pStream; +} + +//! Return vector of indices that puts magnitudes of sequence in descending order. +template +std::vector argMagnitudeSort(Iter begin, Iter end) +{ + std::vector indices(end - begin); + std::iota(indices.begin(), indices.end(), 0); + std::sort(indices.begin(), indices.end(), [&begin](size_t i, size_t j) { return std::abs(begin[j]) < std::abs(begin[i]); }); + return indices; +} + +inline bool readReferenceFile(const std::string& fileName, std::vector& refVector) +{ + std::ifstream infile(fileName); + if (!infile.is_open()) + { + std::cout << "ERROR: readReferenceFile: Attempting to read from a file that is not open." << std::endl; + return false; + } + std::string line; + while (std::getline(infile, line)) + { + if (line.empty()) + continue; + refVector.push_back(line); + } + infile.close(); + return true; +} + +template +std::vector classify( + const std::vector& refVector, const std::vector& output, const size_t topK) +{ + const auto inds = samplesCommon::argMagnitudeSort(output.cbegin(), output.cend()); + std::vector result; + result.reserve(topK); + for (size_t k = 0; k < topK; ++k) + { + result.push_back(refVector[inds[k]]); + } + return result; +} + +// Returns indices of highest K magnitudes in v. +template +std::vector topKMagnitudes(const std::vector& v, const size_t k) +{ + std::vector indices = samplesCommon::argMagnitudeSort(v.cbegin(), v.cend()); + indices.resize(k); + return indices; +} + +template +bool readASCIIFile(const std::string& fileName, const size_t size, std::vector& out) +{ + std::ifstream infile(fileName); + if (!infile.is_open()) + { + std::cout << "ERROR readASCIIFile: Attempting to read from a file that is not open." << std::endl; + return false; + } + out.clear(); + out.reserve(size); + out.assign(std::istream_iterator(infile), std::istream_iterator()); + infile.close(); + return true; +} + +template +bool writeASCIIFile(const std::string& fileName, const std::vector& in) +{ + std::ofstream outfile(fileName); + if (!outfile.is_open()) + { + std::cout << "ERROR: writeASCIIFile: Attempting to write to a file that is not open." << std::endl; + return false; + } + for (auto fn : in) + { + outfile << fn << "\n"; + } + outfile.close(); + return true; +} + +inline void print_version() +{ + std::cout << " TensorRT version: " << NV_TENSORRT_MAJOR << "." << NV_TENSORRT_MINOR << "." << NV_TENSORRT_PATCH + << "." << NV_TENSORRT_BUILD << std::endl; +} + +inline std::string getFileType(const std::string& filepath) +{ + return filepath.substr(filepath.find_last_of(".") + 1); +} + +inline std::string toLower(const std::string& inp) +{ + std::string out = inp; + std::transform(out.begin(), out.end(), out.begin(), ::tolower); + return out; +} + +inline float getMaxValue(const float* buffer, int64_t size) +{ + assert(buffer != nullptr); + assert(size > 0); + return *std::max_element(buffer, buffer + size); +} + +// Ensures that every tensor used by a network has a dynamic range set. +// +// All tensors in a network must have a dynamic range specified if a calibrator is not used. +// This function is just a utility to globally fill in missing scales and zero-points for the entire network. +// +// If a tensor does not have a dyanamic range set, it is assigned inRange or outRange as follows: +// +// * If the tensor is the input to a layer or output of a pooling node, its dynamic range is derived from inRange. +// * Otherwise its dynamic range is derived from outRange. +// +// The default parameter values are intended to demonstrate, for final layers in the network, +// cases where dynamic ranges are asymmetric. +// +// The default parameter values choosen arbitrarily. Range values should be choosen such that +// we avoid underflow or overflow. Also range value should be non zero to avoid uniform zero scale tensor. +inline void setAllDynamicRanges(nvinfer1::INetworkDefinition* network, float inRange = 2.0f, float outRange = 4.0f) +{ + // Ensure that all layer inputs have a scale. + for (int i = 0; i < network->getNbLayers(); i++) + { + auto layer = network->getLayer(i); + for (int j = 0; j < layer->getNbInputs(); j++) + { + nvinfer1::ITensor* input{layer->getInput(j)}; + // Optional inputs are nullptr here and are from RNN layers. + if (input != nullptr && !input->dynamicRangeIsSet()) + { + ASSERT(input->setDynamicRange(-inRange, inRange)); + } + } + } + + // Ensure that all layer outputs have a scale. + // Tensors that are also inputs to layers are ingored here + // since the previous loop nest assigned scales to them. + for (int i = 0; i < network->getNbLayers(); i++) + { + auto layer = network->getLayer(i); + for (int j = 0; j < layer->getNbOutputs(); j++) + { + nvinfer1::ITensor* output{layer->getOutput(j)}; + // Optional outputs are nullptr here and are from RNN layers. + if (output != nullptr && !output->dynamicRangeIsSet()) + { + // Pooling must have the same input and output scales. + if (layer->getType() == nvinfer1::LayerType::kPOOLING) + { + ASSERT(output->setDynamicRange(-inRange, inRange)); + } + else + { + ASSERT(output->setDynamicRange(-outRange, outRange)); + } + } + } + } +} + +inline void setDummyInt8DynamicRanges(const nvinfer1::IBuilderConfig* c, nvinfer1::INetworkDefinition* n) +{ + // Set dummy per-tensor dynamic range if Int8 mode is requested. + if (c->getFlag(nvinfer1::BuilderFlag::kINT8)) + { + sample::gLogWarning + << "Int8 calibrator not provided. Generating dummy per-tensor dynamic range. Int8 accuracy is not guaranteed." + << std::endl; + setAllDynamicRanges(n); + } +} + +inline void enableDLA(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, int useDLACore, bool allowGPUFallback = true) +{ + if (useDLACore >= 0) + { + if (builder->getNbDLACores() == 0) + { + std::cerr << "Trying to use DLA core " << useDLACore << " on a platform that doesn't have any DLA cores" + << std::endl; + assert("Error: use DLA core on a platfrom that doesn't have any DLA cores" && false); + } + if (allowGPUFallback) + { + config->setFlag(nvinfer1::BuilderFlag::kGPU_FALLBACK); + } + if (!config->getFlag(nvinfer1::BuilderFlag::kINT8)) + { + // User has not requested INT8 Mode. + // By default run in FP16 mode. FP32 mode is not permitted. + config->setFlag(nvinfer1::BuilderFlag::kFP16); + } + config->setDefaultDeviceType(nvinfer1::DeviceType::kDLA); + config->setDLACore(useDLACore); + } +} + +inline int32_t parseDLA(int32_t argc, char** argv) +{ + for (int32_t i = 1; i < argc; i++) + { + if (strncmp(argv[i], "--useDLACore=", 13) == 0) + { + return std::stoi(argv[i] + 13); + } + } + return -1; +} + +inline uint32_t getElementSize(nvinfer1::DataType t) noexcept +{ + switch (t) + { + case nvinfer1::DataType::kINT32: return 4; + case nvinfer1::DataType::kFLOAT: return 4; + case nvinfer1::DataType::kHALF: return 2; + case nvinfer1::DataType::kBOOL: + case nvinfer1::DataType::kINT8: return 1; + } + return 0; +} + +inline int64_t volume(const nvinfer1::Dims& d) +{ + return std::accumulate(d.d, d.d + d.nbDims, 1, std::multiplies()); +} + +template +struct PPM +{ + std::string magic, fileName; + int h, w, max; + uint8_t buffer[C * H * W]; +}; + +// New vPPM(variable sized PPM) class with variable dimensions. +struct vPPM +{ + std::string magic, fileName; + int h, w, max; + std::vector buffer; +}; + +struct BBox +{ + float x1, y1, x2, y2; +}; + +template +void readPPMFile(const std::string& filename, samplesCommon::PPM& ppm) +{ + ppm.fileName = filename; + std::ifstream infile(filename, std::ifstream::binary); + assert(infile.is_open() && "Attempting to read from a file that is not open."); + infile >> ppm.magic >> ppm.w >> ppm.h >> ppm.max; + infile.seekg(1, infile.cur); + infile.read(reinterpret_cast(ppm.buffer), ppm.w * ppm.h * 3); +} + +inline void readPPMFile(const std::string& filename, vPPM& ppm, std::vector& input_dir) +{ + ppm.fileName = filename; + std::ifstream infile(locateFile(filename, input_dir), std::ifstream::binary); + infile >> ppm.magic >> ppm.w >> ppm.h >> ppm.max; + infile.seekg(1, infile.cur); + + for (int i = 0; i < ppm.w * ppm.h * 3; ++i) + { + ppm.buffer.push_back(0); + } + + infile.read(reinterpret_cast(&ppm.buffer[0]), ppm.w * ppm.h * 3); +} + +template +void writePPMFileWithBBox(const std::string& filename, PPM& ppm, const BBox& bbox) +{ + std::ofstream outfile("./" + filename, std::ofstream::binary); + assert(!outfile.fail()); + outfile << "P6" + << "\n" + << ppm.w << " " << ppm.h << "\n" + << ppm.max << "\n"; + + auto round = [](float x) -> int { return int(std::floor(x + 0.5f)); }; + const int x1 = std::min(std::max(0, round(int(bbox.x1))), W - 1); + const int x2 = std::min(std::max(0, round(int(bbox.x2))), W - 1); + const int y1 = std::min(std::max(0, round(int(bbox.y1))), H - 1); + const int y2 = std::min(std::max(0, round(int(bbox.y2))), H - 1); + + for (int x = x1; x <= x2; ++x) + { + // bbox top border + ppm.buffer[(y1 * ppm.w + x) * 3] = 255; + ppm.buffer[(y1 * ppm.w + x) * 3 + 1] = 0; + ppm.buffer[(y1 * ppm.w + x) * 3 + 2] = 0; + // bbox bottom border + ppm.buffer[(y2 * ppm.w + x) * 3] = 255; + ppm.buffer[(y2 * ppm.w + x) * 3 + 1] = 0; + ppm.buffer[(y2 * ppm.w + x) * 3 + 2] = 0; + } + + for (int y = y1; y <= y2; ++y) + { + // bbox left border + ppm.buffer[(y * ppm.w + x1) * 3] = 255; + ppm.buffer[(y * ppm.w + x1) * 3 + 1] = 0; + ppm.buffer[(y * ppm.w + x1) * 3 + 2] = 0; + // bbox right border + ppm.buffer[(y * ppm.w + x2) * 3] = 255; + ppm.buffer[(y * ppm.w + x2) * 3 + 1] = 0; + ppm.buffer[(y * ppm.w + x2) * 3 + 2] = 0; + } + + outfile.write(reinterpret_cast(ppm.buffer), ppm.w * ppm.h * 3); +} + +inline void writePPMFileWithBBox(const std::string& filename, vPPM ppm, std::vector& dets) +{ + std::ofstream outfile("./" + filename, std::ofstream::binary); + assert(!outfile.fail()); + outfile << "P6" + << "\n" + << ppm.w << " " << ppm.h << "\n" + << ppm.max << "\n"; + auto round = [](float x) -> int { return int(std::floor(x + 0.5f)); }; + + for (auto bbox : dets) + { + for (int x = int(bbox.x1); x < int(bbox.x2); ++x) + { + // bbox top border + ppm.buffer[(round(bbox.y1) * ppm.w + x) * 3] = 255; + ppm.buffer[(round(bbox.y1) * ppm.w + x) * 3 + 1] = 0; + ppm.buffer[(round(bbox.y1) * ppm.w + x) * 3 + 2] = 0; + // bbox bottom border + ppm.buffer[(round(bbox.y2) * ppm.w + x) * 3] = 255; + ppm.buffer[(round(bbox.y2) * ppm.w + x) * 3 + 1] = 0; + ppm.buffer[(round(bbox.y2) * ppm.w + x) * 3 + 2] = 0; + } + + for (int y = int(bbox.y1); y < int(bbox.y2); ++y) + { + // bbox left border + ppm.buffer[(y * ppm.w + round(bbox.x1)) * 3] = 255; + ppm.buffer[(y * ppm.w + round(bbox.x1)) * 3 + 1] = 0; + ppm.buffer[(y * ppm.w + round(bbox.x1)) * 3 + 2] = 0; + // bbox right border + ppm.buffer[(y * ppm.w + round(bbox.x2)) * 3] = 255; + ppm.buffer[(y * ppm.w + round(bbox.x2)) * 3 + 1] = 0; + ppm.buffer[(y * ppm.w + round(bbox.x2)) * 3 + 2] = 0; + } + } + + outfile.write(reinterpret_cast(&ppm.buffer[0]), ppm.w * ppm.h * 3); +} + +class TimerBase +{ +public: + virtual void start() {} + virtual void stop() {} + float microseconds() const noexcept + { + return mMs * 1000.f; + } + float milliseconds() const noexcept + { + return mMs; + } + float seconds() const noexcept + { + return mMs / 1000.f; + } + void reset() noexcept + { + mMs = 0.f; + } + +protected: + float mMs{0.0f}; +}; + +class GpuTimer : public TimerBase +{ +public: + explicit GpuTimer(cudaStream_t stream) + : mStream(stream) + { + CHECK(cudaEventCreate(&mStart)); + CHECK(cudaEventCreate(&mStop)); + } + ~GpuTimer() + { + CHECK(cudaEventDestroy(mStart)); + CHECK(cudaEventDestroy(mStop)); + } + void start() + { + CHECK(cudaEventRecord(mStart, mStream)); + } + void stop() + { + CHECK(cudaEventRecord(mStop, mStream)); + float ms{0.0f}; + CHECK(cudaEventSynchronize(mStop)); + CHECK(cudaEventElapsedTime(&ms, mStart, mStop)); + mMs += ms; + } + +private: + cudaEvent_t mStart, mStop; + cudaStream_t mStream; +}; // class GpuTimer + +template +class CpuTimer : public TimerBase +{ +public: + using clock_type = Clock; + + void start() + { + mStart = Clock::now(); + } + void stop() + { + mStop = Clock::now(); + mMs += std::chrono::duration{mStop - mStart}.count(); + } + +private: + std::chrono::time_point mStart, mStop; +}; // class CpuTimer + +using PreciseCpuTimer = CpuTimer; + +inline std::vector splitString(std::string str, char delimiter = ',') +{ + std::vector splitVect; + std::stringstream ss(str); + std::string substr; + + while (ss.good()) + { + getline(ss, substr, delimiter); + splitVect.emplace_back(std::move(substr)); + } + return splitVect; +} + +// Return m rounded up to nearest multiple of n +inline int roundUp(int m, int n) +{ + return ((m + n - 1) / n) * n; +} + +inline int getC(const nvinfer1::Dims& d) +{ + return d.nbDims >= 3 ? d.d[d.nbDims - 3] : 1; +} + +inline int getH(const nvinfer1::Dims& d) +{ + return d.nbDims >= 2 ? d.d[d.nbDims - 2] : 1; +} + +inline int getW(const nvinfer1::Dims& d) +{ + return d.nbDims >= 1 ? d.d[d.nbDims - 1] : 1; +} + +inline void loadLibrary(const std::string& path) +{ +#ifdef _MSC_VER + void* handle = LoadLibrary(path.c_str()); +#else + int32_t flags{RTLD_LAZY}; +#if ENABLE_ASAN + // https://github.com/google/sanitizers/issues/89 + // asan doesn't handle module unloading correctly and there are no plans on doing + // so. In order to get proper stack traces, don't delete the shared library on + // close so that asan can resolve the symbols correctly. + flags |= RTLD_NODELETE; +#endif // ENABLE_ASAN + + void* handle = dlopen(path.c_str(), flags); +#endif + if (handle == nullptr) + { +#ifdef _MSC_VER + sample::gLogError << "Could not load plugin library: " << path << std::endl; +#else + sample::gLogError << "Could not load plugin library: " << path << ", due to: " << dlerror() << std::endl; +#endif + } +} + +inline int32_t getSMVersion() +{ + int32_t deviceIndex = 0; + CHECK(cudaGetDevice(&deviceIndex)); + + int32_t major, minor; + CHECK(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, deviceIndex)); + CHECK(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, deviceIndex)); + + return ((major << 8) | minor); +} + +inline bool isSMSafe() +{ + const int32_t smVersion = getSMVersion(); + return smVersion == 0x0700 || smVersion == 0x0702 || smVersion == 0x0705 || + smVersion == 0x0800 || smVersion == 0x0806 || smVersion == 0x0807; +} + +inline bool isDataTypeSupported(nvinfer1::DataType dataType) +{ + auto builder = SampleUniquePtr(nvinfer1::createInferBuilder(sample::gLogger.getTRTLogger())); + if (!builder) + { + return false; + } + + if ((dataType == nvinfer1::DataType::kINT8 && !builder->platformHasFastInt8()) + || (dataType == nvinfer1::DataType::kHALF && !builder->platformHasFastFp16())) + { + return false; + } + + return true; +} + +} // namespace samplesCommon + +inline std::ostream& operator<<(std::ostream& os, const nvinfer1::Dims& dims) +{ + os << "("; + for (int i = 0; i < dims.nbDims; ++i) + { + os << (i ? ", " : "") << dims.d[i]; + } + return os << ")"; +} + +#endif // TENSORRT_COMMON_H diff --git a/src/Detector/tensorrt_yolo/common_deprecated/half.h b/src/Detector/tensorrt_yolo/common_deprecated/half.h new file mode 100644 index 00000000..0755c316 --- /dev/null +++ b/src/Detector/tensorrt_yolo/common_deprecated/half.h @@ -0,0 +1,4302 @@ +// half - IEEE 754-based half-precision floating point library. +// +// Copyright (c) 2012-2017 Christian Rau +// +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated +// documentation files (the "Software"), to deal in the Software without restriction, including without limitation the +// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to +// permit persons to whom the Software is furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the +// Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR +// OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +/* + * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Version 1.12.0 + +/// \file +/// Main header file for half precision functionality. + +#ifndef HALF_HALF_HPP +#define HALF_HALF_HPP + +/// Combined gcc version number. +#define HALF_GNUC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__) + +// check C++11 language features +#if defined(__clang__) // clang +#if __has_feature(cxx_static_assert) && !defined(HALF_ENABLE_CPP11_STATIC_ASSERT) +#define HALF_ENABLE_CPP11_STATIC_ASSERT 1 +#endif +#if __has_feature(cxx_constexpr) && !defined(HALF_ENABLE_CPP11_CONSTEXPR) +#define HALF_ENABLE_CPP11_CONSTEXPR 1 +#endif +#if __has_feature(cxx_noexcept) && !defined(HALF_ENABLE_CPP11_NOEXCEPT) +#define HALF_ENABLE_CPP11_NOEXCEPT 1 +#endif +#if __has_feature(cxx_user_literals) && !defined(HALF_ENABLE_CPP11_USER_LITERALS) +#define HALF_ENABLE_CPP11_USER_LITERALS 1 +#endif +#if (defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103L) && !defined(HALF_ENABLE_CPP11_LONG_LONG) +#define HALF_ENABLE_CPP11_LONG_LONG 1 +#endif +/*#elif defined(__INTEL_COMPILER) //Intel C++ + #if __INTEL_COMPILER >= 1100 && !defined(HALF_ENABLE_CPP11_STATIC_ASSERT) ???????? + #define HALF_ENABLE_CPP11_STATIC_ASSERT 1 + #endif + #if __INTEL_COMPILER >= 1300 && !defined(HALF_ENABLE_CPP11_CONSTEXPR) ???????? + #define HALF_ENABLE_CPP11_CONSTEXPR 1 + #endif + #if __INTEL_COMPILER >= 1300 && !defined(HALF_ENABLE_CPP11_NOEXCEPT) ???????? + #define HALF_ENABLE_CPP11_NOEXCEPT 1 + #endif + #if __INTEL_COMPILER >= 1100 && !defined(HALF_ENABLE_CPP11_LONG_LONG) ???????? + #define HALF_ENABLE_CPP11_LONG_LONG 1 + #endif*/ +#elif defined(__GNUC__) // gcc +#if defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103L +#if HALF_GNUC_VERSION >= 403 && !defined(HALF_ENABLE_CPP11_STATIC_ASSERT) +#define HALF_ENABLE_CPP11_STATIC_ASSERT 1 +#endif +#if HALF_GNUC_VERSION >= 406 && !defined(HALF_ENABLE_CPP11_CONSTEXPR) +#define HALF_ENABLE_CPP11_CONSTEXPR 1 +#endif +#if HALF_GNUC_VERSION >= 406 && !defined(HALF_ENABLE_CPP11_NOEXCEPT) +#define HALF_ENABLE_CPP11_NOEXCEPT 1 +#endif +#if HALF_GNUC_VERSION >= 407 && !defined(HALF_ENABLE_CPP11_USER_LITERALS) +#define HALF_ENABLE_CPP11_USER_LITERALS 1 +#endif +#if !defined(HALF_ENABLE_CPP11_LONG_LONG) +#define HALF_ENABLE_CPP11_LONG_LONG 1 +#endif +#endif +#elif defined(_MSC_VER) // Visual C++ +#if _MSC_VER >= 1900 && !defined(HALF_ENABLE_CPP11_CONSTEXPR) +#define HALF_ENABLE_CPP11_CONSTEXPR 1 +#endif +#if _MSC_VER >= 1900 && !defined(HALF_ENABLE_CPP11_NOEXCEPT) +#define HALF_ENABLE_CPP11_NOEXCEPT 1 +#endif +#if _MSC_VER >= 1900 && !defined(HALF_ENABLE_CPP11_USER_LITERALS) +#define HALF_ENABLE_CPP11_USER_LITERALS 1 +#endif +#if _MSC_VER >= 1600 && !defined(HALF_ENABLE_CPP11_STATIC_ASSERT) +#define HALF_ENABLE_CPP11_STATIC_ASSERT 1 +#endif +#if _MSC_VER >= 1310 && !defined(HALF_ENABLE_CPP11_LONG_LONG) +#define HALF_ENABLE_CPP11_LONG_LONG 1 +#endif +#define HALF_POP_WARNINGS 1 +#pragma warning(push) +#pragma warning(disable : 4099 4127 4146) // struct vs class, constant in if, negative unsigned +#endif + +// check C++11 library features +#include +#if defined(_LIBCPP_VERSION) // libc++ +#if defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103 +#ifndef HALF_ENABLE_CPP11_TYPE_TRAITS +#define HALF_ENABLE_CPP11_TYPE_TRAITS 1 +#endif +#ifndef HALF_ENABLE_CPP11_CSTDINT +#define HALF_ENABLE_CPP11_CSTDINT 1 +#endif +#ifndef HALF_ENABLE_CPP11_CMATH +#define HALF_ENABLE_CPP11_CMATH 1 +#endif +#ifndef HALF_ENABLE_CPP11_HASH +#define HALF_ENABLE_CPP11_HASH 1 +#endif +#endif +#elif defined(__GLIBCXX__) // libstdc++ +#if defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103 +#ifdef __clang__ +#if __GLIBCXX__ >= 20080606 && !defined(HALF_ENABLE_CPP11_TYPE_TRAITS) +#define HALF_ENABLE_CPP11_TYPE_TRAITS 1 +#endif +#if __GLIBCXX__ >= 20080606 && !defined(HALF_ENABLE_CPP11_CSTDINT) +#define HALF_ENABLE_CPP11_CSTDINT 1 +#endif +#if __GLIBCXX__ >= 20080606 && !defined(HALF_ENABLE_CPP11_CMATH) +#define HALF_ENABLE_CPP11_CMATH 1 +#endif +#if __GLIBCXX__ >= 20080606 && !defined(HALF_ENABLE_CPP11_HASH) +#define HALF_ENABLE_CPP11_HASH 1 +#endif +#else +#if HALF_GNUC_VERSION >= 403 && !defined(HALF_ENABLE_CPP11_CSTDINT) +#define HALF_ENABLE_CPP11_CSTDINT 1 +#endif +#if HALF_GNUC_VERSION >= 403 && !defined(HALF_ENABLE_CPP11_CMATH) +#define HALF_ENABLE_CPP11_CMATH 1 +#endif +#if HALF_GNUC_VERSION >= 403 && !defined(HALF_ENABLE_CPP11_HASH) +#define HALF_ENABLE_CPP11_HASH 1 +#endif +#endif +#endif +#elif defined(_CPPLIB_VER) // Dinkumware/Visual C++ +#if _CPPLIB_VER >= 520 +#ifndef HALF_ENABLE_CPP11_TYPE_TRAITS +#define HALF_ENABLE_CPP11_TYPE_TRAITS 1 +#endif +#ifndef HALF_ENABLE_CPP11_CSTDINT +#define HALF_ENABLE_CPP11_CSTDINT 1 +#endif +#ifndef HALF_ENABLE_CPP11_HASH +#define HALF_ENABLE_CPP11_HASH 1 +#endif +#endif +#if _CPPLIB_VER >= 610 +#ifndef HALF_ENABLE_CPP11_CMATH +#define HALF_ENABLE_CPP11_CMATH 1 +#endif +#endif +#endif +#undef HALF_GNUC_VERSION + +// support constexpr +#if HALF_ENABLE_CPP11_CONSTEXPR +#define HALF_CONSTEXPR constexpr +#define HALF_CONSTEXPR_CONST constexpr +#else +#define HALF_CONSTEXPR +#define HALF_CONSTEXPR_CONST const +#endif + +// support noexcept +#if HALF_ENABLE_CPP11_NOEXCEPT +#define HALF_NOEXCEPT noexcept +#define HALF_NOTHROW noexcept +#else +#define HALF_NOEXCEPT +#define HALF_NOTHROW throw() +#endif + +#include +#include +#include +#include +#include +#include +#if HALF_ENABLE_CPP11_TYPE_TRAITS +#include +#endif +#if HALF_ENABLE_CPP11_CSTDINT +#include +#endif +#if HALF_ENABLE_CPP11_HASH +#include +#endif + +/// Default rounding mode. +/// This specifies the rounding mode used for all conversions between [half](\ref half_float::half)s and `float`s as +/// well as for the half_cast() if not specifying a rounding mode explicitly. It can be redefined (before including +/// half.hpp) to one of the standard rounding modes using their respective constants or the equivalent values of +/// `std::float_round_style`: +/// +/// `std::float_round_style` | value | rounding +/// ---------------------------------|-------|------------------------- +/// `std::round_indeterminate` | -1 | fastest (default) +/// `std::round_toward_zero` | 0 | toward zero +/// `std::round_to_nearest` | 1 | to nearest +/// `std::round_toward_infinity` | 2 | toward positive infinity +/// `std::round_toward_neg_infinity` | 3 | toward negative infinity +/// +/// By default this is set to `-1` (`std::round_indeterminate`), which uses truncation (round toward zero, but with +/// overflows set to infinity) and is the fastest rounding mode possible. It can even be set to +/// `std::numeric_limits::round_style` to synchronize the rounding mode with that of the underlying +/// single-precision implementation. +#ifndef HALF_ROUND_STYLE +#define HALF_ROUND_STYLE 1 // = std::round_to_nearest +#endif + +/// Tie-breaking behaviour for round to nearest. +/// This specifies if ties in round to nearest should be resolved by rounding to the nearest even value. By default this +/// is defined to `0` resulting in the faster but slightly more biased behaviour of rounding away from zero in half-way +/// cases (and thus equal to the round() function), but can be redefined to `1` (before including half.hpp) if more +/// IEEE-conformant behaviour is needed. +#ifndef HALF_ROUND_TIES_TO_EVEN +#define HALF_ROUND_TIES_TO_EVEN 0 // ties away from zero +#endif + +/// Value signaling overflow. +/// In correspondence with `HUGE_VAL[F|L]` from `` this symbol expands to a positive value signaling the overflow +/// of an operation, in particular it just evaluates to positive infinity. +#define HUGE_VALH std::numeric_limits::infinity() + +/// Fast half-precision fma function. +/// This symbol is only defined if the fma() function generally executes as fast as, or faster than, a separate +/// half-precision multiplication followed by an addition. Due to the internal single-precision implementation of all +/// arithmetic operations, this is in fact always the case. +#define FP_FAST_FMAH 1 + +#ifndef FP_ILOGB0 +#define FP_ILOGB0 INT_MIN +#endif +#ifndef FP_ILOGBNAN +#define FP_ILOGBNAN INT_MAX +#endif +#ifndef FP_SUBNORMAL +#define FP_SUBNORMAL 0 +#endif +#ifndef FP_ZERO +#define FP_ZERO 1 +#endif +#ifndef FP_NAN +#define FP_NAN 2 +#endif +#ifndef FP_INFINITE +#define FP_INFINITE 3 +#endif +#ifndef FP_NORMAL +#define FP_NORMAL 4 +#endif + +/// Main namespace for half precision functionality. +/// This namespace contains all the functionality provided by the library. +namespace half_float +{ +class half; + +#if HALF_ENABLE_CPP11_USER_LITERALS +/// Library-defined half-precision literals. +/// Import this namespace to enable half-precision floating point literals: +/// ~~~~{.cpp} +/// using namespace half_float::literal; +/// half_float::half = 4.2_h; +/// ~~~~ +namespace literal +{ +half operator"" _h(long double); +} +#endif + +/// \internal +/// \brief Implementation details. +namespace detail +{ +#if HALF_ENABLE_CPP11_TYPE_TRAITS +/// Conditional type. +template +struct conditional : std::conditional +{ +}; + +/// Helper for tag dispatching. +template +struct bool_type : std::integral_constant +{ +}; +using std::false_type; +using std::true_type; + +/// Type traits for floating point types. +template +struct is_float : std::is_floating_point +{ +}; +#else +/// Conditional type. +template +struct conditional +{ + typedef T type; +}; +template +struct conditional +{ + typedef F type; +}; + +/// Helper for tag dispatching. +template +struct bool_type +{ +}; +typedef bool_type true_type; +typedef bool_type false_type; + +/// Type traits for floating point types. +template +struct is_float : false_type +{ +}; +template +struct is_float : is_float +{ +}; +template +struct is_float : is_float +{ +}; +template +struct is_float : is_float +{ +}; +template <> +struct is_float : true_type +{ +}; +template <> +struct is_float : true_type +{ +}; +template <> +struct is_float : true_type +{ +}; +#endif + +/// Type traits for floating point bits. +template +struct bits +{ + typedef unsigned char type; +}; +template +struct bits : bits +{ +}; +template +struct bits : bits +{ +}; +template +struct bits : bits +{ +}; + +#if HALF_ENABLE_CPP11_CSTDINT +/// Unsigned integer of (at least) 16 bits width. +typedef std::uint_least16_t uint16; + +/// Unsigned integer of (at least) 32 bits width. +template <> +struct bits +{ + typedef std::uint_least32_t type; +}; + +/// Unsigned integer of (at least) 64 bits width. +template <> +struct bits +{ + typedef std::uint_least64_t type; +}; +#else +/// Unsigned integer of (at least) 16 bits width. +typedef unsigned short uint16; + +/// Unsigned integer of (at least) 32 bits width. +template <> +struct bits : conditional::digits >= 32, unsigned int, unsigned long> +{ +}; + +#if HALF_ENABLE_CPP11_LONG_LONG +/// Unsigned integer of (at least) 64 bits width. +template <> +struct bits : conditional::digits >= 64, unsigned long, unsigned long long> +{ +}; +#else +/// Unsigned integer of (at least) 64 bits width. +template <> +struct bits +{ + typedef unsigned long type; +}; +#endif +#endif + +/// Tag type for binary construction. +struct binary_t +{ +}; + +/// Tag for binary construction. +HALF_CONSTEXPR_CONST binary_t binary = binary_t(); + +/// Temporary half-precision expression. +/// This class represents a half-precision expression which just stores a single-precision value internally. +struct expr +{ + /// Conversion constructor. + /// \param f single-precision value to convert + explicit HALF_CONSTEXPR expr(float f) HALF_NOEXCEPT : value_(f) {} + + /// Conversion to single-precision. + /// \return single precision value representing expression value + HALF_CONSTEXPR operator float() const HALF_NOEXCEPT + { + return value_; + } + +private: + /// Internal expression value stored in single-precision. + float value_; +}; + +/// SFINAE helper for generic half-precision functions. +/// This class template has to be specialized for each valid combination of argument types to provide a corresponding +/// `type` member equivalent to \a T. +/// \tparam T type to return +template +struct enable +{ +}; +template +struct enable +{ + typedef T type; +}; +template +struct enable +{ + typedef T type; +}; +template +struct enable +{ + typedef T type; +}; +template +struct enable +{ + typedef T type; +}; +template +struct enable +{ + typedef T type; +}; +template +struct enable +{ + typedef T type; +}; +template +struct enable +{ + typedef T type; +}; +template +struct enable +{ + typedef T type; +}; +template +struct enable +{ + typedef T type; +}; +template +struct enable +{ + typedef T type; +}; +template +struct enable +{ + typedef T type; +}; +template +struct enable +{ + typedef T type; +}; +template +struct enable +{ + typedef T type; +}; +template +struct enable +{ + typedef T type; +}; + +/// Return type for specialized generic 2-argument half-precision functions. +/// This class template has to be specialized for each valid combination of argument types to provide a corresponding +/// `type` member denoting the appropriate return type. +/// \tparam T first argument type +/// \tparam U first argument type +template +struct result : enable +{ +}; +template <> +struct result +{ + typedef half type; +}; + +/// \name Classification helpers +/// \{ + +/// Check for infinity. +/// \tparam T argument type (builtin floating point type) +/// \param arg value to query +/// \retval true if infinity +/// \retval false else +template +bool builtin_isinf(T arg) +{ +#if HALF_ENABLE_CPP11_CMATH + return std::isinf(arg); +#elif defined(_MSC_VER) + return !::_finite(static_cast(arg)) && !::_isnan(static_cast(arg)); +#else + return arg == std::numeric_limits::infinity() || arg == -std::numeric_limits::infinity(); +#endif +} + +/// Check for NaN. +/// \tparam T argument type (builtin floating point type) +/// \param arg value to query +/// \retval true if not a number +/// \retval false else +template +bool builtin_isnan(T arg) +{ +#if HALF_ENABLE_CPP11_CMATH + return std::isnan(arg); +#elif defined(_MSC_VER) + return ::_isnan(static_cast(arg)) != 0; +#else + return arg != arg; +#endif +} + +/// Check sign. +/// \tparam T argument type (builtin floating point type) +/// \param arg value to query +/// \retval true if signbit set +/// \retval false else +template +bool builtin_signbit(T arg) +{ +#if HALF_ENABLE_CPP11_CMATH + return std::signbit(arg); +#else + return arg < T() || (arg == T() && T(1) / arg < T()); +#endif +} + +/// \} +/// \name Conversion +/// \{ + +/// Convert IEEE single-precision to half-precision. +/// Credit for this goes to [Jeroen van der Zijp](ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf). +/// \tparam R rounding mode to use, `std::round_indeterminate` for fastest rounding +/// \param value single-precision value +/// \return binary representation of half-precision value +template +uint16 float2half_impl(float value, true_type) +{ + typedef bits::type uint32; + uint32 bits; // = *reinterpret_cast(&value); //violating strict aliasing! + std::memcpy(&bits, &value, sizeof(float)); + /* uint16 hbits = (bits>>16) & 0x8000; + bits &= 0x7FFFFFFF; + int exp = bits >> 23; + if(exp == 255) + return hbits | 0x7C00 | (0x3FF&-static_cast((bits&0x7FFFFF)!=0)); + if(exp > 142) + { + if(R == std::round_toward_infinity) + return hbits | 0x7C00 - (hbits>>15); + if(R == std::round_toward_neg_infinity) + return hbits | 0x7BFF + (hbits>>15); + return hbits | 0x7BFF + (R!=std::round_toward_zero); + } + int g, s; + if(exp > 112) + { + g = (bits>>12) & 1; + s = (bits&0xFFF) != 0; + hbits |= ((exp-112)<<10) | ((bits>>13)&0x3FF); + } + else if(exp > 101) + { + int i = 125 - exp; + bits = (bits&0x7FFFFF) | 0x800000; + g = (bits>>i) & 1; + s = (bits&((1L<> (i+1); + } + else + { + g = 0; + s = bits != 0; + } + if(R == std::round_to_nearest) + #if HALF_ROUND_TIES_TO_EVEN + hbits += g & (s|hbits); + #else + hbits += g; + #endif + else if(R == std::round_toward_infinity) + hbits += ~(hbits>>15) & (s|g); + else if(R == std::round_toward_neg_infinity) + hbits += (hbits>>15) & (g|s); + */ + static const uint16 base_table[512] = {0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0001, 0x0002, 0x0004, 0x0008, + 0x0010, 0x0020, 0x0040, 0x0080, 0x0100, 0x0200, 0x0400, 0x0800, 0x0C00, 0x1000, 0x1400, 0x1800, 0x1C00, 0x2000, + 0x2400, 0x2800, 0x2C00, 0x3000, 0x3400, 0x3800, 0x3C00, 0x4000, 0x4400, 0x4800, 0x4C00, 0x5000, 0x5400, 0x5800, + 0x5C00, 0x6000, 0x6400, 0x6800, 0x6C00, 0x7000, 0x7400, 0x7800, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, + 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, + 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, + 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, + 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, + 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, + 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, + 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, + 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8001, 0x8002, 0x8004, 0x8008, 0x8010, 0x8020, 0x8040, 0x8080, 0x8100, 0x8200, 0x8400, 0x8800, 0x8C00, 0x9000, + 0x9400, 0x9800, 0x9C00, 0xA000, 0xA400, 0xA800, 0xAC00, 0xB000, 0xB400, 0xB800, 0xBC00, 0xC000, 0xC400, 0xC800, + 0xCC00, 0xD000, 0xD400, 0xD800, 0xDC00, 0xE000, 0xE400, 0xE800, 0xEC00, 0xF000, 0xF400, 0xF800, 0xFC00, 0xFC00, + 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, + 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, + 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, + 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, + 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, + 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, + 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, + 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00}; + static const unsigned char shift_table[512] = {24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 13, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 13}; + uint16 hbits = base_table[bits >> 23] + static_cast((bits & 0x7FFFFF) >> shift_table[bits >> 23]); + if (R == std::round_to_nearest) + hbits += (((bits & 0x7FFFFF) >> (shift_table[bits >> 23] - 1)) | (((bits >> 23) & 0xFF) == 102)) + & ((hbits & 0x7C00) != 0x7C00) +#if HALF_ROUND_TIES_TO_EVEN + & (((((static_cast(1) << (shift_table[bits >> 23] - 1)) - 1) & bits) != 0) | hbits) +#endif + ; + else if (R == std::round_toward_zero) + hbits -= ((hbits & 0x7FFF) == 0x7C00) & ~shift_table[bits >> 23]; + else if (R == std::round_toward_infinity) + hbits += ((((bits & 0x7FFFFF & ((static_cast(1) << (shift_table[bits >> 23])) - 1)) != 0) + | (((bits >> 23) <= 102) & ((bits >> 23) != 0))) + & (hbits < 0x7C00)) + - ((hbits == 0xFC00) & ((bits >> 23) != 511)); + else if (R == std::round_toward_neg_infinity) + hbits += ((((bits & 0x7FFFFF & ((static_cast(1) << (shift_table[bits >> 23])) - 1)) != 0) + | (((bits >> 23) <= 358) & ((bits >> 23) != 256))) + & (hbits < 0xFC00) & (hbits >> 15)) + - ((hbits == 0x7C00) & ((bits >> 23) != 255)); + return hbits; +} + +/// Convert IEEE double-precision to half-precision. +/// \tparam R rounding mode to use, `std::round_indeterminate` for fastest rounding +/// \param value double-precision value +/// \return binary representation of half-precision value +template +uint16 float2half_impl(double value, true_type) +{ + typedef bits::type uint32; + typedef bits::type uint64; + uint64 bits; // = *reinterpret_cast(&value); //violating strict aliasing! + std::memcpy(&bits, &value, sizeof(double)); + uint32 hi = bits >> 32, lo = bits & 0xFFFFFFFF; + uint16 hbits = (hi >> 16) & 0x8000; + hi &= 0x7FFFFFFF; + int exp = hi >> 20; + if (exp == 2047) + return hbits | 0x7C00 | (0x3FF & -static_cast((bits & 0xFFFFFFFFFFFFF) != 0)); + if (exp > 1038) + { + if (R == std::round_toward_infinity) + return hbits | 0x7C00 - (hbits >> 15); + if (R == std::round_toward_neg_infinity) + return hbits | 0x7BFF + (hbits >> 15); + return hbits | 0x7BFF + (R != std::round_toward_zero); + } + int g, s = lo != 0; + if (exp > 1008) + { + g = (hi >> 9) & 1; + s |= (hi & 0x1FF) != 0; + hbits |= ((exp - 1008) << 10) | ((hi >> 10) & 0x3FF); + } + else if (exp > 997) + { + int i = 1018 - exp; + hi = (hi & 0xFFFFF) | 0x100000; + g = (hi >> i) & 1; + s |= (hi & ((1L << i) - 1)) != 0; + hbits |= hi >> (i + 1); + } + else + { + g = 0; + s |= hi != 0; + } + if (R == std::round_to_nearest) +#if HALF_ROUND_TIES_TO_EVEN + hbits += g & (s | hbits); +#else + hbits += g; +#endif + else if (R == std::round_toward_infinity) + hbits += ~(hbits >> 15) & (s | g); + else if (R == std::round_toward_neg_infinity) + hbits += (hbits >> 15) & (g | s); + return hbits; +} + +/// Convert non-IEEE floating point to half-precision. +/// \tparam R rounding mode to use, `std::round_indeterminate` for fastest rounding +/// \tparam T source type (builtin floating point type) +/// \param value floating point value +/// \return binary representation of half-precision value +template +uint16 float2half_impl(T value, ...) +{ + uint16 hbits = static_cast(builtin_signbit(value)) << 15; + if (value == T()) + return hbits; + if (builtin_isnan(value)) + return hbits | 0x7FFF; + if (builtin_isinf(value)) + return hbits | 0x7C00; + int exp; + std::frexp(value, &exp); + if (exp > 16) + { + if (R == std::round_toward_infinity) + return hbits | (0x7C00 - (hbits >> 15)); + else if (R == std::round_toward_neg_infinity) + return hbits | (0x7BFF + (hbits >> 15)); + return hbits | (0x7BFF + (R != std::round_toward_zero)); + } + if (exp < -13) + value = std::ldexp(value, 24); + else + { + value = std::ldexp(value, 11 - exp); + hbits |= ((exp + 13) << 10); + } + T ival, frac = std::modf(value, &ival); + hbits += static_cast(std::abs(static_cast(ival))); + if (R == std::round_to_nearest) + { + frac = std::abs(frac); +#if HALF_ROUND_TIES_TO_EVEN + hbits += (frac > T(0.5)) | ((frac == T(0.5)) & hbits); +#else + hbits += frac >= T(0.5); +#endif + } + else if (R == std::round_toward_infinity) + hbits += frac > T(); + else if (R == std::round_toward_neg_infinity) + hbits += frac < T(); + return hbits; +} + +/// Convert floating point to half-precision. +/// \tparam R rounding mode to use, `std::round_indeterminate` for fastest rounding +/// \tparam T source type (builtin floating point type) +/// \param value floating point value +/// \return binary representation of half-precision value +template +uint16 float2half(T value) +{ + return float2half_impl( + value, bool_type < std::numeric_limits::is_iec559 && sizeof(typename bits::type) == sizeof(T) > ()); +} + +/// Convert integer to half-precision floating point. +/// \tparam R rounding mode to use, `std::round_indeterminate` for fastest rounding +/// \tparam S `true` if value negative, `false` else +/// \tparam T type to convert (builtin integer type) +/// \param value non-negative integral value +/// \return binary representation of half-precision value +template +uint16 int2half_impl(T value) +{ +#if HALF_ENABLE_CPP11_STATIC_ASSERT && HALF_ENABLE_CPP11_TYPE_TRAITS + static_assert(std::is_integral::value, "int to half conversion only supports builtin integer types"); +#endif + if (S) + value = -value; + uint16 bits = S << 15; + if (value > 0xFFFF) + { + if (R == std::round_toward_infinity) + bits |= 0x7C00 - S; + else if (R == std::round_toward_neg_infinity) + bits |= 0x7BFF + S; + else + bits |= 0x7BFF + (R != std::round_toward_zero); + } + else if (value) + { + uint32_t m = value, exp = 24; + for (; m < 0x400; m <<= 1, --exp) + ; + for (; m > 0x7FF; m >>= 1, ++exp) + ; + bits |= (exp << 10) + m; + if (exp > 24) + { + if (R == std::round_to_nearest) + bits += (value >> (exp - 25)) & 1 +#if HALF_ROUND_TIES_TO_EVEN + & (((((1 << (exp - 25)) - 1) & value) != 0) | bits) +#endif + ; + else if (R == std::round_toward_infinity) + bits += ((value & ((1 << (exp - 24)) - 1)) != 0) & !S; + else if (R == std::round_toward_neg_infinity) + bits += ((value & ((1 << (exp - 24)) - 1)) != 0) & S; + } + } + return bits; +} + +/// Convert integer to half-precision floating point. +/// \tparam R rounding mode to use, `std::round_indeterminate` for fastest rounding +/// \tparam T type to convert (builtin integer type) +/// \param value integral value +/// \return binary representation of half-precision value +template +uint16 int2half(T value) +{ + return (value < 0) ? int2half_impl(value) : int2half_impl(value); +} + +/// Convert half-precision to IEEE single-precision. +/// Credit for this goes to [Jeroen van der Zijp](ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf). +/// \param value binary representation of half-precision value +/// \return single-precision value +inline float half2float_impl(uint16 value, float, true_type) +{ + typedef bits::type uint32; + /* uint32 bits = static_cast(value&0x8000) << 16; + int abs = value & 0x7FFF; + if(abs) + { + bits |= 0x38000000 << static_cast(abs>=0x7C00); + for(; abs<0x400; abs<<=1,bits-=0x800000) ; + bits += static_cast(abs) << 13; + } + */ + static const uint32 mantissa_table[2048] = {0x00000000, 0x33800000, 0x34000000, 0x34400000, 0x34800000, 0x34A00000, + 0x34C00000, 0x34E00000, 0x35000000, 0x35100000, 0x35200000, 0x35300000, 0x35400000, 0x35500000, 0x35600000, + 0x35700000, 0x35800000, 0x35880000, 0x35900000, 0x35980000, 0x35A00000, 0x35A80000, 0x35B00000, 0x35B80000, + 0x35C00000, 0x35C80000, 0x35D00000, 0x35D80000, 0x35E00000, 0x35E80000, 0x35F00000, 0x35F80000, 0x36000000, + 0x36040000, 0x36080000, 0x360C0000, 0x36100000, 0x36140000, 0x36180000, 0x361C0000, 0x36200000, 0x36240000, + 0x36280000, 0x362C0000, 0x36300000, 0x36340000, 0x36380000, 0x363C0000, 0x36400000, 0x36440000, 0x36480000, + 0x364C0000, 0x36500000, 0x36540000, 0x36580000, 0x365C0000, 0x36600000, 0x36640000, 0x36680000, 0x366C0000, + 0x36700000, 0x36740000, 0x36780000, 0x367C0000, 0x36800000, 0x36820000, 0x36840000, 0x36860000, 0x36880000, + 0x368A0000, 0x368C0000, 0x368E0000, 0x36900000, 0x36920000, 0x36940000, 0x36960000, 0x36980000, 0x369A0000, + 0x369C0000, 0x369E0000, 0x36A00000, 0x36A20000, 0x36A40000, 0x36A60000, 0x36A80000, 0x36AA0000, 0x36AC0000, + 0x36AE0000, 0x36B00000, 0x36B20000, 0x36B40000, 0x36B60000, 0x36B80000, 0x36BA0000, 0x36BC0000, 0x36BE0000, + 0x36C00000, 0x36C20000, 0x36C40000, 0x36C60000, 0x36C80000, 0x36CA0000, 0x36CC0000, 0x36CE0000, 0x36D00000, + 0x36D20000, 0x36D40000, 0x36D60000, 0x36D80000, 0x36DA0000, 0x36DC0000, 0x36DE0000, 0x36E00000, 0x36E20000, + 0x36E40000, 0x36E60000, 0x36E80000, 0x36EA0000, 0x36EC0000, 0x36EE0000, 0x36F00000, 0x36F20000, 0x36F40000, + 0x36F60000, 0x36F80000, 0x36FA0000, 0x36FC0000, 0x36FE0000, 0x37000000, 0x37010000, 0x37020000, 0x37030000, + 0x37040000, 0x37050000, 0x37060000, 0x37070000, 0x37080000, 0x37090000, 0x370A0000, 0x370B0000, 0x370C0000, + 0x370D0000, 0x370E0000, 0x370F0000, 0x37100000, 0x37110000, 0x37120000, 0x37130000, 0x37140000, 0x37150000, + 0x37160000, 0x37170000, 0x37180000, 0x37190000, 0x371A0000, 0x371B0000, 0x371C0000, 0x371D0000, 0x371E0000, + 0x371F0000, 0x37200000, 0x37210000, 0x37220000, 0x37230000, 0x37240000, 0x37250000, 0x37260000, 0x37270000, + 0x37280000, 0x37290000, 0x372A0000, 0x372B0000, 0x372C0000, 0x372D0000, 0x372E0000, 0x372F0000, 0x37300000, + 0x37310000, 0x37320000, 0x37330000, 0x37340000, 0x37350000, 0x37360000, 0x37370000, 0x37380000, 0x37390000, + 0x373A0000, 0x373B0000, 0x373C0000, 0x373D0000, 0x373E0000, 0x373F0000, 0x37400000, 0x37410000, 0x37420000, + 0x37430000, 0x37440000, 0x37450000, 0x37460000, 0x37470000, 0x37480000, 0x37490000, 0x374A0000, 0x374B0000, + 0x374C0000, 0x374D0000, 0x374E0000, 0x374F0000, 0x37500000, 0x37510000, 0x37520000, 0x37530000, 0x37540000, + 0x37550000, 0x37560000, 0x37570000, 0x37580000, 0x37590000, 0x375A0000, 0x375B0000, 0x375C0000, 0x375D0000, + 0x375E0000, 0x375F0000, 0x37600000, 0x37610000, 0x37620000, 0x37630000, 0x37640000, 0x37650000, 0x37660000, + 0x37670000, 0x37680000, 0x37690000, 0x376A0000, 0x376B0000, 0x376C0000, 0x376D0000, 0x376E0000, 0x376F0000, + 0x37700000, 0x37710000, 0x37720000, 0x37730000, 0x37740000, 0x37750000, 0x37760000, 0x37770000, 0x37780000, + 0x37790000, 0x377A0000, 0x377B0000, 0x377C0000, 0x377D0000, 0x377E0000, 0x377F0000, 0x37800000, 0x37808000, + 0x37810000, 0x37818000, 0x37820000, 0x37828000, 0x37830000, 0x37838000, 0x37840000, 0x37848000, 0x37850000, + 0x37858000, 0x37860000, 0x37868000, 0x37870000, 0x37878000, 0x37880000, 0x37888000, 0x37890000, 0x37898000, + 0x378A0000, 0x378A8000, 0x378B0000, 0x378B8000, 0x378C0000, 0x378C8000, 0x378D0000, 0x378D8000, 0x378E0000, + 0x378E8000, 0x378F0000, 0x378F8000, 0x37900000, 0x37908000, 0x37910000, 0x37918000, 0x37920000, 0x37928000, + 0x37930000, 0x37938000, 0x37940000, 0x37948000, 0x37950000, 0x37958000, 0x37960000, 0x37968000, 0x37970000, + 0x37978000, 0x37980000, 0x37988000, 0x37990000, 0x37998000, 0x379A0000, 0x379A8000, 0x379B0000, 0x379B8000, + 0x379C0000, 0x379C8000, 0x379D0000, 0x379D8000, 0x379E0000, 0x379E8000, 0x379F0000, 0x379F8000, 0x37A00000, + 0x37A08000, 0x37A10000, 0x37A18000, 0x37A20000, 0x37A28000, 0x37A30000, 0x37A38000, 0x37A40000, 0x37A48000, + 0x37A50000, 0x37A58000, 0x37A60000, 0x37A68000, 0x37A70000, 0x37A78000, 0x37A80000, 0x37A88000, 0x37A90000, + 0x37A98000, 0x37AA0000, 0x37AA8000, 0x37AB0000, 0x37AB8000, 0x37AC0000, 0x37AC8000, 0x37AD0000, 0x37AD8000, + 0x37AE0000, 0x37AE8000, 0x37AF0000, 0x37AF8000, 0x37B00000, 0x37B08000, 0x37B10000, 0x37B18000, 0x37B20000, + 0x37B28000, 0x37B30000, 0x37B38000, 0x37B40000, 0x37B48000, 0x37B50000, 0x37B58000, 0x37B60000, 0x37B68000, + 0x37B70000, 0x37B78000, 0x37B80000, 0x37B88000, 0x37B90000, 0x37B98000, 0x37BA0000, 0x37BA8000, 0x37BB0000, + 0x37BB8000, 0x37BC0000, 0x37BC8000, 0x37BD0000, 0x37BD8000, 0x37BE0000, 0x37BE8000, 0x37BF0000, 0x37BF8000, + 0x37C00000, 0x37C08000, 0x37C10000, 0x37C18000, 0x37C20000, 0x37C28000, 0x37C30000, 0x37C38000, 0x37C40000, + 0x37C48000, 0x37C50000, 0x37C58000, 0x37C60000, 0x37C68000, 0x37C70000, 0x37C78000, 0x37C80000, 0x37C88000, + 0x37C90000, 0x37C98000, 0x37CA0000, 0x37CA8000, 0x37CB0000, 0x37CB8000, 0x37CC0000, 0x37CC8000, 0x37CD0000, + 0x37CD8000, 0x37CE0000, 0x37CE8000, 0x37CF0000, 0x37CF8000, 0x37D00000, 0x37D08000, 0x37D10000, 0x37D18000, + 0x37D20000, 0x37D28000, 0x37D30000, 0x37D38000, 0x37D40000, 0x37D48000, 0x37D50000, 0x37D58000, 0x37D60000, + 0x37D68000, 0x37D70000, 0x37D78000, 0x37D80000, 0x37D88000, 0x37D90000, 0x37D98000, 0x37DA0000, 0x37DA8000, + 0x37DB0000, 0x37DB8000, 0x37DC0000, 0x37DC8000, 0x37DD0000, 0x37DD8000, 0x37DE0000, 0x37DE8000, 0x37DF0000, + 0x37DF8000, 0x37E00000, 0x37E08000, 0x37E10000, 0x37E18000, 0x37E20000, 0x37E28000, 0x37E30000, 0x37E38000, + 0x37E40000, 0x37E48000, 0x37E50000, 0x37E58000, 0x37E60000, 0x37E68000, 0x37E70000, 0x37E78000, 0x37E80000, + 0x37E88000, 0x37E90000, 0x37E98000, 0x37EA0000, 0x37EA8000, 0x37EB0000, 0x37EB8000, 0x37EC0000, 0x37EC8000, + 0x37ED0000, 0x37ED8000, 0x37EE0000, 0x37EE8000, 0x37EF0000, 0x37EF8000, 0x37F00000, 0x37F08000, 0x37F10000, + 0x37F18000, 0x37F20000, 0x37F28000, 0x37F30000, 0x37F38000, 0x37F40000, 0x37F48000, 0x37F50000, 0x37F58000, + 0x37F60000, 0x37F68000, 0x37F70000, 0x37F78000, 0x37F80000, 0x37F88000, 0x37F90000, 0x37F98000, 0x37FA0000, + 0x37FA8000, 0x37FB0000, 0x37FB8000, 0x37FC0000, 0x37FC8000, 0x37FD0000, 0x37FD8000, 0x37FE0000, 0x37FE8000, + 0x37FF0000, 0x37FF8000, 0x38000000, 0x38004000, 0x38008000, 0x3800C000, 0x38010000, 0x38014000, 0x38018000, + 0x3801C000, 0x38020000, 0x38024000, 0x38028000, 0x3802C000, 0x38030000, 0x38034000, 0x38038000, 0x3803C000, + 0x38040000, 0x38044000, 0x38048000, 0x3804C000, 0x38050000, 0x38054000, 0x38058000, 0x3805C000, 0x38060000, + 0x38064000, 0x38068000, 0x3806C000, 0x38070000, 0x38074000, 0x38078000, 0x3807C000, 0x38080000, 0x38084000, + 0x38088000, 0x3808C000, 0x38090000, 0x38094000, 0x38098000, 0x3809C000, 0x380A0000, 0x380A4000, 0x380A8000, + 0x380AC000, 0x380B0000, 0x380B4000, 0x380B8000, 0x380BC000, 0x380C0000, 0x380C4000, 0x380C8000, 0x380CC000, + 0x380D0000, 0x380D4000, 0x380D8000, 0x380DC000, 0x380E0000, 0x380E4000, 0x380E8000, 0x380EC000, 0x380F0000, + 0x380F4000, 0x380F8000, 0x380FC000, 0x38100000, 0x38104000, 0x38108000, 0x3810C000, 0x38110000, 0x38114000, + 0x38118000, 0x3811C000, 0x38120000, 0x38124000, 0x38128000, 0x3812C000, 0x38130000, 0x38134000, 0x38138000, + 0x3813C000, 0x38140000, 0x38144000, 0x38148000, 0x3814C000, 0x38150000, 0x38154000, 0x38158000, 0x3815C000, + 0x38160000, 0x38164000, 0x38168000, 0x3816C000, 0x38170000, 0x38174000, 0x38178000, 0x3817C000, 0x38180000, + 0x38184000, 0x38188000, 0x3818C000, 0x38190000, 0x38194000, 0x38198000, 0x3819C000, 0x381A0000, 0x381A4000, + 0x381A8000, 0x381AC000, 0x381B0000, 0x381B4000, 0x381B8000, 0x381BC000, 0x381C0000, 0x381C4000, 0x381C8000, + 0x381CC000, 0x381D0000, 0x381D4000, 0x381D8000, 0x381DC000, 0x381E0000, 0x381E4000, 0x381E8000, 0x381EC000, + 0x381F0000, 0x381F4000, 0x381F8000, 0x381FC000, 0x38200000, 0x38204000, 0x38208000, 0x3820C000, 0x38210000, + 0x38214000, 0x38218000, 0x3821C000, 0x38220000, 0x38224000, 0x38228000, 0x3822C000, 0x38230000, 0x38234000, + 0x38238000, 0x3823C000, 0x38240000, 0x38244000, 0x38248000, 0x3824C000, 0x38250000, 0x38254000, 0x38258000, + 0x3825C000, 0x38260000, 0x38264000, 0x38268000, 0x3826C000, 0x38270000, 0x38274000, 0x38278000, 0x3827C000, + 0x38280000, 0x38284000, 0x38288000, 0x3828C000, 0x38290000, 0x38294000, 0x38298000, 0x3829C000, 0x382A0000, + 0x382A4000, 0x382A8000, 0x382AC000, 0x382B0000, 0x382B4000, 0x382B8000, 0x382BC000, 0x382C0000, 0x382C4000, + 0x382C8000, 0x382CC000, 0x382D0000, 0x382D4000, 0x382D8000, 0x382DC000, 0x382E0000, 0x382E4000, 0x382E8000, + 0x382EC000, 0x382F0000, 0x382F4000, 0x382F8000, 0x382FC000, 0x38300000, 0x38304000, 0x38308000, 0x3830C000, + 0x38310000, 0x38314000, 0x38318000, 0x3831C000, 0x38320000, 0x38324000, 0x38328000, 0x3832C000, 0x38330000, + 0x38334000, 0x38338000, 0x3833C000, 0x38340000, 0x38344000, 0x38348000, 0x3834C000, 0x38350000, 0x38354000, + 0x38358000, 0x3835C000, 0x38360000, 0x38364000, 0x38368000, 0x3836C000, 0x38370000, 0x38374000, 0x38378000, + 0x3837C000, 0x38380000, 0x38384000, 0x38388000, 0x3838C000, 0x38390000, 0x38394000, 0x38398000, 0x3839C000, + 0x383A0000, 0x383A4000, 0x383A8000, 0x383AC000, 0x383B0000, 0x383B4000, 0x383B8000, 0x383BC000, 0x383C0000, + 0x383C4000, 0x383C8000, 0x383CC000, 0x383D0000, 0x383D4000, 0x383D8000, 0x383DC000, 0x383E0000, 0x383E4000, + 0x383E8000, 0x383EC000, 0x383F0000, 0x383F4000, 0x383F8000, 0x383FC000, 0x38400000, 0x38404000, 0x38408000, + 0x3840C000, 0x38410000, 0x38414000, 0x38418000, 0x3841C000, 0x38420000, 0x38424000, 0x38428000, 0x3842C000, + 0x38430000, 0x38434000, 0x38438000, 0x3843C000, 0x38440000, 0x38444000, 0x38448000, 0x3844C000, 0x38450000, + 0x38454000, 0x38458000, 0x3845C000, 0x38460000, 0x38464000, 0x38468000, 0x3846C000, 0x38470000, 0x38474000, + 0x38478000, 0x3847C000, 0x38480000, 0x38484000, 0x38488000, 0x3848C000, 0x38490000, 0x38494000, 0x38498000, + 0x3849C000, 0x384A0000, 0x384A4000, 0x384A8000, 0x384AC000, 0x384B0000, 0x384B4000, 0x384B8000, 0x384BC000, + 0x384C0000, 0x384C4000, 0x384C8000, 0x384CC000, 0x384D0000, 0x384D4000, 0x384D8000, 0x384DC000, 0x384E0000, + 0x384E4000, 0x384E8000, 0x384EC000, 0x384F0000, 0x384F4000, 0x384F8000, 0x384FC000, 0x38500000, 0x38504000, + 0x38508000, 0x3850C000, 0x38510000, 0x38514000, 0x38518000, 0x3851C000, 0x38520000, 0x38524000, 0x38528000, + 0x3852C000, 0x38530000, 0x38534000, 0x38538000, 0x3853C000, 0x38540000, 0x38544000, 0x38548000, 0x3854C000, + 0x38550000, 0x38554000, 0x38558000, 0x3855C000, 0x38560000, 0x38564000, 0x38568000, 0x3856C000, 0x38570000, + 0x38574000, 0x38578000, 0x3857C000, 0x38580000, 0x38584000, 0x38588000, 0x3858C000, 0x38590000, 0x38594000, + 0x38598000, 0x3859C000, 0x385A0000, 0x385A4000, 0x385A8000, 0x385AC000, 0x385B0000, 0x385B4000, 0x385B8000, + 0x385BC000, 0x385C0000, 0x385C4000, 0x385C8000, 0x385CC000, 0x385D0000, 0x385D4000, 0x385D8000, 0x385DC000, + 0x385E0000, 0x385E4000, 0x385E8000, 0x385EC000, 0x385F0000, 0x385F4000, 0x385F8000, 0x385FC000, 0x38600000, + 0x38604000, 0x38608000, 0x3860C000, 0x38610000, 0x38614000, 0x38618000, 0x3861C000, 0x38620000, 0x38624000, + 0x38628000, 0x3862C000, 0x38630000, 0x38634000, 0x38638000, 0x3863C000, 0x38640000, 0x38644000, 0x38648000, + 0x3864C000, 0x38650000, 0x38654000, 0x38658000, 0x3865C000, 0x38660000, 0x38664000, 0x38668000, 0x3866C000, + 0x38670000, 0x38674000, 0x38678000, 0x3867C000, 0x38680000, 0x38684000, 0x38688000, 0x3868C000, 0x38690000, + 0x38694000, 0x38698000, 0x3869C000, 0x386A0000, 0x386A4000, 0x386A8000, 0x386AC000, 0x386B0000, 0x386B4000, + 0x386B8000, 0x386BC000, 0x386C0000, 0x386C4000, 0x386C8000, 0x386CC000, 0x386D0000, 0x386D4000, 0x386D8000, + 0x386DC000, 0x386E0000, 0x386E4000, 0x386E8000, 0x386EC000, 0x386F0000, 0x386F4000, 0x386F8000, 0x386FC000, + 0x38700000, 0x38704000, 0x38708000, 0x3870C000, 0x38710000, 0x38714000, 0x38718000, 0x3871C000, 0x38720000, + 0x38724000, 0x38728000, 0x3872C000, 0x38730000, 0x38734000, 0x38738000, 0x3873C000, 0x38740000, 0x38744000, + 0x38748000, 0x3874C000, 0x38750000, 0x38754000, 0x38758000, 0x3875C000, 0x38760000, 0x38764000, 0x38768000, + 0x3876C000, 0x38770000, 0x38774000, 0x38778000, 0x3877C000, 0x38780000, 0x38784000, 0x38788000, 0x3878C000, + 0x38790000, 0x38794000, 0x38798000, 0x3879C000, 0x387A0000, 0x387A4000, 0x387A8000, 0x387AC000, 0x387B0000, + 0x387B4000, 0x387B8000, 0x387BC000, 0x387C0000, 0x387C4000, 0x387C8000, 0x387CC000, 0x387D0000, 0x387D4000, + 0x387D8000, 0x387DC000, 0x387E0000, 0x387E4000, 0x387E8000, 0x387EC000, 0x387F0000, 0x387F4000, 0x387F8000, + 0x387FC000, 0x38000000, 0x38002000, 0x38004000, 0x38006000, 0x38008000, 0x3800A000, 0x3800C000, 0x3800E000, + 0x38010000, 0x38012000, 0x38014000, 0x38016000, 0x38018000, 0x3801A000, 0x3801C000, 0x3801E000, 0x38020000, + 0x38022000, 0x38024000, 0x38026000, 0x38028000, 0x3802A000, 0x3802C000, 0x3802E000, 0x38030000, 0x38032000, + 0x38034000, 0x38036000, 0x38038000, 0x3803A000, 0x3803C000, 0x3803E000, 0x38040000, 0x38042000, 0x38044000, + 0x38046000, 0x38048000, 0x3804A000, 0x3804C000, 0x3804E000, 0x38050000, 0x38052000, 0x38054000, 0x38056000, + 0x38058000, 0x3805A000, 0x3805C000, 0x3805E000, 0x38060000, 0x38062000, 0x38064000, 0x38066000, 0x38068000, + 0x3806A000, 0x3806C000, 0x3806E000, 0x38070000, 0x38072000, 0x38074000, 0x38076000, 0x38078000, 0x3807A000, + 0x3807C000, 0x3807E000, 0x38080000, 0x38082000, 0x38084000, 0x38086000, 0x38088000, 0x3808A000, 0x3808C000, + 0x3808E000, 0x38090000, 0x38092000, 0x38094000, 0x38096000, 0x38098000, 0x3809A000, 0x3809C000, 0x3809E000, + 0x380A0000, 0x380A2000, 0x380A4000, 0x380A6000, 0x380A8000, 0x380AA000, 0x380AC000, 0x380AE000, 0x380B0000, + 0x380B2000, 0x380B4000, 0x380B6000, 0x380B8000, 0x380BA000, 0x380BC000, 0x380BE000, 0x380C0000, 0x380C2000, + 0x380C4000, 0x380C6000, 0x380C8000, 0x380CA000, 0x380CC000, 0x380CE000, 0x380D0000, 0x380D2000, 0x380D4000, + 0x380D6000, 0x380D8000, 0x380DA000, 0x380DC000, 0x380DE000, 0x380E0000, 0x380E2000, 0x380E4000, 0x380E6000, + 0x380E8000, 0x380EA000, 0x380EC000, 0x380EE000, 0x380F0000, 0x380F2000, 0x380F4000, 0x380F6000, 0x380F8000, + 0x380FA000, 0x380FC000, 0x380FE000, 0x38100000, 0x38102000, 0x38104000, 0x38106000, 0x38108000, 0x3810A000, + 0x3810C000, 0x3810E000, 0x38110000, 0x38112000, 0x38114000, 0x38116000, 0x38118000, 0x3811A000, 0x3811C000, + 0x3811E000, 0x38120000, 0x38122000, 0x38124000, 0x38126000, 0x38128000, 0x3812A000, 0x3812C000, 0x3812E000, + 0x38130000, 0x38132000, 0x38134000, 0x38136000, 0x38138000, 0x3813A000, 0x3813C000, 0x3813E000, 0x38140000, + 0x38142000, 0x38144000, 0x38146000, 0x38148000, 0x3814A000, 0x3814C000, 0x3814E000, 0x38150000, 0x38152000, + 0x38154000, 0x38156000, 0x38158000, 0x3815A000, 0x3815C000, 0x3815E000, 0x38160000, 0x38162000, 0x38164000, + 0x38166000, 0x38168000, 0x3816A000, 0x3816C000, 0x3816E000, 0x38170000, 0x38172000, 0x38174000, 0x38176000, + 0x38178000, 0x3817A000, 0x3817C000, 0x3817E000, 0x38180000, 0x38182000, 0x38184000, 0x38186000, 0x38188000, + 0x3818A000, 0x3818C000, 0x3818E000, 0x38190000, 0x38192000, 0x38194000, 0x38196000, 0x38198000, 0x3819A000, + 0x3819C000, 0x3819E000, 0x381A0000, 0x381A2000, 0x381A4000, 0x381A6000, 0x381A8000, 0x381AA000, 0x381AC000, + 0x381AE000, 0x381B0000, 0x381B2000, 0x381B4000, 0x381B6000, 0x381B8000, 0x381BA000, 0x381BC000, 0x381BE000, + 0x381C0000, 0x381C2000, 0x381C4000, 0x381C6000, 0x381C8000, 0x381CA000, 0x381CC000, 0x381CE000, 0x381D0000, + 0x381D2000, 0x381D4000, 0x381D6000, 0x381D8000, 0x381DA000, 0x381DC000, 0x381DE000, 0x381E0000, 0x381E2000, + 0x381E4000, 0x381E6000, 0x381E8000, 0x381EA000, 0x381EC000, 0x381EE000, 0x381F0000, 0x381F2000, 0x381F4000, + 0x381F6000, 0x381F8000, 0x381FA000, 0x381FC000, 0x381FE000, 0x38200000, 0x38202000, 0x38204000, 0x38206000, + 0x38208000, 0x3820A000, 0x3820C000, 0x3820E000, 0x38210000, 0x38212000, 0x38214000, 0x38216000, 0x38218000, + 0x3821A000, 0x3821C000, 0x3821E000, 0x38220000, 0x38222000, 0x38224000, 0x38226000, 0x38228000, 0x3822A000, + 0x3822C000, 0x3822E000, 0x38230000, 0x38232000, 0x38234000, 0x38236000, 0x38238000, 0x3823A000, 0x3823C000, + 0x3823E000, 0x38240000, 0x38242000, 0x38244000, 0x38246000, 0x38248000, 0x3824A000, 0x3824C000, 0x3824E000, + 0x38250000, 0x38252000, 0x38254000, 0x38256000, 0x38258000, 0x3825A000, 0x3825C000, 0x3825E000, 0x38260000, + 0x38262000, 0x38264000, 0x38266000, 0x38268000, 0x3826A000, 0x3826C000, 0x3826E000, 0x38270000, 0x38272000, + 0x38274000, 0x38276000, 0x38278000, 0x3827A000, 0x3827C000, 0x3827E000, 0x38280000, 0x38282000, 0x38284000, + 0x38286000, 0x38288000, 0x3828A000, 0x3828C000, 0x3828E000, 0x38290000, 0x38292000, 0x38294000, 0x38296000, + 0x38298000, 0x3829A000, 0x3829C000, 0x3829E000, 0x382A0000, 0x382A2000, 0x382A4000, 0x382A6000, 0x382A8000, + 0x382AA000, 0x382AC000, 0x382AE000, 0x382B0000, 0x382B2000, 0x382B4000, 0x382B6000, 0x382B8000, 0x382BA000, + 0x382BC000, 0x382BE000, 0x382C0000, 0x382C2000, 0x382C4000, 0x382C6000, 0x382C8000, 0x382CA000, 0x382CC000, + 0x382CE000, 0x382D0000, 0x382D2000, 0x382D4000, 0x382D6000, 0x382D8000, 0x382DA000, 0x382DC000, 0x382DE000, + 0x382E0000, 0x382E2000, 0x382E4000, 0x382E6000, 0x382E8000, 0x382EA000, 0x382EC000, 0x382EE000, 0x382F0000, + 0x382F2000, 0x382F4000, 0x382F6000, 0x382F8000, 0x382FA000, 0x382FC000, 0x382FE000, 0x38300000, 0x38302000, + 0x38304000, 0x38306000, 0x38308000, 0x3830A000, 0x3830C000, 0x3830E000, 0x38310000, 0x38312000, 0x38314000, + 0x38316000, 0x38318000, 0x3831A000, 0x3831C000, 0x3831E000, 0x38320000, 0x38322000, 0x38324000, 0x38326000, + 0x38328000, 0x3832A000, 0x3832C000, 0x3832E000, 0x38330000, 0x38332000, 0x38334000, 0x38336000, 0x38338000, + 0x3833A000, 0x3833C000, 0x3833E000, 0x38340000, 0x38342000, 0x38344000, 0x38346000, 0x38348000, 0x3834A000, + 0x3834C000, 0x3834E000, 0x38350000, 0x38352000, 0x38354000, 0x38356000, 0x38358000, 0x3835A000, 0x3835C000, + 0x3835E000, 0x38360000, 0x38362000, 0x38364000, 0x38366000, 0x38368000, 0x3836A000, 0x3836C000, 0x3836E000, + 0x38370000, 0x38372000, 0x38374000, 0x38376000, 0x38378000, 0x3837A000, 0x3837C000, 0x3837E000, 0x38380000, + 0x38382000, 0x38384000, 0x38386000, 0x38388000, 0x3838A000, 0x3838C000, 0x3838E000, 0x38390000, 0x38392000, + 0x38394000, 0x38396000, 0x38398000, 0x3839A000, 0x3839C000, 0x3839E000, 0x383A0000, 0x383A2000, 0x383A4000, + 0x383A6000, 0x383A8000, 0x383AA000, 0x383AC000, 0x383AE000, 0x383B0000, 0x383B2000, 0x383B4000, 0x383B6000, + 0x383B8000, 0x383BA000, 0x383BC000, 0x383BE000, 0x383C0000, 0x383C2000, 0x383C4000, 0x383C6000, 0x383C8000, + 0x383CA000, 0x383CC000, 0x383CE000, 0x383D0000, 0x383D2000, 0x383D4000, 0x383D6000, 0x383D8000, 0x383DA000, + 0x383DC000, 0x383DE000, 0x383E0000, 0x383E2000, 0x383E4000, 0x383E6000, 0x383E8000, 0x383EA000, 0x383EC000, + 0x383EE000, 0x383F0000, 0x383F2000, 0x383F4000, 0x383F6000, 0x383F8000, 0x383FA000, 0x383FC000, 0x383FE000, + 0x38400000, 0x38402000, 0x38404000, 0x38406000, 0x38408000, 0x3840A000, 0x3840C000, 0x3840E000, 0x38410000, + 0x38412000, 0x38414000, 0x38416000, 0x38418000, 0x3841A000, 0x3841C000, 0x3841E000, 0x38420000, 0x38422000, + 0x38424000, 0x38426000, 0x38428000, 0x3842A000, 0x3842C000, 0x3842E000, 0x38430000, 0x38432000, 0x38434000, + 0x38436000, 0x38438000, 0x3843A000, 0x3843C000, 0x3843E000, 0x38440000, 0x38442000, 0x38444000, 0x38446000, + 0x38448000, 0x3844A000, 0x3844C000, 0x3844E000, 0x38450000, 0x38452000, 0x38454000, 0x38456000, 0x38458000, + 0x3845A000, 0x3845C000, 0x3845E000, 0x38460000, 0x38462000, 0x38464000, 0x38466000, 0x38468000, 0x3846A000, + 0x3846C000, 0x3846E000, 0x38470000, 0x38472000, 0x38474000, 0x38476000, 0x38478000, 0x3847A000, 0x3847C000, + 0x3847E000, 0x38480000, 0x38482000, 0x38484000, 0x38486000, 0x38488000, 0x3848A000, 0x3848C000, 0x3848E000, + 0x38490000, 0x38492000, 0x38494000, 0x38496000, 0x38498000, 0x3849A000, 0x3849C000, 0x3849E000, 0x384A0000, + 0x384A2000, 0x384A4000, 0x384A6000, 0x384A8000, 0x384AA000, 0x384AC000, 0x384AE000, 0x384B0000, 0x384B2000, + 0x384B4000, 0x384B6000, 0x384B8000, 0x384BA000, 0x384BC000, 0x384BE000, 0x384C0000, 0x384C2000, 0x384C4000, + 0x384C6000, 0x384C8000, 0x384CA000, 0x384CC000, 0x384CE000, 0x384D0000, 0x384D2000, 0x384D4000, 0x384D6000, + 0x384D8000, 0x384DA000, 0x384DC000, 0x384DE000, 0x384E0000, 0x384E2000, 0x384E4000, 0x384E6000, 0x384E8000, + 0x384EA000, 0x384EC000, 0x384EE000, 0x384F0000, 0x384F2000, 0x384F4000, 0x384F6000, 0x384F8000, 0x384FA000, + 0x384FC000, 0x384FE000, 0x38500000, 0x38502000, 0x38504000, 0x38506000, 0x38508000, 0x3850A000, 0x3850C000, + 0x3850E000, 0x38510000, 0x38512000, 0x38514000, 0x38516000, 0x38518000, 0x3851A000, 0x3851C000, 0x3851E000, + 0x38520000, 0x38522000, 0x38524000, 0x38526000, 0x38528000, 0x3852A000, 0x3852C000, 0x3852E000, 0x38530000, + 0x38532000, 0x38534000, 0x38536000, 0x38538000, 0x3853A000, 0x3853C000, 0x3853E000, 0x38540000, 0x38542000, + 0x38544000, 0x38546000, 0x38548000, 0x3854A000, 0x3854C000, 0x3854E000, 0x38550000, 0x38552000, 0x38554000, + 0x38556000, 0x38558000, 0x3855A000, 0x3855C000, 0x3855E000, 0x38560000, 0x38562000, 0x38564000, 0x38566000, + 0x38568000, 0x3856A000, 0x3856C000, 0x3856E000, 0x38570000, 0x38572000, 0x38574000, 0x38576000, 0x38578000, + 0x3857A000, 0x3857C000, 0x3857E000, 0x38580000, 0x38582000, 0x38584000, 0x38586000, 0x38588000, 0x3858A000, + 0x3858C000, 0x3858E000, 0x38590000, 0x38592000, 0x38594000, 0x38596000, 0x38598000, 0x3859A000, 0x3859C000, + 0x3859E000, 0x385A0000, 0x385A2000, 0x385A4000, 0x385A6000, 0x385A8000, 0x385AA000, 0x385AC000, 0x385AE000, + 0x385B0000, 0x385B2000, 0x385B4000, 0x385B6000, 0x385B8000, 0x385BA000, 0x385BC000, 0x385BE000, 0x385C0000, + 0x385C2000, 0x385C4000, 0x385C6000, 0x385C8000, 0x385CA000, 0x385CC000, 0x385CE000, 0x385D0000, 0x385D2000, + 0x385D4000, 0x385D6000, 0x385D8000, 0x385DA000, 0x385DC000, 0x385DE000, 0x385E0000, 0x385E2000, 0x385E4000, + 0x385E6000, 0x385E8000, 0x385EA000, 0x385EC000, 0x385EE000, 0x385F0000, 0x385F2000, 0x385F4000, 0x385F6000, + 0x385F8000, 0x385FA000, 0x385FC000, 0x385FE000, 0x38600000, 0x38602000, 0x38604000, 0x38606000, 0x38608000, + 0x3860A000, 0x3860C000, 0x3860E000, 0x38610000, 0x38612000, 0x38614000, 0x38616000, 0x38618000, 0x3861A000, + 0x3861C000, 0x3861E000, 0x38620000, 0x38622000, 0x38624000, 0x38626000, 0x38628000, 0x3862A000, 0x3862C000, + 0x3862E000, 0x38630000, 0x38632000, 0x38634000, 0x38636000, 0x38638000, 0x3863A000, 0x3863C000, 0x3863E000, + 0x38640000, 0x38642000, 0x38644000, 0x38646000, 0x38648000, 0x3864A000, 0x3864C000, 0x3864E000, 0x38650000, + 0x38652000, 0x38654000, 0x38656000, 0x38658000, 0x3865A000, 0x3865C000, 0x3865E000, 0x38660000, 0x38662000, + 0x38664000, 0x38666000, 0x38668000, 0x3866A000, 0x3866C000, 0x3866E000, 0x38670000, 0x38672000, 0x38674000, + 0x38676000, 0x38678000, 0x3867A000, 0x3867C000, 0x3867E000, 0x38680000, 0x38682000, 0x38684000, 0x38686000, + 0x38688000, 0x3868A000, 0x3868C000, 0x3868E000, 0x38690000, 0x38692000, 0x38694000, 0x38696000, 0x38698000, + 0x3869A000, 0x3869C000, 0x3869E000, 0x386A0000, 0x386A2000, 0x386A4000, 0x386A6000, 0x386A8000, 0x386AA000, + 0x386AC000, 0x386AE000, 0x386B0000, 0x386B2000, 0x386B4000, 0x386B6000, 0x386B8000, 0x386BA000, 0x386BC000, + 0x386BE000, 0x386C0000, 0x386C2000, 0x386C4000, 0x386C6000, 0x386C8000, 0x386CA000, 0x386CC000, 0x386CE000, + 0x386D0000, 0x386D2000, 0x386D4000, 0x386D6000, 0x386D8000, 0x386DA000, 0x386DC000, 0x386DE000, 0x386E0000, + 0x386E2000, 0x386E4000, 0x386E6000, 0x386E8000, 0x386EA000, 0x386EC000, 0x386EE000, 0x386F0000, 0x386F2000, + 0x386F4000, 0x386F6000, 0x386F8000, 0x386FA000, 0x386FC000, 0x386FE000, 0x38700000, 0x38702000, 0x38704000, + 0x38706000, 0x38708000, 0x3870A000, 0x3870C000, 0x3870E000, 0x38710000, 0x38712000, 0x38714000, 0x38716000, + 0x38718000, 0x3871A000, 0x3871C000, 0x3871E000, 0x38720000, 0x38722000, 0x38724000, 0x38726000, 0x38728000, + 0x3872A000, 0x3872C000, 0x3872E000, 0x38730000, 0x38732000, 0x38734000, 0x38736000, 0x38738000, 0x3873A000, + 0x3873C000, 0x3873E000, 0x38740000, 0x38742000, 0x38744000, 0x38746000, 0x38748000, 0x3874A000, 0x3874C000, + 0x3874E000, 0x38750000, 0x38752000, 0x38754000, 0x38756000, 0x38758000, 0x3875A000, 0x3875C000, 0x3875E000, + 0x38760000, 0x38762000, 0x38764000, 0x38766000, 0x38768000, 0x3876A000, 0x3876C000, 0x3876E000, 0x38770000, + 0x38772000, 0x38774000, 0x38776000, 0x38778000, 0x3877A000, 0x3877C000, 0x3877E000, 0x38780000, 0x38782000, + 0x38784000, 0x38786000, 0x38788000, 0x3878A000, 0x3878C000, 0x3878E000, 0x38790000, 0x38792000, 0x38794000, + 0x38796000, 0x38798000, 0x3879A000, 0x3879C000, 0x3879E000, 0x387A0000, 0x387A2000, 0x387A4000, 0x387A6000, + 0x387A8000, 0x387AA000, 0x387AC000, 0x387AE000, 0x387B0000, 0x387B2000, 0x387B4000, 0x387B6000, 0x387B8000, + 0x387BA000, 0x387BC000, 0x387BE000, 0x387C0000, 0x387C2000, 0x387C4000, 0x387C6000, 0x387C8000, 0x387CA000, + 0x387CC000, 0x387CE000, 0x387D0000, 0x387D2000, 0x387D4000, 0x387D6000, 0x387D8000, 0x387DA000, 0x387DC000, + 0x387DE000, 0x387E0000, 0x387E2000, 0x387E4000, 0x387E6000, 0x387E8000, 0x387EA000, 0x387EC000, 0x387EE000, + 0x387F0000, 0x387F2000, 0x387F4000, 0x387F6000, 0x387F8000, 0x387FA000, 0x387FC000, 0x387FE000}; + static const uint32 exponent_table[64] = {0x00000000, 0x00800000, 0x01000000, 0x01800000, 0x02000000, 0x02800000, + 0x03000000, 0x03800000, 0x04000000, 0x04800000, 0x05000000, 0x05800000, 0x06000000, 0x06800000, 0x07000000, + 0x07800000, 0x08000000, 0x08800000, 0x09000000, 0x09800000, 0x0A000000, 0x0A800000, 0x0B000000, 0x0B800000, + 0x0C000000, 0x0C800000, 0x0D000000, 0x0D800000, 0x0E000000, 0x0E800000, 0x0F000000, 0x47800000, 0x80000000, + 0x80800000, 0x81000000, 0x81800000, 0x82000000, 0x82800000, 0x83000000, 0x83800000, 0x84000000, 0x84800000, + 0x85000000, 0x85800000, 0x86000000, 0x86800000, 0x87000000, 0x87800000, 0x88000000, 0x88800000, 0x89000000, + 0x89800000, 0x8A000000, 0x8A800000, 0x8B000000, 0x8B800000, 0x8C000000, 0x8C800000, 0x8D000000, 0x8D800000, + 0x8E000000, 0x8E800000, 0x8F000000, 0xC7800000}; + static const unsigned short offset_table[64] = {0, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, + 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, + 1024, 1024, 0, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, + 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024}; + uint32 bits = mantissa_table[offset_table[value >> 10] + (value & 0x3FF)] + exponent_table[value >> 10]; + // return *reinterpret_cast(&bits); //violating strict aliasing! + float out; + std::memcpy(&out, &bits, sizeof(float)); + return out; +} + +/// Convert half-precision to IEEE double-precision. +/// \param value binary representation of half-precision value +/// \return double-precision value +inline double half2float_impl(uint16 value, double, true_type) +{ + typedef bits::type uint32; + typedef bits::type uint64; + uint32 hi = static_cast(value & 0x8000) << 16; + int abs = value & 0x7FFF; + if (abs) + { + hi |= 0x3F000000 << static_cast(abs >= 0x7C00); + for (; abs < 0x400; abs <<= 1, hi -= 0x100000) + ; + hi += static_cast(abs) << 10; + } + uint64 bits = static_cast(hi) << 32; + // return *reinterpret_cast(&bits); //violating strict aliasing! + double out; + std::memcpy(&out, &bits, sizeof(double)); + return out; +} + +/// Convert half-precision to non-IEEE floating point. +/// \tparam T type to convert to (builtin integer type) +/// \param value binary representation of half-precision value +/// \return floating point value +template +T half2float_impl(uint16 value, T, ...) +{ + T out; + int abs = value & 0x7FFF; + if (abs > 0x7C00) + out = std::numeric_limits::has_quiet_NaN ? std::numeric_limits::quiet_NaN() : T(); + else if (abs == 0x7C00) + out = std::numeric_limits::has_infinity ? std::numeric_limits::infinity() : std::numeric_limits::max(); + else if (abs > 0x3FF) + out = std::ldexp(static_cast((abs & 0x3FF) | 0x400), (abs >> 10) - 25); + else + out = std::ldexp(static_cast(abs), -24); + return (value & 0x8000) ? -out : out; +} + +/// Convert half-precision to floating point. +/// \tparam T type to convert to (builtin integer type) +/// \param value binary representation of half-precision value +/// \return floating point value +template +T half2float(uint16 value) +{ + return half2float_impl( + value, T(), bool_type < std::numeric_limits::is_iec559 && sizeof(typename bits::type) == sizeof(T) > ()); +} + +/// Convert half-precision floating point to integer. +/// \tparam R rounding mode to use, `std::round_indeterminate` for fastest rounding +/// \tparam E `true` for round to even, `false` for round away from zero +/// \tparam T type to convert to (buitlin integer type with at least 16 bits precision, excluding any implicit sign +/// bits) \param value binary representation of half-precision value \return integral value +template +T half2int_impl(uint16 value) +{ +#if HALF_ENABLE_CPP11_STATIC_ASSERT && HALF_ENABLE_CPP11_TYPE_TRAITS + static_assert(std::is_integral::value, "half to int conversion only supports builtin integer types"); +#endif + uint32_t e = value & 0x7FFF; + if (e >= 0x7C00) + return (value & 0x8000) ? std::numeric_limits::min() : std::numeric_limits::max(); + if (e < 0x3800) + { + if (R == std::round_toward_infinity) + return T(~(value >> 15) & (e != 0)); + else if (R == std::round_toward_neg_infinity) + return -T(value > 0x8000); + return T(); + } + uint32_t m = (value & 0x3FF) | 0x400; + e >>= 10; + if (e < 25) + { + if (R == std::round_to_nearest) + m += (1 << (24 - e)) - (~(m >> (25 - e)) & E); + else if (R == std::round_toward_infinity) + m += ((value >> 15) - 1) & ((1 << (25 - e)) - 1U); + else if (R == std::round_toward_neg_infinity) + m += -(value >> 15) & ((1 << (25 - e)) - 1U); + m >>= 25 - e; + } + else + m <<= e - 25; + return (value & 0x8000) ? -static_cast(m) : static_cast(m); +} + +/// Convert half-precision floating point to integer. +/// \tparam R rounding mode to use, `std::round_indeterminate` for fastest rounding +/// \tparam T type to convert to (buitlin integer type with at least 16 bits precision, excluding any implicit sign +/// bits) \param value binary representation of half-precision value \return integral value +template +T half2int(uint16 value) +{ + return half2int_impl(value); +} + +/// Convert half-precision floating point to integer using round-to-nearest-away-from-zero. +/// \tparam T type to convert to (buitlin integer type with at least 16 bits precision, excluding any implicit sign +/// bits) \param value binary representation of half-precision value \return integral value +template +T half2int_up(uint16 value) +{ + return half2int_impl(value); +} + +/// Round half-precision number to nearest integer value. +/// \tparam R rounding mode to use, `std::round_indeterminate` for fastest rounding +/// \tparam E `true` for round to even, `false` for round away from zero +/// \param value binary representation of half-precision value +/// \return half-precision bits for nearest integral value +template +uint16 round_half_impl(uint16 value) +{ + uint32_t e = value & 0x7FFF; + uint16 result = value; + if (e < 0x3C00) + { + result &= 0x8000; + if (R == std::round_to_nearest) + result |= 0x3C00U & -(e >= (0x3800 + E)); + else if (R == std::round_toward_infinity) + result |= 0x3C00U & -(~(value >> 15) & (e != 0)); + else if (R == std::round_toward_neg_infinity) + result |= 0x3C00U & -(value > 0x8000); + } + else if (e < 0x6400) + { + e = 25 - (e >> 10); + uint32_t mask = (1 << e) - 1; + if (R == std::round_to_nearest) + result += (1 << (e - 1)) - (~(result >> e) & E); + else if (R == std::round_toward_infinity) + result += mask & ((value >> 15) - 1); + else if (R == std::round_toward_neg_infinity) + result += mask & -(value >> 15); + result &= ~mask; + } + return result; +} + +/// Round half-precision number to nearest integer value. +/// \tparam R rounding mode to use, `std::round_indeterminate` for fastest rounding +/// \param value binary representation of half-precision value +/// \return half-precision bits for nearest integral value +template +uint16 round_half(uint16 value) +{ + return round_half_impl(value); +} + +/// Round half-precision number to nearest integer value using round-to-nearest-away-from-zero. +/// \param value binary representation of half-precision value +/// \return half-precision bits for nearest integral value +inline uint16 round_half_up(uint16 value) +{ + return round_half_impl(value); +} +/// \} + +struct functions; +template +struct unary_specialized; +template +struct binary_specialized; +template +struct half_caster; +} // namespace detail + +/// Half-precision floating point type. +/// This class implements an IEEE-conformant half-precision floating point type with the usual arithmetic operators and +/// conversions. It is implicitly convertible to single-precision floating point, which makes artihmetic expressions and +/// functions with mixed-type operands to be of the most precise operand type. Additionally all arithmetic operations +/// (and many mathematical functions) are carried out in single-precision internally. All conversions from single- to +/// half-precision are done using the library's default rounding mode, but temporary results inside chained arithmetic +/// expressions are kept in single-precision as long as possible (while of course still maintaining a strong +/// half-precision type). +/// +/// According to the C++98/03 definition, the half type is not a POD type. But according to C++11's less strict and +/// extended definitions it is both a standard layout type and a trivially copyable type (even if not a POD type), which +/// means it can be standard-conformantly copied using raw binary copies. But in this context some more words about the +/// actual size of the type. Although the half is representing an IEEE 16-bit type, it does not neccessarily have to be +/// of exactly 16-bits size. But on any reasonable implementation the actual binary representation of this type will +/// most probably not ivolve any additional "magic" or padding beyond the simple binary representation of the underlying +/// 16-bit IEEE number, even if not strictly guaranteed by the standard. But even then it only has an actual size of 16 +/// bits if your C++ implementation supports an unsigned integer type of exactly 16 bits width. But this should be the +/// case on nearly any reasonable platform. +/// +/// So if your C++ implementation is not totally exotic or imposes special alignment requirements, it is a reasonable +/// assumption that the data of a half is just comprised of the 2 bytes of the underlying IEEE representation. +class half +{ + friend struct detail::functions; + friend struct detail::unary_specialized; + friend struct detail::binary_specialized; + template + friend struct detail::half_caster; + friend class std::numeric_limits; +#if HALF_ENABLE_CPP11_HASH + friend struct std::hash; +#endif +#if HALF_ENABLE_CPP11_USER_LITERALS + friend half literal::operator"" _h(long double); +#endif + +public: + /// Default constructor. + /// This initializes the half to 0. Although this does not match the builtin types' default-initialization semantics + /// and may be less efficient than no initialization, it is needed to provide proper value-initialization semantics. + HALF_CONSTEXPR half() HALF_NOEXCEPT : data_() {} + + /// Copy constructor. + /// \tparam T type of concrete half expression + /// \param rhs half expression to copy from + half(detail::expr rhs) + : data_(detail::float2half(static_cast(rhs))) + { + } + + /// Conversion constructor. + /// \param rhs float to convert + explicit half(float rhs) + : data_(detail::float2half(rhs)) + { + } + + /// Conversion to single-precision. + /// \return single precision value representing expression value + operator float() const + { + return detail::half2float(data_); + } + + /// Assignment operator. + /// \tparam T type of concrete half expression + /// \param rhs half expression to copy from + /// \return reference to this half + half& operator=(detail::expr rhs) + { + return *this = static_cast(rhs); + } + + /// Arithmetic assignment. + /// \tparam T type of concrete half expression + /// \param rhs half expression to add + /// \return reference to this half + template + typename detail::enable::type operator+=(T rhs) + { + return *this += static_cast(rhs); + } + + /// Arithmetic assignment. + /// \tparam T type of concrete half expression + /// \param rhs half expression to subtract + /// \return reference to this half + template + typename detail::enable::type operator-=(T rhs) + { + return *this -= static_cast(rhs); + } + + /// Arithmetic assignment. + /// \tparam T type of concrete half expression + /// \param rhs half expression to multiply with + /// \return reference to this half + template + typename detail::enable::type operator*=(T rhs) + { + return *this *= static_cast(rhs); + } + + /// Arithmetic assignment. + /// \tparam T type of concrete half expression + /// \param rhs half expression to divide by + /// \return reference to this half + template + typename detail::enable::type operator/=(T rhs) + { + return *this /= static_cast(rhs); + } + + /// Assignment operator. + /// \param rhs single-precision value to copy from + /// \return reference to this half + half& operator=(float rhs) + { + data_ = detail::float2half(rhs); + return *this; + } + + /// Arithmetic assignment. + /// \param rhs single-precision value to add + /// \return reference to this half + half& operator+=(float rhs) + { + data_ = detail::float2half(detail::half2float(data_) + rhs); + return *this; + } + + /// Arithmetic assignment. + /// \param rhs single-precision value to subtract + /// \return reference to this half + half& operator-=(float rhs) + { + data_ = detail::float2half(detail::half2float(data_) - rhs); + return *this; + } + + /// Arithmetic assignment. + /// \param rhs single-precision value to multiply with + /// \return reference to this half + half& operator*=(float rhs) + { + data_ = detail::float2half(detail::half2float(data_) * rhs); + return *this; + } + + /// Arithmetic assignment. + /// \param rhs single-precision value to divide by + /// \return reference to this half + half& operator/=(float rhs) + { + data_ = detail::float2half(detail::half2float(data_) / rhs); + return *this; + } + + /// Prefix increment. + /// \return incremented half value + half& operator++() + { + return *this += 1.0f; + } + + /// Prefix decrement. + /// \return decremented half value + half& operator--() + { + return *this -= 1.0f; + } + + /// Postfix increment. + /// \return non-incremented half value + half operator++(int) + { + half out(*this); + ++*this; + return out; + } + + /// Postfix decrement. + /// \return non-decremented half value + half operator--(int) + { + half out(*this); + --*this; + return out; + } + +private: + /// Rounding mode to use + static const std::float_round_style round_style = (std::float_round_style)(HALF_ROUND_STYLE); + + /// Constructor. + /// \param bits binary representation to set half to + HALF_CONSTEXPR half(detail::binary_t, detail::uint16 bits) HALF_NOEXCEPT : data_(bits) {} + + /// Internal binary representation + detail::uint16 data_; +}; + +#if HALF_ENABLE_CPP11_USER_LITERALS +namespace literal +{ +/// Half literal. +/// While this returns an actual half-precision value, half literals can unfortunately not be constant expressions due +/// to rather involved conversions. +/// \param value literal value +/// \return half with given value (if representable) +inline half operator"" _h(long double value) +{ + return half(detail::binary, detail::float2half(value)); +} +} // namespace literal +#endif + +namespace detail +{ +/// Wrapper implementing unspecialized half-precision functions. +struct functions +{ + /// Addition implementation. + /// \param x first operand + /// \param y second operand + /// \return Half-precision sum stored in single-precision + static expr plus(float x, float y) + { + return expr(x + y); + } + + /// Subtraction implementation. + /// \param x first operand + /// \param y second operand + /// \return Half-precision difference stored in single-precision + static expr minus(float x, float y) + { + return expr(x - y); + } + + /// Multiplication implementation. + /// \param x first operand + /// \param y second operand + /// \return Half-precision product stored in single-precision + static expr multiplies(float x, float y) + { + return expr(x * y); + } + + /// Division implementation. + /// \param x first operand + /// \param y second operand + /// \return Half-precision quotient stored in single-precision + static expr divides(float x, float y) + { + return expr(x / y); + } + + /// Output implementation. + /// \param out stream to write to + /// \param arg value to write + /// \return reference to stream + template + static std::basic_ostream& write(std::basic_ostream& out, float arg) + { + return out << arg; + } + + /// Input implementation. + /// \param in stream to read from + /// \param arg half to read into + /// \return reference to stream + template + static std::basic_istream& read(std::basic_istream& in, half& arg) + { + float f; + if (in >> f) + arg = f; + return in; + } + + /// Modulo implementation. + /// \param x first operand + /// \param y second operand + /// \return Half-precision division remainder stored in single-precision + static expr fmod(float x, float y) + { + return expr(std::fmod(x, y)); + } + + /// Remainder implementation. + /// \param x first operand + /// \param y second operand + /// \return Half-precision division remainder stored in single-precision + static expr remainder(float x, float y) + { +#if HALF_ENABLE_CPP11_CMATH + return expr(std::remainder(x, y)); +#else + if (builtin_isnan(x) || builtin_isnan(y)) + return expr(std::numeric_limits::quiet_NaN()); + float ax = std::fabs(x), ay = std::fabs(y); + if (ax >= 65536.0f || ay < std::ldexp(1.0f, -24)) + return expr(std::numeric_limits::quiet_NaN()); + if (ay >= 65536.0f) + return expr(x); + if (ax == ay) + return expr(builtin_signbit(x) ? -0.0f : 0.0f); + ax = std::fmod(ax, ay + ay); + float y2 = 0.5f * ay; + if (ax > y2) + { + ax -= ay; + if (ax >= y2) + ax -= ay; + } + return expr(builtin_signbit(x) ? -ax : ax); +#endif + } + + /// Remainder implementation. + /// \param x first operand + /// \param y second operand + /// \param quo address to store quotient bits at + /// \return Half-precision division remainder stored in single-precision + static expr remquo(float x, float y, int* quo) + { +#if HALF_ENABLE_CPP11_CMATH + return expr(std::remquo(x, y, quo)); +#else + if (builtin_isnan(x) || builtin_isnan(y)) + return expr(std::numeric_limits::quiet_NaN()); + bool sign = builtin_signbit(x), qsign = static_cast(sign ^ builtin_signbit(y)); + float ax = std::fabs(x), ay = std::fabs(y); + if (ax >= 65536.0f || ay < std::ldexp(1.0f, -24)) + return expr(std::numeric_limits::quiet_NaN()); + if (ay >= 65536.0f) + return expr(x); + if (ax == ay) + return *quo = qsign ? -1 : 1, expr(sign ? -0.0f : 0.0f); + ax = std::fmod(ax, 8.0f * ay); + int cquo = 0; + if (ax >= 4.0f * ay) + { + ax -= 4.0f * ay; + cquo += 4; + } + if (ax >= 2.0f * ay) + { + ax -= 2.0f * ay; + cquo += 2; + } + float y2 = 0.5f * ay; + if (ax > y2) + { + ax -= ay; + ++cquo; + if (ax >= y2) + { + ax -= ay; + ++cquo; + } + } + return *quo = qsign ? -cquo : cquo, expr(sign ? -ax : ax); +#endif + } + + /// Positive difference implementation. + /// \param x first operand + /// \param y second operand + /// \return Positive difference stored in single-precision + static expr fdim(float x, float y) + { +#if HALF_ENABLE_CPP11_CMATH + return expr(std::fdim(x, y)); +#else + return expr((x <= y) ? 0.0f : (x - y)); +#endif + } + + /// Fused multiply-add implementation. + /// \param x first operand + /// \param y second operand + /// \param z third operand + /// \return \a x * \a y + \a z stored in single-precision + static expr fma(float x, float y, float z) + { +#if HALF_ENABLE_CPP11_CMATH && defined(FP_FAST_FMAF) + return expr(std::fma(x, y, z)); +#else + return expr(x * y + z); +#endif + } + + /// Get NaN. + /// \return Half-precision quiet NaN + static half nanh() + { + return half(binary, 0x7FFF); + } + + /// Exponential implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr exp(float arg) + { + return expr(std::exp(arg)); + } + + /// Exponential implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr expm1(float arg) + { +#if HALF_ENABLE_CPP11_CMATH + return expr(std::expm1(arg)); +#else + return expr(static_cast(std::exp(static_cast(arg)) - 1.0)); +#endif + } + + /// Binary exponential implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr exp2(float arg) + { +#if HALF_ENABLE_CPP11_CMATH + return expr(std::exp2(arg)); +#else + return expr(static_cast(std::exp(arg * 0.69314718055994530941723212145818))); +#endif + } + + /// Logarithm implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr log(float arg) + { + return expr(std::log(arg)); + } + + /// Common logarithm implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr log10(float arg) + { + return expr(std::log10(arg)); + } + + /// Logarithm implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr log1p(float arg) + { +#if HALF_ENABLE_CPP11_CMATH + return expr(std::log1p(arg)); +#else + return expr(static_cast(std::log(1.0 + arg))); +#endif + } + + /// Binary logarithm implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr log2(float arg) + { +#if HALF_ENABLE_CPP11_CMATH + return expr(std::log2(arg)); +#else + return expr(static_cast(std::log(static_cast(arg)) * 1.4426950408889634073599246810019)); +#endif + } + + /// Square root implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr sqrt(float arg) + { + return expr(std::sqrt(arg)); + } + + /// Cubic root implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr cbrt(float arg) + { +#if HALF_ENABLE_CPP11_CMATH + return expr(std::cbrt(arg)); +#else + if (builtin_isnan(arg) || builtin_isinf(arg)) + return expr(arg); + return expr(builtin_signbit(arg) ? -static_cast(std::pow(-static_cast(arg), 1.0 / 3.0)) + : static_cast(std::pow(static_cast(arg), 1.0 / 3.0))); +#endif + } + + /// Hypotenuse implementation. + /// \param x first argument + /// \param y second argument + /// \return function value stored in single-preicision + static expr hypot(float x, float y) + { +#if HALF_ENABLE_CPP11_CMATH + return expr(std::hypot(x, y)); +#else + return expr((builtin_isinf(x) || builtin_isinf(y)) + ? std::numeric_limits::infinity() + : static_cast(std::sqrt(static_cast(x) * x + static_cast(y) * y))); +#endif + } + + /// Power implementation. + /// \param base value to exponentiate + /// \param exp power to expontiate to + /// \return function value stored in single-preicision + static expr pow(float base, float exp) + { + return expr(std::pow(base, exp)); + } + + /// Sine implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr sin(float arg) + { + return expr(std::sin(arg)); + } + + /// Cosine implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr cos(float arg) + { + return expr(std::cos(arg)); + } + + /// Tan implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr tan(float arg) + { + return expr(std::tan(arg)); + } + + /// Arc sine implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr asin(float arg) + { + return expr(std::asin(arg)); + } + + /// Arc cosine implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr acos(float arg) + { + return expr(std::acos(arg)); + } + + /// Arc tangent implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr atan(float arg) + { + return expr(std::atan(arg)); + } + + /// Arc tangent implementation. + /// \param x first argument + /// \param y second argument + /// \return function value stored in single-preicision + static expr atan2(float x, float y) + { + return expr(std::atan2(x, y)); + } + + /// Hyperbolic sine implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr sinh(float arg) + { + return expr(std::sinh(arg)); + } + + /// Hyperbolic cosine implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr cosh(float arg) + { + return expr(std::cosh(arg)); + } + + /// Hyperbolic tangent implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr tanh(float arg) + { + return expr(std::tanh(arg)); + } + + /// Hyperbolic area sine implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr asinh(float arg) + { +#if HALF_ENABLE_CPP11_CMATH + return expr(std::asinh(arg)); +#else + return expr((arg == -std::numeric_limits::infinity()) + ? arg + : static_cast(std::log(arg + std::sqrt(arg * arg + 1.0)))); +#endif + } + + /// Hyperbolic area cosine implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr acosh(float arg) + { +#if HALF_ENABLE_CPP11_CMATH + return expr(std::acosh(arg)); +#else + return expr((arg < -1.0f) ? std::numeric_limits::quiet_NaN() + : static_cast(std::log(arg + std::sqrt(arg * arg - 1.0)))); +#endif + } + + /// Hyperbolic area tangent implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr atanh(float arg) + { +#if HALF_ENABLE_CPP11_CMATH + return expr(std::atanh(arg)); +#else + return expr(static_cast(0.5 * std::log((1.0 + arg) / (1.0 - arg)))); +#endif + } + + /// Error function implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr erf(float arg) + { +#if HALF_ENABLE_CPP11_CMATH + return expr(std::erf(arg)); +#else + return expr(static_cast(erf(static_cast(arg)))); +#endif + } + + /// Complementary implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr erfc(float arg) + { +#if HALF_ENABLE_CPP11_CMATH + return expr(std::erfc(arg)); +#else + return expr(static_cast(1.0 - erf(static_cast(arg)))); +#endif + } + + /// Gamma logarithm implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr lgamma(float arg) + { +#if HALF_ENABLE_CPP11_CMATH + return expr(std::lgamma(arg)); +#else + if (builtin_isinf(arg)) + return expr(std::numeric_limits::infinity()); + if (arg < 0.0f) + { + float i, f = std::modf(-arg, &i); + if (f == 0.0f) + return expr(std::numeric_limits::infinity()); + return expr(static_cast(1.1447298858494001741434273513531 + - std::log(std::abs(std::sin(3.1415926535897932384626433832795 * f))) - lgamma(1.0 - arg))); + } + return expr(static_cast(lgamma(static_cast(arg)))); +#endif + } + + /// Gamma implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr tgamma(float arg) + { +#if HALF_ENABLE_CPP11_CMATH + return expr(std::tgamma(arg)); +#else + if (arg == 0.0f) + return builtin_signbit(arg) ? expr(-std::numeric_limits::infinity()) + : expr(std::numeric_limits::infinity()); + if (arg < 0.0f) + { + float i, f = std::modf(-arg, &i); + if (f == 0.0f) + return expr(std::numeric_limits::quiet_NaN()); + double value = 3.1415926535897932384626433832795 + / (std::sin(3.1415926535897932384626433832795 * f) * std::exp(lgamma(1.0 - arg))); + return expr(static_cast((std::fmod(i, 2.0f) == 0.0f) ? -value : value)); + } + if (builtin_isinf(arg)) + return expr(arg); + return expr(static_cast(std::exp(lgamma(static_cast(arg))))); +#endif + } + + /// Floor implementation. + /// \param arg value to round + /// \return rounded value + static half floor(half arg) + { + return half(binary, round_half(arg.data_)); + } + + /// Ceiling implementation. + /// \param arg value to round + /// \return rounded value + static half ceil(half arg) + { + return half(binary, round_half(arg.data_)); + } + + /// Truncation implementation. + /// \param arg value to round + /// \return rounded value + static half trunc(half arg) + { + return half(binary, round_half(arg.data_)); + } + + /// Nearest integer implementation. + /// \param arg value to round + /// \return rounded value + static half round(half arg) + { + return half(binary, round_half_up(arg.data_)); + } + + /// Nearest integer implementation. + /// \param arg value to round + /// \return rounded value + static long lround(half arg) + { + return detail::half2int_up(arg.data_); + } + + /// Nearest integer implementation. + /// \param arg value to round + /// \return rounded value + static half rint(half arg) + { + return half(binary, round_half(arg.data_)); + } + + /// Nearest integer implementation. + /// \param arg value to round + /// \return rounded value + static long lrint(half arg) + { + return detail::half2int(arg.data_); + } + +#if HALF_ENABLE_CPP11_LONG_LONG + /// Nearest integer implementation. + /// \param arg value to round + /// \return rounded value + static long long llround(half arg) + { + return detail::half2int_up(arg.data_); + } + + /// Nearest integer implementation. + /// \param arg value to round + /// \return rounded value + static long long llrint(half arg) + { + return detail::half2int(arg.data_); + } +#endif + + /// Decompression implementation. + /// \param arg number to decompress + /// \param exp address to store exponent at + /// \return normalized significant + static half frexp(half arg, int* exp) + { + int m = arg.data_ & 0x7FFF, e = -14; + if (m >= 0x7C00 || !m) + return *exp = 0, arg; + for (; m < 0x400; m <<= 1, --e) + ; + return *exp = e + (m >> 10), half(binary, (arg.data_ & 0x8000) | 0x3800 | (m & 0x3FF)); + } + + /// Decompression implementation. + /// \param arg number to decompress + /// \param iptr address to store integer part at + /// \return fractional part + static half modf(half arg, half* iptr) + { + uint32_t e = arg.data_ & 0x7FFF; + if (e >= 0x6400) + return *iptr = arg, half(binary, arg.data_ & (0x8000U | -(e > 0x7C00))); + if (e < 0x3C00) + return iptr->data_ = arg.data_ & 0x8000, arg; + e >>= 10; + uint32_t mask = (1 << (25 - e)) - 1, m = arg.data_ & mask; + iptr->data_ = arg.data_ & ~mask; + if (!m) + return half(binary, arg.data_ & 0x8000); + for (; m < 0x400; m <<= 1, --e) + ; + return half(binary, static_cast((arg.data_ & 0x8000) | (e << 10) | (m & 0x3FF))); + } + + /// Scaling implementation. + /// \param arg number to scale + /// \param exp power of two to scale by + /// \return scaled number + static half scalbln(half arg, long exp) + { + uint32_t m = arg.data_ & 0x7FFF; + if (m >= 0x7C00 || !m) + return arg; + for (; m < 0x400; m <<= 1, --exp) + ; + exp += m >> 10; + uint16 value = arg.data_ & 0x8000; + if (exp > 30) + { + if (half::round_style == std::round_toward_zero) + value |= 0x7BFF; + else if (half::round_style == std::round_toward_infinity) + value |= 0x7C00 - (value >> 15); + else if (half::round_style == std::round_toward_neg_infinity) + value |= 0x7BFF + (value >> 15); + else + value |= 0x7C00; + } + else if (exp > 0) + value |= (exp << 10) | (m & 0x3FF); + else if (exp > -11) + { + m = (m & 0x3FF) | 0x400; + if (half::round_style == std::round_to_nearest) + { + m += 1 << -exp; +#if HALF_ROUND_TIES_TO_EVEN + m -= (m >> (1 - exp)) & 1; +#endif + } + else if (half::round_style == std::round_toward_infinity) + m += ((value >> 15) - 1) & ((1 << (1 - exp)) - 1U); + else if (half::round_style == std::round_toward_neg_infinity) + m += -(value >> 15) & ((1 << (1 - exp)) - 1U); + value |= m >> (1 - exp); + } + else if (half::round_style == std::round_toward_infinity) + value -= (value >> 15) - 1; + else if (half::round_style == std::round_toward_neg_infinity) + value += value >> 15; + return half(binary, value); + } + + /// Exponent implementation. + /// \param arg number to query + /// \return floating point exponent + static int ilogb(half arg) + { + int abs = arg.data_ & 0x7FFF; + if (!abs) + return FP_ILOGB0; + if (abs < 0x7C00) + { + int exp = (abs >> 10) - 15; + if (abs < 0x400) + for (; abs < 0x200; abs <<= 1, --exp) + ; + return exp; + } + if (abs > 0x7C00) + return FP_ILOGBNAN; + return INT_MAX; + } + + /// Exponent implementation. + /// \param arg number to query + /// \return floating point exponent + static half logb(half arg) + { + int abs = arg.data_ & 0x7FFF; + if (!abs) + return half(binary, 0xFC00); + if (abs < 0x7C00) + { + int exp = (abs >> 10) - 15; + if (abs < 0x400) + for (; abs < 0x200; abs <<= 1, --exp) + ; + uint16 bits = (exp < 0) << 15; + if (exp) + { + uint32_t m = std::abs(exp) << 6, e = 18; + for (; m < 0x400; m <<= 1, --e) + ; + bits |= (e << 10) + m; + } + return half(binary, bits); + } + if (abs > 0x7C00) + return arg; + return half(binary, 0x7C00); + } + + /// Enumeration implementation. + /// \param from number to increase/decrease + /// \param to direction to enumerate into + /// \return next representable number + static half nextafter(half from, half to) + { + uint16 fabs = from.data_ & 0x7FFF, tabs = to.data_ & 0x7FFF; + if (fabs > 0x7C00) + return from; + if (tabs > 0x7C00 || from.data_ == to.data_ || !(fabs | tabs)) + return to; + if (!fabs) + return half(binary, (to.data_ & 0x8000) + 1); + bool lt = ((fabs == from.data_) ? static_cast(fabs) : -static_cast(fabs)) + < ((tabs == to.data_) ? static_cast(tabs) : -static_cast(tabs)); + return half(binary, from.data_ + (((from.data_ >> 15) ^ static_cast(lt)) << 1) - 1); + } + + /// Enumeration implementation. + /// \param from number to increase/decrease + /// \param to direction to enumerate into + /// \return next representable number + static half nexttoward(half from, long double to) + { + if (isnan(from)) + return from; + long double lfrom = static_cast(from); + if (builtin_isnan(to) || lfrom == to) + return half(static_cast(to)); + if (!(from.data_ & 0x7FFF)) + return half(binary, (static_cast(builtin_signbit(to)) << 15) + 1); + return half(binary, from.data_ + (((from.data_ >> 15) ^ static_cast(lfrom < to)) << 1) - 1); + } + + /// Sign implementation + /// \param x first operand + /// \param y second operand + /// \return composed value + static half copysign(half x, half y) + { + return half(binary, x.data_ ^ ((x.data_ ^ y.data_) & 0x8000)); + } + + /// Classification implementation. + /// \param arg value to classify + /// \retval true if infinite number + /// \retval false else + static int fpclassify(half arg) + { + uint32_t abs = arg.data_ & 0x7FFF; + return abs + ? ((abs > 0x3FF) ? ((abs >= 0x7C00) ? ((abs > 0x7C00) ? FP_NAN : FP_INFINITE) : FP_NORMAL) : FP_SUBNORMAL) + : FP_ZERO; + } + + /// Classification implementation. + /// \param arg value to classify + /// \retval true if finite number + /// \retval false else + static bool isfinite(half arg) + { + return (arg.data_ & 0x7C00) != 0x7C00; + } + + /// Classification implementation. + /// \param arg value to classify + /// \retval true if infinite number + /// \retval false else + static bool isinf(half arg) + { + return (arg.data_ & 0x7FFF) == 0x7C00; + } + + /// Classification implementation. + /// \param arg value to classify + /// \retval true if not a number + /// \retval false else + static bool isnan(half arg) + { + return (arg.data_ & 0x7FFF) > 0x7C00; + } + + /// Classification implementation. + /// \param arg value to classify + /// \retval true if normal number + /// \retval false else + static bool isnormal(half arg) + { + return ((arg.data_ & 0x7C00) != 0) & ((arg.data_ & 0x7C00) != 0x7C00); + } + + /// Sign bit implementation. + /// \param arg value to check + /// \retval true if signed + /// \retval false if unsigned + static bool signbit(half arg) + { + return (arg.data_ & 0x8000) != 0; + } + + /// Comparison implementation. + /// \param x first operand + /// \param y second operand + /// \retval true if operands equal + /// \retval false else + static bool isequal(half x, half y) + { + return (x.data_ == y.data_ || !((x.data_ | y.data_) & 0x7FFF)) && !isnan(x); + } + + /// Comparison implementation. + /// \param x first operand + /// \param y second operand + /// \retval true if operands not equal + /// \retval false else + static bool isnotequal(half x, half y) + { + return (x.data_ != y.data_ && ((x.data_ | y.data_) & 0x7FFF)) || isnan(x); + } + + /// Comparison implementation. + /// \param x first operand + /// \param y second operand + /// \retval true if \a x > \a y + /// \retval false else + static bool isgreater(half x, half y) + { + int xabs = x.data_ & 0x7FFF, yabs = y.data_ & 0x7FFF; + return xabs <= 0x7C00 && yabs <= 0x7C00 + && (((xabs == x.data_) ? xabs : -xabs) > ((yabs == y.data_) ? yabs : -yabs)); + } + + /// Comparison implementation. + /// \param x first operand + /// \param y second operand + /// \retval true if \a x >= \a y + /// \retval false else + static bool isgreaterequal(half x, half y) + { + int xabs = x.data_ & 0x7FFF, yabs = y.data_ & 0x7FFF; + return xabs <= 0x7C00 && yabs <= 0x7C00 + && (((xabs == x.data_) ? xabs : -xabs) >= ((yabs == y.data_) ? yabs : -yabs)); + } + + /// Comparison implementation. + /// \param x first operand + /// \param y second operand + /// \retval true if \a x < \a y + /// \retval false else + static bool isless(half x, half y) + { + int xabs = x.data_ & 0x7FFF, yabs = y.data_ & 0x7FFF; + return xabs <= 0x7C00 && yabs <= 0x7C00 + && (((xabs == x.data_) ? xabs : -xabs) < ((yabs == y.data_) ? yabs : -yabs)); + } + + /// Comparison implementation. + /// \param x first operand + /// \param y second operand + /// \retval true if \a x <= \a y + /// \retval false else + static bool islessequal(half x, half y) + { + int xabs = x.data_ & 0x7FFF, yabs = y.data_ & 0x7FFF; + return xabs <= 0x7C00 && yabs <= 0x7C00 + && (((xabs == x.data_) ? xabs : -xabs) <= ((yabs == y.data_) ? yabs : -yabs)); + } + + /// Comparison implementation. + /// \param x first operand + /// \param y second operand + /// \retval true if either \a x > \a y nor \a x < \a y + /// \retval false else + static bool islessgreater(half x, half y) + { + int xabs = x.data_ & 0x7FFF, yabs = y.data_ & 0x7FFF; + if (xabs > 0x7C00 || yabs > 0x7C00) + return false; + int a = (xabs == x.data_) ? xabs : -xabs, b = (yabs == y.data_) ? yabs : -yabs; + return a < b || a > b; + } + + /// Comparison implementation. + /// \param x first operand + /// \param y second operand + /// \retval true if operand unordered + /// \retval false else + static bool isunordered(half x, half y) + { + return isnan(x) || isnan(y); + } + +private: + static double erf(double arg) + { + if (builtin_isinf(arg)) + return (arg < 0.0) ? -1.0 : 1.0; + double x2 = arg * arg, ax2 = 0.147 * x2, + value = std::sqrt(1.0 - std::exp(-x2 * (1.2732395447351626861510701069801 + ax2) / (1.0 + ax2))); + return builtin_signbit(arg) ? -value : value; + } + + static double lgamma(double arg) + { + double v = 1.0; + for (; arg < 8.0; ++arg) + v *= arg; + double w = 1.0 / (arg * arg); + return (((((((-0.02955065359477124183006535947712 * w + 0.00641025641025641025641025641026) * w + + -0.00191752691752691752691752691753) + * w + + 8.4175084175084175084175084175084e-4) + * w + + -5.952380952380952380952380952381e-4) + * w + + 7.9365079365079365079365079365079e-4) + * w + + -0.00277777777777777777777777777778) + * w + + 0.08333333333333333333333333333333) + / arg + + 0.91893853320467274178032973640562 - std::log(v) - arg + (arg - 0.5) * std::log(arg); + } +}; + +/// Wrapper for unary half-precision functions needing specialization for individual argument types. +/// \tparam T argument type +template +struct unary_specialized +{ + /// Negation implementation. + /// \param arg value to negate + /// \return negated value + static HALF_CONSTEXPR half negate(half arg) + { + return half(binary, arg.data_ ^ 0x8000); + } + + /// Absolute value implementation. + /// \param arg function argument + /// \return absolute value + static half fabs(half arg) + { + return half(binary, arg.data_ & 0x7FFF); + } +}; +template <> +struct unary_specialized +{ + static HALF_CONSTEXPR expr negate(float arg) + { + return expr(-arg); + } + static expr fabs(float arg) + { + return expr(std::fabs(arg)); + } +}; + +/// Wrapper for binary half-precision functions needing specialization for individual argument types. +/// \tparam T first argument type +/// \tparam U first argument type +template +struct binary_specialized +{ + /// Minimum implementation. + /// \param x first operand + /// \param y second operand + /// \return minimum value + static expr fmin(float x, float y) + { +#if HALF_ENABLE_CPP11_CMATH + return expr(std::fmin(x, y)); +#else + if (builtin_isnan(x)) + return expr(y); + if (builtin_isnan(y)) + return expr(x); + return expr(std::min(x, y)); +#endif + } + + /// Maximum implementation. + /// \param x first operand + /// \param y second operand + /// \return maximum value + static expr fmax(float x, float y) + { +#if HALF_ENABLE_CPP11_CMATH + return expr(std::fmax(x, y)); +#else + if (builtin_isnan(x)) + return expr(y); + if (builtin_isnan(y)) + return expr(x); + return expr(std::max(x, y)); +#endif + } +}; +template <> +struct binary_specialized +{ + static half fmin(half x, half y) + { + int xabs = x.data_ & 0x7FFF, yabs = y.data_ & 0x7FFF; + if (xabs > 0x7C00) + return y; + if (yabs > 0x7C00) + return x; + return (((xabs == x.data_) ? xabs : -xabs) > ((yabs == y.data_) ? yabs : -yabs)) ? y : x; + } + static half fmax(half x, half y) + { + int xabs = x.data_ & 0x7FFF, yabs = y.data_ & 0x7FFF; + if (xabs > 0x7C00) + return y; + if (yabs > 0x7C00) + return x; + return (((xabs == x.data_) ? xabs : -xabs) < ((yabs == y.data_) ? yabs : -yabs)) ? y : x; + } +}; + +/// Helper class for half casts. +/// This class template has to be specialized for all valid cast argument to define an appropriate static `cast` member +/// function and a corresponding `type` member denoting its return type. +/// \tparam T destination type +/// \tparam U source type +/// \tparam R rounding mode to use +template +struct half_caster +{ +}; +template +struct half_caster +{ +#if HALF_ENABLE_CPP11_STATIC_ASSERT && HALF_ENABLE_CPP11_TYPE_TRAITS + static_assert(std::is_arithmetic::value, "half_cast from non-arithmetic type unsupported"); +#endif + + static half cast(U arg) + { + return cast_impl(arg, is_float()); + }; + +private: + static half cast_impl(U arg, true_type) + { + return half(binary, float2half(arg)); + } + static half cast_impl(U arg, false_type) + { + return half(binary, int2half(arg)); + } +}; +template +struct half_caster +{ +#if HALF_ENABLE_CPP11_STATIC_ASSERT && HALF_ENABLE_CPP11_TYPE_TRAITS + static_assert(std::is_arithmetic::value, "half_cast to non-arithmetic type unsupported"); +#endif + + static T cast(half arg) + { + return cast_impl(arg, is_float()); + } + +private: + static T cast_impl(half arg, true_type) + { + return half2float(arg.data_); + } + static T cast_impl(half arg, false_type) + { + return half2int(arg.data_); + } +}; +template +struct half_caster +{ +#if HALF_ENABLE_CPP11_STATIC_ASSERT && HALF_ENABLE_CPP11_TYPE_TRAITS + static_assert(std::is_arithmetic::value, "half_cast to non-arithmetic type unsupported"); +#endif + + static T cast(expr arg) + { + return cast_impl(arg, is_float()); + } + +private: + static T cast_impl(float arg, true_type) + { + return static_cast(arg); + } + static T cast_impl(half arg, false_type) + { + return half2int(arg.data_); + } +}; +template +struct half_caster +{ + static half cast(half arg) + { + return arg; + } +}; +template +struct half_caster : half_caster +{ +}; + +/// \name Comparison operators +/// \{ + +/// Comparison for equality. +/// \param x first operand +/// \param y second operand +/// \retval true if operands equal +/// \retval false else +template +typename enable::type operator==(T x, U y) +{ + return functions::isequal(x, y); +} + +/// Comparison for inequality. +/// \param x first operand +/// \param y second operand +/// \retval true if operands not equal +/// \retval false else +template +typename enable::type operator!=(T x, U y) +{ + return functions::isnotequal(x, y); +} + +/// Comparison for less than. +/// \param x first operand +/// \param y second operand +/// \retval true if \a x less than \a y +/// \retval false else +template +typename enable::type operator<(T x, U y) +{ + return functions::isless(x, y); +} + +/// Comparison for greater than. +/// \param x first operand +/// \param y second operand +/// \retval true if \a x greater than \a y +/// \retval false else +template +typename enable::type operator>(T x, U y) +{ + return functions::isgreater(x, y); +} + +/// Comparison for less equal. +/// \param x first operand +/// \param y second operand +/// \retval true if \a x less equal \a y +/// \retval false else +template +typename enable::type operator<=(T x, U y) +{ + return functions::islessequal(x, y); +} + +/// Comparison for greater equal. +/// \param x first operand +/// \param y second operand +/// \retval true if \a x greater equal \a y +/// \retval false else +template +typename enable::type operator>=(T x, U y) +{ + return functions::isgreaterequal(x, y); +} + +/// \} +/// \name Arithmetic operators +/// \{ + +/// Add halfs. +/// \param x left operand +/// \param y right operand +/// \return sum of half expressions +template +typename enable::type operator+(T x, U y) +{ + return functions::plus(x, y); +} + +/// Subtract halfs. +/// \param x left operand +/// \param y right operand +/// \return difference of half expressions +template +typename enable::type operator-(T x, U y) +{ + return functions::minus(x, y); +} + +/// Multiply halfs. +/// \param x left operand +/// \param y right operand +/// \return product of half expressions +template +typename enable::type operator*(T x, U y) +{ + return functions::multiplies(x, y); +} + +/// Divide halfs. +/// \param x left operand +/// \param y right operand +/// \return quotient of half expressions +template +typename enable::type operator/(T x, U y) +{ + return functions::divides(x, y); +} + +/// Identity. +/// \param arg operand +/// \return uncahnged operand +template +HALF_CONSTEXPR typename enable::type operator+(T arg) +{ + return arg; +} + +/// Negation. +/// \param arg operand +/// \return negated operand +template +HALF_CONSTEXPR typename enable::type operator-(T arg) +{ + return unary_specialized::negate(arg); +} + +/// \} +/// \name Input and output +/// \{ + +/// Output operator. +/// \param out output stream to write into +/// \param arg half expression to write +/// \return reference to output stream +template +typename enable&, T>::type operator<<(std::basic_ostream& out, T arg) +{ + return functions::write(out, arg); +} + +/// Input operator. +/// \param in input stream to read from +/// \param arg half to read into +/// \return reference to input stream +template +std::basic_istream& operator>>(std::basic_istream& in, half& arg) +{ + return functions::read(in, arg); +} + +/// \} +/// \name Basic mathematical operations +/// \{ + +/// Absolute value. +/// \param arg operand +/// \return absolute value of \a arg +// template typename enable::type abs(T arg) { return unary_specialized::fabs(arg); } +inline half abs(half arg) +{ + return unary_specialized::fabs(arg); +} +inline expr abs(expr arg) +{ + return unary_specialized::fabs(arg); +} + +/// Absolute value. +/// \param arg operand +/// \return absolute value of \a arg +// template typename enable::type fabs(T arg) { return unary_specialized::fabs(arg); } +inline half fabs(half arg) +{ + return unary_specialized::fabs(arg); +} +inline expr fabs(expr arg) +{ + return unary_specialized::fabs(arg); +} + +/// Remainder of division. +/// \param x first operand +/// \param y second operand +/// \return remainder of floating point division. +// template typename enable::type fmod(T x, U y) { return functions::fmod(x, y); } +inline expr fmod(half x, half y) +{ + return functions::fmod(x, y); +} +inline expr fmod(half x, expr y) +{ + return functions::fmod(x, y); +} +inline expr fmod(expr x, half y) +{ + return functions::fmod(x, y); +} +inline expr fmod(expr x, expr y) +{ + return functions::fmod(x, y); +} + +/// Remainder of division. +/// \param x first operand +/// \param y second operand +/// \return remainder of floating point division. +// template typename enable::type remainder(T x, U y) { return +// functions::remainder(x, y); } +inline expr remainder(half x, half y) +{ + return functions::remainder(x, y); +} +inline expr remainder(half x, expr y) +{ + return functions::remainder(x, y); +} +inline expr remainder(expr x, half y) +{ + return functions::remainder(x, y); +} +inline expr remainder(expr x, expr y) +{ + return functions::remainder(x, y); +} + +/// Remainder of division. +/// \param x first operand +/// \param y second operand +/// \param quo address to store some bits of quotient at +/// \return remainder of floating point division. +// template typename enable::type remquo(T x, U y, int *quo) { return +// functions::remquo(x, y, quo); } +inline expr remquo(half x, half y, int* quo) +{ + return functions::remquo(x, y, quo); +} +inline expr remquo(half x, expr y, int* quo) +{ + return functions::remquo(x, y, quo); +} +inline expr remquo(expr x, half y, int* quo) +{ + return functions::remquo(x, y, quo); +} +inline expr remquo(expr x, expr y, int* quo) +{ + return functions::remquo(x, y, quo); +} + +/// Fused multiply add. +/// \param x first operand +/// \param y second operand +/// \param z third operand +/// \return ( \a x * \a y ) + \a z rounded as one operation. +// template typename enable::type fma(T x, U y, V z) { return +// functions::fma(x, y, z); } +inline expr fma(half x, half y, half z) +{ + return functions::fma(x, y, z); +} +inline expr fma(half x, half y, expr z) +{ + return functions::fma(x, y, z); +} +inline expr fma(half x, expr y, half z) +{ + return functions::fma(x, y, z); +} +inline expr fma(half x, expr y, expr z) +{ + return functions::fma(x, y, z); +} +inline expr fma(expr x, half y, half z) +{ + return functions::fma(x, y, z); +} +inline expr fma(expr x, half y, expr z) +{ + return functions::fma(x, y, z); +} +inline expr fma(expr x, expr y, half z) +{ + return functions::fma(x, y, z); +} +inline expr fma(expr x, expr y, expr z) +{ + return functions::fma(x, y, z); +} + +/// Maximum of half expressions. +/// \param x first operand +/// \param y second operand +/// \return maximum of operands +// template typename result::type fmax(T x, U y) { return +// binary_specialized::fmax(x, y); } +inline half fmax(half x, half y) +{ + return binary_specialized::fmax(x, y); +} +inline expr fmax(half x, expr y) +{ + return binary_specialized::fmax(x, y); +} +inline expr fmax(expr x, half y) +{ + return binary_specialized::fmax(x, y); +} +inline expr fmax(expr x, expr y) +{ + return binary_specialized::fmax(x, y); +} + +/// Minimum of half expressions. +/// \param x first operand +/// \param y second operand +/// \return minimum of operands +// template typename result::type fmin(T x, U y) { return +// binary_specialized::fmin(x, y); } +inline half fmin(half x, half y) +{ + return binary_specialized::fmin(x, y); +} +inline expr fmin(half x, expr y) +{ + return binary_specialized::fmin(x, y); +} +inline expr fmin(expr x, half y) +{ + return binary_specialized::fmin(x, y); +} +inline expr fmin(expr x, expr y) +{ + return binary_specialized::fmin(x, y); +} + +/// Positive difference. +/// \param x first operand +/// \param y second operand +/// \return \a x - \a y or 0 if difference negative +// template typename enable::type fdim(T x, U y) { return functions::fdim(x, y); } +inline expr fdim(half x, half y) +{ + return functions::fdim(x, y); +} +inline expr fdim(half x, expr y) +{ + return functions::fdim(x, y); +} +inline expr fdim(expr x, half y) +{ + return functions::fdim(x, y); +} +inline expr fdim(expr x, expr y) +{ + return functions::fdim(x, y); +} + +/// Get NaN value. +/// \return quiet NaN +inline half nanh(const char*) +{ + return functions::nanh(); +} + +/// \} +/// \name Exponential functions +/// \{ + +/// Exponential function. +/// \param arg function argument +/// \return e raised to \a arg +// template typename enable::type exp(T arg) { return functions::exp(arg); } +inline expr exp(half arg) +{ + return functions::exp(arg); +} +inline expr exp(expr arg) +{ + return functions::exp(arg); +} + +/// Exponential minus one. +/// \param arg function argument +/// \return e raised to \a arg subtracted by 1 +// template typename enable::type expm1(T arg) { return functions::expm1(arg); } +inline expr expm1(half arg) +{ + return functions::expm1(arg); +} +inline expr expm1(expr arg) +{ + return functions::expm1(arg); +} + +/// Binary exponential. +/// \param arg function argument +/// \return 2 raised to \a arg +// template typename enable::type exp2(T arg) { return functions::exp2(arg); } +inline expr exp2(half arg) +{ + return functions::exp2(arg); +} +inline expr exp2(expr arg) +{ + return functions::exp2(arg); +} + +/// Natural logorithm. +/// \param arg function argument +/// \return logarithm of \a arg to base e +// template typename enable::type log(T arg) { return functions::log(arg); } +inline expr log(half arg) +{ + return functions::log(arg); +} +inline expr log(expr arg) +{ + return functions::log(arg); +} + +/// Common logorithm. +/// \param arg function argument +/// \return logarithm of \a arg to base 10 +// template typename enable::type log10(T arg) { return functions::log10(arg); } +inline expr log10(half arg) +{ + return functions::log10(arg); +} +inline expr log10(expr arg) +{ + return functions::log10(arg); +} + +/// Natural logorithm. +/// \param arg function argument +/// \return logarithm of \a arg plus 1 to base e +// template typename enable::type log1p(T arg) { return functions::log1p(arg); } +inline expr log1p(half arg) +{ + return functions::log1p(arg); +} +inline expr log1p(expr arg) +{ + return functions::log1p(arg); +} + +/// Binary logorithm. +/// \param arg function argument +/// \return logarithm of \a arg to base 2 +// template typename enable::type log2(T arg) { return functions::log2(arg); } +inline expr log2(half arg) +{ + return functions::log2(arg); +} +inline expr log2(expr arg) +{ + return functions::log2(arg); +} + +/// \} +/// \name Power functions +/// \{ + +/// Square root. +/// \param arg function argument +/// \return square root of \a arg +// template typename enable::type sqrt(T arg) { return functions::sqrt(arg); } +inline expr sqrt(half arg) +{ + return functions::sqrt(arg); +} +inline expr sqrt(expr arg) +{ + return functions::sqrt(arg); +} + +/// Cubic root. +/// \param arg function argument +/// \return cubic root of \a arg +// template typename enable::type cbrt(T arg) { return functions::cbrt(arg); } +inline expr cbrt(half arg) +{ + return functions::cbrt(arg); +} +inline expr cbrt(expr arg) +{ + return functions::cbrt(arg); +} + +/// Hypotenuse function. +/// \param x first argument +/// \param y second argument +/// \return square root of sum of squares without internal over- or underflows +// template typename enable::type hypot(T x, U y) { return functions::hypot(x, y); +//} +inline expr hypot(half x, half y) +{ + return functions::hypot(x, y); +} +inline expr hypot(half x, expr y) +{ + return functions::hypot(x, y); +} +inline expr hypot(expr x, half y) +{ + return functions::hypot(x, y); +} +inline expr hypot(expr x, expr y) +{ + return functions::hypot(x, y); +} + +/// Power function. +/// \param base first argument +/// \param exp second argument +/// \return \a base raised to \a exp +// template typename enable::type pow(T base, U exp) { return functions::pow(base, +// exp); } +inline expr pow(half base, half exp) +{ + return functions::pow(base, exp); +} +inline expr pow(half base, expr exp) +{ + return functions::pow(base, exp); +} +inline expr pow(expr base, half exp) +{ + return functions::pow(base, exp); +} +inline expr pow(expr base, expr exp) +{ + return functions::pow(base, exp); +} + +/// \} +/// \name Trigonometric functions +/// \{ + +/// Sine function. +/// \param arg function argument +/// \return sine value of \a arg +// template typename enable::type sin(T arg) { return functions::sin(arg); } +inline expr sin(half arg) +{ + return functions::sin(arg); +} +inline expr sin(expr arg) +{ + return functions::sin(arg); +} + +/// Cosine function. +/// \param arg function argument +/// \return cosine value of \a arg +// template typename enable::type cos(T arg) { return functions::cos(arg); } +inline expr cos(half arg) +{ + return functions::cos(arg); +} +inline expr cos(expr arg) +{ + return functions::cos(arg); +} + +/// Tangent function. +/// \param arg function argument +/// \return tangent value of \a arg +// template typename enable::type tan(T arg) { return functions::tan(arg); } +inline expr tan(half arg) +{ + return functions::tan(arg); +} +inline expr tan(expr arg) +{ + return functions::tan(arg); +} + +/// Arc sine. +/// \param arg function argument +/// \return arc sine value of \a arg +// template typename enable::type asin(T arg) { return functions::asin(arg); } +inline expr asin(half arg) +{ + return functions::asin(arg); +} +inline expr asin(expr arg) +{ + return functions::asin(arg); +} + +/// Arc cosine function. +/// \param arg function argument +/// \return arc cosine value of \a arg +// template typename enable::type acos(T arg) { return functions::acos(arg); } +inline expr acos(half arg) +{ + return functions::acos(arg); +} +inline expr acos(expr arg) +{ + return functions::acos(arg); +} + +/// Arc tangent function. +/// \param arg function argument +/// \return arc tangent value of \a arg +// template typename enable::type atan(T arg) { return functions::atan(arg); } +inline expr atan(half arg) +{ + return functions::atan(arg); +} +inline expr atan(expr arg) +{ + return functions::atan(arg); +} + +/// Arc tangent function. +/// \param x first argument +/// \param y second argument +/// \return arc tangent value +// template typename enable::type atan2(T x, U y) { return functions::atan2(x, y); +//} +inline expr atan2(half x, half y) +{ + return functions::atan2(x, y); +} +inline expr atan2(half x, expr y) +{ + return functions::atan2(x, y); +} +inline expr atan2(expr x, half y) +{ + return functions::atan2(x, y); +} +inline expr atan2(expr x, expr y) +{ + return functions::atan2(x, y); +} + +/// \} +/// \name Hyperbolic functions +/// \{ + +/// Hyperbolic sine. +/// \param arg function argument +/// \return hyperbolic sine value of \a arg +// template typename enable::type sinh(T arg) { return functions::sinh(arg); } +inline expr sinh(half arg) +{ + return functions::sinh(arg); +} +inline expr sinh(expr arg) +{ + return functions::sinh(arg); +} + +/// Hyperbolic cosine. +/// \param arg function argument +/// \return hyperbolic cosine value of \a arg +// template typename enable::type cosh(T arg) { return functions::cosh(arg); } +inline expr cosh(half arg) +{ + return functions::cosh(arg); +} +inline expr cosh(expr arg) +{ + return functions::cosh(arg); +} + +/// Hyperbolic tangent. +/// \param arg function argument +/// \return hyperbolic tangent value of \a arg +// template typename enable::type tanh(T arg) { return functions::tanh(arg); } +inline expr tanh(half arg) +{ + return functions::tanh(arg); +} +inline expr tanh(expr arg) +{ + return functions::tanh(arg); +} + +/// Hyperbolic area sine. +/// \param arg function argument +/// \return area sine value of \a arg +// template typename enable::type asinh(T arg) { return functions::asinh(arg); } +inline expr asinh(half arg) +{ + return functions::asinh(arg); +} +inline expr asinh(expr arg) +{ + return functions::asinh(arg); +} + +/// Hyperbolic area cosine. +/// \param arg function argument +/// \return area cosine value of \a arg +// template typename enable::type acosh(T arg) { return functions::acosh(arg); } +inline expr acosh(half arg) +{ + return functions::acosh(arg); +} +inline expr acosh(expr arg) +{ + return functions::acosh(arg); +} + +/// Hyperbolic area tangent. +/// \param arg function argument +/// \return area tangent value of \a arg +// template typename enable::type atanh(T arg) { return functions::atanh(arg); } +inline expr atanh(half arg) +{ + return functions::atanh(arg); +} +inline expr atanh(expr arg) +{ + return functions::atanh(arg); +} + +/// \} +/// \name Error and gamma functions +/// \{ + +/// Error function. +/// \param arg function argument +/// \return error function value of \a arg +// template typename enable::type erf(T arg) { return functions::erf(arg); } +inline expr erf(half arg) +{ + return functions::erf(arg); +} +inline expr erf(expr arg) +{ + return functions::erf(arg); +} + +/// Complementary error function. +/// \param arg function argument +/// \return 1 minus error function value of \a arg +// template typename enable::type erfc(T arg) { return functions::erfc(arg); } +inline expr erfc(half arg) +{ + return functions::erfc(arg); +} +inline expr erfc(expr arg) +{ + return functions::erfc(arg); +} + +/// Natural logarithm of gamma function. +/// \param arg function argument +/// \return natural logarith of gamma function for \a arg +// template typename enable::type lgamma(T arg) { return functions::lgamma(arg); } +inline expr lgamma(half arg) +{ + return functions::lgamma(arg); +} +inline expr lgamma(expr arg) +{ + return functions::lgamma(arg); +} + +/// Gamma function. +/// \param arg function argument +/// \return gamma function value of \a arg +// template typename enable::type tgamma(T arg) { return functions::tgamma(arg); } +inline expr tgamma(half arg) +{ + return functions::tgamma(arg); +} +inline expr tgamma(expr arg) +{ + return functions::tgamma(arg); +} + +/// \} +/// \name Rounding +/// \{ + +/// Nearest integer not less than half value. +/// \param arg half to round +/// \return nearest integer not less than \a arg +// template typename enable::type ceil(T arg) { return functions::ceil(arg); } +inline half ceil(half arg) +{ + return functions::ceil(arg); +} +inline half ceil(expr arg) +{ + return functions::ceil(arg); +} + +/// Nearest integer not greater than half value. +/// \param arg half to round +/// \return nearest integer not greater than \a arg +// template typename enable::type floor(T arg) { return functions::floor(arg); } +inline half floor(half arg) +{ + return functions::floor(arg); +} +inline half floor(expr arg) +{ + return functions::floor(arg); +} + +/// Nearest integer not greater in magnitude than half value. +/// \param arg half to round +/// \return nearest integer not greater in magnitude than \a arg +// template typename enable::type trunc(T arg) { return functions::trunc(arg); } +inline half trunc(half arg) +{ + return functions::trunc(arg); +} +inline half trunc(expr arg) +{ + return functions::trunc(arg); +} + +/// Nearest integer. +/// \param arg half to round +/// \return nearest integer, rounded away from zero in half-way cases +// template typename enable::type round(T arg) { return functions::round(arg); } +inline half round(half arg) +{ + return functions::round(arg); +} +inline half round(expr arg) +{ + return functions::round(arg); +} + +/// Nearest integer. +/// \param arg half to round +/// \return nearest integer, rounded away from zero in half-way cases +// template typename enable::type lround(T arg) { return functions::lround(arg); } +inline long lround(half arg) +{ + return functions::lround(arg); +} +inline long lround(expr arg) +{ + return functions::lround(arg); +} + +/// Nearest integer using half's internal rounding mode. +/// \param arg half expression to round +/// \return nearest integer using default rounding mode +// template typename enable::type nearbyint(T arg) { return functions::nearbyint(arg); } +inline half nearbyint(half arg) +{ + return functions::rint(arg); +} +inline half nearbyint(expr arg) +{ + return functions::rint(arg); +} + +/// Nearest integer using half's internal rounding mode. +/// \param arg half expression to round +/// \return nearest integer using default rounding mode +// template typename enable::type rint(T arg) { return functions::rint(arg); } +inline half rint(half arg) +{ + return functions::rint(arg); +} +inline half rint(expr arg) +{ + return functions::rint(arg); +} + +/// Nearest integer using half's internal rounding mode. +/// \param arg half expression to round +/// \return nearest integer using default rounding mode +// template typename enable::type lrint(T arg) { return functions::lrint(arg); } +inline long lrint(half arg) +{ + return functions::lrint(arg); +} +inline long lrint(expr arg) +{ + return functions::lrint(arg); +} +#if HALF_ENABLE_CPP11_LONG_LONG +/// Nearest integer. +/// \param arg half to round +/// \return nearest integer, rounded away from zero in half-way cases +// template typename enable::type llround(T arg) { return functions::llround(arg); } +inline long long llround(half arg) +{ + return functions::llround(arg); +} +inline long long llround(expr arg) +{ + return functions::llround(arg); +} + +/// Nearest integer using half's internal rounding mode. +/// \param arg half expression to round +/// \return nearest integer using default rounding mode +// template typename enable::type llrint(T arg) { return functions::llrint(arg); } +inline long long llrint(half arg) +{ + return functions::llrint(arg); +} +inline long long llrint(expr arg) +{ + return functions::llrint(arg); +} +#endif + +/// \} +/// \name Floating point manipulation +/// \{ + +/// Decompress floating point number. +/// \param arg number to decompress +/// \param exp address to store exponent at +/// \return significant in range [0.5, 1) +// template typename enable::type frexp(T arg, int *exp) { return functions::frexp(arg, exp); } +inline half frexp(half arg, int* exp) +{ + return functions::frexp(arg, exp); +} +inline half frexp(expr arg, int* exp) +{ + return functions::frexp(arg, exp); +} + +/// Multiply by power of two. +/// \param arg number to modify +/// \param exp power of two to multiply with +/// \return \a arg multplied by 2 raised to \a exp +// template typename enable::type ldexp(T arg, int exp) { return functions::scalbln(arg, exp); +//} +inline half ldexp(half arg, int exp) +{ + return functions::scalbln(arg, exp); +} +inline half ldexp(expr arg, int exp) +{ + return functions::scalbln(arg, exp); +} + +/// Extract integer and fractional parts. +/// \param arg number to decompress +/// \param iptr address to store integer part at +/// \return fractional part +// template typename enable::type modf(T arg, half *iptr) { return functions::modf(arg, iptr); +//} +inline half modf(half arg, half* iptr) +{ + return functions::modf(arg, iptr); +} +inline half modf(expr arg, half* iptr) +{ + return functions::modf(arg, iptr); +} + +/// Multiply by power of two. +/// \param arg number to modify +/// \param exp power of two to multiply with +/// \return \a arg multplied by 2 raised to \a exp +// template typename enable::type scalbn(T arg, int exp) { return functions::scalbln(arg, exp); +//} +inline half scalbn(half arg, int exp) +{ + return functions::scalbln(arg, exp); +} +inline half scalbn(expr arg, int exp) +{ + return functions::scalbln(arg, exp); +} + +/// Multiply by power of two. +/// \param arg number to modify +/// \param exp power of two to multiply with +/// \return \a arg multplied by 2 raised to \a exp +// template typename enable::type scalbln(T arg, long exp) { return functions::scalbln(arg, +// exp); +//} +inline half scalbln(half arg, long exp) +{ + return functions::scalbln(arg, exp); +} +inline half scalbln(expr arg, long exp) +{ + return functions::scalbln(arg, exp); +} + +/// Extract exponent. +/// \param arg number to query +/// \return floating point exponent +/// \retval FP_ILOGB0 for zero +/// \retval FP_ILOGBNAN for NaN +/// \retval MAX_INT for infinity +// template typename enable::type ilogb(T arg) { return functions::ilogb(arg); } +inline int ilogb(half arg) +{ + return functions::ilogb(arg); +} +inline int ilogb(expr arg) +{ + return functions::ilogb(arg); +} + +/// Extract exponent. +/// \param arg number to query +/// \return floating point exponent +// template typename enable::type logb(T arg) { return functions::logb(arg); } +inline half logb(half arg) +{ + return functions::logb(arg); +} +inline half logb(expr arg) +{ + return functions::logb(arg); +} + +/// Next representable value. +/// \param from value to compute next representable value for +/// \param to direction towards which to compute next value +/// \return next representable value after \a from in direction towards \a to +// template typename enable::type nextafter(T from, U to) { return +// functions::nextafter(from, to); } +inline half nextafter(half from, half to) +{ + return functions::nextafter(from, to); +} +inline half nextafter(half from, expr to) +{ + return functions::nextafter(from, to); +} +inline half nextafter(expr from, half to) +{ + return functions::nextafter(from, to); +} +inline half nextafter(expr from, expr to) +{ + return functions::nextafter(from, to); +} + +/// Next representable value. +/// \param from value to compute next representable value for +/// \param to direction towards which to compute next value +/// \return next representable value after \a from in direction towards \a to +// template typename enable::type nexttoward(T from, long double to) { return +// functions::nexttoward(from, to); } +inline half nexttoward(half from, long double to) +{ + return functions::nexttoward(from, to); +} +inline half nexttoward(expr from, long double to) +{ + return functions::nexttoward(from, to); +} + +/// Take sign. +/// \param x value to change sign for +/// \param y value to take sign from +/// \return value equal to \a x in magnitude and to \a y in sign +// template typename enable::type copysign(T x, U y) { return +// functions::copysign(x, y); } +inline half copysign(half x, half y) +{ + return functions::copysign(x, y); +} +inline half copysign(half x, expr y) +{ + return functions::copysign(x, y); +} +inline half copysign(expr x, half y) +{ + return functions::copysign(x, y); +} +inline half copysign(expr x, expr y) +{ + return functions::copysign(x, y); +} + +/// \} +/// \name Floating point classification +/// \{ + +/// Classify floating point value. +/// \param arg number to classify +/// \retval FP_ZERO for positive and negative zero +/// \retval FP_SUBNORMAL for subnormal numbers +/// \retval FP_INFINITY for positive and negative infinity +/// \retval FP_NAN for NaNs +/// \retval FP_NORMAL for all other (normal) values +// template typename enable::type fpclassify(T arg) { return functions::fpclassify(arg); } +inline int fpclassify(half arg) +{ + return functions::fpclassify(arg); +} +inline int fpclassify(expr arg) +{ + return functions::fpclassify(arg); +} + +/// Check if finite number. +/// \param arg number to check +/// \retval true if neither infinity nor NaN +/// \retval false else +// template typename enable::type isfinite(T arg) { return functions::isfinite(arg); } +inline bool isfinite(half arg) +{ + return functions::isfinite(arg); +} +inline bool isfinite(expr arg) +{ + return functions::isfinite(arg); +} + +/// Check for infinity. +/// \param arg number to check +/// \retval true for positive or negative infinity +/// \retval false else +// template typename enable::type isinf(T arg) { return functions::isinf(arg); } +inline bool isinf(half arg) +{ + return functions::isinf(arg); +} +inline bool isinf(expr arg) +{ + return functions::isinf(arg); +} + +/// Check for NaN. +/// \param arg number to check +/// \retval true for NaNs +/// \retval false else +// template typename enable::type isnan(T arg) { return functions::isnan(arg); } +inline bool isnan(half arg) +{ + return functions::isnan(arg); +} +inline bool isnan(expr arg) +{ + return functions::isnan(arg); +} + +/// Check if normal number. +/// \param arg number to check +/// \retval true if normal number +/// \retval false if either subnormal, zero, infinity or NaN +// template typename enable::type isnormal(T arg) { return functions::isnormal(arg); } +inline bool isnormal(half arg) +{ + return functions::isnormal(arg); +} +inline bool isnormal(expr arg) +{ + return functions::isnormal(arg); +} + +/// Check sign. +/// \param arg number to check +/// \retval true for negative number +/// \retval false for positive number +// template typename enable::type signbit(T arg) { return functions::signbit(arg); } +inline bool signbit(half arg) +{ + return functions::signbit(arg); +} +inline bool signbit(expr arg) +{ + return functions::signbit(arg); +} + +/// \} +/// \name Comparison +/// \{ + +/// Comparison for greater than. +/// \param x first operand +/// \param y second operand +/// \retval true if \a x greater than \a y +/// \retval false else +// template typename enable::type isgreater(T x, U y) { return +// functions::isgreater(x, y); } +inline bool isgreater(half x, half y) +{ + return functions::isgreater(x, y); +} +inline bool isgreater(half x, expr y) +{ + return functions::isgreater(x, y); +} +inline bool isgreater(expr x, half y) +{ + return functions::isgreater(x, y); +} +inline bool isgreater(expr x, expr y) +{ + return functions::isgreater(x, y); +} + +/// Comparison for greater equal. +/// \param x first operand +/// \param y second operand +/// \retval true if \a x greater equal \a y +/// \retval false else +// template typename enable::type isgreaterequal(T x, U y) { return +// functions::isgreaterequal(x, y); } +inline bool isgreaterequal(half x, half y) +{ + return functions::isgreaterequal(x, y); +} +inline bool isgreaterequal(half x, expr y) +{ + return functions::isgreaterequal(x, y); +} +inline bool isgreaterequal(expr x, half y) +{ + return functions::isgreaterequal(x, y); +} +inline bool isgreaterequal(expr x, expr y) +{ + return functions::isgreaterequal(x, y); +} + +/// Comparison for less than. +/// \param x first operand +/// \param y second operand +/// \retval true if \a x less than \a y +/// \retval false else +// template typename enable::type isless(T x, U y) { return functions::isless(x, +// y); +//} +inline bool isless(half x, half y) +{ + return functions::isless(x, y); +} +inline bool isless(half x, expr y) +{ + return functions::isless(x, y); +} +inline bool isless(expr x, half y) +{ + return functions::isless(x, y); +} +inline bool isless(expr x, expr y) +{ + return functions::isless(x, y); +} + +/// Comparison for less equal. +/// \param x first operand +/// \param y second operand +/// \retval true if \a x less equal \a y +/// \retval false else +// template typename enable::type islessequal(T x, U y) { return +// functions::islessequal(x, y); } +inline bool islessequal(half x, half y) +{ + return functions::islessequal(x, y); +} +inline bool islessequal(half x, expr y) +{ + return functions::islessequal(x, y); +} +inline bool islessequal(expr x, half y) +{ + return functions::islessequal(x, y); +} +inline bool islessequal(expr x, expr y) +{ + return functions::islessequal(x, y); +} + +/// Comarison for less or greater. +/// \param x first operand +/// \param y second operand +/// \retval true if either less or greater +/// \retval false else +// template typename enable::type islessgreater(T x, U y) { return +// functions::islessgreater(x, y); } +inline bool islessgreater(half x, half y) +{ + return functions::islessgreater(x, y); +} +inline bool islessgreater(half x, expr y) +{ + return functions::islessgreater(x, y); +} +inline bool islessgreater(expr x, half y) +{ + return functions::islessgreater(x, y); +} +inline bool islessgreater(expr x, expr y) +{ + return functions::islessgreater(x, y); +} + +/// Check if unordered. +/// \param x first operand +/// \param y second operand +/// \retval true if unordered (one or two NaN operands) +/// \retval false else +// template typename enable::type isunordered(T x, U y) { return +// functions::isunordered(x, y); } +inline bool isunordered(half x, half y) +{ + return functions::isunordered(x, y); +} +inline bool isunordered(half x, expr y) +{ + return functions::isunordered(x, y); +} +inline bool isunordered(expr x, half y) +{ + return functions::isunordered(x, y); +} +inline bool isunordered(expr x, expr y) +{ + return functions::isunordered(x, y); +} + +/// \name Casting +/// \{ + +/// Cast to or from half-precision floating point number. +/// This casts between [half](\ref half_float::half) and any built-in arithmetic type. The values are converted +/// directly using the given rounding mode, without any roundtrip over `float` that a `static_cast` would otherwise do. +/// It uses the default rounding mode. +/// +/// Using this cast with neither of the two types being a [half](\ref half_float::half) or with any of the two types +/// not being a built-in arithmetic type (apart from [half](\ref half_float::half), of course) results in a compiler +/// error and casting between [half](\ref half_float::half)s is just a no-op. +/// \tparam T destination type (half or built-in arithmetic type) +/// \tparam U source type (half or built-in arithmetic type) +/// \param arg value to cast +/// \return \a arg converted to destination type +template +T half_cast(U arg) +{ + return half_caster::cast(arg); +} + +/// Cast to or from half-precision floating point number. +/// This casts between [half](\ref half_float::half) and any built-in arithmetic type. The values are converted +/// directly using the given rounding mode, without any roundtrip over `float` that a `static_cast` would otherwise do. +/// +/// Using this cast with neither of the two types being a [half](\ref half_float::half) or with any of the two types +/// not being a built-in arithmetic type (apart from [half](\ref half_float::half), of course) results in a compiler +/// error and casting between [half](\ref half_float::half)s is just a no-op. +/// \tparam T destination type (half or built-in arithmetic type) +/// \tparam R rounding mode to use. +/// \tparam U source type (half or built-in arithmetic type) +/// \param arg value to cast +/// \return \a arg converted to destination type +template +T half_cast(U arg) +{ + return half_caster::cast(arg); +} +/// \} +} // namespace detail + +using detail::operator==; +using detail::operator!=; +using detail::operator<; +using detail::operator>; +using detail::operator<=; +using detail::operator>=; +using detail::operator+; +using detail::operator-; +using detail::operator*; +using detail::operator/; +using detail::operator<<; +using detail::operator>>; + +using detail::abs; +using detail::acos; +using detail::acosh; +using detail::asin; +using detail::asinh; +using detail::atan; +using detail::atan2; +using detail::atanh; +using detail::cbrt; +using detail::ceil; +using detail::cos; +using detail::cosh; +using detail::erf; +using detail::erfc; +using detail::exp; +using detail::exp2; +using detail::expm1; +using detail::fabs; +using detail::fdim; +using detail::floor; +using detail::fma; +using detail::fmax; +using detail::fmin; +using detail::fmod; +using detail::hypot; +using detail::lgamma; +using detail::log; +using detail::log10; +using detail::log1p; +using detail::log2; +using detail::lrint; +using detail::lround; +using detail::nanh; +using detail::nearbyint; +using detail::pow; +using detail::remainder; +using detail::remquo; +using detail::rint; +using detail::round; +using detail::sin; +using detail::sinh; +using detail::sqrt; +using detail::tan; +using detail::tanh; +using detail::tgamma; +using detail::trunc; +#if HALF_ENABLE_CPP11_LONG_LONG +using detail::llrint; +using detail::llround; +#endif +using detail::copysign; +using detail::fpclassify; +using detail::frexp; +using detail::ilogb; +using detail::isfinite; +using detail::isgreater; +using detail::isgreaterequal; +using detail::isinf; +using detail::isless; +using detail::islessequal; +using detail::islessgreater; +using detail::isnan; +using detail::isnormal; +using detail::isunordered; +using detail::ldexp; +using detail::logb; +using detail::modf; +using detail::nextafter; +using detail::nexttoward; +using detail::scalbln; +using detail::scalbn; +using detail::signbit; + +using detail::half_cast; +} // namespace half_float + +/// Extensions to the C++ standard library. +namespace std +{ +/// Numeric limits for half-precision floats. +/// Because of the underlying single-precision implementation of many operations, it inherits some properties from +/// `std::numeric_limits`. +template <> +class numeric_limits : public numeric_limits +{ +public: + /// Supports signed values. + static HALF_CONSTEXPR_CONST bool is_signed = true; + + /// Is not exact. + static HALF_CONSTEXPR_CONST bool is_exact = false; + + /// Doesn't provide modulo arithmetic. + static HALF_CONSTEXPR_CONST bool is_modulo = false; + + /// IEEE conformant. + static HALF_CONSTEXPR_CONST bool is_iec559 = true; + + /// Supports infinity. + static HALF_CONSTEXPR_CONST bool has_infinity = true; + + /// Supports quiet NaNs. + static HALF_CONSTEXPR_CONST bool has_quiet_NaN = true; + + /// Supports subnormal values. + static HALF_CONSTEXPR_CONST float_denorm_style has_denorm = denorm_present; + + /// Rounding mode. + /// Due to the mix of internal single-precision computations (using the rounding mode of the underlying + /// single-precision implementation) with the rounding mode of the single-to-half conversions, the actual rounding + /// mode might be `std::round_indeterminate` if the default half-precision rounding mode doesn't match the + /// single-precision rounding mode. + static HALF_CONSTEXPR_CONST float_round_style round_style + = (std::numeric_limits::round_style == half_float::half::round_style) ? half_float::half::round_style + : round_indeterminate; + + /// Significant digits. + static HALF_CONSTEXPR_CONST int digits = 11; + + /// Significant decimal digits. + static HALF_CONSTEXPR_CONST int digits10 = 3; + + /// Required decimal digits to represent all possible values. + static HALF_CONSTEXPR_CONST int max_digits10 = 5; + + /// Number base. + static HALF_CONSTEXPR_CONST int radix = 2; + + /// One more than smallest exponent. + static HALF_CONSTEXPR_CONST int min_exponent = -13; + + /// Smallest normalized representable power of 10. + static HALF_CONSTEXPR_CONST int min_exponent10 = -4; + + /// One more than largest exponent + static HALF_CONSTEXPR_CONST int max_exponent = 16; + + /// Largest finitely representable power of 10. + static HALF_CONSTEXPR_CONST int max_exponent10 = 4; + + /// Smallest positive normal value. + static HALF_CONSTEXPR half_float::half min() HALF_NOTHROW + { + return half_float::half(half_float::detail::binary, 0x0400); + } + + /// Smallest finite value. + static HALF_CONSTEXPR half_float::half lowest() HALF_NOTHROW + { + return half_float::half(half_float::detail::binary, 0xFBFF); + } + + /// Largest finite value. + static HALF_CONSTEXPR half_float::half max() HALF_NOTHROW + { + return half_float::half(half_float::detail::binary, 0x7BFF); + } + + /// Difference between one and next representable value. + static HALF_CONSTEXPR half_float::half epsilon() HALF_NOTHROW + { + return half_float::half(half_float::detail::binary, 0x1400); + } + + /// Maximum rounding error. + static HALF_CONSTEXPR half_float::half round_error() HALF_NOTHROW + { + return half_float::half(half_float::detail::binary, (round_style == std::round_to_nearest) ? 0x3800 : 0x3C00); + } + + /// Positive infinity. + static HALF_CONSTEXPR half_float::half infinity() HALF_NOTHROW + { + return half_float::half(half_float::detail::binary, 0x7C00); + } + + /// Quiet NaN. + static HALF_CONSTEXPR half_float::half quiet_NaN() HALF_NOTHROW + { + return half_float::half(half_float::detail::binary, 0x7FFF); + } + + /// Signalling NaN. + static HALF_CONSTEXPR half_float::half signaling_NaN() HALF_NOTHROW + { + return half_float::half(half_float::detail::binary, 0x7DFF); + } + + /// Smallest positive subnormal value. + static HALF_CONSTEXPR half_float::half denorm_min() HALF_NOTHROW + { + return half_float::half(half_float::detail::binary, 0x0001); + } +}; + +#if HALF_ENABLE_CPP11_HASH +/// Hash function for half-precision floats. +/// This is only defined if C++11 `std::hash` is supported and enabled. +template <> +struct hash //: unary_function +{ + /// Type of function argument. + typedef half_float::half argument_type; + + /// Function return type. + typedef size_t result_type; + + /// Compute hash function. + /// \param arg half to hash + /// \return hash value + result_type operator()(argument_type arg) const + { + return hash()(static_cast(arg.data_) & -(arg.data_ != 0x8000)); + } +}; +#endif +} // namespace std + +#undef HALF_CONSTEXPR +#undef HALF_CONSTEXPR_CONST +#undef HALF_NOEXCEPT +#undef HALF_NOTHROW +#ifdef HALF_POP_WARNINGS +#pragma warning(pop) +#undef HALF_POP_WARNINGS +#endif + +#endif diff --git a/src/Detector/tensorrt_yolo/common_deprecated/logger.cpp b/src/Detector/tensorrt_yolo/common_deprecated/logger.cpp new file mode 100644 index 00000000..03c64398 --- /dev/null +++ b/src/Detector/tensorrt_yolo/common_deprecated/logger.cpp @@ -0,0 +1,40 @@ +/* + * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "logger.h" +#include "ErrorRecorder.h" +#include "logging.h" + +SampleErrorRecorder gRecorder; +namespace sample +{ +Logger gLogger{Logger::Severity::kINFO}; +LogStreamConsumer gLogVerbose{LOG_VERBOSE(gLogger)}; +LogStreamConsumer gLogInfo{LOG_INFO(gLogger)}; +LogStreamConsumer gLogWarning{LOG_WARN(gLogger)}; +LogStreamConsumer gLogError{LOG_ERROR(gLogger)}; +LogStreamConsumer gLogFatal{LOG_FATAL(gLogger)}; + +void setReportableSeverity(Logger::Severity severity) +{ + gLogger.setReportableSeverity(severity); + gLogVerbose.setReportableSeverity(severity); + gLogInfo.setReportableSeverity(severity); + gLogWarning.setReportableSeverity(severity); + gLogError.setReportableSeverity(severity); + gLogFatal.setReportableSeverity(severity); +} +} // namespace sample diff --git a/src/Detector/tensorrt_yolo/common_deprecated/logger.h b/src/Detector/tensorrt_yolo/common_deprecated/logger.h new file mode 100644 index 00000000..3069e8e9 --- /dev/null +++ b/src/Detector/tensorrt_yolo/common_deprecated/logger.h @@ -0,0 +1,36 @@ +/* + * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LOGGER_H +#define LOGGER_H + +#include "logging.h" + +class SampleErrorRecorder; +extern SampleErrorRecorder gRecorder; +namespace sample +{ +extern Logger gLogger; +extern LogStreamConsumer gLogVerbose; +extern LogStreamConsumer gLogInfo; +extern LogStreamConsumer gLogWarning; +extern LogStreamConsumer gLogError; +extern LogStreamConsumer gLogFatal; + +void setReportableSeverity(Logger::Severity severity); +} // namespace sample + +#endif // LOGGER_H diff --git a/src/Detector/tensorrt_yolo/common_deprecated/logging.h b/src/Detector/tensorrt_yolo/common_deprecated/logging.h new file mode 100644 index 00000000..78732c10 --- /dev/null +++ b/src/Detector/tensorrt_yolo/common_deprecated/logging.h @@ -0,0 +1,578 @@ +/* + * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef TENSORRT_LOGGING_H +#define TENSORRT_LOGGING_H + +#include "NvInferRuntimeCommon.h" +#include "sampleOptions.h" +#include +#include +#include +#include +#include +#include +#include +#include + +namespace sample +{ + +using Severity = nvinfer1::ILogger::Severity; + +class LogStreamConsumerBuffer : public std::stringbuf +{ +public: + LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog) + : mOutput(stream) + , mPrefix(prefix) + , mShouldLog(shouldLog) + { + } + + LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other) noexcept + : mOutput(other.mOutput) + , mPrefix(other.mPrefix) + , mShouldLog(other.mShouldLog) + { + } + LogStreamConsumerBuffer(const LogStreamConsumerBuffer& other) = delete; + LogStreamConsumerBuffer() = delete; + LogStreamConsumerBuffer& operator=(const LogStreamConsumerBuffer&) = delete; + LogStreamConsumerBuffer& operator=(LogStreamConsumerBuffer&&) = delete; + + ~LogStreamConsumerBuffer() override + { + // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence + // std::streambuf::pptr() gives a pointer to the current position of the output sequence + // if the pointer to the beginning is not equal to the pointer to the current position, + // call putOutput() to log the output to the stream + if (pbase() != pptr()) + { + putOutput(); + } + } + + //! + //! synchronizes the stream buffer and returns 0 on success + //! synchronizing the stream buffer consists of inserting the buffer contents into the stream, + //! resetting the buffer and flushing the stream + //! + int32_t sync() override + { + putOutput(); + return 0; + } + + void putOutput() + { + if (mShouldLog) + { + // prepend timestamp + std::time_t timestamp = std::time(nullptr); + tm* tm_local = std::localtime(×tamp); + mOutput << "["; + mOutput << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/"; + mOutput << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/"; + mOutput << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-"; + mOutput << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":"; + mOutput << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":"; + mOutput << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] "; + // std::stringbuf::str() gets the string contents of the buffer + // insert the buffer contents pre-appended by the appropriate prefix into the stream + mOutput << mPrefix << str(); + } + // set the buffer to empty + str(""); + // flush the stream + mOutput.flush(); + } + + void setShouldLog(bool shouldLog) + { + mShouldLog = shouldLog; + } + +private: + std::ostream& mOutput; + std::string mPrefix; + bool mShouldLog{}; +}; // class LogStreamConsumerBuffer + +//! +//! \class LogStreamConsumerBase +//! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer +//! +class LogStreamConsumerBase +{ +public: + LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog) + : mBuffer(stream, prefix, shouldLog) + { + } + +protected: + std::mutex mLogMutex; + LogStreamConsumerBuffer mBuffer; +}; // class LogStreamConsumerBase + +//! +//! \class LogStreamConsumer +//! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages. +//! Order of base classes is LogStreamConsumerBase and then std::ostream. +//! This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field +//! in LogStreamConsumer and then the address of the buffer is passed to std::ostream. +//! This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream. +//! Please do not change the order of the parent classes. +//! +class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream +{ +public: + //! + //! \brief Creates a LogStreamConsumer which logs messages with level severity. + //! Reportable severity determines if the messages are severe enough to be logged. + //! + LogStreamConsumer(nvinfer1::ILogger::Severity reportableSeverity, nvinfer1::ILogger::Severity severity) + : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity) + , std::ostream(&mBuffer) // links the stream buffer with the stream + , mShouldLog(severity <= reportableSeverity) + , mSeverity(severity) + { + } + + LogStreamConsumer(LogStreamConsumer&& other) noexcept + : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog) + , std::ostream(&mBuffer) // links the stream buffer with the stream + , mShouldLog(other.mShouldLog) + , mSeverity(other.mSeverity) + { + } + LogStreamConsumer(const LogStreamConsumer& other) = delete; + LogStreamConsumer() = delete; + ~LogStreamConsumer() = default; + LogStreamConsumer& operator=(const LogStreamConsumer&) = delete; + LogStreamConsumer& operator=(LogStreamConsumer&&) = delete; + + void setReportableSeverity(Severity reportableSeverity) + { + mShouldLog = mSeverity <= reportableSeverity; + mBuffer.setShouldLog(mShouldLog); + } + + std::mutex& getMutex() + { + return mLogMutex; + } + + bool getShouldLog() const + { + return mShouldLog; + } + +private: + static std::ostream& severityOstream(Severity severity) + { + return severity >= Severity::kINFO ? std::cout : std::cerr; + } + + static std::string severityPrefix(Severity severity) + { + switch (severity) + { + case Severity::kINTERNAL_ERROR: return "[F] "; + case Severity::kERROR: return "[E] "; + case Severity::kWARNING: return "[W] "; + case Severity::kINFO: return "[I] "; + case Severity::kVERBOSE: return "[V] "; + default: assert(0); return ""; + } + } + + bool mShouldLog; + Severity mSeverity; +}; // class LogStreamConsumer + +template +LogStreamConsumer& operator<<(LogStreamConsumer& logger, const T& obj) +{ + if (logger.getShouldLog()) + { + std::lock_guard guard(logger.getMutex()); + auto& os = static_cast(logger); + os << obj; + } + return logger; +} + +//! +//! Special handling std::endl +//! +inline LogStreamConsumer& operator<<(LogStreamConsumer& logger, std::ostream& (*f)(std::ostream&) ) +{ + if (logger.getShouldLog()) + { + std::lock_guard guard(logger.getMutex()); + auto& os = static_cast(logger); + os << f; + } + return logger; +} + +inline LogStreamConsumer& operator<<(LogStreamConsumer& logger, const nvinfer1::Dims& dims) +{ + if (logger.getShouldLog()) + { + std::lock_guard guard(logger.getMutex()); + auto& os = static_cast(logger); + for (int32_t i = 0; i < dims.nbDims; ++i) + { + os << (i ? "x" : "") << dims.d[i]; + } + } + return logger; +} + +//! +//! \class Logger +//! +//! \brief Class which manages logging of TensorRT tools and samples +//! +//! \details This class provides a common interface for TensorRT tools and samples to log information to the console, +//! and supports logging two types of messages: +//! +//! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal) +//! - Test pass/fail messages +//! +//! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is +//! that the logic for controlling the verbosity and formatting of sample output is centralized in one location. +//! +//! In the future, this class could be extended to support dumping test results to a file in some standard format +//! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run). +//! +//! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger +//! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT +//! library and messages coming from the sample. +//! +//! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the +//! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger +//! object. +//! +class Logger : public nvinfer1::ILogger +{ +public: + explicit Logger(Severity severity = Severity::kWARNING) + : mReportableSeverity(severity) + { + } + + //! + //! \enum TestResult + //! \brief Represents the state of a given test + //! + enum class TestResult + { + kRUNNING, //!< The test is running + kPASSED, //!< The test passed + kFAILED, //!< The test failed + kWAIVED //!< The test was waived + }; + + //! + //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger + //! \return The nvinfer1::ILogger associated with this Logger + //! + //! TODO Once all samples are updated to use this method to register the logger with TensorRT, + //! we can eliminate the inheritance of Logger from ILogger + //! + nvinfer1::ILogger& getTRTLogger() noexcept + { + return *this; + } + + //! + //! \brief Implementation of the nvinfer1::ILogger::log() virtual method + //! + //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the + //! inheritance from nvinfer1::ILogger + //! + void log(Severity severity, const char* msg) noexcept override + { + LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl; + } + + //! + //! \brief Method for controlling the verbosity of logging output + //! + //! \param severity The logger will only emit messages that have severity of this level or higher. + //! + void setReportableSeverity(Severity severity) noexcept + { + mReportableSeverity = severity; + } + + //! + //! \brief Opaque handle that holds logging information for a particular test + //! + //! This object is an opaque handle to information used by the Logger to print test results. + //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used + //! with Logger::reportTest{Start,End}(). + //! + class TestAtom + { + public: + TestAtom(TestAtom&&) = default; + + private: + friend class Logger; + + TestAtom(bool started, const std::string& name, const std::string& cmdline) + : mStarted(started) + , mName(name) + , mCmdline(cmdline) + { + } + + bool mStarted; + std::string mName; + std::string mCmdline; + }; + + //! + //! \brief Define a test for logging + //! + //! \param[in] name The name of the test. This should be a string starting with + //! "TensorRT" and containing dot-separated strings containing + //! the characters [A-Za-z0-9_]. + //! For example, "TensorRT.sample_googlenet" + //! \param[in] cmdline The command line used to reproduce the test + // + //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). + //! + static TestAtom defineTest(const std::string& name, const std::string& cmdline) + { + return TestAtom(false, name, cmdline); + } + + //! + //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments + //! as input + //! + //! \param[in] name The name of the test + //! \param[in] argc The number of command-line arguments + //! \param[in] argv The array of command-line arguments (given as C strings) + //! + //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). + //! + static TestAtom defineTest(const std::string& name, int32_t argc, char const* const* argv) + { + // Append TensorRT version as info + const std::string vname = name + " [TensorRT v" + std::to_string(NV_TENSORRT_VERSION) + "]"; + auto cmdline = genCmdlineString(argc, argv); + return defineTest(vname, cmdline); + } + + //! + //! \brief Report that a test has started. + //! + //! \pre reportTestStart() has not been called yet for the given testAtom + //! + //! \param[in] testAtom The handle to the test that has started + //! + static void reportTestStart(TestAtom& testAtom) + { + reportTestResult(testAtom, TestResult::kRUNNING); + assert(!testAtom.mStarted); + testAtom.mStarted = true; + } + + //! + //! \brief Report that a test has ended. + //! + //! \pre reportTestStart() has been called for the given testAtom + //! + //! \param[in] testAtom The handle to the test that has ended + //! \param[in] result The result of the test. Should be one of TestResult::kPASSED, + //! TestResult::kFAILED, TestResult::kWAIVED + //! + static void reportTestEnd(TestAtom const& testAtom, TestResult result) + { + assert(result != TestResult::kRUNNING); + assert(testAtom.mStarted); + reportTestResult(testAtom, result); + } + + static int32_t reportPass(TestAtom const& testAtom) + { + reportTestEnd(testAtom, TestResult::kPASSED); + return EXIT_SUCCESS; + } + + static int32_t reportFail(TestAtom const& testAtom) + { + reportTestEnd(testAtom, TestResult::kFAILED); + return EXIT_FAILURE; + } + + static int32_t reportWaive(TestAtom const& testAtom) + { + reportTestEnd(testAtom, TestResult::kWAIVED); + return EXIT_SUCCESS; + } + + static int32_t reportTest(TestAtom const& testAtom, bool pass) + { + return pass ? reportPass(testAtom) : reportFail(testAtom); + } + + Severity getReportableSeverity() const + { + return mReportableSeverity; + } + +private: + //! + //! \brief returns an appropriate string for prefixing a log message with the given severity + //! + static const char* severityPrefix(Severity severity) + { + switch (severity) + { + case Severity::kINTERNAL_ERROR: return "[F] "; + case Severity::kERROR: return "[E] "; + case Severity::kWARNING: return "[W] "; + case Severity::kINFO: return "[I] "; + case Severity::kVERBOSE: return "[V] "; + default: assert(0); return ""; + } + } + + //! + //! \brief returns an appropriate string for prefixing a test result message with the given result + //! + static const char* testResultString(TestResult result) + { + switch (result) + { + case TestResult::kRUNNING: return "RUNNING"; + case TestResult::kPASSED: return "PASSED"; + case TestResult::kFAILED: return "FAILED"; + case TestResult::kWAIVED: return "WAIVED"; + default: assert(0); return ""; + } + } + + //! + //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity + //! + static std::ostream& severityOstream(Severity severity) + { + return severity >= Severity::kINFO ? std::cout : std::cerr; + } + + //! + //! \brief method that implements logging test results + //! + static void reportTestResult(TestAtom const& testAtom, TestResult result) + { + severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # " + << testAtom.mCmdline << std::endl; + } + + //! + //! \brief generate a command line string from the given (argc, argv) values + //! + static std::string genCmdlineString(int32_t argc, char const* const* argv) + { + std::stringstream ss; + for (int32_t i = 0; i < argc; i++) + { + if (i > 0) + { + ss << " "; + } + ss << argv[i]; + } + return ss.str(); + } + + Severity mReportableSeverity; +}; // class Logger + +namespace +{ +//! +//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE +//! +//! Example usage: +//! +//! LOG_VERBOSE(logger) << "hello world" << std::endl; +//! +inline LogStreamConsumer LOG_VERBOSE(const Logger& logger) +{ + return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE); +} + +//! +//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO +//! +//! Example usage: +//! +//! LOG_INFO(logger) << "hello world" << std::endl; +//! +inline LogStreamConsumer LOG_INFO(const Logger& logger) +{ + return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO); +} + +//! +//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING +//! +//! Example usage: +//! +//! LOG_WARN(logger) << "hello world" << std::endl; +//! +inline LogStreamConsumer LOG_WARN(const Logger& logger) +{ + return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING); +} + +//! +//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR +//! +//! Example usage: +//! +//! LOG_ERROR(logger) << "hello world" << std::endl; +//! +inline LogStreamConsumer LOG_ERROR(const Logger& logger) +{ + return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR); +} + +//! +//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR +//! ("fatal" severity) +//! +//! Example usage: +//! +//! LOG_FATAL(logger) << "hello world" << std::endl; +//! +inline LogStreamConsumer LOG_FATAL(const Logger& logger) +{ + return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR); +} +} // anonymous namespace +} // namespace sample +#endif // TENSORRT_LOGGING_H diff --git a/src/Detector/tensorrt_yolo/common_deprecated/parserOnnxConfig.h b/src/Detector/tensorrt_yolo/common_deprecated/parserOnnxConfig.h new file mode 100644 index 00000000..c92a1420 --- /dev/null +++ b/src/Detector/tensorrt_yolo/common_deprecated/parserOnnxConfig.h @@ -0,0 +1,153 @@ +/* + * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef PARSER_ONNX_CONFIG_H +#define PARSER_ONNX_CONFIG_H + +#include +#include +#include + +#include "NvInfer.h" +#include "NvOnnxConfig.h" +#include "NvOnnxParser.h" + +#define ONNX_DEBUG 1 + +/** + * \class ParserOnnxConfig + * \brief Configuration Manager Class Concrete Implementation + * + * \note: + * + */ + +using namespace std; + +class ParserOnnxConfig : public nvonnxparser::IOnnxConfig +{ + +protected: + string mModelFilename{}; + string mTextFilename{}; + string mFullTextFilename{}; + nvinfer1::DataType mModelDtype; + nvonnxparser::IOnnxConfig::Verbosity mVerbosity; + bool mPrintLayercInfo; + +public: + ParserOnnxConfig() + : mModelDtype(nvinfer1::DataType::kFLOAT) + , mVerbosity(static_cast(nvinfer1::ILogger::Severity::kWARNING)) + , mPrintLayercInfo(false) + { +#ifdef ONNX_DEBUG + if (isDebug()) + { + std::cout << " ParserOnnxConfig::ctor(): " << this << "\t" << std::endl; + } +#endif + } + +protected: + ~ParserOnnxConfig() + { +#ifdef ONNX_DEBUG + if (isDebug()) + { + std::cout << "ParserOnnxConfig::dtor(): " << this << std::endl; + } +#endif + } + +public: + virtual void setModelDtype(const nvinfer1::DataType modelDtype) noexcept + { + mModelDtype = modelDtype; + } + + virtual nvinfer1::DataType getModelDtype() const noexcept + { + return mModelDtype; + } + + virtual const char* getModelFileName() const noexcept + { + return mModelFilename.c_str(); + } + virtual void setModelFileName(const char* onnxFilename) noexcept + { + mModelFilename = string(onnxFilename); + } + virtual nvonnxparser::IOnnxConfig::Verbosity getVerbosityLevel() const noexcept + { + return mVerbosity; + } + virtual void addVerbosity() noexcept + { + ++mVerbosity; + } + virtual void reduceVerbosity() noexcept + { + --mVerbosity; + } + virtual void setVerbosityLevel(nvonnxparser::IOnnxConfig::Verbosity verbosity) noexcept + { + mVerbosity = verbosity; + } + + virtual const char* getTextFileName() const noexcept + { + return mTextFilename.c_str(); + } + virtual void setTextFileName(const char* textFilename) noexcept + { + mTextFilename = string(textFilename); + } + virtual const char* getFullTextFileName() const noexcept + { + return mFullTextFilename.c_str(); + } + virtual void setFullTextFileName(const char* fullTextFilename) noexcept + { + mFullTextFilename = string(fullTextFilename); + } + virtual bool getPrintLayerInfo() const noexcept + { + return mPrintLayercInfo; + } + virtual void setPrintLayerInfo(bool src) noexcept + { + mPrintLayercInfo = src; + } //!< get the boolean variable corresponding to the Layer Info, see getPrintLayerInfo() + + virtual bool isDebug() const noexcept + { +#if ONNX_DEBUG + return (std::getenv("ONNX_DEBUG") ? true : false); +#else + return false; +#endif + } + + virtual void destroy() noexcept + { + delete this; + } + +}; // class ParserOnnxConfig + +#endif diff --git a/src/Detector/tensorrt_yolo/common_deprecated/safeCommon.h b/src/Detector/tensorrt_yolo/common_deprecated/safeCommon.h new file mode 100644 index 00000000..3d84b095 --- /dev/null +++ b/src/Detector/tensorrt_yolo/common_deprecated/safeCommon.h @@ -0,0 +1,71 @@ +/* + * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef TENSORRT_SAFE_COMMON_H +#define TENSORRT_SAFE_COMMON_H + +#include "NvInferRuntimeCommon.h" +#include +#include +#include +#include +#include + +#define CHECK(status) \ + do \ + { \ + auto ret = (status); \ + if (ret != 0) \ + { \ + std::cerr << "Cuda failure: " << ret << std::endl; \ + abort(); \ + } \ + } while (0) + +namespace samplesCommon +{ +template +inline std::shared_ptr infer_object(T* obj) +{ + if (!obj) + { + throw std::runtime_error("Failed to create object"); + } + return std::shared_ptr(obj); +} + +inline uint32_t elementSize(nvinfer1::DataType t) +{ + switch (t) + { + case nvinfer1::DataType::kINT32: + case nvinfer1::DataType::kFLOAT: return 4; + case nvinfer1::DataType::kHALF: return 2; + case nvinfer1::DataType::kINT8: return 1; + case nvinfer1::DataType::kBOOL: return 1; + } + return 0; +} + +template +inline A divUp(A x, B n) +{ + return (x + n - 1) / n; +} + +} // namespace samplesCommon + +#endif // TENSORRT_SAFE_COMMON_H diff --git a/src/Detector/tensorrt_yolo/common_deprecated/sampleConfig.h b/src/Detector/tensorrt_yolo/common_deprecated/sampleConfig.h new file mode 100644 index 00000000..53a78331 --- /dev/null +++ b/src/Detector/tensorrt_yolo/common_deprecated/sampleConfig.h @@ -0,0 +1,337 @@ +/* + * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef SampleConfig_H +#define SampleConfig_H + +#include +#include +#include + +#include "NvInfer.h" +#include "NvOnnxConfig.h" +class SampleConfig : public nvonnxparser::IOnnxConfig +{ +public: + enum class InputDataFormat : int + { + kASCII = 0, + kPPM = 1 + }; + +private: + std::string mModelFilename; + std::string mEngineFilename; + std::string mTextFilename; + std::string mFullTextFilename; + std::string mImageFilename; + std::string mReferenceFilename; + std::string mOutputFilename; + std::string mCalibrationFilename; + std::string mTimingCacheFilename; + int64_t mLabel{-1}; + int64_t mMaxBatchSize{32}; + int64_t mCalibBatchSize{0}; + int64_t mMaxNCalibBatch{0}; + int64_t mFirstCalibBatch{0}; + int64_t mUseDLACore{-1}; + nvinfer1::DataType mModelDtype{nvinfer1::DataType::kFLOAT}; + bool mTF32{true}; + Verbosity mVerbosity{static_cast(nvinfer1::ILogger::Severity::kWARNING)}; + bool mPrintLayercInfo{false}; + bool mDebugBuilder{false}; + InputDataFormat mInputDataFormat{InputDataFormat::kASCII}; + uint64_t mTopK{0}; + float mFailurePercentage{-1.0f}; + float mTolerance{0.0f}; + float mAbsTolerance{1e-5f}; + +public: + SampleConfig() + { +#ifdef ONNX_DEBUG + if (isDebug()) + { + std::cout << " SampleConfig::ctor(): " << this << "\t" << std::endl; + } +#endif + } + +protected: + ~SampleConfig() + { +#ifdef ONNX_DEBUG + if (isDebug()) + { + std::cout << "SampleConfig::dtor(): " << this << std::endl; + } +#endif + } + +public: + void setModelDtype(const nvinfer1::DataType mdt) noexcept + { + mModelDtype = mdt; + } + + nvinfer1::DataType getModelDtype() const noexcept + { + return mModelDtype; + } + + bool getTF32() const noexcept + { + return mTF32; + } + + void setTF32(bool enabled) noexcept + { + mTF32 = enabled; + } + + const char* getModelFileName() const noexcept + { + return mModelFilename.c_str(); + } + + void setModelFileName(const char* onnxFilename) noexcept + { + mModelFilename = std::string(onnxFilename); + } + Verbosity getVerbosityLevel() const noexcept + { + return mVerbosity; + } + void addVerbosity() noexcept + { + ++mVerbosity; + } + void reduceVerbosity() noexcept + { + --mVerbosity; + } + virtual void setVerbosityLevel(Verbosity v) noexcept + { + mVerbosity = v; + } + const char* getEngineFileName() const noexcept + { + return mEngineFilename.c_str(); + } + void setEngineFileName(const char* engineFilename) noexcept + { + mEngineFilename = std::string(engineFilename); + } + const char* getTextFileName() const noexcept + { + return mTextFilename.c_str(); + } + void setTextFileName(const char* textFilename) noexcept + { + mTextFilename = std::string(textFilename); + } + const char* getFullTextFileName() const noexcept + { + return mFullTextFilename.c_str(); + } + void setFullTextFileName(const char* fullTextFilename) noexcept + { + mFullTextFilename = std::string(fullTextFilename); + } + void setLabel(int64_t label) noexcept + { + mLabel = label; + } //!< set the Label + + int64_t getLabel() const noexcept + { + return mLabel; + } //!< get the Label + + bool getPrintLayerInfo() const noexcept + { + return mPrintLayercInfo; + } + + void setPrintLayerInfo(bool b) noexcept + { + mPrintLayercInfo = b; + } //!< get the boolean variable corresponding to the Layer Info, see getPrintLayerInfo() + + void setMaxBatchSize(int64_t maxBatchSize) noexcept + { + mMaxBatchSize = maxBatchSize; + } //!< set the Max Batch Size + int64_t getMaxBatchSize() const noexcept + { + return mMaxBatchSize; + } //!< get the Max Batch Size + + void setCalibBatchSize(int64_t CalibBatchSize) noexcept + { + mCalibBatchSize = CalibBatchSize; + } //!< set the calibration batch size + int64_t getCalibBatchSize() const noexcept + { + return mCalibBatchSize; + } //!< get calibration batch size + + void setMaxNCalibBatch(int64_t MaxNCalibBatch) noexcept + { + mMaxNCalibBatch = MaxNCalibBatch; + } //!< set Max Number of Calibration Batches + int64_t getMaxNCalibBatch() const noexcept + { + return mMaxNCalibBatch; + } //!< get the Max Number of Calibration Batches + + void setFirstCalibBatch(int64_t FirstCalibBatch) noexcept + { + mFirstCalibBatch = FirstCalibBatch; + } //!< set the first calibration batch + int64_t getFirstCalibBatch() const noexcept + { + return mFirstCalibBatch; + } //!< get the first calibration batch + + void setUseDLACore(int64_t UseDLACore) noexcept + { + mUseDLACore = UseDLACore; + } //!< set the DLA core to use + int64_t getUseDLACore() const noexcept + { + return mUseDLACore; + } //!< get the DLA core to use + + void setDebugBuilder() noexcept + { + mDebugBuilder = true; + } //!< enable the Debug info, while building the engine. + bool getDebugBuilder() const noexcept + { + return mDebugBuilder; + } //!< get the boolean variable, corresponding to the debug builder + + const char* getImageFileName() const noexcept //!< set Image file name (PPM or ASCII) + { + return mImageFilename.c_str(); + } + void setImageFileName(const char* imageFilename) noexcept //!< get the Image file name + { + mImageFilename = std::string(imageFilename); + } + const char* getReferenceFileName() const noexcept + { + return mReferenceFilename.c_str(); + } + void setReferenceFileName(const char* referenceFilename) noexcept //!< set reference file name + { + mReferenceFilename = std::string(referenceFilename); + } + + void setInputDataFormat(InputDataFormat idt) noexcept + { + mInputDataFormat = idt; + } //!< specifies expected data format of the image file (PPM or ASCII) + InputDataFormat getInputDataFormat() const noexcept + { + return mInputDataFormat; + } //!< returns the expected data format of the image file. + + const char* getOutputFileName() const noexcept //!< specifies the file to save the results + { + return mOutputFilename.c_str(); + } + void setOutputFileName(const char* outputFilename) noexcept //!< get the output file name + { + mOutputFilename = std::string(outputFilename); + } + + const char* getCalibrationFileName() const noexcept + { + return mCalibrationFilename.c_str(); + } //!< specifies the file containing the list of image files for int8 calibration + void setCalibrationFileName(const char* calibrationFilename) noexcept //!< get the int 8 calibration list file name + { + mCalibrationFilename = std::string(calibrationFilename); + } + + uint64_t getTopK() const noexcept + { + return mTopK; + } + void setTopK(uint64_t topK) noexcept + { + mTopK = topK; + } //!< If this options is specified, return the K top probabilities. + + float getFailurePercentage() const noexcept + { + return mFailurePercentage; + } + + void setFailurePercentage(float f) noexcept + { + mFailurePercentage = f; + } + + float getAbsoluteTolerance() const noexcept + { + return mAbsTolerance; + } + + void setAbsoluteTolerance(float a) noexcept + { + mAbsTolerance = a; + } + + float getTolerance() const noexcept + { + return mTolerance; + } + + void setTolerance(float t) noexcept + { + mTolerance = t; + } + + const char* getTimingCacheFilename() const noexcept + { + return mTimingCacheFilename.c_str(); + } + + void setTimingCacheFileName(const char* timingCacheFilename) noexcept + { + mTimingCacheFilename = std::string(timingCacheFilename); + } + + bool isDebug() const noexcept + { +#if ONNX_DEBUG + return (std::getenv("ONNX_DEBUG") ? true : false); +#else + return false; +#endif + } + + void destroy() noexcept + { + delete this; + } + +}; // class SampleConfig + +#endif diff --git a/src/Detector/tensorrt_yolo/common_deprecated/sampleDevice.h b/src/Detector/tensorrt_yolo/common_deprecated/sampleDevice.h new file mode 100644 index 00000000..2053ac7c --- /dev/null +++ b/src/Detector/tensorrt_yolo/common_deprecated/sampleDevice.h @@ -0,0 +1,494 @@ +/* + * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef TRT_SAMPLE_DEVICE_H +#define TRT_SAMPLE_DEVICE_H + +#include +#include +#include +#include +#include + +namespace sample +{ + +inline void cudaCheck(cudaError_t ret, std::ostream& err = std::cerr) +{ + if (ret != cudaSuccess) + { + err << "Cuda failure: " << cudaGetErrorString(ret) << std::endl; + abort(); + } +} + +class TrtCudaEvent; + +namespace +{ + +void cudaSleep(void* sleep) +{ + std::this_thread::sleep_for(std::chrono::duration(*static_cast(sleep))); +} + +} // namespace + +//! +//! \class TrtCudaStream +//! \brief Managed CUDA stream +//! +class TrtCudaStream +{ +public: + TrtCudaStream() + { + cudaCheck(cudaStreamCreate(&mStream)); + } + + TrtCudaStream(const TrtCudaStream&) = delete; + + TrtCudaStream& operator=(const TrtCudaStream&) = delete; + + TrtCudaStream(TrtCudaStream&&) = delete; + + TrtCudaStream& operator=(TrtCudaStream&&) = delete; + + ~TrtCudaStream() + { + cudaCheck(cudaStreamDestroy(mStream)); + } + + cudaStream_t get() const + { + return mStream; + } + + void synchronize() + { + cudaCheck(cudaStreamSynchronize(mStream)); + } + + void wait(TrtCudaEvent& event); + + void sleep(float* ms) + { + cudaCheck(cudaLaunchHostFunc(mStream, cudaSleep, ms)); + } + +private: + cudaStream_t mStream{}; +}; + +//! +//! \class TrtCudaEvent +//! \brief Managed CUDA event +//! +class TrtCudaEvent +{ +public: + explicit TrtCudaEvent(bool blocking = true) + { + const uint32_t flags = blocking ? cudaEventBlockingSync : cudaEventDefault; + cudaCheck(cudaEventCreateWithFlags(&mEvent, flags)); + } + + TrtCudaEvent(const TrtCudaEvent&) = delete; + + TrtCudaEvent& operator=(const TrtCudaEvent&) = delete; + + TrtCudaEvent(TrtCudaEvent&&) = delete; + + TrtCudaEvent& operator=(TrtCudaEvent&&) = delete; + + ~TrtCudaEvent() + { + cudaCheck(cudaEventDestroy(mEvent)); + } + + cudaEvent_t get() const + { + return mEvent; + } + + void record(const TrtCudaStream& stream) + { + cudaCheck(cudaEventRecord(mEvent, stream.get())); + } + + void synchronize() + { + cudaCheck(cudaEventSynchronize(mEvent)); + } + + // Returns time elapsed time in milliseconds + float operator-(const TrtCudaEvent& e) const + { + float time{0}; + cudaCheck(cudaEventElapsedTime(&time, e.get(), get())); + return time; + } + +private: + cudaEvent_t mEvent{}; +}; + +inline void TrtCudaStream::wait(TrtCudaEvent& event) +{ + cudaCheck(cudaStreamWaitEvent(mStream, event.get(), 0)); +} + +//! +//! \class TrtCudaGraph +//! \brief Managed CUDA graph +//! +class TrtCudaGraph +{ +public: + explicit TrtCudaGraph() = default; + + TrtCudaGraph(const TrtCudaGraph&) = delete; + + TrtCudaGraph& operator=(const TrtCudaGraph&) = delete; + + TrtCudaGraph(TrtCudaGraph&&) = delete; + + TrtCudaGraph& operator=(TrtCudaGraph&&) = delete; + + ~TrtCudaGraph() + { + if (mGraphExec) + { + cudaGraphExecDestroy(mGraphExec); + } + } + + void beginCapture(TrtCudaStream& stream) + { + cudaCheck(cudaStreamBeginCapture(stream.get(), cudaStreamCaptureModeThreadLocal)); + } + + bool launch(TrtCudaStream& stream) + { + return cudaGraphLaunch(mGraphExec, stream.get()) == cudaSuccess; + } + + void endCapture(TrtCudaStream& stream) + { + cudaCheck(cudaStreamEndCapture(stream.get(), &mGraph)); + cudaCheck(cudaGraphInstantiate(&mGraphExec, mGraph, nullptr, nullptr, 0)); + cudaCheck(cudaGraphDestroy(mGraph)); + } + + void endCaptureOnError(TrtCudaStream& stream) + { + // There are two possibilities why stream capture would fail: + // (1) stream is in cudaErrorStreamCaptureInvalidated state. + // (2) TRT reports a failure. + // In case (1), the returning mGraph should be nullptr. + // In case (2), the returning mGraph is not nullptr, but it should not be used. + const auto ret = cudaStreamEndCapture(stream.get(), &mGraph); + if (ret == cudaErrorStreamCaptureInvalidated) + { + assert(mGraph == nullptr); + } + else + { + assert(ret == cudaSuccess); + assert(mGraph != nullptr); + cudaCheck(cudaGraphDestroy(mGraph)); + mGraph = nullptr; + } + // Clean up any CUDA error. + cudaGetLastError(); + sample::gLogWarning << "The CUDA graph capture on the stream has failed." << std::endl; + } + +private: + cudaGraph_t mGraph{}; + cudaGraphExec_t mGraphExec{}; +}; + +//! +//! \class TrtCudaBuffer +//! \brief Managed buffer for host and device +//! +template +class TrtCudaBuffer +{ +public: + TrtCudaBuffer() = default; + + TrtCudaBuffer(const TrtCudaBuffer&) = delete; + + TrtCudaBuffer& operator=(const TrtCudaBuffer&) = delete; + + TrtCudaBuffer(TrtCudaBuffer&& rhs) + { + reset(rhs.mPtr); + rhs.mPtr = nullptr; + } + + TrtCudaBuffer& operator=(TrtCudaBuffer&& rhs) + { + if (this != &rhs) + { + reset(rhs.mPtr); + rhs.mPtr = nullptr; + } + return *this; + } + + ~TrtCudaBuffer() + { + reset(); + } + + TrtCudaBuffer(size_t size) + { + A()(&mPtr, size); + } + + void allocate(size_t size) + { + reset(); + A()(&mPtr, size); + } + + void reset(void* ptr = nullptr) + { + if (mPtr) + { + D()(mPtr); + } + mPtr = ptr; + } + + void* get() const + { + return mPtr; + } + +private: + void* mPtr{nullptr}; +}; + +struct DeviceAllocator +{ + void operator()(void** ptr, size_t size) + { + cudaCheck(cudaMalloc(ptr, size)); + } +}; + +struct DeviceDeallocator +{ + void operator()(void* ptr) + { + cudaCheck(cudaFree(ptr)); + } +}; + +struct ManagedAllocator +{ + void operator()(void** ptr, size_t size) + { + cudaCheck(cudaMallocManaged(ptr, size)); + } +}; + +struct HostAllocator +{ + void operator()(void** ptr, size_t size) + { + cudaCheck(cudaMallocHost(ptr, size)); + } +}; + +struct HostDeallocator +{ + void operator()(void* ptr) + { + cudaCheck(cudaFreeHost(ptr)); + } +}; + +using TrtDeviceBuffer = TrtCudaBuffer; +using TrtManagedBuffer = TrtCudaBuffer; + +using TrtHostBuffer = TrtCudaBuffer; + +//! +//! \class MirroredBuffer +//! \brief Coupled host and device buffers +//! +class IMirroredBuffer +{ +public: + //! + //! Allocate memory for the mirrored buffer give the size + //! of the allocation. + //! + virtual void allocate(size_t size) = 0; + + //! + //! Get the pointer to the device side buffer. + //! + //! \return pointer to device memory or nullptr if uninitialized. + //! + virtual void* getDeviceBuffer() const = 0; + + //! + //! Get the pointer to the host side buffer. + //! + //! \return pointer to host memory or nullptr if uninitialized. + //! + virtual void* getHostBuffer() const = 0; + + //! + //! Copy the memory from host to device. + //! + virtual void hostToDevice(TrtCudaStream& stream) = 0; + + //! + //! Copy the memory from device to host. + //! + virtual void deviceToHost(TrtCudaStream& stream) = 0; + + //! + //! Interface to get the size of the memory + //! + //! \return the size of memory allocated. + //! + virtual size_t getSize() const = 0; + + //! + //! Virtual destructor declaraion + //! + virtual ~IMirroredBuffer() = default; + +}; // class IMirroredBuffer + +//! +//! Class to have a seperate memory buffer for discrete device and host allocations. +//! +class DiscreteMirroredBuffer : public IMirroredBuffer +{ +public: + void allocate(size_t size) + { + mSize = size; + mHostBuffer.allocate(size); + mDeviceBuffer.allocate(size); + } + + void* getDeviceBuffer() const + { + return mDeviceBuffer.get(); + } + + void* getHostBuffer() const + { + return mHostBuffer.get(); + } + + void hostToDevice(TrtCudaStream& stream) + { + cudaCheck(cudaMemcpyAsync(mDeviceBuffer.get(), mHostBuffer.get(), mSize, cudaMemcpyHostToDevice, stream.get())); + } + + void deviceToHost(TrtCudaStream& stream) + { + cudaCheck(cudaMemcpyAsync(mHostBuffer.get(), mDeviceBuffer.get(), mSize, cudaMemcpyDeviceToHost, stream.get())); + } + + size_t getSize() const + { + return mSize; + } + +private: + size_t mSize{0}; + TrtHostBuffer mHostBuffer; + TrtDeviceBuffer mDeviceBuffer; +}; // class DiscreteMirroredBuffer + +//! +//! Class to have a unified memory buffer for embedded devices. +//! +class UnifiedMirroredBuffer : public IMirroredBuffer +{ +public: + void allocate(size_t size) + { + mSize = size; + mBuffer.allocate(size); + } + + void* getDeviceBuffer() const + { + return mBuffer.get(); + } + + void* getHostBuffer() const + { + return mBuffer.get(); + } + + void hostToDevice(TrtCudaStream& /*stream*/) + { + // Does nothing since we are using unified memory. + } + + void deviceToHost(TrtCudaStream& /*stream*/) + { + // Does nothing since we are using unified memory. + } + + size_t getSize() const + { + return mSize; + } + +private: + size_t mSize{0}; + TrtManagedBuffer mBuffer; +}; // class UnifiedMirroredBuffer + +inline void setCudaDevice(int device, std::ostream& os) +{ + cudaCheck(cudaSetDevice(device)); + + cudaDeviceProp properties; + cudaCheck(cudaGetDeviceProperties(&properties, device)); + +// clang-format off + os << "=== Device Information ===" << std::endl; + os << "Selected Device: " << properties.name << std::endl; + os << "Compute Capability: " << properties.major << "." << properties.minor << std::endl; + os << "SMs: " << properties.multiProcessorCount << std::endl; + os << "Compute Clock Rate: " << properties.clockRate / 1000000.0F << " GHz" << std::endl; + os << "Device Global Memory: " << (properties.totalGlobalMem >> 20) << " MiB" << std::endl; + os << "Shared Memory per SM: " << (properties.sharedMemPerMultiprocessor >> 10) << " KiB" << std::endl; + os << "Memory Bus Width: " << properties.memoryBusWidth << " bits" + << " (ECC " << (properties.ECCEnabled != 0 ? "enabled" : "disabled") << ")" << std::endl; + os << "Memory Clock Rate: " << properties.memoryClockRate / 1000000.0F << " GHz" << std::endl; + // clang-format on +} + +} // namespace sample + +#endif // TRT_SAMPLE_DEVICE_H diff --git a/src/Detector/tensorrt_yolo/common/sampleEngines.cpp b/src/Detector/tensorrt_yolo/common_deprecated/sampleEngines.cpp similarity index 100% rename from src/Detector/tensorrt_yolo/common/sampleEngines.cpp rename to src/Detector/tensorrt_yolo/common_deprecated/sampleEngines.cpp diff --git a/src/Detector/tensorrt_yolo/common_deprecated/sampleEngines.h b/src/Detector/tensorrt_yolo/common_deprecated/sampleEngines.h new file mode 100644 index 00000000..620b51a1 --- /dev/null +++ b/src/Detector/tensorrt_yolo/common_deprecated/sampleEngines.h @@ -0,0 +1,183 @@ +/* + * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef TRT_SAMPLE_ENGINES_H +#define TRT_SAMPLE_ENGINES_H + +#include +#include + +#include "NvInfer.h" + +#if (NV_TENSORRT_MAJOR > 7) + +#include "NvInferConsistency.h" +#include "NvInferSafeRuntime.h" + +#endif + +#include "NvOnnxParser.h" +#include "sampleOptions.h" +#include "sampleUtils.h" + +namespace sample +{ + +struct Parser +{ + TrtUniquePtr onnxParser; + + operator bool() const + { + return onnxParser.operator bool(); + } +}; + +struct BuildEnvironment +{ + TrtUniquePtr network; + //! Parser that creates the network. Must be declared *after* network, so that when + //! ~BuildEnvironment() executes, the parser is destroyed before the network is destroyed. + Parser parser; + TrtUniquePtr engine; + std::unique_ptr safeEngine; + std::vector engineBlob; +}; + +//! +//! \brief Generate a network definition for a given model +//! +//! \return Parser The parser used to initialize the network and that holds the weights for the network, or an invalid +//! parser (the returned parser converts to false if tested) +//! +//! Constant input dimensions in the model must not be changed in the corresponding +//! network definition, because its correctness may rely on the constants. +//! +//! \see Parser::operator bool() +//! +Parser modelToNetwork(const ModelOptions& model, nvinfer1::INetworkDefinition& network, std::ostream& err); + +//! +//! \brief Set up network and config +//! +//! \return boolean Return true if network and config were successfully set +//! +bool setupNetworkAndConfig(const BuildOptions& build, const SystemOptions& sys, nvinfer1::IBuilder& builder, + nvinfer1::INetworkDefinition& network, nvinfer1::IBuilderConfig& config, std::ostream& err, + std::vector>& sparseWeights); + +//! +//! \brief Log refittable layers and weights of a refittable engine +//! +void dumpRefittable(nvinfer1::ICudaEngine& engine); + +//! +//! \brief Load a serialized engine +//! +//! \return Pointer to the engine loaded or nullptr if the operation failed +//! +nvinfer1::ICudaEngine* loadEngine(const std::string& engine, int DLACore, std::ostream& err); + +//! +//! \brief Save an engine into a file +//! +//! \return boolean Return true if the engine was successfully saved +//! +bool saveEngine(const nvinfer1::ICudaEngine& engine, const std::string& fileName, std::ostream& err); + +//! +//! \brief Create an engine from model or serialized file, and optionally save engine +//! +//! \return Pointer to the engine created or nullptr if the creation failed +//! +bool getEngineBuildEnv(const ModelOptions& model, const BuildOptions& build, const SystemOptions& sys, + BuildEnvironment& env, std::ostream& err); + +//! +//! \brief Create an engine from model or serialized file, and optionally save engine +//! +//! \return Pointer to the engine created or nullptr if the creation failed +//! +inline TrtUniquePtr getEngine( + const ModelOptions& model, const BuildOptions& build, const SystemOptions& sys, std::ostream& err) +{ + BuildEnvironment env; + TrtUniquePtr engine; + if (getEngineBuildEnv(model, build, sys, env, err)) + { + engine.swap(env.engine); + } + return engine; +} + +//! +//! \brief Create a serialized network +//! +//! \return Pointer to a host memory for a serialized network +//! +nvinfer1::IHostMemory* networkToSerialized(const BuildOptions& build, const SystemOptions& sys, nvinfer1::IBuilder& builder, + nvinfer1::INetworkDefinition& network, std::ostream& err); + +//! +//! \brief Tranfer model to a serialized network +//! +//! \return Pointer to a host memory for a serialized network +//! +nvinfer1::IHostMemory* modelToSerialized(const ModelOptions& model, const BuildOptions& build, const SystemOptions& sys, std::ostream& err); + +//! +//! \brief Serialize network and save it into a file +//! +//! \return boolean Return true if the network was successfully serialized and saved +//! +bool serializeAndSave(const ModelOptions& model, const BuildOptions& build, const SystemOptions& sys, std::ostream& err); + +bool timeRefit(const nvinfer1::INetworkDefinition& network, nvinfer1::ICudaEngine& engine, bool multiThreading); + +//! +//! \brief Set tensor scales from a calibration table +//! +void setTensorScalesFromCalibration(nvinfer1::INetworkDefinition& network, const std::vector& inputFormats, + const std::vector& outputFormats, const std::string& calibrationFile); + +//! +//! \brief Check if safe runtime is loaded. +//! +bool hasSafeRuntime(); + +//! +//! \brief Create a safe runtime object if the dynamic library is loaded. +//! +nvinfer1::safe::IRuntime* createSafeInferRuntime(nvinfer1::ILogger& logger) noexcept; + +//! +//! \brief Check if consistency checker is loaded. +//! +bool hasConsistencyChecker(); + +//! +//! \brief Create a consistency checker object if the dynamic library is loaded. +//! +nvinfer1::consistency::IConsistencyChecker* createConsistencyChecker( + nvinfer1::ILogger& logger, nvinfer1::IHostMemory const* engine) noexcept; + +//! +//! \brief Run consistency check on serialized engine. +//! +bool checkSafeEngine(void const* serializedEngine, int32_t const engineSize); +} // namespace sample + +#endif // TRT_SAMPLE_ENGINES_H diff --git a/src/Detector/tensorrt_yolo/common/sampleInference.cpp b/src/Detector/tensorrt_yolo/common_deprecated/sampleInference.cpp similarity index 100% rename from src/Detector/tensorrt_yolo/common/sampleInference.cpp rename to src/Detector/tensorrt_yolo/common_deprecated/sampleInference.cpp diff --git a/src/Detector/tensorrt_yolo/common_deprecated/sampleInference.h b/src/Detector/tensorrt_yolo/common_deprecated/sampleInference.h new file mode 100644 index 00000000..1c21f592 --- /dev/null +++ b/src/Detector/tensorrt_yolo/common_deprecated/sampleInference.h @@ -0,0 +1,92 @@ +/* + * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef TRT_SAMPLE_INFERENCE_H +#define TRT_SAMPLE_INFERENCE_H + +#include "sampleReporting.h" +#include "sampleUtils.h" + +#include +#include +#include +#include + +#include "NvInfer.h" + +#if (NV_TENSORRT_MAJOR > 7) + +#include "NvInferSafeRuntime.h" + +namespace sample +{ + +struct InferenceEnvironment +{ + TrtUniquePtr engine; + std::unique_ptr profiler; + std::vector> context; + std::vector> bindings; + bool error{false}; + + std::vector engineBlob; + + bool safe{false}; + std::unique_ptr safeEngine; + std::vector> safeContext; + + template + inline ContextType* getContext(int32_t streamIdx); +}; + +template <> +inline nvinfer1::IExecutionContext* InferenceEnvironment::getContext(int32_t streamIdx) +{ + return context[streamIdx].get(); +} + +template <> +inline nvinfer1::safe::IExecutionContext* InferenceEnvironment::getContext(int32_t streamIdx) +{ + return safeContext[streamIdx].get(); +} + +//! +//! \brief Set up contexts and bindings for inference +//! +bool setUpInference(InferenceEnvironment& iEnv, const InferenceOptions& inference); + +//! +//! \brief Deserialize the engine and time how long it takes. +//! +bool timeDeserialize(InferenceEnvironment& iEnv); + +//! +//! \brief Run inference and collect timing, return false if any error hit during inference +//! +bool runInference( + const InferenceOptions& inference, InferenceEnvironment& iEnv, int32_t device, std::vector& trace); + +//! +//! \brief Get layer information of the engine. +//! +std::string getLayerInformation(const InferenceEnvironment& iEnv, nvinfer1::LayerInformationFormat format); + +} // namespace sample + +#endif + +#endif // TRT_SAMPLE_INFERENCE_H diff --git a/src/Detector/tensorrt_yolo/common_deprecated/sampleOptions.cpp b/src/Detector/tensorrt_yolo/common_deprecated/sampleOptions.cpp new file mode 100644 index 00000000..0afd163f --- /dev/null +++ b/src/Detector/tensorrt_yolo/common_deprecated/sampleOptions.cpp @@ -0,0 +1,1778 @@ +/* + * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "NvInfer.h" + +#include "logger.h" +#include "sampleOptions.h" + +namespace sample +{ + +namespace +{ + +std::vector splitToStringVec(const std::string& option, char separator) +{ + std::vector options; + + for (size_t start = 0; start < option.length();) + { + size_t separatorIndex = option.find(separator, start); + if (separatorIndex == std::string::npos) + { + separatorIndex = option.length(); + } + options.emplace_back(option.substr(start, separatorIndex - start)); + start = separatorIndex + 1; + } + + return options; +} + +template +T stringToValue(const std::string& option) +{ + return T{option}; +} + +template <> +int32_t stringToValue(const std::string& option) +{ + return std::stoi(option); +} + +template <> +float stringToValue(const std::string& option) +{ + return std::stof(option); +} + +template <> +double stringToValue(const std::string& option) +{ + return std::stod(option); +} + +template <> +bool stringToValue(const std::string& option) +{ + return true; +} + +template <> +std::vector stringToValue>(const std::string& option) +{ + std::vector shape; + std::vector dimsStrings = splitToStringVec(option, 'x'); + for (const auto& d : dimsStrings) + { + shape.push_back(stringToValue(d)); + } + return shape; +} + +template <> +nvinfer1::DataType stringToValue(const std::string& option) +{ + const std::unordered_map strToDT{{"fp32", nvinfer1::DataType::kFLOAT}, + {"fp16", nvinfer1::DataType::kHALF}, {"int8", nvinfer1::DataType::kINT8}, + {"int32", nvinfer1::DataType::kINT32}}; + const auto& dt = strToDT.find(option); + if (dt == strToDT.end()) + { + throw std::invalid_argument("Invalid DataType " + option); + } + return dt->second; +} + +template <> +nvinfer1::TensorFormats stringToValue(const std::string& option) +{ + std::vector optionStrings = splitToStringVec(option, '+'); + const std::unordered_map strToFmt{{"chw", nvinfer1::TensorFormat::kLINEAR}, + {"chw2", nvinfer1::TensorFormat::kCHW2}, {"chw4", nvinfer1::TensorFormat::kCHW4}, + {"hwc8", nvinfer1::TensorFormat::kHWC8}, {"chw16", nvinfer1::TensorFormat::kCHW16}, + {"chw32", nvinfer1::TensorFormat::kCHW32}, {"dhwc8", nvinfer1::TensorFormat::kDHWC8}, + {"hwc", nvinfer1::TensorFormat::kHWC}, {"dla_linear", nvinfer1::TensorFormat::kDLA_LINEAR}, + {"dla_hwc4", nvinfer1::TensorFormat::kDLA_HWC4}}; + nvinfer1::TensorFormats formats{}; + for (auto f : optionStrings) + { + const auto& tf = strToFmt.find(f); + if (tf == strToFmt.end()) + { + throw std::invalid_argument(std::string("Invalid TensorFormat ") + f); + } + formats |= 1U << static_cast(tf->second); + } + + return formats; +} + +template <> +IOFormat stringToValue(const std::string& option) +{ + IOFormat ioFormat{}; + const size_t colon = option.find(':'); + + if (colon == std::string::npos) + { + throw std::invalid_argument(std::string("Invalid IOFormat ") + option); + } + + ioFormat.first = stringToValue(option.substr(0, colon)); + ioFormat.second = stringToValue(option.substr(colon + 1)); + + return ioFormat; +} + +template +std::pair splitNameAndValue(const std::string& s) +{ + std::string tensorName; + std::string valueString; + // Split on the last : + std::vector nameRange{splitToStringVec(s, ':')}; + // Everything before the last : is the name + tensorName = nameRange[0]; + for (size_t i = 1; i < nameRange.size() - 1; i++) + { + tensorName += ":" + nameRange[i]; + } + // Value is the string element after the last : + valueString = nameRange[nameRange.size() - 1]; + return std::pair(tensorName, stringToValue(valueString)); +} + +template +void splitInsertKeyValue(const std::vector& kvList, T& map) +{ + for (const auto& kv : kvList) + { + map.insert(splitNameAndValue(kv)); + } +} + +const char* boolToEnabled(bool enable) +{ + return enable ? "Enabled" : "Disabled"; +} + +//! Check if input option exists in input arguments. +//! If it does: return its value, erase the argument and return true. +//! If it does not: return false. +template +bool getAndDelOption(Arguments& arguments, const std::string& option, T& value) +{ + const auto match = arguments.find(option); + if (match != arguments.end()) + { + value = stringToValue(match->second); + arguments.erase(match); + return true; + } + + return false; +} + +//! Check if input option exists in input arguments. +//! If it does: return false in value, erase the argument and return true. +//! If it does not: return false. +bool getAndDelNegOption(Arguments& arguments, const std::string& option, bool& value) +{ + bool dummy; + if (getAndDelOption(arguments, option, dummy)) + { + value = false; + return true; + } + return false; +} + +//! Check if input option exists in input arguments. +//! If it does: add all the matched arg values to values vector, erase the argument and return true. +//! If it does not: return false. +template +bool getAndDelRepeatedOption(Arguments& arguments, const std::string& option, std::vector& values) +{ + const auto match = arguments.equal_range(option); + if (match.first == match.second) + { + return false; + } + + auto addToValues = [&values](Arguments::value_type& argValue) {values.emplace_back(stringToValue(argValue.second));}; + std::for_each(match.first, match.second, addToValues); + arguments.erase(match.first, match.second); + + return true; +} + +void insertShapesBuild(std::unordered_map& shapes, nvinfer1::OptProfileSelector selector, const std::string& name, const std::vector& dims) +{ + shapes[name][static_cast(selector)] = dims; +} + +void insertShapesInference(std::unordered_map>& shapes, const std::string& name, const std::vector& dims) +{ + shapes[name] = dims; +} + +std::string removeSingleQuotationMarks(std::string& str) +{ + std::vector strList{splitToStringVec(str, '\'')}; + // Remove all the escaped single quotation marks + std::string retVal = ""; + // Do not really care about unterminated sequences + for (size_t i = 0; i < strList.size(); i++) + { + retVal += strList[i]; + } + return retVal; +} + +void getLayerPrecisions(Arguments& arguments, char const* argument, LayerPrecisions& layerPrecisions) +{ + std::string list; + if (!getAndDelOption(arguments, argument, list)) + { + return; + } + + // The layerPrecisions flag contains comma-separated layerName:precision pairs. + std::vector precisionList{splitToStringVec(list, ',')}; + for (auto const& s : precisionList) + { + auto namePrecisionPair = splitNameAndValue(s); + auto const layerName = removeSingleQuotationMarks(namePrecisionPair.first); + layerPrecisions[layerName] = namePrecisionPair.second; + } +} + +void getLayerOutputTypes(Arguments& arguments, char const* argument, LayerOutputTypes& layerOutputTypes) +{ + std::string list; + if (!getAndDelOption(arguments, argument, list)) + { + return; + } + + // The layerOutputTypes flag contains comma-separated layerName:types pairs. + std::vector precisionList{splitToStringVec(list, ',')}; + for (auto const& s : precisionList) + { + auto namePrecisionPair = splitNameAndValue(s); + auto const layerName = removeSingleQuotationMarks(namePrecisionPair.first); + auto const typeStrings = splitToStringVec(namePrecisionPair.second, '+'); + std::vector typeVec(typeStrings.size(), nvinfer1::DataType::kFLOAT); + std::transform(typeStrings.begin(), typeStrings.end(), typeVec.begin(), stringToValue); + layerOutputTypes[layerName] = typeVec; + } +} + +bool getShapesBuild(Arguments& arguments, std::unordered_map& shapes, char const* argument, + nvinfer1::OptProfileSelector selector) +{ + std::string list; + bool retVal = getAndDelOption(arguments, argument, list); + std::vector shapeList{splitToStringVec(list, ',')}; + for (const auto& s : shapeList) + { + auto nameDimsPair = splitNameAndValue>(s); + auto tensorName = removeSingleQuotationMarks(nameDimsPair.first); + auto dims = nameDimsPair.second; + insertShapesBuild(shapes, selector, tensorName, dims); + } + return retVal; +} + +bool getShapesInference(Arguments& arguments, std::unordered_map>& shapes, const char* argument) +{ + std::string list; + bool retVal = getAndDelOption(arguments, argument, list); + std::vector shapeList{splitToStringVec(list, ',')}; + for (const auto& s : shapeList) + { + auto nameDimsPair = splitNameAndValue>(s); + auto tensorName = removeSingleQuotationMarks(nameDimsPair.first); + auto dims = nameDimsPair.second; + insertShapesInference(shapes, tensorName, dims); + } + return retVal; +} + +void processShapes(std::unordered_map& shapes, bool minShapes, bool optShapes, bool maxShapes, bool calib) +{ + // Only accept optShapes only or all three of minShapes, optShapes, maxShapes + if ( ((minShapes || maxShapes) && !optShapes) // minShapes only, maxShapes only, both minShapes and maxShapes + || (minShapes && !maxShapes && optShapes) // both minShapes and optShapes + || (!minShapes && maxShapes && optShapes)) // both maxShapes and optShapes + { + if (calib) + { + throw std::invalid_argument("Must specify only --optShapesCalib or all of --minShapesCalib, --optShapesCalib, --maxShapesCalib"); + } + else + { + throw std::invalid_argument("Must specify only --optShapes or all of --minShapes, --optShapes, --maxShapes"); + } + } + + // If optShapes only, expand optShapes to minShapes and maxShapes + if (optShapes && !minShapes && !maxShapes) + { + std::unordered_map newShapes; + for (auto& s : shapes) + { + insertShapesBuild(newShapes, nvinfer1::OptProfileSelector::kMIN, s.first, s.second[static_cast(nvinfer1::OptProfileSelector::kOPT)]); + insertShapesBuild(newShapes, nvinfer1::OptProfileSelector::kOPT, s.first, s.second[static_cast(nvinfer1::OptProfileSelector::kOPT)]); + insertShapesBuild(newShapes, nvinfer1::OptProfileSelector::kMAX, s.first, s.second[static_cast(nvinfer1::OptProfileSelector::kOPT)]); + } + shapes = newShapes; + } +} + +template +void printShapes(std::ostream& os, const char* phase, const T& shapes) +{ + if (shapes.empty()) + { + os << "Input " << phase << " shapes: model" << std::endl; + } + else + { + for (const auto& s : shapes) + { + os << "Input " << phase << " shape: " << s.first << "=" << s.second << std::endl; + } + } +} + +std::ostream& printBatch(std::ostream& os, int32_t maxBatch) +{ + if (maxBatch != maxBatchNotProvided) + { + os << maxBatch; + } + else + { + os << "explicit batch"; + } + return os; +} + +std::ostream& printTacticSources(std::ostream& os, nvinfer1::TacticSources enabledSources, nvinfer1::TacticSources disabledSources) +{ + if (!enabledSources && !disabledSources) + { + os << "Using default tactic sources"; + } + else + { + auto const addSource = [&](uint32_t source, std::string const& name) { + if (enabledSources & source) + { + os << name << " [ON], "; + } + else if (disabledSources & source) + { + os << name << " [OFF], "; + } + }; + + addSource(1U << static_cast(nvinfer1::TacticSource::kCUBLAS), "cublas"); + addSource(1U << static_cast(nvinfer1::TacticSource::kCUBLAS_LT), "cublasLt"); +#if (NV_TENSORRT_MAJOR > 7) + addSource(1U << static_cast(nvinfer1::TacticSource::kCUDNN), "cudnn"); +#endif + } + return os; +} + +std::ostream& printPrecision(std::ostream& os, BuildOptions const& options) +{ + os << "FP32"; + if (options.fp16) + { + os << "+FP16"; + } + if (options.int8) + { + os << "+INT8"; + } + if (options.precisionConstraints == PrecisionConstraints::kOBEY) + { + os << " (obey precision constraints)"; + } + if (options.precisionConstraints == PrecisionConstraints::kPREFER) + { + os << " (prefer precision constraints)"; + } + return os; +} + +std::ostream& printTimingCache(std::ostream& os, BuildOptions const& options) +{ + switch (options.timingCacheMode) + { + case TimingCacheMode::kGLOBAL: os << "global"; break; + case TimingCacheMode::kLOCAL: os << "local"; break; + case TimingCacheMode::kDISABLE: os << "disable"; break; + } + return os; +} + +std::ostream& printSparsity(std::ostream& os, BuildOptions const& options) +{ + switch (options.sparsity) + { + case SparsityFlag::kDISABLE: os << "Disabled"; break; + case SparsityFlag::kENABLE: os << "Enabled"; break; + case SparsityFlag::kFORCE: os << "Forced"; break; + } + + return os; +} + +std::ostream& printMemoryPools(std::ostream& os, BuildOptions const& options) +{ + auto const printValueOrDefault = [&os](double const val) { + if (val >= 0) + { + os << val << " MiB"; + } + else + { + os << "default"; + } + }; + os << "workspace: "; printValueOrDefault(options.workspace); os << ", "; + os << "dlaSRAM: "; printValueOrDefault(options.dlaSRAM); os << ", "; + os << "dlaLocalDRAM: "; printValueOrDefault(options.dlaLocalDRAM); os << ", "; + os << "dlaGlobalDRAM: "; printValueOrDefault(options.dlaGlobalDRAM); + return os; +} + +} // namespace + +Arguments argsToArgumentsMap(int32_t argc, char* argv[]) +{ + Arguments arguments; + for (int32_t i = 1; i < argc; ++i) + { + auto valuePtr = strchr(argv[i], '='); + if (valuePtr) + { + std::string value{valuePtr + 1}; + arguments.emplace(std::string(argv[i], valuePtr - argv[i]), value); + } + else + { + arguments.emplace(argv[i], ""); + } + } + return arguments; +} + +void BaseModelOptions::parse(Arguments& arguments) +{ + if (getAndDelOption(arguments, "--onnx", model)) + { + format = ModelFormat::kONNX; + } + else if (getAndDelOption(arguments, "--uff", model)) + { + format = ModelFormat::kUFF; + } + else if (getAndDelOption(arguments, "--model", model)) + { + format = ModelFormat::kCAFFE; + } +} + +void UffInput::parse(Arguments& arguments) +{ + getAndDelOption(arguments, "--uffNHWC", NHWC); + std::vector args; + if (getAndDelRepeatedOption(arguments, "--uffInput", args)) + { + for (const auto& i : args) + { + std::vector values{splitToStringVec(i, ',')}; + if (values.size() == 4) + { + nvinfer1::Dims3 dims{std::stoi(values[1]), std::stoi(values[2]), std::stoi(values[3])}; + inputs.emplace_back(values[0], dims); + } + else + { + throw std::invalid_argument(std::string("Invalid uffInput ") + i); + } + } + } +} + +void ModelOptions::parse(Arguments& arguments) +{ + baseModel.parse(arguments); + + switch (baseModel.format) + { + case ModelFormat::kCAFFE: + { + getAndDelOption(arguments, "--deploy", prototxt); + break; + } + case ModelFormat::kUFF: + { + uffInputs.parse(arguments); + if (uffInputs.inputs.empty()) + { + throw std::invalid_argument("Uff models require at least one input"); + } + break; + } + case ModelFormat::kONNX: + break; + case ModelFormat::kANY: + { + if (getAndDelOption(arguments, "--deploy", prototxt)) + { + baseModel.format = ModelFormat::kCAFFE; + } + break; + } + } + + // The --output flag should only be used with Caffe and UFF. It has no effect on ONNX. + std::vector outArgs; + if (getAndDelRepeatedOption(arguments, "--output", outArgs)) + { + for (const auto& o : outArgs) + { + for (auto& v : splitToStringVec(o, ',')) + { + outputs.emplace_back(std::move(v)); + } + } + } + if (baseModel.format == ModelFormat::kCAFFE || baseModel.format == ModelFormat::kUFF) + { + if (outputs.empty()) + { + throw std::invalid_argument("Caffe and Uff models require at least one output"); + } + } + else if (baseModel.format == ModelFormat::kONNX) + { + if (!outputs.empty()) + { + throw std::invalid_argument("The --output flag should not be used with ONNX models."); + } + } +} + +void BuildOptions::parse(Arguments& arguments) +{ + auto getFormats = [&arguments](std::vector& formatsVector, const char* argument) { + std::string list; + getAndDelOption(arguments, argument, list); + std::vector formats{splitToStringVec(list, ',')}; + for (const auto& f : formats) + { + formatsVector.push_back(stringToValue(f)); + } + }; + + getFormats(inputFormats, "--inputIOFormats"); + getFormats(outputFormats, "--outputIOFormats"); + + bool addedExplicitBatchFlag{false}; + getAndDelOption(arguments, "--explicitBatch", addedExplicitBatchFlag); + if (addedExplicitBatchFlag) + { + sample::gLogWarning << "--explicitBatch flag has been deprecated and has no effect!" << std::endl; + sample::gLogWarning << "Explicit batch dim is automatically enabled if input model is ONNX or if dynamic " + << "shapes are provided when the engine is built." << std::endl; + } + + bool minShapes = getShapesBuild(arguments, shapes, "--minShapes", nvinfer1::OptProfileSelector::kMIN); + bool optShapes = getShapesBuild(arguments, shapes, "--optShapes", nvinfer1::OptProfileSelector::kOPT); + bool maxShapes = getShapesBuild(arguments, shapes, "--maxShapes", nvinfer1::OptProfileSelector::kMAX); + processShapes(shapes, minShapes, optShapes, maxShapes, false); + bool minShapesCalib + = getShapesBuild(arguments, shapesCalib, "--minShapesCalib", nvinfer1::OptProfileSelector::kMIN); + bool optShapesCalib + = getShapesBuild(arguments, shapesCalib, "--optShapesCalib", nvinfer1::OptProfileSelector::kOPT); + bool maxShapesCalib + = getShapesBuild(arguments, shapesCalib, "--maxShapesCalib", nvinfer1::OptProfileSelector::kMAX); + processShapes(shapesCalib, minShapesCalib, optShapesCalib, maxShapesCalib, true); + + bool addedExplicitPrecisionFlag{false}; + getAndDelOption(arguments, "--explicitPrecision", addedExplicitPrecisionFlag); + if (addedExplicitPrecisionFlag) + { + sample::gLogWarning << "--explicitPrecision flag has been deprecated and has no effect!" << std::endl; + } + + if (getAndDelOption(arguments, "--workspace", workspace)) + { + sample::gLogWarning << "--workspace flag has been deprecated by --memPoolSize flag." << std::endl; + } + + std::string memPoolSizes; + getAndDelOption(arguments, "--memPoolSize", memPoolSizes); + std::vector memPoolSpecs{splitToStringVec(memPoolSizes, ',')}; + for (auto const& memPoolSpec : memPoolSpecs) + { + std::string memPoolName; + double memPoolSize; + std::tie(memPoolName, memPoolSize) = splitNameAndValue(memPoolSpec); + if (memPoolSize < 0) + { + throw std::invalid_argument(std::string("Negative memory pool size: ") + std::to_string(memPoolSize)); + } + if (memPoolName == "workspace") + { + workspace = memPoolSize; + } + else if (memPoolName == "dlaSRAM") + { + dlaSRAM = memPoolSize; + } + else if (memPoolName == "dlaLocalDRAM") + { + dlaLocalDRAM = memPoolSize; + } + else if (memPoolName == "dlaGlobalDRAM") + { + dlaGlobalDRAM = memPoolSize; + } + else if (!memPoolName.empty()) + { + throw std::invalid_argument(std::string("Unknown memory pool: ") + memPoolName); + } + } + + getAndDelOption(arguments, "--maxBatch", maxBatch); + getAndDelOption(arguments, "--minTiming", minTiming); + getAndDelOption(arguments, "--avgTiming", avgTiming); + + bool best{false}; + getAndDelOption(arguments, "--best", best); + if (best) + { + int8 = true; + fp16 = true; + } + + getAndDelOption(arguments, "--refit", refittable); + getAndDelNegOption(arguments, "--noTF32", tf32); + getAndDelOption(arguments, "--fp16", fp16); + getAndDelOption(arguments, "--int8", int8); + getAndDelOption(arguments, "--safe", safe); + getAndDelOption(arguments, "--consistency", consistency); + getAndDelOption(arguments, "--restricted", restricted); + + getAndDelOption(arguments, "--directIO", directIO); + + std::string precisionConstraintsString; + getAndDelOption(arguments, "--precisionConstraints", precisionConstraintsString); + if (!precisionConstraintsString.empty()) + { + const std::unordered_map precisionConstraintsMap + = {{"obey", PrecisionConstraints::kOBEY}, {"prefer", PrecisionConstraints::kPREFER}, + {"none", PrecisionConstraints::kNONE}}; + auto it = precisionConstraintsMap.find(precisionConstraintsString); + if (it == precisionConstraintsMap.end()) + { + throw std::invalid_argument(std::string("Unknown precision constraints: ") + precisionConstraintsString); + } + precisionConstraints = it->second; + } + else + { + precisionConstraints = PrecisionConstraints::kNONE; + } + + getLayerPrecisions(arguments, "--layerPrecisions", layerPrecisions); + getLayerOutputTypes(arguments, "--layerOutputTypes", layerOutputTypes); + + if (layerPrecisions.empty() && layerOutputTypes.empty() && precisionConstraints != PrecisionConstraints::kNONE) + { + sample::gLogWarning << "When --precisionConstraints flag is set to \"obey\" or \"prefer\", please add " + << "--layerPrecision/--layerOutputTypes flags to set layer-wise precisions and output " + << "types." << std::endl; + } + else if ((!layerPrecisions.empty() || !layerOutputTypes.empty()) + && precisionConstraints == PrecisionConstraints::kNONE) + { + sample::gLogWarning << "--layerPrecision/--layerOutputTypes flags have no effect when --precisionConstraints " + << "flag is set to \"none\"." << std::endl; + } + + std::string sparsityString; + getAndDelOption(arguments, "--sparsity", sparsityString); + if (sparsityString == "disable") + { + sparsity = SparsityFlag::kDISABLE; + } + else if (sparsityString == "enable") + { + sparsity = SparsityFlag::kENABLE; + } + else if (sparsityString == "force") + { + sparsity = SparsityFlag::kFORCE; + } + else if (!sparsityString.empty()) + { + throw std::invalid_argument(std::string("Unknown sparsity mode: ") + sparsityString); + } + + bool calibCheck = getAndDelOption(arguments, "--calib", calibration); + if (int8 && calibCheck && !shapes.empty() && shapesCalib.empty()) + { + shapesCalib = shapes; + } + + std::string profilingVerbosityString; + if (getAndDelOption(arguments, "--nvtxMode", profilingVerbosityString)) + { + sample::gLogWarning << "--nvtxMode flag has been deprecated by --profilingVerbosity flag." << std::endl; + } + + getAndDelOption(arguments, "--profilingVerbosity", profilingVerbosityString); + if (profilingVerbosityString == "layer_names_only") + { +#if (NV_TENSORRT_MAJOR > 7) + profilingVerbosity = nvinfer1::ProfilingVerbosity::kLAYER_NAMES_ONLY; +#else + profilingVerbosity = nvinfer1::ProfilingVerbosity::kDEFAULT; +#endif + } + else if (profilingVerbosityString == "none") + { + profilingVerbosity = nvinfer1::ProfilingVerbosity::kNONE; + } +#if (NV_TENSORRT_MAJOR > 7) + else if (profilingVerbosityString == "detailed") + { + profilingVerbosity = nvinfer1::ProfilingVerbosity::kDETAILED; + } +#endif + else if (profilingVerbosityString == "default") + { +#if (NV_TENSORRT_MAJOR > 7) + sample::gLogWarning << "--profilingVerbosity=default has been deprecated by " + "--profilingVerbosity=layer_names_only." + << std::endl; + profilingVerbosity = nvinfer1::ProfilingVerbosity::kLAYER_NAMES_ONLY; +#else + profilingVerbosity = nvinfer1::ProfilingVerbosity::kDEFAULT; +#endif + } + else if (profilingVerbosityString == "verbose") + { +#if (NV_TENSORRT_MAJOR > 7) + sample::gLogWarning << "--profilingVerbosity=verbose has been deprecated by --profilingVerbosity=detailed." + << std::endl; + profilingVerbosity = nvinfer1::ProfilingVerbosity::kDETAILED; +#else + profilingVerbosity = nvinfer1::ProfilingVerbosity::kDEFAULT; +#endif + } + else if (!profilingVerbosityString.empty()) + { + throw std::invalid_argument(std::string("Unknown profilingVerbosity: ") + profilingVerbosityString); + } + + if (getAndDelOption(arguments, "--loadEngine", engine)) + { + load = true; + } + if (getAndDelOption(arguments, "--saveEngine", engine)) + { + save = true; + } + if (load && save) + { + throw std::invalid_argument("Incompatible load and save engine options selected"); + } + + std::string tacticSourceArgs; + if (getAndDelOption(arguments, "--tacticSources", tacticSourceArgs)) + { + std::vector tacticList = splitToStringVec(tacticSourceArgs, ','); + for (auto& t : tacticList) + { + bool enable{false}; + if (t.front() == '+') + { + enable = true; + } + else if (t.front() != '-') + { + throw std::invalid_argument( + "Tactic source must be prefixed with + or -, indicating whether it should be enabled or disabled " + "respectively."); + } + t.erase(0, 1); + + const auto toUpper = [](std::string& sourceName) { + std::transform( + sourceName.begin(), sourceName.end(), sourceName.begin(), [](char c) { return std::toupper(c); }); + return sourceName; + }; + + nvinfer1::TacticSource source{}; + t = toUpper(t); + if (t == "CUBLAS") + { + source = nvinfer1::TacticSource::kCUBLAS; + } + else if (t == "CUBLASLT" || t == "CUBLAS_LT") + { + source = nvinfer1::TacticSource::kCUBLAS_LT; + } +#if (NV_TENSORRT_MAJOR > 7) + else if (t == "CUDNN") + { + source = nvinfer1::TacticSource::kCUDNN; + } +#endif + else + { + throw std::invalid_argument(std::string("Unknown tactic source: ") + t); + } + + uint32_t sourceBit = 1U << static_cast(source); + + if (enable) + { + enabledTactics |= sourceBit; + } + else + { + disabledTactics |= sourceBit; + } + + if (enabledTactics & disabledTactics) + { + throw std::invalid_argument(std::string("Cannot enable and disable ") + t); + } + } + } + + bool noBuilderCache{false}; + getAndDelOption(arguments, "--noBuilderCache", noBuilderCache); + getAndDelOption(arguments, "--timingCacheFile", timingCacheFile); + if (noBuilderCache) + { + timingCacheMode = TimingCacheMode::kDISABLE; + } + else if (!timingCacheFile.empty()) + { + timingCacheMode = TimingCacheMode::kGLOBAL; + } + else + { + timingCacheMode = TimingCacheMode::kLOCAL; + } +} + +void SystemOptions::parse(Arguments& arguments) +{ + getAndDelOption(arguments, "--device", device); + getAndDelOption(arguments, "--useDLACore", DLACore); + getAndDelOption(arguments, "--allowGPUFallback", fallback); + std::string pluginName; + while (getAndDelOption(arguments, "--plugins", pluginName)) + { + plugins.emplace_back(pluginName); + } +} + +void InferenceOptions::parse(Arguments& arguments) +{ + getAndDelOption(arguments, "--streams", streams); + getAndDelOption(arguments, "--iterations", iterations); + getAndDelOption(arguments, "--duration", duration); + getAndDelOption(arguments, "--warmUp", warmup); + getAndDelOption(arguments, "--sleepTime", sleep); + getAndDelOption(arguments, "--idleTime", idle); + bool exposeDMA{false}; + if (getAndDelOption(arguments, "--exposeDMA", exposeDMA)) + { + overlap = !exposeDMA; + } + getAndDelOption(arguments, "--noDataTransfers", skipTransfers); + getAndDelOption(arguments, "--useManagedMemory", useManaged); + getAndDelOption(arguments, "--useSpinWait", spin); + getAndDelOption(arguments, "--threads", threads); + getAndDelOption(arguments, "--useCudaGraph", graph); + getAndDelOption(arguments, "--separateProfileRun", rerun); + getAndDelOption(arguments, "--buildOnly", skip); + getAndDelOption(arguments, "--timeDeserialize", timeDeserialize); + getAndDelOption(arguments, "--timeRefit", timeRefit); + + std::string list; + getAndDelOption(arguments, "--loadInputs", list); + std::vector inputsList{splitToStringVec(list, ',')}; + splitInsertKeyValue(inputsList, inputs); + + getShapesInference(arguments, shapes, "--shapes"); + getAndDelOption(arguments, "--batch", batch); +} + +void ReportingOptions::parse(Arguments& arguments) +{ + getAndDelOption(arguments, "--percentile", percentile); + getAndDelOption(arguments, "--avgRuns", avgs); + getAndDelOption(arguments, "--verbose", verbose); + getAndDelOption(arguments, "--dumpRefit", refit); + getAndDelOption(arguments, "--dumpOutput", output); + getAndDelOption(arguments, "--dumpProfile", profile); + getAndDelOption(arguments, "--dumpLayerInfo", layerInfo); + getAndDelOption(arguments, "--exportTimes", exportTimes); + getAndDelOption(arguments, "--exportOutput", exportOutput); + getAndDelOption(arguments, "--exportProfile", exportProfile); + getAndDelOption(arguments, "--exportLayerInfo", exportLayerInfo); + if (percentile < 0 || percentile > 100) + { + throw std::invalid_argument(std::string("Percentile ") + std::to_string(percentile) + "is not in [0,100]"); + } +} + +bool parseHelp(Arguments& arguments) +{ + bool helpLong{false}; + bool helpShort{false}; + getAndDelOption(arguments, "--help", helpLong); + getAndDelOption(arguments, "-h", helpShort); + return helpLong || helpShort; +} + +void AllOptions::parse(Arguments& arguments) +{ + model.parse(arguments); + build.parse(arguments); + system.parse(arguments); + inference.parse(arguments); + + // Use explicitBatch when input model is ONNX or when dynamic shapes are used. + const bool isOnnx{model.baseModel.format == ModelFormat::kONNX}; + const bool hasDynamicShapes{!build.shapes.empty() || !inference.shapes.empty()}; + const bool detectedExplicitBatch = isOnnx || hasDynamicShapes; + + // Throw an error if user tries to use --batch or --maxBatch when the engine has explicit batch dim. + const bool maxBatchWasSet{build.maxBatch != maxBatchNotProvided}; + const bool batchWasSet{inference.batch != batchNotProvided}; + if (detectedExplicitBatch && (maxBatchWasSet || batchWasSet)) + { + throw std::invalid_argument( + "The --batch and --maxBatch flags should not be used when the input model is ONNX or when dynamic shapes " + "are provided. Please use --optShapes and --shapes to set input shapes instead."); + } + + // If batch and/or maxBatch is not set and the engine has implicit batch dim, set them to default values. + if (!detectedExplicitBatch) + { + // If batch is not set, set it to default value. + if (!batchWasSet) + { + inference.batch = defaultBatch; + } + // If maxBatch is not set, set it to be equal to batch. + if (!maxBatchWasSet) + { + build.maxBatch = inference.batch; + } + // MaxBatch should not be less than batch. + if (build.maxBatch < inference.batch) + { + throw std::invalid_argument("Build max batch " + std::to_string(build.maxBatch) + + " is less than inference batch " + std::to_string(inference.batch)); + } + } + + if (build.shapes.empty() && !inference.shapes.empty()) + { + // If --shapes are provided but --optShapes are not, assume that optShapes is the same as shapes. + for (auto& s : inference.shapes) + { + insertShapesBuild(build.shapes, nvinfer1::OptProfileSelector::kMIN, s.first, s.second); + insertShapesBuild(build.shapes, nvinfer1::OptProfileSelector::kOPT, s.first, s.second); + insertShapesBuild(build.shapes, nvinfer1::OptProfileSelector::kMAX, s.first, s.second); + } + } + else if (!build.shapes.empty() && inference.shapes.empty()) + { + // If --optShapes are provided but --shapes are not, assume that shapes is the same as optShapes. + for (auto& s : build.shapes) + { + insertShapesInference( + inference.shapes, s.first, s.second[static_cast(nvinfer1::OptProfileSelector::kOPT)]); + } + } + + reporting.parse(arguments); + helps = parseHelp(arguments); + + if (!helps) + { + if (!build.load && model.baseModel.format == ModelFormat::kANY) + { + throw std::invalid_argument("Model missing or format not recognized"); + } + if (build.safe && system.DLACore >= 0) + { + auto checkSafeDLAFormats = [](std::vector const& fmt) { + return fmt.empty() ? false : std::all_of(fmt.begin(), fmt.end(), [](IOFormat const& pair) { + bool supported{false}; + bool const isLINEAR{pair.second == 1U << static_cast(nvinfer1::TensorFormat::kLINEAR)}; + bool const isCHW4{pair.second == 1U << static_cast(nvinfer1::TensorFormat::kCHW4)}; + bool const isCHW32{pair.second == 1U << static_cast(nvinfer1::TensorFormat::kCHW32)}; + bool const isCHW16{pair.second == 1U << static_cast(nvinfer1::TensorFormat::kCHW16)}; + supported |= pair.first == nvinfer1::DataType::kINT8 && (isLINEAR || isCHW4 || isCHW32); + supported |= pair.first == nvinfer1::DataType::kHALF && (isLINEAR || isCHW4 || isCHW16); + return supported; + }); + }; + if (!checkSafeDLAFormats(build.inputFormats) || !checkSafeDLAFormats(build.outputFormats)) + { + throw std::invalid_argument( + "I/O formats for safe DLA capability are restricted to fp16/int8:linear, fp16:chw16 or int8:chw32"); + } + if (system.fallback) + { + throw std::invalid_argument("GPU fallback (--allowGPUFallback) not allowed for safe DLA capability"); + } + } + } +} + +void SafeBuilderOptions::parse(Arguments& arguments) +{ + auto getFormats = [&arguments](std::vector& formatsVector, const char* argument) { + std::string list; + getAndDelOption(arguments, argument, list); + std::vector formats{splitToStringVec(list, ',')}; + for (const auto& f : formats) + { + formatsVector.push_back(stringToValue(f)); + } + }; + + getAndDelOption(arguments, "--serialized", serialized); + getAndDelOption(arguments, "--onnx", onnxModelFile); + getAndDelOption(arguments, "--help", help); + getAndDelOption(arguments, "-h", help); + getAndDelOption(arguments, "--verbose", verbose); + getAndDelOption(arguments, "-v", verbose); + getFormats(inputFormats, "--inputIOFormats"); + getFormats(outputFormats, "--outputIOFormats"); + getAndDelOption(arguments, "--int8", int8); + getAndDelOption(arguments, "--calib", calibFile); + getAndDelOption(arguments, "--consistency", consistency); + getAndDelOption(arguments, "--std", standard); + std::string pluginName; + while (getAndDelOption(arguments, "--plugins", pluginName)) + { + plugins.emplace_back(pluginName); + } +} + +std::ostream& operator<<(std::ostream& os, const BaseModelOptions& options) +{ + os << "=== Model Options ===" << std::endl; + + os << "Format: "; + switch (options.format) + { + case ModelFormat::kCAFFE: + { + os << "Caffe"; + break; + } + case ModelFormat::kONNX: + { + os << "ONNX"; + break; + } + case ModelFormat::kUFF: + { + os << "UFF"; + break; + } + case ModelFormat::kANY: + os << "*"; + break; + } + os << std::endl << "Model: " << options.model << std::endl; + + return os; +} + +std::ostream& operator<<(std::ostream& os, const UffInput& input) +{ + os << "Uff Inputs Layout: " << (input.NHWC ? "NHWC" : "NCHW") << std::endl; + for (const auto& i : input.inputs) + { + os << "Input: " << i.first << "," << i.second.d[0] << "," << i.second.d[1] << "," << i.second.d[2] << std::endl; + } + + return os; +} + +std::ostream& operator<<(std::ostream& os, const ModelOptions& options) +{ + os << options.baseModel; + switch (options.baseModel.format) + { + case ModelFormat::kCAFFE: + { + os << "Prototxt: " << options.prototxt << std::endl; + break; + } + case ModelFormat::kUFF: + { + os << options.uffInputs; + break; + } + case ModelFormat::kONNX: // Fallthrough: No options to report for ONNX or the generic case + case ModelFormat::kANY: + break; + } + + os << "Output:"; + for (const auto& o : options.outputs) + { + os << " " << o; + } + os << std::endl; + + return os; +} + +std::ostream& operator<<(std::ostream& os, nvinfer1::DataType dtype) +{ + switch (dtype) + { + case nvinfer1::DataType::kFLOAT: + { + os << "fp32"; + break; + } + case nvinfer1::DataType::kHALF: + { + os << "fp16"; + break; + } + case nvinfer1::DataType::kINT8: + { + os << "int8"; + break; + } + case nvinfer1::DataType::kINT32: + { + os << "int32"; + break; + } + case nvinfer1::DataType::kBOOL: + { + os << "bool"; + break; + } + } + return os; +} + +std::ostream& operator<<(std::ostream& os, IOFormat const& format) +{ + os << format.first << ":"; + + for (int32_t f = 0; f < nvinfer1::EnumMax(); ++f) + { + if ((1U << f) & format.second) + { + if (f) + { + os << "+"; + } + switch (nvinfer1::TensorFormat(f)) + { + case nvinfer1::TensorFormat::kLINEAR: + { + os << "chw"; + break; + } + case nvinfer1::TensorFormat::kCHW2: + { + os << "chw2"; + break; + } + case nvinfer1::TensorFormat::kHWC8: + { + os << "hwc8"; + break; + } +#if (NV_TENSORRT_MAJOR > 7) + case nvinfer1::TensorFormat::kHWC16: + { + os << "hwc16"; + break; + } +#endif + case nvinfer1::TensorFormat::kCHW4: + { + os << "chw4"; + break; + } + case nvinfer1::TensorFormat::kCHW16: + { + os << "chw16"; + break; + } + case nvinfer1::TensorFormat::kCHW32: + { + os << "chw32"; + break; + } + case nvinfer1::TensorFormat::kDHWC8: + { + os << "dhwc8"; + break; + } + case nvinfer1::TensorFormat::kCDHW32: + { + os << "cdhw32"; + break; + } + case nvinfer1::TensorFormat::kHWC: + { + os << "hwc"; + break; + } + case nvinfer1::TensorFormat::kDLA_LINEAR: + { + os << "dla_linear"; + break; + } + case nvinfer1::TensorFormat::kDLA_HWC4: + { + os << "dla_hwc4"; + break; + } + } + } + } + return os; +} + +std::ostream& operator<<(std::ostream& os, const ShapeRange& dims) +{ + int32_t i = 0; + for (const auto& d : dims) + { + if (!d.size()) + { + break; + } + os << (i ? "+" : "") << d; + ++i; + } + return os; +} + +std::ostream& operator<<(std::ostream& os, LayerPrecisions const& layerPrecisions) +{ + int32_t i = 0; + for (auto const& layerPrecision : layerPrecisions) + { + os << (i ? "," : "") << layerPrecision.first << ":" << layerPrecision.second; + ++i; + } + return os; +} + +std::ostream& operator<<(std::ostream& os, const BuildOptions& options) +{ + // clang-format off + os << "=== Build Options ===" << std::endl << + + "Max batch: "; printBatch(os, options.maxBatch) << std::endl << + "Memory Pools: "; printMemoryPools(os, options) << std::endl << + "minTiming: " << options.minTiming << std::endl << + "avgTiming: " << options.avgTiming << std::endl << + "Precision: "; printPrecision(os, options) << std::endl << + "LayerPrecisions: " << options.layerPrecisions << std::endl << + "Calibration: " << (options.int8 && options.calibration.empty() ? "Dynamic" : options.calibration.c_str()) << std::endl << + "Refit: " << boolToEnabled(options.refittable) << std::endl << + "Sparsity: "; printSparsity(os, options) << std::endl << + "Safe mode: " << boolToEnabled(options.safe) << std::endl << + "DirectIO mode: " << boolToEnabled(options.directIO) << std::endl << + "Restricted mode: " << boolToEnabled(options.restricted) << std::endl << + "Save engine: " << (options.save ? options.engine : "") << std::endl << + "Load engine: " << (options.load ? options.engine : "") << std::endl << + "Profiling verbosity: " << static_cast(options.profilingVerbosity) << std::endl << + "Tactic sources: "; printTacticSources(os, options.enabledTactics, options.disabledTactics) << std::endl << + "timingCacheMode: "; printTimingCache(os, options) << std::endl << + "timingCacheFile: " << options.timingCacheFile << std::endl; + // clang-format on + + auto printIOFormats = [](std::ostream& os, const char* direction, const std::vector formats) { + if (formats.empty()) + { + os << direction << "s format: fp32:CHW" << std::endl; + } + else + { + for(const auto& f : formats) + { + os << direction << ": " << f << std::endl; + } + } + }; + + printIOFormats(os, "Input(s)", options.inputFormats); + printIOFormats(os, "Output(s)", options.outputFormats); + printShapes(os, "build", options.shapes); + printShapes(os, "calibration", options.shapesCalib); + + return os; +} + +std::ostream& operator<<(std::ostream& os, const SystemOptions& options) +{ + // clang-format off + os << "=== System Options ===" << std::endl << + + "Device: " << options.device << std::endl << + "DLACore: " << (options.DLACore != -1 ? std::to_string(options.DLACore) : "") << + (options.DLACore != -1 && options.fallback ? "(With GPU fallback)" : "") << std::endl; + os << "Plugins:"; + + for (const auto& p : options.plugins) + { + os << " " << p; + } + os << std::endl; + + return os; + // clang-format on +} + +std::ostream& operator<<(std::ostream& os, const InferenceOptions& options) +{ +// clang-format off + os << "=== Inference Options ===" << std::endl << + + "Batch: "; + if (options.batch && options.shapes.empty()) + { + os << options.batch << std::endl; + } + else + { + os << "Explicit" << std::endl; + } + printShapes(os, "inference", options.shapes); + os << "Iterations: " << options.iterations << std::endl << + "Duration: " << options.duration << "s (+ " + << options.warmup << "ms warm up)" << std::endl << + "Sleep time: " << options.sleep << "ms" << std::endl << + "Idle time: " << options.idle << "ms" << std::endl << + "Streams: " << options.streams << std::endl << + "ExposeDMA: " << boolToEnabled(!options.overlap) << std::endl << + "Data transfers: " << boolToEnabled(!options.skipTransfers) << std::endl << + "Spin-wait: " << boolToEnabled(options.spin) << std::endl << + "Multithreading: " << boolToEnabled(options.threads) << std::endl << + "CUDA Graph: " << boolToEnabled(options.graph) << std::endl << + "Separate profiling: " << boolToEnabled(options.rerun) << std::endl << + "Time Deserialize: " << boolToEnabled(options.timeDeserialize) << std::endl << + "Time Refit: " << boolToEnabled(options.timeRefit) << std::endl << + "Skip inference: " << boolToEnabled(options.skip) << std::endl; + +// clang-format on + os << "Inputs:" << std::endl; + for (const auto& input : options.inputs) + { + os << input.first << "<-" << input.second << std::endl; + } + + return os; +} + +std::ostream& operator<<(std::ostream& os, const ReportingOptions& options) +{ +// clang-format off + os << "=== Reporting Options ===" << std::endl << + + "Verbose: " << boolToEnabled(options.verbose) << std::endl << + "Averages: " << options.avgs << " inferences" << std::endl << + "Percentile: " << options.percentile << std::endl << + "Dump refittable layers:" << boolToEnabled(options.refit) << std::endl << + "Dump output: " << boolToEnabled(options.output) << std::endl << + "Profile: " << boolToEnabled(options.profile) << std::endl << + "Export timing to JSON file: " << options.exportTimes << std::endl << + "Export output to JSON file: " << options.exportOutput << std::endl << + "Export profile to JSON file: " << options.exportProfile << std::endl; +// clang-format on + + return os; +} + +std::ostream& operator<<(std::ostream& os, const AllOptions& options) +{ + os << options.model << options.build << options.system << options.inference << options.reporting << std::endl; + return os; +} + +std::ostream& operator<<(std::ostream& os, const SafeBuilderOptions& options) +{ + auto printIOFormats = [](std::ostream& os, const char* direction, const std::vector formats) { + if (formats.empty()) + { + os << direction << "s format: fp32:CHW" << std::endl; + } + else + { + for(const auto& f : formats) + { + os << direction << ": " << f << std::endl; + } + } + }; + + os << "=== Build Options ===" << std::endl; + os << "Model ONNX: " << options.onnxModelFile << std::endl; + + os << "Precision: FP16"; + if (options.int8) + { + os << " + INT8"; + } + os << std::endl; + os << "Calibration file: " << options.calibFile << std::endl; + os << "Serialized Network: " << options.serialized << std::endl; + + printIOFormats(os, "Input(s)", options.inputFormats); + printIOFormats(os, "Output(s)", options.outputFormats); + + os << "Plugins:"; + for (const auto& p : options.plugins) + { + os << " " << p; + } + os << std::endl; + return os; +} + +void BaseModelOptions::help(std::ostream& os) +{ +// clang-format off + os << " --uff= UFF model" << std::endl << + " --onnx= ONNX model" << std::endl << + " --model= Caffe model (default = no model, random weights used)" << std::endl; +// clang-format on +} + +void UffInput::help(std::ostream& os) +{ +// clang-format off + os << " --uffInput=,X,Y,Z Input blob name and its dimensions (X,Y,Z=C,H,W), it can be specified " + "multiple times; at least one is required for UFF models" << std::endl << + " --uffNHWC Set if inputs are in the NHWC layout instead of NCHW (use " << + "X,Y,Z=H,W,C order in --uffInput)" << std::endl; +// clang-format on +} + +void ModelOptions::help(std::ostream& os) +{ +// clang-format off + os << "=== Model Options ===" << std::endl; + BaseModelOptions::help(os); + os << " --deploy= Caffe prototxt file" << std::endl << + " --output=[,]* Output names (it can be specified multiple times); at least one output " + "is required for UFF and Caffe" << std::endl; + UffInput::help(os); +// clang-format on +} + +void BuildOptions::help(std::ostream& os) +{ +// clang-format off + os << "=== Build Options ===" "\n" + " --maxBatch Set max batch size and build an implicit batch engine (default = same size as --batch)" "\n" + " This option should not be used when the input model is ONNX or when dynamic shapes are provided." "\n" + " --minShapes=spec Build with dynamic shapes using a profile with the min shapes provided" "\n" + " --optShapes=spec Build with dynamic shapes using a profile with the opt shapes provided" "\n" + " --maxShapes=spec Build with dynamic shapes using a profile with the max shapes provided" "\n" + " --minShapesCalib=spec Calibrate with dynamic shapes using a profile with the min shapes provided" "\n" + " --optShapesCalib=spec Calibrate with dynamic shapes using a profile with the opt shapes provided" "\n" + " --maxShapesCalib=spec Calibrate with dynamic shapes using a profile with the max shapes provided" "\n" + " Note: All three of min, opt and max shapes must be supplied." "\n" + " However, if only opt shapes is supplied then it will be expanded so" "\n" + " that min shapes and max shapes are set to the same values as opt shapes." "\n" + " Input names can be wrapped with escaped single quotes (ex: \\\'Input:0\\\')." "\n" + " Example input shapes spec: input0:1x3x256x256,input1:1x3x128x128" "\n" + " Each input shape is supplied as a key-value pair where key is the input name and" "\n" + " value is the dimensions (including the batch dimension) to be used for that input." "\n" + " Each key-value pair has the key and value separated using a colon (:)." "\n" + " Multiple input shapes can be provided via comma-separated key-value pairs." "\n" + " --inputIOFormats=spec Type and format of each of the input tensors (default = all inputs in fp32:chw)" "\n" + " See --outputIOFormats help for the grammar of type and format list." "\n" + " Note: If this option is specified, please set comma-separated types and formats for all" "\n" + " inputs following the same order as network inputs ID (even if only one input" "\n" + " needs specifying IO format) or set the type and format once for broadcasting." "\n" + " --outputIOFormats=spec Type and format of each of the output tensors (default = all outputs in fp32:chw)" "\n" + " Note: If this option is specified, please set comma-separated types and formats for all" "\n" + " outputs following the same order as network outputs ID (even if only one output" "\n" + " needs specifying IO format) or set the type and format once for broadcasting." "\n" + " IO Formats: spec ::= IOfmt[\",\"spec]" "\n" + " IOfmt ::= type:fmt" "\n" + " type ::= \"fp32\"|\"fp16\"|\"int32\"|\"int8\"" "\n" + " fmt ::= (\"chw\"|\"chw2\"|\"chw4\"|\"hwc8\"|\"chw16\"|\"chw32\"|\"dhwc8\")[\"+\"fmt]" "\n" + " --workspace=N Set workspace size in MiB." "\n" + " --memPoolSize=poolspec Specify the size constraints of the designated memory pool(s) in MiB." "\n" + " Note: Also accepts decimal sizes, e.g. 0.25MiB. Will be rounded down to the nearest integer bytes." "\n" + " Pool constraint: poolspec ::= poolfmt[\",\"poolspec]" "\n" + " poolfmt ::= pool:sizeInMiB" "\n" + " pool ::= \"workspace\"|\"dlaSRAM\"|\"dlaLocalDRAM\"|\"dlaGlobalDRAM\"" "\n" + " --profilingVerbosity=mode Specify profiling verbosity. mode ::= layer_names_only|detailed|none (default = layer_names_only)" "\n" + " --minTiming=M Set the minimum number of iterations used in kernel selection (default = " + << defaultMinTiming << ")" "\n" + " --avgTiming=M Set the number of times averaged in each iteration for kernel selection (default = " + << defaultAvgTiming << ")" "\n" + " --refit Mark the engine as refittable. This will allow the inspection of refittable layers " "\n" + " and weights within the engine." "\n" + " --sparsity=spec Control sparsity (default = disabled). " "\n" + " Sparsity: spec ::= \"disable\", \"enable\", \"force\"" "\n" + " Note: Description about each of these options is as below" "\n" + " disable = do not enable sparse tactics in the builder (this is the default)" "\n" + " enable = enable sparse tactics in the builder (but these tactics will only be" "\n" + " considered if the weights have the right sparsity pattern)" "\n" + " force = enable sparse tactics in the builder and force-overwrite the weights to have" "\n" + " a sparsity pattern (even if you loaded a model yourself)" "\n" + " --noTF32 Disable tf32 precision (default is to enable tf32, in addition to fp32)" "\n" + " --fp16 Enable fp16 precision, in addition to fp32 (default = disabled)" "\n" + " --int8 Enable int8 precision, in addition to fp32 (default = disabled)" "\n" + " --best Enable all precisions to achieve the best performance (default = disabled)" "\n" + " --directIO Avoid reformatting at network boundaries. (default = disabled)" "\n" + " --precisionConstraints=spec Control precision constraint setting. (default = none)" "\n" + " Precision Constaints: spec ::= \"none\" | \"obey\" | \"prefer\"" "\n" + " none = no constraints" "\n" + " prefer = meet precision constraints set by --layerPrecisions/--layerOutputTypes if possible" "\n" + " obey = meet precision constraints set by --layerPrecisions/--layerOutputTypes or fail" "\n" + " otherwise" "\n" + " --layerPrecisions=spec Control per-layer precision constraints. Effective only when precisionConstraints is set to" "\n" + " \"obey\" or \"prefer\". (default = none)" "\n" + " The specs are read left-to-right, and later ones override earlier ones. \"*\" can be used as a" "\n" + " layerName to specify the default precision for all the unspecified layers." "\n" + " Per-layer precision spec ::= layerPrecision[\",\"spec]" "\n" + " layerPrecision ::= layerName\":\"precision" "\n" + " precision ::= \"fp32\"|\"fp16\"|\"int32\"|\"int8\"" "\n" + " --layerOutputTypes=spec Control per-layer output type constraints. Effective only when precisionConstraints is set to" "\n" + " \"obey\" or \"prefer\". (default = none)" "\n" + " The specs are read left-to-right, and later ones override earlier ones. \"*\" can be used as a" "\n" + " layerName to specify the default precision for all the unspecified layers. If a layer has more than""\n" + " one output, then multiple types separated by \"+\" can be provided for this layer." "\n" + " Per-layer output type spec ::= layerOutputTypes[\",\"spec]" "\n" + " layerOutputTypes ::= layerName\":\"type" "\n" + " type ::= \"fp32\"|\"fp16\"|\"int32\"|\"int8\"[\"+\"type]" "\n" + " --calib= Read INT8 calibration cache file" "\n" + " --safe Enable build safety certified engine" "\n" + " --consistency Perform consistency checking on safety certified engine" "\n" + " --restricted Enable safety scope checking with kSAFETY_SCOPE build flag" "\n" + " --saveEngine= Save the serialized engine" "\n" + " --loadEngine= Load a serialized engine" "\n" + " --tacticSources=tactics Specify the tactics to be used by adding (+) or removing (-) tactics from the default " "\n" + " tactic sources (default = all available tactics)." "\n" + " Note: Currently only cuDNN, cuBLAS and cuBLAS-LT are listed as optional tactics." "\n" + " Tactic Sources: tactics ::= [\",\"tactic]" "\n" + " tactic ::= (+|-)lib" "\n" + " lib ::= \"CUBLAS\"|\"CUBLAS_LT\"|\"CUDNN\"" "\n" + " For example, to disable cudnn and enable cublas: --tacticSources=-CUDNN,+CUBLAS" "\n" + " --noBuilderCache Disable timing cache in builder (default is to enable timing cache)" "\n" + " --timingCacheFile= Save/load the serialized global timing cache" "\n" + ; +// clang-format on + os << std::flush; +} + +void SystemOptions::help(std::ostream& os) +{ +// clang-format off + os << "=== System Options ===" << std::endl << + " --device=N Select cuda device N (default = " << defaultDevice << ")" << std::endl << + " --useDLACore=N Select DLA core N for layers that support DLA (default = none)" << std::endl << + " --allowGPUFallback When DLA is enabled, allow GPU fallback for unsupported layers " + "(default = disabled)" << std::endl; + os << " --plugins Plugin library (.so) to load (can be specified multiple times)" << std::endl; +// clang-format on +} + +void InferenceOptions::help(std::ostream& os) +{ + // clang-format off + os << "=== Inference Options ===" << std::endl << + " --batch=N Set batch size for implicit batch engines (default = " << defaultBatch << ")" << std::endl << + " This option should not be used when the engine is built from an ONNX model or when dynamic" << std::endl << + " shapes are provided when the engine is built." << std::endl << + " --shapes=spec Set input shapes for dynamic shapes inference inputs." << std::endl << + " Note: Input names can be wrapped with escaped single quotes (ex: \\\'Input:0\\\')." << std::endl << + " Example input shapes spec: input0:1x3x256x256, input1:1x3x128x128" << std::endl << + " Each input shape is supplied as a key-value pair where key is the input name and" << std::endl << + " value is the dimensions (including the batch dimension) to be used for that input." << std::endl << + " Each key-value pair has the key and value separated using a colon (:)." << std::endl << + " Multiple input shapes can be provided via comma-separated key-value pairs." << std::endl << + " --loadInputs=spec Load input values from files (default = generate random inputs). Input names can be " + "wrapped with single quotes (ex: 'Input:0')" << std::endl << + " Input values spec ::= Ival[\",\"spec]" << std::endl << + " Ival ::= name\":\"file" << std::endl << + " --iterations=N Run at least N inference iterations (default = " << defaultIterations << ")" << std::endl << + " --warmUp=N Run for N milliseconds to warmup before measuring performance (default = " + << defaultWarmUp << ")" << std::endl << + " --duration=N Run performance measurements for at least N seconds wallclock time (default = " + << defaultDuration << ")" << std::endl << + " --sleepTime=N Delay inference start with a gap of N milliseconds between launch and compute " + "(default = " << defaultSleep << ")" << std::endl << + " --idleTime=N Sleep N milliseconds between two continuous iterations" + "(default = " << defaultIdle << ")" << std::endl << + " --streams=N Instantiate N engines to use concurrently (default = " << defaultStreams << ")" << std::endl << + " --exposeDMA Serialize DMA transfers to and from device (default = disabled)." << std::endl << + " --noDataTransfers Disable DMA transfers to and from device (default = enabled)." << std::endl << + " --useManagedMemory Use managed memory instead of seperate host and device allocations (default = disabled)." << std::endl << + " --useSpinWait Actively synchronize on GPU events. This option may decrease synchronization time but " + "increase CPU usage and power (default = disabled)" << std::endl << + " --threads Enable multithreading to drive engines with independent threads" + " or speed up refitting (default = disabled) " << std::endl << + " --useCudaGraph Use CUDA graph to capture engine execution and then launch inference (default = disabled)." << std::endl << + " This flag may be ignored if the graph capture fails." << std::endl << + " --timeDeserialize Time the amount of time it takes to deserialize the network and exit." << std::endl << + " --timeRefit Time the amount of time it takes to refit the engine before inference." << std::endl << + " --separateProfileRun Do not attach the profiler in the benchmark run; if profiling is enabled, a second " + "profile run will be executed (default = disabled)" << std::endl << + " --buildOnly Skip inference perf measurement (default = disabled)" << std::endl; + // clang-format on +} + +void ReportingOptions::help(std::ostream& os) +{ +// clang-format off + os << "=== Reporting Options ===" << std::endl << + " --verbose Use verbose logging (default = false)" << std::endl << + " --avgRuns=N Report performance measurements averaged over N consecutive " + "iterations (default = " << defaultAvgRuns << ")" << std::endl << + " --percentile=P Report performance for the P percentage (0<=P<=100, 0 " + "representing max perf, and 100 representing min perf; (default" + " = " << defaultPercentile << "%)" << std::endl << + " --dumpRefit Print the refittable layers and weights from a refittable " + "engine" << std::endl << + " --dumpOutput Print the output tensor(s) of the last inference iteration " + "(default = disabled)" << std::endl << + " --dumpProfile Print profile information per layer (default = disabled)" << std::endl << + " --dumpLayerInfo Print layer information of the engine to console " + "(default = disabled)" << std::endl << + " --exportTimes= Write the timing results in a json file (default = disabled)" << std::endl << + " --exportOutput= Write the output tensors to a json file (default = disabled)" << std::endl << + " --exportProfile= Write the profile information per layer in a json file " + "(default = disabled)" << std::endl << + " --exportLayerInfo= Write the layer information of the engine in a json file " + "(default = disabled)" << std::endl; +// clang-format on +} + +void helpHelp(std::ostream& os) +{ +// clang-format off + os << "=== Help ===" << std::endl << + " --help, -h Print this message" << std::endl; +// clang-format on +} + +void AllOptions::help(std::ostream& os) +{ + ModelOptions::help(os); + os << std::endl; + BuildOptions::help(os); + os << std::endl; + InferenceOptions::help(os); + os << std::endl; +// clang-format off + os << "=== Build and Inference Batch Options ===" << std::endl << + " When using implicit batch, the max batch size of the engine, if not given, " << std::endl << + " is set to the inference batch size;" << std::endl << + " when using explicit batch, if shapes are specified only for inference, they " << std::endl << + " will be used also as min/opt/max in the build profile; if shapes are " << std::endl << + " specified only for the build, the opt shapes will be used also for inference;" << std::endl << + " if both are specified, they must be compatible; and if explicit batch is " << std::endl << + " enabled but neither is specified, the model must provide complete static" << std::endl << + " dimensions, including batch size, for all inputs" << std::endl << + " Using ONNX models automatically forces explicit batch." << std::endl << + std::endl; + // clang-format on + ReportingOptions::help(os); + os << std::endl; + SystemOptions::help(os); + os << std::endl; + helpHelp(os); +} + +void SafeBuilderOptions::printHelp(std::ostream& os) +{ +// clang-format off + os << "=== Mandatory ===" << std::endl << + " --onnx= ONNX model" << std::endl << + " " << std::endl << + "=== Optional ===" << std::endl << + " --inputIOFormats=spec Type and format of each of the input tensors (default = all inputs in fp32:chw)" << std::endl << + " See --outputIOFormats help for the grammar of type and format list." << std::endl << + " Note: If this option is specified, please set comma-separated types and formats for all" << std::endl << + " inputs following the same order as network inputs ID (even if only one input" << std::endl << + " needs specifying IO format) or set the type and format once for broadcasting." << std::endl << + " --outputIOFormats=spec Type and format of each of the output tensors (default = all outputs in fp32:chw)" << std::endl << + " Note: If this option is specified, please set comma-separated types and formats for all" << std::endl << + " outputs following the same order as network outputs ID (even if only one output" << std::endl << + " needs specifying IO format) or set the type and format once for broadcasting." << std::endl << + " IO Formats: spec ::= IOfmt[\",\"spec]" << std::endl << + " IOfmt ::= type:fmt" << std::endl << + " type ::= \"fp32\"|\"fp16\"|\"int32\"|\"int8\"" << std::endl << + " fmt ::= (\"chw\"|\"chw2\"|\"chw4\"|\"hwc8\"|\"chw16\"|\"chw32\"|\"dhwc8\")[\"+\"fmt]" << std::endl << + " --int8 Enable int8 precision, in addition to fp16 (default = disabled)" << std::endl << + " --consistency Enable consistency check for serialized engine, (default = disabled)" << std::endl << + " --std Build standard serialized engine, (default = disabled)" << std::endl << + " --calib= Read INT8 calibration cache file" << std::endl << + " --serialized= Save the serialized network" << std::endl << + " --plugins Plugin library (.so) to load (can be specified multiple times)" << std::endl << + " --verbose or -v Use verbose logging (default = false)" << std::endl << + " --help or -h Print this message" << std::endl << + " " << std::endl; +// clang-format on +} + +} // namespace sample diff --git a/src/Detector/tensorrt_yolo/common_deprecated/sampleOptions.h b/src/Detector/tensorrt_yolo/common_deprecated/sampleOptions.h new file mode 100644 index 00000000..8975e1ea --- /dev/null +++ b/src/Detector/tensorrt_yolo/common_deprecated/sampleOptions.h @@ -0,0 +1,355 @@ +/* + * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef TRT_SAMPLE_OPTIONS_H +#define TRT_SAMPLE_OPTIONS_H + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "NvInfer.h" + +namespace sample +{ + +// Build default params +constexpr int32_t maxBatchNotProvided{0}; +constexpr int32_t defaultMinTiming{1}; +constexpr int32_t defaultAvgTiming{8}; + +// System default params +constexpr int32_t defaultDevice{0}; + +// Inference default params +constexpr int32_t defaultBatch{1}; +constexpr int32_t batchNotProvided{0}; +constexpr int32_t defaultStreams{1}; +constexpr int32_t defaultIterations{10}; +constexpr float defaultWarmUp{200.F}; +constexpr float defaultDuration{3.F}; +constexpr float defaultSleep{}; +constexpr float defaultIdle{}; + +// Reporting default params +constexpr int32_t defaultAvgRuns{10}; +constexpr float defaultPercentile{99}; + +enum class PrecisionConstraints +{ + kNONE, + kOBEY, + kPREFER +}; + +enum class ModelFormat +{ + kANY, + kCAFFE, + kONNX, + kUFF +}; + +enum class SparsityFlag +{ + kDISABLE, + kENABLE, + kFORCE +}; + +enum class TimingCacheMode +{ + kDISABLE, + kLOCAL, + kGLOBAL +}; + +using Arguments = std::unordered_multimap; + +using IOFormat = std::pair; + +using ShapeRange = std::array, nvinfer1::EnumMax()>; + +using LayerPrecisions = std::unordered_map; +using LayerOutputTypes = std::unordered_map>; + +struct Options +{ + virtual void parse(Arguments& arguments) = 0; +}; + +struct BaseModelOptions : public Options +{ + ModelFormat format{ModelFormat::kANY}; + std::string model; + + void parse(Arguments& arguments) override; + + static void help(std::ostream& out); +}; + +struct UffInput : public Options +{ + std::vector> inputs; + bool NHWC{false}; + + void parse(Arguments& arguments) override; + + static void help(std::ostream& out); +}; + +struct ModelOptions : public Options +{ + BaseModelOptions baseModel; + std::string prototxt; + std::vector outputs; + UffInput uffInputs; + + void parse(Arguments& arguments) override; + + static void help(std::ostream& out); +}; + +struct BuildOptions : public Options +{ + int32_t maxBatch{maxBatchNotProvided}; + double workspace{-1.0}; + double dlaSRAM{-1.0}; + double dlaLocalDRAM{-1.0}; + double dlaGlobalDRAM{-1.0}; + int32_t minTiming{defaultMinTiming}; + int32_t avgTiming{defaultAvgTiming}; + bool tf32{true}; + bool fp16{false}; + bool int8{false}; + bool directIO{false}; + PrecisionConstraints precisionConstraints{PrecisionConstraints::kNONE}; + LayerPrecisions layerPrecisions; + LayerOutputTypes layerOutputTypes; + bool safe{false}; + bool consistency{false}; + bool restricted{false}; + bool save{false}; + bool load{false}; + bool refittable{false}; + SparsityFlag sparsity{SparsityFlag::kDISABLE}; +#if (NV_TENSORRT_MAJOR > 7) + nvinfer1::ProfilingVerbosity profilingVerbosity{nvinfer1::ProfilingVerbosity::kLAYER_NAMES_ONLY}; +#else + nvinfer1::ProfilingVerbosity profilingVerbosity{ nvinfer1::ProfilingVerbosity::kDEFAULT }; +#endif + std::string engine; + std::string calibration; + std::unordered_map shapes; + std::unordered_map shapesCalib; + std::vector inputFormats; + std::vector outputFormats; + nvinfer1::TacticSources enabledTactics{0}; + nvinfer1::TacticSources disabledTactics{0}; + TimingCacheMode timingCacheMode{TimingCacheMode::kLOCAL}; + std::string timingCacheFile{}; + void parse(Arguments& arguments) override; + + static void help(std::ostream& out); +}; + +struct SystemOptions : public Options +{ + int32_t device{defaultDevice}; + int32_t DLACore{-1}; + bool fallback{false}; + std::vector plugins; + + void parse(Arguments& arguments) override; + + static void help(std::ostream& out); +}; + +struct InferenceOptions : public Options +{ + int32_t batch{batchNotProvided}; + int32_t iterations{defaultIterations}; + int32_t streams{defaultStreams}; + float warmup{defaultWarmUp}; + float duration{defaultDuration}; + float sleep{defaultSleep}; + float idle{defaultIdle}; + bool overlap{true}; + bool skipTransfers{false}; + bool useManaged{false}; + bool spin{false}; + bool threads{false}; + bool graph{false}; + bool skip{false}; + bool rerun{false}; + bool timeDeserialize{false}; + bool timeRefit{false}; + std::unordered_map inputs; + std::unordered_map> shapes; + + void parse(Arguments& arguments) override; + + static void help(std::ostream& out); +}; + +struct ReportingOptions : public Options +{ + bool verbose{false}; + int32_t avgs{defaultAvgRuns}; + float percentile{defaultPercentile}; + bool refit{false}; + bool output{false}; + bool profile{false}; + bool layerInfo{false}; + std::string exportTimes; + std::string exportOutput; + std::string exportProfile; + std::string exportLayerInfo; + + void parse(Arguments& arguments) override; + + static void help(std::ostream& out); +}; + +struct SafeBuilderOptions : public Options +{ + std::string serialized{}; + std::string onnxModelFile{}; + bool help{false}; + bool verbose{false}; + std::vector inputFormats; + std::vector outputFormats; + bool int8{false}; + std::string calibFile{}; + std::vector plugins; + bool consistency{false}; + bool standard{false}; + + void parse(Arguments& arguments) override; + + static void printHelp(std::ostream& out); +}; + +struct AllOptions : public Options +{ + ModelOptions model; + BuildOptions build; + SystemOptions system; + InferenceOptions inference; + ReportingOptions reporting; + bool helps{false}; + + void parse(Arguments& arguments) override; + + static void help(std::ostream& out); +}; + +Arguments argsToArgumentsMap(int32_t argc, char* argv[]); + +bool parseHelp(Arguments& arguments); + +void helpHelp(std::ostream& out); + +// Functions to print options + +std::ostream& operator<<(std::ostream& os, const BaseModelOptions& options); + +std::ostream& operator<<(std::ostream& os, const UffInput& input); + +std::ostream& operator<<(std::ostream& os, const IOFormat& format); + +std::ostream& operator<<(std::ostream& os, const ShapeRange& dims); + +std::ostream& operator<<(std::ostream& os, const ModelOptions& options); + +std::ostream& operator<<(std::ostream& os, const BuildOptions& options); + +std::ostream& operator<<(std::ostream& os, const SystemOptions& options); + +std::ostream& operator<<(std::ostream& os, const InferenceOptions& options); + +std::ostream& operator<<(std::ostream& os, const ReportingOptions& options); + +std::ostream& operator<<(std::ostream& os, const AllOptions& options); + +std::ostream& operator<<(std::ostream& os, const SafeBuilderOptions& options); + +inline std::ostream& operator<<(std::ostream& os, const nvinfer1::Dims& dims) +{ + for (int32_t i = 0; i < dims.nbDims; ++i) + { + os << (i ? "x" : "") << dims.d[i]; + } + return os; +} +inline std::ostream& operator<<(std::ostream& os, const nvinfer1::WeightsRole role) +{ + switch (role) + { + case nvinfer1::WeightsRole::kKERNEL: + { + os << "Kernel"; + break; + } + case nvinfer1::WeightsRole::kBIAS: + { + os << "Bias"; + break; + } + case nvinfer1::WeightsRole::kSHIFT: + { + os << "Shift"; + break; + } + case nvinfer1::WeightsRole::kSCALE: + { + os << "Scale"; + break; + } + case nvinfer1::WeightsRole::kCONSTANT: + { + os << "Constant"; + break; + } +#if (NV_TENSORRT_MAJOR > 7) + case nvinfer1::WeightsRole::kANY: + { + os << "Any"; + break; + } +#endif + } + + return os; +} + +inline std::ostream& operator<<(std::ostream& os, const std::vector& vec) +{ + for (int32_t i = 0, e = static_cast(vec.size()); i < e; ++i) + { + os << (i ? "x" : "") << vec[i]; + } + return os; +} + +} // namespace sample + +#endif // TRT_SAMPLES_OPTIONS_H diff --git a/src/Detector/tensorrt_yolo/common_deprecated/sampleReporting.cpp b/src/Detector/tensorrt_yolo/common_deprecated/sampleReporting.cpp new file mode 100644 index 00000000..a92938c5 --- /dev/null +++ b/src/Detector/tensorrt_yolo/common_deprecated/sampleReporting.cpp @@ -0,0 +1,445 @@ +/* + * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "sampleInference.h" +#include "sampleOptions.h" +#include "sampleReporting.h" + +namespace sample +{ + +namespace +{ + +//! +//! \brief Find percentile in an ascending sequence of timings +//! \note percentile must be in [0, 100]. Otherwise, an exception is thrown. +//! +template +float findPercentile(float percentile, std::vector const& timings, T const& toFloat) +{ + int32_t const all = static_cast(timings.size()); + int32_t const exclude = static_cast((1 - percentile / 100) * all); + if (timings.empty()) + { + return std::numeric_limits::infinity(); + } + if (percentile < 0.0f || percentile > 100.0f) + { + throw std::runtime_error("percentile is not in [0, 100]!"); + } + return toFloat(timings[std::max(all - 1 - exclude, 0)]); +} + +//! +//! \brief Find median in a sorted sequence of timings +//! +template +float findMedian(std::vector const& timings, T const& toFloat) +{ + if (timings.empty()) + { + return std::numeric_limits::infinity(); + } + + int32_t const m = timings.size() / 2; + if (timings.size() % 2) + { + return toFloat(timings[m]); + } + + return (toFloat(timings[m - 1]) + toFloat(timings[m])) / 2; +} + +//! +//! \brief Find coefficient of variance (which is std / mean) in a sorted sequence of timings given the mean +//! +template +float findCoeffOfVariance(std::vector const& timings, T const& toFloat, float mean) +{ + if (timings.empty()) + { + return 0; + } + + if (mean == 0.F) + { + return std::numeric_limits::infinity(); + } + + auto const metricAccumulator = [toFloat, mean](float acc, InferenceTime const& a) { + float const diff = toFloat(a) - mean; + return acc + diff * diff; + }; + float const variance = std::accumulate(timings.begin(), timings.end(), 0.F, metricAccumulator) / timings.size(); + + return std::sqrt(variance) / mean * 100.F; +} + +inline InferenceTime traceToTiming(const InferenceTrace& a) +{ + return InferenceTime((a.enqEnd - a.enqStart), (a.h2dEnd - a.h2dStart), (a.computeEnd - a.computeStart), + (a.d2hEnd - a.d2hStart), (a.d2hEnd - a.h2dStart)); +} + +} // namespace + +void printProlog(int32_t warmups, int32_t timings, float warmupMs, float benchTimeMs, std::ostream& os) +{ + os << "Warmup completed " << warmups << " queries over " << warmupMs << " ms" << std::endl; + os << "Timing trace has " << timings << " queries over " << benchTimeMs / 1000 << " s" << std::endl; +} + +void printTiming(std::vector const& timings, int32_t runsPerAvg, std::ostream& os) +{ + int32_t count = 0; + InferenceTime sum; + + os << std::endl; + os << "=== Trace details ===" << std::endl; + os << "Trace averages of " << runsPerAvg << " runs:" << std::endl; + for (auto const& t : timings) + { + sum += t; + + if (++count == runsPerAvg) + { + // clang-format off + os << "Average on " << runsPerAvg << " runs - GPU latency: " << sum.compute / runsPerAvg + << " ms - Host latency: " << sum.latency() / runsPerAvg << " ms (end to end " << sum.e2e / runsPerAvg + << " ms, enqueue " << sum.enq / runsPerAvg << " ms)" << std::endl; + // clang-format on + count = 0; + sum.enq = 0; + sum.h2d = 0; + sum.compute = 0; + sum.d2h = 0; + sum.e2e = 0; + } + } +} + +void printMetricExplanations(std::ostream& os) +{ + os << std::endl; + os << "=== Explanations of the performance metrics ===" << std::endl; + os << "Total Host Walltime: the host walltime from when the first query (after warmups) is enqueued to when the " + "last query is completed." + << std::endl; + os << "GPU Compute Time: the GPU latency to execute the kernels for a query." << std::endl; + os << "Total GPU Compute Time: the summation of the GPU Compute Time of all the queries. If this is significantly " + "shorter than Total Host Walltime, the GPU may be under-utilized because of host-side overheads or data " + "transfers." + << std::endl; + os << "Throughput: the observed throughput computed by dividing the number of queries by the Total Host Walltime. " + "If this is significantly lower than the reciprocal of GPU Compute Time, the GPU may be under-utilized " + "because of host-side overheads or data transfers." + << std::endl; + os << "Enqueue Time: the host latency to enqueue a query. If this is longer than GPU Compute Time, the GPU may be " + "under-utilized." + << std::endl; + os << "H2D Latency: the latency for host-to-device data transfers for input tensors of a single query." + << std::endl; + os << "D2H Latency: the latency for device-to-host data transfers for output tensors of a single query." + << std::endl; + os << "Latency: the summation of H2D Latency, GPU Compute Time, and D2H Latency. This is the latency to infer a " + "single query." + << std::endl; + os << "End-to-End Host Latency: the duration from when the H2D of a query is called to when the D2H of the same " + "query is completed, which includes the latency to wait for the completion of the previous query. This is " + "the latency of a query if multiple queries are enqueued consecutively." + << std::endl; +} + +PerformanceResult getPerformanceResult(std::vector const& timings, + std::function metricGetter, float percentile) +{ + auto const metricComparator + = [metricGetter](InferenceTime const& a, InferenceTime const& b) { return metricGetter(a) < metricGetter(b); }; + auto const metricAccumulator = [metricGetter](float acc, InferenceTime const& a) { return acc + metricGetter(a); }; + std::vector newTimings = timings; + std::sort(newTimings.begin(), newTimings.end(), metricComparator); + PerformanceResult result; + result.min = metricGetter(newTimings.front()); + result.max = metricGetter(newTimings.back()); + result.mean = std::accumulate(newTimings.begin(), newTimings.end(), 0.0f, metricAccumulator) / newTimings.size(); + result.median = findMedian(newTimings, metricGetter); + result.percentile = findPercentile(percentile, newTimings, metricGetter); + result.coeffVar = findCoeffOfVariance(newTimings, metricGetter, result.mean); + return result; +} + +void printEpilog(std::vector const& timings, float walltimeMs, float percentile, int32_t batchSize, + std::ostream& osInfo, std::ostream& osWarning, std::ostream& osVerbose) +{ + float const throughput = batchSize * timings.size() / walltimeMs * 1000; + + auto const getLatency = [](InferenceTime const& t) { return t.latency(); }; + auto const latencyResult = getPerformanceResult(timings, getLatency, percentile); + + auto const getEndToEnd = [](InferenceTime const& t) { return t.e2e; }; + auto const e2eLatencyResult = getPerformanceResult(timings, getEndToEnd, percentile); + + auto const getEnqueue = [](InferenceTime const& t) { return t.enq; }; + auto const enqueueResult = getPerformanceResult(timings, getEnqueue, percentile); + + auto const getH2d = [](InferenceTime const& t) { return t.h2d; }; + auto const h2dResult = getPerformanceResult(timings, getH2d, percentile); + + auto const getCompute = [](InferenceTime const& t) { return t.compute; }; + auto const gpuComputeResult = getPerformanceResult(timings, getCompute, percentile); + + auto const getD2h = [](InferenceTime const& t) { return t.d2h; }; + auto const d2hResult = getPerformanceResult(timings, getD2h, percentile); + + auto const toPerfString = [percentile](const PerformanceResult& r) { + std::stringstream s; + s << "min = " << r.min << " ms, max = " << r.max << " ms, mean = " << r.mean << " ms, " + << "median = " << r.median << " ms, percentile(" << percentile << "%) = " << r.percentile << " ms"; + return s.str(); + }; + + osInfo << std::endl; + osInfo << "=== Performance summary ===" << std::endl; + osInfo << "Throughput: " << throughput << " qps" << std::endl; + osInfo << "Latency: " << toPerfString(latencyResult) << std::endl; + osInfo << "End-to-End Host Latency: " << toPerfString(e2eLatencyResult) << std::endl; + osInfo << "Enqueue Time: " << toPerfString(enqueueResult) << std::endl; + osInfo << "H2D Latency: " << toPerfString(h2dResult) << std::endl; + osInfo << "GPU Compute Time: " << toPerfString(gpuComputeResult) << std::endl; + osInfo << "D2H Latency: " << toPerfString(d2hResult) << std::endl; + osInfo << "Total Host Walltime: " << walltimeMs / 1000 << " s" << std::endl; + osInfo << "Total GPU Compute Time: " << gpuComputeResult.mean * timings.size() / 1000 << " s" << std::endl; + + // Report warnings if the throughput is bound by other factors than GPU Compute Time. + constexpr float kENQUEUE_BOUND_REPORTING_THRESHOLD{0.8F}; + if (enqueueResult.median > kENQUEUE_BOUND_REPORTING_THRESHOLD * gpuComputeResult.median) + { + osWarning + << "* Throughput may be bound by Enqueue Time rather than GPU Compute and the GPU may be under-utilized." + << std::endl; + osWarning << " If not already in use, --useCudaGraph (utilize CUDA graphs where possible) may increase the " + "throughput." + << std::endl; + } + if (h2dResult.median >= gpuComputeResult.median) + { + osWarning << "* Throughput may be bound by host-to-device transfers for the inputs rather than GPU Compute and " + "the GPU may be under-utilized." + << std::endl; + osWarning << " Add --noDataTransfers flag to disable data transfers." << std::endl; + } + if (d2hResult.median >= gpuComputeResult.median) + { + osWarning << "* Throughput may be bound by device-to-host transfers for the outputs rather than GPU Compute " + "and the GPU may be under-utilized." + << std::endl; + osWarning << " Add --noDataTransfers flag to disable data transfers." << std::endl; + } + + // Report warnings if the GPU Compute Time is unstable. + constexpr float kUNSTABLE_PERF_REPORTING_THRESHOLD{1.0F}; + if (gpuComputeResult.coeffVar > kUNSTABLE_PERF_REPORTING_THRESHOLD) + { + osWarning << "* GPU compute time is unstable, with coefficient of variance = " << gpuComputeResult.coeffVar + << "%." << std::endl; + osWarning << " If not already in use, locking GPU clock frequency or adding --useSpinWait may improve the " + << "stability." << std::endl; + } + + // Explain what the metrics mean. + osInfo << "Explanations of the performance metrics are printed in the verbose logs." << std::endl; + printMetricExplanations(osVerbose); + + osInfo << std::endl; +} + +void printPerformanceReport(std::vector const& trace, const ReportingOptions& reporting, float warmupMs, + int32_t batchSize, std::ostream& osInfo, std::ostream& osWarning, std::ostream& osVerbose) +{ + auto const isNotWarmup = [&warmupMs](const InferenceTrace& a) { return a.computeStart >= warmupMs; }; + auto const noWarmup = std::find_if(trace.begin(), trace.end(), isNotWarmup); + int32_t const warmups = noWarmup - trace.begin(); + float const benchTime = trace.back().d2hEnd - noWarmup->h2dStart; + // when implicit batch used, batchSize = options.inference.batch, which is parsed through --batch + // when explicit batch used, batchSize = options.inference.batch = 0 + // treat inference with explicit batch as a single query and report the throughput + batchSize = batchSize ? batchSize : 1; + printProlog(warmups * batchSize, (trace.size() - warmups) * batchSize, warmupMs, benchTime, osInfo); + + std::vector timings(trace.size() - warmups); + std::transform(noWarmup, trace.end(), timings.begin(), traceToTiming); + printTiming(timings, reporting.avgs, osInfo); + printEpilog(timings, benchTime, reporting.percentile, batchSize, osInfo, osWarning, osVerbose); + + if (!reporting.exportTimes.empty()) + { + exportJSONTrace(trace, reporting.exportTimes); + } +} + +//! Printed format: +//! [ value, ...] +//! value ::= { "start enq : time, "end enq" : time, "start h2d" : time, "end h2d" : time, "start compute" : time, +//! "end compute" : time, "start d2h" : time, "end d2h" : time, "h2d" : time, "compute" : time, +//! "d2h" : time, "latency" : time, "end to end" : time } +//! +void exportJSONTrace(std::vector const& trace, std::string const& fileName) +{ + std::ofstream os(fileName, std::ofstream::trunc); + os << "[" << std::endl; + char const* sep = " "; + for (auto const& t : trace) + { + InferenceTime const it(traceToTiming(t)); + os << sep << "{ "; + sep = ", "; + // clang-format off + os << "\"startEnqMs\" : " << t.enqStart << sep << "\"endEnqMs\" : " << t.enqEnd << sep + << "\"startH2dMs\" : " << t.h2dStart << sep << "\"endH2dMs\" : " << t.h2dEnd << sep + << "\"startComputeMs\" : " << t.computeStart << sep << "\"endComputeMs\" : " << t.computeEnd << sep + << "\"startD2hMs\" : " << t.d2hStart << sep << "\"endD2hMs\" : " << t.d2hEnd << sep + << "\"h2dMs\" : " << it.h2d << sep << "\"computeMs\" : " << it.compute << sep + << "\"d2hMs\" : " << it.d2h << sep << "\"latencyMs\" : " << it.latency() << sep + << "\"endToEndMs\" : " << it.e2e << " }" << std::endl; + // clang-format on + } + os << "]" << std::endl; +} + +void Profiler::reportLayerTime(char const* layerName, float timeMs) noexcept +{ + if (mIterator == mLayers.end()) + { + bool const first = !mLayers.empty() && mLayers.begin()->name == layerName; + mUpdatesCount += mLayers.empty() || first; + if (first) + { + mIterator = mLayers.begin(); + } + else + { + mLayers.emplace_back(); + mLayers.back().name = layerName; + mIterator = mLayers.end() - 1; + } + } + + mIterator->timeMs += timeMs; + ++mIterator; +} + +void Profiler::print(std::ostream& os) const noexcept +{ + std::string const nameHdr("Layer"); + std::string const timeHdr(" Time (ms)"); + std::string const avgHdr(" Avg. Time (ms)"); + std::string const percentageHdr(" Time %"); + + float const totalTimeMs = getTotalTime(); + + auto const cmpLayer = [](LayerProfile const& a, LayerProfile const& b) { return a.name.size() < b.name.size(); }; + auto const longestName = std::max_element(mLayers.begin(), mLayers.end(), cmpLayer); + auto const nameLength = std::max(longestName->name.size() + 1, nameHdr.size()); + auto const timeLength = timeHdr.size(); + auto const avgLength = avgHdr.size(); + auto const percentageLength = percentageHdr.size(); + + os << std::endl + << "=== Profile (" << mUpdatesCount << " iterations ) ===" << std::endl + << std::setw(nameLength) << nameHdr << timeHdr << avgHdr << percentageHdr << std::endl; + + for (auto const& p : mLayers) + { + // clang-format off + os << std::setw(nameLength) << p.name << std::setw(timeLength) << std::fixed << std::setprecision(2) << p.timeMs + << std::setw(avgLength) << std::fixed << std::setprecision(4) << p.timeMs / mUpdatesCount + << std::setw(percentageLength) << std::fixed << std::setprecision(1) << p.timeMs / totalTimeMs * 100 + << std::endl; + } + { + os << std::setw(nameLength) << "Total" << std::setw(timeLength) << std::fixed << std::setprecision(2) + << totalTimeMs << std::setw(avgLength) << std::fixed << std::setprecision(4) << totalTimeMs / mUpdatesCount + << std::setw(percentageLength) << std::fixed << std::setprecision(1) << 100.0 << std::endl; + // clang-format on + } + os << std::endl; +} + +void Profiler::exportJSONProfile(std::string const& fileName) const noexcept +{ + std::ofstream os(fileName, std::ofstream::trunc); + os << "[" << std::endl << " { \"count\" : " << mUpdatesCount << " }" << std::endl; + + auto const totalTimeMs = getTotalTime(); + + for (auto const& l : mLayers) + { + // clang-format off + os << ", {" << " \"name\" : \"" << l.name << "\"" + ", \"timeMs\" : " << l.timeMs + << ", \"averageMs\" : " << l.timeMs / mUpdatesCount + << ", \"percentage\" : " << l.timeMs / totalTimeMs * 100 + << " }" << std::endl; + // clang-format on + } + os << "]" << std::endl; +} + +void dumpInputs(nvinfer1::IExecutionContext const& context, Bindings const& bindings, std::ostream& os) +{ + os << "Input Tensors:" << std::endl; + bindings.dumpInputs(context, os); +} + +void dumpOutputs(nvinfer1::IExecutionContext const& context, Bindings const& bindings, std::ostream& os) +{ + os << "Output Tensors:" << std::endl; + bindings.dumpOutputs(context, os); +} + +void exportJSONOutput( + nvinfer1::IExecutionContext const& context, Bindings const& bindings, std::string const& fileName, int32_t batch) +{ + std::ofstream os(fileName, std::ofstream::trunc); + std::string sep = " "; + auto const output = bindings.getOutputBindings(); + os << "[" << std::endl; + for (auto const& binding : output) + { + // clang-format off + os << sep << "{ \"name\" : \"" << binding.first << "\"" << std::endl; + sep = ", "; + os << " " << sep << "\"dimensions\" : \""; + bindings.dumpBindingDimensions(binding.second, context, os); + os << "\"" << std::endl; + os << " " << sep << "\"values\" : [ "; + bindings.dumpBindingValues(context, binding.second, os, sep, batch); + os << " ]" << std::endl << " }" << std::endl; + // clang-format on + } + os << "]" << std::endl; +} + +} // namespace sample diff --git a/src/Detector/tensorrt_yolo/common_deprecated/sampleReporting.h b/src/Detector/tensorrt_yolo/common_deprecated/sampleReporting.h new file mode 100644 index 00000000..5f730987 --- /dev/null +++ b/src/Detector/tensorrt_yolo/common_deprecated/sampleReporting.h @@ -0,0 +1,222 @@ +/* + * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef TRT_SAMPLE_REPORTING_H +#define TRT_SAMPLE_REPORTING_H + +#include +#include + +#include "NvInfer.h" + +#include "sampleOptions.h" +#include "sampleUtils.h" + +namespace sample +{ + +//! +//! \struct InferenceTime +//! \brief Measurement times in milliseconds +//! +struct InferenceTime +{ + InferenceTime(float q, float i, float c, float o, float e) + : enq(q) + , h2d(i) + , compute(c) + , d2h(o) + , e2e(e) + { + } + + InferenceTime() = default; + InferenceTime(InferenceTime const&) = default; + InferenceTime(InferenceTime&&) = default; + InferenceTime& operator=(InferenceTime const&) = default; + InferenceTime& operator=(InferenceTime&&) = default; + ~InferenceTime() = default; + + float enq{0}; // Enqueue + float h2d{0}; // Host to Device + float compute{0}; // Compute + float d2h{0}; // Device to Host + float e2e{0}; // end to end + + // ideal latency + float latency() const + { + return h2d + compute + d2h; + } +}; + +//! +//! \struct InferenceTrace +//! \brief Measurement points in milliseconds +//! +struct InferenceTrace +{ + InferenceTrace(int32_t s, float es, float ee, float is, float ie, float cs, float ce, float os, float oe) + : stream(s) + , enqStart(es) + , enqEnd(ee) + , h2dStart(is) + , h2dEnd(ie) + , computeStart(cs) + , computeEnd(ce) + , d2hStart(os) + , d2hEnd(oe) + { + } + + InferenceTrace() = default; + InferenceTrace(InferenceTrace const&) = default; + InferenceTrace(InferenceTrace&&) = default; + InferenceTrace& operator=(InferenceTrace const&) = default; + InferenceTrace& operator=(InferenceTrace&&) = default; + ~InferenceTrace() = default; + + int32_t stream{0}; + float enqStart{0}; + float enqEnd{0}; + float h2dStart{0}; + float h2dEnd{0}; + float computeStart{0}; + float computeEnd{0}; + float d2hStart{0}; + float d2hEnd{0}; +}; + +inline InferenceTime operator+(InferenceTime const& a, InferenceTime const& b) +{ + return InferenceTime(a.enq + b.enq, a.h2d + b.h2d, a.compute + b.compute, a.d2h + b.d2h, a.e2e + b.e2e); +} + +inline InferenceTime operator+=(InferenceTime& a, InferenceTime const& b) +{ + return a = a + b; +} + +//! +//! \struct PerformanceResult +//! \brief Performance result of a performance metric +//! +struct PerformanceResult +{ + float min{0}; + float max{0}; + float mean{0}; + float median{0}; + float percentile{0}; + float coeffVar{0}; // coefficient of variation +}; + +//! +//! \brief Print benchmarking time and number of traces collected +//! +void printProlog(int32_t warmups, int32_t timings, float warmupMs, float walltime, std::ostream& os); + +//! +//! \brief Print a timing trace +//! +void printTiming(std::vector const& timings, int32_t runsPerAvg, std::ostream& os); + +//! +//! \brief Print the performance summary of a trace +//! +void printEpilog(std::vector const& timings, float percentile, int32_t batchSize, std::ostream& osInfo, + std::ostream& osWarning, std::ostream& osVerbose); + +//! +//! \brief Get the result of a specific performance metric from a trace +//! +PerformanceResult getPerformanceResult(std::vector const& timings, + std::function metricGetter, float percentile); + +//! +//! \brief Print the explanations of the performance metrics printed in printEpilog() function. +//! +void printMetricExplanations(std::ostream& os); + +//! +//! \brief Print and summarize a timing trace +//! +void printPerformanceReport(std::vector const& trace, ReportingOptions const& reporting, float warmupMs, + int32_t batchSize, std::ostream& osInfo, std::ostream& osWarning, std::ostream& osVerbose); + +//! +//! \brief Export a timing trace to JSON file +//! +void exportJSONTrace(std::vector const& trace, std::string const& fileName); + +//! +//! \brief Print input tensors to stream +//! +void dumpInputs(nvinfer1::IExecutionContext const& context, Bindings const& bindings, std::ostream& os); + +//! +//! \brief Print output tensors to stream +//! +void dumpOutputs(nvinfer1::IExecutionContext const& context, Bindings const& bindings, std::ostream& os); + +//! +//! \brief Export output tensors to JSON file +//! +void exportJSONOutput( + nvinfer1::IExecutionContext const& context, Bindings const& bindings, std::string const& fileName, int32_t batch); + +//! +//! \struct LayerProfile +//! \brief Layer profile information +//! +struct LayerProfile +{ + std::string name; + float timeMs{0}; +}; + +//! +//! \class Profiler +//! \brief Collect per-layer profile information, assuming times are reported in the same order +//! +class Profiler : public nvinfer1::IProfiler +{ + +public: + void reportLayerTime(char const* layerName, float timeMs) noexcept override; + + void print(std::ostream& os) const noexcept; + + //! + //! \brief Export a profile to JSON file + //! + void exportJSONProfile(std::string const& fileName) const noexcept; + +private: + float getTotalTime() const noexcept + { + auto const plusLayerTime = [](float accumulator, LayerProfile const& lp) { return accumulator + lp.timeMs; }; + return std::accumulate(mLayers.begin(), mLayers.end(), 0.0, plusLayerTime); + } + + std::vector mLayers; + std::vector::iterator mIterator{mLayers.begin()}; + int32_t mUpdatesCount{0}; +}; + +} // namespace sample + +#endif // TRT_SAMPLE_REPORTING_H diff --git a/src/Detector/tensorrt_yolo/common_deprecated/sampleUtils.h b/src/Detector/tensorrt_yolo/common_deprecated/sampleUtils.h new file mode 100644 index 00000000..1509a7fc --- /dev/null +++ b/src/Detector/tensorrt_yolo/common_deprecated/sampleUtils.h @@ -0,0 +1,543 @@ +/* + * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef TRT_SAMPLE_UTILS_H +#define TRT_SAMPLE_UTILS_H + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "NvInfer.h" + +#include "common.h" +#include "logger.h" +#include "sampleDevice.h" +#include "sampleOptions.h" + +namespace sample +{ + +inline int dataTypeSize(nvinfer1::DataType dataType) +{ + switch (dataType) + { + case nvinfer1::DataType::kINT32: + case nvinfer1::DataType::kFLOAT: return 4; + case nvinfer1::DataType::kHALF: return 2; + case nvinfer1::DataType::kBOOL: + case nvinfer1::DataType::kINT8: return 1; + } + return 0; +} + +template +inline T roundUp(T m, T n) +{ + return ((m + n - 1) / n) * n; +} + +inline int volume(const nvinfer1::Dims& d) +{ + return std::accumulate(d.d, d.d + d.nbDims, 1, std::multiplies()); +} + +//! comps is the number of components in a vector. Ignored if vecDim < 0. +inline int64_t volume(const nvinfer1::Dims& dims, const nvinfer1::Dims& strides, int vecDim, int comps, int batch) +{ + int maxNbElems = 1; + for (int i = 0; i < dims.nbDims; ++i) + { + // Get effective length of axis. + int d = dims.d[i]; + // Any dimension is 0, it is an empty tensor. + if (d == 0) + { + return 0; + } + if (i == vecDim) + { + d = samplesCommon::divUp(d, comps); + } + maxNbElems = std::max(maxNbElems, d * strides.d[i]); + } + return static_cast(maxNbElems) * batch * (vecDim < 0 ? 1 : comps); +} + +inline int64_t volume(nvinfer1::Dims dims, int vecDim, int comps, int batch) +{ + if (vecDim != -1) + { + dims.d[vecDim] = roundUp(dims.d[vecDim], comps); + } + return volume(dims) * std::max(batch, 1); +} + +inline nvinfer1::Dims toDims(const std::vector& vec) +{ + int limit = static_cast(nvinfer1::Dims::MAX_DIMS); + if (static_cast(vec.size()) > limit) + { + sample::gLogWarning << "Vector too long, only first 8 elements are used in dimension." << std::endl; + } + // Pick first nvinfer1::Dims::MAX_DIMS elements + nvinfer1::Dims dims{std::min(static_cast(vec.size()), limit), {}}; + std::copy_n(vec.begin(), dims.nbDims, std::begin(dims.d)); + return dims; +} + +template +inline void fillBuffer(void* buffer, int64_t volume, T min, T max) +{ + T* typedBuffer = static_cast(buffer); + std::default_random_engine engine; + if (std::is_integral::value) + { + std::uniform_int_distribution distribution(min, max); + auto generator = [&engine, &distribution]() { return static_cast(distribution(engine)); }; + std::generate(typedBuffer, typedBuffer + volume, generator); + } + else + { + std::uniform_real_distribution distribution(min, max); + auto generator = [&engine, &distribution]() { return static_cast(distribution(engine)); }; + std::generate(typedBuffer, typedBuffer + volume, generator); + } +} + +// Specialization needed for custom type __half +template +inline void fillBufferHalf(void* buffer, int64_t volume, H min, H max) +{ + H* typedBuffer = static_cast(buffer); + std::default_random_engine engine; + std::uniform_real_distribution distribution(min, max); + auto generator = [&engine, &distribution]() { return static_cast(distribution(engine)); }; + std::generate(typedBuffer, typedBuffer + volume, generator); +} +template <> +inline void fillBuffer<__half>(void* buffer, int64_t volume, __half min, __half max) +{ + fillBufferHalf(buffer, volume, min, max); +} + +template +inline void dumpBuffer(const void* buffer, const std::string& separator, std::ostream& os, const nvinfer1::Dims& dims, + const nvinfer1::Dims& strides, int32_t vectorDim, int32_t spv) +{ + const int64_t volume = std::accumulate(dims.d, dims.d + dims.nbDims, 1, std::multiplies()); + const T* typedBuffer = static_cast(buffer); + std::string sep; + for (int64_t v = 0; v < volume; ++v) + { + int64_t curV = v; + int32_t dataOffset = 0; + for (int32_t dimIndex = dims.nbDims - 1; dimIndex >= 0; --dimIndex) + { + int32_t dimVal = curV % dims.d[dimIndex]; + if (dimIndex == vectorDim) + { + dataOffset += (dimVal / spv) * strides.d[dimIndex] * spv + dimVal % spv; + } + else + { + dataOffset += dimVal * strides.d[dimIndex] * (vectorDim == -1 ? 1 : spv); + } + curV /= dims.d[dimIndex]; + ASSERT(curV >= 0); + } + + os << sep << typedBuffer[dataOffset]; + sep = separator; + } +} + +inline void loadFromFile(std::string const& fileName, char* dst, size_t size) +{ + ASSERT(dst); + + std::ifstream file(fileName, std::ios::in | std::ios::binary); + if (file.is_open()) + { + file.read(dst, size); + file.close(); + } + else + { + std::stringstream msg; + msg << "Cannot open file " << fileName << "!"; + throw std::invalid_argument(msg.str()); + } +} + +struct Binding +{ + bool isInput{false}; + std::unique_ptr buffer; + int64_t volume{0}; + nvinfer1::DataType dataType{nvinfer1::DataType::kFLOAT}; + + void fill(const std::string& fileName) + { + loadFromFile(fileName, static_cast(buffer->getHostBuffer()), buffer->getSize()); + } + + void fill() + { + switch (dataType) + { + case nvinfer1::DataType::kBOOL: + { + fillBuffer(buffer->getHostBuffer(), volume, 0, 1); + break; + } + case nvinfer1::DataType::kINT32: + { + fillBuffer(buffer->getHostBuffer(), volume, -128, 127); + break; + } + case nvinfer1::DataType::kINT8: + { + fillBuffer(buffer->getHostBuffer(), volume, -128, 127); + break; + } + case nvinfer1::DataType::kFLOAT: + { + fillBuffer(buffer->getHostBuffer(), volume, -1.0F, 1.0F); + break; + } + case nvinfer1::DataType::kHALF: + { + fillBuffer<__half>(buffer->getHostBuffer(), volume, -1.0F, 1.0F); + break; + } + } + } + + void dump(std::ostream& os, nvinfer1::Dims dims, nvinfer1::Dims strides, int32_t vectorDim, int32_t spv, + const std::string separator = " ") const + { + switch (dataType) + { + case nvinfer1::DataType::kBOOL: + { + dumpBuffer(buffer->getHostBuffer(), separator, os, dims, strides, vectorDim, spv); + break; + } + case nvinfer1::DataType::kINT32: + { + dumpBuffer(buffer->getHostBuffer(), separator, os, dims, strides, vectorDim, spv); + break; + } + case nvinfer1::DataType::kINT8: + { + dumpBuffer(buffer->getHostBuffer(), separator, os, dims, strides, vectorDim, spv); + break; + } + case nvinfer1::DataType::kFLOAT: + { + dumpBuffer(buffer->getHostBuffer(), separator, os, dims, strides, vectorDim, spv); + break; + } + case nvinfer1::DataType::kHALF: + { + dumpBuffer<__half>(buffer->getHostBuffer(), separator, os, dims, strides, vectorDim, spv); + break; + } + } + } +}; + +class Bindings +{ +public: + Bindings() = delete; + explicit Bindings(bool useManaged) + : mUseManaged(useManaged) + { + } + + void addBinding(int b, const std::string& name, bool isInput, int64_t volume, nvinfer1::DataType dataType, + const std::string& fileName = "") + { + while (mBindings.size() <= static_cast(b)) + { + mBindings.emplace_back(); + mDevicePointers.emplace_back(); + } + mNames[name] = b; + if (mBindings[b].buffer == nullptr) + { + if (mUseManaged) + mBindings[b].buffer.reset(new UnifiedMirroredBuffer); + else + mBindings[b].buffer.reset(new DiscreteMirroredBuffer); + } + mBindings[b].isInput = isInput; + // Some memory allocators return nullptr when allocating zero bytes, but TensorRT requires a non-null ptr + // even for empty tensors, so allocate a dummy byte. + if (volume == 0) + mBindings[b].buffer->allocate(1); + else + mBindings[b].buffer->allocate(static_cast(volume) * static_cast(dataTypeSize(dataType))); + + mBindings[b].volume = volume; + mBindings[b].dataType = dataType; + mDevicePointers[b] = mBindings[b].buffer->getDeviceBuffer(); + if (isInput) + { + if (fileName.empty()) + fill(b); + else + fill(b, fileName); + } + } + + void** getDeviceBuffers() + { + return mDevicePointers.data(); + } + + void transferInputToDevice(TrtCudaStream& stream) + { + for (auto& b : mNames) + { + if (mBindings[b.second].isInput) + mBindings[b.second].buffer->hostToDevice(stream); + } + } + + void transferOutputToHost(TrtCudaStream& stream) + { + for (auto& b : mNames) + { + if (!mBindings[b.second].isInput) + mBindings[b.second].buffer->deviceToHost(stream); + } + } + + void fill(int binding, const std::string& fileName) + { + mBindings[binding].fill(fileName); + } + + void fill(int binding) + { + mBindings[binding].fill(); + } + + void dumpBindingDimensions(int binding, const nvinfer1::IExecutionContext& context, std::ostream& os) const + { + const auto dims = context.getBindingDimensions(binding); + // Do not add a newline terminator, because the caller may be outputting a JSON string. + os << dims; + } + + void dumpBindingValues(const nvinfer1::IExecutionContext& context, int binding, std::ostream& os, + const std::string& separator = " ", int32_t batch = 1) const + { + nvinfer1::Dims dims = context.getBindingDimensions(binding); + nvinfer1::Dims strides = context.getStrides(binding); + int32_t vectorDim = context.getEngine().getBindingVectorizedDim(binding); + const int32_t spv = context.getEngine().getBindingComponentsPerElement(binding); + + if (context.getEngine().hasImplicitBatchDimension()) + { + auto insertN = [](nvinfer1::Dims& d, int32_t bs) { + const int32_t nbDims = d.nbDims; + ASSERT(nbDims < nvinfer1::Dims::MAX_DIMS); + std::copy_backward(&d.d[0], &d.d[nbDims], &d.d[nbDims + 1]); + d.d[0] = bs; + d.nbDims = nbDims + 1; + }; + int32_t batchStride = 0; + for (int32_t i = 0; i < strides.nbDims; ++i) + { + if (strides.d[i] * dims.d[i] > batchStride) + { + batchStride = strides.d[i] * dims.d[i]; + } + } + insertN(dims, batch); + insertN(strides, batchStride); + vectorDim = (vectorDim == -1) ? -1 : vectorDim + 1; + } + + mBindings[binding].dump(os, dims, strides, vectorDim, spv, separator); + } + + void dumpInputs(const nvinfer1::IExecutionContext& context, std::ostream& os) const + { + auto isInput = [](const Binding& b) { return b.isInput; }; + dumpBindings(context, isInput, os); + } + + void dumpOutputs(const nvinfer1::IExecutionContext& context, std::ostream& os) const + { + auto isOutput = [](const Binding& b) { return !b.isInput; }; + dumpBindings(context, isOutput, os); + } + + void dumpBindings(const nvinfer1::IExecutionContext& context, std::ostream& os) const + { + auto all = [](const Binding& /*b*/) { return true; }; + dumpBindings(context, all, os); + } + + void dumpBindings( + const nvinfer1::IExecutionContext& context, bool (*predicate)(const Binding& b), std::ostream& os) const + { + for (const auto& n : mNames) + { + const auto binding = n.second; + if (predicate(mBindings[binding])) + { + os << n.first << ": ("; + dumpBindingDimensions(binding, context, os); + os << ")" << std::endl; + + dumpBindingValues(context, binding, os); + os << std::endl; + } + } + } + + std::unordered_map getInputBindings() const + { + auto isInput = [](const Binding& b) { return b.isInput; }; + return getBindings(isInput); + } + + std::unordered_map getOutputBindings() const + { + auto isOutput = [](const Binding& b) { return !b.isInput; }; + return getBindings(isOutput); + } + + std::unordered_map getBindings() const + { + auto all = [](const Binding& /*b*/) { return true; }; + return getBindings(all); + } + + std::unordered_map getBindings(bool (*predicate)(const Binding& b)) const + { + std::unordered_map bindings; + for (const auto& n : mNames) + { + const auto binding = n.second; + if (predicate(mBindings[binding])) + bindings.insert(n); + } + return bindings; + } + +private: + std::unordered_map mNames; + std::vector mBindings; + std::vector mDevicePointers; + bool mUseManaged{false}; +}; + +template +struct TrtDestroyer +{ + void operator()(T* t) + { + //t->destroy(); + delete t; + } +}; + +template +using TrtUniquePtr = std::unique_ptr>; + +inline bool broadcastIOFormats(const std::vector& formats, size_t nbBindings, bool isInput = true) +{ + bool broadcast = formats.size() == 1; + bool validFormatsCount = broadcast || (formats.size() == nbBindings); + if (!formats.empty() && !validFormatsCount) + { + if (isInput) + { + throw std::invalid_argument( + "The number of inputIOFormats must match network's inputs or be one for broadcasting."); + } + else + { + throw std::invalid_argument( + "The number of outputIOFormats must match network's outputs or be one for broadcasting."); + } + } + return broadcast; +} + +inline std::vector loadTimingCacheFile(const std::string inFileName) +{ + std::ifstream iFile(inFileName, std::ios::in | std::ios::binary); + if (!iFile) + { + sample::gLogWarning << "Could not read timing cache from: " << inFileName + << ". A new timing cache will be generated and written." << std::endl; + return std::vector(); + } + iFile.seekg(0, std::ifstream::end); + size_t fsize = iFile.tellg(); + iFile.seekg(0, std::ifstream::beg); + std::vector content(fsize); + iFile.read(content.data(), fsize); + iFile.close(); + sample::gLogInfo << "Loaded " << fsize << " bytes of timing cache from " << inFileName << std::endl; + return content; +} + +inline void saveTimingCacheFile(const std::string outFileName, const nvinfer1::IHostMemory* blob) +{ + std::ofstream oFile(outFileName, std::ios::out | std::ios::binary); + if (!oFile) + { + sample::gLogWarning << "Could not write timing cache to: " << outFileName << std::endl; + return; + } + oFile.write((char*) blob->data(), blob->size()); + oFile.close(); + sample::gLogInfo << "Saved " << blob->size() << " bytes of timing cache to " << outFileName << std::endl; +} + +inline int32_t getCudaDriverVersion() +{ + int32_t version{-1}; + cudaCheck(cudaDriverGetVersion(&version)); + return version; +} + +inline int32_t getCudaRuntimeVersion() +{ + int32_t version{-1}; + cudaCheck(cudaRuntimeGetVersion(&version)); + return version; +} + +} // namespace sample + +#endif // TRT_SAMPLE_UTILS_H diff --git a/src/Detector/tensorrt_yolo/yolo.cpp b/src/Detector/tensorrt_yolo/yolo.cpp index a60d3dc4..4ee202b6 100644 --- a/src/Detector/tensorrt_yolo/yolo.cpp +++ b/src/Detector/tensorrt_yolo/yolo.cpp @@ -78,7 +78,31 @@ Yolo::Yolo(const NetworkInfo& networkInfo, const InferParams& inferParams) assert(m_Engine != nullptr); m_Context = m_Engine->createExecutionContext(); assert(m_Context != nullptr); + + auto numBindings = m_Engine->getNbIOTensors(); + //std::cout << "** Bindings: " << numBindings << " **" << std::endl; + for (int32_t i = 0; i < numBindings; ++i) + { + std::string bindName = m_Engine->getIOTensorName(i); + m_tensorNames.emplace(bindName, i); + nvinfer1::Dims dim = m_Engine->getTensorShape(bindName.c_str()); + + std::cout << i << ": name: " << bindName; + std::cout << ", size: "; + for (int j = 0; j < dim.nbDims; ++j) + { + std::cout << dim.d[j]; + if (j < dim.nbDims - 1) + std::cout << "x"; + } + std::cout << std::endl; + + if (m_InputBlobName == bindName) + m_InputBindingIndex = i; + } +#if (NV_TENSORRT_MAJOR < 9) m_InputBindingIndex = m_Engine->getBindingIndex(m_InputBlobName.c_str()); +#endif assert(m_InputBindingIndex != -1); assert(m_BatchSize <= static_cast(m_Engine->getMaxBatchSize())); allocateBuffers(); @@ -464,7 +488,14 @@ void Yolo::createYOLOEngine(const nvinfer1::DataType dataType, Int8EntropyCalibr // Build the engine std::cout << "Building the TensorRT Engine..." << std::endl; - m_Engine = m_Builder->buildEngineWithConfig(*m_Network, *config); +#if (NV_TENSORRT_MAJOR < 9) + m_Engine = m_Builder->buildEngineWithConfig(*m_Network, *config); +#else + nvinfer1::IRuntime* inferRuntime = nvinfer1::createInferRuntime(m_Logger); + nvinfer1::IHostMemory* serialNetmork = m_Builder->buildSerializedNetwork(*m_Network, *config); + m_Engine = inferRuntime->deserializeCudaEngine(serialNetmork->data(), serialNetmork->size()); + delete inferRuntime; +#endif assert(m_Engine != nullptr); std::cout << "Building complete!" << std::endl; @@ -942,7 +973,15 @@ void Yolo::create_engine_yolov5(const nvinfer1::DataType dataType, Int8EntropyCa #endif // Build the engine std::cout << "Building the TensorRT Engine..." << std::endl; +#if (NV_TENSORRT_MAJOR < 9) m_Engine = m_Builder->buildEngineWithConfig(*m_Network, *config); +#else + nvinfer1::IRuntime* inferRuntime = nvinfer1::createInferRuntime(m_Logger); + nvinfer1::IHostMemory* serialNetmork = m_Builder->buildSerializedNetwork(*m_Network, *config); + m_Engine = inferRuntime->deserializeCudaEngine(serialNetmork->data(), serialNetmork->size()); + delete inferRuntime; +#endif + assert(m_Engine != nullptr); std::cout << "Building complete!" << std::endl; @@ -987,7 +1026,8 @@ void Yolo::doInference(const unsigned char* input, const uint32_t batchSize) batchSize * m_InputSize * sizeof(float), cudaMemcpyHostToDevice, m_CudaStream)); - m_Context->enqueue(batchSize, m_DeviceBuffers.data(), m_CudaStream, nullptr); + //m_Context->enqueueV3(batchSize, m_DeviceBuffers.data(), m_CudaStream, nullptr); + m_Context->enqueueV3(m_CudaStream); for (auto& tensor : m_OutputTensors) { NV_CUDA_CHECK(cudaMemcpyAsync(tensor.hostBuffer, m_DeviceBuffers.at(tensor.bindingIndex), @@ -1249,8 +1289,7 @@ void Yolo::parse_cfg_blocks_v5(const std::vectorgetNbBindings(), nullptr); + m_DeviceBuffers.resize(m_Engine->getNbIOTensors(), nullptr); assert(m_InputBindingIndex != -1 && "Invalid input binding index"); - NV_CUDA_CHECK(cudaMalloc(&m_DeviceBuffers.at(m_InputBindingIndex), - m_BatchSize * m_InputSize * sizeof(float))); + NV_CUDA_CHECK(cudaMalloc(&m_DeviceBuffers.at(m_InputBindingIndex), m_BatchSize * m_InputSize * sizeof(float))); for (auto& tensor : m_OutputTensors) { +#if (NV_TENSORRT_MAJOR < 9) tensor.bindingIndex = m_Engine->getBindingIndex(tensor.blobName.c_str()); +#else + auto it = m_tensorNames.find(tensor.blobName); + tensor.bindingIndex = (it != std::end(m_tensorNames)) ? it->second : -1; +#endif assert((tensor.bindingIndex != -1) && "Invalid output binding index"); - NV_CUDA_CHECK(cudaMalloc(&m_DeviceBuffers.at(tensor.bindingIndex), - m_BatchSize * tensor.volume * sizeof(float))); - NV_CUDA_CHECK( - cudaMallocHost(&tensor.hostBuffer, tensor.volume * m_BatchSize * sizeof(float))); + NV_CUDA_CHECK(cudaMalloc(&m_DeviceBuffers.at(tensor.bindingIndex), m_BatchSize * tensor.volume * sizeof(float))); + NV_CUDA_CHECK(cudaMallocHost((void**)&tensor.hostBuffer, tensor.volume * m_BatchSize * sizeof(float))); } } diff --git a/src/Detector/tensorrt_yolo/yolo.h b/src/Detector/tensorrt_yolo/yolo.h index be347d19..4cfdba16 100644 --- a/src/Detector/tensorrt_yolo/yolo.h +++ b/src/Detector/tensorrt_yolo/yolo.h @@ -158,6 +158,7 @@ class Yolo std::vector m_DeviceBuffers; int m_InputBindingIndex = -1; cudaStream_t m_CudaStream = nullptr; + std::map m_tensorNames; virtual std::vector decodeTensor(const int imageIdx, const int imageH, const int imageW, const TensorInfo& tensor) = 0;